Source code for moldesign.interfaces.biopython_interface

# Copyright 2016 Autodesk Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import itertools
import string

import Bio.PDB
import Bio.PDB.MMCIF2Dict
import numpy as np

import moldesign as mdt
from moldesign import units as u
from moldesign.helpers.pdb import BioAssembly


[docs]def parse_mmcif(f): """Parse an mmCIF file (using the Biopython parser) and return a molecule Note: This routine is not currently called by any part of the user-facing API! The OpenBabel parser appears to give more accurate results for the time being. The molecules created using this routine will NOT have any bond topology! Args: f (file): file-like object containing the mmCIF file Returns: moldesign.Molecule: parsed molecule """ return _parse_file(f, Bio.PDB.MMCIFParser)
[docs]def parse_pdb(f): """Parse a PDB file (using the Biopython parser) and return the basic structure Note: This structure will be missing some key data - most notably bonds, but also any biomolecular assembly information. Therefore, our default parser combines this routine with a few other methods to create the final Molecule object See also: moldesign.fileio.read_pdb Args: f (file): file-like object containing the PDB file Returns: moldesign.Molecule: parsed molecule """ # TODO: this needs to handle strings and streams # TODO: deal with alternate locations return _parse_file(f, Bio.PDB.PDBParser)
def _parse_file(f, parser_type): parser = parser_type() struc = parser.get_structure('no name', f) mol = biopy_to_mol(struc) return mol
[docs]def biopy_to_mol(struc): """Convert a biopython PDB structure to an MDT molecule. Note: Biopython doesn't deal with bond data, so no bonds will be present in the Molecule Args: struc (Bio.PDB.Structure.Structure): Biopython PDB structure to convert Returns: moldesign.Molecule: converted molecule """ # TODO: assign bonds using 1) CONECT records, 2) residue templates, 3) distance newatoms = [] backup_chain_names = list(string.ascii_uppercase) for chain in struc.get_chains(): tmp, pdbidx, pdbid = chain.get_full_id() if not pdbid.strip(): pdbid = backup_chain_names.pop() newchain = mdt.Chain(pdbname=pdbid.strip()) for residue in chain.get_residues(): newresidue = mdt.Residue(pdbname=residue.resname.strip(), pdbindex=residue.id[1]) newchain.add(newresidue) for atom in residue.get_atom(): elem = atom.element if len(elem) == 2: elem = elem[0] + elem[1].lower() newatom = mdt.Atom(element=elem, name=atom.get_name(), pdbname=atom.get_name(), pdbindex=atom.get_serial_number()) newatom.position = atom.coord * u.angstrom newresidue.add(newatom) newatoms.append(newatom) return mdt.Molecule(newatoms, name=struc.get_full_id()[0])
[docs]def get_mmcif_assemblies(fileobj=None, mmcdata=None): """Parse an mmCIF file, return biomolecular assembly specifications Args: fileobj (file-like): File-like object for the PDB file (this object will be rewound before returning) mmcdata (dict): dict version of complete mmCIF data structure (if passed, this will not be read again from fileobj) Returns: Mapping[str, BioAssembly]: dict mapping assembly ids to BioAssembly instances """ if mmcdata is None: mmcdata = _getmmcdata(fileobj) if '_pdbx_struct_assembly.id' not in mmcdata: return {} # no assemblies present # Get assembly metadata ids = mmcdata['_pdbx_struct_assembly.id'] details = mmcdata['_pdbx_struct_assembly.details'] chains = mmcdata['_pdbx_struct_assembly_gen.asym_id_list'] opers = mmcdata['_pdbx_struct_assembly_gen.oper_expression'] transform_ids = mmcdata['_pdbx_struct_oper_list.id'] # Get matrix transformations tmat = np.zeros((4, 4)).tolist() for i in xrange(3): # construct displacement vector tmat[i][3] = mmcdata['_pdbx_struct_oper_list.vector[%d]' % (i+1)] for i, j in itertools.product(xrange(0, 3), xrange(0, 3)): # construct rotation matrix tmat[i][j] = mmcdata['_pdbx_struct_oper_list.matrix[%d][%d]' % (i+1, j+1)] transforms = _make_transform_dict(tmat, transform_ids) # Make sure it's a list if not isinstance(ids, list): ids = [ids] details = [details] chains = [chains] opers = [opers] # now create the assembly specifications assemblies = {} for id, detail, chainlist, operlist in zip(ids, details, chains, opers): assert id not in assemblies transforms = [transforms[i] for i in operlist.split(',')] assemblies[id] = BioAssembly(detail, chainlist.split(','), transforms) return assemblies
def _make_transform_dict(tmat, transform_ids): if isinstance(transform_ids, list): for i, j in itertools.product(xrange(0, 3), xrange(0, 4)): tmat[i][j] = map(float, tmat[i][j]) tmat[3][3] = [1.0]*len(transform_ids) tmat[3][0] = tmat[3][1] = tmat[3][2] = [0.0]*len(transform_ids) tmat = np.array(tmat) transforms = {id: tmat[:, :, i] for i, id in enumerate(transform_ids)} else: for i, j in itertools.product(xrange(0, 4), xrange(0, 4)): tmat[i][j] = float(tmat[i][j]) tmat[3][3] = 1.0 tmat = np.array(tmat) transforms = {transform_ids: tmat} return transforms def _getmmcdata(fileobj): mmcdata = Bio.PDB.MMCIF2Dict.MMCIF2Dict(fileobj) fileobj.seek(0) # rewind for future access return mmcdata