Source code for moldesign.interfaces.biopython_interface

# Copyright 2016 Autodesk Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import itertools
import string

import Bio.PDB
import Bio.PDB.MMCIF2Dict
import numpy as np

import moldesign as mdt
from moldesign import units as u
from moldesign.helpers.pdb import BioAssembly


[docs]def parse_mmcif(f):
    """Parse an mmCIF file (using the Biopython parser) and return a molecule

    Note:
        This routine is not currently called by any part of the user-facing API! The
        OpenBabel parser appears to give more accurate results for the time being. The
        molecules created using this routine will NOT have any bond topology!

    Args:
        f (file): file-like object containing the mmCIF file

    Returns:
        moldesign.Molecule: parsed molecule
    """
    return _parse_file(f, Bio.PDB.MMCIFParser)


[docs]def parse_pdb(f):
    """Parse a PDB file (using the Biopython parser) and return the basic structure

    Note:
        This structure will be missing some key data - most notably bonds, but also
        any biomolecular assembly information. Therefore, our default parser combines
        this routine with a few other methods to create the final Molecule object

    See also:
        moldesign.fileio.read_pdb

    Args:
        f (file): file-like object containing the PDB file

    Returns:
        moldesign.Molecule: parsed molecule
    """
    # TODO: this needs to handle strings and streams
    # TODO: deal with alternate locations

    return _parse_file(f, Bio.PDB.PDBParser)


def _parse_file(f, parser_type):
    parser = parser_type()
    struc = parser.get_structure('no name', f)
    mol = biopy_to_mol(struc)
    return mol


[docs]def biopy_to_mol(struc):
    """Convert a biopython PDB structure to an MDT molecule.

    Note:
        Biopython doesn't deal with bond data, so no bonds will be present
        in the Molecule

    Args:
        struc (Bio.PDB.Structure.Structure): Biopython PDB structure to convert

    Returns:
        moldesign.Molecule: converted molecule
    """
    # TODO: assign bonds using 1) CONECT records, 2) residue templates, 3) distance
    newatoms = []
    backup_chain_names = list(string.ascii_uppercase)

    for chain in struc.get_chains():
        tmp, pdbidx, pdbid = chain.get_full_id()
        if not pdbid.strip():
            pdbid = backup_chain_names.pop()
        newchain = mdt.Chain(pdbname=pdbid.strip())

        for residue in chain.get_residues():
            newresidue = mdt.Residue(pdbname=residue.resname.strip(),
                                     pdbindex=residue.id[1])

            newchain.add(newresidue)

            for atom in residue.get_atom():
                elem = atom.element
                if len(elem) == 2:
                    elem = elem[0] + elem[1].lower()
                newatom = mdt.Atom(element=elem,
                                   name=atom.get_name(),
                                   pdbname=atom.get_name(),
                                   pdbindex=atom.get_serial_number())
                newatom.position = atom.coord * u.angstrom
                newresidue.add(newatom)

                newatoms.append(newatom)

    return mdt.Molecule(newatoms,
                        name=struc.get_full_id()[0])


[docs]def get_mmcif_assemblies(fileobj=None, mmcdata=None):
    """Parse an mmCIF file, return biomolecular assembly specifications

    Args:
        fileobj (file-like): File-like object for the PDB file
            (this object will be rewound before returning)
        mmcdata (dict): dict version of complete mmCIF data structure (if passed, this will
            not be read again from fileobj)

    Returns:
        Mapping[str, BioAssembly]: dict mapping assembly ids to BioAssembly instances
    """
    if mmcdata is None:
        mmcdata = _getmmcdata(fileobj)

    if '_pdbx_struct_assembly.id' not in mmcdata:
        return {}  # no assemblies present

    # Get assembly metadata
    ids = mmcdata['_pdbx_struct_assembly.id']
    details = mmcdata['_pdbx_struct_assembly.details']
    chains = mmcdata['_pdbx_struct_assembly_gen.asym_id_list']
    opers = mmcdata['_pdbx_struct_assembly_gen.oper_expression']
    transform_ids = mmcdata['_pdbx_struct_oper_list.id']

    # Get matrix transformations
    tmat = np.zeros((4, 4)).tolist()
    for i in xrange(3):  # construct displacement vector
        tmat[i][3] = mmcdata['_pdbx_struct_oper_list.vector[%d]' % (i+1)]
    for i, j in itertools.product(xrange(0, 3), xrange(0, 3)):  # construct rotation matrix
        tmat[i][j] = mmcdata['_pdbx_struct_oper_list.matrix[%d][%d]' % (i+1, j+1)]
    transforms = _make_transform_dict(tmat, transform_ids)

    # Make sure it's a list
    if not isinstance(ids, list):
        ids = [ids]
        details = [details]
        chains = [chains]
        opers = [opers]

    # now create the assembly specifications
    assemblies = {}
    for id, detail, chainlist, operlist in zip(ids, details, chains, opers):
        assert id not in assemblies
        transforms = [transforms[i] for i in operlist.split(',')]
        assemblies[id] = BioAssembly(detail, chainlist.split(','), transforms)

    return assemblies


def _make_transform_dict(tmat, transform_ids):
    if isinstance(transform_ids, list):
        for i, j in itertools.product(xrange(0, 3), xrange(0, 4)):
            tmat[i][j] = map(float, tmat[i][j])
        tmat[3][3] = [1.0]*len(transform_ids)
        tmat[3][0] = tmat[3][1] = tmat[3][2] = [0.0]*len(transform_ids)
        tmat = np.array(tmat)
        transforms = {id: tmat[:, :, i] for i, id in enumerate(transform_ids)}
    else:
        for i, j in itertools.product(xrange(0, 4), xrange(0, 4)):
            tmat[i][j] = float(tmat[i][j])
        tmat[3][3] = 1.0
        tmat = np.array(tmat)
        transforms = {transform_ids: tmat}

    return transforms


def _getmmcdata(fileobj):
    mmcdata = Bio.PDB.MMCIF2Dict.MMCIF2Dict(fileobj)
    fileobj.seek(0)  # rewind for future access
    return mmcdata