Source code for moldesign.helpers.pdb

# Copyright 2016 Autodesk Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PDB file parsing utilities

We don't yet (and hopefully will never need) an internal PDB parser or writer. For now,
routines in this module read and write data that's not necessarily parsed by other implementations.
"""
import collections
from cStringIO import StringIO

import numpy as np

import moldesign as mdt

BioAssembly = collections.namedtuple('BioAssembly', 'desc chains transforms')


[docs]def insert_ter_records(mol, pdbfile):
    """ Inserts TER records to indicate the end of the biopolymeric part of a chain

    Many common PDB writers - including OpenBabel - don't insert TER records. This can
    cause a problem for situations such as forcefield assignment. This routine is one
    solution to that problem.

    What it does:
    This routine inserts 'TER' records
    1) after any protein residue not bound to the next residue via backbone peptide bond, and
    2) after any DNA residue not bound to the next residue via a backbone phosphate bond

    In the input PDB file, the ATOM records be formatted with the proper columns (see
    http://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM) and
    the chain names and residue numbers must match ``chain.pdbname`` and ``residue.pdbindex``,
    respectively.

    Args:
        mol (moldesign.Molecule): the MDT version of the molecule that pdbfile describes
        pdbfile (TextIO OR str): pdb file (file-like or string)

    Returns:
        cStringIO.StringIO OR str: copy of the pdb file with added TER records - it will be
         returned as the same type passed (i.e., as a filelike buffer or as a string)
    """

    is_string = False
    if isinstance(pdbfile, basestring):
        pdbfile = StringIO(pdbfile)
        is_string = True

    # First, identify where to insert records (if anywhere)
    ter_residues = set()
    for chain in mol.chains:
        if chain.type == 'protein':
            ter_residues.add((chain.pdbname, chain.c_terminal.pdbindex))
        elif chain.type == 'dna':
            ter_residues.add((chain.pdbname, chain.threeprime_end.pdbindex))

    # insert the records (if necessary)
    newf = StringIO()
    pdbfile.seek(0)
    watchres = None
    for line in pdbfile:
        fields = line.split()
        if fields and fields[0] in ('ATOM','HETATM'):  # line is an atom record
            res = (line[21], int(line[22:26].strip()))
            if watchres:
                if res != watchres:
                    print >> newf, 'TER'
                    watchres = None
            if res in ter_residues:
                watchres = res

        elif watchres is not None:  # line is not an atom record
            watchres = None
            if line.strip() != 'TER':
                print >> newf, 'TER'

        newf.write(line)

    newf.seek(0)
    if is_string:
        return newf.read()
    else:
        return newf

[docs]def warn_assemblies(mol, assemblies):
    """ Print a warning message if the PDB structure contains a biomolecular assembly
    """

    # Don't warn if the only assembly is the asymmetric unit
    if len(assemblies) > 1 or len(assemblies.values()[0].transforms) > 1:
        print "WARNING: This PDB file contains the following biomolecular assemblies:"
        for name, asm in assemblies.iteritems():
            print 'WARNING: Assembly "%s": %d copies of chains %s'%(
                name, len(asm.transforms), ', '.join(asm.chains))
            print 'WARNING: Use ``mdt.build_assembly([molecule],[assembly_name])``' \
                  ' to build one of the above assemblies'


[docs]def guess_atnum_from_name(s):
    """ Guess an atomic number given a name string (usually 1-3 characters).

    Args:
        s (str): atomic number

    Returns:
        int: atomic number

    Raises:
        KeyError: if atomic number can't be determined

    Examples:
        >>> guess_atnum_from_name('C')
        6
        >>> guess_atnum_from_name('C1')
        6
        >>> guess_atnum_from_name('cl3')
        17
        >>> guess_atnum_from_name('CL')
        17
    """
    try:  # the unmodified string
        return mdt.data.ATOMIC_NUMBERS[s]
    except KeyError:
        pass

    cleaned = ''.join((c.upper() if i==0 else c.lower())
                      for i,c in enumerate(s)
                      if c.isalpha())

    try:  # just the letters, with proper capitalization
        return mdt.data.ATOMIC_NUMBERS[cleaned]
    except KeyError:
        pass

    # otherwise, just the first letter
    return mdt.data.ATOMIC_NUMBERS[cleaned[0]]


[docs]def get_conect_records(pdbfile):
    """Parse a PDB file, return CONECT records

    Bond orders are assigned as 1 by default. Repeated CONECT records are interpreted as
    higher order bonds.

    Args:
        pdbfile (file): file-like object

    Example:
        > CONECT   1   2   3
        > CONECT   1   2
        > CONECT   2   1   1
        > CONECT   3   1
        These records are interpreted as a double-bond between atoms 1 and 2
        and a single bond between atoms 1 and 3

    Note:
        This only returns the covalent CONECT records (the first 4 entries) - it doesn't
        return salt bridges or hydrogen bonds

    Returns:
        dict: CONECT records using serial numbers -
             ``{serial1: {serial2:order. serial3:order, }, ...}``
    """
    conect = {}
    for line in pdbfile:
        fields = line.split()
        if len(fields) == 0:
            continue
        if fields[0] != 'CONECT':
            continue

        atombonds = conect.setdefault(int(fields[1]), {})
        for f in fields[2:6]:  # TODO: check the end bound
            serial = int(f)
            if serial not in atombonds:
                atombonds[serial] = 0
            atombonds[serial] += 1
    return conect


[docs]def get_pdb_assemblies(fileobj):
    """Parse a PDB file, return biomolecular assembly specifications

    Args:
        fileobj (file-like): File-like object for the PDB file
            (this object will be rewound before returning)

    Returns:
        Mapping[str, BioAssembly]: dict mapping assembly ids to BioAssembly instances
    """
    assemblies = {}
    lineiter = iter(fileobj)
    while True:  # first, search for assembly transformations
        line = lineiter.next()
        fields = line.split()

        # Conditions that indicate we're past the "REMARK 350" section
        if fields[0] in ('ATOM', 'HETATM', 'CONECT'):
            break
        if fields[0] == 'REMARK' and int(fields[1]) > 350:
            break

        # look for start of a assembly transformation, i.e. "REMARK 350 BIOMOLECULE: [name]  "
        if fields[:3] == 'REMARK 350 BIOMOLECULE:'.split():
            assembly_name = fields[-1]
            assemblies[assembly_name] = _read_pdb_assembly(lineiter)

    return assemblies


def _read_pdb_assembly(lineiter):
    """Helper for get_pdb_assemblies
    """
    # First, there's description lines: "REMARK 350 AUTHOR DETERMINED BIOLOGICAL UNIT: OCTAMERIC"
    description_lines = []
    line = lineiter.next()
    fields = line.split()
    while fields[:7] != 'REMARK 350 APPLY THE FOLLOWING TO CHAINS:'.split():
        description_lines.append(line[len('REMARK 350 '):])
        line = lineiter.next()
        fields = line.split()
    description = (''.join(description_lines)).strip()

    # Next, we get the chains in this assembly: "REMARK 350 APPLY THE FOLLOWING TO CHAINS:  C, D"
    assert fields[:7] == 'REMARK 350 APPLY THE FOLLOWING TO CHAINS:'.split()
    chain_names = [x.rstrip(',') for x in fields[7:]]
    while fields[-1][-1] == ',':  # deal with multi-line lists of chains
        line = lineiter.next()
        fields = line.split()
        assert fields[2:4] == ['AND', 'CHAINS:']
        chain_names.extend(x.rstrip(',') for x in fields[4:])

    transforms = []
    while True:  # loop over each assembly transformation
        # example: "REMARK 350   BIOMT1   1  1.000000  0.000000  0.000000        0.00000 "
        t = np.zeros((4, 4))
        t[3, 3] = 1.0

        for idim in xrange(3):
            line = lineiter.next()
            fields = line.split()
            if idim == 0 and len(fields) == 2:
                return BioAssembly(description, chain_names, transforms)

            assert int(fields[3]) == len(transforms)+1
            assert fields[2] == ('BIOMT%d' % (idim+1))
            t[idim, :] = map(float, fields[4:8])

        transforms.append(t)


[docs]def assign_biopolymer_bonds(mol):
    """ Assign bonds to all standard residues using the PDB chemical component dictionary

    Any unrecognized residues are ignored.

    References:
        http://www.wwpdb.org/data/ccd
    """
    for chain in mol.chains:
        try:
            chain.assign_biopolymer_bonds()
        except KeyError:
            print('WARNING: failed to assign backbone bonds for %s') % str(chain)
    for residue in mol.residues:
        try:
            residue.assign_template_bonds()
        except KeyError:
            print('WARNING: failed to assign bonds for %s; use '
                  '``residue.assign_distance.bonds`` to guess the topology') % str(residue)


[docs]def assign_unique_hydrogen_names(mol):
    """ Assign unique names to all hydrogens, based on either:
    1) information in the Chemical Component Database, or
    2) newly generated, unique names

    Args:
        mol (moldesign.Molecule):
    """

    for residue in mol.residues:
        if residue.resname in mdt.data.RESIDUE_BONDS:
            _assign_hydrogen_names_from_ccd(residue)
        else:
            _assign_unique_hydrogen_names_in_order(residue)

        residue.rebuild()


def _assign_hydrogen_names_from_ccd(residue):
    ccd_bonds = mdt.data.RESIDUE_BONDS[residue.resname]
    taken = set(atom.name for atom in residue.atoms)

    if 'H' not in taken:
        return  # nothing to do
    if 'H' in ccd_bonds:
        taken.remove('H')  # someone will actually need to be named "H'

    for atom in residue:
        if atom.atnum != 1 or atom.name != 'H':
            continue
        assert atom.num_bonds == 1, 'Hydrogen has more than one bond'
        bond = atom.bonds[0]
        other = bond.partner(atom).name
        for partner in ccd_bonds[other]:
            if partner[0] == 'H' and partner not in taken:
                assert ccd_bonds[other][partner] == 1, 'Hydrogen bond order is not 1'
                atom.name = partner
                taken.add(partner)
                break


def _assign_unique_hydrogen_names_in_order(residue):
    n_hydrogen = 1
    namecounts = collections.Counter(x.name for x in residue.atoms)
    if namecounts.get('H', 0) > 1:
        used_names = set(atom.name for atom in residue.atoms)
        for atom in residue.atoms:
            if atom.name == 'H':
                name = 'H%d' % n_hydrogen
                while name in used_names:
                    n_hydrogen += 1
                    name = 'H%d' % n_hydrogen
                atom.name = name
                used_names.add(name)