Source code for moldesign.fileio

# Copyright 2016 Autodesk Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import bz2
import cPickle as pkl # TODO: if cpickle fails, retry with regular pickle to get a better traceback
import cStringIO as StringIO
import functools
import gzip
import os

import moldesign as mdt
from moldesign.interfaces import biopython_interface
import moldesign.interfaces.openbabel as openbabel_interface
from moldesign.interfaces.openmm import amber_to_mol as read_amber
from moldesign.helpers import pdb
from moldesign import chemjson

def exports(o, name=None):
    __all__.append(o.__name__)
    return o

__all__ = ['from_smiles', 'read_amber']


from_smiles = openbabel_interface.from_smiles


@exports
[docs]def read(f, format=None): """Read in a molecule from a file, file-like object, or string. Will also depickle a pickled object. Note: Files with ``.bz2`` or ``.gz`` suffixes will be automatically decompressed. Currently does not support files with more than one record - only returns the first record Args: f (str or file-like): Either a path to a file, OR a string with the file's contents, OR a file-like object format (str): molecule format (pdb, xyz, sdf, etc.) or pickle format (recognizes p, pkl, or pickle); guessed from filename if not passed Returns: moldesign.Molecule or object: molecule parsed from the file (or python object, for pickle files) Raises: ValueError: if ``f`` isn't recognized as a string, file path, or file-like object """ filename = None # Open a file-like object if isinstance(f, basestring) and os.path.exists(f): # it's a path to a file filename = f format, compression = _get_format(filename, format) fileobj = COMPRESSION[compression](filename, mode='r') elif hasattr(f, 'open'): # we can get a file-like object fileobj = f.open('r') elif hasattr(f, 'read'): # it's already file-like fileobj = f elif isinstance(f, basestring): # it's just a string fileobj = StringIO.StringIO(f) else: raise ValueError('Parameter to moldesign.read (%s) not ' % str(f) + 'recognized as string, file path, or file-like object') if format in READERS: mol = READERS[format](fileobj) else: # default to openbabel if there's not an explicit reader for this format mol = openbabel_interface.read_stream(fileobj, format) if filename is not None: mol.name = filename return mol
@exports
[docs]def write(obj, filename=None, format=None, mode='w'): """ Write a molecule to a file or string. Will also pickle arbitrary python objects. Note: Files with ``.bz2`` or ``.gz`` suffixes will be automatically compressed. Args: obj (moldesign.Molecule or object): the molecule to be written (or python object to be pickled) filename (str): filename (if not passed, then a string is returned) format (str): molecule format (pdb, xyz, sdf, etc.) or a pickle file extension ('pkl' and 'mdt' are both accepted) Returns: str: if filename is none, return the output file as a string (otherwise returns ``None``) """ # TODO: handle writing and returning file-like objects instead of strings # lets users call mdt.write(obj, 'pdb') and get a string (without needing the "format" keyword if (format is None and filename is not None and len(filename) < 5 and '.' not in filename): filename, format = None, filename format, compression = _get_format(filename, format) # First, create an object to write to (either file handle or file-like buffer) if filename: return_string = False fileobj = COMPRESSION[compression](filename, mode=mode) else: return_string = True fileobj = StringIO.StringIO() # Now, write to the object if format in WRITERS: WRITERS[format](obj, fileobj) else: fileobj.write(openbabel_interface.write_string(obj, format)) # Return a string if necessary if return_string: return fileobj.getvalue() else: fileobj.close()
@exports
[docs]def write_trajectory(traj, filename=None, format=None, overwrite=True): """ Write trajectory a file (if filename provided) or file-like buffer Args: traj (moldesign.molecules.Trajectory): trajectory to write filename (str): name of file (return a file-like object if not passed) format (str): file format (guessed from filename if None) overwrite (bool): overwrite filename if it exists Returns: StringIO: file-like object (only if filename not passed) """ format, compression = _get_format(filename, format) # If user is requesting a pickle, just dump the whole thing now and return if format.lower() in PICKLE_EXTENSIONS: write(traj, filename=filename, format=format) # for traditional molecular file formats, write the frames one after another else: tempmol = traj._tempmol if filename and (not overwrite) and os.path.exists(filename): raise IOError('%s exists' % filename) if not filename: fileobj = StringIO.StringIO() else: fileobj = open(filename, 'w') for frame in traj.frames: traj.apply_frame(frame) fileobj.write(tempmol.write(format=format)) if filename is None: fileobj.seek(0) return fileobj else: fileobj.close()
def read_pdb(f, assign_ccd_bonds=True): """ Read a PDB file and return a molecule. This uses the biopython parser to get the molecular structure, but uses internal parsers to create bonds and biomolecular assembly data. Note: Users won't typically use this routine; instead, they'll use ``moldesign.read``, which will delegate to this routine when appropriate. Args: f (filelike): filelike object giving access to the PDB file (must implement seek) assign_ccd_bonds (bool): Use the PDB Chemical Component Dictionary (CCD) to create bond topology (note that bonds from CONECT records will always be created as well) Returns: moldesign.Molecule: the parsed molecule """ assemblies = pdb.get_pdb_assemblies(f) f.seek(0) mol = biopython_interface.parse_pdb(f) mol.properties.bioassemblies = assemblies f.seek(0) conect_graph = pdb.get_conect_records(f) # Assign bonds from residue templates if assign_ccd_bonds: pdb.assign_biopolymer_bonds(mol) # Create bonds from CONECT records serials = {atom.pdbindex: atom for atom in mol.atoms} for atomserial, nbrs in conect_graph.iteritems(): atom = serials[atomserial] for nbrserial, order in nbrs.iteritems(): nbr = serials[nbrserial] if nbr not in atom.bond_graph: # we already got it from CCD mol.newbond(atom, nbr, order) if assemblies: pdb.warn_assemblies(mol, assemblies) return mol def read_mmcif(f): """ Read an mmCIF file and return a molecule. This uses OpenBabel's basic structure parser along with biopython's mmCIF bioassembly parser Note: Users won't typically use this routine; instead, they'll use ``moldesign.read``, which will delegate to this routine when appropriate. Args: f (filelike): file-like object that accesses the mmCIF file (must implement seek) Returns: moldesign.Molecule: the parsed molecular structure """ mol = openbabel_interface.read_stream(f, 'cif') f.seek(0) assemblies = biopython_interface.get_mmcif_assemblies(f) if assemblies: pdb.warn_assemblies(mol, assemblies) mol.properties.bioassemblies = assemblies return mol def read_xyz(f): tempmol = openbabel_interface.read_stream(f, 'xyz') for atom in tempmol.atoms: atom.residue = None atom.chain = None return mdt.Molecule(tempmol.atoms) @exports
[docs]def mol_to_openmm_sim(mol): try: return mol.energy_model.get_openmm_simulation() except AttributeError: raise AttributeError("Can't create an OpenMM object - no OpenMM energy_model present")
@exports
[docs]def from_pdb(pdbcode, usecif=False): """ Import the given molecular geometry from PDB.org See Also: http://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/introduction Args: pdbcode (str): 4-character PDB code (e.g. 3AID, 1BNA, etc.) usecif (bool): If False (the default), use the PDB-formatted file (default). If True, use the mmCIF-format file from RCSB.org. Returns: moldesign.Molecule: molecule object """ import requests assert len(pdbcode) == 4, "%s is not a valid PDB ID." % pdbcode fileext = 'cif' if usecif else 'pdb' request = requests.get('http://www.rcsb.org/pdb/files/%s.%s' % (pdbcode, fileext)) if request.status_code == 404 and not usecif: # if not found, try the cif-format version print 'WARNING: %s.pdb not found in rcsb.org database. Trying %s.cif...' % ( pdbcode, pdbcode), retval = from_pdb(pdbcode, usecif=True) print 'success.' return retval elif request.status_code != 200: raise ValueError('Failed to download %s.%s from rcsb.org: %s %s' % ( pdbcode, fileext, request.status_code, request.reason)) if usecif: filestring = request.text else: filestring = request.text.encode('ascii') mol = read(filestring, format=fileext) mol.name = pdbcode return mol
@exports
[docs]def from_name(name): """Attempt to convert an IUPAC or common name to a molecular geometry. Args: name (str): molecular name (generally IUPAC - some common names are also recognized) Returns: moldesign.Molecule: molecule object """ from moldesign.interfaces.opsin_interface import name_to_smiles # TODO: fallback to http://cactus.nci.nih.gov/chemical/structure smi = name_to_smiles(name) mol = from_smiles(smi, name) return mol
def _get_format(filename, format): """ Determine the requested file format and optional compression library Args: filename (str or None): requested filename, if present format (str or None): requested format, if present Returns: (str, str): (file format, compression format or ``None`` for no compression) Examples: >>> _get_format('mymol.pdb', None) ('pdb', None) >>> _get_format('smallmol.xyz.bz2', None) ('xyz','bz2') >>> _get_format('mymol.t.gz', 'sdf') ('sdf','gz') """ if filename is None and format is None: raise ValueError('No filename or file format specified') elif filename is None: return format, None fname, extn = os.path.splitext(filename) suffix = extn[1:].lower() compressor = None if suffix in COMPRESSION: compressor = suffix suffix = os.path.splitext(fname)[1][1:].lower() if format is None: format = suffix return format, compressor #################################### # FILE EXTENSION HANDLERS # #################################### # All extensions MUST be lower case READERS = {'json': chemjson.reader, 'pdb': read_pdb, 'cif': read_mmcif, 'mmcif': read_mmcif, 'xyz': read_xyz} WRITERS = {'json': chemjson.writer} PICKLE_EXTENSIONS = set("p pkl pickle mdt".split()) COMPRESSION = {'gz': gzip.open, 'gzip': gzip.open, 'bz2': bz2.BZ2File, 'bzip2': bz2.BZ2File, None: open} for ext in PICKLE_EXTENSIONS: READERS[ext] = pkl.load WRITERS[ext] = functools.partial(pkl.dump, protocol=2)