diff --git a/modules/mmcif/.imp_info.py b/modules/mmcif/.imp_info.py new file mode 100644 index 0000000000..b00919eb57 --- /dev/null +++ b/modules/mmcif/.imp_info.py @@ -0,0 +1,3 @@ +{ + "name": "IMP.mmcif" +} diff --git a/modules/mmcif/README.md b/modules/mmcif/README.md new file mode 100644 index 0000000000..fa2f4b334f --- /dev/null +++ b/modules/mmcif/README.md @@ -0,0 +1,19 @@ +\brief Support for output of models in mmCIF format. + +This module adds basic support for outputing IMP models in mmCIF format, +using the [integrative/hybrid modeling extensions](https://github.com/ihmwg/IHM-dictionary). +This allows IMP models to be deposited in [PDB-dev](https://pdb-dev.wwpdb.org/). + +Note that this module is still under heavy development, and not yet +completely functional. + +# Info + +_Author(s)_: Benjamin Webb + +_Maintainer_: benmwebb + +_License_: LGPL + +_Publications_: +- None diff --git a/modules/mmcif/dependencies.py b/modules/mmcif/dependencies.py new file mode 100644 index 0000000000..4541488945 --- /dev/null +++ b/modules/mmcif/dependencies.py @@ -0,0 +1,4 @@ +required_modules = 'kernel' +lib_only_required_modules = 'atom:core:algebra:container:rmf' +required_dependencies = '' +optional_dependencies = '' diff --git a/modules/mmcif/pyext/IMP_mmcif.init.i b/modules/mmcif/pyext/IMP_mmcif.init.i new file mode 100644 index 0000000000..f11bb63f11 --- /dev/null +++ b/modules/mmcif/pyext/IMP_mmcif.init.i @@ -0,0 +1,6 @@ +%pythoncode %{ + +# Temporary hack to simplify development +from IMP.mmcif.util import * + +%} diff --git a/modules/mmcif/pyext/src/_compat_collections.py b/modules/mmcif/pyext/src/_compat_collections.py new file mode 100644 index 0000000000..4200390387 --- /dev/null +++ b/modules/mmcif/pyext/src/_compat_collections.py @@ -0,0 +1,259 @@ +"""Compatibility code for Python 2.6 (classes that are present + in the 'collections' module in Python 2.7 or later).""" + +try: + from thread import get_ident as _get_ident +except ImportError: + from dummy_thread import get_ident as _get_ident + +try: + from _abcoll import KeysView, ValuesView, ItemsView +except ImportError: + pass + +class OrderedDict(dict): + """Dictionary that remembers insertion order + An inherited dict maps keys to values. + The inherited dict provides __getitem__, __len__, __contains__, and get. + The remaining methods are order-aware. + Big-O running times for all methods are the same as for regular dictionaries. + + The internal self.__map dictionary maps keys to links in a doubly linked list. + The circular doubly linked list starts and ends with a sentinel element. + The sentinel element never gets deleted (this simplifies the algorithm). + Each link is stored as a list of length three: [PREV, NEXT, KEY]. + source: http://code.activestate.com/recipes/576693/ + """ + + def __init__(self, *args, **kwds): + '''Initialize an ordered dictionary. Signature is the same as for + regular dictionaries, but keyword arguments are not recommended + because their insertion order is arbitrary. + + ''' + if len(args) > 1: + raise TypeError('expected at most 1 arguments, got %d' % len(args)) + try: + self.__root + except AttributeError: + self.__root = root = [] # sentinel node + root[:] = [root, root, None] + self.__map = {} + self.__update(*args, **kwds) + + def __setitem__(self, key, value, dict_setitem=dict.__setitem__): + 'od.__setitem__(i, y) <==> od[i]=y' + # Setting a new item creates a new link which goes at the end of the linked + # list, and the inherited dictionary is updated with the new key/value pair. + if key not in self: + root = self.__root + last = root[0] + last[1] = root[0] = self.__map[key] = [last, root, key] + dict_setitem(self, key, value) + + def __delitem__(self, key, dict_delitem=dict.__delitem__): + 'od.__delitem__(y) <==> del od[y]' + # Deleting an existing item uses self.__map to find the link which is + # then removed by updating the links in the predecessor and successor nodes. + dict_delitem(self, key) + link_prev, link_next, key = self.__map.pop(key) + link_prev[1] = link_next + link_next[0] = link_prev + + def __iter__(self): + 'od.__iter__() <==> iter(od)' + root = self.__root + curr = root[1] + while curr is not root: + yield curr[2] + curr = curr[1] + + def __reversed__(self): + 'od.__reversed__() <==> reversed(od)' + root = self.__root + curr = root[0] + while curr is not root: + yield curr[2] + curr = curr[0] + + def clear(self): + 'od.clear() -> None. Remove all items from od.' + try: + for node in self.__map.itervalues(): + del node[:] + root = self.__root + root[:] = [root, root, None] + self.__map.clear() + except AttributeError: + pass + dict.clear(self) + + def popitem(self, last=True): + '''od.popitem() -> (k, v), return and remove a (key, value) pair. + Pairs are returned in LIFO order if last is true or FIFO order if false. + + ''' + if not self: + raise KeyError('dictionary is empty') + root = self.__root + if last: + link = root[0] + link_prev = link[0] + link_prev[1] = root + root[0] = link_prev + else: + link = root[1] + link_next = link[1] + root[1] = link_next + link_next[0] = root + key = link[2] + del self.__map[key] + value = dict.pop(self, key) + return key, value + + # -- the following methods do not depend on the internal structure -- + + def keys(self): + 'od.keys() -> list of keys in od' + return list(self) + + def values(self): + 'od.values() -> list of values in od' + return [self[key] for key in self] + + def items(self): + 'od.items() -> list of (key, value) pairs in od' + return [(key, self[key]) for key in self] + + def iterkeys(self): + 'od.iterkeys() -> an iterator over the keys in od' + return iter(self) + + def itervalues(self): + 'od.itervalues -> an iterator over the values in od' + for k in self: + yield self[k] + + def iteritems(self): + 'od.iteritems -> an iterator over the (key, value) items in od' + for k in self: + yield (k, self[k]) + + def update(*args, **kwds): + '''od.update(E, **F) -> None. Update od from dict/iterable E and F. + + If E is a dict instance, does: for k in E: od[k] = E[k] + If E has a .keys() method, does: for k in E.keys(): od[k] = E[k] + Or if E is an iterable of items, does: for k, v in E: od[k] = v + In either case, this is followed by: for k, v in F.items(): od[k] = v + + ''' + if len(args) > 2: + raise TypeError('update() takes at most 2 positional ' + 'arguments (%d given)' % (len(args),)) + elif not args: + raise TypeError('update() takes at least 1 argument (0 given)') + self = args[0] + # Make progressively weaker assumptions about "other" + other = () + if len(args) == 2: + other = args[1] + if isinstance(other, dict): + for key in other: + self[key] = other[key] + elif hasattr(other, 'keys'): + for key in other.keys(): + self[key] = other[key] + else: + for key, value in other: + self[key] = value + for key, value in kwds.items(): + self[key] = value + + __update = update # let subclasses override update without breaking __init__ + + __marker = object() + + def pop(self, key, default=__marker): + '''od.pop(k[,d]) -> v, remove specified key and return the corresponding value. + If key is not found, d is returned if given, otherwise KeyError is raised. + + ''' + if key in self: + result = self[key] + del self[key] + return result + if default is self.__marker: + raise KeyError(key) + return default + + def setdefault(self, key, default=None): + 'od.setdefault(k[,d]) -> od.get(k,d), also set od[k]=d if k not in od' + if key in self: + return self[key] + self[key] = default + return default + + def __repr__(self, _repr_running={}): + 'od.__repr__() <==> repr(od)' + call_key = id(self), _get_ident() + if call_key in _repr_running: + return '...' + _repr_running[call_key] = 1 + try: + if not self: + return '%s()' % (self.__class__.__name__,) + return '%s(%r)' % (self.__class__.__name__, self.items()) + finally: + del _repr_running[call_key] + + def __reduce__(self): + 'Return state information for pickling' + items = [[k, self[k]] for k in self] + inst_dict = vars(self).copy() + for k in vars(OrderedDict()): + inst_dict.pop(k, None) + if inst_dict: + return (self.__class__, (items,), inst_dict) + return self.__class__, (items,) + + def copy(self): + 'od.copy() -> a shallow copy of od' + return self.__class__(self) + + @classmethod + def fromkeys(cls, iterable, value=None): + '''OD.fromkeys(S[, v]) -> New ordered dictionary with keys from S + and values equal to v (which defaults to None). + + ''' + d = cls() + for key in iterable: + d[key] = value + return d + + def __eq__(self, other): + '''od.__eq__(y) <==> od==y. Comparison to another OD is order-sensitive + while comparison to a regular mapping is order-insensitive. + + ''' + if isinstance(other, OrderedDict): + return len(self)==len(other) and self.items() == other.items() + return dict.__eq__(self, other) + + def __ne__(self, other): + return not self == other + + # -- the following methods are only used in Python 2.7 -- + + def viewkeys(self): + "od.viewkeys() -> a set-like object providing a view on od's keys" + return KeysView(self) + + def viewvalues(self): + "od.viewvalues() -> an object providing a view on od's values" + return ValuesView(self) + + def viewitems(self): + "od.viewitems() -> a set-like object providing a view on od's items" + return ItemsView(self) diff --git a/modules/mmcif/pyext/src/data.py b/modules/mmcif/pyext/src/data.py new file mode 100644 index 0000000000..9880235135 --- /dev/null +++ b/modules/mmcif/pyext/src/data.py @@ -0,0 +1,540 @@ +"""@namespace IMP.mmcif.data + @brief Classes to represent data structures used in mmCIF. +""" + +from __future__ import print_function +import IMP.atom +import IMP.mmcif.dataset +import IMP.mmcif.metadata + +def get_molecule(h): + """Given a Hierarchy, walk up and find the parent Molecule""" + while h: + if IMP.atom.Molecule.get_is_setup(h): + return IMP.atom.Molecule(h) + h = h.get_parent() + return None + +class _Entity(object): + """Represent a CIF entity (a component with a unique sequence)""" + def __init__(self, seq): + self.sequence = seq + self.first_component = None + # Use the name of the first component as the description of the entity + description = property(lambda self: self.first_component.name) + + +class _EntityMapper(dict): + """Handle mapping from IMP chains to CIF entities. + Multiple components may map to the same entity if they share sequence.""" + def __init__(self): + super(_EntityMapper, self).__init__() + self._sequence_dict = {} + self._entities = [] + + def add(self, chain): + # todo: handle non-protein sequences + sequence = chain.get_sequence() + if sequence not in self._sequence_dict: + entity = _Entity(sequence) + self._entities.append(entity) + entity.id = len(self._entities) + self._sequence_dict[sequence] = entity + self[chain] = self._sequence_dict[sequence] + return self[chain] + + def get_all(self): + """Yield all entities""" + return self._entities + + +def _assign_id(obj, seen_objs, obj_by_id): + """Assign a unique ID to obj, and track all ids in obj_by_id.""" + if obj not in seen_objs: + if not hasattr(obj, 'id'): + obj_by_id.append(obj) + obj.id = len(obj_by_id) + seen_objs[obj] = obj.id + else: + obj.id = seen_objs[obj] + + +class _Component(object): + """An mmCIF component. This is an instance of an _Entity. Multiple + _Components may map to the same _Entity but must have unique + asym_ids. A _Component is similar to an IMP Chain but multiple + Chains may map to the same _Component (the Chains represent the + same structure, just in different states, and potentially in + different IMP Models). A _Component may also represent something + that is described by an experiment but was not modeled by IMP, and + so no Chains map to it but a string name does.""" + def __init__(self, entity, asym_id, name): + self.entity, self.asym_id, self.name = entity, asym_id, name + + +class _ComponentMapper(dict): + """Handle mapping from chains to CIF components.""" + def __init__(self): + super(_ComponentMapper, self).__init__() + self._all_components = [] + self._all_modeled_components = [] + self._map = {} + + def add(self, chain, entity): + """Add a chain (either an IMP Chain object for a modeled component, + or a NonModeledChain object for a non-modeled component)""" + if isinstance(chain, IMP.atom.Chain): + modeled = True + mol = get_molecule(chain) + asym_id = map_key = chain.get_id() + name = mol.get_name() if mol else None + else: + modeled = False + asym_id = None + name = map_key = chain.name + if map_key not in self._map: + component = _Component(entity, asym_id, name) + if entity.first_component is None: + entity.first_component = component + self._all_components.append(component) + if modeled: + self._all_modeled_components.append(component) + self._map[map_key] = component + else: + component = self._map[map_key] + if component.entity != entity: + raise ValueError("Two chains have the same ID (%s) but " + "different sequences - rename one of the " + "chains" % map_key) + return component + + def get_all(self): + """Get all components""" + return self._all_components + + def get_all_modeled(self): + """Get all modeled components""" + return self._all_modeled_components + + +class _Assembly(list): + """A collection of components. Currently simply implemented as a list of + the _Component objects. These must be in creation order.""" + def __hash__(self): + # allow putting assemblies in a dict. 'list' isn't hashable + # but 'tuple' is + return hash(tuple(self)) + + +class _Assemblies(object): + """Track all assemblies used in the modeling.""" + def __init__(self): + self._assemblies = [] + + def add(self, a): + """Add a new assembly. The first such assembly is assumed to contain + all components. Duplicate assemblies will be pruned at the end.""" + self._assemblies.append(a) + return a + + def get_subassembly(self, compdict): + """Get an _Assembly consisting of the given components.""" + # Put components in creation order + newa = _Assembly(c for c in self._assemblies[0] if c in compdict) + return self.add(newa) + + def get_all(self): + """Get all assemblies""" + return self._assemblies + + +class _Representation(object): + """Group a set of contiguous particles with the same representation""" + def __init__(self): + self.particles = [] + self.residue_range = () # inclusive range + self.starting_model = None + + def add(self, particle, starting_model): + """Potentially add a new particle to this representation. + Iff the particle could be added, return True.""" + resrange, rigid_body, primitive = self._get_particle_info(particle) + if not self.particles: + self.particles.append(particle) + self.residue_range = resrange + self.rigid_body = rigid_body + self.primitive = primitive + self.starting_model = starting_model + return True + elif type(particle) == type(self.particles[0]) \ + and resrange[0] == self.residue_range[1] + 1 \ + and starting_model == self.starting_model \ + and self._same_rigid_body(rigid_body): + self.particles.append(particle) + self.residue_range = (self.residue_range[0], resrange[1]) + return True + + def _same_rigid_body(self, rigid_body): + # Note: can't just use self.rigid_body == rigid_body as IMP may + # crash when comparing a RigidBody object against None + if self.rigid_body is None and rigid_body is None: + return True + elif self.rigid_body is None or rigid_body is None: + return False + else: + return self.rigid_body == rigid_body + + def _get_particle_info(self, p): + # Note that we consider nonrigid members to not be rigid here + if IMP.core.RigidMember.get_is_setup(p): + rigid_body = IMP.core.RigidMember(p).get_rigid_body() + else: + rigid_body = None + if isinstance(p, IMP.atom.Residue): + return (p.get_index(), p.get_index()), rigid_body, 'sphere' + elif isinstance(p, IMP.atom.Fragment): + resinds = p.get_residue_indexes() + return (resinds[0], resinds[-1]), rigid_body, 'sphere' + raise TypeError("Unknown particle ", p) + + def __bool__(self): + return len(self.particles) > 0 + + __nonzero__ = __bool__ # Python 2 compatibility + + +def _get_all_structure_provenance(p): + """Yield all StructureProvenance decorators for the given particle.""" + return IMP.core.get_all_provenance(p, types=[IMP.core.StructureProvenance]) + +class _StartingModel(object): + _eq_keys = ['filename', 'chain_id', 'offset'] + + def __init__(self, struc_prov): + self.filename = struc_prov[0].get_filename() + self.chain_id = struc_prov[0].get_chain_id() + self.offset = struc_prov[0].get_residue_offset() + + def _add_residue(self, resind): + self.seq_id_end = resind + self.offset + if not hasattr(self, 'seq_id_begin'): + self.seq_id_begin = self.seq_id_end + + def get_seq_id_range_all_sources(self): + source0 = self.sources[0] + # Where there are multiple sources (to date, this can only + # mean multiple templates for a comparative model) consolidate + # them; template info is given in starting_comparative_models. + seq_id_begin, seq_id_end = source0.get_seq_id_range(self) + for source in self.sources[1:]: + this_begin, this_end = source.get_seq_id_range(self) + seq_id_begin = min(seq_id_begin, this_begin) + seq_id_end = max(seq_id_end, this_end) + return seq_id_begin, seq_id_end + + # Two starting models with same filename, chain ID, and offset + # compare identical + # note: this results in separate starting models if only the offset differs; + # maybe consolidate into one? + def _eq_vals(self): + return tuple([self.__class__] + + [getattr(self, x) for x in self._eq_keys]) + def __eq__(self, other): + return other is not None and self._eq_vals() == other._eq_vals() + def __hash__(self): + return hash(self._eq_vals()) + + def _set_sources_datasets(self, system): + # Attempt to identify PDB file vs. comparative model + p = IMP.mmcif.metadata._PDBMetadataParser() + p.parse_file(self.filename, self.chain_id, system) + self.dataset = p.dataset + self.sources = p.sources + self.alignment_file = p.alignment_file + + +class _StartingModelFinder(object): + """Map IMP particles to starting model objects""" + def __init__(self, existing_starting_models): + self._seen_particles = {} + self._seen_starting_models = dict.fromkeys(existing_starting_models) + + def find(self, particle, system): + """Return a StartingModel object, or None, for this particle""" + def _get_starting_model(sp, resind): + s = _StartingModel(sp) + if s not in self._seen_starting_models: + self._seen_starting_models[s] = s + s._set_sources_datasets(system) + s = self._seen_starting_models[s] + if s: + s._add_residue(resind) + return s + resind = None + if IMP.atom.Residue.get_is_setup(particle): + resind = IMP.atom.Residue(particle).get_index() + sp = list(_get_all_structure_provenance(particle)) + if sp: + return _get_starting_model(sp, resind) + elif IMP.atom.Hierarchy.get_is_setup(particle): + h = IMP.atom.Hierarchy(particle).get_parent() + # Remember all nodes we inspect + seen_parents = [] + while h: + if IMP.atom.Residue.get_is_setup(h): + resind = IMP.atom.Residue(h).get_index() + pi = h.get_particle_index() + seen_parents.append(pi) + # If we inspected this node before, return the cached result + if pi in self._seen_particles: + sp = self._seen_particles[pi] + if sp and sp[0] and resind is not None: + sp[0]._add_residue(resind) + return sp[0] if sp else None + else: + sp = list(_get_all_structure_provenance(h)) + self._seen_particles[pi] = [] + if sp: + s = _get_starting_model(sp, resind) + # Set cache for this node and all the children we + # inspected on the way up + for spi in seen_parents: + self._seen_particles[spi].append(s) + return s + h = h.get_parent() + + +class _Datasets(object): + """Store all datasets used.""" + def __init__(self, external_files): + super(_Datasets, self).__init__() + self._datasets = {} + self._external_files = external_files + + def add(self, d): + """Add and return a new dataset.""" + if d not in self._datasets: + self._datasets[d] = d + d.id = len(self._datasets) + self._external_files.add_input(d.location) + return self._datasets[d] + + def get_all(self): + """Yield all datasets""" + return self._datasets.keys() + + +class _Citation(object): + """A publication that describes the modeling.""" + def __init__(self, pmid, title, journal, volume, page_range, year, authors, + doi): + self.title, self.journal, self.volume = title, journal, volume + self.page_range, self.year = page_range, year + self.pmid, self.authors, self.doi = pmid, authors, doi + + +class _Software(object): + """Software (other than IMP) used as part of the modeling protocol.""" + def __init__(self, name, classification, description, url, + type='program', version=None): + self.name = name + self.classification = classification + self.description = description + self.url = url + self.type = type + self.version = version + + +class _AllSoftware(list): + """Keep track of all _Software objects.""" + def __init__(self): + super(_AllSoftware, self).__init__() + self.modeller_used = self.phyre2_used = False + + def set_modeller_used(self, version, date): + if self.modeller_used: + return + self.modeller_used = True + self.append(_Software( + name='MODELLER', classification='comparative modeling', + description='Comparative modeling by satisfaction ' + 'of spatial restraints, build ' + date, + url='https://salilab.org/modeller/', + version=version)) + + def set_phyre2_used(self): + if self.phyre2_used: + return + self.phyre2_used = True + self.append(_Software( + name='Phyre2', classification='protein homology modeling', + description='Protein Homology/analogY Recognition ' + 'Engine V 2.0', + version='2.0', url='http://www.sbg.bio.ic.ac.uk/~phyre2/')) + + def add_hierarchy(self, h): + for p in IMP.core.get_all_provenance(h, + types=[IMP.core.SoftwareProvenance]): + self.append(_Software(name=p.get_software_name(), + classification='integrative model building', + description=None, + version=p.get_version(), + url=p.get_location())) + + +class _ExternalFile(object): + """A single externally-referenced file""" + + # All valid content types + INPUT_DATA = "Input data or restraints" + MODELING_OUTPUT = "Modeling or post-processing output" + WORKFLOW = "Modeling workflow or script" + + def __init__(self, location, content_type): + self.location, self.content_type = location, content_type + + # Pass id through to location + def __set_id(self, i): + self.location.id = i + id = property(lambda x: x.location.id, __set_id) + file_size = property(lambda x: x.location.file_size) + + def __eq__(self, other): + return self.location == other.location + def __hash__(self): + return hash(self.location) + + +class _ExternalFiles(object): + """Track all externally-referenced files + (i.e. anything that refers to a Location that isn't + a DatabaseLocation).""" + def __init__(self): + self._refs = [] + self._repos = [] + + def add_repo(self, repo): + """Add a repository containing modeling files.""" + self._repos.append(repo) + + def _add(self, location, content_type): + """Add a new externally-referenced file. + Note that ids are assigned later.""" + self._refs.append(_ExternalFile(location, content_type)) + + def add_input(self, location): + """Add a new externally-referenced file used as input.""" + return self._add(location, _ExternalFile.INPUT_DATA) + + def add_output(self, location): + """Add a new externally-referenced file produced as output.""" + return self._add(location, _ExternalFile.MODELING_OUTPUT) + + def add_workflow(self, location): + """Add a new externally-referenced file that's part of the workflow.""" + return self._add(location, _ExternalFile.WORKFLOW) + + def add_hierarchy(self, h): + # Add all Python scripts that were used in the modeling + for p in IMP.core.get_all_provenance(h, + types=[IMP.core.ScriptProvenance]): + # todo: set details + l = IMP.mmcif.dataset.FileLocation(path=p.get_filename(), + details='Integrative modeling Python script') + self.add_workflow(l) + + def get_all_nondb(self): + """Yield all external files that are not database hosted""" + for x in self._refs: + if not isinstance(x.location, IMP.mmcif.dataset.DatabaseLocation): + yield x + + +class _ProtocolStep(object): + """A single step (e.g. sampling, refinement) in a protocol.""" + def __init__(self, prov, num_models_begin): + self.num_models_begin = num_models_begin + self._prov = [prov] + self.num_models_end = prov.get_number_of_frames() + + def add_combine(self, prov): + self._prov.append(prov) + self.num_models_end = prov.get_number_of_frames() + return self.num_models_end + + +class _PostProcessing(object): + """A single postprocessing step (e.g. filtering) in a protocol.""" + def __init__(self, prov, num_models_begin): + self.num_models_begin = num_models_begin + self._prov = prov + if isinstance(prov, IMP.core.FilterProvenance): + self.num_models_end = prov.get_number_of_frames() + elif isinstance(prov, IMP.core.ClusterProvenance): + # Assume clustering uses all models + self.num_models_end = self.num_models_begin + else: + raise ValueError("Unhandled provenance", prov) + + +class _Protocol(object): + """A modeling protocol. + Each protocol consists of a number of protocol steps (e.g. sampling, + refinement) followed by a number of postprocessing steps (e.g. + filtering, rescoring, clustering)""" + def __init__(self): + self._steps = [] + self._postprocs = [] + + def add_step(self, prov, num_models): + if isinstance(prov, IMP.core.CombineProvenance): + # Fold CombineProvenance into a previous sampling step + if len(self._steps) == 0: + raise ValueError("CombineProvenance with no previous sampling") + return self._steps[-1].add_combine(prov) + else: + ps = _ProtocolStep(prov, num_models) + self._steps.append(ps) + ps.id = len(self._steps) + return ps.num_models_end + + def add_postproc(self, prov, num_models): + pp = _PostProcessing(prov, num_models) + self._postprocs.append(pp) + pp.id = len(self._postprocs) + return pp.num_models_end + + +class _Protocols(object): + """Track all modeling protocols used.""" + def __init__(self): + self._protocols = [] + + def get_all(self): + return self._protocols + + def _add_protocol(self, prot, modeled_assembly): + prot.modeled_assembly = modeled_assembly + self._protocols.append(prot) + prot.id = len(self._protocols) + + def _add_hierarchy(self, h, modeled_assembly): + num_models = 0 # assume we always start with no models + prot_types = (IMP.core.SampleProvenance, IMP.core.CombineProvenance) + pp_types = (IMP.core.FilterProvenance, IMP.core.ClusterProvenance) + in_postproc = False + prot = _Protocol() + for p in reversed(list(IMP.core.get_all_provenance( + h, types=prot_types + pp_types))): + if isinstance(p, pp_types): + num_models = prot.add_postproc(p, num_models) + in_postproc = True + else: + if in_postproc: + # Start a new protocol + self._add_protocol(prot, modeled_assembly) + prot = _Protocol() + num_models = prot.add_step(p, num_models) + in_postproc = False + if len(prot._steps) > 0: + self._add_protocol(prot, modeled_assembly) diff --git a/modules/mmcif/pyext/src/dataset.py b/modules/mmcif/pyext/src/dataset.py new file mode 100644 index 0000000000..e3d5642a9f --- /dev/null +++ b/modules/mmcif/pyext/src/dataset.py @@ -0,0 +1,180 @@ +"""@namespace IMP.mmcif.dataset + + Classes for tracking data used by mmCIF models. +""" + +import os +try: + from collections import OrderedDict +except ImportError: + from IMP.mmcif._compat_collections import OrderedDict + + +class Dataset(object): + """A set of input data, for example, a crystal structure or EM map.""" + + _eq_keys = ['location'] + + # Datasets compare equal iff they are the same class and have the + # same attributes + def _eq_vals(self): + return tuple([self.__class__] + + [getattr(self, x) for x in self._eq_keys]) + def __eq__(self, other): + return self._eq_vals() == other._eq_vals() + def __hash__(self): + return hash(self._eq_vals()) + + _data_type = 'unspecified' + def __init__(self, location): + self.location = location + self._parents = OrderedDict() + + def add_parent(self, dataset): + """Add another Dataset from which this one was derived. + For example, a 3D EM map may be derived from a set of 2D images.""" + self._parents[dataset] = None + + +class CXMSDataset(Dataset): + """Processed crosslinks from a CX-MS experiment""" + _data_type = 'CX-MS data' + + +class PDBDataset(Dataset): + """An experimentally-determined 3D structure as a set of a coordinates, + usually in a PDB file""" + _data_type = 'Experimental model' + + +class ComparativeModelDataset(Dataset): + """A 3D structure determined by comparative modeling""" + _data_type = 'Comparative model' + + +class Location(object): + """Identifies the location where a resource can be found.""" + + # 'details' can differ without affecting dataset equality + _eq_keys = [] + _allow_duplicates = False + + def __init__(self, details=None): + self.details = details + + # Locations compare equal iff they are the same class, have the + # same attributes, and allow_duplicates=False + def _eq_vals(self): + if self._allow_duplicates: + return id(self) + else: + return tuple([self.__class__] + + [getattr(self, x) for x in self._eq_keys]) + def __eq__(self, other): + return self._eq_vals() == other._eq_vals() + def __hash__(self): + return hash(self._eq_vals()) + + +class DatabaseLocation(Location): + """A dataset stored in an official database (PDB, EMDB, PRIDE, etc.)""" + + _eq_keys = Location._eq_keys + ['db_name', 'access_code', 'version'] + + def __init__(self, db_name, db_code, version=None, details=None): + super(DatabaseLocation, self).__init__(details) + self.db_name = db_name + self.access_code = db_code + self.version = version + + +class PDBLocation(DatabaseLocation): + """Something stored in the PDB database.""" + def __init__(self, db_code, version=None, details=None): + super(PDBLocation, self).__init__('PDB', db_code, version, details) + + +class FileLocation(Location): + """An individual file or directory. + This may be in a repository (if `repo` is not None) or only on the + local disk (if `repo` is None).""" + + _eq_keys = Location._eq_keys + ['repo', 'path'] + + def __init__(self, path, repo=None, details=None): + """Constructor. + @param path the location of the file or directory. + @param repo a Repository object that describes the repository + containing the file (if any). + """ + super(FileLocation, self).__init__(details) + self.repo = repo + if repo: + self.path = path + # Cannot determine file size if non-local + self.file_size = None + else: + if not os.path.exists(path): + raise ValueError("%s does not exist" % path) + self.file_size = os.stat(path).st_size + # Store absolute path in case the working directory changes later + self.path = os.path.abspath(path) + + +class Repository(object): + """A repository containing modeling files. + This can be used if the script plus input files are part of a + repository, which has been archived somewhere with a DOI. + This will be used to construct permanent references to files + used in this modeling, even if they haven't been uploaded to + a database such as PDB or EMDB. + + @see FileLocation.""" + + # Two repositories compare equal if their DOIs and URLs are the same + def __eq__(self, other): + return self.doi == other.doi and self.url == other.url + def __hash__(self): + return hash((self.doi, self.url)) + + def __init__(self, doi, root=None, url=None, + top_directory=None): + """Constructor. + @param doi the Digital Object Identifier for the repository. + @param root the relative path to the top-level directory + of the repository from the working directory of the script, + or None if files in this repository aren't checked out. + @param url If given, a location that this repository can be + downloaded from. + @param top_directory If given, prefix all paths for files in this + repository with this value. This is useful when the archived + version of the repository is found in a subdirectory at the + URL or DOI (for example, GitHub repositories archived at + Zenodo get placed in a subdirectory named for the repository + and git hash). + """ + # todo: DOI should be optional (could also use URL, local path) + self.doi = doi + self.url, self.top_directory = url, top_directory + if root is not None: + # Store absolute path in case the working directory changes later + self._root = os.path.abspath(root) + + @staticmethod + def update_in_repos(fileloc, repos): + """If the given FileLocation maps to somewhere within one of the + passed repositories, update it to reflect that.""" + if fileloc.repo: + return + orig_path = fileloc.path + for repo in repos: + relpath = os.path.relpath(orig_path, repo._root) + if not relpath.startswith('..'): + # Prefer the shortest paths if multiple repositories can match + if fileloc.repo is None or len(fileloc.path) > len(relpath): + fileloc.repo = repo + fileloc.path = relpath + + def _get_full_path(self, path): + """Prefix the given path with our top-level directory""" + return os.path.join(self.top_directory or "", path) diff --git a/modules/mmcif/pyext/src/dumper.py b/modules/mmcif/pyext/src/dumper.py new file mode 100644 index 0000000000..bbecf9d7e0 --- /dev/null +++ b/modules/mmcif/pyext/src/dumper.py @@ -0,0 +1,637 @@ +"""@namespace IMP.mmcif.dumper + @brief Utility classes to dump out information in mmCIF format. +""" + +from __future__ import print_function +import IMP.atom +import IMP.mmcif.data +import IMP.mmcif.dataset +from IMP.mmcif.format import _CifWriter +import operator +import os + +class _Dumper(object): + """Base class for helpers to dump output to mmCIF""" + def __init__(self): + pass + def finalize_metadata(self, system): + pass + def finalize(self, system): + pass + +class _EntryDumper(_Dumper): + def dump(self, system, writer): + entry_id = 'imp_model' + # Write CIF header (so this dumper should always be first) + writer.fh.write("data_%s\n" % entry_id) + with writer.category("_entry") as l: + l.write(id=entry_id) + +class _ChemCompDumper(_Dumper): + def dump(self, system, writer): + seen = {} + std = dict.fromkeys(('ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS', + 'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN', 'ARG', 'SER', + 'THR', 'VAL', 'TRP', 'TYR')) + with writer.loop("_chem_comp", ["id", "type"]) as l: + for entity in system.entities.get_all(): + seq = entity.sequence + for num, one_letter_code in enumerate(seq): + restyp = IMP.atom.get_residue_type(one_letter_code) + resid = restyp.get_string() + if resid not in seen: + seen[resid] = None + l.write(id=resid, + type='L-peptide linking' if resid in std \ + else 'other') + + +class _EntityDumper(_Dumper): + # todo: we currently only support amino acid sequences here (and + # then only standard amino acids; need to add support for MSE etc.) + def dump(self, system, writer): + with writer.loop("_entity", + ["id", "type", "src_method", "pdbx_description", + "formula_weight", "pdbx_number_of_molecules", + "details"]) as l: + for entity in system.entities.get_all(): + l.write(id=entity.id, type='polymer', src_method='man', + pdbx_description=entity.description, + formula_weight=writer.unknown, + pdbx_number_of_molecules=1, details=writer.unknown) + + +class _EntityPolyDumper(_Dumper): + # todo: we currently only support amino acid sequences here + def dump(self, system, writer): + with writer.loop("_entity_poly", + ["entity_id", "type", "nstd_linkage", + "nstd_monomer", "pdbx_strand_id", + "pdbx_seq_one_letter_code", + "pdbx_seq_one_letter_code_can"]) as l: + for entity in system.entities.get_all(): + seq = entity.sequence + # Split into lines to get tidier CIF output + seq = "\n".join(seq[i:i+70] for i in range(0, len(seq), 70)) + comp = entity.first_component + l.write(entity_id=entity.id, type='polypeptide(L)', + nstd_linkage='no', nstd_monomer='no', + pdbx_strand_id=comp.asym_id, + pdbx_seq_one_letter_code=seq, + pdbx_seq_one_letter_code_can=seq) + +class _EntityPolySeqDumper(_Dumper): + def dump(self, system, writer): + with writer.loop("_entity_poly_seq", + ["entity_id", "num", "mon_id", "hetero"]) as l: + for entity in system.entities.get_all(): + seq = entity.sequence + for num, one_letter_code in enumerate(seq): + restyp = IMP.atom.get_residue_type(one_letter_code) + l.write(entity_id=entity.id, num=num + 1, + mon_id=restyp.get_string(), + hetero=writer.omitted) + + +class _StructAsymDumper(_Dumper): + def dump(self, system, writer): + with writer.loop("_struct_asym", + ["id", "entity_id", "details"]) as l: + for comp in system.components.get_all_modeled(): + l.write(id=comp.asym_id, + entity_id=comp.entity.id, + details=comp.name if comp.name else writer.omitted) + + +class _AssemblyDumper(_Dumper): + def finalize(self, system): + seen_assemblies = {} + # Assign IDs to all assemblies + self._assembly_by_id = [] + for a in system._assemblies.get_all(): + IMP.mmcif.data._assign_id(a, seen_assemblies, self._assembly_by_id) + + def dump(self, system, writer): + ordinal = 1 + with writer.loop("_ihm_struct_assembly", + ["ordinal_id", "assembly_id", "parent_assembly_id", + "entity_description", + "entity_id", "asym_id", "seq_id_begin", + "seq_id_end"]) as l: + for a in self._assembly_by_id: + for comp in a: + entity = comp.entity + l.write(ordinal_id=ordinal, assembly_id=a.id, + # Currently all assemblies are not hierarchical, + # so each assembly is a self-parent + parent_assembly_id=a.id, + entity_description=entity.description, + entity_id=entity.id, + asym_id=comp.asym_id if comp.asym_id + else writer.omitted, + seq_id_begin=1, + seq_id_end=len(entity.sequence)) + ordinal += 1 + + +class _ModelRepresentationDumper(_Dumper): + def _get_granularity(self, r): + if isinstance(r.particles[0], IMP.atom.Residue): + return 'by-residue' + else: + return 'by-feature' + + def dump(self, system, writer): + ordinal_id = 1 + segment_id = 1 + with writer.loop("_ihm_model_representation", + ["ordinal_id", "representation_id", + "segment_id", "entity_id", "entity_description", + "entity_asym_id", + "seq_id_begin", "seq_id_end", + "model_object_primitive", "starting_model_id", + "model_mode", "model_granularity", + "model_object_count"]) as l: + for comp in system.components.get_all_modeled(): + # For now, assume that representation of the same-named + # component is the same in all states, so just take the first + state = list(system._states.keys())[0] + for r in state.representation[comp]: + # todo: handle multiple representations + l.write(ordinal_id=ordinal_id, representation_id=1, + segment_id=segment_id, entity_id=comp.entity.id, + entity_description=comp.entity.description, + entity_asym_id=comp.asym_id, + seq_id_begin=r.residue_range[0], + seq_id_end=r.residue_range[1], + model_object_primitive=r.primitive, + starting_model_id=r.starting_model.id + if r.starting_model + else writer.omitted, + model_mode='rigid' if r.rigid_body else 'flexible', + model_granularity=self._get_granularity(r), + model_object_count=len(r.particles)) + ordinal_id += 1 + segment_id += 1 + +class _MSESeqDif(object): + """Track an MSE -> MET mutation in the starting model sequence""" + comp_id = 'MET' + db_comp_id = 'MSE' + details = 'Conversion of modified residue MSE to MET' + def __init__(self, res, comp, source, model): + self.res, self.comp, self.source, self.model = res, comp, source, model + self.model = model + + +class _StartingModelDumper(_Dumper): + def finalize(self, system): + for comp in system.components.get_all_modeled(): + for n, starting_model in enumerate(self._all_models(system, comp)): + starting_model.id = '%s-m%d' % (comp.name, n + 1) + + def _all_models(self, system, comp): + """Get the set of all starting models for a component""" + for state in system._states.keys(): + seen_models = {} + for rep in state.representation.get(comp, []): + sm = rep.starting_model + if sm and sm not in seen_models: + seen_models[sm] = None + yield sm + + def dump(self, system, writer): + self.dump_details(system, writer) + self.dump_comparative(system, writer) + seq_dif = self.dump_coords(system, writer) + self.dump_seq_dif(writer, seq_dif) + + def dump_details(self, system, writer): + with writer.loop("_ihm_starting_model_details", + ["starting_model_id", "entity_id", "entity_description", + "asym_id", "seq_id_begin", + "seq_id_end", "starting_model_source", + "starting_model_auth_asym_id", + "starting_model_sequence_offset", + "dataset_list_id"]) as l: + for comp in system.components.get_all_modeled(): + for sm in self._all_models(system, comp): + seq_id_begin, seq_id_end = sm.get_seq_id_range_all_sources() + l.write(starting_model_id=sm.id, entity_id=comp.entity.id, + entity_description=comp.entity.description, + asym_id=comp.asym_id, + seq_id_begin=seq_id_begin, + seq_id_end=seq_id_end, + starting_model_source=sm.sources[0].source, + starting_model_auth_asym_id=sm.chain_id, + dataset_list_id=sm.dataset.id, + starting_model_sequence_offset=sm.offset) + + def dump_comparative(self, system, writer): + """Dump details on comparative models.""" + with writer.loop("_ihm_starting_comparative_models", + ["ordinal_id", "starting_model_id", + "starting_model_auth_asym_id", + "starting_model_seq_id_begin", + "starting_model_seq_id_end", + "template_auth_asym_id", "template_seq_id_begin", + "template_seq_id_end", "template_sequence_identity", + "template_sequence_identity_denominator", + "template_dataset_list_id", + "alignment_file_id"]) as l: + ordinal = 1 + for comp in system.components.get_all_modeled(): + for sm in self._all_models(system, comp): + for template in [s for s in sm.sources + if isinstance(s, IMP.mmcif.metadata._TemplateSource)]: + seq_id_begin, seq_id_end = template.get_seq_id_range(sm) + l.write(ordinal_id=ordinal, + starting_model_id=sm.id, + starting_model_auth_asym_id=template.chain_id, + starting_model_seq_id_begin=seq_id_begin, + starting_model_seq_id_end=seq_id_end, + template_auth_asym_id=template.tm_chain_id, + template_seq_id_begin=template.tm_seq_id_begin, + template_seq_id_end=template.tm_seq_id_end, + template_sequence_identity=template.sequence_identity, + # Assume Modeller-style sequence identity for now + template_sequence_identity_denominator=1, + template_dataset_list_id=template.tm_dataset.id + if template.tm_dataset + else writer.unknown, + alignment_file_id=sm.alignment_file.id + if hasattr(sm, 'alignment_file') + else writer.unknown) + ordinal += 1 + + def dump_coords(self, system, writer): + seq_dif = [] + ordinal = 1 + with writer.loop("_ihm_starting_model_coord", + ["starting_model_id", "group_PDB", "id", "type_symbol", + "atom_id", "comp_id", "entity_id", "asym_id", + "seq_id", "Cartn_x", + "Cartn_y", "Cartn_z", "B_iso_or_equiv", + "ordinal_id"]) as l: + for comp in system.components.get_all_modeled(): + for sm in self._all_models(system, comp): + m, sel = self._read_coords(sm) + last_res_index = None + for a in sel.get_selected_particles(): + coord = IMP.core.XYZ(a).get_coordinates() + atom = IMP.atom.Atom(a) + element = atom.get_element() + element = IMP.atom.get_element_table().get_name(element) + atom_name = atom.get_atom_type().get_string().strip() + group_pdb = 'ATOM' + if atom_name.startswith('HET:'): + group_pdb = 'HETATM' + atom_name = atom_name[4:] + res = IMP.atom.get_residue(atom) + res_name = res.get_residue_type().get_string() + # MSE in the original PDB is automatically mutated + # by IMP to MET, so reflect that in the output, + # and pass back to populate the seq_dif category. + if res_name == 'MSE': + res_name = 'MET' + # Only add one seq_dif record per residue + ind = res.get_index() + if ind != last_res_index: + last_res_index = ind + # This should only happen when we're using + # a crystal structure as the source (a + # comparative model would use MET in + # the sequence) + assert(len(sm.sources) == 1) + seq_dif.append(_MSESeqDif(res, comp, + sm.sources[0], sm)) + l.write(starting_model_id=sm.id, + group_PDB=group_pdb, + id=atom.get_input_index(), type_symbol=element, + atom_id=atom_name, comp_id=res_name, + entity_id=comp.entity.id, + asym_id=sm.chain_id, + seq_id=res.get_index() + sm.offset, + Cartn_x=coord[0], + Cartn_y=coord[1], Cartn_z=coord[2], + B_iso_or_equiv=atom.get_temperature_factor(), + ordinal_id=ordinal) + ordinal += 1 + return seq_dif + + def dump_seq_dif(self, writer, seq_dif): + ordinal = 1 + with writer.loop("_ihm_starting_model_seq_dif", + ["ordinal_id", "entity_id", "asym_id", + "seq_id", "comp_id", "starting_model_id", + "db_asym_id", "db_seq_id", "db_comp_id", + "details"]) as l: + for sd in seq_dif: + l.write(ordinal_id=ordinal, entity_id=sd.comp.entity.id, + asym_id=sd.model.chain_id, + seq_id=sd.res.get_index(), + comp_id=sd.comp_id, + db_asym_id=sd.source.chain_id, + db_seq_id=sd.res.get_index() - sd.model.offset, + db_comp_id=sd.db_comp_id, + starting_model_id=sd.model.name, + details=sd.details) + ordinal += 1 + + def _read_coords(self, sm): + """Read the coordinates for a starting model""" + m = IMP.Model() + # todo: support reading other subsets of the atoms (e.g. CA/CB) + slt = IMP.atom.ChainPDBSelector(sm.chain_id) \ + & IMP.atom.NonWaterNonHydrogenPDBSelector() + hier = IMP.atom.read_pdb(sm.filename, m, slt) + sel = IMP.atom.Selection(hier, + residue_indexes=list(range(sm.seq_id_begin - sm.offset, + sm.seq_id_end + 1 - sm.offset))) + return m, sel + + +class _DatasetDumper(_Dumper): + def _dataset_by_id(self, system): + return sorted(system.datasets.get_all(), key=operator.attrgetter('id')) + + def dump(self, system, writer): + with writer.loop("_ihm_dataset_list", + ["id", "data_type", "database_hosted"]) as l: + for d in self._dataset_by_id(system): + l.write(id=d.id, data_type=d._data_type, + database_hosted=isinstance(d.location, + IMP.mmcif.dataset.DatabaseLocation)) + self.dump_other((d for d in self._dataset_by_id(system) + if not isinstance(d.location, + IMP.mmcif.dataset.DatabaseLocation)), + writer) + self.dump_rel_dbs((d for d in self._dataset_by_id(system) + if isinstance(d.location, + IMP.mmcif.dataset.DatabaseLocation)), + writer) + self.dump_related(system, writer) + + def dump_other(self, datasets, writer): + ordinal = 1 + with writer.loop("_ihm_dataset_external_reference", + ["id", "dataset_list_id", "file_id"]) as l: + for d in datasets: + l.write(id=ordinal, dataset_list_id=d.id, file_id=d.location.id) + ordinal += 1 + + def dump_rel_dbs(self, datasets, writer): + ordinal = 1 + with writer.loop("_ihm_dataset_related_db_reference", + ["id", "dataset_list_id", "db_name", + "accession_code", "version", "details"]) as l: + for d in datasets: + l.write(id=ordinal, dataset_list_id=d.id, + db_name=d.location.db_name, + accession_code=d.location.access_code, + version=d.location.version if d.location.version + else writer.omitted, + details=d.location.details if d.location.details + else writer.omitted) + ordinal += 1 + + def dump_related(self, system, writer): + ordinal = 1 + with writer.loop("_ihm_related_datasets", + ["ordinal_id", "dataset_list_id_derived", + "dataset_list_id_primary"]) as l: + for derived in self._dataset_by_id(system): + for parent in sorted(derived._parents.keys(), + key=operator.attrgetter('id')): + l.write(ordinal_id=ordinal, + dataset_list_id_derived=derived.id, + dataset_list_id_primary=parent.id) + ordinal += 1 + + +class _CitationDumper(_Dumper): + def dump(self, system, writer): + self.dump_citation(system, writer) + self.dump_author(system, writer) + + def dump_citation(self, system, writer): + with writer.loop("_citation", + ["id", "title", "journal_abbrev", "journal_volume", + "page_first", "page_last", "year", + "pdbx_database_id_PubMed", + "pdbx_database_id_DOI"]) as l: + for n, c in enumerate(system._citations): + if isinstance(c.page_range, (tuple, list)): + page_first, page_last = c.page_range + else: + page_first = c.page_range + page_last = writer.omitted + l.write(id=n+1, title=c.title, journal_abbrev=c.journal, + journal_volume=c.volume, page_first=page_first, + page_last=page_last, year=c.year, + pdbx_database_id_PubMed=c.pmid, + pdbx_database_id_DOI=c.doi) + + def dump_author(self, system, writer): + with writer.loop("_citation_author", + ["citation_id", "name", "ordinal"]) as l: + ordinal = 1 + for n, c in enumerate(system._citations): + for a in c.authors: + l.write(citation_id=n+1, name=a, ordinal=ordinal) + ordinal += 1 + + +class _SoftwareDumper(_Dumper): + def dump(self, system, writer): + ordinal = 1 + with writer.loop("_software", + ["pdbx_ordinal", "name", "classification", "version", + "type", "location"]) as l: + for s in system._software: + l.write(pdbx_ordinal=ordinal, name=s.name, + classification=s.classification, version=s.version, + type=s.type, location=s.url) + ordinal += 1 + + +class _ExternalReferenceDumper(_Dumper): + """Output information on externally referenced files + (i.e. anything that refers to a Location that isn't + a DatabaseLocation).""" + + class _LocalFiles(object): + reference_provider = _CifWriter.omitted + reference_type = 'Supplementary Files' + reference = _CifWriter.omitted + refers_to = 'Other' + associated_url = _CifWriter.omitted + + def __init__(self, top_directory): + self.top_directory = top_directory + + def _get_full_path(self, path): + return os.path.relpath(path, start=self.top_directory) + + class _Repository(object): + reference_provider = _CifWriter.omitted + reference_type = 'DOI' + refers_to = 'Other' + associated_url = _CifWriter.omitted + + def __init__(self, repo): + self.id = repo.id + self.reference = repo.doi + if 'zenodo' in self.reference: + self.reference_provider = 'Zenodo' + if repo.url: + self.associated_url = repo.url + if repo.url.endswith(".zip"): + self.refers_to = 'Archive' + else: + self.refers_to = 'File' + + def finalize(self, system): + # Keep only locations that don't point into databases (these are + # handled elsewhere) + self._refs = list(system._external_files.get_all_nondb()) + # Assign IDs to all locations and repos (including the None repo, which + # is for local files) + seen_refs = {} + seen_repos = {} + self._ref_by_id = [] + self._repo_by_id = [] + # Special dummy repo for repo=None (local files) + self._local_files = self._LocalFiles(os.getcwd()) + for r in self._refs: + # Update location to point to parent repository, if any + # todo: this could probably happen when the location is first made + system._update_location(r.location) + # Assign a unique ID to the reference + IMP.mmcif.data._assign_id(r, seen_refs, self._ref_by_id) + # Assign a unique ID to the repository + IMP.mmcif.data._assign_id(r.location.repo or self._local_files, + seen_repos, self._repo_by_id) + + def dump(self, system, writer): + self.dump_repos(writer) + self.dump_refs(writer) + + def dump_repos(self, writer): + def map_repo(r): + return r if isinstance(r, self._LocalFiles) else self._Repository(r) + with writer.loop("_ihm_external_reference_info", + ["reference_id", "reference_provider", + "reference_type", "reference", "refers_to", + "associated_url"]) as l: + for repo in [map_repo(r) for r in self._repo_by_id]: + l.write(reference_id=repo.id, + reference_provider=repo.reference_provider, + reference_type=repo.reference_type, + reference=repo.reference, refers_to=repo.refers_to, + associated_url=repo.associated_url) + + def dump_refs(self, writer): + with writer.loop("_ihm_external_files", + ["id", "reference_id", "file_path", "content_type", + "file_size_bytes", "details"]) as l: + for r in self._ref_by_id: + loc = r.location + repo = loc.repo or self._local_files + file_path=self._posix_path(repo._get_full_path(loc.path)) + if r.file_size is None: + file_size = writer.omitted + else: + file_size = r.file_size + l.write(id=loc.id, reference_id=repo.id, + file_path=file_path, + content_type=r.content_type, + file_size_bytes=file_size, + details=loc.details or writer.omitted) + + # On Windows systems, convert native paths to POSIX-like (/-separated) paths + if os.sep == '/': + def _posix_path(self, path): + return path + else: + def _posix_path(self, path): + return path.replace(os.sep, '/') + + +class _ProtocolDumper(_Dumper): + class _Step(object): + name = 'Sampling' # todo: support 'refinement' too + + def __init__(self, s): + self.orig_s = s + def __get_method(self): + first_prov = self.orig_s._prov[0] + meth = first_prov.get_method() + if first_prov.get_number_of_replicas() > 1: + meth = "Replica exchange " + meth + return meth + + num_models_begin = property(lambda self: self.orig_s.num_models_begin) + num_models_end = property(lambda self: self.orig_s.num_models_end) + id = property(lambda self: self.orig_s.id) + method = property(__get_method) + + def dump(self, system, writer): + ordinal = 1 + with writer.loop("_ihm_modeling_protocol", + ["ordinal_id", "protocol_id", "step_id", + "struct_assembly_id", "dataset_group_id", + "struct_assembly_description", "protocol_name", + "step_name", "step_method", "num_models_begin", + "num_models_end", "multi_scale_flag", + "multi_state_flag", "time_ordered_flag"]) as l: + for p in system.protocols.get_all(): + for s in p._steps: + step = self._Step(s) + l.write(ordinal_id=ordinal, protocol_id=p.id, + step_id=step.id, step_method=step.method, + step_name=step.name, + struct_assembly_id=p.modeled_assembly.id, + num_models_begin=step.num_models_begin, + num_models_end=step.num_models_end, + # todo: support multiple states, time ordered + multi_state_flag=False, time_ordered_flag=False, + # todo: revisit assumption all models are multiscale + multi_scale_flag=True) + ordinal += 1 + + +class _PostProcessDumper(_Dumper): + class _PostProc(object): + def __init__(self, pp): + self._pp = pp + def __get_type(self): + type_map = {IMP.core.ClusterProvenance: 'cluster', + IMP.core.FilterProvenance: 'filter'} + return type_map[type(self._pp._prov)] + def __get_feature(self): + type_map = {IMP.core.ClusterProvenance: 'RMSD', + IMP.core.FilterProvenance: 'energy/score'} + return type_map[type(self._pp._prov)] + + num_models_begin = property(lambda self: self._pp.num_models_begin) + num_models_end = property(lambda self: self._pp.num_models_end) + id = property(lambda self: self._pp.id) + type = property(__get_type) + feature = property(__get_feature) + + def dump(self, system, writer): + ordinal = 1 + with writer.loop("_ihm_modeling_post_process", + ["id", "protocol_id", "analysis_id", "step_id", + "type", "feature", "num_models_begin", + "num_models_end"]) as l: + # todo: handle multiple analyses + for prot in system.protocols.get_all(): + for pp in prot._postprocs: + p = self._PostProc(pp) + l.write(id=ordinal, protocol_id=prot.id, analysis_id=1, + step_id=p.id, type=p.type, feature=p.feature, + num_models_begin=p.num_models_begin, + num_models_end=p.num_models_end) + ordinal += 1 diff --git a/modules/mmcif/pyext/src/format.py b/modules/mmcif/pyext/src/format.py new file mode 100644 index 0000000000..ab373b9ea4 --- /dev/null +++ b/modules/mmcif/pyext/src/format.py @@ -0,0 +1,109 @@ +"""@namespace IMP.mmcif.format + @brief Utility classes to handle CIF format. +""" + +from __future__ import print_function +import sys + +# Python 3 has no 'long' type, so use 'int' instead +if sys.version_info[0] >= 3: + _long_type = int +else: + _long_type = long + +class _LineWriter(object): + def __init__(self, writer, line_len=80): + self.writer = writer + self.line_len = line_len + self.column = 0 + def write(self, val): + if isinstance(val, str) and '\n' in val: + self.writer.fh.write("\n;") + self.writer.fh.write(val) + if not val.endswith('\n'): + self.writer.fh.write("\n") + self.writer.fh.write(";\n") + self.column = 0 + return + val = self.writer._repr(val) + if self.column > 0: + if self.column + len(val) + 1 > self.line_len: + self.writer.fh.write("\n") + self.column = 0 + else: + self.writer.fh.write(" ") + self.column += 1 + self.writer.fh.write(val) + self.column += len(val) + + +class _CifCategoryWriter(object): + def __init__(self, writer, category): + self.writer = writer + self.category = category + def write(self, **kwargs): + self.writer._write(self.category, kwargs) + def __enter__(self): + return self + def __exit__(self, exc_type, exc_value, traceback): + pass + + +class _CifLoopWriter(object): + def __init__(self, writer, category, keys): + self.writer = writer + self.category = category + self.keys = keys + # Remove characters that we can't use in Python identifiers + self.python_keys = [k.replace('[', '').replace(']', '') for k in keys] + self._empty_loop = True + def write(self, **kwargs): + if self._empty_loop: + f = self.writer.fh + f.write("#\nloop_\n") + for k in self.keys: + f.write("%s.%s\n" % (self.category, k)) + self._empty_loop = False + l = _LineWriter(self.writer) + for k in self.python_keys: + l.write(kwargs.get(k, self.writer.omitted)) + self.writer.fh.write("\n") + def __enter__(self): + return self + def __exit__(self, exc_type, exc_value, traceback): + if not self._empty_loop: + self.writer.fh.write("#\n") + + +class _CifWriter(object): + omitted = '.' + unknown = '?' + _boolmap = {False: 'NO', True: 'YES'} + + def __init__(self, fh): + self.fh = fh + def category(self, category): + return _CifCategoryWriter(self, category) + def loop(self, category, keys): + return _CifLoopWriter(self, category, keys) + def write_comment(self, comment): + for line in textwrap.wrap(comment, 78): + self.fh.write('# ' + line + '\n') + def _write(self, category, kwargs): + for key in kwargs: + self.fh.write("%s.%s %s\n" % (category, key, + self._repr(kwargs[key]))) + def _repr(self, obj): + if isinstance(obj, str) and '"' not in obj \ + and "'" not in obj and " " not in obj: + return obj + elif isinstance(obj, float): + return "%.3f" % obj + elif isinstance(obj, bool): + return self._boolmap[obj] + # Don't use repr(x) if type(x) == long since that adds an 'L' suffix, + # which isn't valid mmCIF syntax. _long_type = long only on Python 2. + elif isinstance(obj, _long_type): + return "%d" % obj + else: + return repr(obj) diff --git a/modules/mmcif/pyext/src/metadata.py b/modules/mmcif/pyext/src/metadata.py new file mode 100644 index 0000000000..782ed36abf --- /dev/null +++ b/modules/mmcif/pyext/src/metadata.py @@ -0,0 +1,273 @@ +"""@namespace IMP.mmcif.metadata + + Classes to extract metadata for various input files. +""" + +import IMP.mmcif.dataset +import re + +class _MetadataParser(object): + """Base class for all metadata parsers. + Call parse_file() to extract metadata into self.dataset, etc.""" + def parse_file(self, filename, system): + pass + + +class _PDBHelix(object): + """Represent a HELIX record from a PDB file.""" + def __init__(self, line): + self.helix_id = line[11:14].strip() + self.start_resnam = line[14:18].strip() + self.start_asym = line[19] + self.start_resnum = int(line[21:25]) + self.end_resnam = line[27:30].strip() + self.end_asym = line[31] + self.end_resnum = int(line[33:37]) + self.helix_class = int(line[38:40]) + self.length = int(line[71:76]) + + +class _StartingModelSource(object): + """Base class for all sources for starting models.""" + pass + + +class _PDBSource(_StartingModelSource): + """An experimental PDB file used as part of a starting model""" + source = 'experimental model' + db_name = 'PDB' + sequence_identity = 100.0 + + def __init__(self, db_code, chain_id, metadata): + self.db_code = db_code + self.chain_id = chain_id + self.metadata = metadata + + def get_seq_id_range(self, starting_model): + # Assume the structure covers the entire sequence + return (starting_model.seq_id_begin, starting_model.seq_id_end) + + +class _TemplateSource(_StartingModelSource): + """A PDB file used as a template for a comparative starting model""" + source = 'comparative model' + db_name = db_code = None + tm_dataset = None + + def __init__(self, tm_code, tm_seq_id_begin, tm_seq_id_end, seq_id_begin, + chain_id, seq_id_end, seq_id): + # Assume a code of 1abcX refers to a real PDB structure + if len(tm_code) == 5: + self._orig_tm_code = None + self.tm_db_code = tm_code[:4].upper() + self.tm_chain_id = tm_code[4] + else: + # Otherwise, will need to look up in TEMPLATE PATH remarks + self._orig_tm_code = tm_code + self.tm_db_code = None + self.tm_chain_id = tm_code[-1] + self.sequence_identity = seq_id + self.tm_seq_id_begin = tm_seq_id_begin + self.tm_seq_id_end = tm_seq_id_end + self.chain_id = chain_id + self._seq_id_begin, self._seq_id_end = seq_id_begin, seq_id_end + + def get_seq_id_range(self, starting_model): + # The template may cover more than the current starting model + seq_id_begin = max(starting_model.seq_id_begin, self._seq_id_begin) + seq_id_end = min(starting_model.seq_id_end, self._seq_id_end) + return (seq_id_begin, seq_id_end) + + +class _UnknownSource(_StartingModelSource): + """Part of a starting model from an unknown source""" + db_code = None + db_name = None + chain_id = None + sequence_identity = None + # Map dataset types to starting model sources + _source_map = {'Comparative model': 'comparative model', + 'Experimental model': 'experimental model'} + + def __init__(self, dataset, chain): + self.source = self._source_map[dataset._data_type] + self.chain_id = chain + + def get_seq_id_range(self, starting_model): + return (starting_model.seq_id_begin, starting_model.seq_id_end) + + +class _PDBMetadataParser(_MetadataParser): + """Extract metadata (e.g. PDB ID, comparative modeling templates) from a + PDB file.""" + + def parse_file(self, filename, chain, system): + """Extract metadata from `filename`. + Sets self.dataset to point to the PDB file, self.sources to + a list of sources (e.g. comparative modeling templates), and + self.alignment_file to the comparative modeling alignment, + if available.""" + self.alignment_file = None + fh = open(filename) + first_line = fh.readline() + local_file = IMP.mmcif.dataset.FileLocation(filename, + details="Starting model structure") + if first_line.startswith('HEADER'): + self._parse_official_pdb(fh, chain, first_line, system) + elif first_line.startswith('EXPDTA DERIVED FROM PDB:'): + self._parse_derived_from_pdb(fh, chain, first_line, local_file, + system) + elif first_line.startswith('EXPDTA DERIVED FROM COMPARATIVE ' + 'MODEL, DOI:'): + self._parse_derived_from_model(fh, chain, first_line, local_file, + system) + elif first_line.startswith('EXPDTA THEORETICAL MODEL, MODELLER'): + self._parse_modeller_model(fh, chain, first_line, local_file, + filename, system) + elif first_line.startswith('REMARK 99 Chain ID :'): + self._parse_phyre_model(fh, chain, first_line, local_file, + filename, system) + else: + self._parse_unknown_model(fh, chain, first_line, local_file, + filename, system) + + def _parse_official_pdb(self, fh, chain, first_line, system): + """Handle a file that's from the official PDB database.""" + version, details, metadata = self._parse_pdb_records(fh, first_line) + source = _PDBSource(first_line[62:66].strip(), chain, metadata) + l = IMP.mmcif.dataset.PDBLocation(source.db_code, version, details) + d = IMP.mmcif.dataset.PDBDataset(l) + self.dataset = system.datasets.add(d) + self.sources = [source] + + def _parse_derived_from_pdb(self, fh, chain, first_line, local_file, + system): + # Model derived from a PDB structure; treat as a local experimental + # model with the official PDB as a parent + local_file.details = self._parse_details(fh) + db_code = first_line[27:].strip() + d = IMP.mmcif.dataset.PDBDataset(local_file) + pdb_loc = IMP.mmcif.dataset.PDBLocation(db_code) + parent = IMP.mmcif.dataset.PDBDataset(pdb_loc) + d.add_parent(parent) + self.dataset = system.datasets.add(d) + self.sources = [_UnknownSource(self.dataset, chain)] + + def _parse_derived_from_model(self, fh, chain, first_line, local_file, + system): + # Model derived from a comparative model; link back to the original + # model as a parent + local_file.details = self._parse_details(fh) + d = IMP.mmcif.dataset.ComparativeModelDataset(local_file) + repo = IMP.mmcif.dataset.Repository(doi=first_line[46:].strip()) + # todo: better specify an unknown path + orig_loc = IMP.mmcif.dataset.FileLocation(repo=repo, path='.', + details="Starting comparative model structure") + parent = IMP.mmcif.dataset.ComparativeModelDataset(orig_loc) + d.add_parent(parent) + self.dataset = system.datasets.add(d) + self.sources = [_UnknownSource(self.dataset, chain)] + + def _parse_modeller_model(self, fh, chain, first_line, local_file, + filename, system): + system._software.set_modeller_used(*first_line[38:].split(' ', 1)) + self._handle_comparative_model(local_file, filename, chain, system) + + def _parse_phyre_model(self, fh, chain, first_line, local_file, + filename, system): + # Model generated by Phyre2 + # todo: extract Modeller-like template info for Phyre models + system._software.set_phyre2_used() + self._handle_comparative_model(local_file, filename, chain, system) + + def _parse_unknown_model(self, fh, chain, first_line, local_file, filename, + system): + # todo: revisit assumption that all unknown source PDBs are + # comparative models + self._handle_comparative_model(local_file, filename, chain, system) + + def _handle_comparative_model(self, local_file, pdbname, chain, system): + d = IMP.mmcif.dataset.ComparativeModelDataset(local_file) + self.dataset = system.datasets.add(d) + templates, alnfile = self.get_templates(pdbname, system) + if alnfile: + self.alignment_file = IMP.mmcif.dataset.FileLocation(alnfile, + details="Alignment for starting " + "comparative model") + system._external_files.add_input(self.alignment_file) + + if templates: + self.sources = templates + else: + self.sources = [_UnknownSource(self.dataset, chain)] + + def get_templates(self, pdbname, system): + template_path_map = {} + templates = [] + alnfile = None + alnfilere = re.compile('REMARK 6 ALIGNMENT: (\S+)') + tmppathre = re.compile('REMARK 6 TEMPLATE PATH (\S+) (\S+)') + tmpre = re.compile('REMARK 6 TEMPLATE: ' + '(\S+) (\S+):\S+ \- (\S+):\S+ ' + 'MODELS (\S+):(\S+) \- (\S+):\S+ AT (\S+)%') + + with open(pdbname) as fh: + for line in fh: + if line.startswith('ATOM'): # Read only the header + break + m = tmppathre.match(line) + if m: + template_path_map[m.group(1)] = \ + IMP.get_relative_path(pdbname, m.group(2)) + m = alnfilere.match(line) + if m: + # Path to alignment is relative to that of the PDB file + alnfile = IMP.get_relative_path(pdbname, m.group(1)) + m = tmpre.match(line) + if m: + templates.append(_TemplateSource(m.group(1), + int(m.group(2)), + int(m.group(3)), + int(m.group(4)), + m.group(5), + int(m.group(6)), + m.group(7))) + # Add datasets for templates + for t in templates: + if t._orig_tm_code: + fname = template_path_map[t._orig_tm_code] + l = IMP.mmcif.dataset.FileLocation(fname, + details="Template for comparative modeling") + else: + l = IMP.mmcif.dataset.PDBLocation(t.tm_db_code) + d = IMP.mmcif.dataset.PDBDataset(l) + d = system.datasets.add(d) + t.tm_dataset = d + self.dataset.add_parent(d) + + # Sort by starting residue, then ending residue + return(sorted(templates, + key=lambda x: (x._seq_id_begin, x._seq_id_end)), + alnfile) + + def _parse_pdb_records(self, fh, first_line): + """Extract information from an official PDB""" + metadata = [] + details = '' + for line in fh: + if line.startswith('TITLE'): + details += line[10:].rstrip() + elif line.startswith('HELIX'): + metadata.append(_PDBHelix(line)) + return (first_line[50:59].strip(), + details if details else None, metadata) + + def _parse_details(self, fh): + """Extract TITLE records from a PDB file""" + details = '' + for line in fh: + if line.startswith('TITLE'): + details += line[10:].rstrip() + elif line.startswith('ATOM'): + break + return details diff --git a/modules/mmcif/pyext/src/util.py b/modules/mmcif/pyext/src/util.py new file mode 100644 index 0000000000..ba7cac21ab --- /dev/null +++ b/modules/mmcif/pyext/src/util.py @@ -0,0 +1,205 @@ +import IMP.mmcif.dumper +import IMP.mmcif.dataset +import IMP.mmcif.data +import IMP.mmcif.format +import IMP.rmf +import IMP.atom +import RMF +import string +import weakref +import operator + +class _NonModeledChain(object): + """Represent a chain that was experimentally characterized but not modeled. + Such a chain resembles an IMP.atom.Chain, but has no associated + structure, and belongs to no state.""" + def __init__(self, name, sequence, chain_type): + self.name = name + self.sequence = sequence + self.chain_type = chain_type + + get_sequence = lambda self: self.sequence + + +class System(object): + def __init__(self): + self._states = {} + self._assemblies = IMP.mmcif.data._Assemblies() + # The assembly of all known components. + self.complete_assembly = IMP.mmcif.data._Assembly() + self._assemblies.add(self.complete_assembly) + + self._dumpers = [IMP.mmcif.dumper._EntryDumper(), # must be first + IMP.mmcif.dumper._SoftwareDumper(), + IMP.mmcif.dumper._CitationDumper(), + IMP.mmcif.dumper._ChemCompDumper(), + IMP.mmcif.dumper._EntityDumper(), + IMP.mmcif.dumper._EntityPolyDumper(), + IMP.mmcif.dumper._EntityPolySeqDumper(), + IMP.mmcif.dumper._StructAsymDumper(), + IMP.mmcif.dumper._AssemblyDumper(), + IMP.mmcif.dumper._ModelRepresentationDumper(), + IMP.mmcif.dumper._ExternalReferenceDumper(), + IMP.mmcif.dumper._DatasetDumper(), + IMP.mmcif.dumper._StartingModelDumper(), + IMP.mmcif.dumper._ProtocolDumper(), + IMP.mmcif.dumper._PostProcessDumper()] + self.entities = IMP.mmcif.data._EntityMapper() + self.components = IMP.mmcif.data._ComponentMapper() + self._citations = [] + self._software = IMP.mmcif.data._AllSoftware() + self._external_files = IMP.mmcif.data._ExternalFiles() + self.datasets = IMP.mmcif.data._Datasets(self._external_files) + # All modeling protocols + self.protocols = IMP.mmcif.data._Protocols() + + def _update_location(self, fileloc): + """Update FileLocation to point to a parent repository, if any""" + IMP.mmcif.dataset.Repository.update_in_repos(fileloc, + self._external_files._repos) + + def add_repository(self, doi, root=None, url=None, top_directory=None): + """Add a repository containing one or more modeling files.""" + self._external_files.add_repo(IMP.mmcif.dataset.Repository( + doi, root, url, top_directory)) + + def add_citation(self, pmid, title, journal, volume, page_range, year, + authors, doi): + """Add a publication that describes the modeling""" + self._citations.append(IMP.mmcif.data._Citation( + pmid, title, journal, volume, page_range, year, + authors, doi)) + + def add_software(self, name, classification, description, url, + type='program', version=None): + "Add software (other than IMP) used as part of the modeling protocol." + self._software.append(IMP.mmcif.data._Software( + name, classification, description, url, type, version)) + + def _add_state(self, state): + self._states[state] = None + state.id = len(self._states) + + def _add_hierarchy(self, h, state): + chains = [IMP.atom.Chain(c) + for c in IMP.atom.get_by_type(h, IMP.atom.CHAIN_TYPE)] + self._remove_duplicate_chain_ids(chains) + # todo: handle same chain in multiple states + for c in chains: + component = self._add_chain(c) + state._all_modeled_components.append(component) + state.modeled_assembly.append(component) + state.representation[component] \ + = list(self._get_representation(c, + self._get_all_starting_models(component))) + self.protocols._add_hierarchy(h, state.modeled_assembly) + self._external_files.add_hierarchy(h) + self._software.add_hierarchy(h) + + def _get_all_starting_models(self, comp): + """Get all starting models (in all states) for the given component""" + for state in self._states: + for rep in state.representation.get(comp, []): + if rep.starting_model: + yield rep.starting_model + + def _get_representation(self, chain, existing_starting_models): + """Yield groups of particles under chain with same representation""" + smf = IMP.mmcif.data._StartingModelFinder(existing_starting_models) + rep = IMP.mmcif.data._Representation() + for sp in self._get_structure_particles(chain): + starting_model = smf.find(sp, self) + if not rep.add(sp, starting_model): + yield rep + rep = IMP.mmcif.data._Representation() + rep.add(sp, starting_model) + if rep: + yield rep + + def _get_structure_particles(self, chain): + """Yield all particles under chain with coordinates. + They are sorted by residue index.""" + # todo: handle Representation decorators for non-PMI-1 models + ps = IMP.atom.get_leaves(chain) + resind_dict = {} + for p in ps: + if IMP.atom.Residue.get_is_setup(p): + residue = IMP.atom.Residue(p) + resind = residue.get_index() + if resind in resind_dict: + continue + resind_dict[resind] = residue + elif IMP.atom.Fragment.get_is_setup(p): + fragment = IMP.atom.Fragment(p) + # todo: handle non-contiguous fragments + resinds = fragment.get_residue_indexes() + resind = resinds[len(resinds) // 2] + if resind in resind_dict: + continue + resind_dict[resind] = fragment + # Return values sorted by key (residue index) + for item in sorted(resind_dict.items(), key=operator.itemgetter(0)): + yield item[1] + + def add_non_modeled_chain(self, name, sequence, + chain_type=IMP.atom.UnknownChainType): + """Add a chain that wasn't modeled by IMP.""" + c = _NonModeledChain(name, sequence, chain_type) + self._add_chain(c) + + def _add_chain(self, c): + entity = self.entities.add(c) + component = self.components.add(c, entity) + if component not in self.complete_assembly: + self.complete_assembly.append(component) + return component + + def _remove_duplicate_chain_ids(self, chains): + chain_ids = [c.get_id() for c in chains] + if len(set(chain_ids)) < len(chain_ids): + print("Duplicate chain IDs detected - reassigning as A-Z, a-z, 0-9") + # todo: handle > 62 chains + for chain, cid in zip(chains, string.uppercase + + string.lowercase + string.digits): + chain.set_id(cid) + + def write(self, fname): + with open(fname, 'w') as fh: + writer = IMP.mmcif.format._CifWriter(fh) + for dumper in self._dumpers: + dumper.finalize_metadata(self) + for dumper in self._dumpers: + dumper.finalize(self) + for dumper in self._dumpers: + dumper.dump(self, writer) + +class State(object): + """Represent a single IMP state.""" + def __init__(self, system): + self.system = weakref.proxy(system) + system._add_state(self) + self.model = IMP.Model() + self.hiers = None + # The assembly of all components modeled by IMP in this state. + # This may be smaller than the complete assembly. + self.modeled_assembly = IMP.mmcif.data._Assembly() + system._assemblies.add(self.modeled_assembly) + # A list of Representation objects for each Component + self.representation = {} + + self._all_modeled_components = [] + + def add_rmf_files(self, rmfs): + for fname in rmfs: + rmf = RMF.open_rmf_file_read_only(fname) + if self.hiers is None: + self.hiers = IMP.rmf.create_hierarchies(rmf, self.model) + else: + IMP.rmf.link_hierarchies(rmf, self.hiers) + # todo: allow reading other frames + IMP.rmf.load_frame(rmf, RMF.FrameID(0)) + for h in self.hiers: + self.add_hierarchy(h) + + def add_hierarchy(self, h): + self.system._add_hierarchy(h, self) diff --git a/modules/mmcif/pyext/swig.i-in b/modules/mmcif/pyext/swig.i-in new file mode 100644 index 0000000000..8c009e01d2 --- /dev/null +++ b/modules/mmcif/pyext/swig.i-in @@ -0,0 +1,2 @@ +/* Include top-level Python interface */ +%include "IMP_mmcif.init.i" diff --git a/modules/mmcif/test/input/15133C.pdb b/modules/mmcif/test/input/15133C.pdb new file mode 100644 index 0000000000..32710de946 --- /dev/null +++ b/modules/mmcif/test/input/15133C.pdb @@ -0,0 +1 @@ +ATOM 2 CA TYR A 7 -8.986 11.688 -5.817 1.00 91.82 C diff --git a/modules/mmcif/test/input/derived_model.pdb b/modules/mmcif/test/input/derived_model.pdb new file mode 100644 index 0000000000..49137698b2 --- /dev/null +++ b/modules/mmcif/test/input/derived_model.pdb @@ -0,0 +1,6 @@ +EXPDTA DERIVED FROM COMPARATIVE MODEL, DOI:10.1093/nar/gkt704 +TITLE MED4 AND MED9 STRUCTURE TAKEN FROM LARIVIERE ET AL, +TITLE 1 NUCLEIC ACIDS RESEARCH. 2013;41:9266-9273. DOI: 10.1093/nar/gkt704. +TITLE 2 THE MED10 STRUCTURE ALSO PROPOSED IN THAT WORK IS NOT USED +TITLE 3 IN THIS STUDY. +ATOM 2 CA LEU A 37 33.176 69.138 -65.108 1.00268.38 C diff --git a/modules/mmcif/test/input/derived_pdb.pdb b/modules/mmcif/test/input/derived_pdb.pdb new file mode 100644 index 0000000000..bcf36b4b85 --- /dev/null +++ b/modules/mmcif/test/input/derived_pdb.pdb @@ -0,0 +1,4 @@ +EXPDTA DERIVED FROM PDB:1YKH +TITLE MED7C AND MED21 STRUCTURES FROM PDB ENTRY 1YKH, ROTATED AND +TITLE 1 TRANSLATED TO ALIGN WITH THE MED4-MED9 MODEL +ATOM 1341 CA TYR A 112 49.530 67.604 46.768 1.00 40.17 C diff --git a/modules/mmcif/test/input/modeller_model.ali b/modules/mmcif/test/input/modeller_model.ali new file mode 100644 index 0000000000..8d88241bb8 --- /dev/null +++ b/modules/mmcif/test/input/modeller_model.ali @@ -0,0 +1,12 @@ +>P1;5fd1 +structureX:5fd1:1 :A:106 :A:ferredoxin:Azotobacter vinelandii: 1.90: 0.19 +W* + +>P1;1fdx +sequence:1fdx:1 : :54 : :ferredoxin:Peptococcus aerogenes: 2.00:-1.00 +Y* +~ +~ +~ +~ + diff --git a/modules/mmcif/test/input/modeller_model.pdb b/modules/mmcif/test/input/modeller_model.pdb new file mode 100644 index 0000000000..c62651a1b7 --- /dev/null +++ b/modules/mmcif/test/input/modeller_model.pdb @@ -0,0 +1,5 @@ +EXPDTA THEORETICAL MODEL, MODELLER 9.18 2017/02/10 22:21:34 +REMARK 6 ALIGNMENT: modeller_model.ali +REMARK 6 TEMPLATE: 3jroC 33:C - 424:C MODELS 33:A - 424:A AT 100.0% +REMARK 6 TEMPLATE: 3f3fG 482:G - 551:G MODELS 429:A - 488:A AT 10.0% +ATOM 2 CA TYR A 7 -8.986 11.688 -5.817 1.00 91.82 C diff --git a/modules/mmcif/test/input/modeller_model_local.pdb b/modules/mmcif/test/input/modeller_model_local.pdb new file mode 100644 index 0000000000..6b039d1213 --- /dev/null +++ b/modules/mmcif/test/input/modeller_model_local.pdb @@ -0,0 +1,5 @@ +EXPDTA THEORETICAL MODEL, MODELLER 9.18 2017/02/10 22:21:34 +REMARK 6 ALIGNMENT: modeller_model.ali +REMARK 6 TEMPLATE PATH 15133C ./15133C.pdb +REMARK 6 TEMPLATE: 15133C 33:C - 424:C MODELS 33:A - 424:A AT 100.0% +ATOM 2 CA TYR A 7 -8.986 11.688 -5.817 1.00 91.82 C diff --git a/modules/mmcif/test/input/modeller_model_no_aln.pdb b/modules/mmcif/test/input/modeller_model_no_aln.pdb new file mode 100644 index 0000000000..9f8a66f6aa --- /dev/null +++ b/modules/mmcif/test/input/modeller_model_no_aln.pdb @@ -0,0 +1,4 @@ +EXPDTA THEORETICAL MODEL, MODELLER 9.18 2017/02/10 22:21:34 +REMARK 6 TEMPLATE: 3jroC 33:C - 424:C MODELS 33:A - 424:A AT 100.0% +REMARK 6 TEMPLATE: 3f3fG 482:G - 551:G MODELS 429:A - 488:A AT 10.0% +ATOM 2 CA TYR A 7 -8.986 11.688 -5.817 1.00 91.82 C diff --git a/modules/mmcif/test/input/official.pdb b/modules/mmcif/test/input/official.pdb new file mode 100644 index 0000000000..5261967206 --- /dev/null +++ b/modules/mmcif/test/input/official.pdb @@ -0,0 +1,5 @@ +HEADER HYDROLASE, GENE REGULATION 14-JUN-06 2HBJ +TITLE STRUCTURE OF THE YEAST NUCLEAR EXOSOME COMPONENT, RRP6P, +TITLE 2 REVEALS AN INTERPLAY BETWEEN THE ACTIVE SITE AND THE HRDC +TITLE 3 DOMAIN +ATOM 2 CA GLY A 127 57.062 21.781 -5.354 1.00 88.40 C diff --git a/modules/mmcif/test/input/phyre2_model.pdb b/modules/mmcif/test/input/phyre2_model.pdb new file mode 100644 index 0000000000..cde9c3af42 --- /dev/null +++ b/modules/mmcif/test/input/phyre2_model.pdb @@ -0,0 +1,6 @@ +REMARK 99 Chain ID : 1 +REMARK 99 Residues : 361 +REMARK 99 Atoms : 3263 +REMARK 99 File : c4bzkA_.1.pdb.sc +REMARK 6 TEMPLATE: 4bzkA 13:A - 334:A MODELS 8:A - 538:A AT 11% +ATOM 2 CA MET A 8 221.364 50.140 158.235 1.00 0.00 C diff --git a/modules/mmcif/test/input/test.nup84.pdb b/modules/mmcif/test/input/test.nup84.pdb new file mode 100644 index 0000000000..5c3f2e13c0 --- /dev/null +++ b/modules/mmcif/test/input/test.nup84.pdb @@ -0,0 +1,6 @@ +EXPDTA THEORETICAL MODEL, MODELLER 9.18 2017/02/10 22:21:34 +REMARK 6 ALIGNMENT: modeller_model.ali +REMARK 6 TEMPLATE: 3jroC 33:C - 424:C MODELS 33:A - 424:A AT 100.0% +REMARK 6 TEMPLATE: 3f3fG 482:G - 551:G MODELS 429:A - 488:A AT 10.0% +ATOM 1 CA MET A 1 -8.986 11.688 -5.817 1.00 91.82 C +ATOM 2 CA GLU A 2 -8.986 11.688 -5.817 1.00 91.82 C diff --git a/modules/mmcif/test/input/test.nup85.pdb b/modules/mmcif/test/input/test.nup85.pdb new file mode 100644 index 0000000000..bbc2ca07a2 --- /dev/null +++ b/modules/mmcif/test/input/test.nup85.pdb @@ -0,0 +1,6 @@ +EXPDTA THEORETICAL MODEL, MODELLER 9.18 2017/02/10 22:21:34 +REMARK 6 ALIGNMENT: modeller_model.ali +REMARK 6 TEMPLATE: 3jroC 33:C - 424:C MODELS 33:A - 424:A AT 100.0% +REMARK 6 TEMPLATE: 3f3fG 482:G - 551:G MODELS 429:A - 488:A AT 10.0% +ATOM 1 CA GLY A 1 -8.986 11.688 -5.817 1.00 91.82 C +ATOM 2 CA GLU A 2 -8.986 11.688 -5.817 1.00 91.82 C diff --git a/modules/mmcif/test/standards_exceptions b/modules/mmcif/test/standards_exceptions new file mode 100644 index 0000000000..1d3fa89304 --- /dev/null +++ b/modules/mmcif/test/standards_exceptions @@ -0,0 +1,4 @@ +value_object_exceptions=[] +function_name_exceptions=[] +show_exceptions=[] +spelling_exceptions=[] diff --git a/modules/mmcif/test/test_datasets.py b/modules/mmcif/test/test_datasets.py new file mode 100644 index 0000000000..b453a7ad8e --- /dev/null +++ b/modules/mmcif/test/test_datasets.py @@ -0,0 +1,35 @@ +from __future__ import print_function +import IMP.test +import IMP.mmcif.dataset + +class Tests(IMP.test.TestCase): + def test_duplicate_datasets_details(self): + """Datasets with differing details should be considered duplicates""" + fname = self.get_input_file_name('test.nup84.pdb') + l1 = IMP.mmcif.dataset.FileLocation(fname, details='test details') + d1 = IMP.mmcif.dataset.PDBDataset(l1) + + l2 = IMP.mmcif.dataset.FileLocation(fname, details='other details') + d2 = IMP.mmcif.dataset.PDBDataset(l2) + self.assertEqual(l1, l2) + + def test_duplicate_locations(self): + """Datasets with same location should be considered duplicates""" + fname1 = self.get_input_file_name('test.nup84.pdb') + fname2 = self.get_input_file_name('test.nup85.pdb') + loc1 = IMP.mmcif.dataset.FileLocation(fname1) + loc2 = IMP.mmcif.dataset.FileLocation(fname2) + + # Identical datasets in the same location aren't duplicated + pdb1 = IMP.mmcif.dataset.PDBDataset(loc1) + pdb2 = IMP.mmcif.dataset.PDBDataset(loc1) + self.assertEqual(pdb1, pdb2) + + # Datasets in different locations are OK + pdb1 = IMP.mmcif.dataset.PDBDataset(loc1) + pdb2 = IMP.mmcif.dataset.PDBDataset(loc2) + self.assertNotEqual(pdb1, pdb2) + + +if __name__ == '__main__': + IMP.test.main() diff --git a/modules/mmcif/test/test_dumper.py b/modules/mmcif/test/test_dumper.py new file mode 100644 index 0000000000..74ef23deaf --- /dev/null +++ b/modules/mmcif/test/test_dumper.py @@ -0,0 +1,741 @@ +from __future__ import print_function +import IMP.test +import IMP.mmcif +import IMP.mmcif.dumper +import IMP.mmcif.format +import IMP.mmcif.dataset +import io +import sys +import os +if sys.version_info[0] >= 3: + from io import StringIO +else: + from io import BytesIO as StringIO + +def _get_dumper_output(dumper, system): + fh = StringIO() + writer = IMP.mmcif.format._CifWriter(fh) + dumper.dump(system, writer) + return fh.getvalue() + +class Tests(IMP.test.TestCase): + def make_model(self, system, chains=None): + if chains is None: + chains = (('foo', 'ACGT', 'A'), ('bar', 'ACGT', 'B'), + ('baz', 'ACC', 'C')) + s = IMP.mmcif.State(system) + m = s.model + top = IMP.atom.Hierarchy.setup_particle(IMP.Particle(m)) + for name, seq, cid in chains: + h = IMP.atom.Hierarchy.setup_particle(IMP.Particle(m)) + mol = IMP.atom.Molecule.setup_particle(h) + mol.set_name(name) + top.add_child(mol) + + h = IMP.atom.Hierarchy.setup_particle(IMP.Particle(m)) + chain = IMP.atom.Chain.setup_particle(h, cid) + chain.set_sequence(seq) + mol.add_child(chain) + return top, s + + def make_model_with_protocol(self, system, chains=None): + top, s = self.make_model(system, chains) + m = s.model + prov = IMP.core.SampleProvenance.setup_particle(m, IMP.Particle(m), + "Monte Carlo", 100, 5) + IMP.core.add_provenance(m, top, prov) + + prov = IMP.core.CombineProvenance.setup_particle(m, IMP.Particle(m), + 5, 500) + IMP.core.add_provenance(m, top, prov) + + prov = IMP.core.FilterProvenance.setup_particle(m, IMP.Particle(m), + "Total score", 100.5, 400) + IMP.core.add_provenance(m, top, prov) + + prov = IMP.core.SampleProvenance.setup_particle(m, IMP.Particle(m), + "Molecular Dynamics", 2000, 5, 16) + IMP.core.add_provenance(m, top, prov) + + prov = IMP.core.ClusterProvenance.setup_particle(m, IMP.Particle(m), 10) + IMP.core.add_provenance(m, top, prov) + return top, s + + def test_citation_dumper(self): + """Test CitationDumper""" + system = IMP.mmcif.System() + system.add_citation( + pmid='25161197', + title="Structural characterization by cross-linking reveals the\n" + "detailed architecture of a coatomer-related heptameric\n" + "module from the nuclear pore complex.", + journal="Mol Cell Proteomics", volume=13, page_range=(2927,2943), + year=2014, + authors=['Shi Y', 'Fernandez-Martinez J', 'Tjioe E', 'Pellarin R', + 'Kim SJ', 'Williams R', 'Schneidman-Duhovny D', 'Sali A', + 'Rout MP', 'Chait BT'], + doi='10.1074/mcp.M114.041673') + dumper = IMP.mmcif.dumper._CitationDumper() + out = _get_dumper_output(dumper, system) + self.assertEqual(out, """# +loop_ +_citation.id +_citation.title +_citation.journal_abbrev +_citation.journal_volume +_citation.page_first +_citation.page_last +_citation.year +_citation.pdbx_database_id_PubMed +_citation.pdbx_database_id_DOI +1 +;Structural characterization by cross-linking reveals the +detailed architecture of a coatomer-related heptameric +module from the nuclear pore complex. +; +'Mol Cell Proteomics' 13 2927 2943 2014 25161197 10.1074/mcp.M114.041673 +# +# +loop_ +_citation_author.citation_id +_citation_author.name +_citation_author.ordinal +1 'Shi Y' 1 +1 'Fernandez-Martinez J' 2 +1 'Tjioe E' 3 +1 'Pellarin R' 4 +1 'Kim SJ' 5 +1 'Williams R' 6 +1 'Schneidman-Duhovny D' 7 +1 'Sali A' 8 +1 'Rout MP' 9 +1 'Chait BT' 10 +# +""") + # Handle no last page + system._citations[0].page_range = 'e1637' + out = _get_dumper_output(dumper, system) + self.assertTrue("'Mol Cell Proteomics' 13 e1637 . 2014 " in out) + + def test_software_dumper(self): + """Test SoftwareDumper""" + system = IMP.mmcif.System() + h, state = self.make_model(system) + m = state.model + IMP.core.add_imp_provenance(h) + state.add_hierarchy(h) + + system.add_software(name='test', classification='test code', + description='Some test program', + version=1, url='http://salilab.org') + dumper = IMP.mmcif.dumper._SoftwareDumper() + out = _get_dumper_output(dumper, system) + self.assertEqual(out, """# +loop_ +_software.pdbx_ordinal +_software.name +_software.classification +_software.version +_software.type +_software.location +1 'Integrative Modeling Platform (IMP)' 'integrative model building' +%s program https://integrativemodeling.org +2 test 'test code' 1 program http://salilab.org +# +""" % IMP.get_module_version()) + + def test_entry_dumper(self): + """Test EntryDumper""" + system = IMP.mmcif.System() + dumper = IMP.mmcif.dumper._EntryDumper() + out = _get_dumper_output(dumper, system) + self.assertEqual(out, "data_imp_model\n_entry.id imp_model\n") + + def test_chem_comp_dumper(self): + """Test ChemCompDumper""" + system = IMP.mmcif.System() + h, state = self.make_model(system) + state.add_hierarchy(h) + dumper = IMP.mmcif.dumper._ChemCompDumper() + out = _get_dumper_output(dumper, system) + self.assertEqual(out, """# +loop_ +_chem_comp.id +_chem_comp.type +ALA 'L-peptide linking' +CYS 'L-peptide linking' +GLY 'L-peptide linking' +THR 'L-peptide linking' +# +""") + + def test_entity_dumper(self): + """Test EntityDumper""" + system = IMP.mmcif.System() + h, state = self.make_model(system) + state.add_hierarchy(h) + dumper = IMP.mmcif.dumper._EntityDumper() + out = _get_dumper_output(dumper, system) + self.assertEqual(out, """# +loop_ +_entity.id +_entity.type +_entity.src_method +_entity.pdbx_description +_entity.formula_weight +_entity.pdbx_number_of_molecules +_entity.details +1 polymer man foo ? 1 ? +2 polymer man baz ? 1 ? +# +""") + + def test_entity_poly_dumper(self): + """Test EntityPolyDumper""" + system = IMP.mmcif.System() + h, state = self.make_model(system) + state.add_hierarchy(h) + dumper = IMP.mmcif.dumper._EntityPolyDumper() + out = _get_dumper_output(dumper, system) + self.assertEqual(out, """# +loop_ +_entity_poly.entity_id +_entity_poly.type +_entity_poly.nstd_linkage +_entity_poly.nstd_monomer +_entity_poly.pdbx_strand_id +_entity_poly.pdbx_seq_one_letter_code +_entity_poly.pdbx_seq_one_letter_code_can +1 polypeptide(L) no no A ACGT ACGT +2 polypeptide(L) no no C ACC ACC +# +""") + + def test_entity_poly_seq_dumper(self): + """Test EntityPolySeqDumper""" + system = IMP.mmcif.System() + h, state = self.make_model(system) + state.add_hierarchy(h) + dumper = IMP.mmcif.dumper._EntityPolySeqDumper() + out = _get_dumper_output(dumper, system) + self.assertEqual(out, """# +loop_ +_entity_poly_seq.entity_id +_entity_poly_seq.num +_entity_poly_seq.mon_id +_entity_poly_seq.hetero +1 1 ALA . +1 2 CYS . +1 3 GLY . +1 4 THR . +2 1 ALA . +2 2 CYS . +2 3 CYS . +# +""") + + def test_struct_asym_dumper(self): + """Test StructAsymDumper""" + system = IMP.mmcif.System() + h, state = self.make_model(system) + state.add_hierarchy(h) + dumper = IMP.mmcif.dumper._StructAsymDumper() + out = _get_dumper_output(dumper, system) + self.assertEqual(out, """# +loop_ +_struct_asym.id +_struct_asym.entity_id +_struct_asym.details +A 1 foo +B 1 bar +C 2 baz +# +""") + + def test_assembly_dumper_get_subassembly(self): + """Test AssemblyDumper.get_subassembly()""" + system = IMP.mmcif.System() + a = system._assemblies + system.complete_assembly.extend(['a', 'b', 'c']) + x1 = a.get_subassembly({'a':None, 'b':None}) + x2 = a.get_subassembly({'a':None, 'b':None, 'c':None}) + + d = IMP.mmcif.dumper._AssemblyDumper() + d.finalize(system) # assign IDs to all assemblies + self.assertEqual(system.complete_assembly.id, 1) + self.assertEqual(x1.id, 2) + self.assertEqual(x1, ['a', 'b']) + self.assertEqual(x2.id, 1) + + def test_assembly_all_modeled(self): + """Test AssemblyDumper, all components modeled""" + system = IMP.mmcif.System() + h, state = self.make_model(system, (("foo", "AAA", 'A'), + ("bar", "AAA", 'B'), + ("baz", "AA", 'C'))) + state.add_hierarchy(h) + foo, bar, baz = state._all_modeled_components + + system._assemblies.add(IMP.mmcif.data._Assembly((foo, bar))) + system._assemblies.add(IMP.mmcif.data._Assembly((bar, baz))) + + d = IMP.mmcif.dumper._AssemblyDumper() + d.finalize(system) + out = _get_dumper_output(d, system) + self.assertEqual(out, """# +loop_ +_ihm_struct_assembly.ordinal_id +_ihm_struct_assembly.assembly_id +_ihm_struct_assembly.parent_assembly_id +_ihm_struct_assembly.entity_description +_ihm_struct_assembly.entity_id +_ihm_struct_assembly.asym_id +_ihm_struct_assembly.seq_id_begin +_ihm_struct_assembly.seq_id_end +1 1 1 foo 1 A 1 3 +2 1 1 foo 1 B 1 3 +3 1 1 baz 2 C 1 2 +4 2 2 foo 1 A 1 3 +5 2 2 foo 1 B 1 3 +6 3 3 foo 1 B 1 3 +7 3 3 baz 2 C 1 2 +# +""") + + def test_assembly_subset_modeled(self): + """Test AssemblyDumper, subset of components modeled""" + system = IMP.mmcif.System() + h, state = self.make_model(system, (("foo", "AAA", 'A'),)) + state.add_hierarchy(h) + system.add_non_modeled_chain(name="bar", sequence="AA") + d = IMP.mmcif.dumper._AssemblyDumper() + d.finalize(system) # assign IDs + out = _get_dumper_output(d, system) + self.assertEqual(out, """# +loop_ +_ihm_struct_assembly.ordinal_id +_ihm_struct_assembly.assembly_id +_ihm_struct_assembly.parent_assembly_id +_ihm_struct_assembly.entity_description +_ihm_struct_assembly.entity_id +_ihm_struct_assembly.asym_id +_ihm_struct_assembly.seq_id_begin +_ihm_struct_assembly.seq_id_end +1 1 1 foo 1 A 1 3 +2 1 1 bar 2 . 1 2 +3 2 2 foo 1 A 1 3 +# +""") + + def test_model_representation_dumper(self): + """Test ModelRepresentationDumper""" + system = IMP.mmcif.System() + h, state = self.make_model(system, (("foo", "AAAA", 'A'),)) + chain = IMP.atom.get_by_type(h, IMP.atom.CHAIN_TYPE)[0] + m = state.model + # Add starting model information for residues 1-2 + ress = IMP.atom.Hierarchy.setup_particle(IMP.Particle(m)) + sp = IMP.core.StructureProvenance.setup_particle(IMP.Particle(m), + self.get_input_file_name("test.nup84.pdb"), "A") + IMP.core.Provenanced.setup_particle(ress, sp) + chain.add_child(ress) + res1 = IMP.atom.Residue.setup_particle(IMP.Particle(m), + IMP.atom.ALA, 1) + ress.add_child(res1) + res2 = IMP.atom.Residue.setup_particle(IMP.Particle(m), + IMP.atom.ALA, 2) + ress.add_child(res2) + # First matching object (res1 and res2) will be used + frag1 = IMP.atom.Fragment.setup_particle(IMP.Particle(m),[1,2]) + chain.add_child(frag1) + frag2 = IMP.atom.Fragment.setup_particle(IMP.Particle(m),[3,4]) + chain.add_child(frag2) + state.add_hierarchy(h) + # Assign starting model IDs + d = IMP.mmcif.dumper._StartingModelDumper() + d.finalize(system) + d = IMP.mmcif.dumper._ModelRepresentationDumper() + out = _get_dumper_output(d, system) + self.assertEqual(out, """# +loop_ +_ihm_model_representation.ordinal_id +_ihm_model_representation.representation_id +_ihm_model_representation.segment_id +_ihm_model_representation.entity_id +_ihm_model_representation.entity_description +_ihm_model_representation.entity_asym_id +_ihm_model_representation.seq_id_begin +_ihm_model_representation.seq_id_end +_ihm_model_representation.model_object_primitive +_ihm_model_representation.starting_model_id +_ihm_model_representation.model_mode +_ihm_model_representation.model_granularity +_ihm_model_representation.model_object_count +1 1 1 1 foo A 1 2 sphere foo-m1 flexible by-residue 2 +2 1 2 1 foo A 3 4 sphere . flexible by-feature 1 +# +""") + + def _make_residue_chain(self, name, chain_id, model): + if name == 'Nup84': + fname = 'test.nup84.pdb' + seq = 'ME' + else: + fname = 'test.nup85.pdb' + seq = 'GE' + h = IMP.atom.read_pdb(self.get_input_file_name(fname), model) + for hchain in IMP.atom.get_by_type(h, IMP.atom.CHAIN_TYPE): + chain = IMP.atom.Chain(hchain) + chain.set_sequence(seq) + chain.set_id(chain_id) + chain.set_name(name) + for hres in IMP.atom.get_by_type(h, IMP.atom.RESIDUE_TYPE): + res = IMP.atom.Residue(hres) + while res.get_number_of_children() > 0: + res.remove_child(0) + return h + + def test_starting_model_dumper(self): + """Test StartingModelDumper""" + m = IMP.Model() + system = IMP.mmcif.System() + + state1h = IMP.atom.Hierarchy.setup_particle(IMP.Particle(m)) + h1 = self._make_residue_chain('Nup84', 'A', m) + state1h.add_child(h1) + + # Test multiple states: components that are the same in both states + # (Nup84) should not be duplicated in the mmCIF output + state2h = IMP.atom.Hierarchy.setup_particle(IMP.Particle(m)) + h1 = self._make_residue_chain('Nup84', 'A', m) + state2h.add_child(h1) + h2 = self._make_residue_chain('Nup85', 'B', m) + state2h.add_child(h2) + + state1 = IMP.mmcif.State(system) + state1.add_hierarchy(state1h) + state2 = IMP.mmcif.State(system) + state2.add_hierarchy(state2h) + + d = IMP.mmcif.dumper._ExternalReferenceDumper() + d.finalize(system) # assign file IDs (nup84 pdb = 1, alignment file = 2) + d = IMP.mmcif.dumper._StartingModelDumper() + d.finalize(system) + out = _get_dumper_output(d, system) + self.assertEqual(out, """# +loop_ +_ihm_starting_model_details.starting_model_id +_ihm_starting_model_details.entity_id +_ihm_starting_model_details.entity_description +_ihm_starting_model_details.asym_id +_ihm_starting_model_details.seq_id_begin +_ihm_starting_model_details.seq_id_end +_ihm_starting_model_details.starting_model_source +_ihm_starting_model_details.starting_model_auth_asym_id +_ihm_starting_model_details.starting_model_sequence_offset +_ihm_starting_model_details.dataset_list_id +Nup84-m1 1 Nup84 A 33 2 'comparative model' A 0 1 +Nup85-m1 2 Nup85 B 33 2 'comparative model' A 0 4 +# +# +loop_ +_ihm_starting_comparative_models.ordinal_id +_ihm_starting_comparative_models.starting_model_id +_ihm_starting_comparative_models.starting_model_auth_asym_id +_ihm_starting_comparative_models.starting_model_seq_id_begin +_ihm_starting_comparative_models.starting_model_seq_id_end +_ihm_starting_comparative_models.template_auth_asym_id +_ihm_starting_comparative_models.template_seq_id_begin +_ihm_starting_comparative_models.template_seq_id_end +_ihm_starting_comparative_models.template_sequence_identity +_ihm_starting_comparative_models.template_sequence_identity_denominator +_ihm_starting_comparative_models.template_dataset_list_id +_ihm_starting_comparative_models.alignment_file_id +1 Nup84-m1 A 33 2 C 33 424 100.0 1 2 2 +2 Nup84-m1 A 429 2 G 482 551 10.0 1 3 2 +3 Nup85-m1 A 33 2 C 33 424 100.0 1 2 2 +4 Nup85-m1 A 429 2 G 482 551 10.0 1 3 2 +# +# +loop_ +_ihm_starting_model_coord.starting_model_id +_ihm_starting_model_coord.group_PDB +_ihm_starting_model_coord.id +_ihm_starting_model_coord.type_symbol +_ihm_starting_model_coord.atom_id +_ihm_starting_model_coord.comp_id +_ihm_starting_model_coord.entity_id +_ihm_starting_model_coord.asym_id +_ihm_starting_model_coord.seq_id +_ihm_starting_model_coord.Cartn_x +_ihm_starting_model_coord.Cartn_y +_ihm_starting_model_coord.Cartn_z +_ihm_starting_model_coord.B_iso_or_equiv +_ihm_starting_model_coord.ordinal_id +Nup84-m1 ATOM 1 C CA MET 1 A 1 -8.986 11.688 -5.817 91.820 1 +Nup84-m1 ATOM 2 C CA GLU 1 A 2 -8.986 11.688 -5.817 91.820 2 +Nup85-m1 ATOM 1 C CA GLY 2 A 1 -8.986 11.688 -5.817 91.820 3 +Nup85-m1 ATOM 2 C CA GLU 2 A 2 -8.986 11.688 -5.817 91.820 4 +# +""") + + def test_dataset_dumper(self): + """Test DatasetDumper""" + m = IMP.Model() + system = IMP.mmcif.System() + + l = IMP.mmcif.dataset.FileLocation(repo="foo", path="bar") + l.id = 97 + d = IMP.mmcif.dataset.CXMSDataset(l) + pds = system.datasets.add(d) + + l = IMP.mmcif.dataset.PDBLocation("1abc", "1.0", "test details") + d = IMP.mmcif.dataset.PDBDataset(l) + system.datasets.add(d) + + l = IMP.mmcif.dataset.FileLocation(repo="foo2", path="bar2") + l.id = 98 + d = IMP.mmcif.dataset.PDBDataset(l) + d.add_parent(pds) + system.datasets.add(d) + + d = IMP.mmcif.dumper._DatasetDumper() + out = _get_dumper_output(d, system) + self.assertEqual(out, """# +loop_ +_ihm_dataset_list.id +_ihm_dataset_list.data_type +_ihm_dataset_list.database_hosted +1 'CX-MS data' NO +2 'Experimental model' YES +3 'Experimental model' NO +# +# +loop_ +_ihm_dataset_external_reference.id +_ihm_dataset_external_reference.dataset_list_id +_ihm_dataset_external_reference.file_id +1 1 97 +2 3 98 +# +# +loop_ +_ihm_dataset_related_db_reference.id +_ihm_dataset_related_db_reference.dataset_list_id +_ihm_dataset_related_db_reference.db_name +_ihm_dataset_related_db_reference.accession_code +_ihm_dataset_related_db_reference.version +_ihm_dataset_related_db_reference.details +1 2 PDB 1abc 1.0 'test details' +# +# +loop_ +_ihm_related_datasets.ordinal_id +_ihm_related_datasets.dataset_list_id_derived +_ihm_related_datasets.dataset_list_id_primary +1 3 1 +# +""") + + def test_seq_dif(self): + """Test StartingModelDumper.dump_seq_dif""" + class MockEntity(object): + id = 4 + class MockRes(object): + def get_index(self): + return 42 + class MockComponent(object): + entity = MockEntity() + class MockSource(object): + chain_id = 'X' + class MockModel(object): + name = 'dummy-m1' + chain_id = 'H' + offset = 2 + fh = StringIO() + writer = IMP.mmcif.format._CifWriter(fh) + dumper = IMP.mmcif.dumper._StartingModelDumper() + sd = IMP.mmcif.dumper._MSESeqDif(MockRes(), MockComponent(), + MockSource(), MockModel()) + dumper.dump_seq_dif(writer, [sd]) + out = fh.getvalue() + self.assertEqual(out, """# +loop_ +_ihm_starting_model_seq_dif.ordinal_id +_ihm_starting_model_seq_dif.entity_id +_ihm_starting_model_seq_dif.asym_id +_ihm_starting_model_seq_dif.seq_id +_ihm_starting_model_seq_dif.comp_id +_ihm_starting_model_seq_dif.starting_model_id +_ihm_starting_model_seq_dif.db_asym_id +_ihm_starting_model_seq_dif.db_seq_id +_ihm_starting_model_seq_dif.db_comp_id +_ihm_starting_model_seq_dif.details +1 4 H 42 MET dummy-m1 X 40 MSE 'Conversion of modified residue MSE to MET' +# +""") + + def test_external_reference_dumper_dump(self): + """Test ExternalReferenceDumper.dump()""" + system = IMP.mmcif.System() + exfil = system._external_files + + repo1 = IMP.mmcif.dataset.Repository(doi="foo") + repo2 = IMP.mmcif.dataset.Repository(doi="10.5281/zenodo.46266", + url='nup84-v1.0.zip', + top_directory=os.path.join('foo', 'bar')) + repo3 = IMP.mmcif.dataset.Repository(doi="10.5281/zenodo.58025", + url='foo.spd') + l = IMP.mmcif.dataset.FileLocation(repo=repo1, path='bar') + exfil.add_input(l) + # Duplicates should be ignored + l = IMP.mmcif.dataset.FileLocation(repo=repo1, path='bar') + exfil.add_input(l) + # Different file, same repository + l = IMP.mmcif.dataset.FileLocation(repo=repo1, path='baz') + exfil.add_input(l) + # Different repository + l = IMP.mmcif.dataset.FileLocation(repo=repo2, path='baz') + exfil.add_output(l) + # Repository containing a single file (not an archive) + l = IMP.mmcif.dataset.FileLocation(repo=repo3, path='foo.spd', + details='EM micrographs') + exfil.add_input(l) + bar = 'test_mmcif_extref.tmp' + with open(bar, 'w') as f: + f.write("abcd") + # Local file + l = IMP.mmcif.dataset.FileLocation(bar) + exfil.add_workflow(l) + # DatabaseLocations should be ignored + l = IMP.mmcif.dataset.PDBLocation('1abc', '1.0', 'test details') + exfil.add_workflow(l) + + dump = IMP.mmcif.dumper._ExternalReferenceDumper() + dump.finalize(system) # assign IDs + out = _get_dumper_output(dump, system) + self.assertEqual(out, """# +loop_ +_ihm_external_reference_info.reference_id +_ihm_external_reference_info.reference_provider +_ihm_external_reference_info.reference_type +_ihm_external_reference_info.reference +_ihm_external_reference_info.refers_to +_ihm_external_reference_info.associated_url +1 . DOI foo Other . +2 Zenodo DOI 10.5281/zenodo.46266 Archive nup84-v1.0.zip +3 Zenodo DOI 10.5281/zenodo.58025 File foo.spd +4 . 'Supplementary Files' . Other . +# +# +loop_ +_ihm_external_files.id +_ihm_external_files.reference_id +_ihm_external_files.file_path +_ihm_external_files.content_type +_ihm_external_files.file_size_bytes +_ihm_external_files.details +1 1 bar 'Input data or restraints' . . +2 1 baz 'Input data or restraints' . . +3 2 foo/bar/baz 'Modeling or post-processing output' . . +4 3 foo.spd 'Input data or restraints' . 'EM micrographs' +5 4 %s 'Modeling workflow or script' 4 . +# +""" % bar) + os.unlink(bar) + + def test_workflow(self): + """Test output of workflow files""" + system = IMP.mmcif.System() + h, state = self.make_model(system) + m = state.model + prov = IMP.core.ScriptProvenance.setup_particle(m, IMP.Particle(m), + __file__) + IMP.core.add_provenance(m, h, prov) + state.add_hierarchy(h) + + root = os.path.dirname(__file__) + system.add_repository(doi="foo", root=root) + dump = IMP.mmcif.dumper._ExternalReferenceDumper() + dump.finalize(system) # assign IDs + out = _get_dumper_output(dump, system) + self.assertEqual(out, """# +loop_ +_ihm_external_reference_info.reference_id +_ihm_external_reference_info.reference_provider +_ihm_external_reference_info.reference_type +_ihm_external_reference_info.reference +_ihm_external_reference_info.refers_to +_ihm_external_reference_info.associated_url +1 . DOI foo Other . +# +# +loop_ +_ihm_external_files.id +_ihm_external_files.reference_id +_ihm_external_files.file_path +_ihm_external_files.content_type +_ihm_external_files.file_size_bytes +_ihm_external_files.details +1 1 %s 'Modeling workflow or script' %d +'Integrative modeling Python script' +# +""" % (os.path.basename(__file__), os.stat(__file__).st_size)) + + def test_modeling_protocol(self): + """Test ProtocolDumper""" + system = IMP.mmcif.System() + h, state = self.make_model_with_protocol(system) + state.add_hierarchy(h) + + dumper = IMP.mmcif.dumper._AssemblyDumper() + dumper.finalize(system) # Assign assembly IDs + + dumper = IMP.mmcif.dumper._ProtocolDumper() + out = _get_dumper_output(dumper, system) + self.assertEqual(out, """# +loop_ +_ihm_modeling_protocol.ordinal_id +_ihm_modeling_protocol.protocol_id +_ihm_modeling_protocol.step_id +_ihm_modeling_protocol.struct_assembly_id +_ihm_modeling_protocol.dataset_group_id +_ihm_modeling_protocol.struct_assembly_description +_ihm_modeling_protocol.protocol_name +_ihm_modeling_protocol.step_name +_ihm_modeling_protocol.step_method +_ihm_modeling_protocol.num_models_begin +_ihm_modeling_protocol.num_models_end +_ihm_modeling_protocol.multi_scale_flag +_ihm_modeling_protocol.multi_state_flag +_ihm_modeling_protocol.time_ordered_flag +1 1 1 1 . . . Sampling 'Monte Carlo' 0 500 YES NO NO +2 2 1 1 . . . Sampling 'Replica exchange Molecular Dynamics' 400 2000 YES NO NO +# +""") + + def test_post_process(self): + """Test PostProcessDumper""" + system = IMP.mmcif.System() + h, state = self.make_model_with_protocol(system) + state.add_hierarchy(h) + dumper = IMP.mmcif.dumper._PostProcessDumper() + out = _get_dumper_output(dumper, system) + self.assertEqual(out, """# +loop_ +_ihm_modeling_post_process.id +_ihm_modeling_post_process.protocol_id +_ihm_modeling_post_process.analysis_id +_ihm_modeling_post_process.step_id +_ihm_modeling_post_process.type +_ihm_modeling_post_process.feature +_ihm_modeling_post_process.num_models_begin +_ihm_modeling_post_process.num_models_end +1 1 1 1 filter energy/score 500 400 +2 2 1 1 cluster RMSD 2000 2000 +# +""") + + +if __name__ == '__main__': + IMP.test.main() diff --git a/modules/mmcif/test/test_metadata.py b/modules/mmcif/test/test_metadata.py new file mode 100644 index 0000000000..e8046b2826 --- /dev/null +++ b/modules/mmcif/test/test_metadata.py @@ -0,0 +1,155 @@ +from __future__ import print_function +import IMP.test +import IMP.mmcif.metadata +import IMP.mmcif.data + +class MockSystem(object): + def __init__(self): + self._external_files = IMP.mmcif.data._ExternalFiles() + self.datasets = IMP.mmcif.data._Datasets(self._external_files) + self._software = IMP.mmcif.data._AllSoftware() + + +class Tests(IMP.test.TestCase): + def _parse_pdb(self, fname, chain): + system = MockSystem() + p = IMP.mmcif.metadata._PDBMetadataParser() + p.parse_file(fname, chain, system) + return p + + def test_official_pdb(self): + """Test PDBMetadataParser when given an official PDB""" + p = self._parse_pdb(self.get_input_file_name('official.pdb'), 'A') + s, = p.sources + self.assertEqual(s.db_code, '2HBJ') + self.assertEqual(s.chain_id, 'A') + self.assertEqual(p.dataset._data_type, 'Experimental model') + self.assertEqual(p.dataset.location.db_name, 'PDB') + self.assertEqual(p.dataset.location.access_code, '2HBJ') + self.assertEqual(p.dataset.location.version, '14-JUN-06') + self.assertEqual(p.dataset.location.details, + 'STRUCTURE OF THE YEAST NUCLEAR EXOSOME COMPONENT, ' + 'RRP6P, REVEALS AN INTERPLAY BETWEEN THE ACTIVE ' + 'SITE AND THE HRDC DOMAIN') + + def test_derived_pdb(self): + """Test PDBMetadataParser when given a file derived from a PDB""" + pdbname = self.get_input_file_name('derived_pdb.pdb') + p = self._parse_pdb(pdbname, 'A') + s, = p.sources + self.assertIs(s.db_code, None) + self.assertEqual(s.chain_id, 'A') + self.assertEqual(p.dataset._data_type, 'Experimental model') + self.assertEqual(p.dataset.location.path, pdbname) + self.assertEqual(p.dataset.location.repo, None) + self.assertEqual(p.dataset.location.details, + 'MED7C AND MED21 STRUCTURES FROM PDB ENTRY 1YKH, ' + 'ROTATED AND TRANSLATED TO ALIGN WITH THE ' + 'MED4-MED9 MODEL') + parent, = p.dataset._parents + self.assertEqual(parent._data_type, 'Experimental model') + self.assertEqual(parent.location.db_name, 'PDB') + self.assertEqual(parent.location.access_code, '1YKH') + self.assertEqual(parent.location.version, None) + self.assertEqual(parent.location.details, None) + + def test_derived_model(self): + """Test PDBMetadataParser when given a file derived from a model""" + pdbname = self.get_input_file_name('derived_model.pdb') + p = self._parse_pdb(pdbname, 'A') + s, = p.sources + self.assertIs(s.db_code, None) + self.assertEqual(s.chain_id, 'A') + self.assertEqual(p.dataset._data_type, 'Comparative model') + self.assertEqual(p.dataset.location.path, pdbname) + self.assertEqual(p.dataset.location.repo, None) + self.assertEqual(p.dataset.location.details, + 'MED4 AND MED9 STRUCTURE TAKEN FROM LARIVIERE ' + 'ET AL, NUCLEIC ACIDS RESEARCH. 2013;41:9266-9273. ' + 'DOI: 10.1093/nar/gkt704. THE MED10 STRUCTURE ALSO ' + 'PROPOSED IN THAT WORK IS NOT USED IN THIS STUDY.') + parent, = p.dataset._parents + self.assertEqual(parent._data_type, 'Comparative model') + self.assertEqual(parent.location.path, '.') + self.assertEqual(parent.location.repo.doi, '10.1093/nar/gkt704') + self.assertEqual(parent.location.details, + 'Starting comparative model structure') + + def test_modeller_model_aln(self): + """Test PDBMetadataParser when given a Modeller model with alignment""" + pdbname = self.get_input_file_name('modeller_model.pdb') + p = self.check_modeller_model(pdbname) + self.assertEqual(p.alignment_file.path, + self.get_input_file_name('modeller_model.ali')) + + def test_modeller_model_no_aln(self): + "Test PDBMetadataParser when given a Modeller model with no alignment" + pdbname = self.get_input_file_name('modeller_model_no_aln.pdb') + p = self.check_modeller_model(pdbname) + + def check_modeller_model(self, pdbname): + p = self._parse_pdb(pdbname, 'A') + s1, s2 = p.sources + self.assertIs(s1.db_code, None) + self.assertEqual(s1.chain_id, 'A') + self.assertEqual(s1.tm_db_code, '3JRO') + self.assertEqual(s1.tm_chain_id, 'C') + self.assertIs(s2.db_code, None) + self.assertEqual(s2.chain_id, 'A') + self.assertEqual(s2.tm_db_code, '3F3F') + self.assertEqual(s2.tm_chain_id, 'G') + self.assertEqual(p.dataset._data_type, 'Comparative model') + self.assertEqual(p.dataset.location.path, pdbname) + self.assertEqual(p.dataset.location.repo, None) + self.assertEqual(p.dataset.location.details, + 'Starting model structure') + p1, p2 = p.dataset._parents + self.assertEqual(p1._data_type, 'Experimental model') + self.assertEqual(p1.location.db_name, 'PDB') + self.assertEqual(p1.location.access_code, '3JRO') + self.assertEqual(p1.location.version, None) + self.assertEqual(p1.location.details, None) + self.assertEqual(p2.location.access_code, '3F3F') + return p + + def test_modeller_local(self): + "Test PDBMetadataParser when given a Modeller model with local template" + pdbname = self.get_input_file_name('modeller_model_local.pdb') + p = self._parse_pdb(pdbname, 'A') + s, = p.sources + self.assertIs(s.db_code, None) + self.assertEqual(s.chain_id, 'A') + self.assertIs(s.tm_db_code, None) + self.assertEqual(s.tm_chain_id, 'C') + parent, = p.dataset._parents + self.assertEqual(parent._data_type, 'Experimental model') + self.assertEqual(parent.location.details, + 'Template for comparative modeling') + self.assertEqual(parent.location.path, + self.get_input_file_name('15133C.pdb')) + + def test_phyre2_model(self): + """Test PDBMetadataParser when given a Phyre2 model.""" + pdbname = self.get_input_file_name('phyre2_model.pdb') + p = self._parse_pdb(pdbname, 'A') + s, = p.sources + self.assertIs(s.db_code, None) + self.assertEqual(s.chain_id, 'A') + self.assertEqual(s.tm_db_code, '4BZK') + self.assertEqual(s.tm_chain_id, 'A') + self.assertEqual(p.dataset._data_type, 'Comparative model') + self.assertEqual(p.dataset.location.path, pdbname) + self.assertIs(p.dataset.location.repo, None) + self.assertEqual(p.dataset.location.details, + 'Starting model structure') + parent, = p.dataset._parents + self.assertEqual(parent._data_type, 'Experimental model') + self.assertEqual(parent.location.db_name, 'PDB') + self.assertEqual(parent.location.access_code, '4BZK') + self.assertIs(parent.location.version, None) + self.assertIs(parent.location.details, None) + + + +if __name__ == '__main__': + IMP.test.main()