diff --git a/modules/mmcif/.imp_info.py b/modules/mmcif/.imp_info.py
new file mode 100644
index 0000000000..b00919eb57
--- /dev/null
+++ b/modules/mmcif/.imp_info.py
@@ -0,0 +1,3 @@
+{
+    "name": "IMP.mmcif"
+}
diff --git a/modules/mmcif/README.md b/modules/mmcif/README.md
new file mode 100644
index 0000000000..fa2f4b334f
--- /dev/null
+++ b/modules/mmcif/README.md
@@ -0,0 +1,19 @@
+\brief Support for output of models in mmCIF format.
+
+This module adds basic support for outputing IMP models in mmCIF format,
+using the [integrative/hybrid modeling extensions](https://github.com/ihmwg/IHM-dictionary).
+This allows IMP models to be deposited in [PDB-dev](https://pdb-dev.wwpdb.org/).
+
+Note that this module is still under heavy development, and not yet
+completely functional.
+
+# Info
+
+_Author(s)_: Benjamin Webb
+
+_Maintainer_: benmwebb
+
+_License_: LGPL
+
+_Publications_:
+- None
diff --git a/modules/mmcif/dependencies.py b/modules/mmcif/dependencies.py
new file mode 100644
index 0000000000..4541488945
--- /dev/null
+++ b/modules/mmcif/dependencies.py
@@ -0,0 +1,4 @@
+required_modules = 'kernel'
+lib_only_required_modules = 'atom:core:algebra:container:rmf'
+required_dependencies = ''
+optional_dependencies = ''
diff --git a/modules/mmcif/pyext/IMP_mmcif.init.i b/modules/mmcif/pyext/IMP_mmcif.init.i
new file mode 100644
index 0000000000..f11bb63f11
--- /dev/null
+++ b/modules/mmcif/pyext/IMP_mmcif.init.i
@@ -0,0 +1,6 @@
+%pythoncode %{
+
+# Temporary hack to simplify development
+from IMP.mmcif.util import *
+
+%}
diff --git a/modules/mmcif/pyext/src/_compat_collections.py b/modules/mmcif/pyext/src/_compat_collections.py
new file mode 100644
index 0000000000..4200390387
--- /dev/null
+++ b/modules/mmcif/pyext/src/_compat_collections.py
@@ -0,0 +1,259 @@
+"""Compatibility code for Python 2.6 (classes that are present
+   in the 'collections' module in Python 2.7 or later)."""
+
+try:
+    from thread import get_ident as _get_ident
+except ImportError:
+    from dummy_thread import get_ident as _get_ident
+
+try:
+    from _abcoll import KeysView, ValuesView, ItemsView
+except ImportError:
+    pass
+
+class OrderedDict(dict):
+    """Dictionary that remembers insertion order
+    An inherited dict maps keys to values.
+    The inherited dict provides __getitem__, __len__, __contains__, and get.
+    The remaining methods are order-aware.
+    Big-O running times for all methods are the same as for regular dictionaries.
+
+    The internal self.__map dictionary maps keys to links in a doubly linked list.
+    The circular doubly linked list starts and ends with a sentinel element.
+    The sentinel element never gets deleted (this simplifies the algorithm).
+    Each link is stored as a list of length three:  [PREV, NEXT, KEY].
+    source: http://code.activestate.com/recipes/576693/
+    """
+
+    def __init__(self, *args, **kwds):
+        '''Initialize an ordered dictionary.  Signature is the same as for
+        regular dictionaries, but keyword arguments are not recommended
+        because their insertion order is arbitrary.
+
+        '''
+        if len(args) > 1:
+            raise TypeError('expected at most 1 arguments, got %d' % len(args))
+        try:
+            self.__root
+        except AttributeError:
+            self.__root = root = []                     # sentinel node
+            root[:] = [root, root, None]
+            self.__map = {}
+        self.__update(*args, **kwds)
+
+    def __setitem__(self, key, value, dict_setitem=dict.__setitem__):
+        'od.__setitem__(i, y) <==> od[i]=y'
+        # Setting a new item creates a new link which goes at the end of the linked
+        # list, and the inherited dictionary is updated with the new key/value pair.
+        if key not in self:
+            root = self.__root
+            last = root[0]
+            last[1] = root[0] = self.__map[key] = [last, root, key]
+        dict_setitem(self, key, value)
+
+    def __delitem__(self, key, dict_delitem=dict.__delitem__):
+        'od.__delitem__(y) <==> del od[y]'
+        # Deleting an existing item uses self.__map to find the link which is
+        # then removed by updating the links in the predecessor and successor nodes.
+        dict_delitem(self, key)
+        link_prev, link_next, key = self.__map.pop(key)
+        link_prev[1] = link_next
+        link_next[0] = link_prev
+
+    def __iter__(self):
+        'od.__iter__() <==> iter(od)'
+        root = self.__root
+        curr = root[1]
+        while curr is not root:
+            yield curr[2]
+            curr = curr[1]
+
+    def __reversed__(self):
+        'od.__reversed__() <==> reversed(od)'
+        root = self.__root
+        curr = root[0]
+        while curr is not root:
+            yield curr[2]
+            curr = curr[0]
+
+    def clear(self):
+        'od.clear() -> None.  Remove all items from od.'
+        try:
+            for node in self.__map.itervalues():
+                del node[:]
+            root = self.__root
+            root[:] = [root, root, None]
+            self.__map.clear()
+        except AttributeError:
+            pass
+        dict.clear(self)
+
+    def popitem(self, last=True):
+        '''od.popitem() -> (k, v), return and remove a (key, value) pair.
+        Pairs are returned in LIFO order if last is true or FIFO order if false.
+
+        '''
+        if not self:
+            raise KeyError('dictionary is empty')
+        root = self.__root
+        if last:
+            link = root[0]
+            link_prev = link[0]
+            link_prev[1] = root
+            root[0] = link_prev
+        else:
+            link = root[1]
+            link_next = link[1]
+            root[1] = link_next
+            link_next[0] = root
+        key = link[2]
+        del self.__map[key]
+        value = dict.pop(self, key)
+        return key, value
+
+    # -- the following methods do not depend on the internal structure --
+
+    def keys(self):
+        'od.keys() -> list of keys in od'
+        return list(self)
+
+    def values(self):
+        'od.values() -> list of values in od'
+        return [self[key] for key in self]
+
+    def items(self):
+        'od.items() -> list of (key, value) pairs in od'
+        return [(key, self[key]) for key in self]
+
+    def iterkeys(self):
+        'od.iterkeys() -> an iterator over the keys in od'
+        return iter(self)
+
+    def itervalues(self):
+        'od.itervalues -> an iterator over the values in od'
+        for k in self:
+            yield self[k]
+
+    def iteritems(self):
+        'od.iteritems -> an iterator over the (key, value) items in od'
+        for k in self:
+            yield (k, self[k])
+
+    def update(*args, **kwds):
+        '''od.update(E, **F) -> None.  Update od from dict/iterable E and F.
+
+        If E is a dict instance, does:           for k in E: od[k] = E[k]
+        If E has a .keys() method, does:         for k in E.keys(): od[k] = E[k]
+        Or if E is an iterable of items, does:   for k, v in E: od[k] = v
+        In either case, this is followed by:     for k, v in F.items(): od[k] = v
+
+        '''
+        if len(args) > 2:
+            raise TypeError('update() takes at most 2 positional '
+                            'arguments (%d given)' % (len(args),))
+        elif not args:
+            raise TypeError('update() takes at least 1 argument (0 given)')
+        self = args[0]
+        # Make progressively weaker assumptions about "other"
+        other = ()
+        if len(args) == 2:
+            other = args[1]
+        if isinstance(other, dict):
+            for key in other:
+                self[key] = other[key]
+        elif hasattr(other, 'keys'):
+            for key in other.keys():
+                self[key] = other[key]
+        else:
+            for key, value in other:
+                self[key] = value
+        for key, value in kwds.items():
+            self[key] = value
+
+    __update = update  # let subclasses override update without breaking __init__
+
+    __marker = object()
+
+    def pop(self, key, default=__marker):
+        '''od.pop(k[,d]) -> v, remove specified key and return the corresponding value.
+        If key is not found, d is returned if given, otherwise KeyError is raised.
+
+        '''
+        if key in self:
+            result = self[key]
+            del self[key]
+            return result
+        if default is self.__marker:
+            raise KeyError(key)
+        return default
+
+    def setdefault(self, key, default=None):
+        'od.setdefault(k[,d]) -> od.get(k,d), also set od[k]=d if k not in od'
+        if key in self:
+            return self[key]
+        self[key] = default
+        return default
+
+    def __repr__(self, _repr_running={}):
+        'od.__repr__() <==> repr(od)'
+        call_key = id(self), _get_ident()
+        if call_key in _repr_running:
+            return '...'
+        _repr_running[call_key] = 1
+        try:
+            if not self:
+                return '%s()' % (self.__class__.__name__,)
+            return '%s(%r)' % (self.__class__.__name__, self.items())
+        finally:
+            del _repr_running[call_key]
+
+    def __reduce__(self):
+        'Return state information for pickling'
+        items = [[k, self[k]] for k in self]
+        inst_dict = vars(self).copy()
+        for k in vars(OrderedDict()):
+            inst_dict.pop(k, None)
+        if inst_dict:
+            return (self.__class__, (items,), inst_dict)
+        return self.__class__, (items,)
+
+    def copy(self):
+        'od.copy() -> a shallow copy of od'
+        return self.__class__(self)
+
+    @classmethod
+    def fromkeys(cls, iterable, value=None):
+        '''OD.fromkeys(S[, v]) -> New ordered dictionary with keys from S
+        and values equal to v (which defaults to None).
+
+        '''
+        d = cls()
+        for key in iterable:
+            d[key] = value
+        return d
+
+    def __eq__(self, other):
+        '''od.__eq__(y) <==> od==y.  Comparison to another OD is order-sensitive
+        while comparison to a regular mapping is order-insensitive.
+
+        '''
+        if isinstance(other, OrderedDict):
+            return len(self)==len(other) and self.items() == other.items()
+        return dict.__eq__(self, other)
+
+    def __ne__(self, other):
+        return not self == other
+
+    # -- the following methods are only used in Python 2.7 --
+
+    def viewkeys(self):
+        "od.viewkeys() -> a set-like object providing a view on od's keys"
+        return KeysView(self)
+
+    def viewvalues(self):
+        "od.viewvalues() -> an object providing a view on od's values"
+        return ValuesView(self)
+
+    def viewitems(self):
+        "od.viewitems() -> a set-like object providing a view on od's items"
+        return ItemsView(self)
diff --git a/modules/mmcif/pyext/src/data.py b/modules/mmcif/pyext/src/data.py
new file mode 100644
index 0000000000..9880235135
--- /dev/null
+++ b/modules/mmcif/pyext/src/data.py
@@ -0,0 +1,540 @@
+"""@namespace IMP.mmcif.data
+   @brief Classes to represent data structures used in mmCIF.
+"""
+
+from __future__ import print_function
+import IMP.atom
+import IMP.mmcif.dataset
+import IMP.mmcif.metadata
+
+def get_molecule(h):
+    """Given a Hierarchy, walk up and find the parent Molecule"""
+    while h:
+        if IMP.atom.Molecule.get_is_setup(h):
+            return IMP.atom.Molecule(h)
+        h = h.get_parent()
+    return None
+
+class _Entity(object):
+    """Represent a CIF entity (a component with a unique sequence)"""
+    def __init__(self, seq):
+        self.sequence = seq
+        self.first_component = None
+    # Use the name of the first component as the description of the entity
+    description = property(lambda self: self.first_component.name)
+
+
+class _EntityMapper(dict):
+    """Handle mapping from IMP chains to CIF entities.
+       Multiple components may map to the same entity if they share sequence."""
+    def __init__(self):
+        super(_EntityMapper, self).__init__()
+        self._sequence_dict = {}
+        self._entities = []
+
+    def add(self, chain):
+        # todo: handle non-protein sequences
+        sequence = chain.get_sequence()
+        if sequence not in self._sequence_dict:
+            entity = _Entity(sequence)
+            self._entities.append(entity)
+            entity.id = len(self._entities)
+            self._sequence_dict[sequence] = entity
+        self[chain] = self._sequence_dict[sequence]
+        return self[chain]
+
+    def get_all(self):
+        """Yield all entities"""
+        return self._entities
+
+
+def _assign_id(obj, seen_objs, obj_by_id):
+    """Assign a unique ID to obj, and track all ids in obj_by_id."""
+    if obj not in seen_objs:
+        if not hasattr(obj, 'id'):
+            obj_by_id.append(obj)
+            obj.id = len(obj_by_id)
+        seen_objs[obj] = obj.id
+    else:
+        obj.id = seen_objs[obj]
+
+
+class _Component(object):
+    """An mmCIF component. This is an instance of an _Entity. Multiple
+       _Components may map to the same _Entity but must have unique
+       asym_ids. A _Component is similar to an IMP Chain but multiple
+       Chains may map to the same _Component (the Chains represent the
+       same structure, just in different states, and potentially in
+       different IMP Models). A _Component may also represent something
+       that is described by an experiment but was not modeled by IMP, and
+       so no Chains map to it but a string name does."""
+    def __init__(self, entity, asym_id, name):
+        self.entity, self.asym_id, self.name = entity, asym_id, name
+
+
+class _ComponentMapper(dict):
+    """Handle mapping from chains to CIF components."""
+    def __init__(self):
+        super(_ComponentMapper, self).__init__()
+        self._all_components = []
+        self._all_modeled_components = []
+        self._map = {}
+
+    def add(self, chain, entity):
+        """Add a chain (either an IMP Chain object for a modeled component,
+           or a NonModeledChain object for a non-modeled component)"""
+        if isinstance(chain, IMP.atom.Chain):
+            modeled = True
+            mol = get_molecule(chain)
+            asym_id = map_key = chain.get_id()
+            name = mol.get_name() if mol else None
+        else:
+            modeled = False
+            asym_id = None
+            name = map_key = chain.name
+        if map_key not in self._map:
+            component = _Component(entity, asym_id, name)
+            if entity.first_component is None:
+                entity.first_component = component
+            self._all_components.append(component)
+            if modeled:
+                self._all_modeled_components.append(component)
+            self._map[map_key] = component
+        else:
+            component = self._map[map_key]
+            if component.entity != entity:
+                raise ValueError("Two chains have the same ID (%s) but "
+                                 "different sequences - rename one of the "
+                                 "chains" % map_key)
+        return component
+
+    def get_all(self):
+        """Get all components"""
+        return self._all_components
+
+    def get_all_modeled(self):
+        """Get all modeled components"""
+        return self._all_modeled_components
+
+
+class _Assembly(list):
+    """A collection of components. Currently simply implemented as a list of
+       the _Component objects. These must be in creation order."""
+    def __hash__(self):
+        # allow putting assemblies in a dict. 'list' isn't hashable
+        # but 'tuple' is
+        return hash(tuple(self))
+
+
+class _Assemblies(object):
+    """Track all assemblies used in the modeling."""
+    def __init__(self):
+        self._assemblies = []
+
+    def add(self, a):
+        """Add a new assembly. The first such assembly is assumed to contain
+           all components. Duplicate assemblies will be pruned at the end."""
+        self._assemblies.append(a)
+        return a
+
+    def get_subassembly(self, compdict):
+        """Get an _Assembly consisting of the given components."""
+        # Put components in creation order
+        newa = _Assembly(c for c in self._assemblies[0] if c in compdict)
+        return self.add(newa)
+
+    def get_all(self):
+        """Get all assemblies"""
+        return self._assemblies
+
+
+class _Representation(object):
+    """Group a set of contiguous particles with the same representation"""
+    def __init__(self):
+        self.particles = []
+        self.residue_range = () # inclusive range
+        self.starting_model = None
+
+    def add(self, particle, starting_model):
+        """Potentially add a new particle to this representation.
+           Iff the particle could be added, return True."""
+        resrange, rigid_body, primitive = self._get_particle_info(particle)
+        if not self.particles:
+            self.particles.append(particle)
+            self.residue_range = resrange
+            self.rigid_body = rigid_body
+            self.primitive = primitive
+            self.starting_model = starting_model
+            return True
+        elif type(particle) == type(self.particles[0]) \
+             and resrange[0] == self.residue_range[1] + 1 \
+             and starting_model == self.starting_model \
+             and self._same_rigid_body(rigid_body):
+            self.particles.append(particle)
+            self.residue_range = (self.residue_range[0], resrange[1])
+            return True
+
+    def _same_rigid_body(self, rigid_body):
+        # Note: can't just use self.rigid_body == rigid_body as IMP may
+        # crash when comparing a RigidBody object against None
+        if self.rigid_body is None and rigid_body is None:
+            return True
+        elif self.rigid_body is None or rigid_body is None:
+            return False
+        else:
+            return self.rigid_body == rigid_body
+
+    def _get_particle_info(self, p):
+        # Note that we consider nonrigid members to not be rigid here
+        if IMP.core.RigidMember.get_is_setup(p):
+            rigid_body = IMP.core.RigidMember(p).get_rigid_body()
+        else:
+            rigid_body = None
+        if isinstance(p, IMP.atom.Residue):
+            return (p.get_index(), p.get_index()), rigid_body, 'sphere'
+        elif isinstance(p, IMP.atom.Fragment):
+            resinds = p.get_residue_indexes()
+            return (resinds[0], resinds[-1]), rigid_body, 'sphere'
+        raise TypeError("Unknown particle ", p)
+
+    def __bool__(self):
+        return len(self.particles) > 0
+
+    __nonzero__ = __bool__ # Python 2 compatibility
+
+
+def _get_all_structure_provenance(p):
+    """Yield all StructureProvenance decorators for the given particle."""
+    return IMP.core.get_all_provenance(p, types=[IMP.core.StructureProvenance])
+
+class _StartingModel(object):
+    _eq_keys = ['filename', 'chain_id', 'offset']
+
+    def __init__(self, struc_prov):
+        self.filename = struc_prov[0].get_filename()
+        self.chain_id = struc_prov[0].get_chain_id()
+        self.offset = struc_prov[0].get_residue_offset()
+
+    def _add_residue(self, resind):
+        self.seq_id_end = resind + self.offset
+        if not hasattr(self, 'seq_id_begin'):
+            self.seq_id_begin = self.seq_id_end
+
+    def get_seq_id_range_all_sources(self):
+        source0 = self.sources[0]
+        # Where there are multiple sources (to date, this can only
+        # mean multiple templates for a comparative model) consolidate
+        # them; template info is given in starting_comparative_models.
+        seq_id_begin, seq_id_end = source0.get_seq_id_range(self)
+        for source in self.sources[1:]:
+            this_begin, this_end = source.get_seq_id_range(self)
+            seq_id_begin = min(seq_id_begin, this_begin)
+            seq_id_end = max(seq_id_end, this_end)
+        return seq_id_begin, seq_id_end
+
+    # Two starting models with same filename, chain ID, and offset
+    # compare identical
+    # note: this results in separate starting models if only the offset differs;
+    # maybe consolidate into one?
+    def _eq_vals(self):
+        return tuple([self.__class__]
+                     + [getattr(self, x) for x in self._eq_keys])
+    def __eq__(self, other):
+        return other is not None and self._eq_vals() == other._eq_vals()
+    def __hash__(self):
+        return hash(self._eq_vals())
+
+    def _set_sources_datasets(self, system):
+        # Attempt to identify PDB file vs. comparative model
+        p = IMP.mmcif.metadata._PDBMetadataParser()
+        p.parse_file(self.filename, self.chain_id, system)
+        self.dataset = p.dataset
+        self.sources = p.sources
+        self.alignment_file = p.alignment_file
+
+
+class _StartingModelFinder(object):
+    """Map IMP particles to starting model objects"""
+    def __init__(self, existing_starting_models):
+        self._seen_particles = {}
+        self._seen_starting_models = dict.fromkeys(existing_starting_models)
+
+    def find(self, particle, system):
+        """Return a StartingModel object, or None, for this particle"""
+        def _get_starting_model(sp, resind):
+            s = _StartingModel(sp)
+            if s not in self._seen_starting_models:
+                self._seen_starting_models[s] = s
+                s._set_sources_datasets(system)
+            s = self._seen_starting_models[s]
+            if s:
+                s._add_residue(resind)
+            return s
+        resind = None
+        if IMP.atom.Residue.get_is_setup(particle):
+            resind = IMP.atom.Residue(particle).get_index()
+        sp = list(_get_all_structure_provenance(particle))
+        if sp:
+            return _get_starting_model(sp, resind)
+        elif IMP.atom.Hierarchy.get_is_setup(particle):
+            h = IMP.atom.Hierarchy(particle).get_parent()
+            # Remember all nodes we inspect
+            seen_parents = []
+            while h:
+                if IMP.atom.Residue.get_is_setup(h):
+                    resind = IMP.atom.Residue(h).get_index()
+                pi = h.get_particle_index()
+                seen_parents.append(pi)
+                # If we inspected this node before, return the cached result
+                if pi in self._seen_particles:
+                    sp = self._seen_particles[pi]
+                    if sp and sp[0] and resind is not None:
+                        sp[0]._add_residue(resind)
+                    return sp[0] if sp else None
+                else:
+                    sp = list(_get_all_structure_provenance(h))
+                    self._seen_particles[pi] = []
+                    if sp:
+                        s = _get_starting_model(sp, resind)
+                        # Set cache for this node and all the children we
+                        # inspected on the way up
+                        for spi in seen_parents:
+                            self._seen_particles[spi].append(s)
+                        return s
+                h = h.get_parent()
+
+
+class _Datasets(object):
+    """Store all datasets used."""
+    def __init__(self, external_files):
+        super(_Datasets, self).__init__()
+        self._datasets = {}
+        self._external_files = external_files
+
+    def add(self, d):
+        """Add and return a new dataset."""
+        if d not in self._datasets:
+            self._datasets[d] = d
+            d.id = len(self._datasets)
+            self._external_files.add_input(d.location)
+        return self._datasets[d]
+
+    def get_all(self):
+        """Yield all datasets"""
+        return self._datasets.keys()
+
+
+class _Citation(object):
+    """A publication that describes the modeling."""
+    def __init__(self, pmid, title, journal, volume, page_range, year, authors,
+                 doi):
+        self.title, self.journal, self.volume = title, journal, volume
+        self.page_range, self.year = page_range, year
+        self.pmid, self.authors, self.doi = pmid, authors, doi
+
+
+class _Software(object):
+    """Software (other than IMP) used as part of the modeling protocol."""
+    def __init__(self, name, classification, description, url,
+                 type='program', version=None):
+        self.name = name
+        self.classification = classification
+        self.description = description
+        self.url = url
+        self.type = type
+        self.version = version
+
+
+class _AllSoftware(list):
+    """Keep track of all _Software objects."""
+    def __init__(self):
+        super(_AllSoftware, self).__init__()
+        self.modeller_used = self.phyre2_used = False
+
+    def set_modeller_used(self, version, date):
+        if self.modeller_used:
+            return
+        self.modeller_used = True
+        self.append(_Software(
+                name='MODELLER', classification='comparative modeling',
+                description='Comparative modeling by satisfaction '
+                            'of spatial restraints, build ' + date,
+                url='https://salilab.org/modeller/',
+                version=version))
+
+    def set_phyre2_used(self):
+        if self.phyre2_used:
+            return
+        self.phyre2_used = True
+        self.append(_Software(
+               name='Phyre2', classification='protein homology modeling',
+               description='Protein Homology/analogY Recognition '
+                           'Engine V 2.0',
+               version='2.0', url='http://www.sbg.bio.ic.ac.uk/~phyre2/'))
+
+    def add_hierarchy(self, h):
+        for p in IMP.core.get_all_provenance(h,
+                                        types=[IMP.core.SoftwareProvenance]):
+            self.append(_Software(name=p.get_software_name(),
+                                  classification='integrative model building',
+                                  description=None,
+                                  version=p.get_version(),
+                                  url=p.get_location()))
+
+
+class _ExternalFile(object):
+    """A single externally-referenced file"""
+
+    # All valid content types
+    INPUT_DATA = "Input data or restraints"
+    MODELING_OUTPUT = "Modeling or post-processing output"
+    WORKFLOW = "Modeling workflow or script"
+
+    def __init__(self, location, content_type):
+        self.location, self.content_type = location, content_type
+
+    # Pass id through to location
+    def __set_id(self, i):
+        self.location.id = i
+    id = property(lambda x: x.location.id, __set_id)
+    file_size = property(lambda x: x.location.file_size)
+
+    def __eq__(self, other):
+        return self.location == other.location
+    def __hash__(self):
+        return hash(self.location)
+
+
+class _ExternalFiles(object):
+    """Track all externally-referenced files
+       (i.e. anything that refers to a Location that isn't
+       a DatabaseLocation)."""
+    def __init__(self):
+        self._refs = []
+        self._repos = []
+
+    def add_repo(self, repo):
+        """Add a repository containing modeling files."""
+        self._repos.append(repo)
+
+    def _add(self, location, content_type):
+        """Add a new externally-referenced file.
+           Note that ids are assigned later."""
+        self._refs.append(_ExternalFile(location, content_type))
+
+    def add_input(self, location):
+        """Add a new externally-referenced file used as input."""
+        return self._add(location, _ExternalFile.INPUT_DATA)
+
+    def add_output(self, location):
+        """Add a new externally-referenced file produced as output."""
+        return self._add(location, _ExternalFile.MODELING_OUTPUT)
+
+    def add_workflow(self, location):
+        """Add a new externally-referenced file that's part of the workflow."""
+        return self._add(location, _ExternalFile.WORKFLOW)
+
+    def add_hierarchy(self, h):
+        # Add all Python scripts that were used in the modeling
+        for p in IMP.core.get_all_provenance(h,
+                                     types=[IMP.core.ScriptProvenance]):
+            # todo: set details
+            l = IMP.mmcif.dataset.FileLocation(path=p.get_filename(),
+                               details='Integrative modeling Python script')
+            self.add_workflow(l)
+
+    def get_all_nondb(self):
+        """Yield all external files that are not database hosted"""
+        for x in self._refs:
+            if not isinstance(x.location, IMP.mmcif.dataset.DatabaseLocation):
+                yield x
+
+
+class _ProtocolStep(object):
+    """A single step (e.g. sampling, refinement) in a protocol."""
+    def __init__(self, prov, num_models_begin):
+        self.num_models_begin = num_models_begin
+        self._prov = [prov]
+        self.num_models_end = prov.get_number_of_frames()
+
+    def add_combine(self, prov):
+        self._prov.append(prov)
+        self.num_models_end = prov.get_number_of_frames()
+        return self.num_models_end
+
+
+class _PostProcessing(object):
+    """A single postprocessing step (e.g. filtering) in a protocol."""
+    def __init__(self, prov, num_models_begin):
+        self.num_models_begin = num_models_begin
+        self._prov = prov
+        if isinstance(prov, IMP.core.FilterProvenance):
+            self.num_models_end = prov.get_number_of_frames()
+        elif isinstance(prov, IMP.core.ClusterProvenance):
+            # Assume clustering uses all models
+            self.num_models_end = self.num_models_begin
+        else:
+            raise ValueError("Unhandled provenance", prov)
+
+
+class _Protocol(object):
+    """A modeling protocol.
+       Each protocol consists of a number of protocol steps (e.g. sampling,
+       refinement) followed by a number of postprocessing steps (e.g.
+       filtering, rescoring, clustering)"""
+    def __init__(self):
+        self._steps = []
+        self._postprocs = []
+
+    def add_step(self, prov, num_models):
+        if isinstance(prov, IMP.core.CombineProvenance):
+            # Fold CombineProvenance into a previous sampling step
+            if len(self._steps) == 0:
+                raise ValueError("CombineProvenance with no previous sampling")
+            return self._steps[-1].add_combine(prov)
+        else:
+            ps = _ProtocolStep(prov, num_models)
+            self._steps.append(ps)
+            ps.id = len(self._steps)
+            return ps.num_models_end
+
+    def add_postproc(self, prov, num_models):
+        pp = _PostProcessing(prov, num_models)
+        self._postprocs.append(pp)
+        pp.id = len(self._postprocs)
+        return pp.num_models_end
+
+
+class _Protocols(object):
+    """Track all modeling protocols used."""
+    def __init__(self):
+        self._protocols = []
+
+    def get_all(self):
+        return self._protocols
+
+    def _add_protocol(self, prot, modeled_assembly):
+        prot.modeled_assembly = modeled_assembly
+        self._protocols.append(prot)
+        prot.id = len(self._protocols)
+
+    def _add_hierarchy(self, h, modeled_assembly):
+        num_models = 0 # assume we always start with no models
+        prot_types = (IMP.core.SampleProvenance, IMP.core.CombineProvenance)
+        pp_types = (IMP.core.FilterProvenance, IMP.core.ClusterProvenance)
+        in_postproc = False
+        prot = _Protocol()
+        for p in reversed(list(IMP.core.get_all_provenance(
+                                          h, types=prot_types + pp_types))):
+            if isinstance(p, pp_types):
+                num_models = prot.add_postproc(p, num_models)
+                in_postproc = True
+            else:
+                if in_postproc:
+                    # Start a new protocol
+                    self._add_protocol(prot, modeled_assembly)
+                    prot = _Protocol()
+                num_models = prot.add_step(p, num_models)
+                in_postproc = False
+        if len(prot._steps) > 0:
+            self._add_protocol(prot, modeled_assembly)
diff --git a/modules/mmcif/pyext/src/dataset.py b/modules/mmcif/pyext/src/dataset.py
new file mode 100644
index 0000000000..e3d5642a9f
--- /dev/null
+++ b/modules/mmcif/pyext/src/dataset.py
@@ -0,0 +1,180 @@
+"""@namespace IMP.mmcif.dataset
+
+   Classes for tracking data used by mmCIF models.
+"""
+
+import os
+try:
+    from collections import OrderedDict
+except ImportError:
+    from IMP.mmcif._compat_collections import OrderedDict
+
+
+class Dataset(object):
+    """A set of input data, for example, a crystal structure or EM map."""
+
+    _eq_keys = ['location']
+
+    # Datasets compare equal iff they are the same class and have the
+    # same attributes
+    def _eq_vals(self):
+        return tuple([self.__class__]
+                     + [getattr(self, x) for x in self._eq_keys])
+    def __eq__(self, other):
+        return self._eq_vals() == other._eq_vals()
+    def __hash__(self):
+        return hash(self._eq_vals())
+
+    _data_type = 'unspecified'
+    def __init__(self, location):
+        self.location = location
+        self._parents = OrderedDict()
+
+    def add_parent(self, dataset):
+        """Add another Dataset from which this one was derived.
+           For example, a 3D EM map may be derived from a set of 2D images."""
+        self._parents[dataset] = None
+
+
+class CXMSDataset(Dataset):
+    """Processed crosslinks from a CX-MS experiment"""
+    _data_type = 'CX-MS data'
+
+
+class PDBDataset(Dataset):
+    """An experimentally-determined 3D structure as a set of a coordinates,
+       usually in a PDB file"""
+    _data_type = 'Experimental model'
+
+
+class ComparativeModelDataset(Dataset):
+    """A 3D structure determined by comparative modeling"""
+    _data_type = 'Comparative model'
+
+
+class Location(object):
+    """Identifies the location where a resource can be found."""
+
+    # 'details' can differ without affecting dataset equality
+    _eq_keys = []
+    _allow_duplicates = False
+
+    def __init__(self, details=None):
+        self.details = details
+
+    # Locations compare equal iff they are the same class, have the
+    # same attributes, and allow_duplicates=False
+    def _eq_vals(self):
+        if self._allow_duplicates:
+            return id(self)
+        else:
+            return tuple([self.__class__]
+                         + [getattr(self, x) for x in self._eq_keys])
+    def __eq__(self, other):
+        return self._eq_vals() == other._eq_vals()
+    def __hash__(self):
+        return hash(self._eq_vals())
+
+
+class DatabaseLocation(Location):
+    """A dataset stored in an official database (PDB, EMDB, PRIDE, etc.)"""
+
+    _eq_keys = Location._eq_keys + ['db_name', 'access_code', 'version']
+
+    def __init__(self, db_name, db_code, version=None, details=None):
+        super(DatabaseLocation, self).__init__(details)
+        self.db_name = db_name
+        self.access_code = db_code
+        self.version = version
+
+
+class PDBLocation(DatabaseLocation):
+    """Something stored in the PDB database."""
+    def __init__(self, db_code, version=None, details=None):
+        super(PDBLocation, self).__init__('PDB', db_code, version, details)
+
+
+class FileLocation(Location):
+    """An individual file or directory.
+       This may be in a repository (if `repo` is not None) or only on the
+       local disk (if `repo` is None)."""
+
+    _eq_keys = Location._eq_keys + ['repo', 'path']
+
+    def __init__(self, path, repo=None, details=None):
+        """Constructor.
+           @param path the location of the file or directory.
+           @param repo a Repository object that describes the repository
+                  containing the file (if any).
+        """
+        super(FileLocation, self).__init__(details)
+        self.repo = repo
+        if repo:
+            self.path = path
+            # Cannot determine file size if non-local
+            self.file_size = None
+        else:
+            if not os.path.exists(path):
+                raise ValueError("%s does not exist" % path)
+            self.file_size = os.stat(path).st_size
+            # Store absolute path in case the working directory changes later
+            self.path = os.path.abspath(path)
+
+
+class Repository(object):
+    """A repository containing modeling files.
+       This can be used if the script plus input files are part of a
+       repository, which has been archived somewhere with a DOI.
+       This will be used to construct permanent references to files
+       used in this modeling, even if they haven't been uploaded to
+       a database such as PDB or EMDB.
+
+       @see FileLocation."""
+
+    # Two repositories compare equal if their DOIs and URLs are the same
+    def __eq__(self, other):
+        return self.doi == other.doi and self.url == other.url
+    def __hash__(self):
+        return hash((self.doi, self.url))
+
+    def __init__(self, doi, root=None, url=None,
+                 top_directory=None):
+        """Constructor.
+           @param doi the Digital Object Identifier for the repository.
+           @param root the relative path to the top-level directory
+                  of the repository from the working directory of the script,
+                  or None if files in this repository aren't checked out.
+           @param url If given, a location that this repository can be
+                  downloaded from.
+           @param top_directory If given, prefix all paths for files in this
+                  repository with this value. This is useful when the archived
+                  version of the repository is found in a subdirectory at the
+                  URL or DOI (for example, GitHub repositories archived at
+                  Zenodo get placed in a subdirectory named for the repository
+                  and git hash).
+        """
+        # todo: DOI should be optional (could also use URL, local path)
+        self.doi = doi
+        self.url, self.top_directory = url, top_directory
+        if root is not None:
+            # Store absolute path in case the working directory changes later
+            self._root = os.path.abspath(root)
+
+    @staticmethod
+    def update_in_repos(fileloc, repos):
+        """If the given FileLocation maps to somewhere within one of the
+           passed repositories, update it to reflect that."""
+        if fileloc.repo:
+            return
+        orig_path = fileloc.path
+        for repo in repos:
+            relpath = os.path.relpath(orig_path, repo._root)
+            if not relpath.startswith('..'):
+                # Prefer the shortest paths if multiple repositories can match
+                if fileloc.repo is None or len(fileloc.path) > len(relpath):
+                    fileloc.repo = repo
+                    fileloc.path = relpath
+
+    def _get_full_path(self, path):
+        """Prefix the given path with our top-level directory"""
+        return os.path.join(self.top_directory or "", path)
diff --git a/modules/mmcif/pyext/src/dumper.py b/modules/mmcif/pyext/src/dumper.py
new file mode 100644
index 0000000000..bbecf9d7e0
--- /dev/null
+++ b/modules/mmcif/pyext/src/dumper.py
@@ -0,0 +1,637 @@
+"""@namespace IMP.mmcif.dumper
+   @brief Utility classes to dump out information in mmCIF format.
+"""
+
+from __future__ import print_function
+import IMP.atom
+import IMP.mmcif.data
+import IMP.mmcif.dataset
+from IMP.mmcif.format import _CifWriter
+import operator
+import os
+
+class _Dumper(object):
+    """Base class for helpers to dump output to mmCIF"""
+    def __init__(self):
+        pass
+    def finalize_metadata(self, system):
+        pass
+    def finalize(self, system):
+        pass
+
+class _EntryDumper(_Dumper):
+    def dump(self, system, writer):
+        entry_id = 'imp_model'
+        # Write CIF header (so this dumper should always be first)
+        writer.fh.write("data_%s\n" % entry_id)
+        with writer.category("_entry") as l:
+            l.write(id=entry_id)
+
+class _ChemCompDumper(_Dumper):
+    def dump(self, system, writer):
+        seen = {}
+        std = dict.fromkeys(('ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS',
+               'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN', 'ARG', 'SER',
+               'THR', 'VAL', 'TRP', 'TYR'))
+        with writer.loop("_chem_comp", ["id", "type"]) as l:
+            for entity in system.entities.get_all():
+                seq = entity.sequence
+                for num, one_letter_code in enumerate(seq):
+                    restyp = IMP.atom.get_residue_type(one_letter_code)
+                    resid = restyp.get_string()
+                    if resid not in seen:
+                        seen[resid] = None
+                        l.write(id=resid,
+                                type='L-peptide linking' if resid in std \
+                                                         else 'other')
+
+
+class _EntityDumper(_Dumper):
+    # todo: we currently only support amino acid sequences here (and
+    # then only standard amino acids; need to add support for MSE etc.)
+    def dump(self, system, writer):
+        with writer.loop("_entity",
+                         ["id", "type", "src_method", "pdbx_description",
+                          "formula_weight", "pdbx_number_of_molecules",
+                          "details"]) as l:
+            for entity in system.entities.get_all():
+                l.write(id=entity.id, type='polymer', src_method='man',
+                        pdbx_description=entity.description,
+                        formula_weight=writer.unknown,
+                        pdbx_number_of_molecules=1, details=writer.unknown)
+
+
+class _EntityPolyDumper(_Dumper):
+    # todo: we currently only support amino acid sequences here
+    def dump(self, system, writer):
+        with writer.loop("_entity_poly",
+                         ["entity_id", "type", "nstd_linkage",
+                          "nstd_monomer", "pdbx_strand_id",
+                          "pdbx_seq_one_letter_code",
+                          "pdbx_seq_one_letter_code_can"]) as l:
+            for entity in system.entities.get_all():
+                seq = entity.sequence
+                # Split into lines to get tidier CIF output
+                seq = "\n".join(seq[i:i+70] for i in range(0, len(seq), 70))
+                comp = entity.first_component
+                l.write(entity_id=entity.id, type='polypeptide(L)',
+                        nstd_linkage='no', nstd_monomer='no',
+                        pdbx_strand_id=comp.asym_id,
+                        pdbx_seq_one_letter_code=seq,
+                        pdbx_seq_one_letter_code_can=seq)
+
+class _EntityPolySeqDumper(_Dumper):
+    def dump(self, system, writer):
+        with writer.loop("_entity_poly_seq",
+                         ["entity_id", "num", "mon_id", "hetero"]) as l:
+            for entity in system.entities.get_all():
+                seq = entity.sequence
+                for num, one_letter_code in enumerate(seq):
+                    restyp = IMP.atom.get_residue_type(one_letter_code)
+                    l.write(entity_id=entity.id, num=num + 1,
+                            mon_id=restyp.get_string(),
+                            hetero=writer.omitted)
+
+
+class _StructAsymDumper(_Dumper):
+    def dump(self, system, writer):
+        with writer.loop("_struct_asym",
+                         ["id", "entity_id", "details"]) as l:
+            for comp in system.components.get_all_modeled():
+                l.write(id=comp.asym_id,
+                        entity_id=comp.entity.id,
+                        details=comp.name if comp.name else writer.omitted)
+
+
+class _AssemblyDumper(_Dumper):
+    def finalize(self, system):
+        seen_assemblies = {}
+        # Assign IDs to all assemblies
+        self._assembly_by_id = []
+        for a in system._assemblies.get_all():
+            IMP.mmcif.data._assign_id(a, seen_assemblies, self._assembly_by_id)
+
+    def dump(self, system, writer):
+        ordinal = 1
+        with writer.loop("_ihm_struct_assembly",
+                         ["ordinal_id", "assembly_id", "parent_assembly_id",
+                          "entity_description",
+                          "entity_id", "asym_id", "seq_id_begin",
+                          "seq_id_end"]) as l:
+            for a in self._assembly_by_id:
+                for comp in a:
+                    entity = comp.entity
+                    l.write(ordinal_id=ordinal, assembly_id=a.id,
+                            # Currently all assemblies are not hierarchical,
+                            # so each assembly is a self-parent
+                            parent_assembly_id=a.id,
+                            entity_description=entity.description,
+                            entity_id=entity.id,
+                            asym_id=comp.asym_id if comp.asym_id
+                                                 else writer.omitted,
+                            seq_id_begin=1,
+                            seq_id_end=len(entity.sequence))
+                    ordinal += 1
+
+
+class _ModelRepresentationDumper(_Dumper):
+    def _get_granularity(self, r):
+        if isinstance(r.particles[0], IMP.atom.Residue):
+            return 'by-residue'
+        else:
+            return 'by-feature'
+
+    def dump(self, system, writer):
+        ordinal_id = 1
+        segment_id = 1
+        with writer.loop("_ihm_model_representation",
+                         ["ordinal_id", "representation_id",
+                          "segment_id", "entity_id", "entity_description",
+                          "entity_asym_id",
+                          "seq_id_begin", "seq_id_end",
+                          "model_object_primitive", "starting_model_id",
+                          "model_mode", "model_granularity",
+                          "model_object_count"]) as l:
+            for comp in system.components.get_all_modeled():
+                # For now, assume that representation of the same-named
+                # component is the same in all states, so just take the first
+                state = list(system._states.keys())[0]
+                for r in state.representation[comp]:
+                    # todo: handle multiple representations
+                    l.write(ordinal_id=ordinal_id, representation_id=1,
+                            segment_id=segment_id, entity_id=comp.entity.id,
+                            entity_description=comp.entity.description,
+                            entity_asym_id=comp.asym_id,
+                            seq_id_begin=r.residue_range[0],
+                            seq_id_end=r.residue_range[1],
+                            model_object_primitive=r.primitive,
+                            starting_model_id=r.starting_model.id
+                                                  if r.starting_model
+                                                  else writer.omitted,
+                            model_mode='rigid' if r.rigid_body else 'flexible',
+                            model_granularity=self._get_granularity(r),
+                            model_object_count=len(r.particles))
+                    ordinal_id += 1
+                    segment_id += 1
+
+class _MSESeqDif(object):
+    """Track an MSE -> MET mutation in the starting model sequence"""
+    comp_id = 'MET'
+    db_comp_id = 'MSE'
+    details = 'Conversion of modified residue MSE to MET'
+    def __init__(self, res, comp, source, model):
+        self.res, self.comp, self.source, self.model = res, comp, source, model
+        self.model = model
+
+
+class _StartingModelDumper(_Dumper):
+    def finalize(self, system):
+        for comp in system.components.get_all_modeled():
+            for n, starting_model in enumerate(self._all_models(system, comp)):
+                starting_model.id = '%s-m%d' % (comp.name, n + 1)
+
+    def _all_models(self, system, comp):
+        """Get the set of all starting models for a component"""
+        for state in system._states.keys():
+            seen_models = {}
+            for rep in state.representation.get(comp, []):
+                sm = rep.starting_model
+                if sm and sm not in seen_models:
+                    seen_models[sm] = None
+                    yield sm
+
+    def dump(self, system, writer):
+        self.dump_details(system, writer)
+        self.dump_comparative(system, writer)
+        seq_dif = self.dump_coords(system, writer)
+        self.dump_seq_dif(writer, seq_dif)
+
+    def dump_details(self, system, writer):
+        with writer.loop("_ihm_starting_model_details",
+                     ["starting_model_id", "entity_id", "entity_description",
+                      "asym_id", "seq_id_begin",
+                      "seq_id_end", "starting_model_source",
+                      "starting_model_auth_asym_id",
+                      "starting_model_sequence_offset",
+                      "dataset_list_id"]) as l:
+            for comp in system.components.get_all_modeled():
+                for sm in self._all_models(system, comp):
+                    seq_id_begin, seq_id_end = sm.get_seq_id_range_all_sources()
+                    l.write(starting_model_id=sm.id, entity_id=comp.entity.id,
+                            entity_description=comp.entity.description,
+                            asym_id=comp.asym_id,
+                            seq_id_begin=seq_id_begin,
+                            seq_id_end=seq_id_end,
+                            starting_model_source=sm.sources[0].source,
+                            starting_model_auth_asym_id=sm.chain_id,
+                            dataset_list_id=sm.dataset.id,
+                            starting_model_sequence_offset=sm.offset)
+
+    def dump_comparative(self, system, writer):
+        """Dump details on comparative models."""
+        with writer.loop("_ihm_starting_comparative_models",
+                     ["ordinal_id", "starting_model_id",
+                      "starting_model_auth_asym_id",
+                      "starting_model_seq_id_begin",
+                      "starting_model_seq_id_end",
+                      "template_auth_asym_id", "template_seq_id_begin",
+                      "template_seq_id_end", "template_sequence_identity",
+                      "template_sequence_identity_denominator",
+                      "template_dataset_list_id",
+                      "alignment_file_id"]) as l:
+            ordinal = 1
+            for comp in system.components.get_all_modeled():
+                for sm in self._all_models(system, comp):
+                    for template in [s for s in sm.sources
+                          if isinstance(s, IMP.mmcif.metadata._TemplateSource)]:
+                        seq_id_begin, seq_id_end = template.get_seq_id_range(sm)
+                        l.write(ordinal_id=ordinal,
+                          starting_model_id=sm.id,
+                          starting_model_auth_asym_id=template.chain_id,
+                          starting_model_seq_id_begin=seq_id_begin,
+                          starting_model_seq_id_end=seq_id_end,
+                          template_auth_asym_id=template.tm_chain_id,
+                          template_seq_id_begin=template.tm_seq_id_begin,
+                          template_seq_id_end=template.tm_seq_id_end,
+                          template_sequence_identity=template.sequence_identity,
+                          # Assume Modeller-style sequence identity for now
+                          template_sequence_identity_denominator=1,
+                          template_dataset_list_id=template.tm_dataset.id
+                                                   if template.tm_dataset
+                                                   else writer.unknown,
+                          alignment_file_id=sm.alignment_file.id
+                                            if hasattr(sm, 'alignment_file')
+                                            else writer.unknown)
+                        ordinal += 1
+
+    def dump_coords(self, system, writer):
+        seq_dif = []
+        ordinal = 1
+        with writer.loop("_ihm_starting_model_coord",
+                     ["starting_model_id", "group_PDB", "id", "type_symbol",
+                      "atom_id", "comp_id", "entity_id", "asym_id",
+                      "seq_id", "Cartn_x",
+                      "Cartn_y", "Cartn_z", "B_iso_or_equiv",
+                      "ordinal_id"]) as l:
+            for comp in system.components.get_all_modeled():
+                for sm in self._all_models(system, comp):
+                    m, sel = self._read_coords(sm)
+                    last_res_index = None
+                    for a in sel.get_selected_particles():
+                        coord = IMP.core.XYZ(a).get_coordinates()
+                        atom = IMP.atom.Atom(a)
+                        element = atom.get_element()
+                        element = IMP.atom.get_element_table().get_name(element)
+                        atom_name = atom.get_atom_type().get_string().strip()
+                        group_pdb = 'ATOM'
+                        if atom_name.startswith('HET:'):
+                            group_pdb = 'HETATM'
+                            atom_name = atom_name[4:]
+                        res = IMP.atom.get_residue(atom)
+                        res_name = res.get_residue_type().get_string()
+                        # MSE in the original PDB is automatically mutated
+                        # by IMP to MET, so reflect that in the output,
+                        # and pass back to populate the seq_dif category.
+                        if res_name == 'MSE':
+                            res_name = 'MET'
+                            # Only add one seq_dif record per residue
+                            ind = res.get_index()
+                            if ind != last_res_index:
+                                last_res_index = ind
+                                # This should only happen when we're using
+                                # a crystal structure as the source (a
+                                # comparative model would use MET in
+                                # the sequence)
+                                assert(len(sm.sources) == 1)
+                                seq_dif.append(_MSESeqDif(res, comp,
+                                                          sm.sources[0], sm))
+                        l.write(starting_model_id=sm.id,
+                                group_PDB=group_pdb,
+                                id=atom.get_input_index(), type_symbol=element,
+                                atom_id=atom_name, comp_id=res_name,
+                                entity_id=comp.entity.id,
+                                asym_id=sm.chain_id,
+                                seq_id=res.get_index() + sm.offset,
+                                Cartn_x=coord[0],
+                                Cartn_y=coord[1], Cartn_z=coord[2],
+                                B_iso_or_equiv=atom.get_temperature_factor(),
+                                ordinal_id=ordinal)
+                        ordinal += 1
+        return seq_dif
+
+    def dump_seq_dif(self, writer, seq_dif):
+        ordinal = 1
+        with writer.loop("_ihm_starting_model_seq_dif",
+                     ["ordinal_id", "entity_id", "asym_id",
+                      "seq_id", "comp_id", "starting_model_id",
+                      "db_asym_id", "db_seq_id", "db_comp_id",
+                      "details"]) as l:
+            for sd in seq_dif:
+                l.write(ordinal_id=ordinal, entity_id=sd.comp.entity.id,
+                        asym_id=sd.model.chain_id,
+                        seq_id=sd.res.get_index(),
+                        comp_id=sd.comp_id,
+                        db_asym_id=sd.source.chain_id,
+                        db_seq_id=sd.res.get_index() - sd.model.offset,
+                        db_comp_id=sd.db_comp_id,
+                        starting_model_id=sd.model.name,
+                        details=sd.details)
+                ordinal += 1
+
+    def _read_coords(self, sm):
+        """Read the coordinates for a starting model"""
+        m = IMP.Model()
+        # todo: support reading other subsets of the atoms (e.g. CA/CB)
+        slt = IMP.atom.ChainPDBSelector(sm.chain_id) \
+                   & IMP.atom.NonWaterNonHydrogenPDBSelector()
+        hier = IMP.atom.read_pdb(sm.filename, m, slt)
+        sel = IMP.atom.Selection(hier,
+                     residue_indexes=list(range(sm.seq_id_begin - sm.offset,
+                                                sm.seq_id_end + 1 - sm.offset)))
+        return m, sel
+
+
+class _DatasetDumper(_Dumper):
+    def _dataset_by_id(self, system):
+        return sorted(system.datasets.get_all(), key=operator.attrgetter('id'))
+
+    def dump(self, system, writer):
+        with writer.loop("_ihm_dataset_list",
+                         ["id", "data_type", "database_hosted"]) as l:
+            for d in self._dataset_by_id(system):
+                l.write(id=d.id, data_type=d._data_type,
+                        database_hosted=isinstance(d.location,
+                                        IMP.mmcif.dataset.DatabaseLocation))
+        self.dump_other((d for d in self._dataset_by_id(system)
+                         if not isinstance(d.location,
+                                           IMP.mmcif.dataset.DatabaseLocation)),
+                        writer)
+        self.dump_rel_dbs((d for d in self._dataset_by_id(system)
+                           if isinstance(d.location,
+                                         IMP.mmcif.dataset.DatabaseLocation)),
+                          writer)
+        self.dump_related(system, writer)
+
+    def dump_other(self, datasets, writer):
+        ordinal = 1
+        with writer.loop("_ihm_dataset_external_reference",
+                         ["id", "dataset_list_id", "file_id"]) as l:
+            for d in datasets:
+                l.write(id=ordinal, dataset_list_id=d.id, file_id=d.location.id)
+                ordinal += 1
+
+    def dump_rel_dbs(self, datasets, writer):
+        ordinal = 1
+        with writer.loop("_ihm_dataset_related_db_reference",
+                         ["id", "dataset_list_id", "db_name",
+                          "accession_code", "version", "details"]) as l:
+            for d in datasets:
+                l.write(id=ordinal, dataset_list_id=d.id,
+                        db_name=d.location.db_name,
+                        accession_code=d.location.access_code,
+                        version=d.location.version if d.location.version
+                                else writer.omitted,
+                        details=d.location.details if d.location.details
+                                else writer.omitted)
+                ordinal += 1
+
+    def dump_related(self, system, writer):
+        ordinal = 1
+        with writer.loop("_ihm_related_datasets",
+                         ["ordinal_id", "dataset_list_id_derived",
+                          "dataset_list_id_primary"]) as l:
+            for derived in self._dataset_by_id(system):
+                for parent in sorted(derived._parents.keys(),
+                                     key=operator.attrgetter('id')):
+                    l.write(ordinal_id=ordinal,
+                            dataset_list_id_derived=derived.id,
+                            dataset_list_id_primary=parent.id)
+                    ordinal += 1
+
+
+class _CitationDumper(_Dumper):
+    def dump(self, system, writer):
+        self.dump_citation(system, writer)
+        self.dump_author(system, writer)
+
+    def dump_citation(self, system, writer):
+        with writer.loop("_citation",
+                         ["id", "title", "journal_abbrev", "journal_volume",
+                          "page_first", "page_last", "year",
+                          "pdbx_database_id_PubMed",
+                          "pdbx_database_id_DOI"]) as l:
+            for n, c in enumerate(system._citations):
+                if isinstance(c.page_range, (tuple, list)):
+                    page_first, page_last = c.page_range
+                else:
+                    page_first = c.page_range
+                    page_last = writer.omitted
+                l.write(id=n+1, title=c.title, journal_abbrev=c.journal,
+                        journal_volume=c.volume, page_first=page_first,
+                        page_last=page_last, year=c.year,
+                        pdbx_database_id_PubMed=c.pmid,
+                        pdbx_database_id_DOI=c.doi)
+
+    def dump_author(self, system, writer):
+        with writer.loop("_citation_author",
+                         ["citation_id", "name", "ordinal"]) as l:
+            ordinal = 1
+            for n, c in enumerate(system._citations):
+                for a in c.authors:
+                    l.write(citation_id=n+1, name=a, ordinal=ordinal)
+                    ordinal += 1
+
+
+class _SoftwareDumper(_Dumper):
+    def dump(self, system, writer):
+        ordinal = 1
+        with writer.loop("_software",
+                         ["pdbx_ordinal", "name", "classification", "version",
+                          "type", "location"]) as l:
+            for s in system._software:
+                l.write(pdbx_ordinal=ordinal, name=s.name,
+                        classification=s.classification, version=s.version,
+                        type=s.type, location=s.url)
+                ordinal += 1
+
+
+class _ExternalReferenceDumper(_Dumper):
+    """Output information on externally referenced files
+       (i.e. anything that refers to a Location that isn't
+       a DatabaseLocation)."""
+
+    class _LocalFiles(object):
+        reference_provider = _CifWriter.omitted
+        reference_type = 'Supplementary Files'
+        reference = _CifWriter.omitted
+        refers_to = 'Other'
+        associated_url = _CifWriter.omitted
+
+        def __init__(self, top_directory):
+            self.top_directory = top_directory
+
+        def _get_full_path(self, path):
+            return os.path.relpath(path, start=self.top_directory)
+
+    class _Repository(object):
+        reference_provider = _CifWriter.omitted
+        reference_type = 'DOI'
+        refers_to = 'Other'
+        associated_url = _CifWriter.omitted
+
+        def __init__(self, repo):
+            self.id = repo.id
+            self.reference = repo.doi
+            if 'zenodo' in self.reference:
+                self.reference_provider = 'Zenodo'
+            if repo.url:
+                self.associated_url = repo.url
+                if repo.url.endswith(".zip"):
+                    self.refers_to = 'Archive'
+                else:
+                    self.refers_to = 'File'
+
+    def finalize(self, system):
+        # Keep only locations that don't point into databases (these are
+        # handled elsewhere)
+        self._refs = list(system._external_files.get_all_nondb())
+        # Assign IDs to all locations and repos (including the None repo, which
+        # is for local files)
+        seen_refs = {}
+        seen_repos = {}
+        self._ref_by_id = []
+        self._repo_by_id = []
+        # Special dummy repo for repo=None (local files)
+        self._local_files = self._LocalFiles(os.getcwd())
+        for r in self._refs:
+            # Update location to point to parent repository, if any
+            # todo: this could probably happen when the location is first made
+            system._update_location(r.location)
+            # Assign a unique ID to the reference
+            IMP.mmcif.data._assign_id(r, seen_refs, self._ref_by_id)
+            # Assign a unique ID to the repository
+            IMP.mmcif.data._assign_id(r.location.repo or self._local_files,
+                                      seen_repos, self._repo_by_id)
+
+    def dump(self, system, writer):
+        self.dump_repos(writer)
+        self.dump_refs(writer)
+
+    def dump_repos(self, writer):
+        def map_repo(r):
+            return r if isinstance(r, self._LocalFiles) else self._Repository(r)
+        with writer.loop("_ihm_external_reference_info",
+                         ["reference_id", "reference_provider",
+                          "reference_type", "reference", "refers_to",
+                          "associated_url"]) as l:
+            for repo in [map_repo(r) for r in self._repo_by_id]:
+                l.write(reference_id=repo.id,
+                        reference_provider=repo.reference_provider,
+                        reference_type=repo.reference_type,
+                        reference=repo.reference, refers_to=repo.refers_to,
+                        associated_url=repo.associated_url)
+
+    def dump_refs(self, writer):
+        with writer.loop("_ihm_external_files",
+                         ["id", "reference_id", "file_path", "content_type",
+                          "file_size_bytes", "details"]) as l:
+            for r in self._ref_by_id:
+                loc = r.location
+                repo = loc.repo or self._local_files
+                file_path=self._posix_path(repo._get_full_path(loc.path))
+                if r.file_size is None:
+                    file_size = writer.omitted
+                else:
+                    file_size = r.file_size
+                l.write(id=loc.id, reference_id=repo.id,
+                        file_path=file_path,
+                        content_type=r.content_type,
+                        file_size_bytes=file_size,
+                        details=loc.details or writer.omitted)
+
+    # On Windows systems, convert native paths to POSIX-like (/-separated) paths
+    if os.sep == '/':
+        def _posix_path(self, path):
+            return path
+    else:
+        def _posix_path(self, path):
+            return path.replace(os.sep, '/')
+
+
+class _ProtocolDumper(_Dumper):
+    class _Step(object):
+        name = 'Sampling' # todo: support 'refinement' too
+
+        def __init__(self, s):
+            self.orig_s = s
+        def __get_method(self):
+            first_prov = self.orig_s._prov[0]
+            meth = first_prov.get_method()
+            if first_prov.get_number_of_replicas() > 1:
+                meth = "Replica exchange " + meth
+            return meth
+
+        num_models_begin = property(lambda self: self.orig_s.num_models_begin)
+        num_models_end = property(lambda self: self.orig_s.num_models_end)
+        id = property(lambda self: self.orig_s.id)
+        method = property(__get_method)
+
+    def dump(self, system, writer):
+        ordinal = 1
+        with writer.loop("_ihm_modeling_protocol",
+                         ["ordinal_id", "protocol_id", "step_id",
+                          "struct_assembly_id", "dataset_group_id",
+                          "struct_assembly_description", "protocol_name",
+                          "step_name", "step_method", "num_models_begin",
+                          "num_models_end", "multi_scale_flag",
+                          "multi_state_flag", "time_ordered_flag"]) as l:
+            for p in system.protocols.get_all():
+                for s in p._steps:
+                    step = self._Step(s)
+                    l.write(ordinal_id=ordinal, protocol_id=p.id,
+                            step_id=step.id, step_method=step.method,
+                            step_name=step.name,
+                            struct_assembly_id=p.modeled_assembly.id,
+                            num_models_begin=step.num_models_begin,
+                            num_models_end=step.num_models_end,
+                            # todo: support multiple states, time ordered
+                            multi_state_flag=False, time_ordered_flag=False,
+                            # todo: revisit assumption all models are multiscale
+                            multi_scale_flag=True)
+                    ordinal += 1
+
+
+class _PostProcessDumper(_Dumper):
+    class _PostProc(object):
+        def __init__(self, pp):
+            self._pp = pp
+        def __get_type(self):
+            type_map = {IMP.core.ClusterProvenance: 'cluster',
+                        IMP.core.FilterProvenance: 'filter'}
+            return type_map[type(self._pp._prov)]
+        def __get_feature(self):
+            type_map = {IMP.core.ClusterProvenance: 'RMSD',
+                        IMP.core.FilterProvenance: 'energy/score'}
+            return type_map[type(self._pp._prov)]
+
+        num_models_begin = property(lambda self: self._pp.num_models_begin)
+        num_models_end = property(lambda self: self._pp.num_models_end)
+        id = property(lambda self: self._pp.id)
+        type = property(__get_type)
+        feature = property(__get_feature)
+
+    def dump(self, system, writer):
+        ordinal = 1
+        with writer.loop("_ihm_modeling_post_process",
+                         ["id", "protocol_id", "analysis_id", "step_id",
+                          "type", "feature", "num_models_begin",
+                          "num_models_end"]) as l:
+            # todo: handle multiple analyses
+            for prot in system.protocols.get_all():
+                for pp in prot._postprocs:
+                    p = self._PostProc(pp)
+                    l.write(id=ordinal, protocol_id=prot.id, analysis_id=1,
+                            step_id=p.id, type=p.type, feature=p.feature,
+                            num_models_begin=p.num_models_begin,
+                            num_models_end=p.num_models_end)
+                    ordinal += 1
diff --git a/modules/mmcif/pyext/src/format.py b/modules/mmcif/pyext/src/format.py
new file mode 100644
index 0000000000..ab373b9ea4
--- /dev/null
+++ b/modules/mmcif/pyext/src/format.py
@@ -0,0 +1,109 @@
+"""@namespace IMP.mmcif.format
+   @brief Utility classes to handle CIF format.
+"""
+
+from __future__ import print_function
+import sys
+
+# Python 3 has no 'long' type, so use 'int' instead
+if sys.version_info[0] >= 3:
+    _long_type = int
+else:
+    _long_type = long
+
+class _LineWriter(object):
+    def __init__(self, writer, line_len=80):
+        self.writer = writer
+        self.line_len = line_len
+        self.column = 0
+    def write(self, val):
+        if isinstance(val, str) and '\n' in val:
+            self.writer.fh.write("\n;")
+            self.writer.fh.write(val)
+            if not val.endswith('\n'):
+                self.writer.fh.write("\n")
+            self.writer.fh.write(";\n")
+            self.column = 0
+            return
+        val = self.writer._repr(val)
+        if self.column > 0:
+            if self.column + len(val) + 1 > self.line_len:
+                self.writer.fh.write("\n")
+                self.column = 0
+            else:
+                self.writer.fh.write(" ")
+                self.column += 1
+        self.writer.fh.write(val)
+        self.column += len(val)
+
+
+class _CifCategoryWriter(object):
+    def __init__(self, writer, category):
+        self.writer = writer
+        self.category = category
+    def write(self, **kwargs):
+        self.writer._write(self.category, kwargs)
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        pass
+
+
+class _CifLoopWriter(object):
+    def __init__(self, writer, category, keys):
+        self.writer = writer
+        self.category = category
+        self.keys = keys
+        # Remove characters that we can't use in Python identifiers
+        self.python_keys = [k.replace('[', '').replace(']', '') for k in keys]
+        self._empty_loop = True
+    def write(self, **kwargs):
+        if self._empty_loop:
+            f = self.writer.fh
+            f.write("#\nloop_\n")
+            for k in self.keys:
+                f.write("%s.%s\n" % (self.category, k))
+            self._empty_loop = False
+        l = _LineWriter(self.writer)
+        for k in self.python_keys:
+            l.write(kwargs.get(k, self.writer.omitted))
+        self.writer.fh.write("\n")
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        if not self._empty_loop:
+            self.writer.fh.write("#\n")
+
+
+class _CifWriter(object):
+    omitted = '.'
+    unknown = '?'
+    _boolmap = {False: 'NO', True: 'YES'}
+
+    def __init__(self, fh):
+        self.fh = fh
+    def category(self, category):
+        return _CifCategoryWriter(self, category)
+    def loop(self, category, keys):
+        return _CifLoopWriter(self, category, keys)
+    def write_comment(self, comment):
+        for line in textwrap.wrap(comment, 78):
+            self.fh.write('# ' + line + '\n')
+    def _write(self, category, kwargs):
+        for key in kwargs:
+            self.fh.write("%s.%s %s\n" % (category, key,
+                                          self._repr(kwargs[key])))
+    def _repr(self, obj):
+        if isinstance(obj, str) and '"' not in obj \
+           and "'" not in obj and " " not in obj:
+            return obj
+        elif isinstance(obj, float):
+            return "%.3f" % obj
+        elif isinstance(obj, bool):
+            return self._boolmap[obj]
+        # Don't use repr(x) if type(x) == long since that adds an 'L' suffix,
+        # which isn't valid mmCIF syntax. _long_type = long only on Python 2.
+        elif isinstance(obj, _long_type):
+            return "%d" % obj
+        else:
+            return repr(obj)
diff --git a/modules/mmcif/pyext/src/metadata.py b/modules/mmcif/pyext/src/metadata.py
new file mode 100644
index 0000000000..782ed36abf
--- /dev/null
+++ b/modules/mmcif/pyext/src/metadata.py
@@ -0,0 +1,273 @@
+"""@namespace IMP.mmcif.metadata
+
+   Classes to extract metadata for various input files.
+"""
+
+import IMP.mmcif.dataset
+import re
+
+class _MetadataParser(object):
+    """Base class for all metadata parsers.
+       Call parse_file() to extract metadata into self.dataset, etc."""
+    def parse_file(self, filename, system):
+        pass
+
+
+class _PDBHelix(object):
+    """Represent a HELIX record from a PDB file."""
+    def __init__(self, line):
+        self.helix_id = line[11:14].strip()
+        self.start_resnam = line[14:18].strip()
+        self.start_asym = line[19]
+        self.start_resnum = int(line[21:25])
+        self.end_resnam = line[27:30].strip()
+        self.end_asym = line[31]
+        self.end_resnum = int(line[33:37])
+        self.helix_class = int(line[38:40])
+        self.length = int(line[71:76])
+
+
+class _StartingModelSource(object):
+    """Base class for all sources for starting models."""
+    pass
+
+
+class _PDBSource(_StartingModelSource):
+    """An experimental PDB file used as part of a starting model"""
+    source = 'experimental model'
+    db_name = 'PDB'
+    sequence_identity = 100.0
+
+    def __init__(self, db_code, chain_id, metadata):
+        self.db_code = db_code
+        self.chain_id = chain_id
+        self.metadata = metadata
+
+    def get_seq_id_range(self, starting_model):
+        # Assume the structure covers the entire sequence
+        return (starting_model.seq_id_begin, starting_model.seq_id_end)
+
+
+class _TemplateSource(_StartingModelSource):
+    """A PDB file used as a template for a comparative starting model"""
+    source = 'comparative model'
+    db_name = db_code = None
+    tm_dataset = None
+
+    def __init__(self, tm_code, tm_seq_id_begin, tm_seq_id_end, seq_id_begin,
+                 chain_id, seq_id_end, seq_id):
+        # Assume a code of 1abcX refers to a real PDB structure
+        if len(tm_code) == 5:
+            self._orig_tm_code = None
+            self.tm_db_code = tm_code[:4].upper()
+            self.tm_chain_id = tm_code[4]
+        else:
+            # Otherwise, will need to look up in TEMPLATE PATH remarks
+            self._orig_tm_code = tm_code
+            self.tm_db_code = None
+            self.tm_chain_id = tm_code[-1]
+        self.sequence_identity = seq_id
+        self.tm_seq_id_begin = tm_seq_id_begin
+        self.tm_seq_id_end = tm_seq_id_end
+        self.chain_id = chain_id
+        self._seq_id_begin, self._seq_id_end = seq_id_begin, seq_id_end
+
+    def get_seq_id_range(self, starting_model):
+        # The template may cover more than the current starting model
+        seq_id_begin = max(starting_model.seq_id_begin, self._seq_id_begin)
+        seq_id_end = min(starting_model.seq_id_end, self._seq_id_end)
+        return (seq_id_begin, seq_id_end)
+
+
+class _UnknownSource(_StartingModelSource):
+    """Part of a starting model from an unknown source"""
+    db_code = None
+    db_name = None
+    chain_id = None
+    sequence_identity = None
+    # Map dataset types to starting model sources
+    _source_map = {'Comparative model': 'comparative model',
+                   'Experimental model': 'experimental model'}
+
+    def __init__(self, dataset, chain):
+        self.source = self._source_map[dataset._data_type]
+        self.chain_id = chain
+
+    def get_seq_id_range(self, starting_model):
+        return (starting_model.seq_id_begin, starting_model.seq_id_end)
+
+
+class _PDBMetadataParser(_MetadataParser):
+    """Extract metadata (e.g. PDB ID, comparative modeling templates) from a
+       PDB file."""
+
+    def parse_file(self, filename, chain, system):
+        """Extract metadata from `filename`.
+           Sets self.dataset to point to the PDB file, self.sources to
+           a list of sources (e.g. comparative modeling templates), and
+           self.alignment_file to the comparative modeling alignment,
+           if available."""
+        self.alignment_file = None
+        fh = open(filename)
+        first_line = fh.readline()
+        local_file = IMP.mmcif.dataset.FileLocation(filename,
+                                          details="Starting model structure")
+        if first_line.startswith('HEADER'):
+            self._parse_official_pdb(fh, chain, first_line, system)
+        elif first_line.startswith('EXPDTA    DERIVED FROM PDB:'):
+            self._parse_derived_from_pdb(fh, chain, first_line, local_file,
+                                         system)
+        elif first_line.startswith('EXPDTA    DERIVED FROM COMPARATIVE '
+                                   'MODEL, DOI:'):
+            self._parse_derived_from_model(fh, chain, first_line, local_file,
+                                           system)
+        elif first_line.startswith('EXPDTA    THEORETICAL MODEL, MODELLER'):
+            self._parse_modeller_model(fh, chain, first_line, local_file,
+                                       filename, system)
+        elif first_line.startswith('REMARK  99  Chain ID :'):
+            self._parse_phyre_model(fh, chain, first_line, local_file,
+                                    filename, system)
+        else:
+            self._parse_unknown_model(fh, chain, first_line, local_file,
+                                      filename, system)
+
+    def _parse_official_pdb(self, fh, chain, first_line, system):
+        """Handle a file that's from the official PDB database."""
+        version, details, metadata = self._parse_pdb_records(fh, first_line)
+        source = _PDBSource(first_line[62:66].strip(), chain, metadata)
+        l = IMP.mmcif.dataset.PDBLocation(source.db_code, version, details)
+        d = IMP.mmcif.dataset.PDBDataset(l)
+        self.dataset = system.datasets.add(d)
+        self.sources = [source]
+
+    def _parse_derived_from_pdb(self, fh, chain, first_line, local_file,
+                                system):
+        # Model derived from a PDB structure; treat as a local experimental
+        # model with the official PDB as a parent
+        local_file.details = self._parse_details(fh)
+        db_code = first_line[27:].strip()
+        d = IMP.mmcif.dataset.PDBDataset(local_file)
+        pdb_loc = IMP.mmcif.dataset.PDBLocation(db_code)
+        parent = IMP.mmcif.dataset.PDBDataset(pdb_loc)
+        d.add_parent(parent)
+        self.dataset = system.datasets.add(d)
+        self.sources = [_UnknownSource(self.dataset, chain)]
+
+    def _parse_derived_from_model(self, fh, chain, first_line, local_file,
+                                  system):
+        # Model derived from a comparative model; link back to the original
+        # model as a parent
+        local_file.details = self._parse_details(fh)
+        d = IMP.mmcif.dataset.ComparativeModelDataset(local_file)
+        repo = IMP.mmcif.dataset.Repository(doi=first_line[46:].strip())
+        # todo: better specify an unknown path
+        orig_loc = IMP.mmcif.dataset.FileLocation(repo=repo, path='.',
+                          details="Starting comparative model structure")
+        parent = IMP.mmcif.dataset.ComparativeModelDataset(orig_loc)
+        d.add_parent(parent)
+        self.dataset = system.datasets.add(d)
+        self.sources = [_UnknownSource(self.dataset, chain)]
+
+    def _parse_modeller_model(self, fh, chain, first_line, local_file,
+                              filename, system):
+        system._software.set_modeller_used(*first_line[38:].split(' ', 1))
+        self._handle_comparative_model(local_file, filename, chain, system)
+
+    def _parse_phyre_model(self, fh, chain, first_line, local_file,
+                           filename, system):
+        # Model generated by Phyre2
+        # todo: extract Modeller-like template info for Phyre models
+        system._software.set_phyre2_used()
+        self._handle_comparative_model(local_file, filename, chain, system)
+
+    def _parse_unknown_model(self, fh, chain, first_line, local_file, filename,
+                             system):
+        # todo: revisit assumption that all unknown source PDBs are
+        # comparative models
+        self._handle_comparative_model(local_file, filename, chain, system)
+
+    def _handle_comparative_model(self, local_file, pdbname, chain, system):
+        d = IMP.mmcif.dataset.ComparativeModelDataset(local_file)
+        self.dataset = system.datasets.add(d)
+        templates, alnfile = self.get_templates(pdbname, system)
+        if alnfile:
+            self.alignment_file = IMP.mmcif.dataset.FileLocation(alnfile,
+                                    details="Alignment for starting "
+                                            "comparative model")
+            system._external_files.add_input(self.alignment_file)
+
+        if templates:
+            self.sources = templates
+        else:
+            self.sources = [_UnknownSource(self.dataset, chain)]
+
+    def get_templates(self, pdbname, system):
+        template_path_map = {}
+        templates = []
+        alnfile = None
+        alnfilere = re.compile('REMARK   6 ALIGNMENT: (\S+)')
+        tmppathre = re.compile('REMARK   6 TEMPLATE PATH (\S+) (\S+)')
+        tmpre = re.compile('REMARK   6 TEMPLATE: '
+                           '(\S+) (\S+):\S+ \- (\S+):\S+ '
+                           'MODELS (\S+):(\S+) \- (\S+):\S+ AT (\S+)%')
+
+        with open(pdbname) as fh:
+            for line in fh:
+                if line.startswith('ATOM'): # Read only the header
+                    break
+                m = tmppathre.match(line)
+                if m:
+                    template_path_map[m.group(1)] = \
+                              IMP.get_relative_path(pdbname, m.group(2))
+                m = alnfilere.match(line)
+                if m:
+                    # Path to alignment is relative to that of the PDB file
+                    alnfile = IMP.get_relative_path(pdbname, m.group(1))
+                m = tmpre.match(line)
+                if m:
+                    templates.append(_TemplateSource(m.group(1),
+                                                     int(m.group(2)),
+                                                     int(m.group(3)),
+                                                     int(m.group(4)),
+                                                     m.group(5),
+                                                     int(m.group(6)),
+                                                     m.group(7)))
+        # Add datasets for templates
+        for t in templates:
+            if t._orig_tm_code:
+                fname = template_path_map[t._orig_tm_code]
+                l = IMP.mmcif.dataset.FileLocation(fname,
+                                 details="Template for comparative modeling")
+            else:
+                l = IMP.mmcif.dataset.PDBLocation(t.tm_db_code)
+            d = IMP.mmcif.dataset.PDBDataset(l)
+            d = system.datasets.add(d)
+            t.tm_dataset = d
+            self.dataset.add_parent(d)
+
+        # Sort by starting residue, then ending residue
+        return(sorted(templates,
+                      key=lambda x: (x._seq_id_begin, x._seq_id_end)),
+               alnfile)
+
+    def _parse_pdb_records(self, fh, first_line):
+        """Extract information from an official PDB"""
+        metadata = []
+        details = ''
+        for line in fh:
+            if line.startswith('TITLE'):
+                details += line[10:].rstrip()
+            elif line.startswith('HELIX'):
+                metadata.append(_PDBHelix(line))
+        return (first_line[50:59].strip(),
+                details if details else None, metadata)
+
+    def _parse_details(self, fh):
+        """Extract TITLE records from a PDB file"""
+        details = ''
+        for line in fh:
+            if line.startswith('TITLE'):
+                details += line[10:].rstrip()
+            elif line.startswith('ATOM'):
+                break
+        return details
diff --git a/modules/mmcif/pyext/src/util.py b/modules/mmcif/pyext/src/util.py
new file mode 100644
index 0000000000..ba7cac21ab
--- /dev/null
+++ b/modules/mmcif/pyext/src/util.py
@@ -0,0 +1,205 @@
+import IMP.mmcif.dumper
+import IMP.mmcif.dataset
+import IMP.mmcif.data
+import IMP.mmcif.format
+import IMP.rmf
+import IMP.atom
+import RMF
+import string
+import weakref
+import operator
+
+class _NonModeledChain(object):
+    """Represent a chain that was experimentally characterized but not modeled.
+       Such a chain resembles an IMP.atom.Chain, but has no associated
+       structure, and belongs to no state."""
+    def __init__(self, name, sequence, chain_type):
+        self.name = name
+        self.sequence = sequence
+        self.chain_type = chain_type
+
+    get_sequence = lambda self: self.sequence
+
+
+class System(object):
+    def __init__(self):
+        self._states = {}
+        self._assemblies = IMP.mmcif.data._Assemblies()
+        # The assembly of all known components.
+        self.complete_assembly = IMP.mmcif.data._Assembly()
+        self._assemblies.add(self.complete_assembly)
+
+        self._dumpers = [IMP.mmcif.dumper._EntryDumper(), # must be first
+                         IMP.mmcif.dumper._SoftwareDumper(),
+                         IMP.mmcif.dumper._CitationDumper(),
+                         IMP.mmcif.dumper._ChemCompDumper(),
+                         IMP.mmcif.dumper._EntityDumper(),
+                         IMP.mmcif.dumper._EntityPolyDumper(),
+                         IMP.mmcif.dumper._EntityPolySeqDumper(),
+                         IMP.mmcif.dumper._StructAsymDumper(),
+                         IMP.mmcif.dumper._AssemblyDumper(),
+                         IMP.mmcif.dumper._ModelRepresentationDumper(),
+                         IMP.mmcif.dumper._ExternalReferenceDumper(),
+                         IMP.mmcif.dumper._DatasetDumper(),
+                         IMP.mmcif.dumper._StartingModelDumper(),
+                         IMP.mmcif.dumper._ProtocolDumper(),
+                         IMP.mmcif.dumper._PostProcessDumper()]
+        self.entities = IMP.mmcif.data._EntityMapper()
+        self.components = IMP.mmcif.data._ComponentMapper()
+        self._citations = []
+        self._software = IMP.mmcif.data._AllSoftware()
+        self._external_files = IMP.mmcif.data._ExternalFiles()
+        self.datasets = IMP.mmcif.data._Datasets(self._external_files)
+        # All modeling protocols
+        self.protocols = IMP.mmcif.data._Protocols()
+
+    def _update_location(self, fileloc):
+        """Update FileLocation to point to a parent repository, if any"""
+        IMP.mmcif.dataset.Repository.update_in_repos(fileloc,
+                                                   self._external_files._repos)
+
+    def add_repository(self, doi, root=None, url=None, top_directory=None):
+        """Add a repository containing one or more modeling files."""
+        self._external_files.add_repo(IMP.mmcif.dataset.Repository(
+                                            doi, root, url, top_directory))
+
+    def add_citation(self, pmid, title, journal, volume, page_range, year,
+                     authors, doi):
+        """Add a publication that describes the modeling"""
+        self._citations.append(IMP.mmcif.data._Citation(
+                    pmid, title, journal, volume, page_range, year,
+                    authors, doi))
+
+    def add_software(self, name, classification, description, url,
+                     type='program', version=None):
+        "Add software (other than IMP) used as part of the modeling protocol."
+        self._software.append(IMP.mmcif.data._Software(
+                    name, classification, description, url, type, version))
+
+    def _add_state(self, state):
+        self._states[state] = None
+        state.id = len(self._states)
+
+    def _add_hierarchy(self, h, state):
+        chains = [IMP.atom.Chain(c)
+                  for c in IMP.atom.get_by_type(h, IMP.atom.CHAIN_TYPE)]
+        self._remove_duplicate_chain_ids(chains)
+        # todo: handle same chain in multiple states
+        for c in chains:
+            component = self._add_chain(c)
+            state._all_modeled_components.append(component)
+            state.modeled_assembly.append(component)
+            state.representation[component] \
+                = list(self._get_representation(c,
+                                     self._get_all_starting_models(component)))
+        self.protocols._add_hierarchy(h, state.modeled_assembly)
+        self._external_files.add_hierarchy(h)
+        self._software.add_hierarchy(h)
+
+    def _get_all_starting_models(self, comp):
+        """Get all starting models (in all states) for the given component"""
+        for state in self._states:
+            for rep in state.representation.get(comp, []):
+                if rep.starting_model:
+                    yield rep.starting_model
+
+    def _get_representation(self, chain, existing_starting_models):
+        """Yield groups of particles under chain with same representation"""
+        smf = IMP.mmcif.data._StartingModelFinder(existing_starting_models)
+        rep = IMP.mmcif.data._Representation()
+        for sp in self._get_structure_particles(chain):
+            starting_model = smf.find(sp, self)
+            if not rep.add(sp, starting_model):
+                yield rep
+                rep = IMP.mmcif.data._Representation()
+                rep.add(sp, starting_model)
+        if rep:
+            yield rep
+
+    def _get_structure_particles(self, chain):
+        """Yield all particles under chain with coordinates.
+           They are sorted by residue index."""
+        # todo: handle Representation decorators for non-PMI-1 models
+        ps = IMP.atom.get_leaves(chain)
+        resind_dict = {}
+        for p in ps:
+            if IMP.atom.Residue.get_is_setup(p):
+                residue = IMP.atom.Residue(p)
+                resind = residue.get_index()
+                if resind in resind_dict:
+                    continue
+                resind_dict[resind] = residue
+            elif IMP.atom.Fragment.get_is_setup(p):
+                fragment = IMP.atom.Fragment(p)
+                # todo: handle non-contiguous fragments
+                resinds = fragment.get_residue_indexes()
+                resind = resinds[len(resinds) // 2]
+                if resind in resind_dict:
+                    continue
+                resind_dict[resind] = fragment
+        # Return values sorted by key (residue index)
+        for item in sorted(resind_dict.items(), key=operator.itemgetter(0)):
+            yield item[1]
+
+    def add_non_modeled_chain(self, name, sequence,
+                              chain_type=IMP.atom.UnknownChainType):
+        """Add a chain that wasn't modeled by IMP."""
+        c = _NonModeledChain(name, sequence, chain_type)
+        self._add_chain(c)
+
+    def _add_chain(self, c):
+        entity = self.entities.add(c)
+        component = self.components.add(c, entity)
+        if component not in self.complete_assembly:
+            self.complete_assembly.append(component)
+        return component
+
+    def _remove_duplicate_chain_ids(self, chains):
+        chain_ids = [c.get_id() for c in chains]
+        if len(set(chain_ids)) < len(chain_ids):
+            print("Duplicate chain IDs detected - reassigning as A-Z, a-z, 0-9")
+            # todo: handle > 62 chains
+            for chain, cid in zip(chains, string.uppercase
+                                          + string.lowercase + string.digits):
+                chain.set_id(cid)
+
+    def write(self, fname):
+        with open(fname, 'w') as fh:
+            writer = IMP.mmcif.format._CifWriter(fh)
+            for dumper in self._dumpers:
+                dumper.finalize_metadata(self)
+            for dumper in self._dumpers:
+                dumper.finalize(self)
+            for dumper in self._dumpers:
+                dumper.dump(self, writer)
+
+class State(object):
+    """Represent a single IMP state."""
+    def __init__(self, system):
+        self.system = weakref.proxy(system)
+        system._add_state(self)
+        self.model = IMP.Model()
+        self.hiers = None
+        # The assembly of all components modeled by IMP in this state.
+        # This may be smaller than the complete assembly.
+        self.modeled_assembly = IMP.mmcif.data._Assembly()
+        system._assemblies.add(self.modeled_assembly)
+        # A list of Representation objects for each Component
+        self.representation = {}
+
+        self._all_modeled_components = []
+
+    def add_rmf_files(self, rmfs):
+        for fname in rmfs:
+            rmf = RMF.open_rmf_file_read_only(fname)
+            if self.hiers is None:
+                self.hiers = IMP.rmf.create_hierarchies(rmf, self.model)
+            else:
+                IMP.rmf.link_hierarchies(rmf, self.hiers)
+                # todo: allow reading other frames
+                IMP.rmf.load_frame(rmf, RMF.FrameID(0))
+            for h in self.hiers:
+                self.add_hierarchy(h)
+
+    def add_hierarchy(self, h):
+        self.system._add_hierarchy(h, self)
diff --git a/modules/mmcif/pyext/swig.i-in b/modules/mmcif/pyext/swig.i-in
new file mode 100644
index 0000000000..8c009e01d2
--- /dev/null
+++ b/modules/mmcif/pyext/swig.i-in
@@ -0,0 +1,2 @@
+/* Include top-level Python interface */
+%include "IMP_mmcif.init.i"
diff --git a/modules/mmcif/test/input/15133C.pdb b/modules/mmcif/test/input/15133C.pdb
new file mode 100644
index 0000000000..32710de946
--- /dev/null
+++ b/modules/mmcif/test/input/15133C.pdb
@@ -0,0 +1 @@
+ATOM      2  CA  TYR A   7      -8.986  11.688  -5.817  1.00 91.82           C
diff --git a/modules/mmcif/test/input/derived_model.pdb b/modules/mmcif/test/input/derived_model.pdb
new file mode 100644
index 0000000000..49137698b2
--- /dev/null
+++ b/modules/mmcif/test/input/derived_model.pdb
@@ -0,0 +1,6 @@
+EXPDTA    DERIVED FROM COMPARATIVE MODEL, DOI:10.1093/nar/gkt704
+TITLE     MED4 AND MED9 STRUCTURE TAKEN FROM LARIVIERE ET AL,
+TITLE    1 NUCLEIC ACIDS RESEARCH. 2013;41:9266-9273. DOI: 10.1093/nar/gkt704.
+TITLE    2 THE MED10 STRUCTURE ALSO PROPOSED IN THAT WORK IS NOT USED
+TITLE    3 IN THIS STUDY.
+ATOM      2  CA  LEU A  37      33.176  69.138 -65.108  1.00268.38           C
diff --git a/modules/mmcif/test/input/derived_pdb.pdb b/modules/mmcif/test/input/derived_pdb.pdb
new file mode 100644
index 0000000000..bcf36b4b85
--- /dev/null
+++ b/modules/mmcif/test/input/derived_pdb.pdb
@@ -0,0 +1,4 @@
+EXPDTA    DERIVED FROM PDB:1YKH
+TITLE     MED7C AND MED21 STRUCTURES FROM PDB ENTRY 1YKH, ROTATED AND
+TITLE    1 TRANSLATED TO ALIGN WITH THE MED4-MED9 MODEL
+ATOM   1341  CA  TYR A 112      49.530  67.604  46.768  1.00 40.17           C
diff --git a/modules/mmcif/test/input/modeller_model.ali b/modules/mmcif/test/input/modeller_model.ali
new file mode 100644
index 0000000000..8d88241bb8
--- /dev/null
+++ b/modules/mmcif/test/input/modeller_model.ali
@@ -0,0 +1,12 @@
+>P1;5fd1
+structureX:5fd1:1    :A:106  :A:ferredoxin:Azotobacter vinelandii: 1.90: 0.19
+W*
+
+>P1;1fdx
+sequence:1fdx:1    : :54   : :ferredoxin:Peptococcus aerogenes: 2.00:-1.00
+Y*
+~                                                                               
+~                                                                               
+~                                                                               
+~                                                                               
+
diff --git a/modules/mmcif/test/input/modeller_model.pdb b/modules/mmcif/test/input/modeller_model.pdb
new file mode 100644
index 0000000000..c62651a1b7
--- /dev/null
+++ b/modules/mmcif/test/input/modeller_model.pdb
@@ -0,0 +1,5 @@
+EXPDTA    THEORETICAL MODEL, MODELLER 9.18 2017/02/10 22:21:34
+REMARK   6 ALIGNMENT: modeller_model.ali
+REMARK   6 TEMPLATE: 3jroC 33:C - 424:C MODELS 33:A - 424:A AT 100.0%
+REMARK   6 TEMPLATE: 3f3fG 482:G - 551:G MODELS 429:A - 488:A AT 10.0%
+ATOM      2  CA  TYR A   7      -8.986  11.688  -5.817  1.00 91.82           C
diff --git a/modules/mmcif/test/input/modeller_model_local.pdb b/modules/mmcif/test/input/modeller_model_local.pdb
new file mode 100644
index 0000000000..6b039d1213
--- /dev/null
+++ b/modules/mmcif/test/input/modeller_model_local.pdb
@@ -0,0 +1,5 @@
+EXPDTA    THEORETICAL MODEL, MODELLER 9.18 2017/02/10 22:21:34
+REMARK   6 ALIGNMENT: modeller_model.ali
+REMARK   6 TEMPLATE PATH 15133C ./15133C.pdb
+REMARK   6 TEMPLATE: 15133C 33:C - 424:C MODELS 33:A - 424:A AT 100.0%
+ATOM      2  CA  TYR A   7      -8.986  11.688  -5.817  1.00 91.82           C
diff --git a/modules/mmcif/test/input/modeller_model_no_aln.pdb b/modules/mmcif/test/input/modeller_model_no_aln.pdb
new file mode 100644
index 0000000000..9f8a66f6aa
--- /dev/null
+++ b/modules/mmcif/test/input/modeller_model_no_aln.pdb
@@ -0,0 +1,4 @@
+EXPDTA    THEORETICAL MODEL, MODELLER 9.18 2017/02/10 22:21:34
+REMARK   6 TEMPLATE: 3jroC 33:C - 424:C MODELS 33:A - 424:A AT 100.0%
+REMARK   6 TEMPLATE: 3f3fG 482:G - 551:G MODELS 429:A - 488:A AT 10.0%
+ATOM      2  CA  TYR A   7      -8.986  11.688  -5.817  1.00 91.82           C
diff --git a/modules/mmcif/test/input/official.pdb b/modules/mmcif/test/input/official.pdb
new file mode 100644
index 0000000000..5261967206
--- /dev/null
+++ b/modules/mmcif/test/input/official.pdb
@@ -0,0 +1,5 @@
+HEADER    HYDROLASE, GENE REGULATION              14-JUN-06   2HBJ
+TITLE     STRUCTURE OF THE YEAST NUCLEAR EXOSOME COMPONENT, RRP6P,
+TITLE    2 REVEALS AN INTERPLAY BETWEEN THE ACTIVE SITE AND THE HRDC
+TITLE    3 DOMAIN
+ATOM      2  CA  GLY A 127      57.062  21.781  -5.354  1.00 88.40           C
diff --git a/modules/mmcif/test/input/phyre2_model.pdb b/modules/mmcif/test/input/phyre2_model.pdb
new file mode 100644
index 0000000000..cde9c3af42
--- /dev/null
+++ b/modules/mmcif/test/input/phyre2_model.pdb
@@ -0,0 +1,6 @@
+REMARK  99  Chain ID : 1
+REMARK  99  Residues : 361
+REMARK  99  Atoms    : 3263
+REMARK  99  File     : c4bzkA_.1.pdb.sc
+REMARK   6 TEMPLATE: 4bzkA 13:A - 334:A MODELS 8:A - 538:A AT 11%
+ATOM      2  CA  MET A   8     221.364  50.140 158.235  1.00  0.00           C
diff --git a/modules/mmcif/test/input/test.nup84.pdb b/modules/mmcif/test/input/test.nup84.pdb
new file mode 100644
index 0000000000..5c3f2e13c0
--- /dev/null
+++ b/modules/mmcif/test/input/test.nup84.pdb
@@ -0,0 +1,6 @@
+EXPDTA    THEORETICAL MODEL, MODELLER 9.18 2017/02/10 22:21:34
+REMARK   6 ALIGNMENT: modeller_model.ali
+REMARK   6 TEMPLATE: 3jroC 33:C - 424:C MODELS 33:A - 424:A AT 100.0%
+REMARK   6 TEMPLATE: 3f3fG 482:G - 551:G MODELS 429:A - 488:A AT 10.0%
+ATOM      1  CA  MET A   1      -8.986  11.688  -5.817  1.00 91.82           C
+ATOM      2  CA  GLU A   2      -8.986  11.688  -5.817  1.00 91.82           C
diff --git a/modules/mmcif/test/input/test.nup85.pdb b/modules/mmcif/test/input/test.nup85.pdb
new file mode 100644
index 0000000000..bbc2ca07a2
--- /dev/null
+++ b/modules/mmcif/test/input/test.nup85.pdb
@@ -0,0 +1,6 @@
+EXPDTA    THEORETICAL MODEL, MODELLER 9.18 2017/02/10 22:21:34
+REMARK   6 ALIGNMENT: modeller_model.ali
+REMARK   6 TEMPLATE: 3jroC 33:C - 424:C MODELS 33:A - 424:A AT 100.0%
+REMARK   6 TEMPLATE: 3f3fG 482:G - 551:G MODELS 429:A - 488:A AT 10.0%
+ATOM      1  CA  GLY A   1      -8.986  11.688  -5.817  1.00 91.82           C
+ATOM      2  CA  GLU A   2      -8.986  11.688  -5.817  1.00 91.82           C
diff --git a/modules/mmcif/test/standards_exceptions b/modules/mmcif/test/standards_exceptions
new file mode 100644
index 0000000000..1d3fa89304
--- /dev/null
+++ b/modules/mmcif/test/standards_exceptions
@@ -0,0 +1,4 @@
+value_object_exceptions=[]
+function_name_exceptions=[]
+show_exceptions=[]
+spelling_exceptions=[]
diff --git a/modules/mmcif/test/test_datasets.py b/modules/mmcif/test/test_datasets.py
new file mode 100644
index 0000000000..b453a7ad8e
--- /dev/null
+++ b/modules/mmcif/test/test_datasets.py
@@ -0,0 +1,35 @@
+from __future__ import print_function
+import IMP.test
+import IMP.mmcif.dataset
+
+class Tests(IMP.test.TestCase):
+    def test_duplicate_datasets_details(self):
+        """Datasets with differing details should be considered duplicates"""
+        fname = self.get_input_file_name('test.nup84.pdb')
+        l1 = IMP.mmcif.dataset.FileLocation(fname, details='test details')
+        d1 = IMP.mmcif.dataset.PDBDataset(l1)
+
+        l2 = IMP.mmcif.dataset.FileLocation(fname, details='other details')
+        d2 = IMP.mmcif.dataset.PDBDataset(l2)
+        self.assertEqual(l1, l2)
+
+    def test_duplicate_locations(self):
+        """Datasets with same location should be considered duplicates"""
+        fname1 = self.get_input_file_name('test.nup84.pdb')
+        fname2 = self.get_input_file_name('test.nup85.pdb')
+        loc1 = IMP.mmcif.dataset.FileLocation(fname1)
+        loc2 = IMP.mmcif.dataset.FileLocation(fname2)
+
+        # Identical datasets in the same location aren't duplicated
+        pdb1 = IMP.mmcif.dataset.PDBDataset(loc1)
+        pdb2 = IMP.mmcif.dataset.PDBDataset(loc1)
+        self.assertEqual(pdb1, pdb2)
+
+        # Datasets in different locations are OK
+        pdb1 = IMP.mmcif.dataset.PDBDataset(loc1)
+        pdb2 = IMP.mmcif.dataset.PDBDataset(loc2)
+        self.assertNotEqual(pdb1, pdb2)
+
+
+if __name__ == '__main__':
+    IMP.test.main()
diff --git a/modules/mmcif/test/test_dumper.py b/modules/mmcif/test/test_dumper.py
new file mode 100644
index 0000000000..74ef23deaf
--- /dev/null
+++ b/modules/mmcif/test/test_dumper.py
@@ -0,0 +1,741 @@
+from __future__ import print_function
+import IMP.test
+import IMP.mmcif
+import IMP.mmcif.dumper
+import IMP.mmcif.format
+import IMP.mmcif.dataset
+import io
+import sys
+import os
+if sys.version_info[0] >= 3:
+    from io import StringIO
+else:
+    from io import BytesIO as StringIO
+
+def _get_dumper_output(dumper, system):
+    fh = StringIO()
+    writer = IMP.mmcif.format._CifWriter(fh)
+    dumper.dump(system, writer)
+    return fh.getvalue()
+
+class Tests(IMP.test.TestCase):
+    def make_model(self, system, chains=None):
+        if chains is None:
+            chains = (('foo', 'ACGT', 'A'), ('bar', 'ACGT', 'B'),
+                      ('baz', 'ACC', 'C'))
+        s = IMP.mmcif.State(system)
+        m = s.model
+        top = IMP.atom.Hierarchy.setup_particle(IMP.Particle(m))
+        for name, seq, cid in chains:
+            h = IMP.atom.Hierarchy.setup_particle(IMP.Particle(m))
+            mol = IMP.atom.Molecule.setup_particle(h)
+            mol.set_name(name)
+            top.add_child(mol)
+
+            h = IMP.atom.Hierarchy.setup_particle(IMP.Particle(m))
+            chain = IMP.atom.Chain.setup_particle(h, cid)
+            chain.set_sequence(seq)
+            mol.add_child(chain)
+        return top, s
+
+    def make_model_with_protocol(self, system, chains=None):
+        top, s = self.make_model(system, chains)
+        m = s.model
+        prov = IMP.core.SampleProvenance.setup_particle(m, IMP.Particle(m),
+                                "Monte Carlo", 100, 5)
+        IMP.core.add_provenance(m, top, prov)
+
+        prov = IMP.core.CombineProvenance.setup_particle(m, IMP.Particle(m),
+                                                         5, 500)
+        IMP.core.add_provenance(m, top, prov)
+
+        prov = IMP.core.FilterProvenance.setup_particle(m, IMP.Particle(m),
+                                                  "Total score", 100.5, 400)
+        IMP.core.add_provenance(m, top, prov)
+
+        prov = IMP.core.SampleProvenance.setup_particle(m, IMP.Particle(m),
+                                "Molecular Dynamics", 2000, 5, 16)
+        IMP.core.add_provenance(m, top, prov)
+
+        prov = IMP.core.ClusterProvenance.setup_particle(m, IMP.Particle(m), 10)
+        IMP.core.add_provenance(m, top, prov)
+        return top, s
+
+    def test_citation_dumper(self):
+        """Test CitationDumper"""
+        system = IMP.mmcif.System()
+        system.add_citation(
+              pmid='25161197',
+              title="Structural characterization by cross-linking reveals the\n"
+                    "detailed architecture of a coatomer-related heptameric\n"
+                    "module from the nuclear pore complex.",
+              journal="Mol Cell Proteomics", volume=13, page_range=(2927,2943),
+              year=2014,
+              authors=['Shi Y', 'Fernandez-Martinez J', 'Tjioe E', 'Pellarin R',
+                       'Kim SJ', 'Williams R', 'Schneidman-Duhovny D', 'Sali A',
+                       'Rout MP', 'Chait BT'],
+              doi='10.1074/mcp.M114.041673')
+        dumper = IMP.mmcif.dumper._CitationDumper()
+        out = _get_dumper_output(dumper, system)
+        self.assertEqual(out, """#
+loop_
+_citation.id
+_citation.title
+_citation.journal_abbrev
+_citation.journal_volume
+_citation.page_first
+_citation.page_last
+_citation.year
+_citation.pdbx_database_id_PubMed
+_citation.pdbx_database_id_DOI
+1
+;Structural characterization by cross-linking reveals the
+detailed architecture of a coatomer-related heptameric
+module from the nuclear pore complex.
+;
+'Mol Cell Proteomics' 13 2927 2943 2014 25161197 10.1074/mcp.M114.041673
+#
+#
+loop_
+_citation_author.citation_id
+_citation_author.name
+_citation_author.ordinal
+1 'Shi Y' 1
+1 'Fernandez-Martinez J' 2
+1 'Tjioe E' 3
+1 'Pellarin R' 4
+1 'Kim SJ' 5
+1 'Williams R' 6
+1 'Schneidman-Duhovny D' 7
+1 'Sali A' 8
+1 'Rout MP' 9
+1 'Chait BT' 10
+#
+""")
+        # Handle no last page
+        system._citations[0].page_range = 'e1637'
+        out = _get_dumper_output(dumper, system)
+        self.assertTrue("'Mol Cell Proteomics' 13 e1637 . 2014 " in out)
+
+    def test_software_dumper(self):
+        """Test SoftwareDumper"""
+        system = IMP.mmcif.System()
+        h, state = self.make_model(system)
+        m = state.model
+        IMP.core.add_imp_provenance(h)
+        state.add_hierarchy(h)
+
+        system.add_software(name='test', classification='test code',
+                            description='Some test program',
+                            version=1, url='http://salilab.org')
+        dumper = IMP.mmcif.dumper._SoftwareDumper()
+        out = _get_dumper_output(dumper, system)
+        self.assertEqual(out, """#
+loop_
+_software.pdbx_ordinal
+_software.name
+_software.classification
+_software.version
+_software.type
+_software.location
+1 'Integrative Modeling Platform (IMP)' 'integrative model building'
+%s program https://integrativemodeling.org
+2 test 'test code' 1 program http://salilab.org
+#
+""" % IMP.get_module_version())
+
+    def test_entry_dumper(self):
+        """Test EntryDumper"""
+        system = IMP.mmcif.System()
+        dumper = IMP.mmcif.dumper._EntryDumper()
+        out = _get_dumper_output(dumper, system)
+        self.assertEqual(out, "data_imp_model\n_entry.id imp_model\n")
+
+    def test_chem_comp_dumper(self):
+        """Test ChemCompDumper"""
+        system = IMP.mmcif.System()
+        h, state = self.make_model(system)
+        state.add_hierarchy(h)
+        dumper = IMP.mmcif.dumper._ChemCompDumper()
+        out = _get_dumper_output(dumper, system)
+        self.assertEqual(out, """#
+loop_
+_chem_comp.id
+_chem_comp.type
+ALA 'L-peptide linking'
+CYS 'L-peptide linking'
+GLY 'L-peptide linking'
+THR 'L-peptide linking'
+#
+""")
+
+    def test_entity_dumper(self):
+        """Test EntityDumper"""
+        system = IMP.mmcif.System()
+        h, state = self.make_model(system)
+        state.add_hierarchy(h)
+        dumper = IMP.mmcif.dumper._EntityDumper()
+        out = _get_dumper_output(dumper, system)
+        self.assertEqual(out, """#
+loop_
+_entity.id
+_entity.type
+_entity.src_method
+_entity.pdbx_description
+_entity.formula_weight
+_entity.pdbx_number_of_molecules
+_entity.details
+1 polymer man foo ? 1 ?
+2 polymer man baz ? 1 ?
+#
+""")
+
+    def test_entity_poly_dumper(self):
+        """Test EntityPolyDumper"""
+        system = IMP.mmcif.System()
+        h, state = self.make_model(system)
+        state.add_hierarchy(h)
+        dumper = IMP.mmcif.dumper._EntityPolyDumper()
+        out = _get_dumper_output(dumper, system)
+        self.assertEqual(out, """#
+loop_
+_entity_poly.entity_id
+_entity_poly.type
+_entity_poly.nstd_linkage
+_entity_poly.nstd_monomer
+_entity_poly.pdbx_strand_id
+_entity_poly.pdbx_seq_one_letter_code
+_entity_poly.pdbx_seq_one_letter_code_can
+1 polypeptide(L) no no A ACGT ACGT
+2 polypeptide(L) no no C ACC ACC
+#
+""")
+
+    def test_entity_poly_seq_dumper(self):
+        """Test EntityPolySeqDumper"""
+        system = IMP.mmcif.System()
+        h, state = self.make_model(system)
+        state.add_hierarchy(h)
+        dumper = IMP.mmcif.dumper._EntityPolySeqDumper()
+        out = _get_dumper_output(dumper, system)
+        self.assertEqual(out, """#
+loop_
+_entity_poly_seq.entity_id
+_entity_poly_seq.num
+_entity_poly_seq.mon_id
+_entity_poly_seq.hetero
+1 1 ALA .
+1 2 CYS .
+1 3 GLY .
+1 4 THR .
+2 1 ALA .
+2 2 CYS .
+2 3 CYS .
+#
+""")
+
+    def test_struct_asym_dumper(self):
+        """Test StructAsymDumper"""
+        system = IMP.mmcif.System()
+        h, state = self.make_model(system)
+        state.add_hierarchy(h)
+        dumper = IMP.mmcif.dumper._StructAsymDumper()
+        out = _get_dumper_output(dumper, system)
+        self.assertEqual(out, """#
+loop_
+_struct_asym.id
+_struct_asym.entity_id
+_struct_asym.details
+A 1 foo
+B 1 bar
+C 2 baz
+#
+""")
+
+    def test_assembly_dumper_get_subassembly(self):
+        """Test AssemblyDumper.get_subassembly()"""
+        system = IMP.mmcif.System()
+        a = system._assemblies
+        system.complete_assembly.extend(['a', 'b', 'c'])
+        x1 = a.get_subassembly({'a':None, 'b':None})
+        x2 = a.get_subassembly({'a':None, 'b':None, 'c':None})
+
+        d = IMP.mmcif.dumper._AssemblyDumper()
+        d.finalize(system) # assign IDs to all assemblies
+        self.assertEqual(system.complete_assembly.id, 1)
+        self.assertEqual(x1.id, 2)
+        self.assertEqual(x1, ['a', 'b'])
+        self.assertEqual(x2.id, 1)
+
+    def test_assembly_all_modeled(self):
+        """Test AssemblyDumper, all components modeled"""
+        system = IMP.mmcif.System()
+        h, state = self.make_model(system, (("foo", "AAA", 'A'),
+                                            ("bar", "AAA", 'B'),
+                                            ("baz", "AA", 'C')))
+        state.add_hierarchy(h)
+        foo, bar, baz = state._all_modeled_components
+
+        system._assemblies.add(IMP.mmcif.data._Assembly((foo, bar)))
+        system._assemblies.add(IMP.mmcif.data._Assembly((bar, baz)))
+
+        d = IMP.mmcif.dumper._AssemblyDumper()
+        d.finalize(system)
+        out = _get_dumper_output(d, system)
+        self.assertEqual(out, """#
+loop_
+_ihm_struct_assembly.ordinal_id
+_ihm_struct_assembly.assembly_id
+_ihm_struct_assembly.parent_assembly_id
+_ihm_struct_assembly.entity_description
+_ihm_struct_assembly.entity_id
+_ihm_struct_assembly.asym_id
+_ihm_struct_assembly.seq_id_begin
+_ihm_struct_assembly.seq_id_end
+1 1 1 foo 1 A 1 3
+2 1 1 foo 1 B 1 3
+3 1 1 baz 2 C 1 2
+4 2 2 foo 1 A 1 3
+5 2 2 foo 1 B 1 3
+6 3 3 foo 1 B 1 3
+7 3 3 baz 2 C 1 2
+#
+""")
+
+    def test_assembly_subset_modeled(self):
+        """Test AssemblyDumper, subset of components modeled"""
+        system = IMP.mmcif.System()
+        h, state = self.make_model(system, (("foo", "AAA", 'A'),))
+        state.add_hierarchy(h)
+        system.add_non_modeled_chain(name="bar", sequence="AA")
+        d = IMP.mmcif.dumper._AssemblyDumper()
+        d.finalize(system) # assign IDs
+        out = _get_dumper_output(d, system)
+        self.assertEqual(out, """#
+loop_
+_ihm_struct_assembly.ordinal_id
+_ihm_struct_assembly.assembly_id
+_ihm_struct_assembly.parent_assembly_id
+_ihm_struct_assembly.entity_description
+_ihm_struct_assembly.entity_id
+_ihm_struct_assembly.asym_id
+_ihm_struct_assembly.seq_id_begin
+_ihm_struct_assembly.seq_id_end
+1 1 1 foo 1 A 1 3
+2 1 1 bar 2 . 1 2
+3 2 2 foo 1 A 1 3
+#
+""")
+
+    def test_model_representation_dumper(self):
+        """Test ModelRepresentationDumper"""
+        system = IMP.mmcif.System()
+        h, state = self.make_model(system, (("foo", "AAAA", 'A'),))
+        chain = IMP.atom.get_by_type(h, IMP.atom.CHAIN_TYPE)[0]
+        m = state.model
+        # Add starting model information for residues 1-2
+        ress = IMP.atom.Hierarchy.setup_particle(IMP.Particle(m))
+        sp = IMP.core.StructureProvenance.setup_particle(IMP.Particle(m),
+                self.get_input_file_name("test.nup84.pdb"), "A")
+        IMP.core.Provenanced.setup_particle(ress, sp)
+        chain.add_child(ress)
+        res1 = IMP.atom.Residue.setup_particle(IMP.Particle(m),
+                                               IMP.atom.ALA, 1)
+        ress.add_child(res1)
+        res2 = IMP.atom.Residue.setup_particle(IMP.Particle(m),
+                                               IMP.atom.ALA, 2)
+        ress.add_child(res2)
+        # First matching object (res1 and res2) will be used
+        frag1 = IMP.atom.Fragment.setup_particle(IMP.Particle(m),[1,2])
+        chain.add_child(frag1)
+        frag2 = IMP.atom.Fragment.setup_particle(IMP.Particle(m),[3,4])
+        chain.add_child(frag2)
+        state.add_hierarchy(h)
+        # Assign starting model IDs
+        d = IMP.mmcif.dumper._StartingModelDumper()
+        d.finalize(system)
+        d = IMP.mmcif.dumper._ModelRepresentationDumper()
+        out = _get_dumper_output(d, system)
+        self.assertEqual(out, """#
+loop_
+_ihm_model_representation.ordinal_id
+_ihm_model_representation.representation_id
+_ihm_model_representation.segment_id
+_ihm_model_representation.entity_id
+_ihm_model_representation.entity_description
+_ihm_model_representation.entity_asym_id
+_ihm_model_representation.seq_id_begin
+_ihm_model_representation.seq_id_end
+_ihm_model_representation.model_object_primitive
+_ihm_model_representation.starting_model_id
+_ihm_model_representation.model_mode
+_ihm_model_representation.model_granularity
+_ihm_model_representation.model_object_count
+1 1 1 1 foo A 1 2 sphere foo-m1 flexible by-residue 2
+2 1 2 1 foo A 3 4 sphere . flexible by-feature 1
+#
+""")
+
+    def _make_residue_chain(self, name, chain_id, model):
+        if name == 'Nup84':
+            fname = 'test.nup84.pdb'
+            seq = 'ME'
+        else:
+            fname = 'test.nup85.pdb'
+            seq = 'GE'
+        h = IMP.atom.read_pdb(self.get_input_file_name(fname), model)
+        for hchain in IMP.atom.get_by_type(h, IMP.atom.CHAIN_TYPE):
+            chain = IMP.atom.Chain(hchain)
+            chain.set_sequence(seq)
+            chain.set_id(chain_id)
+            chain.set_name(name)
+        for hres in IMP.atom.get_by_type(h, IMP.atom.RESIDUE_TYPE):
+            res = IMP.atom.Residue(hres)
+            while res.get_number_of_children() > 0:
+                res.remove_child(0)
+        return h
+
+    def test_starting_model_dumper(self):
+        """Test StartingModelDumper"""
+        m = IMP.Model()
+        system = IMP.mmcif.System()
+
+        state1h = IMP.atom.Hierarchy.setup_particle(IMP.Particle(m))
+        h1 = self._make_residue_chain('Nup84', 'A', m)
+        state1h.add_child(h1)
+
+        # Test multiple states: components that are the same in both states
+        # (Nup84) should not be duplicated in the mmCIF output
+        state2h = IMP.atom.Hierarchy.setup_particle(IMP.Particle(m))
+        h1 = self._make_residue_chain('Nup84', 'A', m)
+        state2h.add_child(h1)
+        h2 = self._make_residue_chain('Nup85', 'B', m)
+        state2h.add_child(h2)
+
+        state1 = IMP.mmcif.State(system)
+        state1.add_hierarchy(state1h)
+        state2 = IMP.mmcif.State(system)
+        state2.add_hierarchy(state2h)
+
+        d = IMP.mmcif.dumper._ExternalReferenceDumper()
+        d.finalize(system) # assign file IDs (nup84 pdb = 1, alignment file = 2)
+        d = IMP.mmcif.dumper._StartingModelDumper()
+        d.finalize(system)
+        out = _get_dumper_output(d, system)
+        self.assertEqual(out, """#
+loop_
+_ihm_starting_model_details.starting_model_id
+_ihm_starting_model_details.entity_id
+_ihm_starting_model_details.entity_description
+_ihm_starting_model_details.asym_id
+_ihm_starting_model_details.seq_id_begin
+_ihm_starting_model_details.seq_id_end
+_ihm_starting_model_details.starting_model_source
+_ihm_starting_model_details.starting_model_auth_asym_id
+_ihm_starting_model_details.starting_model_sequence_offset
+_ihm_starting_model_details.dataset_list_id
+Nup84-m1 1 Nup84 A 33 2 'comparative model' A 0 1
+Nup85-m1 2 Nup85 B 33 2 'comparative model' A 0 4
+#
+#
+loop_
+_ihm_starting_comparative_models.ordinal_id
+_ihm_starting_comparative_models.starting_model_id
+_ihm_starting_comparative_models.starting_model_auth_asym_id
+_ihm_starting_comparative_models.starting_model_seq_id_begin
+_ihm_starting_comparative_models.starting_model_seq_id_end
+_ihm_starting_comparative_models.template_auth_asym_id
+_ihm_starting_comparative_models.template_seq_id_begin
+_ihm_starting_comparative_models.template_seq_id_end
+_ihm_starting_comparative_models.template_sequence_identity
+_ihm_starting_comparative_models.template_sequence_identity_denominator
+_ihm_starting_comparative_models.template_dataset_list_id
+_ihm_starting_comparative_models.alignment_file_id
+1 Nup84-m1 A 33 2 C 33 424 100.0 1 2 2
+2 Nup84-m1 A 429 2 G 482 551 10.0 1 3 2
+3 Nup85-m1 A 33 2 C 33 424 100.0 1 2 2
+4 Nup85-m1 A 429 2 G 482 551 10.0 1 3 2
+#
+#
+loop_
+_ihm_starting_model_coord.starting_model_id
+_ihm_starting_model_coord.group_PDB
+_ihm_starting_model_coord.id
+_ihm_starting_model_coord.type_symbol
+_ihm_starting_model_coord.atom_id
+_ihm_starting_model_coord.comp_id
+_ihm_starting_model_coord.entity_id
+_ihm_starting_model_coord.asym_id
+_ihm_starting_model_coord.seq_id
+_ihm_starting_model_coord.Cartn_x
+_ihm_starting_model_coord.Cartn_y
+_ihm_starting_model_coord.Cartn_z
+_ihm_starting_model_coord.B_iso_or_equiv
+_ihm_starting_model_coord.ordinal_id
+Nup84-m1 ATOM 1 C CA MET 1 A 1 -8.986 11.688 -5.817 91.820 1
+Nup84-m1 ATOM 2 C CA GLU 1 A 2 -8.986 11.688 -5.817 91.820 2
+Nup85-m1 ATOM 1 C CA GLY 2 A 1 -8.986 11.688 -5.817 91.820 3
+Nup85-m1 ATOM 2 C CA GLU 2 A 2 -8.986 11.688 -5.817 91.820 4
+#
+""")
+
+    def test_dataset_dumper(self):
+        """Test DatasetDumper"""
+        m = IMP.Model()
+        system = IMP.mmcif.System()
+
+        l = IMP.mmcif.dataset.FileLocation(repo="foo", path="bar")
+        l.id = 97
+        d = IMP.mmcif.dataset.CXMSDataset(l)
+        pds = system.datasets.add(d)
+
+        l = IMP.mmcif.dataset.PDBLocation("1abc", "1.0", "test details")
+        d = IMP.mmcif.dataset.PDBDataset(l)
+        system.datasets.add(d)
+
+        l = IMP.mmcif.dataset.FileLocation(repo="foo2", path="bar2")
+        l.id = 98
+        d = IMP.mmcif.dataset.PDBDataset(l)
+        d.add_parent(pds)
+        system.datasets.add(d)
+
+        d = IMP.mmcif.dumper._DatasetDumper()
+        out = _get_dumper_output(d, system)
+        self.assertEqual(out, """#
+loop_
+_ihm_dataset_list.id
+_ihm_dataset_list.data_type
+_ihm_dataset_list.database_hosted
+1 'CX-MS data' NO
+2 'Experimental model' YES
+3 'Experimental model' NO
+#
+#
+loop_
+_ihm_dataset_external_reference.id
+_ihm_dataset_external_reference.dataset_list_id
+_ihm_dataset_external_reference.file_id
+1 1 97
+2 3 98
+#
+#
+loop_
+_ihm_dataset_related_db_reference.id
+_ihm_dataset_related_db_reference.dataset_list_id
+_ihm_dataset_related_db_reference.db_name
+_ihm_dataset_related_db_reference.accession_code
+_ihm_dataset_related_db_reference.version
+_ihm_dataset_related_db_reference.details
+1 2 PDB 1abc 1.0 'test details'
+#
+#
+loop_
+_ihm_related_datasets.ordinal_id
+_ihm_related_datasets.dataset_list_id_derived
+_ihm_related_datasets.dataset_list_id_primary
+1 3 1
+#
+""")
+
+    def test_seq_dif(self):
+        """Test StartingModelDumper.dump_seq_dif"""
+        class MockEntity(object):
+            id = 4
+        class MockRes(object):
+            def get_index(self):
+                return 42
+        class MockComponent(object):
+            entity = MockEntity()
+        class MockSource(object):
+            chain_id = 'X'
+        class MockModel(object):
+            name = 'dummy-m1'
+            chain_id = 'H'
+            offset = 2
+        fh = StringIO()
+        writer = IMP.mmcif.format._CifWriter(fh)
+        dumper = IMP.mmcif.dumper._StartingModelDumper()
+        sd = IMP.mmcif.dumper._MSESeqDif(MockRes(), MockComponent(),
+                                         MockSource(), MockModel())
+        dumper.dump_seq_dif(writer, [sd])
+        out = fh.getvalue()
+        self.assertEqual(out, """#
+loop_
+_ihm_starting_model_seq_dif.ordinal_id
+_ihm_starting_model_seq_dif.entity_id
+_ihm_starting_model_seq_dif.asym_id
+_ihm_starting_model_seq_dif.seq_id
+_ihm_starting_model_seq_dif.comp_id
+_ihm_starting_model_seq_dif.starting_model_id
+_ihm_starting_model_seq_dif.db_asym_id
+_ihm_starting_model_seq_dif.db_seq_id
+_ihm_starting_model_seq_dif.db_comp_id
+_ihm_starting_model_seq_dif.details
+1 4 H 42 MET dummy-m1 X 40 MSE 'Conversion of modified residue MSE to MET'
+#
+""")
+
+    def test_external_reference_dumper_dump(self):
+        """Test ExternalReferenceDumper.dump()"""
+        system = IMP.mmcif.System()
+        exfil = system._external_files
+
+        repo1 = IMP.mmcif.dataset.Repository(doi="foo")
+        repo2 = IMP.mmcif.dataset.Repository(doi="10.5281/zenodo.46266",
+                                     url='nup84-v1.0.zip',
+                                     top_directory=os.path.join('foo', 'bar'))
+        repo3 = IMP.mmcif.dataset.Repository(doi="10.5281/zenodo.58025",
+                                             url='foo.spd')
+        l = IMP.mmcif.dataset.FileLocation(repo=repo1, path='bar')
+        exfil.add_input(l)
+        # Duplicates should be ignored
+        l = IMP.mmcif.dataset.FileLocation(repo=repo1, path='bar')
+        exfil.add_input(l)
+        # Different file, same repository
+        l = IMP.mmcif.dataset.FileLocation(repo=repo1, path='baz')
+        exfil.add_input(l)
+        # Different repository
+        l = IMP.mmcif.dataset.FileLocation(repo=repo2, path='baz')
+        exfil.add_output(l)
+        # Repository containing a single file (not an archive)
+        l = IMP.mmcif.dataset.FileLocation(repo=repo3, path='foo.spd',
+                                           details='EM micrographs')
+        exfil.add_input(l)
+        bar = 'test_mmcif_extref.tmp'
+        with open(bar, 'w') as f:
+            f.write("abcd")
+        # Local file
+        l = IMP.mmcif.dataset.FileLocation(bar)
+        exfil.add_workflow(l)
+        # DatabaseLocations should be ignored
+        l = IMP.mmcif.dataset.PDBLocation('1abc', '1.0', 'test details')
+        exfil.add_workflow(l)
+
+        dump = IMP.mmcif.dumper._ExternalReferenceDumper()
+        dump.finalize(system) # assign IDs
+        out = _get_dumper_output(dump, system)
+        self.assertEqual(out, """#
+loop_
+_ihm_external_reference_info.reference_id
+_ihm_external_reference_info.reference_provider
+_ihm_external_reference_info.reference_type
+_ihm_external_reference_info.reference
+_ihm_external_reference_info.refers_to
+_ihm_external_reference_info.associated_url
+1 . DOI foo Other .
+2 Zenodo DOI 10.5281/zenodo.46266 Archive nup84-v1.0.zip
+3 Zenodo DOI 10.5281/zenodo.58025 File foo.spd
+4 . 'Supplementary Files' . Other .
+#
+#
+loop_
+_ihm_external_files.id
+_ihm_external_files.reference_id
+_ihm_external_files.file_path
+_ihm_external_files.content_type
+_ihm_external_files.file_size_bytes
+_ihm_external_files.details
+1 1 bar 'Input data or restraints' . .
+2 1 baz 'Input data or restraints' . .
+3 2 foo/bar/baz 'Modeling or post-processing output' . .
+4 3 foo.spd 'Input data or restraints' . 'EM micrographs'
+5 4 %s 'Modeling workflow or script' 4 .
+#
+""" % bar)
+        os.unlink(bar)
+
+    def test_workflow(self):
+        """Test output of workflow files"""
+        system = IMP.mmcif.System()
+        h, state = self.make_model(system)
+        m = state.model
+        prov = IMP.core.ScriptProvenance.setup_particle(m, IMP.Particle(m),
+                                                        __file__)
+        IMP.core.add_provenance(m, h, prov)
+        state.add_hierarchy(h)
+
+        root = os.path.dirname(__file__)
+        system.add_repository(doi="foo", root=root)
+        dump = IMP.mmcif.dumper._ExternalReferenceDumper()
+        dump.finalize(system) # assign IDs
+        out = _get_dumper_output(dump, system)
+        self.assertEqual(out, """#
+loop_
+_ihm_external_reference_info.reference_id
+_ihm_external_reference_info.reference_provider
+_ihm_external_reference_info.reference_type
+_ihm_external_reference_info.reference
+_ihm_external_reference_info.refers_to
+_ihm_external_reference_info.associated_url
+1 . DOI foo Other .
+#
+#
+loop_
+_ihm_external_files.id
+_ihm_external_files.reference_id
+_ihm_external_files.file_path
+_ihm_external_files.content_type
+_ihm_external_files.file_size_bytes
+_ihm_external_files.details
+1 1 %s 'Modeling workflow or script' %d
+'Integrative modeling Python script'
+#
+""" % (os.path.basename(__file__), os.stat(__file__).st_size))
+
+    def test_modeling_protocol(self):
+        """Test ProtocolDumper"""
+        system = IMP.mmcif.System()
+        h, state = self.make_model_with_protocol(system)
+        state.add_hierarchy(h)
+
+        dumper = IMP.mmcif.dumper._AssemblyDumper()
+        dumper.finalize(system) # Assign assembly IDs
+
+        dumper = IMP.mmcif.dumper._ProtocolDumper()
+        out = _get_dumper_output(dumper, system)
+        self.assertEqual(out, """#
+loop_
+_ihm_modeling_protocol.ordinal_id
+_ihm_modeling_protocol.protocol_id
+_ihm_modeling_protocol.step_id
+_ihm_modeling_protocol.struct_assembly_id
+_ihm_modeling_protocol.dataset_group_id
+_ihm_modeling_protocol.struct_assembly_description
+_ihm_modeling_protocol.protocol_name
+_ihm_modeling_protocol.step_name
+_ihm_modeling_protocol.step_method
+_ihm_modeling_protocol.num_models_begin
+_ihm_modeling_protocol.num_models_end
+_ihm_modeling_protocol.multi_scale_flag
+_ihm_modeling_protocol.multi_state_flag
+_ihm_modeling_protocol.time_ordered_flag
+1 1 1 1 . . . Sampling 'Monte Carlo' 0 500 YES NO NO
+2 2 1 1 . . . Sampling 'Replica exchange Molecular Dynamics' 400 2000 YES NO NO
+#
+""")
+
+    def test_post_process(self):
+        """Test PostProcessDumper"""
+        system = IMP.mmcif.System()
+        h, state = self.make_model_with_protocol(system)
+        state.add_hierarchy(h)
+        dumper = IMP.mmcif.dumper._PostProcessDumper()
+        out = _get_dumper_output(dumper, system)
+        self.assertEqual(out, """#
+loop_
+_ihm_modeling_post_process.id
+_ihm_modeling_post_process.protocol_id
+_ihm_modeling_post_process.analysis_id
+_ihm_modeling_post_process.step_id
+_ihm_modeling_post_process.type
+_ihm_modeling_post_process.feature
+_ihm_modeling_post_process.num_models_begin
+_ihm_modeling_post_process.num_models_end
+1 1 1 1 filter energy/score 500 400
+2 2 1 1 cluster RMSD 2000 2000
+#
+""")
+
+
+if __name__ == '__main__':
+    IMP.test.main()
diff --git a/modules/mmcif/test/test_metadata.py b/modules/mmcif/test/test_metadata.py
new file mode 100644
index 0000000000..e8046b2826
--- /dev/null
+++ b/modules/mmcif/test/test_metadata.py
@@ -0,0 +1,155 @@
+from __future__ import print_function
+import IMP.test
+import IMP.mmcif.metadata
+import IMP.mmcif.data
+
+class MockSystem(object):
+    def __init__(self):
+        self._external_files = IMP.mmcif.data._ExternalFiles()
+        self.datasets = IMP.mmcif.data._Datasets(self._external_files)
+        self._software = IMP.mmcif.data._AllSoftware()
+
+
+class Tests(IMP.test.TestCase):
+    def _parse_pdb(self, fname, chain):
+        system = MockSystem()
+        p = IMP.mmcif.metadata._PDBMetadataParser()
+        p.parse_file(fname, chain, system)
+        return p
+
+    def test_official_pdb(self):
+        """Test PDBMetadataParser when given an official PDB"""
+        p = self._parse_pdb(self.get_input_file_name('official.pdb'), 'A')
+        s, = p.sources
+        self.assertEqual(s.db_code, '2HBJ')
+        self.assertEqual(s.chain_id, 'A')
+        self.assertEqual(p.dataset._data_type, 'Experimental model')
+        self.assertEqual(p.dataset.location.db_name, 'PDB')
+        self.assertEqual(p.dataset.location.access_code, '2HBJ')
+        self.assertEqual(p.dataset.location.version, '14-JUN-06')
+        self.assertEqual(p.dataset.location.details,
+                         'STRUCTURE OF THE YEAST NUCLEAR EXOSOME COMPONENT, '
+                         'RRP6P, REVEALS AN INTERPLAY BETWEEN THE ACTIVE '
+                         'SITE AND THE HRDC DOMAIN')
+
+    def test_derived_pdb(self):
+        """Test PDBMetadataParser when given a file derived from a PDB"""
+        pdbname = self.get_input_file_name('derived_pdb.pdb')
+        p = self._parse_pdb(pdbname, 'A')
+        s, = p.sources
+        self.assertIs(s.db_code, None)
+        self.assertEqual(s.chain_id, 'A')
+        self.assertEqual(p.dataset._data_type, 'Experimental model')
+        self.assertEqual(p.dataset.location.path, pdbname)
+        self.assertEqual(p.dataset.location.repo, None)
+        self.assertEqual(p.dataset.location.details,
+                         'MED7C AND MED21 STRUCTURES FROM PDB ENTRY 1YKH, '
+                         'ROTATED AND TRANSLATED TO ALIGN WITH THE '
+                         'MED4-MED9 MODEL')
+        parent, = p.dataset._parents
+        self.assertEqual(parent._data_type, 'Experimental model')
+        self.assertEqual(parent.location.db_name, 'PDB')
+        self.assertEqual(parent.location.access_code, '1YKH')
+        self.assertEqual(parent.location.version, None)
+        self.assertEqual(parent.location.details, None)
+
+    def test_derived_model(self):
+        """Test PDBMetadataParser when given a file derived from a model"""
+        pdbname = self.get_input_file_name('derived_model.pdb')
+        p = self._parse_pdb(pdbname, 'A')
+        s, = p.sources
+        self.assertIs(s.db_code, None)
+        self.assertEqual(s.chain_id, 'A')
+        self.assertEqual(p.dataset._data_type, 'Comparative model')
+        self.assertEqual(p.dataset.location.path, pdbname)
+        self.assertEqual(p.dataset.location.repo, None)
+        self.assertEqual(p.dataset.location.details,
+                         'MED4 AND MED9 STRUCTURE TAKEN FROM LARIVIERE '
+                         'ET AL, NUCLEIC ACIDS RESEARCH. 2013;41:9266-9273. '
+                         'DOI: 10.1093/nar/gkt704. THE MED10 STRUCTURE ALSO '
+                         'PROPOSED IN THAT WORK IS NOT USED IN THIS STUDY.')
+        parent, = p.dataset._parents
+        self.assertEqual(parent._data_type, 'Comparative model')
+        self.assertEqual(parent.location.path, '.')
+        self.assertEqual(parent.location.repo.doi, '10.1093/nar/gkt704')
+        self.assertEqual(parent.location.details,
+                         'Starting comparative model structure')
+
+    def test_modeller_model_aln(self):
+        """Test PDBMetadataParser when given a Modeller model with alignment"""
+        pdbname = self.get_input_file_name('modeller_model.pdb')
+        p = self.check_modeller_model(pdbname)
+        self.assertEqual(p.alignment_file.path,
+                         self.get_input_file_name('modeller_model.ali'))
+
+    def test_modeller_model_no_aln(self):
+        "Test PDBMetadataParser when given a Modeller model with no alignment"
+        pdbname = self.get_input_file_name('modeller_model_no_aln.pdb')
+        p = self.check_modeller_model(pdbname)
+
+    def check_modeller_model(self, pdbname):
+        p = self._parse_pdb(pdbname, 'A')
+        s1, s2 = p.sources
+        self.assertIs(s1.db_code, None)
+        self.assertEqual(s1.chain_id, 'A')
+        self.assertEqual(s1.tm_db_code, '3JRO')
+        self.assertEqual(s1.tm_chain_id, 'C')
+        self.assertIs(s2.db_code, None)
+        self.assertEqual(s2.chain_id, 'A')
+        self.assertEqual(s2.tm_db_code, '3F3F')
+        self.assertEqual(s2.tm_chain_id, 'G')
+        self.assertEqual(p.dataset._data_type, 'Comparative model')
+        self.assertEqual(p.dataset.location.path, pdbname)
+        self.assertEqual(p.dataset.location.repo, None)
+        self.assertEqual(p.dataset.location.details,
+                         'Starting model structure')
+        p1, p2 = p.dataset._parents
+        self.assertEqual(p1._data_type, 'Experimental model')
+        self.assertEqual(p1.location.db_name, 'PDB')
+        self.assertEqual(p1.location.access_code, '3JRO')
+        self.assertEqual(p1.location.version, None)
+        self.assertEqual(p1.location.details, None)
+        self.assertEqual(p2.location.access_code, '3F3F')
+        return p
+
+    def test_modeller_local(self):
+        "Test PDBMetadataParser when given a Modeller model with local template"
+        pdbname = self.get_input_file_name('modeller_model_local.pdb')
+        p = self._parse_pdb(pdbname, 'A')
+        s, = p.sources
+        self.assertIs(s.db_code, None)
+        self.assertEqual(s.chain_id, 'A')
+        self.assertIs(s.tm_db_code, None)
+        self.assertEqual(s.tm_chain_id, 'C')
+        parent, = p.dataset._parents
+        self.assertEqual(parent._data_type, 'Experimental model')
+        self.assertEqual(parent.location.details,
+                         'Template for comparative modeling')
+        self.assertEqual(parent.location.path,
+                         self.get_input_file_name('15133C.pdb'))
+
+    def test_phyre2_model(self):
+        """Test PDBMetadataParser when given a Phyre2 model."""
+        pdbname = self.get_input_file_name('phyre2_model.pdb')
+        p = self._parse_pdb(pdbname, 'A')
+        s, = p.sources
+        self.assertIs(s.db_code, None)
+        self.assertEqual(s.chain_id, 'A')
+        self.assertEqual(s.tm_db_code, '4BZK')
+        self.assertEqual(s.tm_chain_id, 'A')
+        self.assertEqual(p.dataset._data_type, 'Comparative model')
+        self.assertEqual(p.dataset.location.path, pdbname)
+        self.assertIs(p.dataset.location.repo, None)
+        self.assertEqual(p.dataset.location.details,
+                         'Starting model structure')
+        parent, = p.dataset._parents
+        self.assertEqual(parent._data_type, 'Experimental model')
+        self.assertEqual(parent.location.db_name, 'PDB')
+        self.assertEqual(parent.location.access_code, '4BZK')
+        self.assertIs(parent.location.version, None)
+        self.assertIs(parent.location.details, None)
+
+
+
+if __name__ == '__main__':
+    IMP.test.main()