diff --git a/pyext/src/mmcif.py b/pyext/src/mmcif.py index 25651769..519ba8b4 100644 --- a/pyext/src/mmcif.py +++ b/pyext/src/mmcif.py @@ -38,6 +38,7 @@ import ihm.representation import ihm.geometry import ihm.cross_linkers +import ihm.reference def _assign_id(obj, seen_objs, obj_by_id): @@ -1082,7 +1083,7 @@ def _get_alphabet(self, alphabet): else: raise TypeError("Don't know how to handle %s" % alphabet) - def add(self, component_name, sequence, offset, alphabet): + def add(self, component_name, sequence, offset, alphabet, uniprot): def entity_seq(sequence): # Map X to UNK if 'X' in sequence: @@ -1095,7 +1096,8 @@ def entity_seq(sequence): d = component_name.split("@")[0].split(".")[0] entity = Entity(entity_seq(sequence), description=d, pmi_offset=offset, - alphabet=self._get_alphabet(alphabet)) + alphabet=self._get_alphabet(alphabet), + uniprot=uniprot) self.system.entities.append(entity) self._sequence_dict[sequence] = entity self[component_name] = self._sequence_dict[sequence] @@ -1198,11 +1200,18 @@ class Entity(ihm.Entity): removed). The actual offset (which is the integer to be added to the IHM numbering to get PMI numbering, or equivalently the number of not-represented N-terminal residues in the PMI sequence) is - available in the `pmi_offset` member.""" - def __init__(self, sequence, pmi_offset, *args, **keys): + available in the `pmi_offset` member. + + If a UniProt accession was provided for the sequence (either when + State.create_molecule() was called, or in the FASTA alignment file + header) then that is available in the `uniprot` member, and can be + added to the IHM system with the add_uniprot_reference method. + """ + def __init__(self, sequence, pmi_offset, uniprot, *args, **keys): # Offset between PMI numbering and IHM; = + pmi_offset # (pmi_offset is also the number of N-terminal gaps in the FASTA file) self.pmi_offset = pmi_offset + self.uniprot = uniprot super().__init__(sequence, *args, **keys) def pmi_residue(self, res_id): @@ -1214,6 +1223,24 @@ def pmi_range(self, res_id_begin, res_id_end): off = self.pmi_offset return self(res_id_begin - off, res_id_end - off) + def add_uniprot_reference(self): + """Add UniProt accession (if available) to the IHM system. + If a UniProt accession was provided for the sequence (either when + State.create_molecule() was called, or in the FASTA alignment file + header), then look this up at the UniProt web site (requires + network access) to get full information, and add it to the IHM + system. The resulting reference object is returned. If the IMP + and UniProt sequences are not identical, then this object may + need to be modified by specifying an alignment and/or + single-point mutations. + """ + if self.uniprot: + print('Adding UniProt accession %s reference for entity %s' + % (self.uniprot, self.description)) + ref = ihm.reference.UniProtSequence.from_accession(self.uniprot) + self.references.append(ref) + return ref + class AsymUnit(ihm.AsymUnit): """A single asymmetric unit in the system. This roughly corresponds to @@ -1397,7 +1424,7 @@ def create_component(self, state, name, modeled, asym_name=None): self.all_modeled_components.append(name) def add_component_sequence(self, state, name, seq, asym_name=None, - alphabet=None): + alphabet=None, uniprot=None): if asym_name is None: asym_name = name @@ -1409,7 +1436,7 @@ def add_component_sequence(self, state, name, seq, asym_name=None, # Offset is always zero to start with; this may be modified # later in finalize_build() if any non-modeled N-terminal # residues are removed - self.entities.add(name, seq, 0, alphabet) + self.entities.add(name, seq, 0, alphabet, uniprot) if asym_name in self.asym_units: if self.asym_units[asym_name] is None: # Set up a new asymmetric unit for this component diff --git a/pyext/src/topology/__init__.py b/pyext/src/topology/__init__.py index 80c24b2f..2ced305e 100644 --- a/pyext/src/topology/__init__.py +++ b/pyext/src/topology/__init__.py @@ -809,7 +809,8 @@ def _build_protocol_output(self): asym_name=self._name_with_copy) po.add_component_sequence(state, name, self.sequence, asym_name=self._name_with_copy, - alphabet=self.alphabet) + alphabet=self.alphabet, + uniprot=self.uniprot) def _finalize_build(self): # For clones, pass the representation of the original molecule diff --git a/test/test_mmcif.py b/test/test_mmcif.py index 31a755fd..94f55501 100644 --- a/test/test_mmcif.py +++ b/test/test_mmcif.py @@ -315,21 +315,43 @@ def test_cif_entities(self): """Test _EntityMapper class""" system = ihm.System() c = IMP.pmi.mmcif._EntityMapper(system) - c.add('foo', 'MELS', 0, alphabet=None) - c.add('bar', 'SELM', 0, alphabet=IMP.pmi.alphabets.amino_acid) - c.add('foo_2', 'MELS', 0, alphabet=None) + c.add('foo', 'MELS', 0, alphabet=None, uniprot=None) + c.add('bar', 'SELM', 0, alphabet=IMP.pmi.alphabets.amino_acid, + uniprot='baracc') + c.add('foo_2', 'MELS', 0, alphabet=None, uniprot=None) self.assertRaises(TypeError, c.add, 'baz', 'MELSXX', 0, - alphabet='garbage') + alphabet='garbage', uniprot=None) self.assertEqual(len(system.entities), 2) self.assertIs(c['foo'], c['foo_2']) self.assertIsNot(c['foo'], c['bar']) a = system.entities self.assertEqual(len(a), 2) self.assertEqual(a[0].description, 'foo') + self.assertIsNone(a[0].uniprot) self.assertEqual(''.join(x.code for x in a[0].sequence), 'MELS') self.assertEqual(a[1].description, 'bar') + self.assertEqual(a[1].uniprot, 'baracc') self.assertEqual(''.join(x.code for x in a[1].sequence), 'SELM') + def test_entity_add_uniprot_reference(self): + """Test Entity.add_uniprot_reference()""" + system = ihm.System() + c = IMP.pmi.mmcif._EntityMapper(system) + c.add('foo', 'MELS', 0, alphabet=None, uniprot=None) + c.add('bar', 'SELM', 0, alphabet=None, uniprot='baracc') + # Mock out UniProtSequence.from_accession + orig = ihm.reference.UniProtSequence.from_accession + def mock_from_acc(acc): + return "mock+" + acc + try: + ihm.reference.UniProtSequence.from_accession = mock_from_acc + ref = c['foo'].add_uniprot_reference() + self.assertIsNone(ref) + ref = c['bar'].add_uniprot_reference() + self.assertEqual(ref, 'mock+baracc') + finally: + ihm.reference.UniProtSequence.from_accession = orig + def test_all_datasets_all_group(self): """Test AllDatasets.get_all_group()""" s = ihm.System()