pckroon · pckroon · Nov 18, 2024 · Nov 18, 2024 · Nov 18, 2024 · Nov 18, 2024
diff --git a/pysmiles/read_smiles.py b/pysmiles/read_smiles.py
@@ -55,14 +55,17 @@ def _tokenize(smiles):
 
     Yields
     ------
-    tuple(TokenType, str)
-        A tuple describing the type of token and the associated data
+    tuple(TokenType, int, str)
+        A tuple describing the type of token, its position, and the associated
+        data.
     """
     organic_subset = 'B C N O P S F Cl Br I * b c n o s p'.split()
     smiles = iter(smiles)
     token = ''
+    idx = -1
     peek = None
     while True:
+        idx += 1
         char = peek if peek else next(smiles, '')
         peek = None
         if not char:
@@ -73,86 +76,82 @@ def _tokenize(smiles):
                 token += char
                 if char == ']':
                     break
-            yield TokenType.ATOM, token
+            yield TokenType.ATOM, idx, token
         elif char in organic_subset:
             peek = next(smiles, '')
             if char + peek in organic_subset:
-                yield TokenType.ATOM, char + peek
+                yield TokenType.ATOM, idx, char + peek
                 peek = None
             else:
-                yield TokenType.ATOM, char
+                yield TokenType.ATOM, idx, char
         elif char in '-=#$:.':
-            yield TokenType.BOND_TYPE, char
+            yield TokenType.BOND_TYPE, idx, char
         elif char == '(':
-            yield TokenType.BRANCH_START, '('
+            yield TokenType.BRANCH_START, idx, '('
         elif char == ')':
-            yield TokenType.BRANCH_END, ')'
+            yield TokenType.BRANCH_END, idx, ')'
         elif char == '%':
             # If smiles is too short this will raise a ValueError, which is
             # (slightly) prettier than a StopIteration.
-            yield TokenType.RING_NUM, int(next(smiles, '') + next(smiles, ''))
+            yield TokenType.RING_NUM, idx, int(next(smiles, '') + next(smiles, ''))
         elif char in '/\\':
-            yield TokenType.EZSTEREO, char
+            yield TokenType.EZSTEREO, idx, char
         elif char.isdigit():  # pragma: no branch
-            yield TokenType.RING_NUM, int(char)
+            yield TokenType.RING_NUM, idx, int(char)
 
 
-def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True, 
-                reinterpret_aromatic=True, strict=True):
+def base_smiles_parser(smiles, strict=True, node_attr='desc', edge_attr='desc'):
     """
-    Parses a SMILES string.
+    Parse but do not interpret a SMILES string to a graph. Nodes and edges will
+    be annotated with their constructing string description as ``node_attr`` and
+    ``edge_attr``. The indices of these strings are annotated in the ``'_pos'``
+    attribute. Finally, the smiles string itself is annotated as a graph
+    attribute under ``'smiles'``.
+
+    This functions also returns stereochemical information that cannot be
+    directly annotated in the graph, since it depends on interpreting nodes and
+    edges.
 
     Parameters
     ----------
     smiles : iterable
-        The SMILES string to parse. Should conform to the OpenSMILES
-        specification.
-    explicit_hydrogen : bool
-        Whether hydrogens should be explicit nodes in the output graph, or be
-        implicit in 'hcount' attributes.
-    zero_order_bonds : bool
-        Whether zero-order bonds (".") should be added as edges with an order of
-        0.
-    reinterpret_aromatic : bool
-        Whether aromaticity should be determined from the created molecule,
-        instead of taken from the SMILES string.
     strict : bool
-        Whether to be more strict in accepting what is a valid SMILES string and
-        what is not.
+    node_attr : collections.abc.Hashable
+    edge_attr : collections.abc.Hashable
 
     Returns
     -------
     nx.Graph
-        A graph describing a molecule. Nodes will have an 'element', 'aromatic'
-        and a 'charge', and if `explicit_hydrogen` is False a 'hcount'.
-        Depending on the input, they will also have 'isotope' and 'class'
-        information.
-        Edges will have an 'order'.
+        The created graph. Nodes are annotated with the string/token that define
+        their attributes in the ``node_attr`` key, edges in the ``edge_attr``
+        key. The indices at which these strings can be found is annotated as
+        ``_pos``.
+    List[Tuple[Tuple[int, int, str], Tuple[int, int, str]]]
+        A list of E/Z isomer pairs, where the integers are node indices and last
+        element is the directional token from the smiles string (*i.e.* / or \).
+    List[Tuple[int, int, {edge_attr: str, '_pos': int}]
+        A list of created ring bonds, to be used for R/S isomer annotation. It
+        contains tuples consisting of 2 node indices, and a dictionary
+        containing the edge attributes.
     """
-    bond_to_order = {'-': 1, '=': 2, '#': 3, '$': 4, ':': 1.5, '.': 0}
-    mol = nx.Graph()
+    mol = nx.Graph(smiles=smiles)
     anchor = None
     idx = 0
-    default_bond = 1
-    default_aromatic_bond = 1.5
     next_bond = None
     branches = []
     ring_nums = {}
     current_ez = None
+    ez_isomer_pairs = []
+    created_ring_bonds = []
     prev_token = None
     prev_type = None
-    ez_isomer_pairs = []
-    for tokentype, token in _tokenize(smiles):
+    for tokentype, token_idx, token in _tokenize(smiles):
         if tokentype == TokenType.ATOM:
-            mol.add_node(idx, **parse_atom(token))
+            mol.add_node(idx, **{node_attr: token, '_pos': token_idx})
             if anchor is not None:
                 if next_bond is None:
-                    if mol.nodes[anchor].get('aromatic') and mol.nodes[idx].get('aromatic'):
-                        next_bond = default_aromatic_bond
-                    else:
-                        next_bond = default_bond
-                if next_bond or zero_order_bonds:
-                    mol.add_edge(anchor, idx, order=next_bond)
+                    next_bond = ""
+                mol.add_edge(anchor, idx, **{edge_attr: next_bond, '_pos': token_idx})
                 next_bond = None
             anchor = idx
             idx += 1
@@ -164,15 +163,12 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True,
             if next_bond is not None:
                 raise ValueError('Previous bond (order {}) not used. '
                                  'Overwritten by "{}"'.format(next_bond, token))
-            next_bond = bond_to_order[token]
+            next_bond = token
         elif tokentype == TokenType.RING_NUM:
             if token in ring_nums:
                 jdx, order = ring_nums[token]
                 if next_bond is None and order is None:
-                    if mol.nodes[idx-1].get('aromatic') and mol.nodes[jdx].get('aromatic'):
-                        next_bond = default_aromatic_bond
-                    else:
-                        next_bond = default_bond
+                    next_bond = ""
                 elif order is None:  # Note that the check is needed,
                     next_bond = next_bond  # But this could be pass.
                 elif next_bond is None:
@@ -181,23 +177,18 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True,
                     raise ValueError('Conflicting bond orders for ring '
                                      'between indices {}'.format(token))
                 # idx is the index of the *next* atom we're adding. So: -1.
-                if mol.has_edge(idx-1, jdx):
+                if mol.has_edge(idx - 1, jdx):
                     raise ValueError('Edge specified by marker {} already '
                                      'exists'.format(token))
-                if idx-1 == jdx:
+                if idx - 1 == jdx:
                     raise ValueError('Marker {} specifies a bond between an '
                                      'atom and itself'.format(token))
-                if next_bond or zero_order_bonds:
-                    mol.add_edge(idx - 1, jdx, order=next_bond)
-                    # we need to keep track of ring bonds here for the
-                    # chirality assignment
-                    if mol.nodes[idx-1].get('rs_isomer', False):
-                        mol.nodes[idx-1]['rs_isomer'][1].append(jdx)
-                    if mol.nodes[jdx].get('rs_isomer', False):
-                        mol.nodes[jdx]['rs_isomer'][1].append(idx-1)
-
+                mol.add_edge(idx - 1, jdx, **{edge_attr: next_bond, '_pos': token_idx})
                 next_bond = None
                 del ring_nums[token]
+                # we need to keep track of ring bonds here for the
+                # chirality assignment
+                created_ring_bonds.append((idx-1, jdx, {edge_attr: next_bond, '_pos': token_idx}))
             else:
                 if idx == 0:
                     raise ValueError("Can't have a marker ({}) before an atom"
@@ -229,6 +220,64 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True,
     if current_ez and strict:
         raise ValueError('There is an unmatched stereochemical token.')
 
+    return mol, ez_isomer_pairs, created_ring_bonds
+
+def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True, 
+                reinterpret_aromatic=True, strict=True):
+    """
+    Parses a SMILES string.
+
+    Parameters
+    ----------
+    smiles : iterable
+        The SMILES string to parse. Should conform to the OpenSMILES
+        specification.
+    explicit_hydrogen : bool
+        Whether hydrogens should be explicit nodes in the output graph, or be
+        implicit in 'hcount' attributes.
+    zero_order_bonds : bool
+        Whether zero-order bonds (".") should be added as edges with an order of
+        0.
+    reinterpret_aromatic : bool
+        Whether aromaticity should be determined from the created molecule,
+        instead of taken from the SMILES string.
+    strict : bool
+        Whether to be more strict in accepting what is a valid SMILES string and
+        what is not.
+
+    Returns
+    -------
+    nx.Graph
+        A graph describing a molecule. Nodes will have an 'element', 'aromatic'
+        and a 'charge', and if `explicit_hydrogen` is False a 'hcount'.
+        Depending on the input, they will also have 'isotope' and 'class'
+        information.
+        Edges will have an 'order'.
+    """
+    bond_to_order = {'-': 1, '=': 2, '#': 3, '$': 4, ':': 1.5, '.': 0}
+    default_bond = 1
+    default_aromatic_bond = 1.5
+    mol, ez_isomer_pairs, ring_bonds = base_smiles_parser(smiles, strict=strict,
+                                                          node_attr='_atom_str', edge_attr='_bond_str')
+    for node in mol:
+        mol.nodes[node].update(parse_atom(mol.nodes[node]['_atom_str']))
+    for idx, jdx, attrs in ring_bonds:
+        if mol.nodes[idx].get('rs_isomer'):
+            mol.nodes[idx]['rs_isomer'][1].append(jdx)
+        if mol.nodes[jdx].get('rs_isomer'):
+            mol.nodes[jdx]['rs_isomer'][1].append(idx)
+    for edge in mol.edges:
+        if mol.edges[edge]['_bond_str']:
+            order = bond_to_order[mol.edges[edge]['_bond_str']]
+            if not order and not zero_order_bonds:
+                mol.remove_edge(*edge)
+                continue
+            mol.edges[edge]['order'] = order
+        elif mol.nodes[edge[0]].get('aromatic') and mol.nodes[edge[1]].get('aromatic'):
+            mol.edges[edge]['order'] = default_aromatic_bond
+        else:
+            mol.edges[edge]['order'] = default_bond
+
     if reinterpret_aromatic:
         correct_aromatic_rings(mol, strict=strict)
 
@@ -246,9 +295,26 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True,
             else:
                 LOGGER.warning(msg)
         elif element != '*' and bonds_missing(mol, node):
-            debug_smiles = write_smiles_component(nx.ego_graph(mol, node))
+            attrs = mol.nodes[node]
+            # Grab some of the smiles string for debugging purposes
+            surrounding_characters = 5
+            node_position = (attrs['_pos'] - surrounding_characters,
+                             attrs['_pos'] + len(attrs['_atom_str']) + surrounding_characters)
+            if node_position[0] < 0:
+                left_idx = 0
+                prefix = ''
+            else:
+                left_idx = node_position[0]
+                prefix = '...'
+            if node_position[1] >= len(smiles):
+                right_idx = len(smiles) - 1
+                suffix = ''
+            else:
+                right_idx = node_position[1]
+                suffix = '...'
+            debug_smiles = prefix + smiles[left_idx:right_idx] + suffix
             msg = (f'Node {node} ({format_atom(mol, node)}) has non-standard'
-                   f' valence: ...{debug_smiles}...')
+                   f' valence: {debug_smiles}')
             if strict:
                 raise KeyError(msg)
             else:

diff --git a/pysmiles/testhelper.py b/pysmiles/testhelper.py
@@ -15,7 +15,7 @@
 """
 Contains a unittest.TestCase subclass that has an `assertEqualGraphs` method.
 """
-
+from functools import partial
 import operator
 import unittest
 
@@ -29,12 +29,19 @@ def make_mol(node_data, edge_data):
     return mol
 
 
-def assertEqualGraphs(graph1, graph2):  # pylint: disable=invalid-name
+def _equal_dicts(dict1, dict2, excluded_keys=[]):
+    dict1_keys = set(dict1.keys()) - set(excluded_keys)
+    dict2_keys = set(dict2.keys()) - set(excluded_keys)
+    return dict1_keys == dict2_keys and {k1: dict1[k1] for k1 in dict1_keys} == {k2: dict2[k2] for k2 in dict1_keys}
+
+
+def assertEqualGraphs(graph1, graph2, excluded_node_attrs=[], excluded_edge_attrs=[]):  # pylint: disable=invalid-name
     """
     Asserts that `graph1` and `graph2` are equal up to isomorphism.
     """
     out = nx.is_isomorphic(graph1, graph2,
-                           node_match=operator.eq, edge_match=operator.eq)
+                           node_match=partial(_equal_dicts, excluded_keys=excluded_node_attrs),
+                           edge_match=partial(_equal_dicts, excluded_keys=excluded_edge_attrs))
     if out:
         return
 
@@ -51,15 +58,15 @@ def assertEqualGraphs(graph1, graph2):  # pylint: disable=invalid-name
         for node_idx, node_jdx in match.items():
             node1 = graph1.nodes[node_idx]
             node2 = graph2.nodes[node_idx]
-            for attr in set(node1) | set(node2):
+            for attr in (set(node1) | set(node2)) - set(excluded_node_attrs):
                 if node1.get(attr) == node2.get(attr):
                     score += 1
 
         for idx1, jdx1 in graph1.edges:
             idx2, jdx2 = match[idx1], match[jdx1]
             edge1 = graph1.edges[idx1, jdx1]
             edge2 = graph2.edges[idx2, jdx2]
-            for attr in set(edge1) | set(edge2):
+            for attr in (set(edge1) | set(edge2)) - set(excluded_edge_attrs):
                 if edge1.get(attr) == edge2.get(attr):
                     score += 1
         scores.append((score, m_idx))

diff --git a/tests/test_hypothesis.py b/tests/test_hypothesis.py
@@ -161,6 +161,6 @@ def write_read_cycle(self):
         correct_aromatic_rings(reference)
         # note(found.nodes(data=True))
         # note(found.edges(data=True))
-        assertEqualGraphs(reference, found)
+        assertEqualGraphs(reference, found, excluded_node_attrs=['_pos', '_atom_str'], excluded_edge_attrs=['_pos', '_bond_str'])
 
 Tester = SMILESTest.TestCase
diff --git a/tests/test_read_smiles.py b/tests/test_read_smiles.py
@@ -911,7 +911,7 @@ def test_read_smiles(smiles, node_data, edge_data, explicit_h):
     print(found.edges(data=True))
     print(smiles)
     expected = make_mol(node_data, edge_data)
-    assertEqualGraphs(found, expected)
+    assertEqualGraphs(found, expected, excluded_node_attrs=['_pos', '_atom_str'], excluded_edge_attrs=['_pos', '_bond_str'])
 
 @pytest.mark.parametrize('smiles, error_type', (
     ('[CL-]', ValueError),
@@ -927,6 +927,8 @@ def test_read_smiles(smiles, node_data, edge_data, explicit_h):
     ('F/C=CF', ValueError),
     ('C=.C', ValueError),
     ('C[O@]C', ValueError),
+    ('CC(C)(=C)C', KeyError),
+    ('CCCCCCCC$C', KeyError),
 ))
 def test_invalid_smiles(smiles, error_type):
     with pytest.raises(error_type):

diff --git a/tests/test_write_smiles.py b/tests/test_write_smiles.py
@@ -157,4 +157,4 @@ def test_write_smiles(node_data, edge_data, kwargs):
     print(smiles)
     print(kwargs)
     found = read_smiles(smiles, **kwargs)
-    assertEqualGraphs(mol, found)
+    assertEqualGraphs(mol, found, excluded_node_attrs=['_pos', '_atom_str'], excluded_edge_attrs=['_pos', '_bond_str'])