Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split off a base parser from read_smiles #49

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 129 additions & 63 deletions pysmiles/read_smiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,17 @@ def _tokenize(smiles):

Yields
------
tuple(TokenType, str)
A tuple describing the type of token and the associated data
tuple(TokenType, int, str)
A tuple describing the type of token, its position, and the associated
data.
"""
organic_subset = 'B C N O P S F Cl Br I * b c n o s p'.split()
smiles = iter(smiles)
token = ''
idx = -1
peek = None
while True:
idx += 1
char = peek if peek else next(smiles, '')
peek = None
if not char:
Expand All @@ -73,86 +76,82 @@ def _tokenize(smiles):
token += char
if char == ']':
break
yield TokenType.ATOM, token
yield TokenType.ATOM, idx, token
elif char in organic_subset:
peek = next(smiles, '')
if char + peek in organic_subset:
yield TokenType.ATOM, char + peek
yield TokenType.ATOM, idx, char + peek
peek = None
else:
yield TokenType.ATOM, char
yield TokenType.ATOM, idx, char
elif char in '-=#$:.':
yield TokenType.BOND_TYPE, char
yield TokenType.BOND_TYPE, idx, char
elif char == '(':
yield TokenType.BRANCH_START, '('
yield TokenType.BRANCH_START, idx, '('
elif char == ')':
yield TokenType.BRANCH_END, ')'
yield TokenType.BRANCH_END, idx, ')'
elif char == '%':
# If smiles is too short this will raise a ValueError, which is
# (slightly) prettier than a StopIteration.
yield TokenType.RING_NUM, int(next(smiles, '') + next(smiles, ''))
yield TokenType.RING_NUM, idx, int(next(smiles, '') + next(smiles, ''))
elif char in '/\\':
yield TokenType.EZSTEREO, char
yield TokenType.EZSTEREO, idx, char
elif char.isdigit(): # pragma: no branch
yield TokenType.RING_NUM, int(char)
yield TokenType.RING_NUM, idx, int(char)


def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True,
reinterpret_aromatic=True, strict=True):
def base_smiles_parser(smiles, strict=True, node_attr='desc', edge_attr='desc'):
"""
Parses a SMILES string.
Parse but do not interpret a SMILES string to a graph. Nodes and edges will
be annotated with their constructing string description as ``node_attr`` and
``edge_attr``. The indices of these strings are annotated in the ``'_pos'``
attribute. Finally, the smiles string itself is annotated as a graph
attribute under ``'smiles'``.

This functions also returns stereochemical information that cannot be
directly annotated in the graph, since it depends on interpreting nodes and
edges.

Parameters
----------
smiles : iterable
The SMILES string to parse. Should conform to the OpenSMILES
specification.
explicit_hydrogen : bool
Whether hydrogens should be explicit nodes in the output graph, or be
implicit in 'hcount' attributes.
zero_order_bonds : bool
Whether zero-order bonds (".") should be added as edges with an order of
0.
reinterpret_aromatic : bool
Whether aromaticity should be determined from the created molecule,
instead of taken from the SMILES string.
strict : bool
pckroon marked this conversation as resolved.
Show resolved Hide resolved
Whether to be more strict in accepting what is a valid SMILES string and
what is not.
node_attr : collections.abc.Hashable
edge_attr : collections.abc.Hashable

Returns
-------
nx.Graph
A graph describing a molecule. Nodes will have an 'element', 'aromatic'
and a 'charge', and if `explicit_hydrogen` is False a 'hcount'.
Depending on the input, they will also have 'isotope' and 'class'
information.
Edges will have an 'order'.
The created graph. Nodes are annotated with the string/token that define
their attributes in the ``node_attr`` key, edges in the ``edge_attr``
key. The indices at which these strings can be found is annotated as
``_pos``.
List[Tuple[Tuple[int, int, str], Tuple[int, int, str]]]
A list of E/Z isomer pairs, where the integers are node indices and last
element is the directional token from the smiles string (*i.e.* / or \).
List[Tuple[int, int, {edge_attr: str, '_pos': int}]
A list of created ring bonds, to be used for R/S isomer annotation. It
contains tuples consisting of 2 node indices, and a dictionary
containing the edge attributes.
"""
bond_to_order = {'-': 1, '=': 2, '#': 3, '$': 4, ':': 1.5, '.': 0}
mol = nx.Graph()
mol = nx.Graph(smiles=smiles)
anchor = None
idx = 0
default_bond = 1
default_aromatic_bond = 1.5
next_bond = None
branches = []
ring_nums = {}
current_ez = None
ez_isomer_pairs = []
created_ring_bonds = []
prev_token = None
prev_type = None
ez_isomer_pairs = []
for tokentype, token in _tokenize(smiles):
for tokentype, token_idx, token in _tokenize(smiles):
if tokentype == TokenType.ATOM:
mol.add_node(idx, **parse_atom(token))
mol.add_node(idx, **{node_attr: token, '_pos': token_idx})
if anchor is not None:
if next_bond is None:
if mol.nodes[anchor].get('aromatic') and mol.nodes[idx].get('aromatic'):
next_bond = default_aromatic_bond
else:
next_bond = default_bond
if next_bond or zero_order_bonds:
mol.add_edge(anchor, idx, order=next_bond)
next_bond = ""
mol.add_edge(anchor, idx, **{edge_attr: next_bond, '_pos': token_idx})
next_bond = None
anchor = idx
idx += 1
Expand All @@ -164,15 +163,12 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True,
if next_bond is not None:
raise ValueError('Previous bond (order {}) not used. '
'Overwritten by "{}"'.format(next_bond, token))
next_bond = bond_to_order[token]
next_bond = token
elif tokentype == TokenType.RING_NUM:
if token in ring_nums:
jdx, order = ring_nums[token]
if next_bond is None and order is None:
if mol.nodes[idx-1].get('aromatic') and mol.nodes[jdx].get('aromatic'):
next_bond = default_aromatic_bond
else:
next_bond = default_bond
next_bond = ""
elif order is None: # Note that the check is needed,
next_bond = next_bond # But this could be pass.
elif next_bond is None:
Expand All @@ -181,23 +177,18 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True,
raise ValueError('Conflicting bond orders for ring '
'between indices {}'.format(token))
# idx is the index of the *next* atom we're adding. So: -1.
if mol.has_edge(idx-1, jdx):
if mol.has_edge(idx - 1, jdx):
raise ValueError('Edge specified by marker {} already '
'exists'.format(token))
if idx-1 == jdx:
if idx - 1 == jdx:
raise ValueError('Marker {} specifies a bond between an '
'atom and itself'.format(token))
if next_bond or zero_order_bonds:
mol.add_edge(idx - 1, jdx, order=next_bond)
# we need to keep track of ring bonds here for the
# chirality assignment
if mol.nodes[idx-1].get('rs_isomer', False):
mol.nodes[idx-1]['rs_isomer'][1].append(jdx)
if mol.nodes[jdx].get('rs_isomer', False):
mol.nodes[jdx]['rs_isomer'][1].append(idx-1)

mol.add_edge(idx - 1, jdx, **{edge_attr: next_bond, '_pos': token_idx})
next_bond = None
del ring_nums[token]
# we need to keep track of ring bonds here for the
# chirality assignment
created_ring_bonds.append((idx-1, jdx, {edge_attr: next_bond, '_pos': token_idx}))
else:
if idx == 0:
raise ValueError("Can't have a marker ({}) before an atom"
Expand Down Expand Up @@ -229,6 +220,64 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True,
if current_ez and strict:
raise ValueError('There is an unmatched stereochemical token.')

return mol, ez_isomer_pairs, created_ring_bonds

def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True,
reinterpret_aromatic=True, strict=True):
"""
Parses a SMILES string.

Parameters
----------
smiles : iterable
The SMILES string to parse. Should conform to the OpenSMILES
specification.
explicit_hydrogen : bool
Whether hydrogens should be explicit nodes in the output graph, or be
implicit in 'hcount' attributes.
zero_order_bonds : bool
Whether zero-order bonds (".") should be added as edges with an order of
0.
reinterpret_aromatic : bool
Whether aromaticity should be determined from the created molecule,
instead of taken from the SMILES string.
strict : bool
Whether to be more strict in accepting what is a valid SMILES string and
what is not.

Returns
-------
nx.Graph
A graph describing a molecule. Nodes will have an 'element', 'aromatic'
and a 'charge', and if `explicit_hydrogen` is False a 'hcount'.
Depending on the input, they will also have 'isotope' and 'class'
information.
Edges will have an 'order'.
"""
bond_to_order = {'-': 1, '=': 2, '#': 3, '$': 4, ':': 1.5, '.': 0}
default_bond = 1
default_aromatic_bond = 1.5
mol, ez_isomer_pairs, ring_bonds = base_smiles_parser(smiles, strict=strict,
node_attr='_atom_str', edge_attr='_bond_str')
for node in mol:
mol.nodes[node].update(parse_atom(mol.nodes[node]['_atom_str']))
for idx, jdx, attrs in ring_bonds:
if mol.nodes[idx].get('rs_isomer'):
mol.nodes[idx]['rs_isomer'][1].append(jdx)
if mol.nodes[jdx].get('rs_isomer'):
mol.nodes[jdx]['rs_isomer'][1].append(idx)
for edge in mol.edges:
if mol.edges[edge]['_bond_str']:
order = bond_to_order[mol.edges[edge]['_bond_str']]
if not order and not zero_order_bonds:
mol.remove_edge(*edge)
continue
mol.edges[edge]['order'] = order
elif mol.nodes[edge[0]].get('aromatic') and mol.nodes[edge[1]].get('aromatic'):
mol.edges[edge]['order'] = default_aromatic_bond
else:
mol.edges[edge]['order'] = default_bond

Comment on lines +262 to +280
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If one were to make a bold suggestion: These could be put into functions as well such that one can mix and match a read_smiles interpreter function. Maybe even decorators? Something like an abstract interpret_graph method.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the idea, but I need to think on it a little bit. How to organise the code and such. I suggest leaving that for a next PR

if reinterpret_aromatic:
correct_aromatic_rings(mol, strict=strict)

Expand All @@ -246,9 +295,26 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True,
else:
LOGGER.warning(msg)
elif element != '*' and bonds_missing(mol, node):
debug_smiles = write_smiles_component(nx.ego_graph(mol, node))
attrs = mol.nodes[node]
# Grab some of the smiles string for debugging purposes
surrounding_characters = 5
node_position = (attrs['_pos'] - surrounding_characters,
attrs['_pos'] + len(attrs['_atom_str']) + surrounding_characters)
if node_position[0] < 0:
left_idx = 0
prefix = ''
else:
left_idx = node_position[0]
prefix = '...'
if node_position[1] >= len(smiles):
right_idx = len(smiles) - 1
suffix = ''
else:
right_idx = node_position[1]
suffix = '...'
debug_smiles = prefix + smiles[left_idx:right_idx] + suffix
msg = (f'Node {node} ({format_atom(mol, node)}) has non-standard'
f' valence: ...{debug_smiles}...')
f' valence: {debug_smiles}')
if strict:
raise KeyError(msg)
else:
Expand Down
17 changes: 12 additions & 5 deletions pysmiles/testhelper.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"""
Contains a unittest.TestCase subclass that has an `assertEqualGraphs` method.
"""

from functools import partial
import operator
import unittest

Expand All @@ -29,12 +29,19 @@ def make_mol(node_data, edge_data):
return mol


def assertEqualGraphs(graph1, graph2): # pylint: disable=invalid-name
def _equal_dicts(dict1, dict2, excluded_keys=[]):
dict1_keys = set(dict1.keys()) - set(excluded_keys)
dict2_keys = set(dict2.keys()) - set(excluded_keys)
return dict1_keys == dict2_keys and {k1: dict1[k1] for k1 in dict1_keys} == {k2: dict2[k2] for k2 in dict1_keys}


def assertEqualGraphs(graph1, graph2, excluded_node_attrs=[], excluded_edge_attrs=[]): # pylint: disable=invalid-name
"""
Asserts that `graph1` and `graph2` are equal up to isomorphism.
"""
out = nx.is_isomorphic(graph1, graph2,
node_match=operator.eq, edge_match=operator.eq)
node_match=partial(_equal_dicts, excluded_keys=excluded_node_attrs),
edge_match=partial(_equal_dicts, excluded_keys=excluded_edge_attrs))
if out:
return

Expand All @@ -51,15 +58,15 @@ def assertEqualGraphs(graph1, graph2): # pylint: disable=invalid-name
for node_idx, node_jdx in match.items():
node1 = graph1.nodes[node_idx]
node2 = graph2.nodes[node_idx]
for attr in set(node1) | set(node2):
for attr in (set(node1) | set(node2)) - set(excluded_node_attrs):
if node1.get(attr) == node2.get(attr):
score += 1

for idx1, jdx1 in graph1.edges:
idx2, jdx2 = match[idx1], match[jdx1]
edge1 = graph1.edges[idx1, jdx1]
edge2 = graph2.edges[idx2, jdx2]
for attr in set(edge1) | set(edge2):
for attr in (set(edge1) | set(edge2)) - set(excluded_edge_attrs):
if edge1.get(attr) == edge2.get(attr):
score += 1
scores.append((score, m_idx))
Expand Down
2 changes: 1 addition & 1 deletion tests/test_hypothesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,6 @@ def write_read_cycle(self):
correct_aromatic_rings(reference)
# note(found.nodes(data=True))
# note(found.edges(data=True))
assertEqualGraphs(reference, found)
assertEqualGraphs(reference, found, excluded_node_attrs=['_pos', '_atom_str'], excluded_edge_attrs=['_pos', '_bond_str'])

Tester = SMILESTest.TestCase
4 changes: 3 additions & 1 deletion tests/test_read_smiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -911,7 +911,7 @@ def test_read_smiles(smiles, node_data, edge_data, explicit_h):
print(found.edges(data=True))
print(smiles)
expected = make_mol(node_data, edge_data)
assertEqualGraphs(found, expected)
assertEqualGraphs(found, expected, excluded_node_attrs=['_pos', '_atom_str'], excluded_edge_attrs=['_pos', '_bond_str'])

@pytest.mark.parametrize('smiles, error_type', (
('[CL-]', ValueError),
Expand All @@ -927,6 +927,8 @@ def test_read_smiles(smiles, node_data, edge_data, explicit_h):
('F/C=CF', ValueError),
('C=.C', ValueError),
('C[O@]C', ValueError),
('CC(C)(=C)C', KeyError),
('CCCCCCCC$C', KeyError),
))
def test_invalid_smiles(smiles, error_type):
with pytest.raises(error_type):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_write_smiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,4 +157,4 @@ def test_write_smiles(node_data, edge_data, kwargs):
print(smiles)
print(kwargs)
found = read_smiles(smiles, **kwargs)
assertEqualGraphs(mol, found)
assertEqualGraphs(mol, found, excluded_node_attrs=['_pos', '_atom_str'], excluded_edge_attrs=['_pos', '_bond_str'])
Loading