Skip to content

Commit

Permalink
Use repr() to properly handle MOLBLOCK identifiers (#544)
Browse files Browse the repository at this point in the history
  • Loading branch information
skearnes authored Jan 22, 2021
1 parent 30d1af7 commit 3e51ed5
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 21 deletions.
24 changes: 6 additions & 18 deletions ord_schema/templating.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import pandas as pd
from google.protobuf import text_format # pytype: disable=import-error

import ord_schema
from ord_schema import validations
from ord_schema.proto import dataset_pb2
from ord_schema.proto import reaction_pb2
Expand All @@ -50,29 +51,16 @@ def read_spreadsheet(file_name_or_buffer: Union[str, BinaryIO],
return pd.read_csv(file_name_or_buffer)


def _escape(string: str) -> str:
"""Converts single backslashes to double backslashes.
Note that we do not do a full re.escape because only backslashes are
problematic.
Args:
string: String to escape.
Returns:
Updated string with escaped backslashes.
"""
return string.replace('\\', '\\\\')


def _is_null(value: Union[float, str]) -> bool:
"""Returns whether a value is null."""
return pd.isnull(value) or (isinstance(value, str) and
(value == 'nan' or not value.strip()))


def _fill_template(string: str,
substitutions: Mapping[str, str]) -> reaction_pb2.Reaction:
def _fill_template(
string: str,
substitutions: Mapping[str,
ord_schema.ScalarType]) -> reaction_pb2.Reaction:
"""Performs substring substitutions according to a dictionary.
If any pattern has a null replacement value (i.e. this is an empty cell in
Expand Down Expand Up @@ -104,7 +92,7 @@ def _fill_template(string: str,
for pattern, value in substitutions.items():
if pd.isnull(value):
check_null = True
string = string.replace(pattern, _escape(str(value)))
string = string.replace(pattern, repr(value).strip('\''))
try:
reaction = text_format.Parse(string, reaction_pb2.Reaction())
except text_format.ParseError as error:
Expand Down
25 changes: 22 additions & 3 deletions ord_schema/templating_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from absl.testing import parameterized
from google.protobuf import text_format
import pandas as pd
from rdkit import Chem

from ord_schema import templating
from ord_schema.proto import reaction_pb2
Expand All @@ -47,11 +48,11 @@ def setUp(self):

def test_valid_templating(self):
template_string = self.template_string.replace('value: "CCO"',
'value: "$my_smiles$"')
'value: "$smiles$"')
template_string = template_string.replace('value: 75',
'value: $conversion$')
df = pd.DataFrame.from_dict({
'$my_smiles$': ['CCO', 'CCCO', 'CCCCO'],
'$smiles$': ['CCO', 'CCCO', 'CCCCO'],
'$conversion$': [75, 50, 30],
})
dataset = templating.generate_dataset(template_string, df)
Expand All @@ -67,12 +68,30 @@ def test_valid_templating(self):

# Test without "$" in column names
df = pd.DataFrame.from_dict({
'my_smiles': ['CCO', 'CCCO', 'CCCCO'],
'smiles': ['CCO', 'CCCO', 'CCCCO'],
'conversion': [75, 50, 30],
})
dataset = templating.generate_dataset(template_string, df)
self.assertEqual(dataset, expected_dataset)

def test_valid_templating_escapes(self):
smiles = ['CCO', 'CCCO', 'CCCCO']
mols = [Chem.MolFromSmiles(this_smiles) for this_smiles in smiles]
molblocks = [Chem.MolToMolBlock(mol) for mol in mols]
self.valid_reaction.inputs['in'].components[0].identifiers.add(
type='MOLBLOCK', value='$molblock$')
template_string = text_format.MessageToString(self.valid_reaction)
df = pd.DataFrame.from_dict({'molblock': molblocks})
dataset = templating.generate_dataset(template_string, df)
expected_reactions = []
for molblock in molblocks:
reaction = reaction_pb2.Reaction()
reaction.CopyFrom(self.valid_reaction)
reaction.inputs['in'].components[0].identifiers[1].value = molblock
expected_reactions.append(reaction)
expected_dataset = dataset_pb2.Dataset(reactions=expected_reactions)
self.assertEqual(dataset, expected_dataset)

@parameterized.parameters(['.csv', '.xls', '.xlsx'])
def test_read_spreadsheet(self, suffix):
df = pd.DataFrame.from_dict({
Expand Down

0 comments on commit 3e51ed5

Please sign in to comment.