Use repr() to properly handle MOLBLOCK identifiers (#544)

open-reaction-database · Jan 22, 2021 · 3e51ed5 · 3e51ed5
1 parent 30d1af7
commit 3e51ed5
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 21 deletions.
diff --git a/ord_schema/templating.py b/ord_schema/templating.py
@@ -26,6 +26,7 @@
 import pandas as pd
 from google.protobuf import text_format  # pytype: disable=import-error
 
+import ord_schema
 from ord_schema import validations
 from ord_schema.proto import dataset_pb2
 from ord_schema.proto import reaction_pb2
@@ -50,29 +51,16 @@ def read_spreadsheet(file_name_or_buffer: Union[str, BinaryIO],
     return pd.read_csv(file_name_or_buffer)
 
 
-def _escape(string: str) -> str:
-    """Converts single backslashes to double backslashes.
-
-    Note that we do not do a full re.escape because only backslashes are
-    problematic.
-
-    Args:
-        string: String to escape.
-
-    Returns:
-        Updated string with escaped backslashes.
-    """
-    return string.replace('\\', '\\\\')
-
-
 def _is_null(value: Union[float, str]) -> bool:
     """Returns whether a value is null."""
     return pd.isnull(value) or (isinstance(value, str) and
                                 (value == 'nan' or not value.strip()))
 
 
-def _fill_template(string: str,
-                   substitutions: Mapping[str, str]) -> reaction_pb2.Reaction:
+def _fill_template(
+    string: str,
+    substitutions: Mapping[str,
+                           ord_schema.ScalarType]) -> reaction_pb2.Reaction:
     """Performs substring substitutions according to a dictionary.
 
     If any pattern has a null replacement value (i.e. this is an empty cell in
@@ -104,7 +92,7 @@ def _fill_template(string: str,
     for pattern, value in substitutions.items():
         if pd.isnull(value):
             check_null = True
-        string = string.replace(pattern, _escape(str(value)))
+        string = string.replace(pattern, repr(value).strip('\''))
     try:
         reaction = text_format.Parse(string, reaction_pb2.Reaction())
     except text_format.ParseError as error:

diff --git a/ord_schema/templating_test.py b/ord_schema/templating_test.py
@@ -21,6 +21,7 @@
 from absl.testing import parameterized
 from google.protobuf import text_format
 import pandas as pd
+from rdkit import Chem
 
 from ord_schema import templating
 from ord_schema.proto import reaction_pb2
@@ -47,11 +48,11 @@ def setUp(self):
 
     def test_valid_templating(self):
         template_string = self.template_string.replace('value: "CCO"',
-                                                       'value: "$my_smiles$"')
+                                                       'value: "$smiles$"')
         template_string = template_string.replace('value: 75',
                                                   'value: $conversion$')
         df = pd.DataFrame.from_dict({
-            '$my_smiles$': ['CCO', 'CCCO', 'CCCCO'],
+            '$smiles$': ['CCO', 'CCCO', 'CCCCO'],
             '$conversion$': [75, 50, 30],
         })
         dataset = templating.generate_dataset(template_string, df)
@@ -67,12 +68,30 @@ def test_valid_templating(self):
 
         # Test without "$" in column names
         df = pd.DataFrame.from_dict({
-            'my_smiles': ['CCO', 'CCCO', 'CCCCO'],
+            'smiles': ['CCO', 'CCCO', 'CCCCO'],
             'conversion': [75, 50, 30],
         })
         dataset = templating.generate_dataset(template_string, df)
         self.assertEqual(dataset, expected_dataset)
 
+    def test_valid_templating_escapes(self):
+        smiles = ['CCO', 'CCCO', 'CCCCO']
+        mols = [Chem.MolFromSmiles(this_smiles) for this_smiles in smiles]
+        molblocks = [Chem.MolToMolBlock(mol) for mol in mols]
+        self.valid_reaction.inputs['in'].components[0].identifiers.add(
+            type='MOLBLOCK', value='$molblock$')
+        template_string = text_format.MessageToString(self.valid_reaction)
+        df = pd.DataFrame.from_dict({'molblock': molblocks})
+        dataset = templating.generate_dataset(template_string, df)
+        expected_reactions = []
+        for molblock in molblocks:
+            reaction = reaction_pb2.Reaction()
+            reaction.CopyFrom(self.valid_reaction)
+            reaction.inputs['in'].components[0].identifiers[1].value = molblock
+            expected_reactions.append(reaction)
+        expected_dataset = dataset_pb2.Dataset(reactions=expected_reactions)
+        self.assertEqual(dataset, expected_dataset)
+
     @parameterized.parameters(['.csv', '.xls', '.xlsx'])
     def test_read_spreadsheet(self, suffix):
         df = pd.DataFrame.from_dict({