From 820c9c8aabc04a3c664ca81918350cac624a6e3d Mon Sep 17 00:00:00 2001 From: Christopher Zou Date: Tue, 11 Aug 2020 17:23:04 -0400 Subject: [PATCH 1/7] Add hashes, molecule registration, and update create-write documentation --- ...ng and Writing to MongoDB-checkpoint.ipynb | 528 ++++++++++++++++-- .../Creating and Writing to MongoDB.ipynb | 521 +++++++++++++++-- mongordkit/Database/registration.py | 93 +++ mongordkit/Database/tests/test_write.py | 70 ++- mongordkit/Database/utils.py | 2 +- mongordkit/Database/write.py | 117 ++-- mongordkit/Search/tests/test_similarity.py | 12 +- mongordkit/Search/tests/test_substructure.py | 4 +- 8 files changed, 1171 insertions(+), 176 deletions(-) create mode 100644 mongordkit/Database/registration.py diff --git a/docs/notebooks/.ipynb_checkpoints/Creating and Writing to MongoDB-checkpoint.ipynb b/docs/notebooks/.ipynb_checkpoints/Creating and Writing to MongoDB-checkpoint.ipynb index 0dd6819..cdd4ad1 100644 --- a/docs/notebooks/.ipynb_checkpoints/Creating and Writing to MongoDB-checkpoint.ipynb +++ b/docs/notebooks/.ipynb_checkpoints/Creating and Writing to MongoDB-checkpoint.ipynb @@ -6,7 +6,7 @@ "source": [ "# Creating and Writing to MongoDB\n", "\n", - "Last updated: 7/12/20\n", + "Last updated: 8/10/20\n", "\n", "Methods that directly modify MongoDB database instances are included in the `mongordkit.Database` module.\n", "\n", @@ -16,11 +16,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "from mongordkit.Database import create, write, utils\n", + "from mongordkit.Database import create, write, utils, registration\n", + "from rdkit import Chem\n", "import pymongo" ] }, @@ -29,26 +30,25 @@ "metadata": {}, "source": [ "## Reset Cells\n", - "Run the contents of this cell to reset the local MongoDB database used in this notebook." + "Run the contents of this cell to reset the local MongoDB database, `demo_db`, used in this notebook." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "client = pymongo.MongoClient()\n", - "print(client.list_database_names())\n", - "client.drop_database('TestDatabase')\n", - "print(client.list_database_names())" + "client.drop_database('demo_db')\n", + "demo_db = client.demo_db" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Creating Databases\n", + "## Creating Databases (DEPRECATED for now)\n", "Users can opt to bring their own database instances, but `Database.create` provides methods that will create ready-made MongoDB instances, defaulting to your local MongoDB:" ] }, @@ -58,11 +58,11 @@ "metadata": {}, "outputs": [], "source": [ - "# Return a database using a host port, such as the local port:\n", - "TestDB = create.createFromHostPort('TestDatabase', host='localhost', port=27017)\n", + "# # Return a database using a host port, such as the local port:\n", + "# db = create.createFromHostPort('demo_db', host='localhost', port=27017)\n", "\n", - "# Return a database using a MongoDB URI, such as that provided by Atlas:\n", - "TestDB = create.createFromURL('TestDatabase', url=None)" + "# # Return a database using a MongoDB URI, such as that provided by Atlas:\n", + "# TestDB = create.createFromURL('demo_db', url=None)" ] }, { @@ -78,7 +78,134 @@ "metadata": {}, "outputs": [], "source": [ - "print(utils.STANDARD_SETTING)" + "# print(utils.STANDARD_SETTING)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Registration\n", + "`Database.registration` constructs document representations of molecules according to configurable schemes and handles data registration settings.\n", + "\n", + "It does this in two parts. First, it defines the global variables `RDKIT_HASH_FUNCTIONS` and `HASH_FUNCTIONS` as dictionaries that hold map hash function names to methods. It also defines the global variables `DEFAULT_SCHEME_NAME`, `DEFAULT_AUTHOR`, `DEFAULT_PREPROCESS`, and `DEFAULT_INDEX`, which are used in scheme creation and are thus defined for easy configuration. \n", + "\n", + "Second, the file defines the `MolDocScheme` object, which stores scheme information in its instance variables and is passed into `.write` methods in order to specify molecule document format. By default, `MolDocScheme` includes scheme name, author, whether or not the molecule has been pre-processed, an index option, hashes, fingerprints, and value fields. All of the information contained in a `MolDocScheme` object can be used directly to generate documents for molecules:" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'rdmol': Binary(b'\\xef\\xbe\\xad\\xde\\x00\\x00\\x00\\x00\\x0b\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x07\\x00\\x00\\x00\\x07\\x00\\x00\\x00\\x80\\x01\\x06\\x00`\\x00\\x00\\x00\\x01\\x03\\x06@(\\x00\\x00\\x00\\x03\\x04\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x0b\\x00\\x01\\x00\\x01\\x02h\\x0c\\x02\\x03h\\x0c\\x03\\x04h\\x0c\\x04\\x05h\\x0c\\x05\\x06h\\x0c\\x06\\x01h\\x0c\\x14\\x01\\x06\\x01\\x06\\x05\\x04\\x03\\x02\\x17\\x00\\x00\\x00\\x00\\x16', 0),\n", + " 'index': 'YXFVVABEGXRONW-UHFFFAOYSA-N',\n", + " 'smiles': 'Cc1ccccc1',\n", + " 'scheme': 'default',\n", + " 'hashes': {'inchi_standard': 'InChI=1S/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", + " 'inchikey_KET_15T': 'YXFVVABEGXRONW-UHFFFAOYNA-N',\n", + " 'noiso_smiles': 'Cc1ccccc1',\n", + " 'MoleculeHashString': '100-7-7-SaZjmQ-zcSDYw-aXeP/g-122pug-haQS5A-qxXe4Q',\n", + " 'inchi_KET_15T': 'InChI=1/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", + " 'inchikey_standard': 'YXFVVABEGXRONW-UHFFFAOYSA-N',\n", + " 'cx_smiles': 'Cc1ccccc1'},\n", + " 'rdkit_hashes': {'Mesomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", + " 'HetAtomProtomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", + " 'Regioisomer': '*C.c1ccccc1',\n", + " 'MurckoScaffold': 'c1ccccc1',\n", + " 'ArthorSubstructureOrder': '00070007010007000000002a000000',\n", + " 'ExtendedMurcko': '*c1ccccc1',\n", + " 'DegreeVector': '0,1,5,1',\n", + " 'RedoxPair': 'C[C]1[CH][CH][CH][CH][CH]1',\n", + " 'SmallWorldIndexBR': 'B7R1',\n", + " 'MolFormula': 'C7H8',\n", + " 'AtomBondCounts': '7,7',\n", + " 'ElementGraph': 'CC1CCCCC1',\n", + " 'CanonicalSmiles': 'Cc1ccccc1',\n", + " 'SmallWorldIndexBRL': 'B7R1L5',\n", + " 'HetAtomTautomer': 'C[C]1[CH][CH][CH][CH][CH]1_0_0',\n", + " 'NetCharge': '0',\n", + " 'AnonymousGraph': '**1*****1'},\n", + " 'fingerprints': {},\n", + " 'value_data': {}}" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rdmol = Chem.MolFromSmiles('Cc1ccccc1')\n", + "scheme = registration.MolDocScheme()\n", + "scheme.generate_mol_doc(rdmol)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `MolDocScheme` class also defines a series of instance methods, such as `MolDocScheme.set_index` and `MolDocScheme.remove_field`, that can be used to modify document schemes:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'rdmol': Binary(b'\\xef\\xbe\\xad\\xde\\x00\\x00\\x00\\x00\\x0b\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x07\\x00\\x00\\x00\\x07\\x00\\x00\\x00\\x80\\x01\\x06\\x00`\\x00\\x00\\x00\\x01\\x03\\x06@(\\x00\\x00\\x00\\x03\\x04\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x0b\\x00\\x01\\x00\\x01\\x02h\\x0c\\x02\\x03h\\x0c\\x03\\x04h\\x0c\\x04\\x05h\\x0c\\x05\\x06h\\x0c\\x06\\x01h\\x0c\\x14\\x01\\x06\\x01\\x06\\x05\\x04\\x03\\x02\\x17\\x00\\x00\\x00\\x00\\x16', 0),\n", + " 'index': 'C7H8',\n", + " 'smiles': 'Cc1ccccc1',\n", + " 'scheme': 'default',\n", + " 'hashes': {'inchi_standard': 'InChI=1S/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", + " 'inchikey_KET_15T': 'YXFVVABEGXRONW-UHFFFAOYNA-N',\n", + " 'noiso_smiles': 'Cc1ccccc1',\n", + " 'MoleculeHashString': '100-7-7-SaZjmQ-zcSDYw-aXeP/g-122pug-haQS5A-qxXe4Q',\n", + " 'inchi_KET_15T': 'InChI=1/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", + " 'inchikey_standard': 'YXFVVABEGXRONW-UHFFFAOYSA-N',\n", + " 'cx_smiles': 'Cc1ccccc1'},\n", + " 'rdkit_hashes': {'Mesomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", + " 'HetAtomProtomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", + " 'Regioisomer': '*C.c1ccccc1',\n", + " 'MurckoScaffold': 'c1ccccc1',\n", + " 'ArthorSubstructureOrder': '00070007010007000000002a000000',\n", + " 'ExtendedMurcko': '*c1ccccc1',\n", + " 'DegreeVector': '0,1,5,1',\n", + " 'RedoxPair': 'C[C]1[CH][CH][CH][CH][CH]1',\n", + " 'SmallWorldIndexBR': 'B7R1',\n", + " 'MolFormula': 'C7H8',\n", + " 'AtomBondCounts': '7,7',\n", + " 'ElementGraph': 'CC1CCCCC1',\n", + " 'CanonicalSmiles': 'Cc1ccccc1',\n", + " 'SmallWorldIndexBRL': 'B7R1L5',\n", + " 'HetAtomTautomer': 'C[C]1[CH][CH][CH][CH][CH]1_0_0',\n", + " 'NetCharge': '0'},\n", + " 'fingerprints': {},\n", + " 'value_data': {}}" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scheme.remove_field('AnonymousGraph')\n", + "scheme.set_index('MolFormula')\n", + "scheme.generate_mol_doc(rdmol)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Because `MolDocScheme` objects contain no functions—only references to functions—they can be pickled. In fact, the methods in `write` can save `MolDocSchemes` so that custom schemes are retrievable for later use." ] }, { @@ -86,47 +213,289 @@ "metadata": {}, "source": [ "## Writing to a Database\n", - "`Database.write` provides write functionality. Its core method is `writeFromSDF`, which relies on rdkit's `ForwardSDMolSupplier` to write data from an SDF file into a specified database.\n", + "`Database.write` provides write functionality. Its core method is `WriteFromSDF`, which relies on rdkit's `ForwardSDMolSupplier` to write data from an SDF file into a specified database.\n", "\n", - "For each molecule in the SDF, `writeFromSDF` inserts a document containing at the minimum a unique identifying index, that molecule's SMILES, a pickle of the molecule's rdmol, and a field that specifies the registration option used to store the molecule." + "For each molecule in the SDF, `WriteFromSDF` inserts a document whose fields are specified by the `MolDocScheme` object passed into the function (one with default settings is created if the `scheme` argument is left blank)." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "populating mongodb collection with compounds from SDF...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "RDKit WARNING: [22:03:51] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:03:51] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:03:51] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:03:51] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:03:51] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:03:51] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:03:51] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [22:05:23] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "200 molecules successfully imported\n", + "0 duplicates skipped\n" + ] + }, + { + "data": { + "text/plain": [ + "200" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Write the contents of first_200_props.sdf, a test dataset, into the TestDatabase created above. \n", + "# Write the contents of first_200_props.sdf, a test dataset, into the collection demo_db.molecules.\n", "# The index will default to the molecule's inchikey.\n", "# Return the number of molecules succesfully imported.\n", - "write.writeFromSDF(TestDB, '../../data/test_data/first_200.props.sdf', 'test')" + "write.WriteFromSDF(demo_db.molecules, '../../data/test_data/first_200.props.sdf')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The above call is the most basic version of `writeFromSDF`. For additional flexibility, `writeFromSDF` takes several optional arguments that allow users to specify how inbound molecules should be standardized, a field relating to the data's origin, customize the index, and change how many molecules are inserted into the database at a time. " + "The above call is the most basic version of `writeFromSDF`. For additional flexibility, `writeFromSDF` takes several optional arguments—users can specify a custom scheme object, a registration collection to write scheme objects to, how many molecules are inserted at a time (this can affect performance), and limit the number of molecules written in." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "populating mongodb collection with compounds from SDF...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100 molecules successfully imported\n", + "0 duplicates skipped\n" + ] + }, + { + "data": { + "text/plain": [ + "100" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Write the contents of first_200_props.sdf, a test dataset, into the TestDatabase created above. \n", + "# Write the first 100 molecules of first_200_props.sdf, a test dataset, into demo_db.molecules\n", "# This write will use canonical SMILES as the identifying index and thus does not conflict with the above write. \n", "# If we had used inchikey again, the write would have imported 0 molecules.\n", - "write.writeFromSDF(TestDB, '../../data/test_data/first_200.props.sdf', 'test', reg_option='standard_setting', index_option='canonical_smiles', chunk_size=100, limit=None)" + "scheme = registration.MolDocScheme()\n", + "scheme.set_index('CanonicalSmiles')\n", + "write.WriteFromSDF(demo_db.molecules, '../../data/test_data/first_200.props.sdf', scheme, reg_collection=demo_db.schema, chunk_size=50, limit=100)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In order to maintain consistency, the registration options and index options are drawn from a set of predetermined options specified in `Database.utils`." + "In the case that users aren't working with an SDF, `.write` also provides `WriteFromMolList`, which will take a Python list of rdmol objects in place of the SDF argument in `WriteFromSDF`." ] }, { @@ -135,44 +504,121 @@ "source": [ "## `.create` Module Contents\n", "\n", - "mongordkit.Database.create.**createFromHostPort**(database, host=None (*string*), port=None (*string*))\n", + "mongordkit.Database.create.**createFromHostPort**(database_name, host=None (*string*), port=None (*string*)) --> *a MongoDB database instance named database_name*\n", "\n", - "mongordkit.Database.create.**createFromURL**(database, url=None (*string*))" + "mongordkit.Database.create.**createFromURL**(database_name, url=None (*string*)) --> *a MongoDB database instance named database_name*" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## `.write` Module Contents" + "## `.registration` Module Contents" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'MoleculeHashString': ,\n", + " 'inchi_standard': ,\n", + " 'inchikey_standard': ,\n", + " 'inchi_KET_15T': (rdmol)>,\n", + " 'inchikey_KET_15T': (rdmol)>,\n", + " 'noiso_smiles': (rdmol)>,\n", + " 'cx_smiles': }" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "registration.HASH_FUNCTIONS" + ] + }, + { + "cell_type": "code", + "execution_count": 32, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'AnonymousGraph': rdkit.Chem.rdMolHash.HashFunction.AnonymousGraph,\n", + " 'ElementGraph': rdkit.Chem.rdMolHash.HashFunction.ElementGraph,\n", + " 'CanonicalSmiles': rdkit.Chem.rdMolHash.HashFunction.CanonicalSmiles,\n", + " 'MurckoScaffold': rdkit.Chem.rdMolHash.HashFunction.MurckoScaffold,\n", + " 'ExtendedMurcko': rdkit.Chem.rdMolHash.HashFunction.ExtendedMurcko,\n", + " 'MolFormula': rdkit.Chem.rdMolHash.HashFunction.MolFormula,\n", + " 'AtomBondCounts': rdkit.Chem.rdMolHash.HashFunction.AtomBondCounts,\n", + " 'DegreeVector': rdkit.Chem.rdMolHash.HashFunction.DegreeVector,\n", + " 'Mesomer': rdkit.Chem.rdMolHash.HashFunction.Mesomer,\n", + " 'HetAtomTautomer': rdkit.Chem.rdMolHash.HashFunction.HetAtomTautomer,\n", + " 'HetAtomProtomer': rdkit.Chem.rdMolHash.HashFunction.HetAtomProtomer,\n", + " 'RedoxPair': rdkit.Chem.rdMolHash.HashFunction.RedoxPair,\n", + " 'Regioisomer': rdkit.Chem.rdMolHash.HashFunction.Regioisomer,\n", + " 'NetCharge': rdkit.Chem.rdMolHash.HashFunction.NetCharge,\n", + " 'SmallWorldIndexBR': rdkit.Chem.rdMolHash.HashFunction.SmallWorldIndexBR,\n", + " 'SmallWorldIndexBRL': rdkit.Chem.rdMolHash.HashFunction.SmallWorldIndexBRL,\n", + " 'ArthorSubstructureOrder': rdkit.Chem.rdMolHash.HashFunction.ArthorSubstructureOrder}" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "mongordkit.Database.write.**writeFromSDF**(database, source_sdf, source_name *(string)*, reg_option=\"standard_setting\", index_option=\"inchikey\", chunk_size=100, limit=None)" + "registration.RDKIT_HASH_FUNCTIONS" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "As of 7/15/20, `writeFromSDF` supports the following registration options: \n", - "* 'standard_setting'\n", + "**Class** mongordkit.Database.registration.**MolDocScheme()**\n", "\n", - "And the following index options: \n", - "* 'inchikey'\n", - "* 'canonical_smiles'\n", - "* 'het_atom_tautomer'" + "**Instance variables**:\n", + "```\n", + "self.scheme_name = DEFAULT_SCHEME_NAME\n", + "self.author = DEFAULT_AUTHOR\n", + "self.pre_processed = DEFAULT_PREPROCESS\n", + "self.index_option = DEFAULT_INDEX\n", + "self.rdkit_hashes = set(RDKIT_HASH_FUNCTIONS.keys())\n", + "self.hashes = set(HASH_FUNCTIONS.keys())\n", + "self.fingerprints = {}\n", + "self.value_fields = {}\n", + "```\n", + "**Instance methods**:\n", + "- set_index(self, new_index) --> *None*\n", + "- get_index_value(self, rdmol) --> *calculated index value*\n", + "- add_hash_field(self, field_name, field_method) --> *None*\n", + "- add_value_field(self, field_name, field_value) --> *None*\n", + "- remove_field(self, field_name) --> *None*\n", + "- generate_mol_doc(self, rdmol) --> *Dict: document representing molecule according to scheme*" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "## `.write` Module Contents" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "mongordkit.Database.write.**WriteFromSDF**(database, sdf, scheme=MolDocScheme(), reg_collection=None, chunk_size=100, limit=None) --> *int: number of molecules imported*\n", + "\n", + "mongordkit.Database.write.**WriteFromMolList**(database, list, scheme=MolDocScheme(), reg_collection=None, chunk_size=100, limit=None) --> *int: number of molecules imported*" + ] } ], "metadata": { diff --git a/docs/notebooks/Creating and Writing to MongoDB.ipynb b/docs/notebooks/Creating and Writing to MongoDB.ipynb index 91e557e..cdd4ad1 100644 --- a/docs/notebooks/Creating and Writing to MongoDB.ipynb +++ b/docs/notebooks/Creating and Writing to MongoDB.ipynb @@ -6,7 +6,7 @@ "source": [ "# Creating and Writing to MongoDB\n", "\n", - "Last updated: 7/12/20\n", + "Last updated: 8/10/20\n", "\n", "Methods that directly modify MongoDB database instances are included in the `mongordkit.Database` module.\n", "\n", @@ -16,11 +16,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "from mongordkit.Database import create, write, utils\n", + "from mongordkit.Database import create, write, utils, registration\n", + "from rdkit import Chem\n", "import pymongo" ] }, @@ -29,26 +30,25 @@ "metadata": {}, "source": [ "## Reset Cells\n", - "Run the contents of this cell to reset the local MongoDB database used in this notebook." + "Run the contents of this cell to reset the local MongoDB database, `demo_db`, used in this notebook." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "client = pymongo.MongoClient()\n", - "print(client.list_database_names())\n", - "client.drop_database('TestDatabase')\n", - "print(client.list_database_names())" + "client.drop_database('demo_db')\n", + "demo_db = client.demo_db" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Creating Databases\n", + "## Creating Databases (DEPRECATED for now)\n", "Users can opt to bring their own database instances, but `Database.create` provides methods that will create ready-made MongoDB instances, defaulting to your local MongoDB:" ] }, @@ -58,11 +58,11 @@ "metadata": {}, "outputs": [], "source": [ - "# Return a database using a host port, such as the local port:\n", - "TestDB = create.createFromHostPort('TestDatabase', host='localhost', port=27017)\n", + "# # Return a database using a host port, such as the local port:\n", + "# db = create.createFromHostPort('demo_db', host='localhost', port=27017)\n", "\n", - "# Return a database using a MongoDB URI, such as that provided by Atlas:\n", - "TestDB = create.createFromURL('TestDatabase', url=None)" + "# # Return a database using a MongoDB URI, such as that provided by Atlas:\n", + "# TestDB = create.createFromURL('demo_db', url=None)" ] }, { @@ -78,7 +78,134 @@ "metadata": {}, "outputs": [], "source": [ - "print(utils.STANDARD_SETTING)" + "# print(utils.STANDARD_SETTING)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Registration\n", + "`Database.registration` constructs document representations of molecules according to configurable schemes and handles data registration settings.\n", + "\n", + "It does this in two parts. First, it defines the global variables `RDKIT_HASH_FUNCTIONS` and `HASH_FUNCTIONS` as dictionaries that hold map hash function names to methods. It also defines the global variables `DEFAULT_SCHEME_NAME`, `DEFAULT_AUTHOR`, `DEFAULT_PREPROCESS`, and `DEFAULT_INDEX`, which are used in scheme creation and are thus defined for easy configuration. \n", + "\n", + "Second, the file defines the `MolDocScheme` object, which stores scheme information in its instance variables and is passed into `.write` methods in order to specify molecule document format. By default, `MolDocScheme` includes scheme name, author, whether or not the molecule has been pre-processed, an index option, hashes, fingerprints, and value fields. All of the information contained in a `MolDocScheme` object can be used directly to generate documents for molecules:" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'rdmol': Binary(b'\\xef\\xbe\\xad\\xde\\x00\\x00\\x00\\x00\\x0b\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x07\\x00\\x00\\x00\\x07\\x00\\x00\\x00\\x80\\x01\\x06\\x00`\\x00\\x00\\x00\\x01\\x03\\x06@(\\x00\\x00\\x00\\x03\\x04\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x0b\\x00\\x01\\x00\\x01\\x02h\\x0c\\x02\\x03h\\x0c\\x03\\x04h\\x0c\\x04\\x05h\\x0c\\x05\\x06h\\x0c\\x06\\x01h\\x0c\\x14\\x01\\x06\\x01\\x06\\x05\\x04\\x03\\x02\\x17\\x00\\x00\\x00\\x00\\x16', 0),\n", + " 'index': 'YXFVVABEGXRONW-UHFFFAOYSA-N',\n", + " 'smiles': 'Cc1ccccc1',\n", + " 'scheme': 'default',\n", + " 'hashes': {'inchi_standard': 'InChI=1S/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", + " 'inchikey_KET_15T': 'YXFVVABEGXRONW-UHFFFAOYNA-N',\n", + " 'noiso_smiles': 'Cc1ccccc1',\n", + " 'MoleculeHashString': '100-7-7-SaZjmQ-zcSDYw-aXeP/g-122pug-haQS5A-qxXe4Q',\n", + " 'inchi_KET_15T': 'InChI=1/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", + " 'inchikey_standard': 'YXFVVABEGXRONW-UHFFFAOYSA-N',\n", + " 'cx_smiles': 'Cc1ccccc1'},\n", + " 'rdkit_hashes': {'Mesomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", + " 'HetAtomProtomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", + " 'Regioisomer': '*C.c1ccccc1',\n", + " 'MurckoScaffold': 'c1ccccc1',\n", + " 'ArthorSubstructureOrder': '00070007010007000000002a000000',\n", + " 'ExtendedMurcko': '*c1ccccc1',\n", + " 'DegreeVector': '0,1,5,1',\n", + " 'RedoxPair': 'C[C]1[CH][CH][CH][CH][CH]1',\n", + " 'SmallWorldIndexBR': 'B7R1',\n", + " 'MolFormula': 'C7H8',\n", + " 'AtomBondCounts': '7,7',\n", + " 'ElementGraph': 'CC1CCCCC1',\n", + " 'CanonicalSmiles': 'Cc1ccccc1',\n", + " 'SmallWorldIndexBRL': 'B7R1L5',\n", + " 'HetAtomTautomer': 'C[C]1[CH][CH][CH][CH][CH]1_0_0',\n", + " 'NetCharge': '0',\n", + " 'AnonymousGraph': '**1*****1'},\n", + " 'fingerprints': {},\n", + " 'value_data': {}}" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rdmol = Chem.MolFromSmiles('Cc1ccccc1')\n", + "scheme = registration.MolDocScheme()\n", + "scheme.generate_mol_doc(rdmol)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `MolDocScheme` class also defines a series of instance methods, such as `MolDocScheme.set_index` and `MolDocScheme.remove_field`, that can be used to modify document schemes:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'rdmol': Binary(b'\\xef\\xbe\\xad\\xde\\x00\\x00\\x00\\x00\\x0b\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x07\\x00\\x00\\x00\\x07\\x00\\x00\\x00\\x80\\x01\\x06\\x00`\\x00\\x00\\x00\\x01\\x03\\x06@(\\x00\\x00\\x00\\x03\\x04\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x0b\\x00\\x01\\x00\\x01\\x02h\\x0c\\x02\\x03h\\x0c\\x03\\x04h\\x0c\\x04\\x05h\\x0c\\x05\\x06h\\x0c\\x06\\x01h\\x0c\\x14\\x01\\x06\\x01\\x06\\x05\\x04\\x03\\x02\\x17\\x00\\x00\\x00\\x00\\x16', 0),\n", + " 'index': 'C7H8',\n", + " 'smiles': 'Cc1ccccc1',\n", + " 'scheme': 'default',\n", + " 'hashes': {'inchi_standard': 'InChI=1S/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", + " 'inchikey_KET_15T': 'YXFVVABEGXRONW-UHFFFAOYNA-N',\n", + " 'noiso_smiles': 'Cc1ccccc1',\n", + " 'MoleculeHashString': '100-7-7-SaZjmQ-zcSDYw-aXeP/g-122pug-haQS5A-qxXe4Q',\n", + " 'inchi_KET_15T': 'InChI=1/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", + " 'inchikey_standard': 'YXFVVABEGXRONW-UHFFFAOYSA-N',\n", + " 'cx_smiles': 'Cc1ccccc1'},\n", + " 'rdkit_hashes': {'Mesomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", + " 'HetAtomProtomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", + " 'Regioisomer': '*C.c1ccccc1',\n", + " 'MurckoScaffold': 'c1ccccc1',\n", + " 'ArthorSubstructureOrder': '00070007010007000000002a000000',\n", + " 'ExtendedMurcko': '*c1ccccc1',\n", + " 'DegreeVector': '0,1,5,1',\n", + " 'RedoxPair': 'C[C]1[CH][CH][CH][CH][CH]1',\n", + " 'SmallWorldIndexBR': 'B7R1',\n", + " 'MolFormula': 'C7H8',\n", + " 'AtomBondCounts': '7,7',\n", + " 'ElementGraph': 'CC1CCCCC1',\n", + " 'CanonicalSmiles': 'Cc1ccccc1',\n", + " 'SmallWorldIndexBRL': 'B7R1L5',\n", + " 'HetAtomTautomer': 'C[C]1[CH][CH][CH][CH][CH]1_0_0',\n", + " 'NetCharge': '0'},\n", + " 'fingerprints': {},\n", + " 'value_data': {}}" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scheme.remove_field('AnonymousGraph')\n", + "scheme.set_index('MolFormula')\n", + "scheme.generate_mol_doc(rdmol)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Because `MolDocScheme` objects contain no functions—only references to functions—they can be pickled. In fact, the methods in `write` can save `MolDocSchemes` so that custom schemes are retrievable for later use." ] }, { @@ -86,47 +213,289 @@ "metadata": {}, "source": [ "## Writing to a Database\n", - "`Database.write` provides write functionality. Its core method is `writeFromSDF`, which relies on rdkit's `ForwardSDMolSupplier` to write data from an SDF file into a specified database.\n", + "`Database.write` provides write functionality. Its core method is `WriteFromSDF`, which relies on rdkit's `ForwardSDMolSupplier` to write data from an SDF file into a specified database.\n", "\n", - "For each molecule in the SDF, `writeFromSDF` inserts a document containing at the minimum a unique identifying index, that molecule's SMILES, a pickle of the molecule's rdmol, and a field that specifies the registration option used to store the molecule." + "For each molecule in the SDF, `WriteFromSDF` inserts a document whose fields are specified by the `MolDocScheme` object passed into the function (one with default settings is created if the `scheme` argument is left blank)." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "populating mongodb collection with compounds from SDF...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "RDKit WARNING: [22:03:51] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:03:51] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:03:51] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:03:51] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:03:51] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:03:51] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:03:51] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:23] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [22:05:23] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "200 molecules successfully imported\n", + "0 duplicates skipped\n" + ] + }, + { + "data": { + "text/plain": [ + "200" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Write the contents of first_200_props.sdf, a test dataset, into the TestDatabase created above. \n", + "# Write the contents of first_200_props.sdf, a test dataset, into the collection demo_db.molecules.\n", "# The index will default to the molecule's inchikey.\n", "# Return the number of molecules succesfully imported.\n", - "write.writeFromSDF(TestDB, '../../data/test_data/first_200.props.sdf', 'test')" + "write.WriteFromSDF(demo_db.molecules, '../../data/test_data/first_200.props.sdf')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The above call is the most basic version of `writeFromSDF`. For additional flexibility, `writeFromSDF` takes several optional arguments that allow users to specify how inbound molecules should be standardized, a field relating to the data's origin, customize the index, and change how many molecules are inserted into the database at a time. " + "The above call is the most basic version of `writeFromSDF`. For additional flexibility, `writeFromSDF` takes several optional arguments—users can specify a custom scheme object, a registration collection to write scheme objects to, how many molecules are inserted at a time (this can affect performance), and limit the number of molecules written in." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "populating mongodb collection with compounds from SDF...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100 molecules successfully imported\n", + "0 duplicates skipped\n" + ] + }, + { + "data": { + "text/plain": [ + "100" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Write the contents of first_200_props.sdf, a test dataset, into the TestDatabase created above. \n", + "# Write the first 100 molecules of first_200_props.sdf, a test dataset, into demo_db.molecules\n", "# This write will use canonical SMILES as the identifying index and thus does not conflict with the above write. \n", "# If we had used inchikey again, the write would have imported 0 molecules.\n", - "write.writeFromSDF(TestDB, '../../data/test_data/first_200.props.sdf', 'test', reg_option='standard_setting', index_option='canonical_smiles', chunk_size=100, limit=None)" + "scheme = registration.MolDocScheme()\n", + "scheme.set_index('CanonicalSmiles')\n", + "write.WriteFromSDF(demo_db.molecules, '../../data/test_data/first_200.props.sdf', scheme, reg_collection=demo_db.schema, chunk_size=50, limit=100)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In order to maintain consistency, the registration options and index options are drawn from a set of predetermined options specified in `Database.utils`." + "In the case that users aren't working with an SDF, `.write` also provides `WriteFromMolList`, which will take a Python list of rdmol objects in place of the SDF argument in `WriteFromSDF`." ] }, { @@ -144,27 +513,111 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## `.write` Module Contents" + "## `.registration` Module Contents" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'MoleculeHashString': ,\n", + " 'inchi_standard': ,\n", + " 'inchikey_standard': ,\n", + " 'inchi_KET_15T': (rdmol)>,\n", + " 'inchikey_KET_15T': (rdmol)>,\n", + " 'noiso_smiles': (rdmol)>,\n", + " 'cx_smiles': }" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "registration.HASH_FUNCTIONS" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'AnonymousGraph': rdkit.Chem.rdMolHash.HashFunction.AnonymousGraph,\n", + " 'ElementGraph': rdkit.Chem.rdMolHash.HashFunction.ElementGraph,\n", + " 'CanonicalSmiles': rdkit.Chem.rdMolHash.HashFunction.CanonicalSmiles,\n", + " 'MurckoScaffold': rdkit.Chem.rdMolHash.HashFunction.MurckoScaffold,\n", + " 'ExtendedMurcko': rdkit.Chem.rdMolHash.HashFunction.ExtendedMurcko,\n", + " 'MolFormula': rdkit.Chem.rdMolHash.HashFunction.MolFormula,\n", + " 'AtomBondCounts': rdkit.Chem.rdMolHash.HashFunction.AtomBondCounts,\n", + " 'DegreeVector': rdkit.Chem.rdMolHash.HashFunction.DegreeVector,\n", + " 'Mesomer': rdkit.Chem.rdMolHash.HashFunction.Mesomer,\n", + " 'HetAtomTautomer': rdkit.Chem.rdMolHash.HashFunction.HetAtomTautomer,\n", + " 'HetAtomProtomer': rdkit.Chem.rdMolHash.HashFunction.HetAtomProtomer,\n", + " 'RedoxPair': rdkit.Chem.rdMolHash.HashFunction.RedoxPair,\n", + " 'Regioisomer': rdkit.Chem.rdMolHash.HashFunction.Regioisomer,\n", + " 'NetCharge': rdkit.Chem.rdMolHash.HashFunction.NetCharge,\n", + " 'SmallWorldIndexBR': rdkit.Chem.rdMolHash.HashFunction.SmallWorldIndexBR,\n", + " 'SmallWorldIndexBRL': rdkit.Chem.rdMolHash.HashFunction.SmallWorldIndexBRL,\n", + " 'ArthorSubstructureOrder': rdkit.Chem.rdMolHash.HashFunction.ArthorSubstructureOrder}" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "registration.RDKIT_HASH_FUNCTIONS" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Class** mongordkit.Database.registration.**MolDocScheme()**\n", + "\n", + "**Instance variables**:\n", + "```\n", + "self.scheme_name = DEFAULT_SCHEME_NAME\n", + "self.author = DEFAULT_AUTHOR\n", + "self.pre_processed = DEFAULT_PREPROCESS\n", + "self.index_option = DEFAULT_INDEX\n", + "self.rdkit_hashes = set(RDKIT_HASH_FUNCTIONS.keys())\n", + "self.hashes = set(HASH_FUNCTIONS.keys())\n", + "self.fingerprints = {}\n", + "self.value_fields = {}\n", + "```\n", + "**Instance methods**:\n", + "- set_index(self, new_index) --> *None*\n", + "- get_index_value(self, rdmol) --> *calculated index value*\n", + "- add_hash_field(self, field_name, field_method) --> *None*\n", + "- add_value_field(self, field_name, field_value) --> *None*\n", + "- remove_field(self, field_name) --> *None*\n", + "- generate_mol_doc(self, rdmol) --> *Dict: document representing molecule according to scheme*" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "mongordkit.Database.write.**writeFromSDF**(database, source_sdf, source_name *(string)*, reg_option=\"standard_setting\", index_option=\"inchikey\", chunk_size=100, limit=None) --> *int: number of molecules imported*" + "## `.write` Module Contents" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "As of 7/15/20, `writeFromSDF` supports the following registration options: \n", - "* 'standard_setting'\n", + "mongordkit.Database.write.**WriteFromSDF**(database, sdf, scheme=MolDocScheme(), reg_collection=None, chunk_size=100, limit=None) --> *int: number of molecules imported*\n", "\n", - "And the following index options: \n", - "* 'inchikey'\n", - "* 'canonical_smiles'\n", - "* 'het_atom_tautomer'" + "mongordkit.Database.write.**WriteFromMolList**(database, list, scheme=MolDocScheme(), reg_collection=None, chunk_size=100, limit=None) --> *int: number of molecules imported*" ] } ], diff --git a/mongordkit/Database/registration.py b/mongordkit/Database/registration.py new file mode 100644 index 0000000..876321d --- /dev/null +++ b/mongordkit/Database/registration.py @@ -0,0 +1,93 @@ +""" +Constructs document representations of molecules and +handles data registration settings. +""" +import rdkit +from bson import Binary +from rdkit import Chem +from rdkit.Chem import rdMolHash +import pickle + + +DEFAULT_SCHEME_NAME = 'default' +DEFAULT_AUTHOR = 'package-native' +DEFAULT_PREPROCESS = False +DEFAULT_INDEX = 'inchikey_standard' + + +RDKIT_HASH_FUNCTIONS = rdkit.Chem.rdMolHash.HashFunction.names +HASH_FUNCTIONS = {} +HASH_FUNCTIONS['MoleculeHashString'] = rdMolHash.GenerateMoleculeHashString +HASH_FUNCTIONS['inchi_standard'] = Chem.MolToInchi +HASH_FUNCTIONS['inchikey_standard'] = Chem.MolToInchiKey +HASH_FUNCTIONS['inchi_KET_15T'] = lambda rdmol: Chem.MolToInchi(rdmol, options='-KET -15T') +HASH_FUNCTIONS['inchikey_KET_15T'] = lambda rdmol: Chem.MolToInchiKey(rdmol, options='-KET -15T') +HASH_FUNCTIONS['noiso_smiles'] = lambda rdmol: Chem.MolToSmiles(rdmol, isomericSmiles=False) +HASH_FUNCTIONS['cx_smiles'] = Chem.MolToCXSmiles + + +class MolDocScheme(): + + def __init__(self): + self.scheme_name = DEFAULT_SCHEME_NAME + self.author = DEFAULT_AUTHOR + self.pre_processed = DEFAULT_PREPROCESS + self.index_option = DEFAULT_INDEX + self.rdkit_hashes = set(RDKIT_HASH_FUNCTIONS.keys()) + self.hashes = set(HASH_FUNCTIONS.keys()) + self.fingerprints = {} + self.value_fields = {} + + def __repr__(self): + return 'Molecule document representation schema. ' \ + '{Name: ' + str(self.scheme_name) + \ + ', author: ' + str(self.author) + \ + ', index' + str(self.index_option) + \ + '}' + + def set_index(self, new_index): + if new_index not in HASH_FUNCTIONS.keys() and new_index not in RDKIT_HASH_FUNCTIONS.keys(): + raise Exception("Please add this hash first.") + else: + self.index_option = new_index + + def get_index_value(self, rdmol): + if self.index_option in HASH_FUNCTIONS.keys(): + return HASH_FUNCTIONS[self.index_option](rdmol) + elif self.index_option in RDKIT_HASH_FUNCTIONS.keys(): + return rdMolHash.MolHash(rdmol, RDKIT_HASH_FUNCTIONS[self.index_option]) + else: + raise Exception("Specified index option does not exist.") + + def add_hash_field(self, field_name, field_method): + self.hashes.add(field_name) + HASH_FUNCTIONS[field_name] = field_method + + def add_value_field(self, field_name, field_value): + self.value_fields[field_name] = field_value + + def remove_field(self, field_name): + if field_name in self.hashes: + self.hashes.remove(field_name) + print(f'removed {field_name} from scheme') + if field_name in self.value_fields.keys(): + self.value_fields.pop(field_name) + print(f'removed {field_name} from scheme') + if field_name in self.rdkit_hashes: + self.rdkit_hashes.remove(field_name) + print(f'removed {field_name} from scheme') + + def generate_mol_doc(self, rdmol): + molDoc = { + 'rdmol': Binary(rdmol.ToBinary()), + 'index': self.get_index_value(rdmol), + 'smiles': Chem.MolToSmiles(rdmol), + 'scheme': self.scheme_name, + 'hashes': {hash_name: HASH_FUNCTIONS[hash_name](rdmol) for hash_name in self.hashes}, + 'rdkit_hashes': {hash_name: rdMolHash.MolHash(rdmol, RDKIT_HASH_FUNCTIONS[hash_name]) + for hash_name in self.rdkit_hashes}, + 'fingerprints': {fp: fp_method(rdmol) for fp, fp_method in self.fingerprints.items()}, + 'value_data': {field_name: value for field_name, value in self.value_fields.items()} + } + return molDoc + diff --git a/mongordkit/Database/tests/test_write.py b/mongordkit/Database/tests/test_write.py index 2ef0818..1acbe63 100644 --- a/mongordkit/Database/tests/test_write.py +++ b/mongordkit/Database/tests/test_write.py @@ -8,41 +8,49 @@ sys.path.append(Path('.').resolve().parent.parent) from mongordkit.Database import write +from mongordkit.Database import registration def setupDB(): client = mongomock.MongoClient() return client.db -def test_writeCount(): - db = setupDB() - assert 200 == write.writeFromSDF(db.molecules, 'data/test_data/first_200.props.sdf', 'test') +class TestWrite: -def test_invalidIndex(): - with pytest.raises(ValueError): + def test_writeCount(self): + data_scheme = registration.MolDocScheme() db = setupDB() - write.writeFromSDF(db.molecules, 'data/test_data/first_200.props.sdf', 'test', reg_option='standard_setting', index_option='canonica_smiles') - -def test_hashes(): - db = setupDB() - assert 200 == write.writeFromSDF(db.molecules, 'data/test_data/first_200.props.sdf', 'test', 'standard_setting', 'canonical_smiles') - db = setupDB() - assert 200 == write.writeFromSDF(db.molecules, 'data/test_data/first_200.props.sdf', 'test', 'standard_setting', 'het_atom_tautomer') - -def test_uniqueInsertion(): - db = setupDB() - write.writeFromSDF(db.molecules, 'data/test_data/first_200.props.sdf', 'test') - assert 0 == write.writeFromSDF(db.molecules, 'data/test_data/first_200.props.sdf', 'test') - assert 200 == write.writeFromSDF(db.molecules, 'data/test_data/first_200.props.sdf', 'test', reg_option='standard_setting', index_option='canonical_smiles') - -def test_writeLimit(): - db = setupDB() - assert 100 == write.writeFromSDF(db.molecules, 'data/test_data/first_200.props.sdf', 'test', limit=100) - -def test_WriteMolListCount(): - db = setupDB() - f = open('data/zinc.frags.500.q.smi') - frags = [Chem.MolFromSmiles(line.split()[0]) for line in f] - f.close() - frag_smiles = [Chem.MolToSmiles(rdmol) for rdmol in frags] - write.WriteMolList(db.molecules, frags, 'test', chunk_size=100) - assert 499 == db.molecules.count_documents({}) + assert 200 == write.WriteFromSDF(db.molecules, 'data/test_data/first_200.props.sdf', data_scheme) + + def test_invalidIndex(self): + db = setupDB() + data_scheme = registration.MolDocScheme() + with pytest.raises(Exception): + data_scheme.set_index('moo') + + def test_hashes(self): + db = setupDB() + data_scheme = registration.MolDocScheme() + data_scheme.set_index("CanonicalSmiles") + assert 200 == write.WriteFromSDF(db.molecules, 'data/test_data/first_200.props.sdf', data_scheme) + data_scheme.set_index("MoleculeHashString") + assert 200 == write.WriteFromSDF(db.molecules, 'data/test_data/first_200.props.sdf', data_scheme) + + def test_uniqueInsertion(self): + db = setupDB() + data_scheme = registration.MolDocScheme() + assert 200 == write.WriteFromSDF(db.molecules, 'data/test_data/first_200.props.sdf', data_scheme) + assert 0 == write.WriteFromSDF(db.molecules, 'data/test_data/first_200.props.sdf', data_scheme) + + def test_writeLimit(self): + db = setupDB() + data_scheme = registration.MolDocScheme() + assert 10 == write.WriteFromSDF(db.molecules, 'data/test_data/first_200.props.sdf', data_scheme, limit=10) + + def test_WriteMolListCount(self): + db = setupDB() + data_scheme = registration.MolDocScheme() + f = open('data/zinc.frags.500.q.smi') + frags = [Chem.MolFromSmiles(line.split()[0]) for line in f] + f.close() + write.WriteFromMolList(db.molecules, frags, data_scheme) + assert 499 == db.molecules.count_documents({}) \ No newline at end of file diff --git a/mongordkit/Database/utils.py b/mongordkit/Database/utils.py index 38126b9..66ec0d4 100644 --- a/mongordkit/Database/utils.py +++ b/mongordkit/Database/utils.py @@ -20,7 +20,7 @@ def canonicalHash(m): 'fragments': 'removed', } -VALID_IDS = {'inchikey', 'canonical_smiles', 'het_atom_tautomer'} +VALID_HASHES = {'inchikey', 'canonical_smiles', 'het_atom_tautomer'} HASH_FUNCTIONS = {'inchikey': rdinchi.MolToInchiKey, 'canonical_smiles': canonicalHash, diff --git a/mongordkit/Database/write.py b/mongordkit/Database/write.py index d0e813b..02dad52 100644 --- a/mongordkit/Database/write.py +++ b/mongordkit/Database/write.py @@ -1,57 +1,50 @@ -import pymongo +import pymongo, pickle from bson import Binary from rdkit import Chem from rdkit.Chem import rdMolHash from rdkit.Chem import rdinchi -from .utils import * +from .registration import MolDocScheme, HASH_FUNCTIONS, RDKIT_HASH_FUNCTIONS -def writeFromSDF(mol_collection, sdf, src, - reg_collection=None, reg_option="standard_setting", index_option="inchikey", - chunk_size=100, limit=None): +def WriteFromSDF(mol_collection, sdf, scheme=MolDocScheme(), + reg_collection=None, chunk_size=100, limit=None): """ - Writes to MOL_COLLECTION the - contents of an SDF file SDF from SRC in a collection called molecules. - To limit the number of calls to the database, - chunks the molecules into groups of CHUNK_SIZE, - which defaults to 100. Returns the number of molecules - inserted into the collection during its call. + Writes the contents of SDF to MOL_COLLECTION and creates + an index on the index specificed in SCHEME. Optional parameters: + - customize document structure by specifying a SCHEME MolDocScheme object. + - write the scheme object into a separate collection by specifying REG_COLLECTION. + - customize how many molecules are inserted at a time by setting CHUNK_SIZE. + - limit the number of molecules written in by setting LIMIT. :param mol_collection: A MongoDB collection. :param sdf: A Python File object of the desired SDF file. - :param src: For the user's utility, a string that indicates where the file originates. - :param reg_option: Allows control over the inserted document structure. Defaults to a standard - setting that includes an index, SMILES, an rdkit Molecule, and a registration setting. - :param index_option: Allows control over indexing settings. Defaults to generating an Inchikey for - each molecule. - :param chunk_size: How many documents are inserted into the database instance at a time. - :return: The total number of molecules inserted into the collection. + :param scheme: A registration.MolDocScheme() object. + :param reg_collection: A MongoDB collection. + :param chunk_size: Integer indicating how many molecules inserted at a time. + :param limit: Integer indicating how many molecules to insert. """ molecules = mol_collection - print('populating mongodb collection with compounds from chembl...') + print('populating mongodb collection with compounds from SDF...') # This is placeholder code for when more registration options exist. - if index_option not in VALID_IDS: - options = ', '.join(VALID_IDS) - raise ValueError("id_option must be one of {}".format(options)) - else: - hash = HASH_FUNCTIONS[index_option] + # if index_option not in VALID_HASHES: + # options = ', '.join(VALID_HASHES) + # raise ValueError("index_option must be one of {}".format(options)) + # else: + # hash = HASH_FUNCTIONS[index_option] chunk = [] inserted = 0 + duplicates = 0 + if reg_collection: + reg_collection.insert_one({scheme.scheme_name: pickle.dumps(scheme)}) for rdmol in Chem.ForwardSDMolSupplier(sdf): if limit is not None and inserted >= limit: break if rdmol is None: continue - index = hash(rdmol) + index = scheme.get_index_value(rdmol) if mol_collection.count_documents({"index": index}) != 0: + duplicates += 1 continue - # Placeholder for where molecule standardization might take place. - document = { - 'index': hash(rdmol), - 'smiles': Chem.MolToSmiles(rdmol), - 'rdmol': Binary(rdmol.ToBinary()), - 'registration_setting': reg_option - } - # Placeholder for adding setting specific fields to the document. + document = scheme.generate_mol_doc(rdmol) chunk.append(document) inserted += 1 if len(chunk) == chunk_size: @@ -65,50 +58,50 @@ def writeFromSDF(mol_collection, sdf, src, molecules.insert_one(i) inserted += 1 print("{} molecules successfully imported".format(inserted)) + print("{} duplicates skipped".format(duplicates)) + mol_collection.create_index('index') return inserted -def WriteMolList(mol_collection, list, src, - reg_collection=None, reg_option="standard_setting", index_option="inchikey", - chunk_size=100, limit=None): +def WriteFromMolList(mol_collection, list, scheme=MolDocScheme(), + reg_collection=None, chunk_size=100, limit=None): """ - Writes to database instance DB a list of rdmols in the standard format into a molecules collection. + Writes the contents of LIST to MOL_COLLECTION and creates + an index on the index specificed in SCHEME. Optional parameters: + - customize document structure by specifying a SCHEME MolDocScheme object. + - write the scheme object into a separate collection by specifying REG_COLLECTION. + - customize how many molecules are inserted at a time by setting CHUNK_SIZE. + - limit the number of molecules written in by setting LIMIT. :param mol_collection: A MongoDB collection. - :param sdf: A Python list of rdmol objects. - :param src: For the user's utility, a string that indicates where the file originates. - :param reg_option: Allows control over the inserted document structure. Defaults to a standard - setting that includes an index, SMILES, an rdkit Molecule, and a registration setting. - :param index_option: Allows control over indexing settings. Defaults to generating an Inchikey for - each molecule. - :param chunk_size: How many documents are inserted into the database instance at a time. - :return: The total number of molecules inserted into the collection. + :param list: A Python list of rdmol objects. + :param scheme: A registration.MolDocScheme() object. + :param reg_collection: A MongoDB collection. + :param chunk_size: Integer indicating how many molecules inserted at a time. + :param limit: Integer indicating how many molecules to insert. """ molecules = mol_collection - print('populating mongodb collection with compounds from chembl...') + print('populating mongodb collection with compounds from list...') # This is placeholder code for when more registration options exist. - if index_option not in VALID_IDS: - options = ', '.join(VALID_IDS) - raise ValueError("id_option must be one of {}".format(options)) - else: - hash = HASH_FUNCTIONS[index_option] + # if index_option not in VALID_HASHES: + # options = ', '.join(VALID_HASHES) + # raise ValueError("index_option must be one of {}".format(options)) + # else: + # hash = HASH_FUNCTIONS[index_option] chunk = [] inserted = 0 + duplicates = 0 + if reg_collection: + reg_collection.insert_one({scheme.scheme_name: pickle.dumps(scheme)}) for rdmol in list: if limit is not None and inserted >= limit: break if rdmol is None: continue - index = hash(rdmol) - if molecules.count_documents({"index": index}) != 0: + index = scheme.get_index_value(rdmol) + if mol_collection.count_documents({"index": index}) != 0: + duplicates += 1 continue - # Placeholder for where molecule standardization might take place. - document = { - 'index': hash(rdmol), - 'smiles': Chem.MolToSmiles(rdmol), - 'rdmol': Binary(rdmol.ToBinary()), - 'registration_setting': reg_option - } - # Placeholder for adding setting specific fields to the document. + document = scheme.generate_mol_doc(rdmol) chunk.append(document) inserted += 1 if len(chunk) == chunk_size: @@ -122,6 +115,8 @@ def WriteMolList(mol_collection, list, src, molecules.insert_one(i) inserted += 1 print("{} molecules successfully imported".format(inserted)) + print("{} duplicates skipped".format(duplicates)) + mol_collection.create_index('index') return inserted diff --git a/mongordkit/Search/tests/test_similarity.py b/mongordkit/Search/tests/test_similarity.py index 5ff1df2..07f5f64 100644 --- a/mongordkit/Search/tests/test_similarity.py +++ b/mongordkit/Search/tests/test_similarity.py @@ -22,7 +22,7 @@ def test_zeroThreshold(): """ db_python = utils.setupPythonDB('data/test_data/first_200.props.sdf') db_mongo = utils.setupMockDB() - write.writeFromSDF(db_mongo.molecules, 'data/test_data/first_200.props.sdf', 'test') + write.WriteFromSDF(db_mongo.molecules, 'data/test_data/first_200.props.sdf') similarity.AddMorganFingerprints(db_mongo.molecules, db_mongo.mfp_counts) mol = Chem.Mol(db_python[0]['rdmol']) @@ -38,7 +38,7 @@ def test_similarityAccuracy(): """ db_python = utils.setupPythonDB('data/test_data/first_200.props.sdf') db_mongo = utils.setupMockDB() - write.writeFromSDF(db_mongo.molecules, 'data/test_data/first_200.props.sdf', 'test') + write.WriteFromSDF(db_mongo.molecules, 'data/test_data/first_200.props.sdf') similarity.AddMorganFingerprints(db_mongo.molecules, db_mongo.mfp_counts) thresholds = [0.2, 0.4, 0.6, 0.8, 1] for t in thresholds: @@ -61,7 +61,7 @@ def test_similarityAccuracyAggregate(mongoURI): db_mongo = utils.setupMongoDB() else: db_mongo = utils.setupMongoDB(mongoURI) - write.writeFromSDF(db_mongo.molecules, 'data/test_data/first_200.props.sdf', 'test') + write.WriteFromSDF(db_mongo.molecules, 'data/test_data/first_200.props.sdf') similarity.AddMorganFingerprints(db_mongo.molecules, db_mongo.mfp_counts) thresholds = [0.2, 0.4, 0.6, 0.8, 1] counter = 0 @@ -82,7 +82,7 @@ def test_similarityProgression(): """ db_python = utils.setupPythonDB('data/test_data/first_200.props.sdf') db_mongo = utils.setupMockDB() - write.writeFromSDF(db_mongo.molecules, 'data/test_data/first_200.props.sdf', 'test') + write.WriteFromSDF(db_mongo.molecules, 'data/test_data/first_200.props.sdf') similarity.AddMorganFingerprints(db_mongo.molecules, db_mongo.mfp_counts) thresholds = [1, 0.8, 0.6, 0.4, 0.2] for i in range(200): @@ -106,7 +106,7 @@ def test_similarityAggregateProgression(mongoURI): db_mongo = utils.setupMongoDB() else: db_mongo = utils.setupMongoDB(mongoURI) - write.writeFromSDF(db_mongo.molecules, 'data/test_data/first_200.props.sdf', 'test') + write.WriteFromSDF(db_mongo.molecules, 'data/test_data/first_200.props.sdf') similarity.AddMorganFingerprints(db_mongo.molecules, db_mongo.mfp_counts) thresholds = [1, 0.8, 0.6, 0.4, 0.2] for i in range(200): @@ -126,7 +126,7 @@ def test_similarity_accuracy_LSH(mongoURI): db_mongo = utils.setupMongoDB() else: db_mongo = utils.setupMongoDB(mongoURI) - write.writeFromSDF(db_mongo.molecules, 'data/test_data/first_200.props.sdf', 'test') + write.WriteFromSDF(db_mongo.molecules, 'data/test_data/first_200.props.sdf') similarity.AddMorganFingerprints(db_mongo.molecules, db_mongo.mfp_counts) similarity.AddRandPermutations(db_mongo.permutations) similarity.AddLocalityHashes(db_mongo.molecules, db_mongo.permutations, 25) diff --git a/mongordkit/Search/tests/test_substructure.py b/mongordkit/Search/tests/test_substructure.py index 38c2f77..2b29a7d 100644 --- a/mongordkit/Search/tests/test_substructure.py +++ b/mongordkit/Search/tests/test_substructure.py @@ -6,7 +6,7 @@ def test_addPatternFingerprints(): db = utils.setupMockDB() - write.writeFromSDF(db.molecules, 'data/test_data/first_200.props.sdf', 'test') + write.WriteFromSDF(db.molecules, 'data/test_data/first_200.props.sdf') substructure.AddPatternFingerprints(db.molecules) counter = 0 assert db.molecules.count_documents({"pattern_fp": {"$exists": True}}) == 200 @@ -14,7 +14,7 @@ def test_addPatternFingerprints(): def test_SubSearchAccuracy(): db_mock = utils.setupMockDB() - write.writeFromSDF(db_mock.molecules, 'data/test_data/first_200.props.sdf', 'test') + write.WriteFromSDF(db_mock.molecules, 'data/test_data/first_200.props.sdf') substructure.AddPatternFingerprints(db_mock.molecules) db_python = utils.setupPythonDB('data/test_data/first_200.props.sdf') for i in range(200): From 7d1b19585a5ea973fa037eb608856d5eadf6997e Mon Sep 17 00:00:00 2001 From: Christopher Zou Date: Wed, 12 Aug 2020 23:09:54 -0400 Subject: [PATCH 2/7] Update documentation and added hashes --- ...ng and Writing to MongoDB-checkpoint.ipynb | 3 +- ...y and Substructure Search-checkpoint.ipynb | 395 ++++++++---- .../Creating and Writing to MongoDB.ipynb | 3 +- .../Similarity and Substructure Search.ipynb | 562 +++++++++++++++--- .../__pycache__/create.cpython-37.pyc | Bin 1389 -> 1389 bytes .../Database/__pycache__/utils.cpython-37.pyc | Bin 933 -> 936 bytes .../Database/__pycache__/write.cpython-37.pyc | Bin 4963 -> 4377 bytes .../test_create.cpython-37-pytest-5.4.3.pyc | Bin 1499 -> 1288 bytes .../test_write.cpython-37-pytest-5.4.3.pyc | Bin 4115 -> 6323 bytes mongordkit/Search/__init__.py | 12 + .../__pycache__/__init__.cpython-37.pyc | Bin 155 -> 684 bytes .../__pycache__/similarity.cpython-37.pyc | Bin 5159 -> 12138 bytes mongordkit/Search/similarity.py | 128 ++-- mongordkit/Search/substructure.py | 16 +- mongordkit/Search/tests/test_similarity.py | 1 + mongordkit/Search/tests/test_substructure.py | 2 +- mongordkit/Search/tests/utils.py | 4 +- 17 files changed, 855 insertions(+), 271 deletions(-) diff --git a/docs/notebooks/.ipynb_checkpoints/Creating and Writing to MongoDB-checkpoint.ipynb b/docs/notebooks/.ipynb_checkpoints/Creating and Writing to MongoDB-checkpoint.ipynb index cdd4ad1..aec730f 100644 --- a/docs/notebooks/.ipynb_checkpoints/Creating and Writing to MongoDB-checkpoint.ipynb +++ b/docs/notebooks/.ipynb_checkpoints/Creating and Writing to MongoDB-checkpoint.ipynb @@ -488,7 +488,8 @@ "# If we had used inchikey again, the write would have imported 0 molecules.\n", "scheme = registration.MolDocScheme()\n", "scheme.set_index('CanonicalSmiles')\n", - "write.WriteFromSDF(demo_db.molecules, '../../data/test_data/first_200.props.sdf', scheme, reg_collection=demo_db.schema, chunk_size=50, limit=100)" + "write.WriteFromSDF(demo_db.molecules, '../../data/test_data/first_200.props.sdf', \n", + " scheme, reg_collection=demo_db.schema, chunk_size=50, limit=100)" ] }, { diff --git a/docs/notebooks/.ipynb_checkpoints/Similarity and Substructure Search-checkpoint.ipynb b/docs/notebooks/.ipynb_checkpoints/Similarity and Substructure Search-checkpoint.ipynb index 9e5d205..34d092f 100644 --- a/docs/notebooks/.ipynb_checkpoints/Similarity and Substructure Search-checkpoint.ipynb +++ b/docs/notebooks/.ipynb_checkpoints/Similarity and Substructure Search-checkpoint.ipynb @@ -6,18 +6,19 @@ "source": [ "# Similarity and Substructure Search\n", "\n", - "Last updated: 7/11/20\n", + "Last updated: 7/27/20\n", "\n", "Methods for similarity and substructure search are included in the `mongordkit.Search` module." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from mongordkit.Search import similarity, substructure, utils\n", + "from mongordkit import Search\n", "from mongordkit.Database import create, write\n", "from rdkit import Chem\n", "import pymongo" @@ -29,28 +30,164 @@ "source": [ "## Reset Cells\n", "\n", - "Run these cells to reset the local MongoDB instance used in this notebook." + "Run these cells to reset the MongoDB database used in this notebook." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, + "outputs": [], + "source": [ + "client = pymongo.MongoClient()\n", + "client.drop_database('demo_db')\n", + "demo_db = client.demo_db" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparing for Search\n", + "Adequately preparing the database for searching requires adding a variety of fingerprints and hashes. You can easily perform all of the setup work required for similarity and substructure search by calling the method `Search.PrepareForSearch`. Generally, workflow will follow straight from the following two lines into search calls:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['TestDatabase', 'admin', 'config', 'db', 'local']\n", - "['admin', 'config', 'db', 'local']\n" + "populating mongodb collection with compounds from SDF...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:20] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [22:56:20] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "200 molecules successfully imported\n", + "0 duplicates skipped\n", + "Preparing database and collections for search...\n", + "Added pattern fps, morgan fps, and support for LSH.\n" ] } ], "source": [ - "client = pymongo.MongoClient()\n", - "print(client.list_database_names())\n", - "client.drop_database('TestDatabase')\n", - "print(client.list_database_names())" + "write.WriteFromSDF(demo_db.molecules, '../../data/test_data/first_200.props.sdf')\n", + "Search.PrepareForSearch(demo_db, demo_db.molecules, demo_db.mfp_counts, demo_db.permutations)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "However, the rest of this notebook will explicitly note the addition of fingerprints and hashes in an effort to better communicate how the code actually works. Let's reset the database again so that we can insert the hashes step by step without any issues." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.drop_database('demo_db')\n", + "demo_db = client.demo_db" ] }, { @@ -59,177 +196,158 @@ "source": [ "## Similarity Search\n", "\n", - "`mongordkit.Search.similarity` supports similarity search best on a database prepared by `mongordkit.Database.write`. Users can also use any database that has a `molecules` collection where each document in that collection has the following fields:\n", + "`mongordkit.Search.similarity` supports similarity search best on a MongoDB collection prepared by `mongordkit.Database.write`. For the general level of similarity search, users can also use any collection that has documents with the following fields:\n", "- `'rdmol': binary pickle object`\n", - "- `'smiles': some SMILES string`" + "- `'index': a unique identifier for each molecule`\n", + "- `'fingerprints': {a nested document that can be blank at the start}'`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's run through an example of similarity search. First, we'll have to set up our database:" + "Let's run through an example of similarity search. First, we'll write into the database 200 molecules from a data file included in the `mongordkit` package. We will use default write settings." ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "populating mongodb collection with compounds from chembl...\n", - "200 molecules successfully imported\n" - ] - }, - { - "data": { - "text/plain": [ - "200" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], "source": [ - "TestDB = create.createFromHostPort('TestDatabase', host='localhost', port=27017)\n", - "write.writeFromSDF(TestDB, '../../data/test_data/first_200.props.sdf', 'test')" + "write.WriteFromSDF(demo_db.molecules, '../../data/test_data/first_200.props.sdf')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "`similarity.SimSearchNaive` will directly loop through the database and display results. However, this implementation is extremely slow for any decently-sized database. Instead, `similarity` supports precalculating the following kinds of fingerprints for screening: \n", - "- Morgan (length 1048)\n", + "`similarity.SimSearchNaive` will directly loop through the database and display results. This is good for purposes of verifying accuracy. However, this implementation is extremely slow for any decently-sized database. Instead, `similarity` supports precalculating the following kinds of fingerprints for screening: \n", + "- Morgan (default radius 2, length 2048)\n", "\n", - "through `similarity.addMorganFingerprints`. For each document in a passed in database's `molecules` collection, this method creates a nested field that contains `{morgan_fp: {bits: }, {count: }}`. Note that `addMorganFingerprints` also creates indices on `morgan_fp[bits]` and `morgan_fp[count]` to speed search. " + "through `similarity.AddMorganFingerprints`. For each document in a passed in collection, this method adds the nested field `{morgan_fp: {bits: }, {count: }}` to the document's `fingerprint` field. `AddMorganFingerprints` also creates indices on `morgan_fp[bits]` and `morgan_fp[count]` to speed search. " ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "similarity.addMorganFingerprints(TestDB, radius=2, length=1024)" + "similarity.AddMorganFingerprints(demo_db.molecules, demo_db.mfp_counts)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'bits': [33,\n", - " 56,\n", - " 84,\n", - " 130,\n", - " 313,\n", - " 314,\n", - " 356,\n", - " 547,\n", - " 650,\n", - " 698,\n", - " 744,\n", - " 747,\n", - " 849,\n", - " 853,\n", - " 967],\n", - " 'count': 15}" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "TestDB.molecules.find_one()['morgan_fp']" + "demo_db.molecules.find_one()['fingerprints']['morgan_fp']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "From here, we can directly perform similarity search. `similarity` provides two methods that take advantage of fingerprint screening: `similaritySearch` and `similaritySearchAggregate`. The latter shifts much of the computation into the MongoDB server by using an aggregation pipeline and may improve performance when working with performant or sharded MongoDB servers. " + "From here, we can directly perform similarity search. `similarity` provides two methods that take advantage of fingerprint screening: `similaritySearch` and `similaritySearchAggregate`. The latter shifts much of the computation into the MongoDB server by using an aggregation pipeline and can dramatically improve performance when working with sharded MongoDB servers." ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "similaritySearch: [[0.35294117647058826, 'c1ccc(P(c2ccccc2)c2ccccc2)cc1'], [0.4117647058823529, 'Cc1ccc(S)cc1'], [0.35, 'CC(O)(c1ccccc1)c1ccccc1']]\n", - "\n", - "\n", - "similaritySearchAggregate: [[0.35294117647058826, 'c1ccc(P(c2ccccc2)c2ccccc2)cc1'], [0.4117647058823529, 'Cc1ccc(S)cc1'], [0.35, 'CC(O)(c1ccccc1)c1ccccc1']]\n" - ] - } - ], + "outputs": [], "source": [ "q_mol = Chem.MolFromSmiles('Cc1ccccc1')\n", "\n", "# Perform a similarity search on TestDB for q_mol with a Tanimoto threshold of 0.4. \n", - "results1 = similarity.similaritySearch(q_mol, TestDB, 0.35)\n", + "results1 = similarity.SimSearch(q_mol, demo_db.molecules, demo_db.mfp_counts, 0.8)\n", "\n", "# Do the same thing, but use the MongoDB Aggregation Pipeline. \n", - "results2 = similarity.similaritySearchAggregate(q_mol, TestDB, 0.35)\n", + "results2 = similarity.SimSearchAggregate(q_mol, demo_db.molecules, demo_db.mfp_counts, 0.8)\n", "\n", "print('similaritySearch: {}'.format(results1))\n", "print('\\n')\n", "print('similaritySearchAggregate: {}'.format(results2))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the search returns only the index for the molecule, which in this case is the inchikey; users should find it easy to go from the index to the full molecule document by way of a quick search. This also makes it easier for users to retrieve molecules when indices represent multiple tautomers or isomers in the collection.\n", + "\n", + "`SimSearch` and `SimSearchAggregate` both make use of the conventional fingerprint screening method. `similarity` also supports searching using Locality Sensitive Hashing, as developed by ChemBL in an excellent [blog post](http://chembl.blogspot.com/2015/08/lsh-based-similarity-search-in-mongodb.html). The method here is called `SimSearchLSH` and requires a little bit more setup work:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Generate 100 different permutations of length 2048 and save them in demo_db.permutations as separate documents.\n", + "similarity.AddRandPermutations(demo_db.permutations)\n", + "\n", + "# Add locality-sensitive hash values to each documents in demo_db.molecules by splitting the 100 different permutations\n", + "# in demo_db.permutations into 25 different buckets. \n", + "similarity.AddLocalityHashes(demo_db.molecules, demo_db.permutations, 25)\n", + "\n", + "# Create 25 different collections in db_demo each store a subset of hash values for molecules in demo_db.molecules.\n", + "similarity.AddHashCollections(demo_db, demo_db.molecules)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's try a search using the query molecule from earlier:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "q_mol = Chem.MolFromSmiles('Cc1ccccc1')\n", + "\n", + "results3 = similarity.SimSearchLSH(q_mol, demo_db, demo_db.molecules, demo_db.permutations, threshold=0.8)\n", + "\n", + "print('similaritySearchLSH: {}'.format(results3))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The LSH algorithm relies on random permutations using the `numpy` module, so it yields non-deterministic results. This means that LSH is well-suited for *scanning* datasets (its performance on large datasets is faster than either similarity search method), but is less accurate than regular similarity search, especially below thresholds of 0.7. Specific notes on benchmarks can be found in \"Benchmarking Similarity Search.\"" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Substructure Search\n", "\n", - "Likewise, `mongordkit.Search.substructure` supports substructure search best on databases prepared by `write`. Database requirements are identical to those for similarity search: a `molecules` collection whose documents have `rdmol` and `smiles` fields. \n", + "`mongordkit.Search.substructure` supports substructure search best on collections prepared by `write`. Requirements are identical to those for similarity search: a `molecules` collection whose documents have `rdmol` and `index` fields. \n", "\n", "`substructure.SubSearchNaive` provides a fingerprint-less, slower implementation of substructure search suitable for very small databases:" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['c1ccc(-c2ccccc2OCCOc2ccccc2-c2ccccc2)cc1',\n", - " 'COc1ccc(Cc2ccc(OC)cc2)cc1',\n", - " 'COc1cc([N+](=O)[O-])c(N)c([N+](=O)[O-])c1',\n", - " 'COc1ccc(/C=N/O)cc1',\n", - " 'Cc1nc2ccccc2c(Oc2ccccc2)c1-c1ccccc1',\n", - " 'O/N=C/c1ccc2c(c1)OCO2',\n", - " 'COc1ccc(CC#N)cc1',\n", - " 'COc1ccc(C(C)(C)C#N)cc1']" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "q_mol = Chem.MolFromSmiles('C1=CC=CC=C1OC')\n", "\n", "# Perform a substructure search for q_mol on TestDB. \n", - "substructure.SubSearchNaive(q_mol, TestDB, chirality=False)" + "substructure.SubSearchNaive(q_mol, demo_db.molecules, chirality=False)" ] }, { @@ -241,24 +359,26 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "substructure.AddPatternFingerprints(demo_db.molecules)\n", + "substructure.SubSearch(q_mol, demo_db.molecules, chirality=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `.Search` contents" + ] + }, + { + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'substructure' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msubstructure\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mAddPatternFingerprints\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mTestDB\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmolecules\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTestDB\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmorgan_fp_counts\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlength\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0msubstructure\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSubSearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mq_mol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTestDB\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchirality\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'substructure' is not defined" - ] - } - ], "source": [ - "substructure.AddPatternFingerprints(TestDB.molecules, TestDB.mfp_counts, length=None)\n", - "substructure.SubSearch(q_mol, TestDB, chirality=False)" + "mongordkit.Search.**PrepareForSearch**(db (*MongoDB database for hash information*), mol_collection (*MongoDB collection*), count_collection (*MongoDB collection*), perm_collection (*MongoDB collection*)) --> None" ] }, { @@ -267,17 +387,30 @@ "source": [ "## `.similarity` Contents\n", "\n", + "### Constants:\n", + "- DEFAULT_THRESHOLD = 0.8\n", + "- DEFAULT_MORGAN_RADIUS = 2\n", + "- DEFAULT_MORGAN_LENGTH = 2048\n", + "- DEFAULT_BIT_N = 2048\n", + "- DEFAULT_BUCKET_N = 25\n", + "- DEFAULT_PERM_LEN = 2048\n", + "- DEFAULT_PERM_N = 100\n", + "\n", "mongordkit.Search.similarity.**AddMorganFingerprints**(mol_collection (*MongoDB collection*), count_collection (*MongoDB collection*), radius=2 (*int: radius of Morgan fingerprint*), length=2048 (*int: length of Morgan fingerprint bit vector*)) --> None\n", "\n", - "mongordkit.Search.similarity.**SimSearchNaive**(mol (*rdmol object*), mol_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, smiles]*\n", + "mongordkit.Search.similarity.**SimSearchNaive**(mol (*rdmol object*), mol_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, index]*\n", + "\n", + "mongordkit.Search.similarity.**SimSearch**(mol (*rdmol object*), mol_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, index]*\n", + "\n", + "mongordkit.Search.similarity.**SimSearchAggregate**(mol (*rdmol object*), mol_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, index]*\n", "\n", - "mongordkit.Search.similarity.**SimSearch**(mol (*rdmol object*), mol_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, smiles]*\n", + "mongordkit.Search.similarity.**AddRandPermutations**(perm_collection (*MongoDB collection*), len=2048 (*int: length corresponding to length of fingerprint bit vectors*), num=100 (*int: number of permutations*)) --> None\n", "\n", - "mongordkit.Search.similarity.**SimSearchAggregate**(mol (*rdmol object*), mol_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, smiles]*\n", + "mongordkit.Search.similarity.**AddLocalityHashes**(mol_collection (*MongoDB collection*), perm_collection (*MongoDB collection*), nBuckets=25 (*int: number of hash buckets. The number of permutations (mod NBuckets) must be 0*)) --> None\n", "\n", - "mongordkit.Search.**AddRandPermutations**(perm_collection (*MongoDB collection), len=2048, num=100) --> None\n", + "mongordkit.Search.similarity.**AddHashCollections**(db (*MongoDB database*), mol_collection (*MongoDB collection*)) --> None\n", "\n", - "mongordkit.Search.similarity.**SimSearchLSH**(mol (*rdmol object*), db (*MongoDB database containing hash collections*), mol_collection (*MongoDB collection*), perm_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, smiles]*" + "mongordkit.Search.similarity.**SimSearchLSH**(mol (*rdmol object*), db (*MongoDB database containing hash collections*), mol_collection (*MongoDB collection*), perm_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, index]*" ] }, { @@ -286,7 +419,7 @@ "source": [ "## `.substructure` Contents\n", "\n", - "mongordkit.Search.substructure.**AddPatternFingerprints**(db, length=2048 (*int: length of Pattern fingerprint bit vector*)) --> None\n", + "mongordkit.Search.substructure.**AddPatternFingerprints**(mol_collection (MongoDB collection), length=2048 (*int: length of Pattern fingerprint bit vector*)) --> None\n", "\n", "mongordkit.Search.similarity.**SubSearchNaive**(pattern (*rdmol object*), db, chirality=False (*boolean: include chirality in search or not*)) --> *list: results with format [smiles]*\n", "\n", diff --git a/docs/notebooks/Creating and Writing to MongoDB.ipynb b/docs/notebooks/Creating and Writing to MongoDB.ipynb index cdd4ad1..aec730f 100644 --- a/docs/notebooks/Creating and Writing to MongoDB.ipynb +++ b/docs/notebooks/Creating and Writing to MongoDB.ipynb @@ -488,7 +488,8 @@ "# If we had used inchikey again, the write would have imported 0 molecules.\n", "scheme = registration.MolDocScheme()\n", "scheme.set_index('CanonicalSmiles')\n", - "write.WriteFromSDF(demo_db.molecules, '../../data/test_data/first_200.props.sdf', scheme, reg_collection=demo_db.schema, chunk_size=50, limit=100)" + "write.WriteFromSDF(demo_db.molecules, '../../data/test_data/first_200.props.sdf', \n", + " scheme, reg_collection=demo_db.schema, chunk_size=50, limit=100)" ] }, { diff --git a/docs/notebooks/Similarity and Substructure Search.ipynb b/docs/notebooks/Similarity and Substructure Search.ipynb index c4263b5..837c466 100644 --- a/docs/notebooks/Similarity and Substructure Search.ipynb +++ b/docs/notebooks/Similarity and Substructure Search.ipynb @@ -6,18 +6,19 @@ "source": [ "# Similarity and Substructure Search\n", "\n", - "Last updated: 7/27/20\n", + "Last updated: 8/11/20\n", "\n", "Methods for similarity and substructure search are included in the `mongordkit.Search` module." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from mongordkit.Search import similarity, substructure, utils\n", + "from mongordkit import Search\n", "from mongordkit.Database import create, write\n", "from rdkit import Chem\n", "import pymongo" @@ -29,28 +30,255 @@ "source": [ "## Reset Cells\n", "\n", - "Run these cells to reset the local MongoDB instance used in this notebook." + "Run these cells to reset the MongoDB database used in this notebook." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, + "outputs": [], + "source": [ + "client = pymongo.MongoClient()\n", + "client.drop_database('demo_db')\n", + "demo_db = client.demo_db" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparing for Search\n", + "Adequately preparing the database for searching requires adding a variety of fingerprints and hashes. You can easily perform all of the setup work required for similarity and substructure search by calling the method `Search.PrepareForSearch`. Generally, workflow will follow straight from the following two lines into search calls:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['TestDatabase', 'admin', 'config', 'db', 'local']\n", - "['admin', 'config', 'db', 'local']\n" + "populating mongodb collection with compounds from SDF...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:23] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [23:01:23] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:24] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "200 molecules successfully imported\n", + "0 duplicates skipped\n", + "Preparing database and collections for search...\n", + "Added pattern fps, morgan fps, and support for LSH.\n" ] } ], "source": [ - "client = pymongo.MongoClient()\n", - "print(client.list_database_names())\n", - "client.drop_database('TestDatabase')\n", - "print(client.list_database_names())" + "write.WriteFromSDF(demo_db.molecules, '../../data/test_data/first_200.props.sdf')\n", + "Search.PrepareForSearch(demo_db, demo_db.molecules, demo_db.mfp_counts, demo_db.permutations)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "However, the rest of this notebook will explicitly note the addition of fingerprints and hashes in an effort to better communicate how the code actually works. Let's reset the database again so that we can insert the hashes step by step without any issues." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "client.drop_database('demo_db')\n", + "demo_db = client.demo_db" ] }, { @@ -59,29 +287,130 @@ "source": [ "## Similarity Search\n", "\n", - "`mongordkit.Search.similarity` supports similarity search best on a database prepared by `mongordkit.Database.write`. Users can also use any database that has a `molecules` collection where each document in that collection has the following fields:\n", + "`mongordkit.Search.similarity` supports similarity search best on a MongoDB collection prepared by `mongordkit.Database.write`. For the general level of similarity search, users can also use any collection that has documents with the following fields:\n", "- `'rdmol': binary pickle object`\n", - "- `'smiles': some SMILES string`" + "- `'index': a unique identifier for each molecule`\n", + "- `'fingerprints': {a nested document that can be blank at the start}'`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's run through an example of similarity search. First, we'll have to set up our database:" + "Let's run through an example of similarity search. First, we'll write into the database 200 molecules from a data file included in the `mongordkit` package. We will use default write settings." ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, + "execution_count": 8, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "populating mongodb collection with compounds from chembl...\n", - "200 molecules successfully imported\n" + "populating mongodb collection with compounds from SDF...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:25] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:44] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:44] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:44] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:44] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:44] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:44] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:44] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:44] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:44] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [23:01:44] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:45] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [23:01:45] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [23:01:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [23:01:46] WARNING: Omitted undefined stereo\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "200 molecules successfully imported\n", + "0 duplicates skipped\n" ] }, { @@ -90,90 +419,90 @@ "200" ] }, - "execution_count": 4, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "TestDB = create.createFromHostPort('TestDatabase', host='localhost', port=27017)\n", - "write.writeFromSDF(TestDB, '../../data/test_data/first_200.props.sdf', 'test')" + "write.WriteFromSDF(demo_db.molecules, '../../data/test_data/first_200.props.sdf')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "`similarity.SimSearchNaive` will directly loop through the database and display results. However, this implementation is extremely slow for any decently-sized database. Instead, `similarity` supports precalculating the following kinds of fingerprints for screening: \n", - "- Morgan (length 1048)\n", + "`similarity.SimSearchNaive` will directly loop through the database and display results. This is good for purposes of verifying accuracy. However, this implementation is extremely slow for any decently-sized database. Instead, `similarity` supports precalculating the following kinds of fingerprints for screening: \n", + "- Morgan (default radius 2, length 2048)\n", "\n", - "through `similarity.addMorganFingerprints`. For each document in a passed in database's `molecules` collection, this method creates a nested field that contains `{morgan_fp: {bits: }, {count: }}`. Note that `addMorganFingerprints` also creates indices on `morgan_fp[bits]` and `morgan_fp[count]` to speed search. " + "through `similarity.AddMorganFingerprints`. For each document in a passed in collection, this method adds the nested field `{morgan_fp: {bits: }, {count: }}` to the document's `fingerprint` field. `AddMorganFingerprints` also creates indices on `morgan_fp[bits]` and `morgan_fp[count]` to speed search. " ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "similarity.addMorganFingerprints(TestDB, radius=2, length=1024)" + "similarity.AddMorganFingerprints(demo_db.molecules, demo_db.mfp_counts)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'bits': [33,\n", - " 56,\n", - " 84,\n", - " 130,\n", - " 313,\n", + "{'bits': [84,\n", " 314,\n", " 356,\n", " 547,\n", " 650,\n", - " 698,\n", - " 744,\n", " 747,\n", - " 849,\n", - " 853,\n", - " 967],\n", - " 'count': 15}" + " 967,\n", + " 1057,\n", + " 1080,\n", + " 1154,\n", + " 1337,\n", + " 1380,\n", + " 1722,\n", + " 1768,\n", + " 1873,\n", + " 1877],\n", + " 'count': 16}" ] }, - "execution_count": 6, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "TestDB.molecules.find_one()['morgan_fp']" + "demo_db.molecules.find_one()['fingerprints']['morgan_fp']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "From here, we can directly perform similarity search. `similarity` provides two methods that take advantage of fingerprint screening: `similaritySearch` and `similaritySearchAggregate`. The latter shifts much of the computation into the MongoDB server by using an aggregation pipeline and may improve performance when working with performant or sharded MongoDB servers. " + "From here, we can directly perform similarity search. `similarity` provides two methods that take advantage of fingerprint screening: `similaritySearch` and `similaritySearchAggregate`. The latter shifts much of the computation into the MongoDB server by using an aggregation pipeline and can dramatically improve performance when working with sharded MongoDB servers." ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "similaritySearch: [[0.35294117647058826, 'c1ccc(P(c2ccccc2)c2ccccc2)cc1'], [0.4117647058823529, 'Cc1ccc(S)cc1'], [0.35, 'CC(O)(c1ccccc1)c1ccccc1']]\n", + "similaritySearch: []\n", "\n", "\n", - "similaritySearchAggregate: [[0.35294117647058826, 'c1ccc(P(c2ccccc2)c2ccccc2)cc1'], [0.4117647058823529, 'Cc1ccc(S)cc1'], [0.35, 'CC(O)(c1ccccc1)c1ccccc1']]\n" + "similaritySearchAggregate: []\n" ] } ], @@ -181,46 +510,107 @@ "q_mol = Chem.MolFromSmiles('Cc1ccccc1')\n", "\n", "# Perform a similarity search on TestDB for q_mol with a Tanimoto threshold of 0.4. \n", - "results1 = similarity.similaritySearch(q_mol, TestDB, 0.35)\n", + "results1 = similarity.SimSearch(q_mol, demo_db.molecules, demo_db.mfp_counts, 0.8)\n", "\n", "# Do the same thing, but use the MongoDB Aggregation Pipeline. \n", - "results2 = similarity.similaritySearchAggregate(q_mol, TestDB, 0.35)\n", + "results2 = similarity.SimSearchAggregate(q_mol, demo_db.molecules, demo_db.mfp_counts, 0.8)\n", "\n", "print('similaritySearch: {}'.format(results1))\n", "print('\\n')\n", "print('similaritySearchAggregate: {}'.format(results2))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the search returns only the index for the molecule, which in this case is the inchikey; users should find it easy to go from the index to the full molecule document by way of a quick search. This also makes it easier for users to retrieve molecules when indices represent multiple tautomers or isomers in the collection.\n", + "\n", + "`SimSearch` and `SimSearchAggregate` both make use of the conventional fingerprint screening method. `similarity` also supports searching using Locality Sensitive Hashing, as developed by ChemBL in an excellent [blog post](http://chembl.blogspot.com/2015/08/lsh-based-similarity-search-in-mongodb.html). The method here is called `SimSearchLSH` and requires a little bit more setup work:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Generate 100 different permutations of length 2048 and save them in demo_db.permutations as separate documents.\n", + "similarity.AddRandPermutations(demo_db.permutations)\n", + "\n", + "# Add locality-sensitive hash values to each documents in demo_db.molecules by splitting the 100 different permutations\n", + "# in demo_db.permutations into 25 different buckets. \n", + "similarity.AddLocalityHashes(demo_db.molecules, demo_db.permutations, 25)\n", + "\n", + "# Create 25 different collections in db_demo each store a subset of hash values for molecules in demo_db.molecules.\n", + "similarity.AddHashCollections(demo_db, demo_db.molecules)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's try a search using the query molecule from earlier:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "similaritySearchLSH: []\n" + ] + } + ], + "source": [ + "q_mol = Chem.MolFromSmiles('Cc1ccccc1')\n", + "\n", + "results3 = similarity.SimSearchLSH(q_mol, demo_db, demo_db.molecules, demo_db.permutations, threshold=0.8)\n", + "\n", + "print('similaritySearchLSH: {}'.format(results3))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The LSH algorithm relies on random permutations using the `numpy` module, so it yields non-deterministic results. This means that LSH is well-suited for *scanning* datasets (its performance on large datasets is faster than either similarity search method), but is less accurate than regular similarity search, especially below thresholds of 0.7. Specific notes on benchmarks can be found in \"Benchmarking Similarity Search.\"" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Substructure Search\n", "\n", - "Likewise, `mongordkit.Search.substructure` supports substructure search best on databases prepared by `write`. Database requirements are identical to those for similarity search: a `molecules` collection whose documents have `rdmol` and `smiles` fields. \n", + "`mongordkit.Search.substructure` supports substructure search best on collections prepared by `write`. Requirements are identical to those for similarity search: a `molecules` collection whose documents have `rdmol` and `index` fields. \n", "\n", "`substructure.SubSearchNaive` provides a fingerprint-less, slower implementation of substructure search suitable for very small databases:" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['c1ccc(-c2ccccc2OCCOc2ccccc2-c2ccccc2)cc1',\n", - " 'COc1ccc(Cc2ccc(OC)cc2)cc1',\n", - " 'COc1cc([N+](=O)[O-])c(N)c([N+](=O)[O-])c1',\n", - " 'COc1ccc(/C=N/O)cc1',\n", - " 'Cc1nc2ccccc2c(Oc2ccccc2)c1-c1ccccc1',\n", - " 'O/N=C/c1ccc2c(c1)OCO2',\n", - " 'COc1ccc(CC#N)cc1',\n", - " 'COc1ccc(C(C)(C)C#N)cc1']" + "['RUTYZGCHBCCSKD-UHFFFAOYSA-N',\n", + " 'WECJUPODCKXNQK-UHFFFAOYSA-N',\n", + " 'GZZJZWYIOOPHOV-UHFFFAOYSA-N',\n", + " 'FXOSHPAYNZBSFO-RMKNXTFCSA-N',\n", + " 'KWLUBKHLCNCFQI-UHFFFAOYSA-N',\n", + " 'VDAJDWUTRXNYMU-RUDMXATFSA-N',\n", + " 'PACGLQCRGWFBJH-UHFFFAOYSA-N',\n", + " 'CDCRUVGWQJYTFO-UHFFFAOYSA-N']" ] }, - "execution_count": 27, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -229,7 +619,7 @@ "q_mol = Chem.MolFromSmiles('C1=CC=CC=C1OC')\n", "\n", "# Perform a substructure search for q_mol on TestDB. \n", - "substructure.SubSearchNaive(q_mol, TestDB, chirality=False)" + "substructure.SubSearchNaive(q_mol, demo_db.molecules, chirality=False)" ] }, { @@ -241,31 +631,44 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 15, "metadata": {}, "outputs": [ { - "ename": "NameError", - "evalue": "name 'substructure' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msubstructure\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mAddPatternFingerprints\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mTestDB\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmolecules\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTestDB\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmorgan_fp_counts\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlength\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0msubstructure\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSubSearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mq_mol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTestDB\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchirality\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'substructure' is not defined" - ] + "data": { + "text/plain": [ + "['RUTYZGCHBCCSKD-UHFFFAOYSA-N',\n", + " 'WECJUPODCKXNQK-UHFFFAOYSA-N',\n", + " 'GZZJZWYIOOPHOV-UHFFFAOYSA-N',\n", + " 'FXOSHPAYNZBSFO-RMKNXTFCSA-N',\n", + " 'KWLUBKHLCNCFQI-UHFFFAOYSA-N',\n", + " 'VDAJDWUTRXNYMU-RUDMXATFSA-N',\n", + " 'PACGLQCRGWFBJH-UHFFFAOYSA-N',\n", + " 'CDCRUVGWQJYTFO-UHFFFAOYSA-N']" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "substructure.AddPatternFingerprints(TestDB.molecules, TestDB.mfp_counts, length=None)\n", - "substructure.SubSearch(q_mol, TestDB, chirality=False)" + "substructure.AddPatternFingerprints(demo_db.molecules)\n", + "substructure.SubSearch(q_mol, demo_db.molecules, chirality=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Substructure Searching using Locality Sensitive Hashing" + "## `.Search` contents" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "mongordkit.Search.**PrepareForSearch**(db (*MongoDB database for hash information*), mol_collection (*MongoDB collection*), count_collection (*MongoDB collection*), perm_collection (*MongoDB collection*)) --> None" ] }, { @@ -274,13 +677,22 @@ "source": [ "## `.similarity` Contents\n", "\n", + "### Constants:\n", + "- DEFAULT_THRESHOLD = 0.8\n", + "- DEFAULT_MORGAN_RADIUS = 2\n", + "- DEFAULT_MORGAN_LENGTH = 2048\n", + "- DEFAULT_BIT_N = 2048\n", + "- DEFAULT_BUCKET_N = 25\n", + "- DEFAULT_PERM_LEN = 2048\n", + "- DEFAULT_PERM_N = 100\n", + "\n", "mongordkit.Search.similarity.**AddMorganFingerprints**(mol_collection (*MongoDB collection*), count_collection (*MongoDB collection*), radius=2 (*int: radius of Morgan fingerprint*), length=2048 (*int: length of Morgan fingerprint bit vector*)) --> None\n", "\n", - "mongordkit.Search.similarity.**SimSearchNaive**(mol (*rdmol object*), mol_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, smiles]*\n", + "mongordkit.Search.similarity.**SimSearchNaive**(mol (*rdmol object*), mol_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, index]*\n", "\n", - "mongordkit.Search.similarity.**SimSearch**(mol (*rdmol object*), mol_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, smiles]*\n", + "mongordkit.Search.similarity.**SimSearch**(mol (*rdmol object*), mol_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, index]*\n", "\n", - "mongordkit.Search.similarity.**SimSearchAggregate**(mol (*rdmol object*), mol_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, smiles]*\n", + "mongordkit.Search.similarity.**SimSearchAggregate**(mol (*rdmol object*), mol_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, index]*\n", "\n", "mongordkit.Search.similarity.**AddRandPermutations**(perm_collection (*MongoDB collection*), len=2048 (*int: length corresponding to length of fingerprint bit vectors*), num=100 (*int: number of permutations*)) --> None\n", "\n", @@ -288,7 +700,7 @@ "\n", "mongordkit.Search.similarity.**AddHashCollections**(db (*MongoDB database*), mol_collection (*MongoDB collection*)) --> None\n", "\n", - "mongordkit.Search.similarity.**SimSearchLSH**(mol (*rdmol object*), db (*MongoDB database containing hash collections*), mol_collection (*MongoDB collection*), perm_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, smiles]*" + "mongordkit.Search.similarity.**SimSearchLSH**(mol (*rdmol object*), db (*MongoDB database containing hash collections*), mol_collection (*MongoDB collection*), perm_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, index]*" ] }, { @@ -297,7 +709,7 @@ "source": [ "## `.substructure` Contents\n", "\n", - "mongordkit.Search.substructure.**AddPatternFingerprints**(db, length=2048 (*int: length of Pattern fingerprint bit vector*)) --> None\n", + "mongordkit.Search.substructure.**AddPatternFingerprints**(mol_collection (MongoDB collection), length=2048 (*int: length of Pattern fingerprint bit vector*)) --> None\n", "\n", "mongordkit.Search.similarity.**SubSearchNaive**(pattern (*rdmol object*), db, chirality=False (*boolean: include chirality in search or not*)) --> *list: results with format [smiles]*\n", "\n", diff --git a/mongordkit/Database/__pycache__/create.cpython-37.pyc b/mongordkit/Database/__pycache__/create.cpython-37.pyc index 495d11c6db7158328a7c29037a5f64c008dd1bbf..3366231c901096cf8174c21a55643a775d2f57be 100644 GIT binary patch delta 20 acmaFM^_Gj($ZN*T!xQG{RZbjFac?YyogX)7EEm8BX-mNaQm1zaqYP3pgceU;k zU7~e(*Qwso8jm~xtF(4Q7lk#)xA+7~x%DCQ;9cT$H*mMCa0rzmr;cwAod-JfZ)8OU zKD7e7V?$piYFpPjTfNPj?e)Fw>+Oy>_kQcW?Y-V&>jMfy?E+2+R}CI4ufr2w0-$&d1U?}nC`&|>h953VGc<6>go(~ruDc5KjGQbrv#Gs^P2Nr~p)(vC{9 zS4sA&B*kMhso*S$A-@_?IVs-JliVo7uOx+I=z(9+(t|4>XzHuf_v`I9lOkC3>QObd z=zeO^{TtfB_b^sZs`Hu;zK8MP+v93t+>`5vS`lg`RRfD_G^a*@&kXDJtfrX+t`E2q zv5-VVM(n_km>-2C7?4hDlSBd8y}r|1zrM4xA-zH@pAwrh3r5JnmQNg?vfCu^r3O_A z$INyHj?E~1C7t!HjopnVxjv4Zz_(m7wz%c7i1Bb$D!EMTIE(_%Ib?(ecI<%-B#d}$ zM=@ulKOsut#PRoumFgq&!9#-nC(MqTsk&RZBxz45V=Mg(#=wd!5;C*^df0*M!Z5Jv z)Fkh3ygRq*soE1Ohruo4S^k800Vs%Fz|R6p9Y16|f>m1)OcObtGM+^dTE4z@v;AJL zv)$d8Ds&ysiKLx=?DZK(+)p&(D!?5GAwrO9+1cLR-ct%zC6G9xce0XIat)yI_k-5j z-1-&NmPCHcA&`Zstm_3n6J-_WJs~l?=+YIa;zmu>76VjdA^t=`SuARcQ6bC|r;EJw zmngw_waG+Yeig>l&%!)GNaQ(%p1h3fHKt1ty7RThFGAZd>&rJ`dxp#QkVDu8<00eA zEf$WTxGWudnbVOIsj@1ztjOwHAzPLwN^?9B6?t54f_K5WB~Tk)1)3yO>V90W z!=2F$cuf7#^lies0(B+pdM;x^y#}|b^9JZRo7OJ;JzBe)ED~BfrH}TE&^|m^0Yb~q z2(7R{XrIpr4SOltLqao75n5?RXe&TU<{Y7|03pe*lF)ws1EKwxTJ$3j+IMFV+IK0T zl@|!DJV$7i1wtbb=zm6NJKLST{~1m3xH{72dH0ON^k2kiE^O#IO2aC0sXP!;Dhd~v zK%GASJraBS;Ykd^oUxcZUH>tMRe1%@J6=VBIXquNQAhD8ipNl#Ljiw68h;$c6DXcU z@f3(gjXw=9kwfeFGuZqrisw)~kAk3h0mTIrFQRw}1@Z%b8O19okQ$V@1SL_Qo$tGW zy93JCp!Li%-h9|I{x#5KXF%YEZ_EqU;R*5oPX0<|4u6wJ*bUdR*)VV^WIT3283Ae~ z78%68%2uNxWSEkZAn}}~hPDOiER_m68+&`(?RW9KQ;Gd7(Y-jIwOz#IQf7Q9(UI2L z!akw>RhVF!Pts{G;NQsd?Nf=N%206txY5Q5l2j;APZ7yqbbWx{F&z%q&$y3D)S)lBDNbSFddSa s498HH?vS(1*(i6L+-Sx-UO?f`=?Xe2#4TQjbV%LjvN^rJf{$VT4i~5T3IG5A literal 4963 zcmeHL&2JmW72oCO@=Ky*xoMEJowi9_r3&R5{Q%T7s4PmAP@6GAIR*q2#BygTuDRT$ zXJ;czSsq#kJ_J4lMGr;M19j{*r(XMS?6oH&NKQHR(*E8omm+N^F>+{o$P)W@xbx=C zn>X*{H+#KSD{Htu{$!=&d|T80Nq^EShsLjQ2YnE(aoyF9=$Uc#Vdf~KQ#AU-8(8r&m;EreYoyR)(@n3&Et_RZexG8ui zu%&do-#Yq8cv^4DpwHkD_u^d!Ub(NW)2um!l<` zpBsa$_fA<%ulnLR%W>F2uIohlRo37G`nVCGsmOdvTWA8&Sc+KY?kjy+miwFd(8A$ySc}D zj%z0k;SX8+ogEBhCbN9kwJquR9%nzDlscP z=yj^gMlyLn^akid$4nL`gU-jo4@XL=o&DRb-*(yuN4v?GlfGm18Mk|8=t6Dq{^aV_ z27AYrq3{w}7jw^~s_C$PTSzFbrl48p*2F^r_hk-blE%rBwnnBfhm3c(*fv!3j(wOe zUBk?t3%mjE5%aB6*@va}$d-QhE(}MuP$sYgM?i_x)Qh8qwZ-;)p*TaRdnI6@bX-TC zZeR}rsg#s`Q(|`RSSIc9N#7Q>8lkq%7tS%_7(EN3g?-%dN3?&?&vpG1EEJnBe3$v} zVXH{DnT5D-g+urS%@|rRt=(YFIh&IzN#HBXPJu+-oXrL3<_WOZ8VAw1v~*!J@{;wr7)kFhGj~%40}U ztmg~00^5XFrS7sYvqetY37j)~i|zX-Y-oC?EVU073DeKaHgl|}e1yR;CX>$4o(dPr zVYk>J%r1Rtx^p3u>JdCc@`cMLg3f+9@<*YYtW3Em@6P-6i6i^?KOFg?#{<@*n6{AI zyKbY=5C+n5T%EW6S^gEz9NK65q*ySrEvM(eThkrpJ~r0#CA05i!y4?4P<%1JBEs|{Swi7M$-2!;rqAkGU8Ys72w zDyFsO><90UFXmxEXjyg;^g@@k!GbB)*Vp3G%$#w4g!qy1Wf!`4E-cxh6$he16B|9~ z(SdSt{Yi$n^zQbJgJ$QT*^YB}eaDN-@0xCC?+W3IxByognlfJ6-)`@B_HMU!4iDaL zwMCYst|6vRP$t{Wo4(r)M|&Hhm2J1ziFa@xBuGNybY#c)9MSD*B>D(Z8q&-faa!h%r{i*C+(!! zoR+4|6RK8?(P>;&tvz@t@*{a?&;-#|3%Ze6)9bj6jG>p&GV~mNb=}Z2xYzVd7Ch)N zvgpOTDz))j#j~m}>EbHnJR6Vn{vYtjAEQMi@~3+b7m!G00f~H^_R8u6*ab)gcx1Bt z1QMw)Ad%0eNaQmhk(C(|c?fg?T=EnWd3a~aMaTyzWMAPK<0CzI3kXGxpCOd}c^OIH z7b27l0z#$SuRtg!&;&9CQiMtYlnYgjN(&FAZp0g20|S_IdEg>!7xMVmFX9V&Osn}~ zZAH|5Q?|b;+bJIYuas?7uKrimT9trltrj&T)}$~jqxD=>`I4$TVuePo5~12FE)iKH za+%1th`c}q2W3sXNaQ6VR8qyuM5soJ?-2Pek?#>qK5B@?#l0}Q7i=Kp$c5nyujX)jSGiU!+Y4nzBT6W)eIWWBCbDQ!a@?kIZlEa$9 zX+lsNHp!;7=b6gDfxTWUAfGs#vLr(>Snr?Sa;JDKkHWI$!r zqWCG=7Fm@pdOpj-|9E~2lcT}F@0pHkcHQJ~J#W6$@~Vzu)#;)9YZ);;JZ#TuoA8XS_M|IX*eOMY|PH S<}d*6lAn<&|Em0^QTh*p?7RK| diff --git a/mongordkit/Database/tests/__pycache__/test_create.cpython-37-pytest-5.4.3.pyc b/mongordkit/Database/tests/__pycache__/test_create.cpython-37-pytest-5.4.3.pyc index 4f112e0f7fb42e88f22be710a73f34798c594f07..c238be580ad04120a5942de9566b559feb85afb8 100644 GIT binary patch delta 87 zcmcc3-NB{o#LLU&dR{Aj3nK%=V+JI^0%SV?adFv1rURxO^*J5=#=3 z5{pwcnQpOU4C_mQ&s3riYB8oS; z$Q>pa8swwN6vdp8k`%?9n37Z^1Jnc|WPz+(+$E{SCGik5gn$CYAb&A13NRLdoT@1@ jS(jxoqtfIvENT)WKmiU0AqE~s4lv|ll3?Os;$Q~=S5`ii diff --git a/mongordkit/Database/tests/__pycache__/test_write.cpython-37-pytest-5.4.3.pyc b/mongordkit/Database/tests/__pycache__/test_write.cpython-37-pytest-5.4.3.pyc index 59da7eb7b6893695a763777d2f789f9293ac1050..f537de466671d49eaa17a005b84e7072ed302dac 100644 GIT binary patch literal 6323 zcmeHMO^h5z74E<3znT5>U+e_J5(s#LGrQ}bwT)~m#2bf5$qFziXe87cPxtQjx@US^ z)objfM?$eBP6%@e67pe|6GWVH;Kr6GLbS-i>rw22U6Iyrl!R(HQ}XT^&w`ct=9Fz`c^C3pz`uY z)G71^*vbz@oJC-2B3h4SCR$mXcH7j0emWil&t*Jv8YI&O`i|Dua&6PV*!pVSEWA$I zT~9k{`+8A*kJ!se9Cfq82(MvayUlOE(L@#fAFVEYPewv6v^NAkm-aRyv9J=!>tGjn zoYzHoJLKewPG7Wgzjo$6Tkx{nQTX}PeW=NZ&>8>v5ty~0E(jwl# zBQJnNyj4vj)I$SGGea}9P+DOnv{6=u%Cwa#n@(5_Yp8d_DJ*%q@V<+Ayia;-lYZz& zOQVPN^df49deSrFj@HMA`etTsT74@wHY@!~X!Pye$}72@>vzn)lbgA7-RQfyD?iQb zffJg!yQYU0)vp;l7V6yG+|k2I<_-KE?Yb=!)KxRoiJR0)T-QK5Nm`J!OUVyOGB+SO z;8t*uw3)B$=v><9;gmfda1R|l{=lu^(8FQgfuwuiJ)-qJQcv#1I(R>;p>-zjC7JQkh`9=q>^8lA`0hYz5t?N>tVtJ+QG)?y*E;N;?BqbJgyY{+n}u0Jv^ zUaafeKY8_d@Agt%&R@JZ&+MtX^nbL=&8x~YyLDiORS;|NPF)dc=jzH8zki%Mo!gZJ zGn8D0WG79c_Er)}zkfn~X0a}Z96nz6`=WxG3lF5I(xd~~hABmLw=gk3BBzL)A+k*5 zTw$SMk?9iCQ`<8w2@}ZF7cf_@K2x6-HjP_*FHKg`_EjF34>EtWsEw|JY;HK$!hfT8 zn=B(}iDE8jw~{0fQBSnfPOl}R!V6?;EuxyD2Fc*sR-9yUR~B}XLRc0@1G$lIB_a0; z)|2#FD+#)-PE<^PgC`0=cuR;>6bFN~RCHQd5Z&q}tu8}Fy;@i@O4h{Oo@VGXYK@CU zb?@c$=*!vBm-A|Bs?XBN(WaBCNquSzXJG zfgQqLbnSg$|F^a8Xx~SPQ*W#`%}$!u9kr#j)#4?P!dAOi*rFB7NEY5(x7tyUP`L1b zZ9&`(qg!=Te1&?@ER?1RZAMl;$8(RnH(N;@zC&G(LO{A88dmP=HGTV~iIt8@munqE zvMYei@wH-ordl5`%*-sHnU(9CHZaXBG0hmqw0pYrGiTre&j{5jC8{|Ck5SFe{6Upb z?H1%}8SqPd0L~+hv5F)K)tplDS0tGeM*Wr~iDQf+o=}ZSLN&MS@kj1KIQR*-Djal~ zCsgx>sOE>&64j>AI{hqEW1ZA!&;Jdb81glYy$_v0d-ov8TDhl-U($R(TQ89QJxP;3 zApL-(pRF78`Um%dPSER9?)BXC!{>i$wV1iw>ZaWopmVho17*bw4(I-^0Wk~F|H*rR zt+DQk1DK#7yb*_p942yv$O}Zs7{xJ=$G~3@JT(sh^!e)fW=n2d%|zT?ug^>%g*Zin z@CEca;yzF0Wg=f7GEd}-L|!5CB_dxY@+wHZI*A`&rKaOV>O@{6@--qSh`dguL1Y4F zs*E!mm}MmC+wwGsLKuGxU+p2j9@Qu7f2VNw@B|vEF`tY>i;v;Zh)1PQcvKVU^B?o5 z2z!)ArLgDO9u;Aa@~D(;Jr9qnT6&hx(8E1L9)gdy58o1KunkG}E`bJj$?yAY{}R%_ zCg~yKd`!|q#CeKF(CZWKr4Z+@-0S&ig>Ky1>i+=--X#N!pg;v4CdEbUX0q?`-v*OI z3?^IM_{LWBPM3UQcs=Cj$VCu^Do^s+j!q!W5K&&9K$MArOK2fiksfyE7CBfOb?2T0 zEvR#b0h^J-gnnuvSVNG8pv}uY9OhXZU65kBR@1NWhlzQ z*$warTG*(>M3@JZEhL15wxg4?@!!AnUl*r92_oPc62-n*+O!3d=B5L!|<&- z5x#k&;Tw3TkK_fL)XN;SnrOhIEvbB$&5F_T+#4` z#K*wefpAtCAVsoCwrA{^aImd@h29Zr*qd(OiIHN+o!mtqxc#{Yd-QW({)KD;frbxj zw5gu0(Svosau7t+22&hvd3|eP%gaKs&gpwXUB|gY<@VH2ss07ING0u2w$K7^gO|;f!;|$UsHyW}NmwJQC z13hc#fQ=wmYaL0q9~?p0i4aD6cnu%Q*bCb=fR$KQIWj_ zCTJ8!1G$JK?nV>)|H!|PARk9bgG*=Vi|X^+rzbF#lMopDT?liC-MFCA2$#U@4iNtx z+q`G*Hz(~Kxs+k@hEqD62!X+&U04R!3w(siG zNFdCXm#)j{lQo3^^UpdUt97q%15PLfLE#5MCk>Hf178b*8(Xb}QcXc17VtSi(S%qg za+b)~iM&l@Z{p0+^(`nLO<5;NZ?551T-$Z=RJf)7LgDV_x;QQ>js+^!(oon4U!x%+ zP8JnjSyed0I^MX#=^vM^F9$UhPImw;AOI7h9F&LZK$ZuM|hLK;Y> z$;}8E9^~P=S@R(0w^_ciGR$c!7mTw(?3>Y=p0wD)y`c1Vi99(;@uaIA&}!Hi^;a{F L8a2IUxYoY_RC*P@ literal 4115 zcmc(iO^hQ)700XE?f$Smo|)avX2VAqLO!;m#PhK`n~lVxkX<6>zzB#)kPoqXrh0Zf z)9yA^?XbJFFBuLTkk|u?A{3!9;=+w{jyWNY9Q%aC0fj>@+;ZaoUbh{4NQOZoiLH9| zy6RPR)vKysy?%LVsqv4${OipRY$5(G=6-kpwP;HtTx_x^J>{co6CJJCQkpy3YMPuhmgMEA>*n6K3IVXrlEjGMQo7<6(A& z_Y2oV*=Ts@(h>{x zWNE?Domu{hEOP&WC@-tRFm~;KsBBi+Uv9;BQqbNnmU;5*lH2Jh$(B)Q zu0ROsVBf!D1IbzH^=}sF%=tv0}0?QK42noNp||-R|;Un50R6FYK8?KaNb` z6i&IMg`>m1iA>RWJ&Z>Yufiw$KtGE90?GbA?UYq^NIzz>uS-wk_43*z=Z3z{^L_Pl$sBpPcLB<7S8t?f4i%)k&?lWW> z6=WKxAmhzL26`RI8M|kCW9ck!JhTj9@*MtnD9zijWRE&<`hBt{ z9kBfZ{lc5Sy`yRCocjdm0M^CIbkoPvT`x6V!(elX;W;*gt`(c1{h!!e;(8OFAGP=B0bM?KEs-5 zr5_{l6%t=1@ih`(C-FFmZ$O|S-6VRA#5YMiLE>8^o`hHcYDLWouiN^mnL&ls7Z$Mk z;%S03nIH$DIf$61<|aCrM=D`;g|YfBV|8r~x6?XDoRrAjM%x)}h=ViQv}d%*enncaR4p@Un`%X^=1nGTt0Had6loX2xr(&@7~cfAPA{DQI@NT2;|{FQ@nwW{ z=fK1w8$cT724j@qR%HzP!X4tnH16+e`UWj(=DsPc)aYmNr0Y~< zlR@Ex!(o&tYHZT@tq6f3V)`WOUgR%I{)+5OzaqX^d(J)VD2|=I iZx2#6ilgt)F*0=dc;bR+;X{-E))lK|c{cua&-pi%TJb&r diff --git a/mongordkit/Search/__init__.py b/mongordkit/Search/__init__.py index e69de29..73fd660 100644 --- a/mongordkit/Search/__init__.py +++ b/mongordkit/Search/__init__.py @@ -0,0 +1,12 @@ +from .substructure import * +from .similarity import * + + +def PrepareForSearch(db, mol_collection, count_collection, perm_collection): + print("Preparing database and collections for search...") + AddPatternFingerprints(mol_collection) + AddMorganFingerprints(mol_collection, count_collection) + similarity.AddRandPermutations(perm_collection) + similarity.AddLocalityHashes(mol_collection, perm_collection, DEFAULT_BUCKET_N) + similarity.AddHashCollections(db, mol_collection) + print("Added pattern fps, morgan fps, and support for LSH.") \ No newline at end of file diff --git a/mongordkit/Search/__pycache__/__init__.cpython-37.pyc b/mongordkit/Search/__pycache__/__init__.cpython-37.pyc index cb98211bbf5b72a3988c7e30aafcf4d72e542376..49f5b05457114b55f7679e56ecd89c46ca4c68d4 100644 GIT binary patch literal 684 zcmYjPO>fgM7`EeftE#P(Avkf!kswHr*b&gScGU!>s%U%3C30i0jh4ibU+1tXxA8;P z8~mkQIq?@b@j3ymw%)Jj{q)Dh@o}Hv`uO8CdgT!E%NqX|+T;ugULq4j(3C7&Nl`k2 zeJ7zKIt$`;e^U(cXiZ7+ST`sc7ZgJBYA{PQ`ZC#~!+f zGIOwFjdfU&itcO-cC^`oU_UmydpFmz;5x}SuHc5RIY^i1!i`m$%Ge|-2lrZO7bMs5 z*7yCp_vb=L;TGH&sdM+bfH!WY^oIY_Z8nrep|oi-r;97!>($)?o0#wgX3yKy2{x6w zfgya3_}8Y&K3w-9$&wW3Gq>S04EcuD=Tc{-;mvH}Qv|0f<|#&8ao9=-hvUo1`D(h1 zE>@#2m&<4to?wu5jvf!?b!tbf!$GFf=<%%KFji%5_K_zAw%Y@K5`BQL@Mj#XKxzo$ ztxmwGVk>npmT-e=kg0s5-fD4^m{xDqMRSKi6eW3LqR209>tTESa-wwWwNDKLWQ6u` y_i+#C5!F3JW*)1Lpj?B|Wo$|@g1FCPR--&!|0; zT0Y%!W<={!75f&Uid0>eP^n^u1y$6giVYMS7A&aZFJRLU6O*F&OsSp{D(-4xN|bPSKE9e3Gq_r? zXY5%#n-z1oTC}H8`;?eRen~8dMbv(YTU-(^;ofO+TAabXGxoGyLanpn9PXZt+jv=+ zxH>0Z5$93nWue|z%CA1BnW>c3FneeFkzMm|3FM9Ib-RiG%vz%nXS3^;Z*5ziz59oM zK%eK|d#<6TqhuB3`4V!^1(F)gFMV*Af7kKv{T`BCrKkIfuRhWO#n*j9sGsYHsUzj7 z+B5ba1?pk?NLg1t`nZ>hO8J??Y>*P#l5(Vulu!G)zzEV$wO%I31lnFoUI_HVywLrL zAoEmRPzlXkRLmV$noYN1*E$V6{KWC==BCwhny&A1JsHG| zZToS@YS6>YoA++qzjHjsqs~Ftz}Aqoayr9j~3Xfxeau|cia}rnz!B7 zuDgECJibdW^UbDh`<^NBLPy$7yXAXB4f%Cxdv&)VR+Ay1UVM)SdOVmFbKTytIt{dk zp1-y7Zq#LI`yI)BXgHp4x;y6m+qX7u+>hSB(`oED4UDzl=o_ZhKwqP2^Xs;QS&1jl z^lC0%9d!^QmTFydd@oG#G<5U*Vyk91U&DNbr`B)0zxH5bbL;kjaibFE(9)f# z6UGkSBuRur{kGc(Cu>%tw$<-an6}z&yCp&$ohWBR9qSerkqoR-m`mm$%wU)_k)i&0 zryZtIaNVt;7A~^=Hvd$S=kU5O-@o+0v!!>bR+ku}+pgR4(z@;KA$zGwtNx}GdyXIF zqiiI&mj;%ueGnG!JI&}p#d4n53uxP$N225mHLn)&ox?A$X4R7Vk}6F+sgWAfhd?uj zAFJ#?BeVa*Ksi)cE${X;U-h*|P~5{*pb3@LGVN!C1~tyI8tYI?ozxk%po}P`FDVXG zQKSMrFhu$b%~2io2*~?5{}9O0OYL7x-tcx$+kXRfGE0ic;+I<@I9TkZgVfS6In>E-aQ zDKSZRIaj2dm;xM>tbgdk!I}_&0v0S2($JSIbKCLFCy*>xdZyJ9P{9t=EE4cYxac-3 z3xw)}&6`w%b;yEZ2)EW@y)>QH2+*vUw^~?wl%-efkP_Ri+5rjzftnVnKDP?-cFpT| z9#l35P-;lchb8d@ynt{)Q+AJ`ZKw$z)qpsEO(Vy>1t8FPhZ>WXa5~#$giuEOLCgpDNsU~CnqT(bdoca6e*daq=Y2YF-2j%(-ts7TW-q^^hEU>RzZm2~~@+ zEJ|LcrcObdUI1zDO+4~w&Xjynol!LaQWmf@qZ-J~t7nj#!<{_7@-j-*Xz=u*@snS{ z0QCrS4WGgYG=eC24Gd54Eet=!z6JaVfKfWog|?^3C;RJwfeb^%yFJCv0<6;T5CBoa zz>hoy?Www}!ly6}^FhYX;%S=vhs1lh3@9<~EATARXgTW_{K>;2pe;KG?*eyn2PMA{ zWCQ&)y%`YKI%6pQ)LvfxwW`q9FA4Z3z5K5$E$u@EC3A}ay9K2;5oq#DkrVl63dWNm z|1d~1SgME9@Dj5}+Tlzvaip$O%Y_!6iefPFV|Yis!v4oJvLF}a#ncxXY8C*xC4xTy z=zAFJn31yQr0=DEqRR8pcz4woNBk4@@%d=X;xmx|9ebu?gq^6c#kP^@*=-! zYsax00#LYS_T4AA5P)GA2=d>ce@+c*3yVYL$n%OS63|&fIEfX&69UoAzB8408^hS? zwi*X!&1$(VwA(N}xT<}ZYxQU5$}z4o4$5U{sr(KljExh>g*q&nyntMdP?A0nD1~(U zK7QUDl3f_)j|x3CKqR)L{J!=W+6KI4I3+g;h^(!=#Q-z{HDBhh(9G7{X8Y<6HAiiy zmG0TZA6mK6a4{F&Rb=`lx#h0D?6kPeN?C(N*!69s4d2GqS0eORf_r&ZzJV@`qW3Un zM*x3{%FR+jb09Awp~>5GQai0@Tj{-m?t%C0rNP0fJQ&_0?@C-K%Wm%fZL2e z;TPal>Om$3`YeDRp=J#9xftk+0Q$d(fqsgCJ{JQ${LvWb^8osPO@KblK%a_%z7zxf zbdZmMegZ(B0?_9f=qJP!nGAwI2H?y#kEMOi(yddy^mv$RJIU(SahKxBd!C@N+T&n{~%S zoa@(JVRk%3?5Tf2z7ciB@m+fLhVI4yg~cZ6dM=>5MQX@505%U^`5&! zAXDJ+#FAd9IW0sJ1lN*akaLlR8nU4&JYySr1{t=YbA1~MONvl^SBq`wU}KR%nMhkQdzCReHBze33sO1?|URZ6}`$$Ln`TwjCja`9yQm6$HMMm4Wd zLWV)EQ*winA0P>{pk3N7g*(F{7|y&XavmypB(q9B;zz#5R%vWT*=$0 zn~ZKfc0;H_dj{gu*8(-rD99i@W$uBfNW-f3Q56&rIfQTnJ1yq?Ohj;7 zZCf@wKEoS(hP!slmgFo}9^5u1NC&rR4whknj^l9(x}(V11S`SBg=am%TWqUVC+Bj2 z@U#e1J)8ViG~Q<{hJf^#cN|X)3>C(ESIrF$+o>CaDnkx#GK|5#6B#+wNUPIC%-?Bl zgI7f(V}?s@(U{_Deebq{2$*Lg(G*&oL{msKSX1Uf8>u~B8j2%@@5 z!ZPi<a!uPORcL$bp9O6~VdN>hB>rJb55aw)5 z0ouZcyP&Tpuh&m%Z3TMak;E$seo+ zFksJQw;we9o1WeB5J^3;&AR2)%_ml)L;B`M%Fq{h(u(1&02U+K`4Fbt8d5bXd-uk@ z+k=dpIq0dxl)RUylgEv{SdF?-hZI8vFAj*?#V8_V#qD>H*e`BnxR--1FqaKpfOmh&vsQ9wnUSeU!9U$fg36-O}>={A#MVS!Wx!ORw|aX3ZkmhW!G?aS9t5X}+M zB_ANEW?R>ySBL2cD8s_Q{Da@Zv@>^)f3{6wHfeo=Ti=y#r~N`iG!5C?pnwyScN3kk zy+6n6JPOB@e2J;95+WW0u@6NHAlG zl~{LBJ#Pcbj2^{08f=Y08(&Mxp$;Y<5eBpHF(Xf^bEHFMOt9#nsYC{LDBJxTma6Ma z2|{9>ogG_ZFKJ-6iHTS2KQU>+CWPo@5?XjR@RA6kA!5;B$fBZ|ApWs)3xvs9!* zO>9S!u|-cTYAg*Lh3yFq>h>t0&R~t{Ro9bEI`8jL$s^=cG8FhyU@Pmms*&ZP4+#MI zawM_dW!MIwXuRY3!oFx)V8IZ8rJ3~u9ts0Y=8yK@|4ez59>Zk<6^XH=EtHtcMCc1H zlQ5JwRR*}ss|ayZ3746RxlAL-#UXB~l|{2zwg(^&@bbAGcR;S41=_{nl%odu zU_vY)7%v8s6mWlmAew@14!N=O9;B0C93i%N1)=+S@v1QUfjIq(We&pg;ye65FD{6S zfsq8`^I(IQf_YfP1@Rj9WPy4DbuC-3_O0A?wnuH@2uZh#pC0M9J|Y%1~$d&c5To6q>eD1{55H|Tp#7^0SJjq zS$vcrGI`s!4YMlA^8_#;76Sk`4n|R7mGLCrI~UtSmq}0KyD>6wjd;H&aRbR68%sKp zd-GI{0wvaT)TH1AZXiVnq1qbJe|GWyk@T3_1ledN0S2MkIKB}}ri>#;kE3r%G2GX8 z_GE^XJ9}aX{x8Sf9VI`%^-IaRM-$bbq4u~L`uF2kCa)m$Bj1u(Pwy=x?2Eq^`Qo3! z3y&6q4HzQWfHVbINSnuTBx}eTs__`4pxgx{SSrSI@)2VHT;4@KRO?twn~HHvNSjl< zdjEk;yn5V9(!p;fB~&<(lKpXW@Sn%;NnSy7FeV=aeVcza#0VcgT-VHdBLkUmO>JJ2eTP>?;d(}M7m_*FIcTBU$DmD5W zl#u6xQ!KX6j7fPhB3)QN63=*G=YW`(YL@m3nEU4`U88ocQF5JjZ;LIMk}y;2NNiSkp$eBe%2)G|KZwnEk2@nbs48h`Ut>vG!@?l6{ypw^_mK2i z8*;TuWA_+npMmxmW_W+Wi1!(aYFb-V&uFEX%`wz7eLlxStBDUz5Jf*4qDMY>%;gLo zW79up?;8$=24_Uyzs%~pg3$OiKSf?Rl=%#RErqRUEr5F_FXONpPPLG;aaVbaP36b0 z!5L#NsFI+=w*sI6GbJ5#bDT8J9!q1Fd8K|BGtN-OMzfqbS!_v$SVU2X_ndg)NmQcM z|5fC$jV1OOfk%5pvN%IMqLT~?7O%+LK{HrL-uI{H2@^q~5c$-En4XUW$OtbeQ0K9~ z<-r7B5h&sOp$|Wr{YPCG&)_9v zs|bs>ppXZ86dfSC7B5b!(p^2Kv--0a1+?2B%y@5ZyJ4H#2WAvCP&gC^o+}ZxPOC~E z`llc1xktom01x>p9FPT!qO4XT1<+&zk3#hTtWy?EM(USXurPb2VKujfb@c*f^50NY z>AphhG!XC(E^bt|oQ)Ww6rY%5Xo*gD#NlrW92-tO&C#24OhmeF!wb{!%{n!^oaWW7 zPH=ak$eA}5s+oAC)hOnM4;KgBp}O7I=O<`XvhgJtI(NMO1rCJ`$N_B!zOFJ%C2H8$ z1zvxWHDBR1BVsj+@E0rkRKvXK-3K4gK}ZmzTSUt|*j%HdocG7lhFF_ul{{LeXz)am z4%eh0cO_9ubeM%OSvFsJ1p4bU1P;7xpyRX_j`~G3CDR{RI&}+432zvpawCo4$@m1# zzJa#oJ~G|&qa@#$HPmp7&}z%yLV4J@Pw`hR(TWUG7yU`6r_rQuBKJB?I!&meNxDuZ znI@Oq5yrZ=k)eH7Qlvwi{G%!6BQBbcbA3TMddULZuk4wabmL50D~Mc z*~vkLfykuh+B-W=%^_}K+x9;p_uu~nEH1Cz8b=6~YbyxTFQOaLMG_Xk5+xfjynjG* zU9qZKWxJ|ZtVqNL=9HG`5{;F7eMLhUPIt;?m}wvIfl4}YiW8N*HGy+p9;hUmmgw|n zo}aDI(dX#6X`bsuHE5P_Y_-#DAIRUPDglz}bpL2+a@zO!fL3*~|M1$a&8}K zQL|T?uE5#mt0dGOE%+joFZyQHtZwLr%0B?Jp%><}XN%?HrQ&2^F|KOu)szqcSIN@+65Ob(qXICxkrcHi#4 zec$)q^2YSE!*KoR@3UK_G4pEOpB?pTTm~j=CF)%K@?V4%Q)nm?Y;bfr6$&3dw=DOIWDdZ&)=7>-{}Uj zBg4c?{3vvz9XIwne&8uTdE~~@Q_c1(?k9fIj(Umi=7o>k9Y1VI)m46&xN%cS8KT1d zr0pefB=@DN>y}r$p7J_w*Gm$q!mI9T=qk~P0yo-zD4U6!puMYPSIHRT$FAoJ8T(2K zH|}l6iRv|zo|5DKCv=MKtM2`_bj8T%ZiG=W0l(RHlV~*Wy1U+s6SwVQnVyS*WwRH^ z*lk7uinQDlI+Hdg(vAWF)n4d#A}Bhlb9W*&VVSzSCU?AEfLWu}mHHd`f|bOKX{B!9 zV`La}bN$ZUTbub(cY48&9|XrrUva%4BE!MFsFl8Qlkv!LGs1N82|hF6IX<%(gU=Ph z8-5aJ1#QaAjxgG`Cbkhr(TjTj;w%6^-P}EA9i9VSjwzKk)b+RJnO4i8Cy`65uMvJwmiIzLv ztTa~f7p7LA{mZv6zaL8#Uv9P)W*l|fQe9q?@h*y&J1|G|suH_?l9%)1ruM_j$DUd5 zKFVe`{Z9TtdvY1N<2eXcw)iriIw3DFFmSWzOIBRSwE80u(zZS8bxMrP4^vVxw_Gl7=s1ro3@dq$#WsEbtLLiA^jde*61fA;uYE7+*b zs%dIHLxQ5B2DKu0)$mlaRH{H^W+20c)9b?Kaw`gDX0|Z5f}hAvoH;<>R&L;i;Gu~-9nBLYt2qRK@=9!%<%baGr7w&Gw*rd!v zqm1und|!h*7tfK#GoBE+#V1H8zJO2h^AM~!$6bxj1#aOt#~1ho+-G?OKXn-`P9T*w zKqK`YEc#6WG&O{^Yp9P7UYiJ{)d9la&`yfLC8Mw?>=pKy!Y?p43Xxax5IIJ5Qcj%V zRBEJ!B_JO-w;#qA$N;q46O@OM96Seg`Gt zqwJR3qIVoF5l{8NSCBMNoCO0R1wiV6oZ)EfT)(FKQN&b~ducE#a=aXy zMEsfQhdb&L)t!TA5>)A;7>Gc@HgU&G5G@es$CUx-U$7PSM`O0G0*3?}MczfI~$x6S(1V;kq|tzQcw1l;&K3gZrXfWBGvL#=wFW`KNJiG&Oi ziMP|#ahh9d3#a*mSYv`+=INDUrg{;A?2%+v#{>SYru2hsz9Ch#=I{G4lBHWpMJg-o z1d*3yXK0pSmvJVw6?z>RtCvWP6-Npz6oAGP@XG?}UnD`cRLJg`It!7Nkt^f`mDwnD z{4jrd2icsGd-z2BJ!&yKQA50`mPrp;e}Z%7PMh_0e2?o8%%-HE1e&RWY)T*+1LYa~ zh;AzUl2IWFG8d3EEOYf1dK}RY8I9QYChmCu|CWBlye1yd&4_pwDZRm&>I}|M=Wu3P zNv)Pncu@PCI4>?B1-dAn7w4&Ve2W)!ud;CUb6)&FTuMj1S|$jKPCpB&bI1o%;zdoB zQ$&V9T($r2{}qzbIYUR{R3zM9tkaP(33{!TlCAtS)%CkF@I#rC(l;`j`+c#Rx&k-- zeppTYm{hqWP6AE6hPsEi$9(qcY2dAntRTM)vGwgLNT=EU&IH#lRl6#p!z6JcQXix! z#2Nhp2(~Z#bCj#y332?4^8;UhwtAdy>GZ2u72r1^p{hFSY*mrE^vhM8LKEdd_uND$ zodZ$Mx9_PqGyD+Aou(p+D7uuB^oeLvz_(2@7o|Tsnk( z#?^n@f|PXTA*HtXow#281YH%;qxva{pON@Ei8n|PkE>shxC)V#$4)J4m9OP<^-F4h zjl{1=Tqp4+iCYj^38_FgqBQiRFJ&uV>pLHyUrclS7VK>}U~z*lan~%F3t)5g4*DH& zxQQ2v@0!CYxC?AZjc@45)XdY1IyJ?`(^I^=dHYi+(Y9w?QD|?{KF~!&(T3}x2q^Ry zP6`Se8h9d;)Y&A?mZ|ZWCrB-{L_JP=#xgsfPpoE*VltgJDhSL+KCHK?W;Tos5BGM% z-0-$f*;L2$#~3=Mr7}7MQ(rH+cGmfi9Uq+9atk``79JQhb{%62pJogk@ zVZ+fCb^6;&A50pKZkRWt;qvn~CGa|o>V41_$}^2))9R8wE0nHvB7y&AuG1F9lpK^0 Sqf2>gHl0Oh*}3G@9P2*_lOGiT diff --git a/mongordkit/Search/similarity.py b/mongordkit/Search/similarity.py index 83e1e22..b31ac43 100644 --- a/mongordkit/Search/similarity.py +++ b/mongordkit/Search/similarity.py @@ -1,7 +1,4 @@ -import pymongo -import rdkit -import math - +import pymongo, rdkit, math from bson import ObjectId from rdkit import Chem from rdkit.Chem import AllChem @@ -9,8 +6,21 @@ import numpy as np import functools +# Default configurations for a variety of constants. +DEFAULT_THRESHOLD = 0.8 + +# Morgan fingerprints +DEFAULT_MORGAN_RADIUS = 2 +DEFAULT_MORGAN_LEN = 2048 + +# LSH constants +DEFAULT_BIT_N = 2048 +DEFAULT_BUCKET_N = 25 +DEFAULT_PERM_LEN = 2048 +DEFAULT_PERM_N = 100 + -def SimSearchNaive(mol, mol_collection, threshold=0.8): +def SimSearchNaive(mol, mol_collection, threshold=DEFAULT_THRESHOLD): """ Searches MOL_COLLECTION for molecules with Tanimoto similarity to MOL greater than or equal to THRESHOLD. @@ -20,16 +30,19 @@ def SimSearchNaive(mol, mol_collection, threshold=0.8): :return: A list of SMILES that fulfill threshold, along with their tanimoto scores. """ results = [] - qfp = list(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048).GetOnBits()) + qfp = list(AllChem.GetMorganFingerprintAsBitVect(mol, DEFAULT_MORGAN_RADIUS, + nBits=DEFAULT_MORGAN_LEN).GetOnBits()) for molDoc in mol_collection.find(): - mfp = list((AllChem.GetMorganFingerprintAsBitVect(Chem.Mol(molDoc['rdmol']), 2, nBits=2048).GetOnBits())) + mfp = list((AllChem.GetMorganFingerprintAsBitVect(Chem.Mol(molDoc['rdmol']), DEFAULT_MORGAN_RADIUS, + nBits=DEFAULT_MORGAN_LEN).GetOnBits())) tanimoto = calc_tanimoto(qfp, mfp) if tanimoto >= threshold: - results.append([tanimoto, molDoc['smiles']]) + results.append(["tanimoto: " + str(tanimoto)] + + ["index" + ": " + str(mol["index"])]) return results -def AddMorganFingerprints(mol_collection, count_collection, radius=2, length=2048): +def AddMorganFingerprints(mol_collection, count_collection): """ Adds Morgan fingerprint bit vectors and counts with RADIUS and LENGTH bits to all documents in MOL_COLLECTION. Inserts bit frequency information @@ -40,38 +53,43 @@ def AddMorganFingerprints(mol_collection, count_collection, radius=2, length=204 :param length: NBits in desired Morgan fingerprints. Defaults to 2048. """ for m in mol_collection.find(): - bit_vector = list((AllChem.GetMorganFingerprintAsBitVect(Chem.Mol(m['rdmol']), radius, length).GetOnBits())) + bit_vector = list((AllChem.GetMorganFingerprintAsBitVect(Chem.Mol(m['rdmol']), + DEFAULT_MORGAN_RADIUS, + nBits=DEFAULT_MORGAN_LEN).GetOnBits())) count = len(bit_vector) - mol_collection.update_one({'_id': m['_id']}, {'$set': {'morgan_fp': {'bits': bit_vector, + mol_collection.update_one({'_id': m['_id']}, {'$set': {'fingerprints.morgan_fp': {'bits': bit_vector, 'count': count}}}) counts = {} - chunk_size = 100 for m in mol_collection.find(): - for bit in m['morgan_fp']['bits']: + for bit in m['fingerprints']['morgan_fp']['bits']: counts[bit] = counts.get(bit, 0) + 1 for k, v in counts.items(): count_collection.insert_one({'_id': k, 'count': v}) a = count_collection.find_one() - mol_collection.create_index('morgan_fp.bits') - mol_collection.create_index('morgan_fp.count') + mol_collection.create_index('fingerprints.morgan_fp.bits') + mol_collection.create_index('fingerprints.morgan_fp.count') return None -def SimSearch(mol, mol_collection, count_collection=None, threshold=0.8): +def SimSearch(mol, mol_collection, count_collection=None, threshold=DEFAULT_THRESHOLD): """ Searches MOL_COLLECTION for molecules with Tanimoto similarity to MOL greater than or equal to THRESHOLD. :param mol: An rdmol object. :param mol_collection: A MongoDB collection that meets requirements. :param threshold: Tanimoto threshold for similarity. Defaults to 0.8 + :param return_fields: A list of strings that indicate fields to return in addition + to the Tanimoto threshold. Defaults to only canonical smiles. :return: A list of SMILES that fulfill threshold, along with their tanimoto scores. """ results = [] a = count_collection.find_one() - qfp = list(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048).GetOnBits()) + qfp = list(AllChem.GetMorganFingerprintAsBitVect(mol, DEFAULT_MORGAN_RADIUS, + nBits=DEFAULT_MORGAN_LEN).GetOnBits()) if threshold == 0: for mol in mol_collection.find(): - results.append([calc_tanimoto(qfp, mol['morgan_fp']['bits']), mol['smiles']]) + tanimoto = [str(calc_tanimoto(qfp, mol['fingerprints']['morgan_fp']['bits']))] + results.append(tanimoto + [str(mol['index'])]) return results qfp_count = len(qfp) fp_min = int(math.ceil((threshold * qfp_count))) @@ -85,17 +103,16 @@ def SimSearch(mol, mol_collection, count_collection=None, threshold=0.8): {'_id': {'$in': qfp}}).sort('count', 1).limit(req_common_count)] else: req_common_bits = qfp[:req_common_count] - for mol in mol_collection.find({'morgan_fp.count': {'$gte': fp_min, '$lte': fp_max}, - 'morgan_fp.bits': {'$in': req_common_bits}}): - - - tanimoto = calc_tanimoto(qfp, mol['morgan_fp']['bits']) + for mol in mol_collection.find({'fingerprints.morgan_fp.count': {'$gte': fp_min, '$lte': fp_max}, + 'fingerprints.morgan_fp.bits': {'$in': req_common_bits}}): + tanimoto = calc_tanimoto(qfp, mol['fingerprints']['morgan_fp']['bits']) if tanimoto >= threshold: - results.append([tanimoto, mol['smiles']]) + results.append([tanimoto] + + [str(mol["index"])]) return results -def SimSearchAggregate(mol, mol_collection, count_collection=None, threshold=0.8): +def SimSearchAggregate(mol, mol_collection, count_collection=None, threshold=DEFAULT_THRESHOLD): """ Searches MOL_COLLECTION for molecules with Tanimoto similarity to MOL greater than or equal to THRESHOLD. @@ -106,10 +123,12 @@ def SimSearchAggregate(mol, mol_collection, count_collection=None, threshold=0.8 :return: A list of SMILES that fulfill threshold, along with their tanimoto scores. """ results = [] - qfp = list(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048).GetOnBits()) + qfp = list(AllChem.GetMorganFingerprintAsBitVect(mol, DEFAULT_MORGAN_RADIUS, + nBits=DEFAULT_MORGAN_LEN).GetOnBits()) if threshold == 0: for mol in mol_collection.find(): - results.append([calc_tanimoto(qfp, mol['morgan_fp']['bits']), mol['smiles']]) + tanimoto = [str(calc_tanimoto(qfp, mol['fingerprints']['morgan_fp']['bits']))] + results.append(tanimoto + [str(mol['index'])]) return results qfp_count = len(qfp) fp_min = int(math.ceil((threshold * qfp_count))) @@ -124,23 +143,23 @@ def SimSearchAggregate(mol, mol_collection, count_collection=None, threshold=0.8 else: req_common_bits = qfp[:req_common_count] aggregate = [ - {'$match': {'morgan_fp.count': {'$gte': fp_min, '$lte': fp_max}, - 'morgan_fp.bits': {'$in': req_common_bits}}}, + {'$match': {'fingerprints.morgan_fp.count': {'$gte': fp_min, '$lte': fp_max}, + 'fingerprints.morgan_fp.bits': {'$in': req_common_bits}}}, {'$project': - {'tanimoto': {'$let': {'vars': {'common': {'$size': {'$setIntersection': ['$morgan_fp.bits', qfp]}}}, - 'in': {'$divide': ['$$common', {'$add': [qfp_count, {'$subtract': ['$morgan_fp.count', '$$common']}]}]} + {'tanimoto': {'$let': {'vars': {'common': {'$size': {'$setIntersection': ['$fingerprints.morgan_fp.bits', qfp]}}}, + 'in': {'$divide': ['$$common', {'$add': [qfp_count, {'$subtract': ['$fingerprints.morgan_fp.count', '$$common']}]}]} } }, - 'smiles': 1 + 'index': 1 } }, {'$match': {'tanimoto': {'$gte': threshold}}} ] response = mol_collection.aggregate(aggregate) - return [[r['tanimoto'], r['smiles']] for r in response] + return [[r['tanimoto'], r['index']] for r in response] -def AddRandPermutations(perm_collection, len=2048, num=100): +def AddRandPermutations(perm_collection, len=DEFAULT_PERM_LEN, num=DEFAULT_PERM_N): """ Uses the function get_permutations to generate NUM random permutations of bits of length LEN and saves each in COLLECTION as a separate document. @@ -153,7 +172,7 @@ def AddRandPermutations(perm_collection, len=2048, num=100): for i, perm in enumerate(get_permutations(len, num))]) -def AddLocalityHashes(mol_collection, perm_collection, nBuckets=25): +def AddLocalityHashes(mol_collection, perm_collection, nBuckets=DEFAULT_BUCKET_N): """ Adds locality-sensitive hash values to each document in MOL_COLLECTION based on permutations in PERM_COLLECTION. This method requires documents @@ -191,7 +210,7 @@ def AddHashCollections(db, mol_collection): {'$push': {'molecules': moldoc['_id']}}, True) -def SimSearchLSH(mol, db, mol_collection, perm_collection, threshold=0.8): +def SimSearchLSH(mol, db, mol_collection, perm_collection, threshold=DEFAULT_THRESHOLD): """ Conducts a similarity search for query molecule MOL in MOL_COLLECTION with Tanimoto threshold THRESHOLD. @@ -202,11 +221,13 @@ def SimSearchLSH(mol, db, mol_collection, perm_collection, threshold=0.8): :return: """ results = [] - qfp = list(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048).GetOnBits()) + qfp = list(AllChem.GetMorganFingerprintAsBitVect(mol, DEFAULT_MORGAN_RADIUS, + nBits=DEFAULT_MORGAN_LEN).GetOnBits()) qfp_bits = [int(n) for n in qfp] if threshold == 0: for mol in db.molecules.find(): - results.append([calc_tanimoto(qfp, mol['morgan_fp']['bits']), mol['smiles']]) + tanimoto = [str(calc_tanimoto(qfp, mol['fingerprints']['morgan_fp']['bits']))] + results.append(tanimoto + [str(mol['index'])]) return results qfp_count = len(qfp) fp_min = int(math.ceil((threshold * qfp_count))) @@ -223,29 +244,36 @@ def SimSearchLSH(mol, db, mol_collection, perm_collection, threshold=0.8): permutations = [p['permutation'] for p in perm_collection.find()] min_hash = get_min_hash(mol, permutations) hash_groups = hash_to_buckets(min_hash) - nested_res = [list(i)[0]['molecules'] for i in - [db['LSHash_' + str(i)].find({'_id': h}, {'molecules': 1}) for i, h in enumerate(hash_groups)]] + nested_res = [] + cursors = [db['LSHash_' + str(i)].find({'_id': h}, {'molecules': 1}) for i, h in enumerate(hash_groups)] + for c in cursors: + cursor = list(c) + if len(cursor) == 0: + continue + else: + nested_res.append(cursor[0]['molecules']) hashed_ids = [ObjectId(x) for x in (set([str(item) for sublist in nested_res for item in sublist]))] aggregate = [ - {'$match': {'_id':{'$in': hashed_ids}, 'morgan_fp.count': {'$gte': fp_min, '$lte': fp_max}, - 'morgan_fp.bits': {'$in': req_common_bits}}}, + {'$match': {'_id':{'$in': hashed_ids}, 'fingerprints.morgan_fp.count': {'$gte': fp_min, '$lte': fp_max}, + 'fingerprints.morgan_fp.bits': {'$in': req_common_bits}}}, {'$project': - {'tanimoto': {'$let': {'vars': {'common': {'$size': {'$setIntersection': ['$morgan_fp.bits', qfp]}}}, + {'tanimoto': {'$let': {'vars': {'common': {'$size': {'$setIntersection': ['$fingerprints.morgan_fp.bits', qfp]}}}, 'in': {'$divide': ['$$common', { - '$add': [qfp_count, {'$subtract': ['$morgan_fp.count', '$$common']}]}]} + '$add': [qfp_count, {'$subtract': ['$fingerprints.morgan_fp.count', '$$common']}]}]} } }, - 'smiles': 1 + 'index': 1 } }, {'$match': {'tanimoto': {'$gte': threshold}}} ] - results = mol_collection.aggregate(aggregate) - return [[r['tanimoto'], r['smiles']] for r in results] + response = mol_collection.aggregate(aggregate) + return [[r['tanimoto'], r['index']] for r in response] def get_min_hash(mol, permutations): - qfp = [int(n) for n in AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)] + qfp = [int(n) for n in list(AllChem.GetMorganFingerprintAsBitVect(mol, DEFAULT_MORGAN_RADIUS, + nBits=DEFAULT_MORGAN_LEN))] min_hash = [] for perm in permutations: for idx, i in enumerate(perm): @@ -255,7 +283,7 @@ def get_min_hash(mol, permutations): return min_hash -def hash_to_buckets(min_hash, num_buckets=25, nBits=2048): +def hash_to_buckets(min_hash, num_buckets=DEFAULT_BUCKET_N, nBits=DEFAULT_BIT_N): if len(min_hash) % num_buckets: print('Length of min_hash must be divisible by number of buckets.') return @@ -267,7 +295,7 @@ def hash_to_buckets(min_hash, num_buckets=25, nBits=2048): return buckets -def get_permutations(len_permutations=2048, num_permutations=100): +def get_permutations(len_permutations=DEFAULT_PERM_LEN, num_permutations=DEFAULT_PERM_N): """Gets NUM_PERMUTATIONS random permutations of numbers of length LEN_PERMUTATIONS each.""" return map(lambda _: np.random.permutation(len_permutations), range(num_permutations)) diff --git a/mongordkit/Search/substructure.py b/mongordkit/Search/substructure.py index 3d1c593..b198442 100644 --- a/mongordkit/Search/substructure.py +++ b/mongordkit/Search/substructure.py @@ -20,7 +20,7 @@ def SubSearchNaive(pattern, mol_collection, chirality=False): for molDoc in mol_collection.find(): rdmol = Chem.Mol(molDoc['rdmol']) if rdmol.HasSubstructMatch(pattern, useChirality=chirality): - results.append(molDoc['smiles']) + results.append(molDoc['index']) return results @@ -33,7 +33,7 @@ def AddPatternFingerprints(mol_collection): mol = Chem.Mol(moldoc['rdmol']) bit_vector = list(PatternFingerprint(mol).GetOnBits()) count = len(bit_vector) - mol_collection.update_one({'_id': moldoc['_id']}, {'$set': {'pattern_fp': {'bits': bit_vector, 'count': count}}}) + mol_collection.update_one({'_id': moldoc['_id']}, {'$set': {'fingerprints.pattern_fp': {'bits': bit_vector, 'count': count}}}) return @@ -49,14 +49,10 @@ def SubSearch(pattern, mol_collection, chirality=False): results = [] query_fp = list(PatternFingerprint(pattern).GetOnBits()) qfp_len = len(query_fp) - for molDoc in mol_collection.find({'pattern_fp.count': {'$gte': qfp_len}, - 'pattern_fp.bits': {'$all': query_fp} + for molDoc in mol_collection.find({'fingerprints.pattern_fp.count': {'$gte': qfp_len}, + 'fingerprints.pattern_fp.bits': {'$all': query_fp} }): rdmol = Chem.Mol(molDoc['rdmol']) if rdmol.HasSubstructMatch(pattern, useChirality=chirality): - results.append(molDoc['smiles']) - return results - - -def SubSearchAggregate(pattern, db, chirality=False): - return \ No newline at end of file + results.append(molDoc['index']) + return results \ No newline at end of file diff --git a/mongordkit/Search/tests/test_similarity.py b/mongordkit/Search/tests/test_similarity.py index 07f5f64..52bc7dc 100644 --- a/mongordkit/Search/tests/test_similarity.py +++ b/mongordkit/Search/tests/test_similarity.py @@ -136,6 +136,7 @@ def test_similarity_accuracy_LSH(mongoURI): for t in thresholds: for i in range(200): mol = Chem.Mol(db_python[i]['rdmol']) + smiles = Chem.MolToSmiles(mol) search_python = [result[1] for result in utils.similaritySearchPython(mol, db_python, t)] search_mongo_LSH = [result[1] for result in similarity.SimSearchLSH(mol, db_mongo, db_mongo.molecules, db_mongo.permutations, t)] diff --git a/mongordkit/Search/tests/test_substructure.py b/mongordkit/Search/tests/test_substructure.py index 2b29a7d..22a2425 100644 --- a/mongordkit/Search/tests/test_substructure.py +++ b/mongordkit/Search/tests/test_substructure.py @@ -9,7 +9,7 @@ def test_addPatternFingerprints(): write.WriteFromSDF(db.molecules, 'data/test_data/first_200.props.sdf') substructure.AddPatternFingerprints(db.molecules) counter = 0 - assert db.molecules.count_documents({"pattern_fp": {"$exists": True}}) == 200 + assert db.molecules.count_documents({"fingerprints.pattern_fp": {"$exists": True}}) == 200 def test_SubSearchAccuracy(): diff --git a/mongordkit/Search/tests/utils.py b/mongordkit/Search/tests/utils.py index 8521df3..c38936c 100644 --- a/mongordkit/Search/tests/utils.py +++ b/mongordkit/Search/tests/utils.py @@ -25,7 +25,7 @@ def similaritySearchPython(query_mol, molecules, threshold): mfp = list(AllChem.GetMorganFingerprintAsBitVect(Chem.Mol(mol['rdmol']), 2, nBits=2048).GetOnBits()) tanimoto = calc_tanimoto(qfp, mfp) if calc_tanimoto(qfp, mfp) >= threshold: - results.append([tanimoto, mol['smiles']]) + results.append([tanimoto, str(mol['index'])]) return results @@ -38,7 +38,7 @@ def SubSearchPython(pattern, molecules): for moldoc in molecules: mol = Chem.Mol(moldoc['rdmol']) if mol.HasSubstructMatch(pattern): - results.append(moldoc['smiles']) + results.append(moldoc['index']) return results From 8ef571426cf44a69e078e02bd38bd2ef899c7d84 Mon Sep 17 00:00:00 2001 From: Christopher Zou Date: Tue, 18 Aug 2020 14:41:20 -0400 Subject: [PATCH 3/7] Update hashes, update README, polish documentation and benchmarks --- README.md | 41 +- ...ng and Writing to MongoDB-checkpoint.ipynb | 519 +++++++++-------- .../Similarity Testing-checkpoint.ipynb | 244 -------- ...y and Substructure Search-checkpoint.ipynb | 430 ++++++++++---- .../Creating and Writing to MongoDB.ipynb | 519 +++++++++-------- docs/notebooks/Explore LSH.ipynb | 284 ---------- .../notebooks/Exploring Multiprocessing.ipynb | 340 ----------- docs/notebooks/Similarity Benchmarking.ipynb | 428 ++++++++++++++ docs/notebooks/Similarity Testing.ipynb | 532 ------------------ .../Similarity and Substructure Search.ipynb | 496 +++++++--------- .../notebooks/Substructure Benchmarking.ipynb | 211 ++++--- docs/testing.md | 11 +- .../Database/__pycache__/write.cpython-37.pyc | Bin 4377 -> 0 bytes mongordkit/Database/registration.py | 16 +- mongordkit/Database/write.py | 19 +- mongordkit/Search/similarity.py | 6 +- mongordkit/Search/tests/test_similarity.py | 2 +- 17 files changed, 1650 insertions(+), 2448 deletions(-) delete mode 100644 docs/notebooks/.ipynb_checkpoints/Similarity Testing-checkpoint.ipynb delete mode 100644 docs/notebooks/Explore LSH.ipynb delete mode 100644 docs/notebooks/Exploring Multiprocessing.ipynb create mode 100644 docs/notebooks/Similarity Benchmarking.ipynb delete mode 100644 docs/notebooks/Similarity Testing.ipynb delete mode 100644 mongordkit/Database/__pycache__/write.cpython-37.pyc diff --git a/README.md b/README.md index 4471e86..fadbe91 100644 --- a/README.md +++ b/README.md @@ -2,17 +2,18 @@ [![Build Status](https://dev.azure.com/cwzou/mongo-rdkit/_apis/build/status/rdkit.mongo-rdkit?branchName=master)](https://dev.azure.com/cwzou/mongo-rdkit/_build/latest?definitionId=1&branchName=master) Mongo-rdkit is an integration between MongoDB, -a NoSQL database platform, and RDKit, a collection of chemoinformatics and machine-learning software. +a NoSQL database platform, and RDKit, a collection of cheminformatics and machine-learning software. This package contains tools to create and manipulate a chemically-intelligent database, as well as methods for high-performance searches on the database that leverage native MongoDB features. Useful links: * [BSD License](https://github.com/rdkit/mongo-rdkit/blob/master/LICENSE) - a business friendly license for open-source. -* [Jupyter Notebooks](https://github.com/rdkit/mongo-rdkit/tree/master/docs) - resources for getting started. +* [Jupyter Notebooks](https://github.com/rdkit/mongo-rdkit/tree/master/docs) - walkthroughs for main functionality. +* [Testing Guide](https://github.com/rdkit/mongo-rdkit/blob/master/docs/testing.md) - walkthrough of running `mongordkit` tests. ## Documentation Jupyter Notebooks and resources for getting started in the [docs](https://github.com/rdkit/mongo-rdkit/tree/master/docs) -folder on GitHub +folder on GitHub. ## Installation As the package is not officially configured with a setup.py file or pushed onto PyPi, these are working install instructions. @@ -43,9 +44,43 @@ echo $PYTHONPATH You can now `import mongordkit` in your Python interpreter or run all tests using the `pytest` command. ### Windows: +Similarly, ensure that `conda` has been added to `PATH`. +Clone the repository into your desired directory and navigate into it. +Create a conda environment called mongo_rdkit that includes dependencies: +``` +conda env create --quiet --force --file env.yml +``` +Activate this conda environment: +``` +call activate mongo_rdkit +``` +Check that you are able to import mongordkit: +``` +python -c "import mongordkit" +``` +If this fails, you may need to add the current directory manually to `PYTHONPATH`: +``` +set PYTHONPATH=%PYTHONPATH%;C:. +``` +You can now use `mongordkit` in your interpreter and run tests using `python -m pytest`. +## Package Contents +### Modules +`mongordkit` contains two main modules, each of which contains a variety of importable methods and classes. +`Database` contains functionality for writing and registering data. `Search` contains functionality for setting up and performing +substructure and similarity search. Detailed walkthroughs can be found in the notebooks, listed below. + +### Notebooks +- **Creating and Writing to MongoDB**: documentation and demos for creating and modifying mongo-rdkit databases. +- **Similarity and Substructure Search**: documentation and demos for similarity and substructure search. +- **Similarity Benchmarking**: documentation for reproducing similarity benchmarking. +- **Substructure Benchmarking**: documentation for reproducing substructure benchmarking. +### Configuration +- **azure_pipelines.yml**: CI/CD pipeline configurations. +- **conftest.py**: `pytest` configurations. +- **env.yml**: required dependencies. ## License Code released under the BSD License. diff --git a/docs/notebooks/.ipynb_checkpoints/Creating and Writing to MongoDB-checkpoint.ipynb b/docs/notebooks/.ipynb_checkpoints/Creating and Writing to MongoDB-checkpoint.ipynb index aec730f..c581d97 100644 --- a/docs/notebooks/.ipynb_checkpoints/Creating and Writing to MongoDB-checkpoint.ipynb +++ b/docs/notebooks/.ipynb_checkpoints/Creating and Writing to MongoDB-checkpoint.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -88,14 +88,14 @@ "## Data Registration\n", "`Database.registration` constructs document representations of molecules according to configurable schemes and handles data registration settings.\n", "\n", - "It does this in two parts. First, it defines the global variables `RDKIT_HASH_FUNCTIONS` and `HASH_FUNCTIONS` as dictionaries that hold map hash function names to methods. It also defines the global variables `DEFAULT_SCHEME_NAME`, `DEFAULT_AUTHOR`, `DEFAULT_PREPROCESS`, and `DEFAULT_INDEX`, which are used in scheme creation and are thus defined for easy configuration. \n", + "It does this in two parts. First, it defines the global variable `HASH_FUNCTIONS` as a dictionary that maps hash function names to methods. It also defines the global variables `DEFAULT_SCHEME_NAME`, `DEFAULT_AUTHOR`, `DEFAULT_PREPROCESS`, and `DEFAULT_INDEX`, which are used in scheme creation and are thus defined for easy configuration. \n", "\n", "Second, the file defines the `MolDocScheme` object, which stores scheme information in its instance variables and is passed into `.write` methods in order to specify molecule document format. By default, `MolDocScheme` includes scheme name, author, whether or not the molecule has been pre-processed, an index option, hashes, fingerprints, and value fields. All of the information contained in a `MolDocScheme` object can be used directly to generate documents for molecules:" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -105,35 +105,34 @@ " 'index': 'YXFVVABEGXRONW-UHFFFAOYSA-N',\n", " 'smiles': 'Cc1ccccc1',\n", " 'scheme': 'default',\n", - " 'hashes': {'inchi_standard': 'InChI=1S/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", - " 'inchikey_KET_15T': 'YXFVVABEGXRONW-UHFFFAOYNA-N',\n", - " 'noiso_smiles': 'Cc1ccccc1',\n", - " 'MoleculeHashString': '100-7-7-SaZjmQ-zcSDYw-aXeP/g-122pug-haQS5A-qxXe4Q',\n", - " 'inchi_KET_15T': 'InChI=1/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", + " 'hashes': {'MolFormula': 'C7H8',\n", + " 'SmallWorldIndexBRL': 'B7R1L5',\n", + " 'AtomBondCounts': '7,7',\n", + " 'cx_smiles': 'Cc1ccccc1',\n", + " 'NetCharge': '0',\n", + " 'CanonicalSmiles': 'Cc1ccccc1',\n", " 'inchikey_standard': 'YXFVVABEGXRONW-UHFFFAOYSA-N',\n", - " 'cx_smiles': 'Cc1ccccc1'},\n", - " 'rdkit_hashes': {'Mesomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", - " 'HetAtomProtomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", + " 'inchikey_KET_15T': 'YXFVVABEGXRONW-UHFFFAOYNA-N',\n", + " 'SmallWorldIndexBR': 'B7R1',\n", + " 'DegreeVector': '0,1,5,1',\n", + " 'ElementGraph': 'CC1CCCCC1',\n", + " 'HetAtomTautomer': 'C[C]1[CH][CH][CH][CH][CH]1_0_0',\n", + " 'inchi_standard': 'InChI=1S/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", + " 'RedoxPair': 'C[C]1[CH][CH][CH][CH][CH]1',\n", + " 'AnonymousGraph': '**1*****1',\n", + " 'Mesomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", " 'Regioisomer': '*C.c1ccccc1',\n", + " 'inchi_KET_15T': 'InChI=1/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", " 'MurckoScaffold': 'c1ccccc1',\n", " 'ArthorSubstructureOrder': '00070007010007000000002a000000',\n", + " 'noiso_smiles': 'Cc1ccccc1',\n", " 'ExtendedMurcko': '*c1ccccc1',\n", - " 'DegreeVector': '0,1,5,1',\n", - " 'RedoxPair': 'C[C]1[CH][CH][CH][CH][CH]1',\n", - " 'SmallWorldIndexBR': 'B7R1',\n", - " 'MolFormula': 'C7H8',\n", - " 'AtomBondCounts': '7,7',\n", - " 'ElementGraph': 'CC1CCCCC1',\n", - " 'CanonicalSmiles': 'Cc1ccccc1',\n", - " 'SmallWorldIndexBRL': 'B7R1L5',\n", - " 'HetAtomTautomer': 'C[C]1[CH][CH][CH][CH][CH]1_0_0',\n", - " 'NetCharge': '0',\n", - " 'AnonymousGraph': '**1*****1'},\n", + " 'HetAtomProtomer': 'C[C]1[CH][CH][CH][CH][CH]1_0'},\n", " 'fingerprints': {},\n", " 'value_data': {}}" ] }, - "execution_count": 23, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -153,9 +152,16 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 4, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "removed AnonymousGraph from scheme\n" + ] + }, { "data": { "text/plain": [ @@ -163,34 +169,33 @@ " 'index': 'C7H8',\n", " 'smiles': 'Cc1ccccc1',\n", " 'scheme': 'default',\n", - " 'hashes': {'inchi_standard': 'InChI=1S/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", - " 'inchikey_KET_15T': 'YXFVVABEGXRONW-UHFFFAOYNA-N',\n", - " 'noiso_smiles': 'Cc1ccccc1',\n", - " 'MoleculeHashString': '100-7-7-SaZjmQ-zcSDYw-aXeP/g-122pug-haQS5A-qxXe4Q',\n", - " 'inchi_KET_15T': 'InChI=1/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", + " 'hashes': {'MolFormula': 'C7H8',\n", + " 'SmallWorldIndexBRL': 'B7R1L5',\n", + " 'AtomBondCounts': '7,7',\n", + " 'cx_smiles': 'Cc1ccccc1',\n", + " 'NetCharge': '0',\n", + " 'CanonicalSmiles': 'Cc1ccccc1',\n", " 'inchikey_standard': 'YXFVVABEGXRONW-UHFFFAOYSA-N',\n", - " 'cx_smiles': 'Cc1ccccc1'},\n", - " 'rdkit_hashes': {'Mesomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", - " 'HetAtomProtomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", + " 'inchikey_KET_15T': 'YXFVVABEGXRONW-UHFFFAOYNA-N',\n", + " 'SmallWorldIndexBR': 'B7R1',\n", + " 'DegreeVector': '0,1,5,1',\n", + " 'ElementGraph': 'CC1CCCCC1',\n", + " 'HetAtomTautomer': 'C[C]1[CH][CH][CH][CH][CH]1_0_0',\n", + " 'inchi_standard': 'InChI=1S/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", + " 'RedoxPair': 'C[C]1[CH][CH][CH][CH][CH]1',\n", + " 'Mesomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", " 'Regioisomer': '*C.c1ccccc1',\n", + " 'inchi_KET_15T': 'InChI=1/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", " 'MurckoScaffold': 'c1ccccc1',\n", " 'ArthorSubstructureOrder': '00070007010007000000002a000000',\n", + " 'noiso_smiles': 'Cc1ccccc1',\n", " 'ExtendedMurcko': '*c1ccccc1',\n", - " 'DegreeVector': '0,1,5,1',\n", - " 'RedoxPair': 'C[C]1[CH][CH][CH][CH][CH]1',\n", - " 'SmallWorldIndexBR': 'B7R1',\n", - " 'MolFormula': 'C7H8',\n", - " 'AtomBondCounts': '7,7',\n", - " 'ElementGraph': 'CC1CCCCC1',\n", - " 'CanonicalSmiles': 'Cc1ccccc1',\n", - " 'SmallWorldIndexBRL': 'B7R1L5',\n", - " 'HetAtomTautomer': 'C[C]1[CH][CH][CH][CH][CH]1_0_0',\n", - " 'NetCharge': '0'},\n", + " 'HetAtomProtomer': 'C[C]1[CH][CH][CH][CH][CH]1_0'},\n", " 'fingerprints': {},\n", " 'value_data': {}}" ] }, - "execution_count": 24, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -220,7 +225,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -234,91 +239,91 @@ "name": "stderr", "output_type": "stream", "text": [ - "RDKit WARNING: [22:03:51] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:03:51] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:03:51] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:03:51] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:03:51] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:03:51] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:03:51] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", - "RDKit WARNING: [22:05:23] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n" + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [15:39:46] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n" ] }, { @@ -335,7 +340,7 @@ "200" ] }, - "execution_count": 28, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -356,7 +361,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -370,97 +375,95 @@ "name": "stderr", "output_type": "stream", "text": [ - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n" + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:48] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:48] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [15:39:50] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n" ] }, { @@ -477,7 +480,7 @@ "100" ] }, - "execution_count": 30, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -519,13 +522,29 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'MoleculeHashString': ,\n", + "{'AnonymousGraph': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.AnonymousGraph)>,\n", + " 'ElementGraph': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.ElementGraph)>,\n", + " 'CanonicalSmiles': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.CanonicalSmiles)>,\n", + " 'MurckoScaffold': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.MurckoScaffold)>,\n", + " 'ExtendedMurcko': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.ExtendedMurcko)>,\n", + " 'MolFormula': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.MolFormula)>,\n", + " 'AtomBondCounts': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.AtomBondCounts)>,\n", + " 'DegreeVector': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.DegreeVector)>,\n", + " 'Mesomer': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.Mesomer)>,\n", + " 'HetAtomTautomer': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.HetAtomTautomer)>,\n", + " 'HetAtomProtomer': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.HetAtomProtomer)>,\n", + " 'RedoxPair': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.RedoxPair)>,\n", + " 'Regioisomer': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.Regioisomer)>,\n", + " 'NetCharge': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.NetCharge)>,\n", + " 'SmallWorldIndexBR': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.SmallWorldIndexBR)>,\n", + " 'SmallWorldIndexBRL': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.SmallWorldIndexBRL)>,\n", + " 'ArthorSubstructureOrder': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.ArthorSubstructureOrder)>,\n", " 'inchi_standard': ,\n", " 'inchikey_standard': ,\n", " 'inchi_KET_15T': (rdmol)>,\n", @@ -534,7 +553,7 @@ " 'cx_smiles': }" ] }, - "execution_count": 31, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -543,42 +562,6 @@ "registration.HASH_FUNCTIONS" ] }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'AnonymousGraph': rdkit.Chem.rdMolHash.HashFunction.AnonymousGraph,\n", - " 'ElementGraph': rdkit.Chem.rdMolHash.HashFunction.ElementGraph,\n", - " 'CanonicalSmiles': rdkit.Chem.rdMolHash.HashFunction.CanonicalSmiles,\n", - " 'MurckoScaffold': rdkit.Chem.rdMolHash.HashFunction.MurckoScaffold,\n", - " 'ExtendedMurcko': rdkit.Chem.rdMolHash.HashFunction.ExtendedMurcko,\n", - " 'MolFormula': rdkit.Chem.rdMolHash.HashFunction.MolFormula,\n", - " 'AtomBondCounts': rdkit.Chem.rdMolHash.HashFunction.AtomBondCounts,\n", - " 'DegreeVector': rdkit.Chem.rdMolHash.HashFunction.DegreeVector,\n", - " 'Mesomer': rdkit.Chem.rdMolHash.HashFunction.Mesomer,\n", - " 'HetAtomTautomer': rdkit.Chem.rdMolHash.HashFunction.HetAtomTautomer,\n", - " 'HetAtomProtomer': rdkit.Chem.rdMolHash.HashFunction.HetAtomProtomer,\n", - " 'RedoxPair': rdkit.Chem.rdMolHash.HashFunction.RedoxPair,\n", - " 'Regioisomer': rdkit.Chem.rdMolHash.HashFunction.Regioisomer,\n", - " 'NetCharge': rdkit.Chem.rdMolHash.HashFunction.NetCharge,\n", - " 'SmallWorldIndexBR': rdkit.Chem.rdMolHash.HashFunction.SmallWorldIndexBR,\n", - " 'SmallWorldIndexBRL': rdkit.Chem.rdMolHash.HashFunction.SmallWorldIndexBRL,\n", - " 'ArthorSubstructureOrder': rdkit.Chem.rdMolHash.HashFunction.ArthorSubstructureOrder}" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "registration.RDKIT_HASH_FUNCTIONS" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -591,7 +574,6 @@ "self.author = DEFAULT_AUTHOR\n", "self.pre_processed = DEFAULT_PREPROCESS\n", "self.index_option = DEFAULT_INDEX\n", - "self.rdkit_hashes = set(RDKIT_HASH_FUNCTIONS.keys())\n", "self.hashes = set(HASH_FUNCTIONS.keys())\n", "self.fingerprints = {}\n", "self.value_fields = {}\n", @@ -616,10 +598,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "mongordkit.Database.write.**WriteFromSDF**(database, sdf, scheme=MolDocScheme(), reg_collection=None, chunk_size=100, limit=None) --> *int: number of molecules imported*\n", + "mongordkit.Database.write.**WriteFromSDF**(database, sdf, scheme=MolDocScheme(), reg_collection=None, chunk_size=100, limit=None, warnings=False (*Make this true to turn on rdkit warnings*) --> *int: number of molecules imported*\n", "\n", "mongordkit.Database.write.**WriteFromMolList**(database, list, scheme=MolDocScheme(), reg_collection=None, chunk_size=100, limit=None) --> *int: number of molecules imported*" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/docs/notebooks/.ipynb_checkpoints/Similarity Testing-checkpoint.ipynb b/docs/notebooks/.ipynb_checkpoints/Similarity Testing-checkpoint.ipynb deleted file mode 100644 index c2a351f..0000000 --- a/docs/notebooks/.ipynb_checkpoints/Similarity Testing-checkpoint.ipynb +++ /dev/null @@ -1,244 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Similarity Search Benchmarking" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import mongordkit\n", - "import time\n", - "import pymongo\n", - "import rdkit\n", - "import matplotlib\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "from rdkit import Chem\n", - "from statistics import mean\n", - "import mongomock\n", - "from rdkit.Chem import AllChem\n", - "from mongordkit.Database import write\n", - "from mongordkit.Search import similarity" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "populating mongodb collection with compounds from chembl...\n", - "inserted chunk...\n", - "inserted chunk...\n", - "200 molecules successfully imported\n" - ] - }, - { - "data": { - "text/plain": [ - "200" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#Create a mongomock database instance and write to it. \n", - "client = mongomock.MongoClient()\n", - "db = client.db\n", - "\n", - "#Write 200 molecules into the database\n", - "write.writeFromSDF(db, '../../data/test_data/first_200.props.sdf', 'test')" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "#Add Morgan fingerprints into the database\n", - "similarity.addMorganFingerprints(db)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[[1.0, 'CC1=CC(=O)C=CC1=O']]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#Check that similarity search is working, at least for one molecule. \n", - "doc = db.molecules.find_one()\n", - "m = Chem.Mol(doc['rdmol'])\n", - "results = similarity.similaritySearch(m, db, .8)\n", - "results" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "populating mongodb collection with compounds from chembl...\n", - "The specified setting does not exist. Will only insert default molecules\n", - "inserted chunk...\n", - "inserted chunk...\n", - "1000 molecules successfully imported\n" - ] - }, - { - "data": { - "text/plain": [ - "1000" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#Create a regular mongoDB database instance and write the first 1000 molecules to it. \n", - "client = pymongo.MongoClient()\n", - "db = client.db\n", - "db.molecules.drop()\n", - "db.mfp_counts.drop()\n", - "write.writeFromSDF(db, '../../../chembl_27.sdf', 'test', reg_option='inchikey', index_option='inchikey', chunk_size=500, limit=500)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "similarity.addMorganFingerprints(db)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Measuring performance for similarity threshold 0.7.\n" - ] - } - ], - "source": [ - "#Run benchmarks for similarity search with and without aggregation parameters. \n", - "thresholds = [0.7, 0.75, 0.8, 0.85, 0.9, 0.95]\n", - "times = []\n", - "repetitions = 5\n", - "for t in thresholds: \n", - " print(\"Measuring performance for similarity threshold {}.\".format(t))\n", - " temp_times = []\n", - " for r in range(repetitions):\n", - " start = time.time()\n", - " for m in db.molecules.find():\n", - " mol = Chem.Mol(m['rdmol'])\n", - " similarity.similaritySearchAggregate(mol, db, t)\n", - " print('working')\n", - " end = time.time()\n", - " temp_times.append(end - start)\n", - " times.append([t, mean(temp_times)])\n", - "print(times)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZEAAAEGCAYAAACkQqisAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO3de3iU9Z338fc3CeEgh3BICAIBFOQQVNQUPB9ASWy1WFv7KLbS1qde27rd7rq7rX3arV27vdZu99o+22e33drWrXZFq7Zb7a5yKHiqq2BAwXCOIBAgBAgE5BSSfJ8/5h4cQxKGyUzuOXxe1zXXzPzmvmd+PyX55Hf/7rm/5u6IiIgkIi/sDoiISOZSiIiISMIUIiIikjCFiIiIJEwhIiIiCSsIuwM9bdiwYT527NiwuyEiklFWrFix192L27fnXIiMHTuW6urqsLshIpJRzGxrR+06nCUiIglTiIiISMIUIiIikjCFiIiIJEwhIiIiCVOIiIhIwhQiIiKSMIVIHNydX72xlf9avTPsroiIpJWc+7JhIsyMp6u3Y2bcdMHZYXdHRCRtaCYSp8ryUlZtP0B907GwuyIikjYUInGqLB8OwKK19SH3REQkfShE4jS+ZADnFJ/FojW7w+6KiEjaUIicgcryUt7YvI+mIyfC7oqISFpQiJyByvJSWtqcJes1GxERAYXIGblg5CCGD+zNwjVaFxERAYXIGcnLM2ZPKeXljXs42twadndEREKnEDlDleWlHDvRxiub9oTdFRGR0ClEztCMc4YwqG8vHdISEUEhcsZ65ecxa1IJS9Y10NLaFnZ3RERCpRBJwOzyUpqOnmD5lsawuyIiEiqFSAKuOa+YPr3ydEhLRHKeQiQBfQvzuWpCMYvW7sbdw+6OiEhoFCIJqiwvZVfTMVbXNYXdFRGR0ChEEnT95BLy80yHtEQkpylEElTUr5AZ44awaK0ugSIiuStlIWJmj5hZg5nVxLTdZmZrzKzNzCrabf8NM6s1sw1mVhnTXhW01ZrZ/THt48xsmZltMrNfm1lhqsbSmcryUmob3ufdPe/39EeLiKSFVM5EfglUtWurAW4FXoltNLMpwO1AebDPj80s38zygX8FbgSmAHcE2wJ8H/ihu08A9gN3p2gcnZod1BjRIS0RyVUpCxF3fwVobNe2zt03dLD5HOBJdz/u7luAWmB6cKt1983u3gw8CcwxMwNmAs8E+z8K3JKioXRqxKC+XDhqEAtVY0REclS6rImMBLbHPK8L2jprHwoccPeWdu0dMrN7zKzazKr37EnuNa9mq2yuiOSwdAkR66DNE2jvkLs/7O4V7l5RXFycYBc7prK5IpLL0iVE6oDRMc9HATu7aN8LFJlZQbv2Hhctm6t1ERHJRekSIs8Bt5tZbzMbB0wAlgNvAhOCM7EKiSy+P+eRr4m/CHwq2H8e8GwI/QaiZXMbVTZXRHJOKk/xfQJ4HZhoZnVmdreZfcLM6oDLgP82s4UA7r4GeApYCywA7nX31mDN40+BhcA64KlgW4CvA/eZWS2RNZJfpGosp1NZXkqryuaKSA6yXLv2U0VFhVdXVyf1PdvanMsfWsqFowfx089WnH4HEZEMY2Yr3P2UX3Dpcjgro+XlGTdMGa6yuSKScxQiSaKyuSKSixQiSaKyuSKSixQiSaKyuSKSixQiSaSyuSKSaxQiSaSyuSKSaxQiSdS3MJ+rVTZXRHKIQiTJZqtsrojkEIVIkqlsrojkEoVIkkXL5ipERCQXKERSoLK8lHf3HKa2QWVzRSS7KURSYLZqjIhIjlCIpIDK5opIrlCIpIjK5opILlCIpIjK5opILlCIpIjK5opILlCIpFC0bO6BI81hd0VEJCUUIikULZu7dH1D2F0REUkJhUgKXTByEKUD++iQlohkLYVICuXlGbPLVTZXRLKXQiTFZk9R2VwRyV4KkRRT2VwRyWYKkRSLLZt7QmVzRSTLKER6QLRs7psqmysiWUYh0gNUNldEslXKQsTMHjGzBjOriWkbYmaLzWxTcD84aDcz+5GZ1ZrZajO7OGafecH2m8xsXkz7JWb2TrDPj8zMUjWW7lLZXBHJVqmcifwSqGrXdj+wxN0nAEuC5wA3AhOC2z3ATyASOsADwAxgOvBANHiCbe6J2a/9Z6WVSpXNFZEslLIQcfdXgPaLAHOAR4PHjwK3xLQ/5hFvAEVmNgKoBBa7e6O77wcWA1XBawPd/XWP/Gn/WMx7paVZKpsrIlmop9dEhrv7LoDgviRoHwlsj9muLmjrqr2ug/a0pbK5IpKN0mVhvaP1DE+gveM3N7vHzKrNrHrPnvC+9KeyuSKSbXo6RHYHh6II7qNXJqwDRsdsNwrYeZr2UR20d8jdH3b3CnevKC4u7vYgEqWyuSKSbXo6RJ4DomdYzQOejWm/KzhL61KgKTjctRCYbWaDgwX12cDC4LVDZnZpcFbWXTHvlbZUNldEsk0qT/F9AngdmGhmdWZ2N/AQcIOZbQJuCJ4DPA9sBmqBnwFfBnD3RuC7wJvB7cGgDeBLwM+Dfd4FXkjVWJJJZXNFJJtYrn1voaKiwqurq0P7/NqGQ1z/T6/w4Jxy7rpsbGj9EBE5E2a2wt0r2reny8J6zlDZXBHJJgqREKhsrohkC4VICKJlc5esU9lcEclsCpEQRMvm6lRfEcl0CpEQqGyuiGQLhUhIKstVNldEMp9CJCTTx6lsrohkPoVISFQ2V0SygUIkRNGyuctVNldEMpRCJETRsrmLdEhLRDKUQiREKpsrIplOIRIylc0VkUymEAmZyuaKSCZTiIRMZXNFJJMpRNKAyuaKSKZSiKSBaNlczUZEJNMoRNJAtGzuorUqmysimSWuEDGzEjP7hJnda2ZfMLPpZqYASiKVzRWRTNRlEJjZdWa2EPhv4EZgBDAF+Bbwjpn9rZkNTH03s19leSmALg8vIhml4DSvfxT4ortva/+CmRUANwE3AL9JQd9yyviS/ifL5qr2uohkii5nIu7+1x0FSPBai7v/zt0VIEmisrkikmniXRP5qpkNtIhfmNlKM5ud6s7lGpXNFZFME+/i+Bfc/SAwGygGPg88lLJe5SiVzRWRTBNviFhw/1Hg3919VUybJInK5opIpok3RFaY2SIiIbLQzAYAqqSUAiqbKyKZJN4QuRu4H/iIux8BCokc0pIkU9lcEckkcYWIu7cBLcDVZnYrcA0wPtEPDRbqa8xsjZn9edA2xMwWm9mm4H5w0G5m9iMzqzWz1WZ2ccz7zAu232Rm8xLtTzpR2VwRySTxnp31CPAI8Eng5uB2UyIfaGZTgS8C04ELgZvMbAKRmc4Sd58ALAmeQ+RLjhOC2z3AT4L3GQI8AMwI3uuBaPBkOpXNFZFMcbovG0Zd6u5TkvSZk4E3gsNimNnLwCeAOcC1wTaPAi8BXw/aH/NI6b83zKzIzEYE2y5298bgfRYDVcATSepnaKJlcxeuqeeK8cPC7o6ISKfiXRN53cySFSI1RA6LDTWzfkQW60cDw919F0BwXxJsPxLYHrN/XdDWWfspzOweM6s2s+o9e9J/wfpk2dw1KpsrIukt3hB5lEiQbAjWJd4xs9WJfKC7rwO+DywGFgCriKy3dKajU4m9i/aOPvNhd69w94ri4uIz7HE4KstLqT+osrkikt7iDZFHgM8SOVwUXQ+5OdEPdfdfuPvF7n410AhsAnYHh6kI7qNf264jMlOJGgXs7KI9K6hsrohkgnhDZJu7P+fuW9x9a/SW6IeaWUlwXwbcSmQd4zkgeobVPODZ4PFzwF3BWVqXAk3B4a6FwGwzGxwsqM8O2rKCyuaKSCaId2F9vZnNB34PHI82uvtvE/zc35jZUOAEcK+77zezh4CnzOxuYBtwW7Dt80TWTWqBIwTfT3H3RjP7LvBmsN2D0UX2bFFZXsoDz62htuF9xpf0D7s7IiKniDdE+hIJj9iLLjqQUIi4+1UdtO0DZnXQ7sC9nbxP9NTjrDS7fDgPPLeGhWvqGV+S8NdyRERSJq4QcXd9Oz0EJ8vmrqnn3usUIiKSfk5X2fBbwZf6Ont9ppkl9KVDic/s8lJW1TWxq+lo2F0RETnF6RbW3wF+b2ZLzOwHZvY1M/u2mf3KzN4hcobWstR3M3dFy+YuXrs75J6IiJzqdJUNn3X3K4A/AdYA+cBB4D+A6e7+F+6e/t/ey2DjS/pzblA2V0Qk3cS7JrKJyHc5JASzy0t5+JXNHDjSTFG/wrC7IyJyUrzfE5EQqWyuiKQrhUgGiJbN1SEtEUk3CpEMEC2b+8omlc0VkfQSbz2R84IztGqC5xeY2bdS2zWJpbK5IpKO4p2J/Az4BpHLlODuq4HbU9UpOZXK5opIOoo3RPq5+/J2bV1dvl2STGVzRSQdxRsie83sXIJ6HWb2KWBXynolHVLZXBFJN/GGyL3AT4FJZrYD+HPgSynrlXQotmyuiEg6iCtE3H2zu18PFAOT3P1Kd38vpT2TU8SWzW1rU9lcEQlfXN9YN7Mi4C5gLFBgFqlM6+5/lrKeSYcqy0tZtHY37+xo4sLRRWF3R0RyXLz1RJ4H3iByQUat6oYotmyuQkREwhZviPRx9/tS2hOJS1G/Qi49J1I292tVk8LujojkuHgX1n9lZl80sxFmNiR6S2nPpFOzp5Ty7p7D1Da8H3ZXRCTHxRsizcAPgNeBFcGtOlWdkq7NLh8OoLO0RCR08YbIfcB4dx/r7uOC2zmp7Jh0LrZsrohImOINkTXAkVR2RM6MyuaKSDqIN0RagbfN7Kdm9qPoLZUdk66pbK6IpIN4z876XXCTNBFbNveuy8aG3R0RyVHxlsd9NNUdkTNXWV7KT1U2V0RC1OXhLDN7Krh/x8xWt7/1TBelM7NVNldEQna6NZGvBvc3ATd3cEuImf2Fma0xsxoze8LM+pjZODNbZmabzOzXZlYYbNs7eF4bvD425n2+EbRvMLPKRPuTqVQ2V0TC1mWIuHv0cu9fdvetsTfgy4l8oJmNBP4MqHD3qUA+kQJX3wd+6O4TgP3A3cEudwP73X088MNgO8xsSrBfOVAF/NjM8hPpU6ZS2VwRCVu8Z2fd0EHbjd343AKgr5kVAP2I1CaZCTwTvP4ocEvweE7wnOD1WRa5AuQc4El3P+7uW4BaYHo3+pSRVDZXRMJ0ujWRL5nZO8DEdushW4CE1kTcfQfwj8A2IuHRROQb8AfcPVotsQ4YGTweCWwP9m0Jth8a297BPu3HcY+ZVZtZ9Z492fXLVmVzRSRMpzs7az7wAvD3wP0x7YfcPaHyemY2mMgsYhxwAHiajmc10YIZ1slrnbWf2uj+MPAwQEVFRVYV4uiVn8esyR+Uze2VH+/kUkSk+063JtLk7u+5+x3t1kS6U5/1emCLu+9x9xPAb4HLgaLg8BbAKGBn8LgOGA0QvD4IaIxt72CfnDJ7isrmikg4wvizdRtwqZn1C9Y2ZgFrgReBTwXbzAOeDR4/FzwneH2pu3vQfntw9tY4YAKwvIfGkFZUNldEwtLjIeLuy4gskK8kUuQqj8ihpq8D95lZLZE1j18Eu/wCGBq030dwWM3d1wBPEQmgBcC97p6TpyipbK6IhCXey54klbs/ADzQrnkzHZxd5e7HgNs6eZ/vAd9LegczULRs7uodTUxTxUMR6SFahc0S0bK5ujy8iPQkhUiWiC2bKyLSUxQiWaSyXGVzRaRnKUSyyA1TVDZXRHqWQiSLqGyuiPQ0hUiWUdlcEelJCpEso7K5ItKTFCJZJrZsrohIqilEslBleSlvbG7kwJHmsLsiIllOIZKFVDZXRHqKQiQLqWyuiPQUhUgWUtlcEekpCpEsFS2b+/LG7KrkKCLpRSGSpaJlcxet1SEtEUkdhUiWal82V0QkFRQiWayyXGVzRSS1FCJZ7OoJKpsrIqmlEMliKpsrIqmmEMlyleWl1B88xuodTWF3RUSykEIky0XL5uqQloikgkIky0XL5qrGiIikgkIkB6hsroikikIkB6hsroikikIkB6hsroikikIkR6hsroikQo+HiJlNNLO3Y24HzezPzWyImS02s03B/eBgezOzH5lZrZmtNrOLY95rXrD9JjOb19NjySTRsrmL1qhsrogkT4+HiLtvcPdp7j4NuAQ4AvwncD+wxN0nAEuC5wA3AhOC2z3ATwDMbAjwADADmA48EA0eOVW0bK4uyCgiyRT24axZwLvuvhWYAzwatD8K3BI8ngM85hFvAEVmNgKoBBa7e6O77wcWA1U92/3MorK5IpJsYYfI7cATwePh7r4LILgvCdpHAttj9qkL2jprP4WZ3WNm1WZWvWdP7tbXqFTZXBFJstBCxMwKgY8DT59u0w7avIv2UxvdH3b3CnevKC4uPrOOZpHzVTZXRJIszJnIjcBKd4+u9O4ODlMR3Ef/XK4DRsfsNwrY2UW7dEJlc0Uk2cIMkTv44FAWwHNA9AyrecCzMe13BWdpXQo0BYe7FgKzzWxwsKA+O2iTLqhsrogkUyghYmb9gBuA38Y0PwTcYGabgtceCtqfBzYDtcDPgC8DuHsj8F3gzeD2YNAmXThZNleHtEQkCQrC+FB3PwIMbde2j8jZWu23deDeTt7nEeCRVPQxW7Uvm9srP+xzK0Qkk+k3SA5S2VwRSRaFSA5S2VwRSRaFSA5S2VwRSRaFSI5S2VwRSQaFSI5S2VwRSQaFSI6Kls1ViIhIdyhEclhleSmbVTZXRLpBIZLDVDZXRLpLIZLDVDZXRLpLIZLjVDb3A+8fb+GlDQ00HDwWdldEMkYolz2R9FFZXsoPFm5g0ZrdzLt8bNjd6XH7Dzfzh3W7WVBTz6u1e2luaaMgz7hhynDmzijjinOHkZfXUdUBEQGFSM6Lls1duKY+Z0Kk4eAxFq7dzcKael7fvI/WNmdkUV8+M2MMV00Yxuub9/F09XZeqKmnbEg/7phexm0VoxjWv3fYXRdJOxa5vmHuqKio8Orq6rC7kVb+YcF6fvrKZlZ863qK+hWG3Z2U2N54hIVr6llQU8+Kbftxh3OGnUXV1FKqppZy/shBmH0w4zje0sqCmnoeX7aN5Vsa6ZVvzC4v5c7pZVx27tAPbSuSC8xshbtXtG/XTESoLC/lxy+9y5J1DXzyklFhdydpahveZ0HNLhasqadmx0EAJo8YyF9cfx5VU0uZUNK/0zDoXZDPnGkjmTNtJLUNh5i/bDu/WVnHf6/exbhhZzF3ehmfvGQUQ87KztCVzHWitY2jJ1o52hzcTkRux5pbU/IHkGYigrtz2d8v5YJRg3j4rlP+0MgY7s6anQdZUFPPgjX1J7//clFZEVXlkRnHmKFnJfz+x0608vw7u5i/bBvVW/dTmJ/HjeeXMnd6GdPHDdHsRE6rq1/wR2KfnwieN0ceHz3xwevHmj94Hn0t9v1aurge3vrvVtGnV35CfddMRDplFimb+1T1do42t9K3MLF/ZGFoa3Pe2r7/ZHBsbzxKnsGMcUP57KVjqCwvpXRQn6R8Vp9e+dx68ShuvXgUG+oP8cTybfxmZR3Pvr2T8SX9uWN6GZ+8eGTWHhLMdmH/gu9M74I8+hXm07dXPn2C+36F+QzoU0DJgN70DdpO3kcfxzzvU5hPv175FKTgJBHNRASA12r3cufPl/Fvn7mEqqmlYXenSy2tbSzb0siCmnoWrqmn4dBxeuUbV44fRtXUUq6fPJyhPbQIfrS5ld+v3sn8Zdt4e/sBehfk8bHzRzB3RhmXjBms2UkaOXaildc37+PF9Q1Uv7efw80tKfsF3yfml/mHnnfxCz7a1id4j76F+fQpyE+bswM1E5EuxZbNTccQOd7Syh837WVBTT2L1+3mwJET9OmVx7XnlXDj+aVcN6mEgX169Xi/+hbm8+mK0Xy6YjRrdx5k/vKt/O6tnfz2rR1MHD6AuTPKuOWikQzq2/N9E6hvOsbS9Q0sXd/Aa7V7OXqilb698qkYO5gJZ/XPml/wYdJMRE6676m3+cPa3az4mxvSomzu4eMtvLRhDwvW1PPi+gbeP97CgN4FzJpcQtXUEVxzXnFaHno7fLyF36/ayfzl21hd10SfXnncfMHZzJ1RxrTRRZqdpFBbm7Oq7gBL1zewZF0Da3dFTqgYWdSXWZNLmDmphEvPGZrwukAu00xETquyvJTfrtzB8i2NXDF+WCh9aDpygiXrd/NCTT2vbNzD8ZY2hpxVyE0XjKBqaimXnzuMwoLwA64rZ/Uu4PbpZdw+vYyaHU08vmwbz769g6dX1DF5xMDI7GTa2QwIYeaUjQ4eO8GrG/eydH0DL21oYN/hZvIMKsYM4etVk5g1uaTLM/GkezQTkZOONrdy0XcX8emK0Tw4Z2qPfe6eQ8dZvHY3L9Ts4vV399HS5pQO7EPV1FIqy0v5yNjBFKTBzKg7Dh07wbNvR9ZO1u46SL/CfOZMO5u508dw/qhBYXcv42ze8/7Jw1TLtzTS0uYM6tuLaycWM3NSCdecV6wTHJKss5mIQkQ+5J7Hqlld18T/3D8zpcd7dxw4ysLgjKo332vEHcYM7Rf58l95KReOKsrK483uzqq6JuYv28pzq3Zy7EQb548cxNwZZXz8wrM5q7cODnSkuaWNN99rZMm6Bl7c0MCWvYcBmDh8ANdNKmHW5BIuGl2U8X9spDOFSEAh0rXfrKjjL59exe/uvYJpo4uS+t5b9h7mhZpdLKypZ1VdpCzvxOEDTn5rfFLpgJw65HDw2Al+99YO5i/bxvr6Q/TvXRCZncwoo/xszU72HDrOSxsis41XN+3l/eMtFBbkcfm5Q5k5qYTrJpYweki/sLuZM7QmInGJLZvb3RBxd9bXH+KFmnoW1tSzYfchAC4cNYivVU2kqryUc4r7J6PbGWlgn17cddlYPnvpGFZu28/jy7bxzIo6Hl+2jQtHF3Hn9DJuunAE/Qpz48c0+mXRpesbWLK+gVXbDwAwfGBvbr7wbGZOKuGK8UNz5r9HptBMRE5x58/fYFfTMZb+5bVnvG/07Jjol/+27juCGXxk7BCqykupnFrKyKK+ye90lmg6coLfrKxj/vJt1Da8z4A+Bdx60UjmzhjDxNIBYXcv6Y40t/DHTZFF8Rc3NLD74HHM4MJRRcyaVMLMySVMGTEwp2ao6UqHswIKkdN77PX3+Paza/jDfdcwvuT0M4WW1jbefG//yQsc1h88RkGecfn4YVSVl3LDlOEUD9AVcM+Eu/Pme/uZv2wrz9fU09zSxiVjBjN3ehkfu2BERp+iur3xyMnZxhub99Hc0saA3gVcdd4wZk4azrUTi3XF5DSUViFiZkXAz4GpgANfADYAvwbGAu8Bn3b3/Rb5E+SfgY8CR4DPufvK4H3mAd8K3vbv3P3R0322QuT06puOcenfL+GvKydy73XjO9ymuaWN197dy8Kaehat3U3j4WZ6F+RxzXnFVE0tZdak4Qzqp1NYk2H/4ebI7GTZNjbvPcygvr249eKR3DmjjPEl6T87aWltY8XW/Szd0MDSdQ1sCq5pds6ws5g5KfLdjYqxQ9L+1O1cl24h8ijwqrv/3MwKgX7A/wEa3f0hM7sfGOzuXzezjwJfIRIiM4B/dvcZZjYEqAYqiATRCuASd9/f1WcrROIz519fA3ee/dMrT7YdbW7l5Y0NLKipZ8m6Bg4db+GswnxmTh7OjVNLuea8Yp1dlELuzuub9zF/2TYWrqnnRKszfdwQ7pxRRtXUUnoXpM/sZP/hZl7euIel6xt4eeMemo6eoFe+MX3cEGZOGs7MSSWMG5b4xTCl56XNwrqZDQSuBj4H4O7NQLOZzQGuDTZ7FHgJ+DowB3jMI2n3hpkVmdmIYNvF7t4YvO9ioAp4oqfGks1mTxnODxZuYOPuQ6wNroz70sYGjp1oo6hfL6qmlnLj+ZEv/2XyoZVMYmZcfu4wLj93GHvfP84zK+p4Yvk2vvrk2wzu14tPXTKKO6aXhXKygruzcff7LFm/m6XrGli5bT9tDsP6F3LDlOHMmlTClROG6QuWWajHZyJmNg14GFgLXEhkBvFVYIe7F8Vst9/dB5vZfwEPufsfg/YlRMLlWqCPu/9d0P43wFF3/8cOPvMe4B6AsrKyS7Zu3ZrCEWaH2ob3uf6fXj75vGRAbyrLS7lxainTxw3R+fhpoq3N+Z939zF/+VYWrdlNS5tz2TlDmTujjMry0pQeIope0HDpushpuDsOHAVg6siBzJxYwszJw7lg5KCs/L5PLkqbmUjwmRcDX3H3ZWb2z8D9XWzf0b9A76L91Eb3h4kEFxUVFbl1JkGCxpf054tXjcMdbjy/lItGD9YvgzSUl2dcOWEYV04YRsOhYzxdHZmdfOWJtxh6ViG3VYzmjumju1VHJdaupqORM6nWN/DH2r0cO9FG3175XDlhGF+ZOZ7rJpUwfGByLr0vmSGMEKkD6tx9WfD8GSIhstvMRrj7ruBwVUPM9qNj9h8F7Azar23X/lIK+51zvvmxKWF3Qc5AyYA+3HvdeL50zbm8smkP85dt42evbubfXn6XqyYMY+70Mq6fMvyMLq7ZGr2gYTDbiF7QcNTgvvyvitHMnDycGeOG6JBmDgtrYf1V4H+7+wYz+w4Q/TNpX8zC+hB3/5qZfQz4Uz5YWP+Ru08PFtZXEJnVAKwksrDe2NVna2Fdckl90zGeqt7Ok8u3sbPpGMUDevPpilHc/pGyTr/tHb2g4ZL1u3lpwx4aDzeTn2dcMmYwMyeVMGtSCeN1QcOck25nZ00jcopvIbAZ+DyQBzwFlAHbgNvcvTE4xfdfiCyaHwE+7+7Vwft8gchZXQDfc/d/P91nK0QkF7W2OS9vbGD+sm0sXd+AA1dPKGbujDJmTSpha+MRXgwun/7me5ELGhb168W15xVznS5oKKRZiIRJISK5bueBozz55nZ+/eY2dh88Tt9e+Rw90QpErmU2c3JktjFNFzSUGAqRgEJEJKKlte3k5dTLzx7IdZNKGDVYFzSUjqXT2VkikgYK8vOYXV7K7PL0K4csmUNzVRERSZhCREREEqYQERGRhClEREQkYQoRERFJmEJEREQSphAREZGEKURERCRhOfeNdTPbAyRaUGQYsDeJ3ckEGnNuyLUx59p4oftjHuPuxe0bcy5EurkUHMMAAAZNSURBVMPMqjv62n8205hzQ66NOdfGC6kbsw5niYhIwhQiIiKSMIXImXk47A6EQGPODbk25lwbL6RozFoTERGRhGkmIiIiCVOIiIhIwhQigJlVmdkGM6s1s/s7eP2HZvZ2cNtoZgdiXptnZpuC27ye7Xniujnm1pjXnuvZnicujjGXmdmLZvaWma02s4/GvPaNYL8NZlbZsz1PXKJjNrOxZnY05v/zv/V87xMTx5jHmNmSYLwvmdmomNey9ee5qzF37+fZ3XP6BuQD7wLnAIXAKmBKF9t/BXgkeDwE2BzcDw4eDw57TKkcc/D8/bDHkIoxE1l4/FLweArwXszjVUBvYFzwPvlhjynFYx4L1IQ9hhSN+WlgXvB4JvCr4HHW/jx3Nubgebd+njUTgelArbtvdvdm4ElgThfb3wE8ETyuBBa7e6O77wcWA1Up7W1ydGfMmSqeMTswMHg8CNgZPJ4DPOnux919C1AbvF+6686YM1U8Y54CLAkevxjzejb/PHc25m5TiMBIYHvM87qg7RRmNobIX6JLz3TfNNOdMQP0MbNqM3vDzG5JXTeTKp4xfwf4jJnVAc8TmYHFu2866s6YAcYFh7leNrOrUtrT5IlnzKuATwaPPwEMMLOhce6bjrozZujmz7NCBKyDts7Oe74deMbdWxPYN510Z8wAZR65fMJc4P+a2bnJ7mAKxDPmO4Bfuvso4KPAr8wsL85901F3xryLyP/ni4D7gPlmNpD0F8+Y/wq4xszeAq4BdgAtce6bjrozZujmz7NCJJLao2Oej6LzKf3tfPiwzpnsm066M2bcfWdwvxl4Cbgo+V1MunjGfDfwFIC7vw70IXLRumz+/9zhmINDd/uC9hVEjrmfl/Ied99px+zuO9391iAgvxm0NcWzb5rqzpi7//Mc9qJQ2DeggMgC2jg+WJQq72C7icB7BF/QDNqGAFuILMINDh4PCXtMKR7zYKB38HgYsIkuFuXT5RbPmIEXgM8FjycHP4gGlPPhhfXNZMbCenfGXBwdI5EF2x3Z8m87+HebFzz+HvBg8Dhrf567GHO3f55D/w+QDjci0/iNRP7a+mbQ9iDw8ZhtvgM81MG+XyCy0FoLfD7ssaR6zMDlwDvBP9R3gLvDHkuyxkxk8fG1YGxvA7Nj9v1msN8G4Mawx5LqMRM5fr4maF8J3Bz2WJI45k8Fvyw3Aj+P/hINXsvKn+fOxpyMn2dd9kRERBKmNREREUmYQkRERBKmEBERkYQpREREJGEKERERSZhCROQ0zKzIzL4cPL7WzP4rBZ/xOTP7lzPc5z0zG9ZB+3fM7K+S1zuRzilERE6vCPjymexgZvkp6otIWlGIiJzeQ8C5ZvY28AOgv5k9Y2brzexxMzM4OTP4tpn9EbjNzM41swVmtsLMXjWzScF2t5lZjZmtMrNXYj7n7GD7TWb2D9FGM7vDzN4J9vl+Rx00s28G9ST+QORKA9H2PzOztUEdiSeT/59Gcl1B2B0QyQD3A1PdfZqZXQs8S+RSKDuJfNv7CuCPwbbH3P1KADNbAvyJu28ysxnAj4nUcvg2UOnuO8ysKOZzphG5btFxYIOZ/T+gFfg+cAmwH1hkZre4+++iO5nZJUSucXYRkZ/plcCKmL6Pc/fj7T5LJCk0ExE5c8vdvc7d24hcKmRszGu/BjCz/kQuKfF0MIP5KTAi2OY14Jdm9kUiBYWilrh7k7sfA9YCY4CPAC+5+x53bwEeB65u15+rgP909yPufhCIrU63GnjczD7DB1dtFUkazUREztzxmMetfPjn6HBwnwcccPdp7Xd29z8JZiYfA942s+g2Hb1vR5f57khn1y/6GJHQ+TjwN2ZWHoSRSFJoJiJyeoeAAWeyQzAj2GJmtwFYxIXB43PdfZm7fxvYy4cv493eMiJ1IIYFi/V3AC+32+YV4BNm1tfMBgA3B5+TB4x29xeBrxE5QaD/mYxD5HQ0ExE5DXffZ2avmVkNcBTYHeeudwI/MbNvAb2IlC1dBfzAzCYQmWUsCdpOmbEEn73LzL5BpKSpAc+7+7PttllpZr8mcmhtK/Bq8FI+8B9mNijY94fufiDecYvEQ1fxFRGRhOlwloiIJEwhIiIiCVOIiIhIwhQiIiKSMIWIiIgkTCEiIiIJU4iIiEjC/j9IssaYfBN36gAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "x_list = [v[0] for v in times]\n", - "y_list = [v[1]*1000 for v in times]\n", - "plt.xlabel('thresholds')\n", - "plt.ylabel('time (ms)')\n", - "plt.plot(x_list, y_list)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py37_rdkit_beta", - "language": "python", - "name": "py37_rdkit_beta" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/notebooks/.ipynb_checkpoints/Similarity and Substructure Search-checkpoint.ipynb b/docs/notebooks/.ipynb_checkpoints/Similarity and Substructure Search-checkpoint.ipynb index 34d092f..82c3f12 100644 --- a/docs/notebooks/.ipynb_checkpoints/Similarity and Substructure Search-checkpoint.ipynb +++ b/docs/notebooks/.ipynb_checkpoints/Similarity and Substructure Search-checkpoint.ipynb @@ -6,7 +6,7 @@ "source": [ "# Similarity and Substructure Search\n", "\n", - "Last updated: 7/27/20\n", + "Last updated: 8/11/20\n", "\n", "Methods for similarity and substructure search are included in the `mongordkit.Search` module." ] @@ -63,104 +63,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "populating mongodb collection with compounds from SDF...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:20] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", - "RDKit WARNING: [22:56:20] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "populating mongodb collection with compounds from SDF...\n", "200 molecules successfully imported\n", "0 duplicates skipped\n", "Preparing database and collections for search...\n", @@ -182,7 +85,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -211,11 +114,219 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "populating mongodb collection with compounds from SDF...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [15:43:38] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:40] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:40] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:40] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:40] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:40] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:40] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:40] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:40] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:40] WARNING: Omitted undefined stereo\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "200 molecules successfully imported\n", + "0 duplicates skipped\n" + ] + }, + { + "data": { + "text/plain": [ + "200" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "write.WriteFromSDF(demo_db.molecules, '../../data/test_data/first_200.props.sdf')" ] @@ -232,7 +343,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -241,9 +352,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'bits': [84,\n", + " 314,\n", + " 356,\n", + " 547,\n", + " 650,\n", + " 747,\n", + " 967,\n", + " 1057,\n", + " 1080,\n", + " 1154,\n", + " 1337,\n", + " 1380,\n", + " 1722,\n", + " 1768,\n", + " 1873,\n", + " 1877],\n", + " 'count': 16}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "demo_db.molecules.find_one()['fingerprints']['morgan_fp']" ] @@ -257,17 +395,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "similaritySearch: [[0.4117647058823529, 'WLHCBQAPPJAULW-UHFFFAOYSA-N']]\n", + "\n", + "\n", + "similaritySearchAggregate: [[0.4117647058823529, 'WLHCBQAPPJAULW-UHFFFAOYSA-N']]\n" + ] + } + ], "source": [ "q_mol = Chem.MolFromSmiles('Cc1ccccc1')\n", "\n", "# Perform a similarity search on TestDB for q_mol with a Tanimoto threshold of 0.4. \n", - "results1 = similarity.SimSearch(q_mol, demo_db.molecules, demo_db.mfp_counts, 0.8)\n", + "results1 = similarity.SimSearch(q_mol, demo_db.molecules, demo_db.mfp_counts, 0.4)\n", "\n", "# Do the same thing, but use the MongoDB Aggregation Pipeline. \n", - "results2 = similarity.SimSearchAggregate(q_mol, demo_db.molecules, demo_db.mfp_counts, 0.8)\n", + "results2 = similarity.SimSearchAggregate(q_mol, demo_db.molecules, demo_db.mfp_counts, 0.4)\n", "\n", "print('similaritySearch: {}'.format(results1))\n", "print('\\n')\n", @@ -285,7 +434,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -309,13 +458,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "similaritySearchLSH: []\n" + ] + } + ], "source": [ "q_mol = Chem.MolFromSmiles('Cc1ccccc1')\n", "\n", - "results3 = similarity.SimSearchLSH(q_mol, demo_db, demo_db.molecules, demo_db.permutations, threshold=0.8)\n", + "results3 = similarity.SimSearchLSH(q_mol, demo_db, demo_db.molecules, \n", + " demo_db.permutations, demo_db.mfp_counts, threshold=0.8)\n", "\n", "print('similaritySearchLSH: {}'.format(results3))" ] @@ -340,9 +498,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['RUTYZGCHBCCSKD-UHFFFAOYSA-N',\n", + " 'WECJUPODCKXNQK-UHFFFAOYSA-N',\n", + " 'GZZJZWYIOOPHOV-UHFFFAOYSA-N',\n", + " 'FXOSHPAYNZBSFO-RMKNXTFCSA-N',\n", + " 'KWLUBKHLCNCFQI-UHFFFAOYSA-N',\n", + " 'VDAJDWUTRXNYMU-RUDMXATFSA-N',\n", + " 'PACGLQCRGWFBJH-UHFFFAOYSA-N',\n", + " 'CDCRUVGWQJYTFO-UHFFFAOYSA-N']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "q_mol = Chem.MolFromSmiles('C1=CC=CC=C1OC')\n", "\n", @@ -359,9 +535,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['RUTYZGCHBCCSKD-UHFFFAOYSA-N',\n", + " 'WECJUPODCKXNQK-UHFFFAOYSA-N',\n", + " 'GZZJZWYIOOPHOV-UHFFFAOYSA-N',\n", + " 'FXOSHPAYNZBSFO-RMKNXTFCSA-N',\n", + " 'KWLUBKHLCNCFQI-UHFFFAOYSA-N',\n", + " 'VDAJDWUTRXNYMU-RUDMXATFSA-N',\n", + " 'PACGLQCRGWFBJH-UHFFFAOYSA-N',\n", + " 'CDCRUVGWQJYTFO-UHFFFAOYSA-N']" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "substructure.AddPatternFingerprints(demo_db.molecules)\n", "substructure.SubSearch(q_mol, demo_db.molecules, chirality=False)" @@ -410,7 +604,7 @@ "\n", "mongordkit.Search.similarity.**AddHashCollections**(db (*MongoDB database*), mol_collection (*MongoDB collection*)) --> None\n", "\n", - "mongordkit.Search.similarity.**SimSearchLSH**(mol (*rdmol object*), db (*MongoDB database containing hash collections*), mol_collection (*MongoDB collection*), perm_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, index]*" + "mongordkit.Search.similarity.**SimSearchLSH**(mol (*rdmol object*), db (*MongoDB database containing hash collections*), mol_collection (*MongoDB collection*), perm_collection (*MongoDB collection*), count_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, index]*" ] }, { diff --git a/docs/notebooks/Creating and Writing to MongoDB.ipynb b/docs/notebooks/Creating and Writing to MongoDB.ipynb index aec730f..c581d97 100644 --- a/docs/notebooks/Creating and Writing to MongoDB.ipynb +++ b/docs/notebooks/Creating and Writing to MongoDB.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -88,14 +88,14 @@ "## Data Registration\n", "`Database.registration` constructs document representations of molecules according to configurable schemes and handles data registration settings.\n", "\n", - "It does this in two parts. First, it defines the global variables `RDKIT_HASH_FUNCTIONS` and `HASH_FUNCTIONS` as dictionaries that hold map hash function names to methods. It also defines the global variables `DEFAULT_SCHEME_NAME`, `DEFAULT_AUTHOR`, `DEFAULT_PREPROCESS`, and `DEFAULT_INDEX`, which are used in scheme creation and are thus defined for easy configuration. \n", + "It does this in two parts. First, it defines the global variable `HASH_FUNCTIONS` as a dictionary that maps hash function names to methods. It also defines the global variables `DEFAULT_SCHEME_NAME`, `DEFAULT_AUTHOR`, `DEFAULT_PREPROCESS`, and `DEFAULT_INDEX`, which are used in scheme creation and are thus defined for easy configuration. \n", "\n", "Second, the file defines the `MolDocScheme` object, which stores scheme information in its instance variables and is passed into `.write` methods in order to specify molecule document format. By default, `MolDocScheme` includes scheme name, author, whether or not the molecule has been pre-processed, an index option, hashes, fingerprints, and value fields. All of the information contained in a `MolDocScheme` object can be used directly to generate documents for molecules:" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -105,35 +105,34 @@ " 'index': 'YXFVVABEGXRONW-UHFFFAOYSA-N',\n", " 'smiles': 'Cc1ccccc1',\n", " 'scheme': 'default',\n", - " 'hashes': {'inchi_standard': 'InChI=1S/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", - " 'inchikey_KET_15T': 'YXFVVABEGXRONW-UHFFFAOYNA-N',\n", - " 'noiso_smiles': 'Cc1ccccc1',\n", - " 'MoleculeHashString': '100-7-7-SaZjmQ-zcSDYw-aXeP/g-122pug-haQS5A-qxXe4Q',\n", - " 'inchi_KET_15T': 'InChI=1/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", + " 'hashes': {'MolFormula': 'C7H8',\n", + " 'SmallWorldIndexBRL': 'B7R1L5',\n", + " 'AtomBondCounts': '7,7',\n", + " 'cx_smiles': 'Cc1ccccc1',\n", + " 'NetCharge': '0',\n", + " 'CanonicalSmiles': 'Cc1ccccc1',\n", " 'inchikey_standard': 'YXFVVABEGXRONW-UHFFFAOYSA-N',\n", - " 'cx_smiles': 'Cc1ccccc1'},\n", - " 'rdkit_hashes': {'Mesomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", - " 'HetAtomProtomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", + " 'inchikey_KET_15T': 'YXFVVABEGXRONW-UHFFFAOYNA-N',\n", + " 'SmallWorldIndexBR': 'B7R1',\n", + " 'DegreeVector': '0,1,5,1',\n", + " 'ElementGraph': 'CC1CCCCC1',\n", + " 'HetAtomTautomer': 'C[C]1[CH][CH][CH][CH][CH]1_0_0',\n", + " 'inchi_standard': 'InChI=1S/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", + " 'RedoxPair': 'C[C]1[CH][CH][CH][CH][CH]1',\n", + " 'AnonymousGraph': '**1*****1',\n", + " 'Mesomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", " 'Regioisomer': '*C.c1ccccc1',\n", + " 'inchi_KET_15T': 'InChI=1/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", " 'MurckoScaffold': 'c1ccccc1',\n", " 'ArthorSubstructureOrder': '00070007010007000000002a000000',\n", + " 'noiso_smiles': 'Cc1ccccc1',\n", " 'ExtendedMurcko': '*c1ccccc1',\n", - " 'DegreeVector': '0,1,5,1',\n", - " 'RedoxPair': 'C[C]1[CH][CH][CH][CH][CH]1',\n", - " 'SmallWorldIndexBR': 'B7R1',\n", - " 'MolFormula': 'C7H8',\n", - " 'AtomBondCounts': '7,7',\n", - " 'ElementGraph': 'CC1CCCCC1',\n", - " 'CanonicalSmiles': 'Cc1ccccc1',\n", - " 'SmallWorldIndexBRL': 'B7R1L5',\n", - " 'HetAtomTautomer': 'C[C]1[CH][CH][CH][CH][CH]1_0_0',\n", - " 'NetCharge': '0',\n", - " 'AnonymousGraph': '**1*****1'},\n", + " 'HetAtomProtomer': 'C[C]1[CH][CH][CH][CH][CH]1_0'},\n", " 'fingerprints': {},\n", " 'value_data': {}}" ] }, - "execution_count": 23, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -153,9 +152,16 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 4, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "removed AnonymousGraph from scheme\n" + ] + }, { "data": { "text/plain": [ @@ -163,34 +169,33 @@ " 'index': 'C7H8',\n", " 'smiles': 'Cc1ccccc1',\n", " 'scheme': 'default',\n", - " 'hashes': {'inchi_standard': 'InChI=1S/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", - " 'inchikey_KET_15T': 'YXFVVABEGXRONW-UHFFFAOYNA-N',\n", - " 'noiso_smiles': 'Cc1ccccc1',\n", - " 'MoleculeHashString': '100-7-7-SaZjmQ-zcSDYw-aXeP/g-122pug-haQS5A-qxXe4Q',\n", - " 'inchi_KET_15T': 'InChI=1/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", + " 'hashes': {'MolFormula': 'C7H8',\n", + " 'SmallWorldIndexBRL': 'B7R1L5',\n", + " 'AtomBondCounts': '7,7',\n", + " 'cx_smiles': 'Cc1ccccc1',\n", + " 'NetCharge': '0',\n", + " 'CanonicalSmiles': 'Cc1ccccc1',\n", " 'inchikey_standard': 'YXFVVABEGXRONW-UHFFFAOYSA-N',\n", - " 'cx_smiles': 'Cc1ccccc1'},\n", - " 'rdkit_hashes': {'Mesomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", - " 'HetAtomProtomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", + " 'inchikey_KET_15T': 'YXFVVABEGXRONW-UHFFFAOYNA-N',\n", + " 'SmallWorldIndexBR': 'B7R1',\n", + " 'DegreeVector': '0,1,5,1',\n", + " 'ElementGraph': 'CC1CCCCC1',\n", + " 'HetAtomTautomer': 'C[C]1[CH][CH][CH][CH][CH]1_0_0',\n", + " 'inchi_standard': 'InChI=1S/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", + " 'RedoxPair': 'C[C]1[CH][CH][CH][CH][CH]1',\n", + " 'Mesomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", " 'Regioisomer': '*C.c1ccccc1',\n", + " 'inchi_KET_15T': 'InChI=1/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", " 'MurckoScaffold': 'c1ccccc1',\n", " 'ArthorSubstructureOrder': '00070007010007000000002a000000',\n", + " 'noiso_smiles': 'Cc1ccccc1',\n", " 'ExtendedMurcko': '*c1ccccc1',\n", - " 'DegreeVector': '0,1,5,1',\n", - " 'RedoxPair': 'C[C]1[CH][CH][CH][CH][CH]1',\n", - " 'SmallWorldIndexBR': 'B7R1',\n", - " 'MolFormula': 'C7H8',\n", - " 'AtomBondCounts': '7,7',\n", - " 'ElementGraph': 'CC1CCCCC1',\n", - " 'CanonicalSmiles': 'Cc1ccccc1',\n", - " 'SmallWorldIndexBRL': 'B7R1L5',\n", - " 'HetAtomTautomer': 'C[C]1[CH][CH][CH][CH][CH]1_0_0',\n", - " 'NetCharge': '0'},\n", + " 'HetAtomProtomer': 'C[C]1[CH][CH][CH][CH][CH]1_0'},\n", " 'fingerprints': {},\n", " 'value_data': {}}" ] }, - "execution_count": 24, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -220,7 +225,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -234,91 +239,91 @@ "name": "stderr", "output_type": "stream", "text": [ - "RDKit WARNING: [22:03:51] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:03:51] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:03:51] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:03:51] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:03:51] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:03:51] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:03:51] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:03:52] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:23] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", - "RDKit WARNING: [22:05:23] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n" + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:46] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [15:39:46] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n" ] }, { @@ -335,7 +340,7 @@ "200" ] }, - "execution_count": 28, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -356,7 +361,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -370,97 +375,95 @@ "name": "stderr", "output_type": "stream", "text": [ - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:05:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:20] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:20] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:12:21] WARNING: Charges were rearranged\n" + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:48] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:48] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:39:50] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [15:39:50] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n" ] }, { @@ -477,7 +480,7 @@ "100" ] }, - "execution_count": 30, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -519,13 +522,29 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'MoleculeHashString': ,\n", + "{'AnonymousGraph': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.AnonymousGraph)>,\n", + " 'ElementGraph': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.ElementGraph)>,\n", + " 'CanonicalSmiles': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.CanonicalSmiles)>,\n", + " 'MurckoScaffold': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.MurckoScaffold)>,\n", + " 'ExtendedMurcko': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.ExtendedMurcko)>,\n", + " 'MolFormula': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.MolFormula)>,\n", + " 'AtomBondCounts': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.AtomBondCounts)>,\n", + " 'DegreeVector': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.DegreeVector)>,\n", + " 'Mesomer': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.Mesomer)>,\n", + " 'HetAtomTautomer': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.HetAtomTautomer)>,\n", + " 'HetAtomProtomer': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.HetAtomProtomer)>,\n", + " 'RedoxPair': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.RedoxPair)>,\n", + " 'Regioisomer': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.Regioisomer)>,\n", + " 'NetCharge': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.NetCharge)>,\n", + " 'SmallWorldIndexBR': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.SmallWorldIndexBR)>,\n", + " 'SmallWorldIndexBRL': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.SmallWorldIndexBRL)>,\n", + " 'ArthorSubstructureOrder': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.ArthorSubstructureOrder)>,\n", " 'inchi_standard': ,\n", " 'inchikey_standard': ,\n", " 'inchi_KET_15T': (rdmol)>,\n", @@ -534,7 +553,7 @@ " 'cx_smiles': }" ] }, - "execution_count": 31, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -543,42 +562,6 @@ "registration.HASH_FUNCTIONS" ] }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'AnonymousGraph': rdkit.Chem.rdMolHash.HashFunction.AnonymousGraph,\n", - " 'ElementGraph': rdkit.Chem.rdMolHash.HashFunction.ElementGraph,\n", - " 'CanonicalSmiles': rdkit.Chem.rdMolHash.HashFunction.CanonicalSmiles,\n", - " 'MurckoScaffold': rdkit.Chem.rdMolHash.HashFunction.MurckoScaffold,\n", - " 'ExtendedMurcko': rdkit.Chem.rdMolHash.HashFunction.ExtendedMurcko,\n", - " 'MolFormula': rdkit.Chem.rdMolHash.HashFunction.MolFormula,\n", - " 'AtomBondCounts': rdkit.Chem.rdMolHash.HashFunction.AtomBondCounts,\n", - " 'DegreeVector': rdkit.Chem.rdMolHash.HashFunction.DegreeVector,\n", - " 'Mesomer': rdkit.Chem.rdMolHash.HashFunction.Mesomer,\n", - " 'HetAtomTautomer': rdkit.Chem.rdMolHash.HashFunction.HetAtomTautomer,\n", - " 'HetAtomProtomer': rdkit.Chem.rdMolHash.HashFunction.HetAtomProtomer,\n", - " 'RedoxPair': rdkit.Chem.rdMolHash.HashFunction.RedoxPair,\n", - " 'Regioisomer': rdkit.Chem.rdMolHash.HashFunction.Regioisomer,\n", - " 'NetCharge': rdkit.Chem.rdMolHash.HashFunction.NetCharge,\n", - " 'SmallWorldIndexBR': rdkit.Chem.rdMolHash.HashFunction.SmallWorldIndexBR,\n", - " 'SmallWorldIndexBRL': rdkit.Chem.rdMolHash.HashFunction.SmallWorldIndexBRL,\n", - " 'ArthorSubstructureOrder': rdkit.Chem.rdMolHash.HashFunction.ArthorSubstructureOrder}" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "registration.RDKIT_HASH_FUNCTIONS" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -591,7 +574,6 @@ "self.author = DEFAULT_AUTHOR\n", "self.pre_processed = DEFAULT_PREPROCESS\n", "self.index_option = DEFAULT_INDEX\n", - "self.rdkit_hashes = set(RDKIT_HASH_FUNCTIONS.keys())\n", "self.hashes = set(HASH_FUNCTIONS.keys())\n", "self.fingerprints = {}\n", "self.value_fields = {}\n", @@ -616,10 +598,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "mongordkit.Database.write.**WriteFromSDF**(database, sdf, scheme=MolDocScheme(), reg_collection=None, chunk_size=100, limit=None) --> *int: number of molecules imported*\n", + "mongordkit.Database.write.**WriteFromSDF**(database, sdf, scheme=MolDocScheme(), reg_collection=None, chunk_size=100, limit=None, warnings=False (*Make this true to turn on rdkit warnings*) --> *int: number of molecules imported*\n", "\n", "mongordkit.Database.write.**WriteFromMolList**(database, list, scheme=MolDocScheme(), reg_collection=None, chunk_size=100, limit=None) --> *int: number of molecules imported*" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/docs/notebooks/Explore LSH.ipynb b/docs/notebooks/Explore LSH.ipynb deleted file mode 100644 index ed142b5..0000000 --- a/docs/notebooks/Explore LSH.ipynb +++ /dev/null @@ -1,284 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Exploring Locality Sensitive Hashing" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [], - "source": [ - "# Imports\n", - "\n", - "import numpy as np\n", - "from rdkit import Chem\n", - "from rdkit.Chem import AllChem\n", - "import sys\n", - "import functools\n", - "import mongomock" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "# Permutation function\n", - "\n", - "def get_permutations(len_permutations=2048, num_permutations=100):\n", - " return map(lambda _: np.random.permutation(2048), range(num_permutations))" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [], - "source": [ - "permutations = get_permutations()" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [], - "source": [ - "def get_min_hash(mol, permutations):\n", - " qfp = list(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048))\n", - " min_hash = []\n", - " for perm in permutations:\n", - " for idx, i in enumerate(perm):\n", - " if qfp_bits[i]:\n", - " min_hash.append(idx)\n", - " break \n", - " return min_hash" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [], - "source": [ - "mol = Chem.MolFromSmiles('C1=CC=CC=C1OC')\n", - "min_hash = get_min_hash(mol, permutations)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [], - "source": [ - "def hash_to_buckets(min_hash, num_buckets=25, nBits=2048):\n", - " if len(min_hash) % num_buckets:\n", - " raise Exception('number of buckets must be divisiable by the hash length')\n", - " buckets = []\n", - " hash_per_bucket = int(len(min_hash) / num_buckets)\n", - " num_bits = (nBits-1).bit_length()\n", - "# if num_bits * hash_per_bucket > sys.maxint.bit_length():\n", - "# raise Exception('numbers are too large to produce valid buckets')\n", - " for b in range(num_buckets):\n", - " buckets.append(functools.reduce(lambda x,y: (x << num_bits) + y, min_hash[b:(b + hash_per_bucket)]))\n", - " return buckets" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[250056202389,\n", - " 1941707205052,\n", - " 782309908621,\n", - " 1281762813978,\n", - " 3814522409145,\n", - " 1211290208280,\n", - " 224114294943,\n", - " 1589238888575,\n", - " 206825584784,\n", - " 1366332571687,\n", - " 1091525753125,\n", - " 1237114759205,\n", - " 336236456125,\n", - " 2517006411838,\n", - " 318620430363,\n", - " 1623757740190,\n", - " 532689514536,\n", - " 232591015954,\n", - " 1357377474657,\n", - " 343673079808,\n", - " 155025670508,\n", - " 833224401069,\n", - " 1527081060,\n", - " 3127462010894,\n", - " 1486478143488]" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "hash_to_buckets(min_hash)" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [], - "source": [ - "client = mongomock.MongoClient()" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [], - "source": [ - "db = client.db" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "db.list_collection_names()" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "db.molecules.insert_one({'_id': 1, 'molecule': 'boom'})" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['molecules']" - ] - }, - "execution_count": 61, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "db.list_collection_names()" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "db.molecules.find()" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'_id': 1, 'molecule': 'boom'}" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "db.molecules.find_one()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py37_rdkit_beta", - "language": "python", - "name": "py37_rdkit_beta" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/notebooks/Exploring Multiprocessing.ipynb b/docs/notebooks/Exploring Multiprocessing.ipynb deleted file mode 100644 index 300afb8..0000000 --- a/docs/notebooks/Exploring Multiprocessing.ipynb +++ /dev/null @@ -1,340 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "import pymongo\n", - "import rdkit\n", - "import math\n", - "from rdkit import Chem\n", - "from rdkit.Chem import AllChem\n", - "from rdkit.Chem.rdmolops import PatternFingerprint\n", - "from mongordkit.Database import write\n", - "from mongordkit import Search" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "client = pymongo.MongoClient()\n", - "db = client['multip']\n", - "molecules = db.molecules" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "write.writeFromSDF(db.molecules, '../../data/test_data/first_200.props.sdf', 'test')" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[ObjectId('5f22bce9ad3e9621e4119c64'),\n", - " ObjectId('5f22bce9ad3e9621e4119c65'),\n", - " ObjectId('5f22bce9ad3e9621e4119c66'),\n", - " ObjectId('5f22bce9ad3e9621e4119c67'),\n", - " ObjectId('5f22bce9ad3e9621e4119c68'),\n", - " ObjectId('5f22bce9ad3e9621e4119c69'),\n", - " ObjectId('5f22bce9ad3e9621e4119c6a'),\n", - " ObjectId('5f22bce9ad3e9621e4119c6b'),\n", - " ObjectId('5f22bce9ad3e9621e4119c6c'),\n", - " ObjectId('5f22bce9ad3e9621e4119c6d'),\n", - " ObjectId('5f22bce9ad3e9621e4119c6e'),\n", - " ObjectId('5f22bce9ad3e9621e4119c6f'),\n", - " ObjectId('5f22bce9ad3e9621e4119c70'),\n", - " ObjectId('5f22bce9ad3e9621e4119c71'),\n", - " ObjectId('5f22bce9ad3e9621e4119c72'),\n", - " ObjectId('5f22bce9ad3e9621e4119c73'),\n", - " ObjectId('5f22bce9ad3e9621e4119c74'),\n", - " ObjectId('5f22bce9ad3e9621e4119c75'),\n", - " ObjectId('5f22bce9ad3e9621e4119c76'),\n", - " ObjectId('5f22bce9ad3e9621e4119c77'),\n", - " ObjectId('5f22bce9ad3e9621e4119c78'),\n", - " ObjectId('5f22bce9ad3e9621e4119c79'),\n", - " ObjectId('5f22bce9ad3e9621e4119c7a'),\n", - " ObjectId('5f22bce9ad3e9621e4119c7b'),\n", - " ObjectId('5f22bce9ad3e9621e4119c7c'),\n", - " ObjectId('5f22bce9ad3e9621e4119c7d'),\n", - " ObjectId('5f22bce9ad3e9621e4119c7e'),\n", - " ObjectId('5f22bce9ad3e9621e4119c7f'),\n", - " ObjectId('5f22bce9ad3e9621e4119c80'),\n", - " ObjectId('5f22bce9ad3e9621e4119c81'),\n", - " ObjectId('5f22bce9ad3e9621e4119c82'),\n", - " ObjectId('5f22bce9ad3e9621e4119c83'),\n", - " ObjectId('5f22bce9ad3e9621e4119c84'),\n", - " ObjectId('5f22bce9ad3e9621e4119c85'),\n", - " ObjectId('5f22bce9ad3e9621e4119c86'),\n", - " ObjectId('5f22bce9ad3e9621e4119c87'),\n", - " ObjectId('5f22bce9ad3e9621e4119c88'),\n", - " ObjectId('5f22bce9ad3e9621e4119c89'),\n", - " ObjectId('5f22bce9ad3e9621e4119c8a'),\n", - " ObjectId('5f22bce9ad3e9621e4119c8b'),\n", - " ObjectId('5f22bce9ad3e9621e4119c8c'),\n", - " ObjectId('5f22bce9ad3e9621e4119c8d'),\n", - " ObjectId('5f22bce9ad3e9621e4119c8e'),\n", - " ObjectId('5f22bce9ad3e9621e4119c8f'),\n", - " ObjectId('5f22bce9ad3e9621e4119c90'),\n", - " ObjectId('5f22bce9ad3e9621e4119c91'),\n", - " ObjectId('5f22bce9ad3e9621e4119c92'),\n", - " ObjectId('5f22bce9ad3e9621e4119c93'),\n", - " ObjectId('5f22bce9ad3e9621e4119c94'),\n", - " ObjectId('5f22bce9ad3e9621e4119c95'),\n", - " ObjectId('5f22bce9ad3e9621e4119c96'),\n", - " ObjectId('5f22bce9ad3e9621e4119c97'),\n", - " ObjectId('5f22bce9ad3e9621e4119c98'),\n", - " ObjectId('5f22bce9ad3e9621e4119c99'),\n", - " ObjectId('5f22bce9ad3e9621e4119c9a'),\n", - " ObjectId('5f22bce9ad3e9621e4119c9b'),\n", - " ObjectId('5f22bce9ad3e9621e4119c9c'),\n", - " ObjectId('5f22bce9ad3e9621e4119c9d'),\n", - " ObjectId('5f22bce9ad3e9621e4119c9e'),\n", - " ObjectId('5f22bce9ad3e9621e4119c9f'),\n", - " ObjectId('5f22bce9ad3e9621e4119ca0'),\n", - " ObjectId('5f22bce9ad3e9621e4119ca1'),\n", - " ObjectId('5f22bce9ad3e9621e4119ca2'),\n", - " ObjectId('5f22bce9ad3e9621e4119ca3'),\n", - " ObjectId('5f22bce9ad3e9621e4119ca4'),\n", - " ObjectId('5f22bce9ad3e9621e4119ca5'),\n", - " ObjectId('5f22bce9ad3e9621e4119ca6'),\n", - " ObjectId('5f22bce9ad3e9621e4119ca7'),\n", - " ObjectId('5f22bce9ad3e9621e4119ca8'),\n", - " ObjectId('5f22bce9ad3e9621e4119ca9'),\n", - " ObjectId('5f22bce9ad3e9621e4119caa'),\n", - " ObjectId('5f22bce9ad3e9621e4119cab'),\n", - " ObjectId('5f22bce9ad3e9621e4119cac'),\n", - " ObjectId('5f22bce9ad3e9621e4119cad'),\n", - " ObjectId('5f22bce9ad3e9621e4119cae'),\n", - " ObjectId('5f22bce9ad3e9621e4119caf'),\n", - " ObjectId('5f22bce9ad3e9621e4119cb0'),\n", - " ObjectId('5f22bce9ad3e9621e4119cb1'),\n", - " ObjectId('5f22bce9ad3e9621e4119cb2'),\n", - " ObjectId('5f22bce9ad3e9621e4119cb3'),\n", - " ObjectId('5f22bce9ad3e9621e4119cb4'),\n", - " ObjectId('5f22bce9ad3e9621e4119cb5'),\n", - " ObjectId('5f22bce9ad3e9621e4119cb6'),\n", - " ObjectId('5f22bce9ad3e9621e4119cb7'),\n", - " ObjectId('5f22bce9ad3e9621e4119cb8'),\n", - " ObjectId('5f22bce9ad3e9621e4119cb9'),\n", - " ObjectId('5f22bce9ad3e9621e4119cba'),\n", - " ObjectId('5f22bce9ad3e9621e4119cbb'),\n", - " ObjectId('5f22bce9ad3e9621e4119cbc'),\n", - " ObjectId('5f22bce9ad3e9621e4119cbd'),\n", - " ObjectId('5f22bce9ad3e9621e4119cbe'),\n", - " ObjectId('5f22bce9ad3e9621e4119cbf'),\n", - " ObjectId('5f22bce9ad3e9621e4119cc0'),\n", - " ObjectId('5f22bce9ad3e9621e4119cc1'),\n", - " ObjectId('5f22bce9ad3e9621e4119cc2'),\n", - " ObjectId('5f22bce9ad3e9621e4119cc3'),\n", - " ObjectId('5f22bce9ad3e9621e4119cc4'),\n", - " ObjectId('5f22bce9ad3e9621e4119cc5'),\n", - " ObjectId('5f22bce9ad3e9621e4119cc6'),\n", - " ObjectId('5f22bce9ad3e9621e4119cc7'),\n", - " ObjectId('5f22bce9ad3e9621e4119cc8'),\n", - " ObjectId('5f22bce9ad3e9621e4119cc9'),\n", - " ObjectId('5f22bce9ad3e9621e4119cca'),\n", - " ObjectId('5f22bce9ad3e9621e4119ccb'),\n", - " ObjectId('5f22bce9ad3e9621e4119ccc'),\n", - " ObjectId('5f22bce9ad3e9621e4119ccd'),\n", - " ObjectId('5f22bce9ad3e9621e4119cce'),\n", - " ObjectId('5f22bce9ad3e9621e4119ccf'),\n", - " ObjectId('5f22bce9ad3e9621e4119cd0'),\n", - " ObjectId('5f22bce9ad3e9621e4119cd1'),\n", - " ObjectId('5f22bce9ad3e9621e4119cd2'),\n", - " ObjectId('5f22bce9ad3e9621e4119cd3'),\n", - " ObjectId('5f22bce9ad3e9621e4119cd4'),\n", - " ObjectId('5f22bce9ad3e9621e4119cd5'),\n", - " ObjectId('5f22bce9ad3e9621e4119cd6'),\n", - " ObjectId('5f22bce9ad3e9621e4119cd7'),\n", - " ObjectId('5f22bce9ad3e9621e4119cd8'),\n", - " ObjectId('5f22bce9ad3e9621e4119cd9'),\n", - " ObjectId('5f22bce9ad3e9621e4119cda'),\n", - " ObjectId('5f22bce9ad3e9621e4119cdb'),\n", - " ObjectId('5f22bce9ad3e9621e4119cdc'),\n", - " ObjectId('5f22bce9ad3e9621e4119cdd'),\n", - " ObjectId('5f22bce9ad3e9621e4119cde'),\n", - " ObjectId('5f22bce9ad3e9621e4119cdf'),\n", - " ObjectId('5f22bce9ad3e9621e4119ce0'),\n", - " ObjectId('5f22bce9ad3e9621e4119ce1'),\n", - " ObjectId('5f22bce9ad3e9621e4119ce2'),\n", - " ObjectId('5f22bce9ad3e9621e4119ce3'),\n", - " ObjectId('5f22bce9ad3e9621e4119ce4'),\n", - " ObjectId('5f22bce9ad3e9621e4119ce5'),\n", - " ObjectId('5f22bce9ad3e9621e4119ce6'),\n", - " ObjectId('5f22bce9ad3e9621e4119ce7'),\n", - " ObjectId('5f22bce9ad3e9621e4119ce8'),\n", - " ObjectId('5f22bce9ad3e9621e4119ce9'),\n", - " ObjectId('5f22bce9ad3e9621e4119cea'),\n", - " ObjectId('5f22bce9ad3e9621e4119ceb'),\n", - " ObjectId('5f22bce9ad3e9621e4119cec'),\n", - " ObjectId('5f22bce9ad3e9621e4119ced'),\n", - " ObjectId('5f22bce9ad3e9621e4119cee'),\n", - " ObjectId('5f22bce9ad3e9621e4119cef'),\n", - " ObjectId('5f22bce9ad3e9621e4119cf0'),\n", - " ObjectId('5f22bce9ad3e9621e4119cf1'),\n", - " ObjectId('5f22bce9ad3e9621e4119cf2'),\n", - " ObjectId('5f22bce9ad3e9621e4119cf3'),\n", - " ObjectId('5f22bce9ad3e9621e4119cf4'),\n", - " ObjectId('5f22bce9ad3e9621e4119cf5'),\n", - " ObjectId('5f22bce9ad3e9621e4119cf6'),\n", - " ObjectId('5f22bce9ad3e9621e4119cf7'),\n", - " ObjectId('5f22bce9ad3e9621e4119cf8'),\n", - " ObjectId('5f22bce9ad3e9621e4119cf9'),\n", - " ObjectId('5f22bce9ad3e9621e4119cfa'),\n", - " ObjectId('5f22bce9ad3e9621e4119cfb'),\n", - " ObjectId('5f22bce9ad3e9621e4119cfc'),\n", - " ObjectId('5f22bce9ad3e9621e4119cfd'),\n", - " ObjectId('5f22bce9ad3e9621e4119cfe'),\n", - " ObjectId('5f22bce9ad3e9621e4119cff'),\n", - " ObjectId('5f22bce9ad3e9621e4119d00'),\n", - " ObjectId('5f22bce9ad3e9621e4119d01'),\n", - " ObjectId('5f22bce9ad3e9621e4119d02'),\n", - " ObjectId('5f22bce9ad3e9621e4119d03'),\n", - " ObjectId('5f22bce9ad3e9621e4119d04'),\n", - " ObjectId('5f22bce9ad3e9621e4119d05'),\n", - " ObjectId('5f22bce9ad3e9621e4119d06'),\n", - " ObjectId('5f22bce9ad3e9621e4119d07'),\n", - " ObjectId('5f22bce9ad3e9621e4119d08'),\n", - " ObjectId('5f22bce9ad3e9621e4119d09'),\n", - " ObjectId('5f22bce9ad3e9621e4119d0a'),\n", - " ObjectId('5f22bce9ad3e9621e4119d0b'),\n", - " ObjectId('5f22bce9ad3e9621e4119d0c'),\n", - " ObjectId('5f22bce9ad3e9621e4119d0d'),\n", - " ObjectId('5f22bce9ad3e9621e4119d0e'),\n", - " ObjectId('5f22bce9ad3e9621e4119d0f'),\n", - " ObjectId('5f22bce9ad3e9621e4119d10'),\n", - " ObjectId('5f22bce9ad3e9621e4119d11'),\n", - " ObjectId('5f22bce9ad3e9621e4119d12'),\n", - " ObjectId('5f22bce9ad3e9621e4119d13'),\n", - " ObjectId('5f22bce9ad3e9621e4119d14'),\n", - " ObjectId('5f22bce9ad3e9621e4119d15'),\n", - " ObjectId('5f22bce9ad3e9621e4119d16'),\n", - " ObjectId('5f22bce9ad3e9621e4119d17'),\n", - " ObjectId('5f22bce9ad3e9621e4119d18'),\n", - " ObjectId('5f22bce9ad3e9621e4119d19'),\n", - " ObjectId('5f22bce9ad3e9621e4119d1a'),\n", - " ObjectId('5f22bce9ad3e9621e4119d1b'),\n", - " ObjectId('5f22bce9ad3e9621e4119d1c'),\n", - " ObjectId('5f22bce9ad3e9621e4119d1d'),\n", - " ObjectId('5f22bce9ad3e9621e4119d1e'),\n", - " ObjectId('5f22bce9ad3e9621e4119d1f'),\n", - " ObjectId('5f22bce9ad3e9621e4119d20'),\n", - " ObjectId('5f22bce9ad3e9621e4119d21'),\n", - " ObjectId('5f22bce9ad3e9621e4119d22'),\n", - " ObjectId('5f22bce9ad3e9621e4119d23'),\n", - " ObjectId('5f22bce9ad3e9621e4119d24'),\n", - " ObjectId('5f22bce9ad3e9621e4119d25'),\n", - " ObjectId('5f22bce9ad3e9621e4119d26'),\n", - " ObjectId('5f22bce9ad3e9621e4119d27'),\n", - " ObjectId('5f22bce9ad3e9621e4119d28'),\n", - " ObjectId('5f22bce9ad3e9621e4119d29'),\n", - " ObjectId('5f22bce9ad3e9621e4119d2a'),\n", - " ObjectId('5f22bce9ad3e9621e4119d2b')]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "document_ids = molecules.find().distinct('_id')\n", - "document_ids" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "def chunk(list, length):\n", - " for i in range(0, len(l), length):\n", - " yield l[i:i + n]" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def calculate(chunk, input):\n", - " client = pymongo.MongoClient('localhost', 27017, maxPoolSize=1000)\n", - " db = client['multip']\n", - " collection = db['molecules']\n", - " chunk_list = []\n", - " for id in chunk: \n", - " result = \n", - " chunk_list.append()\n", - " \n", - " \n", - "# # define client inside function\n", - " \n", - " \n", - " \n", - " \n", - "# client = pymongo.MongoClient('localhost', 27017, maxPoolSize=10000)\n", - "\n", - "# db = client['multip']\n", - "# collection = db['molecules']\n", - "# chunk_result_list = []\n", - "# # loop over the id's in the chunk and do the calculation with each\n", - "# # my problem right now is that I want to be able to chunk the CURSOR.\n", - "# for id in chunk:\n", - "# #do the calculation with document collection.find_one(id) \n", - "# chunk_result_list.append(result)\n", - "# return chunk_result_list" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are two ways that we can split the processing here. \n", - "\n", - "When the cursor query is expensive, we want to split up the cursor. \n", - "\n", - "When the cursor query is not, we simply want to keep the cursor together, and parallelize the subsequent operations. Let's start with the latter. We've got a cursor object that is iterable. \n", - "\n", - "What you could do is iterate through the cursor and get all the object ids. Then you could find all of them again and \n", - "For " - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py37_rdkit_beta", - "language": "python", - "name": "py37_rdkit_beta" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/notebooks/Similarity Benchmarking.ipynb b/docs/notebooks/Similarity Benchmarking.ipynb new file mode 100644 index 0000000..7ebd8fb --- /dev/null +++ b/docs/notebooks/Similarity Benchmarking.ipynb @@ -0,0 +1,428 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Similarity Search Benchmarking\n", + "\n", + "These benchmarks were originally run on an early 2015 MacBook Pro with a 2.7 GHz dual-core i5 processor and 8GB of memory. \n", + "\n", + "They make use of a ChEMBL_27 dataset. \n", + "## Setup Work\n", + "### Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import mongordkit\n", + "import time\n", + "import pymongo\n", + "import rdkit\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from os import sys\n", + "import pandas as pd\n", + "from rdkit import Chem\n", + "from statistics import mean, median\n", + "import mongomock\n", + "from rdkit.Chem import AllChem\n", + "from mongordkit.Database import write\n", + "from mongordkit.Search import similarity\n", + "from mongordkit import Search" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Database Setup\n", + "Here we set up a database called `test` that will hold our molecules. We will construct a collection called `molecules_100K` to hold the first 100,000 molecules in the ChEMBL_27 dataset and a collection called `molecules_1M` to hold the first 1,000,000 molecules in the ChEMBL_27 dataset. If you have already run benchmarks from `mongo-rdkit` on your local MongoDB instance, these should have been set up already." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the client that will connect to the database.\n", + "client = pymongo.MongoClient()\n", + "db = client.test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If necessary, write the first 100,000 compounds to molecules_100K.\n", + "if db.molecules_100K.count_documents({}) != 100000:\n", + " write.WriteFromSDF(db.molecules_100K, '../../../chembl_27.sdf', chunk_size=1000, limit=100000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If necessary, write the first 1,000,000 compounds to molecules_1M.\n", + "if db.molecules_1M.count_documents({}) != 1000000:\n", + " write.writeFromSDF(db.molecules_1M, '../../../chembl_27.sdf', chunk_size=1000, limit=1000000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's ensure that there are actually 100,000 and 1M documents in these collections, respectively.\n", + "print(f\"In molecules_100K: {db.molecules_100K.count_documents({})} documents\")\n", + "print(f\"In molecules_1M: {db.molecules_1M.count_documents({})} documents\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Next, we have to prepare the database for search by adding in fingerprints and hash collections.\n", + "Search.PrepareForSearch(db, db.molecules_100K, db.molecules_100KCt, db.molecules_100KPm)\n", + "Search.PrepareForSearch(db, db.molecules_1M, db.molecules_1MCt, db.molecules_1MPm)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Query Set Setup\n", + "To benchmark, we'll use the first 200 compounds in ChEMBL. Let's get an rdmol for each of these and write them into a list. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "first_200 = []\n", + "for rdmol in Chem.ForwardMolSupplier('../../data/test_data/first_200.props.sdf'): \n", + " first_200.append(rdmol)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Benchmarks" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will search each compound five times against the target database, taking the mean value as representative of that molecule. We'll then take the median and mean for all 200 compounds, repeating the entire process for thresholds 0.7, 0.75, 0.8, 0.85, and 0.9. \n", + "\n", + "We will benchmark both the `SimSearchAggregate` and `SimSearchLSH` methods, keeping in mind that the LSH method does not return exact results. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "thresholds = [0.7, 0.75, 0.8, 0.85, 0.9, 0.95]\n", + "repetitions = 5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `SimSearchAggregate`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Benchmark against the first 100,000 molecules in ChEMBL. \n", + "aggregate_means_100K = []\n", + "aggregate_medians_100K = []\n", + "\n", + "for t in thresholds: \n", + " print(f\"Measuring performance for similarity threshold {t}...\")\n", + " query_times = []\n", + " for rdmol in first_200:\n", + " temp_times = []\n", + " for r in range(repetitions):\n", + " start = time.time()\n", + " _ = similarity.SimSearchAggregate(rdmol, db.molecules_100K, db.molecules_100KCt, threshold=t)\n", + " end = time.time()\n", + " temp_times.append(end - start)\n", + " query_times.append(mean(temp_times))\n", + " aggregate_means_100K.append([t, mean(query_times)])\n", + " aggregate_medians_100K.append([t, median(query_times)])\n", + "\n", + "print(f\"Aggregate means: {aggregate_means_100K}\")\n", + "print(f\"Aggregate medians: {aggregate_medians_100K}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Before we take a look at the 1M molecule dataset, let's graph these times to get a better idea of how similarity search increases in time required with lowered similarity thresholds: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x_list = [v[0] for v in aggregate_medians_100K]\n", + "y_list = [v[1] for v in aggregate_medians_100K]\n", + "plt.xlabel('thresholds')\n", + "plt.ylabel('time (s)')\n", + "plt.title('SimSearchAggregate medians / 100K dataset')\n", + "plt.plot(x_list, y_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And here are the equivalent benchmarks against a million-molecule dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Benchmark against the first 1M molecules in ChEMBL. \n", + "aggregate_means_1M = []\n", + "aggregate_medians_1M = []\n", + "\n", + "for t in thresholds: \n", + " print(f\"Measuring performance for similarity threshold {t}...\")\n", + " query_times = []\n", + " for rdmol in first_200:\n", + " temp_times = []\n", + " for r in range(repetitions):\n", + " start = time.time()\n", + " _ = similarity.SimSearchAggregate(rdmol, db.molecules_1M, db.molecules_1MCt, threshold=t)\n", + " end = time.time()\n", + " temp_times.append(end - start)\n", + " query_times.append(mean(temp_times))\n", + " aggregate_means_1M.append([t, mean(query_times)])\n", + " aggregate_medians_1M.append([t, median(query_times)])\n", + "\n", + "print(f\"Aggregate means: {aggregate_means_1M}\")\n", + "print(f\"Aggregate medians: {aggregate_medians_1M}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x_list = [v[0] for v in aggregate_medians_1M]\n", + "y_list = [v[1] for v in aggregate_medians_1M]\n", + "plt.xlabel('thresholds')\n", + "plt.ylabel('time (s)')\n", + "plt.title('SimSearchAggregate medians / 1M dataset')\n", + "plt.plot(x_list, y_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `SimSearchLSH`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will benchmark speed for LSH in the same way as we did for normal similarity search. As noted by the original ChEMBL authors of this approach, however, LSH also introduces an element of inaccuracy. Thus, we will also include a section on comparing results of `SimSearchAggregate` and `SimSearchLSH`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Benchmark against the first 100,000 molecules in ChEMBL. \n", + "LSH_means_100K = []\n", + "LSH_medians_100K = []\n", + "\n", + "for t in thresholds: \n", + " print(f\"Measuring performance for similarity threshold {t}...\")\n", + " query_times = []\n", + " for rdmol in first_200:\n", + " temp_times = []\n", + " for r in range(repetitions):\n", + " start = time.time()\n", + " _ = similarity.SimSearchLSH(rdmol, db, db.molecules_100K, \n", + " db.molecules_100KP, db.molecules_100KCt, threshold=t)\n", + " end = time.time()\n", + " temp_times.append(end - start)\n", + " query_times.append(mean(temp_times))\n", + " LSH_means_100K.append([t, mean(query_times)])\n", + " LSH_medians_100K.append([t, median(query_times)])\n", + "\n", + "print(f\"LSH means: {LSH_means_100K}\")\n", + "print(f\"LSH medians: {LSH_medians_100K}\")\n", + "\n", + "x_list = [v[0] for v in LSH_medians_100K]\n", + "y_list = [v[1] for v in LSH_medians_100K]\n", + "plt.xlabel('thresholds')\n", + "plt.ylabel('time (s)')\n", + "plt.title('SimSearchLSH medians / 100K dataset')\n", + "plt.plot(x_list, y_list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Benchmark against the first 100,000 molecules in ChEMBL. \n", + "LSH_means_1M = []\n", + "LSH_medians_1M = []\n", + "\n", + "for t in thresholds: \n", + " print(f\"Measuring performance for similarity threshold {t}...\")\n", + " query_times = []\n", + " for rdmol in first_200:\n", + " temp_times = []\n", + " for r in range(repetitions):\n", + " start = time.time()\n", + " _ = similarity.SimSearchLSH(rdmol, db, db.molecules_1M, \n", + " db.molecules_1MP, db.molecules_1MCt, threshold=t)\n", + " end = time.time()\n", + " temp_times.append(end - start)\n", + " query_times.append(mean(temp_times))\n", + " LSH_means_1M.append([t, mean(query_times)])\n", + " LSH_medians_1M.append([t, median(query_times)])\n", + "\n", + "print(f\"LSH means: {LSH_means_1M}\")\n", + "print(f\"LSH medians: {LSH_medians_1M}\")\n", + "\n", + "x_list = [v[0] for v in LSH_medians_1M]\n", + "y_list = [v[1] for v in LSH_medians_1M]\n", + "plt.xlabel('thresholds')\n", + "plt.ylabel('time (s)')\n", + "plt.title('SimSearchLSH medians / 1M dataset')\n", + "plt.plot(x_list, y_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In order to compare accuracy, we will use the approach written about in the ChEMBL blog post: finding the symmetric set difference between the two sets of results as a percentage of the size of the union of the two result sets. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = []\n", + "\n", + "for t in thresholds: \n", + " print(f\"Measuring accuracy for similarity threshold {t}...\")\n", + " nmols_w_discrepancies = 0\n", + " discrepancies_per_mol = []\n", + " discrepancy_percent_per_mol = []\n", + " for rdmol in first_200:\n", + " sim_lsh = similarity.SimSearchLSH(rdmol, db, db.molecules_100K, \n", + " db.molecules_100KP, db.molecules_100KCt, threshold=t)\n", + " sim_agg = similarity.SimSearchAggregate(rdmol, db.molecules_100K, db.molecules_100KCt, threshold=t)\n", + " if sim_lsh: \n", + " set_lsh = set(result[1] for result in sim_lsh)\n", + " else:\n", + " set_lsh = set()\n", + " if sim_agg: \n", + " set_agg = set(result[1] for result in sim_agg)\n", + " else: \n", + " set_agg = set()\n", + " sym_set_diff = (set_lsh ^ set_agg)\n", + " discrepancies = len(sym_set_diff)\n", + " total = len(set_lsh | set_agg)\n", + " if discrepancies:\n", + " nmols_w_discrepancies += 1\n", + " discrepancies_per_mol.append(discrepancies)\n", + " discrepancy_percent_per_mol.append(discrepancies / total * 100)\n", + " results.append([t, f'nmols_w_discrepancies: {nmols_w_discrepancies}', \n", + " np.mean(discrepancies_per_mol), np.mean(discrepancy_percent_per_mol)])\n", + "print(results)\n", + "x_list = [v[0] for v in results]\n", + "y_list = [v[3] for v in results]\n", + "plt.xlabel('thresholds')\n", + "plt.ylabel('discrepancy percent per molecule')\n", + "plt.title('LSH Accuracy / 100K dataset')\n", + "plt.plot(x_list, y_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Discussion\n", + "These times are already very reasonable for a similarity search. However, it is worth noting that these benchmarks were run on a local MongoDB instance, effectively making no distinction between the client and the server. A MongoDB instance that has more horizontal scaling could benefit greatly from the aggregation pipeline, thus speeding search even further. \n", + "\n", + "The time complexity also increases greatly with decreasing Tanimoto thresholds." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py37_rdkit_beta", + "language": "python", + "name": "py37_rdkit_beta" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/notebooks/Similarity Testing.ipynb b/docs/notebooks/Similarity Testing.ipynb deleted file mode 100644 index 986a6c0..0000000 --- a/docs/notebooks/Similarity Testing.ipynb +++ /dev/null @@ -1,532 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Similarity Search Benchmarking\n", - "\n", - "These benchmarks were originally run on an early 2015 MacBook Pro with a 2.7 GHz dual-core i5 processor and 8GB of memory. \n", - "\n", - "They make use of a ChEMBL_27 dataset. " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import mongordkit\n", - "import time\n", - "import pymongo\n", - "import rdkit\n", - "import matplotlib\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "from os import sys\n", - "import pandas as pd\n", - "from rdkit import Chem\n", - "from statistics import mean\n", - "import mongomock\n", - "from rdkit.Chem import AllChem\n", - "from mongordkit.Database import write\n", - "from mongordkit.Search import similarity" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "populating mongodb collection with compounds from chembl...\n", - "199 molecules successfully imported\n" - ] - } - ], - "source": [ - "#Create a mongomock database instance and write to it. \n", - "client = mongomock.MongoClient()\n", - "db = client.db\n", - "\n", - "#Write 200 molecules into the database\n", - "write.writeFromSDF(db.molecules, '../../data/test_data/first_200.props.sdf', 'test', chunk_size=100, limit=199)\n", - "doc = db.molecules.find_one()\n", - "m = Chem.Mol(doc['rdmol'])" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "#Add Morgan fingerprints into the database\n", - "similarity.addMorganFingerprints(db)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#Check that similarity search is working, at least for one molecule. \n", - "doc = db.molecules.find_one()\n", - "m = Chem.Mol(doc['rdmol'])\n", - "results = similarity.similaritySearch(m, db, .8)\n", - "results" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "populating mongodb collection with compounds from chembl...\n", - "inserted chunk...\n", - "inserted chunk...\n", - "1000 molecules successfully imported\n" - ] - }, - { - "data": { - "text/plain": [ - "1000" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#Create a regular mongoDB database instance and write the first 1000 molecules to it. \n", - "client = pymongo.MongoClient()\n", - "db = client.db\n", - "db.molecules.drop()\n", - "db.mfp_counts.drop()\n", - "write.writeFromSDF(db, '../../../chembl_27.sdf', 'test', reg_option='standard_setting', index_option='inchikey', chunk_size=500, limit=500)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "def calc_tanimoto(Na, Nb):\n", - " Nab = len(set(Na).intersection((set(Nb))))\n", - " return float(Nab) / (len(Na) + len(Nb) - Nab)\n", - "\n", - "def similarity_search_naive(query_mol, db, threshold): \n", - " results = []\n", - " qfp = list(AllChem.GetMorganFingerprintAsBitVect(query_mol, 2, nBits=1024).GetOnBits())\n", - " for mol in db.molecules.find():\n", - " mfp = list(AllChem.GetMorganFingerprintAsBitVect(Chem.Mol(mol['rdmol']), 2, nBits=1024).GetOnBits())\n", - " tanimoto = calc_tanimoto(qfp, mfp)\n", - " if calc_tanimoto(qfp, mfp) >= threshold:\n", - " results.append([tanimoto, mol['smiles']])\n", - " return results" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Measuring performance for similarity threshold 0.7.\n", - "Measuring performance for similarity threshold 0.75.\n", - "Measuring performance for similarity threshold 0.8.\n", - "Measuring performance for similarity threshold 0.85.\n", - "Measuring performance for similarity threshold 0.9.\n", - "Measuring performance for similarity threshold 0.95.\n", - "[[0.7, 3.236401987075806], [0.75, 2.964214563369751], [0.8, 2.850223159790039], [0.85, 2.716036558151245], [0.9, 2.50888934135437], [0.95, 2.7822859287261963]]\n", - "Measuring performance for similarity threshold 0.7.\n" - ] - }, - { - "ename": "NameError", - "evalue": "name 'query_mol' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mm\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmolecules\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0mmol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mChem\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMol\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mm\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'rdmol'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 25\u001b[0;31m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msimilarity_search_naive\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 26\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0mtemp_times\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mend\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36msimilarity_search_naive\u001b[0;34m(mol, db, t)\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0msimilarity_search_naive\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mqfp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mAllChem\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mGetMorganFingerprintAsBitVect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery_mol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnBits\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1024\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mGetOnBits\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mmol\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmolecules\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mmfp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mAllChem\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mGetMorganFingerprintAsBitVect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mChem\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMol\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmol\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'rdmol'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnBits\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1024\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mGetOnBits\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'query_mol' is not defined" - ] - } - ], - "source": [ - "#Run benchmarks for similarity search with and without aggregation parameters, then with LSH + aggregation. \n", - "thresholds = [0.7, 0.75, 0.8, 0.85, 0.9, 0.95]\n", - "times = []\n", - "repetitions = 5\n", - "for t in thresholds: \n", - " print(\"Measuring performance for similarity threshold {}.\".format(t))\n", - " temp_times = []\n", - " for r in range(repetitions):\n", - " start = time.time()\n", - " for m in db.molecules.find():\n", - " mol = Chem.Mol(m['rdmol'])\n", - " _ = similarity.similaritySearch(mol, db, t)\n", - " end = time.time()\n", - " temp_times.append(end - start)\n", - " times.append([t, mean(temp_times)])\n", - "print(times)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Measuring performance for similarity threshold 0.7.\n", - "Measuring performance for similarity threshold 0.75.\n" - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mmol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mChem\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMol\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mm\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'rdmol'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msimilarity_search_naive\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 12\u001b[0m \u001b[0mcounter\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36msimilarity_search_naive\u001b[0;34m(query_mol, db, threshold)\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mqfp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mAllChem\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mGetMorganFingerprintAsBitVect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery_mol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnBits\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1024\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mGetOnBits\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mmol\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmolecules\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mmfp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mAllChem\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mGetMorganFingerprintAsBitVect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mChem\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMol\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmol\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'rdmol'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnBits\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1024\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mGetOnBits\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0mtanimoto\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcalc_tanimoto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mqfp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmfp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcalc_tanimoto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mqfp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmfp\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0mthreshold\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], - "source": [ - "for t in thresholds: \n", - " print(\"Measuring performance for similarity threshold {}.\".format(t))\n", - " temp_times = []\n", - " for r in range(5):\n", - " start = time.time()\n", - " counter = 0\n", - " for m in db.molecules.find():\n", - " if counter > 100: \n", - " break\n", - " mol = Chem.Mol(m['rdmol'])\n", - " _ = similarity_search_naive(mol, db, t)\n", - " counter += 1\n", - " end = time.time()\n", - " temp_times.append(end - start)\n", - " times.append([t, mean(temp_times)])\n", - "print(times)" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "x_list = [v[0] for v in times]\n", - "y_list = [v[1]*1000 for v in times]\n", - "plt.xlabel('thresholds')\n", - "plt.ylabel('time (ms)')\n", - "plt.title('Without Aggregation')\n", - "plt.plot(x_list, y_list)" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Measuring performance for similarity threshold 0.7.\n", - "Measuring performance for similarity threshold 0.75.\n", - "Measuring performance for similarity threshold 0.8.\n", - "Measuring performance for similarity threshold 0.85.\n", - "Measuring performance for similarity threshold 0.9.\n", - "Measuring performance for similarity threshold 0.95.\n", - "[[0.7, 6.002911186218261], [0.75, 5.983159065246582], [0.8, 5.641262626647949], [0.85, 5.888340759277344], [0.9, 6.869273900985718], [0.95, 5.446581506729126]]\n" - ] - }, - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "thresholds = [0.7, 0.75, 0.8, 0.85, 0.9, 0.95]\n", - "times = []\n", - "repetitions = 5\n", - "for t in thresholds: \n", - " print(\"Measuring performance for similarity threshold {}.\".format(t))\n", - " temp_times = []\n", - " for r in range(repetitions):\n", - " start = time.time()\n", - " for m in db.molecules.find():\n", - " mol = Chem.Mol(m['rdmol'])\n", - " _ = similarity.similaritySearchAggregate(mol, db, t)\n", - " end = time.time()\n", - " temp_times.append(end - start)\n", - " times.append([t, mean(temp_times)])\n", - "print(times)\n", - "x_list = [v[0] for v in times]\n", - "y_list = [v[1]*1000 for v in times]\n", - "plt.xlabel('thresholds')\n", - "plt.ylabel('time (ms)')\n", - "plt.title('With Aggregation')\n", - "plt.plot(x_list, y_list)" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "populating mongodb collection with compounds from chembl...\n", - "The specified setting does not exist. Will only insert default molecules\n", - "inserted chunk...\n", - "inserted chunk...\n", - "1000 molecules successfully imported\n" - ] - } - ], - "source": [ - "db.molecules.drop()\n", - "db.mfp_counts.drop()\n", - "write.writeFromSDF(db, '../../../chembl_27.sdf', 'test', reg_option='inchikey', index_option='inchikey', chunk_size=500, limit=500)\n", - "similarity.addMorganFingerprints(db)" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Measuring performance for similarity threshold 0.7.\n", - "Measuring performance for similarity threshold 0.75.\n", - "Measuring performance for similarity threshold 0.8.\n", - "Measuring performance for similarity threshold 0.85.\n", - "Measuring performance for similarity threshold 0.9.\n", - "Measuring performance for similarity threshold 0.95.\n", - "[[0.7, 0.0038346290588378907], [0.75, 0.003863954544067383], [0.8, 0.00497593879699707], [0.85, 0.00534672737121582], [0.9, 0.004187107086181641], [0.95, 0.006510639190673828]]\n" - ] - }, - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "#Compute benchmarks with a fingerprint counts collection.\n", - "thresholds = [0.7, 0.75, 0.8, 0.85, 0.9, 0.95]\n", - "times = []\n", - "repetitions = 5\n", - "for t in thresholds: \n", - " print(\"Measuring performance for similarity threshold {}.\".format(t))\n", - " temp_times = []\n", - " for r in range(repetitions):\n", - " start = time.time()\n", - " for m in db.molecules.find():\n", - " mol = Chem.Mol(m['rdmol'])\n", - " _ = similarity.similaritySearch(mol, db, t)\n", - " end = time.time()\n", - " temp_times.append(end - start)\n", - " times.append([t, mean(temp_times)])\n", - "print(times)\n", - "x_list = [v[0] for v in times]\n", - "y_list = [v[1]*1000 for v in times]\n", - "plt.xlabel('thresholds')\n", - "plt.ylabel('time (ms)')\n", - "plt.title('Without Aggregation')\n", - "plt.plot(x_list, y_list)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Measuring performance for similarity threshold 0.7.\n", - "Measuring performance for similarity threshold 0.75.\n", - "Measuring performance for similarity threshold 0.8.\n", - "Measuring performance for similarity threshold 0.85.\n", - "Measuring performance for similarity threshold 0.9.\n", - "Measuring performance for similarity threshold 0.95.\n", - "[[0.7, 0.009987068176269532], [0.75, 0.0059474468231201175], [0.8, 0.005334234237670899], [0.85, 0.0038384437561035157], [0.9, 0.0040286540985107425], [0.95, 0.0037296295166015627]]\n" - ] - }, - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "times = []\n", - "for t in thresholds: \n", - " print(\"Measuring performance for similarity threshold {}.\".format(t))\n", - " temp_times = []\n", - " for r in range(repetitions):\n", - " start = time.time()\n", - " for m in db.molecules.find():\n", - " mol = Chem.Mol(m['rdmol'])\n", - " _ = similarity.similaritySearchAggregate(mol, db, t)\n", - " end = time.time()\n", - " temp_times.append(end - start)\n", - " times.append([t, mean(temp_times)])\n", - "print(times)\n", - "x_list = [v[0] for v in times]\n", - "y_list = [v[1]*1000 for v in times]\n", - "plt.xlabel('thresholds')\n", - "plt.ylabel('time (ms)')\n", - "plt.title('Without Aggregation')\n", - "plt.plot(x_list, y_list)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py37_rdkit_beta", - "language": "python", - "name": "py37_rdkit_beta" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/notebooks/Similarity and Substructure Search.ipynb b/docs/notebooks/Similarity and Substructure Search.ipynb index 837c466..82c3f12 100644 --- a/docs/notebooks/Similarity and Substructure Search.ipynb +++ b/docs/notebooks/Similarity and Substructure Search.ipynb @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -54,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "metadata": { "scrolled": true }, @@ -63,195 +63,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "populating mongodb collection with compounds from SDF...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:21] WARNING: Charges were rearranged\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [22:56:21] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:23] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", - "RDKit WARNING: [23:01:23] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", - "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:24] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:24] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "populating mongodb collection with compounds from SDF...\n", "200 molecules successfully imported\n", "0 duplicates skipped\n", "Preparing database and collections for search...\n", @@ -273,7 +85,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -302,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "metadata": { "scrolled": true }, @@ -318,91 +130,182 @@ "name": "stderr", "output_type": "stream", "text": [ - "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:25] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:25] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:44] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:44] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:44] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:44] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:44] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:44] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:44] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:44] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:44] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [23:01:44] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:45] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", - "RDKit WARNING: [23:01:45] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [23:01:45] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [23:01:46] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:46] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [23:01:46] WARNING: Omitted undefined stereo\n" + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:38] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [15:43:38] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged; Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:40] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:40] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:40] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:40] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:40] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:40] WARNING: Charges were rearranged\n", + "RDKit WARNING: [15:43:40] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:40] WARNING: Omitted undefined stereo\n", + "RDKit WARNING: [15:43:40] WARNING: Omitted undefined stereo\n" ] }, { @@ -419,7 +322,7 @@ "200" ] }, - "execution_count": 8, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -440,7 +343,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -449,7 +352,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -474,7 +377,7 @@ " 'count': 16}" ] }, - "execution_count": 10, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -492,17 +395,17 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "similaritySearch: []\n", + "similaritySearch: [[0.4117647058823529, 'WLHCBQAPPJAULW-UHFFFAOYSA-N']]\n", "\n", "\n", - "similaritySearchAggregate: []\n" + "similaritySearchAggregate: [[0.4117647058823529, 'WLHCBQAPPJAULW-UHFFFAOYSA-N']]\n" ] } ], @@ -510,10 +413,10 @@ "q_mol = Chem.MolFromSmiles('Cc1ccccc1')\n", "\n", "# Perform a similarity search on TestDB for q_mol with a Tanimoto threshold of 0.4. \n", - "results1 = similarity.SimSearch(q_mol, demo_db.molecules, demo_db.mfp_counts, 0.8)\n", + "results1 = similarity.SimSearch(q_mol, demo_db.molecules, demo_db.mfp_counts, 0.4)\n", "\n", "# Do the same thing, but use the MongoDB Aggregation Pipeline. \n", - "results2 = similarity.SimSearchAggregate(q_mol, demo_db.molecules, demo_db.mfp_counts, 0.8)\n", + "results2 = similarity.SimSearchAggregate(q_mol, demo_db.molecules, demo_db.mfp_counts, 0.4)\n", "\n", "print('similaritySearch: {}'.format(results1))\n", "print('\\n')\n", @@ -531,7 +434,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -555,7 +458,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -569,7 +472,8 @@ "source": [ "q_mol = Chem.MolFromSmiles('Cc1ccccc1')\n", "\n", - "results3 = similarity.SimSearchLSH(q_mol, demo_db, demo_db.molecules, demo_db.permutations, threshold=0.8)\n", + "results3 = similarity.SimSearchLSH(q_mol, demo_db, demo_db.molecules, \n", + " demo_db.permutations, demo_db.mfp_counts, threshold=0.8)\n", "\n", "print('similaritySearchLSH: {}'.format(results3))" ] @@ -594,7 +498,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -610,7 +514,7 @@ " 'CDCRUVGWQJYTFO-UHFFFAOYSA-N']" ] }, - "execution_count": 14, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -631,7 +535,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -647,7 +551,7 @@ " 'CDCRUVGWQJYTFO-UHFFFAOYSA-N']" ] }, - "execution_count": 15, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -700,7 +604,7 @@ "\n", "mongordkit.Search.similarity.**AddHashCollections**(db (*MongoDB database*), mol_collection (*MongoDB collection*)) --> None\n", "\n", - "mongordkit.Search.similarity.**SimSearchLSH**(mol (*rdmol object*), db (*MongoDB database containing hash collections*), mol_collection (*MongoDB collection*), perm_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, index]*" + "mongordkit.Search.similarity.**SimSearchLSH**(mol (*rdmol object*), db (*MongoDB database containing hash collections*), mol_collection (*MongoDB collection*), perm_collection (*MongoDB collection*), count_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, index]*" ] }, { diff --git a/docs/notebooks/Substructure Benchmarking.ipynb b/docs/notebooks/Substructure Benchmarking.ipynb index 26f2eb8..ef27503 100644 --- a/docs/notebooks/Substructure Benchmarking.ipynb +++ b/docs/notebooks/Substructure Benchmarking.ipynb @@ -15,7 +15,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Imports" + "## Setup Work\n", + "### Imports" ] }, { @@ -38,27 +39,25 @@ "import numpy as np\n", "from os import sys\n", "import pandas as pd\n", - "from statistics import mean, median\n", "from IPython.display import display, HTML\n", "\n", "from mongordkit.Database import write\n", "from mongordkit.Search import similarity\n", - "from mongordkit.Search import substructure" + "from mongordkit.Search import substructure\n", + "from mongordkit import Search" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Database Setup\n", - "Here we set up a database called `test` that will hold our molecules. We will construct 1 collection called `molecules_100K` to hold the first 100,000 molecules in the ChEMBL_27 dataset and a collection called `molecules_1M` to hold the first 1,000,000 molecules in the ChEMBL_27 dataset." + "### Database Setup\n", + "Here we set up a database called `test` that will hold our molecules. We will construct a collection called `molecules_100K` to hold the first 100,000 molecules in the ChEMBL_27 dataset and a collection called `molecules_1M` to hold the first 1,000,000 molecules in the ChEMBL_27 dataset. If you have already run benchmarks from `mongo-rdkit` on your local MongoDB instance, these should have been set up already." ] }, { - "cell_type": "code", - "execution_count": 8, + "cell_type": "raw", "metadata": {}, - "outputs": [], "source": [ "# Initialize the client that will connect to the database.\n", "client = pymongo.MongoClient()\n", @@ -67,76 +66,65 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "populating mongodb collection with compounds from chembl...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "RDKit WARNING: [15:22:20] Warning: conflicting stereochemistry at atom 11 ignored.\n", - "RDKit WARNING: [15:45:12] Warning: conflicting stereochemistry at atom 14 ignored.\n", - "RDKit WARNING: [16:15:11] Warning: conflicting stereochemistry at atom 10 ignored.\n", - "RDKit WARNING: [16:15:11] Warning: conflicting stereochemistry at atom 10 ignored.\n", - "RDKit WARNING: [16:15:40] Warning: conflicting stereochemistry at atom 10 ignored.\n", - "RDKit WARNING: [16:15:40] Warning: conflicting stereochemistry at atom 10 ignored.\n", - "RDKit WARNING: [16:26:44] Warning: conflicting stereochemistry at atom 6 ignored.\n", - "RDKit WARNING: [16:26:44] Warning: conflicting stereochemistry at atom 6 ignored.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "101001 molecules successfully imported\n" + "populating mongodb collection with compounds from SDF...\n", + "100000 molecules successfully imported\n", + "1 duplicates skipped\n" ] }, { "data": { "text/plain": [ - "101001" + "100000" ] }, - "execution_count": 15, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Write the first 100,000 compounds to molecules_100K. \n", - "write.writeFromSDF(db.molecules_100K, '../../../chembl_27.sdf', 'test', reg_option='standard_setting', \n", - " index_option='inchikey', chunk_size=1000, limit=100000)" + "# If necessary, write the first 100,000 compounds to molecules_100K.\n", + "if db.molecules_100K.count_documents({}) != 100000:\n", + " write.WriteFromSDF(db.molecules_100K, '../../../chembl_27.sdf', chunk_size=1000, limit=100000)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "populating mongodb collection with compounds from SDF...\n" + ] + } + ], "source": [ - "# Write the first 1,000,000 compounds to molecules_1M.\n", - "write.writeFromSDF(db.molecules_1M, '../../../chembl27_sdf', 'test', reg_option='standard_setting', \n", - " index_option='inchikey', chunk_size=1000, limit=1000000)" + "# If necessary, write the first 1,000,000 compounds to molecules_1M.\n", + "if db.molecules_1M.count_documents({}) != 1000000:\n", + " write.WriteFromSDF(db.molecules_1M, '../../../chembl_27.sdf', chunk_size=1000, limit=1000000)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "In molecules_100K: 101000 documents\n", - "In molecules_1M: 0 documents\n" + "In molecules_100K: 100000 documents\n", + "In molecules_1M: 180512 documents\n" ] } ], @@ -146,11 +134,22 @@ "print(f\"In molecules_1M: {db.molecules_1M.count_documents({})} documents\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Next, we have to prepare all of the documents in our collections for search by adding in fingerprints.\n", + "substructure.AddPatternFingerprints(db.molecules_100K)\n", + "substructure.AddPatternFingerprints(db.molecules_1M)" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Query Set Setup\n", + "### Query Set Setup\n", "For our queries, we'll use three sets of patterns identified by Greg Landrum in one of his [blog posts](http://rdkit.blogspot.com/2013/11/fingerprint-based-substructure.html) on substructure searching and discussed in this [mailing list](http://www.mail-archive.com/rdkit-discuss@lists.sourceforge.net/msg02066.html) and this [presentation](http://www.hinxton.wellcome.ac.uk/advancedcourses/MIOSS%20Greg%20Landrum.pdf). They are: \n", "- Fragments: 500 diverse molecules taken from the ZINC Fragments set\n", "- Leads: 500 diverse molecules taken from the ZINC Lead-like set\n", @@ -159,7 +158,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -184,12 +183,12 @@ "### Naive Substructure Search\n", "`substructure.SubSearchNaive` is a search that simply loops through the dataset and checks for a substructure match on each molecule. This method is not directly benchmarked here because searching through a single molecule takes upward of 5 seconds; this means that it is far too slow to feel directly interactive.\n", "### Substructure Search with Fingerprint Screening\n", - "Instead, we will benchmark the standard `SubSearch`, which makes use of fingerprint screening to dramatically increase efficiency. First, we want to see what kinds of times we are dealing with. For each of our query sets, we will search all of their elements against `molecules_100K` and `molecules_1M`, then return the median and mean query times in seconds. " + "Instead, we will benchmark the standard `SubSearch`, which makes use of fingerprint screening to dramatically increase efficiency. For each of our query sets, we will search all of their elements against `molecules_100K` and `molecules_1M`, then return the median and mean query times in seconds. " ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -199,34 +198,118 @@ " start = time.time()\n", " substructure.SubSearch(pattern, dataset)\n", " end = time.time()\n", - " results.append(end - start)" + " results.append(end - start)\n", + " return results" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
meanmedian
fragments0.0627400.062074
leads0.0625920.062289
pieces0.0627390.061950
\n", + "
" + ], + "text/plain": [ + " mean median\n", + "fragments 0.062740 0.062074\n", + "leads 0.062592 0.062289\n", + "pieces 0.062739 0.061950" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Benchmark for search of all three query sets against 100K and 1M.\n", - "# This should take around five minutes; these calls can be split up if necessary.\n", + "# This should take around five minutes; these calls commented out if necessary.\n", "frag_times_100K = benchmark_query_set(fragments, db.molecules_100K)\n", - "frag_times_1M = benchmark_query_set(fragments, db.molecules_1M)\n", "lead_times_100K = benchmark_query_set(leads, db.molecules_100K)\n", - "lead_times_1M = benchmark_query_set(leads, db.molecules_1M)\n", "pieces_times_100K = benchmark_query_set(pieces, db.molecules_100K)\n", - "pieces_times_1M = benchmark_query_set(pieces, db.molecules_1M)\n", "\n", - "results = [frag_times_100K, frag_times_1M, lead_times_100K, lead_times_1M, pieces_times_100K, pieces_times_1M]\n", + "results = [frag_times_100K, lead_times_100K, pieces_times_100K]\n", + "means_100K = [np.mean(times) for times in results]\n", + "medians_100K = [np.median(times) for times in results]\n", "\n", - "means = [mean(times) for times in results]\n", - "medians = [median(times) for times in results]\n", + "data = {'mean (100K)': means, 'median (100K)': medians}\n", + "df = pd.DataFrame(data, index =['fragments', 'leads', 'pieces']) \n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Benchmark for search of all three query sets against 1M. \n", + "# This should take around five minutes; these calls can be commented out if necessary.\n", + "frag_times_1M = benchmark_query_set(fragments, db.molecules_1M)\n", + "lead_times_1M = benchmark_query_set(leads, db.molecules_1M)\n", + "pieces_times_1M = benchmark_query_set(pieces, db.molecules_1M)\n", + "\n", + "results = [frag_times_1M, lead_times_1M, pieces_times_1M]\n", + "means_1M = [np.mean(times) for times in results]\n", + "medians_1M = [np.median(times) for times in results]\n", "\n", - "data = {'mean': means, 'median': medians}\n", + "data = {'mean (1M)': means, 'median (1M)': medians}\n", "df = pd.DataFrame(data, index =['fragments', 'leads', 'pieces']) \n", "df" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Discussion\n", + "\n", + "A median search time of less than 70ms indicates decent performance, certainly fast enough to have interactive search performance on large datasets with single molecules (the traditional UI benchmark for instant feedback being 100ms). " + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -248,18 +331,6 @@ "display_name": "py37_rdkit_beta", "language": "python", "name": "py37_rdkit_beta" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" } }, "nbformat": 4, diff --git a/docs/testing.md b/docs/testing.md index a118563..ce75d74 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -7,10 +7,15 @@ pip install -U pytest ``` ## Running Tests While you are in the top-level directory (`/your path here/mongo-rdkit`), -you can run all tests simply by opening a command line and running: +you can run all tests on OS/Linux systems simply by opening a command line and running: ``` pytest ``` +On Windows, you may have to run the following instead: +``` +python -m pytest +``` + Directory position is important because as of 7/10/20, the tests have explicit file paths to test data in the `/data` directory. @@ -21,6 +26,10 @@ a MongoDB URI to the testing framework: ``` pytest --server="YOUR_MONGO_URI" ``` +On Windows: +``` +python -m pytest --server="YOUR_MONGO_URI" +``` Passing in `"local"` will run the tests on your locally hosted MongoDB instance. diff --git a/mongordkit/Database/__pycache__/write.cpython-37.pyc b/mongordkit/Database/__pycache__/write.cpython-37.pyc deleted file mode 100644 index b0fdab4676615f4b04e22791685b637ac7ee2719..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4377 zcmeHKOLH4V5Z={uwR%`U!mD@;;gJZ4TyVj-s8nLh#x86ng`9#?6{uD_BWuw<RZbjFac?YyogX)7EEm8BX-mNaQm1zaqYP3pgceU;k zU7~e(*Qwso8jm~xtF(4Q7lk#)xA+7~x%DCQ;9cT$H*mMCa0rzmr;cwAod-JfZ)8OU zKD7e7V?$piYFpPjTfNPj?e)Fw>+Oy>_kQcW?Y-V&>jMfy?E+2+R}CI4ufr2w0-$&d1U?}nC`&|>h953VGc<6>go(~ruDc5KjGQbrv#Gs^P2Nr~p)(vC{9 zS4sA&B*kMhso*S$A-@_?IVs-JliVo7uOx+I=z(9+(t|4>XzHuf_v`I9lOkC3>QObd z=zeO^{TtfB_b^sZs`Hu;zK8MP+v93t+>`5vS`lg`RRfD_G^a*@&kXDJtfrX+t`E2q zv5-VVM(n_km>-2C7?4hDlSBd8y}r|1zrM4xA-zH@pAwrh3r5JnmQNg?vfCu^r3O_A z$INyHj?E~1C7t!HjopnVxjv4Zz_(m7wz%c7i1Bb$D!EMTIE(_%Ib?(ecI<%-B#d}$ zM=@ulKOsut#PRoumFgq&!9#-nC(MqTsk&RZBxz45V=Mg(#=wd!5;C*^df0*M!Z5Jv z)Fkh3ygRq*soE1Ohruo4S^k800Vs%Fz|R6p9Y16|f>m1)OcObtGM+^dTE4z@v;AJL zv)$d8Ds&ysiKLx=?DZK(+)p&(D!?5GAwrO9+1cLR-ct%zC6G9xce0XIat)yI_k-5j z-1-&NmPCHcA&`Zstm_3n6J-_WJs~l?=+YIa;zmu>76VjdA^t=`SuARcQ6bC|r;EJw zmngw_waG+Yeig>l&%!)GNaQ(%p1h3fHKt1ty7RThFGAZd>&rJ`dxp#QkVDu8<00eA zEf$WTxGWudnbVOIsj@1ztjOwHAzPLwN^?9B6?t54f_K5WB~Tk)1)3yO>V90W z!=2F$cuf7#^lies0(B+pdM;x^y#}|b^9JZRo7OJ;JzBe)ED~BfrH}TE&^|m^0Yb~q z2(7R{XrIpr4SOltLqao75n5?RXe&TU<{Y7|03pe*lF)ws1EKwxTJ$3j+IMFV+IK0T zl@|!DJV$7i1wtbb=zm6NJKLST{~1m3xH{72dH0ON^k2kiE^O#IO2aC0sXP!;Dhd~v zK%GASJraBS;Ykd^oUxcZUH>tMRe1%@J6=VBIXquNQAhD8ipNl#Ljiw68h;$c6DXcU z@f3(gjXw=9kwfeFGuZqrisw)~kAk3h0mTIrFQRw}1@Z%b8O19okQ$V@1SL_Qo$tGW zy93JCp!Li%-h9|I{x#5KXF%YEZ_EqU;R*5oPX0<|4u6wJ*bUdR*)VV^WIT3283Ae~ z78%68%2uNxWSEkZAn}}~hPDOiER_m68+&`(?RW9KQ;Gd7(Y-jIwOz#IQf7Q9(UI2L z!akw>RhVF!Pts{G;NQsd?Nf=N%206txY5Q5l2j;APZ7yqbbWx{F&z%q&$y3D)S)lBDNbSFddSa s498HH?vS(1*(i6L+-Sx-UO?f`=?Xe2#4TQjbV%LjvN^rJf{$VT4i~5T3IG5A diff --git a/mongordkit/Database/registration.py b/mongordkit/Database/registration.py index 876321d..acf9b26 100644 --- a/mongordkit/Database/registration.py +++ b/mongordkit/Database/registration.py @@ -14,10 +14,9 @@ DEFAULT_PREPROCESS = False DEFAULT_INDEX = 'inchikey_standard' - -RDKIT_HASH_FUNCTIONS = rdkit.Chem.rdMolHash.HashFunction.names HASH_FUNCTIONS = {} -HASH_FUNCTIONS['MoleculeHashString'] = rdMolHash.GenerateMoleculeHashString +for k, v in rdMolHash.HashFunction.names.items(): + HASH_FUNCTIONS[k] = lambda rdmol, f=v: rdMolHash.MolHash(rdmol, f) HASH_FUNCTIONS['inchi_standard'] = Chem.MolToInchi HASH_FUNCTIONS['inchikey_standard'] = Chem.MolToInchiKey HASH_FUNCTIONS['inchi_KET_15T'] = lambda rdmol: Chem.MolToInchi(rdmol, options='-KET -15T') @@ -33,7 +32,6 @@ def __init__(self): self.author = DEFAULT_AUTHOR self.pre_processed = DEFAULT_PREPROCESS self.index_option = DEFAULT_INDEX - self.rdkit_hashes = set(RDKIT_HASH_FUNCTIONS.keys()) self.hashes = set(HASH_FUNCTIONS.keys()) self.fingerprints = {} self.value_fields = {} @@ -46,16 +44,15 @@ def __repr__(self): '}' def set_index(self, new_index): - if new_index not in HASH_FUNCTIONS.keys() and new_index not in RDKIT_HASH_FUNCTIONS.keys(): + if new_index not in HASH_FUNCTIONS.keys(): raise Exception("Please add this hash first.") else: self.index_option = new_index + return def get_index_value(self, rdmol): if self.index_option in HASH_FUNCTIONS.keys(): return HASH_FUNCTIONS[self.index_option](rdmol) - elif self.index_option in RDKIT_HASH_FUNCTIONS.keys(): - return rdMolHash.MolHash(rdmol, RDKIT_HASH_FUNCTIONS[self.index_option]) else: raise Exception("Specified index option does not exist.") @@ -73,9 +70,6 @@ def remove_field(self, field_name): if field_name in self.value_fields.keys(): self.value_fields.pop(field_name) print(f'removed {field_name} from scheme') - if field_name in self.rdkit_hashes: - self.rdkit_hashes.remove(field_name) - print(f'removed {field_name} from scheme') def generate_mol_doc(self, rdmol): molDoc = { @@ -84,8 +78,6 @@ def generate_mol_doc(self, rdmol): 'smiles': Chem.MolToSmiles(rdmol), 'scheme': self.scheme_name, 'hashes': {hash_name: HASH_FUNCTIONS[hash_name](rdmol) for hash_name in self.hashes}, - 'rdkit_hashes': {hash_name: rdMolHash.MolHash(rdmol, RDKIT_HASH_FUNCTIONS[hash_name]) - for hash_name in self.rdkit_hashes}, 'fingerprints': {fp: fp_method(rdmol) for fp, fp_method in self.fingerprints.items()}, 'value_data': {field_name: value for field_name, value in self.value_fields.items()} } diff --git a/mongordkit/Database/write.py b/mongordkit/Database/write.py index 02dad52..50d37e2 100644 --- a/mongordkit/Database/write.py +++ b/mongordkit/Database/write.py @@ -1,13 +1,14 @@ import pymongo, pickle from bson import Binary from rdkit import Chem +import rdkit from rdkit.Chem import rdMolHash from rdkit.Chem import rdinchi -from .registration import MolDocScheme, HASH_FUNCTIONS, RDKIT_HASH_FUNCTIONS +from .registration import MolDocScheme, HASH_FUNCTIONS def WriteFromSDF(mol_collection, sdf, scheme=MolDocScheme(), - reg_collection=None, chunk_size=100, limit=None): + reg_collection=None, chunk_size=100, limit=None, warnings=False): """ Writes the contents of SDF to MOL_COLLECTION and creates an index on the index specificed in SCHEME. Optional parameters: @@ -22,14 +23,10 @@ def WriteFromSDF(mol_collection, sdf, scheme=MolDocScheme(), :param chunk_size: Integer indicating how many molecules inserted at a time. :param limit: Integer indicating how many molecules to insert. """ + if not warnings: + rdkit.RDLogger.DisableLog('rdApp.*') molecules = mol_collection print('populating mongodb collection with compounds from SDF...') - # This is placeholder code for when more registration options exist. - # if index_option not in VALID_HASHES: - # options = ', '.join(VALID_HASHES) - # raise ValueError("index_option must be one of {}".format(options)) - # else: - # hash = HASH_FUNCTIONS[index_option] chunk = [] inserted = 0 duplicates = 0 @@ -81,12 +78,6 @@ def WriteFromMolList(mol_collection, list, scheme=MolDocScheme(), """ molecules = mol_collection print('populating mongodb collection with compounds from list...') - # This is placeholder code for when more registration options exist. - # if index_option not in VALID_HASHES: - # options = ', '.join(VALID_HASHES) - # raise ValueError("index_option must be one of {}".format(options)) - # else: - # hash = HASH_FUNCTIONS[index_option] chunk = [] inserted = 0 duplicates = 0 diff --git a/mongordkit/Search/similarity.py b/mongordkit/Search/similarity.py index b31ac43..e31c495 100644 --- a/mongordkit/Search/similarity.py +++ b/mongordkit/Search/similarity.py @@ -210,7 +210,7 @@ def AddHashCollections(db, mol_collection): {'$push': {'molecules': moldoc['_id']}}, True) -def SimSearchLSH(mol, db, mol_collection, perm_collection, threshold=DEFAULT_THRESHOLD): +def SimSearchLSH(mol, db, mol_collection, perm_collection, count_collection, threshold=DEFAULT_THRESHOLD): """ Conducts a similarity search for query molecule MOL in MOL_COLLECTION with Tanimoto threshold THRESHOLD. @@ -236,8 +236,8 @@ def SimSearchLSH(mol, db, mol_collection, perm_collection, threshold=DEFAULT_THR except ZeroDivisionError: fp_max = float('inf') req_common_count = qfp_count - fp_min + 1 - if 'mfp' in db.list_collection_names(): - req_common_bits = [count['_id'] for count in db.mfp_counts.find( + if count_collection: + req_common_bits = [count['_id'] for count in count_collection.find( {'_id': {'$in': qfp}}).sort('count', 1).limit(req_common_count)] else: req_common_bits = qfp[:req_common_count] diff --git a/mongordkit/Search/tests/test_similarity.py b/mongordkit/Search/tests/test_similarity.py index 52bc7dc..ee59854 100644 --- a/mongordkit/Search/tests/test_similarity.py +++ b/mongordkit/Search/tests/test_similarity.py @@ -139,7 +139,7 @@ def test_similarity_accuracy_LSH(mongoURI): smiles = Chem.MolToSmiles(mol) search_python = [result[1] for result in utils.similaritySearchPython(mol, db_python, t)] search_mongo_LSH = [result[1] for result in - similarity.SimSearchLSH(mol, db_mongo, db_mongo.molecules, db_mongo.permutations, t)] + similarity.SimSearchLSH(mol, db_mongo, db_mongo.molecules, db_mongo.permutations, db_mongo.mfp_counts, t)] assert set(search_mongo_LSH).issubset(search_python) print(counter) counter += 1 \ No newline at end of file From be283f88865ab10780ce9a87b0cc662a8577b587 Mon Sep 17 00:00:00 2001 From: Christopher Zou Date: Tue, 18 Aug 2020 15:20:03 -0400 Subject: [PATCH 4/7] fix hash test --- mongordkit/Database/tests/test_write.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mongordkit/Database/tests/test_write.py b/mongordkit/Database/tests/test_write.py index 1acbe63..d09b240 100644 --- a/mongordkit/Database/tests/test_write.py +++ b/mongordkit/Database/tests/test_write.py @@ -32,7 +32,7 @@ def test_hashes(self): data_scheme = registration.MolDocScheme() data_scheme.set_index("CanonicalSmiles") assert 200 == write.WriteFromSDF(db.molecules, 'data/test_data/first_200.props.sdf', data_scheme) - data_scheme.set_index("MoleculeHashString") + data_scheme.set_index("inchi_standard") assert 200 == write.WriteFromSDF(db.molecules, 'data/test_data/first_200.props.sdf', data_scheme) def test_uniqueInsertion(self): From 6cc6321fde8685ae11847b6a775ff977d5eb9976 Mon Sep 17 00:00:00 2001 From: Christopher Zou Date: Tue, 25 Aug 2020 12:54:52 -0400 Subject: [PATCH 5/7] Push up write benchmarks --- ...ng and Writing to MongoDB-checkpoint.ipynb | 18 +- ...y and Substructure Search-checkpoint.ipynb | 5 +- .../Creating and Writing to MongoDB.ipynb | 18 +- docs/notebooks/Similarity Benchmarking.ipynb | 25 +- .../Similarity and Substructure Search.ipynb | 5 +- .../notebooks/Substructure Benchmarking.ipynb | 28 +- .../Write and Registration Benchmarking.ipynb | 427 ++++++++++++++++++ mongordkit/Database/registration.py | 15 +- mongordkit/Database/tests/test_write.py | 2 +- requirements.txt | 24 - setup.txt | 56 --- 11 files changed, 505 insertions(+), 118 deletions(-) create mode 100644 docs/notebooks/Write and Registration Benchmarking.ipynb delete mode 100644 requirements.txt delete mode 100644 setup.txt diff --git a/docs/notebooks/.ipynb_checkpoints/Creating and Writing to MongoDB-checkpoint.ipynb b/docs/notebooks/.ipynb_checkpoints/Creating and Writing to MongoDB-checkpoint.ipynb index c581d97..fdbf379 100644 --- a/docs/notebooks/.ipynb_checkpoints/Creating and Writing to MongoDB-checkpoint.ipynb +++ b/docs/notebooks/.ipynb_checkpoints/Creating and Writing to MongoDB-checkpoint.ipynb @@ -41,7 +41,10 @@ "source": [ "client = pymongo.MongoClient()\n", "client.drop_database('demo_db')\n", - "demo_db = client.demo_db" + "demo_db = client.demo_db\n", + "\n", + "# Disable rdkit warnings\n", + "rdkit.RDLogger.DisableLog('rdApp.*')" ] }, { @@ -90,7 +93,7 @@ "\n", "It does this in two parts. First, it defines the global variable `HASH_FUNCTIONS` as a dictionary that maps hash function names to methods. It also defines the global variables `DEFAULT_SCHEME_NAME`, `DEFAULT_AUTHOR`, `DEFAULT_PREPROCESS`, and `DEFAULT_INDEX`, which are used in scheme creation and are thus defined for easy configuration. \n", "\n", - "Second, the file defines the `MolDocScheme` object, which stores scheme information in its instance variables and is passed into `.write` methods in order to specify molecule document format. By default, `MolDocScheme` includes scheme name, author, whether or not the molecule has been pre-processed, an index option, hashes, fingerprints, and value fields. All of the information contained in a `MolDocScheme` object can be used directly to generate documents for molecules:" + "Second, the file defines the `MolDocScheme` object, which stores scheme information in its instance variables and is passed into `.write` methods in order to specify molecule document format. By default, `MolDocScheme` includes scheme name, author, whether or not the molecule has been pre-processed, an index option, two hashes, fingerprints, and value fields. All of the information contained in a `MolDocScheme` object can be used directly to generate documents for molecules:" ] }, { @@ -201,7 +204,8 @@ } ], "source": [ - "scheme.remove_field('AnonymousGraph')\n", + "scheme.remove_field('CanonicalSmiles')\n", + "scheme.add_hash_field('MolFormula')\n", "scheme.set_index('MolFormula')\n", "scheme.generate_mol_doc(rdmol)" ] @@ -583,6 +587,7 @@ "- get_index_value(self, rdmol) --> *calculated index value*\n", "- add_hash_field(self, field_name, field_method) --> *None*\n", "- add_value_field(self, field_name, field_value) --> *None*\n", + "- add_all_hashes(self) --> *None*\n", "- remove_field(self, field_name) --> *None*\n", "- generate_mol_doc(self, rdmol) --> *Dict: document representing molecule according to scheme*" ] @@ -602,13 +607,6 @@ "\n", "mongordkit.Database.write.**WriteFromMolList**(database, list, scheme=MolDocScheme(), reg_collection=None, chunk_size=100, limit=None) --> *int: number of molecules imported*" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/notebooks/.ipynb_checkpoints/Similarity and Substructure Search-checkpoint.ipynb b/docs/notebooks/.ipynb_checkpoints/Similarity and Substructure Search-checkpoint.ipynb index 82c3f12..33285bc 100644 --- a/docs/notebooks/.ipynb_checkpoints/Similarity and Substructure Search-checkpoint.ipynb +++ b/docs/notebooks/.ipynb_checkpoints/Similarity and Substructure Search-checkpoint.ipynb @@ -41,7 +41,10 @@ "source": [ "client = pymongo.MongoClient()\n", "client.drop_database('demo_db')\n", - "demo_db = client.demo_db" + "demo_db = client.demo_db\n", + "\n", + "# Disable rdkit warnings\n", + "rdkit.RDLogger.DisableLog('rdApp.*')" ] }, { diff --git a/docs/notebooks/Creating and Writing to MongoDB.ipynb b/docs/notebooks/Creating and Writing to MongoDB.ipynb index c581d97..fdbf379 100644 --- a/docs/notebooks/Creating and Writing to MongoDB.ipynb +++ b/docs/notebooks/Creating and Writing to MongoDB.ipynb @@ -41,7 +41,10 @@ "source": [ "client = pymongo.MongoClient()\n", "client.drop_database('demo_db')\n", - "demo_db = client.demo_db" + "demo_db = client.demo_db\n", + "\n", + "# Disable rdkit warnings\n", + "rdkit.RDLogger.DisableLog('rdApp.*')" ] }, { @@ -90,7 +93,7 @@ "\n", "It does this in two parts. First, it defines the global variable `HASH_FUNCTIONS` as a dictionary that maps hash function names to methods. It also defines the global variables `DEFAULT_SCHEME_NAME`, `DEFAULT_AUTHOR`, `DEFAULT_PREPROCESS`, and `DEFAULT_INDEX`, which are used in scheme creation and are thus defined for easy configuration. \n", "\n", - "Second, the file defines the `MolDocScheme` object, which stores scheme information in its instance variables and is passed into `.write` methods in order to specify molecule document format. By default, `MolDocScheme` includes scheme name, author, whether or not the molecule has been pre-processed, an index option, hashes, fingerprints, and value fields. All of the information contained in a `MolDocScheme` object can be used directly to generate documents for molecules:" + "Second, the file defines the `MolDocScheme` object, which stores scheme information in its instance variables and is passed into `.write` methods in order to specify molecule document format. By default, `MolDocScheme` includes scheme name, author, whether or not the molecule has been pre-processed, an index option, two hashes, fingerprints, and value fields. All of the information contained in a `MolDocScheme` object can be used directly to generate documents for molecules:" ] }, { @@ -201,7 +204,8 @@ } ], "source": [ - "scheme.remove_field('AnonymousGraph')\n", + "scheme.remove_field('CanonicalSmiles')\n", + "scheme.add_hash_field('MolFormula')\n", "scheme.set_index('MolFormula')\n", "scheme.generate_mol_doc(rdmol)" ] @@ -583,6 +587,7 @@ "- get_index_value(self, rdmol) --> *calculated index value*\n", "- add_hash_field(self, field_name, field_method) --> *None*\n", "- add_value_field(self, field_name, field_value) --> *None*\n", + "- add_all_hashes(self) --> *None*\n", "- remove_field(self, field_name) --> *None*\n", "- generate_mol_doc(self, rdmol) --> *Dict: document representing molecule according to scheme*" ] @@ -602,13 +607,6 @@ "\n", "mongordkit.Database.write.**WriteFromMolList**(database, list, scheme=MolDocScheme(), reg_collection=None, chunk_size=100, limit=None) --> *int: number of molecules imported*" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/notebooks/Similarity Benchmarking.ipynb b/docs/notebooks/Similarity Benchmarking.ipynb index 7ebd8fb..701d377 100644 --- a/docs/notebooks/Similarity Benchmarking.ipynb +++ b/docs/notebooks/Similarity Benchmarking.ipynb @@ -47,13 +47,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Initialize the client that will connect to the database.\n", "client = pymongo.MongoClient()\n", - "db = client.test" + "db = client.test\n", + "chembl = '../../../chembl_27.sdf'\n", + "\n", + "# Disable rdkit warnings\n", + "rdkit.RDLogger.DisableLog('rdApp.*')" ] }, { @@ -64,7 +68,7 @@ "source": [ "# If necessary, write the first 100,000 compounds to molecules_100K.\n", "if db.molecules_100K.count_documents({}) != 100000:\n", - " write.WriteFromSDF(db.molecules_100K, '../../../chembl_27.sdf', chunk_size=1000, limit=100000)" + " write.WriteFromSDF(db.molecules_100K, chembl, chunk_size=1000, limit=100000)" ] }, { @@ -75,14 +79,23 @@ "source": [ "# If necessary, write the first 1,000,000 compounds to molecules_1M.\n", "if db.molecules_1M.count_documents({}) != 1000000:\n", - " write.writeFromSDF(db.molecules_1M, '../../../chembl_27.sdf', chunk_size=1000, limit=1000000)" + " write.writeFromSDF(db.molecules_1M, chembl, chunk_size=1000, limit=1000000)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "In molecules_100K: 100000 documents\n", + "In molecules_1M: 629512 documents\n" + ] + } + ], "source": [ "# Let's ensure that there are actually 100,000 and 1M documents in these collections, respectively.\n", "print(f\"In molecules_100K: {db.molecules_100K.count_documents({})} documents\")\n", diff --git a/docs/notebooks/Similarity and Substructure Search.ipynb b/docs/notebooks/Similarity and Substructure Search.ipynb index 82c3f12..33285bc 100644 --- a/docs/notebooks/Similarity and Substructure Search.ipynb +++ b/docs/notebooks/Similarity and Substructure Search.ipynb @@ -41,7 +41,10 @@ "source": [ "client = pymongo.MongoClient()\n", "client.drop_database('demo_db')\n", - "demo_db = client.demo_db" + "demo_db = client.demo_db\n", + "\n", + "# Disable rdkit warnings\n", + "rdkit.RDLogger.DisableLog('rdApp.*')" ] }, { diff --git a/docs/notebooks/Substructure Benchmarking.ipynb b/docs/notebooks/Substructure Benchmarking.ipynb index ef27503..53a5ef7 100644 --- a/docs/notebooks/Substructure Benchmarking.ipynb +++ b/docs/notebooks/Substructure Benchmarking.ipynb @@ -52,16 +52,22 @@ "metadata": {}, "source": [ "### Database Setup\n", - "Here we set up a database called `test` that will hold our molecules. We will construct a collection called `molecules_100K` to hold the first 100,000 molecules in the ChEMBL_27 dataset and a collection called `molecules_1M` to hold the first 1,000,000 molecules in the ChEMBL_27 dataset. If you have already run benchmarks from `mongo-rdkit` on your local MongoDB instance, these should have been set up already." + "Here we set up a database called `test` that will hold our molecules. We will construct a collection called `molecules_100K` to hold the first 100,000 molecules in the ChEMBL_27 dataset and a collection called `molecules_1M` to hold the first 1,000,000 molecules in the ChEMBL_27 dataset. If you have already run search or similarity benchmarks from `mongo-rdkit` on your local MongoDB instance, these should have been set up already." ] }, { - "cell_type": "raw", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "# Initialize the client that will connect to the database.\n", "client = pymongo.MongoClient()\n", - "db = client.test" + "db = client.test\n", + "chembl = '../../../chembl_27.sdf'\n", + "\n", + "# Disable rdkit warnings\n", + "rdkit.RDLogger.DisableLog('rdApp.*')" ] }, { @@ -92,7 +98,7 @@ "source": [ "# If necessary, write the first 100,000 compounds to molecules_100K.\n", "if db.molecules_100K.count_documents({}) != 100000:\n", - " write.WriteFromSDF(db.molecules_100K, '../../../chembl_27.sdf', chunk_size=1000, limit=100000)" + " write.WriteFromSDF(db.molecules_100K, chembl, chunk_size=1000, limit=100000)" ] }, { @@ -111,7 +117,7 @@ "source": [ "# If necessary, write the first 1,000,000 compounds to molecules_1M.\n", "if db.molecules_1M.count_documents({}) != 1000000:\n", - " write.WriteFromSDF(db.molecules_1M, '../../../chembl_27.sdf', chunk_size=1000, limit=1000000)" + " write.WriteFromSDF(db.molecules_1M, chembl, chunk_size=1000, limit=1000000)" ] }, { @@ -331,6 +337,18 @@ "display_name": "py37_rdkit_beta", "language": "python", "name": "py37_rdkit_beta" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" } }, "nbformat": 4, diff --git a/docs/notebooks/Write and Registration Benchmarking.ipynb b/docs/notebooks/Write and Registration Benchmarking.ipynb new file mode 100644 index 0000000..468c5d0 --- /dev/null +++ b/docs/notebooks/Write and Registration Benchmarking.ipynb @@ -0,0 +1,427 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Write and Registration Benchmarks\n", + "\n", + "These benchmarks were originally run on an early 2015 MacBook Pro with a 2.7 GHz dual-core i5 processor and 8GB of memory. All molecules are written into a data directory stored locally via `--dbpath`.\n", + "\n", + "They make use of molecules found in the data folder. \n", + "\n", + "Last updated: 8/24/20 by Christopher Zou\n", + "\n", + "## Setup Work\n", + "### Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from mongordkit.Database import write, registration\n", + "from rdkit import Chem\n", + "import rdkit\n", + "import numpy as np\n", + "import time\n", + "import pymongo\n", + "import mongomock\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Database Setup\n", + "Here we set up a database called `test` that will hold our molecules. We will construct a collection called `molecules_write_testing` to benchmark the speed of writing to a collection." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the client that will connect to the database.\n", + "client = pymongo.MongoClient()\n", + "db = client.test\n", + "db.molecules_write_testing.drop()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Defining Some Useful Variables" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "hash_functions = registration.HASH_FUNCTIONS\n", + "first_200_mols = '../../data/test_data/first_200.props.sdf'\n", + "chembl = '../../../chembl_27.sdf'\n", + "\n", + "# Disable RDLogger to reduce system output.\n", + "rdkit.RDLogger.DisableLog('rdApp.*')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Benchmarking Write" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We want to know the performance of `write.WriteFromSDF`. To find out, let's write the first 1000-10000 (incrementing by 1000 every time) molecules of a ChEMBL dataset using a scheme that contains all 23 available hashes and take median write times:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "testing 1000\n", + "populating mongodb collection with compounds from SDF...\n", + "1000 molecules successfully imported\n", + "0 duplicates skipped\n", + "populating mongodb collection with compounds from SDF...\n", + "1000 molecules successfully imported\n", + "0 duplicates skipped\n", + "populating mongodb collection with compounds from SDF...\n", + "1000 molecules successfully imported\n", + "0 duplicates skipped\n", + "populating mongodb collection with compounds from SDF...\n", + "1000 molecules successfully imported\n", + "0 duplicates skipped\n", + "populating mongodb collection with compounds from SDF...\n", + "1000 molecules successfully imported\n", + "0 duplicates skipped\n", + "testing 2000\n", + "populating mongodb collection with compounds from SDF...\n", + "2000 molecules successfully imported\n", + "0 duplicates skipped\n", + "populating mongodb collection with compounds from SDF...\n", + "2000 molecules successfully imported\n", + "0 duplicates skipped\n", + "populating mongodb collection with compounds from SDF...\n", + "2000 molecules successfully imported\n", + "0 duplicates skipped\n", + "populating mongodb collection with compounds from SDF...\n", + "2000 molecules successfully imported\n", + "0 duplicates skipped\n", + "populating mongodb collection with compounds from SDF...\n", + "2000 molecules successfully imported\n", + "0 duplicates skipped\n", + "testing 3000\n", + "populating mongodb collection with compounds from SDF...\n", + "3000 molecules successfully imported\n", + "0 duplicates skipped\n", + "populating mongodb collection with compounds from SDF...\n" + ] + } + ], + "source": [ + "repetitions = 5\n", + "scheme = registration.MolDocScheme()\n", + "scheme.add_all_hashes()\n", + "times = []\n", + "limits = [1000 + (i * 1000) for i in range(11)]\n", + "for number in limits:\n", + " temp_times = []\n", + " print(f'testing {number}')\n", + " for i in range(repetitions):\n", + " mol_collection = db.molecules_write_testing\n", + " start = time.time()\n", + " write.WriteFromSDF(mol_collection, chembl, scheme, limit=number)\n", + " end = time.time()\n", + " duration = end - start\n", + " mol_collection.drop()\n", + " temp_times.append(duration)\n", + " times.append([number, np.mean(temp_times)])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[2000, 14.187205600738526],\n", + " [2100, 14.940004205703735],\n", + " [2200, 15.965514373779296],\n", + " [2300, 16.910987186431885],\n", + " [2400, 23.27452983856201],\n", + " [2500, 21.29524955749512],\n", + " [2600, 23.414347171783447],\n", + " [2700, 25.251486158370973],\n", + " [2800, 28.250892400741577],\n", + " [2900, 28.316519117355348],\n", + " [3000, 32.57432060241699]]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "x_values = [i[0] for i in times]\n", + "y_values = [i[1] for i in times]\n", + "plt.plot(x_values, y_values)\n", + "times" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's also divide each mean value by the number of molecules written to get a measure of how long it takes to write each molecule:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "time_per_molecule = []\n", + "for i in range(times):\n", + " time_per_molecule.append(y_values[i] / x_values[i])\n", + " \n", + "plt.plot(x_values, time_per_molecule)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The amount of time required to insert each molecule appears to stay relatively constant between 0.02 and 0.025 seconds; " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Breaking Down Write Times\n", + "### Hashes\n", + "`mongordkit.registration` provides (as of time of writing) 23 different molecular hash options. Calculating all of these hash options when writing molecules into a database results in extremely long write times. Here, we calculate the median time required to calculate each hash for the first 200 molecules in a ChEMBL database. " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "median_times = []\n", + "\n", + "for key in hash_functions: \n", + " times = []\n", + " for rdmol in Chem.ForwardSDMolSupplier(first_200_mols):\n", + " start = time.time()\n", + " _ = hash_functions[key](rdmol)\n", + " end = time.time()\n", + " times.append(end - start)\n", + " median_times.append([key, np.median(times)])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig = plt.figure(figsize=(40, 10))\n", + "x_list = [v[0] for v in median_times]\n", + "y_list = [v[1] for v in median_times]\n", + "plt.xlabel('hashes')\n", + "plt.ylabel('time (s)')\n", + "plt.bar(x_list, y_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It is evident that some of the hashes take significantly longer (those with the inchikeys being almost eight times as expensive as MolFormula or DegreeVector. With this in mind, let's take a look at how long it takes to generate a mol document. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### MolDocuments\n", + "By default, `mongordkit.registration` includes only `CanonicalSmiles` and `inchikey_standard` in the hashes it generates. Let's look at time to generate mol documents for the first 200 mols in a ChEMBL dataset from above for this default scheme:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Median required time: 0.0010679960250854492, Mean required time: 0.0011445283889770508\n" + ] + } + ], + "source": [ + "scheme = registration.MolDocScheme()\n", + "times = []\n", + "for rdmol in Chem.ForwardSDMolSupplier(first_200_mols):\n", + " start = time.time()\n", + " _ = scheme.generate_mol_doc(rdmol)\n", + " end = time.time()\n", + " times.append(end - start)\n", + "print(f\"Median required time: {np.median(times)}, Mean required time: {np.mean(times)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What if we add all the hashes back in?" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Median required time: 0.0047119855880737305, Mean required time: 0.00495597243309021\n" + ] + } + ], + "source": [ + "scheme.add_all_hashes()\n", + "times = []\n", + "for rdmol in Chem.ForwardSDMolSupplier(first_200_mols):\n", + " start = time.time()\n", + " _ = scheme.generate_mol_doc(rdmol)\n", + " end = time.time()\n", + " times.append(end - start)\n", + "print(f\"Median required time: {np.median(times)}, Mean required time: {np.mean(times)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Adding all the hashes increases the required time by a factor of between 4 and 5. However, this is still a negligible amount (around 5%) of time compared to the time that it takes to actually write the molecule into the database from above. In fact, the total time required to generate all of these mol documents for a million molecules would be less than two hours. These benchmarks were run because writing a million molecules had taken more than three days. \n", + "\n", + "Evidently, most of the work happens in the insertion step. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py37_rdkit_beta", + "language": "python", + "name": "py37_rdkit_beta" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/mongordkit/Database/registration.py b/mongordkit/Database/registration.py index acf9b26..3144990 100644 --- a/mongordkit/Database/registration.py +++ b/mongordkit/Database/registration.py @@ -32,7 +32,7 @@ def __init__(self): self.author = DEFAULT_AUTHOR self.pre_processed = DEFAULT_PREPROCESS self.index_option = DEFAULT_INDEX - self.hashes = set(HASH_FUNCTIONS.keys()) + self.hashes = {'CanonicalSmiles', 'inchikey_standard'} self.fingerprints = {} self.value_fields = {} @@ -56,9 +56,16 @@ def get_index_value(self, rdmol): else: raise Exception("Specified index option does not exist.") - def add_hash_field(self, field_name, field_method): - self.hashes.add(field_name) - HASH_FUNCTIONS[field_name] = field_method + def add_hash_field(self, field_name, field_method=None): + if field_name in HASH_FUNCTIONS: + self.hashes.add(field_name) + else: + HASH_FUNCTIONS[field_name] = field_method + self.hashes.add(field_name) + + def add_all_hashes(self): + for function in HASH_FUNCTIONS.keys(): + self.hashes.add(function) def add_value_field(self, field_name, field_value): self.value_fields[field_name] = field_value diff --git a/mongordkit/Database/tests/test_write.py b/mongordkit/Database/tests/test_write.py index d09b240..da0c9b9 100644 --- a/mongordkit/Database/tests/test_write.py +++ b/mongordkit/Database/tests/test_write.py @@ -32,7 +32,7 @@ def test_hashes(self): data_scheme = registration.MolDocScheme() data_scheme.set_index("CanonicalSmiles") assert 200 == write.WriteFromSDF(db.molecules, 'data/test_data/first_200.props.sdf', data_scheme) - data_scheme.set_index("inchi_standard") + data_scheme.set_index("inchikey_standard") assert 200 == write.WriteFromSDF(db.molecules, 'data/test_data/first_200.props.sdf', data_scheme) def test_uniqueInsertion(self): diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 40f2f73..0000000 --- a/requirements.txt +++ /dev/null @@ -1,24 +0,0 @@ -attrs==19.3.0 -certifi==2019.11.28 -importlib-metadata==1.6.1 -mkl-fft==1.0.15 -mkl-random==1.1.0 -mkl-service==2.3.0 -mongomock==3.19.0 -more-itertools==8.3.0 -numpy==1.18.1 -olefile==0.46 -packaging==20.4 -pandas==1.0.3 -Pillow==7.0.0 -pluggy==0.13.1 -py==1.8.1 -pymongo==3.10.1 -pyparsing==2.4.7 -pytest==5.4.3 -python-dateutil==2.8.1 -pytz==2019.3 -sentinels==1.0.0 -six==1.14.0 -wcwidth==0.2.4 -zipp==3.1.0 diff --git a/setup.txt b/setup.txt deleted file mode 100644 index 0cbad19..0000000 --- a/setup.txt +++ /dev/null @@ -1,56 +0,0 @@ -Basic requirements for running mongo-rdkit: - -Packages currently installed in my Conda environment: -blas 1.0 mkl -bzip2 1.0.8 h1de35cc_0 -ca-certificates 2020.1.1 0 -cairo 1.14.12 hc4e6be7_4 -certifi 2019.11.28 py37_1 -fontconfig 2.13.0 h5d5b041_1 -freetype 2.9.1 hb4e5f40_0 -gettext 0.19.8.1 h15daf44_3 -glib 2.63.1 hd977a24_0 -icu 58.2 h4b95b61_1 -intel-openmp 2019.4 233 -jpeg 9b he5867d9_2 -libboost 1.67.0 hebc422b_4 -libcxx 4.0.1 hcfea43d_1 -libcxxabi 4.0.1 hcfea43d_1 -libedit 3.1.20181209 hb402a30_0 -libffi 3.2.1 h475c297_4 -libgfortran 3.0.1 h93005f0_2 -libiconv 1.15 hdd342a3_7 -libpng 1.6.37 ha441bb4_0 -libtiff 4.1.0 hcb84e12_0 -libxml2 2.9.9 hf6e021a_1 -mkl 2019.4 233 -mkl-service 2.3.0 py37hfbe908c_0 -mkl_fft 1.0.15 py37h5e564d8_0 -mkl_random 1.1.0 py37ha771720_0 -ncurses 6.2 h0a44026_0 -numpy 1.18.1 py37h7241aed_0 -numpy-base 1.18.1 py37h6575580_1 -olefile 0.46 py37_0 -openssl 1.1.1e h1de35cc_0 -pandas 1.0.3 py37h6c726b0_0 -pcre 8.43 h0a44026_0 -pillow 7.0.0 py37h4655f20_0 -pip 20.0.2 py37_1 -pixman 0.38.0 h1de35cc_0 -py-boost 1.67.0 py37h6440ff4_4 -pymongo 3.10.1 pypi_0 pypi -python 3.7.7 hc70fcce_0_cpython -python-dateutil 2.8.1 py_0 -pytz 2019.3 py_0 -rdkit 2020.03.1b1.0 py37h65625ec_1 rdkit/label/beta -readline 8.0 h1de35cc_0 -setuptools 46.1.1 py37_0 -six 1.14.0 py37_0 -sqlite 3.31.1 ha441bb4_0 -tk 8.6.8 ha441bb4_0 -wheel 0.34.2 py37_0 -xz 5.2.4 h1de35cc_4 -zlib 1.2.11 h1de35cc_3 -zstd 1.3.7 h5bba6e5_0 - -Most of these packages install by default when installing Anaconda. Will document for better reproducibility in the future. From d269d68127a37c02edace8ec93e52d222b19df1c Mon Sep 17 00:00:00 2001 From: Christopher Zou Date: Tue, 25 Aug 2020 13:00:01 -0400 Subject: [PATCH 6/7] Add write benchmarks and edit default registration --- .../Write and Registration Benchmarking.ipynb | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/notebooks/Write and Registration Benchmarking.ipynb b/docs/notebooks/Write and Registration Benchmarking.ipynb index 468c5d0..cc4b856 100644 --- a/docs/notebooks/Write and Registration Benchmarking.ipynb +++ b/docs/notebooks/Write and Registration Benchmarking.ipynb @@ -133,6 +133,22 @@ "populating mongodb collection with compounds from SDF...\n", "3000 molecules successfully imported\n", "0 duplicates skipped\n", + "populating mongodb collection with compounds from SDF...\n", + "3000 molecules successfully imported\n", + "0 duplicates skipped\n", + "populating mongodb collection with compounds from SDF...\n", + "3000 molecules successfully imported\n", + "0 duplicates skipped\n", + "populating mongodb collection with compounds from SDF...\n", + "3000 molecules successfully imported\n", + "0 duplicates skipped\n", + "populating mongodb collection with compounds from SDF...\n", + "3000 molecules successfully imported\n", + "0 duplicates skipped\n", + "testing 4000\n", + "populating mongodb collection with compounds from SDF...\n", + "4000 molecules successfully imported\n", + "0 duplicates skipped\n", "populating mongodb collection with compounds from SDF...\n" ] } From 690eabb1c41f59946889ad0202693c03b6cadcf3 Mon Sep 17 00:00:00 2001 From: Christopher Zou Date: Tue, 25 Aug 2020 13:07:46 -0400 Subject: [PATCH 7/7] delete ipynb checkpoints --- .../.ipynb_checkpoints/.ipynb-checkpoint | 92 --- ...ng and Writing to MongoDB-checkpoint.ipynb | 633 ----------------- ...y and Substructure Search-checkpoint.ipynb | 648 ------------------ .../Write and Registration Benchmarking.ipynb | 16 + 4 files changed, 16 insertions(+), 1373 deletions(-) delete mode 100644 docs/notebooks/.ipynb_checkpoints/.ipynb-checkpoint delete mode 100644 docs/notebooks/.ipynb_checkpoints/Creating and Writing to MongoDB-checkpoint.ipynb delete mode 100644 docs/notebooks/.ipynb_checkpoints/Similarity and Substructure Search-checkpoint.ipynb diff --git a/docs/notebooks/.ipynb_checkpoints/.ipynb-checkpoint b/docs/notebooks/.ipynb_checkpoints/.ipynb-checkpoint deleted file mode 100644 index 80af95b..0000000 --- a/docs/notebooks/.ipynb_checkpoints/.ipynb-checkpoint +++ /dev/null @@ -1,92 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Creating and Writing to MongoDB\n", - "\n", - "Methods that directly modify MongoDB database instances are included in the `mongordkit.Database` module.\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from mongordkit.Database import *" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating Databases\n", - "Users can opt to bring their own database instances, but `Database.create` provides a variety of ways to create a `mongordkit`-compatible MongoDB instance:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "ename": "DuplicateKeyError", - "evalue": "E11000 duplicate key error collection: MyDatabase.registration index: _id_ dup key: { _id: ObjectId('5f0f64b2eaae47671ad2fb9d') }", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mDuplicateKeyError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDatabase\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreateFromHostPort\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'MyDatabase'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhost\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mport\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mdb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDatabase\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreateFromURI\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'MyDatabase'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Desktop/mongo-rdkit/mongordkit/Database/create.py\u001b[0m in \u001b[0;36mcreateFromHostPort\u001b[0;34m(dbname, host, port)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mdb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclient\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdbname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0mcollection\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdb\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'registration'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 19\u001b[0;31m \u001b[0mcollection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minsert_one\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSTANDARD_SETTING\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 20\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdb\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/envs/py37_rdkit_beta/lib/python3.7/site-packages/pymongo/collection.py\u001b[0m in \u001b[0;36minsert_one\u001b[0;34m(self, document, bypass_document_validation, session)\u001b[0m\n\u001b[1;32m 696\u001b[0m \u001b[0mwrite_concern\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mwrite_concern\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 697\u001b[0m \u001b[0mbypass_doc_val\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbypass_document_validation\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 698\u001b[0;31m session=session),\n\u001b[0m\u001b[1;32m 699\u001b[0m write_concern.acknowledged)\n\u001b[1;32m 700\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/envs/py37_rdkit_beta/lib/python3.7/site-packages/pymongo/collection.py\u001b[0m in \u001b[0;36m_insert\u001b[0;34m(self, docs, ordered, check_keys, manipulate, write_concern, op_id, bypass_doc_val, session)\u001b[0m\n\u001b[1;32m 610\u001b[0m return self._insert_one(\n\u001b[1;32m 611\u001b[0m \u001b[0mdocs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mordered\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcheck_keys\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmanipulate\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwrite_concern\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mop_id\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 612\u001b[0;31m bypass_doc_val, session)\n\u001b[0m\u001b[1;32m 613\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 614\u001b[0m \u001b[0mids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/envs/py37_rdkit_beta/lib/python3.7/site-packages/pymongo/collection.py\u001b[0m in \u001b[0;36m_insert_one\u001b[0;34m(self, doc, ordered, check_keys, manipulate, write_concern, op_id, bypass_doc_val, session)\u001b[0m\n\u001b[1;32m 598\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 599\u001b[0m self.__database.client._retryable_write(\n\u001b[0;32m--> 600\u001b[0;31m acknowledged, _insert_command, session)\n\u001b[0m\u001b[1;32m 601\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 602\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdoc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mRawBSONDocument\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/envs/py37_rdkit_beta/lib/python3.7/site-packages/pymongo/mongo_client.py\u001b[0m in \u001b[0;36m_retryable_write\u001b[0;34m(self, retryable, func, session)\u001b[0m\n\u001b[1;32m 1489\u001b[0m \u001b[0;34m\"\"\"Internal retryable write helper.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1490\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_tmp_session\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1491\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_retry_with_session\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mretryable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1492\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1493\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_reset_server\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maddress\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/envs/py37_rdkit_beta/lib/python3.7/site-packages/pymongo/mongo_client.py\u001b[0m in \u001b[0;36m_retry_with_session\u001b[0;34m(self, retryable, func, session, bulk)\u001b[0m\n\u001b[1;32m 1382\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mlast_error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1383\u001b[0m \u001b[0mretryable\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1384\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msock_info\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretryable\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1385\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mServerSelectionTimeoutError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1386\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_retrying\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/envs/py37_rdkit_beta/lib/python3.7/site-packages/pymongo/collection.py\u001b[0m in \u001b[0;36m_insert_command\u001b[0;34m(session, sock_info, retryable_write)\u001b[0m\n\u001b[1;32m 595\u001b[0m retryable_write=retryable_write)\n\u001b[1;32m 596\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 597\u001b[0;31m \u001b[0m_check_write_command_response\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 598\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 599\u001b[0m self.__database.client._retryable_write(\n", - "\u001b[0;32m~/anaconda3/envs/py37_rdkit_beta/lib/python3.7/site-packages/pymongo/helpers.py\u001b[0m in \u001b[0;36m_check_write_command_response\u001b[0;34m(result)\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0mwrite_errors\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"writeErrors\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 220\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mwrite_errors\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 221\u001b[0;31m \u001b[0m_raise_last_write_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwrite_errors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 222\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[0merror\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"writeConcernError\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/envs/py37_rdkit_beta/lib/python3.7/site-packages/pymongo/helpers.py\u001b[0m in \u001b[0;36m_raise_last_write_error\u001b[0;34m(write_errors)\u001b[0m\n\u001b[1;32m 200\u001b[0m \u001b[0merror\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwrite_errors\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 201\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"code\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m11000\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 202\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mDuplicateKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"errmsg\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m11000\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 203\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mWriteError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"errmsg\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"code\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 204\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mDuplicateKeyError\u001b[0m: E11000 duplicate key error collection: MyDatabase.registration index: _id_ dup key: { _id: ObjectId('5f0f64b2eaae47671ad2fb9d') }" - ] - } - ], - "source": [ - "db = Database.create.createFromHostPort('MyDatabase', host=None, port=None)\n", - "db = Database.create.createFromURI('MyDatabase', url=None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py37_rdkit_beta", - "language": "python", - "name": "py37_rdkit_beta" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/notebooks/.ipynb_checkpoints/Creating and Writing to MongoDB-checkpoint.ipynb b/docs/notebooks/.ipynb_checkpoints/Creating and Writing to MongoDB-checkpoint.ipynb deleted file mode 100644 index fdbf379..0000000 --- a/docs/notebooks/.ipynb_checkpoints/Creating and Writing to MongoDB-checkpoint.ipynb +++ /dev/null @@ -1,633 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Creating and Writing to MongoDB\n", - "\n", - "Last updated: 8/10/20\n", - "\n", - "Methods that directly modify MongoDB database instances are included in the `mongordkit.Database` module.\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from mongordkit.Database import create, write, utils, registration\n", - "from rdkit import Chem\n", - "import pymongo" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Reset Cells\n", - "Run the contents of this cell to reset the local MongoDB database, `demo_db`, used in this notebook." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "client = pymongo.MongoClient()\n", - "client.drop_database('demo_db')\n", - "demo_db = client.demo_db\n", - "\n", - "# Disable rdkit warnings\n", - "rdkit.RDLogger.DisableLog('rdApp.*')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating Databases (DEPRECATED for now)\n", - "Users can opt to bring their own database instances, but `Database.create` provides methods that will create ready-made MongoDB instances, defaulting to your local MongoDB:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# # Return a database using a host port, such as the local port:\n", - "# db = create.createFromHostPort('demo_db', host='localhost', port=27017)\n", - "\n", - "# # Return a database using a MongoDB URI, such as that provided by Atlas:\n", - "# TestDB = create.createFromURL('demo_db', url=None)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "These databases are created with a `registration` collection. The registration collection includes several documents that consist of common pre-made settings, with the default being `STANDARD_SETTING`. All settings are documented in `Database.utils`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# print(utils.STANDARD_SETTING)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Registration\n", - "`Database.registration` constructs document representations of molecules according to configurable schemes and handles data registration settings.\n", - "\n", - "It does this in two parts. First, it defines the global variable `HASH_FUNCTIONS` as a dictionary that maps hash function names to methods. It also defines the global variables `DEFAULT_SCHEME_NAME`, `DEFAULT_AUTHOR`, `DEFAULT_PREPROCESS`, and `DEFAULT_INDEX`, which are used in scheme creation and are thus defined for easy configuration. \n", - "\n", - "Second, the file defines the `MolDocScheme` object, which stores scheme information in its instance variables and is passed into `.write` methods in order to specify molecule document format. By default, `MolDocScheme` includes scheme name, author, whether or not the molecule has been pre-processed, an index option, two hashes, fingerprints, and value fields. All of the information contained in a `MolDocScheme` object can be used directly to generate documents for molecules:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'rdmol': Binary(b'\\xef\\xbe\\xad\\xde\\x00\\x00\\x00\\x00\\x0b\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x07\\x00\\x00\\x00\\x07\\x00\\x00\\x00\\x80\\x01\\x06\\x00`\\x00\\x00\\x00\\x01\\x03\\x06@(\\x00\\x00\\x00\\x03\\x04\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x0b\\x00\\x01\\x00\\x01\\x02h\\x0c\\x02\\x03h\\x0c\\x03\\x04h\\x0c\\x04\\x05h\\x0c\\x05\\x06h\\x0c\\x06\\x01h\\x0c\\x14\\x01\\x06\\x01\\x06\\x05\\x04\\x03\\x02\\x17\\x00\\x00\\x00\\x00\\x16', 0),\n", - " 'index': 'YXFVVABEGXRONW-UHFFFAOYSA-N',\n", - " 'smiles': 'Cc1ccccc1',\n", - " 'scheme': 'default',\n", - " 'hashes': {'MolFormula': 'C7H8',\n", - " 'SmallWorldIndexBRL': 'B7R1L5',\n", - " 'AtomBondCounts': '7,7',\n", - " 'cx_smiles': 'Cc1ccccc1',\n", - " 'NetCharge': '0',\n", - " 'CanonicalSmiles': 'Cc1ccccc1',\n", - " 'inchikey_standard': 'YXFVVABEGXRONW-UHFFFAOYSA-N',\n", - " 'inchikey_KET_15T': 'YXFVVABEGXRONW-UHFFFAOYNA-N',\n", - " 'SmallWorldIndexBR': 'B7R1',\n", - " 'DegreeVector': '0,1,5,1',\n", - " 'ElementGraph': 'CC1CCCCC1',\n", - " 'HetAtomTautomer': 'C[C]1[CH][CH][CH][CH][CH]1_0_0',\n", - " 'inchi_standard': 'InChI=1S/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", - " 'RedoxPair': 'C[C]1[CH][CH][CH][CH][CH]1',\n", - " 'AnonymousGraph': '**1*****1',\n", - " 'Mesomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", - " 'Regioisomer': '*C.c1ccccc1',\n", - " 'inchi_KET_15T': 'InChI=1/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", - " 'MurckoScaffold': 'c1ccccc1',\n", - " 'ArthorSubstructureOrder': '00070007010007000000002a000000',\n", - " 'noiso_smiles': 'Cc1ccccc1',\n", - " 'ExtendedMurcko': '*c1ccccc1',\n", - " 'HetAtomProtomer': 'C[C]1[CH][CH][CH][CH][CH]1_0'},\n", - " 'fingerprints': {},\n", - " 'value_data': {}}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rdmol = Chem.MolFromSmiles('Cc1ccccc1')\n", - "scheme = registration.MolDocScheme()\n", - "scheme.generate_mol_doc(rdmol)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `MolDocScheme` class also defines a series of instance methods, such as `MolDocScheme.set_index` and `MolDocScheme.remove_field`, that can be used to modify document schemes:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "removed AnonymousGraph from scheme\n" - ] - }, - { - "data": { - "text/plain": [ - "{'rdmol': Binary(b'\\xef\\xbe\\xad\\xde\\x00\\x00\\x00\\x00\\x0b\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x07\\x00\\x00\\x00\\x07\\x00\\x00\\x00\\x80\\x01\\x06\\x00`\\x00\\x00\\x00\\x01\\x03\\x06@(\\x00\\x00\\x00\\x03\\x04\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x06@h\\x00\\x00\\x00\\x03\\x03\\x01\\x0b\\x00\\x01\\x00\\x01\\x02h\\x0c\\x02\\x03h\\x0c\\x03\\x04h\\x0c\\x04\\x05h\\x0c\\x05\\x06h\\x0c\\x06\\x01h\\x0c\\x14\\x01\\x06\\x01\\x06\\x05\\x04\\x03\\x02\\x17\\x00\\x00\\x00\\x00\\x16', 0),\n", - " 'index': 'C7H8',\n", - " 'smiles': 'Cc1ccccc1',\n", - " 'scheme': 'default',\n", - " 'hashes': {'MolFormula': 'C7H8',\n", - " 'SmallWorldIndexBRL': 'B7R1L5',\n", - " 'AtomBondCounts': '7,7',\n", - " 'cx_smiles': 'Cc1ccccc1',\n", - " 'NetCharge': '0',\n", - " 'CanonicalSmiles': 'Cc1ccccc1',\n", - " 'inchikey_standard': 'YXFVVABEGXRONW-UHFFFAOYSA-N',\n", - " 'inchikey_KET_15T': 'YXFVVABEGXRONW-UHFFFAOYNA-N',\n", - " 'SmallWorldIndexBR': 'B7R1',\n", - " 'DegreeVector': '0,1,5,1',\n", - " 'ElementGraph': 'CC1CCCCC1',\n", - " 'HetAtomTautomer': 'C[C]1[CH][CH][CH][CH][CH]1_0_0',\n", - " 'inchi_standard': 'InChI=1S/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", - " 'RedoxPair': 'C[C]1[CH][CH][CH][CH][CH]1',\n", - " 'Mesomer': 'C[C]1[CH][CH][CH][CH][CH]1_0',\n", - " 'Regioisomer': '*C.c1ccccc1',\n", - " 'inchi_KET_15T': 'InChI=1/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3',\n", - " 'MurckoScaffold': 'c1ccccc1',\n", - " 'ArthorSubstructureOrder': '00070007010007000000002a000000',\n", - " 'noiso_smiles': 'Cc1ccccc1',\n", - " 'ExtendedMurcko': '*c1ccccc1',\n", - " 'HetAtomProtomer': 'C[C]1[CH][CH][CH][CH][CH]1_0'},\n", - " 'fingerprints': {},\n", - " 'value_data': {}}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "scheme.remove_field('CanonicalSmiles')\n", - "scheme.add_hash_field('MolFormula')\n", - "scheme.set_index('MolFormula')\n", - "scheme.generate_mol_doc(rdmol)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Because `MolDocScheme` objects contain no functions—only references to functions—they can be pickled. In fact, the methods in `write` can save `MolDocSchemes` so that custom schemes are retrievable for later use." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Writing to a Database\n", - "`Database.write` provides write functionality. Its core method is `WriteFromSDF`, which relies on rdkit's `ForwardSDMolSupplier` to write data from an SDF file into a specified database.\n", - "\n", - "For each molecule in the SDF, `WriteFromSDF` inserts a document whose fields are specified by the `MolDocScheme` object passed into the function (one with default settings is created if the `scheme` argument is left blank)." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "populating mongodb collection with compounds from SDF...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:46] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:46] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:46] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", - "RDKit WARNING: [15:39:46] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "200 molecules successfully imported\n", - "0 duplicates skipped\n" - ] - }, - { - "data": { - "text/plain": [ - "200" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Write the contents of first_200_props.sdf, a test dataset, into the collection demo_db.molecules.\n", - "# The index will default to the molecule's inchikey.\n", - "# Return the number of molecules succesfully imported.\n", - "write.WriteFromSDF(demo_db.molecules, '../../data/test_data/first_200.props.sdf')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The above call is the most basic version of `writeFromSDF`. For additional flexibility, `writeFromSDF` takes several optional arguments—users can specify a custom scheme object, a registration collection to write scheme objects to, how many molecules are inserted at a time (this can affect performance), and limit the number of molecules written in." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "populating mongodb collection with compounds from SDF...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:47] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:47] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:48] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:48] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:48] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:50] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:39:50] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", - "RDKit WARNING: [15:39:50] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", - "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:39:50] WARNING: Omitted undefined stereo\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "100 molecules successfully imported\n", - "0 duplicates skipped\n" - ] - }, - { - "data": { - "text/plain": [ - "100" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Write the first 100 molecules of first_200_props.sdf, a test dataset, into demo_db.molecules\n", - "# This write will use canonical SMILES as the identifying index and thus does not conflict with the above write. \n", - "# If we had used inchikey again, the write would have imported 0 molecules.\n", - "scheme = registration.MolDocScheme()\n", - "scheme.set_index('CanonicalSmiles')\n", - "write.WriteFromSDF(demo_db.molecules, '../../data/test_data/first_200.props.sdf', \n", - " scheme, reg_collection=demo_db.schema, chunk_size=50, limit=100)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the case that users aren't working with an SDF, `.write` also provides `WriteFromMolList`, which will take a Python list of rdmol objects in place of the SDF argument in `WriteFromSDF`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## `.create` Module Contents\n", - "\n", - "mongordkit.Database.create.**createFromHostPort**(database_name, host=None (*string*), port=None (*string*)) --> *a MongoDB database instance named database_name*\n", - "\n", - "mongordkit.Database.create.**createFromURL**(database_name, url=None (*string*)) --> *a MongoDB database instance named database_name*" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## `.registration` Module Contents" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'AnonymousGraph': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.AnonymousGraph)>,\n", - " 'ElementGraph': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.ElementGraph)>,\n", - " 'CanonicalSmiles': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.CanonicalSmiles)>,\n", - " 'MurckoScaffold': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.MurckoScaffold)>,\n", - " 'ExtendedMurcko': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.ExtendedMurcko)>,\n", - " 'MolFormula': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.MolFormula)>,\n", - " 'AtomBondCounts': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.AtomBondCounts)>,\n", - " 'DegreeVector': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.DegreeVector)>,\n", - " 'Mesomer': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.Mesomer)>,\n", - " 'HetAtomTautomer': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.HetAtomTautomer)>,\n", - " 'HetAtomProtomer': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.HetAtomProtomer)>,\n", - " 'RedoxPair': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.RedoxPair)>,\n", - " 'Regioisomer': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.Regioisomer)>,\n", - " 'NetCharge': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.NetCharge)>,\n", - " 'SmallWorldIndexBR': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.SmallWorldIndexBR)>,\n", - " 'SmallWorldIndexBRL': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.SmallWorldIndexBRL)>,\n", - " 'ArthorSubstructureOrder': (rdmol, f=rdkit.Chem.rdMolHash.HashFunction.ArthorSubstructureOrder)>,\n", - " 'inchi_standard': ,\n", - " 'inchikey_standard': ,\n", - " 'inchi_KET_15T': (rdmol)>,\n", - " 'inchikey_KET_15T': (rdmol)>,\n", - " 'noiso_smiles': (rdmol)>,\n", - " 'cx_smiles': }" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "registration.HASH_FUNCTIONS" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Class** mongordkit.Database.registration.**MolDocScheme()**\n", - "\n", - "**Instance variables**:\n", - "```\n", - "self.scheme_name = DEFAULT_SCHEME_NAME\n", - "self.author = DEFAULT_AUTHOR\n", - "self.pre_processed = DEFAULT_PREPROCESS\n", - "self.index_option = DEFAULT_INDEX\n", - "self.hashes = set(HASH_FUNCTIONS.keys())\n", - "self.fingerprints = {}\n", - "self.value_fields = {}\n", - "```\n", - "**Instance methods**:\n", - "- set_index(self, new_index) --> *None*\n", - "- get_index_value(self, rdmol) --> *calculated index value*\n", - "- add_hash_field(self, field_name, field_method) --> *None*\n", - "- add_value_field(self, field_name, field_value) --> *None*\n", - "- add_all_hashes(self) --> *None*\n", - "- remove_field(self, field_name) --> *None*\n", - "- generate_mol_doc(self, rdmol) --> *Dict: document representing molecule according to scheme*" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## `.write` Module Contents" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "mongordkit.Database.write.**WriteFromSDF**(database, sdf, scheme=MolDocScheme(), reg_collection=None, chunk_size=100, limit=None, warnings=False (*Make this true to turn on rdkit warnings*) --> *int: number of molecules imported*\n", - "\n", - "mongordkit.Database.write.**WriteFromMolList**(database, list, scheme=MolDocScheme(), reg_collection=None, chunk_size=100, limit=None) --> *int: number of molecules imported*" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py37_rdkit_beta", - "language": "python", - "name": "py37_rdkit_beta" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/notebooks/.ipynb_checkpoints/Similarity and Substructure Search-checkpoint.ipynb b/docs/notebooks/.ipynb_checkpoints/Similarity and Substructure Search-checkpoint.ipynb deleted file mode 100644 index 33285bc..0000000 --- a/docs/notebooks/.ipynb_checkpoints/Similarity and Substructure Search-checkpoint.ipynb +++ /dev/null @@ -1,648 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Similarity and Substructure Search\n", - "\n", - "Last updated: 8/11/20\n", - "\n", - "Methods for similarity and substructure search are included in the `mongordkit.Search` module." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from mongordkit.Search import similarity, substructure, utils\n", - "from mongordkit import Search\n", - "from mongordkit.Database import create, write\n", - "from rdkit import Chem\n", - "import pymongo" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Reset Cells\n", - "\n", - "Run these cells to reset the MongoDB database used in this notebook." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "client = pymongo.MongoClient()\n", - "client.drop_database('demo_db')\n", - "demo_db = client.demo_db\n", - "\n", - "# Disable rdkit warnings\n", - "rdkit.RDLogger.DisableLog('rdApp.*')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Preparing for Search\n", - "Adequately preparing the database for searching requires adding a variety of fingerprints and hashes. You can easily perform all of the setup work required for similarity and substructure search by calling the method `Search.PrepareForSearch`. Generally, workflow will follow straight from the following two lines into search calls:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "populating mongodb collection with compounds from SDF...\n", - "200 molecules successfully imported\n", - "0 duplicates skipped\n", - "Preparing database and collections for search...\n", - "Added pattern fps, morgan fps, and support for LSH.\n" - ] - } - ], - "source": [ - "write.WriteFromSDF(demo_db.molecules, '../../data/test_data/first_200.props.sdf')\n", - "Search.PrepareForSearch(demo_db, demo_db.molecules, demo_db.mfp_counts, demo_db.permutations)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "However, the rest of this notebook will explicitly note the addition of fingerprints and hashes in an effort to better communicate how the code actually works. Let's reset the database again so that we can insert the hashes step by step without any issues." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "client.drop_database('demo_db')\n", - "demo_db = client.demo_db" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Similarity Search\n", - "\n", - "`mongordkit.Search.similarity` supports similarity search best on a MongoDB collection prepared by `mongordkit.Database.write`. For the general level of similarity search, users can also use any collection that has documents with the following fields:\n", - "- `'rdmol': binary pickle object`\n", - "- `'index': a unique identifier for each molecule`\n", - "- `'fingerprints': {a nested document that can be blank at the start}'`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's run through an example of similarity search. First, we'll write into the database 200 molecules from a data file included in the `mongordkit` package. We will use default write settings." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "populating mongodb collection with compounds from SDF...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:23] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:23] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:38] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:38] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", - "RDKit WARNING: [15:43:38] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected\n", - "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:38] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Accepted unusual valence(s): Cu(4); Metal was disconnected; Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged; Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:39] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:39] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:40] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:40] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:40] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:40] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:40] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:40] WARNING: Charges were rearranged\n", - "RDKit WARNING: [15:43:40] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:40] WARNING: Omitted undefined stereo\n", - "RDKit WARNING: [15:43:40] WARNING: Omitted undefined stereo\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "200 molecules successfully imported\n", - "0 duplicates skipped\n" - ] - }, - { - "data": { - "text/plain": [ - "200" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "write.WriteFromSDF(demo_db.molecules, '../../data/test_data/first_200.props.sdf')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`similarity.SimSearchNaive` will directly loop through the database and display results. This is good for purposes of verifying accuracy. However, this implementation is extremely slow for any decently-sized database. Instead, `similarity` supports precalculating the following kinds of fingerprints for screening: \n", - "- Morgan (default radius 2, length 2048)\n", - "\n", - "through `similarity.AddMorganFingerprints`. For each document in a passed in collection, this method adds the nested field `{morgan_fp: {bits: }, {count: }}` to the document's `fingerprint` field. `AddMorganFingerprints` also creates indices on `morgan_fp[bits]` and `morgan_fp[count]` to speed search. " - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "similarity.AddMorganFingerprints(demo_db.molecules, demo_db.mfp_counts)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'bits': [84,\n", - " 314,\n", - " 356,\n", - " 547,\n", - " 650,\n", - " 747,\n", - " 967,\n", - " 1057,\n", - " 1080,\n", - " 1154,\n", - " 1337,\n", - " 1380,\n", - " 1722,\n", - " 1768,\n", - " 1873,\n", - " 1877],\n", - " 'count': 16}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "demo_db.molecules.find_one()['fingerprints']['morgan_fp']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From here, we can directly perform similarity search. `similarity` provides two methods that take advantage of fingerprint screening: `similaritySearch` and `similaritySearchAggregate`. The latter shifts much of the computation into the MongoDB server by using an aggregation pipeline and can dramatically improve performance when working with sharded MongoDB servers." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "similaritySearch: [[0.4117647058823529, 'WLHCBQAPPJAULW-UHFFFAOYSA-N']]\n", - "\n", - "\n", - "similaritySearchAggregate: [[0.4117647058823529, 'WLHCBQAPPJAULW-UHFFFAOYSA-N']]\n" - ] - } - ], - "source": [ - "q_mol = Chem.MolFromSmiles('Cc1ccccc1')\n", - "\n", - "# Perform a similarity search on TestDB for q_mol with a Tanimoto threshold of 0.4. \n", - "results1 = similarity.SimSearch(q_mol, demo_db.molecules, demo_db.mfp_counts, 0.4)\n", - "\n", - "# Do the same thing, but use the MongoDB Aggregation Pipeline. \n", - "results2 = similarity.SimSearchAggregate(q_mol, demo_db.molecules, demo_db.mfp_counts, 0.4)\n", - "\n", - "print('similaritySearch: {}'.format(results1))\n", - "print('\\n')\n", - "print('similaritySearchAggregate: {}'.format(results2))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that the search returns only the index for the molecule, which in this case is the inchikey; users should find it easy to go from the index to the full molecule document by way of a quick search. This also makes it easier for users to retrieve molecules when indices represent multiple tautomers or isomers in the collection.\n", - "\n", - "`SimSearch` and `SimSearchAggregate` both make use of the conventional fingerprint screening method. `similarity` also supports searching using Locality Sensitive Hashing, as developed by ChemBL in an excellent [blog post](http://chembl.blogspot.com/2015/08/lsh-based-similarity-search-in-mongodb.html). The method here is called `SimSearchLSH` and requires a little bit more setup work:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# Generate 100 different permutations of length 2048 and save them in demo_db.permutations as separate documents.\n", - "similarity.AddRandPermutations(demo_db.permutations)\n", - "\n", - "# Add locality-sensitive hash values to each documents in demo_db.molecules by splitting the 100 different permutations\n", - "# in demo_db.permutations into 25 different buckets. \n", - "similarity.AddLocalityHashes(demo_db.molecules, demo_db.permutations, 25)\n", - "\n", - "# Create 25 different collections in db_demo each store a subset of hash values for molecules in demo_db.molecules.\n", - "similarity.AddHashCollections(demo_db, demo_db.molecules)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's try a search using the query molecule from earlier:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "similaritySearchLSH: []\n" - ] - } - ], - "source": [ - "q_mol = Chem.MolFromSmiles('Cc1ccccc1')\n", - "\n", - "results3 = similarity.SimSearchLSH(q_mol, demo_db, demo_db.molecules, \n", - " demo_db.permutations, demo_db.mfp_counts, threshold=0.8)\n", - "\n", - "print('similaritySearchLSH: {}'.format(results3))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The LSH algorithm relies on random permutations using the `numpy` module, so it yields non-deterministic results. This means that LSH is well-suited for *scanning* datasets (its performance on large datasets is faster than either similarity search method), but is less accurate than regular similarity search, especially below thresholds of 0.7. Specific notes on benchmarks can be found in \"Benchmarking Similarity Search.\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Substructure Search\n", - "\n", - "`mongordkit.Search.substructure` supports substructure search best on collections prepared by `write`. Requirements are identical to those for similarity search: a `molecules` collection whose documents have `rdmol` and `index` fields. \n", - "\n", - "`substructure.SubSearchNaive` provides a fingerprint-less, slower implementation of substructure search suitable for very small databases:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['RUTYZGCHBCCSKD-UHFFFAOYSA-N',\n", - " 'WECJUPODCKXNQK-UHFFFAOYSA-N',\n", - " 'GZZJZWYIOOPHOV-UHFFFAOYSA-N',\n", - " 'FXOSHPAYNZBSFO-RMKNXTFCSA-N',\n", - " 'KWLUBKHLCNCFQI-UHFFFAOYSA-N',\n", - " 'VDAJDWUTRXNYMU-RUDMXATFSA-N',\n", - " 'PACGLQCRGWFBJH-UHFFFAOYSA-N',\n", - " 'CDCRUVGWQJYTFO-UHFFFAOYSA-N']" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "q_mol = Chem.MolFromSmiles('C1=CC=CC=C1OC')\n", - "\n", - "# Perform a substructure search for q_mol on TestDB. \n", - "substructure.SubSearchNaive(q_mol, demo_db.molecules, chirality=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "By adding pattern fingerprints, which are optimized for substructure search, we can use `substructure.SubSearch`, which takes advantage of fingerprint screening to avoid as many expensive calls to `HasSubstructMatch` as possible. " - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['RUTYZGCHBCCSKD-UHFFFAOYSA-N',\n", - " 'WECJUPODCKXNQK-UHFFFAOYSA-N',\n", - " 'GZZJZWYIOOPHOV-UHFFFAOYSA-N',\n", - " 'FXOSHPAYNZBSFO-RMKNXTFCSA-N',\n", - " 'KWLUBKHLCNCFQI-UHFFFAOYSA-N',\n", - " 'VDAJDWUTRXNYMU-RUDMXATFSA-N',\n", - " 'PACGLQCRGWFBJH-UHFFFAOYSA-N',\n", - " 'CDCRUVGWQJYTFO-UHFFFAOYSA-N']" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "substructure.AddPatternFingerprints(demo_db.molecules)\n", - "substructure.SubSearch(q_mol, demo_db.molecules, chirality=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## `.Search` contents" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "mongordkit.Search.**PrepareForSearch**(db (*MongoDB database for hash information*), mol_collection (*MongoDB collection*), count_collection (*MongoDB collection*), perm_collection (*MongoDB collection*)) --> None" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## `.similarity` Contents\n", - "\n", - "### Constants:\n", - "- DEFAULT_THRESHOLD = 0.8\n", - "- DEFAULT_MORGAN_RADIUS = 2\n", - "- DEFAULT_MORGAN_LENGTH = 2048\n", - "- DEFAULT_BIT_N = 2048\n", - "- DEFAULT_BUCKET_N = 25\n", - "- DEFAULT_PERM_LEN = 2048\n", - "- DEFAULT_PERM_N = 100\n", - "\n", - "mongordkit.Search.similarity.**AddMorganFingerprints**(mol_collection (*MongoDB collection*), count_collection (*MongoDB collection*), radius=2 (*int: radius of Morgan fingerprint*), length=2048 (*int: length of Morgan fingerprint bit vector*)) --> None\n", - "\n", - "mongordkit.Search.similarity.**SimSearchNaive**(mol (*rdmol object*), mol_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, index]*\n", - "\n", - "mongordkit.Search.similarity.**SimSearch**(mol (*rdmol object*), mol_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, index]*\n", - "\n", - "mongordkit.Search.similarity.**SimSearchAggregate**(mol (*rdmol object*), mol_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, index]*\n", - "\n", - "mongordkit.Search.similarity.**AddRandPermutations**(perm_collection (*MongoDB collection*), len=2048 (*int: length corresponding to length of fingerprint bit vectors*), num=100 (*int: number of permutations*)) --> None\n", - "\n", - "mongordkit.Search.similarity.**AddLocalityHashes**(mol_collection (*MongoDB collection*), perm_collection (*MongoDB collection*), nBuckets=25 (*int: number of hash buckets. The number of permutations (mod NBuckets) must be 0*)) --> None\n", - "\n", - "mongordkit.Search.similarity.**AddHashCollections**(db (*MongoDB database*), mol_collection (*MongoDB collection*)) --> None\n", - "\n", - "mongordkit.Search.similarity.**SimSearchLSH**(mol (*rdmol object*), db (*MongoDB database containing hash collections*), mol_collection (*MongoDB collection*), perm_collection (*MongoDB collection*), count_collection (*MongoDB collection*), threshold=0.8 (*Tanimoto threshold between 0 and 1, float*)) --> *list: results with format [tanimoto, index]*" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## `.substructure` Contents\n", - "\n", - "mongordkit.Search.substructure.**AddPatternFingerprints**(mol_collection (MongoDB collection), length=2048 (*int: length of Pattern fingerprint bit vector*)) --> None\n", - "\n", - "mongordkit.Search.similarity.**SubSearchNaive**(pattern (*rdmol object*), db, chirality=False (*boolean: include chirality in search or not*)) --> *list: results with format [smiles]*\n", - "\n", - "mongordkit.Search.similarity.**SubSearch**(pattern (*rdmol object*), db, chirality=False (*boolean: include chirality in search or not*)) --> *list: results with format [smiles]*" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py37_rdkit_beta", - "language": "python", - "name": "py37_rdkit_beta" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/notebooks/Write and Registration Benchmarking.ipynb b/docs/notebooks/Write and Registration Benchmarking.ipynb index cc4b856..a849c11 100644 --- a/docs/notebooks/Write and Registration Benchmarking.ipynb +++ b/docs/notebooks/Write and Registration Benchmarking.ipynb @@ -149,6 +149,22 @@ "populating mongodb collection with compounds from SDF...\n", "4000 molecules successfully imported\n", "0 duplicates skipped\n", + "populating mongodb collection with compounds from SDF...\n", + "4000 molecules successfully imported\n", + "0 duplicates skipped\n", + "populating mongodb collection with compounds from SDF...\n", + "4000 molecules successfully imported\n", + "0 duplicates skipped\n", + "populating mongodb collection with compounds from SDF...\n", + "4000 molecules successfully imported\n", + "0 duplicates skipped\n", + "populating mongodb collection with compounds from SDF...\n", + "4000 molecules successfully imported\n", + "0 duplicates skipped\n", + "testing 5000\n", + "populating mongodb collection with compounds from SDF...\n", + "5000 molecules successfully imported\n", + "0 duplicates skipped\n", "populating mongodb collection with compounds from SDF...\n" ] }