Fix tests, refactor updates (more parallelism too). See CHANGELOG.md

lotusnprod · Dec 17, 2023 · d0a4ec4 · d0a4ec4
1 parent dd15c5d
commit d0a4ec4
Show file tree

Hide file tree

Showing 27 changed files with 537 additions and 569 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -39,8 +39,35 @@ jobs:
           poetry install
           chmod +x ./update.py
           poetry run ./update.py
+      - name: Run tests
+        run: |
+          poetry run pytest
       - name: Clean up unnecessary files
         if: always()
         run: |
           rm -rf data/*.csv data/*.db
-        
+  tests:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Set up python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+      - name: Load cached Poetry installation
+        id: cached-poetry
+        uses: actions/cache@v3
+        with:
+          path: ~/.local
+          key: poetry-0
+      - name: Install Poetry
+        if: steps.cached-poetry.outputs.cache-hit != 'true'
+        uses: snok/install-poetry@v1
+        with:
+          virtualenvs-create: true
+          virtualenvs-in-project: true
+      - name: Run tests
+        run: |
+          poetry install
+          poetry run pytest
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,12 @@
+2023-12-16
+    - Fix tests
+    - Refactor the download of CSVs
+    - Parallelize the download of CSVs
+    - Add parameters to the update script
+        you can run with `--only xxx`, `--stop xxx` or `--skip xxx` to:
+        - only execute xxx
+        - stop before xxx
+        - skip xxx 
+        Where xxx can be a task or a group name
+        Task/Groups descriptions can be listed with `--list`
+    - Fix a potential abuse if taxon contains a comma in `generate_database_taxo.py`
diff --git a/EXAMPLE_QUERIES.md → doc/EXAMPLE_QUERIES.md b/EXAMPLE_QUERIES.md → doc/EXAMPLE_QUERIES.md
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -2,7 +2,7 @@
 name = "lotus_search"
 version = "0.1.2"
 description = ""
-authors = ["Jonathan Bisson <[email protected]>"]
+authors = ["Jonathan Bisson <[email protected]>", "Adriano Rutz <[email protected]>"]
 packages = [{include = "*.py", from = ""}]
 readme = "README.md"
 
@@ -17,14 +17,16 @@ gunicorn = "^21.2.0"
 orjson = "^3.9.10"
 pandas = "^2.1.3"
 pydantic = "^2.5.1"
-pytest = "^6.0.0"
-pytest-mock = "*"
 requests = "^2.31.0"
-requests_mock = "*"
 rdkit = "^2023.9.2"
 sqlalchemy = "^2.0.23"
 uvicorn = "^0.23.2"
 
+[tool.poetry.dev-dependencies]
+pytest = "^6.0.0"
+pytest-mock = "*"
+requests_mock = "*"
+
 [tool.poetry.scripts]
 start = "app:main"
 

diff --git a/tests/test_common.py b/tests/test_common.py
@@ -1,28 +1,27 @@
-import pytest
 import requests_mock
 
-from update.common import remove_wd_entity_prefix, wd_sparql_to_csv
+from update.common import remove_wd_entity_prefix, sparql_to_csv
 
 
-def test_wd_sparql_to_csv_returns_expected_csv():
-    with requests_mock.Mocker() as m:
-        m.get("https://query.wikidata.org/sparql", text="expected_csv")
-        result = wd_sparql_to_csv("query")
-        assert result == "expected_csv"
+class TestWdSparqlToCsv:
+    def test_returns_expected_csv(self):
+        with requests_mock.Mocker() as m:
+            m.get("https://query.wikidata.org/sparql", text="expected_csv")
+            result = sparql_to_csv("query")
+            assert result == "expected_csv"
 
+    def test_uses_provided_url(self):
+        with requests_mock.Mocker() as m:
+            m.get("https://other.url/sparql", text="expected_csv")
+            result = sparql_to_csv("query", "https://other.url/sparql")
+            assert result == "expected_csv"
 
-def test_wd_sparql_to_csv_uses_provided_url():
-    with requests_mock.Mocker() as m:
-        m.get("https://other.url/sparql", text="expected_csv")
-        result = wd_sparql_to_csv("query", "https://other.url/sparql")
-        assert result == "expected_csv"
 
+class TestRemoveWdEntityPrefix:
+    def test_removes_prefix(self):
+        result = remove_wd_entity_prefix("http://www.wikidata.org/entity/Q123")
+        assert result == "123"
 
-def test_remove_wd_entity_prefix_removes_prefix():
-    result = remove_wd_entity_prefix("http://www.wikidata.org/entity/Q123")
-    assert result == "123"
-
-
-def test_remove_wd_entity_prefix_does_not_remove_other_text():
-    result = remove_wd_entity_prefix("http://www.wikidata.org/entity/Q123/other")
-    assert result == "123/other"
+    def test_does_not_remove_other_text(self):
+        result = remove_wd_entity_prefix("http://www.wikidata.org/entity/Q123/other")
+        assert result == "123/other"
diff --git a/tests/test_download_couples_referenced.py b/tests/test_download_couples_referenced.py
diff --git a/tests/test_download_doi.py b/tests/test_download_doi.py
diff --git a/tests/test_download_query_as_csv.py b/tests/test_download_query_as_csv.py
@@ -0,0 +1,28 @@
+from unittest.mock import patch
+
+import pytest
+
+from update.download_query_as_csv import run
+
+
+class TestRunQueryToCSV:
+    @pytest.fixture(autouse=True)
+    def setup(self, tmp_path):
+        self.query_file = tmp_path / "query.sparql"
+        self.query_file.write_text("SELECT ?item WHERE {?item wdt:P31 wd:Q5.} LIMIT 1")
+        self.output_file = tmp_path / "output.csv"
+
+    def test_retries_on_timeout(self):
+        with patch('update.download_query_as_csv.sparql_to_csv') as mock_sparql_to_csv:
+            mock_sparql_to_csv.side_effect = ['java.util.concurrent.TimeoutException', 'valid result']
+            run(self.query_file, self.output_file)
+            assert mock_sparql_to_csv.call_count == 2
+            assert self.output_file.read_text() == 'valid result'
+
+    def test_writes_expected_result(self):
+        with patch('update.download_query_as_csv.sparql_to_csv') as mock_sparql_to_csv, \
+             patch('update.download_query_as_csv.remove_wd_entity_prefix') as mock_remove_wd_entity_prefix:
+            mock_sparql_to_csv.return_value = 'valid result'
+            mock_remove_wd_entity_prefix.return_value = 'expected result'
+            run(self.query_file, self.output_file)
+            assert self.output_file.read_text() == 'expected result'
diff --git a/tests/test_download_smiles.py b/tests/test_download_smiles.py
diff --git a/tests/test_download_taxonomy_parenting.py b/tests/test_download_taxonomy_parenting.py
diff --git a/tests/test_generate_database.py b/tests/test_generate_database.py
@@ -3,15 +3,15 @@
 
 import pytest
 
-from update.generate_database import run
+from update import generate_database
 
 
 @patch("update.generate_database.pickle.dump")
 @patch("update.generate_database.pickle.load")
 @patch("update.generate_database.open", new_callable=mock_open)
 def test_run_loads_and_dumps_database(mock_open, mock_pickle_load, mock_pickle_dump):
     mock_pickle_load.return_value = {"key": "value"}
-    run(Path("."))
+    generate_database.run()
     assert mock_pickle_load.call_count == 2
     assert mock_pickle_dump.call_count == 1
 
@@ -26,7 +26,7 @@ def test_run_updates_database_with_loaded_data(
         {"chemo_key": "chemo_value"},
         {"taxo_key": "taxo_value"},
     ]
-    run(Path("."))
+    generate_database.run()
     mock_pickle_dump.assert_called_once_with(
         {"chemo_key": "chemo_value", "taxo_key": "taxo_value"},
         mock_open.return_value.__enter__.return_value,

diff --git a/tests/test_generate_database_chemo.py b/tests/test_generate_database_chemo.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from update.generate_database_chemo import process_smiles, run
+from update import generate_database_chemo
 
 
 @patch("update.generate_database_chemo.ProcessPoolExecutor")
@@ -21,6 +21,7 @@ def test_run_generates_database(
             (
                 0,
                 "smiles",
+                "smol",
                 "smiles_clean",
                 "sim_fp",
                 "sub_fp",
@@ -30,7 +31,7 @@ def test_run_generates_database(
             )
         ]
     )
-    run(Path("."))
+    generate_database_chemo.run()
     assert mock_pickle_dump.call_count == 1
 
 
@@ -54,10 +55,11 @@ def test_process_smiles_returns_expected_result_on_success(
     mock_fingerprint.return_value = "sim_fp"
     mock_pattern_fp.return_value = "sub_fp"
     mock_mol.return_value.ToBinary.return_value = "mol_h"
-    result = process_smiles((0, "smiles"))
+    result = generate_database_chemo.process_smiles((0, "smiles"))
     assert result == (
         0,
         "smiles",
+        "smol",
         "smiles_clean",
         "sim_fp",
         "sub_fp",
@@ -68,5 +70,5 @@ def test_process_smiles_returns_expected_result_on_success(
 
 
 def test_process_smiles_returns_none_on_failure():
-    result = process_smiles((0, "invalid_smiles"))
+    result = generate_database_chemo.process_smiles((0, "invalid_smiles"))
     assert result is None