Biotools-API (#208)

* typo * first draft of bio.tools API classes * typo * replace testing conditional with ability to evaluate regex for spelling differences, capitalisation, ... * add biotools example, change to regex eval * change test name * switch to metabolomics as proteomics is in the API docs examples
biocypher · Nov 12, 2024 · e8fdc13 · e8fdc13
1 parent 000d046
commit e8fdc13
Show file tree

Hide file tree

Showing 11 changed files with 746 additions and 90 deletions.
diff --git a/benchmark/conftest.py b/benchmark/conftest.py
@@ -17,20 +17,20 @@
 from .benchmark_utils import benchmark_already_executed
 
 # how often should each benchmark be run?
-N_ITERATIONS = 3
+N_ITERATIONS = 1
 
 # which dataset should be used for benchmarking?
 BENCHMARK_DATASET = get_benchmark_dataset()
 
 # which models should be benchmarked?
 OPENAI_MODEL_NAMES = [
-    "gpt-3.5-turbo-0125",
-    "gpt-4-0613",
-    "gpt-4-0125-preview",
-    "gpt-4-turbo-2024-04-09",
-    "gpt-4o-2024-05-13",
+    # "gpt-3.5-turbo-0125",
+    # "gpt-4-0613",
+    # "gpt-4-0125-preview",
+    # "gpt-4-turbo-2024-04-09",
+    # "gpt-4o-2024-05-13",
     "gpt-4o-2024-08-06",
-    "gpt-4o-mini-2024-07-18",
+    # "gpt-4o-mini-2024-07-18",
 ]
 
 ANTHROPIC_MODEL_NAMES = [
@@ -128,28 +128,28 @@
     #         # "FP16",
     #     ],
     # },
-    "llama-2-chat": {
-        "model_size_in_billions": [
-            7,
-            # 13,
-            # 70,
-        ],
-        "model_format": "ggufv2",
-        "quantization": [
-            "Q2_K",
-            # "Q3_K_S",
-            "Q3_K_M",
-            # "Q3_K_L",
-            # "Q4_0",
-            # "Q4_K_S",
-            "Q4_K_M",
-            # "Q5_0",
-            # "Q5_K_S",
-            "Q5_K_M",
-            "Q6_K",
-            "Q8_0",
-        ],
-    },
+    # "llama-2-chat": {
+    #     "model_size_in_billions": [
+    #         7,
+    #         # 13,
+    #         # 70,
+    #     ],
+    #     "model_format": "ggufv2",
+    #     "quantization": [
+    #         "Q2_K",
+    #         # "Q3_K_S",
+    #         "Q3_K_M",
+    #         # "Q3_K_L",
+    #         # "Q4_0",
+    #         # "Q4_K_S",
+    #         "Q4_K_M",
+    #         # "Q5_0",
+    #         # "Q5_K_S",
+    #         "Q5_K_M",
+    #         "Q6_K",
+    #         "Q8_0",
+    #     ],
+    # },
     # "llama-3-instruct": {
     #     "model_size_in_billions": [
     #         8,
@@ -169,31 +169,31 @@
     #         # "Q4_K_M",
     #     ],
     # },
-    "llama-3.1-instruct": {
-        "model_size_in_billions": [
-            8,
-            # 70,
-        ],
-        "model_format": "ggufv2",
-        "quantization": [
-            # 8B model quantisations
-            "Q3_K_L",
-            "IQ4_XS",
-            "Q4_K_M",
-            # "Q5_K_M",
-            # "Q6_K",
-            "Q8_0",
-            # 70B model quantisations
-            # "IQ2_M",
-            # "Q2_K",
-            # "Q3_K_S",
-            # "IQ4_XS",
-            # "Q4_K_M",  # crazy slow on mbp m3 max
-            # "Q5_K_M",
-            # "Q6_K",
-            # "Q8_0",
-        ],
-    },
+    # "llama-3.1-instruct": {
+    #     "model_size_in_billions": [
+    #         8,
+    #         # 70,
+    #     ],
+    #     "model_format": "ggufv2",
+    #     "quantization": [
+    #         # 8B model quantisations
+    #         "Q3_K_L",
+    #         "IQ4_XS",
+    #         "Q4_K_M",
+    #         # "Q5_K_M",
+    #         # "Q6_K",
+    #         "Q8_0",
+    #         # 70B model quantisations
+    #         # "IQ2_M",
+    #         # "Q2_K",
+    #         # "Q3_K_S",
+    #         # "IQ4_XS",
+    #         # "Q4_K_M",  # crazy slow on mbp m3 max
+    #         # "Q5_K_M",
+    #         # "Q6_K",
+    #         # "Q8_0",
+    #     ],
+    # },
     # "mistral-instruct-v0.2": {
     #     "model_size_in_billions": [
     #         7,
@@ -239,26 +239,26 @@
     #         "none",
     #     ],
     # },
-    "openhermes-2.5": {
-        "model_size_in_billions": [
-            7,
-        ],
-        "model_format": "ggufv2",
-        "quantization": [
-            "Q2_K",
-            # "Q3_K_S",
-            "Q3_K_M",
-            # "Q3_K_L",
-            # "Q4_0",
-            # "Q4_K_S",
-            "Q4_K_M",
-            # "Q5_0",
-            # "Q5_K_S",
-            "Q5_K_M",
-            "Q6_K",
-            "Q8_0",
-        ],
-    },
+    # "openhermes-2.5": {
+    #     "model_size_in_billions": [
+    #         7,
+    #     ],
+    #     "model_format": "ggufv2",
+    #     "quantization": [
+    #         "Q2_K",
+    #         # "Q3_K_S",
+    #         "Q3_K_M",
+    #         # "Q3_K_L",
+    #         # "Q4_0",
+    #         # "Q4_K_S",
+    #         "Q4_K_M",
+    #         # "Q5_0",
+    #         # "Q5_K_S",
+    #         "Q5_K_M",
+    #         "Q6_K",
+    #         "Q8_0",
+    #     ],
+    # },
 }
 
 # create concrete benchmark list by concatenating all combinations of model

diff --git a/benchmark/data/benchmark_api_calling_data.yaml b/benchmark/data/benchmark_api_calling_data.yaml
@@ -3,11 +3,16 @@
 #
 # Test case keys:
 # - input (for creating the test)
-# - expected (for asserting ourcomes and generating a score)
+# - expected (for asserting outcomes and generating a score)
 # - case (for categorizing the test case)
 #
 # If any input is a dictionary itself, it will be expanded into separate test
 # cases, using the top-level key to create a concatenated test case purpose.
+#
+# We are using regular expressions to evaluate the expected parts, to be able to
+# account for variations in the output (e.g. whitespace, capitalization). Make
+# sure to escape special characters in the regular expressions, such as '?',
+# '.', etc., by adding two backslashes before them.
 
 api_calling:
   - case: oncokb:braf:melanoma
@@ -17,7 +22,7 @@ api_calling:
     expected:
       parts_of_query:
         [
-          "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange?",
+          "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange\\?",
           "hugoSymbol=BRAF",
           "alteration=V600E",
           "tumorType=Melanoma",
@@ -29,7 +34,8 @@ api_calling:
     expected:
       parts_of_query:
         [
-          "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange?hugoSymbol=TP53",
+          "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange\\?",
+          "hugoSymbol=TP53",
           "alteration=R273C",
           "tumorType=Colon%20Adenocarcinoma",
         ]
@@ -41,7 +47,7 @@ api_calling:
     expected:
       parts_of_query:
         [
-          "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange?",
+          "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange\\?",
           "hugoSymbol=BRAF",
           "alteration=N486_P490del",
           "tumorType=Histiocytosis",
@@ -53,10 +59,17 @@ api_calling:
     expected:
       parts_of_query:
         [
-          "https://demo.oncokb.org/api/v1/annotate/structuralVariants?",
+          "https://demo.oncokb.org/api/v1/annotate/structuralVariants\\?",
           "hugoSymbolA=CD74",
           "hugoSymbolB=ROS1",
           "structuralVariantType=FUSION",
           "isFunctionalFusion=true",
           "tumorType=Lung%20Adenocarcinoma",
         ]
+  - case: biotools:topic:metabolomics
+    input:
+      prompt:
+        fuzzy_search: "Which tools can I use for metabolomics?"
+    expected:
+      parts_of_query:
+        ["https://bio.tools/api/t/", "\\?topic=", "[mM]etabolomics"]
diff --git a/benchmark/data/benchmark_kg_schema_data.yaml b/benchmark/data/benchmark_kg_schema_data.yaml
@@ -3,7 +3,7 @@
 #
 # Test case keys:
 # - input (for creating the test)
-# - expected (for asserting ourcomes and generating a score)
+# - expected (for asserting outcomes and generating a score)
 # - case (for categorizing the test case)
 #
 # If any input is a dictionary itself, it will be expanded into separate test

diff --git a/benchmark/data/benchmark_med_qa_data.yaml b/benchmark/data/benchmark_med_qa_data.yaml
@@ -3,7 +3,7 @@
 #
 # Test case keys:
 # - input (for creating the test)
-# - expected (for asserting ourcomes and generating a score)
+# - expected (for asserting outcomes and generating a score)
 # - case (for categorizing the test case)
 #
 # If any input is a dictionary itself, it will be expanded into separate test

diff --git a/benchmark/data/benchmark_query_test_data.yaml b/benchmark/data/benchmark_query_test_data.yaml
@@ -3,7 +3,7 @@
 #
 # Test case keys:
 # - input (for creating the test)
-# - expected (for asserting ourcomes and generating a score)
+# - expected (for asserting outcomes and generating a score)
 # - case (for categorizing the test case)
 #
 # If any input is a dictionary itself, it will be expanded into separate test

diff --git a/benchmark/data/benchmark_rag_test_data.yaml b/benchmark/data/benchmark_rag_test_data.yaml
@@ -3,7 +3,7 @@
 #
 # Test case keys:
 # - input (for creating the test)
-# - expected (for asserting ourcomes and generating a score)
+# - expected (for asserting outcomes and generating a score)
 # - case (for categorizing the test case)
 #
 # If any input is a dictionary itself, it will be expanded into separate test

diff --git a/benchmark/data/benchmark_text_extract_data.yaml b/benchmark/data/benchmark_text_extract_data.yaml
@@ -3,7 +3,7 @@
 #
 # Test case keys:
 # - input (for creating the test)
-# - expected (for asserting ourcomes and generating a score)
+# - expected (for asserting outcomes and generating a score)
 # - case (for categorizing the test case)
 #
 # If any input is a dictionary itself, it will be expanded into separate test

diff --git a/benchmark/test_api_calling.py b/benchmark/test_api_calling.py
@@ -1,10 +1,11 @@
 from urllib.parse import urlencode
 import inspect
+import re
 
 import pytest
 
 from biochatter._misc import ensure_iterable
-from biochatter.api_agent.oncokb import OncoKBQueryBuilder
+from biochatter.api_agent import OncoKBQueryBuilder, BioToolsQueryBuilder
 from .conftest import calculate_bool_vector_score
 from .benchmark_utils import (
     skip_if_already_run,
@@ -31,24 +32,27 @@ def test_api_calling(
 
     def run_test():
         conversation.reset()  # needs to be reset for each test
-        builder = OncoKBQueryBuilder()
+        if "oncokb" in yaml_data["case"]:
+            builder = OncoKBQueryBuilder()
+        elif "biotools" in yaml_data["case"]:
+            builder = BioToolsQueryBuilder()
         parameters = builder.parameterise_query(
             question=yaml_data["input"]["prompt"],
             conversation=conversation,
         )
 
-        params = parameters.dict(exclude_unset=True)
+        params = parameters.dict(exclude_none=True)
         endpoint = params.pop("endpoint")
         base_url = params.pop("base_url")
         params.pop("question_uuid")
-        full_url = f"{base_url}/{endpoint}"
+        full_url = f"{base_url.rstrip('/')}/{endpoint.lstrip('/')}"
         api_query = f"{full_url}?{urlencode(params)}"
 
         score = []
         for expected_part in ensure_iterable(
             yaml_data["expected"]["parts_of_query"]
         ):
-            if expected_part in api_query:
+            if re.search(expected_part, api_query):
                 score.append(True)
             else:
                 score.append(False)

diff --git a/biochatter/api_agent/__init__.py b/biochatter/api_agent/__init__.py
@@ -6,4 +6,26 @@
     BlastQueryParameters,
 )
 from .oncokb import OncoKBFetcher, OncoKBInterpreter, OncoKBQueryBuilder
+from .bio_tools import (
+    BioToolsFetcher,
+    BioToolsInterpreter,
+    BioToolsQueryBuilder,
+)
 from .api_agent import APIAgent
+
+__all__ = [
+    "BaseFetcher",
+    "BaseInterpreter",
+    "BaseQueryBuilder",
+    "BlastFetcher",
+    "BlastInterpreter",
+    "BlastQueryBuilder",
+    "BlastQueryParameters",
+    "OncoKBFetcher",
+    "OncoKBInterpreter",
+    "OncoKBQueryBuilder",
+    "BioToolsFetcher",
+    "BioToolsInterpreter",
+    "BioToolsQueryBuilder",
+    "APIAgent",
+]