From e8fdc1384d934d9c8fb5b69dca96ce2a3dec790e Mon Sep 17 00:00:00 2001 From: Sebastian Lobentanzer Date: Tue, 12 Nov 2024 16:10:45 +0100 Subject: [PATCH] Biotools-API (#208) * typo * first draft of bio.tools API classes * typo * replace testing conditional with ability to evaluate regex for spelling differences, capitalisation, ... * add biotools example, change to regex eval * change test name * switch to metabolomics as proteomics is in the API docs examples --- benchmark/conftest.py | 148 ++--- .../data/benchmark_api_calling_data.yaml | 23 +- benchmark/data/benchmark_kg_schema_data.yaml | 2 +- benchmark/data/benchmark_med_qa_data.yaml | 2 +- benchmark/data/benchmark_query_test_data.yaml | 2 +- benchmark/data/benchmark_rag_test_data.yaml | 2 +- .../data/benchmark_text_extract_data.yaml | 2 +- benchmark/test_api_calling.py | 14 +- biochatter/api_agent/__init__.py | 22 + biochatter/api_agent/bio_tools.py | 617 ++++++++++++++++++ biochatter/api_agent/oncokb.py | 2 +- 11 files changed, 746 insertions(+), 90 deletions(-) create mode 100644 biochatter/api_agent/bio_tools.py diff --git a/benchmark/conftest.py b/benchmark/conftest.py index 5fa71139..a2af2127 100644 --- a/benchmark/conftest.py +++ b/benchmark/conftest.py @@ -17,20 +17,20 @@ from .benchmark_utils import benchmark_already_executed # how often should each benchmark be run? -N_ITERATIONS = 3 +N_ITERATIONS = 1 # which dataset should be used for benchmarking? BENCHMARK_DATASET = get_benchmark_dataset() # which models should be benchmarked? OPENAI_MODEL_NAMES = [ - "gpt-3.5-turbo-0125", - "gpt-4-0613", - "gpt-4-0125-preview", - "gpt-4-turbo-2024-04-09", - "gpt-4o-2024-05-13", + # "gpt-3.5-turbo-0125", + # "gpt-4-0613", + # "gpt-4-0125-preview", + # "gpt-4-turbo-2024-04-09", + # "gpt-4o-2024-05-13", "gpt-4o-2024-08-06", - "gpt-4o-mini-2024-07-18", + # "gpt-4o-mini-2024-07-18", ] ANTHROPIC_MODEL_NAMES = [ @@ -128,28 +128,28 @@ # # "FP16", # ], # }, - "llama-2-chat": { - "model_size_in_billions": [ - 7, - # 13, - # 70, - ], - "model_format": "ggufv2", - "quantization": [ - "Q2_K", - # "Q3_K_S", - "Q3_K_M", - # "Q3_K_L", - # "Q4_0", - # "Q4_K_S", - "Q4_K_M", - # "Q5_0", - # "Q5_K_S", - "Q5_K_M", - "Q6_K", - "Q8_0", - ], - }, + # "llama-2-chat": { + # "model_size_in_billions": [ + # 7, + # # 13, + # # 70, + # ], + # "model_format": "ggufv2", + # "quantization": [ + # "Q2_K", + # # "Q3_K_S", + # "Q3_K_M", + # # "Q3_K_L", + # # "Q4_0", + # # "Q4_K_S", + # "Q4_K_M", + # # "Q5_0", + # # "Q5_K_S", + # "Q5_K_M", + # "Q6_K", + # "Q8_0", + # ], + # }, # "llama-3-instruct": { # "model_size_in_billions": [ # 8, @@ -169,31 +169,31 @@ # # "Q4_K_M", # ], # }, - "llama-3.1-instruct": { - "model_size_in_billions": [ - 8, - # 70, - ], - "model_format": "ggufv2", - "quantization": [ - # 8B model quantisations - "Q3_K_L", - "IQ4_XS", - "Q4_K_M", - # "Q5_K_M", - # "Q6_K", - "Q8_0", - # 70B model quantisations - # "IQ2_M", - # "Q2_K", - # "Q3_K_S", - # "IQ4_XS", - # "Q4_K_M", # crazy slow on mbp m3 max - # "Q5_K_M", - # "Q6_K", - # "Q8_0", - ], - }, + # "llama-3.1-instruct": { + # "model_size_in_billions": [ + # 8, + # # 70, + # ], + # "model_format": "ggufv2", + # "quantization": [ + # # 8B model quantisations + # "Q3_K_L", + # "IQ4_XS", + # "Q4_K_M", + # # "Q5_K_M", + # # "Q6_K", + # "Q8_0", + # # 70B model quantisations + # # "IQ2_M", + # # "Q2_K", + # # "Q3_K_S", + # # "IQ4_XS", + # # "Q4_K_M", # crazy slow on mbp m3 max + # # "Q5_K_M", + # # "Q6_K", + # # "Q8_0", + # ], + # }, # "mistral-instruct-v0.2": { # "model_size_in_billions": [ # 7, @@ -239,26 +239,26 @@ # "none", # ], # }, - "openhermes-2.5": { - "model_size_in_billions": [ - 7, - ], - "model_format": "ggufv2", - "quantization": [ - "Q2_K", - # "Q3_K_S", - "Q3_K_M", - # "Q3_K_L", - # "Q4_0", - # "Q4_K_S", - "Q4_K_M", - # "Q5_0", - # "Q5_K_S", - "Q5_K_M", - "Q6_K", - "Q8_0", - ], - }, + # "openhermes-2.5": { + # "model_size_in_billions": [ + # 7, + # ], + # "model_format": "ggufv2", + # "quantization": [ + # "Q2_K", + # # "Q3_K_S", + # "Q3_K_M", + # # "Q3_K_L", + # # "Q4_0", + # # "Q4_K_S", + # "Q4_K_M", + # # "Q5_0", + # # "Q5_K_S", + # "Q5_K_M", + # "Q6_K", + # "Q8_0", + # ], + # }, } # create concrete benchmark list by concatenating all combinations of model diff --git a/benchmark/data/benchmark_api_calling_data.yaml b/benchmark/data/benchmark_api_calling_data.yaml index e69181bb..85340a81 100644 --- a/benchmark/data/benchmark_api_calling_data.yaml +++ b/benchmark/data/benchmark_api_calling_data.yaml @@ -3,11 +3,16 @@ # # Test case keys: # - input (for creating the test) -# - expected (for asserting ourcomes and generating a score) +# - expected (for asserting outcomes and generating a score) # - case (for categorizing the test case) # # If any input is a dictionary itself, it will be expanded into separate test # cases, using the top-level key to create a concatenated test case purpose. +# +# We are using regular expressions to evaluate the expected parts, to be able to +# account for variations in the output (e.g. whitespace, capitalization). Make +# sure to escape special characters in the regular expressions, such as '?', +# '.', etc., by adding two backslashes before them. api_calling: - case: oncokb:braf:melanoma @@ -17,7 +22,7 @@ api_calling: expected: parts_of_query: [ - "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange?", + "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange\\?", "hugoSymbol=BRAF", "alteration=V600E", "tumorType=Melanoma", @@ -29,7 +34,8 @@ api_calling: expected: parts_of_query: [ - "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange?hugoSymbol=TP53", + "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange\\?", + "hugoSymbol=TP53", "alteration=R273C", "tumorType=Colon%20Adenocarcinoma", ] @@ -41,7 +47,7 @@ api_calling: expected: parts_of_query: [ - "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange?", + "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange\\?", "hugoSymbol=BRAF", "alteration=N486_P490del", "tumorType=Histiocytosis", @@ -53,10 +59,17 @@ api_calling: expected: parts_of_query: [ - "https://demo.oncokb.org/api/v1/annotate/structuralVariants?", + "https://demo.oncokb.org/api/v1/annotate/structuralVariants\\?", "hugoSymbolA=CD74", "hugoSymbolB=ROS1", "structuralVariantType=FUSION", "isFunctionalFusion=true", "tumorType=Lung%20Adenocarcinoma", ] + - case: biotools:topic:metabolomics + input: + prompt: + fuzzy_search: "Which tools can I use for metabolomics?" + expected: + parts_of_query: + ["https://bio.tools/api/t/", "\\?topic=", "[mM]etabolomics"] diff --git a/benchmark/data/benchmark_kg_schema_data.yaml b/benchmark/data/benchmark_kg_schema_data.yaml index c19c98d3..930b8656 100644 --- a/benchmark/data/benchmark_kg_schema_data.yaml +++ b/benchmark/data/benchmark_kg_schema_data.yaml @@ -3,7 +3,7 @@ # # Test case keys: # - input (for creating the test) -# - expected (for asserting ourcomes and generating a score) +# - expected (for asserting outcomes and generating a score) # - case (for categorizing the test case) # # If any input is a dictionary itself, it will be expanded into separate test diff --git a/benchmark/data/benchmark_med_qa_data.yaml b/benchmark/data/benchmark_med_qa_data.yaml index 5b924050..ccd58655 100644 --- a/benchmark/data/benchmark_med_qa_data.yaml +++ b/benchmark/data/benchmark_med_qa_data.yaml @@ -3,7 +3,7 @@ # # Test case keys: # - input (for creating the test) -# - expected (for asserting ourcomes and generating a score) +# - expected (for asserting outcomes and generating a score) # - case (for categorizing the test case) # # If any input is a dictionary itself, it will be expanded into separate test diff --git a/benchmark/data/benchmark_query_test_data.yaml b/benchmark/data/benchmark_query_test_data.yaml index e73ccaec..bdaa6889 100644 --- a/benchmark/data/benchmark_query_test_data.yaml +++ b/benchmark/data/benchmark_query_test_data.yaml @@ -3,7 +3,7 @@ # # Test case keys: # - input (for creating the test) -# - expected (for asserting ourcomes and generating a score) +# - expected (for asserting outcomes and generating a score) # - case (for categorizing the test case) # # If any input is a dictionary itself, it will be expanded into separate test diff --git a/benchmark/data/benchmark_rag_test_data.yaml b/benchmark/data/benchmark_rag_test_data.yaml index 2fd4a105..fca5baec 100644 --- a/benchmark/data/benchmark_rag_test_data.yaml +++ b/benchmark/data/benchmark_rag_test_data.yaml @@ -3,7 +3,7 @@ # # Test case keys: # - input (for creating the test) -# - expected (for asserting ourcomes and generating a score) +# - expected (for asserting outcomes and generating a score) # - case (for categorizing the test case) # # If any input is a dictionary itself, it will be expanded into separate test diff --git a/benchmark/data/benchmark_text_extract_data.yaml b/benchmark/data/benchmark_text_extract_data.yaml index 8c85e5a6..6e4f9e14 100644 --- a/benchmark/data/benchmark_text_extract_data.yaml +++ b/benchmark/data/benchmark_text_extract_data.yaml @@ -3,7 +3,7 @@ # # Test case keys: # - input (for creating the test) -# - expected (for asserting ourcomes and generating a score) +# - expected (for asserting outcomes and generating a score) # - case (for categorizing the test case) # # If any input is a dictionary itself, it will be expanded into separate test diff --git a/benchmark/test_api_calling.py b/benchmark/test_api_calling.py index 6a4f52bc..c198ebd1 100644 --- a/benchmark/test_api_calling.py +++ b/benchmark/test_api_calling.py @@ -1,10 +1,11 @@ from urllib.parse import urlencode import inspect +import re import pytest from biochatter._misc import ensure_iterable -from biochatter.api_agent.oncokb import OncoKBQueryBuilder +from biochatter.api_agent import OncoKBQueryBuilder, BioToolsQueryBuilder from .conftest import calculate_bool_vector_score from .benchmark_utils import ( skip_if_already_run, @@ -31,24 +32,27 @@ def test_api_calling( def run_test(): conversation.reset() # needs to be reset for each test - builder = OncoKBQueryBuilder() + if "oncokb" in yaml_data["case"]: + builder = OncoKBQueryBuilder() + elif "biotools" in yaml_data["case"]: + builder = BioToolsQueryBuilder() parameters = builder.parameterise_query( question=yaml_data["input"]["prompt"], conversation=conversation, ) - params = parameters.dict(exclude_unset=True) + params = parameters.dict(exclude_none=True) endpoint = params.pop("endpoint") base_url = params.pop("base_url") params.pop("question_uuid") - full_url = f"{base_url}/{endpoint}" + full_url = f"{base_url.rstrip('/')}/{endpoint.lstrip('/')}" api_query = f"{full_url}?{urlencode(params)}" score = [] for expected_part in ensure_iterable( yaml_data["expected"]["parts_of_query"] ): - if expected_part in api_query: + if re.search(expected_part, api_query): score.append(True) else: score.append(False) diff --git a/biochatter/api_agent/__init__.py b/biochatter/api_agent/__init__.py index 959e3d63..4d9084ab 100644 --- a/biochatter/api_agent/__init__.py +++ b/biochatter/api_agent/__init__.py @@ -6,4 +6,26 @@ BlastQueryParameters, ) from .oncokb import OncoKBFetcher, OncoKBInterpreter, OncoKBQueryBuilder +from .bio_tools import ( + BioToolsFetcher, + BioToolsInterpreter, + BioToolsQueryBuilder, +) from .api_agent import APIAgent + +__all__ = [ + "BaseFetcher", + "BaseInterpreter", + "BaseQueryBuilder", + "BlastFetcher", + "BlastInterpreter", + "BlastQueryBuilder", + "BlastQueryParameters", + "OncoKBFetcher", + "OncoKBInterpreter", + "OncoKBQueryBuilder", + "BioToolsFetcher", + "BioToolsInterpreter", + "BioToolsQueryBuilder", + "APIAgent", +] diff --git a/biochatter/api_agent/bio_tools.py b/biochatter/api_agent/bio_tools.py new file mode 100644 index 00000000..6745acc2 --- /dev/null +++ b/biochatter/api_agent/bio_tools.py @@ -0,0 +1,617 @@ +from typing import Optional +from collections.abc import Callable +import uuid + +from langchain_core.prompts import ChatPromptTemplate +from langchain_core.pydantic_v1 import Field, BaseModel +from langchain_core.output_parsers import StrOutputParser +from langchain.chains.openai_functions import create_structured_output_runnable +import requests + +from biochatter.llm_connect import Conversation +from .abc import BaseFetcher, BaseInterpreter, BaseQueryBuilder + +BIOTOOLS_QUERY_PROMPT = """ +You are a world class algorithm for creating queries in structured formats. Your task is to use the web API of bio.tools to answer questions about bioinformatics tools and their properties. + +You have to extract the appropriate information out of the examples: +1. To list information about the tools, use the endpoint https://bio.tools/api/t/ with parameters like name, description, homepage, etc. + +Use these formats to generate queries based on the question provided. Below is more information about the bio.tools API: + +Base URL + +https://bio.tools/api/ + +Endpoints and Parameters + +1. List tools + +GET /t/ + +================== ============================================================================================ +Parameter Search behaviour +================== ============================================================================================ +biotoolsID Search for bio.tools tool ID (usually quoted - to get exact match) + + `biotoolsID="signalp" `_ + +name Search for tool name (quoted as needed) + + `name=signalp `_ +homepage Exact search for tool homepage URL (**must** be quoted) + + `homepage="http://cbs.dtu.dk/services/SignalP/" `_ +description Search over tool description (quoted as needed) + + `description="peptide cleavage" `_ +version Exact search for tool version (**must** be quoted) + + `version="4.1" `_ +topic Search for EDAM Topic (term) (quoted as needed) + + `topic="Proteomics" `_ + +topicID Exact search for EDAM Topic (URI): **must** be quoted + + `topicID="topic_3510" `_ +function Fuzzy search over function (input, operation, output, note and command) + + `function="Sequence analysis" `_ +operation Fuzzy search for EDAM Operation (term) (quoted as needed) + + `operation="Sequence analysis" `_ +operationID Exact search for EDAM Operation (ID) (**must** be quoted) + + `operationID="operation_2403" `_ +dataType Fuzzy search over input and output for EDAM Data (term) (quoted as needed) + + `dataType="Protein sequence" `_ +dataTypeID Exact search over input and output for EDAM Data (ID) (**must** be quoted) + + `dataTypeID="data_2976" `_ +dataFormat Fuzzy search over input and output for EDAM Format (term) (quoted as needed) + + `dataFormat="FASTA" `_ +dataFormatID Exact search over input and output for EDAM Format (ID) (**must** be quoted) + + `dataFormatID="format_1929" `_ +input Fuzzy search over input for EDAM Data and Format (term) (quoted as needed) + + `input="Protein sequence" `_ +inputID Exact search over input for EDAM Data and Format (ID) (**must** be quoted) + + `inputID="data_2976" `_ +inputDataType Fuzzy search over input for EDAM Data (term) (quoted as needed) + + `inputDataType="Protein sequence" `_ +inputDataTypeID Exact search over input for EDAM Data (ID) (**must** be quoted) + + `inputDataTypeID="data_2976" `_ +inputDataFormat Fuzzy search over input for EDAM Format (term) (quoted as needed) + + `inputDataFormat="FASTA" `_ +inputDataFormatID Exact search over input for EDAM Format (ID) (**must** be quoted) + + `inputDataFormatID="format_1929" `_ +output Fuzzy search over output for EDAM Data and Format (term) (quoted as needed) + + `output="Sequence alignment" `_ +outputID Exact search over output for EDAM Data and Format (ID) (**must** be quoted) + + `outputID="data_0863" `_ +outputDataType Fuzzy search over output for EDAM Data (term) (quoted as needed) + + `outputDataType="Sequence alignment" `_ +outputDataTypeID Exact search over output for EDAM Data (ID) (**must** be quoted) + + `outputDataTypeID="data_0863" `_ +outputDataFormat Fuzzy search over output for EDAM Format (term) (quoted as needed) + + `outputDataFormat="ClustalW format" `_ +outputDataFormatID Exact search over output for EDAM Format (ID) (**must** be quoted) + + `outputDataFormatID="format_1982" `_ +toolType Exact search for tool type + + `toolType="Command-line tool" `_ +collectionID Exact search for tool collection (normally quoted) + + `collectionID="Rare Disease" `_ +maturity Exact search for tool maturity + + `maturity=Mature `_ +operatingSystem Exact search for tool operating system + + `operatingSystem=Linux `_ +language Exact search for programming language + + `language=Java `_ +cost Exact search for cost + + `cost="Free of charge" `_ +license Exact search for software or data usage license (quoted as needed) + + `license="GPL-3.0" `_ +accessibility Exact search for tool accessibility + + `accessibility="Open access" `_ +credit Fuzzy search over credit (name, email, URL, ORCID iD, type of entity, type of role and note) + + `credit="Henrik Nielsen" `_ +creditName Exact search for name of credited entity + + `creditName="Henrik Nielsen" `_ +creditTypeRole Exact search for role of credited entity + + `creditTypeRole=Developer `_ +creditTypeEntity Exact search for type of credited entity + + `creditTypeEntity="Funding agency" `_ +creditOrcidID Exact search for ORCID iD of credited entity (**must** be quoted) + + `creditOrcidID="0000-0001-5121-2036" `_ +publication Fuzzy search over publication (DOI, PMID, PMCID, publication type and tool version) (quoted as needed) + + `publication=10.12688/f1000research.12974.1 `_ +publicationID Exact search for publication ID (DOI, PMID or PMCID) (**must** be quoted) + + `publicationID="10.12688/f1000research.12974.1" `_ +publicationType Exact search for publication type + + `publicationType=Primary `_ +publicationVersion Exact search for tool version associated with a publication (**must** be quoted) + + `publicationVersion="1.0" `_ +link Fuzzy search over general link (URL, type and note) (quote as needed) + + `link="Issue tracker" `_ +linkType Exact search for type of information found at a link + + `linkType="Issue tracker" `_ +documentation Fuzzy search over documentation link (URL, type and note) (quote as needed) + + `documentation=Manual `_ +documentationType Exact search for type of documentation + + `documentationType=Manual `_ +download Fuzzy search over download link (URL, type, version and note) (quote as needed) + + `download=Binaries `_ +downloadType Exact search for type of download + + `downloadType=Binaries `_ +downloadVersion Exact search for tool version associated with a download (**must** be quoted) + + `downloadVersion="1.0" `_ +otherID Fuzzy search over alternate tool IDs (ID value, type of ID and version) + + `otherID="rrid:SCR_015644" `_ + +otherIDValue Exact search for value of alternate tool ID (**must** be quoted) + + `otherIDValue="rrid:SCR_015644" `_ +otherIDType Exact search for type of alternate tool ID + + `otherIDType=RRID `_ +otherIDVersion Exact search for tool version associated with an alternate ID (**must** be quoted) + + `otherIDVersion="1.0" `_ +================== ============================================================================================ + + +The parameters are (currently) case-sensitive, e.g. you must use &biotoolsID= and not &biotoolsid + +Values of the following parameters must be given in quotes to get sensible (or any) results: +homepage +version +topicID +operationID +dataTypeID +dataFormatID +inputID +inputDataTypeID +inputDataFormatID +outputID +outputDataTypeID +outputDataFormatID +creditOrcidID +publicationID +publicationVersion +downloadVersion +otherIDValue +otherIDVersion +e.g. +https://bio.tools/api/tool?topicID=”topic_3510” +Values of other parameters can be quoted or unquoted: +Unquoted values invoke a fuzzy word search: it will search for fuzzy matches of words in the search phrase, to the target field +Quoted values invoke an exact phrase search; it will search for an exact match of the full-length of the search phrase, to the target field (matches to target substrings are allowed) +e.g. +https://bio.tools/api/tool?biotoolsID=”blast” returns the tool with biotoolsID of “blast” (the “canonical” blast) +https://bio.tools/api/tool?biotoolsID=blast returns all tools with “blast” in their biotoolsID (all blast flavours) +""" + + +BIOTOOLS_SUMMARY_PROMPT = """ +You have to answer this question in a clear and concise manner: {question} Be factual!\n\ +You are a world leading bioinformatician who knows everything about bio.tools packages.\n\ +Do not make up information, only use the provided information and mention how relevant the found information is based on your knowledge about bio.tools.\n\ +Here is the information relevant to the question found on the bio.tools web API:\n\ +{context} +""" + + +class BioToolsQueryParameters(BaseModel): + base_url: str = Field( + default="https://bio.tools/api/", + description="Base URL for the BioTools API.", + ) + endpoint: str = Field( + ..., + description="Specific API endpoint to hit. Example: 't/' for listing tools.", + ) + biotoolsID: Optional[str] = Field( + None, + description="Search for bio.tools tool ID (usually quoted - to get exact match)", + ) + name: Optional[str] = Field( + None, + description="Search for tool name (quoted as needed: quoted for exact match, unquoted for fuzzy search)", + ) + homepage: Optional[str] = Field( + None, + description="Exact search for tool homepage URL (**must** be quoted)", + ) + description: Optional[str] = Field( + None, + description="Search over tool description (quoted as needed)", + ) + version: Optional[str] = Field( + None, + description="Exact search for tool version (**must** be quoted)", + ) + topic: Optional[str] = Field( + None, + description="Search for EDAM Topic (term) (quoted as needed)", + ) + topicID: Optional[str] = Field( + None, + description="Exact search for EDAM Topic (URI): **must** be quoted", + ) + function: Optional[str] = Field( + None, + description="Fuzzy search over function (input, operation, output, note and command)", + ) + operation: Optional[str] = Field( + None, + description="Fuzzy search for EDAM Operation (term) (quoted as needed)", + ) + operationID: Optional[str] = Field( + None, + description="Exact search for EDAM Operation (ID) (**must** be quoted)", + ) + dataType: Optional[str] = Field( + None, + description="Fuzzy search over input and output for EDAM Data (term) (quoted as needed)", + ) + dataTypeID: Optional[str] = Field( + None, + description="Exact search over input and output for EDAM Data (ID) (**must** be quoted)", + ) + dataFormat: Optional[str] = Field( + None, + description="Fuzzy search over input and output for EDAM Format (term) (quoted as needed)", + ) + dataFormatID: Optional[str] = Field( + None, + description="Exact search over input and output for EDAM Format (ID) (**must** be quoted)", + ) + input: Optional[str] = Field( + None, + description="Fuzzy search over input for EDAM Data and Format (term) (quoted as needed)", + ) + inputID: Optional[str] = Field( + None, + description="Exact search over input for EDAM Data and Format (ID) (**must** be quoted)", + ) + inputDataType: Optional[str] = Field( + None, + description="Fuzzy search over input for EDAM Data (term) (quoted as needed)", + ) + inputDataTypeID: Optional[str] = Field( + None, + description="Exact search over input for EDAM Data (ID) (**must** be quoted)", + ) + inputDataFormat: Optional[str] = Field( + None, + description="Fuzzy search over input for EDAM Format (term) (quoted as needed)", + ) + inputDataFormatID: Optional[str] = Field( + None, + description="Exact search over input for EDAM Format (ID) (**must** be quoted)", + ) + output: Optional[str] = Field( + None, + description="Fuzzy search over output for EDAM Data and Format (term) (quoted as needed)", + ) + outputID: Optional[str] = Field( + None, + description="Exact search over output for EDAM Data and Format (ID) (**must** be quoted)", + ) + outputDataType: Optional[str] = Field( + None, + description="Fuzzy search over output for EDAM Data (term) (quoted as needed)", + ) + outputDataTypeID: Optional[str] = Field( + None, + description="Exact search over output for EDAM Data (ID) (**must** be quoted)", + ) + outputDataFormat: Optional[str] = Field( + None, + description="Fuzzy search over output for EDAM Format (term) (quoted as needed)", + ) + outputDataFormatID: Optional[str] = Field( + None, + description="Exact search over output for EDAM Format (ID) (**must** be quoted)", + ) + toolType: Optional[str] = Field( + None, + description="Exact search for tool type", + ) + collectionID: Optional[str] = Field( + None, + description="Exact search for tool collection (normally quoted)", + ) + maturity: Optional[str] = Field( + None, + description="Exact search for tool maturity", + ) + operatingSystem: Optional[str] = Field( + None, + description="Exact search for tool operating system", + ) + language: Optional[str] = Field( + None, + description="Exact search for programming language", + ) + cost: Optional[str] = Field( + None, + description="Exact search for cost", + ) + license: Optional[str] = Field( + None, + description="Exact search for software or data usage license (quoted as needed)", + ) + accessibility: Optional[str] = Field( + None, + description="Exact search for tool accessibility", + ) + credit: Optional[str] = Field( + None, + description="Fuzzy search over credit (name, email, URL, ORCID iD, type of entity, type of role and note)", + ) + creditName: Optional[str] = Field( + None, + description="Exact search for name of credited entity", + ) + creditTypeRole: Optional[str] = Field( + None, + description="Exact search for role of credited entity", + ) + creditTypeEntity: Optional[str] = Field( + None, + description="Exact search for type of credited entity", + ) + creditOrcidID: Optional[str] = Field( + None, + description="Exact search for ORCID iD of credited entity (**must** be quoted)", + ) + publication: Optional[str] = Field( + None, + description="Fuzzy search over publication (DOI, PMID, PMCID, publication type and tool version) (quoted as needed)", + ) + publicationID: Optional[str] = Field( + None, + description="Exact search for publication ID (DOI, PMID or PMCID) (**must** be quoted)", + ) + publicationType: Optional[str] = Field( + None, + description="Exact search for publication type", + ) + publicationVersion: Optional[str] = Field( + None, + description="Exact search for tool version associated with a publication (**must** be quoted)", + ) + link: Optional[str] = Field( + None, + description="Fuzzy search over general link (URL, type and note) (quote as needed)", + ) + linkType: Optional[str] = Field( + None, + description="Exact search for type of information found at a link", + ) + documentation: Optional[str] = Field( + None, + description="Fuzzy search over documentation link (URL, type and note) (quote as needed)", + ) + documentationType: Optional[str] = Field( + None, + description="Exact search for type of documentation", + ) + download: Optional[str] = Field( + None, + description="Fuzzy search over download link (URL, type, version and note) (quote as needed)", + ) + downloadType: Optional[str] = Field( + None, + description="Exact search for type of download", + ) + downloadVersion: Optional[str] = Field( + None, + description="Exact search for tool version associated with a download (**must** be quoted)", + ) + otherID: Optional[str] = Field( + None, + description="Fuzzy search over alternate tool IDs (ID value, type of ID and version)", + ) + otherIDValue: Optional[str] = Field( + None, + description="Exact search for value of alternate tool ID (**must** be quoted)", + ) + otherIDType: Optional[str] = Field( + None, + description="Exact search for type of alternate tool ID", + ) + otherIDVersion: Optional[str] = Field( + None, + description="Exact search for tool version associated with an alternate ID (**must** be quoted)", + ) + question_uuid: Optional[str] = Field( + default_factory=lambda: str(uuid.uuid4()), + description="Unique identifier for the question.", + ) + + +class BioToolsQueryBuilder(BaseQueryBuilder): + """A class for building an BioToolsQuery object.""" + + def create_runnable( + self, + query_parameters: "BioToolsQueryParameters", + conversation: "Conversation", + ) -> Callable: + """ + Creates a runnable object for executing queries using the LangChain + `create_structured_output_runnable` method. + + Args: + query_parameters: A Pydantic data model that specifies the fields of + the API that should be queried. + + conversation: A BioChatter conversation object. + + Returns: + A Callable object that can execute the query. + """ + return create_structured_output_runnable( + output_schema=query_parameters, + llm=conversation.chat, + prompt=self.structured_output_prompt, + ) + + def parameterise_query( + self, + question: str, + conversation: "Conversation", + ) -> BioToolsQueryParameters: + """ + + Generates an BioToolsQuery object based on the given question, prompt, + and BioChatter conversation. Uses a Pydantic model to define the API + fields. Creates a runnable that can be invoked on LLMs that are + qualified to parameterise functions. + + Args: + question (str): The question to be answered. + + conversation: The conversation object used for parameterising the + BioToolsQuery. + + Returns: + BioToolsQueryParameters: the parameterised query object (Pydantic model) + """ + runnable = self.create_runnable( + query_parameters=BioToolsQueryParameters, + conversation=conversation, + ) + oncokb_call_obj = runnable.invoke( + { + "input": f"Answer:\n{question} based on:\n {BIOTOOLS_QUERY_PROMPT}" + } + ) + oncokb_call_obj.question_uuid = str(uuid.uuid4()) + return oncokb_call_obj + + +class BioToolsFetcher(BaseFetcher): + """ + A class for retrieving API results from BioTools given a parameterized + BioToolsQuery. + """ + + def __init__(self, api_token="demo"): + self.headers = { + "Authorization": f"Bearer {api_token}", + "Accept": "application/json", + } + self.base_url = "https://bio.tools/api" + + def fetch_results( + self, request_data: BioToolsQueryParameters, retries: Optional[int] = 3 + ) -> str: + """Function to submit the BioTools query and fetch the results directly. + No multi-step procedure, thus no wrapping of submission and retrieval in + this case. + + Args: + request_data: BioToolsQuery object (Pydantic model) containing the + BioTools query parameters. + + Returns: + str: The results of the BioTools query. + """ + # Submit the query and get the URL + params = request_data.dict(exclude_unset=True) + endpoint = params.pop("endpoint") + params.pop("question_uuid") + full_url = f"{self.base_url}/{endpoint}" + response = requests.get(full_url, headers=self.headers, params=params) + response.raise_for_status() + + # Fetch the results from the URL + results_response = requests.get(response.url, headers=self.headers) + results_response.raise_for_status() + + return results_response.text + + +class BioToolsInterpreter(BaseInterpreter): + def summarise_results( + self, + question: str, + conversation_factory: Callable, + response_text: str, + ) -> str: + """ + Function to extract the answer from the BLAST results. + + Args: + question (str): The question to be answered. + conversation_factory: A BioChatter conversation object. + response_text (str): The response.text returned by bio.tools. + + Returns: + str: The extracted answer from the BLAST results. + + """ + prompt = ChatPromptTemplate.from_messages( + [ + ( + "system", + "You are a world class bioinformatician who knows " + "everything about bio.tools packages and the " + "bioinformatics ecosystem. Your task is to interpret " + "results from BioTools API calls and summarise " + "them for the user.", + ), + ("user", "{input}"), + ] + ) + summary_prompt = BIOTOOLS_SUMMARY_PROMPT.format( + question=question, context=response_text + ) + output_parser = StrOutputParser() + conversation = conversation_factory() + chain = prompt | conversation.chat | output_parser + answer = chain.invoke({"input": {summary_prompt}}) + return answer diff --git a/biochatter/api_agent/oncokb.py b/biochatter/api_agent/oncokb.py index 321937cc..2d2646f6 100644 --- a/biochatter/api_agent/oncokb.py +++ b/biochatter/api_agent/oncokb.py @@ -103,7 +103,7 @@ ONCOKB_SUMMARY_PROMPT = """ You have to answer this question in a clear and concise manner: {question} Be factual!\n\ You are a world leading oncologist and molecular biologist who knows everything about OncoKB results.\n\ -Do not make up information, only use the provided information and mention how relevant the found information is based on your knowledge about OncKB\n\ +Do not make up information, only use the provided information and mention how relevant the found information is based on your knowledge about OncoKB\n\ Here is the information relevant to the question found on OncoKB:\n\ {context} """