Merge pull request #188 from valory-xyz/chore/fix-rag

Enhancements and New Feature Implementation: Fix rag and new tools
valory-xyz · Mar 26, 2024 · 443ffe9 · 443ffe9
2 parents 94de888 + bc25913
commit 443ffe9
Show file tree

Hide file tree

Showing 37 changed files with 2,239 additions and 508 deletions.
diff --git a/packages/jhehemann/customs/prediction_sum_url_content/component.yaml b/packages/jhehemann/customs/prediction_sum_url_content/component.yaml
@@ -12,4 +12,34 @@ fingerprint:
 fingerprint_ignore_patterns: []
 entry_point: prediction_sum_url_content.py
 callable: run
-dependencies: {}
+dependencies:
+  tqdm:
+    version: ==4.56.0
+  google-api-python-client:
+    version: ==2.95.0
+  googlesearch-python:
+    version: ==1.2.3
+  requests: {}
+  pydantic:
+    version: '>=1.9.0,<3'
+  faiss-cpu:
+    version: ==1.7.4
+  tiktoken:
+    version: ==0.5.1
+  markdownify:
+    version: ==0.11.6
+  openai:
+    version: ==1.11.0
+  docstring-parser:
+    version: ==0.15
+  readability-lxml:
+    version: ==0.8.1
+  pypdf2:
+    version: ==3.0.1
+  pandas: {}
+  python-dateutil:
+    version: ==2.8.2
+  beautifulsoup4:
+    version: ==4.12.2
+  spacy:
+    version: ==3.7.2
diff --git a/packages/napthaai/customs/prediction_request_rag/component.yaml b/packages/napthaai/customs/prediction_request_rag/component.yaml
@@ -7,8 +7,31 @@ license: Apache-2.0
 aea_version: '>=1.0.0, <2.0.0'
 fingerprint:
   __init__.py: bafybeibt7f7crtwvmkg7spy3jhscmlqltvyblzp32g6gj44v7tlo5lycuq
-  prediction_request_rag.py: bafybeifxoo6pcmmwved7ffwra7saku5hosfrhsmyrwhqiy7nxcblh4luce
+  prediction_request_rag.py: bafybeihpyddw5tctvvsr6fbwlbenkgkdz4n5ieyngt246idtxjgfrwn2ke
 fingerprint_ignore_patterns: []
 entry_point: prediction_request_rag.py
 callable: run
-dependencies: {}
+dependencies:
+  google-api-python-client:
+    version: ==2.95.0
+  googlesearch-python:
+    version: ==1.2.3
+  requests: {}
+  pydantic:
+    version: '>=1.9.0,<3'
+  faiss-cpu:
+    version: ==1.7.4
+  tiktoken:
+    version: ==0.5.1
+  markdownify:
+    version: ==0.11.6
+  openai:
+    version: ==1.11.0
+  docstring-parser:
+    version: ==0.15
+  readability-lxml:
+    version: ==0.8.1
+  pypdf2:
+    version: ==3.0.1
+  numpy:
+    version: '>=1.19.0'
diff --git a/packages/napthaai/customs/prediction_request_rag/prediction_request_rag.py b/packages/napthaai/customs/prediction_request_rag/prediction_request_rag.py
@@ -19,6 +19,7 @@
 
 """This module implements a Mech tool for binary predictions."""
 
+import re
 from collections import defaultdict
 from concurrent.futures import Future, ThreadPoolExecutor
 from docstring_parser import parse
@@ -35,7 +36,6 @@
 from readability import Document as ReadabilityDocument
 import requests
 from requests.exceptions import RequestException, TooManyRedirects
-from requests.packages.urllib3.util.retry import Retry
 from markdownify import markdownify as md
 from typing import Any, Dict, Generator, List, Optional, Tuple, Callable
 from tiktoken import encoding_for_model
@@ -65,13 +65,13 @@ def __exit__(self, exc_type, exc_value, traceback) -> None:
     "temperature": 0,
 }
 MAX_TOKENS = {
-    "gpt-3.5-turbo": 4096,
+    "gpt-3.5-turbo-0125": 16385,
     "gpt-4": 8192,
 }
 ALLOWED_TOOLS = [
     "prediction-request-rag",
 ]
-TOOL_TO_ENGINE = {tool: "gpt-3.5-turbo" for tool in ALLOWED_TOOLS}
+TOOL_TO_ENGINE = {tool: "gpt-3.5-turbo-0125" for tool in ALLOWED_TOOLS}
 DEFAULT_NUM_URLS = defaultdict(lambda: 3)
 DEFAULT_NUM_QUERIES = defaultdict(lambda: 3)
 NUM_URLS_PER_QUERY = 5
@@ -106,15 +106,15 @@ def __exit__(self, exc_type, exc_value, traceback) -> None:
 """
 
 URL_QUERY_PROMPT = """
- You are an expert fact checker in a team tasked with determining whether an event will happen before a given date in the past. 
-* Your role in the team to come up with search queries to be used to find relevant news articles that may help in determining whether the event occured. 
+ You are an expert fact checker in a team tasked with determining whether an event will happen before a given date. 
+* Your role in the team to come up with search queries to be used to find relevant news articles that may help in determining whether the event will occur. 
 * You are provided with the input question about the event under the label "USER_PROMPT". 
 * You must follow the instructions under the label "INSTRUCTIONS". 
 
 INSTRUCTIONS
 * Read the input under the label "USER_PROMPT" delimited by three backticks.
 * The "USER_PROMPT" is a question about whether an event will happen before a given date.
-* The event will only have has two possible outcomes: either the event will happen or the event will not happen.
+* The event will only have two possible outcomes: either the event will happen or the event will not happen.
 * If the event has more than two possible outcomes, you must ignore the rest of the instructions and output the response "Error".
 * You should come up with {num_queries} diverse queries to search for relevant news articles that may help in determining whether the event will occur. 
 * Focus on capturing different aspects and interpretations of the question to ensure comprehensive coverage of the topic.
@@ -208,6 +208,8 @@ def multi_queries(
     engine: str,
     num_queries: int,
     counter_callback: Optional[Callable[[int, int, str], None]] = None,
+    temperature: Optional[float] = DEFAULT_OPENAI_SETTINGS["temperature"],
+    max_tokens: Optional[int] = DEFAULT_OPENAI_SETTINGS["max_tokens"],
 ) -> List[str]:
     """Generate multiple queries for fetching information from the web."""
 
@@ -223,8 +225,8 @@ def multi_queries(
     response = client.chat.completions.create(
         model=engine,
         messages=messages,
-        temperature=DEFAULT_OPENAI_SETTINGS["temperature"],
-        max_tokens=DEFAULT_OPENAI_SETTINGS["max_tokens"],
+        temperature=temperature,
+        max_tokens=max_tokens,
         n=1,
         timeout=150,
         stop=None,
@@ -245,7 +247,13 @@ def multi_queries(
         return queries.queries, counter_callback
     return queries.queries, None
 
-def search_google(query: str, api_key: str, engine: str, num: int) -> List[str]:
+def search_google(
+    query: str, 
+    api_key: str, 
+    engine: str, 
+    num: int
+) -> List[str]:
+    """Search Google for the given query."""
     service = build("customsearch", "v1", developerKey=api_key)
     search = (
         service.cse()
@@ -286,7 +294,9 @@ def get_urls_from_queries(
 
 
 def find_similar_chunks(
-    query: str, docs_with_embeddings: List[Document], k: int = 4
+    query: str, 
+    docs_with_embeddings: List[Document], 
+    k: int = 4
 ) -> List:
     """Similarity search to find similar chunks to a query"""
 
@@ -439,6 +449,8 @@ def fetch_additional_information(
     num_words: Optional[int] = None,
     num_urls: Optional[int] = None,
     num_queries: Optional[int] = DEFAULT_NUM_QUERIES,
+    temperature: Optional[float] = DEFAULT_OPENAI_SETTINGS["temperature"],
+    max_tokens: Optional[int] = DEFAULT_OPENAI_SETTINGS["max_tokens"],
 ) -> Tuple:
     """Fetch additional information from the web."""
 
@@ -449,6 +461,8 @@ def fetch_additional_information(
         engine=engine,
         num_queries=num_queries,
         counter_callback=counter_callback,
+        temperature=temperature,
+        max_tokens=max_tokens,
     )
     print(f"Queries: {queries}")
 
@@ -469,11 +483,11 @@ def fetch_additional_information(
             urls=urls,
         )
     else:
-        texts = []
+        docs = []
         for url, content in islice(source_links.items(), num_urls or len(source_links)):
-            doc = {}
-            doc['text'], doc['url'] = extract_text(html=content, num_words=num_words), url
-            texts.append(doc)
+            doc = extract_text(html=content, num_words=num_words)
+            doc.url = url
+            docs.append(doc)
 
     # Remove None values from the list
     docs = [doc for doc in docs if doc]
@@ -553,12 +567,23 @@ def adjust_additional_information(
     return additional_information
 
 
-def run(**kwargs) -> Tuple[str, Optional[Dict[str, Any]]]:
+def extract_question(prompt: str) -> str:
+    pattern = r'\"(.*?)\"'
+    try:
+        question = re.findall(pattern, prompt)[0]
+    except Exception as e:
+        print(f"Error extracting question: {e}")
+        question = prompt
+
+    return question
+
+
+def run(**kwargs) -> Tuple[str, Optional[str], Optional[Dict[str, Any]], Any]:
     """Run the task"""
     with OpenAIClientManager(kwargs["api_keys"]["openai"]):
 
         tool = kwargs["tool"]
-        prompt = kwargs["prompt"]
+        prompt = extract_question(kwargs["prompt"])
         max_tokens = kwargs.get("max_tokens", DEFAULT_OPENAI_SETTINGS["max_tokens"])
         temperature = kwargs.get("temperature", DEFAULT_OPENAI_SETTINGS["temperature"])
         num_words = kwargs.get("num_words", None)
@@ -572,7 +597,6 @@ def run(**kwargs) -> Tuple[str, Optional[Dict[str, Any]]]:
             raise ValueError(f"Tool {tool} is not supported.")
 
         engine = TOOL_TO_ENGINE[tool]
-
         additional_information, counter_callback = fetch_additional_information(
             client=client,
             prompt=prompt,
@@ -584,6 +608,8 @@ def run(**kwargs) -> Tuple[str, Optional[Dict[str, Any]]]:
             num_words=num_words,
             num_urls=num_urls,
             num_queries=num_queries,
+            temperature=temperature,
+            max_tokens=max_tokens,
         )
         additional_information = adjust_additional_information(
             prompt,
@@ -592,7 +618,8 @@ def run(**kwargs) -> Tuple[str, Optional[Dict[str, Any]]]:
             engine
         )
         prediction_prompt = PREDICTION_PROMPT.format(
-            user_prompt=prompt, additional_information=additional_information
+            user_prompt=prompt, 
+            additional_information=additional_information
         )
         messages = [
             {"role": "system", "content": SYSTEM_PROMPT},
@@ -607,6 +634,7 @@ def run(**kwargs) -> Tuple[str, Optional[Dict[str, Any]]]:
             timeout=150,
             stop=None,
             functions=[Results.openai_schema],
+            function_call={'name': 'Results'}
         )
         results = str(Results.from_response(response))
 
@@ -623,7 +651,6 @@ def run(**kwargs) -> Tuple[str, Optional[Dict[str, Any]]]:
                 output_tokens=response.usage.completion_tokens,
                 model=engine,
                 token_counter=count_tokens,
-
             )
-    
-    return results, prediction_prompt, None, counter_callback
+
+    return results, prediction_prompt, None, counter_callback
diff --git a/packages/napthaai/customs/prediction_request_reasoning/__init__.py b/packages/napthaai/customs/prediction_request_reasoning/__init__.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+# ------------------------------------------------------------------------------
+#
+#   Copyright 2024 Valory AG
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+#
+# ------------------------------------------------------------------------------
+
+"""This module contains the resolve market reasoning tool."""
diff --git a/packages/napthaai/customs/prediction_request_reasoning/component.yaml b/packages/napthaai/customs/prediction_request_reasoning/component.yaml
@@ -0,0 +1,37 @@
+name: prediction_request_reasoning
+author: napthaai
+version: 0.1.0
+type: custom
+description: A tool that reasons over extracted information.
+license: Apache-2.0
+aea_version: '>=1.0.0, <2.0.0'
+fingerprint:
+  __init__.py: bafybeib36ew6vbztldut5xayk5553rylrq7yv4cpqyhwc5ktvd4cx67vwu
+  prediction_request_reasoning.py: bafybeien7m2b5ejuvxtrkl32ws5tkrek2jfcksxxf7tawo2hh4lhbmagee
+fingerprint_ignore_patterns: []
+entry_point: prediction_request_reasoning.py
+callable: run
+dependencies:
+  google-api-python-client:
+    version: ==2.95.0
+  googlesearch-python:
+    version: ==1.2.3
+  requests: {}
+  pydantic:
+    version: '>=1.9.0,<3'
+  faiss-cpu:
+    version: ==1.7.4
+  tiktoken:
+    version: ==0.5.1
+  markdownify:
+    version: ==0.11.6
+  openai:
+    version: ==1.11.0
+  docstring-parser:
+    version: ==0.15
+  readability-lxml:
+    version: ==0.8.1
+  pypdf2:
+    version: ==3.0.1
+  numpy:
+    version: '>=1.19.0'