Skip to content

Commit

Permalink
Merge pull request #188 from valory-xyz/chore/fix-rag
Browse files Browse the repository at this point in the history
Enhancements and New Feature Implementation: Fix rag and new tools
  • Loading branch information
0xArdi authored Mar 26, 2024
2 parents 94de888 + bc25913 commit 443ffe9
Show file tree
Hide file tree
Showing 37 changed files with 2,239 additions and 508 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,34 @@ fingerprint:
fingerprint_ignore_patterns: []
entry_point: prediction_sum_url_content.py
callable: run
dependencies: {}
dependencies:
tqdm:
version: ==4.56.0
google-api-python-client:
version: ==2.95.0
googlesearch-python:
version: ==1.2.3
requests: {}
pydantic:
version: '>=1.9.0,<3'
faiss-cpu:
version: ==1.7.4
tiktoken:
version: ==0.5.1
markdownify:
version: ==0.11.6
openai:
version: ==1.11.0
docstring-parser:
version: ==0.15
readability-lxml:
version: ==0.8.1
pypdf2:
version: ==3.0.1
pandas: {}
python-dateutil:
version: ==2.8.2
beautifulsoup4:
version: ==4.12.2
spacy:
version: ==3.7.2
27 changes: 25 additions & 2 deletions packages/napthaai/customs/prediction_request_rag/component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,31 @@ license: Apache-2.0
aea_version: '>=1.0.0, <2.0.0'
fingerprint:
__init__.py: bafybeibt7f7crtwvmkg7spy3jhscmlqltvyblzp32g6gj44v7tlo5lycuq
prediction_request_rag.py: bafybeifxoo6pcmmwved7ffwra7saku5hosfrhsmyrwhqiy7nxcblh4luce
prediction_request_rag.py: bafybeihpyddw5tctvvsr6fbwlbenkgkdz4n5ieyngt246idtxjgfrwn2ke
fingerprint_ignore_patterns: []
entry_point: prediction_request_rag.py
callable: run
dependencies: {}
dependencies:
google-api-python-client:
version: ==2.95.0
googlesearch-python:
version: ==1.2.3
requests: {}
pydantic:
version: '>=1.9.0,<3'
faiss-cpu:
version: ==1.7.4
tiktoken:
version: ==0.5.1
markdownify:
version: ==0.11.6
openai:
version: ==1.11.0
docstring-parser:
version: ==0.15
readability-lxml:
version: ==0.8.1
pypdf2:
version: ==3.0.1
numpy:
version: '>=1.19.0'
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

"""This module implements a Mech tool for binary predictions."""

import re
from collections import defaultdict
from concurrent.futures import Future, ThreadPoolExecutor
from docstring_parser import parse
Expand All @@ -35,7 +36,6 @@
from readability import Document as ReadabilityDocument
import requests
from requests.exceptions import RequestException, TooManyRedirects
from requests.packages.urllib3.util.retry import Retry
from markdownify import markdownify as md
from typing import Any, Dict, Generator, List, Optional, Tuple, Callable
from tiktoken import encoding_for_model
Expand Down Expand Up @@ -65,13 +65,13 @@ def __exit__(self, exc_type, exc_value, traceback) -> None:
"temperature": 0,
}
MAX_TOKENS = {
"gpt-3.5-turbo": 4096,
"gpt-3.5-turbo-0125": 16385,
"gpt-4": 8192,
}
ALLOWED_TOOLS = [
"prediction-request-rag",
]
TOOL_TO_ENGINE = {tool: "gpt-3.5-turbo" for tool in ALLOWED_TOOLS}
TOOL_TO_ENGINE = {tool: "gpt-3.5-turbo-0125" for tool in ALLOWED_TOOLS}
DEFAULT_NUM_URLS = defaultdict(lambda: 3)
DEFAULT_NUM_QUERIES = defaultdict(lambda: 3)
NUM_URLS_PER_QUERY = 5
Expand Down Expand Up @@ -106,15 +106,15 @@ def __exit__(self, exc_type, exc_value, traceback) -> None:
"""

URL_QUERY_PROMPT = """
You are an expert fact checker in a team tasked with determining whether an event will happen before a given date in the past.
* Your role in the team to come up with search queries to be used to find relevant news articles that may help in determining whether the event occured.
You are an expert fact checker in a team tasked with determining whether an event will happen before a given date.
* Your role in the team to come up with search queries to be used to find relevant news articles that may help in determining whether the event will occur.
* You are provided with the input question about the event under the label "USER_PROMPT".
* You must follow the instructions under the label "INSTRUCTIONS".
INSTRUCTIONS
* Read the input under the label "USER_PROMPT" delimited by three backticks.
* The "USER_PROMPT" is a question about whether an event will happen before a given date.
* The event will only have has two possible outcomes: either the event will happen or the event will not happen.
* The event will only have two possible outcomes: either the event will happen or the event will not happen.
* If the event has more than two possible outcomes, you must ignore the rest of the instructions and output the response "Error".
* You should come up with {num_queries} diverse queries to search for relevant news articles that may help in determining whether the event will occur.
* Focus on capturing different aspects and interpretations of the question to ensure comprehensive coverage of the topic.
Expand Down Expand Up @@ -208,6 +208,8 @@ def multi_queries(
engine: str,
num_queries: int,
counter_callback: Optional[Callable[[int, int, str], None]] = None,
temperature: Optional[float] = DEFAULT_OPENAI_SETTINGS["temperature"],
max_tokens: Optional[int] = DEFAULT_OPENAI_SETTINGS["max_tokens"],
) -> List[str]:
"""Generate multiple queries for fetching information from the web."""

Expand All @@ -223,8 +225,8 @@ def multi_queries(
response = client.chat.completions.create(
model=engine,
messages=messages,
temperature=DEFAULT_OPENAI_SETTINGS["temperature"],
max_tokens=DEFAULT_OPENAI_SETTINGS["max_tokens"],
temperature=temperature,
max_tokens=max_tokens,
n=1,
timeout=150,
stop=None,
Expand All @@ -245,7 +247,13 @@ def multi_queries(
return queries.queries, counter_callback
return queries.queries, None

def search_google(query: str, api_key: str, engine: str, num: int) -> List[str]:
def search_google(
query: str,
api_key: str,
engine: str,
num: int
) -> List[str]:
"""Search Google for the given query."""
service = build("customsearch", "v1", developerKey=api_key)
search = (
service.cse()
Expand Down Expand Up @@ -286,7 +294,9 @@ def get_urls_from_queries(


def find_similar_chunks(
query: str, docs_with_embeddings: List[Document], k: int = 4
query: str,
docs_with_embeddings: List[Document],
k: int = 4
) -> List:
"""Similarity search to find similar chunks to a query"""

Expand Down Expand Up @@ -439,6 +449,8 @@ def fetch_additional_information(
num_words: Optional[int] = None,
num_urls: Optional[int] = None,
num_queries: Optional[int] = DEFAULT_NUM_QUERIES,
temperature: Optional[float] = DEFAULT_OPENAI_SETTINGS["temperature"],
max_tokens: Optional[int] = DEFAULT_OPENAI_SETTINGS["max_tokens"],
) -> Tuple:
"""Fetch additional information from the web."""

Expand All @@ -449,6 +461,8 @@ def fetch_additional_information(
engine=engine,
num_queries=num_queries,
counter_callback=counter_callback,
temperature=temperature,
max_tokens=max_tokens,
)
print(f"Queries: {queries}")

Expand All @@ -469,11 +483,11 @@ def fetch_additional_information(
urls=urls,
)
else:
texts = []
docs = []
for url, content in islice(source_links.items(), num_urls or len(source_links)):
doc = {}
doc['text'], doc['url'] = extract_text(html=content, num_words=num_words), url
texts.append(doc)
doc = extract_text(html=content, num_words=num_words)
doc.url = url
docs.append(doc)

# Remove None values from the list
docs = [doc for doc in docs if doc]
Expand Down Expand Up @@ -553,12 +567,23 @@ def adjust_additional_information(
return additional_information


def run(**kwargs) -> Tuple[str, Optional[Dict[str, Any]]]:
def extract_question(prompt: str) -> str:
pattern = r'\"(.*?)\"'
try:
question = re.findall(pattern, prompt)[0]
except Exception as e:
print(f"Error extracting question: {e}")
question = prompt

return question


def run(**kwargs) -> Tuple[str, Optional[str], Optional[Dict[str, Any]], Any]:
"""Run the task"""
with OpenAIClientManager(kwargs["api_keys"]["openai"]):

tool = kwargs["tool"]
prompt = kwargs["prompt"]
prompt = extract_question(kwargs["prompt"])
max_tokens = kwargs.get("max_tokens", DEFAULT_OPENAI_SETTINGS["max_tokens"])
temperature = kwargs.get("temperature", DEFAULT_OPENAI_SETTINGS["temperature"])
num_words = kwargs.get("num_words", None)
Expand All @@ -572,7 +597,6 @@ def run(**kwargs) -> Tuple[str, Optional[Dict[str, Any]]]:
raise ValueError(f"Tool {tool} is not supported.")

engine = TOOL_TO_ENGINE[tool]

additional_information, counter_callback = fetch_additional_information(
client=client,
prompt=prompt,
Expand All @@ -584,6 +608,8 @@ def run(**kwargs) -> Tuple[str, Optional[Dict[str, Any]]]:
num_words=num_words,
num_urls=num_urls,
num_queries=num_queries,
temperature=temperature,
max_tokens=max_tokens,
)
additional_information = adjust_additional_information(
prompt,
Expand All @@ -592,7 +618,8 @@ def run(**kwargs) -> Tuple[str, Optional[Dict[str, Any]]]:
engine
)
prediction_prompt = PREDICTION_PROMPT.format(
user_prompt=prompt, additional_information=additional_information
user_prompt=prompt,
additional_information=additional_information
)
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
Expand All @@ -607,6 +634,7 @@ def run(**kwargs) -> Tuple[str, Optional[Dict[str, Any]]]:
timeout=150,
stop=None,
functions=[Results.openai_schema],
function_call={'name': 'Results'}
)
results = str(Results.from_response(response))

Expand All @@ -623,7 +651,6 @@ def run(**kwargs) -> Tuple[str, Optional[Dict[str, Any]]]:
output_tokens=response.usage.completion_tokens,
model=engine,
token_counter=count_tokens,

)
return results, prediction_prompt, None, counter_callback

return results, prediction_prompt, None, counter_callback
20 changes: 20 additions & 0 deletions packages/napthaai/customs/prediction_request_reasoning/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# -*- coding: utf-8 -*-
# ------------------------------------------------------------------------------
#
# Copyright 2024 Valory AG
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ------------------------------------------------------------------------------

"""This module contains the resolve market reasoning tool."""
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: prediction_request_reasoning
author: napthaai
version: 0.1.0
type: custom
description: A tool that reasons over extracted information.
license: Apache-2.0
aea_version: '>=1.0.0, <2.0.0'
fingerprint:
__init__.py: bafybeib36ew6vbztldut5xayk5553rylrq7yv4cpqyhwc5ktvd4cx67vwu
prediction_request_reasoning.py: bafybeien7m2b5ejuvxtrkl32ws5tkrek2jfcksxxf7tawo2hh4lhbmagee
fingerprint_ignore_patterns: []
entry_point: prediction_request_reasoning.py
callable: run
dependencies:
google-api-python-client:
version: ==2.95.0
googlesearch-python:
version: ==1.2.3
requests: {}
pydantic:
version: '>=1.9.0,<3'
faiss-cpu:
version: ==1.7.4
tiktoken:
version: ==0.5.1
markdownify:
version: ==0.11.6
openai:
version: ==1.11.0
docstring-parser:
version: ==0.15
readability-lxml:
version: ==0.8.1
pypdf2:
version: ==3.0.1
numpy:
version: '>=1.19.0'
Loading

0 comments on commit 443ffe9

Please sign in to comment.