diff --git a/.gitignore b/.gitignore index 89e71145..757cf2f5 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,5 @@ leak_report agent/ backup_mech/ /packages/valory/skills/termination_abci/ +/pip +/tool_test.py diff --git a/tools/prediction_request/prediction_request.py b/tools/prediction_request/prediction_request.py index 8025c221..c4b3fad5 100644 --- a/tools/prediction_request/prediction_request.py +++ b/tools/prediction_request/prediction_request.py @@ -156,6 +156,10 @@ def count_tokens(text: str, model: str) -> int: the probability that the event in "USER_PROMPT" occurs. You must provide original information in each query, and they should not overlap or lead to obtain the same set of results. * Output only the JSON object. Do not include any other contents in your response. +* Never use Markdown syntax highlighting, such as ```json``` to surround the output. Only output the raw json string. +* This is incorrect:"```json{{\n \"p_yes\": 0.2,\n \"p_no\": 0.8,\n \"confidence\": 0.7,\n \"info_utility\": 0.5\n}}```" +* This is incorrect:```json"{{\n \"p_yes\": 0.2,\n \"p_no\": 0.8,\n \"confidence\": 0.7,\n \"info_utility\": 0.5\n}}"``` +* This is correct:"{{\n \"p_yes\": 0.2,\n \"p_no\": 0.8,\n \"confidence\": 0.7,\n \"info_utility\": 0.5\n}}" """ diff --git a/tools/prediction_request_claude/prediction_request_claude.py b/tools/prediction_request_claude/prediction_request_claude.py index a2461491..ca677f97 100644 --- a/tools/prediction_request_claude/prediction_request_claude.py +++ b/tools/prediction_request_claude/prediction_request_claude.py @@ -119,7 +119,7 @@ * Never use Markdown syntax highlighting, such as ```json```. Only output the raw json string. * This is incorrect:"```json{{\n \"queries\": [\"term1\", \"term2\"]}}```" * This is incorrect:```json"{{\n \"queries\": [\"term1\", \"term2\"]}}"``` -* This is correct:"{{\n \"quries\": [\"term1\", \"term2\"]}}" +* This is correct:"{{\n \"queries\": [\"term1\", \"term2\"]}}" """ ASSISTANT_TEXT = "```json" diff --git a/tools/prediction_request_embedding/prediction_sentence_embedding.py b/tools/prediction_request_embedding/prediction_sentence_embedding.py index d4262eeb..1fa18ab5 100644 --- a/tools/prediction_request_embedding/prediction_sentence_embedding.py +++ b/tools/prediction_request_embedding/prediction_sentence_embedding.py @@ -144,6 +144,10 @@ def count_tokens(text: str, model: str) -> int: - "info_utility": Utility of the information provided in "ADDITIONAL_INFORMATION" to help you make the probability estimation ranging from 0 (lowest utility) to 1 (maximum utility). * The sum of "p_yes" and "p_no" must equal 1. * Output only the JSON object in your response. Do not include any other contents in your response. +* Never use Markdown syntax highlighting, such as ```json``` to surround the output. Only output the raw json string. +* This is incorrect:"```json{{\n \"p_yes\": 0.2,\n \"p_no\": 0.8,\n \"confidence\": 0.7,\n \"info_utility\": 0.5\n}}```" +* This is incorrect:```json"{{\n \"p_yes\": 0.2,\n \"p_no\": 0.8,\n \"confidence\": 0.7,\n \"info_utility\": 0.5\n}}"``` +* This is correct:"{{\n \"p_yes\": 0.2,\n \"p_no\": 0.8,\n \"confidence\": 0.7,\n \"info_utility\": 0.5\n}}" """ URL_QUERY_PROMPT = """ diff --git a/tools/prediction_request_sme/prediction_request_sme.py b/tools/prediction_request_sme/prediction_request_sme.py index 363da469..e62dbdf0 100644 --- a/tools/prediction_request_sme/prediction_request_sme.py +++ b/tools/prediction_request_sme/prediction_request_sme.py @@ -113,6 +113,10 @@ def count_tokens(text: str, model: str) -> int: 0 indicates lowest utility; 1 maximum utility. * The sum of "p_yes" and "p_no" must equal 1. * Output only the JSON object. Do not include any other contents in your response. +* Never use Markdown syntax highlighting, such as ```json``` to surround the output. Only output the raw json string. +* This is incorrect:"```json{{\n \"p_yes\": 0.2,\n \"p_no\": 0.8,\n \"confidence\": 0.7,\n \"info_utility\": 0.5\n}}```" +* This is incorrect:```json"{{\n \"p_yes\": 0.2,\n \"p_no\": 0.8,\n \"confidence\": 0.7,\n \"info_utility\": 0.5\n}}"``` +* This is correct:"{{\n \"p_yes\": 0.2,\n \"p_no\": 0.8,\n \"confidence\": 0.7,\n \"info_utility\": 0.5\n}}" """ URL_QUERY_PROMPT = """ diff --git a/tools/prediction_sum_url_content/prediction_sum_url_content.py b/tools/prediction_sum_url_content/prediction_sum_url_content.py index d411584c..e110e839 100644 --- a/tools/prediction_sum_url_content/prediction_sum_url_content.py +++ b/tools/prediction_sum_url_content/prediction_sum_url_content.py @@ -132,6 +132,10 @@ def count_tokens(text: str, model: str) -> int: - "info_utility": Utility of the information provided in "ADDITIONAL_INFORMATION" to help you make the prediction ranging from 0 (lowest utility) to 1 (maximum utility). * The sum of "p_yes" and "p_no" must equal 1. * Output only the JSON object in your response. Do not include any other contents in your response. +* Never use Markdown syntax highlighting, such as ```json``` to surround the output. Only output the raw json string. +* This is incorrect:"```json{{\n \"p_yes\": 0.2,\n \"p_no\": 0.8,\n \"confidence\": 0.7,\n \"info_utility\": 0.5\n}}```" +* This is incorrect:```json"{{\n \"p_yes\": 0.2,\n \"p_no\": 0.8,\n \"confidence\": 0.7,\n \"info_utility\": 0.5\n}}"``` +* This is correct:"{{\n \"p_yes\": 0.2,\n \"p_no\": 0.8,\n \"confidence\": 0.7,\n \"info_utility\": 0.5\n}}" """ URL_QUERY_PROMPT = """ diff --git a/tools/prediction_with_research_report/prediction_with_research_report.py b/tools/prediction_with_research_report/prediction_with_research_report.py index 998232be..7e90818e 100644 --- a/tools/prediction_with_research_report/prediction_with_research_report.py +++ b/tools/prediction_with_research_report/prediction_with_research_report.py @@ -22,6 +22,8 @@ from datetime import datetime, timezone from typing import Any, Callable, Dict, Optional, Tuple from concurrent.futures import ThreadPoolExecutor, as_completed + +import openai from langchain.text_splitter import RecursiveCharacterTextSplitter from openai import OpenAI from pydantic import BaseModel @@ -31,7 +33,7 @@ import requests from bs4 import BeautifulSoup from requests import Response -from chromadb import Collection, EphemeralClient +from chromadb import Collection, EphemeralClient, Documents, Embeddings import chromadb.utils.embedding_functions as embedding_functions from tiktoken import encoding_for_model @@ -112,6 +114,10 @@ - "info_utility": Utility of the information provided in "ADDITIONAL_INFORMATION" to help you make the probability estimation ranging from 0 (lowest utility) to 1 (maximum utility). * The sum of "p_yes" and "p_no" must equal 1. * Output only the JSON object in your response. Do not include any other contents in your response. +* Never use Markdown syntax highlighting, such as ```json``` to surround the output. Only output the raw json string. +* This is incorrect:"```json{{\n \"p_yes\": 0.2,\n \"p_no\": 0.8,\n \"confidence\": 0.7,\n \"info_utility\": 0.5\n}}```" +* This is incorrect:```json"{{\n \"p_yes\": 0.2,\n \"p_no\": 0.8,\n \"confidence\": 0.7,\n \"info_utility\": 0.5\n}}"``` +* This is correct:"{{\n \"p_yes\": 0.2,\n \"p_no\": 0.8,\n \"confidence\": 0.7,\n \"info_utility\": 0.5\n}}" """ @@ -158,8 +164,32 @@ Use markdown syntax. Include as much relevant information as possible and try not to summarize. """ +class CustomOpenAIEmbeddingFunction(embedding_functions.OpenAIEmbeddingFunction): + """Custom OpenAI embedding function""" + + def __init__(self, *args: Any, **kwargs: Any) -> None: + """Initialize custom OpenAI embedding function""" + super().__init__(*args, **kwargs) + # OpenAI@v1 compatible + self._client = openai.embeddings + + def __call__(self, texts: Documents) -> Embeddings: + """Return embedding""" + # replace newlines, which can negatively affect performance. + texts = [t.replace("\n", " ") for t in texts] + + # Call the OpenAI Embedding API + embeddings = self._client.create(input=texts, model=self._model_name).data + + # Sort resulting embeddings by index + sorted_embeddings = sorted(embeddings, key=lambda e: e.index) # type: ignore + + # Return just the embeddings + return [result.embedding for result in sorted_embeddings] # MODELS +MAX_TEXT_LENGTH = 7500 + class WebSearchResult(BaseModel): title: str @@ -293,7 +323,7 @@ def search(queries: list[str], api_key: str, filter = lambda x: True) -> list[tu def create_embeddings_from_results(results: list[WebScrapeResult], text_splitter, api_key: str) -> Collection: client = EphemeralClient() - openai_ef = embedding_functions.OpenAIEmbeddingFunction( + openai_ef = CustomOpenAIEmbeddingFunction( api_key=api_key, model_name="text-embedding-ada-002" ) @@ -307,12 +337,13 @@ def create_embeddings_from_results(results: list[WebScrapeResult], text_splitter for scrape_result in results: text_splits = text_splitter.split_text(scrape_result.content) - texts += text_splits + if not len(texts + text_splits) > MAX_TEXT_LENGTH: + texts += text_splits metadatas += [scrape_result.dict() for _ in text_splits] collection.add( documents=texts, - metadatas=metadatas, # type: ignore + metadatas=metadatas, # type: ignore ids=[f'id{i}' for i in range(len(texts))] ) return collection diff --git a/tox.ini b/tox.ini index c77c82db..8f40a304 100644 --- a/tox.ini +++ b/tox.ini @@ -532,4 +532,7 @@ tqdm: >=4.56.0 blspy: >=1.0.16 ; sub-dep of cosmos hypothesis: ==6.21.6 -; we don't modify \ No newline at end of file +; sub-dep of chromadb, has Apache 2.0 Licence https://github.com/chroma-core/hnswlib/blob/master/LICENSE +chroma-hnswlib: ==0.7.3 +; sub-dep of chromadb, has Apache 2.0 Licence https://github.com/apache/pulsar-client-python/blob/main/LICENSE +pulsar-client: ==3.4.0