valory-xyz · Adamantios · Feb 20, 2024 · Feb 19, 2024 · Feb 19, 2024 · Feb 19, 2024
diff --git a/.gitignore b/.gitignore
@@ -52,3 +52,5 @@ leak_report
 agent/
 backup_mech/
 /packages/valory/skills/termination_abci/
+/pip
+/tool_test.py
diff --git a/tools/prediction_request/prediction_request.py b/tools/prediction_request/prediction_request.py
@@ -156,6 +156,10 @@ def count_tokens(text: str, model: str) -> int:
      the probability that the event in "USER_PROMPT" occurs. You must provide original information in each query, and they should not overlap
      or lead to obtain the same set of results.
 * Output only the JSON object. Do not include any other contents in your response.
+* Never use Markdown syntax highlighting, such as ```json``` to surround the output. Only output the raw json string.
+* This is incorrect:"```json{{\n  \"p_yes\": 0.2,\n  \"p_no\": 0.8,\n  \"confidence\": 0.7,\n  \"info_utility\": 0.5\n}}```"
+* This is incorrect:```json"{{\n  \"p_yes\": 0.2,\n  \"p_no\": 0.8,\n  \"confidence\": 0.7,\n  \"info_utility\": 0.5\n}}"```
+* This is correct:"{{\n  \"p_yes\": 0.2,\n  \"p_no\": 0.8,\n  \"confidence\": 0.7,\n  \"info_utility\": 0.5\n}}"
 """
 
 

diff --git a/tools/prediction_request_claude/prediction_request_claude.py b/tools/prediction_request_claude/prediction_request_claude.py
@@ -119,7 +119,7 @@
 * Never use Markdown syntax highlighting, such as ```json```. Only output the raw json string.
 * This is incorrect:"```json{{\n  \"queries\": [\"term1\", \"term2\"]}}```"
 * This is incorrect:```json"{{\n  \"queries\": [\"term1\", \"term2\"]}}"```
-* This is correct:"{{\n  \"quries\": [\"term1\", \"term2\"]}}"
+* This is correct:"{{\n  \"queries\": [\"term1\", \"term2\"]}}"
 """
 
 ASSISTANT_TEXT = "```json"

diff --git a/tools/prediction_request_embedding/prediction_sentence_embedding.py b/tools/prediction_request_embedding/prediction_sentence_embedding.py
@@ -144,6 +144,10 @@ def count_tokens(text: str, model: str) -> int:
    - "info_utility": Utility of the information provided in "ADDITIONAL_INFORMATION" to help you make the probability estimation ranging from 0 (lowest utility) to 1 (maximum utility).
 * The sum of "p_yes" and "p_no" must equal 1.
 * Output only the JSON object in your response. Do not include any other contents in your response.
+* Never use Markdown syntax highlighting, such as ```json``` to surround the output. Only output the raw json string.
+* This is incorrect:"```json{{\n  \"p_yes\": 0.2,\n  \"p_no\": 0.8,\n  \"confidence\": 0.7,\n  \"info_utility\": 0.5\n}}```"
+* This is incorrect:```json"{{\n  \"p_yes\": 0.2,\n  \"p_no\": 0.8,\n  \"confidence\": 0.7,\n  \"info_utility\": 0.5\n}}"```
+* This is correct:"{{\n  \"p_yes\": 0.2,\n  \"p_no\": 0.8,\n  \"confidence\": 0.7,\n  \"info_utility\": 0.5\n}}"
 """
 
 URL_QUERY_PROMPT = """

diff --git a/tools/prediction_request_sme/prediction_request_sme.py b/tools/prediction_request_sme/prediction_request_sme.py
@@ -113,6 +113,10 @@ def count_tokens(text: str, model: str) -> int:
      0 indicates lowest utility; 1 maximum utility.
 * The sum of "p_yes" and "p_no" must equal 1.
 * Output only the JSON object. Do not include any other contents in your response.
+* Never use Markdown syntax highlighting, such as ```json``` to surround the output. Only output the raw json string.
+* This is incorrect:"```json{{\n  \"p_yes\": 0.2,\n  \"p_no\": 0.8,\n  \"confidence\": 0.7,\n  \"info_utility\": 0.5\n}}```"
+* This is incorrect:```json"{{\n  \"p_yes\": 0.2,\n  \"p_no\": 0.8,\n  \"confidence\": 0.7,\n  \"info_utility\": 0.5\n}}"```
+* This is correct:"{{\n  \"p_yes\": 0.2,\n  \"p_no\": 0.8,\n  \"confidence\": 0.7,\n  \"info_utility\": 0.5\n}}"
 """
 
 URL_QUERY_PROMPT = """

diff --git a/tools/prediction_sum_url_content/prediction_sum_url_content.py b/tools/prediction_sum_url_content/prediction_sum_url_content.py
@@ -132,6 +132,10 @@ def count_tokens(text: str, model: str) -> int:
    - "info_utility": Utility of the information provided in "ADDITIONAL_INFORMATION" to help you make the prediction ranging from 0 (lowest utility) to 1 (maximum utility).
 * The sum of "p_yes" and "p_no" must equal 1.
 * Output only the JSON object in your response. Do not include any other contents in your response.
+* Never use Markdown syntax highlighting, such as ```json``` to surround the output. Only output the raw json string.
+* This is incorrect:"```json{{\n  \"p_yes\": 0.2,\n  \"p_no\": 0.8,\n  \"confidence\": 0.7,\n  \"info_utility\": 0.5\n}}```"
+* This is incorrect:```json"{{\n  \"p_yes\": 0.2,\n  \"p_no\": 0.8,\n  \"confidence\": 0.7,\n  \"info_utility\": 0.5\n}}"```
+* This is correct:"{{\n  \"p_yes\": 0.2,\n  \"p_no\": 0.8,\n  \"confidence\": 0.7,\n  \"info_utility\": 0.5\n}}"
 """
 
 URL_QUERY_PROMPT = """

diff --git a/tools/prediction_with_research_report/prediction_with_research_report.py b/tools/prediction_with_research_report/prediction_with_research_report.py
@@ -22,6 +22,8 @@
 from datetime import datetime, timezone
 from typing import Any, Callable, Dict, Optional, Tuple
 from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import openai
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from openai import OpenAI
 from pydantic import BaseModel
@@ -31,7 +33,7 @@
 import requests
 from bs4 import BeautifulSoup
 from requests import Response
-from chromadb import Collection, EphemeralClient
+from chromadb import Collection, EphemeralClient, Documents, Embeddings
 import chromadb.utils.embedding_functions as embedding_functions
 from tiktoken import encoding_for_model
 
@@ -112,6 +114,10 @@
    - "info_utility": Utility of the information provided in "ADDITIONAL_INFORMATION" to help you make the probability estimation ranging from 0 (lowest utility) to 1 (maximum utility).
 * The sum of "p_yes" and "p_no" must equal 1.
 * Output only the JSON object in your response. Do not include any other contents in your response.
+* Never use Markdown syntax highlighting, such as ```json``` to surround the output. Only output the raw json string.
+* This is incorrect:"```json{{\n  \"p_yes\": 0.2,\n  \"p_no\": 0.8,\n  \"confidence\": 0.7,\n  \"info_utility\": 0.5\n}}```"
+* This is incorrect:```json"{{\n  \"p_yes\": 0.2,\n  \"p_no\": 0.8,\n  \"confidence\": 0.7,\n  \"info_utility\": 0.5\n}}"```
+* This is correct:"{{\n  \"p_yes\": 0.2,\n  \"p_no\": 0.8,\n  \"confidence\": 0.7,\n  \"info_utility\": 0.5\n}}"
 """
 
 
@@ -158,8 +164,32 @@
     Use markdown syntax. Include as much relevant information as possible and try not to summarize.
     """
 
+class CustomOpenAIEmbeddingFunction(embedding_functions.OpenAIEmbeddingFunction):
+    """Custom OpenAI embedding function"""
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        """Initialize custom OpenAI embedding function"""
+        super().__init__(*args, **kwargs)
+        # OpenAI@v1 compatible
+        self._client = openai.embeddings
+
+    def __call__(self, texts: Documents) -> Embeddings:
+        """Return embedding"""
+        # replace newlines, which can negatively affect performance.
+        texts = [t.replace("\n", " ") for t in texts]
+
+        # Call the OpenAI Embedding API
+        embeddings = self._client.create(input=texts, model=self._model_name).data
+
+        # Sort resulting embeddings by index
+        sorted_embeddings = sorted(embeddings, key=lambda e: e.index)  # type: ignore
+
+        # Return just the embeddings
+        return [result.embedding for result in sorted_embeddings]
 # MODELS
 
+MAX_TEXT_LENGTH = 7500
+
 
 class WebSearchResult(BaseModel):
     title: str
@@ -293,7 +323,7 @@ def search(queries: list[str], api_key: str, filter = lambda x: True) -> list[tu
 
 def create_embeddings_from_results(results: list[WebScrapeResult], text_splitter, api_key: str) -> Collection:
     client = EphemeralClient()
-    openai_ef = embedding_functions.OpenAIEmbeddingFunction(
+    openai_ef = CustomOpenAIEmbeddingFunction(
                 api_key=api_key,
                 model_name="text-embedding-ada-002"
             )
@@ -307,12 +337,13 @@ def create_embeddings_from_results(results: list[WebScrapeResult], text_splitter
 
     for scrape_result in results:
         text_splits = text_splitter.split_text(scrape_result.content)
-        texts += text_splits
+        if not len(texts + text_splits) > MAX_TEXT_LENGTH:
+            texts += text_splits
         metadatas += [scrape_result.dict() for _ in text_splits]   
 
     collection.add(
         documents=texts,
-        metadatas=metadatas, # type: ignore
+        metadatas=metadatas,  # type: ignore
         ids=[f'id{i}' for i in range(len(texts))]
     )
     return collection

diff --git a/tox.ini b/tox.ini
@@ -532,4 +532,7 @@ tqdm: >=4.56.0
 blspy: >=1.0.16
 ; sub-dep of cosmos
 hypothesis: ==6.21.6
-; we don't modify
+; sub-dep of chromadb, has Apache 2.0 Licence https://github.com/chroma-core/hnswlib/blob/master/LICENSE
+chroma-hnswlib: ==0.7.3
+; sub-dep of chromadb, has Apache 2.0 Licence https://github.com/apache/pulsar-client-python/blob/main/LICENSE
+pulsar-client: ==3.4.0