Merge pull request #155 from valory-xyz/test/benchmark

proposal changes for benchmark runs
valory-xyz · Feb 7, 2024 · 3e152f9 · 3e152f9
2 parents 0f36b71 + aba3a6f
commit 3e152f9
Show file tree

Hide file tree

Showing 17 changed files with 881 additions and 576 deletions.
diff --git a/.pylintrc b/.pylintrc
@@ -0,0 +1,58 @@
+[MASTER]
+ignore-patterns=.*_pb2.py,contract_dispatcher.py,test_abci_messages.py,test_tendermint_messages.py
+ignore=packages/valory/protocols/abci,packages/valory/connections/abci/gogoproto
+
+[MESSAGES CONTROL]
+disable=C0103,R0801,C0301,C0201,C0204,C0209,W1203,C0302,R1735,R1729,W0511,E0611
+
+# See here for more options: https://www.codeac.io/documentation/pylint-configuration.html
+R1735: use-dict-literal
+R1729: use-a-generator
+C0103: invalid-name
+C0201: consider-iterating-dictionary
+W1203: logging-fstring-interpolation
+C0204: bad-mcs-classmethod-argument
+C0209: consider-using-f-string
+C0301: http://pylint-messages.wikidot.com/messages:c0301 > Line too long
+C0302: http://pylint-messages.wikidot.com/messages:c0302 > Too many lines in module
+R0801: similar lines
+E0611: no-name-in-module
+
+[IMPORTS]
+ignored-modules=pandas,numpy,aea_cli_ipfs,compose,multidict,gql,anthropic,tiktoken
+
+[DESIGN]
+# min-public-methods=1
+max-public-methods=58
+# max-returns=10
+# max-bool-expr=7
+max-args=6
+# max-locals=31
+# max-statements=80
+max-parents=10
+max-branches=36
+max-attributes=8
+
+[REFACTORING]
+# max-nested-blocks=6
+
+[SPELLING]
+# uncomment to enable
+# spelling-dict=en_US
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=nocover,pragma,params,noqa,kwargs,str,async,json,boolean,config,pytest,args,url,tx,jsonschema,traceback,api,nosec
+
+[SIMILARITIES]
+
+# Minimum lines number of a similarity.
+min-similarity-lines=10
+
+# Ignore comments when computing similarities.
+ignore-comments=yes
+
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+
+# Ignore imports when computing similarities.
+ignore-imports=no
diff --git a/packages/packages.json b/packages/packages.json
@@ -2,13 +2,13 @@
     "dev": {
         "connection/valory/websocket_client/0.1.0": "bafybeiflmystocxaqblhpzqlcop2vkhsknpzjx2jomohomaxamwskeokzm",
         "skill/valory/contract_subscription/0.1.0": "bafybeicyugrkx5glat4p4ezwf6i7oduh26eycfie6ftd4uxrknztzl3ik4",
-        "agent/valory/mech/0.1.0": "bafybeibhm744jats4jdpavgfrwxgetsolyg5pcy554qcqibx7fhrln4x5q",
+        "agent/valory/mech/0.1.0": "bafybeiepdstu23qid2pcrugkpwtibh7jhlshmnmclwwbgjmefio27cp4im",
         "skill/valory/mech_abci/0.1.0": "bafybeieimp7xzxcnbzsuunf2xkcy5juulhmzsmkq2v3sw3o3lgssb53cnu",
         "contract/valory/agent_mech/0.1.0": "bafybeiepxumywg6z2zapqzc3bg3iey23cmlgjzxisqox5j74o5i2texr5e",
-        "service/valory/mech/0.1.0": "bafybeiadmouelgxqwa6qmeosrsf7emcpxfa6m6qobg4oxp4ra73rfh5x6y",
+        "service/valory/mech/0.1.0": "bafybeifzkrmgejdce5nvp7s63dpvthde6wn6etolesh4dmf5pno7jplzcy",
         "protocol/valory/acn_data_share/0.1.0": "bafybeih5ydonnvrwvy2ygfqgfabkr47s4yw3uqxztmwyfprulwfsoe7ipq",
         "skill/valory/task_submission_abci/0.1.0": "bafybeib4m2bwgchloqss3wotsx4rz7qqkwydaesiqkls2zq7zbtp6jtpsi",
-        "skill/valory/task_execution/0.1.0": "bafybeib7zgltwvd5mzji7z4zcplhd4yg5nqc5ecnkg7ldgln4qvztue7aq",
+        "skill/valory/task_execution/0.1.0": "bafybeicthrgfdv6q56htrsradow445smojjk2zqqizm4cdxyfxfor22vyy",
         "contract/valory/agent_registry/0.1.0": "bafybeiargayav6yiztdnwzejoejstcx4idssch2h4f5arlgtzj3tgsgfmu",
         "protocol/valory/websocket_client/0.1.0": "bafybeih43mnztdv3v2hetr2k3gezg7d3yj4ur7cxdvcyaqhg65e52s5sf4",
         "skill/valory/websocket_client/0.1.0": "bafybeidwntmkk4b2ixq5454ycbkknclqx7a6vpn7aqpm2nw3duszqrxvta",

diff --git a/packages/valory/agents/mech/aea-config.yaml b/packages/valory/agents/mech/aea-config.yaml
@@ -42,7 +42,7 @@ skills:
 - valory/registration_abci:0.1.0:bafybeic2ynseiak7jpta7jfwuqwyp453b4p7lolr4wihxmpn633uekv5am
 - valory/reset_pause_abci:0.1.0:bafybeidzajbe3erygeh2xbd6lrjv7nsptznjuzrt24ykgvhgotdeyhfnba
 - valory/subscription_abci:0.1.0:bafybeigaxq7m2dqv2huhg5jvb4jx3rysqwvvjj2xhojow3t3zzuwq2k4ie
-- valory/task_execution:0.1.0:bafybeib7zgltwvd5mzji7z4zcplhd4yg5nqc5ecnkg7ldgln4qvztue7aq
+- valory/task_execution:0.1.0:bafybeicthrgfdv6q56htrsradow445smojjk2zqqizm4cdxyfxfor22vyy
 - valory/task_submission_abci:0.1.0:bafybeib4m2bwgchloqss3wotsx4rz7qqkwydaesiqkls2zq7zbtp6jtpsi
 - valory/termination_abci:0.1.0:bafybeie4zvjfxvdu7qrulmur3chpjz3kpj5m4bjsxvpk4gvj5zbyyayfaa
 - valory/transaction_settlement_abci:0.1.0:bafybeiaefgqbs7zsn5xe5kdwrujj7ivygkn3ujpw6crnvi3knvxw75qmja

diff --git a/packages/valory/services/mech/service.yaml b/packages/valory/services/mech/service.yaml
@@ -7,7 +7,7 @@ license: Apache-2.0
 fingerprint:
   README.md: bafybeif7ia4jdlazy6745ke2k2x5yoqlwsgwr6sbztbgqtwvs3ndm2p7ba
 fingerprint_ignore_patterns: []
-agent: valory/mech:0.1.0:bafybeibhm744jats4jdpavgfrwxgetsolyg5pcy554qcqibx7fhrln4x5q
+agent: valory/mech:0.1.0:bafybeiepdstu23qid2pcrugkpwtibh7jhlshmnmclwwbgjmefio27cp4im
 number_of_agents: 4
 deployment:
   agent:

diff --git a/packages/valory/skills/task_execution/behaviours.py b/packages/valory/skills/task_execution/behaviours.py
@@ -48,6 +48,7 @@
 from packages.valory.protocols.ipfs.dialogues import IpfsDialogue
 from packages.valory.protocols.ledger_api import LedgerApiMessage
 from packages.valory.skills.task_execution.models import Params
+from packages.valory.skills.task_execution.utils.benchmarks import TokenCounterCallback
 from packages.valory.skills.task_execution.utils.ipfs import (
     ComponentPackageLoader,
     get_ipfs_file_hash,
@@ -291,8 +292,16 @@ def _handle_done_task(self, task_result: Any) -> None:
         }
         if task_result is not None:
             # task succeeded
-            deliver_msg, prompt, transaction = task_result
-            response = {**response, "result": deliver_msg, "prompt": prompt}
+            deliver_msg, prompt, transaction, counter_callback = task_result
+            cost_dict = {}
+            if counter_callback is not None:
+                cost_dict = cast(TokenCounterCallback, counter_callback).cost_dict
+            response = {
+                **response,
+                "result": deliver_msg,
+                "prompt": prompt,
+                "cost_dict": cost_dict,
+            }
             self._done_task["transaction"] = transaction
 
         self.context.logger.info(f"Task result for request {req_id}: {task_result}")
@@ -371,6 +380,7 @@ def _prepare_task(self, task_data: Dict[str, Any]) -> None:
         task_data["tool_py"] = tool_py
         task_data["callable_method"] = callable_method
         task_data["api_keys"] = self.params.api_keys
+        task_data["counter_callback"] = TokenCounterCallback()
         future = self._submit_task(tool_task.execute, **task_data)
         executing_task = cast(Dict[str, Any], self._executing_task)
         executing_task["timeout_deadline"] = time.time() + self.params.task_deadline

diff --git a/packages/valory/skills/task_execution/skill.yaml b/packages/valory/skills/task_execution/skill.yaml
@@ -7,11 +7,12 @@ license: Apache-2.0
 aea_version: '>=1.0.0, <2.0.0'
 fingerprint:
   __init__.py: bafybeidqhvvlnthkbnmrdkdeyjyx2f2ab6z4xdgmagh7welqnh2v6wczx4
-  behaviours.py: bafybeifa72egwarcmfneqmo3ak6wfygjc3i7hplxl6ptafb263vuaey3fm
+  behaviours.py: bafybeihprwot27csugwpoimsqcwprxu5bstqpvanot3nkmfgvhbr66ww4y
   dialogues.py: bafybeid4zxalqdlo5mw4yfbuf34hx4jp5ay5z6chm4zviwu4cj7fudtwca
   handlers.py: bafybeidbt5ezj74cgfogk3w4uw4si2grlnk5g54veyumw7g5yh6gdscywu
   models.py: bafybeihgclxctyltuehj2f4fzj26edptqugrrm4phd6ovuulezrqot6qo4
   utils/__init__.py: bafybeiccdijaigu6e5p2iruwo5mkk224o7ywedc7nr6xeu5fpmhjqgk24e
+  utils/benchmarks.py: bafybeihdutp44ds4cupszbd34gsmcw6fsdda2tzkh5b27fpg65ejbpdvdm
   utils/ipfs.py: bafybeidinbdqkidix44ibz5hug7inkcbijooag53gr5mtbaa72tk335uqq
   utils/task.py: bafybeieuziu7owtk543z3umgmayhjh67klftk7vrhz24l6rlaii5lvkqh4
 fingerprint_ignore_patterns: []
@@ -106,4 +107,8 @@ dependencies:
     version: ==0.2.1
   pyyaml:
     version: <=6.0.1,>=3.10
+  tiktoken:
+    version: ==0.5.1
+  anthropic:
+    version: ==0.3.11
 is_abstract: false
diff --git a/packages/valory/skills/task_execution/utils/benchmarks.py b/packages/valory/skills/task_execution/utils/benchmarks.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+# ------------------------------------------------------------------------------
+#
+#   Copyright 2024 Valory AG
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+#
+# ------------------------------------------------------------------------------
+"""Benchmarking for tools."""
+
+import logging
+from typing import Any, Dict, Union
+
+import anthropic
+import tiktoken
+from tiktoken import Encoding
+
+
+PRICE_NUM_TOKENS = 1000
+
+
+def encoding_for_model(model: str) -> Encoding:
+    """Get the encoding for a model."""
+    return tiktoken.encoding_for_model(model)
+
+
+def count_tokens(text: str, model: str) -> int:
+    """Count the number of tokens in a text."""
+    if "claude" in model:
+        return anthropic.Anthropic().count_tokens(text)
+
+    enc = encoding_for_model(model)
+    return len(enc.encode(text))
+
+
+class TokenCounterCallback:
+    """Callback to count the number of tokens used in a generation."""
+
+    TOKEN_PRICES = {
+        "gpt-3.5-turbo": {"input": 0.001, "output": 0.002},
+        "gpt-4": {"input": 0.03, "output": 0.06},
+        "gpt-4-turbo": {"input": 0.01, "output": 0.03},
+        "claude-2": {"input": 0.008, "output": 0.024},
+    }
+
+    def __init__(self) -> None:
+        """Initialize the callback."""
+        self.cost_dict: Dict[str, Union[int, float]] = {
+            "input_tokens": 0,
+            "output_tokens": 0,
+            "total_tokens": 0,
+            "input_cost": 0,
+            "output_cost": 0,
+            "total_cost": 0,
+        }
+
+    @staticmethod
+    def token_to_cost(tokens: int, model: str, tokens_type: str) -> float:
+        """Converts a number of tokens to a cost in dollars."""
+        return (
+            tokens
+            / PRICE_NUM_TOKENS
+            * TokenCounterCallback.TOKEN_PRICES[model][tokens_type]
+        )
+
+    def calculate_cost(self, tokens_type: str, model: str, **kwargs: Any) -> None:
+        """Calculate the cost of a generation."""
+        # Check if it its prompt or tokens are passed in
+        prompt_key = f"{tokens_type}_prompt"
+        token_key = f"{tokens_type}_tokens"
+        if prompt_key in kwargs:
+            tokens = count_tokens(kwargs[prompt_key], model)
+        elif token_key in kwargs:
+            tokens = kwargs[token_key]
+        else:
+            logging.warning(f"No {token_key}_tokens or {tokens_type}_prompt found.")
+        cost = self.token_to_cost(tokens, model, tokens_type)
+        self.cost_dict[token_key] += tokens
+        self.cost_dict[f"{tokens_type}_cost"] += cost
+
+    def __call__(self, model: str, **kwargs: Any) -> None:
+        """Callback to count the number of tokens used in a generation."""
+        if model not in list(TokenCounterCallback.TOKEN_PRICES.keys()):
+            raise ValueError(f"Model {model} not supported.")
+        try:
+            self.calculate_cost("input", model, **kwargs)
+            self.calculate_cost("output", model, **kwargs)
+            self.cost_dict["total_tokens"] = (
+                self.cost_dict["input_tokens"] + self.cost_dict["output_tokens"]
+            )
+            self.cost_dict["total_cost"] = (
+                self.cost_dict["input_cost"] + self.cost_dict["output_cost"]
+            )
+        except Exception as e:
+            logging.error(f"Error in TokenCounterCallback: {e}")
diff --git a/tools/native_transfer_request/native_transfer_request.py b/tools/native_transfer_request/native_transfer_request.py
@@ -29,20 +29,22 @@
 
 client: Optional[OpenAI] = None
 
+class OpenAIClientManager:
+    """Client context manager for OpenAI."""
+    def __init__(self, api_key: str):
+        self.api_key = api_key
 
-def init_openai_client(api_key: str) -> OpenAI:
-    """Initialize the OpenAI client"""
-    global client
-    if client is None:
-        client = OpenAI(api_key=api_key)
-    return client
+    def __enter__(self) -> OpenAI:
+        global client
+        if client is None:
+            client = OpenAI(api_key=self.api_key)
+        return client
 
-def close_openai_client() -> None:
-    """Close the OpenAI client"""
-    global client
-    if client is not None:
-        client.close()
-        client = None
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        global client
+        if client is not None:
+            client.close()
+            client = None
 
 
 
@@ -102,7 +104,7 @@ def make_request_openai_request(
 def native_transfer(
     prompt: str,
     api_key: str,
-) -> Tuple[str, Optional[str], Optional[Dict[str, Any]]]:
+) -> Tuple[str, Optional[str], Optional[Dict[str, Any]], Any]:
     """Perform native transfer."""
     tool_prompt = NATIVE_TRANSFER_PROMPT.format(user_prompt=prompt)
     response = make_request_openai_request(prompt=tool_prompt)
@@ -118,26 +120,25 @@ def native_transfer(
         "to": str(parsed_txs["to"]),
         "value": int(parsed_txs["wei_value"]),
     }
-    return response, prompt, transaction
+    return response, prompt, transaction, None
 
 
 AVAILABLE_TOOLS = {
     "native": native_transfer,
 }
 
 
-def run(**kwargs) -> Tuple[str, Optional[str], Optional[Dict[str, Any]]]:
+def run(**kwargs) -> Tuple[str, Optional[str], Optional[Dict[str, Any]], Any]:
     """Run the task"""
-    init_openai_client(kwargs["api_keys"]["openai"])
+    with OpenAIClientManager(kwargs["api_keys"]["openai"]):
 
-    prompt = kwargs["prompt"]
-    api_key = kwargs["api_keys"]["openai"]
-    tool = cast(str, kwargs["tool"]).replace(TOOL_PREFIX, "")
+        prompt = kwargs["prompt"]
+        api_key = kwargs["api_keys"]["openai"]
+        tool = cast(str, kwargs["tool"]).replace(TOOL_PREFIX, "")
 
-    if tool not in AVAILABLE_TOOLS:
-        return f"No tool named `{kwargs['tool']}`", None, None
+        if tool not in AVAILABLE_TOOLS:
+            return f"No tool named `{kwargs['tool']}`", None, None, None
 
-    transaction_builder = AVAILABLE_TOOLS[tool]
-    response = transaction_builder(prompt, api_key)
-    close_openai_client()
-    return response
+        transaction_builder = AVAILABLE_TOOLS[tool]
+        response = transaction_builder(prompt, api_key)
+        return response