From 72f405880db879953e75c3dc852be83615fc5ec4 Mon Sep 17 00:00:00 2001 From: Onur ULUSOY Date: Tue, 10 Dec 2024 16:55:08 +0300 Subject: [PATCH] feat: Add screenshot tool for capturing and analyzing images --- gpt_computer_assistant/agent/agent.py | 2 +- gpt_computer_assistant/display_tools.py | 98 +++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 1 deletion(-) diff --git a/gpt_computer_assistant/agent/agent.py b/gpt_computer_assistant/agent/agent.py index 15612055c..4b0ec126b 100644 --- a/gpt_computer_assistant/agent/agent.py +++ b/gpt_computer_assistant/agent/agent.py @@ -74,7 +74,7 @@ def get_agent_executor(the_anthropic_model=False): print("Anthropic tools len", len(tools)) return chat_agent_executor.create_tool_calling_executor(model_catch, tools) else: - tools += [mouse_scroll, click_to_text, click_to_icon, click_to_area] + mcp_tools() + get_standard_tools() + tools += [mouse_scroll, click_to_text, click_to_icon, click_to_area, screenshot] + mcp_tools() + get_standard_tools() diff --git a/gpt_computer_assistant/display_tools.py b/gpt_computer_assistant/display_tools.py index d95b696ba..270e08d43 100644 --- a/gpt_computer_assistant/display_tools.py +++ b/gpt_computer_assistant/display_tools.py @@ -160,3 +160,101 @@ def click_to_area_( click_to_area = tool(click_to_area_) + + + + +def screenshot_(checking:str): + """ + Returns the current screenshot. Explain what should we check on the screenshot. + """ + + from langchain_core.messages import HumanMessage, SystemMessage, AIMessage + + try: + from .cu.computer import screenshot_action + from .agent.agent import get_agent_executor + except ImportError: + from cu.computer import screenshot_action + from agent.agent import get_agent_executor + + the_base64 = screenshot_action(direct_base64=True) + + + + + + + human_first_message = {"type": "text", "text": f"Explain the image and check '{checking}' on the image."} + + + + the_message = [ + human_first_message + ] + + + + human_second_message = None + + if screenshot_path: + + + human_second_message = { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{the_base64}"}, + } + + + + if human_second_message: + the_message.append(human_second_message) + + + + + + the_message = HumanMessage(content=the_message) + + + + + + + + msg = get_agent_executor().invoke( + {"messages": [the_message]} + ) + + + + + + the_last_messages = msg["messages"] + + + + + + + + + + return_value = the_last_messages[-1].content + if isinstance(return_value, list): + the_text = "" + for each in return_value: + the_text += str(each) + return_value = the_text + + if return_value == "": + return_value = "No response " + + + + + return return_value + + + +screenshot = tool(screenshot_) \ No newline at end of file