Skip to content

Commit

Permalink
feat: Add screenshot tool for capturing and analyzing images
Browse files Browse the repository at this point in the history
  • Loading branch information
onuratakan committed Dec 10, 2024
1 parent 3285a65 commit 72f4058
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 1 deletion.
2 changes: 1 addition & 1 deletion gpt_computer_assistant/agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def get_agent_executor(the_anthropic_model=False):
print("Anthropic tools len", len(tools))
return chat_agent_executor.create_tool_calling_executor(model_catch, tools)
else:
tools += [mouse_scroll, click_to_text, click_to_icon, click_to_area] + mcp_tools() + get_standard_tools()
tools += [mouse_scroll, click_to_text, click_to_icon, click_to_area, screenshot] + mcp_tools() + get_standard_tools()



Expand Down
98 changes: 98 additions & 0 deletions gpt_computer_assistant/display_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,101 @@ def click_to_area_(


click_to_area = tool(click_to_area_)




def screenshot_(checking:str):
"""
Returns the current screenshot. Explain what should we check on the screenshot.
"""

from langchain_core.messages import HumanMessage, SystemMessage, AIMessage

try:
from .cu.computer import screenshot_action
from .agent.agent import get_agent_executor
except ImportError:
from cu.computer import screenshot_action
from agent.agent import get_agent_executor

the_base64 = screenshot_action(direct_base64=True)






human_first_message = {"type": "text", "text": f"Explain the image and check '{checking}' on the image."}



the_message = [
human_first_message
]



human_second_message = None

if screenshot_path:


human_second_message = {
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{the_base64}"},
}



if human_second_message:
the_message.append(human_second_message)





the_message = HumanMessage(content=the_message)







msg = get_agent_executor().invoke(
{"messages": [the_message]}
)





the_last_messages = msg["messages"]









return_value = the_last_messages[-1].content
if isinstance(return_value, list):
the_text = ""
for each in return_value:
the_text += str(each)
return_value = the_text

if return_value == "":
return_value = "No response "




return return_value



screenshot = tool(screenshot_)

0 comments on commit 72f4058

Please sign in to comment.