Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WebSurfer Documentation and Fixes #4624

Merged
merged 17 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion python/packages/autogen-ext/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ web-surfer = [
"autogen-agentchat==0.4.0.dev11",
"playwright>=1.48.0",
"pillow>=11.0.0",
"markitdown>=0.0.1a2",
]
magentic-one = [
"autogen-agentchat==0.4.0.dev11",
Expand Down Expand Up @@ -77,7 +78,11 @@ testpaths = ["tests"]
include = "../../shared_tasks.toml"

[tool.poe.tasks]
test = "pytest -n auto"
test.sequence = [
"playwright install",
"pytest -n auto",
]
test.default_item_type = "cmd"
mypy = "mypy --config-file ../../pyproject.toml --exclude src/autogen_ext/runtimes/grpc/protos --exclude tests/protos src tests"

[tool.mypy]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from ._multimodal_web_surfer import MultimodalWebSurfer
from .playwright_controller import PlaywrightController

__all__ = ["MultimodalWebSurfer"]
__all__ = ["MultimodalWebSurfer", "PlaywrightController"]

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
import asyncio
import base64
import io
import os
import random
from typing import Any, Callable, Dict, Optional, Tuple, Union, cast

# TODO: Fix unfollowed import
try:
from markitdown import MarkItDown # type: ignore
except ImportError:
MarkItDown = None
from playwright._impl._errors import Error as PlaywrightError
from playwright._impl._errors import TimeoutError
from playwright.async_api import Download, Page
Expand All @@ -17,24 +23,36 @@


class PlaywrightController:
"""
A helper class to allow Playwright to interact with web pages to perform actions such as clicking, filling, and scrolling.

Args:
downloads_folder (str | None): The folder to save downloads to. If None, downloads are not saved.
animate_actions (bool): Whether to animate the actions (create fake cursor to click).
viewport_width (int): The width of the viewport.
viewport_height (int): The height of the viewport.
_download_handler (Optional[Callable[[Download], None]]): A function to handle downloads.
to_resize_viewport (bool): Whether to resize the viewport
"""

def __init__(
self,
downloads_folder: str | None = None,
animate_actions: bool = False,
downloads_folder: Optional[str] = None,
viewport_width: int = 1440,
viewport_height: int = 900,
_download_handler: Optional[Callable[[Download], None]] = None,
to_resize_viewport: bool = True,
) -> None:
"""
A controller for Playwright to interact with web pages.
animate_actions: If True, actions will be animated.
downloads_folder: The folder to save downloads to.
viewport_width: The width of the viewport.
viewport_height: The height of the viewport.
_download_handler: A handler for downloads.
to_resize_viewport: If True, the viewport will be resized.
Initialize the PlaywrightController.
"""
assert isinstance(animate_actions, bool)
assert isinstance(viewport_width, int)
assert isinstance(viewport_height, int)
assert viewport_height > 0
assert viewport_width > 0

self.animate_actions = animate_actions
self.downloads_folder = downloads_folder
self.viewport_width = viewport_width
Expand All @@ -43,16 +61,33 @@ def __init__(
self.to_resize_viewport = to_resize_viewport
self._page_script: str = ""
self.last_cursor_position: Tuple[float, float] = (0.0, 0.0)
self._markdown_converter: Optional[Any] | None = None

# Read page_script
with open(os.path.join(os.path.abspath(os.path.dirname(__file__)), "page_script.js"), "rt") as fh:
self._page_script = fh.read()

async def sleep(self, page: Page, duration: Union[int, float]) -> None:
"""
Pause the execution for a specified duration.

Args:
page (Page): The Playwright page object.
duration (Union[int, float]): The duration to sleep in milliseconds.
"""
assert page is not None
await page.wait_for_timeout(duration * 1000)

async def get_interactive_rects(self, page: Page) -> Dict[str, InteractiveRegion]:
"""
Retrieve interactive regions from the web page.

Args:
page (Page): The Playwright page object.

Returns:
Dict[str, InteractiveRegion]: A dictionary of interactive regions.
"""
assert page is not None
# Read the regions from the DOM
try:
Expand All @@ -71,6 +106,15 @@ async def get_interactive_rects(self, page: Page) -> Dict[str, InteractiveRegion
return typed_results

async def get_visual_viewport(self, page: Page) -> VisualViewport:
"""
Retrieve the visual viewport of the web page.

Args:
page (Page): The Playwright page object.

Returns:
VisualViewport: The visual viewport of the page.
"""
assert page is not None
try:
await page.evaluate(self._page_script)
Expand All @@ -79,6 +123,15 @@ async def get_visual_viewport(self, page: Page) -> VisualViewport:
return visualviewport_from_dict(await page.evaluate("MultimodalWebSurfer.getVisualViewport();"))

async def get_focused_rect_id(self, page: Page) -> str:
"""
Retrieve the ID of the currently focused element.

Args:
page (Page): The Playwright page object.

Returns:
str: The ID of the focused element.
"""
assert page is not None
try:
await page.evaluate(self._page_script)
Expand All @@ -88,6 +141,15 @@ async def get_focused_rect_id(self, page: Page) -> str:
return str(result)

async def get_page_metadata(self, page: Page) -> Dict[str, Any]:
"""
Retrieve metadata from the web page.

Args:
page (Page): The Playwright page object.

Returns:
Dict[str, Any]: A dictionary of page metadata.
"""
assert page is not None
try:
await page.evaluate(self._page_script)
Expand All @@ -98,6 +160,12 @@ async def get_page_metadata(self, page: Page) -> Dict[str, Any]:
return cast(Dict[str, Any], result)

async def on_new_page(self, page: Page) -> None:
"""
Handle actions to perform on a new page.

Args:
page (Page): The Playwright page object.
"""
assert page is not None
page.on("download", self._download_handler) # type: ignore
if self.to_resize_viewport and self.viewport_width and self.viewport_height:
Expand All @@ -107,10 +175,26 @@ async def on_new_page(self, page: Page) -> None:
await page.wait_for_load_state()

async def back(self, page: Page) -> None:
"""
Navigate back to the previous page.

Args:
page (Page): The Playwright page object.
"""
assert page is not None
await page.go_back()

async def visit_page(self, page: Page, url: str) -> Tuple[bool, bool]:
"""
Visit a specified URL.

Args:
page (Page): The Playwright page object.
url (str): The URL to visit.

Returns:
Tuple[bool, bool]: A tuple indicating whether to reset prior metadata hash and last download.
"""
assert page is not None
reset_prior_metadata_hash = False
reset_last_download = False
Expand Down Expand Up @@ -143,16 +227,38 @@ async def visit_page(self, page: Page, url: str) -> Tuple[bool, bool]:
return reset_prior_metadata_hash, reset_last_download

async def page_down(self, page: Page) -> None:
"""
Scroll the page down by one viewport height minus 50 pixels.

Args:
page (Page): The Playwright page object.
"""
assert page is not None
await page.evaluate(f"window.scrollBy(0, {self.viewport_height-50});")

async def page_up(self, page: Page) -> None:
"""
Scroll the page up by one viewport height minus 50 pixels.

Args:
page (Page): The Playwright page object.
"""
assert page is not None
await page.evaluate(f"window.scrollBy(0, -{self.viewport_height-50});")

async def gradual_cursor_animation(
self, page: Page, start_x: float, start_y: float, end_x: float, end_y: float
) -> None:
"""
Animate the cursor movement gradually from start to end coordinates.

Args:
page (Page): The Playwright page object.
start_x (float): The starting x-coordinate.
start_y (float): The starting y-coordinate.
end_x (float): The ending x-coordinate.
end_y (float): The ending y-coordinate.
"""
# animation helper
steps = 20
for step in range(steps):
Expand All @@ -171,6 +277,13 @@ async def gradual_cursor_animation(
self.last_cursor_position = (end_x, end_y)

async def add_cursor_box(self, page: Page, identifier: str) -> None:
"""
Add a red cursor box around the element with the given identifier.

Args:
page (Page): The Playwright page object.
identifier (str): The element identifier.
"""
# animation helper
await page.evaluate(f"""
(function() {{
Expand Down Expand Up @@ -199,6 +312,13 @@ async def add_cursor_box(self, page: Page, identifier: str) -> None:
""")

async def remove_cursor_box(self, page: Page, identifier: str) -> None:
"""
Remove the red cursor box around the element with the given identifier.

Args:
page (Page): The Playwright page object.
identifier (str): The element identifier.
"""
# Remove the highlight and cursor
await page.evaluate(f"""
(function() {{
Expand All @@ -215,7 +335,14 @@ async def remove_cursor_box(self, page: Page, identifier: str) -> None:

async def click_id(self, page: Page, identifier: str) -> Page | None:
"""
Returns new page if a new page is opened, otherwise None.
Click the element with the given identifier.

Args:
page (Page): The Playwright page object.
identifier (str): The element identifier.

Returns:
Page | None: The new page if a new page is opened, otherwise None.
"""
new_page: Page | None = None
assert page is not None
Expand Down Expand Up @@ -266,7 +393,11 @@ async def click_id(self, page: Page, identifier: str) -> Page | None:

async def hover_id(self, page: Page, identifier: str) -> None:
"""
Hovers the mouse over the target with the given id.
Hover the mouse over the element with the given identifier.

Args:
page (Page): The Playwright page object.
identifier (str): The element identifier.
"""
assert page is not None
target = page.locator(f"[__elementId='{identifier}']")
Expand Down Expand Up @@ -296,7 +427,15 @@ async def hover_id(self, page: Page, identifier: str) -> None:
else:
await page.mouse.move(box["x"] + box["width"] / 2, box["y"] + box["height"] / 2)

async def fill_id(self, page: Page, identifier: str, value: str) -> None:
async def fill_id(self, page: Page, identifier: str, value: str, press_enter: bool = True) -> None:
"""
Fill the element with the given identifier with the specified value.

Args:
page (Page): The Playwright page object.
identifier (str): The element identifier.
value (str): The value to fill.
"""
assert page is not None
target = page.locator(f"[__elementId='{identifier}']")

Expand Down Expand Up @@ -332,12 +471,21 @@ async def fill_id(self, page: Page, identifier: str, value: str) -> None:
await target.fill(value)
except PlaywrightError:
await target.press_sequentially(value)
await target.press("Enter")
if press_enter:
await target.press("Enter")

if self.animate_actions:
await self.remove_cursor_box(page, identifier)

async def scroll_id(self, page: Page, identifier: str, direction: str) -> None:
"""
Scroll the element with the given identifier in the specified direction.

Args:
page (Page): The Playwright page object.
identifier (str): The element identifier.
direction (str): The direction to scroll ("up" or "down").
"""
assert page is not None
await page.evaluate(
f"""
Expand All @@ -355,11 +503,16 @@ async def scroll_id(self, page: Page, identifier: str, direction: str) -> None:
"""
)

async def get_webpage_text(self, page: Page, n_lines: int = 100) -> str:
async def get_webpage_text(self, page: Page, n_lines: int = 50) -> str:
"""
page: playwright page object
n_lines: number of lines to return from the page innertext
return: text in the first n_lines of the page
Retrieve the text content of the web page.

Args:
page (Page): The Playwright page object.
n_lines (int): The number of lines to return from the page inner text.

Returns:
str: The text content of the page.
"""
assert page is not None
try:
Expand All @@ -375,6 +528,22 @@ async def get_webpage_text(self, page: Page, n_lines: int = 100) -> str:
return ""

async def get_page_markdown(self, page: Page) -> str:
# TODO: replace with mdconvert
"""
Retrieve the markdown content of the web page.
Currently not implemented.

Args:
page (Page): The Playwright page object.

Returns:
str: The markdown content of the page.
"""
assert page is not None
return await self.get_webpage_text(page, n_lines=1000)
if self._markdown_converter is None and MarkItDown is not None:
self._markdown_converter = MarkItDown()
html = await page.evaluate("document.documentElement.outerHTML;")
res = self._markdown_converter.convert_stream(io.StringIO(html), file_extension=".html", url=page.url) # type: ignore
assert hasattr(res, "text_content") and isinstance(res.text_content, str)
return res.text_content
else:
return await self.get_webpage_text(page, n_lines=200)
Loading
Loading