From c3e1792aee6e512121d2204ebd16fa41450bca3e Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Thu, 24 Apr 2025 13:32:39 -0400 Subject: [PATCH 1/4] adding a tag on screenshots for mouse_click coordinate actions --- src/agentlab/analyze/agent_xray.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index efac82f2..79fc61e4 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -14,7 +14,7 @@ from attr import dataclass from langchain.schema import BaseMessage, HumanMessage from openai import OpenAI -from PIL import Image +from PIL import Image, ImageDraw from agentlab.analyze import inspect_results from agentlab.experiments.exp_utils import RESULTS_DIR @@ -530,9 +530,29 @@ def wrapper(*args, **kwargs): return decorator +def tag_screenshot_with_action(screenshot: Image, action: str) -> Image: + """If action is a coordinate action, try to render it on the screenshot. + + e.g. mouse_click(120, 130) -> draw a dot at (120, 130) on the screenshot + """ + if action.startswith("mouse_click"): + coords = action[action.index("(") + 1 : action.index(")")].split(",") + coords = [c.strip() for c in coords] + if coords[0].startswith("x="): + coords[0] = coords[0][2:] + if coords[1].startswith("y="): + coords[1] = coords[1][2:] + x, y = float(coords[0].strip()), float(coords[1].strip()) + draw = ImageDraw.Draw(screenshot) + radius = 5 + draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill="red", outline="red") + return screenshot + + def update_screenshot(som_or_not: str): global info - return get_screenshot(info, som_or_not=som_or_not) + action = info.exp_result.steps_info[info.step].action + return tag_screenshot_with_action(get_screenshot(info, som_or_not=som_or_not), action) def get_screenshot(info: Info, step: int = None, som_or_not: str = "Raw Screenshots"): @@ -549,6 +569,9 @@ def update_screenshot_pair(som_or_not: str): global info s1 = get_screenshot(info, info.step, som_or_not) s2 = get_screenshot(info, info.step + 1, som_or_not) + + if s1 is not None: + s1 = tag_screenshot_with_action(s1, info.exp_result.steps_info[info.step].action) return s1, s2 From 6569fb1a2d25c01e19f33a5f51a613dd2d1ef1a8 Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Thu, 24 Apr 2025 13:38:25 -0400 Subject: [PATCH 2/4] darglint --- src/agentlab/analyze/agent_xray.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index 79fc61e4..34782fed 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -531,9 +531,17 @@ def wrapper(*args, **kwargs): def tag_screenshot_with_action(screenshot: Image, action: str) -> Image: - """If action is a coordinate action, try to render it on the screenshot. + """ + If action is a coordinate action, try to render it on the screenshot. e.g. mouse_click(120, 130) -> draw a dot at (120, 130) on the screenshot + + Args: + screenshot: The screenshot to tag. + action: The action to tag the screenshot with. + + Returns: + The tagged screenshot. """ if action.startswith("mouse_click"): coords = action[action.index("(") + 1 : action.index(")")].split(",") From c6e5cb046181e06af184446d028ec78aa971b169 Mon Sep 17 00:00:00 2001 From: Thibault LSDC <78021491+TLSDC@users.noreply.github.com> Date: Thu, 24 Apr 2025 13:39:45 -0400 Subject: [PATCH 3/4] Update src/agentlab/analyze/agent_xray.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/agentlab/analyze/agent_xray.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index 34782fed..a9d0449b 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -544,18 +544,21 @@ def tag_screenshot_with_action(screenshot: Image, action: str) -> Image: The tagged screenshot. """ if action.startswith("mouse_click"): - coords = action[action.index("(") + 1 : action.index(")")].split(",") - coords = [c.strip() for c in coords] - if coords[0].startswith("x="): - coords[0] = coords[0][2:] - if coords[1].startswith("y="): - coords[1] = coords[1][2:] - x, y = float(coords[0].strip()), float(coords[1].strip()) - draw = ImageDraw.Draw(screenshot) - radius = 5 - draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill="red", outline="red") - return screenshot - + try: + coords = action[action.index("(") + 1 : action.index(")")].split(",") + coords = [c.strip() for c in coords] + if len(coords) != 2: + raise ValueError(f"Invalid coordinate format: {coords}") + if coords[0].startswith("x="): + coords[0] = coords[0][2:] + if coords[1].startswith("y="): + coords[1] = coords[1][2:] + x, y = float(coords[0].strip()), float(coords[1].strip()) + draw = ImageDraw.Draw(screenshot) + radius = 5 + draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill="red", outline="red") + except (ValueError, IndexError) as e: + warning(f"Failed to parse action '{action}': {e}") def update_screenshot(som_or_not: str): global info From 2f893a9a2abf8952cda0f52bd1ec76d43b2554b4 Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Thu, 24 Apr 2025 13:44:18 -0400 Subject: [PATCH 4/4] darglint --- src/agentlab/analyze/agent_xray.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index a9d0449b..8cd7961c 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -542,6 +542,9 @@ def tag_screenshot_with_action(screenshot: Image, action: str) -> Image: Returns: The tagged screenshot. + + Raises: + ValueError: If the action parsing fails. """ if action.startswith("mouse_click"): try: @@ -556,9 +559,13 @@ def tag_screenshot_with_action(screenshot: Image, action: str) -> Image: x, y = float(coords[0].strip()), float(coords[1].strip()) draw = ImageDraw.Draw(screenshot) radius = 5 - draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill="red", outline="red") + draw.ellipse( + (x - radius, y - radius, x + radius, y + radius), fill="red", outline="red" + ) except (ValueError, IndexError) as e: warning(f"Failed to parse action '{action}': {e}") + return screenshot + def update_screenshot(som_or_not: str): global info