Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 43 additions & 2 deletions src/agentlab/analyze/agent_xray.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from attr import dataclass
from langchain.schema import BaseMessage, HumanMessage
from openai import OpenAI
from PIL import Image
from PIL import Image, ImageDraw

from agentlab.analyze import inspect_results
from agentlab.experiments.exp_utils import RESULTS_DIR
Expand Down Expand Up @@ -530,9 +530,47 @@ def wrapper(*args, **kwargs):
return decorator


def tag_screenshot_with_action(screenshot: Image, action: str) -> Image:
"""
If action is a coordinate action, try to render it on the screenshot.

e.g. mouse_click(120, 130) -> draw a dot at (120, 130) on the screenshot

Args:
screenshot: The screenshot to tag.
action: The action to tag the screenshot with.

Returns:
The tagged screenshot.

Raises:
ValueError: If the action parsing fails.
"""
if action.startswith("mouse_click"):
try:
coords = action[action.index("(") + 1 : action.index(")")].split(",")
coords = [c.strip() for c in coords]
if len(coords) != 2:
raise ValueError(f"Invalid coordinate format: {coords}")
if coords[0].startswith("x="):
coords[0] = coords[0][2:]
if coords[1].startswith("y="):
coords[1] = coords[1][2:]
x, y = float(coords[0].strip()), float(coords[1].strip())
draw = ImageDraw.Draw(screenshot)
radius = 5
draw.ellipse(
(x - radius, y - radius, x + radius, y + radius), fill="red", outline="red"
)
except (ValueError, IndexError) as e:
warning(f"Failed to parse action '{action}': {e}")
return screenshot


def update_screenshot(som_or_not: str):
global info
return get_screenshot(info, som_or_not=som_or_not)
action = info.exp_result.steps_info[info.step].action
return tag_screenshot_with_action(get_screenshot(info, som_or_not=som_or_not), action)


def get_screenshot(info: Info, step: int = None, som_or_not: str = "Raw Screenshots"):
Expand All @@ -549,6 +587,9 @@ def update_screenshot_pair(som_or_not: str):
global info
s1 = get_screenshot(info, info.step, som_or_not)
s2 = get_screenshot(info, info.step + 1, som_or_not)

if s1 is not None:
s1 = tag_screenshot_with_action(s1, info.exp_result.steps_info[info.step].action)
Comment on lines +591 to +592
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inconsistent Screenshot Pair Annotation category Functionality

Tell me more
What is the issue?

The screenshot annotation logic in update_screenshot_pair() only annotates the first screenshot (s1) but not the second one (s2).

Why this matters

This inconsistency means users won't see click markers on the second screenshot, making it harder to track the progression of actions across screenshot pairs.

Suggested change ∙ Feature Preview
def update_screenshot_pair(som_or_not: str):
    global info
    s1 = get_screenshot(info, info.step, som_or_not)
    s2 = get_screenshot(info, info.step + 1, som_or_not)

    if s1 is not None:
        s1 = tag_screenshot_with_action(s1, info.exp_result.steps_info[info.step].action)
    if s2 is not None and info.step + 1 < len(info.exp_result.steps_info):
        s2 = tag_screenshot_with_action(s2, info.exp_result.steps_info[info.step + 1].action)
    return s1, s2
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.

return s1, s2


Expand Down
Loading