From f112e150e332e214f9219df5035bbfb414629d69 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 27 Jun 2025 15:15:54 -0400 Subject: [PATCH 1/8] enhance the action overlay mechanism --- src/agentlab/agents/agent_utils.py | 137 --------- src/agentlab/analyze/agent_xray.py | 95 +++--- src/agentlab/analyze/overlay_utils.py | 397 ++++++++++++++++++++++++++ tests/analyze/test_overlay_utils.py | 78 +++++ 4 files changed, 535 insertions(+), 172 deletions(-) create mode 100644 src/agentlab/analyze/overlay_utils.py create mode 100644 tests/analyze/test_overlay_utils.py diff --git a/src/agentlab/agents/agent_utils.py b/src/agentlab/agents/agent_utils.py index 991e27e6..29219d2d 100644 --- a/src/agentlab/agents/agent_utils.py +++ b/src/agentlab/agents/agent_utils.py @@ -1,96 +1,6 @@ -from logging import warning -from typing import Optional, Tuple - -import numpy as np from PIL import Image, ImageDraw from playwright.sync_api import Page -""" -This module contains utility functions for handling observations and actions in the context of agent interactions. -""" - - -def tag_screenshot_with_action(screenshot: Image, action: str) -> Image: - """ - If action is a coordinate action, try to render it on the screenshot. - - e.g. mouse_click(120, 130) -> draw a dot at (120, 130) on the screenshot - - Args: - screenshot: The screenshot to tag. - action: The action to tag the screenshot with. - - Returns: - The tagged screenshot. - - Raises: - ValueError: If the action parsing fails. - """ - if action.startswith("mouse_click"): - try: - coords = action[action.index("(") + 1 : action.index(")")].split(",") - coords = [c.strip() for c in coords] - if len(coords) not in [2, 3]: - raise ValueError(f"Invalid coordinate format: {coords}") - if coords[0].startswith("x="): - coords[0] = coords[0][2:] - if coords[1].startswith("y="): - coords[1] = coords[1][2:] - x, y = float(coords[0].strip()), float(coords[1].strip()) - draw = ImageDraw.Draw(screenshot) - radius = 5 - draw.ellipse( - (x - radius, y - radius, x + radius, y + radius), fill="blue", outline="blue" - ) - except (ValueError, IndexError) as e: - warning(f"Failed to parse action '{action}': {e}") - - elif action.startswith("mouse_drag_and_drop"): - try: - func_name, parsed_args = parse_func_call_string(action) - if func_name == "mouse_drag_and_drop" and parsed_args is not None: - args, kwargs = parsed_args - x1, y1, x2, y2 = None, None, None, None - - if args and len(args) >= 4: - # Positional arguments: mouse_drag_and_drop(x1, y1, x2, y2) - x1, y1, x2, y2 = map(float, args[:4]) - elif kwargs: - # Keyword arguments: mouse_drag_and_drop(from_x=x1, from_y=y1, to_x=x2, to_y=y2) - x1 = float(kwargs.get("from_x", 0)) - y1 = float(kwargs.get("from_y", 0)) - x2 = float(kwargs.get("to_x", 0)) - y2 = float(kwargs.get("to_y", 0)) - - if all(coord is not None for coord in [x1, y1, x2, y2]): - draw = ImageDraw.Draw(screenshot) - # Draw the main line - draw.line((x1, y1, x2, y2), fill="red", width=2) - # Draw arrowhead at the end point using the helper function - draw_arrowhead(draw, (x1, y1), (x2, y2)) - except (ValueError, IndexError) as e: - warning(f"Failed to parse action '{action}': {e}") - return screenshot - - -def add_mouse_pointer_from_action(screenshot: Image, action: str) -> Image.Image: - - if action.startswith("mouse_click"): - try: - coords = action[action.index("(") + 1 : action.index(")")].split(",") - coords = [c.strip() for c in coords] - if len(coords) not in [2, 3]: - raise ValueError(f"Invalid coordinate format: {coords}") - if coords[0].startswith("x="): - coords[0] = coords[0][2:] - if coords[1].startswith("y="): - coords[1] = coords[1][2:] - x, y = int(coords[0].strip()), int(coords[1].strip()) - screenshot = draw_mouse_pointer(screenshot, x, y) - except (ValueError, IndexError) as e: - warning(f"Failed to parse action '{action}': {e}") - return screenshot - def draw_mouse_pointer(image: Image.Image, x: int, y: int) -> Image.Image: """ @@ -218,50 +128,3 @@ def zoom_webpage(page: Page, zoom_factor: float = 1.5): page.evaluate(f"document.documentElement.style.zoom='{zoom_factor*100}%'") return page - - -def parse_func_call_string(call_str: str) -> Tuple[Optional[str], Optional[Tuple[list, dict]]]: - """ - Parse a function call string and extract the function name and arguments. - - Args: - call_str (str): A string like "mouse_click(100, 200)" or "mouse_drag_and_drop(x=10, y=20)" - - Returns: - Tuple (func_name, (args, kwargs)), or (None, None) if parsing fails - """ - import ast - - try: - tree = ast.parse(call_str.strip(), mode="eval") - if not isinstance(tree.body, ast.Call): - return None, None - - call_node = tree.body - - # Function name - if isinstance(call_node.func, ast.Name): - func_name = call_node.func.id - else: - return None, None - - # Positional arguments - args = [] - for arg in call_node.args: - try: - args.append(ast.literal_eval(arg)) - except (ValueError, TypeError): - return None, None - - # Keyword arguments - kwargs = {} - for kw in call_node.keywords: - try: - kwargs[kw.arg] = ast.literal_eval(kw.value) - except (ValueError, TypeError): - return None, None - - return func_name, (args, kwargs) - - except (SyntaxError, ValueError, TypeError): - return None, None diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index e09b4af8..439a99ee 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -14,10 +14,11 @@ from attr import dataclass from langchain.schema import BaseMessage, HumanMessage from openai import OpenAI +from openai.types.responses import ResponseFunctionToolCall from PIL import Image -from agentlab.agents import agent_utils from agentlab.analyze import inspect_results +from agentlab.analyze.overlay_utils import annotate_action from agentlab.experiments.exp_utils import RESULTS_DIR from agentlab.experiments.loop import ExpResult, StepInfo from agentlab.experiments.study import get_most_recent_study @@ -351,7 +352,7 @@ def run_gradio(results_dir: Path): pruned_html_code = gr.Code(language="html", **code_args) with gr.Tab("AXTree") as tab_axtree: - axtree_code = gr.Code(language=None, **code_args) + axtree_code = gr.Markdown() with gr.Tab("Chat Messages") as tab_chat: chat_messages = gr.Markdown() @@ -536,38 +537,46 @@ def wrapper(*args, **kwargs): def update_screenshot(som_or_not: str): global info - action = info.exp_result.steps_info[info.step].action - return agent_utils.tag_screenshot_with_action( - get_screenshot(info, som_or_not=som_or_not), action - ) + img, action_str = get_screenshot(info, som_or_not=som_or_not, annotate=True) + return img -def get_screenshot(info: Info, step: int = None, som_or_not: str = "Raw Screenshots"): +def get_screenshot( + info: Info, step: int = None, som_or_not: str = "Raw Screenshots", annotate: bool = False +): if step is None: step = info.step + step_info = info.exp_result.steps_info[step] try: is_som = som_or_not == "SOM Screenshots" - return info.exp_result.get_screenshot(step, som=is_som) + img = info.exp_result.get_screenshot(step, som=is_som) + if annotate: + action_str = step_info.action + properties = step_info.obs.get("extra_element_properties", None) + action_colored = annotate_action(img, action_string=action_str, properties=properties) + else: + action_colored = None + return img, action_colored except FileNotFoundError: - return None + return None, None def update_screenshot_pair(som_or_not: str): global info - s1 = get_screenshot(info, info.step, som_or_not) - s2 = get_screenshot(info, info.step + 1, som_or_not) - - if s1 is not None: - s1 = agent_utils.tag_screenshot_with_action( - s1, info.exp_result.steps_info[info.step].action - ) + s1, action_str = get_screenshot(info, info.step, som_or_not, annotate=True) + s2, action_str = get_screenshot(info, info.step + 1, som_or_not) return s1, s2 def update_screenshot_gallery(som_or_not: str): global info - screenshots = info.exp_result.get_screenshots(som=som_or_not == "SOM Screenshots") + max_steps = len(info.exp_result.steps_info) + som_or_not == "SOM Screenshots" + + screenshots = [get_screenshot(info, step=i, som_or_not=som_or_not)[0] for i in range(max_steps)] + screenshots_and_label = [(s, f"Step {i}") for i, s in enumerate(screenshots)] + gallery = gr.Gallery( value=screenshots_and_label, columns=2, @@ -595,7 +604,8 @@ def update_pruned_html(): def update_axtree(): - return get_obs(key="axtree_txt", default="No AXTree") + obs = get_obs(key="axtree_txt", default="No AXTree") + return f"```\n{obs}\n```" def dict_to_markdown(d: dict): @@ -645,7 +655,7 @@ def dict_msg_to_markdown(d: dict): case "text": parts.append(f"\n```\n{item['text']}\n```\n") case "tool_use": - tool_use = f"Tool Use: {item['name']} {item['input']} (id = {item['id']})" + tool_use = _format_tool_call(item["name"], item["input"], item["call_id"]) parts.append(f"\n```\n{tool_use}\n```\n") case _: parts.append(f"\n```\n{str(item)}\n```\n") @@ -655,6 +665,30 @@ def dict_msg_to_markdown(d: dict): return markdown +def _format_tool_call(name: str, input: str, call_id: str): + """ + Format a tool call to markdown. + """ + return f"Tool Call: {name} `{input}` (call_id: {call_id})" + + +def format_chat_message(message: BaseMessage | MessageBuilder | dict): + """ + Format a message to markdown. + """ + if isinstance(message, BaseMessage): + return message.content + elif isinstance(message, MessageBuilder): + return message.to_markdown() + elif isinstance(message, dict): + return dict_msg_to_markdown(message) + elif isinstance(message, ResponseFunctionToolCall): # type: ignore[return] + too_use_str = _format_tool_call(message.name, message.arguments, message.call_id) + return f"### Tool Use\n```\n{too_use_str}\n```\n" + else: + return str(message) + + def update_chat_messages(): global info agent_info = info.exp_result.steps_info[info.step].agent_info @@ -662,20 +696,9 @@ def update_chat_messages(): if isinstance(chat_messages, Discussion): return chat_messages.to_markdown() - if isinstance(chat_messages, list) and isinstance(chat_messages[0], MessageBuilder): - chat_messages = [ - m.to_markdown() if isinstance(m, MessageBuilder) else dict_msg_to_markdown(m) - for m in chat_messages - ] + if isinstance(chat_messages, list): + chat_messages = [format_chat_message(m) for m in chat_messages] return "\n\n".join(chat_messages) - messages = [] # TODO(ThibaultLSDC) remove this at some point - for i, m in enumerate(chat_messages): - if isinstance(m, BaseMessage): # TODO remove once langchain is deprecated - m = m.content - elif isinstance(m, dict): - m = m.get("content", "No Content") - messages.append(f"""# Message {i}\n```\n{m}\n```\n\n""") - return "\n".join(messages) def update_task_error(): @@ -722,8 +745,8 @@ def update_agent_info_html(): global info # screenshots from current and next step try: - s1 = get_screenshot(info, info.step, False) - s2 = get_screenshot(info, info.step + 1, False) + s1, action_str = get_screenshot(info, info.step, False) + s2, action_str = get_screenshot(info, info.step + 1, False) agent_info = info.exp_result.steps_info[info.step].agent_info page = agent_info.get("html_page", ["No Agent Info"]) if page is None: @@ -854,6 +877,8 @@ def get_episode_info(info: Info): def get_action_info(info: Info): steps_info = info.exp_result.steps_info + img, action_str = get_screenshot(info, step=info.step, annotate=True) # to update click_mapper + if len(steps_info) == 0: return "No steps were taken" if len(steps_info) <= info.step: @@ -863,7 +888,7 @@ def get_action_info(info: Info): action_info = f"""\ **Action:** -{code(step_info.action)} +{action_str} """ think = step_info.agent_info.get("think", None) if think is not None: diff --git a/src/agentlab/analyze/overlay_utils.py b/src/agentlab/analyze/overlay_utils.py new file mode 100644 index 00000000..4649a962 --- /dev/null +++ b/src/agentlab/analyze/overlay_utils.py @@ -0,0 +1,397 @@ +import ast +import inspect +from dataclasses import dataclass +from typing import Any, Union + +import matplotlib.pyplot as plt +from browsergym.core.action.highlevel import ACTION_SUBSETS +from PIL import Image, ImageDraw + +BGYM_FUNCTION_MAP = {} +for subset in ("bid", "coord"): + for func in ACTION_SUBSETS[subset]: + if func not in BGYM_FUNCTION_MAP: + BGYM_FUNCTION_MAP[func.__name__] = func + + +@dataclass +class ArgInfo: + function_name: str + name: str + value: Any + type: str + start_index: int + stop_index: int + + +def parse_function_calls(code_string: str) -> list[ArgInfo]: + """ + Parse a string containing multiple function calls and return a list of ArgInfo objects + for all arguments in all function calls. + + Args: + code_string: String containing function calls + + Returns: + List of ArgInfo objects containing detailed information about each argument + + Example: + >>> code = ''' + ... mouse_click(34, 59) + ... fill("a234", "test") + ... ''' + >>> result = parse_function_calls(code) + >>> # Returns list of ArgInfo objects for each argument + """ + result = [] + + try: + # Parse the code string into an AST + tree = ast.parse(code_string) + + # Extract all function calls + for node in ast.walk(tree): + if isinstance(node, ast.Call) and isinstance(node.func, ast.Name): + func_name = node.func.id + + # Check if this function exists in our module + if func_name in BGYM_FUNCTION_MAP: + func = BGYM_FUNCTION_MAP[func_name] + + # Get function signature to map positional args to parameter names + try: + sig = inspect.signature(func) + param_names = list(sig.parameters.keys()) + + # Process positional arguments + for i, arg in enumerate(node.args): + if i < len(param_names): + param_name = param_names[i] + value = _extract_value(arg) + start_idx, stop_idx = _get_node_indices(code_string, arg) + + arg_info = ArgInfo( + function_name=func_name, + name=param_name, + value=value, + type=type(value).__name__, + start_index=start_idx, + stop_index=stop_idx, + ) + result.append(arg_info) + + # Process keyword arguments + for keyword in node.keywords: + value = _extract_value(keyword.value) + start_idx, stop_idx = _get_node_indices( + code_string, keyword.value, keyword + ) + + arg_info = ArgInfo( + function_name=func_name, + name=keyword.arg, + value=value, + type=type(value).__name__, + start_index=start_idx, + stop_index=stop_idx, + ) + result.append(arg_info) + + except Exception as e: + # If we can't inspect the function, skip it + print(f"Warning: Could not process function {func_name}: {e}") + continue + + except SyntaxError as e: + print(f"Syntax error in code string: {e}") + return [] + + return result + + +def _extract_value(node: ast.AST) -> Any: + """ + Extract the actual value from an AST node. + + Args: + node: AST node representing a value + + Returns: + The extracted Python value + """ + if isinstance(node, ast.Constant): + # Python 3.8+ uses ast.Constant for all literals + return node.value + elif isinstance(node, ast.Str): + # Fallback for older Python versions + return node.s + elif isinstance(node, ast.Num): + # Fallback for older Python versions + return node.n + elif isinstance(node, ast.List): + # Handle list literals + return [_extract_value(item) for item in node.elts] + elif isinstance(node, ast.Name): + # Handle variable names (return as string identifier) + return node.id + else: + # For other node types, return a string representation + return ast.unparse(node) if hasattr(ast, "unparse") else str(node) + + +def _get_node_indices( + source: str, node: ast.AST, keyword_node: ast.keyword = None +) -> tuple[int, int]: + """ + Convert AST node line/column positions to absolute character indices. + + Args: + source: Original source code string + node: AST node (the value) + keyword_node: If provided, use this keyword node's position as start + + Returns: + Tuple of (start_index, stop_index) in the source string + """ + lines = source.splitlines(keepends=True) + + # For keyword arguments, start from the keyword name + if keyword_node is not None: + start_line = keyword_node.lineno + start_col = keyword_node.col_offset + else: + start_line = node.lineno + start_col = node.col_offset + + # Calculate start index + start_index = 0 + for i in range(start_line - 1): # lineno is 1-based + start_index += len(lines[i]) + start_index += start_col + + # End index always comes from the value node + if hasattr(node, "end_lineno") and hasattr(node, "end_col_offset"): + end_index = 0 + for i in range(node.end_lineno - 1): + end_index += len(lines[i]) + end_index += node.end_col_offset + else: + # Fallback estimation + if hasattr(ast, "get_source_segment"): + segment = ast.get_source_segment(source, node) + end_index = start_index + len(segment) if segment else start_index + 1 + else: + end_index = start_index + 1 + + return start_index, end_index + + +def find_bids_and_xy_pairs(args: list[ArgInfo]) -> list[ArgInfo]: + """ + Find bid arguments and x,y coordinate pairs from a list of ArgInfo objects. + + Args: + args: List of ArgInfo objects from parse_function_calls + + Returns: + List of ArgInfo objects containing: + - Original bid arguments (unchanged) + - Merged x,y pairs with joint names, tuple values, and combined indices + + Rules for x,y pairs: + - Must be consecutive arguments + - Must end with 'x' and 'y' respectively + - Must have the same prefix (everything before 'x'/'y') + - Merged name: prefix + "_xy" + - Merged value: (x_value, y_value) as tuple of floats + - Merged indices: start of x to stop of y + """ + result = [] + i = 0 + + while i < len(args): + current_arg = args[i] + + # Check if current arg name ends with 'bid' + if current_arg.name.endswith("bid"): + result.append(current_arg) + i += 1 + continue + + # Check for x,y pair + if i + 1 < len(args) and current_arg.name.endswith("x") and args[i + 1].name.endswith("y"): + + next_arg = args[i + 1] + + # Extract prefixes (everything before 'x' and 'y') + current_prefix = current_arg.name[:-1] # Remove 'x' + next_prefix = next_arg.name[:-1] # Remove 'y' + + # Check if they have the same prefix and are from the same function + if ( + current_prefix == next_prefix + and current_arg.function_name == next_arg.function_name + ): + + # Create merged ArgInfo for x,y pair + merged_name = f"{current_prefix}xy" + + # Convert values to floats and create tuple + try: + x_val = float(current_arg.value) + y_val = float(next_arg.value) + merged_value = (x_val, y_val) + except (ValueError, TypeError): + # If conversion fails, keep original values + merged_value = (current_arg.value, next_arg.value) + + merged_arg = ArgInfo( + function_name=current_arg.function_name, + name=merged_name, + value=merged_value, + type="tuple", + start_index=current_arg.start_index, + stop_index=next_arg.stop_index, + ) + + result.append(merged_arg) + i += 2 # Skip both x and y args + continue + + # If no special handling, skip this argument + i += 1 + + return result + + +def overlay_cross( + img: Image.Image, + coord: tuple[float, float], + color: Union[str, tuple[int, int, int]] = "red", + length: int = 7, + width: int = 1, +) -> Image.Image: + draw = ImageDraw.Draw(img) + + x, y = coord + half_len = length // 2 + + # Draw horizontal line + draw.line([x - half_len, y, x + half_len, y], fill=color, width=width) + # Draw vertical line + draw.line([x, y - half_len, x, y + half_len], fill=color, width=width) + + return img + + +def overlay_rectangle( + img: Image.Image, + bbox: tuple[float, float, float, float], + color: Union[str, tuple[int, int, int]] = "red", + width: int = 1, +) -> Image.Image: + draw = ImageDraw.Draw(img) + + x, y, w, h = bbox + + # Draw rectangle outline + draw.rectangle([x, y, x + w, y + h], outline=color, width=width) + + return img + + +def annotate_action( + img: Image.Image, action_string: str, properties: dict[str, tuple], colormap: str = "tab10" +) -> str: + """ + Annotate an image with overlays for action arguments and return colored HTML. + + Args: + img: PIL Image to modify in place + action_string: String containing function calls + properties: Dict mapping bid strings to bounding boxes (x1, y1, x2, y2) + colormap: Matplotlib colormap name for auto-color selection + + Returns: + HTML string with arguments colored to match overlays + """ + # Parse function calls to get all arguments + all_args = parse_function_calls(action_string) + + # Filter to get bids and xy pairs + filtered_args = find_bids_and_xy_pairs(all_args) + + # Get colormap + cmap = plt.get_cmap(colormap) + + # Track colors for each filtered argument + colors = [] + + # Add overlays to image + for i, arg_info in enumerate(filtered_args): + # Get color from colormap + color_rgb = cmap(i % cmap.N) + color_255 = tuple(int(c * 255) for c in color_rgb[:3]) # Convert to 0-255 range + + colors.append(color_rgb[:3]) # Store normalized RGB for HTML + + if arg_info.name.endswith("xy"): + # Handle x,y coordinate pairs + x, y = arg_info.value + overlay_cross(img, (x, y), color_255, length=9, width=3) + + elif arg_info.name.endswith("bid"): + # Handle bid arguments with bounding boxes + bid_value = arg_info.value + if bid_value in properties: + + bbox = properties[bid_value]["bbox"] + if bbox: + overlay_rectangle(img, bbox, color_255, width=3) + + # Generate colored HTML + html = create_colored_html(action_string, filtered_args, colors) + + return html + + +def create_colored_html(action_string: str, filtered_args: list, colors: list) -> str: + """ + Create HTML with colored arguments using start/stop indices. + + Args: + action_string: Original action string + filtered_args: List of ArgInfo objects with start_index/stop_index + colors: List of RGB tuples, same length as filtered_args + + Returns: + HTML string with colored spans + """ + # Sort args by start position for sequential processing + sorted_pairs = sorted(zip(filtered_args, colors), key=lambda x: x[0].start_index) + + # Build HTML with colored spans + html_parts = [] + last_end = 0 + + for arg_info, color_rgb in sorted_pairs: + # Add uncolored text before this argument + html_parts.append(action_string[last_end : arg_info.start_index]) + + # Get the argument text + arg_text = action_string[arg_info.start_index : arg_info.stop_index] + + # Convert color to hex + color_hex = "#{:02x}{:02x}{:02x}".format( + int(color_rgb[0] * 255), int(color_rgb[1] * 255), int(color_rgb[2] * 255) + ) + + # Add colored span + html_parts.append(f'{arg_text}') + + last_end = arg_info.stop_index + + # Add remaining text + html_parts.append(action_string[last_end:]) + + return "".join(html_parts) diff --git a/tests/analyze/test_overlay_utils.py b/tests/analyze/test_overlay_utils.py new file mode 100644 index 00000000..a3aabf51 --- /dev/null +++ b/tests/analyze/test_overlay_utils.py @@ -0,0 +1,78 @@ +from PIL import Image + +from agentlab.analyze import overlay_utils + + +def test_parse_function_calls(): + + test_code = """ +mouse_click(34, 59) +fill("a234", "test") +click('b123', button="right", modifiers=["Shift", "Control"]) +select_option("c456", ["option1", "option2"]) +""" + + result = overlay_utils.parse_function_calls(test_code) + + assert result[1].function_name == "mouse_click" + assert result[1].name == "y" + assert test_code[result[1].start_index : result[1].stop_index] == "59" + + assert result[8].function_name == "select_option" + assert result[8].name == "options" + assert test_code[result[8].start_index : result[8].stop_index] == '["option1", "option2"]' + + +def test_filtering_args(): + test_code = """ +mouse_click(34, 59) +fill("a234", "test") +mouse_drag_and_drop(34, 59, to_x=100, to_y=200) +drag_and_drop("a123", "b456") +""" + result = overlay_utils.parse_function_calls(test_code) + args = overlay_utils.find_bids_and_xy_pairs(result) + + assert len(args) == 6 # Expecting 4 args: 2 mouse clicks, 1 fill, 1 select_option + + assert args[0].function_name == "mouse_click" + assert args[0].name == "xy" + assert args[0].value == (34.0, 59.0) + assert test_code[args[0].start_index : args[0].stop_index] == "34, 59" + + assert args[2].name == "from_xy" + assert args[3].name == "to_xy" + assert test_code[args[3].start_index : args[3].stop_index] == "to_x=100, to_y=200" + + +def manual_test(): + """Manual test function that displays the resulting image.""" + import matplotlib.pyplot as plt + + # Create a white test image + img = Image.new("RGB", (400, 300), "white") + + # Test action string with multiple function calls + action_string = """mouse_click(100, 150) +fill("search_box", "hello world") +click("submit_btn")""" + + # Mock properties mapping bids to bounding boxes + properties = {"search_box": (200, 100, 350, 130), "submit_btn": (200, 200, 280, 230)} + + # Annotate the image and get colored HTML + html_result = overlay_utils.annotate_action(img, action_string, properties, colormap="tab10") + + # Display result + plt.figure(figsize=(10, 6)) + plt.imshow(img) + plt.axis("off") + plt.show() + + print("HTML with colored arguments:") + print(html_result) + print("\nManual test completed!") + + +if __name__ == "__main__": + manual_test() From 7222f4148cf04fb68b3ebf5f4ad1f845c5000ac0 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 27 Jun 2025 15:16:09 -0400 Subject: [PATCH 2/8] ignore some local files I work with --- .gitignore | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 6339cccb..c878e3d1 100644 --- a/.gitignore +++ b/.gitignore @@ -171,4 +171,8 @@ results/ outputs/ miniwob-plusplus/ .miniwob-server.pid -debugging_results/ \ No newline at end of file +debugging_results/ + +# working files +main_miniwob_debug.py +main_workarena_debug.py From 022a03cca45f0632198f5403faa2693fabb1728d Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 27 Jun 2025 15:17:01 -0400 Subject: [PATCH 3/8] for backward compatibility --- src/agentlab/agents/tool_use_agent/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/agentlab/agents/tool_use_agent/__init__.py b/src/agentlab/agents/tool_use_agent/__init__.py index e69de29b..de27826d 100644 --- a/src/agentlab/agents/tool_use_agent/__init__.py +++ b/src/agentlab/agents/tool_use_agent/__init__.py @@ -0,0 +1,6 @@ +import sys + +from .tool_use_agent import * + +# for backward compatibility of unpickling +sys.modules[__name__ + ".multi_tool_agent"] = sys.modules[__name__] From e33d66470c96033e0884e569f722031aa9369481 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 27 Jun 2025 15:17:47 -0400 Subject: [PATCH 4/8] no explicit caching for OpenAI --- src/agentlab/llm/response_api.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py index 755886f8..2b9f696c 100644 --- a/src/agentlab/llm/response_api.py +++ b/src/agentlab/llm/response_api.py @@ -154,6 +154,9 @@ def handle_tool_call(self, content): output.append({"role": "user", "content": tail_content}) return output + def mark_all_previous_msg_for_caching(self) -> List[Message]: + pass + class AnthropicAPIMessageBuilder(MessageBuilder): From bb04218c0b5895a7cd0d3c4597a5bb22686646be Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 27 Jun 2025 15:39:58 -0400 Subject: [PATCH 5/8] minor corrections --- src/agentlab/agents/tool_use_agent/__init__.py | 2 -- src/agentlab/analyze/agent_xray.py | 1 - 2 files changed, 3 deletions(-) diff --git a/src/agentlab/agents/tool_use_agent/__init__.py b/src/agentlab/agents/tool_use_agent/__init__.py index de27826d..b03b1169 100644 --- a/src/agentlab/agents/tool_use_agent/__init__.py +++ b/src/agentlab/agents/tool_use_agent/__init__.py @@ -1,6 +1,4 @@ import sys -from .tool_use_agent import * - # for backward compatibility of unpickling sys.modules[__name__ + ".multi_tool_agent"] = sys.modules[__name__] diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index 439a99ee..08fc3d0f 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -571,7 +571,6 @@ def update_screenshot_pair(som_or_not: str): def update_screenshot_gallery(som_or_not: str): global info max_steps = len(info.exp_result.steps_info) - som_or_not == "SOM Screenshots" screenshots = [get_screenshot(info, step=i, som_or_not=som_or_not)[0] for i in range(max_steps)] From 3c38c2b8ef35ce9e388b3f0be3386ec06f6b77f2 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 27 Jun 2025 15:42:06 -0400 Subject: [PATCH 6/8] rename manual_test function to manual_eval for clarity --- tests/analyze/test_overlay_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/analyze/test_overlay_utils.py b/tests/analyze/test_overlay_utils.py index a3aabf51..cdff70d6 100644 --- a/tests/analyze/test_overlay_utils.py +++ b/tests/analyze/test_overlay_utils.py @@ -45,7 +45,7 @@ def test_filtering_args(): assert test_code[args[3].start_index : args[3].stop_index] == "to_x=100, to_y=200" -def manual_test(): +def manual_eval(): """Manual test function that displays the resulting image.""" import matplotlib.pyplot as plt @@ -75,4 +75,4 @@ def manual_test(): if __name__ == "__main__": - manual_test() + manual_eval() From b45877d78134dfc9a51f4382d31410b5cbce5de7 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 27 Jun 2025 16:10:01 -0400 Subject: [PATCH 7/8] add check for empty summary DataFrame and improve error logging --- src/agentlab/analyze/agent_xray.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index 08fc3d0f..d46b57f3 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -1108,8 +1108,11 @@ def get_directory_contents(results_dir: Path): most_recent_summary = max(summary_files, key=os.path.getctime) summary_df = pd.read_csv(most_recent_summary) + if len(summary_df) == 0 or summary_df["avg_reward"].isna().all(): + continue # skip if all avg_reward are NaN + # get row with max avg_reward - max_reward_row = summary_df.loc[summary_df["avg_reward"].idxmax()] + max_reward_row = summary_df.loc[summary_df["avg_reward"].idxmax(skipna=True)] reward = max_reward_row["avg_reward"] * 100 completed = max_reward_row["n_completed"] n_err = max_reward_row["n_err"] @@ -1117,7 +1120,7 @@ def get_directory_contents(results_dir: Path): f" - avg-reward: {reward:.1f}% - completed: {completed} - errors: {n_err}" ) except Exception as e: - print(f"Error while reading summary file: {e}") + print(f"Error while reading summary file {most_recent_summary}: {e}") exp_descriptions.append(exp_description) From 53c16f1095fdfcf856ea816b5429228d69922efa Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 27 Jun 2025 16:47:03 -0400 Subject: [PATCH 8/8] refactor: remove antialiasing from plot_profiling function and update properties structure in tests --- src/agentlab/analyze/agent_xray.py | 1 - tests/analyze/test_overlay_utils.py | 5 ++++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index d46b57f3..24564dcc 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -1246,7 +1246,6 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr horizontalalignment="left", rotation=0, clip_on=True, - antialiased=True, fontweight=1000, backgroundcolor=colors[12], ) diff --git a/tests/analyze/test_overlay_utils.py b/tests/analyze/test_overlay_utils.py index cdff70d6..58f5915b 100644 --- a/tests/analyze/test_overlay_utils.py +++ b/tests/analyze/test_overlay_utils.py @@ -58,7 +58,10 @@ def manual_eval(): click("submit_btn")""" # Mock properties mapping bids to bounding boxes - properties = {"search_box": (200, 100, 350, 130), "submit_btn": (200, 200, 280, 230)} + properties = { + "search_box": {"bbox": (50, 50, 100, 50)}, + "submit_btn": {"bbox": (150, 100, 120, 30)}, + } # Annotate the image and get colored HTML html_result = overlay_utils.annotate_action(img, action_string, properties, colormap="tab10")