From f112e150e332e214f9219df5035bbfb414629d69 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 27 Jun 2025 15:15:54 -0400
Subject: [PATCH 1/8] enhance the action overlay mechanism

---
 src/agentlab/agents/agent_utils.py    | 137 ---------
 src/agentlab/analyze/agent_xray.py    |  95 +++---
 src/agentlab/analyze/overlay_utils.py | 397 ++++++++++++++++++++++++++
 tests/analyze/test_overlay_utils.py   |  78 +++++
 4 files changed, 535 insertions(+), 172 deletions(-)
 create mode 100644 src/agentlab/analyze/overlay_utils.py
 create mode 100644 tests/analyze/test_overlay_utils.py

diff --git a/src/agentlab/agents/agent_utils.py b/src/agentlab/agents/agent_utils.py
index 991e27e6..29219d2d 100644
--- a/src/agentlab/agents/agent_utils.py
+++ b/src/agentlab/agents/agent_utils.py
@@ -1,96 +1,6 @@
-from logging import warning
-from typing import Optional, Tuple
-
-import numpy as np
 from PIL import Image, ImageDraw
 from playwright.sync_api import Page
 
-"""
-This module contains utility functions for handling observations and actions in the context of agent interactions.
-"""
-
-
-def tag_screenshot_with_action(screenshot: Image, action: str) -> Image:
-    """
-    If action is a coordinate action, try to render it on the screenshot.
-
-    e.g. mouse_click(120, 130) -> draw a dot at (120, 130) on the screenshot
-
-    Args:
-        screenshot: The screenshot to tag.
-        action: The action to tag the screenshot with.
-
-    Returns:
-        The tagged screenshot.
-
-    Raises:
-        ValueError: If the action parsing fails.
-    """
-    if action.startswith("mouse_click"):
-        try:
-            coords = action[action.index("(") + 1 : action.index(")")].split(",")
-            coords = [c.strip() for c in coords]
-            if len(coords) not in [2, 3]:
-                raise ValueError(f"Invalid coordinate format: {coords}")
-            if coords[0].startswith("x="):
-                coords[0] = coords[0][2:]
-            if coords[1].startswith("y="):
-                coords[1] = coords[1][2:]
-            x, y = float(coords[0].strip()), float(coords[1].strip())
-            draw = ImageDraw.Draw(screenshot)
-            radius = 5
-            draw.ellipse(
-                (x - radius, y - radius, x + radius, y + radius), fill="blue", outline="blue"
-            )
-        except (ValueError, IndexError) as e:
-            warning(f"Failed to parse action '{action}': {e}")
-
-    elif action.startswith("mouse_drag_and_drop"):
-        try:
-            func_name, parsed_args = parse_func_call_string(action)
-            if func_name == "mouse_drag_and_drop" and parsed_args is not None:
-                args, kwargs = parsed_args
-                x1, y1, x2, y2 = None, None, None, None
-
-                if args and len(args) >= 4:
-                    # Positional arguments: mouse_drag_and_drop(x1, y1, x2, y2)
-                    x1, y1, x2, y2 = map(float, args[:4])
-                elif kwargs:
-                    # Keyword arguments: mouse_drag_and_drop(from_x=x1, from_y=y1, to_x=x2, to_y=y2)
-                    x1 = float(kwargs.get("from_x", 0))
-                    y1 = float(kwargs.get("from_y", 0))
-                    x2 = float(kwargs.get("to_x", 0))
-                    y2 = float(kwargs.get("to_y", 0))
-
-                if all(coord is not None for coord in [x1, y1, x2, y2]):
-                    draw = ImageDraw.Draw(screenshot)
-                    # Draw the main line
-                    draw.line((x1, y1, x2, y2), fill="red", width=2)
-                    # Draw arrowhead at the end point using the helper function
-                    draw_arrowhead(draw, (x1, y1), (x2, y2))
-        except (ValueError, IndexError) as e:
-            warning(f"Failed to parse action '{action}': {e}")
-    return screenshot
-
-
-def add_mouse_pointer_from_action(screenshot: Image, action: str) -> Image.Image:
-
-    if action.startswith("mouse_click"):
-        try:
-            coords = action[action.index("(") + 1 : action.index(")")].split(",")
-            coords = [c.strip() for c in coords]
-            if len(coords) not in [2, 3]:
-                raise ValueError(f"Invalid coordinate format: {coords}")
-            if coords[0].startswith("x="):
-                coords[0] = coords[0][2:]
-            if coords[1].startswith("y="):
-                coords[1] = coords[1][2:]
-            x, y = int(coords[0].strip()), int(coords[1].strip())
-            screenshot = draw_mouse_pointer(screenshot, x, y)
-        except (ValueError, IndexError) as e:
-            warning(f"Failed to parse action '{action}': {e}")
-    return screenshot
-
 
 def draw_mouse_pointer(image: Image.Image, x: int, y: int) -> Image.Image:
     """
@@ -218,50 +128,3 @@ def zoom_webpage(page: Page, zoom_factor: float = 1.5):
 
     page.evaluate(f"document.documentElement.style.zoom='{zoom_factor*100}%'")
     return page
-
-
-def parse_func_call_string(call_str: str) -> Tuple[Optional[str], Optional[Tuple[list, dict]]]:
-    """
-    Parse a function call string and extract the function name and arguments.
-
-    Args:
-        call_str (str): A string like "mouse_click(100, 200)" or "mouse_drag_and_drop(x=10, y=20)"
-
-    Returns:
-        Tuple (func_name, (args, kwargs)), or (None, None) if parsing fails
-    """
-    import ast
-
-    try:
-        tree = ast.parse(call_str.strip(), mode="eval")
-        if not isinstance(tree.body, ast.Call):
-            return None, None
-
-        call_node = tree.body
-
-        # Function name
-        if isinstance(call_node.func, ast.Name):
-            func_name = call_node.func.id
-        else:
-            return None, None
-
-        # Positional arguments
-        args = []
-        for arg in call_node.args:
-            try:
-                args.append(ast.literal_eval(arg))
-            except (ValueError, TypeError):
-                return None, None
-
-        # Keyword arguments
-        kwargs = {}
-        for kw in call_node.keywords:
-            try:
-                kwargs[kw.arg] = ast.literal_eval(kw.value)
-            except (ValueError, TypeError):
-                return None, None
-
-        return func_name, (args, kwargs)
-
-    except (SyntaxError, ValueError, TypeError):
-        return None, None
diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index e09b4af8..439a99ee 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -14,10 +14,11 @@
 from attr import dataclass
 from langchain.schema import BaseMessage, HumanMessage
 from openai import OpenAI
+from openai.types.responses import ResponseFunctionToolCall
 from PIL import Image
 
-from agentlab.agents import agent_utils
 from agentlab.analyze import inspect_results
+from agentlab.analyze.overlay_utils import annotate_action
 from agentlab.experiments.exp_utils import RESULTS_DIR
 from agentlab.experiments.loop import ExpResult, StepInfo
 from agentlab.experiments.study import get_most_recent_study
@@ -351,7 +352,7 @@ def run_gradio(results_dir: Path):
                 pruned_html_code = gr.Code(language="html", **code_args)
 
             with gr.Tab("AXTree") as tab_axtree:
-                axtree_code = gr.Code(language=None, **code_args)
+                axtree_code = gr.Markdown()
 
             with gr.Tab("Chat Messages") as tab_chat:
                 chat_messages = gr.Markdown()
@@ -536,38 +537,46 @@ def wrapper(*args, **kwargs):
 
 def update_screenshot(som_or_not: str):
     global info
-    action = info.exp_result.steps_info[info.step].action
-    return agent_utils.tag_screenshot_with_action(
-        get_screenshot(info, som_or_not=som_or_not), action
-    )
+    img, action_str = get_screenshot(info, som_or_not=som_or_not, annotate=True)
+    return img
 
 
-def get_screenshot(info: Info, step: int = None, som_or_not: str = "Raw Screenshots"):
+def get_screenshot(
+    info: Info, step: int = None, som_or_not: str = "Raw Screenshots", annotate: bool = False
+):
     if step is None:
         step = info.step
+    step_info = info.exp_result.steps_info[step]
     try:
         is_som = som_or_not == "SOM Screenshots"
-        return info.exp_result.get_screenshot(step, som=is_som)
+        img = info.exp_result.get_screenshot(step, som=is_som)
+        if annotate:
+            action_str = step_info.action
+            properties = step_info.obs.get("extra_element_properties", None)
+            action_colored = annotate_action(img, action_string=action_str, properties=properties)
+        else:
+            action_colored = None
+        return img, action_colored
     except FileNotFoundError:
-        return None
+        return None, None
 
 
 def update_screenshot_pair(som_or_not: str):
     global info
-    s1 = get_screenshot(info, info.step, som_or_not)
-    s2 = get_screenshot(info, info.step + 1, som_or_not)
-
-    if s1 is not None:
-        s1 = agent_utils.tag_screenshot_with_action(
-            s1, info.exp_result.steps_info[info.step].action
-        )
+    s1, action_str = get_screenshot(info, info.step, som_or_not, annotate=True)
+    s2, action_str = get_screenshot(info, info.step + 1, som_or_not)
     return s1, s2
 
 
 def update_screenshot_gallery(som_or_not: str):
     global info
-    screenshots = info.exp_result.get_screenshots(som=som_or_not == "SOM Screenshots")
+    max_steps = len(info.exp_result.steps_info)
+    som_or_not == "SOM Screenshots"
+
+    screenshots = [get_screenshot(info, step=i, som_or_not=som_or_not)[0] for i in range(max_steps)]
+
     screenshots_and_label = [(s, f"Step {i}") for i, s in enumerate(screenshots)]
+
     gallery = gr.Gallery(
         value=screenshots_and_label,
         columns=2,
@@ -595,7 +604,8 @@ def update_pruned_html():
 
 
 def update_axtree():
-    return get_obs(key="axtree_txt", default="No AXTree")
+    obs = get_obs(key="axtree_txt", default="No AXTree")
+    return f"```\n{obs}\n```"
 
 
 def dict_to_markdown(d: dict):
@@ -645,7 +655,7 @@ def dict_msg_to_markdown(d: dict):
             case "text":
                 parts.append(f"\n```\n{item['text']}\n```\n")
             case "tool_use":
-                tool_use = f"Tool Use: {item['name']} {item['input']} (id = {item['id']})"
+                tool_use = _format_tool_call(item["name"], item["input"], item["call_id"])
                 parts.append(f"\n```\n{tool_use}\n```\n")
             case _:
                 parts.append(f"\n```\n{str(item)}\n```\n")
@@ -655,6 +665,30 @@ def dict_msg_to_markdown(d: dict):
     return markdown
 
 
+def _format_tool_call(name: str, input: str, call_id: str):
+    """
+    Format a tool call to markdown.
+    """
+    return f"Tool Call: {name}  `{input}` (call_id: {call_id})"
+
+
+def format_chat_message(message: BaseMessage | MessageBuilder | dict):
+    """
+    Format a message to markdown.
+    """
+    if isinstance(message, BaseMessage):
+        return message.content
+    elif isinstance(message, MessageBuilder):
+        return message.to_markdown()
+    elif isinstance(message, dict):
+        return dict_msg_to_markdown(message)
+    elif isinstance(message, ResponseFunctionToolCall):  # type: ignore[return]
+        too_use_str = _format_tool_call(message.name, message.arguments, message.call_id)
+        return f"### Tool Use\n```\n{too_use_str}\n```\n"
+    else:
+        return str(message)
+
+
 def update_chat_messages():
     global info
     agent_info = info.exp_result.steps_info[info.step].agent_info
@@ -662,20 +696,9 @@ def update_chat_messages():
     if isinstance(chat_messages, Discussion):
         return chat_messages.to_markdown()
 
-    if isinstance(chat_messages, list) and isinstance(chat_messages[0], MessageBuilder):
-        chat_messages = [
-            m.to_markdown() if isinstance(m, MessageBuilder) else dict_msg_to_markdown(m)
-            for m in chat_messages
-        ]
+    if isinstance(chat_messages, list):
+        chat_messages = [format_chat_message(m) for m in chat_messages]
         return "\n\n".join(chat_messages)
-    messages = []  # TODO(ThibaultLSDC) remove this at some point
-    for i, m in enumerate(chat_messages):
-        if isinstance(m, BaseMessage):  # TODO remove once langchain is deprecated
-            m = m.content
-        elif isinstance(m, dict):
-            m = m.get("content", "No Content")
-        messages.append(f"""# Message {i}\n```\n{m}\n```\n\n""")
-    return "\n".join(messages)
 
 
 def update_task_error():
@@ -722,8 +745,8 @@ def update_agent_info_html():
     global info
     # screenshots from current and next step
     try:
-        s1 = get_screenshot(info, info.step, False)
-        s2 = get_screenshot(info, info.step + 1, False)
+        s1, action_str = get_screenshot(info, info.step, False)
+        s2, action_str = get_screenshot(info, info.step + 1, False)
         agent_info = info.exp_result.steps_info[info.step].agent_info
         page = agent_info.get("html_page", ["No Agent Info"])
         if page is None:
@@ -854,6 +877,8 @@ def get_episode_info(info: Info):
 
 def get_action_info(info: Info):
     steps_info = info.exp_result.steps_info
+    img, action_str = get_screenshot(info, step=info.step, annotate=True)  # to update click_mapper
+
     if len(steps_info) == 0:
         return "No steps were taken"
     if len(steps_info) <= info.step:
@@ -863,7 +888,7 @@ def get_action_info(info: Info):
     action_info = f"""\
 **Action:**
 
-{code(step_info.action)}
+{action_str}
 """
     think = step_info.agent_info.get("think", None)
     if think is not None:
diff --git a/src/agentlab/analyze/overlay_utils.py b/src/agentlab/analyze/overlay_utils.py
new file mode 100644
index 00000000..4649a962
--- /dev/null
+++ b/src/agentlab/analyze/overlay_utils.py
@@ -0,0 +1,397 @@
+import ast
+import inspect
+from dataclasses import dataclass
+from typing import Any, Union
+
+import matplotlib.pyplot as plt
+from browsergym.core.action.highlevel import ACTION_SUBSETS
+from PIL import Image, ImageDraw
+
+BGYM_FUNCTION_MAP = {}
+for subset in ("bid", "coord"):
+    for func in ACTION_SUBSETS[subset]:
+        if func not in BGYM_FUNCTION_MAP:
+            BGYM_FUNCTION_MAP[func.__name__] = func
+
+
+@dataclass
+class ArgInfo:
+    function_name: str
+    name: str
+    value: Any
+    type: str
+    start_index: int
+    stop_index: int
+
+
+def parse_function_calls(code_string: str) -> list[ArgInfo]:
+    """
+    Parse a string containing multiple function calls and return a list of ArgInfo objects
+    for all arguments in all function calls.
+
+    Args:
+        code_string: String containing function calls
+
+    Returns:
+        List of ArgInfo objects containing detailed information about each argument
+
+    Example:
+        >>> code = '''
+        ... mouse_click(34, 59)
+        ... fill("a234", "test")
+        ... '''
+        >>> result = parse_function_calls(code)
+        >>> # Returns list of ArgInfo objects for each argument
+    """
+    result = []
+
+    try:
+        # Parse the code string into an AST
+        tree = ast.parse(code_string)
+
+        # Extract all function calls
+        for node in ast.walk(tree):
+            if isinstance(node, ast.Call) and isinstance(node.func, ast.Name):
+                func_name = node.func.id
+
+                # Check if this function exists in our module
+                if func_name in BGYM_FUNCTION_MAP:
+                    func = BGYM_FUNCTION_MAP[func_name]
+
+                    # Get function signature to map positional args to parameter names
+                    try:
+                        sig = inspect.signature(func)
+                        param_names = list(sig.parameters.keys())
+
+                        # Process positional arguments
+                        for i, arg in enumerate(node.args):
+                            if i < len(param_names):
+                                param_name = param_names[i]
+                                value = _extract_value(arg)
+                                start_idx, stop_idx = _get_node_indices(code_string, arg)
+
+                                arg_info = ArgInfo(
+                                    function_name=func_name,
+                                    name=param_name,
+                                    value=value,
+                                    type=type(value).__name__,
+                                    start_index=start_idx,
+                                    stop_index=stop_idx,
+                                )
+                                result.append(arg_info)
+
+                        # Process keyword arguments
+                        for keyword in node.keywords:
+                            value = _extract_value(keyword.value)
+                            start_idx, stop_idx = _get_node_indices(
+                                code_string, keyword.value, keyword
+                            )
+
+                            arg_info = ArgInfo(
+                                function_name=func_name,
+                                name=keyword.arg,
+                                value=value,
+                                type=type(value).__name__,
+                                start_index=start_idx,
+                                stop_index=stop_idx,
+                            )
+                            result.append(arg_info)
+
+                    except Exception as e:
+                        # If we can't inspect the function, skip it
+                        print(f"Warning: Could not process function {func_name}: {e}")
+                        continue
+
+    except SyntaxError as e:
+        print(f"Syntax error in code string: {e}")
+        return []
+
+    return result
+
+
+def _extract_value(node: ast.AST) -> Any:
+    """
+    Extract the actual value from an AST node.
+
+    Args:
+        node: AST node representing a value
+
+    Returns:
+        The extracted Python value
+    """
+    if isinstance(node, ast.Constant):
+        # Python 3.8+ uses ast.Constant for all literals
+        return node.value
+    elif isinstance(node, ast.Str):
+        # Fallback for older Python versions
+        return node.s
+    elif isinstance(node, ast.Num):
+        # Fallback for older Python versions
+        return node.n
+    elif isinstance(node, ast.List):
+        # Handle list literals
+        return [_extract_value(item) for item in node.elts]
+    elif isinstance(node, ast.Name):
+        # Handle variable names (return as string identifier)
+        return node.id
+    else:
+        # For other node types, return a string representation
+        return ast.unparse(node) if hasattr(ast, "unparse") else str(node)
+
+
+def _get_node_indices(
+    source: str, node: ast.AST, keyword_node: ast.keyword = None
+) -> tuple[int, int]:
+    """
+    Convert AST node line/column positions to absolute character indices.
+
+    Args:
+        source: Original source code string
+        node: AST node (the value)
+        keyword_node: If provided, use this keyword node's position as start
+
+    Returns:
+        Tuple of (start_index, stop_index) in the source string
+    """
+    lines = source.splitlines(keepends=True)
+
+    # For keyword arguments, start from the keyword name
+    if keyword_node is not None:
+        start_line = keyword_node.lineno
+        start_col = keyword_node.col_offset
+    else:
+        start_line = node.lineno
+        start_col = node.col_offset
+
+    # Calculate start index
+    start_index = 0
+    for i in range(start_line - 1):  # lineno is 1-based
+        start_index += len(lines[i])
+    start_index += start_col
+
+    # End index always comes from the value node
+    if hasattr(node, "end_lineno") and hasattr(node, "end_col_offset"):
+        end_index = 0
+        for i in range(node.end_lineno - 1):
+            end_index += len(lines[i])
+        end_index += node.end_col_offset
+    else:
+        # Fallback estimation
+        if hasattr(ast, "get_source_segment"):
+            segment = ast.get_source_segment(source, node)
+            end_index = start_index + len(segment) if segment else start_index + 1
+        else:
+            end_index = start_index + 1
+
+    return start_index, end_index
+
+
+def find_bids_and_xy_pairs(args: list[ArgInfo]) -> list[ArgInfo]:
+    """
+    Find bid arguments and x,y coordinate pairs from a list of ArgInfo objects.
+
+    Args:
+        args: List of ArgInfo objects from parse_function_calls
+
+    Returns:
+        List of ArgInfo objects containing:
+        - Original bid arguments (unchanged)
+        - Merged x,y pairs with joint names, tuple values, and combined indices
+
+    Rules for x,y pairs:
+    - Must be consecutive arguments
+    - Must end with 'x' and 'y' respectively
+    - Must have the same prefix (everything before 'x'/'y')
+    - Merged name: prefix + "_xy"
+    - Merged value: (x_value, y_value) as tuple of floats
+    - Merged indices: start of x to stop of y
+    """
+    result = []
+    i = 0
+
+    while i < len(args):
+        current_arg = args[i]
+
+        # Check if current arg name ends with 'bid'
+        if current_arg.name.endswith("bid"):
+            result.append(current_arg)
+            i += 1
+            continue
+
+        # Check for x,y pair
+        if i + 1 < len(args) and current_arg.name.endswith("x") and args[i + 1].name.endswith("y"):
+
+            next_arg = args[i + 1]
+
+            # Extract prefixes (everything before 'x' and 'y')
+            current_prefix = current_arg.name[:-1]  # Remove 'x'
+            next_prefix = next_arg.name[:-1]  # Remove 'y'
+
+            # Check if they have the same prefix and are from the same function
+            if (
+                current_prefix == next_prefix
+                and current_arg.function_name == next_arg.function_name
+            ):
+
+                # Create merged ArgInfo for x,y pair
+                merged_name = f"{current_prefix}xy"
+
+                # Convert values to floats and create tuple
+                try:
+                    x_val = float(current_arg.value)
+                    y_val = float(next_arg.value)
+                    merged_value = (x_val, y_val)
+                except (ValueError, TypeError):
+                    # If conversion fails, keep original values
+                    merged_value = (current_arg.value, next_arg.value)
+
+                merged_arg = ArgInfo(
+                    function_name=current_arg.function_name,
+                    name=merged_name,
+                    value=merged_value,
+                    type="tuple",
+                    start_index=current_arg.start_index,
+                    stop_index=next_arg.stop_index,
+                )
+
+                result.append(merged_arg)
+                i += 2  # Skip both x and y args
+                continue
+
+        # If no special handling, skip this argument
+        i += 1
+
+    return result
+
+
+def overlay_cross(
+    img: Image.Image,
+    coord: tuple[float, float],
+    color: Union[str, tuple[int, int, int]] = "red",
+    length: int = 7,
+    width: int = 1,
+) -> Image.Image:
+    draw = ImageDraw.Draw(img)
+
+    x, y = coord
+    half_len = length // 2
+
+    # Draw horizontal line
+    draw.line([x - half_len, y, x + half_len, y], fill=color, width=width)
+    # Draw vertical line
+    draw.line([x, y - half_len, x, y + half_len], fill=color, width=width)
+
+    return img
+
+
+def overlay_rectangle(
+    img: Image.Image,
+    bbox: tuple[float, float, float, float],
+    color: Union[str, tuple[int, int, int]] = "red",
+    width: int = 1,
+) -> Image.Image:
+    draw = ImageDraw.Draw(img)
+
+    x, y, w, h = bbox
+
+    # Draw rectangle outline
+    draw.rectangle([x, y, x + w, y + h], outline=color, width=width)
+
+    return img
+
+
+def annotate_action(
+    img: Image.Image, action_string: str, properties: dict[str, tuple], colormap: str = "tab10"
+) -> str:
+    """
+    Annotate an image with overlays for action arguments and return colored HTML.
+
+    Args:
+        img: PIL Image to modify in place
+        action_string: String containing function calls
+        properties: Dict mapping bid strings to bounding boxes (x1, y1, x2, y2)
+        colormap: Matplotlib colormap name for auto-color selection
+
+    Returns:
+        HTML string with arguments colored to match overlays
+    """
+    # Parse function calls to get all arguments
+    all_args = parse_function_calls(action_string)
+
+    # Filter to get bids and xy pairs
+    filtered_args = find_bids_and_xy_pairs(all_args)
+
+    # Get colormap
+    cmap = plt.get_cmap(colormap)
+
+    # Track colors for each filtered argument
+    colors = []
+
+    # Add overlays to image
+    for i, arg_info in enumerate(filtered_args):
+        # Get color from colormap
+        color_rgb = cmap(i % cmap.N)
+        color_255 = tuple(int(c * 255) for c in color_rgb[:3])  # Convert to 0-255 range
+
+        colors.append(color_rgb[:3])  # Store normalized RGB for HTML
+
+        if arg_info.name.endswith("xy"):
+            # Handle x,y coordinate pairs
+            x, y = arg_info.value
+            overlay_cross(img, (x, y), color_255, length=9, width=3)
+
+        elif arg_info.name.endswith("bid"):
+            # Handle bid arguments with bounding boxes
+            bid_value = arg_info.value
+            if bid_value in properties:
+
+                bbox = properties[bid_value]["bbox"]
+                if bbox:
+                    overlay_rectangle(img, bbox, color_255, width=3)
+
+    # Generate colored HTML
+    html = create_colored_html(action_string, filtered_args, colors)
+
+    return html
+
+
+def create_colored_html(action_string: str, filtered_args: list, colors: list) -> str:
+    """
+    Create HTML with colored arguments using start/stop indices.
+
+    Args:
+        action_string: Original action string
+        filtered_args: List of ArgInfo objects with start_index/stop_index
+        colors: List of RGB tuples, same length as filtered_args
+
+    Returns:
+        HTML string with colored spans
+    """
+    # Sort args by start position for sequential processing
+    sorted_pairs = sorted(zip(filtered_args, colors), key=lambda x: x[0].start_index)
+
+    # Build HTML with colored spans
+    html_parts = []
+    last_end = 0
+
+    for arg_info, color_rgb in sorted_pairs:
+        # Add uncolored text before this argument
+        html_parts.append(action_string[last_end : arg_info.start_index])
+
+        # Get the argument text
+        arg_text = action_string[arg_info.start_index : arg_info.stop_index]
+
+        # Convert color to hex
+        color_hex = "#{:02x}{:02x}{:02x}".format(
+            int(color_rgb[0] * 255), int(color_rgb[1] * 255), int(color_rgb[2] * 255)
+        )
+
+        # Add colored span
+        html_parts.append(f'<span style="color: {color_hex}; font-weight: bold;">{arg_text}</span>')
+
+        last_end = arg_info.stop_index
+
+    # Add remaining text
+    html_parts.append(action_string[last_end:])
+
+    return "".join(html_parts)
diff --git a/tests/analyze/test_overlay_utils.py b/tests/analyze/test_overlay_utils.py
new file mode 100644
index 00000000..a3aabf51
--- /dev/null
+++ b/tests/analyze/test_overlay_utils.py
@@ -0,0 +1,78 @@
+from PIL import Image
+
+from agentlab.analyze import overlay_utils
+
+
+def test_parse_function_calls():
+
+    test_code = """
+mouse_click(34, 59)
+fill("a234", "test")
+click('b123', button="right", modifiers=["Shift", "Control"])
+select_option("c456", ["option1", "option2"])
+"""
+
+    result = overlay_utils.parse_function_calls(test_code)
+
+    assert result[1].function_name == "mouse_click"
+    assert result[1].name == "y"
+    assert test_code[result[1].start_index : result[1].stop_index] == "59"
+
+    assert result[8].function_name == "select_option"
+    assert result[8].name == "options"
+    assert test_code[result[8].start_index : result[8].stop_index] == '["option1", "option2"]'
+
+
+def test_filtering_args():
+    test_code = """
+mouse_click(34, 59)
+fill("a234", "test")
+mouse_drag_and_drop(34, 59, to_x=100, to_y=200)
+drag_and_drop("a123", "b456")
+"""
+    result = overlay_utils.parse_function_calls(test_code)
+    args = overlay_utils.find_bids_and_xy_pairs(result)
+
+    assert len(args) == 6  # Expecting 4 args: 2 mouse clicks, 1 fill, 1 select_option
+
+    assert args[0].function_name == "mouse_click"
+    assert args[0].name == "xy"
+    assert args[0].value == (34.0, 59.0)
+    assert test_code[args[0].start_index : args[0].stop_index] == "34, 59"
+
+    assert args[2].name == "from_xy"
+    assert args[3].name == "to_xy"
+    assert test_code[args[3].start_index : args[3].stop_index] == "to_x=100, to_y=200"
+
+
+def manual_test():
+    """Manual test function that displays the resulting image."""
+    import matplotlib.pyplot as plt
+
+    # Create a white test image
+    img = Image.new("RGB", (400, 300), "white")
+
+    # Test action string with multiple function calls
+    action_string = """mouse_click(100, 150)
+fill("search_box", "hello world")
+click("submit_btn")"""
+
+    # Mock properties mapping bids to bounding boxes
+    properties = {"search_box": (200, 100, 350, 130), "submit_btn": (200, 200, 280, 230)}
+
+    # Annotate the image and get colored HTML
+    html_result = overlay_utils.annotate_action(img, action_string, properties, colormap="tab10")
+
+    # Display result
+    plt.figure(figsize=(10, 6))
+    plt.imshow(img)
+    plt.axis("off")
+    plt.show()
+
+    print("HTML with colored arguments:")
+    print(html_result)
+    print("\nManual test completed!")
+
+
+if __name__ == "__main__":
+    manual_test()

From 7222f4148cf04fb68b3ebf5f4ad1f845c5000ac0 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 27 Jun 2025 15:16:09 -0400
Subject: [PATCH 2/8] ignore some local files I work with

---
 .gitignore | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 6339cccb..c878e3d1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -171,4 +171,8 @@ results/
 outputs/
 miniwob-plusplus/
 .miniwob-server.pid
-debugging_results/
\ No newline at end of file
+debugging_results/
+
+# working files
+main_miniwob_debug.py
+main_workarena_debug.py

From 022a03cca45f0632198f5403faa2693fabb1728d Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 27 Jun 2025 15:17:01 -0400
Subject: [PATCH 3/8] for backward compatibility

---
 src/agentlab/agents/tool_use_agent/__init__.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/agentlab/agents/tool_use_agent/__init__.py b/src/agentlab/agents/tool_use_agent/__init__.py
index e69de29b..de27826d 100644
--- a/src/agentlab/agents/tool_use_agent/__init__.py
+++ b/src/agentlab/agents/tool_use_agent/__init__.py
@@ -0,0 +1,6 @@
+import sys
+
+from .tool_use_agent import *
+
+# for backward compatibility of unpickling
+sys.modules[__name__ + ".multi_tool_agent"] = sys.modules[__name__]

From e33d66470c96033e0884e569f722031aa9369481 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 27 Jun 2025 15:17:47 -0400
Subject: [PATCH 4/8] no explicit caching for OpenAI

---
 src/agentlab/llm/response_api.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
index 755886f8..2b9f696c 100644
--- a/src/agentlab/llm/response_api.py
+++ b/src/agentlab/llm/response_api.py
@@ -154,6 +154,9 @@ def handle_tool_call(self, content):
             output.append({"role": "user", "content": tail_content})
         return output
 
+    def mark_all_previous_msg_for_caching(self) -> List[Message]:
+        pass
+
 
 class AnthropicAPIMessageBuilder(MessageBuilder):
 

From bb04218c0b5895a7cd0d3c4597a5bb22686646be Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 27 Jun 2025 15:39:58 -0400
Subject: [PATCH 5/8] minor corrections

---
 src/agentlab/agents/tool_use_agent/__init__.py | 2 --
 src/agentlab/analyze/agent_xray.py             | 1 -
 2 files changed, 3 deletions(-)

diff --git a/src/agentlab/agents/tool_use_agent/__init__.py b/src/agentlab/agents/tool_use_agent/__init__.py
index de27826d..b03b1169 100644
--- a/src/agentlab/agents/tool_use_agent/__init__.py
+++ b/src/agentlab/agents/tool_use_agent/__init__.py
@@ -1,6 +1,4 @@
 import sys
 
-from .tool_use_agent import *
-
 # for backward compatibility of unpickling
 sys.modules[__name__ + ".multi_tool_agent"] = sys.modules[__name__]
diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index 439a99ee..08fc3d0f 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -571,7 +571,6 @@ def update_screenshot_pair(som_or_not: str):
 def update_screenshot_gallery(som_or_not: str):
     global info
     max_steps = len(info.exp_result.steps_info)
-    som_or_not == "SOM Screenshots"
 
     screenshots = [get_screenshot(info, step=i, som_or_not=som_or_not)[0] for i in range(max_steps)]
 

From 3c38c2b8ef35ce9e388b3f0be3386ec06f6b77f2 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 27 Jun 2025 15:42:06 -0400
Subject: [PATCH 6/8] rename manual_test function to manual_eval for clarity

---
 tests/analyze/test_overlay_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/analyze/test_overlay_utils.py b/tests/analyze/test_overlay_utils.py
index a3aabf51..cdff70d6 100644
--- a/tests/analyze/test_overlay_utils.py
+++ b/tests/analyze/test_overlay_utils.py
@@ -45,7 +45,7 @@ def test_filtering_args():
     assert test_code[args[3].start_index : args[3].stop_index] == "to_x=100, to_y=200"
 
 
-def manual_test():
+def manual_eval():
     """Manual test function that displays the resulting image."""
     import matplotlib.pyplot as plt
 
@@ -75,4 +75,4 @@ def manual_test():
 
 
 if __name__ == "__main__":
-    manual_test()
+    manual_eval()

From b45877d78134dfc9a51f4382d31410b5cbce5de7 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 27 Jun 2025 16:10:01 -0400
Subject: [PATCH 7/8] add check for empty summary DataFrame and improve error
 logging

---
 src/agentlab/analyze/agent_xray.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index 08fc3d0f..d46b57f3 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -1108,8 +1108,11 @@ def get_directory_contents(results_dir: Path):
                 most_recent_summary = max(summary_files, key=os.path.getctime)
                 summary_df = pd.read_csv(most_recent_summary)
 
+                if len(summary_df) == 0 or summary_df["avg_reward"].isna().all():
+                    continue  # skip if all avg_reward are NaN
+
                 # get row with max avg_reward
-                max_reward_row = summary_df.loc[summary_df["avg_reward"].idxmax()]
+                max_reward_row = summary_df.loc[summary_df["avg_reward"].idxmax(skipna=True)]
                 reward = max_reward_row["avg_reward"] * 100
                 completed = max_reward_row["n_completed"]
                 n_err = max_reward_row["n_err"]
@@ -1117,7 +1120,7 @@ def get_directory_contents(results_dir: Path):
                     f" - avg-reward: {reward:.1f}% - completed: {completed} - errors: {n_err}"
                 )
         except Exception as e:
-            print(f"Error while reading summary file: {e}")
+            print(f"Error while reading summary file {most_recent_summary}: {e}")
 
         exp_descriptions.append(exp_description)
 

From 53c16f1095fdfcf856ea816b5429228d69922efa Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 27 Jun 2025 16:47:03 -0400
Subject: [PATCH 8/8] refactor: remove antialiasing from plot_profiling
 function and update properties structure in tests

---
 src/agentlab/analyze/agent_xray.py  | 1 -
 tests/analyze/test_overlay_utils.py | 5 ++++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index d46b57f3..24564dcc 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -1246,7 +1246,6 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr
                 horizontalalignment="left",
                 rotation=0,
                 clip_on=True,
-                antialiased=True,
                 fontweight=1000,
                 backgroundcolor=colors[12],
             )
diff --git a/tests/analyze/test_overlay_utils.py b/tests/analyze/test_overlay_utils.py
index cdff70d6..58f5915b 100644
--- a/tests/analyze/test_overlay_utils.py
+++ b/tests/analyze/test_overlay_utils.py
@@ -58,7 +58,10 @@ def manual_eval():
 click("submit_btn")"""
 
     # Mock properties mapping bids to bounding boxes
-    properties = {"search_box": (200, 100, 350, 130), "submit_btn": (200, 200, 280, 230)}
+    properties = {
+        "search_box": {"bbox": (50, 50, 100, 50)},
+        "submit_btn": {"bbox": (150, 100, 120, 30)},
+    }
 
     # Annotate the image and get colored HTML
     html_result = overlay_utils.annotate_action(img, action_string, properties, colormap="tab10")