From 7af2d152adeed3d48c7a074e175125823e76f203 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 13 Aug 2025 12:16:50 +0200
Subject: [PATCH 01/23] fixes

---
 .../agents/tool_use_agent/tool_use_agent.py   | 139 ++++++++++++++++--
 src/agentlab/analyze/agent_xray.py            |   2 +-
 src/agentlab/llm/tracking.py                  |  12 +-
 3 files changed, 137 insertions(+), 16 deletions(-)

diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
index 6ac61180..b1407a87 100644
--- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py
+++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
@@ -1,10 +1,12 @@
 import fnmatch
 import json
+import logging
 from abc import ABC, abstractmethod
+from collections import defaultdict
 from copy import copy
 from dataclasses import asdict, dataclass, field
 from pathlib import Path
-from typing import Any
+from typing import Any, Literal
 
 import bgym
 import pandas as pd
@@ -16,6 +18,7 @@
     overlay_som,
     prune_html,
 )
+from sentence_transformers import SentenceTransformer
 
 from agentlab.agents.agent_args import AgentArgs
 from agentlab.benchmarks.abstract_env import AbstractBenchmark as AgentLabBenchmark
@@ -34,6 +37,8 @@
 )
 from agentlab.llm.tracking import cost_tracker_decorator
 
+logger = logging.getLogger(__name__)
+
 
 @dataclass
 class Block(ABC):
@@ -298,22 +303,45 @@ def apply_init(self, llm, discussion: StructuredDiscussion) -> dict:
 class TaskHint(Block):
     use_task_hint: bool = True
     hint_db_rel_path: str = "hint_db.csv"
+    hint_retrieval_mode: Literal["direct", "llm", "emb"] = "direct"
+    top_n: int = 4  # Number of top hints to return when using embedding retrieval
+    embedder_model: str = "Qwen/Qwen3-Embedding-0.6B"  # Model for embedding hints
+    llm_prompt: str = """We're choosing hints to help solve the following task:\n{goal}.\n
+You need to choose the most relevant hints topic from the following list:\n\nHint topics:\n{topics}\n
+Choose hint topic for the task and return only its number, e.g. 1. If you don't know the answer, return -1."""
 
     def _init(self):
         """Initialize the block."""
-        hint_db_path = Path(__file__).parent / self.hint_db_rel_path
+        if Path(self.hint_db_rel_path).is_absolute():
+            hint_db_path = Path(self.hint_db_rel_path)
+        else:
+            hint_db_path = Path(__file__).parent / self.hint_db_rel_path
         self.hint_db = pd.read_csv(hint_db_path, header=0, index_col=None, dtype=str)
+        if self.hint_retrieval_mode == "emb":
+            logger.info("Load sentence transformer model for hint embeddings.")
+            self.emb_model = SentenceTransformer(
+                "Qwen/Qwen3-Embedding-0.6B", model_kwargs={"torch_dtype": "bfloat16"}
+            )
+            self.encode_hints()
+
+    def encode_hints(self):
+        self.uniq_hints = self.hint_db.drop_duplicates(subset=["hint"], keep="first")
+        logger.info(
+            f"Encoding {len(self.uniq_hints)} unique hints using {self.embedder_model} model."
+        )
+        self.hint_embeddings = self.emb_model.encode(
+            self.uniq_hints["hint"].tolist(), prompt="task hint"
+        )
 
     def apply(self, llm, discussion: StructuredDiscussion, task_name: str) -> dict:
         if not self.use_task_hint:
-            return
+            return {}
 
-        task_hints = self.hint_db[
-            self.hint_db["task_name"].apply(lambda x: fnmatch.fnmatch(x, task_name))
-        ]
+        goal = "\n".join([c.get("text", "") for c in discussion.groups[0].messages[1].content])
+        task_hints = self.choose_hints(llm, task_name, goal)
 
         hints = []
-        for hint in task_hints["hint"]:
+        for hint in task_hints:
             hint = hint.strip()
             if hint:
                 hints.append(f"- {hint}")
@@ -327,6 +355,58 @@ def apply(self, llm, discussion: StructuredDiscussion, task_name: str) -> dict:
 
             discussion.append(msg)
 
+    def choose_hints(self, llm, task_name: str, goal: str) -> list[str]:
+        """Choose hints based on the task name."""
+        if self.hint_retrieval_mode == "llm":
+            return self.choose_hints_llm(llm, goal)
+        elif self.hint_retrieval_mode == "direct":
+            return self.choose_hints_direct(task_name)
+        elif self.hint_retrieval_mode == "emb":
+            return self.choose_hints_emb(goal)
+        else:
+            raise ValueError(f"Unknown hint retrieval mode: {self.hint_retrieval_mode}")
+
+    def choose_hints_llm(self, llm, goal: str) -> list[str]:
+        """Choose hints using LLM to filter the hints."""
+        topic_to_hints = defaultdict(list)
+        for i, row in self.hint_db.iterrows():
+            topic_to_hints[row["semantic_keys"]].append(i)
+        hint_topics = list(topic_to_hints.keys())
+        topics = "\n".join([f"{i}. {h}" for i, h in enumerate(hint_topics)])
+        prompt = self.llm_prompt.format(goal=goal, topics=topics)
+        response = llm(APIPayload(messages=[llm.msg.user().add_text(prompt)]))
+        try:
+            hint_topic_idx = json.loads(response.think)
+            if hint_topic_idx < 0 or hint_topic_idx >= len(hint_topics):
+                logger.error(f"Wrong LLM hint id response: {response.think}, no hints")
+                return []
+            hint_topic = hint_topics[hint_topic_idx]
+            hint_indices = topic_to_hints[hint_topic]
+            df = self.hint_db.iloc[hint_indices].copy()
+            df = df.drop_duplicates(subset=["hint"], keep="first")  # leave only unique hints
+            hints = df["hint"].tolist()
+            logger.debug(f"LLM hint topic {hint_topic_idx}, chosen hints: {df['hint'].tolist()}")
+        except json.JSONDecodeError:
+            logger.error(f"Failed to parse LLM hint id response: {response.think}, no hints")
+            hints = []
+        return hints
+
+    def choose_hints_emb(self, goal: str) -> list[str]:
+        """Choose hints using embeddings to filter the hints."""
+        goal_embeddings = self.emb_model.encode([goal], prompt="task description")
+        similarities = self.emb_model.similarity(goal_embeddings, self.hint_embeddings)
+        top_indices = similarities.argsort()[0][-self.top_n :].tolist()
+        logger.info(f"Top hint indices based on embedding similarity: {top_indices}")
+        hints = self.uniq_hints.iloc[top_indices]
+        logger.info(f"Embedding-based hints chosen: {hints}")
+        return hints["hint"].tolist()
+
+    def choose_hints_direct(self, task_name: str) -> list[str]:
+        hints = self.hint_db[
+            self.hint_db["task_name"].apply(lambda x: fnmatch.fnmatch(x, task_name))
+        ]
+        return hints["hint"].tolist()
+
 
 @dataclass
 class PromptConfig:
@@ -510,6 +590,15 @@ def get_action(self, obs: Any) -> float:
     vision_support=True,
 )
 
+GPT_4_1_CC_API = OpenAIChatModelArgs(
+    model_name="gpt-4.1",
+    max_total_tokens=200_000,
+    max_input_tokens=200_000,
+    max_new_tokens=2_000,
+    temperature=0.1,
+    vision_support=True,
+)
+
 GPT_4_1_MINI = OpenAIResponseModelArgs(
     model_name="gpt-4.1-mini",
     max_total_tokens=200_000,
@@ -528,7 +617,7 @@ def get_action(self, obs: Any) -> float:
     vision_support=True,
 )
 
-CLAUDE_MODEL_CONFIG = ClaudeResponseModelArgs(
+CLAUDE_SONNET_37 = ClaudeResponseModelArgs(
     model_name="claude-3-7-sonnet-20250219",
     max_total_tokens=200_000,
     max_input_tokens=200_000,
@@ -537,6 +626,15 @@ def get_action(self, obs: Any) -> float:
     vision_support=True,
 )
 
+CLAUDE_SONNET_4 = ClaudeResponseModelArgs(
+    model_name="claude-sonnet-4-20250514",
+    max_total_tokens=200_000,
+    max_input_tokens=200_000,
+    max_new_tokens=2_000,
+    temperature=0.1,
+    vision_support=True,
+)
+
 O3_RESPONSE_MODEL = OpenAIResponseModelArgs(
     model_name="o3-2025-04-16",
     max_total_tokens=200_000,
@@ -554,6 +652,25 @@ def get_action(self, obs: Any) -> float:
     vision_support=True,
 )
 
+GPT_5 = OpenAIChatModelArgs(
+    model_name="gpt-5",
+    max_total_tokens=200_000,
+    max_input_tokens=200_000,
+    max_new_tokens=2_000,
+    temperature=None,
+    vision_support=True,
+)
+
+
+GPT_5_MINI = OpenAIChatModelArgs(
+    model_name="gpt-5-mini-2025-08-07",
+    max_total_tokens=200_000,
+    max_input_tokens=200_000,
+    max_new_tokens=2_000,
+    temperature=1.0,
+    vision_support=True,
+)
+
 GPT4_1_OPENROUTER_MODEL = OpenRouterModelArgs(
     model_name="openai/gpt-4.1",
     max_total_tokens=200_000,
@@ -580,12 +697,12 @@ def get_action(self, obs: Any) -> float:
     keep_last_n_obs=None,
     multiaction=True,  # whether to use multi-action or not
     # action_subsets=("bid",),
-    action_subsets=("coord"),
+    action_subsets=("coord",),
     # action_subsets=("coord", "bid"),
 )
 
 AGENT_CONFIG = ToolUseAgentArgs(
-    model_args=CLAUDE_MODEL_CONFIG,
+    model_args=CLAUDE_SONNET_37,
     config=DEFAULT_PROMPT_CONFIG,
 )
 
@@ -605,7 +722,7 @@ def get_action(self, obs: Any) -> float:
 )
 
 OSWORLD_CLAUDE = ToolUseAgentArgs(
-    model_args=CLAUDE_MODEL_CONFIG,
+    model_args=CLAUDE_SONNET_37,
     config=PromptConfig(
         tag_screenshot=True,
         goal=Goal(goal_as_system_msg=True),
diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index 84dc423d..37ead1c3 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -735,7 +735,7 @@ def dict_msg_to_markdown(d: dict):
             case _:
                 parts.append(f"\n```\n{str(item)}\n```\n")
 
-    markdown = f"### {d["role"].capitalize()}\n"
+    markdown = f"### {d['role'].capitalize()}\n"
     markdown += "\n".join(parts)
     return markdown
 
diff --git a/src/agentlab/llm/tracking.py b/src/agentlab/llm/tracking.py
index e761a7f6..afcf5e07 100644
--- a/src/agentlab/llm/tracking.py
+++ b/src/agentlab/llm/tracking.py
@@ -178,9 +178,9 @@ def __call__(self, *args, **kwargs):
         # 'self' here calls ._call_api() method of the subclass
         response = self._call_api(*args, **kwargs)
         usage = dict(getattr(response, "usage", {}))
-        if "prompt_tokens_details" in usage:
+        if "prompt_tokens_details" in usage and usage["prompt_tokens_details"]:
             usage["cached_tokens"] = usage["prompt_tokens_details"].cached_tokens
-        if "input_tokens_details" in usage:
+        if "input_tokens_details" in usage and usage["input_tokens_details"]:
             usage["cached_tokens"] = usage["input_tokens_details"].cached_tokens
         usage = {f"usage_{k}": v for k, v in usage.items() if isinstance(v, (int, float))}
         usage |= {"n_api_calls": 1}
@@ -332,12 +332,16 @@ def get_effective_cost_from_openai_api(self, response) -> float:
         if api_type == "chatcompletion":
             total_input_tokens = usage.prompt_tokens  # (cache read tokens + new input tokens)
             output_tokens = usage.completion_tokens
-            cached_input_tokens = usage.prompt_tokens_details.cached_tokens
+            cached_input_tokens = (
+                usage.prompt_tokens_details.cached_tokens if usage.prompt_tokens_details else 0
+            )
             new_input_tokens = total_input_tokens - cached_input_tokens
         elif api_type == "response":
             total_input_tokens = usage.input_tokens  # (cache read tokens + new input tokens)
             output_tokens = usage.output_tokens
-            cached_input_tokens = usage.input_tokens_details.cached_tokens
+            cached_input_tokens = (
+                usage.input_tokens_details.cached_tokens if usage.input_tokens_details else 0
+            )
             new_input_tokens = total_input_tokens - cached_input_tokens
         else:
             logging.warning(f"Unsupported API type: {api_type}. Defaulting cost to 0.0.")

From 3f9e4a2191f81d1a177e9be3d6eea734924754cd Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 13 Aug 2025 12:16:57 +0200
Subject: [PATCH 02/23] add new deps

---
 requirements.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 6322ffd3..a2798f2e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,3 +27,5 @@ ray[default]
 python-slugify
 pillow
 gymnasium>=0.27
+sentence-transformers>=5.0.0
+python-dotenv>=1.1.1
\ No newline at end of file

From c88d7f3fd0f7942e700d5d79ee79555f45cf3f6b Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Tue, 19 Aug 2025 14:14:38 +0200
Subject: [PATCH 03/23] use external embedding service in task hints retrieval

---
 .../agents/tool_use_agent/tool_use_agent.py   | 75 +++++++++++++++----
 1 file changed, 61 insertions(+), 14 deletions(-)

diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
index b1407a87..f6ace3a8 100644
--- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py
+++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
@@ -1,6 +1,9 @@
 import fnmatch
 import json
 import logging
+import os
+import random
+import time
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from copy import copy
@@ -9,7 +12,9 @@
 from typing import Any, Literal
 
 import bgym
+import numpy as np
 import pandas as pd
+import requests
 from bgym import Benchmark as BgymBenchmark
 from browsergym.core.observation import extract_screenshot
 from browsergym.utils.obs import (
@@ -18,7 +23,6 @@
     overlay_som,
     prune_html,
 )
-from sentence_transformers import SentenceTransformer
 
 from agentlab.agents.agent_args import AgentArgs
 from agentlab.benchmarks.abstract_env import AbstractBenchmark as AgentLabBenchmark
@@ -181,7 +185,6 @@ class Obs(Block):
     def apply(
         self, llm, discussion: StructuredDiscussion, obs: dict, last_llm_output: LLMOutput
     ) -> dict:
-
         obs_msg = llm.msg.user()
         tool_calls = last_llm_output.tool_calls
         if self.use_last_error:
@@ -306,6 +309,7 @@ class TaskHint(Block):
     hint_retrieval_mode: Literal["direct", "llm", "emb"] = "direct"
     top_n: int = 4  # Number of top hints to return when using embedding retrieval
     embedder_model: str = "Qwen/Qwen3-Embedding-0.6B"  # Model for embedding hints
+    embedder_server: str = "http://localhost:5000"
     llm_prompt: str = """We're choosing hints to help solve the following task:\n{goal}.\n
 You need to choose the most relevant hints topic from the following list:\n\nHint topics:\n{topics}\n
 Choose hint topic for the task and return only its number, e.g. 1. If you don't know the answer, return -1."""
@@ -318,20 +322,26 @@ def _init(self):
             hint_db_path = Path(__file__).parent / self.hint_db_rel_path
         self.hint_db = pd.read_csv(hint_db_path, header=0, index_col=None, dtype=str)
         if self.hint_retrieval_mode == "emb":
-            logger.info("Load sentence transformer model for hint embeddings.")
-            self.emb_model = SentenceTransformer(
-                "Qwen/Qwen3-Embedding-0.6B", model_kwargs={"torch_dtype": "bfloat16"}
-            )
             self.encode_hints()
 
+    def oai_embed(self, text: str):
+        response = self._oai_emb.create(input=text, model="text-embedding-3-small")
+        return response.data[0].embedding
+
     def encode_hints(self):
         self.uniq_hints = self.hint_db.drop_duplicates(subset=["hint"], keep="first")
         logger.info(
-            f"Encoding {len(self.uniq_hints)} unique hints using {self.embedder_model} model."
-        )
-        self.hint_embeddings = self.emb_model.encode(
-            self.uniq_hints["hint"].tolist(), prompt="task hint"
+            f"Encoding {len(self.uniq_hints)} unique hints with semantic keys using {self.embedder_model} model."
         )
+        hints = self.uniq_hints["hint"].tolist()
+        semantic_keys = self.uniq_hints["semantic_keys"].tolist()
+        lines = [f"{k}: {h}" for h, k in zip(hints, semantic_keys)]
+        emb_path = f"{self.hint_db_rel_path}.embs.npy"
+        assert os.path.exists(emb_path), f"Embedding file not found: {emb_path}"
+        logger.info(f"Loading hint embeddings from: {emb_path}")
+        emb_dict = np.load(emb_path, allow_pickle=True).item()
+        self.hint_embeddings = np.array([emb_dict[k] for k in lines])
+        logger.info(f"Loaded hint embeddings shape: {self.hint_embeddings.shape}")
 
     def apply(self, llm, discussion: StructuredDiscussion, task_name: str) -> dict:
         if not self.use_task_hint:
@@ -393,14 +403,50 @@ def choose_hints_llm(self, llm, goal: str) -> list[str]:
 
     def choose_hints_emb(self, goal: str) -> list[str]:
         """Choose hints using embeddings to filter the hints."""
-        goal_embeddings = self.emb_model.encode([goal], prompt="task description")
-        similarities = self.emb_model.similarity(goal_embeddings, self.hint_embeddings)
+        goal_embeddings = self._encode([goal], prompt="task description")
+        similarities = self._similarity(goal_embeddings.tolist(), self.hint_embeddings.tolist())
         top_indices = similarities.argsort()[0][-self.top_n :].tolist()
         logger.info(f"Top hint indices based on embedding similarity: {top_indices}")
         hints = self.uniq_hints.iloc[top_indices]
         logger.info(f"Embedding-based hints chosen: {hints}")
         return hints["hint"].tolist()
 
+    def _encode(self, texts: list[str], prompt: str = "", timeout: int = 10, max_retries: int = 5):
+        """Call the encode API endpoint with timeout and retries"""
+        for attempt in range(max_retries):
+            try:
+                response = requests.post(
+                    f"{self.embedder_server}/encode",
+                    json={"texts": texts, "prompt": prompt},
+                    timeout=timeout,
+                )
+                embs = response.json()["embeddings"]
+                return np.asarray(embs)
+            except (requests.exceptions.RequestException, requests.exceptions.Timeout) as e:
+                if attempt == max_retries - 1:
+                    raise e
+                time.sleep(random.uniform(1, timeout))
+                continue
+
+    def _similarity(
+        self, texts1: list[str], texts2: list[str], timeout: int = 2, max_retries: int = 5
+    ):
+        """Call the similarity API endpoint with timeout and retries"""
+        for attempt in range(max_retries):
+            try:
+                response = requests.post(
+                    f"{self.embedder_server}/similarity",
+                    json={"texts1": texts1, "texts2": texts2},
+                    timeout=timeout,
+                )
+                similarities = response.json()["similarities"]
+                return np.asarray(similarities)
+            except (requests.exceptions.RequestException, requests.exceptions.Timeout) as e:
+                if attempt == max_retries - 1:
+                    raise e
+                time.sleep(random.uniform(1, timeout))
+                continue
+
     def choose_hints_direct(self, task_name: str) -> list[str]:
         hints = self.hint_db[
             self.hint_db["task_name"].apply(lambda x: fnmatch.fnmatch(x, task_name))
@@ -466,7 +512,8 @@ def __init__(
         self.model_args = model_args
         self.config = config
         self.action_set: bgym.AbstractActionSet = action_set or bgym.HighLevelActionSet(
-            self.config.action_subsets, multiaction=self.config.multiaction  # type: ignore
+            self.config.action_subsets,
+            multiaction=self.config.multiaction,  # type: ignore
         )
         self.tools = self.action_set.to_tool_description(api=model_args.api)
 
@@ -656,7 +703,7 @@ def get_action(self, obs: Any) -> float:
     model_name="gpt-5",
     max_total_tokens=200_000,
     max_input_tokens=200_000,
-    max_new_tokens=2_000,
+    max_new_tokens=8_000,
     temperature=None,
     vision_support=True,
 )

From 74fc47f2820ec6dde79035a4d3bb5e5949d2c2bf Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Tue, 19 Aug 2025 14:14:49 +0200
Subject: [PATCH 04/23] gpt5 fixes

---
 src/agentlab/llm/chat_api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py
index ff341356..dc9667b5 100644
--- a/src/agentlab/llm/chat_api.py
+++ b/src/agentlab/llm/chat_api.py
@@ -292,7 +292,7 @@ def __call__(self, messages: list[dict], n_samples: int = 1, temperature: float
                     messages=messages,
                     n=n_samples,
                     temperature=temperature,
-                    max_tokens=self.max_tokens,
+                    max_completion_tokens=self.max_tokens,
                     logprobs=self.log_probs,
                 )
 
@@ -359,7 +359,7 @@ def __init__(
             min_retry_wait_time=min_retry_wait_time,
             api_key_env_var="OPENAI_API_KEY",
             client_class=OpenAI,
-            pricing_func=tracking.get_pricing_openai,
+            pricing_func=partial(tracking.get_pricing_litellm, model_name=model_name),
             log_probs=log_probs,
         )
 

From 1de1e519f2adb307d5affb4f51e000db0cc72914 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Tue, 19 Aug 2025 20:27:52 -0400
Subject: [PATCH 05/23] first cut

---
 .../agents/human_trace_recorder/agent.py      | 215 ++++++++++++++++++
 1 file changed, 215 insertions(+)
 create mode 100644 src/agentlab/agents/human_trace_recorder/agent.py

diff --git a/src/agentlab/agents/human_trace_recorder/agent.py b/src/agentlab/agents/human_trace_recorder/agent.py
new file mode 100644
index 00000000..52496b7e
--- /dev/null
+++ b/src/agentlab/agents/human_trace_recorder/agent.py
@@ -0,0 +1,215 @@
+from __future__ import annotations
+
+import logging
+import textwrap
+from dataclasses import dataclass
+
+import bgym
+from playwright.sync_api import Page
+
+from agentlab.agents.agent_args import AgentArgs
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Simplified variant: capture human step (trace + screenshot + html) only
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class SimpleHumanTraceCaptureAgentArgs(AgentArgs):
+    """Args for SimpleHumanTraceCaptureAgent.
+
+    This version ONLY captures what the human does in the paused browser per step.
+    It does NOT attempt to map or translate actions. Always returns noop().
+    Set use_raw_page_output=True in loop/env so that obs contains a Playwright Page.
+    """
+
+    agent_name: str = "SimpleHumanTraceCapture"
+    trace_dir: str = "human_traces"
+    screenshots: bool = True
+    snapshots: bool = True  # playwright tracing snapshots (DOM/Sources)
+    sources: bool = False  # include source files (bigger trace)
+    # Ensure the raw Playwright Page object is present in observations so we can pause.
+    use_raw_page_output: bool = True
+
+    def make_agent(self) -> bgym.Agent:
+        return SimpleHumanTraceCaptureAgent(
+            trace_dir=self.trace_dir,
+            screenshots=self.screenshots,
+            snapshots=self.snapshots,
+            sources=self.sources,
+        )
+
+    def set_reproducibility_mode(self):
+        pass
+
+
+class SimpleHumanTraceCaptureAgent(bgym.Agent):
+    """Minimal human-in-the-loop recorder.
+
+    On each get_action:
+      1. Start a Playwright tracing capture (if not already running for this step).
+      2. Call page.pause() to open Inspector; user performs EXACTLY one logical action.
+      3. Stop tracing, save trace zip, screenshot (after action), and HTML snapshot.
+      4. Return noop() so the environment advances.
+
+    Artifacts are stored under trace_dir/step_<n>/
+    """
+
+    def __init__(self, trace_dir: str, screenshots: bool, snapshots: bool, sources: bool):
+        self.action_set = bgym.HighLevelActionSet(["bid"], multiaction=False)
+        self._step_idx = 0
+        from pathlib import Path
+
+        self._root = Path(trace_dir)
+        self._root.mkdir(parents=True, exist_ok=True)
+        # Store trace config booleans; Playwright tracing.start expects them as named params.
+        self._trace_conf = dict(screenshots=screenshots, snapshots=snapshots, sources=sources)
+        self._tracing_started = False  # track if global tracing has been started
+        self._page: Page | None = None  # optional persistent page ref (when not in obs)
+
+    def set_page(self, page: Page):
+        """Manually inject a Playwright Page so the agent can function without it in obs.
+
+        Call this once after you create / reset the environment if you prefer not to
+        expose the page through observations (e.g., for safety or serialization reasons).
+        """
+        self._page = page
+
+    def obs_preprocessor(self, obs):  # keep original obs so page is available
+        return obs
+
+    def get_action(self, obs: dict):  # type: ignore[override]
+        import json
+        import time
+
+        # Resolve page priority: observation > stored page
+        page: Page | None = obs.get("page") or self._page
+        if page is None:
+            raise RuntimeError(
+                "No Playwright Page available. Provide use_raw_page_output=True OR call set_page(page)."
+            )
+        # Cache page if first time we see it via obs so later steps can omit it
+        if self._page is None:
+            self._page = page
+
+        step_dir = self._root / f"step_{self._step_idx:04d}"
+        step_dir.mkdir(parents=True, exist_ok=True)
+        trace_path = step_dir / "trace.zip"
+        screenshot_path = step_dir / "after.png"
+        html_path = step_dir / "after.html"
+
+        # Lazy start of tracing (once per context) then per-step chunk
+        if not self._tracing_started:
+            try:
+                page.context.tracing.start(
+                    screenshots=self._trace_conf["screenshots"],
+                    snapshots=self._trace_conf["snapshots"],
+                    sources=self._trace_conf["sources"],
+                )
+                self._tracing_started = True
+            except Exception as e:  # pragma: no cover
+                print(f"[SimpleHumanTraceCapture][WARN] initial tracing.start failed: {e}")
+
+        try:
+            page.context.tracing.start_chunk()
+        except Exception as e:  # pragma: no cover
+            print(f"[SimpleHumanTraceCapture][WARN] tracing.start_chunk failed: {e}")
+
+        print("\n[SimpleHumanTraceCapture] Perform ONE action then resume Inspector.")
+        print("[SimpleHumanTraceCapture] A trace will be saved to:", trace_path)
+        try:
+            page.pause()
+        except Exception as e:  # pragma: no cover
+            print(f"[SimpleHumanTraceCapture][WARN] page.pause failed: {e}")
+
+        # Stop current chunk & save
+        try:
+            page.context.tracing.stop_chunk(path=str(trace_path))
+        except Exception as e:  # pragma: no cover
+            print(f"[SimpleHumanTraceCapture][WARN] tracing.stop_chunk failed: {e}")
+
+        # Post-action artifacts
+        try:
+            page.screenshot(path=str(screenshot_path))
+        except Exception as e:  # pragma: no cover
+            print(f"[SimpleHumanTraceCapture][WARN] screenshot failed: {e}")
+        try:
+            html = page.content()
+            html_path.write_text(html)
+        except Exception as e:  # pragma: no cover
+            print(f"[SimpleHumanTraceCapture][WARN] html capture failed: {e}")
+
+        meta = {
+            "url": page.url,
+            "timestamp": time.time(),
+            "step": self._step_idx,
+            "trace_path": str(trace_path),
+            "screenshot_path": str(screenshot_path),
+            "html_path": str(html_path),
+        }
+        (step_dir / "meta.json").write_text(json.dumps(meta, indent=2))
+
+        # --- Derive a lightweight human-readable script summary from the trace ---
+        script_summary_lines: list[str] = []
+        try:
+            import json as _json
+            import zipfile
+
+            with zipfile.ZipFile(trace_path, "r") as zf:
+                # Playwright trace usually contains one or more *.trace files (jsonl)
+                trace_files = [n for n in zf.namelist() if n.endswith(".trace")]
+                for tf in trace_files:
+                    with zf.open(tf, "r") as fh:
+                        for raw_line in fh:
+                            try:
+                                evt = _json.loads(raw_line.decode("utf-8"))
+                            except Exception:
+                                continue
+                            if evt.get("type") != "action":
+                                continue
+                            a = evt.get("action", {})
+                            api_name = a.get("apiName") or a.get("name") or "action"
+                            selector = a.get("selector") or a.get("locator") or ""
+                            value = a.get("value") or a.get("text") or ""
+                            line = f"{api_name}"
+                            if selector:
+                                line += f" selector={selector!r}"
+                            if value and isinstance(value, str) and len(value) < 200:
+                                line += f" value={value!r}"
+                            script_summary_lines.append(line)
+            if not script_summary_lines:
+                script_summary_lines.append("(no action events parsed from trace chunk)")
+        except Exception as e:  # pragma: no cover
+            script_summary_lines.append(f"(failed to parse trace for script summary: {e})")
+
+        # Prepare chat messages (simple list of strings for easy viewing)
+        chat_messages = [
+            "PLAYWRIGHT TRACE STEP SUMMARY:",
+            f"Step {self._step_idx} URL: {page.url}",
+            "Actions:",
+            *script_summary_lines,
+            f"Trace file: {trace_path}",
+            "Open with: npx playwright show-trace " + str(trace_path),
+        ]
+
+        self._step_idx += 1
+
+        agent_info = bgym.AgentInfo(
+            think="human-recorded",
+            chat_messages=chat_messages,
+            stats={"step": self._step_idx},
+            markdown_page=textwrap.dedent(
+                f"""### Simple Human Trace Capture\nSaved artifacts for step {meta['step']}:\n- URL: {meta['url']}\n- Trace: {meta['trace_path']}\n- Screenshot: {meta['screenshot_path']}\n- HTML: {meta['html_path']}\n"""
+            ),
+            extra_info=meta,
+        )
+        return "noop()", agent_info
+
+
+SIMPLE_TRACE_CAPTURE_AGENT = SimpleHumanTraceCaptureAgentArgs()
+
+##1. Simple debug agent
+# 2. Instead of using the page object Launch codegen directly in a subprocess using the playwright codegen --url or somethiing

From 2b4633a95c0e18724565d2a5ffa489f4c7ad220c Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Tue, 19 Aug 2025 21:30:00 -0400
Subject: [PATCH 06/23] update

---
 .../agents/human_trace_recorder/agent.py      | 321 ++++++++----------
 1 file changed, 149 insertions(+), 172 deletions(-)

diff --git a/src/agentlab/agents/human_trace_recorder/agent.py b/src/agentlab/agents/human_trace_recorder/agent.py
index 52496b7e..fd5aa554 100644
--- a/src/agentlab/agents/human_trace_recorder/agent.py
+++ b/src/agentlab/agents/human_trace_recorder/agent.py
@@ -1,215 +1,192 @@
+"""Minimal Human Trace Agent (<200 lines)
+
+Per step we capture ONLY:
+  - axtree_txt, pruned_html, actions.json, after.html
+  - Auto-resume after detecting user action
+  - Visible recording indicator
+"""
+
 from __future__ import annotations
 
-import logging
-import textwrap
+import json
+import time
+import zipfile
 from dataclasses import dataclass
+from pathlib import Path
 
 import bgym
 from playwright.sync_api import Page
 
 from agentlab.agents.agent_args import AgentArgs
-
-logger = logging.getLogger(__name__)
-
-
-# ---------------------------------------------------------------------------
-# Simplified variant: capture human step (trace + screenshot + html) only
-# ---------------------------------------------------------------------------
+from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, prune_html
 
 
 @dataclass
-class SimpleHumanTraceCaptureAgentArgs(AgentArgs):
-    """Args for SimpleHumanTraceCaptureAgent.
-
-    This version ONLY captures what the human does in the paused browser per step.
-    It does NOT attempt to map or translate actions. Always returns noop().
-    Set use_raw_page_output=True in loop/env so that obs contains a Playwright Page.
-    """
-
-    agent_name: str = "SimpleHumanTraceCapture"
+class HumanTraceAgentArgs(AgentArgs):
+    agent_name: str = "HumanTraceAgent"
     trace_dir: str = "human_traces"
-    screenshots: bool = True
-    snapshots: bool = True  # playwright tracing snapshots (DOM/Sources)
-    sources: bool = False  # include source files (bigger trace)
-    # Ensure the raw Playwright Page object is present in observations so we can pause.
     use_raw_page_output: bool = True
 
-    def make_agent(self) -> bgym.Agent:
-        return SimpleHumanTraceCaptureAgent(
-            trace_dir=self.trace_dir,
-            screenshots=self.screenshots,
-            snapshots=self.snapshots,
-            sources=self.sources,
-        )
+    def make_agent(self) -> bgym.Agent:  # type: ignore[override]
+        return HumanTraceAgent(self.trace_dir)
 
     def set_reproducibility_mode(self):
         pass
 
 
-class SimpleHumanTraceCaptureAgent(bgym.Agent):
-    """Minimal human-in-the-loop recorder.
-
-    On each get_action:
-      1. Start a Playwright tracing capture (if not already running for this step).
-      2. Call page.pause() to open Inspector; user performs EXACTLY one logical action.
-      3. Stop tracing, save trace zip, screenshot (after action), and HTML snapshot.
-      4. Return noop() so the environment advances.
-
-    Artifacts are stored under trace_dir/step_<n>/
-    """
-
-    def __init__(self, trace_dir: str, screenshots: bool, snapshots: bool, sources: bool):
+class HumanTraceAgent(bgym.Agent):
+    def __init__(self, trace_dir: str):
         self.action_set = bgym.HighLevelActionSet(["bid"], multiaction=False)
-        self._step_idx = 0
-        from pathlib import Path
-
         self._root = Path(trace_dir)
         self._root.mkdir(parents=True, exist_ok=True)
-        # Store trace config booleans; Playwright tracing.start expects them as named params.
-        self._trace_conf = dict(screenshots=screenshots, snapshots=snapshots, sources=sources)
-        self._tracing_started = False  # track if global tracing has been started
-        self._page: Page | None = None  # optional persistent page ref (when not in obs)
-
-    def set_page(self, page: Page):
-        """Manually inject a Playwright Page so the agent can function without it in obs.
-
-        Call this once after you create / reset the environment if you prefer not to
-        expose the page through observations (e.g., for safety or serialization reasons).
-        """
-        self._page = page
-
-    def obs_preprocessor(self, obs):  # keep original obs so page is available
+        self._page: Page | None = None
+        self._step = 0
+        self._task_name = None
+        self._seed = None
+
+    def obs_preprocessor(self, obs: dict):  # type: ignore[override]
+        if isinstance(obs, dict):
+            if self._page is None and "page" in obs:
+                self._page = obs["page"]
+
+            # Extract task name and seed from obs if available
+            if self._task_name is None:
+                self._task_name = obs.get("task_name") or obs.get("task", {}).get(
+                    "task_name", "unknown_task"
+                )
+            if self._seed is None:
+                self._seed = obs.get("seed") or obs.get("task", {}).get("seed", "unknown_seed")
+
+            dom = obs.get("dom_object")
+            axt = obs.get("axtree_object")
+            if axt is not None:
+                try:
+                    obs["axtree_txt"] = flatten_axtree_to_str(axt)
+                except Exception:
+                    pass
+            if dom is not None:
+                try:
+                    obs["pruned_html"] = prune_html(flatten_dom_to_str(dom))
+                except Exception:
+                    pass
+            for k in ("dom_object", "axtree_object", "page"):
+                obs.pop(k, None)
         return obs
 
     def get_action(self, obs: dict):  # type: ignore[override]
-        import json
-        import time
-
-        # Resolve page priority: observation > stored page
-        page: Page | None = obs.get("page") or self._page
-        if page is None:
-            raise RuntimeError(
-                "No Playwright Page available. Provide use_raw_page_output=True OR call set_page(page)."
-            )
-        # Cache page if first time we see it via obs so later steps can omit it
         if self._page is None:
-            self._page = page
+            raise RuntimeError("Playwright Page missing; ensure use_raw_page_output=True")
 
-        step_dir = self._root / f"step_{self._step_idx:04d}"
+        page = self._page
+
+        # Create directory structure: trace_dir/task_name/seed/step_XXXX
+        task_dir = self._root / str(self._task_name or "unknown_task")
+        seed_dir = task_dir / str(self._seed or "unknown_seed")
+        step_dir = seed_dir / f"step_{self._step:04d}"
         step_dir.mkdir(parents=True, exist_ok=True)
-        trace_path = step_dir / "trace.zip"
-        screenshot_path = step_dir / "after.png"
-        html_path = step_dir / "after.html"
 
-        # Lazy start of tracing (once per context) then per-step chunk
-        if not self._tracing_started:
-            try:
-                page.context.tracing.start(
-                    screenshots=self._trace_conf["screenshots"],
-                    snapshots=self._trace_conf["snapshots"],
-                    sources=self._trace_conf["sources"],
-                )
-                self._tracing_started = True
-            except Exception as e:  # pragma: no cover
-                print(f"[SimpleHumanTraceCapture][WARN] initial tracing.start failed: {e}")
+        trace_path = step_dir / "temp_trace.zip"
+        actions_path = step_dir / "actions.json"
+
+        print(
+            f"[HumanTrace] Task: {self._task_name}, Seed: {self._seed}, Step {self._step}: Perform ONE action"
+        )
 
+        # Small recording indicator
+        page.evaluate(
+            """
+            const div = document.createElement('div');
+            div.id = '__rec';
+            div.innerHTML = '🔴 REC';
+            div.style.cssText = 'position:fixed;top:5px;right:5px;background:#f44;color:#fff;padding:5px 8px;border-radius:4px;font:bold 12px monospace;z-index:99999';
+            document.body.appendChild(div);
+        """
+        )
+
+        # Start tracing
         try:
+            page.context.tracing.start(screenshots=True, snapshots=True)
             page.context.tracing.start_chunk()
-        except Exception as e:  # pragma: no cover
-            print(f"[SimpleHumanTraceCapture][WARN] tracing.start_chunk failed: {e}")
+        except Exception:
+            pass
 
-        print("\n[SimpleHumanTraceCapture] Perform ONE action then resume Inspector.")
-        print("[SimpleHumanTraceCapture] A trace will be saved to:", trace_path)
-        try:
-            page.pause()
-        except Exception as e:  # pragma: no cover
-            print(f"[SimpleHumanTraceCapture][WARN] page.pause failed: {e}")
+        # Wait for action
+        self._wait_for_action(page)
 
-        # Stop current chunk & save
+        # Stop tracing and save
         try:
             page.context.tracing.stop_chunk(path=str(trace_path))
-        except Exception as e:  # pragma: no cover
-            print(f"[SimpleHumanTraceCapture][WARN] tracing.stop_chunk failed: {e}")
+            actions = self._extract_trace(str(trace_path))
+            actions_path.write_text(json.dumps(actions, indent=2))
+            trace_path.unlink(missing_ok=True)
+        except Exception:
+            pass
 
-        # Post-action artifacts
+        # Remove indicator
+        page.evaluate("document.getElementById('__rec')?.remove()")
+
+        # Save screenshot
         try:
-            page.screenshot(path=str(screenshot_path))
-        except Exception as e:  # pragma: no cover
-            print(f"[SimpleHumanTraceCapture][WARN] screenshot failed: {e}")
+            page.screenshot(path=str(step_dir / "screenshot.png"))
+        except Exception:
+            pass
+
+        # Save HTML
         try:
-            html = page.content()
-            html_path.write_text(html)
-        except Exception as e:  # pragma: no cover
-            print(f"[SimpleHumanTraceCapture][WARN] html capture failed: {e}")
-
-        meta = {
-            "url": page.url,
-            "timestamp": time.time(),
-            "step": self._step_idx,
-            "trace_path": str(trace_path),
-            "screenshot_path": str(screenshot_path),
-            "html_path": str(html_path),
+            (step_dir / "after.html").write_text(page.content())
+        except Exception:
+            pass
+
+        self._step += 1
+        return "noop()", {
+            "extra_info": {
+                "step": self._step - 1,
+                "task_name": self._task_name,
+                "seed": self._seed,
+                "trace_dir": str(step_dir),
+            }
         }
-        (step_dir / "meta.json").write_text(json.dumps(meta, indent=2))
 
-        # --- Derive a lightweight human-readable script summary from the trace ---
-        script_summary_lines: list[str] = []
-        try:
-            import json as _json
-            import zipfile
-
-            with zipfile.ZipFile(trace_path, "r") as zf:
-                # Playwright trace usually contains one or more *.trace files (jsonl)
-                trace_files = [n for n in zf.namelist() if n.endswith(".trace")]
-                for tf in trace_files:
-                    with zf.open(tf, "r") as fh:
-                        for raw_line in fh:
-                            try:
-                                evt = _json.loads(raw_line.decode("utf-8"))
-                            except Exception:
-                                continue
-                            if evt.get("type") != "action":
-                                continue
-                            a = evt.get("action", {})
-                            api_name = a.get("apiName") or a.get("name") or "action"
-                            selector = a.get("selector") or a.get("locator") or ""
-                            value = a.get("value") or a.get("text") or ""
-                            line = f"{api_name}"
-                            if selector:
-                                line += f" selector={selector!r}"
-                            if value and isinstance(value, str) and len(value) < 200:
-                                line += f" value={value!r}"
-                            script_summary_lines.append(line)
-            if not script_summary_lines:
-                script_summary_lines.append("(no action events parsed from trace chunk)")
-        except Exception as e:  # pragma: no cover
-            script_summary_lines.append(f"(failed to parse trace for script summary: {e})")
-
-        # Prepare chat messages (simple list of strings for easy viewing)
-        chat_messages = [
-            "PLAYWRIGHT TRACE STEP SUMMARY:",
-            f"Step {self._step_idx} URL: {page.url}",
-            "Actions:",
-            *script_summary_lines,
-            f"Trace file: {trace_path}",
-            "Open with: npx playwright show-trace " + str(trace_path),
-        ]
-
-        self._step_idx += 1
-
-        agent_info = bgym.AgentInfo(
-            think="human-recorded",
-            chat_messages=chat_messages,
-            stats={"step": self._step_idx},
-            markdown_page=textwrap.dedent(
-                f"""### Simple Human Trace Capture\nSaved artifacts for step {meta['step']}:\n- URL: {meta['url']}\n- Trace: {meta['trace_path']}\n- Screenshot: {meta['screenshot_path']}\n- HTML: {meta['html_path']}\n"""
-            ),
-            extra_info=meta,
+    def _wait_for_action(self, page):
+        """Wait for user action with auto-resume."""
+        page.evaluate(
+            """
+            window.__acted = false;
+            ['click','keydown','input','change'].forEach(e => 
+                document.addEventListener(e, () => window.__acted = true, true)
+            );
+        """
         )
-        return "noop()", agent_info
 
-
-SIMPLE_TRACE_CAPTURE_AGENT = SimpleHumanTraceCaptureAgentArgs()
-
-##1. Simple debug agent
-# 2. Instead of using the page object Launch codegen directly in a subprocess using the playwright codegen --url or somethiing
+        start = time.time()
+        while time.time() - start < 300:  # 5 min max
+            try:
+                if page.evaluate("window.__acted"):
+                    page.evaluate("document.getElementById('__rec').innerHTML = '💾 SAVING'")
+                    time.sleep(0.3)
+                    return
+            except Exception:
+                pass
+            time.sleep(0.1)
+
+    def _extract_trace(self, trace_file: str):
+        """Extract ALL events from trace zip."""
+        all_events = []
+        try:
+            with zipfile.ZipFile(trace_file, "r") as zf:
+                for name in zf.namelist():
+                    if name.endswith(".trace"):
+                        with zf.open(name) as f:
+                            for line in f:
+                                try:
+                                    event = json.loads(line.decode())
+                                    # Save everything - don't filter
+                                    all_events.append(event)
+                                except Exception:
+                                    continue
+        except Exception:
+            pass
+        return all_events
+
+
+HUMAN_TRACE_AGENT = HumanTraceAgentArgs()

From 380c69f4708f6c172b9408bd1b55cbaa0edf5556 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Wed, 20 Aug 2025 16:27:39 -0400
Subject: [PATCH 07/23] add event listeners and launcher

---
 .../agents/human_trace_recorder/agent.py      | 368 ++++++++----
 .../human_trace_recorder/event_listeners.py   | 563 ++++++++++++++++++
 2 files changed, 802 insertions(+), 129 deletions(-)
 create mode 100644 src/agentlab/agents/human_trace_recorder/event_listeners.py

diff --git a/src/agentlab/agents/human_trace_recorder/agent.py b/src/agentlab/agents/human_trace_recorder/agent.py
index fd5aa554..556922af 100644
--- a/src/agentlab/agents/human_trace_recorder/agent.py
+++ b/src/agentlab/agents/human_trace_recorder/agent.py
@@ -1,16 +1,14 @@
-"""Minimal Human Trace Agent (<200 lines)
+"""Human Trace Agent for Browser Automation Training Data
 
-Per step we capture ONLY:
-  - axtree_txt, pruned_html, actions.json, after.html
-  - Auto-resume after detecting user action
-  - Visible recording indicator
+Captures human interactions at each step including:
+  - Comprehensive action tracking (clicks, input, navigation, etc.)
+  - Saves only human_action.json files in simple numbered folders
 """
 
 from __future__ import annotations
 
 import json
 import time
-import zipfile
 from dataclasses import dataclass
 from pathlib import Path
 
@@ -18,6 +16,17 @@
 from playwright.sync_api import Page
 
 from agentlab.agents.agent_args import AgentArgs
+from agentlab.agents.human_trace_recorder.event_listeners import (
+    get_interaction_tracking_script,
+    get_recording_indicators_script,
+)
+from browsergym.core.observation import (
+    extract_dom_extra_properties,
+    extract_dom_snapshot,
+    extract_focused_element_bid,
+    extract_merged_axtree,
+    extract_screenshot,
+)
 from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, prune_html
 
 
@@ -41,36 +50,33 @@ def __init__(self, trace_dir: str):
         self._root.mkdir(parents=True, exist_ok=True)
         self._page: Page | None = None
         self._step = 0
-        self._task_name = None
-        self._seed = None
 
     def obs_preprocessor(self, obs: dict):  # type: ignore[override]
         if isinstance(obs, dict):
-            if self._page is None and "page" in obs:
-                self._page = obs["page"]
-
-            # Extract task name and seed from obs if available
-            if self._task_name is None:
-                self._task_name = obs.get("task_name") or obs.get("task", {}).get(
-                    "task_name", "unknown_task"
-                )
-            if self._seed is None:
-                self._seed = obs.get("seed") or obs.get("task", {}).get("seed", "unknown_seed")
-
-            dom = obs.get("dom_object")
-            axt = obs.get("axtree_object")
-            if axt is not None:
-                try:
+            self._page = obs.get("page")
+            # Remove the page object from obs to avoid pickle issues
+            if "page" in obs:
+                del obs["page"]
+
+            obs["screenshot"] = extract_screenshot(self._page)
+            obs["dom_object"] = extract_dom_snapshot(self._page)
+            obs["axtree_object"] = extract_merged_axtree(self._page)
+            scale_factor = getattr(self._page, "_bgym_scale_factor", 1.0)
+            extra_properties = extract_dom_extra_properties(
+                obs["dom_object"], scale_factor=scale_factor
+            )
+            obs["extra_element_properties"] = extra_properties
+            obs["focused_element_bid"] = extract_focused_element_bid(self._page)
+
+            # Add text representations for easier analysis
+            if obs["axtree_object"]:
+                axt = obs["axtree_object"]
+                if extra_properties:
                     obs["axtree_txt"] = flatten_axtree_to_str(axt)
-                except Exception:
-                    pass
-            if dom is not None:
-                try:
-                    obs["pruned_html"] = prune_html(flatten_dom_to_str(dom))
-                except Exception:
-                    pass
-            for k in ("dom_object", "axtree_object", "page"):
-                obs.pop(k, None)
+
+            if obs["dom_object"]:
+                obs["dom_txt"] = flatten_dom_to_str(obs["dom_object"])
+                obs["pruned_html"] = prune_html(obs["dom_txt"])
         return obs
 
     def get_action(self, obs: dict):  # type: ignore[override]
@@ -78,115 +84,219 @@ def get_action(self, obs: dict):  # type: ignore[override]
             raise RuntimeError("Playwright Page missing; ensure use_raw_page_output=True")
 
         page = self._page
-
-        # Create directory structure: trace_dir/task_name/seed/step_XXXX
-        task_dir = self._root / str(self._task_name or "unknown_task")
-        seed_dir = task_dir / str(self._seed or "unknown_seed")
-        step_dir = seed_dir / f"step_{self._step:04d}"
-        step_dir.mkdir(parents=True, exist_ok=True)
-
-        trace_path = step_dir / "temp_trace.zip"
-        actions_path = step_dir / "actions.json"
-
-        print(
-            f"[HumanTrace] Task: {self._task_name}, Seed: {self._seed}, Step {self._step}: Perform ONE action"
-        )
-
-        # Small recording indicator
-        page.evaluate(
-            """
-            const div = document.createElement('div');
-            div.id = '__rec';
-            div.innerHTML = '🔴 REC';
-            div.style.cssText = 'position:fixed;top:5px;right:5px;background:#f44;color:#fff;padding:5px 8px;border-radius:4px;font:bold 12px monospace;z-index:99999';
-            document.body.appendChild(div);
-        """
-        )
-
-        # Start tracing
-        try:
-            page.context.tracing.start(screenshots=True, snapshots=True)
-            page.context.tracing.start_chunk()
-        except Exception:
-            pass
-
-        # Wait for action
-        self._wait_for_action(page)
-
-        # Stop tracing and save
-        try:
-            page.context.tracing.stop_chunk(path=str(trace_path))
-            actions = self._extract_trace(str(trace_path))
-            actions_path.write_text(json.dumps(actions, indent=2))
-            trace_path.unlink(missing_ok=True)
-        except Exception:
-            pass
-
-        # Remove indicator
-        page.evaluate("document.getElementById('__rec')?.remove()")
-
-        # Save screenshot
-        try:
-            page.screenshot(path=str(step_dir / "screenshot.png"))
-        except Exception:
-            pass
-
-        # Save HTML
-        try:
-            (step_dir / "after.html").write_text(page.content())
-        except Exception:
-            pass
-
+        step_dir = self._create_step_directory()
+        
+        self._display_recording_prompt()
+        self._show_recording_indicators(page)
+        
+        # Capture human interactions
+        captured_action, human_interactions = self._capture_interactions_with_js(page, step_dir)
+        
+        # Save and cleanup
+        self._save_human_action(captured_action, step_dir)
+        self._cleanup_indicators(page)
+        
         self._step += 1
         return "noop()", {
             "extra_info": {
                 "step": self._step - 1,
-                "task_name": self._task_name,
-                "seed": self._seed,
-                "trace_dir": str(step_dir),
+                "human_interactions": human_interactions,
             }
         }
 
-    def _wait_for_action(self, page):
-        """Wait for user action with auto-resume."""
-        page.evaluate(
-            """
-            window.__acted = false;
-            ['click','keydown','input','change'].forEach(e => 
-                document.addEventListener(e, () => window.__acted = true, true)
-            );
-        """
-        )
-
-        start = time.time()
-        while time.time() - start < 300:  # 5 min max
+    def _create_step_directory(self) -> Path:
+        """Create directory for current step."""
+        step_dir = self._root / str(self._step)
+        step_dir.mkdir(parents=True, exist_ok=True)
+        return step_dir
+
+    def _display_recording_prompt(self):
+        """Display prompt messages to user."""
+        print(f"[HumanTrace] Step {self._step}: Perform ONE action")
+        print("[HumanTrace] ⚠️  WAIT FOR THE RED BORDER TO APPEAR BEFORE PERFORMING ANY ACTION ⚠️")
+        print("[HumanTrace] The system will automatically save after detecting your action")
+
+    def _show_recording_indicators(self, page: Page):
+        """Show visual recording indicators on the page."""
+        page.evaluate(get_recording_indicators_script())
+
+    def _save_human_action(self, captured_action: dict, step_dir: Path):
+        """Save the captured human action to JSON file."""
+        try:
+            human_action_path = step_dir / "human_action.json"
+            if captured_action and isinstance(captured_action, dict):
+                human_action_path.write_text(json.dumps(captured_action, indent=2))
+                action_type = captured_action.get("type", "unknown")
+            else:
+                # Create empty action record for consistency
+                empty_action = {
+                    "type": "no_action",
+                    "timestamp": time.time() * 1000,
+                    "reason": "No meaningful human action captured in this step",
+                }
+                human_action_path.write_text(json.dumps(empty_action, indent=2))
+                action_type = "no_action"
+
+            print(f"[HumanTrace] Step {self._step} complete - Action: {action_type}")
+
+        except Exception as e:
+            print(f"[HumanTrace] Warning: Failed to save human action: {e}")
+
+    def _cleanup_indicators(self, page: Page):
+        """Remove recording indicators from the page."""
+        page.evaluate("document.getElementById('__rec')?.remove(); document.getElementById('__rec_border')?.remove()")
+
+    def _capture_interactions_with_js(self, page: Page, step_dir: Path) -> tuple[dict, str]:
+        """Capture human interactions using JavaScript injection."""
+        try:
+            print("[HumanTrace] JavaScript interaction tracking enabled")
+            initial_url, initial_title = page.url, page.title()
+
+            # Inject interaction tracking
+            self._inject_interaction_tracking(page)
+            
+            # Wait for user action
+            self._wait_for_user_action(page)
+            
+            # Collect and process interaction data
+            return self._collect_interaction_data(page, initial_url, initial_title)
+
+        except Exception as e:
+            print(f"[HumanTrace] Error: {e}")
+            return {
+                "type": "error",
+                "timestamp": time.time() * 1000,
+                "error": str(e),
+            }, f"Error: {e}"
+
+    def _inject_interaction_tracking(self, page: Page):
+        """Inject JavaScript code for comprehensive interaction tracking."""
+        tracking_script = get_interaction_tracking_script()
+        page.evaluate(tracking_script)
+
+    def _wait_for_user_action(self, page: Page):
+        """Wait for user to perform an action."""
+        start_time = time.time()
+        while time.time() - start_time < 300:
             try:
-                if page.evaluate("window.__acted"):
-                    page.evaluate("document.getElementById('__rec').innerHTML = '💾 SAVING'")
-                    time.sleep(0.3)
-                    return
-            except Exception:
+                action_detected = page.evaluate("window.__acted || false")
+                if action_detected:
+                    print(f"[HumanTrace] Action detected! Exiting immediately...")
+                    break
+            except Exception as e:
+                print(f"[HumanTrace] Debug: Error checking actions: {e}")
                 pass
             time.sleep(0.1)
 
-    def _extract_trace(self, trace_file: str):
-        """Extract ALL events from trace zip."""
-        all_events = []
+    def _collect_interaction_data(self, page: Page, initial_url: str, initial_title: str) -> tuple[dict, str]:
+        """Collect and format interaction data."""
         try:
-            with zipfile.ZipFile(trace_file, "r") as zf:
-                for name in zf.namelist():
-                    if name.endswith(".trace"):
-                        with zf.open(name) as f:
-                            for line in f:
-                                try:
-                                    event = json.loads(line.decode())
-                                    # Save everything - don't filter
-                                    all_events.append(event)
-                                except Exception:
-                                    continue
-        except Exception:
-            pass
-        return all_events
+            action_detected = page.evaluate("window.__acted || false")
+            interactions = page.evaluate("window.__interactions || []")
+            
+            action_data = {
+                "type": "human_interactions" if action_detected else "no_action",
+                "timestamp": time.time() * 1000,
+                "detected": action_detected,
+                "interactions": interactions,
+                "interaction_count": len(interactions)
+            }
+            
+            summary = self._create_interaction_summary(interactions)
+            self._add_page_change_info(action_data, initial_url, initial_title, page)
+            
+            print(f"[HumanTrace] {summary}")
+            return action_data, summary
+            
+        except Exception as e:
+            return {
+                "type": "error",
+                "timestamp": time.time() * 1000,
+                "detected": False,
+                "error": str(e),
+                "interactions": [],
+                "interaction_count": 0
+            }, f"Error collecting interactions: {e}"
+
+    def _create_interaction_summary(self, interactions: list) -> str:
+        """Create a summary string of captured interactions."""
+        if interactions:
+            interaction_types = {}
+            for interaction in interactions:
+                itype = interaction.get('type', 'unknown')
+                interaction_types[itype] = interaction_types.get(itype, 0) + 1
+            
+            summary_parts = []
+            for itype, count in interaction_types.items():
+                summary_parts.append(f"{itype}:{count}")
+            return f"Captured {len(interactions)} interactions: {', '.join(summary_parts)}"
+        else:
+            return "No interactions detected"
+
+    def _add_page_change_info(self, action_data: dict, initial_url: str, initial_title: str, page: Page):
+        """Add page change information to action data."""
+        final_url, final_title = page.url, page.title()
+        if initial_url != final_url or initial_title != final_title:
+            action_data["page_changed"] = True
+            action_data["url_change"] = {"from": initial_url, "to": final_url}
+            action_data["title_change"] = {"from": initial_title, "to": final_title}
+
+    def _format_js_interaction_summary(self, action_data, interaction_log):
+        """Format JavaScript-captured interactions into readable summary."""
+        lines = ["Human Interactions (JavaScript Tracking):"]
+
+        if action_data["interactions"]:
+            lines.append(f"Total Actions: {len(action_data['interactions'])}")
+            lines.append("")
+
+            # Group interactions by type
+            by_type = {}
+            for interaction in action_data["interactions"]:
+                interaction_type = interaction["type"]
+                if interaction_type not in by_type:
+                    by_type[interaction_type] = []
+                by_type[interaction_type].append(interaction)
+
+            # Show summary by type
+            for interaction_type, interactions in by_type.items():
+                lines.append(f"{interaction_type.title()}: {len(interactions)} actions")
+
+            lines.append("")
+            lines.append("Detailed Actions:")
+
+            # Add each interaction from the log
+            for log_entry in interaction_log:
+                lines.append(f"  {log_entry}")
+        else:
+            lines.append("No interactions detected - user may have just observed the page")
+
+        # Add page state changes if URL changed
+        if action_data.get("page_changed"):
+            url_info = action_data.get("url")
+            if url_info:
+                lines.append("")
+                lines.append("� Page Navigation:")
+                lines.append(f"  From: {url_info['from']}")
+                lines.append(f"  To: {url_info['to']}")
+
+        return "\n".join(lines)
 
 
 HUMAN_TRACE_AGENT = HumanTraceAgentArgs()
+
+
+if __name__ == "__main__":
+    from agentlab.agents.human_trace_recorder.agent import HUMAN_TRACE_AGENT
+    from agentlab.experiments.study import Study
+
+    agent_configs = [HUMAN_TRACE_AGENT]
+    benchmark = bgym.DEFAULT_BENCHMARKS["workarena_l1"](n_repeats=1)  # type: bgym.Benchmark
+    benchmark = benchmark.subset_from_glob("task_name", "*filter*")
+    benchmark.env_args_list = benchmark.env_args_list[:1]
+    for env_args in benchmark.env_args_list:
+        print(env_args.task_name)
+        env_args.max_steps = 15
+        env_args.headless = False
+
+    study = Study(agent_configs, benchmark)
+    study.run(n_jobs=1, parallel_backend="sequential")
diff --git a/src/agentlab/agents/human_trace_recorder/event_listeners.py b/src/agentlab/agents/human_trace_recorder/event_listeners.py
new file mode 100644
index 00000000..2fd8453c
--- /dev/null
+++ b/src/agentlab/agents/human_trace_recorder/event_listeners.py
@@ -0,0 +1,563 @@
+"""JavaScript Event Listeners for Human Trace Capture
+
+This module contains all the JavaScript code for capturing comprehensive
+browser interactions including mouse, keyboard, form, scroll, and focus events.
+"""
+
+
+def get_interaction_tracking_script() -> str:
+    """Get the complete JavaScript code for interaction tracking."""
+    return (
+        """
+        window.__acted = false;
+        window.__interactions = [];
+        
+        // Debug mode - set to true to see all events in console
+        window.__debug_events = false; 
+        
+        function captureInteraction(type, event, extra = {}) {
+            // Skip our own recording indicators
+            if (event.target.id === '__rec' || event.target.id === '__rec_border' || 
+                event.target.closest('#__rec') || event.target.closest('#__rec_border')) {
+                return;
+            }
+            
+            const interaction = {
+                type: type,
+                timestamp: Date.now(),
+                coords: {
+                    x: event.clientX || 0,
+                    y: event.clientY || 0
+                },
+                target: {
+                    tagName: event.target.tagName,
+                    id: event.target.id || null,
+                    className: event.target.className || null,
+                    text: event.target.textContent?.slice(0, 50) || null,
+                    bid: event.target.getAttribute('bid') || null
+                },
+                ...extra
+            };
+            
+            window.__interactions.push(interaction);
+            window.__acted = true;
+            
+            // Debug logging
+            if (window.__debug_events) {
+                console.log(`🎯 Captured: ${type}`, interaction);
+            }
+            
+            // Update indicators immediately
+            const indicator = document.getElementById('__rec');
+            const border = document.getElementById('__rec_border');
+            if (indicator) {
+                indicator.innerHTML = '✅ ACTION DETECTED - SAVING...';
+                indicator.style.background = '#28a745';
+                indicator.style.animation = 'none';
+            }
+            if (border) {
+                border.style.border = '8px solid #28a745';
+                border.style.animation = 'none';
+            }
+        }
+        
+        // Debug function - add this temporarily to see what events fire
+        if (window.__debug_events) {
+            ['input', 'change', 'select', 'focus', 'click', 'keydown', 'paste', 'cut', 'copy'].forEach(eventType => {
+                document.addEventListener(eventType, (e) => {
+                    console.log(`🔍 DEBUG: ${eventType} on`, e.target.tagName, e.target.type, e.target);
+                }, true);
+            });
+        }
+        
+        """
+        + get_mouse_event_listeners()
+        + """
+        """
+        + get_keyboard_event_listeners()
+        + """
+        """
+        + get_form_event_listeners()
+        + """
+        """
+        + get_scroll_event_listeners()
+        + """
+        """
+        + get_focus_event_listeners()
+        + """
+        
+        console.log('Comprehensive interaction tracking initialized');
+    """
+    )
+
+
+def get_mouse_event_listeners() -> str:
+    """Get JavaScript code for mouse event listeners."""
+    return """
+        // Mouse events with comprehensive button tracking and performance optimizations
+        let lastClickTime = 0;
+        
+        document.addEventListener('click', (e) => {
+            const now = Date.now();
+            // Prevent spam clicking from creating too many events (minimum 50ms between clicks)
+            if (now - lastClickTime < 50) return;
+            lastClickTime = now;
+            
+            captureInteraction('click', e, {
+                button: e.button, // 0=left, 1=middle, 2=right
+                buttons: e.buttons, // bitmask of pressed buttons
+                buttonName: ['left', 'middle', 'right'][e.button] || 'unknown',
+                detail: e.detail, // click count (single, double, etc.)
+                clickType: e.detail === 1 ? 'single' : e.detail === 2 ? 'double' : `${e.detail}x`
+            });
+        }, true);
+        
+        document.addEventListener('dblclick', (e) => {
+            captureInteraction('dblclick', e, {
+                button: e.button,
+                buttonName: ['left', 'middle', 'right'][e.button] || 'unknown'
+            });
+        }, true);
+        
+        document.addEventListener('mousedown', (e) => {
+            captureInteraction('mousedown', e, {
+                button: e.button,
+                buttons: e.buttons,
+                buttonName: ['left', 'middle', 'right'][e.button] || 'unknown'
+            });
+        }, true);
+        
+        document.addEventListener('mouseup', (e) => {
+            captureInteraction('mouseup', e, {
+                button: e.button,
+                buttons: e.buttons,
+                buttonName: ['left', 'middle', 'right'][e.button] || 'unknown'
+            });
+        }, true);
+        
+        // Context menu (right-click menu)
+        document.addEventListener('contextmenu', (e) => {
+            captureInteraction('contextmenu', e, {
+                button: e.button,
+                buttonName: 'right'
+            });
+        }, true);
+        
+        // Middle mouse button events (often used for scrolling/opening in new tab)
+        document.addEventListener('auxclick', (e) => {
+            captureInteraction('auxclick', e, {
+                button: e.button,
+                buttonName: e.button === 1 ? 'middle' : (e.button === 2 ? 'right' : 'other'),
+                detail: e.detail
+            });
+        }, true);
+        
+        // Enhanced drag tracking (without redundant mousedown)
+        let isDragging = false;
+        let dragStart = null;
+        let dragButton = null;
+        let hasDraggedSignificantly = false;
+        
+        document.addEventListener('mousedown', (e) => {
+            isDragging = true;
+            dragButton = e.button;
+            hasDraggedSignificantly = false;
+            dragStart = {
+                x: e.clientX, 
+                y: e.clientY, 
+                time: Date.now(),
+                button: e.button,
+                buttonName: ['left', 'middle', 'right'][e.button] || 'unknown'
+            };
+        }, true);
+        
+        document.addEventListener('mousemove', (e) => {
+            if (isDragging && dragStart) {
+                const distance = Math.sqrt(
+                    Math.pow(e.clientX - dragStart.x, 2) + 
+                    Math.pow(e.clientY - dragStart.y, 2)
+                );
+                if (distance > 5 && !hasDraggedSignificantly) { 
+                    // Only capture the start of a significant drag, not every movement
+                    hasDraggedSignificantly = true;
+                    captureInteraction('drag_start', e, {
+                        startX: dragStart.x,
+                        startY: dragStart.y,
+                        endX: e.clientX,
+                        endY: e.clientY,
+                        distance: distance,
+                        button: dragButton,
+                        buttonName: dragStart.buttonName,
+                        duration: Date.now() - dragStart.time
+                    });
+                }
+            }
+            // Note: Removed general mousemove tracking to reduce noise
+        }, true);
+        
+        document.addEventListener('mouseup', (e) => {
+            if (isDragging && dragStart && hasDraggedSignificantly) {
+                const distance = Math.sqrt(
+                    Math.pow(e.clientX - dragStart.x, 2) + 
+                    Math.pow(e.clientY - dragStart.y, 2)
+                );
+                captureInteraction('drag_end', e, {
+                    startX: dragStart.x,
+                    startY: dragStart.y,
+                    endX: e.clientX,
+                    endY: e.clientY,
+                    distance: distance,
+                    duration: Date.now() - dragStart.time,
+                    button: dragButton,
+                    buttonName: dragStart.buttonName,
+                    totalDistance: distance
+                });
+            }
+            isDragging = false;
+            dragStart = null;
+            dragButton = null;
+            hasDraggedSignificantly = false;
+        }, true);
+        
+        // Drag and drop events
+        document.addEventListener('dragstart', (e) => {
+            captureInteraction('dragstart', e, {
+                dataTransfer: {
+                    effectAllowed: e.dataTransfer.effectAllowed,
+                    types: Array.from(e.dataTransfer.types)
+                }
+            });
+        }, true);
+        
+        document.addEventListener('dragend', (e) => {
+            captureInteraction('dragend', e, {
+                dataTransfer: {
+                    dropEffect: e.dataTransfer.dropEffect
+                }
+            });
+        }, true);
+        
+        document.addEventListener('drop', (e) => {
+            captureInteraction('drop', e, {
+                dataTransfer: {
+                    dropEffect: e.dataTransfer.dropEffect,
+                    types: Array.from(e.dataTransfer.types)
+                },
+                files: e.dataTransfer.files.length > 0 ? Array.from(e.dataTransfer.files).map(f => ({
+                    name: f.name,
+                    type: f.type,
+                    size: f.size
+                })) : null
+            });
+        }, true);
+    """
+
+
+def get_keyboard_event_listeners() -> str:
+    """Get JavaScript code for keyboard event listeners."""
+    return """
+        // Keyboard events with shortcut detection
+        document.addEventListener('keydown', (e) => {
+            let shortcut = null;
+            if (e.ctrlKey || e.metaKey) {
+                const modifier = e.ctrlKey ? 'Ctrl' : 'Cmd';
+                const key = e.key.length === 1 ? e.key.toUpperCase() : e.key;
+                shortcut = `${modifier}+${key}`;
+            } else if (e.altKey && e.key.length === 1) {
+                shortcut = `Alt+${e.key.toUpperCase()}`;
+            } else if (e.shiftKey && e.key.length === 1) {
+                shortcut = `Shift+${e.key.toUpperCase()}`;
+            }
+            
+            captureInteraction('keydown', e, {
+                key: e.key,
+                code: e.code,
+                ctrlKey: e.ctrlKey,
+                shiftKey: e.shiftKey,
+                altKey: e.altKey,
+                metaKey: e.metaKey,
+                shortcut: shortcut
+            });
+        }, true);
+        
+        document.addEventListener('keyup', (e) => {
+            captureInteraction('keyup', e, {
+                key: e.key,
+                code: e.code
+            });
+        }, true);
+    """
+
+
+def get_form_event_listeners() -> str:
+    """Get JavaScript code for form event listeners."""
+    return """
+        // Input events with throttling to prevent spam during fast typing
+        let inputTimeout;
+        let lastInputValue = '';
+        
+        document.addEventListener('input', (e) => {
+            if (['INPUT', 'TEXTAREA'].includes(e.target.tagName) || e.target.contentEditable === 'true') {
+                clearTimeout(inputTimeout);
+                inputTimeout = setTimeout(() => {
+                    const currentValue = e.target.value || e.target.textContent;
+                    // Only capture if value actually changed significantly
+                    if (currentValue !== lastInputValue) {
+                        lastInputValue = currentValue;
+                        captureInteraction('input', e, {
+                            value: currentValue,
+                            inputType: e.inputType || null,
+                            valueLength: currentValue.length
+                        });
+                    }
+                }, 50); // Reduced from 300ms to 50ms for better responsiveness
+            }
+        }, true);
+        
+        // Immediate input capture (without throttling) for certain cases
+        document.addEventListener('input', (e) => {
+            // Immediate capture for dropdown/select-like inputs or when selection changes
+            if (e.target.tagName === 'SELECT' || 
+                e.inputType === 'deleteContentBackward' || 
+                e.inputType === 'insertFromPaste' ||
+                e.inputType === 'insertFromDrop') {
+                captureInteraction('input_immediate', e, {
+                    value: e.target.value || e.target.textContent,
+                    inputType: e.inputType || null,
+                    immediate: true
+                });
+            }
+        }, true);
+        
+        // Text selection events
+        document.addEventListener('select', (e) => {
+            if (['INPUT', 'TEXTAREA'].includes(e.target.tagName)) {
+                const selectedText = e.target.value.substring(e.target.selectionStart, e.target.selectionEnd);
+                captureInteraction('select', e, {
+                    selectedText: selectedText,
+                    selectionStart: e.target.selectionStart,
+                    selectionEnd: e.target.selectionEnd,
+                    value: e.target.value,
+                    selectionLength: selectedText.length
+                });
+            }
+        }, true);
+        
+        // Clipboard events
+        document.addEventListener('cut', (e) => {
+            captureInteraction('cut', e, {
+                clipboardData: e.clipboardData ? Array.from(e.clipboardData.types) : null,
+                targetValue: e.target.value || e.target.textContent
+            });
+        }, true);
+        
+        document.addEventListener('copy', (e) => {
+            captureInteraction('copy', e, {
+                clipboardData: e.clipboardData ? Array.from(e.clipboardData.types) : null,
+                targetValue: e.target.value || e.target.textContent
+            });
+        }, true);
+        
+        document.addEventListener('paste', (e) => {
+            captureInteraction('paste', e, {
+                clipboardData: e.clipboardData ? Array.from(e.clipboardData.types) : null,
+                targetValue: e.target.value || e.target.textContent
+            });
+        }, true);
+        
+        // Enhanced form change events with better dropdown handling
+        document.addEventListener('change', (e) => {
+            let extra = {};
+            if (e.target.tagName === 'SELECT') {
+                const option = e.target.options[e.target.selectedIndex];
+                extra = {
+                    selectedValue: e.target.value,
+                    selectedText: option?.text || '',
+                    selectedIndex: e.target.selectedIndex,
+                    allOptions: Array.from(e.target.options).map(opt => ({
+                        value: opt.value,
+                        text: opt.text,
+                        selected: opt.selected
+                    })),
+                    optionsCount: e.target.options.length
+                };
+            } else if (['checkbox', 'radio'].includes(e.target.type)) {
+                extra = {
+                    checked: e.target.checked,
+                    value: e.target.value,
+                    name: e.target.name
+                };
+            } else {
+                extra = {
+                    value: e.target.value,
+                    previousValue: e.target.defaultValue, // Capture what it was before
+                    inputType: e.target.type
+                };
+            }
+            captureInteraction('change', e, extra);
+        }, true);
+        
+        document.addEventListener('submit', (e) => {
+            captureInteraction('submit', e, {
+                formAction: e.target.action || null,
+                formMethod: e.target.method || 'GET',
+                formElements: Array.from(e.target.elements).length
+            });
+        }, true);
+        
+        // Additional events for better field interaction capture
+        
+        // Option selection in datalists
+        document.addEventListener('input', (e) => {
+            if (e.target.list) { // Has datalist
+                captureInteraction('datalist_input', e, {
+                    value: e.target.value,
+                    listId: e.target.list.id,
+                    optionsCount: e.target.list.options.length
+                });
+            }
+        }, true);
+        
+        // File input changes
+        document.addEventListener('change', (e) => {
+            if (e.target.type === 'file') {
+                captureInteraction('file_select', e, {
+                    filesCount: e.target.files.length,
+                    files: Array.from(e.target.files).map(file => ({
+                        name: file.name,
+                        type: file.type,
+                        size: file.size,
+                        lastModified: file.lastModified
+                    }))
+                });
+            }
+        }, true);
+    """
+
+
+def get_scroll_event_listeners() -> str:
+    """Get JavaScript code for scroll event listeners."""
+    return """
+        // Scroll events with debouncing to reduce noise
+        let scrollTimeout;
+        let lastScrollTime = 0;
+        
+        document.addEventListener('scroll', (e) => {
+            clearTimeout(scrollTimeout);
+            scrollTimeout = setTimeout(() => {
+                const now = Date.now();
+                // Only capture scroll if it's been at least 200ms since last scroll capture
+                if (now - lastScrollTime > 200) {
+                    lastScrollTime = now;
+                    captureInteraction('scroll', e, {
+                        scrollX: window.scrollX,
+                        scrollY: window.scrollY,
+                        scrollLeft: e.target.scrollLeft || 0,
+                        scrollTop: e.target.scrollTop || 0
+                    });
+                }
+            }, 150); // Increased debounce time
+        }, true);
+        
+        // Wheel events (for detailed scroll tracking) with throttling
+        let lastWheelTime = 0;
+        document.addEventListener('wheel', (e) => {
+            const now = Date.now();
+            // Only capture wheel events every 100ms to reduce noise
+            if (now - lastWheelTime > 100) {
+                lastWheelTime = now;
+                captureInteraction('wheel', e, {
+                    deltaX: e.deltaX,
+                    deltaY: e.deltaY,
+                    deltaZ: e.deltaZ,
+                    deltaMode: e.deltaMode
+                });
+            }
+        }, true);
+    """
+
+
+def get_focus_event_listeners() -> str:
+    """Get JavaScript code for focus event listeners."""
+    return """
+        // Focus events - only for interactive elements to reduce noise
+        document.addEventListener('focus', (e) => {
+            // Only capture focus on interactive elements
+            const interactiveElements = ['INPUT', 'TEXTAREA', 'SELECT', 'BUTTON', 'A'];
+            if (interactiveElements.includes(e.target.tagName) || 
+                e.target.contentEditable === 'true' || 
+                e.target.tabIndex >= 0) {
+                captureInteraction('focus', e);
+            }
+        }, true);
+        
+        document.addEventListener('blur', (e) => {
+            // Only capture blur on interactive elements
+            const interactiveElements = ['INPUT', 'TEXTAREA', 'SELECT', 'BUTTON', 'A'];
+            if (interactiveElements.includes(e.target.tagName) || 
+                e.target.contentEditable === 'true' || 
+                e.target.tabIndex >= 0) {
+                captureInteraction('blur', e);
+            }
+        }, true);
+    """
+
+
+def get_recording_indicators_script() -> str:
+    """Get JavaScript code for recording indicators."""
+    return """
+        // Remove any existing indicators
+        const existingBorder = document.getElementById('__rec_border');
+        if (existingBorder) existingBorder.remove();
+        const existingIndicator = document.getElementById('__rec');
+        if (existingIndicator) existingIndicator.remove();
+        
+        // Create border overlay
+        const border = document.createElement('div');
+        border.id = '__rec_border';
+        border.style.cssText = `
+            position: fixed;
+            top: 0;
+            left: 0;
+            width: 100vw;
+            height: 100vh;
+            border: 8px solid #ff0000;
+            box-sizing: border-box;
+            pointer-events: none;
+            z-index: 999999;
+            animation: pulse 1.5s infinite;
+        `;
+        
+        // Create status indicator
+        const indicator = document.createElement('div');
+        indicator.id = '__rec';
+        indicator.innerHTML = '🔴 RECORDING - Perform your action now';
+        indicator.style.cssText = `
+            position: fixed;
+            top: 10px;
+            left: 50%;
+            transform: translateX(-50%);
+            background: #ff0000;
+            color: #fff;
+            padding: 12px 20px;
+            border-radius: 8px;
+            font: bold 10px -apple-system, BlinkMacSystemFont, sans-serif;
+            z-index: 9999999;
+            box-shadow: 0 4px 12px rgba(255,0,0,0.4);
+            animation: pulse 1.5s infinite;
+        `;
+        
+        // Add pulsing animation
+        const style = document.createElement('style');
+        style.textContent = `
+            @keyframes pulse {
+                0% { opacity: 1; }
+                50% { opacity: 0.4; }
+                100% { opacity: 0.8; }
+            }
+        `;
+        document.head.appendChild(style);
+        
+        document.body.appendChild(border);
+        document.body.appendChild(indicator);
+    """

From d3054cd15d2f6eb492c29531d0479b4ae61377b5 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Thu, 21 Aug 2025 18:54:55 -0400
Subject: [PATCH 08/23] Add codegen step-wise recoder agent

---
 .../human_trace_recorder/codegen_agent.py     | 192 ++++++++++++++++++
 1 file changed, 192 insertions(+)
 create mode 100644 src/agentlab/agents/human_trace_recorder/codegen_agent.py

diff --git a/src/agentlab/agents/human_trace_recorder/codegen_agent.py b/src/agentlab/agents/human_trace_recorder/codegen_agent.py
new file mode 100644
index 00000000..16d0222c
--- /dev/null
+++ b/src/agentlab/agents/human_trace_recorder/codegen_agent.py
@@ -0,0 +1,192 @@
+"""Simple Codegen Agent
+
+Captures human interactions using playwright inspector.
+Playwright trace logs are stored in "think" messages and can be viewed in Agentlab Xray.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import tempfile
+import zipfile
+from dataclasses import dataclass
+from pathlib import Path
+
+import bgym
+from playwright.sync_api import Page
+
+from agentlab.agents.agent_args import AgentArgs
+from browsergym.core.observation import (
+    extract_dom_extra_properties,
+    extract_dom_snapshot,
+    extract_focused_element_bid,
+    extract_merged_axtree,
+    extract_screenshot,
+)
+from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, prune_html
+
+
+def extract_log_message_from_pw_trace(pw_trace_file_path):
+    zip_file = zipfile.ZipFile(pw_trace_file_path, "r")
+    trace_lines = zip_file.read("trace.trace").decode("utf-8").splitlines()
+
+    actions = []
+    for line in trace_lines:
+        if line.strip():
+            event = json.loads(line)
+            if event.get("type") == "log":
+                actions.append(event)
+    # Extract log messages from the trace
+    return [log["message"].strip() for log in sorted(actions, key=lambda x: x.get("time", 0))]
+
+
+def clean_pw_logs(logs, exclude_blacklist=True, use_substitutions=True):
+    clean_logs = list(logs)
+    blacklist = {
+        "attempting click action",
+        "waiting for element to be visible, enabled and stable",
+        "element is visible, enabled and stable",
+        "scrolling into view if needed",
+        "done scrolling",
+        "performing click action",
+        "click action done",
+        "waiting for scheduled navigations to finish",
+        "navigations have finished",
+    }
+
+    substitutions = [("waiting for ", "")]
+
+    def apply_substitutions(log):
+        for old, new in substitutions:
+            log = log.replace(old, new)
+        return log
+
+    if exclude_blacklist:
+        clean_logs = [log for log in clean_logs if log not in blacklist]
+    if use_substitutions:
+        clean_logs = [apply_substitutions(log) for log in clean_logs]
+
+    return clean_logs
+
+
+@dataclass
+class PlayWrightCodeGenAgentArgs(AgentArgs):
+    agent_name: str = "PlayWrightCodeGenAgent"
+    trace_dir: str = "playwright_codegen_traces"
+    use_raw_page_output: bool = True
+    store_raw_trace: bool = False
+
+    def make_agent(self) -> bgym.Agent:  # type: ignore[override]
+        return PlayWrightCodeGenAgent(self.trace_dir, self.store_raw_trace)
+
+    def set_reproducibility_mode(self):
+        pass
+
+
+class PlayWrightCodeGenAgent(bgym.Agent):
+    def __init__(self, trace_dir: str, store_raw_trace: bool):
+        self.action_set = bgym.HighLevelActionSet(["bid"], multiaction=False)
+        self._root = Path(trace_dir)
+        self._page: Page | None = None
+        self._step = 0
+        self.store_raw_trace = store_raw_trace
+        self._episode_trace_dir = None  # Cache for single episode
+
+    def _get_trace_dir(self):
+        """Return the trace directory based on store_raw_trace setting."""
+        if self._episode_trace_dir is None:
+            if self.store_raw_trace:
+                import datetime
+
+                dt_str = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+                self._episode_trace_dir = self._root / f"codegen_traces_{dt_str}"
+                self._episode_trace_dir.mkdir(parents=True, exist_ok=True)
+            else:
+                self._episode_trace_dir = Path(tempfile.mkdtemp())
+        return self._episode_trace_dir
+
+    def obs_preprocessor(self, obs: dict):  # type: ignore[override]
+        if isinstance(obs, dict):
+            self._page = obs.get("page")
+            obs["screenshot"] = extract_screenshot(self._page)
+            obs["dom_object"] = extract_dom_snapshot(self._page)
+            obs["axtree_object"] = extract_merged_axtree(self._page)
+            scale_factor = getattr(self._page, "_bgym_scale_factor", 1.0)
+            extra_properties = extract_dom_extra_properties(
+                obs["dom_object"], scale_factor=scale_factor
+            )
+            obs["extra_element_properties"] = extra_properties
+            obs["focused_element_bid"] = extract_focused_element_bid(self._page)
+
+            if obs["axtree_object"]:
+                obs["axtree_txt"] = flatten_axtree_to_str(obs["axtree_object"])
+
+            if obs["dom_object"]:
+                obs["dom_txt"] = flatten_dom_to_str(obs["dom_object"])
+                obs["pruned_html"] = prune_html(obs["dom_txt"])
+
+        if "page" in obs:  # unpickable
+            del obs["page"]
+
+        return obs
+
+    def get_action(self, obs: dict):  # type: ignore[override]
+
+        if self._page is None:
+            raise RuntimeError("Playwright Page missing; ensure use_raw_page_output=True")
+
+        page = self._page
+        trace_dir = self._get_trace_dir()
+        trace_path = trace_dir / f"step_{self._step}.zip"
+        page.context.tracing.start(screenshots=True, snapshots=True, sources=True)
+        page.context.tracing.start_chunk(name=f"step_{self._step}")
+
+        print(
+            f"{'─'*60}\n" f"Step {self._step}\n",
+            f"{'─'*60}\n",
+            "1. 🔴 Start Recording (Press 'Record' in the Playwright Inspector.)\n",
+            "2. ✨ Perform actions for a single step.\n",
+            "3. ⚫ Stop Recording (Press 'Record' again to stop recording.)\n",
+            "4. ▶️  Press 'Resume' in the Playwright Inspector.",
+        )
+
+        page.pause()  # Launch Inspector and record actions
+        page.context.tracing.stop_chunk(path=trace_path)
+        page.context.tracing.stop()
+
+        pw_logs = extract_log_message_from_pw_trace(trace_path)
+        pw_logs = clean_pw_logs(pw_logs, exclude_blacklist=True)
+        pw_logs_str = "\n".join([f"{i}. {log}" for i, log in enumerate(pw_logs, 1)])
+
+        print(f"\n Playwright logs for step {self._step}:\n{pw_logs_str}")
+
+        self._step += 1
+
+        agent_info = bgym.AgentInfo(
+            think=pw_logs_str,
+            chat_messages=[],
+            stats={},
+        )
+
+        return "noop()", agent_info
+
+
+PW_CODEGEN_AGENT = PlayWrightCodeGenAgentArgs(store_raw_trace=True)
+
+
+if __name__ == "__main__":
+    from agentlab.agents.human_trace_recorder.codegen_agent import PW_CODEGEN_AGENT
+    from agentlab.experiments.study import Study
+
+    agent_configs = [PW_CODEGEN_AGENT]
+    benchmark = bgym.DEFAULT_BENCHMARKS["workarena_l1"]()  # type: bgym.Benchmark
+    benchmark = benchmark.subset_from_glob("task_name", "*create*")
+    benchmark.env_args_list = benchmark.env_args_list[:1]
+    for env_args in benchmark.env_args_list:
+        print(env_args.task_name)
+        env_args.max_steps = 15
+        env_args.headless = False
+
+    study = Study(agent_configs, benchmark, logging_level_stdout=logging.INFO)
+    study.run(n_jobs=1, parallel_backend="sequential", n_relaunch=1)

From bf0b6e71ebc9aeb9508e1bf8375212283cf38166 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 28 Aug 2025 15:05:18 +0200
Subject: [PATCH 09/23] fix repeated llm configs

---
 src/agentlab/llm/llm_configs.py | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py
index 3d5828b9..afbf094f 100644
--- a/src/agentlab/llm/llm_configs.py
+++ b/src/agentlab/llm/llm_configs.py
@@ -20,22 +20,6 @@
 ]
 
 CHAT_MODEL_ARGS_DICT = {
-    "openai/gpt-5-nano-2025-08-07": OpenAIModelArgs(
-        model_name="gpt-5-nano-2025-08-07",
-        max_total_tokens=128_000,
-        max_input_tokens=128_000,
-        max_new_tokens=16_384,
-        temperature=1,  # gpt-5 supports temperature of 1 only
-        vision_support=True,
-    ),
-    "openai/gpt-5-mini-2025-08-07": OpenAIModelArgs(
-        model_name="gpt-5-mini-2025-08-07",
-        max_total_tokens=128_000,
-        max_input_tokens=128_000,
-        max_new_tokens=16_384,
-        temperature=1,  # gpt-5 supports temperature of 1 only
-        vision_support=True,
-    ),
     "openai/gpt-4.1-mini-2025-04-14": OpenAIModelArgs(
         model_name="gpt-4.1-mini-2025-04-14",
         max_total_tokens=128_000,
@@ -117,6 +101,7 @@
         max_input_tokens=400_000 - 4_000,
         max_new_tokens=4_000,
         temperature=1,  # temperature param not supported by gpt-5
+        vision_support=True,
     ),
     "openai/gpt-5-mini-2025-08-07": OpenAIModelArgs(
         model_name="gpt-5-mini-2025-08-07",
@@ -124,6 +109,7 @@
         max_input_tokens=400_000 - 4_000,
         max_new_tokens=4_000,
         temperature=1,  # temperature param not supported by gpt-5
+        vision_support=True,
     ),
     "azure/gpt-35-turbo/gpt-35-turbo": AzureModelArgs(
         model_name="gpt-35-turbo",

From f7d154551c03bc427343af4e22426b87c040274e Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 28 Aug 2025 15:06:00 +0200
Subject: [PATCH 10/23] load env vars in codegen agent

---
 src/agentlab/agents/human_trace_recorder/codegen_agent.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/agentlab/agents/human_trace_recorder/codegen_agent.py b/src/agentlab/agents/human_trace_recorder/codegen_agent.py
index 16d0222c..cd902bd2 100644
--- a/src/agentlab/agents/human_trace_recorder/codegen_agent.py
+++ b/src/agentlab/agents/human_trace_recorder/codegen_agent.py
@@ -14,8 +14,6 @@
 from pathlib import Path
 
 import bgym
-from playwright.sync_api import Page
-
 from agentlab.agents.agent_args import AgentArgs
 from browsergym.core.observation import (
     extract_dom_extra_properties,
@@ -25,7 +23,10 @@
     extract_screenshot,
 )
 from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, prune_html
+from dotenv import load_dotenv
+from playwright.sync_api import Page
 
+load_dotenv()
 
 def extract_log_message_from_pw_trace(pw_trace_file_path):
     zip_file = zipfile.ZipFile(pw_trace_file_path, "r")

From 55ce26a2f85e02b965c31a06660aa4f2518937b5 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Fri, 29 Aug 2025 17:45:41 +0200
Subject: [PATCH 11/23] same hints retrieval for both generic and tooluse
 agents

---
 .../generic_agent_hinter/generic_agent.py     |  3 +-
 .../generic_agent_prompt.py                   | 65 +++++++++----
 .../agents/tool_use_agent/tool_use_agent.py   | 91 +++++++++++++------
 3 files changed, 109 insertions(+), 50 deletions(-)

diff --git a/src/agentlab/agents/generic_agent_hinter/generic_agent.py b/src/agentlab/agents/generic_agent_hinter/generic_agent.py
index 91b2f70f..cfbd19bd 100644
--- a/src/agentlab/agents/generic_agent_hinter/generic_agent.py
+++ b/src/agentlab/agents/generic_agent_hinter/generic_agent.py
@@ -111,10 +111,11 @@ def get_action(self, obs):
             previous_plan=self.plan,
             step=self.plan_step,
             flags=self.flags,
+            llm=self.chat_llm,
         )
 
         # Set task name for task hints if available
-        if self.flags.use_task_hint and hasattr(self, 'task_name'):
+        if self.flags.use_task_hint and hasattr(self, "task_name"):
             main_prompt.set_task_name(self.task_name)
 
         max_prompt_tokens, max_trunc_itr = self._get_maxes()
diff --git a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py
index bc12cc2c..f529fd78 100644
--- a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py
+++ b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py
@@ -6,15 +6,16 @@
 
 import logging
 from dataclasses import dataclass
+from pathlib import Path
+from typing import Literal
 
-from browsergym.core import action
+import pandas as pd
 from browsergym.core.action.base import AbstractActionSet
 
 from agentlab.agents import dynamic_prompting as dp
+from agentlab.agents.tool_use_agent.tool_use_agent import HintsSource
+from agentlab.llm.chat_api import ChatModel
 from agentlab.llm.llm_utils import HumanMessage, parse_html_tags_raise
-import fnmatch
-import pandas as pd
-from pathlib import Path
 
 
 @dataclass
@@ -49,6 +50,7 @@ class GenericPromptFlags(dp.Flags):
     use_abstract_example: bool = False
     use_hints: bool = False
     use_task_hint: bool = False
+    task_hint_retrieval_mode: Literal["direct", "llm", "emb"] = "direct"
     hint_db_path: str = None
     enable_chat: bool = False
     max_prompt_tokens: int = None
@@ -70,10 +72,12 @@ def __init__(
         previous_plan: str,
         step: int,
         flags: GenericPromptFlags,
+        llm: ChatModel,
     ) -> None:
         super().__init__()
         self.flags = flags
         self.history = dp.History(obs_history, actions, memories, thoughts, flags.obs)
+        goal = obs_history[-1]["goal_object"]
         if self.flags.enable_chat:
             self.instructions = dp.ChatInstructions(
                 obs_history[-1]["chat_messages"], extra_instructions=flags.extra_instructions
@@ -84,7 +88,7 @@ def __init__(
                     "Agent is in goal mode, but multiple user messages are present in the chat. Consider switching to `enable_chat=True`."
                 )
             self.instructions = dp.GoalInstructions(
-                obs_history[-1]["goal_object"], extra_instructions=flags.extra_instructions
+                goal, extra_instructions=flags.extra_instructions
             )
 
         self.obs = dp.Observation(
@@ -105,7 +109,10 @@ def time_for_caution():
         self.hints = dp.Hints(visible=lambda: flags.use_hints)
         self.task_hint = TaskHint(
             use_task_hint=flags.use_task_hint,
-            hint_db_path=flags.hint_db_path
+            hint_db_path=flags.hint_db_path,
+            goal=goal,
+            hint_retrieval_mode=flags.task_hint_retrieval_mode,
+            llm=llm,
         )
         self.plan = Plan(previous_plan, step, lambda: flags.use_plan)  # TODO add previous plan
         self.criticise = Criticise(visible=lambda: flags.use_criticise)
@@ -114,12 +121,12 @@ def time_for_caution():
     @property
     def _prompt(self) -> HumanMessage:
         prompt = HumanMessage(self.instructions.prompt)
-        
+
         # Add task hints if enabled
         task_hints_text = ""
-        if self.flags.use_task_hint and hasattr(self, 'task_name'):
+        if self.flags.use_task_hint and hasattr(self, "task_name"):
             task_hints_text = self.task_hint.get_hints_for_task(self.task_name)
-        
+
         prompt.add_text(
             f"""\
 {self.obs.prompt}\
@@ -286,11 +293,21 @@ def _parse_answer(self, text_answer):
 
 
 class TaskHint(dp.PromptElement):
-    def __init__(self, use_task_hint: bool = True, hint_db_path: str = None) -> None:
+    def __init__(
+        self,
+        use_task_hint: bool,
+        hint_db_path: str,
+        goal: str,
+        hint_retrieval_mode: Literal["direct", "llm", "emb"],
+        llm: ChatModel,
+    ) -> None:
         super().__init__(visible=use_task_hint)
         self.use_task_hint = use_task_hint
         self.hint_db_rel_path = "hint_db.csv"
         self.hint_db_path = hint_db_path  # Allow external path override
+        self.hint_retrieval_mode: Literal["direct", "llm", "emb"] = hint_retrieval_mode
+        self.goal = goal
+        self.llm = llm
         self._init()
 
     _prompt = ""  # Task hints are added dynamically in MainPrompt
@@ -316,39 +333,49 @@ def _init(self):
                 hint_db_path = Path(self.hint_db_path)
             else:
                 hint_db_path = Path(__file__).parent / self.hint_db_rel_path
-            
+
             if hint_db_path.exists():
                 self.hint_db = pd.read_csv(hint_db_path, header=0, index_col=None, dtype=str)
                 # Verify the expected columns exist
                 if "task_name" not in self.hint_db.columns or "hint" not in self.hint_db.columns:
-                    print(f"Warning: Hint database missing expected columns. Found: {list(self.hint_db.columns)}")
+                    print(
+                        f"Warning: Hint database missing expected columns. Found: {list(self.hint_db.columns)}"
+                    )
                     self.hint_db = pd.DataFrame(columns=["task_name", "hint"])
             else:
                 print(f"Warning: Hint database not found at {hint_db_path}")
                 self.hint_db = pd.DataFrame(columns=["task_name", "hint"])
+            self.hints_source = HintsSource(
+                hint_db_path=self.hint_db_rel_path,
+                hint_retrieval_mode=self.hint_retrieval_mode,
+            )
         except Exception as e:
             # Fallback to empty database on any error
             print(f"Warning: Could not load hint database: {e}")
             self.hint_db = pd.DataFrame(columns=["task_name", "hint"])
 
-
     def get_hints_for_task(self, task_name: str) -> str:
         """Get hints for a specific task."""
         if not self.use_task_hint:
             return ""
 
         # Ensure hint_db is initialized
-        if not hasattr(self, 'hint_db'):
+        if not hasattr(self, "hint_db"):
             self._init()
 
         # Check if hint_db has the expected structure
-        if self.hint_db.empty or "task_name" not in self.hint_db.columns or "hint" not in self.hint_db.columns:
+        if (
+            self.hint_db.empty
+            or "task_name" not in self.hint_db.columns
+            or "hint" not in self.hint_db.columns
+        ):
             return ""
 
         try:
-            task_hints = self.hint_db[
-                self.hint_db["task_name"].apply(lambda x: fnmatch.fnmatch(x, task_name))
-            ]
+            # task_hints = self.hint_db[
+            #     self.hint_db["task_name"].apply(lambda x: fnmatch.fnmatch(x, task_name))
+            # ]
+            task_hints = self.hints_source.choose_hints(self.llm, task_name, self.goal)
 
             hints = []
             for hint in task_hints["hint"]:
@@ -364,5 +391,5 @@ def get_hints_for_task(self, task_name: str) -> str:
                 return hints_str
         except Exception as e:
             print(f"Warning: Error getting hints for task {task_name}: {e}")
-        
+
         return ""
diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
index 375c829e..9025107e 100644
--- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py
+++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
@@ -28,6 +28,7 @@
 from agentlab.benchmarks.abstract_env import AbstractBenchmark as AgentLabBenchmark
 from agentlab.benchmarks.osworld import OSWorldActionSet
 from agentlab.llm.base_api import BaseModelArgs
+from agentlab.llm.chat_api import ChatModel
 from agentlab.llm.llm_utils import image_to_png_base64_url
 from agentlab.llm.response_api import (
     APIPayload,
@@ -316,39 +317,21 @@ class TaskHint(Block):
 
     def _init(self):
         """Initialize the block."""
-        if Path(self.hint_db_rel_path).is_absolute():
-            hint_db_path = Path(self.hint_db_rel_path)
-        else:
-            hint_db_path = Path(__file__).parent / self.hint_db_rel_path
-        self.hint_db = pd.read_csv(hint_db_path, header=0, index_col=None, dtype=str)
-        if self.hint_retrieval_mode == "emb":
-            self.encode_hints()
-
-    def oai_embed(self, text: str):
-        response = self._oai_emb.create(input=text, model="text-embedding-3-small")
-        return response.data[0].embedding
-
-    def encode_hints(self):
-        self.uniq_hints = self.hint_db.drop_duplicates(subset=["hint"], keep="first")
-        logger.info(
-            f"Encoding {len(self.uniq_hints)} unique hints with semantic keys using {self.embedder_model} model."
+        self.hints_source = HintsSource(
+            hint_db_path=self.hint_db_rel_path,
+            hint_retrieval_mode=self.hint_retrieval_mode,
+            top_n=self.top_n,
+            embedder_model=self.embedder_model,
+            embedder_server=self.embedder_server,
+            llm_prompt=self.llm_prompt,
         )
-        hints = self.uniq_hints["hint"].tolist()
-        semantic_keys = self.uniq_hints["semantic_keys"].tolist()
-        lines = [f"{k}: {h}" for h, k in zip(hints, semantic_keys)]
-        emb_path = f"{self.hint_db_rel_path}.embs.npy"
-        assert os.path.exists(emb_path), f"Embedding file not found: {emb_path}"
-        logger.info(f"Loading hint embeddings from: {emb_path}")
-        emb_dict = np.load(emb_path, allow_pickle=True).item()
-        self.hint_embeddings = np.array([emb_dict[k] for k in lines])
-        logger.info(f"Loaded hint embeddings shape: {self.hint_embeddings.shape}")
 
     def apply(self, llm, discussion: StructuredDiscussion, task_name: str) -> dict:
         if not self.use_task_hint:
             return {}
 
         goal = "\n".join([c.get("text", "") for c in discussion.groups[0].messages[1].content])
-        task_hints = self.choose_hints(llm, task_name, goal)
+        task_hints = self.hints_source.choose_hints(llm, task_name, goal)
 
         hints = []
         for hint in task_hints:
@@ -365,6 +348,49 @@ def apply(self, llm, discussion: StructuredDiscussion, task_name: str) -> dict:
 
             discussion.append(msg)
 
+
+class HintsSource:
+    def __init__(
+        self,
+        hint_db_path: str,
+        hint_retrieval_mode: Literal["direct", "llm", "emb"] = "direct",
+        top_n: int = 4,
+        embedder_model: str = "Qwen/Qwen3-Embedding-0.6B",
+        embedder_server: str = "http://localhost:5000",
+        llm_prompt: str = """We're choosing hints to help solve the following task:\n{goal}.\n
+You need to choose the most relevant hints topic from the following list:\n\nHint topics:\n{topics}\n
+Choose hint topic for the task and return only its number, e.g. 1. If you don't know the answer, return -1.""",
+    ) -> None:
+        self.hint_db_path = hint_db_path
+        self.hint_retrieval_mode = hint_retrieval_mode
+        self.top_n = top_n
+        self.embedder_model = embedder_model
+        self.embedder_server = embedder_server
+        self.llm_prompt = llm_prompt
+
+        if Path(hint_db_path).is_absolute():
+            self.hint_db_path = Path(hint_db_path).as_posix()
+        else:
+            self.hint_db_path = (Path(__file__).parent / self.hint_db_path).as_posix()
+        self.hint_db = pd.read_csv(self.hint_db_path, header=0, index_col=None, dtype=str)
+        if self.hint_retrieval_mode == "emb":
+            self.load_hint_vectors()
+
+    def load_hint_vectors(self):
+        self.uniq_hints = self.hint_db.drop_duplicates(subset=["hint"], keep="first")
+        logger.info(
+            f"Encoding {len(self.uniq_hints)} unique hints with semantic keys using {self.embedder_model} model."
+        )
+        hints = self.uniq_hints["hint"].tolist()
+        semantic_keys = self.uniq_hints["semantic_keys"].tolist()
+        lines = [f"{k}: {h}" for h, k in zip(hints, semantic_keys)]
+        emb_path = f"{self.hint_db_path}.embs.npy"
+        assert os.path.exists(emb_path), f"Embedding file not found: {emb_path}"
+        logger.info(f"Loading hint embeddings from: {emb_path}")
+        emb_dict = np.load(emb_path, allow_pickle=True).item()
+        self.hint_embeddings = np.array([emb_dict[k] for k in lines])
+        logger.info(f"Loaded hint embeddings shape: {self.hint_embeddings.shape}")
+
     def choose_hints(self, llm, task_name: str, goal: str) -> list[str]:
         """Choose hints based on the task name."""
         if self.hint_retrieval_mode == "llm":
@@ -384,11 +410,14 @@ def choose_hints_llm(self, llm, goal: str) -> list[str]:
         hint_topics = list(topic_to_hints.keys())
         topics = "\n".join([f"{i}. {h}" for i, h in enumerate(hint_topics)])
         prompt = self.llm_prompt.format(goal=goal, topics=topics)
-        response = llm(APIPayload(messages=[llm.msg.user().add_text(prompt)]))
+        if isinstance(llm, ChatModel):
+            response: str = llm(messages=[dict(role="user", content=prompt)])["content"]
+        else:
+            response: str = llm(APIPayload(messages=[llm.msg.user().add_text(prompt)])).think
         try:
-            hint_topic_idx = json.loads(response.think)
+            hint_topic_idx = json.loads(response)
             if hint_topic_idx < 0 or hint_topic_idx >= len(hint_topics):
-                logger.error(f"Wrong LLM hint id response: {response.think}, no hints")
+                logger.error(f"Wrong LLM hint id response: {response}, no hints")
                 return []
             hint_topic = hint_topics[hint_topic_idx]
             hint_indices = topic_to_hints[hint_topic]
@@ -397,7 +426,7 @@ def choose_hints_llm(self, llm, goal: str) -> list[str]:
             hints = df["hint"].tolist()
             logger.debug(f"LLM hint topic {hint_topic_idx}, chosen hints: {df['hint'].tolist()}")
         except json.JSONDecodeError:
-            logger.error(f"Failed to parse LLM hint id response: {response.think}, no hints")
+            logger.error(f"Failed to parse LLM hint id response: {response}, no hints")
             hints = []
         return hints
 
@@ -427,6 +456,7 @@ def _encode(self, texts: list[str], prompt: str = "", timeout: int = 10, max_ret
                     raise e
                 time.sleep(random.uniform(1, timeout))
                 continue
+        raise ValueError("Failed to encode hints")
 
     def _similarity(
         self, texts1: list[str], texts2: list[str], timeout: int = 2, max_retries: int = 5
@@ -446,6 +476,7 @@ def _similarity(
                     raise e
                 time.sleep(random.uniform(1, timeout))
                 continue
+        raise ValueError("Failed to compute similarity")
 
     def choose_hints_direct(self, task_name: str) -> list[str]:
         hints = self.hint_db[

From cad12096f312cfd74de24f0b50ba4010f12953f3 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 1 Sep 2025 13:51:53 +0200
Subject: [PATCH 12/23] filter out current task hints if needed

---
 .../agents/tool_use_agent/tool_use_agent.py   | 41 +++++++++++++++----
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
index 9025107e..4e6de3b3 100644
--- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py
+++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
@@ -341,7 +341,7 @@ def apply(self, llm, discussion: StructuredDiscussion, task_name: str) -> dict:
 
         if len(hints) > 0:
             hints_str = (
-                "# Hints:\nHere are some hints for the task you are working on:\n"
+                "\n# Hints:\nHere are some hints for the task you are working on:\n"
                 + "\n".join(hints)
             )
             msg = llm.msg.user().add_text(hints_str)
@@ -354,6 +354,7 @@ def __init__(
         self,
         hint_db_path: str,
         hint_retrieval_mode: Literal["direct", "llm", "emb"] = "direct",
+        skip_hints_for_current_task: bool = False,
         top_n: int = 4,
         embedder_model: str = "Qwen/Qwen3-Embedding-0.6B",
         embedder_server: str = "http://localhost:5000",
@@ -363,6 +364,7 @@ def __init__(
     ) -> None:
         self.hint_db_path = hint_db_path
         self.hint_retrieval_mode = hint_retrieval_mode
+        self.skip_hints_for_current_task = skip_hints_for_current_task
         self.top_n = top_n
         self.embedder_model = embedder_model
         self.embedder_server = embedder_server
@@ -405,7 +407,14 @@ def choose_hints(self, llm, task_name: str, goal: str) -> list[str]:
     def choose_hints_llm(self, llm, goal: str) -> list[str]:
         """Choose hints using LLM to filter the hints."""
         topic_to_hints = defaultdict(list)
-        for i, row in self.hint_db.iterrows():
+        hints_df = self.hint_db
+        if self.skip_hints_for_current_task:
+            current_task_hints = self.get_current_task_hints(task_name)
+            hints_df = hints_df[~hints_df["hint"].isin(current_task_hints)]
+            logger.info(
+                f"Filtered out current task hints, remaining hints: {hints_df.shape[0]} out of {self.hint_db.shape[0]}"
+            )
+        for i, row in hints_df.iterrows():
             topic_to_hints[row["semantic_keys"]].append(i)
         hint_topics = list(topic_to_hints.keys())
         topics = "\n".join([f"{i}. {h}" for i, h in enumerate(hint_topics)])
@@ -421,10 +430,10 @@ def choose_hints_llm(self, llm, goal: str) -> list[str]:
                 return []
             hint_topic = hint_topics[hint_topic_idx]
             hint_indices = topic_to_hints[hint_topic]
-            df = self.hint_db.iloc[hint_indices].copy()
+            df = hints_df.iloc[hint_indices].copy()
             df = df.drop_duplicates(subset=["hint"], keep="first")  # leave only unique hints
             hints = df["hint"].tolist()
-            logger.debug(f"LLM hint topic {hint_topic_idx}, chosen hints: {df['hint'].tolist()}")
+            logger.info(f"LLM hint topic {hint_topic_idx}, chosen hints: {df['hint'].tolist()}")
         except json.JSONDecodeError:
             logger.error(f"Failed to parse LLM hint id response: {response}, no hints")
             hints = []
@@ -433,10 +442,21 @@ def choose_hints_llm(self, llm, goal: str) -> list[str]:
     def choose_hints_emb(self, goal: str) -> list[str]:
         """Choose hints using embeddings to filter the hints."""
         goal_embeddings = self._encode([goal], prompt="task description")
-        similarities = self._similarity(goal_embeddings.tolist(), self.hint_embeddings.tolist())
+        hint_embeddings = self.hint_embeddings
+        hints_df = self.uniq_hints
+        if self.skip_hints_for_current_task:
+            current_task_hints = self.get_current_task_hints(task_name)
+            mask = ~hints_df["hint"].isin(current_task_hints)
+            hints_df = hints_df[mask]
+            filtered_indices = hints_df.index.tolist()
+            hint_embeddings = hint_embeddings[filtered_indices]
+            logger.info(
+                f"Filtered same task hint, remained: {len(hint_embeddings)} out of {len(self.hint_embeddings)} embeddings"
+            )
+        similarities = self._similarity(goal_embeddings.tolist(), hint_embeddings.tolist())
         top_indices = similarities.argsort()[0][-self.top_n :].tolist()
         logger.info(f"Top hint indices based on embedding similarity: {top_indices}")
-        hints = self.uniq_hints.iloc[top_indices]
+        hints = hints_df.iloc[top_indices]
         logger.info(f"Embedding-based hints chosen: {hints}")
         return hints["hint"].tolist()
 
@@ -479,10 +499,15 @@ def _similarity(
         raise ValueError("Failed to compute similarity")
 
     def choose_hints_direct(self, task_name: str) -> list[str]:
-        hints = self.hint_db[
+        hints = self.get_current_task_hints(task_name)
+        logger.info(f"Direct hints chosen: {hints}")
+        return hints
+
+    def get_current_task_hints(self, task_name):
+        hints_df = self.hint_db[
             self.hint_db["task_name"].apply(lambda x: fnmatch.fnmatch(x, task_name))
         ]
-        return hints["hint"].tolist()
+        return hints_df["hint"].tolist()
 
 
 @dataclass

From d920b8eb6cae5e39ba5f1a49bd1b73b633294e6c Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 1 Sep 2025 13:52:08 +0200
Subject: [PATCH 13/23] fix llm config, add gpt-5

---
 src/agentlab/llm/llm_configs.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py
index afbf094f..7ac2450a 100644
--- a/src/agentlab/llm/llm_configs.py
+++ b/src/agentlab/llm/llm_configs.py
@@ -111,6 +111,14 @@
         temperature=1,  # temperature param not supported by gpt-5
         vision_support=True,
     ),
+    "openai/gpt-5-2025-08-07": OpenAIModelArgs(
+        model_name="gpt-5-2025-08-07",
+        max_total_tokens=400_000,
+        max_input_tokens=400_000 - 4_000,
+        max_new_tokens=4_000,
+        temperature=1,  # temperature param not supported by gpt-5
+        vision_support=True,
+    ),
     "azure/gpt-35-turbo/gpt-35-turbo": AzureModelArgs(
         model_name="gpt-35-turbo",
         deployment_name="gpt-35-turbo",

From 5315f14b2b5b57f43e23a0da0eec6b31f273ce99 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 1 Sep 2025 13:52:21 +0200
Subject: [PATCH 14/23] fix

---
 .../agents/generic_agent_hinter/generic_agent_prompt.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py
index f529fd78..19f0efda 100644
--- a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py
+++ b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py
@@ -378,7 +378,7 @@ def get_hints_for_task(self, task_name: str) -> str:
             task_hints = self.hints_source.choose_hints(self.llm, task_name, self.goal)
 
             hints = []
-            for hint in task_hints["hint"]:
+            for hint in task_hints:
                 hint = hint.strip()
                 if hint:
                     hints.append(f"- {hint}")

From 26f0abb36fc80999576cc7beec065f3da07dbb1e Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 1 Sep 2025 14:35:05 +0200
Subject: [PATCH 15/23] pass new flag and fix db path passing issue

---
 .../generic_agent_hinter/generic_agent_prompt.py     |  7 ++++++-
 src/agentlab/agents/tool_use_agent/tool_use_agent.py | 12 ++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py
index 19f0efda..84b5d332 100644
--- a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py
+++ b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py
@@ -51,6 +51,7 @@ class GenericPromptFlags(dp.Flags):
     use_hints: bool = False
     use_task_hint: bool = False
     task_hint_retrieval_mode: Literal["direct", "llm", "emb"] = "direct"
+    skip_hints_for_current_task: bool = False
     hint_db_path: str = None
     enable_chat: bool = False
     max_prompt_tokens: int = None
@@ -113,6 +114,7 @@ def time_for_caution():
             goal=goal,
             hint_retrieval_mode=flags.task_hint_retrieval_mode,
             llm=llm,
+            skip_hints_for_current_task=flags.skip_hints_for_current_task,
         )
         self.plan = Plan(previous_plan, step, lambda: flags.use_plan)  # TODO add previous plan
         self.criticise = Criticise(visible=lambda: flags.use_criticise)
@@ -299,6 +301,7 @@ def __init__(
         hint_db_path: str,
         goal: str,
         hint_retrieval_mode: Literal["direct", "llm", "emb"],
+        skip_hints_for_current_task: bool,
         llm: ChatModel,
     ) -> None:
         super().__init__(visible=use_task_hint)
@@ -306,6 +309,7 @@ def __init__(
         self.hint_db_rel_path = "hint_db.csv"
         self.hint_db_path = hint_db_path  # Allow external path override
         self.hint_retrieval_mode: Literal["direct", "llm", "emb"] = hint_retrieval_mode
+        self.skip_hints_for_current_task = skip_hints_for_current_task
         self.goal = goal
         self.llm = llm
         self._init()
@@ -346,8 +350,9 @@ def _init(self):
                 print(f"Warning: Hint database not found at {hint_db_path}")
                 self.hint_db = pd.DataFrame(columns=["task_name", "hint"])
             self.hints_source = HintsSource(
-                hint_db_path=self.hint_db_rel_path,
+                hint_db_path=hint_db_path.as_posix(),
                 hint_retrieval_mode=self.hint_retrieval_mode,
+                skip_hints_for_current_task=self.skip_hints_for_current_task,
             )
         except Exception as e:
             # Fallback to empty database on any error
diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
index 4e6de3b3..b8f21431 100644
--- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py
+++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
@@ -375,6 +375,7 @@ def __init__(
         else:
             self.hint_db_path = (Path(__file__).parent / self.hint_db_path).as_posix()
         self.hint_db = pd.read_csv(self.hint_db_path, header=0, index_col=None, dtype=str)
+        logger.info(f"Loaded {len(self.hint_db)} hints from database {self.hint_db_path}")
         if self.hint_retrieval_mode == "emb":
             self.load_hint_vectors()
 
@@ -395,16 +396,19 @@ def load_hint_vectors(self):
 
     def choose_hints(self, llm, task_name: str, goal: str) -> list[str]:
         """Choose hints based on the task name."""
+        logger.info(
+            f"Choosing hints for task: {task_name}, goal: {goal} from db: {self.hint_db_path} using mode: {self.hint_retrieval_mode}"
+        )
         if self.hint_retrieval_mode == "llm":
-            return self.choose_hints_llm(llm, goal)
+            return self.choose_hints_llm(llm, goal, task_name)
         elif self.hint_retrieval_mode == "direct":
             return self.choose_hints_direct(task_name)
         elif self.hint_retrieval_mode == "emb":
-            return self.choose_hints_emb(goal)
+            return self.choose_hints_emb(goal, task_name)
         else:
             raise ValueError(f"Unknown hint retrieval mode: {self.hint_retrieval_mode}")
 
-    def choose_hints_llm(self, llm, goal: str) -> list[str]:
+    def choose_hints_llm(self, llm, goal: str, task_name: str) -> list[str]:
         """Choose hints using LLM to filter the hints."""
         topic_to_hints = defaultdict(list)
         hints_df = self.hint_db
@@ -439,7 +443,7 @@ def choose_hints_llm(self, llm, goal: str) -> list[str]:
             hints = []
         return hints
 
-    def choose_hints_emb(self, goal: str) -> list[str]:
+    def choose_hints_emb(self, goal: str, task_name: str) -> list[str]:
         """Choose hints using embeddings to filter the hints."""
         goal_embeddings = self._encode([goal], prompt="task description")
         hint_embeddings = self.hint_embeddings

From 5393a34112beab3e92e339d429947c871bfeb67e Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 1 Sep 2025 14:59:54 +0200
Subject: [PATCH 16/23] fix goal text

---
 .../agents/generic_agent_hinter/generic_agent_prompt.py        | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py
index 84b5d332..b684b6c9 100644
--- a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py
+++ b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py
@@ -108,10 +108,11 @@ def time_for_caution():
         self.be_cautious = dp.BeCautious(visible=time_for_caution)
         self.think = dp.Think(visible=lambda: flags.use_thinking)
         self.hints = dp.Hints(visible=lambda: flags.use_hints)
+        goal_str: str = goal[0]["text"]
         self.task_hint = TaskHint(
             use_task_hint=flags.use_task_hint,
             hint_db_path=flags.hint_db_path,
-            goal=goal,
+            goal=goal_str,
             hint_retrieval_mode=flags.task_hint_retrieval_mode,
             llm=llm,
             skip_hints_for_current_task=flags.skip_hints_for_current_task,

From deddc50697b3871077d8000b2fa3fe0b48649b5d Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 1 Sep 2025 17:35:22 +0200
Subject: [PATCH 17/23] fix current task hints exclusion

---
 .../agents/tool_use_agent/tool_use_agent.py   | 82 ++++++++++---------
 1 file changed, 45 insertions(+), 37 deletions(-)

diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
index b8f21431..bd200da3 100644
--- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py
+++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
@@ -411,58 +411,62 @@ def choose_hints(self, llm, task_name: str, goal: str) -> list[str]:
     def choose_hints_llm(self, llm, goal: str, task_name: str) -> list[str]:
         """Choose hints using LLM to filter the hints."""
         topic_to_hints = defaultdict(list)
-        hints_df = self.hint_db
+        skip_hints = []
         if self.skip_hints_for_current_task:
-            current_task_hints = self.get_current_task_hints(task_name)
-            hints_df = hints_df[~hints_df["hint"].isin(current_task_hints)]
-            logger.info(
-                f"Filtered out current task hints, remaining hints: {hints_df.shape[0]} out of {self.hint_db.shape[0]}"
-            )
-        for i, row in hints_df.iterrows():
-            topic_to_hints[row["semantic_keys"]].append(i)
+            skip_hints = self.get_current_task_hints(task_name)
+        for _, row in self.hint_db.iterrows():
+            hint = row["hint"]
+            if hint in skip_hints:
+                continue
+            topic_to_hints[row["semantic_keys"]].append(hint)
+        logger.info(f"Collected {len(topic_to_hints)} hint topics")
         hint_topics = list(topic_to_hints.keys())
         topics = "\n".join([f"{i}. {h}" for i, h in enumerate(hint_topics)])
         prompt = self.llm_prompt.format(goal=goal, topics=topics)
+
         if isinstance(llm, ChatModel):
             response: str = llm(messages=[dict(role="user", content=prompt)])["content"]
         else:
             response: str = llm(APIPayload(messages=[llm.msg.user().add_text(prompt)])).think
         try:
-            hint_topic_idx = json.loads(response)
-            if hint_topic_idx < 0 or hint_topic_idx >= len(hint_topics):
+            topic_number = json.loads(response)
+            if topic_number < 0 or topic_number >= len(hint_topics):
                 logger.error(f"Wrong LLM hint id response: {response}, no hints")
                 return []
-            hint_topic = hint_topics[hint_topic_idx]
-            hint_indices = topic_to_hints[hint_topic]
-            df = hints_df.iloc[hint_indices].copy()
-            df = df.drop_duplicates(subset=["hint"], keep="first")  # leave only unique hints
-            hints = df["hint"].tolist()
-            logger.info(f"LLM hint topic {hint_topic_idx}, chosen hints: {df['hint'].tolist()}")
-        except json.JSONDecodeError:
-            logger.error(f"Failed to parse LLM hint id response: {response}, no hints")
+            hint_topic = hint_topics[topic_number]
+            hints = list(set(topic_to_hints[hint_topic]))
+            logger.info(f"LLM hint topic {topic_number}:'{hint_topic}', chosen hints: {hints}")
+        except Exception as e:
+            logger.exception(f"Failed to parse LLM hint id response: {response}:\n{e}")
             hints = []
         return hints
 
     def choose_hints_emb(self, goal: str, task_name: str) -> list[str]:
         """Choose hints using embeddings to filter the hints."""
-        goal_embeddings = self._encode([goal], prompt="task description")
-        hint_embeddings = self.hint_embeddings
-        hints_df = self.uniq_hints
-        if self.skip_hints_for_current_task:
-            current_task_hints = self.get_current_task_hints(task_name)
-            mask = ~hints_df["hint"].isin(current_task_hints)
-            hints_df = hints_df[mask]
-            filtered_indices = hints_df.index.tolist()
-            hint_embeddings = hint_embeddings[filtered_indices]
-            logger.info(
-                f"Filtered same task hint, remained: {len(hint_embeddings)} out of {len(self.hint_embeddings)} embeddings"
-            )
-        similarities = self._similarity(goal_embeddings.tolist(), hint_embeddings.tolist())
-        top_indices = similarities.argsort()[0][-self.top_n :].tolist()
-        logger.info(f"Top hint indices based on embedding similarity: {top_indices}")
-        hints = hints_df.iloc[top_indices]
-        logger.info(f"Embedding-based hints chosen: {hints}")
-        return hints["hint"].tolist()
+        try:
+            goal_embeddings = self._encode([goal], prompt="task description")
+            hint_embeddings = self.hint_embeddings.copy()
+            all_hints = self.uniq_hints["hint"].tolist()
+            skip_hints = []
+            if self.skip_hints_for_current_task:
+                skip_hints = self.get_current_task_hints(task_name)
+            hint_embeddings = []
+            id_to_hint = {}
+            for hint, emb in zip(all_hints, self.hint_embeddings):
+                if hint in skip_hints:
+                    continue
+                hint_embeddings.append(emb.tolist())
+                id_to_hint[len(hint_embeddings) - 1] = hint
+            logger.info(f"Prepared hint embeddings for {len(hint_embeddings)} hints")
+            similarities = self._similarity(goal_embeddings.tolist(), hint_embeddings)
+            top_indices = similarities.argsort()[0][-self.top_n :].tolist()
+            logger.info(f"Top hint indices based on embedding similarity: {top_indices}")
+            hints = [id_to_hint[idx] for idx in top_indices]
+            logger.info(f"Embedding-based hints chosen: {hints}")
+        except Exception as e:
+            logger.exception(f"Failed to choose hints using embeddings: {e}")
+            hints = []
+        return hints
 
     def _encode(self, texts: list[str], prompt: str = "", timeout: int = 10, max_retries: int = 5):
         """Call the encode API endpoint with timeout and retries"""
@@ -483,7 +487,11 @@ def _encode(self, texts: list[str], prompt: str = "", timeout: int = 10, max_ret
         raise ValueError("Failed to encode hints")
 
     def _similarity(
-        self, texts1: list[str], texts2: list[str], timeout: int = 2, max_retries: int = 5
+        self,
+        texts1: list,
+        texts2: list,
+        timeout: int = 2,
+        max_retries: int = 5,
     ):
         """Call the similarity API endpoint with timeout and retries"""
         for attempt in range(max_retries):

From b9d09d4d8d2ee557a04b76c358496c21cc1657cd Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Tue, 2 Sep 2025 11:44:38 +0200
Subject: [PATCH 18/23] remove old reqs

---
 requirements.txt | 31 -------------------------------
 1 file changed, 31 deletions(-)
 delete mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index a2798f2e..00000000
--- a/requirements.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-black[jupyter]>=24.2.0
-blacken-docs
-pre-commit
-pytest==7.3.2
-flaky
-pytest-xdist
-pytest-playwright
-pydantic~=2.9
-dask
-distributed
-browsergym>=0.7.1
-joblib>=1.2.0
-openai>=1.7,<2
-langchain_community
-tiktoken
-tapeagents[converters]
-huggingface_hub
-contexttimer
-ipython
-pyyaml>=6
-pandas
-gradio>=5.5 # issue with DataFrame scrolling before 5.5
-gitpython # for the reproducibility script
-requests
-matplotlib
-ray[default]
-python-slugify
-pillow
-gymnasium>=0.27
-sentence-transformers>=5.0.0
-python-dotenv>=1.1.1
\ No newline at end of file

From 725e7a03750780263cb6ce0190ef252fc2e3d688 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Tue, 2 Sep 2025 11:45:54 +0200
Subject: [PATCH 19/23] remove recorder from that brach

---
 .../agents/human_trace_recorder/agent.py      | 302 ----------
 .../human_trace_recorder/codegen_agent.py     | 193 ------
 .../human_trace_recorder/event_listeners.py   | 563 ------------------
 3 files changed, 1058 deletions(-)
 delete mode 100644 src/agentlab/agents/human_trace_recorder/agent.py
 delete mode 100644 src/agentlab/agents/human_trace_recorder/codegen_agent.py
 delete mode 100644 src/agentlab/agents/human_trace_recorder/event_listeners.py

diff --git a/src/agentlab/agents/human_trace_recorder/agent.py b/src/agentlab/agents/human_trace_recorder/agent.py
deleted file mode 100644
index 556922af..00000000
--- a/src/agentlab/agents/human_trace_recorder/agent.py
+++ /dev/null
@@ -1,302 +0,0 @@
-"""Human Trace Agent for Browser Automation Training Data
-
-Captures human interactions at each step including:
-  - Comprehensive action tracking (clicks, input, navigation, etc.)
-  - Saves only human_action.json files in simple numbered folders
-"""
-
-from __future__ import annotations
-
-import json
-import time
-from dataclasses import dataclass
-from pathlib import Path
-
-import bgym
-from playwright.sync_api import Page
-
-from agentlab.agents.agent_args import AgentArgs
-from agentlab.agents.human_trace_recorder.event_listeners import (
-    get_interaction_tracking_script,
-    get_recording_indicators_script,
-)
-from browsergym.core.observation import (
-    extract_dom_extra_properties,
-    extract_dom_snapshot,
-    extract_focused_element_bid,
-    extract_merged_axtree,
-    extract_screenshot,
-)
-from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, prune_html
-
-
-@dataclass
-class HumanTraceAgentArgs(AgentArgs):
-    agent_name: str = "HumanTraceAgent"
-    trace_dir: str = "human_traces"
-    use_raw_page_output: bool = True
-
-    def make_agent(self) -> bgym.Agent:  # type: ignore[override]
-        return HumanTraceAgent(self.trace_dir)
-
-    def set_reproducibility_mode(self):
-        pass
-
-
-class HumanTraceAgent(bgym.Agent):
-    def __init__(self, trace_dir: str):
-        self.action_set = bgym.HighLevelActionSet(["bid"], multiaction=False)
-        self._root = Path(trace_dir)
-        self._root.mkdir(parents=True, exist_ok=True)
-        self._page: Page | None = None
-        self._step = 0
-
-    def obs_preprocessor(self, obs: dict):  # type: ignore[override]
-        if isinstance(obs, dict):
-            self._page = obs.get("page")
-            # Remove the page object from obs to avoid pickle issues
-            if "page" in obs:
-                del obs["page"]
-
-            obs["screenshot"] = extract_screenshot(self._page)
-            obs["dom_object"] = extract_dom_snapshot(self._page)
-            obs["axtree_object"] = extract_merged_axtree(self._page)
-            scale_factor = getattr(self._page, "_bgym_scale_factor", 1.0)
-            extra_properties = extract_dom_extra_properties(
-                obs["dom_object"], scale_factor=scale_factor
-            )
-            obs["extra_element_properties"] = extra_properties
-            obs["focused_element_bid"] = extract_focused_element_bid(self._page)
-
-            # Add text representations for easier analysis
-            if obs["axtree_object"]:
-                axt = obs["axtree_object"]
-                if extra_properties:
-                    obs["axtree_txt"] = flatten_axtree_to_str(axt)
-
-            if obs["dom_object"]:
-                obs["dom_txt"] = flatten_dom_to_str(obs["dom_object"])
-                obs["pruned_html"] = prune_html(obs["dom_txt"])
-        return obs
-
-    def get_action(self, obs: dict):  # type: ignore[override]
-        if self._page is None:
-            raise RuntimeError("Playwright Page missing; ensure use_raw_page_output=True")
-
-        page = self._page
-        step_dir = self._create_step_directory()
-        
-        self._display_recording_prompt()
-        self._show_recording_indicators(page)
-        
-        # Capture human interactions
-        captured_action, human_interactions = self._capture_interactions_with_js(page, step_dir)
-        
-        # Save and cleanup
-        self._save_human_action(captured_action, step_dir)
-        self._cleanup_indicators(page)
-        
-        self._step += 1
-        return "noop()", {
-            "extra_info": {
-                "step": self._step - 1,
-                "human_interactions": human_interactions,
-            }
-        }
-
-    def _create_step_directory(self) -> Path:
-        """Create directory for current step."""
-        step_dir = self._root / str(self._step)
-        step_dir.mkdir(parents=True, exist_ok=True)
-        return step_dir
-
-    def _display_recording_prompt(self):
-        """Display prompt messages to user."""
-        print(f"[HumanTrace] Step {self._step}: Perform ONE action")
-        print("[HumanTrace] ⚠️  WAIT FOR THE RED BORDER TO APPEAR BEFORE PERFORMING ANY ACTION ⚠️")
-        print("[HumanTrace] The system will automatically save after detecting your action")
-
-    def _show_recording_indicators(self, page: Page):
-        """Show visual recording indicators on the page."""
-        page.evaluate(get_recording_indicators_script())
-
-    def _save_human_action(self, captured_action: dict, step_dir: Path):
-        """Save the captured human action to JSON file."""
-        try:
-            human_action_path = step_dir / "human_action.json"
-            if captured_action and isinstance(captured_action, dict):
-                human_action_path.write_text(json.dumps(captured_action, indent=2))
-                action_type = captured_action.get("type", "unknown")
-            else:
-                # Create empty action record for consistency
-                empty_action = {
-                    "type": "no_action",
-                    "timestamp": time.time() * 1000,
-                    "reason": "No meaningful human action captured in this step",
-                }
-                human_action_path.write_text(json.dumps(empty_action, indent=2))
-                action_type = "no_action"
-
-            print(f"[HumanTrace] Step {self._step} complete - Action: {action_type}")
-
-        except Exception as e:
-            print(f"[HumanTrace] Warning: Failed to save human action: {e}")
-
-    def _cleanup_indicators(self, page: Page):
-        """Remove recording indicators from the page."""
-        page.evaluate("document.getElementById('__rec')?.remove(); document.getElementById('__rec_border')?.remove()")
-
-    def _capture_interactions_with_js(self, page: Page, step_dir: Path) -> tuple[dict, str]:
-        """Capture human interactions using JavaScript injection."""
-        try:
-            print("[HumanTrace] JavaScript interaction tracking enabled")
-            initial_url, initial_title = page.url, page.title()
-
-            # Inject interaction tracking
-            self._inject_interaction_tracking(page)
-            
-            # Wait for user action
-            self._wait_for_user_action(page)
-            
-            # Collect and process interaction data
-            return self._collect_interaction_data(page, initial_url, initial_title)
-
-        except Exception as e:
-            print(f"[HumanTrace] Error: {e}")
-            return {
-                "type": "error",
-                "timestamp": time.time() * 1000,
-                "error": str(e),
-            }, f"Error: {e}"
-
-    def _inject_interaction_tracking(self, page: Page):
-        """Inject JavaScript code for comprehensive interaction tracking."""
-        tracking_script = get_interaction_tracking_script()
-        page.evaluate(tracking_script)
-
-    def _wait_for_user_action(self, page: Page):
-        """Wait for user to perform an action."""
-        start_time = time.time()
-        while time.time() - start_time < 300:
-            try:
-                action_detected = page.evaluate("window.__acted || false")
-                if action_detected:
-                    print(f"[HumanTrace] Action detected! Exiting immediately...")
-                    break
-            except Exception as e:
-                print(f"[HumanTrace] Debug: Error checking actions: {e}")
-                pass
-            time.sleep(0.1)
-
-    def _collect_interaction_data(self, page: Page, initial_url: str, initial_title: str) -> tuple[dict, str]:
-        """Collect and format interaction data."""
-        try:
-            action_detected = page.evaluate("window.__acted || false")
-            interactions = page.evaluate("window.__interactions || []")
-            
-            action_data = {
-                "type": "human_interactions" if action_detected else "no_action",
-                "timestamp": time.time() * 1000,
-                "detected": action_detected,
-                "interactions": interactions,
-                "interaction_count": len(interactions)
-            }
-            
-            summary = self._create_interaction_summary(interactions)
-            self._add_page_change_info(action_data, initial_url, initial_title, page)
-            
-            print(f"[HumanTrace] {summary}")
-            return action_data, summary
-            
-        except Exception as e:
-            return {
-                "type": "error",
-                "timestamp": time.time() * 1000,
-                "detected": False,
-                "error": str(e),
-                "interactions": [],
-                "interaction_count": 0
-            }, f"Error collecting interactions: {e}"
-
-    def _create_interaction_summary(self, interactions: list) -> str:
-        """Create a summary string of captured interactions."""
-        if interactions:
-            interaction_types = {}
-            for interaction in interactions:
-                itype = interaction.get('type', 'unknown')
-                interaction_types[itype] = interaction_types.get(itype, 0) + 1
-            
-            summary_parts = []
-            for itype, count in interaction_types.items():
-                summary_parts.append(f"{itype}:{count}")
-            return f"Captured {len(interactions)} interactions: {', '.join(summary_parts)}"
-        else:
-            return "No interactions detected"
-
-    def _add_page_change_info(self, action_data: dict, initial_url: str, initial_title: str, page: Page):
-        """Add page change information to action data."""
-        final_url, final_title = page.url, page.title()
-        if initial_url != final_url or initial_title != final_title:
-            action_data["page_changed"] = True
-            action_data["url_change"] = {"from": initial_url, "to": final_url}
-            action_data["title_change"] = {"from": initial_title, "to": final_title}
-
-    def _format_js_interaction_summary(self, action_data, interaction_log):
-        """Format JavaScript-captured interactions into readable summary."""
-        lines = ["Human Interactions (JavaScript Tracking):"]
-
-        if action_data["interactions"]:
-            lines.append(f"Total Actions: {len(action_data['interactions'])}")
-            lines.append("")
-
-            # Group interactions by type
-            by_type = {}
-            for interaction in action_data["interactions"]:
-                interaction_type = interaction["type"]
-                if interaction_type not in by_type:
-                    by_type[interaction_type] = []
-                by_type[interaction_type].append(interaction)
-
-            # Show summary by type
-            for interaction_type, interactions in by_type.items():
-                lines.append(f"{interaction_type.title()}: {len(interactions)} actions")
-
-            lines.append("")
-            lines.append("Detailed Actions:")
-
-            # Add each interaction from the log
-            for log_entry in interaction_log:
-                lines.append(f"  {log_entry}")
-        else:
-            lines.append("No interactions detected - user may have just observed the page")
-
-        # Add page state changes if URL changed
-        if action_data.get("page_changed"):
-            url_info = action_data.get("url")
-            if url_info:
-                lines.append("")
-                lines.append("� Page Navigation:")
-                lines.append(f"  From: {url_info['from']}")
-                lines.append(f"  To: {url_info['to']}")
-
-        return "\n".join(lines)
-
-
-HUMAN_TRACE_AGENT = HumanTraceAgentArgs()
-
-
-if __name__ == "__main__":
-    from agentlab.agents.human_trace_recorder.agent import HUMAN_TRACE_AGENT
-    from agentlab.experiments.study import Study
-
-    agent_configs = [HUMAN_TRACE_AGENT]
-    benchmark = bgym.DEFAULT_BENCHMARKS["workarena_l1"](n_repeats=1)  # type: bgym.Benchmark
-    benchmark = benchmark.subset_from_glob("task_name", "*filter*")
-    benchmark.env_args_list = benchmark.env_args_list[:1]
-    for env_args in benchmark.env_args_list:
-        print(env_args.task_name)
-        env_args.max_steps = 15
-        env_args.headless = False
-
-    study = Study(agent_configs, benchmark)
-    study.run(n_jobs=1, parallel_backend="sequential")
diff --git a/src/agentlab/agents/human_trace_recorder/codegen_agent.py b/src/agentlab/agents/human_trace_recorder/codegen_agent.py
deleted file mode 100644
index cd902bd2..00000000
--- a/src/agentlab/agents/human_trace_recorder/codegen_agent.py
+++ /dev/null
@@ -1,193 +0,0 @@
-"""Simple Codegen Agent
-
-Captures human interactions using playwright inspector.
-Playwright trace logs are stored in "think" messages and can be viewed in Agentlab Xray.
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-import tempfile
-import zipfile
-from dataclasses import dataclass
-from pathlib import Path
-
-import bgym
-from agentlab.agents.agent_args import AgentArgs
-from browsergym.core.observation import (
-    extract_dom_extra_properties,
-    extract_dom_snapshot,
-    extract_focused_element_bid,
-    extract_merged_axtree,
-    extract_screenshot,
-)
-from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, prune_html
-from dotenv import load_dotenv
-from playwright.sync_api import Page
-
-load_dotenv()
-
-def extract_log_message_from_pw_trace(pw_trace_file_path):
-    zip_file = zipfile.ZipFile(pw_trace_file_path, "r")
-    trace_lines = zip_file.read("trace.trace").decode("utf-8").splitlines()
-
-    actions = []
-    for line in trace_lines:
-        if line.strip():
-            event = json.loads(line)
-            if event.get("type") == "log":
-                actions.append(event)
-    # Extract log messages from the trace
-    return [log["message"].strip() for log in sorted(actions, key=lambda x: x.get("time", 0))]
-
-
-def clean_pw_logs(logs, exclude_blacklist=True, use_substitutions=True):
-    clean_logs = list(logs)
-    blacklist = {
-        "attempting click action",
-        "waiting for element to be visible, enabled and stable",
-        "element is visible, enabled and stable",
-        "scrolling into view if needed",
-        "done scrolling",
-        "performing click action",
-        "click action done",
-        "waiting for scheduled navigations to finish",
-        "navigations have finished",
-    }
-
-    substitutions = [("waiting for ", "")]
-
-    def apply_substitutions(log):
-        for old, new in substitutions:
-            log = log.replace(old, new)
-        return log
-
-    if exclude_blacklist:
-        clean_logs = [log for log in clean_logs if log not in blacklist]
-    if use_substitutions:
-        clean_logs = [apply_substitutions(log) for log in clean_logs]
-
-    return clean_logs
-
-
-@dataclass
-class PlayWrightCodeGenAgentArgs(AgentArgs):
-    agent_name: str = "PlayWrightCodeGenAgent"
-    trace_dir: str = "playwright_codegen_traces"
-    use_raw_page_output: bool = True
-    store_raw_trace: bool = False
-
-    def make_agent(self) -> bgym.Agent:  # type: ignore[override]
-        return PlayWrightCodeGenAgent(self.trace_dir, self.store_raw_trace)
-
-    def set_reproducibility_mode(self):
-        pass
-
-
-class PlayWrightCodeGenAgent(bgym.Agent):
-    def __init__(self, trace_dir: str, store_raw_trace: bool):
-        self.action_set = bgym.HighLevelActionSet(["bid"], multiaction=False)
-        self._root = Path(trace_dir)
-        self._page: Page | None = None
-        self._step = 0
-        self.store_raw_trace = store_raw_trace
-        self._episode_trace_dir = None  # Cache for single episode
-
-    def _get_trace_dir(self):
-        """Return the trace directory based on store_raw_trace setting."""
-        if self._episode_trace_dir is None:
-            if self.store_raw_trace:
-                import datetime
-
-                dt_str = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-                self._episode_trace_dir = self._root / f"codegen_traces_{dt_str}"
-                self._episode_trace_dir.mkdir(parents=True, exist_ok=True)
-            else:
-                self._episode_trace_dir = Path(tempfile.mkdtemp())
-        return self._episode_trace_dir
-
-    def obs_preprocessor(self, obs: dict):  # type: ignore[override]
-        if isinstance(obs, dict):
-            self._page = obs.get("page")
-            obs["screenshot"] = extract_screenshot(self._page)
-            obs["dom_object"] = extract_dom_snapshot(self._page)
-            obs["axtree_object"] = extract_merged_axtree(self._page)
-            scale_factor = getattr(self._page, "_bgym_scale_factor", 1.0)
-            extra_properties = extract_dom_extra_properties(
-                obs["dom_object"], scale_factor=scale_factor
-            )
-            obs["extra_element_properties"] = extra_properties
-            obs["focused_element_bid"] = extract_focused_element_bid(self._page)
-
-            if obs["axtree_object"]:
-                obs["axtree_txt"] = flatten_axtree_to_str(obs["axtree_object"])
-
-            if obs["dom_object"]:
-                obs["dom_txt"] = flatten_dom_to_str(obs["dom_object"])
-                obs["pruned_html"] = prune_html(obs["dom_txt"])
-
-        if "page" in obs:  # unpickable
-            del obs["page"]
-
-        return obs
-
-    def get_action(self, obs: dict):  # type: ignore[override]
-
-        if self._page is None:
-            raise RuntimeError("Playwright Page missing; ensure use_raw_page_output=True")
-
-        page = self._page
-        trace_dir = self._get_trace_dir()
-        trace_path = trace_dir / f"step_{self._step}.zip"
-        page.context.tracing.start(screenshots=True, snapshots=True, sources=True)
-        page.context.tracing.start_chunk(name=f"step_{self._step}")
-
-        print(
-            f"{'─'*60}\n" f"Step {self._step}\n",
-            f"{'─'*60}\n",
-            "1. 🔴 Start Recording (Press 'Record' in the Playwright Inspector.)\n",
-            "2. ✨ Perform actions for a single step.\n",
-            "3. ⚫ Stop Recording (Press 'Record' again to stop recording.)\n",
-            "4. ▶️  Press 'Resume' in the Playwright Inspector.",
-        )
-
-        page.pause()  # Launch Inspector and record actions
-        page.context.tracing.stop_chunk(path=trace_path)
-        page.context.tracing.stop()
-
-        pw_logs = extract_log_message_from_pw_trace(trace_path)
-        pw_logs = clean_pw_logs(pw_logs, exclude_blacklist=True)
-        pw_logs_str = "\n".join([f"{i}. {log}" for i, log in enumerate(pw_logs, 1)])
-
-        print(f"\n Playwright logs for step {self._step}:\n{pw_logs_str}")
-
-        self._step += 1
-
-        agent_info = bgym.AgentInfo(
-            think=pw_logs_str,
-            chat_messages=[],
-            stats={},
-        )
-
-        return "noop()", agent_info
-
-
-PW_CODEGEN_AGENT = PlayWrightCodeGenAgentArgs(store_raw_trace=True)
-
-
-if __name__ == "__main__":
-    from agentlab.agents.human_trace_recorder.codegen_agent import PW_CODEGEN_AGENT
-    from agentlab.experiments.study import Study
-
-    agent_configs = [PW_CODEGEN_AGENT]
-    benchmark = bgym.DEFAULT_BENCHMARKS["workarena_l1"]()  # type: bgym.Benchmark
-    benchmark = benchmark.subset_from_glob("task_name", "*create*")
-    benchmark.env_args_list = benchmark.env_args_list[:1]
-    for env_args in benchmark.env_args_list:
-        print(env_args.task_name)
-        env_args.max_steps = 15
-        env_args.headless = False
-
-    study = Study(agent_configs, benchmark, logging_level_stdout=logging.INFO)
-    study.run(n_jobs=1, parallel_backend="sequential", n_relaunch=1)
diff --git a/src/agentlab/agents/human_trace_recorder/event_listeners.py b/src/agentlab/agents/human_trace_recorder/event_listeners.py
deleted file mode 100644
index 2fd8453c..00000000
--- a/src/agentlab/agents/human_trace_recorder/event_listeners.py
+++ /dev/null
@@ -1,563 +0,0 @@
-"""JavaScript Event Listeners for Human Trace Capture
-
-This module contains all the JavaScript code for capturing comprehensive
-browser interactions including mouse, keyboard, form, scroll, and focus events.
-"""
-
-
-def get_interaction_tracking_script() -> str:
-    """Get the complete JavaScript code for interaction tracking."""
-    return (
-        """
-        window.__acted = false;
-        window.__interactions = [];
-        
-        // Debug mode - set to true to see all events in console
-        window.__debug_events = false; 
-        
-        function captureInteraction(type, event, extra = {}) {
-            // Skip our own recording indicators
-            if (event.target.id === '__rec' || event.target.id === '__rec_border' || 
-                event.target.closest('#__rec') || event.target.closest('#__rec_border')) {
-                return;
-            }
-            
-            const interaction = {
-                type: type,
-                timestamp: Date.now(),
-                coords: {
-                    x: event.clientX || 0,
-                    y: event.clientY || 0
-                },
-                target: {
-                    tagName: event.target.tagName,
-                    id: event.target.id || null,
-                    className: event.target.className || null,
-                    text: event.target.textContent?.slice(0, 50) || null,
-                    bid: event.target.getAttribute('bid') || null
-                },
-                ...extra
-            };
-            
-            window.__interactions.push(interaction);
-            window.__acted = true;
-            
-            // Debug logging
-            if (window.__debug_events) {
-                console.log(`🎯 Captured: ${type}`, interaction);
-            }
-            
-            // Update indicators immediately
-            const indicator = document.getElementById('__rec');
-            const border = document.getElementById('__rec_border');
-            if (indicator) {
-                indicator.innerHTML = '✅ ACTION DETECTED - SAVING...';
-                indicator.style.background = '#28a745';
-                indicator.style.animation = 'none';
-            }
-            if (border) {
-                border.style.border = '8px solid #28a745';
-                border.style.animation = 'none';
-            }
-        }
-        
-        // Debug function - add this temporarily to see what events fire
-        if (window.__debug_events) {
-            ['input', 'change', 'select', 'focus', 'click', 'keydown', 'paste', 'cut', 'copy'].forEach(eventType => {
-                document.addEventListener(eventType, (e) => {
-                    console.log(`🔍 DEBUG: ${eventType} on`, e.target.tagName, e.target.type, e.target);
-                }, true);
-            });
-        }
-        
-        """
-        + get_mouse_event_listeners()
-        + """
-        """
-        + get_keyboard_event_listeners()
-        + """
-        """
-        + get_form_event_listeners()
-        + """
-        """
-        + get_scroll_event_listeners()
-        + """
-        """
-        + get_focus_event_listeners()
-        + """
-        
-        console.log('Comprehensive interaction tracking initialized');
-    """
-    )
-
-
-def get_mouse_event_listeners() -> str:
-    """Get JavaScript code for mouse event listeners."""
-    return """
-        // Mouse events with comprehensive button tracking and performance optimizations
-        let lastClickTime = 0;
-        
-        document.addEventListener('click', (e) => {
-            const now = Date.now();
-            // Prevent spam clicking from creating too many events (minimum 50ms between clicks)
-            if (now - lastClickTime < 50) return;
-            lastClickTime = now;
-            
-            captureInteraction('click', e, {
-                button: e.button, // 0=left, 1=middle, 2=right
-                buttons: e.buttons, // bitmask of pressed buttons
-                buttonName: ['left', 'middle', 'right'][e.button] || 'unknown',
-                detail: e.detail, // click count (single, double, etc.)
-                clickType: e.detail === 1 ? 'single' : e.detail === 2 ? 'double' : `${e.detail}x`
-            });
-        }, true);
-        
-        document.addEventListener('dblclick', (e) => {
-            captureInteraction('dblclick', e, {
-                button: e.button,
-                buttonName: ['left', 'middle', 'right'][e.button] || 'unknown'
-            });
-        }, true);
-        
-        document.addEventListener('mousedown', (e) => {
-            captureInteraction('mousedown', e, {
-                button: e.button,
-                buttons: e.buttons,
-                buttonName: ['left', 'middle', 'right'][e.button] || 'unknown'
-            });
-        }, true);
-        
-        document.addEventListener('mouseup', (e) => {
-            captureInteraction('mouseup', e, {
-                button: e.button,
-                buttons: e.buttons,
-                buttonName: ['left', 'middle', 'right'][e.button] || 'unknown'
-            });
-        }, true);
-        
-        // Context menu (right-click menu)
-        document.addEventListener('contextmenu', (e) => {
-            captureInteraction('contextmenu', e, {
-                button: e.button,
-                buttonName: 'right'
-            });
-        }, true);
-        
-        // Middle mouse button events (often used for scrolling/opening in new tab)
-        document.addEventListener('auxclick', (e) => {
-            captureInteraction('auxclick', e, {
-                button: e.button,
-                buttonName: e.button === 1 ? 'middle' : (e.button === 2 ? 'right' : 'other'),
-                detail: e.detail
-            });
-        }, true);
-        
-        // Enhanced drag tracking (without redundant mousedown)
-        let isDragging = false;
-        let dragStart = null;
-        let dragButton = null;
-        let hasDraggedSignificantly = false;
-        
-        document.addEventListener('mousedown', (e) => {
-            isDragging = true;
-            dragButton = e.button;
-            hasDraggedSignificantly = false;
-            dragStart = {
-                x: e.clientX, 
-                y: e.clientY, 
-                time: Date.now(),
-                button: e.button,
-                buttonName: ['left', 'middle', 'right'][e.button] || 'unknown'
-            };
-        }, true);
-        
-        document.addEventListener('mousemove', (e) => {
-            if (isDragging && dragStart) {
-                const distance = Math.sqrt(
-                    Math.pow(e.clientX - dragStart.x, 2) + 
-                    Math.pow(e.clientY - dragStart.y, 2)
-                );
-                if (distance > 5 && !hasDraggedSignificantly) { 
-                    // Only capture the start of a significant drag, not every movement
-                    hasDraggedSignificantly = true;
-                    captureInteraction('drag_start', e, {
-                        startX: dragStart.x,
-                        startY: dragStart.y,
-                        endX: e.clientX,
-                        endY: e.clientY,
-                        distance: distance,
-                        button: dragButton,
-                        buttonName: dragStart.buttonName,
-                        duration: Date.now() - dragStart.time
-                    });
-                }
-            }
-            // Note: Removed general mousemove tracking to reduce noise
-        }, true);
-        
-        document.addEventListener('mouseup', (e) => {
-            if (isDragging && dragStart && hasDraggedSignificantly) {
-                const distance = Math.sqrt(
-                    Math.pow(e.clientX - dragStart.x, 2) + 
-                    Math.pow(e.clientY - dragStart.y, 2)
-                );
-                captureInteraction('drag_end', e, {
-                    startX: dragStart.x,
-                    startY: dragStart.y,
-                    endX: e.clientX,
-                    endY: e.clientY,
-                    distance: distance,
-                    duration: Date.now() - dragStart.time,
-                    button: dragButton,
-                    buttonName: dragStart.buttonName,
-                    totalDistance: distance
-                });
-            }
-            isDragging = false;
-            dragStart = null;
-            dragButton = null;
-            hasDraggedSignificantly = false;
-        }, true);
-        
-        // Drag and drop events
-        document.addEventListener('dragstart', (e) => {
-            captureInteraction('dragstart', e, {
-                dataTransfer: {
-                    effectAllowed: e.dataTransfer.effectAllowed,
-                    types: Array.from(e.dataTransfer.types)
-                }
-            });
-        }, true);
-        
-        document.addEventListener('dragend', (e) => {
-            captureInteraction('dragend', e, {
-                dataTransfer: {
-                    dropEffect: e.dataTransfer.dropEffect
-                }
-            });
-        }, true);
-        
-        document.addEventListener('drop', (e) => {
-            captureInteraction('drop', e, {
-                dataTransfer: {
-                    dropEffect: e.dataTransfer.dropEffect,
-                    types: Array.from(e.dataTransfer.types)
-                },
-                files: e.dataTransfer.files.length > 0 ? Array.from(e.dataTransfer.files).map(f => ({
-                    name: f.name,
-                    type: f.type,
-                    size: f.size
-                })) : null
-            });
-        }, true);
-    """
-
-
-def get_keyboard_event_listeners() -> str:
-    """Get JavaScript code for keyboard event listeners."""
-    return """
-        // Keyboard events with shortcut detection
-        document.addEventListener('keydown', (e) => {
-            let shortcut = null;
-            if (e.ctrlKey || e.metaKey) {
-                const modifier = e.ctrlKey ? 'Ctrl' : 'Cmd';
-                const key = e.key.length === 1 ? e.key.toUpperCase() : e.key;
-                shortcut = `${modifier}+${key}`;
-            } else if (e.altKey && e.key.length === 1) {
-                shortcut = `Alt+${e.key.toUpperCase()}`;
-            } else if (e.shiftKey && e.key.length === 1) {
-                shortcut = `Shift+${e.key.toUpperCase()}`;
-            }
-            
-            captureInteraction('keydown', e, {
-                key: e.key,
-                code: e.code,
-                ctrlKey: e.ctrlKey,
-                shiftKey: e.shiftKey,
-                altKey: e.altKey,
-                metaKey: e.metaKey,
-                shortcut: shortcut
-            });
-        }, true);
-        
-        document.addEventListener('keyup', (e) => {
-            captureInteraction('keyup', e, {
-                key: e.key,
-                code: e.code
-            });
-        }, true);
-    """
-
-
-def get_form_event_listeners() -> str:
-    """Get JavaScript code for form event listeners."""
-    return """
-        // Input events with throttling to prevent spam during fast typing
-        let inputTimeout;
-        let lastInputValue = '';
-        
-        document.addEventListener('input', (e) => {
-            if (['INPUT', 'TEXTAREA'].includes(e.target.tagName) || e.target.contentEditable === 'true') {
-                clearTimeout(inputTimeout);
-                inputTimeout = setTimeout(() => {
-                    const currentValue = e.target.value || e.target.textContent;
-                    // Only capture if value actually changed significantly
-                    if (currentValue !== lastInputValue) {
-                        lastInputValue = currentValue;
-                        captureInteraction('input', e, {
-                            value: currentValue,
-                            inputType: e.inputType || null,
-                            valueLength: currentValue.length
-                        });
-                    }
-                }, 50); // Reduced from 300ms to 50ms for better responsiveness
-            }
-        }, true);
-        
-        // Immediate input capture (without throttling) for certain cases
-        document.addEventListener('input', (e) => {
-            // Immediate capture for dropdown/select-like inputs or when selection changes
-            if (e.target.tagName === 'SELECT' || 
-                e.inputType === 'deleteContentBackward' || 
-                e.inputType === 'insertFromPaste' ||
-                e.inputType === 'insertFromDrop') {
-                captureInteraction('input_immediate', e, {
-                    value: e.target.value || e.target.textContent,
-                    inputType: e.inputType || null,
-                    immediate: true
-                });
-            }
-        }, true);
-        
-        // Text selection events
-        document.addEventListener('select', (e) => {
-            if (['INPUT', 'TEXTAREA'].includes(e.target.tagName)) {
-                const selectedText = e.target.value.substring(e.target.selectionStart, e.target.selectionEnd);
-                captureInteraction('select', e, {
-                    selectedText: selectedText,
-                    selectionStart: e.target.selectionStart,
-                    selectionEnd: e.target.selectionEnd,
-                    value: e.target.value,
-                    selectionLength: selectedText.length
-                });
-            }
-        }, true);
-        
-        // Clipboard events
-        document.addEventListener('cut', (e) => {
-            captureInteraction('cut', e, {
-                clipboardData: e.clipboardData ? Array.from(e.clipboardData.types) : null,
-                targetValue: e.target.value || e.target.textContent
-            });
-        }, true);
-        
-        document.addEventListener('copy', (e) => {
-            captureInteraction('copy', e, {
-                clipboardData: e.clipboardData ? Array.from(e.clipboardData.types) : null,
-                targetValue: e.target.value || e.target.textContent
-            });
-        }, true);
-        
-        document.addEventListener('paste', (e) => {
-            captureInteraction('paste', e, {
-                clipboardData: e.clipboardData ? Array.from(e.clipboardData.types) : null,
-                targetValue: e.target.value || e.target.textContent
-            });
-        }, true);
-        
-        // Enhanced form change events with better dropdown handling
-        document.addEventListener('change', (e) => {
-            let extra = {};
-            if (e.target.tagName === 'SELECT') {
-                const option = e.target.options[e.target.selectedIndex];
-                extra = {
-                    selectedValue: e.target.value,
-                    selectedText: option?.text || '',
-                    selectedIndex: e.target.selectedIndex,
-                    allOptions: Array.from(e.target.options).map(opt => ({
-                        value: opt.value,
-                        text: opt.text,
-                        selected: opt.selected
-                    })),
-                    optionsCount: e.target.options.length
-                };
-            } else if (['checkbox', 'radio'].includes(e.target.type)) {
-                extra = {
-                    checked: e.target.checked,
-                    value: e.target.value,
-                    name: e.target.name
-                };
-            } else {
-                extra = {
-                    value: e.target.value,
-                    previousValue: e.target.defaultValue, // Capture what it was before
-                    inputType: e.target.type
-                };
-            }
-            captureInteraction('change', e, extra);
-        }, true);
-        
-        document.addEventListener('submit', (e) => {
-            captureInteraction('submit', e, {
-                formAction: e.target.action || null,
-                formMethod: e.target.method || 'GET',
-                formElements: Array.from(e.target.elements).length
-            });
-        }, true);
-        
-        // Additional events for better field interaction capture
-        
-        // Option selection in datalists
-        document.addEventListener('input', (e) => {
-            if (e.target.list) { // Has datalist
-                captureInteraction('datalist_input', e, {
-                    value: e.target.value,
-                    listId: e.target.list.id,
-                    optionsCount: e.target.list.options.length
-                });
-            }
-        }, true);
-        
-        // File input changes
-        document.addEventListener('change', (e) => {
-            if (e.target.type === 'file') {
-                captureInteraction('file_select', e, {
-                    filesCount: e.target.files.length,
-                    files: Array.from(e.target.files).map(file => ({
-                        name: file.name,
-                        type: file.type,
-                        size: file.size,
-                        lastModified: file.lastModified
-                    }))
-                });
-            }
-        }, true);
-    """
-
-
-def get_scroll_event_listeners() -> str:
-    """Get JavaScript code for scroll event listeners."""
-    return """
-        // Scroll events with debouncing to reduce noise
-        let scrollTimeout;
-        let lastScrollTime = 0;
-        
-        document.addEventListener('scroll', (e) => {
-            clearTimeout(scrollTimeout);
-            scrollTimeout = setTimeout(() => {
-                const now = Date.now();
-                // Only capture scroll if it's been at least 200ms since last scroll capture
-                if (now - lastScrollTime > 200) {
-                    lastScrollTime = now;
-                    captureInteraction('scroll', e, {
-                        scrollX: window.scrollX,
-                        scrollY: window.scrollY,
-                        scrollLeft: e.target.scrollLeft || 0,
-                        scrollTop: e.target.scrollTop || 0
-                    });
-                }
-            }, 150); // Increased debounce time
-        }, true);
-        
-        // Wheel events (for detailed scroll tracking) with throttling
-        let lastWheelTime = 0;
-        document.addEventListener('wheel', (e) => {
-            const now = Date.now();
-            // Only capture wheel events every 100ms to reduce noise
-            if (now - lastWheelTime > 100) {
-                lastWheelTime = now;
-                captureInteraction('wheel', e, {
-                    deltaX: e.deltaX,
-                    deltaY: e.deltaY,
-                    deltaZ: e.deltaZ,
-                    deltaMode: e.deltaMode
-                });
-            }
-        }, true);
-    """
-
-
-def get_focus_event_listeners() -> str:
-    """Get JavaScript code for focus event listeners."""
-    return """
-        // Focus events - only for interactive elements to reduce noise
-        document.addEventListener('focus', (e) => {
-            // Only capture focus on interactive elements
-            const interactiveElements = ['INPUT', 'TEXTAREA', 'SELECT', 'BUTTON', 'A'];
-            if (interactiveElements.includes(e.target.tagName) || 
-                e.target.contentEditable === 'true' || 
-                e.target.tabIndex >= 0) {
-                captureInteraction('focus', e);
-            }
-        }, true);
-        
-        document.addEventListener('blur', (e) => {
-            // Only capture blur on interactive elements
-            const interactiveElements = ['INPUT', 'TEXTAREA', 'SELECT', 'BUTTON', 'A'];
-            if (interactiveElements.includes(e.target.tagName) || 
-                e.target.contentEditable === 'true' || 
-                e.target.tabIndex >= 0) {
-                captureInteraction('blur', e);
-            }
-        }, true);
-    """
-
-
-def get_recording_indicators_script() -> str:
-    """Get JavaScript code for recording indicators."""
-    return """
-        // Remove any existing indicators
-        const existingBorder = document.getElementById('__rec_border');
-        if (existingBorder) existingBorder.remove();
-        const existingIndicator = document.getElementById('__rec');
-        if (existingIndicator) existingIndicator.remove();
-        
-        // Create border overlay
-        const border = document.createElement('div');
-        border.id = '__rec_border';
-        border.style.cssText = `
-            position: fixed;
-            top: 0;
-            left: 0;
-            width: 100vw;
-            height: 100vh;
-            border: 8px solid #ff0000;
-            box-sizing: border-box;
-            pointer-events: none;
-            z-index: 999999;
-            animation: pulse 1.5s infinite;
-        `;
-        
-        // Create status indicator
-        const indicator = document.createElement('div');
-        indicator.id = '__rec';
-        indicator.innerHTML = '🔴 RECORDING - Perform your action now';
-        indicator.style.cssText = `
-            position: fixed;
-            top: 10px;
-            left: 50%;
-            transform: translateX(-50%);
-            background: #ff0000;
-            color: #fff;
-            padding: 12px 20px;
-            border-radius: 8px;
-            font: bold 10px -apple-system, BlinkMacSystemFont, sans-serif;
-            z-index: 9999999;
-            box-shadow: 0 4px 12px rgba(255,0,0,0.4);
-            animation: pulse 1.5s infinite;
-        `;
-        
-        // Add pulsing animation
-        const style = document.createElement('style');
-        style.textContent = `
-            @keyframes pulse {
-                0% { opacity: 1; }
-                50% { opacity: 0.4; }
-                100% { opacity: 0.8; }
-            }
-        `;
-        document.head.appendChild(style);
-        
-        document.body.appendChild(border);
-        document.body.appendChild(indicator);
-    """

From e93fde52dbd3f97e4072a2bd624b115365cb3b17 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Tue, 2 Sep 2025 15:58:38 +0000
Subject: [PATCH 20/23] log task errors

---
 src/agentlab/experiments/graph_execution_ray.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/agentlab/experiments/graph_execution_ray.py b/src/agentlab/experiments/graph_execution_ray.py
index f047f866..f7aad780 100644
--- a/src/agentlab/experiments/graph_execution_ray.py
+++ b/src/agentlab/experiments/graph_execution_ray.py
@@ -3,9 +3,8 @@
 
 import bgym
 import ray
-from ray.util import state
-
 from agentlab.experiments.exp_utils import _episode_timeout, run_exp
+from ray.util import state
 
 logger = logging.getLogger(__name__)
 
@@ -79,6 +78,7 @@ def poll_for_timeout(tasks: dict[str, ray.ObjectRef], timeout: float, poll_inter
                 try:
                     result = ray.get(task)
                 except Exception as e:
+                    logger.exception(f"Task failed: {e}")
                     result = e
                 results.append(result)
 

From 5604ac36c861128b296a9b894497297d1e749146 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Tue, 2 Sep 2025 15:59:11 +0000
Subject: [PATCH 21/23] expore agentlabxray

---
 src/agentlab/analyze/agent_xray.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index 8accbfd6..b60c0dcb 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -539,7 +539,7 @@ def run_gradio(results_dir: Path):
     port = os.getenv("AGENTXRAY_APP_PORT", None)
     if isinstance(port, str):
         port = int(port)
-    demo.launch(server_port=port, share=do_share)
+    demo.launch(server_name="0.0.0.0", server_port=port, share=do_share)
 
 
 def handle_key_event(key_event, step_id: StepId):

From 0e68bcab654ffe052a4445aafb338f74ae7400a2 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 3 Sep 2025 09:39:32 +0000
Subject: [PATCH 22/23] remove commented old chunk

---
 .../agents/generic_agent_hinter/generic_agent_prompt.py        | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py
index b684b6c9..983c9d48 100644
--- a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py
+++ b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py
@@ -378,9 +378,6 @@ def get_hints_for_task(self, task_name: str) -> str:
             return ""
 
         try:
-            # task_hints = self.hint_db[
-            #     self.hint_db["task_name"].apply(lambda x: fnmatch.fnmatch(x, task_name))
-            # ]
             task_hints = self.hints_source.choose_hints(self.llm, task_name, self.goal)
 
             hints = []

From e4cad16a9dd83401945624623ed4871dc30cc5dd Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 3 Sep 2025 09:43:19 +0000
Subject: [PATCH 23/23] share xray only when env flag present

---
 src/agentlab/analyze/agent_xray.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index b60c0dcb..fed78b3e 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -537,9 +537,10 @@ def run_gradio(results_dir: Path):
 
     do_share = os.getenv("AGENTXRAY_SHARE_GRADIO", "false").lower() == "true"
     port = os.getenv("AGENTXRAY_APP_PORT", None)
+    server_name = "0.0.0.0" if os.getenv("AGENTXRAY_PUBLIC", "false") == "true" else "127.0.0.1"
     if isinstance(port, str):
         port = int(port)
-    demo.launch(server_name="0.0.0.0", server_port=port, share=do_share)
+    demo.launch(server_name=server_name, server_port=port, share=do_share)
 
 
 def handle_key_event(key_event, step_id: StepId):