From 7af2d152adeed3d48c7a074e175125823e76f203 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Wed, 13 Aug 2025 12:16:50 +0200 Subject: [PATCH 01/23] fixes --- .../agents/tool_use_agent/tool_use_agent.py | 139 ++++++++++++++++-- src/agentlab/analyze/agent_xray.py | 2 +- src/agentlab/llm/tracking.py | 12 +- 3 files changed, 137 insertions(+), 16 deletions(-) diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py index 6ac61180..b1407a87 100644 --- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py +++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py @@ -1,10 +1,12 @@ import fnmatch import json +import logging from abc import ABC, abstractmethod +from collections import defaultdict from copy import copy from dataclasses import asdict, dataclass, field from pathlib import Path -from typing import Any +from typing import Any, Literal import bgym import pandas as pd @@ -16,6 +18,7 @@ overlay_som, prune_html, ) +from sentence_transformers import SentenceTransformer from agentlab.agents.agent_args import AgentArgs from agentlab.benchmarks.abstract_env import AbstractBenchmark as AgentLabBenchmark @@ -34,6 +37,8 @@ ) from agentlab.llm.tracking import cost_tracker_decorator +logger = logging.getLogger(__name__) + @dataclass class Block(ABC): @@ -298,22 +303,45 @@ def apply_init(self, llm, discussion: StructuredDiscussion) -> dict: class TaskHint(Block): use_task_hint: bool = True hint_db_rel_path: str = "hint_db.csv" + hint_retrieval_mode: Literal["direct", "llm", "emb"] = "direct" + top_n: int = 4 # Number of top hints to return when using embedding retrieval + embedder_model: str = "Qwen/Qwen3-Embedding-0.6B" # Model for embedding hints + llm_prompt: str = """We're choosing hints to help solve the following task:\n{goal}.\n +You need to choose the most relevant hints topic from the following list:\n\nHint topics:\n{topics}\n +Choose hint topic for the task and return only its number, e.g. 1. If you don't know the answer, return -1.""" def _init(self): """Initialize the block.""" - hint_db_path = Path(__file__).parent / self.hint_db_rel_path + if Path(self.hint_db_rel_path).is_absolute(): + hint_db_path = Path(self.hint_db_rel_path) + else: + hint_db_path = Path(__file__).parent / self.hint_db_rel_path self.hint_db = pd.read_csv(hint_db_path, header=0, index_col=None, dtype=str) + if self.hint_retrieval_mode == "emb": + logger.info("Load sentence transformer model for hint embeddings.") + self.emb_model = SentenceTransformer( + "Qwen/Qwen3-Embedding-0.6B", model_kwargs={"torch_dtype": "bfloat16"} + ) + self.encode_hints() + + def encode_hints(self): + self.uniq_hints = self.hint_db.drop_duplicates(subset=["hint"], keep="first") + logger.info( + f"Encoding {len(self.uniq_hints)} unique hints using {self.embedder_model} model." + ) + self.hint_embeddings = self.emb_model.encode( + self.uniq_hints["hint"].tolist(), prompt="task hint" + ) def apply(self, llm, discussion: StructuredDiscussion, task_name: str) -> dict: if not self.use_task_hint: - return + return {} - task_hints = self.hint_db[ - self.hint_db["task_name"].apply(lambda x: fnmatch.fnmatch(x, task_name)) - ] + goal = "\n".join([c.get("text", "") for c in discussion.groups[0].messages[1].content]) + task_hints = self.choose_hints(llm, task_name, goal) hints = [] - for hint in task_hints["hint"]: + for hint in task_hints: hint = hint.strip() if hint: hints.append(f"- {hint}") @@ -327,6 +355,58 @@ def apply(self, llm, discussion: StructuredDiscussion, task_name: str) -> dict: discussion.append(msg) + def choose_hints(self, llm, task_name: str, goal: str) -> list[str]: + """Choose hints based on the task name.""" + if self.hint_retrieval_mode == "llm": + return self.choose_hints_llm(llm, goal) + elif self.hint_retrieval_mode == "direct": + return self.choose_hints_direct(task_name) + elif self.hint_retrieval_mode == "emb": + return self.choose_hints_emb(goal) + else: + raise ValueError(f"Unknown hint retrieval mode: {self.hint_retrieval_mode}") + + def choose_hints_llm(self, llm, goal: str) -> list[str]: + """Choose hints using LLM to filter the hints.""" + topic_to_hints = defaultdict(list) + for i, row in self.hint_db.iterrows(): + topic_to_hints[row["semantic_keys"]].append(i) + hint_topics = list(topic_to_hints.keys()) + topics = "\n".join([f"{i}. {h}" for i, h in enumerate(hint_topics)]) + prompt = self.llm_prompt.format(goal=goal, topics=topics) + response = llm(APIPayload(messages=[llm.msg.user().add_text(prompt)])) + try: + hint_topic_idx = json.loads(response.think) + if hint_topic_idx < 0 or hint_topic_idx >= len(hint_topics): + logger.error(f"Wrong LLM hint id response: {response.think}, no hints") + return [] + hint_topic = hint_topics[hint_topic_idx] + hint_indices = topic_to_hints[hint_topic] + df = self.hint_db.iloc[hint_indices].copy() + df = df.drop_duplicates(subset=["hint"], keep="first") # leave only unique hints + hints = df["hint"].tolist() + logger.debug(f"LLM hint topic {hint_topic_idx}, chosen hints: {df['hint'].tolist()}") + except json.JSONDecodeError: + logger.error(f"Failed to parse LLM hint id response: {response.think}, no hints") + hints = [] + return hints + + def choose_hints_emb(self, goal: str) -> list[str]: + """Choose hints using embeddings to filter the hints.""" + goal_embeddings = self.emb_model.encode([goal], prompt="task description") + similarities = self.emb_model.similarity(goal_embeddings, self.hint_embeddings) + top_indices = similarities.argsort()[0][-self.top_n :].tolist() + logger.info(f"Top hint indices based on embedding similarity: {top_indices}") + hints = self.uniq_hints.iloc[top_indices] + logger.info(f"Embedding-based hints chosen: {hints}") + return hints["hint"].tolist() + + def choose_hints_direct(self, task_name: str) -> list[str]: + hints = self.hint_db[ + self.hint_db["task_name"].apply(lambda x: fnmatch.fnmatch(x, task_name)) + ] + return hints["hint"].tolist() + @dataclass class PromptConfig: @@ -510,6 +590,15 @@ def get_action(self, obs: Any) -> float: vision_support=True, ) +GPT_4_1_CC_API = OpenAIChatModelArgs( + model_name="gpt-4.1", + max_total_tokens=200_000, + max_input_tokens=200_000, + max_new_tokens=2_000, + temperature=0.1, + vision_support=True, +) + GPT_4_1_MINI = OpenAIResponseModelArgs( model_name="gpt-4.1-mini", max_total_tokens=200_000, @@ -528,7 +617,7 @@ def get_action(self, obs: Any) -> float: vision_support=True, ) -CLAUDE_MODEL_CONFIG = ClaudeResponseModelArgs( +CLAUDE_SONNET_37 = ClaudeResponseModelArgs( model_name="claude-3-7-sonnet-20250219", max_total_tokens=200_000, max_input_tokens=200_000, @@ -537,6 +626,15 @@ def get_action(self, obs: Any) -> float: vision_support=True, ) +CLAUDE_SONNET_4 = ClaudeResponseModelArgs( + model_name="claude-sonnet-4-20250514", + max_total_tokens=200_000, + max_input_tokens=200_000, + max_new_tokens=2_000, + temperature=0.1, + vision_support=True, +) + O3_RESPONSE_MODEL = OpenAIResponseModelArgs( model_name="o3-2025-04-16", max_total_tokens=200_000, @@ -554,6 +652,25 @@ def get_action(self, obs: Any) -> float: vision_support=True, ) +GPT_5 = OpenAIChatModelArgs( + model_name="gpt-5", + max_total_tokens=200_000, + max_input_tokens=200_000, + max_new_tokens=2_000, + temperature=None, + vision_support=True, +) + + +GPT_5_MINI = OpenAIChatModelArgs( + model_name="gpt-5-mini-2025-08-07", + max_total_tokens=200_000, + max_input_tokens=200_000, + max_new_tokens=2_000, + temperature=1.0, + vision_support=True, +) + GPT4_1_OPENROUTER_MODEL = OpenRouterModelArgs( model_name="openai/gpt-4.1", max_total_tokens=200_000, @@ -580,12 +697,12 @@ def get_action(self, obs: Any) -> float: keep_last_n_obs=None, multiaction=True, # whether to use multi-action or not # action_subsets=("bid",), - action_subsets=("coord"), + action_subsets=("coord",), # action_subsets=("coord", "bid"), ) AGENT_CONFIG = ToolUseAgentArgs( - model_args=CLAUDE_MODEL_CONFIG, + model_args=CLAUDE_SONNET_37, config=DEFAULT_PROMPT_CONFIG, ) @@ -605,7 +722,7 @@ def get_action(self, obs: Any) -> float: ) OSWORLD_CLAUDE = ToolUseAgentArgs( - model_args=CLAUDE_MODEL_CONFIG, + model_args=CLAUDE_SONNET_37, config=PromptConfig( tag_screenshot=True, goal=Goal(goal_as_system_msg=True), diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index 84dc423d..37ead1c3 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -735,7 +735,7 @@ def dict_msg_to_markdown(d: dict): case _: parts.append(f"\n```\n{str(item)}\n```\n") - markdown = f"### {d["role"].capitalize()}\n" + markdown = f"### {d['role'].capitalize()}\n" markdown += "\n".join(parts) return markdown diff --git a/src/agentlab/llm/tracking.py b/src/agentlab/llm/tracking.py index e761a7f6..afcf5e07 100644 --- a/src/agentlab/llm/tracking.py +++ b/src/agentlab/llm/tracking.py @@ -178,9 +178,9 @@ def __call__(self, *args, **kwargs): # 'self' here calls ._call_api() method of the subclass response = self._call_api(*args, **kwargs) usage = dict(getattr(response, "usage", {})) - if "prompt_tokens_details" in usage: + if "prompt_tokens_details" in usage and usage["prompt_tokens_details"]: usage["cached_tokens"] = usage["prompt_tokens_details"].cached_tokens - if "input_tokens_details" in usage: + if "input_tokens_details" in usage and usage["input_tokens_details"]: usage["cached_tokens"] = usage["input_tokens_details"].cached_tokens usage = {f"usage_{k}": v for k, v in usage.items() if isinstance(v, (int, float))} usage |= {"n_api_calls": 1} @@ -332,12 +332,16 @@ def get_effective_cost_from_openai_api(self, response) -> float: if api_type == "chatcompletion": total_input_tokens = usage.prompt_tokens # (cache read tokens + new input tokens) output_tokens = usage.completion_tokens - cached_input_tokens = usage.prompt_tokens_details.cached_tokens + cached_input_tokens = ( + usage.prompt_tokens_details.cached_tokens if usage.prompt_tokens_details else 0 + ) new_input_tokens = total_input_tokens - cached_input_tokens elif api_type == "response": total_input_tokens = usage.input_tokens # (cache read tokens + new input tokens) output_tokens = usage.output_tokens - cached_input_tokens = usage.input_tokens_details.cached_tokens + cached_input_tokens = ( + usage.input_tokens_details.cached_tokens if usage.input_tokens_details else 0 + ) new_input_tokens = total_input_tokens - cached_input_tokens else: logging.warning(f"Unsupported API type: {api_type}. Defaulting cost to 0.0.") From 3f9e4a2191f81d1a177e9be3d6eea734924754cd Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Wed, 13 Aug 2025 12:16:57 +0200 Subject: [PATCH 02/23] add new deps --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 6322ffd3..a2798f2e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,3 +27,5 @@ ray[default] python-slugify pillow gymnasium>=0.27 +sentence-transformers>=5.0.0 +python-dotenv>=1.1.1 \ No newline at end of file From c88d7f3fd0f7942e700d5d79ee79555f45cf3f6b Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 19 Aug 2025 14:14:38 +0200 Subject: [PATCH 03/23] use external embedding service in task hints retrieval --- .../agents/tool_use_agent/tool_use_agent.py | 75 +++++++++++++++---- 1 file changed, 61 insertions(+), 14 deletions(-) diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py index b1407a87..f6ace3a8 100644 --- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py +++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py @@ -1,6 +1,9 @@ import fnmatch import json import logging +import os +import random +import time from abc import ABC, abstractmethod from collections import defaultdict from copy import copy @@ -9,7 +12,9 @@ from typing import Any, Literal import bgym +import numpy as np import pandas as pd +import requests from bgym import Benchmark as BgymBenchmark from browsergym.core.observation import extract_screenshot from browsergym.utils.obs import ( @@ -18,7 +23,6 @@ overlay_som, prune_html, ) -from sentence_transformers import SentenceTransformer from agentlab.agents.agent_args import AgentArgs from agentlab.benchmarks.abstract_env import AbstractBenchmark as AgentLabBenchmark @@ -181,7 +185,6 @@ class Obs(Block): def apply( self, llm, discussion: StructuredDiscussion, obs: dict, last_llm_output: LLMOutput ) -> dict: - obs_msg = llm.msg.user() tool_calls = last_llm_output.tool_calls if self.use_last_error: @@ -306,6 +309,7 @@ class TaskHint(Block): hint_retrieval_mode: Literal["direct", "llm", "emb"] = "direct" top_n: int = 4 # Number of top hints to return when using embedding retrieval embedder_model: str = "Qwen/Qwen3-Embedding-0.6B" # Model for embedding hints + embedder_server: str = "http://localhost:5000" llm_prompt: str = """We're choosing hints to help solve the following task:\n{goal}.\n You need to choose the most relevant hints topic from the following list:\n\nHint topics:\n{topics}\n Choose hint topic for the task and return only its number, e.g. 1. If you don't know the answer, return -1.""" @@ -318,20 +322,26 @@ def _init(self): hint_db_path = Path(__file__).parent / self.hint_db_rel_path self.hint_db = pd.read_csv(hint_db_path, header=0, index_col=None, dtype=str) if self.hint_retrieval_mode == "emb": - logger.info("Load sentence transformer model for hint embeddings.") - self.emb_model = SentenceTransformer( - "Qwen/Qwen3-Embedding-0.6B", model_kwargs={"torch_dtype": "bfloat16"} - ) self.encode_hints() + def oai_embed(self, text: str): + response = self._oai_emb.create(input=text, model="text-embedding-3-small") + return response.data[0].embedding + def encode_hints(self): self.uniq_hints = self.hint_db.drop_duplicates(subset=["hint"], keep="first") logger.info( - f"Encoding {len(self.uniq_hints)} unique hints using {self.embedder_model} model." - ) - self.hint_embeddings = self.emb_model.encode( - self.uniq_hints["hint"].tolist(), prompt="task hint" + f"Encoding {len(self.uniq_hints)} unique hints with semantic keys using {self.embedder_model} model." ) + hints = self.uniq_hints["hint"].tolist() + semantic_keys = self.uniq_hints["semantic_keys"].tolist() + lines = [f"{k}: {h}" for h, k in zip(hints, semantic_keys)] + emb_path = f"{self.hint_db_rel_path}.embs.npy" + assert os.path.exists(emb_path), f"Embedding file not found: {emb_path}" + logger.info(f"Loading hint embeddings from: {emb_path}") + emb_dict = np.load(emb_path, allow_pickle=True).item() + self.hint_embeddings = np.array([emb_dict[k] for k in lines]) + logger.info(f"Loaded hint embeddings shape: {self.hint_embeddings.shape}") def apply(self, llm, discussion: StructuredDiscussion, task_name: str) -> dict: if not self.use_task_hint: @@ -393,14 +403,50 @@ def choose_hints_llm(self, llm, goal: str) -> list[str]: def choose_hints_emb(self, goal: str) -> list[str]: """Choose hints using embeddings to filter the hints.""" - goal_embeddings = self.emb_model.encode([goal], prompt="task description") - similarities = self.emb_model.similarity(goal_embeddings, self.hint_embeddings) + goal_embeddings = self._encode([goal], prompt="task description") + similarities = self._similarity(goal_embeddings.tolist(), self.hint_embeddings.tolist()) top_indices = similarities.argsort()[0][-self.top_n :].tolist() logger.info(f"Top hint indices based on embedding similarity: {top_indices}") hints = self.uniq_hints.iloc[top_indices] logger.info(f"Embedding-based hints chosen: {hints}") return hints["hint"].tolist() + def _encode(self, texts: list[str], prompt: str = "", timeout: int = 10, max_retries: int = 5): + """Call the encode API endpoint with timeout and retries""" + for attempt in range(max_retries): + try: + response = requests.post( + f"{self.embedder_server}/encode", + json={"texts": texts, "prompt": prompt}, + timeout=timeout, + ) + embs = response.json()["embeddings"] + return np.asarray(embs) + except (requests.exceptions.RequestException, requests.exceptions.Timeout) as e: + if attempt == max_retries - 1: + raise e + time.sleep(random.uniform(1, timeout)) + continue + + def _similarity( + self, texts1: list[str], texts2: list[str], timeout: int = 2, max_retries: int = 5 + ): + """Call the similarity API endpoint with timeout and retries""" + for attempt in range(max_retries): + try: + response = requests.post( + f"{self.embedder_server}/similarity", + json={"texts1": texts1, "texts2": texts2}, + timeout=timeout, + ) + similarities = response.json()["similarities"] + return np.asarray(similarities) + except (requests.exceptions.RequestException, requests.exceptions.Timeout) as e: + if attempt == max_retries - 1: + raise e + time.sleep(random.uniform(1, timeout)) + continue + def choose_hints_direct(self, task_name: str) -> list[str]: hints = self.hint_db[ self.hint_db["task_name"].apply(lambda x: fnmatch.fnmatch(x, task_name)) @@ -466,7 +512,8 @@ def __init__( self.model_args = model_args self.config = config self.action_set: bgym.AbstractActionSet = action_set or bgym.HighLevelActionSet( - self.config.action_subsets, multiaction=self.config.multiaction # type: ignore + self.config.action_subsets, + multiaction=self.config.multiaction, # type: ignore ) self.tools = self.action_set.to_tool_description(api=model_args.api) @@ -656,7 +703,7 @@ def get_action(self, obs: Any) -> float: model_name="gpt-5", max_total_tokens=200_000, max_input_tokens=200_000, - max_new_tokens=2_000, + max_new_tokens=8_000, temperature=None, vision_support=True, ) From 74fc47f2820ec6dde79035a4d3bb5e5949d2c2bf Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 19 Aug 2025 14:14:49 +0200 Subject: [PATCH 04/23] gpt5 fixes --- src/agentlab/llm/chat_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py index ff341356..dc9667b5 100644 --- a/src/agentlab/llm/chat_api.py +++ b/src/agentlab/llm/chat_api.py @@ -292,7 +292,7 @@ def __call__(self, messages: list[dict], n_samples: int = 1, temperature: float messages=messages, n=n_samples, temperature=temperature, - max_tokens=self.max_tokens, + max_completion_tokens=self.max_tokens, logprobs=self.log_probs, ) @@ -359,7 +359,7 @@ def __init__( min_retry_wait_time=min_retry_wait_time, api_key_env_var="OPENAI_API_KEY", client_class=OpenAI, - pricing_func=tracking.get_pricing_openai, + pricing_func=partial(tracking.get_pricing_litellm, model_name=model_name), log_probs=log_probs, ) From 1de1e519f2adb307d5affb4f51e000db0cc72914 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com> Date: Tue, 19 Aug 2025 20:27:52 -0400 Subject: [PATCH 05/23] first cut --- .../agents/human_trace_recorder/agent.py | 215 ++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100644 src/agentlab/agents/human_trace_recorder/agent.py diff --git a/src/agentlab/agents/human_trace_recorder/agent.py b/src/agentlab/agents/human_trace_recorder/agent.py new file mode 100644 index 00000000..52496b7e --- /dev/null +++ b/src/agentlab/agents/human_trace_recorder/agent.py @@ -0,0 +1,215 @@ +from __future__ import annotations + +import logging +import textwrap +from dataclasses import dataclass + +import bgym +from playwright.sync_api import Page + +from agentlab.agents.agent_args import AgentArgs + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Simplified variant: capture human step (trace + screenshot + html) only +# --------------------------------------------------------------------------- + + +@dataclass +class SimpleHumanTraceCaptureAgentArgs(AgentArgs): + """Args for SimpleHumanTraceCaptureAgent. + + This version ONLY captures what the human does in the paused browser per step. + It does NOT attempt to map or translate actions. Always returns noop(). + Set use_raw_page_output=True in loop/env so that obs contains a Playwright Page. + """ + + agent_name: str = "SimpleHumanTraceCapture" + trace_dir: str = "human_traces" + screenshots: bool = True + snapshots: bool = True # playwright tracing snapshots (DOM/Sources) + sources: bool = False # include source files (bigger trace) + # Ensure the raw Playwright Page object is present in observations so we can pause. + use_raw_page_output: bool = True + + def make_agent(self) -> bgym.Agent: + return SimpleHumanTraceCaptureAgent( + trace_dir=self.trace_dir, + screenshots=self.screenshots, + snapshots=self.snapshots, + sources=self.sources, + ) + + def set_reproducibility_mode(self): + pass + + +class SimpleHumanTraceCaptureAgent(bgym.Agent): + """Minimal human-in-the-loop recorder. + + On each get_action: + 1. Start a Playwright tracing capture (if not already running for this step). + 2. Call page.pause() to open Inspector; user performs EXACTLY one logical action. + 3. Stop tracing, save trace zip, screenshot (after action), and HTML snapshot. + 4. Return noop() so the environment advances. + + Artifacts are stored under trace_dir/step_/ + """ + + def __init__(self, trace_dir: str, screenshots: bool, snapshots: bool, sources: bool): + self.action_set = bgym.HighLevelActionSet(["bid"], multiaction=False) + self._step_idx = 0 + from pathlib import Path + + self._root = Path(trace_dir) + self._root.mkdir(parents=True, exist_ok=True) + # Store trace config booleans; Playwright tracing.start expects them as named params. + self._trace_conf = dict(screenshots=screenshots, snapshots=snapshots, sources=sources) + self._tracing_started = False # track if global tracing has been started + self._page: Page | None = None # optional persistent page ref (when not in obs) + + def set_page(self, page: Page): + """Manually inject a Playwright Page so the agent can function without it in obs. + + Call this once after you create / reset the environment if you prefer not to + expose the page through observations (e.g., for safety or serialization reasons). + """ + self._page = page + + def obs_preprocessor(self, obs): # keep original obs so page is available + return obs + + def get_action(self, obs: dict): # type: ignore[override] + import json + import time + + # Resolve page priority: observation > stored page + page: Page | None = obs.get("page") or self._page + if page is None: + raise RuntimeError( + "No Playwright Page available. Provide use_raw_page_output=True OR call set_page(page)." + ) + # Cache page if first time we see it via obs so later steps can omit it + if self._page is None: + self._page = page + + step_dir = self._root / f"step_{self._step_idx:04d}" + step_dir.mkdir(parents=True, exist_ok=True) + trace_path = step_dir / "trace.zip" + screenshot_path = step_dir / "after.png" + html_path = step_dir / "after.html" + + # Lazy start of tracing (once per context) then per-step chunk + if not self._tracing_started: + try: + page.context.tracing.start( + screenshots=self._trace_conf["screenshots"], + snapshots=self._trace_conf["snapshots"], + sources=self._trace_conf["sources"], + ) + self._tracing_started = True + except Exception as e: # pragma: no cover + print(f"[SimpleHumanTraceCapture][WARN] initial tracing.start failed: {e}") + + try: + page.context.tracing.start_chunk() + except Exception as e: # pragma: no cover + print(f"[SimpleHumanTraceCapture][WARN] tracing.start_chunk failed: {e}") + + print("\n[SimpleHumanTraceCapture] Perform ONE action then resume Inspector.") + print("[SimpleHumanTraceCapture] A trace will be saved to:", trace_path) + try: + page.pause() + except Exception as e: # pragma: no cover + print(f"[SimpleHumanTraceCapture][WARN] page.pause failed: {e}") + + # Stop current chunk & save + try: + page.context.tracing.stop_chunk(path=str(trace_path)) + except Exception as e: # pragma: no cover + print(f"[SimpleHumanTraceCapture][WARN] tracing.stop_chunk failed: {e}") + + # Post-action artifacts + try: + page.screenshot(path=str(screenshot_path)) + except Exception as e: # pragma: no cover + print(f"[SimpleHumanTraceCapture][WARN] screenshot failed: {e}") + try: + html = page.content() + html_path.write_text(html) + except Exception as e: # pragma: no cover + print(f"[SimpleHumanTraceCapture][WARN] html capture failed: {e}") + + meta = { + "url": page.url, + "timestamp": time.time(), + "step": self._step_idx, + "trace_path": str(trace_path), + "screenshot_path": str(screenshot_path), + "html_path": str(html_path), + } + (step_dir / "meta.json").write_text(json.dumps(meta, indent=2)) + + # --- Derive a lightweight human-readable script summary from the trace --- + script_summary_lines: list[str] = [] + try: + import json as _json + import zipfile + + with zipfile.ZipFile(trace_path, "r") as zf: + # Playwright trace usually contains one or more *.trace files (jsonl) + trace_files = [n for n in zf.namelist() if n.endswith(".trace")] + for tf in trace_files: + with zf.open(tf, "r") as fh: + for raw_line in fh: + try: + evt = _json.loads(raw_line.decode("utf-8")) + except Exception: + continue + if evt.get("type") != "action": + continue + a = evt.get("action", {}) + api_name = a.get("apiName") or a.get("name") or "action" + selector = a.get("selector") or a.get("locator") or "" + value = a.get("value") or a.get("text") or "" + line = f"{api_name}" + if selector: + line += f" selector={selector!r}" + if value and isinstance(value, str) and len(value) < 200: + line += f" value={value!r}" + script_summary_lines.append(line) + if not script_summary_lines: + script_summary_lines.append("(no action events parsed from trace chunk)") + except Exception as e: # pragma: no cover + script_summary_lines.append(f"(failed to parse trace for script summary: {e})") + + # Prepare chat messages (simple list of strings for easy viewing) + chat_messages = [ + "PLAYWRIGHT TRACE STEP SUMMARY:", + f"Step {self._step_idx} URL: {page.url}", + "Actions:", + *script_summary_lines, + f"Trace file: {trace_path}", + "Open with: npx playwright show-trace " + str(trace_path), + ] + + self._step_idx += 1 + + agent_info = bgym.AgentInfo( + think="human-recorded", + chat_messages=chat_messages, + stats={"step": self._step_idx}, + markdown_page=textwrap.dedent( + f"""### Simple Human Trace Capture\nSaved artifacts for step {meta['step']}:\n- URL: {meta['url']}\n- Trace: {meta['trace_path']}\n- Screenshot: {meta['screenshot_path']}\n- HTML: {meta['html_path']}\n""" + ), + extra_info=meta, + ) + return "noop()", agent_info + + +SIMPLE_TRACE_CAPTURE_AGENT = SimpleHumanTraceCaptureAgentArgs() + +##1. Simple debug agent +# 2. Instead of using the page object Launch codegen directly in a subprocess using the playwright codegen --url or somethiing From 2b4633a95c0e18724565d2a5ffa489f4c7ad220c Mon Sep 17 00:00:00 2001 From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com> Date: Tue, 19 Aug 2025 21:30:00 -0400 Subject: [PATCH 06/23] update --- .../agents/human_trace_recorder/agent.py | 321 ++++++++---------- 1 file changed, 149 insertions(+), 172 deletions(-) diff --git a/src/agentlab/agents/human_trace_recorder/agent.py b/src/agentlab/agents/human_trace_recorder/agent.py index 52496b7e..fd5aa554 100644 --- a/src/agentlab/agents/human_trace_recorder/agent.py +++ b/src/agentlab/agents/human_trace_recorder/agent.py @@ -1,215 +1,192 @@ +"""Minimal Human Trace Agent (<200 lines) + +Per step we capture ONLY: + - axtree_txt, pruned_html, actions.json, after.html + - Auto-resume after detecting user action + - Visible recording indicator +""" + from __future__ import annotations -import logging -import textwrap +import json +import time +import zipfile from dataclasses import dataclass +from pathlib import Path import bgym from playwright.sync_api import Page from agentlab.agents.agent_args import AgentArgs - -logger = logging.getLogger(__name__) - - -# --------------------------------------------------------------------------- -# Simplified variant: capture human step (trace + screenshot + html) only -# --------------------------------------------------------------------------- +from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, prune_html @dataclass -class SimpleHumanTraceCaptureAgentArgs(AgentArgs): - """Args for SimpleHumanTraceCaptureAgent. - - This version ONLY captures what the human does in the paused browser per step. - It does NOT attempt to map or translate actions. Always returns noop(). - Set use_raw_page_output=True in loop/env so that obs contains a Playwright Page. - """ - - agent_name: str = "SimpleHumanTraceCapture" +class HumanTraceAgentArgs(AgentArgs): + agent_name: str = "HumanTraceAgent" trace_dir: str = "human_traces" - screenshots: bool = True - snapshots: bool = True # playwright tracing snapshots (DOM/Sources) - sources: bool = False # include source files (bigger trace) - # Ensure the raw Playwright Page object is present in observations so we can pause. use_raw_page_output: bool = True - def make_agent(self) -> bgym.Agent: - return SimpleHumanTraceCaptureAgent( - trace_dir=self.trace_dir, - screenshots=self.screenshots, - snapshots=self.snapshots, - sources=self.sources, - ) + def make_agent(self) -> bgym.Agent: # type: ignore[override] + return HumanTraceAgent(self.trace_dir) def set_reproducibility_mode(self): pass -class SimpleHumanTraceCaptureAgent(bgym.Agent): - """Minimal human-in-the-loop recorder. - - On each get_action: - 1. Start a Playwright tracing capture (if not already running for this step). - 2. Call page.pause() to open Inspector; user performs EXACTLY one logical action. - 3. Stop tracing, save trace zip, screenshot (after action), and HTML snapshot. - 4. Return noop() so the environment advances. - - Artifacts are stored under trace_dir/step_/ - """ - - def __init__(self, trace_dir: str, screenshots: bool, snapshots: bool, sources: bool): +class HumanTraceAgent(bgym.Agent): + def __init__(self, trace_dir: str): self.action_set = bgym.HighLevelActionSet(["bid"], multiaction=False) - self._step_idx = 0 - from pathlib import Path - self._root = Path(trace_dir) self._root.mkdir(parents=True, exist_ok=True) - # Store trace config booleans; Playwright tracing.start expects them as named params. - self._trace_conf = dict(screenshots=screenshots, snapshots=snapshots, sources=sources) - self._tracing_started = False # track if global tracing has been started - self._page: Page | None = None # optional persistent page ref (when not in obs) - - def set_page(self, page: Page): - """Manually inject a Playwright Page so the agent can function without it in obs. - - Call this once after you create / reset the environment if you prefer not to - expose the page through observations (e.g., for safety or serialization reasons). - """ - self._page = page - - def obs_preprocessor(self, obs): # keep original obs so page is available + self._page: Page | None = None + self._step = 0 + self._task_name = None + self._seed = None + + def obs_preprocessor(self, obs: dict): # type: ignore[override] + if isinstance(obs, dict): + if self._page is None and "page" in obs: + self._page = obs["page"] + + # Extract task name and seed from obs if available + if self._task_name is None: + self._task_name = obs.get("task_name") or obs.get("task", {}).get( + "task_name", "unknown_task" + ) + if self._seed is None: + self._seed = obs.get("seed") or obs.get("task", {}).get("seed", "unknown_seed") + + dom = obs.get("dom_object") + axt = obs.get("axtree_object") + if axt is not None: + try: + obs["axtree_txt"] = flatten_axtree_to_str(axt) + except Exception: + pass + if dom is not None: + try: + obs["pruned_html"] = prune_html(flatten_dom_to_str(dom)) + except Exception: + pass + for k in ("dom_object", "axtree_object", "page"): + obs.pop(k, None) return obs def get_action(self, obs: dict): # type: ignore[override] - import json - import time - - # Resolve page priority: observation > stored page - page: Page | None = obs.get("page") or self._page - if page is None: - raise RuntimeError( - "No Playwright Page available. Provide use_raw_page_output=True OR call set_page(page)." - ) - # Cache page if first time we see it via obs so later steps can omit it if self._page is None: - self._page = page + raise RuntimeError("Playwright Page missing; ensure use_raw_page_output=True") - step_dir = self._root / f"step_{self._step_idx:04d}" + page = self._page + + # Create directory structure: trace_dir/task_name/seed/step_XXXX + task_dir = self._root / str(self._task_name or "unknown_task") + seed_dir = task_dir / str(self._seed or "unknown_seed") + step_dir = seed_dir / f"step_{self._step:04d}" step_dir.mkdir(parents=True, exist_ok=True) - trace_path = step_dir / "trace.zip" - screenshot_path = step_dir / "after.png" - html_path = step_dir / "after.html" - # Lazy start of tracing (once per context) then per-step chunk - if not self._tracing_started: - try: - page.context.tracing.start( - screenshots=self._trace_conf["screenshots"], - snapshots=self._trace_conf["snapshots"], - sources=self._trace_conf["sources"], - ) - self._tracing_started = True - except Exception as e: # pragma: no cover - print(f"[SimpleHumanTraceCapture][WARN] initial tracing.start failed: {e}") + trace_path = step_dir / "temp_trace.zip" + actions_path = step_dir / "actions.json" + + print( + f"[HumanTrace] Task: {self._task_name}, Seed: {self._seed}, Step {self._step}: Perform ONE action" + ) + # Small recording indicator + page.evaluate( + """ + const div = document.createElement('div'); + div.id = '__rec'; + div.innerHTML = '🔴 REC'; + div.style.cssText = 'position:fixed;top:5px;right:5px;background:#f44;color:#fff;padding:5px 8px;border-radius:4px;font:bold 12px monospace;z-index:99999'; + document.body.appendChild(div); + """ + ) + + # Start tracing try: + page.context.tracing.start(screenshots=True, snapshots=True) page.context.tracing.start_chunk() - except Exception as e: # pragma: no cover - print(f"[SimpleHumanTraceCapture][WARN] tracing.start_chunk failed: {e}") + except Exception: + pass - print("\n[SimpleHumanTraceCapture] Perform ONE action then resume Inspector.") - print("[SimpleHumanTraceCapture] A trace will be saved to:", trace_path) - try: - page.pause() - except Exception as e: # pragma: no cover - print(f"[SimpleHumanTraceCapture][WARN] page.pause failed: {e}") + # Wait for action + self._wait_for_action(page) - # Stop current chunk & save + # Stop tracing and save try: page.context.tracing.stop_chunk(path=str(trace_path)) - except Exception as e: # pragma: no cover - print(f"[SimpleHumanTraceCapture][WARN] tracing.stop_chunk failed: {e}") + actions = self._extract_trace(str(trace_path)) + actions_path.write_text(json.dumps(actions, indent=2)) + trace_path.unlink(missing_ok=True) + except Exception: + pass - # Post-action artifacts + # Remove indicator + page.evaluate("document.getElementById('__rec')?.remove()") + + # Save screenshot try: - page.screenshot(path=str(screenshot_path)) - except Exception as e: # pragma: no cover - print(f"[SimpleHumanTraceCapture][WARN] screenshot failed: {e}") + page.screenshot(path=str(step_dir / "screenshot.png")) + except Exception: + pass + + # Save HTML try: - html = page.content() - html_path.write_text(html) - except Exception as e: # pragma: no cover - print(f"[SimpleHumanTraceCapture][WARN] html capture failed: {e}") - - meta = { - "url": page.url, - "timestamp": time.time(), - "step": self._step_idx, - "trace_path": str(trace_path), - "screenshot_path": str(screenshot_path), - "html_path": str(html_path), + (step_dir / "after.html").write_text(page.content()) + except Exception: + pass + + self._step += 1 + return "noop()", { + "extra_info": { + "step": self._step - 1, + "task_name": self._task_name, + "seed": self._seed, + "trace_dir": str(step_dir), + } } - (step_dir / "meta.json").write_text(json.dumps(meta, indent=2)) - # --- Derive a lightweight human-readable script summary from the trace --- - script_summary_lines: list[str] = [] - try: - import json as _json - import zipfile - - with zipfile.ZipFile(trace_path, "r") as zf: - # Playwright trace usually contains one or more *.trace files (jsonl) - trace_files = [n for n in zf.namelist() if n.endswith(".trace")] - for tf in trace_files: - with zf.open(tf, "r") as fh: - for raw_line in fh: - try: - evt = _json.loads(raw_line.decode("utf-8")) - except Exception: - continue - if evt.get("type") != "action": - continue - a = evt.get("action", {}) - api_name = a.get("apiName") or a.get("name") or "action" - selector = a.get("selector") or a.get("locator") or "" - value = a.get("value") or a.get("text") or "" - line = f"{api_name}" - if selector: - line += f" selector={selector!r}" - if value and isinstance(value, str) and len(value) < 200: - line += f" value={value!r}" - script_summary_lines.append(line) - if not script_summary_lines: - script_summary_lines.append("(no action events parsed from trace chunk)") - except Exception as e: # pragma: no cover - script_summary_lines.append(f"(failed to parse trace for script summary: {e})") - - # Prepare chat messages (simple list of strings for easy viewing) - chat_messages = [ - "PLAYWRIGHT TRACE STEP SUMMARY:", - f"Step {self._step_idx} URL: {page.url}", - "Actions:", - *script_summary_lines, - f"Trace file: {trace_path}", - "Open with: npx playwright show-trace " + str(trace_path), - ] - - self._step_idx += 1 - - agent_info = bgym.AgentInfo( - think="human-recorded", - chat_messages=chat_messages, - stats={"step": self._step_idx}, - markdown_page=textwrap.dedent( - f"""### Simple Human Trace Capture\nSaved artifacts for step {meta['step']}:\n- URL: {meta['url']}\n- Trace: {meta['trace_path']}\n- Screenshot: {meta['screenshot_path']}\n- HTML: {meta['html_path']}\n""" - ), - extra_info=meta, + def _wait_for_action(self, page): + """Wait for user action with auto-resume.""" + page.evaluate( + """ + window.__acted = false; + ['click','keydown','input','change'].forEach(e => + document.addEventListener(e, () => window.__acted = true, true) + ); + """ ) - return "noop()", agent_info - -SIMPLE_TRACE_CAPTURE_AGENT = SimpleHumanTraceCaptureAgentArgs() - -##1. Simple debug agent -# 2. Instead of using the page object Launch codegen directly in a subprocess using the playwright codegen --url or somethiing + start = time.time() + while time.time() - start < 300: # 5 min max + try: + if page.evaluate("window.__acted"): + page.evaluate("document.getElementById('__rec').innerHTML = '💾 SAVING'") + time.sleep(0.3) + return + except Exception: + pass + time.sleep(0.1) + + def _extract_trace(self, trace_file: str): + """Extract ALL events from trace zip.""" + all_events = [] + try: + with zipfile.ZipFile(trace_file, "r") as zf: + for name in zf.namelist(): + if name.endswith(".trace"): + with zf.open(name) as f: + for line in f: + try: + event = json.loads(line.decode()) + # Save everything - don't filter + all_events.append(event) + except Exception: + continue + except Exception: + pass + return all_events + + +HUMAN_TRACE_AGENT = HumanTraceAgentArgs() From 380c69f4708f6c172b9408bd1b55cbaa0edf5556 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com> Date: Wed, 20 Aug 2025 16:27:39 -0400 Subject: [PATCH 07/23] add event listeners and launcher --- .../agents/human_trace_recorder/agent.py | 368 ++++++++---- .../human_trace_recorder/event_listeners.py | 563 ++++++++++++++++++ 2 files changed, 802 insertions(+), 129 deletions(-) create mode 100644 src/agentlab/agents/human_trace_recorder/event_listeners.py diff --git a/src/agentlab/agents/human_trace_recorder/agent.py b/src/agentlab/agents/human_trace_recorder/agent.py index fd5aa554..556922af 100644 --- a/src/agentlab/agents/human_trace_recorder/agent.py +++ b/src/agentlab/agents/human_trace_recorder/agent.py @@ -1,16 +1,14 @@ -"""Minimal Human Trace Agent (<200 lines) +"""Human Trace Agent for Browser Automation Training Data -Per step we capture ONLY: - - axtree_txt, pruned_html, actions.json, after.html - - Auto-resume after detecting user action - - Visible recording indicator +Captures human interactions at each step including: + - Comprehensive action tracking (clicks, input, navigation, etc.) + - Saves only human_action.json files in simple numbered folders """ from __future__ import annotations import json import time -import zipfile from dataclasses import dataclass from pathlib import Path @@ -18,6 +16,17 @@ from playwright.sync_api import Page from agentlab.agents.agent_args import AgentArgs +from agentlab.agents.human_trace_recorder.event_listeners import ( + get_interaction_tracking_script, + get_recording_indicators_script, +) +from browsergym.core.observation import ( + extract_dom_extra_properties, + extract_dom_snapshot, + extract_focused_element_bid, + extract_merged_axtree, + extract_screenshot, +) from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, prune_html @@ -41,36 +50,33 @@ def __init__(self, trace_dir: str): self._root.mkdir(parents=True, exist_ok=True) self._page: Page | None = None self._step = 0 - self._task_name = None - self._seed = None def obs_preprocessor(self, obs: dict): # type: ignore[override] if isinstance(obs, dict): - if self._page is None and "page" in obs: - self._page = obs["page"] - - # Extract task name and seed from obs if available - if self._task_name is None: - self._task_name = obs.get("task_name") or obs.get("task", {}).get( - "task_name", "unknown_task" - ) - if self._seed is None: - self._seed = obs.get("seed") or obs.get("task", {}).get("seed", "unknown_seed") - - dom = obs.get("dom_object") - axt = obs.get("axtree_object") - if axt is not None: - try: + self._page = obs.get("page") + # Remove the page object from obs to avoid pickle issues + if "page" in obs: + del obs["page"] + + obs["screenshot"] = extract_screenshot(self._page) + obs["dom_object"] = extract_dom_snapshot(self._page) + obs["axtree_object"] = extract_merged_axtree(self._page) + scale_factor = getattr(self._page, "_bgym_scale_factor", 1.0) + extra_properties = extract_dom_extra_properties( + obs["dom_object"], scale_factor=scale_factor + ) + obs["extra_element_properties"] = extra_properties + obs["focused_element_bid"] = extract_focused_element_bid(self._page) + + # Add text representations for easier analysis + if obs["axtree_object"]: + axt = obs["axtree_object"] + if extra_properties: obs["axtree_txt"] = flatten_axtree_to_str(axt) - except Exception: - pass - if dom is not None: - try: - obs["pruned_html"] = prune_html(flatten_dom_to_str(dom)) - except Exception: - pass - for k in ("dom_object", "axtree_object", "page"): - obs.pop(k, None) + + if obs["dom_object"]: + obs["dom_txt"] = flatten_dom_to_str(obs["dom_object"]) + obs["pruned_html"] = prune_html(obs["dom_txt"]) return obs def get_action(self, obs: dict): # type: ignore[override] @@ -78,115 +84,219 @@ def get_action(self, obs: dict): # type: ignore[override] raise RuntimeError("Playwright Page missing; ensure use_raw_page_output=True") page = self._page - - # Create directory structure: trace_dir/task_name/seed/step_XXXX - task_dir = self._root / str(self._task_name or "unknown_task") - seed_dir = task_dir / str(self._seed or "unknown_seed") - step_dir = seed_dir / f"step_{self._step:04d}" - step_dir.mkdir(parents=True, exist_ok=True) - - trace_path = step_dir / "temp_trace.zip" - actions_path = step_dir / "actions.json" - - print( - f"[HumanTrace] Task: {self._task_name}, Seed: {self._seed}, Step {self._step}: Perform ONE action" - ) - - # Small recording indicator - page.evaluate( - """ - const div = document.createElement('div'); - div.id = '__rec'; - div.innerHTML = '🔴 REC'; - div.style.cssText = 'position:fixed;top:5px;right:5px;background:#f44;color:#fff;padding:5px 8px;border-radius:4px;font:bold 12px monospace;z-index:99999'; - document.body.appendChild(div); - """ - ) - - # Start tracing - try: - page.context.tracing.start(screenshots=True, snapshots=True) - page.context.tracing.start_chunk() - except Exception: - pass - - # Wait for action - self._wait_for_action(page) - - # Stop tracing and save - try: - page.context.tracing.stop_chunk(path=str(trace_path)) - actions = self._extract_trace(str(trace_path)) - actions_path.write_text(json.dumps(actions, indent=2)) - trace_path.unlink(missing_ok=True) - except Exception: - pass - - # Remove indicator - page.evaluate("document.getElementById('__rec')?.remove()") - - # Save screenshot - try: - page.screenshot(path=str(step_dir / "screenshot.png")) - except Exception: - pass - - # Save HTML - try: - (step_dir / "after.html").write_text(page.content()) - except Exception: - pass - + step_dir = self._create_step_directory() + + self._display_recording_prompt() + self._show_recording_indicators(page) + + # Capture human interactions + captured_action, human_interactions = self._capture_interactions_with_js(page, step_dir) + + # Save and cleanup + self._save_human_action(captured_action, step_dir) + self._cleanup_indicators(page) + self._step += 1 return "noop()", { "extra_info": { "step": self._step - 1, - "task_name": self._task_name, - "seed": self._seed, - "trace_dir": str(step_dir), + "human_interactions": human_interactions, } } - def _wait_for_action(self, page): - """Wait for user action with auto-resume.""" - page.evaluate( - """ - window.__acted = false; - ['click','keydown','input','change'].forEach(e => - document.addEventListener(e, () => window.__acted = true, true) - ); - """ - ) - - start = time.time() - while time.time() - start < 300: # 5 min max + def _create_step_directory(self) -> Path: + """Create directory for current step.""" + step_dir = self._root / str(self._step) + step_dir.mkdir(parents=True, exist_ok=True) + return step_dir + + def _display_recording_prompt(self): + """Display prompt messages to user.""" + print(f"[HumanTrace] Step {self._step}: Perform ONE action") + print("[HumanTrace] ⚠️ WAIT FOR THE RED BORDER TO APPEAR BEFORE PERFORMING ANY ACTION ⚠️") + print("[HumanTrace] The system will automatically save after detecting your action") + + def _show_recording_indicators(self, page: Page): + """Show visual recording indicators on the page.""" + page.evaluate(get_recording_indicators_script()) + + def _save_human_action(self, captured_action: dict, step_dir: Path): + """Save the captured human action to JSON file.""" + try: + human_action_path = step_dir / "human_action.json" + if captured_action and isinstance(captured_action, dict): + human_action_path.write_text(json.dumps(captured_action, indent=2)) + action_type = captured_action.get("type", "unknown") + else: + # Create empty action record for consistency + empty_action = { + "type": "no_action", + "timestamp": time.time() * 1000, + "reason": "No meaningful human action captured in this step", + } + human_action_path.write_text(json.dumps(empty_action, indent=2)) + action_type = "no_action" + + print(f"[HumanTrace] Step {self._step} complete - Action: {action_type}") + + except Exception as e: + print(f"[HumanTrace] Warning: Failed to save human action: {e}") + + def _cleanup_indicators(self, page: Page): + """Remove recording indicators from the page.""" + page.evaluate("document.getElementById('__rec')?.remove(); document.getElementById('__rec_border')?.remove()") + + def _capture_interactions_with_js(self, page: Page, step_dir: Path) -> tuple[dict, str]: + """Capture human interactions using JavaScript injection.""" + try: + print("[HumanTrace] JavaScript interaction tracking enabled") + initial_url, initial_title = page.url, page.title() + + # Inject interaction tracking + self._inject_interaction_tracking(page) + + # Wait for user action + self._wait_for_user_action(page) + + # Collect and process interaction data + return self._collect_interaction_data(page, initial_url, initial_title) + + except Exception as e: + print(f"[HumanTrace] Error: {e}") + return { + "type": "error", + "timestamp": time.time() * 1000, + "error": str(e), + }, f"Error: {e}" + + def _inject_interaction_tracking(self, page: Page): + """Inject JavaScript code for comprehensive interaction tracking.""" + tracking_script = get_interaction_tracking_script() + page.evaluate(tracking_script) + + def _wait_for_user_action(self, page: Page): + """Wait for user to perform an action.""" + start_time = time.time() + while time.time() - start_time < 300: try: - if page.evaluate("window.__acted"): - page.evaluate("document.getElementById('__rec').innerHTML = '💾 SAVING'") - time.sleep(0.3) - return - except Exception: + action_detected = page.evaluate("window.__acted || false") + if action_detected: + print(f"[HumanTrace] Action detected! Exiting immediately...") + break + except Exception as e: + print(f"[HumanTrace] Debug: Error checking actions: {e}") pass time.sleep(0.1) - def _extract_trace(self, trace_file: str): - """Extract ALL events from trace zip.""" - all_events = [] + def _collect_interaction_data(self, page: Page, initial_url: str, initial_title: str) -> tuple[dict, str]: + """Collect and format interaction data.""" try: - with zipfile.ZipFile(trace_file, "r") as zf: - for name in zf.namelist(): - if name.endswith(".trace"): - with zf.open(name) as f: - for line in f: - try: - event = json.loads(line.decode()) - # Save everything - don't filter - all_events.append(event) - except Exception: - continue - except Exception: - pass - return all_events + action_detected = page.evaluate("window.__acted || false") + interactions = page.evaluate("window.__interactions || []") + + action_data = { + "type": "human_interactions" if action_detected else "no_action", + "timestamp": time.time() * 1000, + "detected": action_detected, + "interactions": interactions, + "interaction_count": len(interactions) + } + + summary = self._create_interaction_summary(interactions) + self._add_page_change_info(action_data, initial_url, initial_title, page) + + print(f"[HumanTrace] {summary}") + return action_data, summary + + except Exception as e: + return { + "type": "error", + "timestamp": time.time() * 1000, + "detected": False, + "error": str(e), + "interactions": [], + "interaction_count": 0 + }, f"Error collecting interactions: {e}" + + def _create_interaction_summary(self, interactions: list) -> str: + """Create a summary string of captured interactions.""" + if interactions: + interaction_types = {} + for interaction in interactions: + itype = interaction.get('type', 'unknown') + interaction_types[itype] = interaction_types.get(itype, 0) + 1 + + summary_parts = [] + for itype, count in interaction_types.items(): + summary_parts.append(f"{itype}:{count}") + return f"Captured {len(interactions)} interactions: {', '.join(summary_parts)}" + else: + return "No interactions detected" + + def _add_page_change_info(self, action_data: dict, initial_url: str, initial_title: str, page: Page): + """Add page change information to action data.""" + final_url, final_title = page.url, page.title() + if initial_url != final_url or initial_title != final_title: + action_data["page_changed"] = True + action_data["url_change"] = {"from": initial_url, "to": final_url} + action_data["title_change"] = {"from": initial_title, "to": final_title} + + def _format_js_interaction_summary(self, action_data, interaction_log): + """Format JavaScript-captured interactions into readable summary.""" + lines = ["Human Interactions (JavaScript Tracking):"] + + if action_data["interactions"]: + lines.append(f"Total Actions: {len(action_data['interactions'])}") + lines.append("") + + # Group interactions by type + by_type = {} + for interaction in action_data["interactions"]: + interaction_type = interaction["type"] + if interaction_type not in by_type: + by_type[interaction_type] = [] + by_type[interaction_type].append(interaction) + + # Show summary by type + for interaction_type, interactions in by_type.items(): + lines.append(f"{interaction_type.title()}: {len(interactions)} actions") + + lines.append("") + lines.append("Detailed Actions:") + + # Add each interaction from the log + for log_entry in interaction_log: + lines.append(f" {log_entry}") + else: + lines.append("No interactions detected - user may have just observed the page") + + # Add page state changes if URL changed + if action_data.get("page_changed"): + url_info = action_data.get("url") + if url_info: + lines.append("") + lines.append("� Page Navigation:") + lines.append(f" From: {url_info['from']}") + lines.append(f" To: {url_info['to']}") + + return "\n".join(lines) HUMAN_TRACE_AGENT = HumanTraceAgentArgs() + + +if __name__ == "__main__": + from agentlab.agents.human_trace_recorder.agent import HUMAN_TRACE_AGENT + from agentlab.experiments.study import Study + + agent_configs = [HUMAN_TRACE_AGENT] + benchmark = bgym.DEFAULT_BENCHMARKS["workarena_l1"](n_repeats=1) # type: bgym.Benchmark + benchmark = benchmark.subset_from_glob("task_name", "*filter*") + benchmark.env_args_list = benchmark.env_args_list[:1] + for env_args in benchmark.env_args_list: + print(env_args.task_name) + env_args.max_steps = 15 + env_args.headless = False + + study = Study(agent_configs, benchmark) + study.run(n_jobs=1, parallel_backend="sequential") diff --git a/src/agentlab/agents/human_trace_recorder/event_listeners.py b/src/agentlab/agents/human_trace_recorder/event_listeners.py new file mode 100644 index 00000000..2fd8453c --- /dev/null +++ b/src/agentlab/agents/human_trace_recorder/event_listeners.py @@ -0,0 +1,563 @@ +"""JavaScript Event Listeners for Human Trace Capture + +This module contains all the JavaScript code for capturing comprehensive +browser interactions including mouse, keyboard, form, scroll, and focus events. +""" + + +def get_interaction_tracking_script() -> str: + """Get the complete JavaScript code for interaction tracking.""" + return ( + """ + window.__acted = false; + window.__interactions = []; + + // Debug mode - set to true to see all events in console + window.__debug_events = false; + + function captureInteraction(type, event, extra = {}) { + // Skip our own recording indicators + if (event.target.id === '__rec' || event.target.id === '__rec_border' || + event.target.closest('#__rec') || event.target.closest('#__rec_border')) { + return; + } + + const interaction = { + type: type, + timestamp: Date.now(), + coords: { + x: event.clientX || 0, + y: event.clientY || 0 + }, + target: { + tagName: event.target.tagName, + id: event.target.id || null, + className: event.target.className || null, + text: event.target.textContent?.slice(0, 50) || null, + bid: event.target.getAttribute('bid') || null + }, + ...extra + }; + + window.__interactions.push(interaction); + window.__acted = true; + + // Debug logging + if (window.__debug_events) { + console.log(`🎯 Captured: ${type}`, interaction); + } + + // Update indicators immediately + const indicator = document.getElementById('__rec'); + const border = document.getElementById('__rec_border'); + if (indicator) { + indicator.innerHTML = '✅ ACTION DETECTED - SAVING...'; + indicator.style.background = '#28a745'; + indicator.style.animation = 'none'; + } + if (border) { + border.style.border = '8px solid #28a745'; + border.style.animation = 'none'; + } + } + + // Debug function - add this temporarily to see what events fire + if (window.__debug_events) { + ['input', 'change', 'select', 'focus', 'click', 'keydown', 'paste', 'cut', 'copy'].forEach(eventType => { + document.addEventListener(eventType, (e) => { + console.log(`🔍 DEBUG: ${eventType} on`, e.target.tagName, e.target.type, e.target); + }, true); + }); + } + + """ + + get_mouse_event_listeners() + + """ + """ + + get_keyboard_event_listeners() + + """ + """ + + get_form_event_listeners() + + """ + """ + + get_scroll_event_listeners() + + """ + """ + + get_focus_event_listeners() + + """ + + console.log('Comprehensive interaction tracking initialized'); + """ + ) + + +def get_mouse_event_listeners() -> str: + """Get JavaScript code for mouse event listeners.""" + return """ + // Mouse events with comprehensive button tracking and performance optimizations + let lastClickTime = 0; + + document.addEventListener('click', (e) => { + const now = Date.now(); + // Prevent spam clicking from creating too many events (minimum 50ms between clicks) + if (now - lastClickTime < 50) return; + lastClickTime = now; + + captureInteraction('click', e, { + button: e.button, // 0=left, 1=middle, 2=right + buttons: e.buttons, // bitmask of pressed buttons + buttonName: ['left', 'middle', 'right'][e.button] || 'unknown', + detail: e.detail, // click count (single, double, etc.) + clickType: e.detail === 1 ? 'single' : e.detail === 2 ? 'double' : `${e.detail}x` + }); + }, true); + + document.addEventListener('dblclick', (e) => { + captureInteraction('dblclick', e, { + button: e.button, + buttonName: ['left', 'middle', 'right'][e.button] || 'unknown' + }); + }, true); + + document.addEventListener('mousedown', (e) => { + captureInteraction('mousedown', e, { + button: e.button, + buttons: e.buttons, + buttonName: ['left', 'middle', 'right'][e.button] || 'unknown' + }); + }, true); + + document.addEventListener('mouseup', (e) => { + captureInteraction('mouseup', e, { + button: e.button, + buttons: e.buttons, + buttonName: ['left', 'middle', 'right'][e.button] || 'unknown' + }); + }, true); + + // Context menu (right-click menu) + document.addEventListener('contextmenu', (e) => { + captureInteraction('contextmenu', e, { + button: e.button, + buttonName: 'right' + }); + }, true); + + // Middle mouse button events (often used for scrolling/opening in new tab) + document.addEventListener('auxclick', (e) => { + captureInteraction('auxclick', e, { + button: e.button, + buttonName: e.button === 1 ? 'middle' : (e.button === 2 ? 'right' : 'other'), + detail: e.detail + }); + }, true); + + // Enhanced drag tracking (without redundant mousedown) + let isDragging = false; + let dragStart = null; + let dragButton = null; + let hasDraggedSignificantly = false; + + document.addEventListener('mousedown', (e) => { + isDragging = true; + dragButton = e.button; + hasDraggedSignificantly = false; + dragStart = { + x: e.clientX, + y: e.clientY, + time: Date.now(), + button: e.button, + buttonName: ['left', 'middle', 'right'][e.button] || 'unknown' + }; + }, true); + + document.addEventListener('mousemove', (e) => { + if (isDragging && dragStart) { + const distance = Math.sqrt( + Math.pow(e.clientX - dragStart.x, 2) + + Math.pow(e.clientY - dragStart.y, 2) + ); + if (distance > 5 && !hasDraggedSignificantly) { + // Only capture the start of a significant drag, not every movement + hasDraggedSignificantly = true; + captureInteraction('drag_start', e, { + startX: dragStart.x, + startY: dragStart.y, + endX: e.clientX, + endY: e.clientY, + distance: distance, + button: dragButton, + buttonName: dragStart.buttonName, + duration: Date.now() - dragStart.time + }); + } + } + // Note: Removed general mousemove tracking to reduce noise + }, true); + + document.addEventListener('mouseup', (e) => { + if (isDragging && dragStart && hasDraggedSignificantly) { + const distance = Math.sqrt( + Math.pow(e.clientX - dragStart.x, 2) + + Math.pow(e.clientY - dragStart.y, 2) + ); + captureInteraction('drag_end', e, { + startX: dragStart.x, + startY: dragStart.y, + endX: e.clientX, + endY: e.clientY, + distance: distance, + duration: Date.now() - dragStart.time, + button: dragButton, + buttonName: dragStart.buttonName, + totalDistance: distance + }); + } + isDragging = false; + dragStart = null; + dragButton = null; + hasDraggedSignificantly = false; + }, true); + + // Drag and drop events + document.addEventListener('dragstart', (e) => { + captureInteraction('dragstart', e, { + dataTransfer: { + effectAllowed: e.dataTransfer.effectAllowed, + types: Array.from(e.dataTransfer.types) + } + }); + }, true); + + document.addEventListener('dragend', (e) => { + captureInteraction('dragend', e, { + dataTransfer: { + dropEffect: e.dataTransfer.dropEffect + } + }); + }, true); + + document.addEventListener('drop', (e) => { + captureInteraction('drop', e, { + dataTransfer: { + dropEffect: e.dataTransfer.dropEffect, + types: Array.from(e.dataTransfer.types) + }, + files: e.dataTransfer.files.length > 0 ? Array.from(e.dataTransfer.files).map(f => ({ + name: f.name, + type: f.type, + size: f.size + })) : null + }); + }, true); + """ + + +def get_keyboard_event_listeners() -> str: + """Get JavaScript code for keyboard event listeners.""" + return """ + // Keyboard events with shortcut detection + document.addEventListener('keydown', (e) => { + let shortcut = null; + if (e.ctrlKey || e.metaKey) { + const modifier = e.ctrlKey ? 'Ctrl' : 'Cmd'; + const key = e.key.length === 1 ? e.key.toUpperCase() : e.key; + shortcut = `${modifier}+${key}`; + } else if (e.altKey && e.key.length === 1) { + shortcut = `Alt+${e.key.toUpperCase()}`; + } else if (e.shiftKey && e.key.length === 1) { + shortcut = `Shift+${e.key.toUpperCase()}`; + } + + captureInteraction('keydown', e, { + key: e.key, + code: e.code, + ctrlKey: e.ctrlKey, + shiftKey: e.shiftKey, + altKey: e.altKey, + metaKey: e.metaKey, + shortcut: shortcut + }); + }, true); + + document.addEventListener('keyup', (e) => { + captureInteraction('keyup', e, { + key: e.key, + code: e.code + }); + }, true); + """ + + +def get_form_event_listeners() -> str: + """Get JavaScript code for form event listeners.""" + return """ + // Input events with throttling to prevent spam during fast typing + let inputTimeout; + let lastInputValue = ''; + + document.addEventListener('input', (e) => { + if (['INPUT', 'TEXTAREA'].includes(e.target.tagName) || e.target.contentEditable === 'true') { + clearTimeout(inputTimeout); + inputTimeout = setTimeout(() => { + const currentValue = e.target.value || e.target.textContent; + // Only capture if value actually changed significantly + if (currentValue !== lastInputValue) { + lastInputValue = currentValue; + captureInteraction('input', e, { + value: currentValue, + inputType: e.inputType || null, + valueLength: currentValue.length + }); + } + }, 50); // Reduced from 300ms to 50ms for better responsiveness + } + }, true); + + // Immediate input capture (without throttling) for certain cases + document.addEventListener('input', (e) => { + // Immediate capture for dropdown/select-like inputs or when selection changes + if (e.target.tagName === 'SELECT' || + e.inputType === 'deleteContentBackward' || + e.inputType === 'insertFromPaste' || + e.inputType === 'insertFromDrop') { + captureInteraction('input_immediate', e, { + value: e.target.value || e.target.textContent, + inputType: e.inputType || null, + immediate: true + }); + } + }, true); + + // Text selection events + document.addEventListener('select', (e) => { + if (['INPUT', 'TEXTAREA'].includes(e.target.tagName)) { + const selectedText = e.target.value.substring(e.target.selectionStart, e.target.selectionEnd); + captureInteraction('select', e, { + selectedText: selectedText, + selectionStart: e.target.selectionStart, + selectionEnd: e.target.selectionEnd, + value: e.target.value, + selectionLength: selectedText.length + }); + } + }, true); + + // Clipboard events + document.addEventListener('cut', (e) => { + captureInteraction('cut', e, { + clipboardData: e.clipboardData ? Array.from(e.clipboardData.types) : null, + targetValue: e.target.value || e.target.textContent + }); + }, true); + + document.addEventListener('copy', (e) => { + captureInteraction('copy', e, { + clipboardData: e.clipboardData ? Array.from(e.clipboardData.types) : null, + targetValue: e.target.value || e.target.textContent + }); + }, true); + + document.addEventListener('paste', (e) => { + captureInteraction('paste', e, { + clipboardData: e.clipboardData ? Array.from(e.clipboardData.types) : null, + targetValue: e.target.value || e.target.textContent + }); + }, true); + + // Enhanced form change events with better dropdown handling + document.addEventListener('change', (e) => { + let extra = {}; + if (e.target.tagName === 'SELECT') { + const option = e.target.options[e.target.selectedIndex]; + extra = { + selectedValue: e.target.value, + selectedText: option?.text || '', + selectedIndex: e.target.selectedIndex, + allOptions: Array.from(e.target.options).map(opt => ({ + value: opt.value, + text: opt.text, + selected: opt.selected + })), + optionsCount: e.target.options.length + }; + } else if (['checkbox', 'radio'].includes(e.target.type)) { + extra = { + checked: e.target.checked, + value: e.target.value, + name: e.target.name + }; + } else { + extra = { + value: e.target.value, + previousValue: e.target.defaultValue, // Capture what it was before + inputType: e.target.type + }; + } + captureInteraction('change', e, extra); + }, true); + + document.addEventListener('submit', (e) => { + captureInteraction('submit', e, { + formAction: e.target.action || null, + formMethod: e.target.method || 'GET', + formElements: Array.from(e.target.elements).length + }); + }, true); + + // Additional events for better field interaction capture + + // Option selection in datalists + document.addEventListener('input', (e) => { + if (e.target.list) { // Has datalist + captureInteraction('datalist_input', e, { + value: e.target.value, + listId: e.target.list.id, + optionsCount: e.target.list.options.length + }); + } + }, true); + + // File input changes + document.addEventListener('change', (e) => { + if (e.target.type === 'file') { + captureInteraction('file_select', e, { + filesCount: e.target.files.length, + files: Array.from(e.target.files).map(file => ({ + name: file.name, + type: file.type, + size: file.size, + lastModified: file.lastModified + })) + }); + } + }, true); + """ + + +def get_scroll_event_listeners() -> str: + """Get JavaScript code for scroll event listeners.""" + return """ + // Scroll events with debouncing to reduce noise + let scrollTimeout; + let lastScrollTime = 0; + + document.addEventListener('scroll', (e) => { + clearTimeout(scrollTimeout); + scrollTimeout = setTimeout(() => { + const now = Date.now(); + // Only capture scroll if it's been at least 200ms since last scroll capture + if (now - lastScrollTime > 200) { + lastScrollTime = now; + captureInteraction('scroll', e, { + scrollX: window.scrollX, + scrollY: window.scrollY, + scrollLeft: e.target.scrollLeft || 0, + scrollTop: e.target.scrollTop || 0 + }); + } + }, 150); // Increased debounce time + }, true); + + // Wheel events (for detailed scroll tracking) with throttling + let lastWheelTime = 0; + document.addEventListener('wheel', (e) => { + const now = Date.now(); + // Only capture wheel events every 100ms to reduce noise + if (now - lastWheelTime > 100) { + lastWheelTime = now; + captureInteraction('wheel', e, { + deltaX: e.deltaX, + deltaY: e.deltaY, + deltaZ: e.deltaZ, + deltaMode: e.deltaMode + }); + } + }, true); + """ + + +def get_focus_event_listeners() -> str: + """Get JavaScript code for focus event listeners.""" + return """ + // Focus events - only for interactive elements to reduce noise + document.addEventListener('focus', (e) => { + // Only capture focus on interactive elements + const interactiveElements = ['INPUT', 'TEXTAREA', 'SELECT', 'BUTTON', 'A']; + if (interactiveElements.includes(e.target.tagName) || + e.target.contentEditable === 'true' || + e.target.tabIndex >= 0) { + captureInteraction('focus', e); + } + }, true); + + document.addEventListener('blur', (e) => { + // Only capture blur on interactive elements + const interactiveElements = ['INPUT', 'TEXTAREA', 'SELECT', 'BUTTON', 'A']; + if (interactiveElements.includes(e.target.tagName) || + e.target.contentEditable === 'true' || + e.target.tabIndex >= 0) { + captureInteraction('blur', e); + } + }, true); + """ + + +def get_recording_indicators_script() -> str: + """Get JavaScript code for recording indicators.""" + return """ + // Remove any existing indicators + const existingBorder = document.getElementById('__rec_border'); + if (existingBorder) existingBorder.remove(); + const existingIndicator = document.getElementById('__rec'); + if (existingIndicator) existingIndicator.remove(); + + // Create border overlay + const border = document.createElement('div'); + border.id = '__rec_border'; + border.style.cssText = ` + position: fixed; + top: 0; + left: 0; + width: 100vw; + height: 100vh; + border: 8px solid #ff0000; + box-sizing: border-box; + pointer-events: none; + z-index: 999999; + animation: pulse 1.5s infinite; + `; + + // Create status indicator + const indicator = document.createElement('div'); + indicator.id = '__rec'; + indicator.innerHTML = '🔴 RECORDING - Perform your action now'; + indicator.style.cssText = ` + position: fixed; + top: 10px; + left: 50%; + transform: translateX(-50%); + background: #ff0000; + color: #fff; + padding: 12px 20px; + border-radius: 8px; + font: bold 10px -apple-system, BlinkMacSystemFont, sans-serif; + z-index: 9999999; + box-shadow: 0 4px 12px rgba(255,0,0,0.4); + animation: pulse 1.5s infinite; + `; + + // Add pulsing animation + const style = document.createElement('style'); + style.textContent = ` + @keyframes pulse { + 0% { opacity: 1; } + 50% { opacity: 0.4; } + 100% { opacity: 0.8; } + } + `; + document.head.appendChild(style); + + document.body.appendChild(border); + document.body.appendChild(indicator); + """ From d3054cd15d2f6eb492c29531d0479b4ae61377b5 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com> Date: Thu, 21 Aug 2025 18:54:55 -0400 Subject: [PATCH 08/23] Add codegen step-wise recoder agent --- .../human_trace_recorder/codegen_agent.py | 192 ++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 src/agentlab/agents/human_trace_recorder/codegen_agent.py diff --git a/src/agentlab/agents/human_trace_recorder/codegen_agent.py b/src/agentlab/agents/human_trace_recorder/codegen_agent.py new file mode 100644 index 00000000..16d0222c --- /dev/null +++ b/src/agentlab/agents/human_trace_recorder/codegen_agent.py @@ -0,0 +1,192 @@ +"""Simple Codegen Agent + +Captures human interactions using playwright inspector. +Playwright trace logs are stored in "think" messages and can be viewed in Agentlab Xray. +""" + +from __future__ import annotations + +import json +import logging +import tempfile +import zipfile +from dataclasses import dataclass +from pathlib import Path + +import bgym +from playwright.sync_api import Page + +from agentlab.agents.agent_args import AgentArgs +from browsergym.core.observation import ( + extract_dom_extra_properties, + extract_dom_snapshot, + extract_focused_element_bid, + extract_merged_axtree, + extract_screenshot, +) +from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, prune_html + + +def extract_log_message_from_pw_trace(pw_trace_file_path): + zip_file = zipfile.ZipFile(pw_trace_file_path, "r") + trace_lines = zip_file.read("trace.trace").decode("utf-8").splitlines() + + actions = [] + for line in trace_lines: + if line.strip(): + event = json.loads(line) + if event.get("type") == "log": + actions.append(event) + # Extract log messages from the trace + return [log["message"].strip() for log in sorted(actions, key=lambda x: x.get("time", 0))] + + +def clean_pw_logs(logs, exclude_blacklist=True, use_substitutions=True): + clean_logs = list(logs) + blacklist = { + "attempting click action", + "waiting for element to be visible, enabled and stable", + "element is visible, enabled and stable", + "scrolling into view if needed", + "done scrolling", + "performing click action", + "click action done", + "waiting for scheduled navigations to finish", + "navigations have finished", + } + + substitutions = [("waiting for ", "")] + + def apply_substitutions(log): + for old, new in substitutions: + log = log.replace(old, new) + return log + + if exclude_blacklist: + clean_logs = [log for log in clean_logs if log not in blacklist] + if use_substitutions: + clean_logs = [apply_substitutions(log) for log in clean_logs] + + return clean_logs + + +@dataclass +class PlayWrightCodeGenAgentArgs(AgentArgs): + agent_name: str = "PlayWrightCodeGenAgent" + trace_dir: str = "playwright_codegen_traces" + use_raw_page_output: bool = True + store_raw_trace: bool = False + + def make_agent(self) -> bgym.Agent: # type: ignore[override] + return PlayWrightCodeGenAgent(self.trace_dir, self.store_raw_trace) + + def set_reproducibility_mode(self): + pass + + +class PlayWrightCodeGenAgent(bgym.Agent): + def __init__(self, trace_dir: str, store_raw_trace: bool): + self.action_set = bgym.HighLevelActionSet(["bid"], multiaction=False) + self._root = Path(trace_dir) + self._page: Page | None = None + self._step = 0 + self.store_raw_trace = store_raw_trace + self._episode_trace_dir = None # Cache for single episode + + def _get_trace_dir(self): + """Return the trace directory based on store_raw_trace setting.""" + if self._episode_trace_dir is None: + if self.store_raw_trace: + import datetime + + dt_str = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + self._episode_trace_dir = self._root / f"codegen_traces_{dt_str}" + self._episode_trace_dir.mkdir(parents=True, exist_ok=True) + else: + self._episode_trace_dir = Path(tempfile.mkdtemp()) + return self._episode_trace_dir + + def obs_preprocessor(self, obs: dict): # type: ignore[override] + if isinstance(obs, dict): + self._page = obs.get("page") + obs["screenshot"] = extract_screenshot(self._page) + obs["dom_object"] = extract_dom_snapshot(self._page) + obs["axtree_object"] = extract_merged_axtree(self._page) + scale_factor = getattr(self._page, "_bgym_scale_factor", 1.0) + extra_properties = extract_dom_extra_properties( + obs["dom_object"], scale_factor=scale_factor + ) + obs["extra_element_properties"] = extra_properties + obs["focused_element_bid"] = extract_focused_element_bid(self._page) + + if obs["axtree_object"]: + obs["axtree_txt"] = flatten_axtree_to_str(obs["axtree_object"]) + + if obs["dom_object"]: + obs["dom_txt"] = flatten_dom_to_str(obs["dom_object"]) + obs["pruned_html"] = prune_html(obs["dom_txt"]) + + if "page" in obs: # unpickable + del obs["page"] + + return obs + + def get_action(self, obs: dict): # type: ignore[override] + + if self._page is None: + raise RuntimeError("Playwright Page missing; ensure use_raw_page_output=True") + + page = self._page + trace_dir = self._get_trace_dir() + trace_path = trace_dir / f"step_{self._step}.zip" + page.context.tracing.start(screenshots=True, snapshots=True, sources=True) + page.context.tracing.start_chunk(name=f"step_{self._step}") + + print( + f"{'─'*60}\n" f"Step {self._step}\n", + f"{'─'*60}\n", + "1. 🔴 Start Recording (Press 'Record' in the Playwright Inspector.)\n", + "2. ✨ Perform actions for a single step.\n", + "3. ⚫ Stop Recording (Press 'Record' again to stop recording.)\n", + "4. ▶️ Press 'Resume' in the Playwright Inspector.", + ) + + page.pause() # Launch Inspector and record actions + page.context.tracing.stop_chunk(path=trace_path) + page.context.tracing.stop() + + pw_logs = extract_log_message_from_pw_trace(trace_path) + pw_logs = clean_pw_logs(pw_logs, exclude_blacklist=True) + pw_logs_str = "\n".join([f"{i}. {log}" for i, log in enumerate(pw_logs, 1)]) + + print(f"\n Playwright logs for step {self._step}:\n{pw_logs_str}") + + self._step += 1 + + agent_info = bgym.AgentInfo( + think=pw_logs_str, + chat_messages=[], + stats={}, + ) + + return "noop()", agent_info + + +PW_CODEGEN_AGENT = PlayWrightCodeGenAgentArgs(store_raw_trace=True) + + +if __name__ == "__main__": + from agentlab.agents.human_trace_recorder.codegen_agent import PW_CODEGEN_AGENT + from agentlab.experiments.study import Study + + agent_configs = [PW_CODEGEN_AGENT] + benchmark = bgym.DEFAULT_BENCHMARKS["workarena_l1"]() # type: bgym.Benchmark + benchmark = benchmark.subset_from_glob("task_name", "*create*") + benchmark.env_args_list = benchmark.env_args_list[:1] + for env_args in benchmark.env_args_list: + print(env_args.task_name) + env_args.max_steps = 15 + env_args.headless = False + + study = Study(agent_configs, benchmark, logging_level_stdout=logging.INFO) + study.run(n_jobs=1, parallel_backend="sequential", n_relaunch=1) From bf0b6e71ebc9aeb9508e1bf8375212283cf38166 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Thu, 28 Aug 2025 15:05:18 +0200 Subject: [PATCH 09/23] fix repeated llm configs --- src/agentlab/llm/llm_configs.py | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py index 3d5828b9..afbf094f 100644 --- a/src/agentlab/llm/llm_configs.py +++ b/src/agentlab/llm/llm_configs.py @@ -20,22 +20,6 @@ ] CHAT_MODEL_ARGS_DICT = { - "openai/gpt-5-nano-2025-08-07": OpenAIModelArgs( - model_name="gpt-5-nano-2025-08-07", - max_total_tokens=128_000, - max_input_tokens=128_000, - max_new_tokens=16_384, - temperature=1, # gpt-5 supports temperature of 1 only - vision_support=True, - ), - "openai/gpt-5-mini-2025-08-07": OpenAIModelArgs( - model_name="gpt-5-mini-2025-08-07", - max_total_tokens=128_000, - max_input_tokens=128_000, - max_new_tokens=16_384, - temperature=1, # gpt-5 supports temperature of 1 only - vision_support=True, - ), "openai/gpt-4.1-mini-2025-04-14": OpenAIModelArgs( model_name="gpt-4.1-mini-2025-04-14", max_total_tokens=128_000, @@ -117,6 +101,7 @@ max_input_tokens=400_000 - 4_000, max_new_tokens=4_000, temperature=1, # temperature param not supported by gpt-5 + vision_support=True, ), "openai/gpt-5-mini-2025-08-07": OpenAIModelArgs( model_name="gpt-5-mini-2025-08-07", @@ -124,6 +109,7 @@ max_input_tokens=400_000 - 4_000, max_new_tokens=4_000, temperature=1, # temperature param not supported by gpt-5 + vision_support=True, ), "azure/gpt-35-turbo/gpt-35-turbo": AzureModelArgs( model_name="gpt-35-turbo", From f7d154551c03bc427343af4e22426b87c040274e Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Thu, 28 Aug 2025 15:06:00 +0200 Subject: [PATCH 10/23] load env vars in codegen agent --- src/agentlab/agents/human_trace_recorder/codegen_agent.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/agentlab/agents/human_trace_recorder/codegen_agent.py b/src/agentlab/agents/human_trace_recorder/codegen_agent.py index 16d0222c..cd902bd2 100644 --- a/src/agentlab/agents/human_trace_recorder/codegen_agent.py +++ b/src/agentlab/agents/human_trace_recorder/codegen_agent.py @@ -14,8 +14,6 @@ from pathlib import Path import bgym -from playwright.sync_api import Page - from agentlab.agents.agent_args import AgentArgs from browsergym.core.observation import ( extract_dom_extra_properties, @@ -25,7 +23,10 @@ extract_screenshot, ) from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, prune_html +from dotenv import load_dotenv +from playwright.sync_api import Page +load_dotenv() def extract_log_message_from_pw_trace(pw_trace_file_path): zip_file = zipfile.ZipFile(pw_trace_file_path, "r") From 55ce26a2f85e02b965c31a06660aa4f2518937b5 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Fri, 29 Aug 2025 17:45:41 +0200 Subject: [PATCH 11/23] same hints retrieval for both generic and tooluse agents --- .../generic_agent_hinter/generic_agent.py | 3 +- .../generic_agent_prompt.py | 65 +++++++++---- .../agents/tool_use_agent/tool_use_agent.py | 91 +++++++++++++------ 3 files changed, 109 insertions(+), 50 deletions(-) diff --git a/src/agentlab/agents/generic_agent_hinter/generic_agent.py b/src/agentlab/agents/generic_agent_hinter/generic_agent.py index 91b2f70f..cfbd19bd 100644 --- a/src/agentlab/agents/generic_agent_hinter/generic_agent.py +++ b/src/agentlab/agents/generic_agent_hinter/generic_agent.py @@ -111,10 +111,11 @@ def get_action(self, obs): previous_plan=self.plan, step=self.plan_step, flags=self.flags, + llm=self.chat_llm, ) # Set task name for task hints if available - if self.flags.use_task_hint and hasattr(self, 'task_name'): + if self.flags.use_task_hint and hasattr(self, "task_name"): main_prompt.set_task_name(self.task_name) max_prompt_tokens, max_trunc_itr = self._get_maxes() diff --git a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py index bc12cc2c..f529fd78 100644 --- a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py +++ b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py @@ -6,15 +6,16 @@ import logging from dataclasses import dataclass +from pathlib import Path +from typing import Literal -from browsergym.core import action +import pandas as pd from browsergym.core.action.base import AbstractActionSet from agentlab.agents import dynamic_prompting as dp +from agentlab.agents.tool_use_agent.tool_use_agent import HintsSource +from agentlab.llm.chat_api import ChatModel from agentlab.llm.llm_utils import HumanMessage, parse_html_tags_raise -import fnmatch -import pandas as pd -from pathlib import Path @dataclass @@ -49,6 +50,7 @@ class GenericPromptFlags(dp.Flags): use_abstract_example: bool = False use_hints: bool = False use_task_hint: bool = False + task_hint_retrieval_mode: Literal["direct", "llm", "emb"] = "direct" hint_db_path: str = None enable_chat: bool = False max_prompt_tokens: int = None @@ -70,10 +72,12 @@ def __init__( previous_plan: str, step: int, flags: GenericPromptFlags, + llm: ChatModel, ) -> None: super().__init__() self.flags = flags self.history = dp.History(obs_history, actions, memories, thoughts, flags.obs) + goal = obs_history[-1]["goal_object"] if self.flags.enable_chat: self.instructions = dp.ChatInstructions( obs_history[-1]["chat_messages"], extra_instructions=flags.extra_instructions @@ -84,7 +88,7 @@ def __init__( "Agent is in goal mode, but multiple user messages are present in the chat. Consider switching to `enable_chat=True`." ) self.instructions = dp.GoalInstructions( - obs_history[-1]["goal_object"], extra_instructions=flags.extra_instructions + goal, extra_instructions=flags.extra_instructions ) self.obs = dp.Observation( @@ -105,7 +109,10 @@ def time_for_caution(): self.hints = dp.Hints(visible=lambda: flags.use_hints) self.task_hint = TaskHint( use_task_hint=flags.use_task_hint, - hint_db_path=flags.hint_db_path + hint_db_path=flags.hint_db_path, + goal=goal, + hint_retrieval_mode=flags.task_hint_retrieval_mode, + llm=llm, ) self.plan = Plan(previous_plan, step, lambda: flags.use_plan) # TODO add previous plan self.criticise = Criticise(visible=lambda: flags.use_criticise) @@ -114,12 +121,12 @@ def time_for_caution(): @property def _prompt(self) -> HumanMessage: prompt = HumanMessage(self.instructions.prompt) - + # Add task hints if enabled task_hints_text = "" - if self.flags.use_task_hint and hasattr(self, 'task_name'): + if self.flags.use_task_hint and hasattr(self, "task_name"): task_hints_text = self.task_hint.get_hints_for_task(self.task_name) - + prompt.add_text( f"""\ {self.obs.prompt}\ @@ -286,11 +293,21 @@ def _parse_answer(self, text_answer): class TaskHint(dp.PromptElement): - def __init__(self, use_task_hint: bool = True, hint_db_path: str = None) -> None: + def __init__( + self, + use_task_hint: bool, + hint_db_path: str, + goal: str, + hint_retrieval_mode: Literal["direct", "llm", "emb"], + llm: ChatModel, + ) -> None: super().__init__(visible=use_task_hint) self.use_task_hint = use_task_hint self.hint_db_rel_path = "hint_db.csv" self.hint_db_path = hint_db_path # Allow external path override + self.hint_retrieval_mode: Literal["direct", "llm", "emb"] = hint_retrieval_mode + self.goal = goal + self.llm = llm self._init() _prompt = "" # Task hints are added dynamically in MainPrompt @@ -316,39 +333,49 @@ def _init(self): hint_db_path = Path(self.hint_db_path) else: hint_db_path = Path(__file__).parent / self.hint_db_rel_path - + if hint_db_path.exists(): self.hint_db = pd.read_csv(hint_db_path, header=0, index_col=None, dtype=str) # Verify the expected columns exist if "task_name" not in self.hint_db.columns or "hint" not in self.hint_db.columns: - print(f"Warning: Hint database missing expected columns. Found: {list(self.hint_db.columns)}") + print( + f"Warning: Hint database missing expected columns. Found: {list(self.hint_db.columns)}" + ) self.hint_db = pd.DataFrame(columns=["task_name", "hint"]) else: print(f"Warning: Hint database not found at {hint_db_path}") self.hint_db = pd.DataFrame(columns=["task_name", "hint"]) + self.hints_source = HintsSource( + hint_db_path=self.hint_db_rel_path, + hint_retrieval_mode=self.hint_retrieval_mode, + ) except Exception as e: # Fallback to empty database on any error print(f"Warning: Could not load hint database: {e}") self.hint_db = pd.DataFrame(columns=["task_name", "hint"]) - def get_hints_for_task(self, task_name: str) -> str: """Get hints for a specific task.""" if not self.use_task_hint: return "" # Ensure hint_db is initialized - if not hasattr(self, 'hint_db'): + if not hasattr(self, "hint_db"): self._init() # Check if hint_db has the expected structure - if self.hint_db.empty or "task_name" not in self.hint_db.columns or "hint" not in self.hint_db.columns: + if ( + self.hint_db.empty + or "task_name" not in self.hint_db.columns + or "hint" not in self.hint_db.columns + ): return "" try: - task_hints = self.hint_db[ - self.hint_db["task_name"].apply(lambda x: fnmatch.fnmatch(x, task_name)) - ] + # task_hints = self.hint_db[ + # self.hint_db["task_name"].apply(lambda x: fnmatch.fnmatch(x, task_name)) + # ] + task_hints = self.hints_source.choose_hints(self.llm, task_name, self.goal) hints = [] for hint in task_hints["hint"]: @@ -364,5 +391,5 @@ def get_hints_for_task(self, task_name: str) -> str: return hints_str except Exception as e: print(f"Warning: Error getting hints for task {task_name}: {e}") - + return "" diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py index 375c829e..9025107e 100644 --- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py +++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py @@ -28,6 +28,7 @@ from agentlab.benchmarks.abstract_env import AbstractBenchmark as AgentLabBenchmark from agentlab.benchmarks.osworld import OSWorldActionSet from agentlab.llm.base_api import BaseModelArgs +from agentlab.llm.chat_api import ChatModel from agentlab.llm.llm_utils import image_to_png_base64_url from agentlab.llm.response_api import ( APIPayload, @@ -316,39 +317,21 @@ class TaskHint(Block): def _init(self): """Initialize the block.""" - if Path(self.hint_db_rel_path).is_absolute(): - hint_db_path = Path(self.hint_db_rel_path) - else: - hint_db_path = Path(__file__).parent / self.hint_db_rel_path - self.hint_db = pd.read_csv(hint_db_path, header=0, index_col=None, dtype=str) - if self.hint_retrieval_mode == "emb": - self.encode_hints() - - def oai_embed(self, text: str): - response = self._oai_emb.create(input=text, model="text-embedding-3-small") - return response.data[0].embedding - - def encode_hints(self): - self.uniq_hints = self.hint_db.drop_duplicates(subset=["hint"], keep="first") - logger.info( - f"Encoding {len(self.uniq_hints)} unique hints with semantic keys using {self.embedder_model} model." + self.hints_source = HintsSource( + hint_db_path=self.hint_db_rel_path, + hint_retrieval_mode=self.hint_retrieval_mode, + top_n=self.top_n, + embedder_model=self.embedder_model, + embedder_server=self.embedder_server, + llm_prompt=self.llm_prompt, ) - hints = self.uniq_hints["hint"].tolist() - semantic_keys = self.uniq_hints["semantic_keys"].tolist() - lines = [f"{k}: {h}" for h, k in zip(hints, semantic_keys)] - emb_path = f"{self.hint_db_rel_path}.embs.npy" - assert os.path.exists(emb_path), f"Embedding file not found: {emb_path}" - logger.info(f"Loading hint embeddings from: {emb_path}") - emb_dict = np.load(emb_path, allow_pickle=True).item() - self.hint_embeddings = np.array([emb_dict[k] for k in lines]) - logger.info(f"Loaded hint embeddings shape: {self.hint_embeddings.shape}") def apply(self, llm, discussion: StructuredDiscussion, task_name: str) -> dict: if not self.use_task_hint: return {} goal = "\n".join([c.get("text", "") for c in discussion.groups[0].messages[1].content]) - task_hints = self.choose_hints(llm, task_name, goal) + task_hints = self.hints_source.choose_hints(llm, task_name, goal) hints = [] for hint in task_hints: @@ -365,6 +348,49 @@ def apply(self, llm, discussion: StructuredDiscussion, task_name: str) -> dict: discussion.append(msg) + +class HintsSource: + def __init__( + self, + hint_db_path: str, + hint_retrieval_mode: Literal["direct", "llm", "emb"] = "direct", + top_n: int = 4, + embedder_model: str = "Qwen/Qwen3-Embedding-0.6B", + embedder_server: str = "http://localhost:5000", + llm_prompt: str = """We're choosing hints to help solve the following task:\n{goal}.\n +You need to choose the most relevant hints topic from the following list:\n\nHint topics:\n{topics}\n +Choose hint topic for the task and return only its number, e.g. 1. If you don't know the answer, return -1.""", + ) -> None: + self.hint_db_path = hint_db_path + self.hint_retrieval_mode = hint_retrieval_mode + self.top_n = top_n + self.embedder_model = embedder_model + self.embedder_server = embedder_server + self.llm_prompt = llm_prompt + + if Path(hint_db_path).is_absolute(): + self.hint_db_path = Path(hint_db_path).as_posix() + else: + self.hint_db_path = (Path(__file__).parent / self.hint_db_path).as_posix() + self.hint_db = pd.read_csv(self.hint_db_path, header=0, index_col=None, dtype=str) + if self.hint_retrieval_mode == "emb": + self.load_hint_vectors() + + def load_hint_vectors(self): + self.uniq_hints = self.hint_db.drop_duplicates(subset=["hint"], keep="first") + logger.info( + f"Encoding {len(self.uniq_hints)} unique hints with semantic keys using {self.embedder_model} model." + ) + hints = self.uniq_hints["hint"].tolist() + semantic_keys = self.uniq_hints["semantic_keys"].tolist() + lines = [f"{k}: {h}" for h, k in zip(hints, semantic_keys)] + emb_path = f"{self.hint_db_path}.embs.npy" + assert os.path.exists(emb_path), f"Embedding file not found: {emb_path}" + logger.info(f"Loading hint embeddings from: {emb_path}") + emb_dict = np.load(emb_path, allow_pickle=True).item() + self.hint_embeddings = np.array([emb_dict[k] for k in lines]) + logger.info(f"Loaded hint embeddings shape: {self.hint_embeddings.shape}") + def choose_hints(self, llm, task_name: str, goal: str) -> list[str]: """Choose hints based on the task name.""" if self.hint_retrieval_mode == "llm": @@ -384,11 +410,14 @@ def choose_hints_llm(self, llm, goal: str) -> list[str]: hint_topics = list(topic_to_hints.keys()) topics = "\n".join([f"{i}. {h}" for i, h in enumerate(hint_topics)]) prompt = self.llm_prompt.format(goal=goal, topics=topics) - response = llm(APIPayload(messages=[llm.msg.user().add_text(prompt)])) + if isinstance(llm, ChatModel): + response: str = llm(messages=[dict(role="user", content=prompt)])["content"] + else: + response: str = llm(APIPayload(messages=[llm.msg.user().add_text(prompt)])).think try: - hint_topic_idx = json.loads(response.think) + hint_topic_idx = json.loads(response) if hint_topic_idx < 0 or hint_topic_idx >= len(hint_topics): - logger.error(f"Wrong LLM hint id response: {response.think}, no hints") + logger.error(f"Wrong LLM hint id response: {response}, no hints") return [] hint_topic = hint_topics[hint_topic_idx] hint_indices = topic_to_hints[hint_topic] @@ -397,7 +426,7 @@ def choose_hints_llm(self, llm, goal: str) -> list[str]: hints = df["hint"].tolist() logger.debug(f"LLM hint topic {hint_topic_idx}, chosen hints: {df['hint'].tolist()}") except json.JSONDecodeError: - logger.error(f"Failed to parse LLM hint id response: {response.think}, no hints") + logger.error(f"Failed to parse LLM hint id response: {response}, no hints") hints = [] return hints @@ -427,6 +456,7 @@ def _encode(self, texts: list[str], prompt: str = "", timeout: int = 10, max_ret raise e time.sleep(random.uniform(1, timeout)) continue + raise ValueError("Failed to encode hints") def _similarity( self, texts1: list[str], texts2: list[str], timeout: int = 2, max_retries: int = 5 @@ -446,6 +476,7 @@ def _similarity( raise e time.sleep(random.uniform(1, timeout)) continue + raise ValueError("Failed to compute similarity") def choose_hints_direct(self, task_name: str) -> list[str]: hints = self.hint_db[ From cad12096f312cfd74de24f0b50ba4010f12953f3 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Mon, 1 Sep 2025 13:51:53 +0200 Subject: [PATCH 12/23] filter out current task hints if needed --- .../agents/tool_use_agent/tool_use_agent.py | 41 +++++++++++++++---- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py index 9025107e..4e6de3b3 100644 --- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py +++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py @@ -341,7 +341,7 @@ def apply(self, llm, discussion: StructuredDiscussion, task_name: str) -> dict: if len(hints) > 0: hints_str = ( - "# Hints:\nHere are some hints for the task you are working on:\n" + "\n# Hints:\nHere are some hints for the task you are working on:\n" + "\n".join(hints) ) msg = llm.msg.user().add_text(hints_str) @@ -354,6 +354,7 @@ def __init__( self, hint_db_path: str, hint_retrieval_mode: Literal["direct", "llm", "emb"] = "direct", + skip_hints_for_current_task: bool = False, top_n: int = 4, embedder_model: str = "Qwen/Qwen3-Embedding-0.6B", embedder_server: str = "http://localhost:5000", @@ -363,6 +364,7 @@ def __init__( ) -> None: self.hint_db_path = hint_db_path self.hint_retrieval_mode = hint_retrieval_mode + self.skip_hints_for_current_task = skip_hints_for_current_task self.top_n = top_n self.embedder_model = embedder_model self.embedder_server = embedder_server @@ -405,7 +407,14 @@ def choose_hints(self, llm, task_name: str, goal: str) -> list[str]: def choose_hints_llm(self, llm, goal: str) -> list[str]: """Choose hints using LLM to filter the hints.""" topic_to_hints = defaultdict(list) - for i, row in self.hint_db.iterrows(): + hints_df = self.hint_db + if self.skip_hints_for_current_task: + current_task_hints = self.get_current_task_hints(task_name) + hints_df = hints_df[~hints_df["hint"].isin(current_task_hints)] + logger.info( + f"Filtered out current task hints, remaining hints: {hints_df.shape[0]} out of {self.hint_db.shape[0]}" + ) + for i, row in hints_df.iterrows(): topic_to_hints[row["semantic_keys"]].append(i) hint_topics = list(topic_to_hints.keys()) topics = "\n".join([f"{i}. {h}" for i, h in enumerate(hint_topics)]) @@ -421,10 +430,10 @@ def choose_hints_llm(self, llm, goal: str) -> list[str]: return [] hint_topic = hint_topics[hint_topic_idx] hint_indices = topic_to_hints[hint_topic] - df = self.hint_db.iloc[hint_indices].copy() + df = hints_df.iloc[hint_indices].copy() df = df.drop_duplicates(subset=["hint"], keep="first") # leave only unique hints hints = df["hint"].tolist() - logger.debug(f"LLM hint topic {hint_topic_idx}, chosen hints: {df['hint'].tolist()}") + logger.info(f"LLM hint topic {hint_topic_idx}, chosen hints: {df['hint'].tolist()}") except json.JSONDecodeError: logger.error(f"Failed to parse LLM hint id response: {response}, no hints") hints = [] @@ -433,10 +442,21 @@ def choose_hints_llm(self, llm, goal: str) -> list[str]: def choose_hints_emb(self, goal: str) -> list[str]: """Choose hints using embeddings to filter the hints.""" goal_embeddings = self._encode([goal], prompt="task description") - similarities = self._similarity(goal_embeddings.tolist(), self.hint_embeddings.tolist()) + hint_embeddings = self.hint_embeddings + hints_df = self.uniq_hints + if self.skip_hints_for_current_task: + current_task_hints = self.get_current_task_hints(task_name) + mask = ~hints_df["hint"].isin(current_task_hints) + hints_df = hints_df[mask] + filtered_indices = hints_df.index.tolist() + hint_embeddings = hint_embeddings[filtered_indices] + logger.info( + f"Filtered same task hint, remained: {len(hint_embeddings)} out of {len(self.hint_embeddings)} embeddings" + ) + similarities = self._similarity(goal_embeddings.tolist(), hint_embeddings.tolist()) top_indices = similarities.argsort()[0][-self.top_n :].tolist() logger.info(f"Top hint indices based on embedding similarity: {top_indices}") - hints = self.uniq_hints.iloc[top_indices] + hints = hints_df.iloc[top_indices] logger.info(f"Embedding-based hints chosen: {hints}") return hints["hint"].tolist() @@ -479,10 +499,15 @@ def _similarity( raise ValueError("Failed to compute similarity") def choose_hints_direct(self, task_name: str) -> list[str]: - hints = self.hint_db[ + hints = self.get_current_task_hints(task_name) + logger.info(f"Direct hints chosen: {hints}") + return hints + + def get_current_task_hints(self, task_name): + hints_df = self.hint_db[ self.hint_db["task_name"].apply(lambda x: fnmatch.fnmatch(x, task_name)) ] - return hints["hint"].tolist() + return hints_df["hint"].tolist() @dataclass From d920b8eb6cae5e39ba5f1a49bd1b73b633294e6c Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Mon, 1 Sep 2025 13:52:08 +0200 Subject: [PATCH 13/23] fix llm config, add gpt-5 --- src/agentlab/llm/llm_configs.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py index afbf094f..7ac2450a 100644 --- a/src/agentlab/llm/llm_configs.py +++ b/src/agentlab/llm/llm_configs.py @@ -111,6 +111,14 @@ temperature=1, # temperature param not supported by gpt-5 vision_support=True, ), + "openai/gpt-5-2025-08-07": OpenAIModelArgs( + model_name="gpt-5-2025-08-07", + max_total_tokens=400_000, + max_input_tokens=400_000 - 4_000, + max_new_tokens=4_000, + temperature=1, # temperature param not supported by gpt-5 + vision_support=True, + ), "azure/gpt-35-turbo/gpt-35-turbo": AzureModelArgs( model_name="gpt-35-turbo", deployment_name="gpt-35-turbo", From 5315f14b2b5b57f43e23a0da0eec6b31f273ce99 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Mon, 1 Sep 2025 13:52:21 +0200 Subject: [PATCH 14/23] fix --- .../agents/generic_agent_hinter/generic_agent_prompt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py index f529fd78..19f0efda 100644 --- a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py +++ b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py @@ -378,7 +378,7 @@ def get_hints_for_task(self, task_name: str) -> str: task_hints = self.hints_source.choose_hints(self.llm, task_name, self.goal) hints = [] - for hint in task_hints["hint"]: + for hint in task_hints: hint = hint.strip() if hint: hints.append(f"- {hint}") From 26f0abb36fc80999576cc7beec065f3da07dbb1e Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Mon, 1 Sep 2025 14:35:05 +0200 Subject: [PATCH 15/23] pass new flag and fix db path passing issue --- .../generic_agent_hinter/generic_agent_prompt.py | 7 ++++++- src/agentlab/agents/tool_use_agent/tool_use_agent.py | 12 ++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py index 19f0efda..84b5d332 100644 --- a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py +++ b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py @@ -51,6 +51,7 @@ class GenericPromptFlags(dp.Flags): use_hints: bool = False use_task_hint: bool = False task_hint_retrieval_mode: Literal["direct", "llm", "emb"] = "direct" + skip_hints_for_current_task: bool = False hint_db_path: str = None enable_chat: bool = False max_prompt_tokens: int = None @@ -113,6 +114,7 @@ def time_for_caution(): goal=goal, hint_retrieval_mode=flags.task_hint_retrieval_mode, llm=llm, + skip_hints_for_current_task=flags.skip_hints_for_current_task, ) self.plan = Plan(previous_plan, step, lambda: flags.use_plan) # TODO add previous plan self.criticise = Criticise(visible=lambda: flags.use_criticise) @@ -299,6 +301,7 @@ def __init__( hint_db_path: str, goal: str, hint_retrieval_mode: Literal["direct", "llm", "emb"], + skip_hints_for_current_task: bool, llm: ChatModel, ) -> None: super().__init__(visible=use_task_hint) @@ -306,6 +309,7 @@ def __init__( self.hint_db_rel_path = "hint_db.csv" self.hint_db_path = hint_db_path # Allow external path override self.hint_retrieval_mode: Literal["direct", "llm", "emb"] = hint_retrieval_mode + self.skip_hints_for_current_task = skip_hints_for_current_task self.goal = goal self.llm = llm self._init() @@ -346,8 +350,9 @@ def _init(self): print(f"Warning: Hint database not found at {hint_db_path}") self.hint_db = pd.DataFrame(columns=["task_name", "hint"]) self.hints_source = HintsSource( - hint_db_path=self.hint_db_rel_path, + hint_db_path=hint_db_path.as_posix(), hint_retrieval_mode=self.hint_retrieval_mode, + skip_hints_for_current_task=self.skip_hints_for_current_task, ) except Exception as e: # Fallback to empty database on any error diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py index 4e6de3b3..b8f21431 100644 --- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py +++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py @@ -375,6 +375,7 @@ def __init__( else: self.hint_db_path = (Path(__file__).parent / self.hint_db_path).as_posix() self.hint_db = pd.read_csv(self.hint_db_path, header=0, index_col=None, dtype=str) + logger.info(f"Loaded {len(self.hint_db)} hints from database {self.hint_db_path}") if self.hint_retrieval_mode == "emb": self.load_hint_vectors() @@ -395,16 +396,19 @@ def load_hint_vectors(self): def choose_hints(self, llm, task_name: str, goal: str) -> list[str]: """Choose hints based on the task name.""" + logger.info( + f"Choosing hints for task: {task_name}, goal: {goal} from db: {self.hint_db_path} using mode: {self.hint_retrieval_mode}" + ) if self.hint_retrieval_mode == "llm": - return self.choose_hints_llm(llm, goal) + return self.choose_hints_llm(llm, goal, task_name) elif self.hint_retrieval_mode == "direct": return self.choose_hints_direct(task_name) elif self.hint_retrieval_mode == "emb": - return self.choose_hints_emb(goal) + return self.choose_hints_emb(goal, task_name) else: raise ValueError(f"Unknown hint retrieval mode: {self.hint_retrieval_mode}") - def choose_hints_llm(self, llm, goal: str) -> list[str]: + def choose_hints_llm(self, llm, goal: str, task_name: str) -> list[str]: """Choose hints using LLM to filter the hints.""" topic_to_hints = defaultdict(list) hints_df = self.hint_db @@ -439,7 +443,7 @@ def choose_hints_llm(self, llm, goal: str) -> list[str]: hints = [] return hints - def choose_hints_emb(self, goal: str) -> list[str]: + def choose_hints_emb(self, goal: str, task_name: str) -> list[str]: """Choose hints using embeddings to filter the hints.""" goal_embeddings = self._encode([goal], prompt="task description") hint_embeddings = self.hint_embeddings From 5393a34112beab3e92e339d429947c871bfeb67e Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Mon, 1 Sep 2025 14:59:54 +0200 Subject: [PATCH 16/23] fix goal text --- .../agents/generic_agent_hinter/generic_agent_prompt.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py index 84b5d332..b684b6c9 100644 --- a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py +++ b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py @@ -108,10 +108,11 @@ def time_for_caution(): self.be_cautious = dp.BeCautious(visible=time_for_caution) self.think = dp.Think(visible=lambda: flags.use_thinking) self.hints = dp.Hints(visible=lambda: flags.use_hints) + goal_str: str = goal[0]["text"] self.task_hint = TaskHint( use_task_hint=flags.use_task_hint, hint_db_path=flags.hint_db_path, - goal=goal, + goal=goal_str, hint_retrieval_mode=flags.task_hint_retrieval_mode, llm=llm, skip_hints_for_current_task=flags.skip_hints_for_current_task, From deddc50697b3871077d8000b2fa3fe0b48649b5d Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Mon, 1 Sep 2025 17:35:22 +0200 Subject: [PATCH 17/23] fix current task hints exclusion --- .../agents/tool_use_agent/tool_use_agent.py | 82 ++++++++++--------- 1 file changed, 45 insertions(+), 37 deletions(-) diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py index b8f21431..bd200da3 100644 --- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py +++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py @@ -411,58 +411,62 @@ def choose_hints(self, llm, task_name: str, goal: str) -> list[str]: def choose_hints_llm(self, llm, goal: str, task_name: str) -> list[str]: """Choose hints using LLM to filter the hints.""" topic_to_hints = defaultdict(list) - hints_df = self.hint_db + skip_hints = [] if self.skip_hints_for_current_task: - current_task_hints = self.get_current_task_hints(task_name) - hints_df = hints_df[~hints_df["hint"].isin(current_task_hints)] - logger.info( - f"Filtered out current task hints, remaining hints: {hints_df.shape[0]} out of {self.hint_db.shape[0]}" - ) - for i, row in hints_df.iterrows(): - topic_to_hints[row["semantic_keys"]].append(i) + skip_hints = self.get_current_task_hints(task_name) + for _, row in self.hint_db.iterrows(): + hint = row["hint"] + if hint in skip_hints: + continue + topic_to_hints[row["semantic_keys"]].append(hint) + logger.info(f"Collected {len(topic_to_hints)} hint topics") hint_topics = list(topic_to_hints.keys()) topics = "\n".join([f"{i}. {h}" for i, h in enumerate(hint_topics)]) prompt = self.llm_prompt.format(goal=goal, topics=topics) + if isinstance(llm, ChatModel): response: str = llm(messages=[dict(role="user", content=prompt)])["content"] else: response: str = llm(APIPayload(messages=[llm.msg.user().add_text(prompt)])).think try: - hint_topic_idx = json.loads(response) - if hint_topic_idx < 0 or hint_topic_idx >= len(hint_topics): + topic_number = json.loads(response) + if topic_number < 0 or topic_number >= len(hint_topics): logger.error(f"Wrong LLM hint id response: {response}, no hints") return [] - hint_topic = hint_topics[hint_topic_idx] - hint_indices = topic_to_hints[hint_topic] - df = hints_df.iloc[hint_indices].copy() - df = df.drop_duplicates(subset=["hint"], keep="first") # leave only unique hints - hints = df["hint"].tolist() - logger.info(f"LLM hint topic {hint_topic_idx}, chosen hints: {df['hint'].tolist()}") - except json.JSONDecodeError: - logger.error(f"Failed to parse LLM hint id response: {response}, no hints") + hint_topic = hint_topics[topic_number] + hints = list(set(topic_to_hints[hint_topic])) + logger.info(f"LLM hint topic {topic_number}:'{hint_topic}', chosen hints: {hints}") + except Exception as e: + logger.exception(f"Failed to parse LLM hint id response: {response}:\n{e}") hints = [] return hints def choose_hints_emb(self, goal: str, task_name: str) -> list[str]: """Choose hints using embeddings to filter the hints.""" - goal_embeddings = self._encode([goal], prompt="task description") - hint_embeddings = self.hint_embeddings - hints_df = self.uniq_hints - if self.skip_hints_for_current_task: - current_task_hints = self.get_current_task_hints(task_name) - mask = ~hints_df["hint"].isin(current_task_hints) - hints_df = hints_df[mask] - filtered_indices = hints_df.index.tolist() - hint_embeddings = hint_embeddings[filtered_indices] - logger.info( - f"Filtered same task hint, remained: {len(hint_embeddings)} out of {len(self.hint_embeddings)} embeddings" - ) - similarities = self._similarity(goal_embeddings.tolist(), hint_embeddings.tolist()) - top_indices = similarities.argsort()[0][-self.top_n :].tolist() - logger.info(f"Top hint indices based on embedding similarity: {top_indices}") - hints = hints_df.iloc[top_indices] - logger.info(f"Embedding-based hints chosen: {hints}") - return hints["hint"].tolist() + try: + goal_embeddings = self._encode([goal], prompt="task description") + hint_embeddings = self.hint_embeddings.copy() + all_hints = self.uniq_hints["hint"].tolist() + skip_hints = [] + if self.skip_hints_for_current_task: + skip_hints = self.get_current_task_hints(task_name) + hint_embeddings = [] + id_to_hint = {} + for hint, emb in zip(all_hints, self.hint_embeddings): + if hint in skip_hints: + continue + hint_embeddings.append(emb.tolist()) + id_to_hint[len(hint_embeddings) - 1] = hint + logger.info(f"Prepared hint embeddings for {len(hint_embeddings)} hints") + similarities = self._similarity(goal_embeddings.tolist(), hint_embeddings) + top_indices = similarities.argsort()[0][-self.top_n :].tolist() + logger.info(f"Top hint indices based on embedding similarity: {top_indices}") + hints = [id_to_hint[idx] for idx in top_indices] + logger.info(f"Embedding-based hints chosen: {hints}") + except Exception as e: + logger.exception(f"Failed to choose hints using embeddings: {e}") + hints = [] + return hints def _encode(self, texts: list[str], prompt: str = "", timeout: int = 10, max_retries: int = 5): """Call the encode API endpoint with timeout and retries""" @@ -483,7 +487,11 @@ def _encode(self, texts: list[str], prompt: str = "", timeout: int = 10, max_ret raise ValueError("Failed to encode hints") def _similarity( - self, texts1: list[str], texts2: list[str], timeout: int = 2, max_retries: int = 5 + self, + texts1: list, + texts2: list, + timeout: int = 2, + max_retries: int = 5, ): """Call the similarity API endpoint with timeout and retries""" for attempt in range(max_retries): From b9d09d4d8d2ee557a04b76c358496c21cc1657cd Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 2 Sep 2025 11:44:38 +0200 Subject: [PATCH 18/23] remove old reqs --- requirements.txt | 31 ------------------------------- 1 file changed, 31 deletions(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index a2798f2e..00000000 --- a/requirements.txt +++ /dev/null @@ -1,31 +0,0 @@ -black[jupyter]>=24.2.0 -blacken-docs -pre-commit -pytest==7.3.2 -flaky -pytest-xdist -pytest-playwright -pydantic~=2.9 -dask -distributed -browsergym>=0.7.1 -joblib>=1.2.0 -openai>=1.7,<2 -langchain_community -tiktoken -tapeagents[converters] -huggingface_hub -contexttimer -ipython -pyyaml>=6 -pandas -gradio>=5.5 # issue with DataFrame scrolling before 5.5 -gitpython # for the reproducibility script -requests -matplotlib -ray[default] -python-slugify -pillow -gymnasium>=0.27 -sentence-transformers>=5.0.0 -python-dotenv>=1.1.1 \ No newline at end of file From 725e7a03750780263cb6ce0190ef252fc2e3d688 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 2 Sep 2025 11:45:54 +0200 Subject: [PATCH 19/23] remove recorder from that brach --- .../agents/human_trace_recorder/agent.py | 302 ---------- .../human_trace_recorder/codegen_agent.py | 193 ------ .../human_trace_recorder/event_listeners.py | 563 ------------------ 3 files changed, 1058 deletions(-) delete mode 100644 src/agentlab/agents/human_trace_recorder/agent.py delete mode 100644 src/agentlab/agents/human_trace_recorder/codegen_agent.py delete mode 100644 src/agentlab/agents/human_trace_recorder/event_listeners.py diff --git a/src/agentlab/agents/human_trace_recorder/agent.py b/src/agentlab/agents/human_trace_recorder/agent.py deleted file mode 100644 index 556922af..00000000 --- a/src/agentlab/agents/human_trace_recorder/agent.py +++ /dev/null @@ -1,302 +0,0 @@ -"""Human Trace Agent for Browser Automation Training Data - -Captures human interactions at each step including: - - Comprehensive action tracking (clicks, input, navigation, etc.) - - Saves only human_action.json files in simple numbered folders -""" - -from __future__ import annotations - -import json -import time -from dataclasses import dataclass -from pathlib import Path - -import bgym -from playwright.sync_api import Page - -from agentlab.agents.agent_args import AgentArgs -from agentlab.agents.human_trace_recorder.event_listeners import ( - get_interaction_tracking_script, - get_recording_indicators_script, -) -from browsergym.core.observation import ( - extract_dom_extra_properties, - extract_dom_snapshot, - extract_focused_element_bid, - extract_merged_axtree, - extract_screenshot, -) -from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, prune_html - - -@dataclass -class HumanTraceAgentArgs(AgentArgs): - agent_name: str = "HumanTraceAgent" - trace_dir: str = "human_traces" - use_raw_page_output: bool = True - - def make_agent(self) -> bgym.Agent: # type: ignore[override] - return HumanTraceAgent(self.trace_dir) - - def set_reproducibility_mode(self): - pass - - -class HumanTraceAgent(bgym.Agent): - def __init__(self, trace_dir: str): - self.action_set = bgym.HighLevelActionSet(["bid"], multiaction=False) - self._root = Path(trace_dir) - self._root.mkdir(parents=True, exist_ok=True) - self._page: Page | None = None - self._step = 0 - - def obs_preprocessor(self, obs: dict): # type: ignore[override] - if isinstance(obs, dict): - self._page = obs.get("page") - # Remove the page object from obs to avoid pickle issues - if "page" in obs: - del obs["page"] - - obs["screenshot"] = extract_screenshot(self._page) - obs["dom_object"] = extract_dom_snapshot(self._page) - obs["axtree_object"] = extract_merged_axtree(self._page) - scale_factor = getattr(self._page, "_bgym_scale_factor", 1.0) - extra_properties = extract_dom_extra_properties( - obs["dom_object"], scale_factor=scale_factor - ) - obs["extra_element_properties"] = extra_properties - obs["focused_element_bid"] = extract_focused_element_bid(self._page) - - # Add text representations for easier analysis - if obs["axtree_object"]: - axt = obs["axtree_object"] - if extra_properties: - obs["axtree_txt"] = flatten_axtree_to_str(axt) - - if obs["dom_object"]: - obs["dom_txt"] = flatten_dom_to_str(obs["dom_object"]) - obs["pruned_html"] = prune_html(obs["dom_txt"]) - return obs - - def get_action(self, obs: dict): # type: ignore[override] - if self._page is None: - raise RuntimeError("Playwright Page missing; ensure use_raw_page_output=True") - - page = self._page - step_dir = self._create_step_directory() - - self._display_recording_prompt() - self._show_recording_indicators(page) - - # Capture human interactions - captured_action, human_interactions = self._capture_interactions_with_js(page, step_dir) - - # Save and cleanup - self._save_human_action(captured_action, step_dir) - self._cleanup_indicators(page) - - self._step += 1 - return "noop()", { - "extra_info": { - "step": self._step - 1, - "human_interactions": human_interactions, - } - } - - def _create_step_directory(self) -> Path: - """Create directory for current step.""" - step_dir = self._root / str(self._step) - step_dir.mkdir(parents=True, exist_ok=True) - return step_dir - - def _display_recording_prompt(self): - """Display prompt messages to user.""" - print(f"[HumanTrace] Step {self._step}: Perform ONE action") - print("[HumanTrace] ⚠️ WAIT FOR THE RED BORDER TO APPEAR BEFORE PERFORMING ANY ACTION ⚠️") - print("[HumanTrace] The system will automatically save after detecting your action") - - def _show_recording_indicators(self, page: Page): - """Show visual recording indicators on the page.""" - page.evaluate(get_recording_indicators_script()) - - def _save_human_action(self, captured_action: dict, step_dir: Path): - """Save the captured human action to JSON file.""" - try: - human_action_path = step_dir / "human_action.json" - if captured_action and isinstance(captured_action, dict): - human_action_path.write_text(json.dumps(captured_action, indent=2)) - action_type = captured_action.get("type", "unknown") - else: - # Create empty action record for consistency - empty_action = { - "type": "no_action", - "timestamp": time.time() * 1000, - "reason": "No meaningful human action captured in this step", - } - human_action_path.write_text(json.dumps(empty_action, indent=2)) - action_type = "no_action" - - print(f"[HumanTrace] Step {self._step} complete - Action: {action_type}") - - except Exception as e: - print(f"[HumanTrace] Warning: Failed to save human action: {e}") - - def _cleanup_indicators(self, page: Page): - """Remove recording indicators from the page.""" - page.evaluate("document.getElementById('__rec')?.remove(); document.getElementById('__rec_border')?.remove()") - - def _capture_interactions_with_js(self, page: Page, step_dir: Path) -> tuple[dict, str]: - """Capture human interactions using JavaScript injection.""" - try: - print("[HumanTrace] JavaScript interaction tracking enabled") - initial_url, initial_title = page.url, page.title() - - # Inject interaction tracking - self._inject_interaction_tracking(page) - - # Wait for user action - self._wait_for_user_action(page) - - # Collect and process interaction data - return self._collect_interaction_data(page, initial_url, initial_title) - - except Exception as e: - print(f"[HumanTrace] Error: {e}") - return { - "type": "error", - "timestamp": time.time() * 1000, - "error": str(e), - }, f"Error: {e}" - - def _inject_interaction_tracking(self, page: Page): - """Inject JavaScript code for comprehensive interaction tracking.""" - tracking_script = get_interaction_tracking_script() - page.evaluate(tracking_script) - - def _wait_for_user_action(self, page: Page): - """Wait for user to perform an action.""" - start_time = time.time() - while time.time() - start_time < 300: - try: - action_detected = page.evaluate("window.__acted || false") - if action_detected: - print(f"[HumanTrace] Action detected! Exiting immediately...") - break - except Exception as e: - print(f"[HumanTrace] Debug: Error checking actions: {e}") - pass - time.sleep(0.1) - - def _collect_interaction_data(self, page: Page, initial_url: str, initial_title: str) -> tuple[dict, str]: - """Collect and format interaction data.""" - try: - action_detected = page.evaluate("window.__acted || false") - interactions = page.evaluate("window.__interactions || []") - - action_data = { - "type": "human_interactions" if action_detected else "no_action", - "timestamp": time.time() * 1000, - "detected": action_detected, - "interactions": interactions, - "interaction_count": len(interactions) - } - - summary = self._create_interaction_summary(interactions) - self._add_page_change_info(action_data, initial_url, initial_title, page) - - print(f"[HumanTrace] {summary}") - return action_data, summary - - except Exception as e: - return { - "type": "error", - "timestamp": time.time() * 1000, - "detected": False, - "error": str(e), - "interactions": [], - "interaction_count": 0 - }, f"Error collecting interactions: {e}" - - def _create_interaction_summary(self, interactions: list) -> str: - """Create a summary string of captured interactions.""" - if interactions: - interaction_types = {} - for interaction in interactions: - itype = interaction.get('type', 'unknown') - interaction_types[itype] = interaction_types.get(itype, 0) + 1 - - summary_parts = [] - for itype, count in interaction_types.items(): - summary_parts.append(f"{itype}:{count}") - return f"Captured {len(interactions)} interactions: {', '.join(summary_parts)}" - else: - return "No interactions detected" - - def _add_page_change_info(self, action_data: dict, initial_url: str, initial_title: str, page: Page): - """Add page change information to action data.""" - final_url, final_title = page.url, page.title() - if initial_url != final_url or initial_title != final_title: - action_data["page_changed"] = True - action_data["url_change"] = {"from": initial_url, "to": final_url} - action_data["title_change"] = {"from": initial_title, "to": final_title} - - def _format_js_interaction_summary(self, action_data, interaction_log): - """Format JavaScript-captured interactions into readable summary.""" - lines = ["Human Interactions (JavaScript Tracking):"] - - if action_data["interactions"]: - lines.append(f"Total Actions: {len(action_data['interactions'])}") - lines.append("") - - # Group interactions by type - by_type = {} - for interaction in action_data["interactions"]: - interaction_type = interaction["type"] - if interaction_type not in by_type: - by_type[interaction_type] = [] - by_type[interaction_type].append(interaction) - - # Show summary by type - for interaction_type, interactions in by_type.items(): - lines.append(f"{interaction_type.title()}: {len(interactions)} actions") - - lines.append("") - lines.append("Detailed Actions:") - - # Add each interaction from the log - for log_entry in interaction_log: - lines.append(f" {log_entry}") - else: - lines.append("No interactions detected - user may have just observed the page") - - # Add page state changes if URL changed - if action_data.get("page_changed"): - url_info = action_data.get("url") - if url_info: - lines.append("") - lines.append("� Page Navigation:") - lines.append(f" From: {url_info['from']}") - lines.append(f" To: {url_info['to']}") - - return "\n".join(lines) - - -HUMAN_TRACE_AGENT = HumanTraceAgentArgs() - - -if __name__ == "__main__": - from agentlab.agents.human_trace_recorder.agent import HUMAN_TRACE_AGENT - from agentlab.experiments.study import Study - - agent_configs = [HUMAN_TRACE_AGENT] - benchmark = bgym.DEFAULT_BENCHMARKS["workarena_l1"](n_repeats=1) # type: bgym.Benchmark - benchmark = benchmark.subset_from_glob("task_name", "*filter*") - benchmark.env_args_list = benchmark.env_args_list[:1] - for env_args in benchmark.env_args_list: - print(env_args.task_name) - env_args.max_steps = 15 - env_args.headless = False - - study = Study(agent_configs, benchmark) - study.run(n_jobs=1, parallel_backend="sequential") diff --git a/src/agentlab/agents/human_trace_recorder/codegen_agent.py b/src/agentlab/agents/human_trace_recorder/codegen_agent.py deleted file mode 100644 index cd902bd2..00000000 --- a/src/agentlab/agents/human_trace_recorder/codegen_agent.py +++ /dev/null @@ -1,193 +0,0 @@ -"""Simple Codegen Agent - -Captures human interactions using playwright inspector. -Playwright trace logs are stored in "think" messages and can be viewed in Agentlab Xray. -""" - -from __future__ import annotations - -import json -import logging -import tempfile -import zipfile -from dataclasses import dataclass -from pathlib import Path - -import bgym -from agentlab.agents.agent_args import AgentArgs -from browsergym.core.observation import ( - extract_dom_extra_properties, - extract_dom_snapshot, - extract_focused_element_bid, - extract_merged_axtree, - extract_screenshot, -) -from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, prune_html -from dotenv import load_dotenv -from playwright.sync_api import Page - -load_dotenv() - -def extract_log_message_from_pw_trace(pw_trace_file_path): - zip_file = zipfile.ZipFile(pw_trace_file_path, "r") - trace_lines = zip_file.read("trace.trace").decode("utf-8").splitlines() - - actions = [] - for line in trace_lines: - if line.strip(): - event = json.loads(line) - if event.get("type") == "log": - actions.append(event) - # Extract log messages from the trace - return [log["message"].strip() for log in sorted(actions, key=lambda x: x.get("time", 0))] - - -def clean_pw_logs(logs, exclude_blacklist=True, use_substitutions=True): - clean_logs = list(logs) - blacklist = { - "attempting click action", - "waiting for element to be visible, enabled and stable", - "element is visible, enabled and stable", - "scrolling into view if needed", - "done scrolling", - "performing click action", - "click action done", - "waiting for scheduled navigations to finish", - "navigations have finished", - } - - substitutions = [("waiting for ", "")] - - def apply_substitutions(log): - for old, new in substitutions: - log = log.replace(old, new) - return log - - if exclude_blacklist: - clean_logs = [log for log in clean_logs if log not in blacklist] - if use_substitutions: - clean_logs = [apply_substitutions(log) for log in clean_logs] - - return clean_logs - - -@dataclass -class PlayWrightCodeGenAgentArgs(AgentArgs): - agent_name: str = "PlayWrightCodeGenAgent" - trace_dir: str = "playwright_codegen_traces" - use_raw_page_output: bool = True - store_raw_trace: bool = False - - def make_agent(self) -> bgym.Agent: # type: ignore[override] - return PlayWrightCodeGenAgent(self.trace_dir, self.store_raw_trace) - - def set_reproducibility_mode(self): - pass - - -class PlayWrightCodeGenAgent(bgym.Agent): - def __init__(self, trace_dir: str, store_raw_trace: bool): - self.action_set = bgym.HighLevelActionSet(["bid"], multiaction=False) - self._root = Path(trace_dir) - self._page: Page | None = None - self._step = 0 - self.store_raw_trace = store_raw_trace - self._episode_trace_dir = None # Cache for single episode - - def _get_trace_dir(self): - """Return the trace directory based on store_raw_trace setting.""" - if self._episode_trace_dir is None: - if self.store_raw_trace: - import datetime - - dt_str = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - self._episode_trace_dir = self._root / f"codegen_traces_{dt_str}" - self._episode_trace_dir.mkdir(parents=True, exist_ok=True) - else: - self._episode_trace_dir = Path(tempfile.mkdtemp()) - return self._episode_trace_dir - - def obs_preprocessor(self, obs: dict): # type: ignore[override] - if isinstance(obs, dict): - self._page = obs.get("page") - obs["screenshot"] = extract_screenshot(self._page) - obs["dom_object"] = extract_dom_snapshot(self._page) - obs["axtree_object"] = extract_merged_axtree(self._page) - scale_factor = getattr(self._page, "_bgym_scale_factor", 1.0) - extra_properties = extract_dom_extra_properties( - obs["dom_object"], scale_factor=scale_factor - ) - obs["extra_element_properties"] = extra_properties - obs["focused_element_bid"] = extract_focused_element_bid(self._page) - - if obs["axtree_object"]: - obs["axtree_txt"] = flatten_axtree_to_str(obs["axtree_object"]) - - if obs["dom_object"]: - obs["dom_txt"] = flatten_dom_to_str(obs["dom_object"]) - obs["pruned_html"] = prune_html(obs["dom_txt"]) - - if "page" in obs: # unpickable - del obs["page"] - - return obs - - def get_action(self, obs: dict): # type: ignore[override] - - if self._page is None: - raise RuntimeError("Playwright Page missing; ensure use_raw_page_output=True") - - page = self._page - trace_dir = self._get_trace_dir() - trace_path = trace_dir / f"step_{self._step}.zip" - page.context.tracing.start(screenshots=True, snapshots=True, sources=True) - page.context.tracing.start_chunk(name=f"step_{self._step}") - - print( - f"{'─'*60}\n" f"Step {self._step}\n", - f"{'─'*60}\n", - "1. 🔴 Start Recording (Press 'Record' in the Playwright Inspector.)\n", - "2. ✨ Perform actions for a single step.\n", - "3. ⚫ Stop Recording (Press 'Record' again to stop recording.)\n", - "4. ▶️ Press 'Resume' in the Playwright Inspector.", - ) - - page.pause() # Launch Inspector and record actions - page.context.tracing.stop_chunk(path=trace_path) - page.context.tracing.stop() - - pw_logs = extract_log_message_from_pw_trace(trace_path) - pw_logs = clean_pw_logs(pw_logs, exclude_blacklist=True) - pw_logs_str = "\n".join([f"{i}. {log}" for i, log in enumerate(pw_logs, 1)]) - - print(f"\n Playwright logs for step {self._step}:\n{pw_logs_str}") - - self._step += 1 - - agent_info = bgym.AgentInfo( - think=pw_logs_str, - chat_messages=[], - stats={}, - ) - - return "noop()", agent_info - - -PW_CODEGEN_AGENT = PlayWrightCodeGenAgentArgs(store_raw_trace=True) - - -if __name__ == "__main__": - from agentlab.agents.human_trace_recorder.codegen_agent import PW_CODEGEN_AGENT - from agentlab.experiments.study import Study - - agent_configs = [PW_CODEGEN_AGENT] - benchmark = bgym.DEFAULT_BENCHMARKS["workarena_l1"]() # type: bgym.Benchmark - benchmark = benchmark.subset_from_glob("task_name", "*create*") - benchmark.env_args_list = benchmark.env_args_list[:1] - for env_args in benchmark.env_args_list: - print(env_args.task_name) - env_args.max_steps = 15 - env_args.headless = False - - study = Study(agent_configs, benchmark, logging_level_stdout=logging.INFO) - study.run(n_jobs=1, parallel_backend="sequential", n_relaunch=1) diff --git a/src/agentlab/agents/human_trace_recorder/event_listeners.py b/src/agentlab/agents/human_trace_recorder/event_listeners.py deleted file mode 100644 index 2fd8453c..00000000 --- a/src/agentlab/agents/human_trace_recorder/event_listeners.py +++ /dev/null @@ -1,563 +0,0 @@ -"""JavaScript Event Listeners for Human Trace Capture - -This module contains all the JavaScript code for capturing comprehensive -browser interactions including mouse, keyboard, form, scroll, and focus events. -""" - - -def get_interaction_tracking_script() -> str: - """Get the complete JavaScript code for interaction tracking.""" - return ( - """ - window.__acted = false; - window.__interactions = []; - - // Debug mode - set to true to see all events in console - window.__debug_events = false; - - function captureInteraction(type, event, extra = {}) { - // Skip our own recording indicators - if (event.target.id === '__rec' || event.target.id === '__rec_border' || - event.target.closest('#__rec') || event.target.closest('#__rec_border')) { - return; - } - - const interaction = { - type: type, - timestamp: Date.now(), - coords: { - x: event.clientX || 0, - y: event.clientY || 0 - }, - target: { - tagName: event.target.tagName, - id: event.target.id || null, - className: event.target.className || null, - text: event.target.textContent?.slice(0, 50) || null, - bid: event.target.getAttribute('bid') || null - }, - ...extra - }; - - window.__interactions.push(interaction); - window.__acted = true; - - // Debug logging - if (window.__debug_events) { - console.log(`🎯 Captured: ${type}`, interaction); - } - - // Update indicators immediately - const indicator = document.getElementById('__rec'); - const border = document.getElementById('__rec_border'); - if (indicator) { - indicator.innerHTML = '✅ ACTION DETECTED - SAVING...'; - indicator.style.background = '#28a745'; - indicator.style.animation = 'none'; - } - if (border) { - border.style.border = '8px solid #28a745'; - border.style.animation = 'none'; - } - } - - // Debug function - add this temporarily to see what events fire - if (window.__debug_events) { - ['input', 'change', 'select', 'focus', 'click', 'keydown', 'paste', 'cut', 'copy'].forEach(eventType => { - document.addEventListener(eventType, (e) => { - console.log(`🔍 DEBUG: ${eventType} on`, e.target.tagName, e.target.type, e.target); - }, true); - }); - } - - """ - + get_mouse_event_listeners() - + """ - """ - + get_keyboard_event_listeners() - + """ - """ - + get_form_event_listeners() - + """ - """ - + get_scroll_event_listeners() - + """ - """ - + get_focus_event_listeners() - + """ - - console.log('Comprehensive interaction tracking initialized'); - """ - ) - - -def get_mouse_event_listeners() -> str: - """Get JavaScript code for mouse event listeners.""" - return """ - // Mouse events with comprehensive button tracking and performance optimizations - let lastClickTime = 0; - - document.addEventListener('click', (e) => { - const now = Date.now(); - // Prevent spam clicking from creating too many events (minimum 50ms between clicks) - if (now - lastClickTime < 50) return; - lastClickTime = now; - - captureInteraction('click', e, { - button: e.button, // 0=left, 1=middle, 2=right - buttons: e.buttons, // bitmask of pressed buttons - buttonName: ['left', 'middle', 'right'][e.button] || 'unknown', - detail: e.detail, // click count (single, double, etc.) - clickType: e.detail === 1 ? 'single' : e.detail === 2 ? 'double' : `${e.detail}x` - }); - }, true); - - document.addEventListener('dblclick', (e) => { - captureInteraction('dblclick', e, { - button: e.button, - buttonName: ['left', 'middle', 'right'][e.button] || 'unknown' - }); - }, true); - - document.addEventListener('mousedown', (e) => { - captureInteraction('mousedown', e, { - button: e.button, - buttons: e.buttons, - buttonName: ['left', 'middle', 'right'][e.button] || 'unknown' - }); - }, true); - - document.addEventListener('mouseup', (e) => { - captureInteraction('mouseup', e, { - button: e.button, - buttons: e.buttons, - buttonName: ['left', 'middle', 'right'][e.button] || 'unknown' - }); - }, true); - - // Context menu (right-click menu) - document.addEventListener('contextmenu', (e) => { - captureInteraction('contextmenu', e, { - button: e.button, - buttonName: 'right' - }); - }, true); - - // Middle mouse button events (often used for scrolling/opening in new tab) - document.addEventListener('auxclick', (e) => { - captureInteraction('auxclick', e, { - button: e.button, - buttonName: e.button === 1 ? 'middle' : (e.button === 2 ? 'right' : 'other'), - detail: e.detail - }); - }, true); - - // Enhanced drag tracking (without redundant mousedown) - let isDragging = false; - let dragStart = null; - let dragButton = null; - let hasDraggedSignificantly = false; - - document.addEventListener('mousedown', (e) => { - isDragging = true; - dragButton = e.button; - hasDraggedSignificantly = false; - dragStart = { - x: e.clientX, - y: e.clientY, - time: Date.now(), - button: e.button, - buttonName: ['left', 'middle', 'right'][e.button] || 'unknown' - }; - }, true); - - document.addEventListener('mousemove', (e) => { - if (isDragging && dragStart) { - const distance = Math.sqrt( - Math.pow(e.clientX - dragStart.x, 2) + - Math.pow(e.clientY - dragStart.y, 2) - ); - if (distance > 5 && !hasDraggedSignificantly) { - // Only capture the start of a significant drag, not every movement - hasDraggedSignificantly = true; - captureInteraction('drag_start', e, { - startX: dragStart.x, - startY: dragStart.y, - endX: e.clientX, - endY: e.clientY, - distance: distance, - button: dragButton, - buttonName: dragStart.buttonName, - duration: Date.now() - dragStart.time - }); - } - } - // Note: Removed general mousemove tracking to reduce noise - }, true); - - document.addEventListener('mouseup', (e) => { - if (isDragging && dragStart && hasDraggedSignificantly) { - const distance = Math.sqrt( - Math.pow(e.clientX - dragStart.x, 2) + - Math.pow(e.clientY - dragStart.y, 2) - ); - captureInteraction('drag_end', e, { - startX: dragStart.x, - startY: dragStart.y, - endX: e.clientX, - endY: e.clientY, - distance: distance, - duration: Date.now() - dragStart.time, - button: dragButton, - buttonName: dragStart.buttonName, - totalDistance: distance - }); - } - isDragging = false; - dragStart = null; - dragButton = null; - hasDraggedSignificantly = false; - }, true); - - // Drag and drop events - document.addEventListener('dragstart', (e) => { - captureInteraction('dragstart', e, { - dataTransfer: { - effectAllowed: e.dataTransfer.effectAllowed, - types: Array.from(e.dataTransfer.types) - } - }); - }, true); - - document.addEventListener('dragend', (e) => { - captureInteraction('dragend', e, { - dataTransfer: { - dropEffect: e.dataTransfer.dropEffect - } - }); - }, true); - - document.addEventListener('drop', (e) => { - captureInteraction('drop', e, { - dataTransfer: { - dropEffect: e.dataTransfer.dropEffect, - types: Array.from(e.dataTransfer.types) - }, - files: e.dataTransfer.files.length > 0 ? Array.from(e.dataTransfer.files).map(f => ({ - name: f.name, - type: f.type, - size: f.size - })) : null - }); - }, true); - """ - - -def get_keyboard_event_listeners() -> str: - """Get JavaScript code for keyboard event listeners.""" - return """ - // Keyboard events with shortcut detection - document.addEventListener('keydown', (e) => { - let shortcut = null; - if (e.ctrlKey || e.metaKey) { - const modifier = e.ctrlKey ? 'Ctrl' : 'Cmd'; - const key = e.key.length === 1 ? e.key.toUpperCase() : e.key; - shortcut = `${modifier}+${key}`; - } else if (e.altKey && e.key.length === 1) { - shortcut = `Alt+${e.key.toUpperCase()}`; - } else if (e.shiftKey && e.key.length === 1) { - shortcut = `Shift+${e.key.toUpperCase()}`; - } - - captureInteraction('keydown', e, { - key: e.key, - code: e.code, - ctrlKey: e.ctrlKey, - shiftKey: e.shiftKey, - altKey: e.altKey, - metaKey: e.metaKey, - shortcut: shortcut - }); - }, true); - - document.addEventListener('keyup', (e) => { - captureInteraction('keyup', e, { - key: e.key, - code: e.code - }); - }, true); - """ - - -def get_form_event_listeners() -> str: - """Get JavaScript code for form event listeners.""" - return """ - // Input events with throttling to prevent spam during fast typing - let inputTimeout; - let lastInputValue = ''; - - document.addEventListener('input', (e) => { - if (['INPUT', 'TEXTAREA'].includes(e.target.tagName) || e.target.contentEditable === 'true') { - clearTimeout(inputTimeout); - inputTimeout = setTimeout(() => { - const currentValue = e.target.value || e.target.textContent; - // Only capture if value actually changed significantly - if (currentValue !== lastInputValue) { - lastInputValue = currentValue; - captureInteraction('input', e, { - value: currentValue, - inputType: e.inputType || null, - valueLength: currentValue.length - }); - } - }, 50); // Reduced from 300ms to 50ms for better responsiveness - } - }, true); - - // Immediate input capture (without throttling) for certain cases - document.addEventListener('input', (e) => { - // Immediate capture for dropdown/select-like inputs or when selection changes - if (e.target.tagName === 'SELECT' || - e.inputType === 'deleteContentBackward' || - e.inputType === 'insertFromPaste' || - e.inputType === 'insertFromDrop') { - captureInteraction('input_immediate', e, { - value: e.target.value || e.target.textContent, - inputType: e.inputType || null, - immediate: true - }); - } - }, true); - - // Text selection events - document.addEventListener('select', (e) => { - if (['INPUT', 'TEXTAREA'].includes(e.target.tagName)) { - const selectedText = e.target.value.substring(e.target.selectionStart, e.target.selectionEnd); - captureInteraction('select', e, { - selectedText: selectedText, - selectionStart: e.target.selectionStart, - selectionEnd: e.target.selectionEnd, - value: e.target.value, - selectionLength: selectedText.length - }); - } - }, true); - - // Clipboard events - document.addEventListener('cut', (e) => { - captureInteraction('cut', e, { - clipboardData: e.clipboardData ? Array.from(e.clipboardData.types) : null, - targetValue: e.target.value || e.target.textContent - }); - }, true); - - document.addEventListener('copy', (e) => { - captureInteraction('copy', e, { - clipboardData: e.clipboardData ? Array.from(e.clipboardData.types) : null, - targetValue: e.target.value || e.target.textContent - }); - }, true); - - document.addEventListener('paste', (e) => { - captureInteraction('paste', e, { - clipboardData: e.clipboardData ? Array.from(e.clipboardData.types) : null, - targetValue: e.target.value || e.target.textContent - }); - }, true); - - // Enhanced form change events with better dropdown handling - document.addEventListener('change', (e) => { - let extra = {}; - if (e.target.tagName === 'SELECT') { - const option = e.target.options[e.target.selectedIndex]; - extra = { - selectedValue: e.target.value, - selectedText: option?.text || '', - selectedIndex: e.target.selectedIndex, - allOptions: Array.from(e.target.options).map(opt => ({ - value: opt.value, - text: opt.text, - selected: opt.selected - })), - optionsCount: e.target.options.length - }; - } else if (['checkbox', 'radio'].includes(e.target.type)) { - extra = { - checked: e.target.checked, - value: e.target.value, - name: e.target.name - }; - } else { - extra = { - value: e.target.value, - previousValue: e.target.defaultValue, // Capture what it was before - inputType: e.target.type - }; - } - captureInteraction('change', e, extra); - }, true); - - document.addEventListener('submit', (e) => { - captureInteraction('submit', e, { - formAction: e.target.action || null, - formMethod: e.target.method || 'GET', - formElements: Array.from(e.target.elements).length - }); - }, true); - - // Additional events for better field interaction capture - - // Option selection in datalists - document.addEventListener('input', (e) => { - if (e.target.list) { // Has datalist - captureInteraction('datalist_input', e, { - value: e.target.value, - listId: e.target.list.id, - optionsCount: e.target.list.options.length - }); - } - }, true); - - // File input changes - document.addEventListener('change', (e) => { - if (e.target.type === 'file') { - captureInteraction('file_select', e, { - filesCount: e.target.files.length, - files: Array.from(e.target.files).map(file => ({ - name: file.name, - type: file.type, - size: file.size, - lastModified: file.lastModified - })) - }); - } - }, true); - """ - - -def get_scroll_event_listeners() -> str: - """Get JavaScript code for scroll event listeners.""" - return """ - // Scroll events with debouncing to reduce noise - let scrollTimeout; - let lastScrollTime = 0; - - document.addEventListener('scroll', (e) => { - clearTimeout(scrollTimeout); - scrollTimeout = setTimeout(() => { - const now = Date.now(); - // Only capture scroll if it's been at least 200ms since last scroll capture - if (now - lastScrollTime > 200) { - lastScrollTime = now; - captureInteraction('scroll', e, { - scrollX: window.scrollX, - scrollY: window.scrollY, - scrollLeft: e.target.scrollLeft || 0, - scrollTop: e.target.scrollTop || 0 - }); - } - }, 150); // Increased debounce time - }, true); - - // Wheel events (for detailed scroll tracking) with throttling - let lastWheelTime = 0; - document.addEventListener('wheel', (e) => { - const now = Date.now(); - // Only capture wheel events every 100ms to reduce noise - if (now - lastWheelTime > 100) { - lastWheelTime = now; - captureInteraction('wheel', e, { - deltaX: e.deltaX, - deltaY: e.deltaY, - deltaZ: e.deltaZ, - deltaMode: e.deltaMode - }); - } - }, true); - """ - - -def get_focus_event_listeners() -> str: - """Get JavaScript code for focus event listeners.""" - return """ - // Focus events - only for interactive elements to reduce noise - document.addEventListener('focus', (e) => { - // Only capture focus on interactive elements - const interactiveElements = ['INPUT', 'TEXTAREA', 'SELECT', 'BUTTON', 'A']; - if (interactiveElements.includes(e.target.tagName) || - e.target.contentEditable === 'true' || - e.target.tabIndex >= 0) { - captureInteraction('focus', e); - } - }, true); - - document.addEventListener('blur', (e) => { - // Only capture blur on interactive elements - const interactiveElements = ['INPUT', 'TEXTAREA', 'SELECT', 'BUTTON', 'A']; - if (interactiveElements.includes(e.target.tagName) || - e.target.contentEditable === 'true' || - e.target.tabIndex >= 0) { - captureInteraction('blur', e); - } - }, true); - """ - - -def get_recording_indicators_script() -> str: - """Get JavaScript code for recording indicators.""" - return """ - // Remove any existing indicators - const existingBorder = document.getElementById('__rec_border'); - if (existingBorder) existingBorder.remove(); - const existingIndicator = document.getElementById('__rec'); - if (existingIndicator) existingIndicator.remove(); - - // Create border overlay - const border = document.createElement('div'); - border.id = '__rec_border'; - border.style.cssText = ` - position: fixed; - top: 0; - left: 0; - width: 100vw; - height: 100vh; - border: 8px solid #ff0000; - box-sizing: border-box; - pointer-events: none; - z-index: 999999; - animation: pulse 1.5s infinite; - `; - - // Create status indicator - const indicator = document.createElement('div'); - indicator.id = '__rec'; - indicator.innerHTML = '🔴 RECORDING - Perform your action now'; - indicator.style.cssText = ` - position: fixed; - top: 10px; - left: 50%; - transform: translateX(-50%); - background: #ff0000; - color: #fff; - padding: 12px 20px; - border-radius: 8px; - font: bold 10px -apple-system, BlinkMacSystemFont, sans-serif; - z-index: 9999999; - box-shadow: 0 4px 12px rgba(255,0,0,0.4); - animation: pulse 1.5s infinite; - `; - - // Add pulsing animation - const style = document.createElement('style'); - style.textContent = ` - @keyframes pulse { - 0% { opacity: 1; } - 50% { opacity: 0.4; } - 100% { opacity: 0.8; } - } - `; - document.head.appendChild(style); - - document.body.appendChild(border); - document.body.appendChild(indicator); - """ From e93fde52dbd3f97e4072a2bd624b115365cb3b17 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 2 Sep 2025 15:58:38 +0000 Subject: [PATCH 20/23] log task errors --- src/agentlab/experiments/graph_execution_ray.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agentlab/experiments/graph_execution_ray.py b/src/agentlab/experiments/graph_execution_ray.py index f047f866..f7aad780 100644 --- a/src/agentlab/experiments/graph_execution_ray.py +++ b/src/agentlab/experiments/graph_execution_ray.py @@ -3,9 +3,8 @@ import bgym import ray -from ray.util import state - from agentlab.experiments.exp_utils import _episode_timeout, run_exp +from ray.util import state logger = logging.getLogger(__name__) @@ -79,6 +78,7 @@ def poll_for_timeout(tasks: dict[str, ray.ObjectRef], timeout: float, poll_inter try: result = ray.get(task) except Exception as e: + logger.exception(f"Task failed: {e}") result = e results.append(result) From 5604ac36c861128b296a9b894497297d1e749146 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 2 Sep 2025 15:59:11 +0000 Subject: [PATCH 21/23] expore agentlabxray --- src/agentlab/analyze/agent_xray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index 8accbfd6..b60c0dcb 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -539,7 +539,7 @@ def run_gradio(results_dir: Path): port = os.getenv("AGENTXRAY_APP_PORT", None) if isinstance(port, str): port = int(port) - demo.launch(server_port=port, share=do_share) + demo.launch(server_name="0.0.0.0", server_port=port, share=do_share) def handle_key_event(key_event, step_id: StepId): From 0e68bcab654ffe052a4445aafb338f74ae7400a2 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Wed, 3 Sep 2025 09:39:32 +0000 Subject: [PATCH 22/23] remove commented old chunk --- .../agents/generic_agent_hinter/generic_agent_prompt.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py index b684b6c9..983c9d48 100644 --- a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py +++ b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py @@ -378,9 +378,6 @@ def get_hints_for_task(self, task_name: str) -> str: return "" try: - # task_hints = self.hint_db[ - # self.hint_db["task_name"].apply(lambda x: fnmatch.fnmatch(x, task_name)) - # ] task_hints = self.hints_source.choose_hints(self.llm, task_name, self.goal) hints = [] From e4cad16a9dd83401945624623ed4871dc30cc5dd Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Wed, 3 Sep 2025 09:43:19 +0000 Subject: [PATCH 23/23] share xray only when env flag present --- src/agentlab/analyze/agent_xray.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index b60c0dcb..fed78b3e 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -537,9 +537,10 @@ def run_gradio(results_dir: Path): do_share = os.getenv("AGENTXRAY_SHARE_GRADIO", "false").lower() == "true" port = os.getenv("AGENTXRAY_APP_PORT", None) + server_name = "0.0.0.0" if os.getenv("AGENTXRAY_PUBLIC", "false") == "true" else "127.0.0.1" if isinstance(port, str): port = int(port) - demo.launch(server_name="0.0.0.0", server_port=port, share=do_share) + demo.launch(server_name=server_name, server_port=port, share=do_share) def handle_key_event(key_event, step_id: StepId):