From d57b0967c30ee7ed91e22fc0ce80e7cf10058c92 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 18 Jul 2025 09:37:42 -0400 Subject: [PATCH 1/7] new hints for creat hardware asset --- src/agentlab/agents/tool_use_agent/hint_db.csv | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/agentlab/agents/tool_use_agent/hint_db.csv b/src/agentlab/agents/tool_use_agent/hint_db.csv index 86020033..f402c24a 100644 --- a/src/agentlab/agents/tool_use_agent/hint_db.csv +++ b/src/agentlab/agents/tool_use_agent/hint_db.csv @@ -16,4 +16,8 @@ June 11,miniwob.drag-items,30,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7 June 18,miniwob.count-shape,23,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7-sonnet-20250219,miniwob,miniwob,allac,Shape and letters size comparison in miniwob,"Shapes or items have different colors and different size. Size is relative to the other objects in the white area and is either ""large"" or ""small"". Shapes that are larger than the average shape or letter are considered ""large"". Others are ""small""." June 18,miniwob.count-shape,23,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7-sonnet-20250219,miniwob,miniwob,allac,communicate answer in miniwob,Answer by clicking one of the buttons describing multiple choices. June 18,miniwob.count-shape,23,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7-sonnet-20250219,miniwob,miniwob,allac,Simbols of colors in miniwob,"Colors a distinct in this task, e.g., cyan is not a type of blue. " -June 18,miniwob.form-sequence-2,23,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7-sonnet-20250219,miniwob,miniwob,allac,Reporting results in miniwob,Make sure to click submit to finish the task. \ No newline at end of file +June 18,miniwob.form-sequence-2,23,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7-sonnet-20250219,miniwob,miniwob,allac,Reporting results in miniwob,Make sure to click submit to finish the task. +July 13,workarena.servicenow.create-hardware-asset,385,gpt-4.1,ToolUse-gpt-4.1,WorkArena-L1,WorkArena-L1,allac,Filling form in WorkArena,"If you enter the value in the wrong field, the task may be terminated immediately. The field you are looking for may be in another tab. You have to look around." +July 13,workarena.servicenow.create-hardware-asset,385,gpt-4.1,ToolUse-gpt-4.1,WorkArena-L1,WorkArena-L1,allac,Filling form in WorkArena,"Before clicking submit, make sure that all fields are filled properly. Then click submit." +July 13,workarena.servicenow.create-hardware-asset,385,gpt-4.1,ToolUse-gpt-4.1,WorkArena-L1,WorkArena-L1,allac,Filling form in WorkArena,Avoid back and forth from tabs to tabs to reduce the number of actions +July 14,workarena.servicenow.create-hardware-asset,385,gpt-4.1,ToolUse-gpt-4.1,WorkArena-L1,WorkArena-L1,allac,Filling form in WorkArena,When you see auto-complete make sure to select an element from that list From 6a0c2384589f8ab484808604d5a37e48c5916fad Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 18 Jul 2025 09:38:33 -0400 Subject: [PATCH 2/7] improve profiling --- src/agentlab/analyze/agent_xray.py | 53 +++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index bd1c6ad4..7c46463e 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -922,6 +922,9 @@ def get_episode_info(info: Info): {code(step_info.task_info)} +**Terminated or Truncated:** +{code(f"Terminated: {step_info.terminated}, Truncated: {step_info.truncated}")} + **exp_dir:** {code(exp_dir_str)}""" @@ -1243,8 +1246,17 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr warning("No step info to plot") return None - # this allows to pop labels to make sure we don't use more than 1 for the legend - labels = ["reset", "env", "agent", "exec action", "action error"] + # Updated labels to include new profiling stages + labels = [ + "reset", + "env", + "agent", + "exec action", + "action error", + "wait for page", + "validation", + "get observation", + ] labels = {e: e for e in labels} colors = plt.get_cmap("tab20c").colors @@ -1253,6 +1265,7 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr all_times = [] step_times = [] for i, step_info in progress_fn(list(enumerate(step_info_list)), desc="Building plot."): + assert isinstance(step_info, StepInfo), f"Expected StepInfo, got {type(step_info)}" step = step_info.step prof = deepcopy(step_info.profiling) @@ -1274,6 +1287,39 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr label = labels.pop("exec action", None) add_patch(ax, prof.action_exec_start, prof.action_exec_stop, colors[3], label) + # NEW: Add wait for page loading visualization + if ( + hasattr(prof, "wait_for_page_loading_start") + and prof.wait_for_page_loading_start > 0 + ): + add_patch( + ax, + prof.wait_for_page_loading_start, + prof.wait_for_page_loading_stop, + colors[19], + labels.pop("wait for page", None), + ) + + # NEW: Add validation visualization + if hasattr(prof, "validation_start") and prof.validation_start > 0: + add_patch( + ax, + prof.validation_start, + prof.validation_stop, + colors[8], + labels.pop("validation", None), + ) + + # NEW: Add get observation visualization + if hasattr(prof, "get_observation_start") and prof.get_observation_start > 0: + add_patch( + ax, + prof.get_observation_start, + prof.get_observation_stop, + colors[12], + labels.pop("get observation", None), + ) + try: next_step_error = step_info_list[i + 1].obs["last_action_error"] except (IndexError, KeyError, TypeError): @@ -1340,7 +1386,6 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr ax.set_ylim(0, 1) ax.set_xlim(0, max(all_times) + 1) - # plt.gca().autoscale() ax.set_xlabel("Time") ax.set_yticks([]) @@ -1349,7 +1394,7 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr ax.legend( loc="upper center", bbox_to_anchor=(0.5, 1.2), - ncol=5, + ncol=8, # Updated to accommodate new labels frameon=True, ) From 90a05e91a90e015ad1b365ba4dbbd7d62b9e2e10 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 18 Jul 2025 09:38:51 -0400 Subject: [PATCH 3/7] remove print --- src/agentlab/analyze/overlay_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/agentlab/analyze/overlay_utils.py b/src/agentlab/analyze/overlay_utils.py index 51ff61c3..a262d1d3 100644 --- a/src/agentlab/analyze/overlay_utils.py +++ b/src/agentlab/analyze/overlay_utils.py @@ -299,7 +299,6 @@ def overlay_rectangle( if dashed: # Draw dashed rectangle - print("Drawing dashed rectangle") linedashed(draw, x, y, x + w, y, color, width) linedashed(draw, x + w, y, x + w, y + h, color, width) linedashed(draw, x + w, y + h, x, y + h, color, width) From 835269e21edc6b1c40f8d664b383978f65debff7 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 18 Jul 2025 09:39:17 -0400 Subject: [PATCH 4/7] improve profiling and delays --- src/agentlab/experiments/loop.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/agentlab/experiments/loop.py b/src/agentlab/experiments/loop.py index 7b5b280f..cf0a687f 100644 --- a/src/agentlab/experiments/loop.py +++ b/src/agentlab/experiments/loop.py @@ -48,6 +48,7 @@ class EnvArgs(DataClassJsonMixin): slow_mo: Optional[int] = None # use default value from BrowserGym storage_state: Optional[str | Path | dict] = None task_kwargs: Optional[dict] = None # use default value from BrowserGym + pre_observation_delay: float = 0.5 # seconds, wait for JS events to be fired def make_env( self, action_mapping, exp_dir, exp_task_kwargs: dict = {}, use_raw_page_output=True @@ -71,6 +72,8 @@ def make_env( extra_kwargs["viewport"] = self.viewport if self.slow_mo is not None: extra_kwargs["slow_mo"] = self.slow_mo + if self.pre_observation_delay is not None: + extra_kwargs["pre_observation_delay"] = self.pre_observation_delay if self.storage_state: extra_kwargs["pw_context_kwargs"] = {"storage_state": self.storage_state} if self.task_kwargs is not None: @@ -142,6 +145,12 @@ class StepTimestamps: env_stop: float = 0 agent_start: float = 0 agent_stop: float = 0 + wait_for_page_loading_start: float = 0 + wait_for_page_loading_stop: float = 0 + validation_start: float = 0 + validation_stop: float = 0 + get_observation_start: float = 0 + get_observation_stop: float = 0 @dataclass @@ -199,6 +208,12 @@ def from_step(self, env: gym.Env, action: str, obs_preprocessor: callable): t.action_exec_start = env_info["action_exec_start"] # start t.action_exect_after_timeout = env_info["action_exec_stop"] t.action_exec_stop = env_info["action_exec_stop"] - env_info["action_exec_timeout"] + t.wait_for_page_loading_start = env_info.get("wait_for_page_loading_start", None) + t.wait_for_page_loading_stop = env_info.get("wait_for_page_loading_stop", None) + t.validation_start = env_info.get("validation_start", None) + t.validation_stop = env_info.get("validation_stop", None) + t.get_observation_start = env_info.get("get_observation_start", None) + t.get_observation_stop = env_info.get("get_observation_stop", None) if obs_preprocessor: self.obs = obs_preprocessor(self.obs) @@ -447,6 +462,10 @@ def run(self): logger.debug("Sending action to environment.") step_info.from_step(env, action, obs_preprocessor=agent.obs_preprocessor) logger.debug("Environment stepped.") + if step_info.is_done: + logger.debug( + f"Episode done: terminated: {step_info.terminated}, truncated: {step_info.truncated}." + ) except Exception as e: err_msg = f"Exception uncaught by agent or environment in task {self.env_args.task_name}.\n{type(e).__name__}:\n{e}" From c6e195a963a16a83edf9e2efb5452cdfccd29c0d Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 18 Jul 2025 09:39:30 -0400 Subject: [PATCH 5/7] add type --- src/agentlab/experiments/study.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py index 810b8bc2..ae223588 100644 --- a/src/agentlab/experiments/study.py +++ b/src/agentlab/experiments/study.py @@ -725,7 +725,7 @@ def set_demo_mode(env_args_list: list[EnvArgs]): env_args.slow_mo = 1000 -def _convert_env_args(env_args_list): +def _convert_env_args(env_args_list) -> list[EnvArgs]: """Return a list where every element is the *new* EnvArgs. For backward compatibility, we need to convert the old EnvArgs to the new one. From 2bed049eec98a088195bfabb47a2c0424b11b284 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 18 Jul 2025 10:44:28 -0400 Subject: [PATCH 6/7] avoid auto-modify code that could cause problems --- .vscode/settings.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index eb18e557..6e1f48c8 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -3,12 +3,12 @@ "editor.formatOnSave": true, "editor.defaultFormatter": "ms-python.black-formatter", "editor.codeActionsOnSave": { - "source.organizeImports": "always", - "source.fixAll": "always", + "source.organizeImports": "explicit", + "source.fixAll": "never", }, }, - "python.analysis.languageServerMode": "full", - "python.analysis.typeCheckingMode": "standard", + // "python.analysis.languageServerMode": "full", + // "python.analysis.typeCheckingMode": "standard", "python.testing.pytestArgs": [ "tests" ], From 9b300b1f62c62edc8d6ab0d69a9ef2f97d00684b Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 18 Jul 2025 10:44:49 -0400 Subject: [PATCH 7/7] fix test --- src/agentlab/experiments/loop.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agentlab/experiments/loop.py b/src/agentlab/experiments/loop.py index cf0a687f..5b9e7a81 100644 --- a/src/agentlab/experiments/loop.py +++ b/src/agentlab/experiments/loop.py @@ -20,7 +20,7 @@ import numpy as np from browsergym.core.chat import Chat from browsergym.experiments.agent import Agent -from browsergym.experiments.utils import count_messages_token, count_tokens +from browsergym.experiments.utils import count_tokens from dataclasses_json import DataClassJsonMixin from PIL import Image from tqdm import tqdm @@ -48,7 +48,7 @@ class EnvArgs(DataClassJsonMixin): slow_mo: Optional[int] = None # use default value from BrowserGym storage_state: Optional[str | Path | dict] = None task_kwargs: Optional[dict] = None # use default value from BrowserGym - pre_observation_delay: float = 0.5 # seconds, wait for JS events to be fired + pre_observation_delay: float = None # seconds, wait for JS events to be fired def make_env( self, action_mapping, exp_dir, exp_task_kwargs: dict = {}, use_raw_page_output=True