diff --git a/.vscode/settings.json b/.vscode/settings.json index 1fc53da1..6e1f48c8 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -3,8 +3,8 @@ "editor.formatOnSave": true, "editor.defaultFormatter": "ms-python.black-formatter", "editor.codeActionsOnSave": { - "source.organizeImports": "always", - "source.fixAll": "always", + "source.organizeImports": "explicit", + "source.fixAll": "never", }, }, // "python.analysis.languageServerMode": "full", diff --git a/src/agentlab/agents/tool_use_agent/hint_db.csv b/src/agentlab/agents/tool_use_agent/hint_db.csv index 86020033..f402c24a 100644 --- a/src/agentlab/agents/tool_use_agent/hint_db.csv +++ b/src/agentlab/agents/tool_use_agent/hint_db.csv @@ -16,4 +16,8 @@ June 11,miniwob.drag-items,30,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7 June 18,miniwob.count-shape,23,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7-sonnet-20250219,miniwob,miniwob,allac,Shape and letters size comparison in miniwob,"Shapes or items have different colors and different size. Size is relative to the other objects in the white area and is either ""large"" or ""small"". Shapes that are larger than the average shape or letter are considered ""large"". Others are ""small""." June 18,miniwob.count-shape,23,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7-sonnet-20250219,miniwob,miniwob,allac,communicate answer in miniwob,Answer by clicking one of the buttons describing multiple choices. June 18,miniwob.count-shape,23,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7-sonnet-20250219,miniwob,miniwob,allac,Simbols of colors in miniwob,"Colors a distinct in this task, e.g., cyan is not a type of blue. " -June 18,miniwob.form-sequence-2,23,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7-sonnet-20250219,miniwob,miniwob,allac,Reporting results in miniwob,Make sure to click submit to finish the task. \ No newline at end of file +June 18,miniwob.form-sequence-2,23,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7-sonnet-20250219,miniwob,miniwob,allac,Reporting results in miniwob,Make sure to click submit to finish the task. +July 13,workarena.servicenow.create-hardware-asset,385,gpt-4.1,ToolUse-gpt-4.1,WorkArena-L1,WorkArena-L1,allac,Filling form in WorkArena,"If you enter the value in the wrong field, the task may be terminated immediately. The field you are looking for may be in another tab. You have to look around." +July 13,workarena.servicenow.create-hardware-asset,385,gpt-4.1,ToolUse-gpt-4.1,WorkArena-L1,WorkArena-L1,allac,Filling form in WorkArena,"Before clicking submit, make sure that all fields are filled properly. Then click submit." +July 13,workarena.servicenow.create-hardware-asset,385,gpt-4.1,ToolUse-gpt-4.1,WorkArena-L1,WorkArena-L1,allac,Filling form in WorkArena,Avoid back and forth from tabs to tabs to reduce the number of actions +July 14,workarena.servicenow.create-hardware-asset,385,gpt-4.1,ToolUse-gpt-4.1,WorkArena-L1,WorkArena-L1,allac,Filling form in WorkArena,When you see auto-complete make sure to select an element from that list diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index 61b1ab68..f882ce14 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -926,6 +926,9 @@ def get_episode_info(info: Info): {code(step_info.task_info)} +**Terminated or Truncated:** +{code(f"Terminated: {step_info.terminated}, Truncated: {step_info.truncated}")} + **exp_dir:** {code(exp_dir_str)}""" @@ -1247,8 +1250,17 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr warning("No step info to plot") return None - # this allows to pop labels to make sure we don't use more than 1 for the legend - labels = ["reset", "env", "agent", "exec action", "action error"] + # Updated labels to include new profiling stages + labels = [ + "reset", + "env", + "agent", + "exec action", + "action error", + "wait for page", + "validation", + "get observation", + ] labels = {e: e for e in labels} colors = plt.get_cmap("tab20c").colors @@ -1257,6 +1269,7 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr all_times = [] step_times = [] for i, step_info in progress_fn(list(enumerate(step_info_list)), desc="Building plot."): + assert isinstance(step_info, StepInfo), f"Expected StepInfo, got {type(step_info)}" step = step_info.step prof = deepcopy(step_info.profiling) @@ -1278,6 +1291,39 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr label = labels.pop("exec action", None) add_patch(ax, prof.action_exec_start, prof.action_exec_stop, colors[3], label) + # NEW: Add wait for page loading visualization + if ( + hasattr(prof, "wait_for_page_loading_start") + and prof.wait_for_page_loading_start > 0 + ): + add_patch( + ax, + prof.wait_for_page_loading_start, + prof.wait_for_page_loading_stop, + colors[19], + labels.pop("wait for page", None), + ) + + # NEW: Add validation visualization + if hasattr(prof, "validation_start") and prof.validation_start > 0: + add_patch( + ax, + prof.validation_start, + prof.validation_stop, + colors[8], + labels.pop("validation", None), + ) + + # NEW: Add get observation visualization + if hasattr(prof, "get_observation_start") and prof.get_observation_start > 0: + add_patch( + ax, + prof.get_observation_start, + prof.get_observation_stop, + colors[12], + labels.pop("get observation", None), + ) + try: next_step_error = step_info_list[i + 1].obs["last_action_error"] except (IndexError, KeyError, TypeError): @@ -1344,7 +1390,6 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr ax.set_ylim(0, 1) ax.set_xlim(0, max(all_times) + 1) - # plt.gca().autoscale() ax.set_xlabel("Time") ax.set_yticks([]) @@ -1353,7 +1398,7 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr ax.legend( loc="upper center", bbox_to_anchor=(0.5, 1.2), - ncol=5, + ncol=8, # Updated to accommodate new labels frameon=True, ) diff --git a/src/agentlab/analyze/overlay_utils.py b/src/agentlab/analyze/overlay_utils.py index 51ff61c3..a262d1d3 100644 --- a/src/agentlab/analyze/overlay_utils.py +++ b/src/agentlab/analyze/overlay_utils.py @@ -299,7 +299,6 @@ def overlay_rectangle( if dashed: # Draw dashed rectangle - print("Drawing dashed rectangle") linedashed(draw, x, y, x + w, y, color, width) linedashed(draw, x + w, y, x + w, y + h, color, width) linedashed(draw, x + w, y + h, x, y + h, color, width) diff --git a/src/agentlab/experiments/loop.py b/src/agentlab/experiments/loop.py index 7b5b280f..5b9e7a81 100644 --- a/src/agentlab/experiments/loop.py +++ b/src/agentlab/experiments/loop.py @@ -20,7 +20,7 @@ import numpy as np from browsergym.core.chat import Chat from browsergym.experiments.agent import Agent -from browsergym.experiments.utils import count_messages_token, count_tokens +from browsergym.experiments.utils import count_tokens from dataclasses_json import DataClassJsonMixin from PIL import Image from tqdm import tqdm @@ -48,6 +48,7 @@ class EnvArgs(DataClassJsonMixin): slow_mo: Optional[int] = None # use default value from BrowserGym storage_state: Optional[str | Path | dict] = None task_kwargs: Optional[dict] = None # use default value from BrowserGym + pre_observation_delay: float = None # seconds, wait for JS events to be fired def make_env( self, action_mapping, exp_dir, exp_task_kwargs: dict = {}, use_raw_page_output=True @@ -71,6 +72,8 @@ def make_env( extra_kwargs["viewport"] = self.viewport if self.slow_mo is not None: extra_kwargs["slow_mo"] = self.slow_mo + if self.pre_observation_delay is not None: + extra_kwargs["pre_observation_delay"] = self.pre_observation_delay if self.storage_state: extra_kwargs["pw_context_kwargs"] = {"storage_state": self.storage_state} if self.task_kwargs is not None: @@ -142,6 +145,12 @@ class StepTimestamps: env_stop: float = 0 agent_start: float = 0 agent_stop: float = 0 + wait_for_page_loading_start: float = 0 + wait_for_page_loading_stop: float = 0 + validation_start: float = 0 + validation_stop: float = 0 + get_observation_start: float = 0 + get_observation_stop: float = 0 @dataclass @@ -199,6 +208,12 @@ def from_step(self, env: gym.Env, action: str, obs_preprocessor: callable): t.action_exec_start = env_info["action_exec_start"] # start t.action_exect_after_timeout = env_info["action_exec_stop"] t.action_exec_stop = env_info["action_exec_stop"] - env_info["action_exec_timeout"] + t.wait_for_page_loading_start = env_info.get("wait_for_page_loading_start", None) + t.wait_for_page_loading_stop = env_info.get("wait_for_page_loading_stop", None) + t.validation_start = env_info.get("validation_start", None) + t.validation_stop = env_info.get("validation_stop", None) + t.get_observation_start = env_info.get("get_observation_start", None) + t.get_observation_stop = env_info.get("get_observation_stop", None) if obs_preprocessor: self.obs = obs_preprocessor(self.obs) @@ -447,6 +462,10 @@ def run(self): logger.debug("Sending action to environment.") step_info.from_step(env, action, obs_preprocessor=agent.obs_preprocessor) logger.debug("Environment stepped.") + if step_info.is_done: + logger.debug( + f"Episode done: terminated: {step_info.terminated}, truncated: {step_info.truncated}." + ) except Exception as e: err_msg = f"Exception uncaught by agent or environment in task {self.env_args.task_name}.\n{type(e).__name__}:\n{e}" diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py index 66efde19..391f419c 100644 --- a/src/agentlab/experiments/study.py +++ b/src/agentlab/experiments/study.py @@ -726,7 +726,7 @@ def set_demo_mode(env_args_list: list[EnvArgs]): env_args.slow_mo = 1000 -def _convert_env_args(env_args_list): +def _convert_env_args(env_args_list) -> list[EnvArgs]: """Return a list where every element is the *new* EnvArgs. For backward compatibility, we need to convert the old EnvArgs to the new one.