Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
"editor.formatOnSave": true,
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.codeActionsOnSave": {
"source.organizeImports": "always",
"source.fixAll": "always",
"source.organizeImports": "explicit",
"source.fixAll": "never",
},
},
// "python.analysis.languageServerMode": "full",
Expand Down
6 changes: 5 additions & 1 deletion src/agentlab/agents/tool_use_agent/hint_db.csv
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,8 @@ June 11,miniwob.drag-items,30,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7
June 18,miniwob.count-shape,23,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7-sonnet-20250219,miniwob,miniwob,allac,Shape and letters size comparison in miniwob,"Shapes or items have different colors and different size. Size is relative to the other objects in the white area and is either ""large"" or ""small"". Shapes that are larger than the average shape or letter are considered ""large"". Others are ""small""."
June 18,miniwob.count-shape,23,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7-sonnet-20250219,miniwob,miniwob,allac,communicate answer in miniwob,Answer by clicking one of the buttons describing multiple choices.
June 18,miniwob.count-shape,23,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7-sonnet-20250219,miniwob,miniwob,allac,Simbols of colors in miniwob,"Colors a distinct in this task, e.g., cyan is not a type of blue. "
June 18,miniwob.form-sequence-2,23,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7-sonnet-20250219,miniwob,miniwob,allac,Reporting results in miniwob,Make sure to click submit to finish the task.
June 18,miniwob.form-sequence-2,23,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7-sonnet-20250219,miniwob,miniwob,allac,Reporting results in miniwob,Make sure to click submit to finish the task.
July 13,workarena.servicenow.create-hardware-asset,385,gpt-4.1,ToolUse-gpt-4.1,WorkArena-L1,WorkArena-L1,allac,Filling form in WorkArena,"If you enter the value in the wrong field, the task may be terminated immediately. The field you are looking for may be in another tab. You have to look around."
July 13,workarena.servicenow.create-hardware-asset,385,gpt-4.1,ToolUse-gpt-4.1,WorkArena-L1,WorkArena-L1,allac,Filling form in WorkArena,"Before clicking submit, make sure that all fields are filled properly. Then click submit."
July 13,workarena.servicenow.create-hardware-asset,385,gpt-4.1,ToolUse-gpt-4.1,WorkArena-L1,WorkArena-L1,allac,Filling form in WorkArena,Avoid back and forth from tabs to tabs to reduce the number of actions
July 14,workarena.servicenow.create-hardware-asset,385,gpt-4.1,ToolUse-gpt-4.1,WorkArena-L1,WorkArena-L1,allac,Filling form in WorkArena,When you see auto-complete make sure to select an element from that list
53 changes: 49 additions & 4 deletions src/agentlab/analyze/agent_xray.py
Original file line number Diff line number Diff line change
Expand Up @@ -926,6 +926,9 @@ def get_episode_info(info: Info):

{code(step_info.task_info)}

**Terminated or Truncated:**
{code(f"Terminated: {step_info.terminated}, Truncated: {step_info.truncated}")}

**exp_dir:**

<small style="line-height: 1; margin: 0; padding: 0;">{code(exp_dir_str)}</small>"""
Expand Down Expand Up @@ -1247,8 +1250,17 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr
warning("No step info to plot")
return None

# this allows to pop labels to make sure we don't use more than 1 for the legend
labels = ["reset", "env", "agent", "exec action", "action error"]
# Updated labels to include new profiling stages
labels = [
"reset",
"env",
"agent",
"exec action",
"action error",
"wait for page",
"validation",
"get observation",
]
labels = {e: e for e in labels}

colors = plt.get_cmap("tab20c").colors
Expand All @@ -1257,6 +1269,7 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr
all_times = []
step_times = []
for i, step_info in progress_fn(list(enumerate(step_info_list)), desc="Building plot."):
assert isinstance(step_info, StepInfo), f"Expected StepInfo, got {type(step_info)}"
step = step_info.step

prof = deepcopy(step_info.profiling)
Expand All @@ -1278,6 +1291,39 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr
label = labels.pop("exec action", None)
add_patch(ax, prof.action_exec_start, prof.action_exec_stop, colors[3], label)

# NEW: Add wait for page loading visualization
if (
hasattr(prof, "wait_for_page_loading_start")
and prof.wait_for_page_loading_start > 0
):
add_patch(
ax,
prof.wait_for_page_loading_start,
prof.wait_for_page_loading_stop,
colors[19],
labels.pop("wait for page", None),
)

# NEW: Add validation visualization
if hasattr(prof, "validation_start") and prof.validation_start > 0:
add_patch(
ax,
prof.validation_start,
prof.validation_stop,
colors[8],
labels.pop("validation", None),
)

# NEW: Add get observation visualization
if hasattr(prof, "get_observation_start") and prof.get_observation_start > 0:
add_patch(
ax,
prof.get_observation_start,
prof.get_observation_stop,
colors[12],
labels.pop("get observation", None),
)

try:
next_step_error = step_info_list[i + 1].obs["last_action_error"]
except (IndexError, KeyError, TypeError):
Expand Down Expand Up @@ -1344,7 +1390,6 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr

ax.set_ylim(0, 1)
ax.set_xlim(0, max(all_times) + 1)
# plt.gca().autoscale()

ax.set_xlabel("Time")
ax.set_yticks([])
Expand All @@ -1353,7 +1398,7 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr
ax.legend(
loc="upper center",
bbox_to_anchor=(0.5, 1.2),
ncol=5,
ncol=8, # Updated to accommodate new labels
frameon=True,
)

Expand Down
1 change: 0 additions & 1 deletion src/agentlab/analyze/overlay_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,6 @@ def overlay_rectangle(

if dashed:
# Draw dashed rectangle
print("Drawing dashed rectangle")
linedashed(draw, x, y, x + w, y, color, width)
linedashed(draw, x + w, y, x + w, y + h, color, width)
linedashed(draw, x + w, y + h, x, y + h, color, width)
Expand Down
21 changes: 20 additions & 1 deletion src/agentlab/experiments/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import numpy as np
from browsergym.core.chat import Chat
from browsergym.experiments.agent import Agent
from browsergym.experiments.utils import count_messages_token, count_tokens
from browsergym.experiments.utils import count_tokens
from dataclasses_json import DataClassJsonMixin
from PIL import Image
from tqdm import tqdm
Expand Down Expand Up @@ -48,6 +48,7 @@ class EnvArgs(DataClassJsonMixin):
slow_mo: Optional[int] = None # use default value from BrowserGym
storage_state: Optional[str | Path | dict] = None
task_kwargs: Optional[dict] = None # use default value from BrowserGym
pre_observation_delay: float = None # seconds, wait for JS events to be fired

def make_env(
self, action_mapping, exp_dir, exp_task_kwargs: dict = {}, use_raw_page_output=True
Expand All @@ -71,6 +72,8 @@ def make_env(
extra_kwargs["viewport"] = self.viewport
if self.slow_mo is not None:
extra_kwargs["slow_mo"] = self.slow_mo
if self.pre_observation_delay is not None:
extra_kwargs["pre_observation_delay"] = self.pre_observation_delay
if self.storage_state:
extra_kwargs["pw_context_kwargs"] = {"storage_state": self.storage_state}
if self.task_kwargs is not None:
Expand Down Expand Up @@ -142,6 +145,12 @@ class StepTimestamps:
env_stop: float = 0
agent_start: float = 0
agent_stop: float = 0
wait_for_page_loading_start: float = 0
wait_for_page_loading_stop: float = 0
validation_start: float = 0
validation_stop: float = 0
get_observation_start: float = 0
get_observation_stop: float = 0


@dataclass
Expand Down Expand Up @@ -199,6 +208,12 @@ def from_step(self, env: gym.Env, action: str, obs_preprocessor: callable):
t.action_exec_start = env_info["action_exec_start"] # start
t.action_exect_after_timeout = env_info["action_exec_stop"]
t.action_exec_stop = env_info["action_exec_stop"] - env_info["action_exec_timeout"]
t.wait_for_page_loading_start = env_info.get("wait_for_page_loading_start", None)
t.wait_for_page_loading_stop = env_info.get("wait_for_page_loading_stop", None)
t.validation_start = env_info.get("validation_start", None)
t.validation_stop = env_info.get("validation_stop", None)
t.get_observation_start = env_info.get("get_observation_start", None)
t.get_observation_stop = env_info.get("get_observation_stop", None)

if obs_preprocessor:
self.obs = obs_preprocessor(self.obs)
Expand Down Expand Up @@ -447,6 +462,10 @@ def run(self):
logger.debug("Sending action to environment.")
step_info.from_step(env, action, obs_preprocessor=agent.obs_preprocessor)
logger.debug("Environment stepped.")
if step_info.is_done:
logger.debug(
f"Episode done: terminated: {step_info.terminated}, truncated: {step_info.truncated}."
)

except Exception as e:
err_msg = f"Exception uncaught by agent or environment in task {self.env_args.task_name}.\n{type(e).__name__}:\n{e}"
Expand Down
2 changes: 1 addition & 1 deletion src/agentlab/experiments/study.py
Original file line number Diff line number Diff line change
Expand Up @@ -726,7 +726,7 @@ def set_demo_mode(env_args_list: list[EnvArgs]):
env_args.slow_mo = 1000


def _convert_env_args(env_args_list):
def _convert_env_args(env_args_list) -> list[EnvArgs]:
"""Return a list where every element is the *new* EnvArgs.

For backward compatibility, we need to convert the old EnvArgs to the new one.
Expand Down