From a0328549544c032061536622e0a130b5a4af1c14 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Wed, 6 Nov 2024 19:16:47 +0000
Subject: [PATCH 01/42] yet another way to kill timedout jobs

---
 src/agentlab/experiments/exp_utils.py         | 15 +++--
 .../experiments/graph_execution_ray.py        | 66 ++++++++++++++++---
 tests/experiments/test_launch_exp.py          | 12 ++--
 3 files changed, 71 insertions(+), 22 deletions(-)

diff --git a/src/agentlab/experiments/exp_utils.py b/src/agentlab/experiments/exp_utils.py
index c2864ce0..97ce527d 100644
--- a/src/agentlab/experiments/exp_utils.py
+++ b/src/agentlab/experiments/exp_utils.py
@@ -27,9 +27,11 @@
 
 def run_exp(exp_arg: ExpArgs, *dependencies, avg_step_timeout=60):
     """Run exp_args.run() with a timeout and handle dependencies."""
-    episode_timeout = _episode_timeout(exp_arg, avg_step_timeout=avg_step_timeout)
-    with timeout_manager(seconds=episode_timeout):
-        return exp_arg.run()
+    # episode_timeout = _episode_timeout(exp_arg, avg_step_timeout=avg_step_timeout)
+    # logger.warning(f"Running {exp_arg.exp_id} with timeout of {episode_timeout} seconds.")
+    # with timeout_manager(seconds=episode_timeout):
+    # this timeout method is not robust enough. using ray.cancel instead
+    return exp_arg.run()
 
 
 def _episode_timeout(exp_arg: ExpArgs, avg_step_timeout=60):
@@ -62,13 +64,12 @@ def timeout_manager(seconds: int = None):
 
     def alarm_handler(signum, frame):
 
-        logger.warning(
-            f"Operation timed out after {seconds}s, sending SIGINT and raising TimeoutError."
-        )
+        logger.warning(f"Operation timed out after {seconds}s, raising TimeoutError.")
         # send sigint
-        os.kill(os.getpid(), signal.SIGINT)
+        # os.kill(os.getpid(), signal.SIGINT) # this doesn't seem to do much I don't know why
 
         # Still raise TimeoutError for immediate handling
+        # This works, but it doesn't seem enough to kill the job
         raise TimeoutError(f"Operation timed out after {seconds} seconds")
 
     previous_handler = signal.signal(signal.SIGALRM, alarm_handler)
diff --git a/src/agentlab/experiments/graph_execution_ray.py b/src/agentlab/experiments/graph_execution_ray.py
index 3e01be31..703f59e6 100644
--- a/src/agentlab/experiments/graph_execution_ray.py
+++ b/src/agentlab/experiments/graph_execution_ray.py
@@ -2,11 +2,14 @@
 
 # # Disable Ray log deduplication
 # os.environ["RAY_DEDUP_LOGS"] = "0"
-
+import time
 import ray
 import bgym
-from agentlab.experiments.exp_utils import run_exp
+from agentlab.experiments.exp_utils import run_exp, _episode_timeout
+from ray.util import state
+import logging
 
+logger = logging.getLogger(__name__)
 
 run_exp = ray.remote(run_exp)
 
@@ -15,25 +18,70 @@ def execute_task_graph(exp_args_list: list[bgym.ExpArgs], avg_step_timeout=60):
     """Execute a task graph in parallel while respecting dependencies using Ray."""
 
     exp_args_map = {exp_args.exp_id: exp_args for exp_args in exp_args_list}
-    tasks = {}
+    task_map = {}
 
     def get_task(exp_arg: bgym.ExpArgs):
-        if exp_arg.exp_id not in tasks:
+        if exp_arg.exp_id not in task_map:
             # Get all dependency tasks first
             dependency_tasks = [get_task(exp_args_map[dep_key]) for dep_key in exp_arg.depends_on]
 
             # Create new task that depends on the dependency results
-            tasks[exp_arg.exp_id] = run_exp.remote(
+            task_map[exp_arg.exp_id] = run_exp.remote(
                 exp_arg, *dependency_tasks, avg_step_timeout=avg_step_timeout
             )
-        return tasks[exp_arg.exp_id]
+        return task_map[exp_arg.exp_id]
 
     # Build task graph
     for exp_arg in exp_args_list:
         get_task(exp_arg)
 
-    # Execute all tasks and gather results
+    max_timeout = max([_episode_timeout(exp_args, avg_step_timeout) for exp_args in exp_args_list])
+    return poll_for_timeout(task_map, max_timeout, poll_interval=max_timeout * 0.1)
+
+
+def poll_for_timeout(tasks: dict[str, ray.ObjectRef], timeout: float, poll_interval: float = 1.0):
+    """Cancel tasks that exceeds the timeout
+
+    I tried various different methods for killing a job that hangs. so far it's
+    the only one that seems to work reliably (hopefully)
+    """
+    task_list = list(tasks.values())
     task_ids = list(tasks.keys())
-    results = ray.get(list(tasks.values()))
 
-    return {task_id: result for task_id, result in zip(task_ids, results)}
+    logger.warning(f"Any task exceeding {timeout} seconds will be cancelled.")
+
+    while True:
+        ready, not_ready = ray.wait(task_list, num_returns=len(task_list), timeout=poll_interval)
+        for task in not_ready:
+            elapsed_time = get_elapsed_time(task)
+            # print(f"Task {task.task_id().hex()} elapsed time: {elapsed_time}")
+            if elapsed_time is not None and elapsed_time > timeout:
+                msg = f"Task {task.task_id().hex()} hase been running for {elapsed_time}s, more than the timeout: {timeout}s."
+                if elapsed_time < timeout + 60:
+                    logger.warning(msg + " Cancelling task.")
+                    ray.cancel(task, force=False, recursive=False)
+                else:
+                    logger.warning(msg + " Force killing.")
+                    ray.cancel(task, force=True, recursive=False)
+        if len(ready) == len(task_list):
+            results = []
+            for task in ready:
+                try:
+                    result = ray.get(task)
+                except Exception as e:
+                    result = e
+                results.append(result)
+
+            return {task_id: result for task_id, result in zip(task_ids, results)}
+
+
+def get_elapsed_time(task_ref: ray.ObjectRef):
+    task_id = task_ref.task_id().hex()
+    task_info = state.get_task(task_id, address="auto")
+    if task_info and task_info.start_time_ms is not None:
+        start_time_s = task_info.start_time_ms / 1000.0  # Convert ms to s
+        current_time_s = time.time()
+        elapsed_time = current_time_s - start_time_s
+        return elapsed_time
+    else:
+        return None  # Task has not started yet
diff --git a/tests/experiments/test_launch_exp.py b/tests/experiments/test_launch_exp.py
index 91a7c2db..782a9edc 100644
--- a/tests/experiments/test_launch_exp.py
+++ b/tests/experiments/test_launch_exp.py
@@ -1,3 +1,4 @@
+import math
 import tempfile
 from pathlib import Path
 
@@ -63,9 +64,8 @@ def _test_launch_system(backend="ray", cause_timeout=False):
             if row.stack_trace is not None:
                 print(row.stack_trace)
             if cause_timeout:
-                assert row.err_msg is not None
-                assert "Timeout" in row.err_msg
-                assert row.cum_reward == 0
+                # assert row.err_msg is not None
+                assert math.isnan(row.cum_reward) or row.cum_reward == 0
             else:
                 assert row.err_msg is None
                 assert row.cum_reward == 1.0
@@ -73,9 +73,9 @@ def _test_launch_system(backend="ray", cause_timeout=False):
         study_summary = inspect_results.summarize_study(results_df)
         assert len(study_summary) == 1
         assert study_summary.std_err.iloc[0] == 0
-        assert study_summary.n_completed.iloc[0] == "3/3"
 
         if not cause_timeout:
+            assert study_summary.n_completed.iloc[0] == "3/3"
             assert study_summary.avg_reward.iloc[0] == 1.0
 
 
@@ -91,7 +91,7 @@ def test_launch_system_ray():
     _test_launch_system(backend="ray")
 
 
-def _test_timeout_ray():
+def test_timeout_ray():
     _test_launch_system(backend="ray", cause_timeout=True)
 
 
@@ -120,7 +120,7 @@ def test_4o_mini_on_miniwob_tiny_test():
 
 
 if __name__ == "__main__":
-    _test_timeout_ray()
+    test_timeout_ray()
     # test_4o_mini_on_miniwob_tiny_test()
     # test_launch_system_ray()
     # test_launch_system_sequntial()

From ac1a461eae9573d880d5415f39f1cb64fb2fe839 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Wed, 6 Nov 2024 21:29:28 +0000
Subject: [PATCH 02/42] Improve timeout handling in task polling logic

---
 src/agentlab/experiments/graph_execution_ray.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/agentlab/experiments/graph_execution_ray.py b/src/agentlab/experiments/graph_execution_ray.py
index 703f59e6..46b96bd8 100644
--- a/src/agentlab/experiments/graph_execution_ray.py
+++ b/src/agentlab/experiments/graph_execution_ray.py
@@ -36,6 +36,7 @@ def get_task(exp_arg: bgym.ExpArgs):
         get_task(exp_arg)
 
     max_timeout = max([_episode_timeout(exp_args, avg_step_timeout) for exp_args in exp_args_list])
+    
     return poll_for_timeout(task_map, max_timeout, poll_interval=max_timeout * 0.1)
 
 
@@ -57,7 +58,7 @@ def poll_for_timeout(tasks: dict[str, ray.ObjectRef], timeout: float, poll_inter
             # print(f"Task {task.task_id().hex()} elapsed time: {elapsed_time}")
             if elapsed_time is not None and elapsed_time > timeout:
                 msg = f"Task {task.task_id().hex()} hase been running for {elapsed_time}s, more than the timeout: {timeout}s."
-                if elapsed_time < timeout + 60:
+                if elapsed_time < timeout + 60 + poll_interval:
                     logger.warning(msg + " Cancelling task.")
                     ray.cancel(task, force=False, recursive=False)
                 else:

From 290b88de62b77ee509700afbf078fc2dd21f12c0 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Thu, 7 Nov 2024 22:05:40 +0000
Subject: [PATCH 03/42] Add method to override max_steps in Study class

---
 src/agentlab/experiments/study.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index 4e3bccea..6ff6ea5e 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -245,6 +245,10 @@ def get_report(self, ignore_cache=False, ignore_stale=False):
         return inspect_results.get_study_summary(
             self.dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale
         )
+    
+    def override_max_steps(self, max_steps):
+        for exp_args in self.exp_args_list:
+            exp_args.env_args.max_steps = max_steps
 
     @staticmethod
     def load(dir: Path) -> "Study":

From 3f05803722121c9f3ff2e12946365cd03c396426 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 8 Nov 2024 18:32:15 +0000
Subject: [PATCH 04/42] add support for tab visibility in observation flags and
 update related components

---
 src/agentlab/agents/dynamic_prompting.py      | 13 ++---
 .../agents/generic_agent/generic_agent.py     |  3 +-
 .../generic_agent/generic_agent_prompt.py     |  1 -
 tests/agents/test_generic_prompt.py           | 47 +++++++++----------
 4 files changed, 26 insertions(+), 38 deletions(-)

diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py
index 1ddbf3be..3b4fcd73 100644
--- a/src/agentlab/agents/dynamic_prompting.py
+++ b/src/agentlab/agents/dynamic_prompting.py
@@ -10,8 +10,6 @@
 
 import bgym
 from browsergym.core.action.base import AbstractActionSet
-from browsergym.core.action.highlevel import HighLevelActionSet
-from browsergym.core.action.python import PythonActionSet
 from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, overlay_som, prune_html
 
 from agentlab.llm.llm_utils import (
@@ -71,6 +69,7 @@ class ObsFlags(Flags):
 
     use_html: bool = True
     use_ax_tree: bool = False
+    use_tabs: bool = False
     use_focused_element: bool = False
     use_error_logs: bool = False
     use_history: bool = False
@@ -386,11 +385,7 @@ def _prompt(self) -> str:
     URL: {page_url}
 """
             prompt_pieces.append(prompt_piece)
-        self._prompt = "\n".join(prompt_pieces)
-
-
-def has_tab_action(action_set: bgym.HighLevelActionSetArgs):
-    return "tab" in action_set.subsets
+        return  "\n".join(prompt_pieces)
 
 
 class Observation(Shrinkable):
@@ -399,14 +394,14 @@ class Observation(Shrinkable):
     Contains the html, the accessibility tree and the error logs.
     """
 
-    def __init__(self, obs, flags: ObsFlags, use_tabs=False) -> None:
+    def __init__(self, obs, flags: ObsFlags) -> None:
         super().__init__()
         self.flags = flags
         self.obs = obs
 
         self.tabs = Tabs(
             obs,
-            visible=use_tabs,
+            visible=lambda: flags.use_tabs,
             prefix="## ",
         )
 
diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py
index 5ef8a4cc..98026dc1 100644
--- a/src/agentlab/agents/generic_agent/generic_agent.py
+++ b/src/agentlab/agents/generic_agent/generic_agent.py
@@ -32,6 +32,7 @@ def set_benchmark(self, benchmark: bgym.Benchmark, demo_mode):
         if benchmark.name.startswith("miniwob"):
             self.flags.obs.use_html = True
 
+        self.flags.obs.use_tabs = benchmark.is_multi_tab
         self.flags.action.action_set = deepcopy(benchmark.high_level_action_set_args)
 
         # for backward compatibility with old traces
@@ -268,5 +269,3 @@ def get_action_post_hoc(agent: GenericAgent, obs: dict, ans_dict: dict):
         output += f"\n<action>\n{action}\n</action>"
 
     return system_prompt, instruction_prompt, output
-    return system_prompt, instruction_prompt, output
-    return system_prompt, instruction_prompt, output
diff --git a/src/agentlab/agents/generic_agent/generic_agent_prompt.py b/src/agentlab/agents/generic_agent/generic_agent_prompt.py
index eb45ba59..67899f18 100644
--- a/src/agentlab/agents/generic_agent/generic_agent_prompt.py
+++ b/src/agentlab/agents/generic_agent/generic_agent_prompt.py
@@ -77,7 +77,6 @@ def __init__(
         self.obs = dp.Observation(
             obs_history[-1],
             self.flags.obs,
-            use_tabs=dp.has_tab_action(self.flags.action.action_set),
         )
 
         self.action_prompt = dp.ActionPrompt(action_set, action_flags=flags.action)
diff --git a/tests/agents/test_generic_prompt.py b/tests/agents/test_generic_prompt.py
index a579c261..66b173a2 100644
--- a/tests/agents/test_generic_prompt.py
+++ b/tests/agents/test_generic_prompt.py
@@ -20,33 +20,28 @@
 </html>
 """
 
+base_obs = {
+    "goal": "do this and that",
+    "goal_object": [{"type": "text", "text": "do this and that"}],
+    "chat_messages": [{"role": "user", "message": "do this and that"}],
+    "axtree_txt": "[1] Click me",
+    "focused_element_bid": "45-256",
+    "open_pages_urls": ["https://example.com"],
+    "open_pages_titles": ["Example"],
+    "active_page_index": 0,
+}
 
 OBS_HISTORY = [
-    {
-        "goal": "do this and that",
-        "goal_object": [{"type": "text", "text": "do this and that"}],
-        "chat_messages": [{"role": "user", "message": "do this and that"}],
+    base_obs | {
         "pruned_html": html_template.format(1),
-        "axtree_txt": "[1] Click me",
-        "focused_element_bid": "45-256",
         "last_action_error": "",
     },
-    {
-        "goal": "do this and that",
-        "goal_object": [{"type": "text", "text": "do this and that"}],
-        "chat_messages": [{"role": "user", "message": "do this and that"}],
+    base_obs | {
         "pruned_html": html_template.format(2),
-        "axtree_txt": "[1] Click me",
-        "focused_element_bid": "45-256",
         "last_action_error": "Hey, this is an error in the past",
     },
-    {
-        "goal": "do this and that",
-        "goal_object": [{"type": "text", "text": "do this and that"}],
-        "chat_messages": [{"role": "user", "message": "do this and that"}],
+    base_obs | {
         "pruned_html": html_template.format(3),
-        "axtree_txt": "[1] Click me",
-        "focused_element_bid": "45-256",
         "last_action_error": "Hey, there is an error now",
     },
 ]
@@ -58,6 +53,7 @@
     obs=dp.ObsFlags(
         use_html=True,
         use_ax_tree=True,
+        use_tabs=True,
         use_focused_element=True,
         use_error_logs=True,
         use_history=True,
@@ -104,6 +100,10 @@
         "obs.use_ax_tree",
         ("AXTree:", "Click me"),
     ),
+    (
+        "obs.use_tabs",
+        ("Currently open tabs:","(active tab)"),
+    ),
     (
         "obs.use_focused_element",
         ("Focused element:", "bid='45-256'"),
@@ -251,11 +251,6 @@ def test_main_prompt_elements_present():
     # for debugging
     test_shrinking_observation()
     test_main_prompt_elements_present()
-    for flag, expected_prompts in FLAG_EXPECTED_PROMPT:
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
+    # for flag, expected_prompts in FLAG_EXPECTED_PROMPT:
+    #     test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
+  
\ No newline at end of file

From 2fe585fe048dfa1da3cc5cfe843482bd34de5361 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 8 Nov 2024 19:59:45 +0000
Subject: [PATCH 05/42] fix tests

---
 tests/agents/test_generic_prompt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/agents/test_generic_prompt.py b/tests/agents/test_generic_prompt.py
index 66b173a2..a26c5747 100644
--- a/tests/agents/test_generic_prompt.py
+++ b/tests/agents/test_generic_prompt.py
@@ -165,7 +165,7 @@ def test_shrinking_observation():
     flags.obs.use_html = True
 
     prompt_maker = MainPrompt(
-        action_set=dp.HighLevelActionSet(),
+        action_set=bgym.HighLevelActionSet(),
         obs_history=OBS_HISTORY,
         actions=ACTIONS,
         memories=MEMORIES,
@@ -231,7 +231,7 @@ def test_main_prompt_elements_present():
     # Initialize MainPrompt
     prompt = str(
         MainPrompt(
-            action_set=dp.HighLevelActionSet(),
+            action_set=bgym.HighLevelActionSet(),
             obs_history=OBS_HISTORY,
             actions=ACTIONS,
             memories=MEMORIES,

From 4a8cbb25dd581c0eb946f21b57ddfa6829237a79 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 8 Nov 2024 20:29:07 +0000
Subject: [PATCH 06/42] black

---
 src/agentlab/agents/dynamic_prompting.py |  2 +-
 src/agentlab/experiments/study.py        |  2 +-
 tests/agents/test_generic_prompt.py      | 12 +++++++-----
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py
index 3b4fcd73..73688f0f 100644
--- a/src/agentlab/agents/dynamic_prompting.py
+++ b/src/agentlab/agents/dynamic_prompting.py
@@ -385,7 +385,7 @@ def _prompt(self) -> str:
     URL: {page_url}
 """
             prompt_pieces.append(prompt_piece)
-        return  "\n".join(prompt_pieces)
+        return "\n".join(prompt_pieces)
 
 
 class Observation(Shrinkable):
diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index 6ff6ea5e..36a1f54c 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -245,7 +245,7 @@ def get_report(self, ignore_cache=False, ignore_stale=False):
         return inspect_results.get_study_summary(
             self.dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale
         )
-    
+
     def override_max_steps(self, max_steps):
         for exp_args in self.exp_args_list:
             exp_args.env_args.max_steps = max_steps
diff --git a/tests/agents/test_generic_prompt.py b/tests/agents/test_generic_prompt.py
index a26c5747..ae2e6d8a 100644
--- a/tests/agents/test_generic_prompt.py
+++ b/tests/agents/test_generic_prompt.py
@@ -32,15 +32,18 @@
 }
 
 OBS_HISTORY = [
-    base_obs | {
+    base_obs
+    | {
         "pruned_html": html_template.format(1),
         "last_action_error": "",
     },
-    base_obs | {
+    base_obs
+    | {
         "pruned_html": html_template.format(2),
         "last_action_error": "Hey, this is an error in the past",
     },
-    base_obs | {
+    base_obs
+    | {
         "pruned_html": html_template.format(3),
         "last_action_error": "Hey, there is an error now",
     },
@@ -102,7 +105,7 @@
     ),
     (
         "obs.use_tabs",
-        ("Currently open tabs:","(active tab)"),
+        ("Currently open tabs:", "(active tab)"),
     ),
     (
         "obs.use_focused_element",
@@ -253,4 +256,3 @@ def test_main_prompt_elements_present():
     test_main_prompt_elements_present()
     # for flag, expected_prompts in FLAG_EXPECTED_PROMPT:
     #     test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-  
\ No newline at end of file

From 17fc3d1a27bf7218d44a8a67f437f8a877b64c75 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Wed, 6 Nov 2024 21:29:28 +0000
Subject: [PATCH 07/42] Improve timeout handling in task polling logic

---
 src/agentlab/experiments/graph_execution_ray.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/agentlab/experiments/graph_execution_ray.py b/src/agentlab/experiments/graph_execution_ray.py
index 703f59e6..46b96bd8 100644
--- a/src/agentlab/experiments/graph_execution_ray.py
+++ b/src/agentlab/experiments/graph_execution_ray.py
@@ -36,6 +36,7 @@ def get_task(exp_arg: bgym.ExpArgs):
         get_task(exp_arg)
 
     max_timeout = max([_episode_timeout(exp_args, avg_step_timeout) for exp_args in exp_args_list])
+    
     return poll_for_timeout(task_map, max_timeout, poll_interval=max_timeout * 0.1)
 
 
@@ -57,7 +58,7 @@ def poll_for_timeout(tasks: dict[str, ray.ObjectRef], timeout: float, poll_inter
             # print(f"Task {task.task_id().hex()} elapsed time: {elapsed_time}")
             if elapsed_time is not None and elapsed_time > timeout:
                 msg = f"Task {task.task_id().hex()} hase been running for {elapsed_time}s, more than the timeout: {timeout}s."
-                if elapsed_time < timeout + 60:
+                if elapsed_time < timeout + 60 + poll_interval:
                     logger.warning(msg + " Cancelling task.")
                     ray.cancel(task, force=False, recursive=False)
                 else:

From 1e07d3e00848f2978d49313bf255d3b6ef6d39a3 Mon Sep 17 00:00:00 2001
From: Alexandre Lacoste <alex.lacoste.shmu@gmail.com>
Date: Wed, 6 Nov 2024 16:13:37 -0500
Subject: [PATCH 08/42] yet another way to kill timedout jobs (#108)

---
 src/agentlab/experiments/graph_execution_ray.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/agentlab/experiments/graph_execution_ray.py b/src/agentlab/experiments/graph_execution_ray.py
index 46b96bd8..ccde71f8 100644
--- a/src/agentlab/experiments/graph_execution_ray.py
+++ b/src/agentlab/experiments/graph_execution_ray.py
@@ -2,12 +2,14 @@
 
 # # Disable Ray log deduplication
 # os.environ["RAY_DEDUP_LOGS"] = "0"
+import logging
 import time
-import ray
+
 import bgym
-from agentlab.experiments.exp_utils import run_exp, _episode_timeout
+import ray
 from ray.util import state
-import logging
+
+from agentlab.experiments.exp_utils import _episode_timeout, run_exp
 
 logger = logging.getLogger(__name__)
 

From 63d8debd2d10fae5496136d6ca25dbfbda579887 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Thu, 7 Nov 2024 22:05:40 +0000
Subject: [PATCH 09/42] Add method to override max_steps in Study class

---
 src/agentlab/experiments/study.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index 4e3bccea..6ff6ea5e 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -245,6 +245,10 @@ def get_report(self, ignore_cache=False, ignore_stale=False):
         return inspect_results.get_study_summary(
             self.dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale
         )
+    
+    def override_max_steps(self, max_steps):
+        for exp_args in self.exp_args_list:
+            exp_args.env_args.max_steps = max_steps
 
     @staticmethod
     def load(dir: Path) -> "Study":

From b88a058943b86dc74918d735fa16cac76cdb55f8 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 8 Nov 2024 18:32:15 +0000
Subject: [PATCH 10/42] add support for tab visibility in observation flags and
 update related components

---
 src/agentlab/agents/dynamic_prompting.py      | 13 ++---
 .../agents/generic_agent/generic_agent.py     |  3 +-
 .../generic_agent/generic_agent_prompt.py     |  1 -
 tests/agents/test_generic_prompt.py           | 47 +++++++++----------
 4 files changed, 26 insertions(+), 38 deletions(-)

diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py
index 1ddbf3be..3b4fcd73 100644
--- a/src/agentlab/agents/dynamic_prompting.py
+++ b/src/agentlab/agents/dynamic_prompting.py
@@ -10,8 +10,6 @@
 
 import bgym
 from browsergym.core.action.base import AbstractActionSet
-from browsergym.core.action.highlevel import HighLevelActionSet
-from browsergym.core.action.python import PythonActionSet
 from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, overlay_som, prune_html
 
 from agentlab.llm.llm_utils import (
@@ -71,6 +69,7 @@ class ObsFlags(Flags):
 
     use_html: bool = True
     use_ax_tree: bool = False
+    use_tabs: bool = False
     use_focused_element: bool = False
     use_error_logs: bool = False
     use_history: bool = False
@@ -386,11 +385,7 @@ def _prompt(self) -> str:
     URL: {page_url}
 """
             prompt_pieces.append(prompt_piece)
-        self._prompt = "\n".join(prompt_pieces)
-
-
-def has_tab_action(action_set: bgym.HighLevelActionSetArgs):
-    return "tab" in action_set.subsets
+        return  "\n".join(prompt_pieces)
 
 
 class Observation(Shrinkable):
@@ -399,14 +394,14 @@ class Observation(Shrinkable):
     Contains the html, the accessibility tree and the error logs.
     """
 
-    def __init__(self, obs, flags: ObsFlags, use_tabs=False) -> None:
+    def __init__(self, obs, flags: ObsFlags) -> None:
         super().__init__()
         self.flags = flags
         self.obs = obs
 
         self.tabs = Tabs(
             obs,
-            visible=use_tabs,
+            visible=lambda: flags.use_tabs,
             prefix="## ",
         )
 
diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py
index 5ef8a4cc..98026dc1 100644
--- a/src/agentlab/agents/generic_agent/generic_agent.py
+++ b/src/agentlab/agents/generic_agent/generic_agent.py
@@ -32,6 +32,7 @@ def set_benchmark(self, benchmark: bgym.Benchmark, demo_mode):
         if benchmark.name.startswith("miniwob"):
             self.flags.obs.use_html = True
 
+        self.flags.obs.use_tabs = benchmark.is_multi_tab
         self.flags.action.action_set = deepcopy(benchmark.high_level_action_set_args)
 
         # for backward compatibility with old traces
@@ -268,5 +269,3 @@ def get_action_post_hoc(agent: GenericAgent, obs: dict, ans_dict: dict):
         output += f"\n<action>\n{action}\n</action>"
 
     return system_prompt, instruction_prompt, output
-    return system_prompt, instruction_prompt, output
-    return system_prompt, instruction_prompt, output
diff --git a/src/agentlab/agents/generic_agent/generic_agent_prompt.py b/src/agentlab/agents/generic_agent/generic_agent_prompt.py
index eb45ba59..67899f18 100644
--- a/src/agentlab/agents/generic_agent/generic_agent_prompt.py
+++ b/src/agentlab/agents/generic_agent/generic_agent_prompt.py
@@ -77,7 +77,6 @@ def __init__(
         self.obs = dp.Observation(
             obs_history[-1],
             self.flags.obs,
-            use_tabs=dp.has_tab_action(self.flags.action.action_set),
         )
 
         self.action_prompt = dp.ActionPrompt(action_set, action_flags=flags.action)
diff --git a/tests/agents/test_generic_prompt.py b/tests/agents/test_generic_prompt.py
index a579c261..66b173a2 100644
--- a/tests/agents/test_generic_prompt.py
+++ b/tests/agents/test_generic_prompt.py
@@ -20,33 +20,28 @@
 </html>
 """
 
+base_obs = {
+    "goal": "do this and that",
+    "goal_object": [{"type": "text", "text": "do this and that"}],
+    "chat_messages": [{"role": "user", "message": "do this and that"}],
+    "axtree_txt": "[1] Click me",
+    "focused_element_bid": "45-256",
+    "open_pages_urls": ["https://example.com"],
+    "open_pages_titles": ["Example"],
+    "active_page_index": 0,
+}
 
 OBS_HISTORY = [
-    {
-        "goal": "do this and that",
-        "goal_object": [{"type": "text", "text": "do this and that"}],
-        "chat_messages": [{"role": "user", "message": "do this and that"}],
+    base_obs | {
         "pruned_html": html_template.format(1),
-        "axtree_txt": "[1] Click me",
-        "focused_element_bid": "45-256",
         "last_action_error": "",
     },
-    {
-        "goal": "do this and that",
-        "goal_object": [{"type": "text", "text": "do this and that"}],
-        "chat_messages": [{"role": "user", "message": "do this and that"}],
+    base_obs | {
         "pruned_html": html_template.format(2),
-        "axtree_txt": "[1] Click me",
-        "focused_element_bid": "45-256",
         "last_action_error": "Hey, this is an error in the past",
     },
-    {
-        "goal": "do this and that",
-        "goal_object": [{"type": "text", "text": "do this and that"}],
-        "chat_messages": [{"role": "user", "message": "do this and that"}],
+    base_obs | {
         "pruned_html": html_template.format(3),
-        "axtree_txt": "[1] Click me",
-        "focused_element_bid": "45-256",
         "last_action_error": "Hey, there is an error now",
     },
 ]
@@ -58,6 +53,7 @@
     obs=dp.ObsFlags(
         use_html=True,
         use_ax_tree=True,
+        use_tabs=True,
         use_focused_element=True,
         use_error_logs=True,
         use_history=True,
@@ -104,6 +100,10 @@
         "obs.use_ax_tree",
         ("AXTree:", "Click me"),
     ),
+    (
+        "obs.use_tabs",
+        ("Currently open tabs:","(active tab)"),
+    ),
     (
         "obs.use_focused_element",
         ("Focused element:", "bid='45-256'"),
@@ -251,11 +251,6 @@ def test_main_prompt_elements_present():
     # for debugging
     test_shrinking_observation()
     test_main_prompt_elements_present()
-    for flag, expected_prompts in FLAG_EXPECTED_PROMPT:
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
+    # for flag, expected_prompts in FLAG_EXPECTED_PROMPT:
+    #     test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
+  
\ No newline at end of file

From e97d023b3a57742a5de2af8a53a64abddfc47735 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 8 Nov 2024 19:59:45 +0000
Subject: [PATCH 11/42] fix tests

---
 tests/agents/test_generic_prompt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/agents/test_generic_prompt.py b/tests/agents/test_generic_prompt.py
index 66b173a2..a26c5747 100644
--- a/tests/agents/test_generic_prompt.py
+++ b/tests/agents/test_generic_prompt.py
@@ -165,7 +165,7 @@ def test_shrinking_observation():
     flags.obs.use_html = True
 
     prompt_maker = MainPrompt(
-        action_set=dp.HighLevelActionSet(),
+        action_set=bgym.HighLevelActionSet(),
         obs_history=OBS_HISTORY,
         actions=ACTIONS,
         memories=MEMORIES,
@@ -231,7 +231,7 @@ def test_main_prompt_elements_present():
     # Initialize MainPrompt
     prompt = str(
         MainPrompt(
-            action_set=dp.HighLevelActionSet(),
+            action_set=bgym.HighLevelActionSet(),
             obs_history=OBS_HISTORY,
             actions=ACTIONS,
             memories=MEMORIES,

From ccd7b8b930e24775cfe558844a49527cede91735 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 8 Nov 2024 20:29:07 +0000
Subject: [PATCH 12/42] black

---
 src/agentlab/agents/dynamic_prompting.py |  2 +-
 src/agentlab/experiments/study.py        |  2 +-
 tests/agents/test_generic_prompt.py      | 12 +++++++-----
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py
index 3b4fcd73..73688f0f 100644
--- a/src/agentlab/agents/dynamic_prompting.py
+++ b/src/agentlab/agents/dynamic_prompting.py
@@ -385,7 +385,7 @@ def _prompt(self) -> str:
     URL: {page_url}
 """
             prompt_pieces.append(prompt_piece)
-        return  "\n".join(prompt_pieces)
+        return "\n".join(prompt_pieces)
 
 
 class Observation(Shrinkable):
diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index 6ff6ea5e..36a1f54c 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -245,7 +245,7 @@ def get_report(self, ignore_cache=False, ignore_stale=False):
         return inspect_results.get_study_summary(
             self.dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale
         )
-    
+
     def override_max_steps(self, max_steps):
         for exp_args in self.exp_args_list:
             exp_args.env_args.max_steps = max_steps
diff --git a/tests/agents/test_generic_prompt.py b/tests/agents/test_generic_prompt.py
index a26c5747..ae2e6d8a 100644
--- a/tests/agents/test_generic_prompt.py
+++ b/tests/agents/test_generic_prompt.py
@@ -32,15 +32,18 @@
 }
 
 OBS_HISTORY = [
-    base_obs | {
+    base_obs
+    | {
         "pruned_html": html_template.format(1),
         "last_action_error": "",
     },
-    base_obs | {
+    base_obs
+    | {
         "pruned_html": html_template.format(2),
         "last_action_error": "Hey, this is an error in the past",
     },
-    base_obs | {
+    base_obs
+    | {
         "pruned_html": html_template.format(3),
         "last_action_error": "Hey, there is an error now",
     },
@@ -102,7 +105,7 @@
     ),
     (
         "obs.use_tabs",
-        ("Currently open tabs:","(active tab)"),
+        ("Currently open tabs:", "(active tab)"),
     ),
     (
         "obs.use_focused_element",
@@ -253,4 +256,3 @@ def test_main_prompt_elements_present():
     test_main_prompt_elements_present()
     # for flag, expected_prompts in FLAG_EXPECTED_PROMPT:
     #     test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-  
\ No newline at end of file

From 1aa491659a336918014a8cf1ab8e6c0eaf66a51c Mon Sep 17 00:00:00 2001
From: Maxime Gasse <maxime.gasse@gmail.com>
Date: Fri, 8 Nov 2024 16:12:56 -0500
Subject: [PATCH 13/42] black

---
 src/agentlab/experiments/graph_execution_ray.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/agentlab/experiments/graph_execution_ray.py b/src/agentlab/experiments/graph_execution_ray.py
index ccde71f8..5dd18d4a 100644
--- a/src/agentlab/experiments/graph_execution_ray.py
+++ b/src/agentlab/experiments/graph_execution_ray.py
@@ -38,7 +38,7 @@ def get_task(exp_arg: bgym.ExpArgs):
         get_task(exp_arg)
 
     max_timeout = max([_episode_timeout(exp_args, avg_step_timeout) for exp_args in exp_args_list])
-    
+
     return poll_for_timeout(task_map, max_timeout, poll_interval=max_timeout * 0.1)
 
 

From c990e76839dd1f60d0c183b8f907b8f1b1a1c6ed Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 8 Nov 2024 22:12:15 +0000
Subject: [PATCH 14/42]

---
 .github/workflows/unit_tests.yml              |  3 +
 .../agents/generic_agent/agent_configs.py     | 13 ++--
 .../agents/generic_agent/tmlr_config.py       |  2 +-
 src/agentlab/experiments/args.py              | 10 +++-
 src/agentlab/llm/llm_configs.py               | 59 ++++++++++---------
 5 files changed, 50 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 7a0e312a..3342ba54 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -58,6 +58,9 @@ jobs:
       - name: Check MiniWob availability
         run: curl -I "http://localhost:8080/miniwob/" || echo "MiniWob not reachable"
 
+      - name: Pre-download nltk ressources
+        run: python -c "import nltk; nltk.download('punkt_tab')"
+
       - name: Run AgentLab Unit Tests
         env:
           MINIWOB_URL: "http://localhost:8080/miniwob/"
diff --git a/src/agentlab/agents/generic_agent/agent_configs.py b/src/agentlab/agents/generic_agent/agent_configs.py
index 2d4f9bc3..a5db8c90 100644
--- a/src/agentlab/agents/generic_agent/agent_configs.py
+++ b/src/agentlab/agents/generic_agent/agent_configs.py
@@ -1,4 +1,5 @@
 import bgym
+
 from agentlab.agents import dynamic_prompting as dp
 from agentlab.experiments import args
 from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
@@ -41,7 +42,7 @@
     use_abstract_example=True,
     use_hints=True,
     enable_chat=False,
-    max_prompt_tokens=None,
+    max_prompt_tokens=40_000,
     be_cautious=True,
     extra_instructions=None,
 )
@@ -89,7 +90,7 @@
     use_abstract_example=True,  # useful
     use_hints=True,  # useful
     enable_chat=False,
-    max_prompt_tokens=None,
+    max_prompt_tokens=40_000,
     be_cautious=True,
     extra_instructions=None,
 )
@@ -136,7 +137,7 @@
     use_abstract_example=True,
     use_hints=True,
     enable_chat=False,
-    max_prompt_tokens=None,
+    max_prompt_tokens=40_000,
     be_cautious=True,
     extra_instructions=None,
     add_missparsed_messages=True,
@@ -186,7 +187,7 @@
     use_abstract_example=True,
     use_hints=True,
     enable_chat=False,
-    max_prompt_tokens=None,
+    max_prompt_tokens=40_000,
     be_cautious=True,
     extra_instructions=None,
     add_missparsed_messages=True,
@@ -234,7 +235,7 @@
     use_abstract_example=True,
     use_hints=True,
     enable_chat=False,
-    max_prompt_tokens=None,
+    max_prompt_tokens=40_000,
     be_cautious=True,
     extra_instructions=None,
 )
@@ -298,7 +299,7 @@
     use_hints=args.Choice([True, False], p=[0.7, 0.3]),
     be_cautious=args.Choice([True, False]),
     enable_chat=False,
-    max_prompt_tokens=None,
+    max_prompt_tokens=40_000,
     extra_instructions=None,
 )
 
diff --git a/src/agentlab/agents/generic_agent/tmlr_config.py b/src/agentlab/agents/generic_agent/tmlr_config.py
index 11860e69..48a28c68 100644
--- a/src/agentlab/agents/generic_agent/tmlr_config.py
+++ b/src/agentlab/agents/generic_agent/tmlr_config.py
@@ -40,7 +40,7 @@
     use_abstract_example=True,
     use_hints=True,
     enable_chat=False,
-    max_prompt_tokens=None,
+    max_prompt_tokens=40_000,
     be_cautious=True,
     extra_instructions=None,
 )
diff --git a/src/agentlab/experiments/args.py b/src/agentlab/experiments/args.py
index bbbb3b7b..6a4fa804 100644
--- a/src/agentlab/experiments/args.py
+++ b/src/agentlab/experiments/args.py
@@ -105,13 +105,19 @@ def expand_cross_product(obj: Any | list[Any]):
     for obj in obj_list:
         cprod_paths = _find_cprod_with_paths(obj)
         if not cprod_paths:
-            return [copy.deepcopy(obj)]
+            result.append(copy.deepcopy(obj))
+            continue
 
         paths, cprod_objects = zip(*cprod_paths)
         combinations = product(*[cprod_obj.elements for cprod_obj in cprod_objects])
 
+        # create a base object with empty fields to make fast deep copies from
+        base_obj = copy.deepcopy(obj)
+        for path in paths:
+            _set_value(base_obj, path, None)
+
         for combo in combinations:
-            new_obj = copy.deepcopy(obj)
+            new_obj = copy.deepcopy(base_obj)
             for path, value in zip(paths, combo):
                 _set_value(new_obj, path, value)
             result.append(new_obj)
diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py
index 8376b5c2..feb4d1c8 100644
--- a/src/agentlab/llm/llm_configs.py
+++ b/src/agentlab/llm/llm_configs.py
@@ -20,28 +20,28 @@
     "openai/gpt-4o-mini-2024-07-18": OpenAIModelArgs(
         model_name="gpt-4o-mini-2024-07-18",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
         vision_support=True,
     ),
     "openai/gpt-4-1106-preview": OpenAIModelArgs(
         model_name="gpt-4-1106-preview",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,  # make sure we don't bust budget
-        max_new_tokens=4000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
     ),
     "openai/gpt-4-vision-preview": OpenAIModelArgs(
         model_name="gpt-4-vision-preview",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,  # make sure we don't bust budget
-        max_new_tokens=4000,  # I think this model has very small default value if we don't set max_new_tokens
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,  # I think this model has very small default value if we don't set max_new_tokens
         vision_support=True,
     ),
     "openai/gpt-4o-2024-05-13": OpenAIModelArgs(
         model_name="gpt-4o-2024-05-13",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,  # make sure we don't bust budget
-        max_new_tokens=4000,  # I think this model has very small default value if we don't set max_new_tokens
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,  # I think this model has very small default value if we don't set max_new_tokens
         vision_support=True,
     ),
     "openai/gpt-3.5-turbo-0125": OpenAIModelArgs(
@@ -67,22 +67,25 @@
         model_name="gpt-4o",
         deployment_name="gpt-4o-2024-05-13",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4_000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
+        vision_support=True,
     ),
     "azure/gpt-4o-2024-08-06": AzureModelArgs(
         model_name="gpt-4o",
         deployment_name="gpt-4o-2024-08-06",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4_000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
+        vision_support=True,
     ),
     "azure/gpt-4o-mini-2024-07-18": AzureModelArgs(
         model_name="gpt-4o-mini",
         deployment_name="gpt-4o-mini-2024-07-18",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4_000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
+        vision_support=True,
     ),
     # ---------------- OSS LLMs ----------------#
     "meta-llama/Meta-Llama-3-70B-Instruct": SelfHostedModelArgs(
@@ -113,43 +116,43 @@
     "openrouter/meta-llama/llama-3.1-405b-instruct": OpenRouterModelArgs(
         model_name="meta-llama/llama-3.1-405b-instruct",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
         temperature=1e-1,
     ),
     "openrouter/meta-llama/llama-3.1-70b-instruct": OpenRouterModelArgs(
         model_name="meta-llama/llama-3.1-70b-instruct",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
         temperature=1e-1,
     ),
     "openrouter/meta-llama/llama-3-70b-instruct": OpenRouterModelArgs(
         model_name="meta-llama/llama-3-70b-instruct",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
         temperature=1e-1,
     ),
     "openrouter/meta-llama/llama-3.1-8b-instruct:free": OpenRouterModelArgs(
         model_name="meta-llama/llama-3.1-8b-instruct:free",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
         temperature=1e-1,
     ),
     "openrouter/meta-llama/llama-3.1-8b-instruct": OpenRouterModelArgs(
         model_name="meta-llama/llama-3.1-8b-instruct",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
         temperature=1e-1,
     ),
     "openrouter/anthropic/claude-3.5-sonnet:beta": OpenRouterModelArgs(
         model_name="anthropic/claude-3.5-sonnet:beta",
         max_total_tokens=200_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4000,
+        max_input_tokens=160_000,
+        max_new_tokens=40_000,
         temperature=1e-1,
         vision_support=True,
     ),
@@ -163,8 +166,8 @@
     "openrouter/openai/o1-mini-2024-09-12": OpenRouterModelArgs(
         model_name="openai/o1-mini-2024-09-12",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
         temperature=1e-1,
     ),
 }

From 8de36e2b6951fcac21dceae136e75fcc913e6458 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 8 Nov 2024 22:23:46 +0000
Subject: [PATCH 15/42] Fix sorting bug.  improve directory content retrieval
 with summary statistics

---
 src/agentlab/analyze/agent_xray.py      | 67 ++++++++++++++++---------
 src/agentlab/analyze/inspect_results.py |  1 +
 2 files changed, 44 insertions(+), 24 deletions(-)

diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index 38968fd6..310ebd22 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -184,8 +184,6 @@ def run_gradio(results_dir: Path):
 
     2. **Select Task**: Select the task you want to analyze, this will trigger
        an update of the available seeds.
-       **IMPORTANT NOTE**: Due to a gradio bug, if you sort the columns of the table, the task
-       selection will not correspond to the right one.
 
     3. **Select the Seed**: You might have multiple repetition for a given task,
        you will be able to select the seed you want to analyze.
@@ -216,10 +214,9 @@ def run_gradio(results_dir: Path):
                         """\
     Click on a row to select an agent. It will trigger the update of other
     fields.
-
-    **GRADIO BUG**: If you sort the columns the click will not match the
-    content. You have to sort back with the Idx column to align the click with
-    the order."""
+    
+    The update mechanism is somewhat flacky, please help figure out why (or is it just gradio?).
+    """
                     )
                 agent_table = gr.DataFrame(max_height=500, show_label=False, interactive=False)
             with gr.Tab("Select Task and Seed", id="Select Task"):
@@ -231,9 +228,8 @@ def run_gradio(results_dir: Path):
                                     """\
         Click on a row to select a task. It will trigger the update of other fields.
 
-        **GRADIO BUG**: If you sort the columns the click will not match the
-        content. You have to sort back with the Idx column to align the click with
-        the order."""
+        The update mechanism is somewhat flacky, please help figure out why (or is it just gradio?).
+        """
                                 )
                             refresh_results_button = gr.Button("↺", scale=0, size="sm")
 
@@ -250,9 +246,8 @@ def run_gradio(results_dir: Path):
                                 """\
     Click on a row to select a seed. It will trigger the update of other fields.
 
-    **GRADIO BUG**: If you sort the columns the click will not match the
-    content. You have to sort back with the Idx column to align the click with
-    the order."""
+    The update mechanism is somewhat flacky, please help figure out why (or is it just gradio?).
+    """
                             )
 
                         seed_table = gr.DataFrame(
@@ -824,22 +819,22 @@ def extract_columns(row: pd.Series):
         )
 
     seed_df = result_df.apply(extract_columns, axis=1)
-    seed_df["Idx"] = seed_df.index
     return seed_df
 
 
 def on_select_agent(evt: gr.SelectData, df: pd.DataFrame):
-    global info
+    # TODO try to find a clever way to solve the sort bug here
     return info.get_agent_id(df.iloc[evt.index[0]])
 
 
 def on_select_task(evt: gr.SelectData, df: pd.DataFrame, agent_id: list[tuple]):
-    return (agent_id, df.iloc[evt.index[0]][TASK_NAME_KEY])
+    # get col index
+    col_idx = df.columns.get_loc(TASK_NAME_KEY)
+    return (agent_id, evt.row_value[col_idx])
 
 
 def update_seeds(agent_task_id: tuple):
     agent_id, task_name = agent_task_id
-    global info
     seed_df = get_seeds_df(info.agent_df, task_name)
     first_seed = seed_df.iloc[0]["seed"]
     return seed_df, EpisodeId(agent_id=agent_id, task_name=task_name, seed=first_seed)
@@ -847,7 +842,8 @@ def update_seeds(agent_task_id: tuple):
 
 def on_select_seed(evt: gr.SelectData, df: pd.DataFrame, agent_task_id: tuple):
     agent_id, task_name = agent_task_id
-    seed = df.iloc[evt.index[0]]["seed"]
+    col_idx = df.columns.get_loc("seed")
+    seed = evt.row_value[col_idx]  # seed should be the first column
     return EpisodeId(agent_id=agent_id, task_name=task_name, seed=seed)
 
 
@@ -933,6 +929,7 @@ def new_exp_dir(exp_dir, progress=gr.Progress(), just_refresh=False):
     if exp_dir == select_dir_instructions:
         return None, None
 
+    exp_dir = exp_dir.split(" - ")[0]
     global info
 
     if len(exp_dir) == 0:
@@ -943,10 +940,13 @@ def new_exp_dir(exp_dir, progress=gr.Progress(), just_refresh=False):
     info.result_df = inspect_results.load_result_df(info.exp_list_dir, progress_fn=progress.tqdm)
     info.result_df = remove_args_from_col(info.result_df)
 
-    agent_report = display_table(get_agent_report(info.result_df))
+    study_summary = inspect_results.summarize_study(info.result_df)
+    # save study_summary
+    study_summary.to_csv(info.exp_list_dir / "summary_df.csv", index=False)
+    agent_report = display_table(study_summary)
+
     info.agent_id_keys = agent_report.index.names
     agent_report.reset_index(inplace=True)
-    agent_report["Idx"] = agent_report.index
 
     agent_id = info.get_agent_id(agent_report.iloc[0])
 
@@ -960,7 +960,6 @@ def new_agent_id(agent_id: list[tuple]):
 
     info.tasks_df = inspect_results.reduce_episodes(info.agent_df).reset_index()
     info.tasks_df = info.tasks_df.drop(columns=["std_err"])
-    info.tasks_df["Idx"] = info.tasks_df.index
 
     # task name of first element
     task_name = info.tasks_df.iloc[0][TASK_NAME_KEY]
@@ -968,10 +967,30 @@ def new_agent_id(agent_id: list[tuple]):
 
 
 def get_directory_contents(results_dir: Path):
-    directories = sorted(
-        [str(file.name) for file in results_dir.iterdir() if file.is_dir()], reverse=True
-    )
-    return [select_dir_instructions] + directories
+    exp_descriptions = []
+    for dir in results_dir.iterdir():
+        if not dir.is_dir():
+            continue
+
+        exp_description = dir.name
+        # get summary*.csv files and find the most recent
+        summary_files = list(dir.glob("summary*.csv"))
+        if len(summary_files) != 0:
+            most_recent_summary = max(summary_files, key=os.path.getctime)
+            summary_df = pd.read_csv(most_recent_summary)
+
+            # get row with max avg_reward
+            max_reward_row = summary_df.loc[summary_df["avg_reward"].idxmax()]
+            reward = max_reward_row["avg_reward"] * 100
+            completed = max_reward_row["n_completed"]
+            n_err = max_reward_row["n_err"]
+            exp_description += (
+                f" - avg-reward: {reward:.1f}% - completed: {completed} - errors: {n_err}"
+            )
+
+        exp_descriptions.append(exp_description)
+
+    return [select_dir_instructions] + sorted(exp_descriptions, reverse=True)
 
 
 def most_recent_folder(results_dir: Path):
diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py
index 9ea2aac3..cf038773 100644
--- a/src/agentlab/analyze/inspect_results.py
+++ b/src/agentlab/analyze/inspect_results.py
@@ -295,6 +295,7 @@ def summarize(sub_df, use_bootstrap=False):
             avg_steps=sub_df["n_steps"].mean(skipna=True).round(3),
             n_completed=f"{n_completed}/{len(sub_df)}",
             n_err=err.sum(skipna=True),
+            cum_cost=sub_df["stats.cum_cost"].sum(skipna=True).round(4),
         )
 
     return pd.Series(record)

From c4e8acbdc5a2aa7a7e6f160c8bfe93fd56e774b0 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 8 Nov 2024 22:27:38 +0000
Subject: [PATCH 16/42] fix test

---
 src/agentlab/analyze/inspect_results.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py
index cf038773..4cb4ccf7 100644
--- a/src/agentlab/analyze/inspect_results.py
+++ b/src/agentlab/analyze/inspect_results.py
@@ -295,8 +295,9 @@ def summarize(sub_df, use_bootstrap=False):
             avg_steps=sub_df["n_steps"].mean(skipna=True).round(3),
             n_completed=f"{n_completed}/{len(sub_df)}",
             n_err=err.sum(skipna=True),
-            cum_cost=sub_df["stats.cum_cost"].sum(skipna=True).round(4),
         )
+        if "stats.cum_cost" in sub_df:
+            record["cum_cost"]=sub_df["stats.cum_cost"].sum(skipna=True).round(4),
 
     return pd.Series(record)
 

From c9f184c4546e680d6e529e2291cfa405ac620a3c Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 8 Nov 2024 22:27:50 +0000
Subject: [PATCH 17/42] black

---
 src/agentlab/analyze/inspect_results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py
index 4cb4ccf7..8df2c1ef 100644
--- a/src/agentlab/analyze/inspect_results.py
+++ b/src/agentlab/analyze/inspect_results.py
@@ -297,7 +297,7 @@ def summarize(sub_df, use_bootstrap=False):
             n_err=err.sum(skipna=True),
         )
         if "stats.cum_cost" in sub_df:
-            record["cum_cost"]=sub_df["stats.cum_cost"].sum(skipna=True).round(4),
+            record["cum_cost"] = (sub_df["stats.cum_cost"].sum(skipna=True).round(4),)
 
     return pd.Series(record)
 

From 3a96d5608b2353477e94e7699d6c3880d002f824 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Sat, 9 Nov 2024 03:35:11 +0000
Subject: [PATCH 18/42] tmp

---
 src/agentlab/llm/llm_configs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py
index feb4d1c8..4a7a054e 100644
--- a/src/agentlab/llm/llm_configs.py
+++ b/src/agentlab/llm/llm_configs.py
@@ -21,7 +21,7 @@
         model_name="gpt-4o-mini-2024-07-18",
         max_total_tokens=128_000,
         max_input_tokens=100_000,
-        max_new_tokens=28_000,
+        max_new_tokens=16384,
         vision_support=True,
     ),
     "openai/gpt-4-1106-preview": OpenAIModelArgs(
@@ -84,7 +84,7 @@
         deployment_name="gpt-4o-mini-2024-07-18",
         max_total_tokens=128_000,
         max_input_tokens=100_000,
-        max_new_tokens=28_000,
+        max_new_tokens=16384,
         vision_support=True,
     ),
     # ---------------- OSS LLMs ----------------#
@@ -167,7 +167,7 @@
         model_name="openai/o1-mini-2024-09-12",
         max_total_tokens=128_000,
         max_input_tokens=100_000,
-        max_new_tokens=28_000,
+        max_new_tokens=16384,
         temperature=1e-1,
     ),
 }

From a16aea037ff3594a6a672ffbb738a035b64411a3 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Wed, 13 Nov 2024 15:22:49 +0000
Subject: [PATCH 19/42] add error report, add cum cost to summary and ray
 backend by default

---
 src/agentlab/analyze/agent_xray.py      | 19 +++++++++++++++----
 src/agentlab/analyze/inspect_results.py |  2 +-
 src/agentlab/experiments/launch_exp.py  |  6 +++---
 src/agentlab/experiments/study.py       |  2 +-
 4 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index 310ebd22..0d64a027 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -142,6 +142,10 @@ def filter_agent_id(self, agent_id: list[tuple]):
     max-height: 400px;
     overflow-y: auto;
 }
+.error-report {
+    max-height: 700px;
+    overflow-y: auto;
+}
 .my-code-view {
     max-height: 300px;
     overflow-y: auto;
@@ -284,6 +288,8 @@ def run_gradio(results_dir: Path):
             with gr.Tab("Global Stats"):
                 global_stats = gr.DataFrame(max_height=500, show_label=False, interactive=False)
 
+            with gr.Tab("Error Report"):
+                error_report = gr.Markdown(elem_classes="error-report", show_copy_button=True)
         with gr.Row():
             episode_info = gr.Markdown(label="Episode Info", elem_classes="my-markdown")
             action_info = gr.Markdown(label="Action Info", elem_classes="my-markdown")
@@ -411,7 +417,7 @@ def run_gradio(results_dir: Path):
         exp_dir_choice.change(
             fn=new_exp_dir,
             inputs=exp_dir_choice,
-            outputs=[agent_table, agent_id, constants, variables, global_stats],
+            outputs=[agent_table, agent_id, constants, variables, global_stats, error_report],
         )
 
         agent_table.select(fn=on_select_agent, inputs=agent_table, outputs=[agent_id])
@@ -918,19 +924,24 @@ def get_agent_report(result_df: pd.DataFrame):
 
 
 def update_global_stats():
-    global info
     stats = inspect_results.global_report(info.result_df, reduce_fn=inspect_results.summarize_stats)
     stats.reset_index(inplace=True)
     return stats
 
 
+def update_error_report():
+    report_files = list(info.exp_list_dir.glob("error_report*.md"))
+    if len(report_files) == 0:
+        return "No error report found"
+    report_files = sorted(report_files, key=os.path.getctime, reverse=True)
+    return report_files[0].read_text()
+
 def new_exp_dir(exp_dir, progress=gr.Progress(), just_refresh=False):
 
     if exp_dir == select_dir_instructions:
         return None, None
 
     exp_dir = exp_dir.split(" - ")[0]
-    global info
 
     if len(exp_dir) == 0:
         info.exp_list_dir = None
@@ -951,7 +962,7 @@ def new_exp_dir(exp_dir, progress=gr.Progress(), just_refresh=False):
     agent_id = info.get_agent_id(agent_report.iloc[0])
 
     constants, variables = format_constant_and_variables()
-    return agent_report, agent_id, constants, variables, update_global_stats()
+    return agent_report, agent_id, constants, variables, update_global_stats(), update_error_report()
 
 
 def new_agent_id(agent_id: list[tuple]):
diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py
index 8df2c1ef..09ba23a0 100644
--- a/src/agentlab/analyze/inspect_results.py
+++ b/src/agentlab/analyze/inspect_results.py
@@ -297,7 +297,7 @@ def summarize(sub_df, use_bootstrap=False):
             n_err=err.sum(skipna=True),
         )
         if "stats.cum_cost" in sub_df:
-            record["cum_cost"] = (sub_df["stats.cum_cost"].sum(skipna=True).round(4),)
+            record["cum_cost"] = sub_df["stats.cum_cost"].sum(skipna=True).round(4)
 
     return pd.Series(record)
 
diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py
index 49a778e3..cb331a99 100644
--- a/src/agentlab/experiments/launch_exp.py
+++ b/src/agentlab/experiments/launch_exp.py
@@ -40,9 +40,9 @@ def run_experiments(
     study_dir = Path(study_dir)
     study_dir.mkdir(parents=True, exist_ok=True)
 
-    if n_jobs == 1 and parallel_backend != "sequential":
-        logging.warning("Only 1 job, switching to sequential backend.")
-        parallel_backend = "sequential"
+    # if n_jobs == 1 and parallel_backend != "sequential":
+    #     logging.warning("Only 1 job, switching to sequential backend.")
+    #     parallel_backend = "sequential"
 
     logging.info(f"Saving experiments to {study_dir}")
     for exp_args in exp_args_list:
diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index 2139ce7b..b42f0bb5 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -123,7 +123,7 @@ def set_reproducibility_info(self, strict_reproducibility=False, comment=None):
     def run(
         self,
         n_jobs=1,
-        parallel_backend="joblib",
+        parallel_backend="ray",
         strict_reproducibility=False,
         n_relaunch=3,
         relaunch_errors=True,

From a18e8e53ef0982153649a1605f9d98b95ee30d48 Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Thu, 14 Nov 2024 16:03:12 -0500
Subject: [PATCH 20/42] displaying exp names in ray dashboard (#123)

* displaying exp names in ray dashboard

* fixing tests
---
 src/agentlab/experiments/exp_utils.py           | 15 ++++++++-------
 src/agentlab/experiments/graph_execution_ray.py |  2 +-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/agentlab/experiments/exp_utils.py b/src/agentlab/experiments/exp_utils.py
index 97ce527d..95c7a71c 100644
--- a/src/agentlab/experiments/exp_utils.py
+++ b/src/agentlab/experiments/exp_utils.py
@@ -1,13 +1,13 @@
-import os
-from pathlib import Path
-from browsergym.experiments.loop import _move_old_exp, yield_all_exp_results
-from tqdm import tqdm
 import logging
-from browsergym.experiments.loop import ExpArgs
-from contextlib import contextmanager
+import os
 import signal
 import sys
-from time import time, sleep
+from contextlib import contextmanager
+from pathlib import Path
+from time import sleep, time
+
+from browsergym.experiments.loop import ExpArgs, _move_old_exp, yield_all_exp_results
+from tqdm import tqdm
 
 logger = logging.getLogger(__name__)  # Get logger based on module name
 
@@ -130,6 +130,7 @@ def add_dependencies(exp_args_list: list[ExpArgs], task_dependencies: dict[str,
 class MockedExpArgs:
     def __init__(self, exp_id, depends_on=None):
         self.exp_id = exp_id
+        self.exp_name = f"exp_{exp_id}"
         self.depends_on = depends_on if depends_on else []
         self.start_time = None
         self.end_time = None
diff --git a/src/agentlab/experiments/graph_execution_ray.py b/src/agentlab/experiments/graph_execution_ray.py
index 5dd18d4a..231a130c 100644
--- a/src/agentlab/experiments/graph_execution_ray.py
+++ b/src/agentlab/experiments/graph_execution_ray.py
@@ -28,7 +28,7 @@ def get_task(exp_arg: bgym.ExpArgs):
             dependency_tasks = [get_task(exp_args_map[dep_key]) for dep_key in exp_arg.depends_on]
 
             # Create new task that depends on the dependency results
-            task_map[exp_arg.exp_id] = run_exp.remote(
+            task_map[exp_arg.exp_id] = run_exp.options(name=f"{exp_arg.exp_name}").remote(
                 exp_arg, *dependency_tasks, avg_step_timeout=avg_step_timeout
             )
         return task_map[exp_arg.exp_id]

From a7d6467ed2c49110e384b8f965f14e5a08f5c1c2 Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Fri, 15 Nov 2024 11:43:30 -0500
Subject: [PATCH 21/42] enabling chat o_0 (#124)

---
 src/agentlab/ui_assistant.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/agentlab/ui_assistant.py b/src/agentlab/ui_assistant.py
index 2bebaa41..96bbb0f9 100644
--- a/src/agentlab/ui_assistant.py
+++ b/src/agentlab/ui_assistant.py
@@ -3,6 +3,7 @@
 from browsergym.experiments.loop import EnvArgs, ExpArgs
 
 from agentlab.agents.agent_args import AgentArgs
+from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs
 from agentlab.experiments.exp_utils import RESULTS_DIR
 from agentlab.experiments.launch_exp import import_object
 
@@ -14,6 +15,9 @@ def make_exp_args(agent_args: AgentArgs, start_url="https://www.google.com"):
     except AttributeError:
         pass
 
+    if isinstance(agent_args, GenericAgentArgs):
+        agent_args.flags.enable_chat = True
+
     exp_args = ExpArgs(
         agent_args=agent_args,
         env_args=EnvArgs(

From 50d4571284457d9c7d88a69bcda3b241785ad4f4 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 15 Nov 2024 16:58:24 +0000
Subject: [PATCH 22/42] sequential studies

---
 src/agentlab/experiments/study.py | 199 ++++++++++++++++++++++--------
 1 file changed, 149 insertions(+), 50 deletions(-)

diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index b42f0bb5..23713c5c 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -1,7 +1,7 @@
+from abc import ABC, abstractmethod
 import gzip
 import logging
 import pickle
-import re
 import uuid
 from dataclasses import dataclass
 from datetime import datetime
@@ -13,7 +13,6 @@
 
 from agentlab.agents.agent_args import AgentArgs
 from agentlab.analyze import inspect_results
-from agentlab.experiments import args
 from agentlab.experiments import reproducibility_util as repro
 from agentlab.experiments.exp_utils import RESULTS_DIR, add_dependencies
 from agentlab.experiments.launch_exp import (
@@ -22,11 +21,96 @@
     run_experiments,
 )
 
+
 logger = logging.getLogger(__name__)
 
 
+def make_study(
+    agent_args: list[AgentArgs],
+    benchmark: bgym.Benchmark,
+    logging_level_stdout=logging.WARNING,
+    suffix="",
+    comment=None,
+    ignore_dependencies=False,
+):
+
+    if isinstance(benchmark, str):
+        benchmark = bgym.DEFAULT_BENCHMARKS[benchmark]()
+
+    """Make a study from a list of agents and a benchmark."""
+    if "webarena" in benchmark.name and len(agent_args) > 1:
+        logger.warning(
+            "*WebArena* requires manual reset after each evaluation. Running through SequentialStudies."
+        )
+        studies = []
+        for agent in agent_args:
+            studies.append(
+                Study(
+                    [agent],
+                    benchmark,
+                    logging_level=logging_level_stdout,
+                    suffix=suffix,
+                    comment=comment,
+                    ignore_dependencies=ignore_dependencies,
+                )
+            )
+
+        return SequentialStudies(studies)
+    else:
+        return Study(
+            agent_args,
+            benchmark,
+            logging_level=logging_level_stdout,
+            suffix=suffix,
+            comment=comment,
+            ignore_dependencies=ignore_dependencies,
+        )
+
+
+class AbstractStudy(ABC):
+    dir: Path = None
+    suffix: str = ""
+
+    @abstractmethod
+    def find_incomplete(self, include_errors=True):
+        """Search for missing"""
+
+    @abstractmethod
+    def run(self, n_jobs=1, parallel_backend="ray", strict_reproducibility=False, n_relaunch=3):
+        """Run the study"""
+
+    def make_dir(self, exp_root=RESULTS_DIR):
+        if self.dir is None:
+            dir_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}_{self.name}"
+
+            self.dir = Path(exp_root) / dir_name
+        self.dir.mkdir(parents=True, exist_ok=True)
+
+    def save(self, exp_root=RESULTS_DIR):
+        """Pickle the study to the directory"""
+        # TODO perhaps remove exp_args_list before pickling and when loading bring them from the individual directories
+
+        self.make_dir(exp_root=exp_root)
+        with gzip.open(self.dir / "study.pkl.gz", "wb") as f:
+            pickle.dump(self, f)
+
+    def get_results(self, suffix="", also_save=True):
+        """Recursively load all results from the study directory and summarize them."""
+        result_df = inspect_results.load_result_df(self.dir)
+        error_report = inspect_results.error_report(result_df, max_stack_trace=3, use_log=True)
+        summary_df = inspect_results.summarize_study(result_df)
+
+        if also_save:
+            suffix = f"_{suffix}" if suffix else ""
+            result_df.to_csv(self.dir / f"result_df{suffix}.csv")
+            summary_df.to_csv(self.dir / f"summary_df{suffix}.csv")
+            (self.dir / f"error_report{suffix}.md").write_text(error_report)
+
+        return result_df, summary_df, error_report
+
+
 @dataclass
-class Study:
+class Study(AbstractStudy):
     """A study coresponds to one or multiple agents evaluated on a benchmark.
 
     This is part of the high level API to help keep experiments organized and reproducible.
@@ -142,7 +226,7 @@ def run(
             self._run(n_jobs, parallel_backend, strict_reproducibility)
 
             suffix = f"trial_{i + 1}_of_{n_relaunch}"
-            _, summary_df, error_report = self.get_results(suffix=suffix)
+            _, summary_df, _ = self.get_results(suffix=suffix)
             logger.info("\n" + str(summary_df))
 
             n_incomplete, n_error = self.find_incomplete(include_errors=relaunch_errors)
@@ -200,60 +284,17 @@ def append_to_journal(self, strict_reproducibility=True):
             ValueError: If the reproducibility information is not compatible
                 with the report.
         """
+        _, summary_df, _ = self.get_results()
         repro.append_to_journal(
             self.reproducibility_info,
-            self.get_report(),
+            summary_df,
             strict_reproducibility=strict_reproducibility,
         )
 
-    def get_results(self, suffix="", also_save=True):
-        result_df = inspect_results.load_result_df(self.dir)
-        error_report = inspect_results.error_report(result_df, max_stack_trace=3, use_log=True)
-        summary_df = inspect_results.summarize_study(result_df)
-
-        if also_save:
-            suffix = f"_{suffix}" if suffix else ""
-            result_df.to_csv(self.dir / f"result_df{suffix}.csv")
-            summary_df.to_csv(self.dir / f"summary_df{suffix}.csv")
-            (self.dir / f"error_report{suffix}.md").write_text(error_report)
-
-        return result_df, summary_df, error_report
-
     @property
     def name(self):
         agent_names = [a.agent_name for a in self.agent_args]
-        if len(agent_names) == 1:
-            study_name = f"{agent_names[0]}_on_{self.benchmark.name}"
-        else:
-            study_name = f"{len(agent_names)}_agents_on_{self.benchmark.name}"
-
-        study_name = slugify(study_name, max_length=100, allow_unicode=True)
-
-        if self.suffix:
-            study_name += f"_{self.suffix}"
-        return study_name
-
-    def make_dir(self, exp_root=RESULTS_DIR):
-        if self.dir is None:
-            dir_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}_{self.name}"
-
-            self.dir = Path(exp_root) / dir_name
-        self.dir.mkdir(parents=True, exist_ok=True)
-
-    def save(self):
-        """Pickle the study to the directory"""
-
-        # TODO perhaps remove exp_args_list before pickling and when loading bring them from the individual directories
-
-        self.make_dir()
-
-        with gzip.open(self.dir / "study.pkl.gz", "wb") as f:
-            pickle.dump(self, f)
-
-    def get_report(self, ignore_cache=False, ignore_stale=False):
-        return inspect_results.get_study_summary(
-            self.dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale
-        )
+        return _make_study_name(agent_names, [self.benchmark.name], self.suffix)
 
     def override_max_steps(self, max_steps):
         for exp_args in self.exp_args_list:
@@ -288,6 +329,64 @@ def load_most_recent(root_dir: Path = None, contains=None) -> "Study":
         return Study.load(get_most_recent_study(root_dir, contains=contains))
 
 
+def _make_study_name(agent_names, benchmark_names, suffix=None):
+    """Make a study name from the agent and benchmark names."""
+    if len(agent_names) == 1:
+        agent_name = agent_names[0]
+    else:
+        agent_name = f"{len(agent_names)}_agents"
+
+    if len(benchmark_names) == 1:
+        benchmark_name = benchmark_names[0]
+    else:
+        benchmark_name = f"{len(benchmark_names)}_benchmarks"
+
+    study_name = f"{agent_name}_on_{benchmark_name}_{suffix if suffix else ''}"
+
+    return slugify(study_name, max_length=200, allow_unicode=True)
+
+
+@dataclass
+class SequentialStudies(AbstractStudy):
+    """
+    Sequential execution of multiple studies.
+
+    This is required for e.g. WebArena, where a server reset is required between evaluations of each agent.
+    """
+
+    studies: list[Study]
+
+    @property
+    def name(self):
+        """The name of the study."""
+        agent_names = [a.agent_name for study in self.studies for a in study.agent_args]
+        benchmark_names = [study.benchmark.name for study in self.studies]
+        return _make_study_name(agent_names, benchmark_names, self.suffix)
+
+    def find_incomplete(self, include_errors=True):
+        for study in self.studies:
+            study.find_incomplete(include_errors=include_errors)
+
+    def run(self, n_jobs=1, parallel_backend="ray", strict_reproducibility=False, n_relaunch=3):
+
+        self.save()
+
+        for study in self.studies:
+            study.make_dir(exp_root=self.dir)
+            study.run(n_jobs, parallel_backend, strict_reproducibility, n_relaunch)
+        _, summary_df, _ = self.get_results()
+        logger.info("\n" + str(summary_df))
+        logger.info(f"SequentialStudies {self.name} finished.")
+
+    def override_max_steps(self, max_steps):
+        for study in self.studies:
+            study.override_max_steps(max_steps)
+
+    def append_to_journal(self, strict_reproducibility=True):
+        for study in self.studies:
+            study.append_to_journal(strict_reproducibility=strict_reproducibility)
+
+
 def get_most_recent_study(
     root_dir: Path = None, date_format: str = "%Y-%m-%d_%H-%M-%S", contains=None
 ):

From d0919dc5dcf6b49ff078c29903b0159c0c64dc16 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Mon, 18 Nov 2024 19:13:06 +0000
Subject: [PATCH 23/42] little bug

---
 src/agentlab/experiments/study.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index 23713c5c..698714de 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -369,10 +369,12 @@ def find_incomplete(self, include_errors=True):
 
     def run(self, n_jobs=1, parallel_backend="ray", strict_reproducibility=False, n_relaunch=3):
 
+        for study in self.studies:
+            study.make_dir(exp_root=self.dir)
+
         self.save()
 
         for study in self.studies:
-            study.make_dir(exp_root=self.dir)
             study.run(n_jobs, parallel_backend, strict_reproducibility, n_relaunch)
         _, summary_df, _ = self.get_results()
         logger.info("\n" + str(summary_df))

From 0e2b752642dc63a4a56de2c0eb50a75dfb3de24d Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Mon, 18 Nov 2024 19:13:16 +0000
Subject: [PATCH 24/42] more flexible requirement

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 453f312d..ba5c8732 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-black[jupyter]==24.2.0
+black>=24.2.0
 blacken-docs
 pre-commit
 pytest==7.3.2

From 041fd68501142b1d56248ac2997b4192fa695d46 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Mon, 18 Nov 2024 19:14:24 +0000
Subject: [PATCH 25/42] imrove readme

---
 README.md | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 0bce34d4..bdad77c9 100644
--- a/README.md
+++ b/README.md
@@ -4,13 +4,28 @@
   <img src="https://github.com/user-attachments/assets/1bd2f6b2-bce0-43c7-846b-837fd3c6480a" width="1000" />
 </a>
 
+
+
+
 AgentLab is a framework for developing and evaluating agents on a variety of
 benchmarks supported by [BrowserGym](https://github.com/ServiceNow/BrowserGym).
 This includes:
-* WebArena
-* WorkArena.L1, L2, L3
-* VisualWebArena (coming soon...)
-* MiniWoB
+* [WebArena](https://webarena.dev/)
+* [WorkArena](https://github.com/ServiceNow/WorkArena) L1, L2, L3
+* [WebLinx](https://mcgill-nlp.github.io/weblinx/)
+* [VisualWebArena](https://github.com/web-arena-x/visualwebarena)
+* Assistant Bench
+* GAIA
+* Mind2Web-live (coming soon ...)
+* [MiniWoB](https://miniwob.farama.org/index.html)
+
+AgentLab Features:
+* Easy large scale parallel agent experiments using [ray](https://www.ray.io/)
+* Building blocks for making agents
+* Unified LLM api for OpenRouter, OpenAI, Azure, Self hosted using TGI.
+* Prefered way for running benchmarks like WebArena
+* Various Reproducibility features
+* Unified LeaderBoard
 
 The framework enables the desing of rich hyperparameter spaces and the launch of
 parallel experiments using ablation studies or random searches. It also provides

From 79ac4184d92c8a8f0ce4bc6bf6dbbc6a1abb14a7 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 22 Nov 2024 12:21:45 +0000
Subject: [PATCH 26/42] Enhance agent configuration and logging in study setup

- Updated `get_vision_agent` to append "_vision" to agent names.
- Modified `BaseMessage.__str__` to include a no-warning option for logging.
- Improved `make_study` function to accept single agent args and benchmark types.
- Added detailed docstrings for better clarity on parameters and functionality.
- Introduced `avg_step_timeout` and `demo_mode` attributes in the Study class.
---
 README.md                                     | 270 ++++++++++--------
 .../agents/generic_agent/tmlr_config.py       |   4 +-
 src/agentlab/experiments/study.py             | 134 +++++++--
 src/agentlab/llm/llm_utils.py                 |  12 +-
 4 files changed, 273 insertions(+), 147 deletions(-)

diff --git a/README.md b/README.md
index b4c518fd..22a4e0fe 100644
--- a/README.md
+++ b/README.md
@@ -1,173 +1,177 @@
 
 
-<a href="https://github.com/user-attachments/assets/fa71f769-6d7b-427a-978b-82aa13a6265f">
-  <img src="https://github.com/user-attachments/assets/fa71f769-6d7b-427a-978b-82aa13a6265f" width="1000" />
-</a>
-
+<a href="https://github.com/user-attachments/assets/c2bc0b80-89da-4afb-9120-2feb018df19d"> <img
+  src="https://github.com/user-attachments/assets/c2bc0b80-89da-4afb-9120-2feb018df19d" width="800"
+/> </a>
+
+&nbsp;&nbsp;|&nbsp;&nbsp; 
+[🎯 Benchmarks](#🎯-supported-benchmarks) &nbsp;&nbsp;|&nbsp;&nbsp; 
+[🛠️ Setup](#🛠️-setup-agentlab) &nbsp;&nbsp;|&nbsp;&nbsp; 
+[🤖 Assistant](#ui-assistant) &nbsp;&nbsp;|&nbsp;&nbsp; 
+[🚀 Launch Experiments](#🚀-launch-experiments) &nbsp;&nbsp;|&nbsp;&nbsp;
+[🔍 AgentXray](#🔍-agentxray) &nbsp;&nbsp;|&nbsp;&nbsp; 
+[🤖 Make Your Own Agent](#implement-a-new-agent) &nbsp;&nbsp;|&nbsp;&nbsp;
+[↻ Reproducibility](#↻-reproducibility) &nbsp;&nbsp;|&nbsp;&nbsp;
+
+<video controls style="max-width: 800px;">
+  <source src="https://github.com/ServiceNow/BrowserGym/assets/26232819/e0bfc788-cc8e-44f1-b8c3-0d1114108b85" type="video/mp4">
+  Your browser does not support the video tag.
+</video>
 
 AgentLab is a framework for developing and evaluating agents on a variety of
-benchmarks supported by [BrowserGym](https://github.com/ServiceNow/BrowserGym).
-This includes:
-* [WebArena](https://webarena.dev/)
-* [WorkArena](https://github.com/ServiceNow/WorkArena) L1, L2, L3
-* [WebLinx](https://mcgill-nlp.github.io/weblinx/)
-* [VisualWebArena](https://github.com/web-arena-x/visualwebarena)
-* Assistant Bench
-* GAIA
-* Mind2Web-live (coming soon ...)
-* [MiniWoB](https://miniwob.farama.org/index.html)
+[benchmarks](#🎯-supported-benchmarks) supported by
+[BrowserGym](https://github.com/ServiceNow/BrowserGym).
 
 AgentLab Features:
 * Easy large scale parallel agent experiments using [ray](https://www.ray.io/)
 * Building blocks for making agents
-* Unified LLM api for OpenRouter, OpenAI, Azure, Self hosted using TGI.
+* Unified LLM api for OpenRouter, OpenAI, Azure, or self hosted using TGI.
 * Prefered way for running benchmarks like WebArena
 * Various Reproducibility features
-* Unified LeaderBoard
-
-The framework enables the desing of rich hyperparameter spaces and the launch of
-parallel experiments using ablation studies or random searches. It also provides
-agent_xray, a visualization tool to inspect the results of the experiments using
-a custom gradio interface
-
-<a href="https://github.com/user-attachments/assets/20a91e7b-94ef-423d-9091-743eebb4733d">
-  <img src="https://github.com/user-attachments/assets/20a91e7b-94ef-423d-9091-743eebb4733d" width="250" />
-</a>
-
-## Install agentlab
-
-This repo is intended for testing and developing new agents, hence we clone and install using the `-e` flag.
+* Unified LeaderBoard (soon)
+
+## 🎯 Supported Benchmarks
+| Benchmark | Setup  <br> Link | # Task <br> Template| Seed  <br> Diversity | Max  <br> Step | Multi-tab | Hosted Method | BrowserGym <br> Leaderboard |
+|-----------|------------|---------|----------------|-----------|-----------|---------------|----------------------|
+| [WebArena](https://webarena.dev/) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/webarena/README.md) | 812 | None | 30 | yes | self hosted (docker) | soon |
+| [WorkArena](https://github.com/ServiceNow/WorkArena) L1 | [setup](https://github.com/ServiceNow/WorkArena?tab=readme-ov-file#getting-started) | 33 | High | 30 | no | demo instance | soon |
+| [WorkArena](https://github.com/ServiceNow/WorkArena) L2 | [setup](https://github.com/ServiceNow/WorkArena?tab=readme-ov-file#getting-started) | 341 | High | 50 | no | demo instance | soon |
+| [WorkArena](https://github.com/ServiceNow/WorkArena) L3 | [setup](https://github.com/ServiceNow/WorkArena?tab=readme-ov-file#getting-started) | 341 | High | 50 | no | demo instance | soon |
+| [WebLinx](https://mcgill-nlp.github.io/weblinx/) | - | 31586 | None | 1 | no | self hosted (dataset) | soon |
+| [VisualWebArena](https://github.com/web-arena-x/visualwebarena) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/visualwebarena/README.md) | 910 | None | 30 | yes | self hosted (docker) | soon |
+| [Assistant Bench](https://assistantbench.github.io/) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/assistantbench/README.md) | 214 | None | 30 | yes | live web | soon |
+| [GAIA](https://huggingface.co/spaces/gaia-benchmark/leaderboard) (soon) | - | - | None | - | - | live web | soon |
+| [Mind2Web-live](https://huggingface.co/datasets/iMeanAI/Mind2Web-Live) (soon) | - | - | None | - | - | live web | soon |
+| [MiniWoB](https://miniwob.farama.org/index.html) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/README.md) | 125 | Medium | 10 | no | self hosted (static files) | soon |
+## 🛠️ Setup agentlab
 
 ```bash
-git clone git@github.com:ServiceNow/AgentLab.git
-pip install -e .
+pip install agentlab
 ```
 
-## Set Environment Variables
+Make sure to prepare the required benchmark according to instructions provided in the [setup
+column](#🎯-supported-benchmarks).
 
 ```bash
 export AGENTLAB_EXP_ROOT=<root directory of experiment results>  # defaults to $HOME/agentlab_results
 export OPENAI_API_KEY=<your openai api key> # if openai models are used
-export HUGGINGFACEHUB_API_TOKEN=<your huggingfacehub api token> # if huggingface models are used
-```
-
-## Use an assistant to work for you (at your own cost and risk)
-```bash
-agentlab-assistant --start_url https://www.google.com
 ```
 
-## Prepare Benchmarks
-Depending on which benchmark you use, there are some prerequisites
-
 <details>
-<summary>MiniWoB</summary>
+<summary>Setup OpenRouter API</summary>
 
 ```bash
-export MINIWOB_URL="file://$HOME/dev/miniwob-plusplus/miniwob/html/miniwob/"
+export OPENROUTER_API_KEY=<your openrouter api key> # if openrouter models are used
 ```
 </details>
 
 <details>
+<summary>Setup Azure API</summary>
 
-<summary>WorkArena</summary>
-
-See [detailed instructions on workarena github](https://github.com/ServiceNow/WorkArena?tab=readme-ov-file#getting-started)
-
-At a glance: 
-1) [Sign in](https://developer.servicenow.com/) and reqeuest a `washington` instance.
-2) Once the instance is ready, you should see `<your instance URL>` and `<your-instance-password>`
-3) Add these to your `.bashrc` (or `.zshrc`) and `source` it (note: make sure that
-  all variables are in single quotes unless you happen to have a password with a
-  single quote in it)
-    ```bash
-    export SNOW_INSTANCE_URL='https://<your-instance-number>.service-now.com/'
-    export SNOW_INSTANCE_UNAME='admin'
-    export SNOW_INSTANCE_PWD='<your-instance-password>'
-    ```
-4) finally run these commands:
-  
-    ```bash
-    pip install browsergym-workarena
-    playwright install
-    workarena-install
-    ```
-
-
+```bash
+export AZURE_OPENAI_API_KEY=<your azure api key> # if using azure models
+export AZURE_OPENAI_ENDPOINT=<your endpoint> # if using azure models
+```
 </details>
 
-<details>
-<summary>WebArena on AWS</summary>
-TODO
-</details>
+## UI-Assistant 
+Use an assistant to work for you (at your own cost and risk).
 
-<details>
-<summary>WebArena on Azure</summary>
-TODO
-</details>
+```bash
+agentlab-assistant --start_url https://www.google.com
+```
 
+Try your own agent: 
 
+```bash
+agentlab-assistant --agent_config="module.path.to.your.AgentArgs"
+```
+
+## 🚀 Launch experiments
 
+```python
+# Import your agent configuration extending bgym.AgentArgs class
+# Make sure this object is imported from a module accessible in PYTHONPATH to properly unpickle
+from agentlab.agents.generic_agent import AGENT_4o_MINI 
 
+from agentlab.experiments.study import make_study
 
-## Launch experiments
+study = make_study(
+    benchmark="miniwob",  # or "webarena", "workarnea_l1" ...
+    agent_args=[AGENT_4o_MINI],
+    comment="My first study",
+)
 
-Create your agent or import an existing one:
-```python
-from agentlab.agents.generic_agent.agent_configs import AGENT_4o
+study.run(n_jobs=5)
 ```
 
-Run the agent on a benchmark:
+Relaunching incomplete or errored tasks
+
 ```python
-study_name, exp_args_list = run_agents_on_benchmark(AGENT_4o, benchmark)
-study_dir = make_study_dir(RESULTS_DIR, study_name)
-run_experiments(n_jobs, exp_args_list, study_dir)
+from agentlab.experiments.study import Study
+study = Study.load("/path/to/your/study/dir")
+study.find_incomplete(include_errors=True)
+study.run()
 ```
 
-use [main.py](main.py) to launch experiments with a variety
-of options. This is like a lazy CLI that is actually more convenient than a CLI.
-Just comment and uncomment the lines you need or modify at will (but don't push
-to the repo).
-
-<details>
+See [main.py](main.py) to launch experiments with a variety of options. This is like a lazy CLI that
+is actually more convenient. Just comment and uncomment the lines you need or modify at will (but
+don't push to the repo).
 
-<summary>Debugging</summary>
 
-For debugging, run experiments using `n_jobs=1` and use VSCode debug mode. This
-will allow you to stop on breakpoints. To prevent the debugger from stopping
-on errors when running multiple experiments directly in VSCode, set
-`enable_debug = False` in `ExpArgs` 
-</details>
+### Job Timeouts
 
+The complexity of the wild web, Playwright, and asyncio can sometimes cause jobs to hang. This
+disables workers until the study is terminated and relaunched. If you are running jobs sequentially
+or with a small number of workers, this could halt your entire study until you manually kill and
+relaunch it. In the Ray parallel backend, we've implemented a system to automatically terminate jobs
+exceeding a specified timeout. This feature is particularly useful when task hanging limits your
+experiments. 
 
+### Debugging
 
+For debugging, run experiments with `n_jobs=1` and use VSCode's debug mode. This allows you to pause
+execution at breakpoints. To prevent the debugger from stopping on errors while running multiple
+experiments in VSCode, set `enable_debug = False` in `ExpArgs`.
 
+### About Parallel Jobs
 
-<details>
+Running one agent on one task corresponds to a single job. Conducting ablation studies or random
+searches across hundreds of tasks with multiple seeds can generate more than 10,000 jobs. Efficient
+parallel execution is therefore critical. Agents typically wait for responses from the LLM server or
+updates from the web server. As a result, you can run 10–50 jobs in parallel on a single computer,
+depending on available RAM.
 
-<summary>Parallel jobs</summary>
+⚠️ **Note for (Visual)WebArena**: These benchmarks have task dependencies designed to minimize
+"corrupting" the instance between tasks. For example, an agent on task 323 could alter the instance
+state, making task 201 impossible. To address this, the Ray backend accounts for task dependencies,
+enabling some degree of parallelism. On WebArena, you can disable dependencies to increase
+parallelism, but this might reduce performance by 1–2%.
 
-Running one agent on one task correspond to one job. When conducting ablation
-studies or random searches on hundreds of tasks with multiple seeds, this can
-lead to more than 10000 jobs. It is thus crucial to execute them in parallel.
-The agent usually wait on the LLM server to return the results or the web server
-to update the page. Hence, you can run 10-50 jobs in parallel on a single
-computer depending on how much RAM is available.
+⚠️ **Instance Reset for (Visual)WebArena**: Before evaluating an agent, the instance is
+automatically reset, a process that takes about 5 minutes. When evaluating multiple agents, the
+`make_study` function returns a `SequentialStudies` object to ensure proper sequential evaluation of
+each agent. AgentLab currently does not support evaluations across multiple instances, but you could
+either create a quick script to handle this or submit a PR to AgentLab. For a smoother parallel
+experience, consider using benchmarks like WorkArena instead.
 
-</details>
 
-## AgentXray
+## 🔍 AgentXray
 While your experiments are running, you can inspect the results using:
 
 ```bash
 agentlab-xray
 ```
 
-<a href="https://github.com/user-attachments/assets/20a91e7b-94ef-423d-9091-743eebb4733d">
-  <img src="https://github.com/user-attachments/assets/20a91e7b-94ef-423d-9091-743eebb4733d" width="250" />
-</a>
 
-You will be able to select the recent experiments in the directory
-`AGENTLAB_EXP_ROOT` and visualize the results in a gradio interface.
+<video controls style="max-width: 800px;">
+  <source src="https://github.com/user-attachments/assets/06c4dac0-b78f-45b7-9405-003da4af6b37" type="video/mp4">
+  Your browser does not support the video tag.
+</video>
+
+
+You will be able to select the recent experiments in the directory `AGENTLAB_EXP_ROOT` and visualize
+the results in a gradio interface.
 
 In the following order, select:
 * The experiment you want to visualize
@@ -175,14 +179,52 @@ In the following order, select:
 * The task
 * And the seed
 
-Once this is selected, you can see the trace of your agent on the given task.
-Click on the profiling image to select a step and observe the action taken by the agent.
+Once this is selected, you can see the trace of your agent on the given task. Click on the profiling
+image to select a step and observe the action taken by the agent.
 
 ## Implement a new Agent
 
-Get inspiration from the `MostBasicAgent` in [agentlab/agents/most_basic_agent/most_basic_agent.py](src/agentlab/agents/most_basic_agent/most_basic_agent.py)
+Get inspiration from the `MostBasicAgent` in
+[agentlab/agents/most_basic_agent/most_basic_agent.py](src/agentlab/agents/most_basic_agent/most_basic_agent.py).
+For a better integration with the tools, make sure to implement most functions in the
+[AgentArgs](src/agentlab/agents/agent_args.py#L5) API and the extended `bgym.AbstractAgentArgs`.
+
+If you think your agent should be included directly in AgenLab, let use know and it can be added in
+agentlab/agents/ with the name of your agent.  
+
+## ↻ Reproducibility
+Several factors can influence reproducibility of results in the context of evaluating agents on
+dynamic benchmarks.
+
+### Factors affecting roproducibility
+* **Software version**: Different version of Playwright or any package in the software stack could
+  influence the behavior of the benchmark or the agent.
+* **API based LLMs silently changing**: Even for a fixed version, a LLM may be updated e.g. to
+  incorporate latest web knowledge.
+* **Live websites**:
+  * WorkArena: The demo instance is mostly fixed in time to a specific version but ServiceNow
+    sometime push minor modifications.
+  * AssistantBench and GAIA: These rely on the agent navigating the open web. The experience may
+    change depending on which country or region, some websites might be in different languages by
+    default.
+* **Stochastic Agents**: Setting temperature of the LLM to 0 can reduce most stochasticity.
+* **Non deterministic tasks**: For a fixed seed, the changes should be minimal
+
+### Reproducibility Features
+* `Study` contains a dict of information about reproducibility, including benchmark version, package
+  version and commit hash
+* The `Study` class allows automatic upload of your results to
+  [`reproducibility_journal.csv`](reproducibility_journal.csv). This makes it easier to populate a
+  large amount of reference points. 
+* **Reproduced results in the leaderboard**. For agents that are repdocudibile, we encourage users
+  to try to reproduce the results and upload them to the leaderboard. There is a special column
+  containing information about all reproduced results of an agent on a benchmark.
+* **ReproducibilityAgent**: You can run this agent on an existing study and it will try to re-run
+  the same actions on the same task seeds. A vsiual diff of the two prompts will be displayed in the
+  AgentInfo HTML tab of AgentXray. You will be able to inspect on some tasks what kind of changes
+  between to two executions. **Note**: this is a beta feature and will need some adaptation for your
+  own agent.
 
-Create a new directory in agentlab/agents/ with the name of your agent. 
 
 ## Misc
 
diff --git a/src/agentlab/agents/generic_agent/tmlr_config.py b/src/agentlab/agents/generic_agent/tmlr_config.py
index 48a28c68..96abc46c 100644
--- a/src/agentlab/agents/generic_agent/tmlr_config.py
+++ b/src/agentlab/agents/generic_agent/tmlr_config.py
@@ -56,10 +56,12 @@ def get_base_agent(llm_config: str):
 def get_vision_agent(llm_config: str):
     flags = deepcopy(BASE_FLAGS)
     flags.obs.use_screenshot = True
-    return GenericAgentArgs(
+    agent_args = GenericAgentArgs(
         chat_model_args=CHAT_MODEL_ARGS_DICT[llm_config],
         flags=flags,
     )
+    agent_args.agent_name = f"{agent_args.agent_name}_vision"
+    return agent_args
 
 
 def get_som_agent(llm_config: str):
diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index 698714de..851f3178 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -26,18 +26,59 @@
 
 
 def make_study(
-    agent_args: list[AgentArgs],
-    benchmark: bgym.Benchmark,
+    agent_args: list[AgentArgs] | AgentArgs,
+    benchmark: bgym.Benchmark | str,
     logging_level_stdout=logging.WARNING,
     suffix="",
     comment=None,
     ignore_dependencies=False,
 ):
+    """Run a list of agents on a benchmark.
+
+    Args:
+        agent_args: list[AgentArgs] | AgentArgs
+            The agent configuration(s) to run. *IMPORTANT*: these objects will be pickled and
+            unpickled.  Make sure they are imported from a package that is accessible from
+            PYTHONPATH. Otherwise, it won't load in agentlab-xray.
+
+        benchmark: bgym.Benchmark | str
+            The benchmark to run the agents on. See bgym.DEFAULT_BENCHMARKS for the main ones. You
+            can also make your own by modifying an existing one.
+
+        logging_level_stdout: int
+            The logging level for the stdout of the main script. Each job will have its own logging
+            level that will save into file and can be seen in agentlab-xray.
+
+        suffix: str
+            A suffix to add to the study name. This can be useful to keep track of your experiments.
+            By default the study name contains agent name, benchmark name and date.
+
+        comment: str
+            Extra comments from the authors of this study to be stored in the reproducibility
+            information. Leave any extra information that can explain why results could be different
+            than expected.
+
+        ignore_dependencies: bool
+            If True, ignore the dependencies of the tasks in the benchmark. *Use with caution.* So
+            far, only WebArena and VisualWebArena have dependencies between tasks to minimize the
+            influence of solving one task before another one. This dependency graph allows
+            experiments to run in parallel while respecting task dependencies. However, it still
+            can't run more than 4 and, in practice it's speeding up evaluation by a factor of only
+            3x compare to sequential executionz. To accelerate execution, you can ignore
+            dependencies and run in full parallel. This leads to a decrease in performance of about
+            1%-2%, and could be more. Note: ignore_dependencies on VisualWebArena doesn't work.
+
+    Returns:
+        Study object or SequentialStudies object if the benchmark requires manual reset after each
+        evaluation such as WebArena and VisualWebArena.
+    """
+
+    if not isinstance(agent_args, (list, tuple)):
+        agent_args = [agent_args]
 
     if isinstance(benchmark, str):
         benchmark = bgym.DEFAULT_BENCHMARKS[benchmark]()
 
-    """Make a study from a list of agents and a benchmark."""
     if "webarena" in benchmark.name and len(agent_args) > 1:
         logger.warning(
             "*WebArena* requires manual reset after each evaluation. Running through SequentialStudies."
@@ -68,18 +109,21 @@ def make_study(
 
 
 class AbstractStudy(ABC):
+    """Abstract class for a study."""
+
     dir: Path = None
     suffix: str = ""
 
     @abstractmethod
     def find_incomplete(self, include_errors=True):
-        """Search for missing"""
+        """Prepare the study for relaunching by finding incomplete experiments"""
 
     @abstractmethod
     def run(self, n_jobs=1, parallel_backend="ray", strict_reproducibility=False, n_relaunch=3):
         """Run the study"""
 
     def make_dir(self, exp_root=RESULTS_DIR):
+        """Create a directory for the study"""
         if self.dir is None:
             dir_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}_{self.name}"
 
@@ -116,24 +160,48 @@ class Study(AbstractStudy):
     This is part of the high level API to help keep experiments organized and reproducible.
 
     Attributes:
-        benchmark: Benchmark | str
-            The benchmark to evaluate the agents on. If a string is provided, it will be
-            converted to the corresponding benchmark using bgym.DEFAULT_BENCHMARKS.
-
         agent_args: list[AgentArgs]
-            The list of agents to evaluate.
-
+            The agent configuration(s) to run. *IMPORTANT*: these objects will be pickled and
+            unpickled.  Make sure they are imported from a package that is accessible from
+            PYTHONPATH. Otherwise, it won't load in agentlab-xray.
+        benchmark: bgym.Benchmark | str
+            The benchmark to run the agents on. See bgym.DEFAULT_BENCHMARKS for the main ones. You
+            can also make your own by modifying an existing one.
         dir: Path
-            The directory where the results will be saved.
-
+            The directory where the study will be saved. If None, a directory will be created in
+            RESULTS_DIR.
         suffix: str
-            A suffix to add to the study name
-
+            A suffix to add to the study name. This can be useful to keep track of your experiments.
+            By default the study name contains agent name, benchmark name and date.
         uuid: str
-            A unique identifier for the study
-
+            A unique identifier for the study.
         reproducibility_info: dict
-            The reproducibility information for the study.
+            Information about the study that may affect the reproducibility of the experiment. e.g.:
+            versions of BrowserGym, benchmark, AgentLab...
+        logging_level: int
+            The logging level for individual jobs.
+        logging_level_stdout: int
+            The logging level for the stdout of the main script. Each job will have its own logging
+            level that will save into file and can be seen in agentlab-xray.
+        comment: str
+            Extra comments from the authors of this study to be stored in the reproducibility
+            information. Leave any extra information that can explain why results could be different
+            than expected.
+        ignore_dependencies: bool
+            If True, ignore the dependencies of the tasks in the benchmark. *Use with caution.* So
+            far, only WebArena and VisualWebArena have dependencies between tasks to minimize the
+            influence of solving one task before another one. This dependency graph allows
+            experiments to run in parallel while respecting task dependencies. However, it still
+            can't run more than 4 and, in practice it's speeding up evaluation by a factor of only
+            3x compare to sequential executionz. To accelerate execution, you can ignore
+            dependencies and run in full parallel. This leads to a decrease in performance of about
+            1%-2%, and could be more. Note: ignore_dependencies on VisualWebArena doesn't work.
+        avg_step_timeout: int
+            The average step timeout in seconds. This is used to stop the experiments if they are
+            taking too long. The default is 60 seconds.
+        demo_mode: bool
+            If True, the experiments will be run in demo mode, which will record videos, and enable
+            visual effects for actions.
     """
 
     agent_args: list[AgentArgs] = None
@@ -146,8 +214,11 @@ class Study(AbstractStudy):
     logging_level_stdout: int = logging.WARNING
     comment: str = None  # Extra comments from the authors of this study
     ignore_dependencies: bool = False
+    avg_step_timeout: int = 60
+    demo_mode: bool = False
 
     def __post_init__(self):
+        """Initialize the study. Set the uuid, and generate the exp_args_list."""
         self.uuid = uuid.uuid4()
         if isinstance(self.benchmark, str):
             self.benchmark = bgym.DEFAULT_BENCHMARKS[self.benchmark]()
@@ -156,12 +227,14 @@ def __post_init__(self):
         self.make_exp_args_list()
 
     def make_exp_args_list(self):
+        """Generate the exp_args_list from the agent_args and the benchmark."""
         self.exp_args_list = _agents_on_benchmark(
             self.agent_args,
             self.benchmark,
             logging_level=self.logging_level,
             logging_level_stdout=self.logging_level_stdout,
             ignore_dependencies=self.ignore_dependencies,
+            demo_mode=self.demo_mode,
         )
 
     def find_incomplete(self, include_errors=True):
@@ -271,7 +344,13 @@ def _run(self, n_jobs=1, parallel_backend="joblib", strict_reproducibility=False
         self.benchmark.prepare_backends()
         logger.info("Backends ready.")
 
-        run_experiments(n_jobs, self.exp_args_list, self.dir, parallel_backend=parallel_backend)
+        run_experiments(
+            n_jobs,
+            self.exp_args_list,
+            self.dir,
+            parallel_backend=parallel_backend,
+            avg_step_timeout=self.avg_step_timeout,
+        )
 
     def append_to_journal(self, strict_reproducibility=True):
         """Append the study to the journal.
@@ -331,6 +410,11 @@ def load_most_recent(root_dir: Path = None, contains=None) -> "Study":
 
 def _make_study_name(agent_names, benchmark_names, suffix=None):
     """Make a study name from the agent and benchmark names."""
+
+    # extract unique agent and benchmark names
+    agent_names = list(set(agent_names))
+    benchmark_names = list(set(benchmark_names))
+
     if len(agent_names) == 1:
         agent_name = agent_names[0]
     else:
@@ -369,6 +453,9 @@ def find_incomplete(self, include_errors=True):
 
     def run(self, n_jobs=1, parallel_backend="ray", strict_reproducibility=False, n_relaunch=3):
 
+        # This sequence of of making directories is important to make sure objects are materialized
+        # properly before saving. Otherwise relaunch may not work properly.
+        self.make_dir()
         for study in self.studies:
             study.make_dir(exp_root=self.dir)
 
@@ -425,7 +512,7 @@ def get_most_recent_study(
 
 
 def set_demo_mode(env_args_list: list[EnvArgs]):
-
+    """Set the demo mode for the experiments. This can be useful for generating videos for demos."""
     for env_args in env_args_list:
         env_args.viewport = {"width": 1280, "height": 720}
         env_args.record_video = True
@@ -473,15 +560,6 @@ def _agents_on_benchmark(
     if demo_mode:
         set_demo_mode(env_args_list)
 
-    # exp_args_list = args.expand_cross_product(
-    #     ExpArgs(
-    #         agent_args=args.CrossProd(agents),
-    #         env_args=args.CrossProd(env_args_list),
-    #         logging_level=logging_level,
-    #         logging_level_stdout=logging_level_stdout,
-    #     )
-    # )  # type: list[ExpArgs]
-
     exp_args_list = []
 
     for agent in agents:
diff --git a/src/agentlab/llm/llm_utils.py b/src/agentlab/llm/llm_utils.py
index eaa2a5e0..c283bd3c 100644
--- a/src/agentlab/llm/llm_utils.py
+++ b/src/agentlab/llm/llm_utils.py
@@ -329,15 +329,19 @@ def __init__(self, role: str, content: Union[str, list[dict]]):
         self["role"] = role
         self["content"] = deepcopy(content)
 
-    def __str__(self) -> str:
+    def __str__(self, no_warning=False) -> str:
         if isinstance(self["content"], str):
             return self["content"]
         if not all(elem["type"] == "text" for elem in self["content"]):
-            logging.warning(
-                "The content of the message has images, which are not displayed in the string representation."
-            )
+            if not no_warning:
+                logging.warning(
+                    "The content of the message has images, which are not displayed in the string representation."
+                )
         return "\n".join([elem["text"] for elem in self["content"] if elem["type"] == "text"])
 
+    def get_text(self):
+        return str(self, no_warning=True)
+
     def add_content(self, type: str, content: Any):
         if isinstance(self["content"], str):
             text = self["content"]

From f4f9e25416eb457c7c2ce4e344508b2f862c0fbf Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 22 Nov 2024 15:03:53 +0000
Subject: [PATCH 27/42] get_text was added by mistake

---
 src/agentlab/llm/llm_utils.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/agentlab/llm/llm_utils.py b/src/agentlab/llm/llm_utils.py
index 2b7efd00..856a99b0 100644
--- a/src/agentlab/llm/llm_utils.py
+++ b/src/agentlab/llm/llm_utils.py
@@ -340,9 +340,6 @@ def __str__(self, warn_if_image=False) -> str:
 
         return "\n".join([elem["text"] for elem in self["content"] if elem["type"] == "text"])
 
-    def get_text(self):
-        return str(self, no_warning=True)
-
     def add_content(self, type: str, content: Any):
         if isinstance(self["content"], str):
             text = self["content"]

From 8677f4847c1bfe2b34ba489b5ebc8731d8a10ecd Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 22 Nov 2024 15:04:09 +0000
Subject: [PATCH 28/42] Update README and Jupyter notebook with improved
 documentation and result analysis instructions

---
 README.md                                  | 38 +++++++++++++++------
 src/agentlab/analyze/inspect_results.ipynb | 39 ++++++++++++++--------
 2 files changed, 53 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 22a4e0fe..096254d4 100644
--- a/README.md
+++ b/README.md
@@ -9,11 +9,16 @@
 [🛠️ Setup](#🛠️-setup-agentlab) &nbsp;&nbsp;|&nbsp;&nbsp; 
 [🤖 Assistant](#ui-assistant) &nbsp;&nbsp;|&nbsp;&nbsp; 
 [🚀 Launch Experiments](#🚀-launch-experiments) &nbsp;&nbsp;|&nbsp;&nbsp;
-[🔍 AgentXray](#🔍-agentxray) &nbsp;&nbsp;|&nbsp;&nbsp; 
+[🔍 Analyse Results](#🔍-analyse-results) &nbsp;&nbsp;|&nbsp;&nbsp; 
 [🤖 Make Your Own Agent](#implement-a-new-agent) &nbsp;&nbsp;|&nbsp;&nbsp;
 [↻ Reproducibility](#↻-reproducibility) &nbsp;&nbsp;|&nbsp;&nbsp;
 
-<video controls style="max-width: 800px;">
+[![PyPI - License](https://img.shields.io/pypi/l/agentlab?style=flat-square)]([https://opensource.org/licenses/MIT](http://www.apache.org/licenses/LICENSE-2.0))
+[![PyPI - Downloads](https://img.shields.io/pypi/dm/agentlab?style=flat-square)](https://pypistats.org/packages/agentlab)
+[![GitHub star chart](https://img.shields.io/github/stars/ServiceNow/AgentLab?style=flat-square)](https://star-history.com/#ServiceNow/AgentLab)
+
+
+<video controls style="max-width: 700px;">
   <source src="https://github.com/ServiceNow/BrowserGym/assets/26232819/e0bfc788-cc8e-44f1-b8c3-0d1114108b85" type="video/mp4">
   Your browser does not support the video tag.
 </video>
@@ -23,11 +28,11 @@ AgentLab is a framework for developing and evaluating agents on a variety of
 [BrowserGym](https://github.com/ServiceNow/BrowserGym).
 
 AgentLab Features:
-* Easy large scale parallel agent experiments using [ray](https://www.ray.io/)
-* Building blocks for making agents
-* Unified LLM api for OpenRouter, OpenAI, Azure, or self hosted using TGI.
+* Easy large scale parallel [agent experiments](#🚀-launch-experiments) using [ray](https://www.ray.io/)
+* Building blocks for making agents over BrowserGym
+* Unified LLM API for OpenRouter, OpenAI, Azure, or self hosted using TGI.
 * Prefered way for running benchmarks like WebArena
-* Various Reproducibility features
+* Various [reproducibility features](#reproducibility-features)
 * Unified LeaderBoard (soon)
 
 ## 🎯 Supported Benchmarks
@@ -131,8 +136,7 @@ experiments.
 ### Debugging
 
 For debugging, run experiments with `n_jobs=1` and use VSCode's debug mode. This allows you to pause
-execution at breakpoints. To prevent the debugger from stopping on errors while running multiple
-experiments in VSCode, set `enable_debug = False` in `ExpArgs`.
+execution at breakpoints.
 
 ### About Parallel Jobs
 
@@ -155,14 +159,28 @@ each agent. AgentLab currently does not support evaluations across multiple inst
 either create a quick script to handle this or submit a PR to AgentLab. For a smoother parallel
 experience, consider using benchmarks like WorkArena instead.
 
+## 🔍 Analyse Results
+
+### Loading Results
+
+The class [`ExpResult`](https://github.com/ServiceNow/BrowserGym/blob/da26a5849d99d9a3169d7b1fde79f909c55c9ba7/browsergym/experiments/src/browsergym/experiments/loop.py#L595) provides a lazy loader for all the information of a specific experiment. You can use [`yield_all_exp_results`](https://github.com/ServiceNow/BrowserGym/blob/da26a5849d99d9a3169d7b1fde79f909c55c9ba7/browsergym/experiments/src/browsergym/experiments/loop.py#L872) to recursivley find all results in a directory. Finally [`load_result_df`](https://github.com/ServiceNow/AgentLab/blob/be1998c5fad5bda47ba50497ec3899aae03e85ec/src/agentlab/analyze/inspect_results.py#L119C5-L119C19) gathers all the summary information in a single dataframe. See [`inspect_results.ipynb`](src/agentlab/analyze/inspect_results.ipynb) for example usage.
+
+```python
+from agentlab.analyze import inspect_results
+result_df = inspect_results.load_result_df("path/to/your/study")
+```
+
+
+### AgentXray
+Inspect the behaviour of your agent using xray. You can load previous or ongoing experiments. The refresh mechanism is currently a bit clunky, but you can refresh the page, refresh the experiment directory list and select again your experiment to see an updated version of your currently running experiments.
 
-## 🔍 AgentXray
-While your experiments are running, you can inspect the results using:
 
 ```bash
 agentlab-xray
 ```
 
+**⚠️ Note**: Gradio is still in developement and unexpected behavior have been frequently noticed. Version 5.5 seems to work properly so far. If you're not sure that the proper information is displaying, refresh the page and select your experiment again.
+
 
 <video controls style="max-width: 800px;">
   <source src="https://github.com/user-attachments/assets/06c4dac0-b78f-45b7-9405-003da4af6b37" type="video/mp4">
diff --git a/src/agentlab/analyze/inspect_results.ipynb b/src/agentlab/analyze/inspect_results.ipynb
index b4b3828a..42543ee3 100644
--- a/src/agentlab/analyze/inspect_results.ipynb
+++ b/src/agentlab/analyze/inspect_results.ipynb
@@ -21,7 +21,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### load all summaries"
+    "### load all summaries\n",
+    "this will iterate over your RESULTS_DIR directory and create a summary of all the results."
    ]
   },
   {
@@ -30,7 +31,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "all_summaries = inspect_results.get_all_summaries(RESULTS_DIR.resolve().parent / \"ICML-Neurips-final-run\", ignore_cache=False, ignore_stale=True)\n",
+    "all_summaries = inspect_results.get_all_summaries(\n",
+    "    RESULTS_DIR.resolve().parent / \"ICML-Neurips-final-run\", ignore_cache=False, ignore_stale=True\n",
+    ")\n",
     "all_summaries"
    ]
   },
@@ -38,7 +41,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Load results"
+    "### Load results\n",
+    "find the most recent study and load all summary information in a result dataframe"
    ]
   },
   {
@@ -47,13 +51,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# # minwob GPT-4o single agent reproduced\n",
-    "# result_dir = RESULTS_DIR / \"2024-05-28_01-16-12_generic_agent_eval_llm\" #\n",
-    "\n",
-    "# # workarena GPT-4o single agent mostly reproduced\n",
-    "# result_dir = RESULTS_DIR / \"2024-05-28_01-13-04_generic_agent_eval_llm\" \n",
-    "# result_dir = RESULTS_DIR / \"2024-05-28_01-44-29_generic_agent_eval_llm\"\n",
-    "\n",
+    "# replace this by your desired directory if needed.\n",
     "result_dir = get_most_recent_study(RESULTS_DIR, contains=None)\n",
     "\n",
     "print(result_dir)\n",
@@ -108,14 +106,27 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Ablation study"
+    "## Ablation study\n",
+    "(TODO this might need some dedusting)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'result_df' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[4], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m ablation_report \u001b[38;5;241m=\u001b[39m inspect_results\u001b[38;5;241m.\u001b[39mablation_report(\u001b[43mresult_df\u001b[49m)\n\u001b[1;32m      2\u001b[0m inspect_results\u001b[38;5;241m.\u001b[39mdisplay_report(ablation_report)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'result_df' is not defined"
+     ]
+    }
+   ],
    "source": [
     "ablation_report = inspect_results.ablation_report(result_df)\n",
     "inspect_results.display_report(ablation_report)"

From ab949e6f5e7ee6d2b108334d160a2bc890b6f267 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 22 Nov 2024 15:18:49 +0000
Subject: [PATCH 29/42] Update requirements to include Jupyter support for
 black

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 3b3e372b..558d03d5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-black>=24.2.0
+black[jupyter]>=24.2.0
 blacken-docs
 pre-commit
 pytest==7.3.2

From c244b4ee76064844bd7239a8791a25c1686beb38 Mon Sep 17 00:00:00 2001
From: Maxime Gasse <maxime.gasse@gmail.com>
Date: Fri, 22 Nov 2024 10:21:27 -0500
Subject: [PATCH 30/42] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 096254d4..a178a4bb 100644
--- a/README.md
+++ b/README.md
@@ -214,10 +214,10 @@ agentlab/agents/ with the name of your agent.
 Several factors can influence reproducibility of results in the context of evaluating agents on
 dynamic benchmarks.
 
-### Factors affecting roproducibility
+### Factors affecting reproducibility
 * **Software version**: Different version of Playwright or any package in the software stack could
   influence the behavior of the benchmark or the agent.
-* **API based LLMs silently changing**: Even for a fixed version, a LLM may be updated e.g. to
+* **API based LLMs silently changing**: Even for a fixed version, an LLM may be updated e.g. to
   incorporate latest web knowledge.
 * **Live websites**:
   * WorkArena: The demo instance is mostly fixed in time to a specific version but ServiceNow

From 4183f6bd705f7338a1de1139b22044e8c1c2eebf Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Sat, 23 Nov 2024 00:55:56 +0000
Subject: [PATCH 31/42] Fix formatting and improve clarity in README.md

---
 README.md | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 789ff2c1..803741e6 100644
--- a/README.md
+++ b/README.md
@@ -29,11 +29,11 @@
 </video>
 
 AgentLab is a framework for developing and evaluating agents on a variety of
-[benchmarks](#🎯-supported-benchmarks) supported by
+[benchmarks](#-supported-benchmarks) supported by
 [BrowserGym](https://github.com/ServiceNow/BrowserGym).
 
 AgentLab Features:
-* Easy large scale parallel [agent experiments](#🚀-launch-experiments) using [ray](https://www.ray.io/)
+* Easy large scale parallel [agent experiments](#-launch-experiments) using [ray](https://www.ray.io/)
 * Building blocks for making agents over BrowserGym
 * Unified LLM API for OpenRouter, OpenAI, Azure, or self hosted using TGI.
 * Prefered way for running benchmarks like WebArena
@@ -41,6 +41,7 @@ AgentLab Features:
 * Unified LeaderBoard (soon)
 
 ## 🎯 Supported Benchmarks
+
 | Benchmark | Setup  <br> Link | # Task <br> Template| Seed  <br> Diversity | Max  <br> Step | Multi-tab | Hosted Method | BrowserGym <br> Leaderboard |
 |-----------|------------|---------|----------------|-----------|-----------|---------------|----------------------|
 | [WebArena](https://webarena.dev/) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/webarena/README.md) | 812 | None | 30 | yes | self hosted (docker) | soon |
@@ -53,14 +54,15 @@ AgentLab Features:
 | [GAIA](https://huggingface.co/spaces/gaia-benchmark/leaderboard) (soon) | - | - | None | - | - | live web | soon |
 | [Mind2Web-live](https://huggingface.co/datasets/iMeanAI/Mind2Web-Live) (soon) | - | - | None | - | - | live web | soon |
 | [MiniWoB](https://miniwob.farama.org/index.html) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/README.md) | 125 | Medium | 10 | no | self hosted (static files) | soon |
-## 🛠️ Setup agentlab
+
+## 🛠️ Setup AgentLab
 
 ```bash
 pip install agentlab
 ```
 
 Make sure to prepare the required benchmark according to instructions provided in the [setup
-column](#🎯-supported-benchmarks).
+column](#-supported-benchmarks).
 
 ```bash
 export AGENTLAB_EXP_ROOT=<root directory of experiment results>  # defaults to $HOME/agentlab_results
@@ -85,6 +87,7 @@ export AZURE_OPENAI_ENDPOINT=<your endpoint> # if using azure models
 </details>
 
 ## 🤖 UI-Assistant 
+
 Use an assistant to work for you (at your own cost and risk).
 
 ```bash

From 6273e343c6b66885ac0ccb6f4edc15a88403ce3e Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Sat, 23 Nov 2024 02:13:38 +0000
Subject: [PATCH 32/42] Update README.md to enhance visuals and improve
 navigation

---
 README.md | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 803741e6..3ce8797d 100644
--- a/README.md
+++ b/README.md
@@ -1,28 +1,21 @@
 
 
-<a href="https://github.com/user-attachments/assets/c2bc0b80-89da-4afb-9120-2feb018df19d"> <img
-  src="https://github.com/user-attachments/assets/c2bc0b80-89da-4afb-9120-2feb018df19d" width="800"
-/> </a>
+![AgentLab Banner (5)](https://github.com/user-attachments/assets/a23b3cd8-b5c4-4918-817b-654ae6468cb4)
+
 
-&nbsp;&nbsp;|&nbsp;&nbsp; 
-[🎯 Benchmarks](#-supported-benchmarks) &nbsp;&nbsp;|&nbsp;&nbsp; 
 [🛠️ Setup](#-setup-agentlab) &nbsp;&nbsp;|&nbsp;&nbsp; 
 [🤖 Assistant](#-ui-assistant) &nbsp;&nbsp;|&nbsp;&nbsp; 
 [🚀 Launch Experiments](#-launch-experiments) &nbsp;&nbsp;|&nbsp;&nbsp;
 [🔍 Analyse Results](#-analyse-results) &nbsp;&nbsp;|&nbsp;&nbsp; 
-[🤖 Make Your Own Agent](#-implement-a-new-agent) &nbsp;&nbsp;|&nbsp;&nbsp;
-[↻ Reproducibility](#-reproducibility) &nbsp;&nbsp;|&nbsp;&nbsp;
+&nbsp;&nbsp;|&nbsp;&nbsp; 
+[🤖 Build Your Agent](#-implement-a-new-agent) &nbsp;&nbsp;|&nbsp;&nbsp;
+[↻ Reproducibility](#-reproducibility) 
 
 [![PyPI - License](https://img.shields.io/pypi/l/agentlab?style=flat-square)]([https://opensource.org/licenses/MIT](http://www.apache.org/licenses/LICENSE-2.0))
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/agentlab?style=flat-square)](https://pypistats.org/packages/agentlab)
 [![GitHub star chart](https://img.shields.io/github/stars/ServiceNow/AgentLab?style=flat-square)](https://star-history.com/#ServiceNow/AgentLab)
 
 
-<video controls style="max-width: 700px;">
-  <source src="https://github.com/ServiceNow/BrowserGym/assets/26232819/e0bfc788-cc8e-44f1-b8c3-0d1114108b85" type="video/mp4">
-  Your browser does not support the video tag.
-</video>
-
 <video controls style="max-width: 700px;">
   <source src="https://github.com/ServiceNow/BrowserGym/assets/26232819/e0bfc788-cc8e-44f1-b8c3-0d1114108b85" type="video/mp4">
   Your browser does not support the video tag.

From 561951bc4ef853592ed46b2c6bd05bf6f627b5be Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Sat, 23 Nov 2024 02:28:43 +0000
Subject: [PATCH 33/42] Add badges to README.md for PyPI, GitHub stars, and CI
 status

---
 README.md | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 3ce8797d..d4e95431 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,13 @@
 
 ![AgentLab Banner (5)](https://github.com/user-attachments/assets/a23b3cd8-b5c4-4918-817b-654ae6468cb4)
 
+[![pypi](https://badge.fury.io/py/agentlab.svg)](https://pypi.org/project/agentlab/)
+[![PyPI - License](https://img.shields.io/pypi/l/agentlab?style=flat-square)]([https://opensource.org/licenses/MIT](http://www.apache.org/licenses/LICENSE-2.0))
+[![PyPI - Downloads](https://img.shields.io/pypi/dm/agentlab?style=flat-square)](https://pypistats.org/packages/agentlab)
+[![GitHub star chart](https://img.shields.io/github/stars/ServiceNow/AgentLab?style=flat-square)](https://star-history.com/#ServiceNow/AgentLab)
+[![Code Format](https://github.com/ServiceNow/AgentLab/actions/workflows/code_format.yml/badge.svg)](https://github.com/ServiceNow/AgentLab/actions/workflows/code_format.yml)
+[![Tests](https://github.com/ServiceNow/AgentLab/actions/workflows/unit_tests.yml/badge.svg)](https://github.com/ServiceNow/AgentLab/actions/workflows/unit_tests.yml)
+
 
 [🛠️ Setup](#-setup-agentlab) &nbsp;&nbsp;|&nbsp;&nbsp; 
 [🤖 Assistant](#-ui-assistant) &nbsp;&nbsp;|&nbsp;&nbsp; 
@@ -11,9 +18,6 @@
 [🤖 Build Your Agent](#-implement-a-new-agent) &nbsp;&nbsp;|&nbsp;&nbsp;
 [↻ Reproducibility](#-reproducibility) 
 
-[![PyPI - License](https://img.shields.io/pypi/l/agentlab?style=flat-square)]([https://opensource.org/licenses/MIT](http://www.apache.org/licenses/LICENSE-2.0))
-[![PyPI - Downloads](https://img.shields.io/pypi/dm/agentlab?style=flat-square)](https://pypistats.org/packages/agentlab)
-[![GitHub star chart](https://img.shields.io/github/stars/ServiceNow/AgentLab?style=flat-square)](https://star-history.com/#ServiceNow/AgentLab)
 
 
 <video controls style="max-width: 700px;">

From 450e0ba2aa259d3d5bafb93534a93e681c10d4df Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Sat, 23 Nov 2024 02:31:52 +0000
Subject: [PATCH 34/42] Add video demonstration to AgentXray section in
 README.md

---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index d4e95431..06d58920 100644
--- a/README.md
+++ b/README.md
@@ -177,6 +177,13 @@ result_df = inspect_results.load_result_df("path/to/your/study")
 
 
 ### AgentXray
+
+
+<video controls style="max-width: 700px;">
+  <source src="https://github.com/ServiceNow/BrowserGym/assets/26232819/e0bfc788-cc8e-44f1-b8c3-0d1114108b85" type="video/mp4">
+  Your browser does not support the video tag.
+</video>
+
 Inspect the behaviour of your agent using xray. You can load previous or ongoing experiments. The refresh mechanism is currently a bit clunky, but you can refresh the page, refresh the experiment directory list and select again your experiment to see an updated version of your currently running experiments.
 
 
@@ -184,6 +191,7 @@ Inspect the behaviour of your agent using xray. You can load previous or ongoing
 agentlab-xray
 ```
 
+
 **⚠️ Note**: Gradio is still in developement and unexpected behavior have been frequently noticed. Version 5.5 seems to work properly so far. If you're not sure that the proper information is displaying, refresh the page and select your experiment again.
 
 

From 73c8193886bbd78100c18a68566afcd929b1b55c Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Sat, 23 Nov 2024 02:34:18 +0000
Subject: [PATCH 35/42] test video

---
 README.md | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 06d58920..5340b681 100644
--- a/README.md
+++ b/README.md
@@ -20,10 +20,7 @@
 
 
 
-<video controls style="max-width: 700px;">
-  <source src="https://github.com/ServiceNow/BrowserGym/assets/26232819/e0bfc788-cc8e-44f1-b8c3-0d1114108b85" type="video/mp4">
-  Your browser does not support the video tag.
-</video>
+https://github.com/ServiceNow/BrowserGym/assets/26232819/e0bfc788-cc8e-44f1-b8c3-0d1114108b85
 
 AgentLab is a framework for developing and evaluating agents on a variety of
 [benchmarks](#-supported-benchmarks) supported by

From e288f78541c07ca9c4c9c9b89e8b76054146a953 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Sat, 23 Nov 2024 02:36:06 +0000
Subject: [PATCH 36/42] xray video test

---
 README.md | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 5340b681..5203586f 100644
--- a/README.md
+++ b/README.md
@@ -175,11 +175,8 @@ result_df = inspect_results.load_result_df("path/to/your/study")
 
 ### AgentXray
 
+https://github.com/ServiceNow/BrowserGym/assets/26232819/e0bfc788-cc8e-44f1-b8c3-0d1114108b85
 
-<video controls style="max-width: 700px;">
-  <source src="https://github.com/ServiceNow/BrowserGym/assets/26232819/e0bfc788-cc8e-44f1-b8c3-0d1114108b85" type="video/mp4">
-  Your browser does not support the video tag.
-</video>
 
 Inspect the behaviour of your agent using xray. You can load previous or ongoing experiments. The refresh mechanism is currently a bit clunky, but you can refresh the page, refresh the experiment directory list and select again your experiment to see an updated version of your currently running experiments.
 
@@ -192,10 +189,7 @@ agentlab-xray
 **⚠️ Note**: Gradio is still in developement and unexpected behavior have been frequently noticed. Version 5.5 seems to work properly so far. If you're not sure that the proper information is displaying, refresh the page and select your experiment again.
 
 
-<video controls style="max-width: 800px;">
-  <source src="https://github.com/user-attachments/assets/06c4dac0-b78f-45b7-9405-003da4af6b37" type="video/mp4">
-  Your browser does not support the video tag.
-</video>
+
 
 
 You will be able to select the recent experiments in the directory `AGENTLAB_EXP_ROOT` and visualize

From 8472a8927c5921e2337801f245da21ff17cf3e2f Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Sat, 23 Nov 2024 02:37:17 +0000
Subject: [PATCH 37/42] Update AgentXray section in README.md with new asset
 link

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 5203586f..0429d7e7 100644
--- a/README.md
+++ b/README.md
@@ -175,7 +175,7 @@ result_df = inspect_results.load_result_df("path/to/your/study")
 
 ### AgentXray
 
-https://github.com/ServiceNow/BrowserGym/assets/26232819/e0bfc788-cc8e-44f1-b8c3-0d1114108b85
+https://github.com/user-attachments/assets/06c4dac0-b78f-45b7-9405-003da4af6b37
 
 
 Inspect the behaviour of your agent using xray. You can load previous or ongoing experiments. The refresh mechanism is currently a bit clunky, but you can refresh the page, refresh the experiment directory list and select again your experiment to see an updated version of your currently running experiments.
@@ -191,7 +191,6 @@ agentlab-xray
 
 
 
-
 You will be able to select the recent experiments in the directory `AGENTLAB_EXP_ROOT` and visualize
 the results in a gradio interface.
 

From 026da4396770c73a5dd842b57ec0f4ecd9645d0c Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Sat, 23 Nov 2024 02:44:23 +0000
Subject: [PATCH 38/42] minor

---
 README.md | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 0429d7e7..86bd584d 100644
--- a/README.md
+++ b/README.md
@@ -177,21 +177,12 @@ result_df = inspect_results.load_result_df("path/to/your/study")
 
 https://github.com/user-attachments/assets/06c4dac0-b78f-45b7-9405-003da4af6b37
 
-
-Inspect the behaviour of your agent using xray. You can load previous or ongoing experiments. The refresh mechanism is currently a bit clunky, but you can refresh the page, refresh the experiment directory list and select again your experiment to see an updated version of your currently running experiments.
-
-
+In a terminal, execute:
 ```bash
 agentlab-xray
 ```
 
-
-**⚠️ Note**: Gradio is still in developement and unexpected behavior have been frequently noticed. Version 5.5 seems to work properly so far. If you're not sure that the proper information is displaying, refresh the page and select your experiment again.
-
-
-
-
-You will be able to select the recent experiments in the directory `AGENTLAB_EXP_ROOT` and visualize
+You can load previous or ongoing experiments in the directory `AGENTLAB_EXP_ROOT` and visualize
 the results in a gradio interface.
 
 In the following order, select:
@@ -203,6 +194,10 @@ In the following order, select:
 Once this is selected, you can see the trace of your agent on the given task. Click on the profiling
 image to select a step and observe the action taken by the agent.
 
+
+**⚠️ Note**: Gradio is still in developement and unexpected behavior have been frequently noticed. Version 5.5 seems to work properly so far. If you're not sure that the proper information is displaying, refresh the page and select your experiment again.
+
+
 ## 🤖 Implement a new Agent
 
 Get inspiration from the `MostBasicAgent` in
@@ -210,7 +205,7 @@ Get inspiration from the `MostBasicAgent` in
 For a better integration with the tools, make sure to implement most functions in the
 [AgentArgs](src/agentlab/agents/agent_args.py#L5) API and the extended `bgym.AbstractAgentArgs`.
 
-If you think your agent should be included directly in AgenLab, let use know and it can be added in
+If you think your agent should be included directly in AgenLab, let us know and it can be added in
 agentlab/agents/ with the name of your agent.  
 
 ## ↻ Reproducibility

From 5d976c6edddbd59293476bbe1789af89e431475a Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Sat, 23 Nov 2024 13:28:17 +0000
Subject: [PATCH 39/42] fix setup link ... again

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 86bd584d..d59f8250 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
 [![Tests](https://github.com/ServiceNow/AgentLab/actions/workflows/unit_tests.yml/badge.svg)](https://github.com/ServiceNow/AgentLab/actions/workflows/unit_tests.yml)
 
 
-[🛠️ Setup](#-setup-agentlab) &nbsp;&nbsp;|&nbsp;&nbsp; 
+[🛠️ Setup](#%EF%B8%8F-setup-agentlab) &nbsp;&nbsp;|&nbsp;&nbsp; 
 [🤖 Assistant](#-ui-assistant) &nbsp;&nbsp;|&nbsp;&nbsp; 
 [🚀 Launch Experiments](#-launch-experiments) &nbsp;&nbsp;|&nbsp;&nbsp;
 [🔍 Analyse Results](#-analyse-results) &nbsp;&nbsp;|&nbsp;&nbsp; 

From d6229ac1e78146e7d89a2d75b365c015b9b93642 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Sat, 23 Nov 2024 13:40:31 +0000
Subject: [PATCH 40/42] remove upper case letter before getting the benchmark

---
 src/agentlab/experiments/study.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index b369b540..c8c16d98 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -76,7 +76,7 @@ def make_study(
         agent_args = [agent_args]
 
     if isinstance(benchmark, str):
-        benchmark = bgym.DEFAULT_BENCHMARKS[benchmark]()
+        benchmark = bgym.DEFAULT_BENCHMARKS[benchmark.lower()]()
 
     if "webarena" in benchmark.name and len(agent_args) > 1:
         logger.warning(
@@ -220,7 +220,7 @@ def __post_init__(self):
         """Initialize the study. Set the uuid, and generate the exp_args_list."""
         self.uuid = uuid.uuid4()
         if isinstance(self.benchmark, str):
-            self.benchmark = bgym.DEFAULT_BENCHMARKS[self.benchmark]()
+            self.benchmark = bgym.DEFAULT_BENCHMARKS[self.benchmark.lower()]()
         if isinstance(self.dir, str):
             self.dir = Path(self.dir)
         self.make_exp_args_list()

From e5769c450c40ef444d2e67cc92faa8518d2785de Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Sat, 23 Nov 2024 13:40:40 +0000
Subject: [PATCH 41/42] minor

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d59f8250..1525cb57 100644
--- a/README.md
+++ b/README.md
@@ -44,7 +44,7 @@ AgentLab Features:
 | [WorkArena](https://github.com/ServiceNow/WorkArena) L3 | [setup](https://github.com/ServiceNow/WorkArena?tab=readme-ov-file#getting-started) | 341 | High | 50 | no | demo instance | soon |
 | [WebLinx](https://mcgill-nlp.github.io/weblinx/) | - | 31586 | None | 1 | no | self hosted (dataset) | soon |
 | [VisualWebArena](https://github.com/web-arena-x/visualwebarena) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/visualwebarena/README.md) | 910 | None | 30 | yes | self hosted (docker) | soon |
-| [Assistant Bench](https://assistantbench.github.io/) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/assistantbench/README.md) | 214 | None | 30 | yes | live web | soon |
+| [AssistantBench](https://assistantbench.github.io/) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/assistantbench/README.md) | 214 | None | 30 | yes | live web | soon |
 | [GAIA](https://huggingface.co/spaces/gaia-benchmark/leaderboard) (soon) | - | - | None | - | - | live web | soon |
 | [Mind2Web-live](https://huggingface.co/datasets/iMeanAI/Mind2Web-Live) (soon) | - | - | None | - | - | live web | soon |
 | [MiniWoB](https://miniwob.farama.org/index.html) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/README.md) | 125 | Medium | 10 | no | self hosted (static files) | soon |

From d1b7efa01438829f198ae3edbbad901b93e9002f Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Sat, 23 Nov 2024 14:04:45 +0000
Subject: [PATCH 42/42] Update ReproducibilityAgent link in README.md for
 better accessibility

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1525cb57..73501654 100644
--- a/README.md
+++ b/README.md
@@ -235,7 +235,7 @@ dynamic benchmarks.
 * **Reproduced results in the leaderboard**. For agents that are repdocudibile, we encourage users
   to try to reproduce the results and upload them to the leaderboard. There is a special column
   containing information about all reproduced results of an agent on a benchmark.
-* **ReproducibilityAgent**: You can run this agent on an existing study and it will try to re-run
+* **ReproducibilityAgent**: [You can run this agent](src/agentlab/agents/generic_agent/reproducibility_agent.py) on an existing study and it will try to re-run
   the same actions on the same task seeds. A vsiual diff of the two prompts will be displayed in the
   AgentInfo HTML tab of AgentXray. You will be able to inspect on some tasks what kind of changes
   between to two executions. **Note**: this is a beta feature and will need some adaptation for your