From a0328549544c032061536622e0a130b5a4af1c14 Mon Sep 17 00:00:00 2001 From: recursix Date: Wed, 6 Nov 2024 19:16:47 +0000 Subject: [PATCH 01/42] yet another way to kill timedout jobs --- src/agentlab/experiments/exp_utils.py | 15 +++-- .../experiments/graph_execution_ray.py | 66 ++++++++++++++++--- tests/experiments/test_launch_exp.py | 12 ++-- 3 files changed, 71 insertions(+), 22 deletions(-) diff --git a/src/agentlab/experiments/exp_utils.py b/src/agentlab/experiments/exp_utils.py index c2864ce0..97ce527d 100644 --- a/src/agentlab/experiments/exp_utils.py +++ b/src/agentlab/experiments/exp_utils.py @@ -27,9 +27,11 @@ def run_exp(exp_arg: ExpArgs, *dependencies, avg_step_timeout=60): """Run exp_args.run() with a timeout and handle dependencies.""" - episode_timeout = _episode_timeout(exp_arg, avg_step_timeout=avg_step_timeout) - with timeout_manager(seconds=episode_timeout): - return exp_arg.run() + # episode_timeout = _episode_timeout(exp_arg, avg_step_timeout=avg_step_timeout) + # logger.warning(f"Running {exp_arg.exp_id} with timeout of {episode_timeout} seconds.") + # with timeout_manager(seconds=episode_timeout): + # this timeout method is not robust enough. using ray.cancel instead + return exp_arg.run() def _episode_timeout(exp_arg: ExpArgs, avg_step_timeout=60): @@ -62,13 +64,12 @@ def timeout_manager(seconds: int = None): def alarm_handler(signum, frame): - logger.warning( - f"Operation timed out after {seconds}s, sending SIGINT and raising TimeoutError." - ) + logger.warning(f"Operation timed out after {seconds}s, raising TimeoutError.") # send sigint - os.kill(os.getpid(), signal.SIGINT) + # os.kill(os.getpid(), signal.SIGINT) # this doesn't seem to do much I don't know why # Still raise TimeoutError for immediate handling + # This works, but it doesn't seem enough to kill the job raise TimeoutError(f"Operation timed out after {seconds} seconds") previous_handler = signal.signal(signal.SIGALRM, alarm_handler) diff --git a/src/agentlab/experiments/graph_execution_ray.py b/src/agentlab/experiments/graph_execution_ray.py index 3e01be31..703f59e6 100644 --- a/src/agentlab/experiments/graph_execution_ray.py +++ b/src/agentlab/experiments/graph_execution_ray.py @@ -2,11 +2,14 @@ # # Disable Ray log deduplication # os.environ["RAY_DEDUP_LOGS"] = "0" - +import time import ray import bgym -from agentlab.experiments.exp_utils import run_exp +from agentlab.experiments.exp_utils import run_exp, _episode_timeout +from ray.util import state +import logging +logger = logging.getLogger(__name__) run_exp = ray.remote(run_exp) @@ -15,25 +18,70 @@ def execute_task_graph(exp_args_list: list[bgym.ExpArgs], avg_step_timeout=60): """Execute a task graph in parallel while respecting dependencies using Ray.""" exp_args_map = {exp_args.exp_id: exp_args for exp_args in exp_args_list} - tasks = {} + task_map = {} def get_task(exp_arg: bgym.ExpArgs): - if exp_arg.exp_id not in tasks: + if exp_arg.exp_id not in task_map: # Get all dependency tasks first dependency_tasks = [get_task(exp_args_map[dep_key]) for dep_key in exp_arg.depends_on] # Create new task that depends on the dependency results - tasks[exp_arg.exp_id] = run_exp.remote( + task_map[exp_arg.exp_id] = run_exp.remote( exp_arg, *dependency_tasks, avg_step_timeout=avg_step_timeout ) - return tasks[exp_arg.exp_id] + return task_map[exp_arg.exp_id] # Build task graph for exp_arg in exp_args_list: get_task(exp_arg) - # Execute all tasks and gather results + max_timeout = max([_episode_timeout(exp_args, avg_step_timeout) for exp_args in exp_args_list]) + return poll_for_timeout(task_map, max_timeout, poll_interval=max_timeout * 0.1) + + +def poll_for_timeout(tasks: dict[str, ray.ObjectRef], timeout: float, poll_interval: float = 1.0): + """Cancel tasks that exceeds the timeout + + I tried various different methods for killing a job that hangs. so far it's + the only one that seems to work reliably (hopefully) + """ + task_list = list(tasks.values()) task_ids = list(tasks.keys()) - results = ray.get(list(tasks.values())) - return {task_id: result for task_id, result in zip(task_ids, results)} + logger.warning(f"Any task exceeding {timeout} seconds will be cancelled.") + + while True: + ready, not_ready = ray.wait(task_list, num_returns=len(task_list), timeout=poll_interval) + for task in not_ready: + elapsed_time = get_elapsed_time(task) + # print(f"Task {task.task_id().hex()} elapsed time: {elapsed_time}") + if elapsed_time is not None and elapsed_time > timeout: + msg = f"Task {task.task_id().hex()} hase been running for {elapsed_time}s, more than the timeout: {timeout}s." + if elapsed_time < timeout + 60: + logger.warning(msg + " Cancelling task.") + ray.cancel(task, force=False, recursive=False) + else: + logger.warning(msg + " Force killing.") + ray.cancel(task, force=True, recursive=False) + if len(ready) == len(task_list): + results = [] + for task in ready: + try: + result = ray.get(task) + except Exception as e: + result = e + results.append(result) + + return {task_id: result for task_id, result in zip(task_ids, results)} + + +def get_elapsed_time(task_ref: ray.ObjectRef): + task_id = task_ref.task_id().hex() + task_info = state.get_task(task_id, address="auto") + if task_info and task_info.start_time_ms is not None: + start_time_s = task_info.start_time_ms / 1000.0 # Convert ms to s + current_time_s = time.time() + elapsed_time = current_time_s - start_time_s + return elapsed_time + else: + return None # Task has not started yet diff --git a/tests/experiments/test_launch_exp.py b/tests/experiments/test_launch_exp.py index 91a7c2db..782a9edc 100644 --- a/tests/experiments/test_launch_exp.py +++ b/tests/experiments/test_launch_exp.py @@ -1,3 +1,4 @@ +import math import tempfile from pathlib import Path @@ -63,9 +64,8 @@ def _test_launch_system(backend="ray", cause_timeout=False): if row.stack_trace is not None: print(row.stack_trace) if cause_timeout: - assert row.err_msg is not None - assert "Timeout" in row.err_msg - assert row.cum_reward == 0 + # assert row.err_msg is not None + assert math.isnan(row.cum_reward) or row.cum_reward == 0 else: assert row.err_msg is None assert row.cum_reward == 1.0 @@ -73,9 +73,9 @@ def _test_launch_system(backend="ray", cause_timeout=False): study_summary = inspect_results.summarize_study(results_df) assert len(study_summary) == 1 assert study_summary.std_err.iloc[0] == 0 - assert study_summary.n_completed.iloc[0] == "3/3" if not cause_timeout: + assert study_summary.n_completed.iloc[0] == "3/3" assert study_summary.avg_reward.iloc[0] == 1.0 @@ -91,7 +91,7 @@ def test_launch_system_ray(): _test_launch_system(backend="ray") -def _test_timeout_ray(): +def test_timeout_ray(): _test_launch_system(backend="ray", cause_timeout=True) @@ -120,7 +120,7 @@ def test_4o_mini_on_miniwob_tiny_test(): if __name__ == "__main__": - _test_timeout_ray() + test_timeout_ray() # test_4o_mini_on_miniwob_tiny_test() # test_launch_system_ray() # test_launch_system_sequntial() From ac1a461eae9573d880d5415f39f1cb64fb2fe839 Mon Sep 17 00:00:00 2001 From: recursix Date: Wed, 6 Nov 2024 21:29:28 +0000 Subject: [PATCH 02/42] Improve timeout handling in task polling logic --- src/agentlab/experiments/graph_execution_ray.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/agentlab/experiments/graph_execution_ray.py b/src/agentlab/experiments/graph_execution_ray.py index 703f59e6..46b96bd8 100644 --- a/src/agentlab/experiments/graph_execution_ray.py +++ b/src/agentlab/experiments/graph_execution_ray.py @@ -36,6 +36,7 @@ def get_task(exp_arg: bgym.ExpArgs): get_task(exp_arg) max_timeout = max([_episode_timeout(exp_args, avg_step_timeout) for exp_args in exp_args_list]) + return poll_for_timeout(task_map, max_timeout, poll_interval=max_timeout * 0.1) @@ -57,7 +58,7 @@ def poll_for_timeout(tasks: dict[str, ray.ObjectRef], timeout: float, poll_inter # print(f"Task {task.task_id().hex()} elapsed time: {elapsed_time}") if elapsed_time is not None and elapsed_time > timeout: msg = f"Task {task.task_id().hex()} hase been running for {elapsed_time}s, more than the timeout: {timeout}s." - if elapsed_time < timeout + 60: + if elapsed_time < timeout + 60 + poll_interval: logger.warning(msg + " Cancelling task.") ray.cancel(task, force=False, recursive=False) else: From 290b88de62b77ee509700afbf078fc2dd21f12c0 Mon Sep 17 00:00:00 2001 From: recursix Date: Thu, 7 Nov 2024 22:05:40 +0000 Subject: [PATCH 03/42] Add method to override max_steps in Study class --- src/agentlab/experiments/study.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py index 4e3bccea..6ff6ea5e 100644 --- a/src/agentlab/experiments/study.py +++ b/src/agentlab/experiments/study.py @@ -245,6 +245,10 @@ def get_report(self, ignore_cache=False, ignore_stale=False): return inspect_results.get_study_summary( self.dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale ) + + def override_max_steps(self, max_steps): + for exp_args in self.exp_args_list: + exp_args.env_args.max_steps = max_steps @staticmethod def load(dir: Path) -> "Study": From 3f05803722121c9f3ff2e12946365cd03c396426 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 8 Nov 2024 18:32:15 +0000 Subject: [PATCH 04/42] add support for tab visibility in observation flags and update related components --- src/agentlab/agents/dynamic_prompting.py | 13 ++--- .../agents/generic_agent/generic_agent.py | 3 +- .../generic_agent/generic_agent_prompt.py | 1 - tests/agents/test_generic_prompt.py | 47 +++++++++---------- 4 files changed, 26 insertions(+), 38 deletions(-) diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py index 1ddbf3be..3b4fcd73 100644 --- a/src/agentlab/agents/dynamic_prompting.py +++ b/src/agentlab/agents/dynamic_prompting.py @@ -10,8 +10,6 @@ import bgym from browsergym.core.action.base import AbstractActionSet -from browsergym.core.action.highlevel import HighLevelActionSet -from browsergym.core.action.python import PythonActionSet from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, overlay_som, prune_html from agentlab.llm.llm_utils import ( @@ -71,6 +69,7 @@ class ObsFlags(Flags): use_html: bool = True use_ax_tree: bool = False + use_tabs: bool = False use_focused_element: bool = False use_error_logs: bool = False use_history: bool = False @@ -386,11 +385,7 @@ def _prompt(self) -> str: URL: {page_url} """ prompt_pieces.append(prompt_piece) - self._prompt = "\n".join(prompt_pieces) - - -def has_tab_action(action_set: bgym.HighLevelActionSetArgs): - return "tab" in action_set.subsets + return "\n".join(prompt_pieces) class Observation(Shrinkable): @@ -399,14 +394,14 @@ class Observation(Shrinkable): Contains the html, the accessibility tree and the error logs. """ - def __init__(self, obs, flags: ObsFlags, use_tabs=False) -> None: + def __init__(self, obs, flags: ObsFlags) -> None: super().__init__() self.flags = flags self.obs = obs self.tabs = Tabs( obs, - visible=use_tabs, + visible=lambda: flags.use_tabs, prefix="## ", ) diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py index 5ef8a4cc..98026dc1 100644 --- a/src/agentlab/agents/generic_agent/generic_agent.py +++ b/src/agentlab/agents/generic_agent/generic_agent.py @@ -32,6 +32,7 @@ def set_benchmark(self, benchmark: bgym.Benchmark, demo_mode): if benchmark.name.startswith("miniwob"): self.flags.obs.use_html = True + self.flags.obs.use_tabs = benchmark.is_multi_tab self.flags.action.action_set = deepcopy(benchmark.high_level_action_set_args) # for backward compatibility with old traces @@ -268,5 +269,3 @@ def get_action_post_hoc(agent: GenericAgent, obs: dict, ans_dict: dict): output += f"\n\n{action}\n" return system_prompt, instruction_prompt, output - return system_prompt, instruction_prompt, output - return system_prompt, instruction_prompt, output diff --git a/src/agentlab/agents/generic_agent/generic_agent_prompt.py b/src/agentlab/agents/generic_agent/generic_agent_prompt.py index eb45ba59..67899f18 100644 --- a/src/agentlab/agents/generic_agent/generic_agent_prompt.py +++ b/src/agentlab/agents/generic_agent/generic_agent_prompt.py @@ -77,7 +77,6 @@ def __init__( self.obs = dp.Observation( obs_history[-1], self.flags.obs, - use_tabs=dp.has_tab_action(self.flags.action.action_set), ) self.action_prompt = dp.ActionPrompt(action_set, action_flags=flags.action) diff --git a/tests/agents/test_generic_prompt.py b/tests/agents/test_generic_prompt.py index a579c261..66b173a2 100644 --- a/tests/agents/test_generic_prompt.py +++ b/tests/agents/test_generic_prompt.py @@ -20,33 +20,28 @@ """ +base_obs = { + "goal": "do this and that", + "goal_object": [{"type": "text", "text": "do this and that"}], + "chat_messages": [{"role": "user", "message": "do this and that"}], + "axtree_txt": "[1] Click me", + "focused_element_bid": "45-256", + "open_pages_urls": ["https://example.com"], + "open_pages_titles": ["Example"], + "active_page_index": 0, +} OBS_HISTORY = [ - { - "goal": "do this and that", - "goal_object": [{"type": "text", "text": "do this and that"}], - "chat_messages": [{"role": "user", "message": "do this and that"}], + base_obs | { "pruned_html": html_template.format(1), - "axtree_txt": "[1] Click me", - "focused_element_bid": "45-256", "last_action_error": "", }, - { - "goal": "do this and that", - "goal_object": [{"type": "text", "text": "do this and that"}], - "chat_messages": [{"role": "user", "message": "do this and that"}], + base_obs | { "pruned_html": html_template.format(2), - "axtree_txt": "[1] Click me", - "focused_element_bid": "45-256", "last_action_error": "Hey, this is an error in the past", }, - { - "goal": "do this and that", - "goal_object": [{"type": "text", "text": "do this and that"}], - "chat_messages": [{"role": "user", "message": "do this and that"}], + base_obs | { "pruned_html": html_template.format(3), - "axtree_txt": "[1] Click me", - "focused_element_bid": "45-256", "last_action_error": "Hey, there is an error now", }, ] @@ -58,6 +53,7 @@ obs=dp.ObsFlags( use_html=True, use_ax_tree=True, + use_tabs=True, use_focused_element=True, use_error_logs=True, use_history=True, @@ -104,6 +100,10 @@ "obs.use_ax_tree", ("AXTree:", "Click me"), ), + ( + "obs.use_tabs", + ("Currently open tabs:","(active tab)"), + ), ( "obs.use_focused_element", ("Focused element:", "bid='45-256'"), @@ -251,11 +251,6 @@ def test_main_prompt_elements_present(): # for debugging test_shrinking_observation() test_main_prompt_elements_present() - for flag, expected_prompts in FLAG_EXPECTED_PROMPT: - test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts) - test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts) - test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts) - test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts) - test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts) - test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts) - test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts) + # for flag, expected_prompts in FLAG_EXPECTED_PROMPT: + # test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts) + \ No newline at end of file From 2fe585fe048dfa1da3cc5cfe843482bd34de5361 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 8 Nov 2024 19:59:45 +0000 Subject: [PATCH 05/42] fix tests --- tests/agents/test_generic_prompt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/agents/test_generic_prompt.py b/tests/agents/test_generic_prompt.py index 66b173a2..a26c5747 100644 --- a/tests/agents/test_generic_prompt.py +++ b/tests/agents/test_generic_prompt.py @@ -165,7 +165,7 @@ def test_shrinking_observation(): flags.obs.use_html = True prompt_maker = MainPrompt( - action_set=dp.HighLevelActionSet(), + action_set=bgym.HighLevelActionSet(), obs_history=OBS_HISTORY, actions=ACTIONS, memories=MEMORIES, @@ -231,7 +231,7 @@ def test_main_prompt_elements_present(): # Initialize MainPrompt prompt = str( MainPrompt( - action_set=dp.HighLevelActionSet(), + action_set=bgym.HighLevelActionSet(), obs_history=OBS_HISTORY, actions=ACTIONS, memories=MEMORIES, From 4a8cbb25dd581c0eb946f21b57ddfa6829237a79 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 8 Nov 2024 20:29:07 +0000 Subject: [PATCH 06/42] black --- src/agentlab/agents/dynamic_prompting.py | 2 +- src/agentlab/experiments/study.py | 2 +- tests/agents/test_generic_prompt.py | 12 +++++++----- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py index 3b4fcd73..73688f0f 100644 --- a/src/agentlab/agents/dynamic_prompting.py +++ b/src/agentlab/agents/dynamic_prompting.py @@ -385,7 +385,7 @@ def _prompt(self) -> str: URL: {page_url} """ prompt_pieces.append(prompt_piece) - return "\n".join(prompt_pieces) + return "\n".join(prompt_pieces) class Observation(Shrinkable): diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py index 6ff6ea5e..36a1f54c 100644 --- a/src/agentlab/experiments/study.py +++ b/src/agentlab/experiments/study.py @@ -245,7 +245,7 @@ def get_report(self, ignore_cache=False, ignore_stale=False): return inspect_results.get_study_summary( self.dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale ) - + def override_max_steps(self, max_steps): for exp_args in self.exp_args_list: exp_args.env_args.max_steps = max_steps diff --git a/tests/agents/test_generic_prompt.py b/tests/agents/test_generic_prompt.py index a26c5747..ae2e6d8a 100644 --- a/tests/agents/test_generic_prompt.py +++ b/tests/agents/test_generic_prompt.py @@ -32,15 +32,18 @@ } OBS_HISTORY = [ - base_obs | { + base_obs + | { "pruned_html": html_template.format(1), "last_action_error": "", }, - base_obs | { + base_obs + | { "pruned_html": html_template.format(2), "last_action_error": "Hey, this is an error in the past", }, - base_obs | { + base_obs + | { "pruned_html": html_template.format(3), "last_action_error": "Hey, there is an error now", }, @@ -102,7 +105,7 @@ ), ( "obs.use_tabs", - ("Currently open tabs:","(active tab)"), + ("Currently open tabs:", "(active tab)"), ), ( "obs.use_focused_element", @@ -253,4 +256,3 @@ def test_main_prompt_elements_present(): test_main_prompt_elements_present() # for flag, expected_prompts in FLAG_EXPECTED_PROMPT: # test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts) - \ No newline at end of file From 17fc3d1a27bf7218d44a8a67f437f8a877b64c75 Mon Sep 17 00:00:00 2001 From: recursix Date: Wed, 6 Nov 2024 21:29:28 +0000 Subject: [PATCH 07/42] Improve timeout handling in task polling logic --- src/agentlab/experiments/graph_execution_ray.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/agentlab/experiments/graph_execution_ray.py b/src/agentlab/experiments/graph_execution_ray.py index 703f59e6..46b96bd8 100644 --- a/src/agentlab/experiments/graph_execution_ray.py +++ b/src/agentlab/experiments/graph_execution_ray.py @@ -36,6 +36,7 @@ def get_task(exp_arg: bgym.ExpArgs): get_task(exp_arg) max_timeout = max([_episode_timeout(exp_args, avg_step_timeout) for exp_args in exp_args_list]) + return poll_for_timeout(task_map, max_timeout, poll_interval=max_timeout * 0.1) @@ -57,7 +58,7 @@ def poll_for_timeout(tasks: dict[str, ray.ObjectRef], timeout: float, poll_inter # print(f"Task {task.task_id().hex()} elapsed time: {elapsed_time}") if elapsed_time is not None and elapsed_time > timeout: msg = f"Task {task.task_id().hex()} hase been running for {elapsed_time}s, more than the timeout: {timeout}s." - if elapsed_time < timeout + 60: + if elapsed_time < timeout + 60 + poll_interval: logger.warning(msg + " Cancelling task.") ray.cancel(task, force=False, recursive=False) else: From 1e07d3e00848f2978d49313bf255d3b6ef6d39a3 Mon Sep 17 00:00:00 2001 From: Alexandre Lacoste Date: Wed, 6 Nov 2024 16:13:37 -0500 Subject: [PATCH 08/42] yet another way to kill timedout jobs (#108) --- src/agentlab/experiments/graph_execution_ray.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/agentlab/experiments/graph_execution_ray.py b/src/agentlab/experiments/graph_execution_ray.py index 46b96bd8..ccde71f8 100644 --- a/src/agentlab/experiments/graph_execution_ray.py +++ b/src/agentlab/experiments/graph_execution_ray.py @@ -2,12 +2,14 @@ # # Disable Ray log deduplication # os.environ["RAY_DEDUP_LOGS"] = "0" +import logging import time -import ray + import bgym -from agentlab.experiments.exp_utils import run_exp, _episode_timeout +import ray from ray.util import state -import logging + +from agentlab.experiments.exp_utils import _episode_timeout, run_exp logger = logging.getLogger(__name__) From 63d8debd2d10fae5496136d6ca25dbfbda579887 Mon Sep 17 00:00:00 2001 From: recursix Date: Thu, 7 Nov 2024 22:05:40 +0000 Subject: [PATCH 09/42] Add method to override max_steps in Study class --- src/agentlab/experiments/study.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py index 4e3bccea..6ff6ea5e 100644 --- a/src/agentlab/experiments/study.py +++ b/src/agentlab/experiments/study.py @@ -245,6 +245,10 @@ def get_report(self, ignore_cache=False, ignore_stale=False): return inspect_results.get_study_summary( self.dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale ) + + def override_max_steps(self, max_steps): + for exp_args in self.exp_args_list: + exp_args.env_args.max_steps = max_steps @staticmethod def load(dir: Path) -> "Study": From b88a058943b86dc74918d735fa16cac76cdb55f8 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 8 Nov 2024 18:32:15 +0000 Subject: [PATCH 10/42] add support for tab visibility in observation flags and update related components --- src/agentlab/agents/dynamic_prompting.py | 13 ++--- .../agents/generic_agent/generic_agent.py | 3 +- .../generic_agent/generic_agent_prompt.py | 1 - tests/agents/test_generic_prompt.py | 47 +++++++++---------- 4 files changed, 26 insertions(+), 38 deletions(-) diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py index 1ddbf3be..3b4fcd73 100644 --- a/src/agentlab/agents/dynamic_prompting.py +++ b/src/agentlab/agents/dynamic_prompting.py @@ -10,8 +10,6 @@ import bgym from browsergym.core.action.base import AbstractActionSet -from browsergym.core.action.highlevel import HighLevelActionSet -from browsergym.core.action.python import PythonActionSet from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, overlay_som, prune_html from agentlab.llm.llm_utils import ( @@ -71,6 +69,7 @@ class ObsFlags(Flags): use_html: bool = True use_ax_tree: bool = False + use_tabs: bool = False use_focused_element: bool = False use_error_logs: bool = False use_history: bool = False @@ -386,11 +385,7 @@ def _prompt(self) -> str: URL: {page_url} """ prompt_pieces.append(prompt_piece) - self._prompt = "\n".join(prompt_pieces) - - -def has_tab_action(action_set: bgym.HighLevelActionSetArgs): - return "tab" in action_set.subsets + return "\n".join(prompt_pieces) class Observation(Shrinkable): @@ -399,14 +394,14 @@ class Observation(Shrinkable): Contains the html, the accessibility tree and the error logs. """ - def __init__(self, obs, flags: ObsFlags, use_tabs=False) -> None: + def __init__(self, obs, flags: ObsFlags) -> None: super().__init__() self.flags = flags self.obs = obs self.tabs = Tabs( obs, - visible=use_tabs, + visible=lambda: flags.use_tabs, prefix="## ", ) diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py index 5ef8a4cc..98026dc1 100644 --- a/src/agentlab/agents/generic_agent/generic_agent.py +++ b/src/agentlab/agents/generic_agent/generic_agent.py @@ -32,6 +32,7 @@ def set_benchmark(self, benchmark: bgym.Benchmark, demo_mode): if benchmark.name.startswith("miniwob"): self.flags.obs.use_html = True + self.flags.obs.use_tabs = benchmark.is_multi_tab self.flags.action.action_set = deepcopy(benchmark.high_level_action_set_args) # for backward compatibility with old traces @@ -268,5 +269,3 @@ def get_action_post_hoc(agent: GenericAgent, obs: dict, ans_dict: dict): output += f"\n\n{action}\n" return system_prompt, instruction_prompt, output - return system_prompt, instruction_prompt, output - return system_prompt, instruction_prompt, output diff --git a/src/agentlab/agents/generic_agent/generic_agent_prompt.py b/src/agentlab/agents/generic_agent/generic_agent_prompt.py index eb45ba59..67899f18 100644 --- a/src/agentlab/agents/generic_agent/generic_agent_prompt.py +++ b/src/agentlab/agents/generic_agent/generic_agent_prompt.py @@ -77,7 +77,6 @@ def __init__( self.obs = dp.Observation( obs_history[-1], self.flags.obs, - use_tabs=dp.has_tab_action(self.flags.action.action_set), ) self.action_prompt = dp.ActionPrompt(action_set, action_flags=flags.action) diff --git a/tests/agents/test_generic_prompt.py b/tests/agents/test_generic_prompt.py index a579c261..66b173a2 100644 --- a/tests/agents/test_generic_prompt.py +++ b/tests/agents/test_generic_prompt.py @@ -20,33 +20,28 @@ """ +base_obs = { + "goal": "do this and that", + "goal_object": [{"type": "text", "text": "do this and that"}], + "chat_messages": [{"role": "user", "message": "do this and that"}], + "axtree_txt": "[1] Click me", + "focused_element_bid": "45-256", + "open_pages_urls": ["https://example.com"], + "open_pages_titles": ["Example"], + "active_page_index": 0, +} OBS_HISTORY = [ - { - "goal": "do this and that", - "goal_object": [{"type": "text", "text": "do this and that"}], - "chat_messages": [{"role": "user", "message": "do this and that"}], + base_obs | { "pruned_html": html_template.format(1), - "axtree_txt": "[1] Click me", - "focused_element_bid": "45-256", "last_action_error": "", }, - { - "goal": "do this and that", - "goal_object": [{"type": "text", "text": "do this and that"}], - "chat_messages": [{"role": "user", "message": "do this and that"}], + base_obs | { "pruned_html": html_template.format(2), - "axtree_txt": "[1] Click me", - "focused_element_bid": "45-256", "last_action_error": "Hey, this is an error in the past", }, - { - "goal": "do this and that", - "goal_object": [{"type": "text", "text": "do this and that"}], - "chat_messages": [{"role": "user", "message": "do this and that"}], + base_obs | { "pruned_html": html_template.format(3), - "axtree_txt": "[1] Click me", - "focused_element_bid": "45-256", "last_action_error": "Hey, there is an error now", }, ] @@ -58,6 +53,7 @@ obs=dp.ObsFlags( use_html=True, use_ax_tree=True, + use_tabs=True, use_focused_element=True, use_error_logs=True, use_history=True, @@ -104,6 +100,10 @@ "obs.use_ax_tree", ("AXTree:", "Click me"), ), + ( + "obs.use_tabs", + ("Currently open tabs:","(active tab)"), + ), ( "obs.use_focused_element", ("Focused element:", "bid='45-256'"), @@ -251,11 +251,6 @@ def test_main_prompt_elements_present(): # for debugging test_shrinking_observation() test_main_prompt_elements_present() - for flag, expected_prompts in FLAG_EXPECTED_PROMPT: - test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts) - test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts) - test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts) - test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts) - test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts) - test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts) - test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts) + # for flag, expected_prompts in FLAG_EXPECTED_PROMPT: + # test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts) + \ No newline at end of file From e97d023b3a57742a5de2af8a53a64abddfc47735 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 8 Nov 2024 19:59:45 +0000 Subject: [PATCH 11/42] fix tests --- tests/agents/test_generic_prompt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/agents/test_generic_prompt.py b/tests/agents/test_generic_prompt.py index 66b173a2..a26c5747 100644 --- a/tests/agents/test_generic_prompt.py +++ b/tests/agents/test_generic_prompt.py @@ -165,7 +165,7 @@ def test_shrinking_observation(): flags.obs.use_html = True prompt_maker = MainPrompt( - action_set=dp.HighLevelActionSet(), + action_set=bgym.HighLevelActionSet(), obs_history=OBS_HISTORY, actions=ACTIONS, memories=MEMORIES, @@ -231,7 +231,7 @@ def test_main_prompt_elements_present(): # Initialize MainPrompt prompt = str( MainPrompt( - action_set=dp.HighLevelActionSet(), + action_set=bgym.HighLevelActionSet(), obs_history=OBS_HISTORY, actions=ACTIONS, memories=MEMORIES, From ccd7b8b930e24775cfe558844a49527cede91735 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 8 Nov 2024 20:29:07 +0000 Subject: [PATCH 12/42] black --- src/agentlab/agents/dynamic_prompting.py | 2 +- src/agentlab/experiments/study.py | 2 +- tests/agents/test_generic_prompt.py | 12 +++++++----- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py index 3b4fcd73..73688f0f 100644 --- a/src/agentlab/agents/dynamic_prompting.py +++ b/src/agentlab/agents/dynamic_prompting.py @@ -385,7 +385,7 @@ def _prompt(self) -> str: URL: {page_url} """ prompt_pieces.append(prompt_piece) - return "\n".join(prompt_pieces) + return "\n".join(prompt_pieces) class Observation(Shrinkable): diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py index 6ff6ea5e..36a1f54c 100644 --- a/src/agentlab/experiments/study.py +++ b/src/agentlab/experiments/study.py @@ -245,7 +245,7 @@ def get_report(self, ignore_cache=False, ignore_stale=False): return inspect_results.get_study_summary( self.dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale ) - + def override_max_steps(self, max_steps): for exp_args in self.exp_args_list: exp_args.env_args.max_steps = max_steps diff --git a/tests/agents/test_generic_prompt.py b/tests/agents/test_generic_prompt.py index a26c5747..ae2e6d8a 100644 --- a/tests/agents/test_generic_prompt.py +++ b/tests/agents/test_generic_prompt.py @@ -32,15 +32,18 @@ } OBS_HISTORY = [ - base_obs | { + base_obs + | { "pruned_html": html_template.format(1), "last_action_error": "", }, - base_obs | { + base_obs + | { "pruned_html": html_template.format(2), "last_action_error": "Hey, this is an error in the past", }, - base_obs | { + base_obs + | { "pruned_html": html_template.format(3), "last_action_error": "Hey, there is an error now", }, @@ -102,7 +105,7 @@ ), ( "obs.use_tabs", - ("Currently open tabs:","(active tab)"), + ("Currently open tabs:", "(active tab)"), ), ( "obs.use_focused_element", @@ -253,4 +256,3 @@ def test_main_prompt_elements_present(): test_main_prompt_elements_present() # for flag, expected_prompts in FLAG_EXPECTED_PROMPT: # test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts) - \ No newline at end of file From 1aa491659a336918014a8cf1ab8e6c0eaf66a51c Mon Sep 17 00:00:00 2001 From: Maxime Gasse Date: Fri, 8 Nov 2024 16:12:56 -0500 Subject: [PATCH 13/42] black --- src/agentlab/experiments/graph_execution_ray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agentlab/experiments/graph_execution_ray.py b/src/agentlab/experiments/graph_execution_ray.py index ccde71f8..5dd18d4a 100644 --- a/src/agentlab/experiments/graph_execution_ray.py +++ b/src/agentlab/experiments/graph_execution_ray.py @@ -38,7 +38,7 @@ def get_task(exp_arg: bgym.ExpArgs): get_task(exp_arg) max_timeout = max([_episode_timeout(exp_args, avg_step_timeout) for exp_args in exp_args_list]) - + return poll_for_timeout(task_map, max_timeout, poll_interval=max_timeout * 0.1) From c990e76839dd1f60d0c183b8f907b8f1b1a1c6ed Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 8 Nov 2024 22:12:15 +0000 Subject: [PATCH 14/42] --- .github/workflows/unit_tests.yml | 3 + .../agents/generic_agent/agent_configs.py | 13 ++-- .../agents/generic_agent/tmlr_config.py | 2 +- src/agentlab/experiments/args.py | 10 +++- src/agentlab/llm/llm_configs.py | 59 ++++++++++--------- 5 files changed, 50 insertions(+), 37 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 7a0e312a..3342ba54 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -58,6 +58,9 @@ jobs: - name: Check MiniWob availability run: curl -I "http://localhost:8080/miniwob/" || echo "MiniWob not reachable" + - name: Pre-download nltk ressources + run: python -c "import nltk; nltk.download('punkt_tab')" + - name: Run AgentLab Unit Tests env: MINIWOB_URL: "http://localhost:8080/miniwob/" diff --git a/src/agentlab/agents/generic_agent/agent_configs.py b/src/agentlab/agents/generic_agent/agent_configs.py index 2d4f9bc3..a5db8c90 100644 --- a/src/agentlab/agents/generic_agent/agent_configs.py +++ b/src/agentlab/agents/generic_agent/agent_configs.py @@ -1,4 +1,5 @@ import bgym + from agentlab.agents import dynamic_prompting as dp from agentlab.experiments import args from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT @@ -41,7 +42,7 @@ use_abstract_example=True, use_hints=True, enable_chat=False, - max_prompt_tokens=None, + max_prompt_tokens=40_000, be_cautious=True, extra_instructions=None, ) @@ -89,7 +90,7 @@ use_abstract_example=True, # useful use_hints=True, # useful enable_chat=False, - max_prompt_tokens=None, + max_prompt_tokens=40_000, be_cautious=True, extra_instructions=None, ) @@ -136,7 +137,7 @@ use_abstract_example=True, use_hints=True, enable_chat=False, - max_prompt_tokens=None, + max_prompt_tokens=40_000, be_cautious=True, extra_instructions=None, add_missparsed_messages=True, @@ -186,7 +187,7 @@ use_abstract_example=True, use_hints=True, enable_chat=False, - max_prompt_tokens=None, + max_prompt_tokens=40_000, be_cautious=True, extra_instructions=None, add_missparsed_messages=True, @@ -234,7 +235,7 @@ use_abstract_example=True, use_hints=True, enable_chat=False, - max_prompt_tokens=None, + max_prompt_tokens=40_000, be_cautious=True, extra_instructions=None, ) @@ -298,7 +299,7 @@ use_hints=args.Choice([True, False], p=[0.7, 0.3]), be_cautious=args.Choice([True, False]), enable_chat=False, - max_prompt_tokens=None, + max_prompt_tokens=40_000, extra_instructions=None, ) diff --git a/src/agentlab/agents/generic_agent/tmlr_config.py b/src/agentlab/agents/generic_agent/tmlr_config.py index 11860e69..48a28c68 100644 --- a/src/agentlab/agents/generic_agent/tmlr_config.py +++ b/src/agentlab/agents/generic_agent/tmlr_config.py @@ -40,7 +40,7 @@ use_abstract_example=True, use_hints=True, enable_chat=False, - max_prompt_tokens=None, + max_prompt_tokens=40_000, be_cautious=True, extra_instructions=None, ) diff --git a/src/agentlab/experiments/args.py b/src/agentlab/experiments/args.py index bbbb3b7b..6a4fa804 100644 --- a/src/agentlab/experiments/args.py +++ b/src/agentlab/experiments/args.py @@ -105,13 +105,19 @@ def expand_cross_product(obj: Any | list[Any]): for obj in obj_list: cprod_paths = _find_cprod_with_paths(obj) if not cprod_paths: - return [copy.deepcopy(obj)] + result.append(copy.deepcopy(obj)) + continue paths, cprod_objects = zip(*cprod_paths) combinations = product(*[cprod_obj.elements for cprod_obj in cprod_objects]) + # create a base object with empty fields to make fast deep copies from + base_obj = copy.deepcopy(obj) + for path in paths: + _set_value(base_obj, path, None) + for combo in combinations: - new_obj = copy.deepcopy(obj) + new_obj = copy.deepcopy(base_obj) for path, value in zip(paths, combo): _set_value(new_obj, path, value) result.append(new_obj) diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py index 8376b5c2..feb4d1c8 100644 --- a/src/agentlab/llm/llm_configs.py +++ b/src/agentlab/llm/llm_configs.py @@ -20,28 +20,28 @@ "openai/gpt-4o-mini-2024-07-18": OpenAIModelArgs( model_name="gpt-4o-mini-2024-07-18", max_total_tokens=128_000, - max_input_tokens=40_000, - max_new_tokens=4000, + max_input_tokens=100_000, + max_new_tokens=28_000, vision_support=True, ), "openai/gpt-4-1106-preview": OpenAIModelArgs( model_name="gpt-4-1106-preview", max_total_tokens=128_000, - max_input_tokens=40_000, # make sure we don't bust budget - max_new_tokens=4000, + max_input_tokens=100_000, + max_new_tokens=28_000, ), "openai/gpt-4-vision-preview": OpenAIModelArgs( model_name="gpt-4-vision-preview", max_total_tokens=128_000, - max_input_tokens=40_000, # make sure we don't bust budget - max_new_tokens=4000, # I think this model has very small default value if we don't set max_new_tokens + max_input_tokens=100_000, + max_new_tokens=28_000, # I think this model has very small default value if we don't set max_new_tokens vision_support=True, ), "openai/gpt-4o-2024-05-13": OpenAIModelArgs( model_name="gpt-4o-2024-05-13", max_total_tokens=128_000, - max_input_tokens=40_000, # make sure we don't bust budget - max_new_tokens=4000, # I think this model has very small default value if we don't set max_new_tokens + max_input_tokens=100_000, + max_new_tokens=28_000, # I think this model has very small default value if we don't set max_new_tokens vision_support=True, ), "openai/gpt-3.5-turbo-0125": OpenAIModelArgs( @@ -67,22 +67,25 @@ model_name="gpt-4o", deployment_name="gpt-4o-2024-05-13", max_total_tokens=128_000, - max_input_tokens=40_000, - max_new_tokens=4_000, + max_input_tokens=100_000, + max_new_tokens=28_000, + vision_support=True, ), "azure/gpt-4o-2024-08-06": AzureModelArgs( model_name="gpt-4o", deployment_name="gpt-4o-2024-08-06", max_total_tokens=128_000, - max_input_tokens=40_000, - max_new_tokens=4_000, + max_input_tokens=100_000, + max_new_tokens=28_000, + vision_support=True, ), "azure/gpt-4o-mini-2024-07-18": AzureModelArgs( model_name="gpt-4o-mini", deployment_name="gpt-4o-mini-2024-07-18", max_total_tokens=128_000, - max_input_tokens=40_000, - max_new_tokens=4_000, + max_input_tokens=100_000, + max_new_tokens=28_000, + vision_support=True, ), # ---------------- OSS LLMs ----------------# "meta-llama/Meta-Llama-3-70B-Instruct": SelfHostedModelArgs( @@ -113,43 +116,43 @@ "openrouter/meta-llama/llama-3.1-405b-instruct": OpenRouterModelArgs( model_name="meta-llama/llama-3.1-405b-instruct", max_total_tokens=128_000, - max_input_tokens=40_000, - max_new_tokens=4000, + max_input_tokens=100_000, + max_new_tokens=28_000, temperature=1e-1, ), "openrouter/meta-llama/llama-3.1-70b-instruct": OpenRouterModelArgs( model_name="meta-llama/llama-3.1-70b-instruct", max_total_tokens=128_000, - max_input_tokens=40_000, - max_new_tokens=4000, + max_input_tokens=100_000, + max_new_tokens=28_000, temperature=1e-1, ), "openrouter/meta-llama/llama-3-70b-instruct": OpenRouterModelArgs( model_name="meta-llama/llama-3-70b-instruct", max_total_tokens=128_000, - max_input_tokens=40_000, - max_new_tokens=4000, + max_input_tokens=100_000, + max_new_tokens=28_000, temperature=1e-1, ), "openrouter/meta-llama/llama-3.1-8b-instruct:free": OpenRouterModelArgs( model_name="meta-llama/llama-3.1-8b-instruct:free", max_total_tokens=128_000, - max_input_tokens=40_000, - max_new_tokens=4000, + max_input_tokens=100_000, + max_new_tokens=28_000, temperature=1e-1, ), "openrouter/meta-llama/llama-3.1-8b-instruct": OpenRouterModelArgs( model_name="meta-llama/llama-3.1-8b-instruct", max_total_tokens=128_000, - max_input_tokens=40_000, - max_new_tokens=4000, + max_input_tokens=100_000, + max_new_tokens=28_000, temperature=1e-1, ), "openrouter/anthropic/claude-3.5-sonnet:beta": OpenRouterModelArgs( model_name="anthropic/claude-3.5-sonnet:beta", max_total_tokens=200_000, - max_input_tokens=40_000, - max_new_tokens=4000, + max_input_tokens=160_000, + max_new_tokens=40_000, temperature=1e-1, vision_support=True, ), @@ -163,8 +166,8 @@ "openrouter/openai/o1-mini-2024-09-12": OpenRouterModelArgs( model_name="openai/o1-mini-2024-09-12", max_total_tokens=128_000, - max_input_tokens=40_000, - max_new_tokens=4000, + max_input_tokens=100_000, + max_new_tokens=28_000, temperature=1e-1, ), } From 8de36e2b6951fcac21dceae136e75fcc913e6458 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 8 Nov 2024 22:23:46 +0000 Subject: [PATCH 15/42] Fix sorting bug. improve directory content retrieval with summary statistics --- src/agentlab/analyze/agent_xray.py | 67 ++++++++++++++++--------- src/agentlab/analyze/inspect_results.py | 1 + 2 files changed, 44 insertions(+), 24 deletions(-) diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index 38968fd6..310ebd22 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -184,8 +184,6 @@ def run_gradio(results_dir: Path): 2. **Select Task**: Select the task you want to analyze, this will trigger an update of the available seeds. - **IMPORTANT NOTE**: Due to a gradio bug, if you sort the columns of the table, the task - selection will not correspond to the right one. 3. **Select the Seed**: You might have multiple repetition for a given task, you will be able to select the seed you want to analyze. @@ -216,10 +214,9 @@ def run_gradio(results_dir: Path): """\ Click on a row to select an agent. It will trigger the update of other fields. - - **GRADIO BUG**: If you sort the columns the click will not match the - content. You have to sort back with the Idx column to align the click with - the order.""" + + The update mechanism is somewhat flacky, please help figure out why (or is it just gradio?). + """ ) agent_table = gr.DataFrame(max_height=500, show_label=False, interactive=False) with gr.Tab("Select Task and Seed", id="Select Task"): @@ -231,9 +228,8 @@ def run_gradio(results_dir: Path): """\ Click on a row to select a task. It will trigger the update of other fields. - **GRADIO BUG**: If you sort the columns the click will not match the - content. You have to sort back with the Idx column to align the click with - the order.""" + The update mechanism is somewhat flacky, please help figure out why (or is it just gradio?). + """ ) refresh_results_button = gr.Button("↺", scale=0, size="sm") @@ -250,9 +246,8 @@ def run_gradio(results_dir: Path): """\ Click on a row to select a seed. It will trigger the update of other fields. - **GRADIO BUG**: If you sort the columns the click will not match the - content. You have to sort back with the Idx column to align the click with - the order.""" + The update mechanism is somewhat flacky, please help figure out why (or is it just gradio?). + """ ) seed_table = gr.DataFrame( @@ -824,22 +819,22 @@ def extract_columns(row: pd.Series): ) seed_df = result_df.apply(extract_columns, axis=1) - seed_df["Idx"] = seed_df.index return seed_df def on_select_agent(evt: gr.SelectData, df: pd.DataFrame): - global info + # TODO try to find a clever way to solve the sort bug here return info.get_agent_id(df.iloc[evt.index[0]]) def on_select_task(evt: gr.SelectData, df: pd.DataFrame, agent_id: list[tuple]): - return (agent_id, df.iloc[evt.index[0]][TASK_NAME_KEY]) + # get col index + col_idx = df.columns.get_loc(TASK_NAME_KEY) + return (agent_id, evt.row_value[col_idx]) def update_seeds(agent_task_id: tuple): agent_id, task_name = agent_task_id - global info seed_df = get_seeds_df(info.agent_df, task_name) first_seed = seed_df.iloc[0]["seed"] return seed_df, EpisodeId(agent_id=agent_id, task_name=task_name, seed=first_seed) @@ -847,7 +842,8 @@ def update_seeds(agent_task_id: tuple): def on_select_seed(evt: gr.SelectData, df: pd.DataFrame, agent_task_id: tuple): agent_id, task_name = agent_task_id - seed = df.iloc[evt.index[0]]["seed"] + col_idx = df.columns.get_loc("seed") + seed = evt.row_value[col_idx] # seed should be the first column return EpisodeId(agent_id=agent_id, task_name=task_name, seed=seed) @@ -933,6 +929,7 @@ def new_exp_dir(exp_dir, progress=gr.Progress(), just_refresh=False): if exp_dir == select_dir_instructions: return None, None + exp_dir = exp_dir.split(" - ")[0] global info if len(exp_dir) == 0: @@ -943,10 +940,13 @@ def new_exp_dir(exp_dir, progress=gr.Progress(), just_refresh=False): info.result_df = inspect_results.load_result_df(info.exp_list_dir, progress_fn=progress.tqdm) info.result_df = remove_args_from_col(info.result_df) - agent_report = display_table(get_agent_report(info.result_df)) + study_summary = inspect_results.summarize_study(info.result_df) + # save study_summary + study_summary.to_csv(info.exp_list_dir / "summary_df.csv", index=False) + agent_report = display_table(study_summary) + info.agent_id_keys = agent_report.index.names agent_report.reset_index(inplace=True) - agent_report["Idx"] = agent_report.index agent_id = info.get_agent_id(agent_report.iloc[0]) @@ -960,7 +960,6 @@ def new_agent_id(agent_id: list[tuple]): info.tasks_df = inspect_results.reduce_episodes(info.agent_df).reset_index() info.tasks_df = info.tasks_df.drop(columns=["std_err"]) - info.tasks_df["Idx"] = info.tasks_df.index # task name of first element task_name = info.tasks_df.iloc[0][TASK_NAME_KEY] @@ -968,10 +967,30 @@ def new_agent_id(agent_id: list[tuple]): def get_directory_contents(results_dir: Path): - directories = sorted( - [str(file.name) for file in results_dir.iterdir() if file.is_dir()], reverse=True - ) - return [select_dir_instructions] + directories + exp_descriptions = [] + for dir in results_dir.iterdir(): + if not dir.is_dir(): + continue + + exp_description = dir.name + # get summary*.csv files and find the most recent + summary_files = list(dir.glob("summary*.csv")) + if len(summary_files) != 0: + most_recent_summary = max(summary_files, key=os.path.getctime) + summary_df = pd.read_csv(most_recent_summary) + + # get row with max avg_reward + max_reward_row = summary_df.loc[summary_df["avg_reward"].idxmax()] + reward = max_reward_row["avg_reward"] * 100 + completed = max_reward_row["n_completed"] + n_err = max_reward_row["n_err"] + exp_description += ( + f" - avg-reward: {reward:.1f}% - completed: {completed} - errors: {n_err}" + ) + + exp_descriptions.append(exp_description) + + return [select_dir_instructions] + sorted(exp_descriptions, reverse=True) def most_recent_folder(results_dir: Path): diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py index 9ea2aac3..cf038773 100644 --- a/src/agentlab/analyze/inspect_results.py +++ b/src/agentlab/analyze/inspect_results.py @@ -295,6 +295,7 @@ def summarize(sub_df, use_bootstrap=False): avg_steps=sub_df["n_steps"].mean(skipna=True).round(3), n_completed=f"{n_completed}/{len(sub_df)}", n_err=err.sum(skipna=True), + cum_cost=sub_df["stats.cum_cost"].sum(skipna=True).round(4), ) return pd.Series(record) From c4e8acbdc5a2aa7a7e6f160c8bfe93fd56e774b0 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 8 Nov 2024 22:27:38 +0000 Subject: [PATCH 16/42] fix test --- src/agentlab/analyze/inspect_results.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py index cf038773..4cb4ccf7 100644 --- a/src/agentlab/analyze/inspect_results.py +++ b/src/agentlab/analyze/inspect_results.py @@ -295,8 +295,9 @@ def summarize(sub_df, use_bootstrap=False): avg_steps=sub_df["n_steps"].mean(skipna=True).round(3), n_completed=f"{n_completed}/{len(sub_df)}", n_err=err.sum(skipna=True), - cum_cost=sub_df["stats.cum_cost"].sum(skipna=True).round(4), ) + if "stats.cum_cost" in sub_df: + record["cum_cost"]=sub_df["stats.cum_cost"].sum(skipna=True).round(4), return pd.Series(record) From c9f184c4546e680d6e529e2291cfa405ac620a3c Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 8 Nov 2024 22:27:50 +0000 Subject: [PATCH 17/42] black --- src/agentlab/analyze/inspect_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py index 4cb4ccf7..8df2c1ef 100644 --- a/src/agentlab/analyze/inspect_results.py +++ b/src/agentlab/analyze/inspect_results.py @@ -297,7 +297,7 @@ def summarize(sub_df, use_bootstrap=False): n_err=err.sum(skipna=True), ) if "stats.cum_cost" in sub_df: - record["cum_cost"]=sub_df["stats.cum_cost"].sum(skipna=True).round(4), + record["cum_cost"] = (sub_df["stats.cum_cost"].sum(skipna=True).round(4),) return pd.Series(record) From 3a96d5608b2353477e94e7699d6c3880d002f824 Mon Sep 17 00:00:00 2001 From: recursix Date: Sat, 9 Nov 2024 03:35:11 +0000 Subject: [PATCH 18/42] tmp --- src/agentlab/llm/llm_configs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py index feb4d1c8..4a7a054e 100644 --- a/src/agentlab/llm/llm_configs.py +++ b/src/agentlab/llm/llm_configs.py @@ -21,7 +21,7 @@ model_name="gpt-4o-mini-2024-07-18", max_total_tokens=128_000, max_input_tokens=100_000, - max_new_tokens=28_000, + max_new_tokens=16384, vision_support=True, ), "openai/gpt-4-1106-preview": OpenAIModelArgs( @@ -84,7 +84,7 @@ deployment_name="gpt-4o-mini-2024-07-18", max_total_tokens=128_000, max_input_tokens=100_000, - max_new_tokens=28_000, + max_new_tokens=16384, vision_support=True, ), # ---------------- OSS LLMs ----------------# @@ -167,7 +167,7 @@ model_name="openai/o1-mini-2024-09-12", max_total_tokens=128_000, max_input_tokens=100_000, - max_new_tokens=28_000, + max_new_tokens=16384, temperature=1e-1, ), } From a16aea037ff3594a6a672ffbb738a035b64411a3 Mon Sep 17 00:00:00 2001 From: recursix Date: Wed, 13 Nov 2024 15:22:49 +0000 Subject: [PATCH 19/42] add error report, add cum cost to summary and ray backend by default --- src/agentlab/analyze/agent_xray.py | 19 +++++++++++++++---- src/agentlab/analyze/inspect_results.py | 2 +- src/agentlab/experiments/launch_exp.py | 6 +++--- src/agentlab/experiments/study.py | 2 +- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index 310ebd22..0d64a027 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -142,6 +142,10 @@ def filter_agent_id(self, agent_id: list[tuple]): max-height: 400px; overflow-y: auto; } +.error-report { + max-height: 700px; + overflow-y: auto; +} .my-code-view { max-height: 300px; overflow-y: auto; @@ -284,6 +288,8 @@ def run_gradio(results_dir: Path): with gr.Tab("Global Stats"): global_stats = gr.DataFrame(max_height=500, show_label=False, interactive=False) + with gr.Tab("Error Report"): + error_report = gr.Markdown(elem_classes="error-report", show_copy_button=True) with gr.Row(): episode_info = gr.Markdown(label="Episode Info", elem_classes="my-markdown") action_info = gr.Markdown(label="Action Info", elem_classes="my-markdown") @@ -411,7 +417,7 @@ def run_gradio(results_dir: Path): exp_dir_choice.change( fn=new_exp_dir, inputs=exp_dir_choice, - outputs=[agent_table, agent_id, constants, variables, global_stats], + outputs=[agent_table, agent_id, constants, variables, global_stats, error_report], ) agent_table.select(fn=on_select_agent, inputs=agent_table, outputs=[agent_id]) @@ -918,19 +924,24 @@ def get_agent_report(result_df: pd.DataFrame): def update_global_stats(): - global info stats = inspect_results.global_report(info.result_df, reduce_fn=inspect_results.summarize_stats) stats.reset_index(inplace=True) return stats +def update_error_report(): + report_files = list(info.exp_list_dir.glob("error_report*.md")) + if len(report_files) == 0: + return "No error report found" + report_files = sorted(report_files, key=os.path.getctime, reverse=True) + return report_files[0].read_text() + def new_exp_dir(exp_dir, progress=gr.Progress(), just_refresh=False): if exp_dir == select_dir_instructions: return None, None exp_dir = exp_dir.split(" - ")[0] - global info if len(exp_dir) == 0: info.exp_list_dir = None @@ -951,7 +962,7 @@ def new_exp_dir(exp_dir, progress=gr.Progress(), just_refresh=False): agent_id = info.get_agent_id(agent_report.iloc[0]) constants, variables = format_constant_and_variables() - return agent_report, agent_id, constants, variables, update_global_stats() + return agent_report, agent_id, constants, variables, update_global_stats(), update_error_report() def new_agent_id(agent_id: list[tuple]): diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py index 8df2c1ef..09ba23a0 100644 --- a/src/agentlab/analyze/inspect_results.py +++ b/src/agentlab/analyze/inspect_results.py @@ -297,7 +297,7 @@ def summarize(sub_df, use_bootstrap=False): n_err=err.sum(skipna=True), ) if "stats.cum_cost" in sub_df: - record["cum_cost"] = (sub_df["stats.cum_cost"].sum(skipna=True).round(4),) + record["cum_cost"] = sub_df["stats.cum_cost"].sum(skipna=True).round(4) return pd.Series(record) diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py index 49a778e3..cb331a99 100644 --- a/src/agentlab/experiments/launch_exp.py +++ b/src/agentlab/experiments/launch_exp.py @@ -40,9 +40,9 @@ def run_experiments( study_dir = Path(study_dir) study_dir.mkdir(parents=True, exist_ok=True) - if n_jobs == 1 and parallel_backend != "sequential": - logging.warning("Only 1 job, switching to sequential backend.") - parallel_backend = "sequential" + # if n_jobs == 1 and parallel_backend != "sequential": + # logging.warning("Only 1 job, switching to sequential backend.") + # parallel_backend = "sequential" logging.info(f"Saving experiments to {study_dir}") for exp_args in exp_args_list: diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py index 2139ce7b..b42f0bb5 100644 --- a/src/agentlab/experiments/study.py +++ b/src/agentlab/experiments/study.py @@ -123,7 +123,7 @@ def set_reproducibility_info(self, strict_reproducibility=False, comment=None): def run( self, n_jobs=1, - parallel_backend="joblib", + parallel_backend="ray", strict_reproducibility=False, n_relaunch=3, relaunch_errors=True, From a18e8e53ef0982153649a1605f9d98b95ee30d48 Mon Sep 17 00:00:00 2001 From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com> Date: Thu, 14 Nov 2024 16:03:12 -0500 Subject: [PATCH 20/42] displaying exp names in ray dashboard (#123) * displaying exp names in ray dashboard * fixing tests --- src/agentlab/experiments/exp_utils.py | 15 ++++++++------- src/agentlab/experiments/graph_execution_ray.py | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/agentlab/experiments/exp_utils.py b/src/agentlab/experiments/exp_utils.py index 97ce527d..95c7a71c 100644 --- a/src/agentlab/experiments/exp_utils.py +++ b/src/agentlab/experiments/exp_utils.py @@ -1,13 +1,13 @@ -import os -from pathlib import Path -from browsergym.experiments.loop import _move_old_exp, yield_all_exp_results -from tqdm import tqdm import logging -from browsergym.experiments.loop import ExpArgs -from contextlib import contextmanager +import os import signal import sys -from time import time, sleep +from contextlib import contextmanager +from pathlib import Path +from time import sleep, time + +from browsergym.experiments.loop import ExpArgs, _move_old_exp, yield_all_exp_results +from tqdm import tqdm logger = logging.getLogger(__name__) # Get logger based on module name @@ -130,6 +130,7 @@ def add_dependencies(exp_args_list: list[ExpArgs], task_dependencies: dict[str, class MockedExpArgs: def __init__(self, exp_id, depends_on=None): self.exp_id = exp_id + self.exp_name = f"exp_{exp_id}" self.depends_on = depends_on if depends_on else [] self.start_time = None self.end_time = None diff --git a/src/agentlab/experiments/graph_execution_ray.py b/src/agentlab/experiments/graph_execution_ray.py index 5dd18d4a..231a130c 100644 --- a/src/agentlab/experiments/graph_execution_ray.py +++ b/src/agentlab/experiments/graph_execution_ray.py @@ -28,7 +28,7 @@ def get_task(exp_arg: bgym.ExpArgs): dependency_tasks = [get_task(exp_args_map[dep_key]) for dep_key in exp_arg.depends_on] # Create new task that depends on the dependency results - task_map[exp_arg.exp_id] = run_exp.remote( + task_map[exp_arg.exp_id] = run_exp.options(name=f"{exp_arg.exp_name}").remote( exp_arg, *dependency_tasks, avg_step_timeout=avg_step_timeout ) return task_map[exp_arg.exp_id] From a7d6467ed2c49110e384b8f965f14e5a08f5c1c2 Mon Sep 17 00:00:00 2001 From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com> Date: Fri, 15 Nov 2024 11:43:30 -0500 Subject: [PATCH 21/42] enabling chat o_0 (#124) --- src/agentlab/ui_assistant.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/agentlab/ui_assistant.py b/src/agentlab/ui_assistant.py index 2bebaa41..96bbb0f9 100644 --- a/src/agentlab/ui_assistant.py +++ b/src/agentlab/ui_assistant.py @@ -3,6 +3,7 @@ from browsergym.experiments.loop import EnvArgs, ExpArgs from agentlab.agents.agent_args import AgentArgs +from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs from agentlab.experiments.exp_utils import RESULTS_DIR from agentlab.experiments.launch_exp import import_object @@ -14,6 +15,9 @@ def make_exp_args(agent_args: AgentArgs, start_url="https://www.google.com"): except AttributeError: pass + if isinstance(agent_args, GenericAgentArgs): + agent_args.flags.enable_chat = True + exp_args = ExpArgs( agent_args=agent_args, env_args=EnvArgs( From 50d4571284457d9c7d88a69bcda3b241785ad4f4 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 15 Nov 2024 16:58:24 +0000 Subject: [PATCH 22/42] sequential studies --- src/agentlab/experiments/study.py | 199 ++++++++++++++++++++++-------- 1 file changed, 149 insertions(+), 50 deletions(-) diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py index b42f0bb5..23713c5c 100644 --- a/src/agentlab/experiments/study.py +++ b/src/agentlab/experiments/study.py @@ -1,7 +1,7 @@ +from abc import ABC, abstractmethod import gzip import logging import pickle -import re import uuid from dataclasses import dataclass from datetime import datetime @@ -13,7 +13,6 @@ from agentlab.agents.agent_args import AgentArgs from agentlab.analyze import inspect_results -from agentlab.experiments import args from agentlab.experiments import reproducibility_util as repro from agentlab.experiments.exp_utils import RESULTS_DIR, add_dependencies from agentlab.experiments.launch_exp import ( @@ -22,11 +21,96 @@ run_experiments, ) + logger = logging.getLogger(__name__) +def make_study( + agent_args: list[AgentArgs], + benchmark: bgym.Benchmark, + logging_level_stdout=logging.WARNING, + suffix="", + comment=None, + ignore_dependencies=False, +): + + if isinstance(benchmark, str): + benchmark = bgym.DEFAULT_BENCHMARKS[benchmark]() + + """Make a study from a list of agents and a benchmark.""" + if "webarena" in benchmark.name and len(agent_args) > 1: + logger.warning( + "*WebArena* requires manual reset after each evaluation. Running through SequentialStudies." + ) + studies = [] + for agent in agent_args: + studies.append( + Study( + [agent], + benchmark, + logging_level=logging_level_stdout, + suffix=suffix, + comment=comment, + ignore_dependencies=ignore_dependencies, + ) + ) + + return SequentialStudies(studies) + else: + return Study( + agent_args, + benchmark, + logging_level=logging_level_stdout, + suffix=suffix, + comment=comment, + ignore_dependencies=ignore_dependencies, + ) + + +class AbstractStudy(ABC): + dir: Path = None + suffix: str = "" + + @abstractmethod + def find_incomplete(self, include_errors=True): + """Search for missing""" + + @abstractmethod + def run(self, n_jobs=1, parallel_backend="ray", strict_reproducibility=False, n_relaunch=3): + """Run the study""" + + def make_dir(self, exp_root=RESULTS_DIR): + if self.dir is None: + dir_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}_{self.name}" + + self.dir = Path(exp_root) / dir_name + self.dir.mkdir(parents=True, exist_ok=True) + + def save(self, exp_root=RESULTS_DIR): + """Pickle the study to the directory""" + # TODO perhaps remove exp_args_list before pickling and when loading bring them from the individual directories + + self.make_dir(exp_root=exp_root) + with gzip.open(self.dir / "study.pkl.gz", "wb") as f: + pickle.dump(self, f) + + def get_results(self, suffix="", also_save=True): + """Recursively load all results from the study directory and summarize them.""" + result_df = inspect_results.load_result_df(self.dir) + error_report = inspect_results.error_report(result_df, max_stack_trace=3, use_log=True) + summary_df = inspect_results.summarize_study(result_df) + + if also_save: + suffix = f"_{suffix}" if suffix else "" + result_df.to_csv(self.dir / f"result_df{suffix}.csv") + summary_df.to_csv(self.dir / f"summary_df{suffix}.csv") + (self.dir / f"error_report{suffix}.md").write_text(error_report) + + return result_df, summary_df, error_report + + @dataclass -class Study: +class Study(AbstractStudy): """A study coresponds to one or multiple agents evaluated on a benchmark. This is part of the high level API to help keep experiments organized and reproducible. @@ -142,7 +226,7 @@ def run( self._run(n_jobs, parallel_backend, strict_reproducibility) suffix = f"trial_{i + 1}_of_{n_relaunch}" - _, summary_df, error_report = self.get_results(suffix=suffix) + _, summary_df, _ = self.get_results(suffix=suffix) logger.info("\n" + str(summary_df)) n_incomplete, n_error = self.find_incomplete(include_errors=relaunch_errors) @@ -200,60 +284,17 @@ def append_to_journal(self, strict_reproducibility=True): ValueError: If the reproducibility information is not compatible with the report. """ + _, summary_df, _ = self.get_results() repro.append_to_journal( self.reproducibility_info, - self.get_report(), + summary_df, strict_reproducibility=strict_reproducibility, ) - def get_results(self, suffix="", also_save=True): - result_df = inspect_results.load_result_df(self.dir) - error_report = inspect_results.error_report(result_df, max_stack_trace=3, use_log=True) - summary_df = inspect_results.summarize_study(result_df) - - if also_save: - suffix = f"_{suffix}" if suffix else "" - result_df.to_csv(self.dir / f"result_df{suffix}.csv") - summary_df.to_csv(self.dir / f"summary_df{suffix}.csv") - (self.dir / f"error_report{suffix}.md").write_text(error_report) - - return result_df, summary_df, error_report - @property def name(self): agent_names = [a.agent_name for a in self.agent_args] - if len(agent_names) == 1: - study_name = f"{agent_names[0]}_on_{self.benchmark.name}" - else: - study_name = f"{len(agent_names)}_agents_on_{self.benchmark.name}" - - study_name = slugify(study_name, max_length=100, allow_unicode=True) - - if self.suffix: - study_name += f"_{self.suffix}" - return study_name - - def make_dir(self, exp_root=RESULTS_DIR): - if self.dir is None: - dir_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}_{self.name}" - - self.dir = Path(exp_root) / dir_name - self.dir.mkdir(parents=True, exist_ok=True) - - def save(self): - """Pickle the study to the directory""" - - # TODO perhaps remove exp_args_list before pickling and when loading bring them from the individual directories - - self.make_dir() - - with gzip.open(self.dir / "study.pkl.gz", "wb") as f: - pickle.dump(self, f) - - def get_report(self, ignore_cache=False, ignore_stale=False): - return inspect_results.get_study_summary( - self.dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale - ) + return _make_study_name(agent_names, [self.benchmark.name], self.suffix) def override_max_steps(self, max_steps): for exp_args in self.exp_args_list: @@ -288,6 +329,64 @@ def load_most_recent(root_dir: Path = None, contains=None) -> "Study": return Study.load(get_most_recent_study(root_dir, contains=contains)) +def _make_study_name(agent_names, benchmark_names, suffix=None): + """Make a study name from the agent and benchmark names.""" + if len(agent_names) == 1: + agent_name = agent_names[0] + else: + agent_name = f"{len(agent_names)}_agents" + + if len(benchmark_names) == 1: + benchmark_name = benchmark_names[0] + else: + benchmark_name = f"{len(benchmark_names)}_benchmarks" + + study_name = f"{agent_name}_on_{benchmark_name}_{suffix if suffix else ''}" + + return slugify(study_name, max_length=200, allow_unicode=True) + + +@dataclass +class SequentialStudies(AbstractStudy): + """ + Sequential execution of multiple studies. + + This is required for e.g. WebArena, where a server reset is required between evaluations of each agent. + """ + + studies: list[Study] + + @property + def name(self): + """The name of the study.""" + agent_names = [a.agent_name for study in self.studies for a in study.agent_args] + benchmark_names = [study.benchmark.name for study in self.studies] + return _make_study_name(agent_names, benchmark_names, self.suffix) + + def find_incomplete(self, include_errors=True): + for study in self.studies: + study.find_incomplete(include_errors=include_errors) + + def run(self, n_jobs=1, parallel_backend="ray", strict_reproducibility=False, n_relaunch=3): + + self.save() + + for study in self.studies: + study.make_dir(exp_root=self.dir) + study.run(n_jobs, parallel_backend, strict_reproducibility, n_relaunch) + _, summary_df, _ = self.get_results() + logger.info("\n" + str(summary_df)) + logger.info(f"SequentialStudies {self.name} finished.") + + def override_max_steps(self, max_steps): + for study in self.studies: + study.override_max_steps(max_steps) + + def append_to_journal(self, strict_reproducibility=True): + for study in self.studies: + study.append_to_journal(strict_reproducibility=strict_reproducibility) + + def get_most_recent_study( root_dir: Path = None, date_format: str = "%Y-%m-%d_%H-%M-%S", contains=None ): From d0919dc5dcf6b49ff078c29903b0159c0c64dc16 Mon Sep 17 00:00:00 2001 From: recursix Date: Mon, 18 Nov 2024 19:13:06 +0000 Subject: [PATCH 23/42] little bug --- src/agentlab/experiments/study.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py index 23713c5c..698714de 100644 --- a/src/agentlab/experiments/study.py +++ b/src/agentlab/experiments/study.py @@ -369,10 +369,12 @@ def find_incomplete(self, include_errors=True): def run(self, n_jobs=1, parallel_backend="ray", strict_reproducibility=False, n_relaunch=3): + for study in self.studies: + study.make_dir(exp_root=self.dir) + self.save() for study in self.studies: - study.make_dir(exp_root=self.dir) study.run(n_jobs, parallel_backend, strict_reproducibility, n_relaunch) _, summary_df, _ = self.get_results() logger.info("\n" + str(summary_df)) From 0e2b752642dc63a4a56de2c0eb50a75dfb3de24d Mon Sep 17 00:00:00 2001 From: recursix Date: Mon, 18 Nov 2024 19:13:16 +0000 Subject: [PATCH 24/42] more flexible requirement --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 453f312d..ba5c8732 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -black[jupyter]==24.2.0 +black>=24.2.0 blacken-docs pre-commit pytest==7.3.2 From 041fd68501142b1d56248ac2997b4192fa695d46 Mon Sep 17 00:00:00 2001 From: recursix Date: Mon, 18 Nov 2024 19:14:24 +0000 Subject: [PATCH 25/42] imrove readme --- README.md | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 0bce34d4..bdad77c9 100644 --- a/README.md +++ b/README.md @@ -4,13 +4,28 @@ + + + AgentLab is a framework for developing and evaluating agents on a variety of benchmarks supported by [BrowserGym](https://github.com/ServiceNow/BrowserGym). This includes: -* WebArena -* WorkArena.L1, L2, L3 -* VisualWebArena (coming soon...) -* MiniWoB +* [WebArena](https://webarena.dev/) +* [WorkArena](https://github.com/ServiceNow/WorkArena) L1, L2, L3 +* [WebLinx](https://mcgill-nlp.github.io/weblinx/) +* [VisualWebArena](https://github.com/web-arena-x/visualwebarena) +* Assistant Bench +* GAIA +* Mind2Web-live (coming soon ...) +* [MiniWoB](https://miniwob.farama.org/index.html) + +AgentLab Features: +* Easy large scale parallel agent experiments using [ray](https://www.ray.io/) +* Building blocks for making agents +* Unified LLM api for OpenRouter, OpenAI, Azure, Self hosted using TGI. +* Prefered way for running benchmarks like WebArena +* Various Reproducibility features +* Unified LeaderBoard The framework enables the desing of rich hyperparameter spaces and the launch of parallel experiments using ablation studies or random searches. It also provides From 79ac4184d92c8a8f0ce4bc6bf6dbbc6a1abb14a7 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 22 Nov 2024 12:21:45 +0000 Subject: [PATCH 26/42] Enhance agent configuration and logging in study setup - Updated `get_vision_agent` to append "_vision" to agent names. - Modified `BaseMessage.__str__` to include a no-warning option for logging. - Improved `make_study` function to accept single agent args and benchmark types. - Added detailed docstrings for better clarity on parameters and functionality. - Introduced `avg_step_timeout` and `demo_mode` attributes in the Study class. --- README.md | 270 ++++++++++-------- .../agents/generic_agent/tmlr_config.py | 4 +- src/agentlab/experiments/study.py | 134 +++++++-- src/agentlab/llm/llm_utils.py | 12 +- 4 files changed, 273 insertions(+), 147 deletions(-) diff --git a/README.md b/README.md index b4c518fd..22a4e0fe 100644 --- a/README.md +++ b/README.md @@ -1,173 +1,177 @@ - - - - + + +  |   +[🎯 Benchmarks](#🎯-supported-benchmarks)   |   +[🛠️ Setup](#🛠️-setup-agentlab)   |   +[🤖 Assistant](#ui-assistant)   |   +[🚀 Launch Experiments](#🚀-launch-experiments)   |   +[🔍 AgentXray](#🔍-agentxray)   |   +[🤖 Make Your Own Agent](#implement-a-new-agent)   |   +[↻ Reproducibility](#↻-reproducibility)   |   + + AgentLab is a framework for developing and evaluating agents on a variety of -benchmarks supported by [BrowserGym](https://github.com/ServiceNow/BrowserGym). -This includes: -* [WebArena](https://webarena.dev/) -* [WorkArena](https://github.com/ServiceNow/WorkArena) L1, L2, L3 -* [WebLinx](https://mcgill-nlp.github.io/weblinx/) -* [VisualWebArena](https://github.com/web-arena-x/visualwebarena) -* Assistant Bench -* GAIA -* Mind2Web-live (coming soon ...) -* [MiniWoB](https://miniwob.farama.org/index.html) +[benchmarks](#🎯-supported-benchmarks) supported by +[BrowserGym](https://github.com/ServiceNow/BrowserGym). AgentLab Features: * Easy large scale parallel agent experiments using [ray](https://www.ray.io/) * Building blocks for making agents -* Unified LLM api for OpenRouter, OpenAI, Azure, Self hosted using TGI. +* Unified LLM api for OpenRouter, OpenAI, Azure, or self hosted using TGI. * Prefered way for running benchmarks like WebArena * Various Reproducibility features -* Unified LeaderBoard - -The framework enables the desing of rich hyperparameter spaces and the launch of -parallel experiments using ablation studies or random searches. It also provides -agent_xray, a visualization tool to inspect the results of the experiments using -a custom gradio interface - - - - - -## Install agentlab - -This repo is intended for testing and developing new agents, hence we clone and install using the `-e` flag. +* Unified LeaderBoard (soon) + +## 🎯 Supported Benchmarks +| Benchmark | Setup
Link | # Task
Template| Seed
Diversity | Max
Step | Multi-tab | Hosted Method | BrowserGym
Leaderboard | +|-----------|------------|---------|----------------|-----------|-----------|---------------|----------------------| +| [WebArena](https://webarena.dev/) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/webarena/README.md) | 812 | None | 30 | yes | self hosted (docker) | soon | +| [WorkArena](https://github.com/ServiceNow/WorkArena) L1 | [setup](https://github.com/ServiceNow/WorkArena?tab=readme-ov-file#getting-started) | 33 | High | 30 | no | demo instance | soon | +| [WorkArena](https://github.com/ServiceNow/WorkArena) L2 | [setup](https://github.com/ServiceNow/WorkArena?tab=readme-ov-file#getting-started) | 341 | High | 50 | no | demo instance | soon | +| [WorkArena](https://github.com/ServiceNow/WorkArena) L3 | [setup](https://github.com/ServiceNow/WorkArena?tab=readme-ov-file#getting-started) | 341 | High | 50 | no | demo instance | soon | +| [WebLinx](https://mcgill-nlp.github.io/weblinx/) | - | 31586 | None | 1 | no | self hosted (dataset) | soon | +| [VisualWebArena](https://github.com/web-arena-x/visualwebarena) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/visualwebarena/README.md) | 910 | None | 30 | yes | self hosted (docker) | soon | +| [Assistant Bench](https://assistantbench.github.io/) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/assistantbench/README.md) | 214 | None | 30 | yes | live web | soon | +| [GAIA](https://huggingface.co/spaces/gaia-benchmark/leaderboard) (soon) | - | - | None | - | - | live web | soon | +| [Mind2Web-live](https://huggingface.co/datasets/iMeanAI/Mind2Web-Live) (soon) | - | - | None | - | - | live web | soon | +| [MiniWoB](https://miniwob.farama.org/index.html) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/README.md) | 125 | Medium | 10 | no | self hosted (static files) | soon | +## 🛠️ Setup agentlab ```bash -git clone git@github.com:ServiceNow/AgentLab.git -pip install -e . +pip install agentlab ``` -## Set Environment Variables +Make sure to prepare the required benchmark according to instructions provided in the [setup +column](#🎯-supported-benchmarks). ```bash export AGENTLAB_EXP_ROOT= # defaults to $HOME/agentlab_results export OPENAI_API_KEY= # if openai models are used -export HUGGINGFACEHUB_API_TOKEN= # if huggingface models are used -``` - -## Use an assistant to work for you (at your own cost and risk) -```bash -agentlab-assistant --start_url https://www.google.com ``` -## Prepare Benchmarks -Depending on which benchmark you use, there are some prerequisites -
-MiniWoB +Setup OpenRouter API ```bash -export MINIWOB_URL="file://$HOME/dev/miniwob-plusplus/miniwob/html/miniwob/" +export OPENROUTER_API_KEY= # if openrouter models are used ```
+Setup Azure API -WorkArena - -See [detailed instructions on workarena github](https://github.com/ServiceNow/WorkArena?tab=readme-ov-file#getting-started) - -At a glance: -1) [Sign in](https://developer.servicenow.com/) and reqeuest a `washington` instance. -2) Once the instance is ready, you should see `` and `` -3) Add these to your `.bashrc` (or `.zshrc`) and `source` it (note: make sure that - all variables are in single quotes unless you happen to have a password with a - single quote in it) - ```bash - export SNOW_INSTANCE_URL='https://.service-now.com/' - export SNOW_INSTANCE_UNAME='admin' - export SNOW_INSTANCE_PWD='' - ``` -4) finally run these commands: - - ```bash - pip install browsergym-workarena - playwright install - workarena-install - ``` - - +```bash +export AZURE_OPENAI_API_KEY= # if using azure models +export AZURE_OPENAI_ENDPOINT= # if using azure models +```
-
-WebArena on AWS -TODO -
+## UI-Assistant +Use an assistant to work for you (at your own cost and risk). -
-WebArena on Azure -TODO -
+```bash +agentlab-assistant --start_url https://www.google.com +``` +Try your own agent: +```bash +agentlab-assistant --agent_config="module.path.to.your.AgentArgs" +``` + +## 🚀 Launch experiments +```python +# Import your agent configuration extending bgym.AgentArgs class +# Make sure this object is imported from a module accessible in PYTHONPATH to properly unpickle +from agentlab.agents.generic_agent import AGENT_4o_MINI +from agentlab.experiments.study import make_study -## Launch experiments +study = make_study( + benchmark="miniwob", # or "webarena", "workarnea_l1" ... + agent_args=[AGENT_4o_MINI], + comment="My first study", +) -Create your agent or import an existing one: -```python -from agentlab.agents.generic_agent.agent_configs import AGENT_4o +study.run(n_jobs=5) ``` -Run the agent on a benchmark: +Relaunching incomplete or errored tasks + ```python -study_name, exp_args_list = run_agents_on_benchmark(AGENT_4o, benchmark) -study_dir = make_study_dir(RESULTS_DIR, study_name) -run_experiments(n_jobs, exp_args_list, study_dir) +from agentlab.experiments.study import Study +study = Study.load("/path/to/your/study/dir") +study.find_incomplete(include_errors=True) +study.run() ``` -use [main.py](main.py) to launch experiments with a variety -of options. This is like a lazy CLI that is actually more convenient than a CLI. -Just comment and uncomment the lines you need or modify at will (but don't push -to the repo). - -
+See [main.py](main.py) to launch experiments with a variety of options. This is like a lazy CLI that +is actually more convenient. Just comment and uncomment the lines you need or modify at will (but +don't push to the repo). -Debugging -For debugging, run experiments using `n_jobs=1` and use VSCode debug mode. This -will allow you to stop on breakpoints. To prevent the debugger from stopping -on errors when running multiple experiments directly in VSCode, set -`enable_debug = False` in `ExpArgs` -
+### Job Timeouts +The complexity of the wild web, Playwright, and asyncio can sometimes cause jobs to hang. This +disables workers until the study is terminated and relaunched. If you are running jobs sequentially +or with a small number of workers, this could halt your entire study until you manually kill and +relaunch it. In the Ray parallel backend, we've implemented a system to automatically terminate jobs +exceeding a specified timeout. This feature is particularly useful when task hanging limits your +experiments. +### Debugging +For debugging, run experiments with `n_jobs=1` and use VSCode's debug mode. This allows you to pause +execution at breakpoints. To prevent the debugger from stopping on errors while running multiple +experiments in VSCode, set `enable_debug = False` in `ExpArgs`. +### About Parallel Jobs -
+Running one agent on one task corresponds to a single job. Conducting ablation studies or random +searches across hundreds of tasks with multiple seeds can generate more than 10,000 jobs. Efficient +parallel execution is therefore critical. Agents typically wait for responses from the LLM server or +updates from the web server. As a result, you can run 10–50 jobs in parallel on a single computer, +depending on available RAM. -Parallel jobs +⚠️ **Note for (Visual)WebArena**: These benchmarks have task dependencies designed to minimize +"corrupting" the instance between tasks. For example, an agent on task 323 could alter the instance +state, making task 201 impossible. To address this, the Ray backend accounts for task dependencies, +enabling some degree of parallelism. On WebArena, you can disable dependencies to increase +parallelism, but this might reduce performance by 1–2%. -Running one agent on one task correspond to one job. When conducting ablation -studies or random searches on hundreds of tasks with multiple seeds, this can -lead to more than 10000 jobs. It is thus crucial to execute them in parallel. -The agent usually wait on the LLM server to return the results or the web server -to update the page. Hence, you can run 10-50 jobs in parallel on a single -computer depending on how much RAM is available. +⚠️ **Instance Reset for (Visual)WebArena**: Before evaluating an agent, the instance is +automatically reset, a process that takes about 5 minutes. When evaluating multiple agents, the +`make_study` function returns a `SequentialStudies` object to ensure proper sequential evaluation of +each agent. AgentLab currently does not support evaluations across multiple instances, but you could +either create a quick script to handle this or submit a PR to AgentLab. For a smoother parallel +experience, consider using benchmarks like WorkArena instead. -
-## AgentXray +## 🔍 AgentXray While your experiments are running, you can inspect the results using: ```bash agentlab-xray ``` - - - -You will be able to select the recent experiments in the directory -`AGENTLAB_EXP_ROOT` and visualize the results in a gradio interface. + + + +You will be able to select the recent experiments in the directory `AGENTLAB_EXP_ROOT` and visualize +the results in a gradio interface. In the following order, select: * The experiment you want to visualize @@ -175,14 +179,52 @@ In the following order, select: * The task * And the seed -Once this is selected, you can see the trace of your agent on the given task. -Click on the profiling image to select a step and observe the action taken by the agent. +Once this is selected, you can see the trace of your agent on the given task. Click on the profiling +image to select a step and observe the action taken by the agent. ## Implement a new Agent -Get inspiration from the `MostBasicAgent` in [agentlab/agents/most_basic_agent/most_basic_agent.py](src/agentlab/agents/most_basic_agent/most_basic_agent.py) +Get inspiration from the `MostBasicAgent` in +[agentlab/agents/most_basic_agent/most_basic_agent.py](src/agentlab/agents/most_basic_agent/most_basic_agent.py). +For a better integration with the tools, make sure to implement most functions in the +[AgentArgs](src/agentlab/agents/agent_args.py#L5) API and the extended `bgym.AbstractAgentArgs`. + +If you think your agent should be included directly in AgenLab, let use know and it can be added in +agentlab/agents/ with the name of your agent. + +## ↻ Reproducibility +Several factors can influence reproducibility of results in the context of evaluating agents on +dynamic benchmarks. + +### Factors affecting roproducibility +* **Software version**: Different version of Playwright or any package in the software stack could + influence the behavior of the benchmark or the agent. +* **API based LLMs silently changing**: Even for a fixed version, a LLM may be updated e.g. to + incorporate latest web knowledge. +* **Live websites**: + * WorkArena: The demo instance is mostly fixed in time to a specific version but ServiceNow + sometime push minor modifications. + * AssistantBench and GAIA: These rely on the agent navigating the open web. The experience may + change depending on which country or region, some websites might be in different languages by + default. +* **Stochastic Agents**: Setting temperature of the LLM to 0 can reduce most stochasticity. +* **Non deterministic tasks**: For a fixed seed, the changes should be minimal + +### Reproducibility Features +* `Study` contains a dict of information about reproducibility, including benchmark version, package + version and commit hash +* The `Study` class allows automatic upload of your results to + [`reproducibility_journal.csv`](reproducibility_journal.csv). This makes it easier to populate a + large amount of reference points. +* **Reproduced results in the leaderboard**. For agents that are repdocudibile, we encourage users + to try to reproduce the results and upload them to the leaderboard. There is a special column + containing information about all reproduced results of an agent on a benchmark. +* **ReproducibilityAgent**: You can run this agent on an existing study and it will try to re-run + the same actions on the same task seeds. A vsiual diff of the two prompts will be displayed in the + AgentInfo HTML tab of AgentXray. You will be able to inspect on some tasks what kind of changes + between to two executions. **Note**: this is a beta feature and will need some adaptation for your + own agent. -Create a new directory in agentlab/agents/ with the name of your agent. ## Misc diff --git a/src/agentlab/agents/generic_agent/tmlr_config.py b/src/agentlab/agents/generic_agent/tmlr_config.py index 48a28c68..96abc46c 100644 --- a/src/agentlab/agents/generic_agent/tmlr_config.py +++ b/src/agentlab/agents/generic_agent/tmlr_config.py @@ -56,10 +56,12 @@ def get_base_agent(llm_config: str): def get_vision_agent(llm_config: str): flags = deepcopy(BASE_FLAGS) flags.obs.use_screenshot = True - return GenericAgentArgs( + agent_args = GenericAgentArgs( chat_model_args=CHAT_MODEL_ARGS_DICT[llm_config], flags=flags, ) + agent_args.agent_name = f"{agent_args.agent_name}_vision" + return agent_args def get_som_agent(llm_config: str): diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py index 698714de..851f3178 100644 --- a/src/agentlab/experiments/study.py +++ b/src/agentlab/experiments/study.py @@ -26,18 +26,59 @@ def make_study( - agent_args: list[AgentArgs], - benchmark: bgym.Benchmark, + agent_args: list[AgentArgs] | AgentArgs, + benchmark: bgym.Benchmark | str, logging_level_stdout=logging.WARNING, suffix="", comment=None, ignore_dependencies=False, ): + """Run a list of agents on a benchmark. + + Args: + agent_args: list[AgentArgs] | AgentArgs + The agent configuration(s) to run. *IMPORTANT*: these objects will be pickled and + unpickled. Make sure they are imported from a package that is accessible from + PYTHONPATH. Otherwise, it won't load in agentlab-xray. + + benchmark: bgym.Benchmark | str + The benchmark to run the agents on. See bgym.DEFAULT_BENCHMARKS for the main ones. You + can also make your own by modifying an existing one. + + logging_level_stdout: int + The logging level for the stdout of the main script. Each job will have its own logging + level that will save into file and can be seen in agentlab-xray. + + suffix: str + A suffix to add to the study name. This can be useful to keep track of your experiments. + By default the study name contains agent name, benchmark name and date. + + comment: str + Extra comments from the authors of this study to be stored in the reproducibility + information. Leave any extra information that can explain why results could be different + than expected. + + ignore_dependencies: bool + If True, ignore the dependencies of the tasks in the benchmark. *Use with caution.* So + far, only WebArena and VisualWebArena have dependencies between tasks to minimize the + influence of solving one task before another one. This dependency graph allows + experiments to run in parallel while respecting task dependencies. However, it still + can't run more than 4 and, in practice it's speeding up evaluation by a factor of only + 3x compare to sequential executionz. To accelerate execution, you can ignore + dependencies and run in full parallel. This leads to a decrease in performance of about + 1%-2%, and could be more. Note: ignore_dependencies on VisualWebArena doesn't work. + + Returns: + Study object or SequentialStudies object if the benchmark requires manual reset after each + evaluation such as WebArena and VisualWebArena. + """ + + if not isinstance(agent_args, (list, tuple)): + agent_args = [agent_args] if isinstance(benchmark, str): benchmark = bgym.DEFAULT_BENCHMARKS[benchmark]() - """Make a study from a list of agents and a benchmark.""" if "webarena" in benchmark.name and len(agent_args) > 1: logger.warning( "*WebArena* requires manual reset after each evaluation. Running through SequentialStudies." @@ -68,18 +109,21 @@ def make_study( class AbstractStudy(ABC): + """Abstract class for a study.""" + dir: Path = None suffix: str = "" @abstractmethod def find_incomplete(self, include_errors=True): - """Search for missing""" + """Prepare the study for relaunching by finding incomplete experiments""" @abstractmethod def run(self, n_jobs=1, parallel_backend="ray", strict_reproducibility=False, n_relaunch=3): """Run the study""" def make_dir(self, exp_root=RESULTS_DIR): + """Create a directory for the study""" if self.dir is None: dir_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}_{self.name}" @@ -116,24 +160,48 @@ class Study(AbstractStudy): This is part of the high level API to help keep experiments organized and reproducible. Attributes: - benchmark: Benchmark | str - The benchmark to evaluate the agents on. If a string is provided, it will be - converted to the corresponding benchmark using bgym.DEFAULT_BENCHMARKS. - agent_args: list[AgentArgs] - The list of agents to evaluate. - + The agent configuration(s) to run. *IMPORTANT*: these objects will be pickled and + unpickled. Make sure they are imported from a package that is accessible from + PYTHONPATH. Otherwise, it won't load in agentlab-xray. + benchmark: bgym.Benchmark | str + The benchmark to run the agents on. See bgym.DEFAULT_BENCHMARKS for the main ones. You + can also make your own by modifying an existing one. dir: Path - The directory where the results will be saved. - + The directory where the study will be saved. If None, a directory will be created in + RESULTS_DIR. suffix: str - A suffix to add to the study name - + A suffix to add to the study name. This can be useful to keep track of your experiments. + By default the study name contains agent name, benchmark name and date. uuid: str - A unique identifier for the study - + A unique identifier for the study. reproducibility_info: dict - The reproducibility information for the study. + Information about the study that may affect the reproducibility of the experiment. e.g.: + versions of BrowserGym, benchmark, AgentLab... + logging_level: int + The logging level for individual jobs. + logging_level_stdout: int + The logging level for the stdout of the main script. Each job will have its own logging + level that will save into file and can be seen in agentlab-xray. + comment: str + Extra comments from the authors of this study to be stored in the reproducibility + information. Leave any extra information that can explain why results could be different + than expected. + ignore_dependencies: bool + If True, ignore the dependencies of the tasks in the benchmark. *Use with caution.* So + far, only WebArena and VisualWebArena have dependencies between tasks to minimize the + influence of solving one task before another one. This dependency graph allows + experiments to run in parallel while respecting task dependencies. However, it still + can't run more than 4 and, in practice it's speeding up evaluation by a factor of only + 3x compare to sequential executionz. To accelerate execution, you can ignore + dependencies and run in full parallel. This leads to a decrease in performance of about + 1%-2%, and could be more. Note: ignore_dependencies on VisualWebArena doesn't work. + avg_step_timeout: int + The average step timeout in seconds. This is used to stop the experiments if they are + taking too long. The default is 60 seconds. + demo_mode: bool + If True, the experiments will be run in demo mode, which will record videos, and enable + visual effects for actions. """ agent_args: list[AgentArgs] = None @@ -146,8 +214,11 @@ class Study(AbstractStudy): logging_level_stdout: int = logging.WARNING comment: str = None # Extra comments from the authors of this study ignore_dependencies: bool = False + avg_step_timeout: int = 60 + demo_mode: bool = False def __post_init__(self): + """Initialize the study. Set the uuid, and generate the exp_args_list.""" self.uuid = uuid.uuid4() if isinstance(self.benchmark, str): self.benchmark = bgym.DEFAULT_BENCHMARKS[self.benchmark]() @@ -156,12 +227,14 @@ def __post_init__(self): self.make_exp_args_list() def make_exp_args_list(self): + """Generate the exp_args_list from the agent_args and the benchmark.""" self.exp_args_list = _agents_on_benchmark( self.agent_args, self.benchmark, logging_level=self.logging_level, logging_level_stdout=self.logging_level_stdout, ignore_dependencies=self.ignore_dependencies, + demo_mode=self.demo_mode, ) def find_incomplete(self, include_errors=True): @@ -271,7 +344,13 @@ def _run(self, n_jobs=1, parallel_backend="joblib", strict_reproducibility=False self.benchmark.prepare_backends() logger.info("Backends ready.") - run_experiments(n_jobs, self.exp_args_list, self.dir, parallel_backend=parallel_backend) + run_experiments( + n_jobs, + self.exp_args_list, + self.dir, + parallel_backend=parallel_backend, + avg_step_timeout=self.avg_step_timeout, + ) def append_to_journal(self, strict_reproducibility=True): """Append the study to the journal. @@ -331,6 +410,11 @@ def load_most_recent(root_dir: Path = None, contains=None) -> "Study": def _make_study_name(agent_names, benchmark_names, suffix=None): """Make a study name from the agent and benchmark names.""" + + # extract unique agent and benchmark names + agent_names = list(set(agent_names)) + benchmark_names = list(set(benchmark_names)) + if len(agent_names) == 1: agent_name = agent_names[0] else: @@ -369,6 +453,9 @@ def find_incomplete(self, include_errors=True): def run(self, n_jobs=1, parallel_backend="ray", strict_reproducibility=False, n_relaunch=3): + # This sequence of of making directories is important to make sure objects are materialized + # properly before saving. Otherwise relaunch may not work properly. + self.make_dir() for study in self.studies: study.make_dir(exp_root=self.dir) @@ -425,7 +512,7 @@ def get_most_recent_study( def set_demo_mode(env_args_list: list[EnvArgs]): - + """Set the demo mode for the experiments. This can be useful for generating videos for demos.""" for env_args in env_args_list: env_args.viewport = {"width": 1280, "height": 720} env_args.record_video = True @@ -473,15 +560,6 @@ def _agents_on_benchmark( if demo_mode: set_demo_mode(env_args_list) - # exp_args_list = args.expand_cross_product( - # ExpArgs( - # agent_args=args.CrossProd(agents), - # env_args=args.CrossProd(env_args_list), - # logging_level=logging_level, - # logging_level_stdout=logging_level_stdout, - # ) - # ) # type: list[ExpArgs] - exp_args_list = [] for agent in agents: diff --git a/src/agentlab/llm/llm_utils.py b/src/agentlab/llm/llm_utils.py index eaa2a5e0..c283bd3c 100644 --- a/src/agentlab/llm/llm_utils.py +++ b/src/agentlab/llm/llm_utils.py @@ -329,15 +329,19 @@ def __init__(self, role: str, content: Union[str, list[dict]]): self["role"] = role self["content"] = deepcopy(content) - def __str__(self) -> str: + def __str__(self, no_warning=False) -> str: if isinstance(self["content"], str): return self["content"] if not all(elem["type"] == "text" for elem in self["content"]): - logging.warning( - "The content of the message has images, which are not displayed in the string representation." - ) + if not no_warning: + logging.warning( + "The content of the message has images, which are not displayed in the string representation." + ) return "\n".join([elem["text"] for elem in self["content"] if elem["type"] == "text"]) + def get_text(self): + return str(self, no_warning=True) + def add_content(self, type: str, content: Any): if isinstance(self["content"], str): text = self["content"] From f4f9e25416eb457c7c2ce4e344508b2f862c0fbf Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 22 Nov 2024 15:03:53 +0000 Subject: [PATCH 27/42] get_text was added by mistake --- src/agentlab/llm/llm_utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/agentlab/llm/llm_utils.py b/src/agentlab/llm/llm_utils.py index 2b7efd00..856a99b0 100644 --- a/src/agentlab/llm/llm_utils.py +++ b/src/agentlab/llm/llm_utils.py @@ -340,9 +340,6 @@ def __str__(self, warn_if_image=False) -> str: return "\n".join([elem["text"] for elem in self["content"] if elem["type"] == "text"]) - def get_text(self): - return str(self, no_warning=True) - def add_content(self, type: str, content: Any): if isinstance(self["content"], str): text = self["content"] From 8677f4847c1bfe2b34ba489b5ebc8731d8a10ecd Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 22 Nov 2024 15:04:09 +0000 Subject: [PATCH 28/42] Update README and Jupyter notebook with improved documentation and result analysis instructions --- README.md | 38 +++++++++++++++------ src/agentlab/analyze/inspect_results.ipynb | 39 ++++++++++++++-------- 2 files changed, 53 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 22a4e0fe..096254d4 100644 --- a/README.md +++ b/README.md @@ -9,11 +9,16 @@ [🛠️ Setup](#🛠️-setup-agentlab)   |   [🤖 Assistant](#ui-assistant)   |   [🚀 Launch Experiments](#🚀-launch-experiments)   |   -[🔍 AgentXray](#🔍-agentxray)   |   +[🔍 Analyse Results](#🔍-analyse-results)   |   [🤖 Make Your Own Agent](#implement-a-new-agent)   |   [↻ Reproducibility](#↻-reproducibility)   |   -