From 1b1c4dc586c09d45c44cc91b221740b3ad8d3e1a Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 11 Apr 2025 13:09:15 -0400 Subject: [PATCH 1/4] Add GenericAgent and prompt builder implementations for AgentLab --- .../agents/visual_agent/visual_agent.py | 207 ++++++++++++++++++ .../visual_agent/visual_agent_prompts.py | 207 ++++++++++++++++++ 2 files changed, 414 insertions(+) create mode 100644 src/agentlab/agents/visual_agent/visual_agent.py create mode 100644 src/agentlab/agents/visual_agent/visual_agent_prompts.py diff --git a/src/agentlab/agents/visual_agent/visual_agent.py b/src/agentlab/agents/visual_agent/visual_agent.py new file mode 100644 index 00000000..2c30f05b --- /dev/null +++ b/src/agentlab/agents/visual_agent/visual_agent.py @@ -0,0 +1,207 @@ +""" +GenericAgent implementation for AgentLab + +This module defines a `GenericAgent` class and its associated arguments for use in the AgentLab framework. \ +The `GenericAgent` class is designed to interact with a chat-based model to determine actions based on \ +observations. It includes methods for preprocessing observations, generating actions, and managing internal \ +state such as plans, memories, and thoughts. The `GenericAgentArgs` class provides configuration options for \ +the agent, including model arguments and flags for various behaviors. +""" + +from copy import deepcopy +from dataclasses import asdict, dataclass +from warnings import warn + +import bgym +from browsergym.experiments.agent import Agent, AgentInfo + +from agentlab.agents import dynamic_prompting as dp +from agentlab.agents.agent_args import AgentArgs +from agentlab.llm.chat_api import BaseModelArgs +from agentlab.llm.llm_utils import Discussion, ParseError, SystemMessage, retry +from agentlab.llm.tracking import cost_tracker_decorator + +from .visual_agent_prompts import GenericPromptFlags, MainPrompt +from functools import partial + + +@dataclass +class ToolAgentFlags: + pass + + +@dataclass +class ToolAgentArgs(AgentArgs): + chat_model_args: BaseModelArgs = None + flags: GenericPromptFlags = None + max_retry: int = 4 + + def __post_init__(self): + try: # some attributes might be temporarily args.CrossProd for hyperparameter generation + self.agent_name = f"GenericAgent-{self.chat_model_args.model_name}".replace("/", "_") + except AttributeError: + pass + + def set_benchmark(self, benchmark: bgym.Benchmark, demo_mode): + """Override Some flags based on the benchmark.""" + if benchmark.name.startswith("miniwob"): + self.flags.obs.use_html = True + + self.flags.obs.use_tabs = benchmark.is_multi_tab + self.flags.action.action_set = deepcopy(benchmark.high_level_action_set_args) + + # for backward compatibility with old traces + if self.flags.action.multi_actions is not None: + self.flags.action.action_set.multiaction = self.flags.action.multi_actions + if self.flags.action.is_strict is not None: + self.flags.action.action_set.strict = self.flags.action.is_strict + + # verify if we can remove this + if demo_mode: + self.flags.action.action_set.demo_mode = "all_blue" + + def set_reproducibility_mode(self): + self.chat_model_args.temperature = 0 + + def prepare(self): + return self.chat_model_args.prepare_server() + + def close(self): + return self.chat_model_args.close_server() + + def make_agent(self): + return ToolAgent( + chat_model_args=self.chat_model_args, flags=self.flags, max_retry=self.max_retry + ) + + +class ToolAgent(Agent): + + def __init__( + self, + chat_model_args: BaseModelArgs, + flags: GenericPromptFlags, + max_retry: int = 4, + ): + + self.chat_llm = chat_model_args.make_model() + self.chat_model_args = chat_model_args + self.max_retry = max_retry + + self.flags = flags + self.action_set = self.flags.action.action_set.make_action_set() + self._obs_preprocessor = dp.make_obs_preprocessor(flags.obs) + + self._check_flag_constancy() + self.reset(seed=None) + + def obs_preprocessor(self, obs: dict) -> dict: + return self._obs_preprocessor(obs) + + @cost_tracker_decorator + def get_action(self, obs): + + self.obs_history.append(obs) + main_prompt = MainPrompt( + action_set=self.action_set, + obs_history=self.obs_history, + actions=self.actions, + memories=self.memories, + thoughts=self.thoughts, + previous_plan=self.plan, + step=self.plan_step, + flags=self.flags, + ) + + max_prompt_tokens, max_trunc_itr = self._get_maxes() + + system_prompt = SystemMessage(dp.SystemPrompt().prompt) + + human_prompt = dp.fit_tokens( + shrinkable=main_prompt, + max_prompt_tokens=max_prompt_tokens, + model_name=self.chat_model_args.model_name, + max_iterations=max_trunc_itr, + additional_prompts=system_prompt, + ) + try: + # TODO, we would need to further shrink the prompt if the retry + # cause it to be too long + + chat_messages = Discussion([system_prompt, human_prompt]) + ans_dict = retry( + self.chat_llm, + chat_messages, + n_retry=self.max_retry, + parser=main_prompt._parse_answer, + ) + ans_dict["busted_retry"] = 0 + # inferring the number of retries, TODO: make this less hacky + ans_dict["n_retry"] = (len(chat_messages) - 3) / 2 + except ParseError as e: + ans_dict = dict( + action=None, + n_retry=self.max_retry + 1, + busted_retry=1, + ) + + stats = self.chat_llm.get_stats() + stats["n_retry"] = ans_dict["n_retry"] + stats["busted_retry"] = ans_dict["busted_retry"] + + self.plan = ans_dict.get("plan", self.plan) + self.plan_step = ans_dict.get("step", self.plan_step) + self.actions.append(ans_dict["action"]) + self.memories.append(ans_dict.get("memory", None)) + self.thoughts.append(ans_dict.get("think", None)) + + agent_info = AgentInfo( + think=ans_dict.get("think", None), + chat_messages=chat_messages, + stats=stats, + extra_info={"chat_model_args": asdict(self.chat_model_args)}, + ) + return ans_dict["action"], agent_info + + def reset(self, seed=None): + self.seed = seed + self.plan = "No plan yet" + self.plan_step = -1 + self.memories = [] + self.thoughts = [] + self.actions = [] + self.obs_history = [] + + def _check_flag_constancy(self): + flags = self.flags + if flags.obs.use_som: + if not flags.obs.use_screenshot: + warn( + """ +Warning: use_som=True requires use_screenshot=True. Disabling use_som.""" + ) + flags.obs.use_som = False + if flags.obs.use_screenshot: + if not self.chat_model_args.vision_support: + warn( + """ +Warning: use_screenshot is set to True, but the chat model \ +does not support vision. Disabling use_screenshot.""" + ) + flags.obs.use_screenshot = False + return flags + + def _get_maxes(self): + maxes = ( + self.flags.max_prompt_tokens, + self.chat_model_args.max_total_tokens, + self.chat_model_args.max_input_tokens, + ) + maxes = [m for m in maxes if m is not None] + max_prompt_tokens = min(maxes) if maxes else None + max_trunc_itr = ( + self.flags.max_trunc_itr + if self.flags.max_trunc_itr + else 20 # dangerous to change the default value here? + ) + return max_prompt_tokens, max_trunc_itr diff --git a/src/agentlab/agents/visual_agent/visual_agent_prompts.py b/src/agentlab/agents/visual_agent/visual_agent_prompts.py new file mode 100644 index 00000000..7010966d --- /dev/null +++ b/src/agentlab/agents/visual_agent/visual_agent_prompts.py @@ -0,0 +1,207 @@ +""" +Prompt builder for GenericAgent + +It is based on the dynamic_prompting module from the agentlab package. +""" + +import logging +from dataclasses import dataclass + +from browsergym.core.action.base import AbstractActionSet + +from agentlab.agents import dynamic_prompting as dp +from agentlab.llm.llm_utils import BaseMessage, HumanMessage, image_to_jpg_base64_url + + +@dataclass +class PromptFlags(dp.Flags): + """ + A class to represent various flags used to control features in an application. + + Attributes: + use_criticise (bool): Ask the LLM to first draft and criticise the action before producing it. + use_thinking (bool): Enable a chain of thoughts. + use_concrete_example (bool): Use a concrete example of the answer in the prompt for a generic task. + use_abstract_example (bool): Use an abstract example of the answer in the prompt. + use_hints (bool): Add some human-engineered hints to the prompt. + enable_chat (bool): Enable chat mode, where the agent can interact with the user. + max_prompt_tokens (int): Maximum number of tokens allowed in the prompt. + be_cautious (bool): Instruct the agent to be cautious about its actions. + extra_instructions (Optional[str]): Extra instructions to provide to the agent. + add_missparsed_messages (bool): When retrying, add the missparsed messages to the prompt. + flag_group (Optional[str]): Group of flags used. + """ + + obs: dp.ObsFlags + action: dp.ActionFlags + use_criticise: bool = False # + use_thinking: bool = False + use_concrete_example: bool = True + use_abstract_example: bool = False + use_hints: bool = False + enable_chat: bool = False + max_prompt_tokens: int = None + be_cautious: bool = True + extra_instructions: str | None = None + add_missparsed_messages: bool = True + max_trunc_itr: int = 20 + flag_group: str = None + + +class SystemPrompt(dp.PromptElement): + _prompt = """\ +You are an agent trying to solve a web task based on the content of the page and +user instructions. You can interact with the page and explore, and send messages to the user. Each time you +submit an action it will be sent to the browser and you will receive a new page.""" + + +def make_instructions(obs: dict, from_chat: bool, extra_instructions: str | None): + """Convenient wrapper to extract instructions from either goal or chat""" + if from_chat: + instructions = dp.ChatInstructions( + obs["chat_messages"], extra_instructions=extra_instructions + ) + else: + if sum([msg["role"] == "user" for msg in obs.get("chat_messages", [])]) > 1: + logging.warning( + "Agent is in goal mode, but multiple user messages are present in the chat. Consider switching to `enable_chat=True`." + ) + instructions = dp.GoalInstructions( + obs["goal_object"], extra_instructions=extra_instructions + ) + return instructions + + +class History(dp.PromptElement): + """ + Format the actions and thoughts of previous steps.""" + + def __init__(self, actions, thoughts) -> None: + + prompt_elements = [] + for i, (action, thought) in enumerate(zip(actions, thoughts)): + prompt_elements.append( + f""" +## Step {i} +### Thoughts: +{thought} +### Action: +{action} +""" + ) + self._prompt = "\n".join(prompt_elements) + "\n" + + +class Observation(dp.PromptElement): + """Observation of the current step. + + Contains the html, the accessibility tree and the error logs. + """ + + def __init__(self, obs, flags: dp.ObsFlags) -> None: + super().__init__() + self.flags = flags + self.obs = obs + + # for a multi-tab browser, we need to show the current tab + self.tabs = dp.Tabs( + obs, + visible=lambda: flags.use_tabs, + prefix="## ", + ) + + # if an error is present, we need to show it + self.error = dp.Error( + obs["last_action_error"], + visible=lambda: flags.use_error_logs and obs["last_action_error"], + prefix="## ", + ) + + @property + def _prompt(self) -> str: + return f""" +# Observation of current step: +{self.tabs.prompt}{self.focused_element.prompt}{self.error.prompt} + +""" + + def add_screenshot(self, prompt: BaseMessage) -> BaseMessage: + if self.flags.use_screenshot: + if self.flags.use_som: + screenshot = self.obs["screenshot_som"] + prompt.add_text( + "\n## Screenshot:\nHere is a screenshot of the page, it is annotated with bounding boxes and corresponding bids:" + ) + else: + screenshot = self.obs["screenshot"] + prompt.add_text("\n## Screenshot:\nHere is a screenshot of the page:") + img_url = image_to_jpg_base64_url(screenshot) + prompt.add_image(img_url, detail=self.flags.openai_vision_detail) + return prompt + + +class MainPrompt(dp.PromptElement): + + def __init__( + self, + action_set: AbstractActionSet, + obs: dict, + actions: list[str], + thoughts: list[str], + flags: PromptFlags, + ) -> None: + super().__init__() + self.flags = flags + self.history = History(obs, actions, thoughts) + self.instructions = make_instructions(obs, flags.enable_chat, flags.extra_instructions) + self.obs = dp.Observation( + obs, + self.flags.obs, + ) + + self.action_prompt = dp.ActionPrompt(action_set, action_flags=flags.action) + self.think = dp.Think(visible=lambda: flags.use_thinking) + + @property + def _prompt(self) -> HumanMessage: + prompt = HumanMessage(self.instructions.prompt) + prompt.add_text( + f"""\ +{self.obs.prompt}\ +{self.history.prompt}\ +{self.action_prompt.prompt}\ +{self.think.prompt}\ +""" + ) + + if self.flags.use_abstract_example: + prompt.add_text( + f""" +# Abstract Example + +Here is an abstract version of the answer with description of the content of +each tag. Make sure you follow this structure, but replace the content with your +answer: +{self.think.abstract_ex}\ +{self.action_prompt.abstract_ex}\ +""" + ) + + if self.flags.use_concrete_example: + prompt.add_text( + f""" +# Concrete Example + +Here is a concrete example of how to format your answer. +Make sure to follow the template with proper tags: +{self.think.concrete_ex}\ +{self.action_prompt.concrete_ex}\ +""" + ) + return self.obs.add_screenshot(prompt) + + def _parse_answer(self, text_answer): + ans_dict = {} + ans_dict.update(self.think.parse_answer(text_answer)) + ans_dict.update(self.action_prompt.parse_answer(text_answer)) + return ans_dict From c25dc84cac26dfb59cba1b776a4bc458696043b5 Mon Sep 17 00:00:00 2001 From: recursix Date: Sat, 12 Apr 2025 10:20:00 -0400 Subject: [PATCH 2/4] Implement VisualAgent and associated prompt flags for enhanced agent functionality --- .../agents/visual_agent/agent_configs.py | 48 +++++++++ .../agents/visual_agent/visual_agent.py | 100 ++---------------- .../visual_agent/visual_agent_prompts.py | 42 ++------ 3 files changed, 69 insertions(+), 121 deletions(-) create mode 100644 src/agentlab/agents/visual_agent/agent_configs.py diff --git a/src/agentlab/agents/visual_agent/agent_configs.py b/src/agentlab/agents/visual_agent/agent_configs.py new file mode 100644 index 00000000..28de02aa --- /dev/null +++ b/src/agentlab/agents/visual_agent/agent_configs.py @@ -0,0 +1,48 @@ +from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT + +from .visual_agent import VisualAgentArgs +from .visual_agent_prompts import PromptFlags +import agentlab.agents.dynamic_prompting as dp +import bgym + +# the other flags are ignored for this agent. +DEFAULT_OBS_FLAGS = dp.ObsFlags( + use_tabs=True, # will be overridden by the benchmark when set_benchmark is called after initalizing the agent + use_error_logs=True, + use_past_error_logs=False, + use_screenshot=True, + use_som=False, + openai_vision_detail="auto", +) + +DEFAULT_ACTION_FLAGS = dp.ActionFlags( + action_set=bgym.HighLevelActionSetArgs(subsets=["coord"]), + long_description=True, + individual_examples=False, +) + + +DEFAULT_PROMPT_FLAGS = PromptFlags( + obs=DEFAULT_OBS_FLAGS, + action=DEFAULT_ACTION_FLAGS, + use_thinking=True, + use_concrete_example=False, + use_abstract_example=True, + enable_chat=False, + extra_instructions=None, +) + +VISUAL_AGENT_4o = VisualAgentArgs( + chat_model_args=CHAT_MODEL_ARGS_DICT["openai/gpt-4o-2024-05-13"], + flags=DEFAULT_PROMPT_FLAGS, +) + +VISUAL_AGENT_COMPUTER_USE = VisualAgentArgs( + chat_model_args=CHAT_MODEL_ARGS_DICT["openai/computer-use-preview-2025-03-11"], + flags=DEFAULT_PROMPT_FLAGS, +) + +VISUAL_AGENT_CLAUDE_3_5 = VisualAgentArgs( + chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/anthropic/claude-3.5-sonnet:beta"], + flags=DEFAULT_PROMPT_FLAGS, +) diff --git a/src/agentlab/agents/visual_agent/visual_agent.py b/src/agentlab/agents/visual_agent/visual_agent.py index 2c30f05b..8efee11d 100644 --- a/src/agentlab/agents/visual_agent/visual_agent.py +++ b/src/agentlab/agents/visual_agent/visual_agent.py @@ -8,9 +8,7 @@ the agent, including model arguments and flags for various behaviors. """ -from copy import deepcopy from dataclasses import asdict, dataclass -from warnings import warn import bgym from browsergym.experiments.agent import Agent, AgentInfo @@ -21,44 +19,24 @@ from agentlab.llm.llm_utils import Discussion, ParseError, SystemMessage, retry from agentlab.llm.tracking import cost_tracker_decorator -from .visual_agent_prompts import GenericPromptFlags, MainPrompt -from functools import partial +from .visual_agent_prompts import PromptFlags, MainPrompt @dataclass -class ToolAgentFlags: - pass - - -@dataclass -class ToolAgentArgs(AgentArgs): +class VisualAgentArgs(AgentArgs): chat_model_args: BaseModelArgs = None - flags: GenericPromptFlags = None + flags: PromptFlags = None max_retry: int = 4 def __post_init__(self): - try: # some attributes might be temporarily args.CrossProd for hyperparameter generation - self.agent_name = f"GenericAgent-{self.chat_model_args.model_name}".replace("/", "_") + try: # some attributes might be missing temporarily due to args.CrossProd for hyperparameter generation + self.agent_name = f"VisualAgent-{self.chat_model_args.model_name}".replace("/", "_") except AttributeError: pass def set_benchmark(self, benchmark: bgym.Benchmark, demo_mode): """Override Some flags based on the benchmark.""" - if benchmark.name.startswith("miniwob"): - self.flags.obs.use_html = True - self.flags.obs.use_tabs = benchmark.is_multi_tab - self.flags.action.action_set = deepcopy(benchmark.high_level_action_set_args) - - # for backward compatibility with old traces - if self.flags.action.multi_actions is not None: - self.flags.action.action_set.multiaction = self.flags.action.multi_actions - if self.flags.action.is_strict is not None: - self.flags.action.action_set.strict = self.flags.action.is_strict - - # verify if we can remove this - if demo_mode: - self.flags.action.action_set.demo_mode = "all_blue" def set_reproducibility_mode(self): self.chat_model_args.temperature = 0 @@ -70,17 +48,17 @@ def close(self): return self.chat_model_args.close_server() def make_agent(self): - return ToolAgent( + return VisualAgent( chat_model_args=self.chat_model_args, flags=self.flags, max_retry=self.max_retry ) -class ToolAgent(Agent): +class VisualAgent(Agent): def __init__( self, chat_model_args: BaseModelArgs, - flags: GenericPromptFlags, + flags: PromptFlags, max_retry: int = 4, ): @@ -92,7 +70,6 @@ def __init__( self.action_set = self.flags.action.action_set.make_action_set() self._obs_preprocessor = dp.make_obs_preprocessor(flags.obs) - self._check_flag_constancy() self.reset(seed=None) def obs_preprocessor(self, obs: dict) -> dict: @@ -101,34 +78,20 @@ def obs_preprocessor(self, obs: dict) -> dict: @cost_tracker_decorator def get_action(self, obs): - self.obs_history.append(obs) main_prompt = MainPrompt( action_set=self.action_set, - obs_history=self.obs_history, + obs=obs, actions=self.actions, - memories=self.memories, thoughts=self.thoughts, - previous_plan=self.plan, - step=self.plan_step, flags=self.flags, ) - max_prompt_tokens, max_trunc_itr = self._get_maxes() - system_prompt = SystemMessage(dp.SystemPrompt().prompt) - - human_prompt = dp.fit_tokens( - shrinkable=main_prompt, - max_prompt_tokens=max_prompt_tokens, - model_name=self.chat_model_args.model_name, - max_iterations=max_trunc_itr, - additional_prompts=system_prompt, - ) try: # TODO, we would need to further shrink the prompt if the retry # cause it to be too long - chat_messages = Discussion([system_prompt, human_prompt]) + chat_messages = Discussion([system_prompt, main_prompt.prompt]) ans_dict = retry( self.chat_llm, chat_messages, @@ -138,7 +101,7 @@ def get_action(self, obs): ans_dict["busted_retry"] = 0 # inferring the number of retries, TODO: make this less hacky ans_dict["n_retry"] = (len(chat_messages) - 3) / 2 - except ParseError as e: + except ParseError: ans_dict = dict( action=None, n_retry=self.max_retry + 1, @@ -149,10 +112,7 @@ def get_action(self, obs): stats["n_retry"] = ans_dict["n_retry"] stats["busted_retry"] = ans_dict["busted_retry"] - self.plan = ans_dict.get("plan", self.plan) - self.plan_step = ans_dict.get("step", self.plan_step) self.actions.append(ans_dict["action"]) - self.memories.append(ans_dict.get("memory", None)) self.thoughts.append(ans_dict.get("think", None)) agent_info = AgentInfo( @@ -165,43 +125,5 @@ def get_action(self, obs): def reset(self, seed=None): self.seed = seed - self.plan = "No plan yet" - self.plan_step = -1 - self.memories = [] self.thoughts = [] self.actions = [] - self.obs_history = [] - - def _check_flag_constancy(self): - flags = self.flags - if flags.obs.use_som: - if not flags.obs.use_screenshot: - warn( - """ -Warning: use_som=True requires use_screenshot=True. Disabling use_som.""" - ) - flags.obs.use_som = False - if flags.obs.use_screenshot: - if not self.chat_model_args.vision_support: - warn( - """ -Warning: use_screenshot is set to True, but the chat model \ -does not support vision. Disabling use_screenshot.""" - ) - flags.obs.use_screenshot = False - return flags - - def _get_maxes(self): - maxes = ( - self.flags.max_prompt_tokens, - self.chat_model_args.max_total_tokens, - self.chat_model_args.max_input_tokens, - ) - maxes = [m for m in maxes if m is not None] - max_prompt_tokens = min(maxes) if maxes else None - max_trunc_itr = ( - self.flags.max_trunc_itr - if self.flags.max_trunc_itr - else 20 # dangerous to change the default value here? - ) - return max_prompt_tokens, max_trunc_itr diff --git a/src/agentlab/agents/visual_agent/visual_agent_prompts.py b/src/agentlab/agents/visual_agent/visual_agent_prompts.py index 7010966d..383923f0 100644 --- a/src/agentlab/agents/visual_agent/visual_agent_prompts.py +++ b/src/agentlab/agents/visual_agent/visual_agent_prompts.py @@ -6,6 +6,7 @@ import logging from dataclasses import dataclass +import bgym from browsergym.core.action.base import AbstractActionSet @@ -17,35 +18,15 @@ class PromptFlags(dp.Flags): """ A class to represent various flags used to control features in an application. - - Attributes: - use_criticise (bool): Ask the LLM to first draft and criticise the action before producing it. - use_thinking (bool): Enable a chain of thoughts. - use_concrete_example (bool): Use a concrete example of the answer in the prompt for a generic task. - use_abstract_example (bool): Use an abstract example of the answer in the prompt. - use_hints (bool): Add some human-engineered hints to the prompt. - enable_chat (bool): Enable chat mode, where the agent can interact with the user. - max_prompt_tokens (int): Maximum number of tokens allowed in the prompt. - be_cautious (bool): Instruct the agent to be cautious about its actions. - extra_instructions (Optional[str]): Extra instructions to provide to the agent. - add_missparsed_messages (bool): When retrying, add the missparsed messages to the prompt. - flag_group (Optional[str]): Group of flags used. """ - obs: dp.ObsFlags - action: dp.ActionFlags - use_criticise: bool = False # - use_thinking: bool = False - use_concrete_example: bool = True - use_abstract_example: bool = False - use_hints: bool = False + obs: dp.ObsFlags = None + action: dp.ActionFlags = None + use_thinking: bool = True + use_concrete_example: bool = False + use_abstract_example: bool = True enable_chat: bool = False - max_prompt_tokens: int = None - be_cautious: bool = True extra_instructions: str | None = None - add_missparsed_messages: bool = True - max_trunc_itr: int = 20 - flag_group: str = None class SystemPrompt(dp.PromptElement): @@ -77,7 +58,7 @@ class History(dp.PromptElement): Format the actions and thoughts of previous steps.""" def __init__(self, actions, thoughts) -> None: - + super().__init__() prompt_elements = [] for i, (action, thought) in enumerate(zip(actions, thoughts)): prompt_elements.append( @@ -121,7 +102,7 @@ def __init__(self, obs, flags: dp.ObsFlags) -> None: def _prompt(self) -> str: return f""" # Observation of current step: -{self.tabs.prompt}{self.focused_element.prompt}{self.error.prompt} +{self.tabs.prompt}{self.error.prompt} """ @@ -152,12 +133,9 @@ def __init__( ) -> None: super().__init__() self.flags = flags - self.history = History(obs, actions, thoughts) + self.history = History(actions, thoughts) self.instructions = make_instructions(obs, flags.enable_chat, flags.extra_instructions) - self.obs = dp.Observation( - obs, - self.flags.obs, - ) + self.obs = Observation(obs, self.flags.obs) self.action_prompt = dp.ActionPrompt(action_set, action_flags=flags.action) self.think = dp.Think(visible=lambda: flags.use_thinking) From ba8c91ed66ee8c2cd3077009100a36fcbc36e9c1 Mon Sep 17 00:00:00 2001 From: recursix Date: Sat, 12 Apr 2025 10:20:58 -0400 Subject: [PATCH 3/4] less filtering --- src/agentlab/experiments/list_openai_models.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/agentlab/experiments/list_openai_models.py b/src/agentlab/experiments/list_openai_models.py index 0c301926..9314e7ef 100644 --- a/src/agentlab/experiments/list_openai_models.py +++ b/src/agentlab/experiments/list_openai_models.py @@ -6,10 +6,12 @@ df = pd.DataFrame([dict(model) for model in models.data]) # Filter GPT models or o1 models - df = df[df["id"].str.contains("gpt") | df["id"].str.contains("o1")] + # df = df[df["id"].str.contains("gpt") | df["id"].str.contains("o1")] # Convert Unix timestamps to dates (YYYY-MM-DD) and remove time df["created"] = pd.to_datetime(df["created"], unit="s").dt.date df.sort_values(by="created", inplace=True) # Print all entries - print(df) + + # print all entries + print(df.to_string(index=False)) From a5a8ef4e58ef9407b0e0845bc9cdd2bf598b49ff Mon Sep 17 00:00:00 2001 From: recursix Date: Tue, 15 Apr 2025 15:55:51 -0400 Subject: [PATCH 4/4] Remove unused VisualAgentArgs for computer use from agent_configs.py --- src/agentlab/agents/visual_agent/agent_configs.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/agentlab/agents/visual_agent/agent_configs.py b/src/agentlab/agents/visual_agent/agent_configs.py index 28de02aa..404afaec 100644 --- a/src/agentlab/agents/visual_agent/agent_configs.py +++ b/src/agentlab/agents/visual_agent/agent_configs.py @@ -37,10 +37,6 @@ flags=DEFAULT_PROMPT_FLAGS, ) -VISUAL_AGENT_COMPUTER_USE = VisualAgentArgs( - chat_model_args=CHAT_MODEL_ARGS_DICT["openai/computer-use-preview-2025-03-11"], - flags=DEFAULT_PROMPT_FLAGS, -) VISUAL_AGENT_CLAUDE_3_5 = VisualAgentArgs( chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/anthropic/claude-3.5-sonnet:beta"],