diff --git a/.gitignore b/.gitignore index c878e3d1..fc61d50e 100644 --- a/.gitignore +++ b/.gitignore @@ -174,5 +174,4 @@ miniwob-plusplus/ debugging_results/ # working files -main_miniwob_debug.py -main_workarena_debug.py +experiments/* \ No newline at end of file diff --git a/src/agentlab/agents/generic_agent/__init__.py b/src/agentlab/agents/generic_agent/__init__.py index ad9b0348..cb5bbb7f 100644 --- a/src/agentlab/agents/generic_agent/__init__.py +++ b/src/agentlab/agents/generic_agent/__init__.py @@ -9,20 +9,23 @@ from .agent_configs import ( AGENT_3_5, AGENT_8B, + AGENT_37_SONNET, + AGENT_CLAUDE_SONNET_35, + AGENT_CLAUDE_SONNET_35_VISION, AGENT_CUSTOM, - AGENT_LLAMA4_17B_INSTRUCT, AGENT_LLAMA3_70B, + AGENT_LLAMA4_17B_INSTRUCT, AGENT_LLAMA31_70B, + CHAT_MODEL_ARGS_DICT, RANDOM_SEARCH_AGENT, AGENT_4o, AGENT_4o_MINI, - AGENT_CLAUDE_SONNET_35, - AGENT_37_SONNET, - AGENT_CLAUDE_SONNET_35_VISION, - AGENT_4o_VISION, AGENT_4o_MINI_VISION, - AGENT_o3_MINI, + AGENT_4o_VISION, AGENT_o1_MINI, + AGENT_o3_MINI, + FLAGS_GPT_4o, + GenericAgentArgs, ) __all__ = [ diff --git a/src/agentlab/agents/tool_use_agent/__init__.py b/src/agentlab/agents/tool_use_agent/__init__.py index b03b1169..935fea14 100644 --- a/src/agentlab/agents/tool_use_agent/__init__.py +++ b/src/agentlab/agents/tool_use_agent/__init__.py @@ -1,4 +1,6 @@ import sys +from agentlab.agents.tool_use_agent.tool_use_agent import * + # for backward compatibility of unpickling sys.modules[__name__ + ".multi_tool_agent"] = sys.modules[__name__] diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py index 86140d02..28b97e82 100644 --- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py +++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py @@ -147,7 +147,7 @@ def apply(self, llm, discussion: StructuredDiscussion, obs: dict) -> dict: AXTREE_NOTE = """ AXTree extracts most of the interactive elements of the DOM in a tree structure. It may also contain information that is not visible in the screenshot. -A line starting with [bid] is a node in the AXTree. It is a unique alpha-numeric identifier to be used when calling tools. +A line starting with [bid] is a node in the AXTree. It is a unique alpha-numeric identifier to be used when calling tools, e.g, click(bid="a253"). Make sure to include letters and numbers in the bid. """ @@ -347,7 +347,7 @@ class PromptConfig: task_hint: TaskHint = None keep_last_n_obs: int = 1 multiaction: bool = False - action_subsets: tuple[str] = field(default_factory=lambda: ("coord",)) + action_subsets: tuple[str] = None @dataclass @@ -498,6 +498,15 @@ def get_action(self, obs: Any) -> float: vision_support=True, ) +GPT_4_1_MINI = OpenAIResponseModelArgs( + model_name="gpt-4.1-mini", + max_total_tokens=200_000, + max_input_tokens=200_000, + max_new_tokens=2_000, + temperature=0.1, + vision_support=True, +) + OPENAI_CHATAPI_MODEL_CONFIG = OpenAIChatModelArgs( model_name="gpt-4o-2024-08-06", max_total_tokens=200_000, diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index f56d607c..ea5371b9 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -601,7 +601,13 @@ def get_screenshot( if annotate: action_str = step_info.action properties = step_info.obs.get("extra_element_properties", None) - action_colored = annotate_action(img, action_string=action_str, properties=properties) + try: + action_colored = annotate_action( + img, action_string=action_str, properties=properties + ) + except Exception as e: + warning(f"Failed to annotate action: {e}") + action_colored = action_str else: action_colored = None return img, action_colored diff --git a/src/agentlab/analyze/archive_studies.py b/src/agentlab/analyze/archive_studies.py new file mode 100644 index 00000000..e82934b4 --- /dev/null +++ b/src/agentlab/analyze/archive_studies.py @@ -0,0 +1,122 @@ +import os +from dataclasses import dataclass +from pathlib import Path + +import pandas as pd +from tqdm import tqdm + +from agentlab.analyze import inspect_results +from agentlab.experiments.exp_utils import RESULTS_DIR +from agentlab.experiments.study import Study + + +@dataclass +class StudyInfo: + study_dir: Path + study: Study + summary_df: pd.DataFrame + should_delete: bool = False + reason: str = "" + + +def search_for_reasons_to_archive(result_dir: Path, min_study_size: int = 0) -> list[StudyInfo]: + + study_info_list = [] + study_dirs = list(result_dir.iterdir()) + progress = tqdm(study_dirs, desc="Processing studies") + for study_dir in progress: + + progress.set_postfix({"study_dir": study_dir}) + if not study_dir.is_dir(): + progress.set_postfix({"status": "skipped"}) + continue + + try: + study = Study.load(study_dir) + except Exception: + study = None + # get summary*.csv files and find the most recent + summary_files = list(study_dir.glob("summary*.csv")) + + if len(summary_files) != 0: + most_recent_summary = max(summary_files, key=os.path.getctime) + summary_df = pd.read_csv(most_recent_summary) + + else: + try: + result_df = inspect_results.load_result_df(study_dir, progress_fn=None) + summary_df = inspect_results.summarize_study(result_df) + except Exception as e: + print(f" Error processing {study_dir}: {e}") + continue + + study_info = StudyInfo( + study_dir=study_dir, + study=study, + summary_df=summary_df, + ) + + if len(study_info.summary_df) == 0: + study_info.should_delete = True + study_info.reason = "Empty summary DataFrame" + + n_completed, n_total, n_err = 0, 0, 0 + + for _, row in study_info.summary_df.iterrows(): + n_comp, n_tot = row["n_completed"].split("/") + n_completed += int(n_comp) + n_total += int(n_tot) + n_err += int(row.get("n_err")) + + n_finished = n_completed - n_err + + # print(summary_df) + # print(f" {n_completed} / {n_total}, {n_err} errors") + + if "miniwob-tiny-test" in study_dir.name: + study_info.should_delete = True + study_info.reason += "Miniwob tiny test\n" + if n_total == 0: + study_info.should_delete = True + study_info.reason += "No tasks\n" + if n_completed == 0: + study_info.should_delete = True + study_info.reason += "No tasks completed\n" + if float(n_finished) / float(n_total) < 0.5: + study_info.should_delete = True + study_info.reason += f"Less than 50% tasks finished, n_err: {n_err}, n_total: {n_total}, n_finished: {n_finished}, n_completed: {n_completed}\n" + + if n_total <= min_study_size: + study_info.should_delete = True + study_info.reason += ( + f"Too few tasks. n_total ({n_total}) <= min_study_size ({min_study_size})\n" + ) + + study_info_list.append(study_info) + return study_info_list + + +if __name__ == "__main__": + study_list_info = search_for_reasons_to_archive(RESULTS_DIR, min_study_size=5) + archive_dir = RESULTS_DIR.parent / "archived_agentlab_results" # type: Path + archive_dir.mkdir(parents=True, exist_ok=True) + + # Uncomment the line below to prevent moving studies to archive + archive_dir = None + + for study_info in study_list_info: + if not study_info.should_delete: + continue + + print(f"Study: {study_info.study_dir.name}") + print(f" Reason: {study_info.reason}") + print(study_info.summary_df) + print() + + if archive_dir is not None: + # move to new dir + new_path = archive_dir / study_info.study_dir.name + study_info.study_dir.rename(new_path) + # save reason in a file + reason_file = new_path / "reason_to_archive.txt" + reason_file.write_text(study_info.reason) diff --git a/src/agentlab/experiments/graph_execution_ray.py b/src/agentlab/experiments/graph_execution_ray.py index b8ba5b59..f047f866 100644 --- a/src/agentlab/experiments/graph_execution_ray.py +++ b/src/agentlab/experiments/graph_execution_ray.py @@ -1,7 +1,3 @@ -# import os - -# # Disable Ray log deduplication -# os.environ["RAY_DEDUP_LOGS"] = "0" import logging import time @@ -90,12 +86,22 @@ def poll_for_timeout(tasks: dict[str, ray.ObjectRef], timeout: float, poll_inter def get_elapsed_time(task_ref: ray.ObjectRef): - task_id = task_ref.task_id().hex() - task_info = state.get_task(task_id, address="auto") - if task_info and task_info.start_time_ms is not None: - start_time_s = task_info.start_time_ms / 1000.0 # Convert ms to s + try: + task_id = task_ref.task_id().hex() + task_info = state.get_task(task_id, address="auto") + if not task_info: + return None + if not isinstance(task_info, list): + task_info = [task_info] + + start_times_ms = [getattr(t, "start_time_ms", None) for t in task_info] + start_time_s = max([t / 1000.0 if t is not None else -1 for t in start_times_ms]) + if start_time_s < 0: + return None # Task has not started yet + current_time_s = time.time() elapsed_time = current_time_s - start_time_s return elapsed_time - else: - return None # Task has not started yet + except Exception as e: + logger.warning(f"Could not get elapsed time for task {task_id}: {e}") + return None diff --git a/src/agentlab/experiments/loop.py b/src/agentlab/experiments/loop.py index ac77e01f..7b5b280f 100644 --- a/src/agentlab/experiments/loop.py +++ b/src/agentlab/experiments/loop.py @@ -25,7 +25,11 @@ from PIL import Image from tqdm import tqdm -from agentlab.agents.tapeagent import TapeAgent, save_tape +try: + from agentlab.agents.tapeagent import TapeAgent, save_tape +except ImportError: + TapeAgent = None + logger = logging.getLogger(__name__) @@ -474,7 +478,7 @@ def run(self): err_msg = f"Exception uncaught by agent or environment in task {self.env_args.task_name}.\n{type(e).__name__}:\n{e}" logger.info("Saving experiment info.") self.save_summary_info(episode_info, Path(self.exp_dir), err_msg, stack_trace) - if isinstance(agent, TapeAgent): + if TapeAgent is not None and isinstance(agent, TapeAgent): task = getattr(env, "task", {}) save_tape(self.exp_dir, episode_info, task, agent.final_tape) except Exception as e: diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py index 371229b5..3285b877 100644 --- a/src/agentlab/llm/chat_api.py +++ b/src/agentlab/llm/chat_api.py @@ -6,6 +6,7 @@ from functools import partial from typing import Optional +import anthropic import openai from huggingface_hub import InferenceClient from openai import AzureOpenAI, OpenAI @@ -471,3 +472,77 @@ def __init__( client_args={"base_url": "http://0.0.0.0:8000/v1"}, pricing_func=None, ) + + +class AnthropicChatModel(AbstractChatModel): + def __init__( + self, + model_name, + api_key=None, + temperature=0.5, + max_tokens=100, + max_retry=4, + log_probs=False, + ): + self.model_name = model_name + self.temperature = temperature + self.max_tokens = max_tokens + self.max_retry = max_retry + self.log_probs = log_probs + + api_key = api_key or os.getenv("ANTHROPIC_API_KEY") + self.client = anthropic.Anthropic(api_key=api_key) + + def __call__(self, messages: list[dict], n_samples: int = 1, temperature: float = None) -> dict: + # Convert OpenAI format to Anthropic format + system_message = None + anthropic_messages = [] + + for msg in messages: + if msg["role"] == "system": + system_message = msg["content"] + else: + anthropic_messages.append({"role": msg["role"], "content": msg["content"]}) + + temperature = temperature if temperature is not None else self.temperature + + for attempt in range(self.max_retry): + try: + kwargs = { + "model": self.model_name, + "messages": anthropic_messages, + "max_tokens": self.max_tokens, + "temperature": temperature, + } + + if system_message: + kwargs["system"] = system_message + + response = self.client.messages.create(**kwargs) + + # Track usage if available + if hasattr(tracking.TRACKER, "instance"): + tracking.TRACKER.instance( + response.usage.input_tokens, + response.usage.output_tokens, + 0, # cost calculation would need pricing info + ) + + return AIMessage(response.content[0].text) + + except Exception as e: + if attempt == self.max_retry - 1: + raise e + logging.warning(f"Anthropic API error (attempt {attempt + 1}): {e}") + time.sleep(60) # Simple retry delay + + +@dataclass +class AnthropicModelArgs(BaseModelArgs): + def make_model(self): + return AnthropicChatModel( + model_name=self.model_name, + temperature=self.temperature, + max_tokens=self.max_new_tokens, + log_probs=self.log_probs, + ) diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py index 5125801d..2570d0cd 100644 --- a/src/agentlab/llm/llm_configs.py +++ b/src/agentlab/llm/llm_configs.py @@ -17,6 +17,20 @@ ] CHAT_MODEL_ARGS_DICT = { + "openai/gpt-4.1-mini-2025-04-14": OpenAIModelArgs( + model_name="gpt-4.1-mini-2025-04-14", + max_total_tokens=128_000, + max_input_tokens=128_000, + max_new_tokens=16_384, + vision_support=True, + ), + "openai/gpt-4.1-2025-04-14": OpenAIModelArgs( + model_name="gpt-4.1-2025-04-14", + max_total_tokens=128_000, + max_input_tokens=128_000, + max_new_tokens=16_384, + vision_support=True, + ), "openai/o3-mini-2025-01-31": OpenAIModelArgs( model_name="o3-mini-2025-01-31", max_total_tokens=200_000, diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py index 2b9f696c..06f346a2 100644 --- a/src/agentlab/llm/response_api.py +++ b/src/agentlab/llm/response_api.py @@ -360,14 +360,7 @@ def _parse_response(self, response: dict) -> dict: interesting_keys = ["output_text"] for output in response.output: if output.type == "function_call": - arguments = json.loads(output.arguments) - func_args_str = ", ".join( - [ - f'{k}="{v}"' if isinstance(v, str) else f"{k}={v}" - for k, v in arguments.items() - ] - ) - result.action = f"{output.name}({func_args_str})" + result.action = tool_call_to_python_code(output.name, json.loads(output.arguments)) result.tool_calls = output break elif output.type == "reasoning": @@ -603,13 +596,7 @@ def _parse_response(self, response: dict) -> dict: ) for output in response.content: if output.type == "tool_use": - func_args_str = ", ".join( - [ - f'{k}="{v}"' if isinstance(v, str) else f"{k}={v}" - for k, v in output.input.items() - ] - ) - result.action = f"{output.name}({func_args_str})" + result.action = tool_call_to_python_code(output.name, output.input) elif output.type == "text": result.think += output.text return result @@ -736,3 +723,15 @@ def make_model(self, extra_kwargs=None, **kwargs): def get_message_builder(self) -> MessageBuilder: return OpenAIChatCompletionAPIMessageBuilder + + +def tool_call_to_python_code(func_name, kwargs): + """Format a function name and kwargs dict into a Python function call string.""" + if kwargs is None: + kwargs = {} + + if not kwargs: + return f"{func_name}()" + + args_str = ", ".join(f"{key}={repr(value)}" for key, value in kwargs.items()) + return f"{func_name}({args_str})" diff --git a/tests/llm/test_chat_api.py b/tests/llm/test_chat_api.py index f06fa7fa..4f74c56a 100644 --- a/tests/llm/test_chat_api.py +++ b/tests/llm/test_chat_api.py @@ -3,6 +3,7 @@ import pytest from agentlab.llm.chat_api import ( + AnthropicModelArgs, AzureModelArgs, OpenAIModelArgs, make_system_message, @@ -59,3 +60,27 @@ def test_api_model_args_openai(): answer = model(messages) assert "5" in answer.get("content") + + +@pytest.mark.pricy +@pytest.mark.skipif(skip_tests, reason="Skipping on remote as Anthropic is pricy") +def test_api_model_args_anthropic(): + model_args = AnthropicModelArgs( + model_name="claude-3-haiku-20240307", + max_total_tokens=8192, + max_input_tokens=8192 - 512, + max_new_tokens=512, + temperature=1e-1, + ) + model = model_args.make_model() + + messages = [ + make_system_message("You are an helpful virtual assistant"), + make_user_message("Give the third prime number. Just the number, no explanation."), + ] + answer = model(messages) + assert "5" in answer.get("content") + + +if __name__ == "__main__": + test_api_model_args_anthropic() diff --git a/tests/llm/test_response_api.py b/tests/llm/test_response_api.py index 16316a92..567b49da 100644 --- a/tests/llm/test_response_api.py +++ b/tests/llm/test_response_api.py @@ -299,7 +299,7 @@ def test_claude_response_model_parse_and_cost(): content for content in parsed_output.raw_response.content if content.type == "tool_use" ] assert "Thinking about the request." in parsed_output.think - assert parsed_output.action == 'search_web(query="latest news")' + assert parsed_output.action == "search_web(query='latest news')" assert fn_calls[0].id == "tool_abc" assert global_tracker.stats["input_tokens"] == 40 assert global_tracker.stats["output_tokens"] == 20 @@ -348,7 +348,7 @@ def test_openai_response_model_parse_and_cost(): fn_calls = [ content for content in parsed_output.raw_response.output if content.type == "function_call" ] - assert parsed_output.action == 'get_current_weather(location="Boston, MA", unit="celsius")' + assert parsed_output.action == "get_current_weather(location='Boston, MA', unit='celsius')" assert fn_calls[0].call_id == "call_abc123" assert parsed_output.raw_response == mock_api_resp assert global_tracker.stats["input_tokens"] == 70 @@ -716,3 +716,53 @@ def test_claude_model_with_multiple_messages_pricy_call(): # TODO: Add tests for image token costing (this is complex and model-specific) # - For OpenAI, you'd need to know how they bill for images (e.g., fixed cost per image + tokens for text parts) # - You'd likely need to mock the response from client.chat.completions.create to include specific usage for images. + + +EDGE_CASES = [ + # 1. Empty kwargs dict + ("valid_function", {}, "valid_function()"), + # 2. Kwargs with problematic string values (quotes, escapes, unicode) + ( + "send_message", + { + "text": 'He said "Hello!" and used a backslash: \\', + "unicode": "Café naïve résumé 🚀", + "newlines": "Line1\nLine2\tTabbed", + }, + "send_message(text='He said \"Hello!\" and used a backslash: \\\\', unicode='Café naïve résumé 🚀', newlines='Line1\\nLine2\\tTabbed')", + ), + # 3. Mixed types including problematic float values + ( + "complex_call", + { + "infinity": float("inf"), + "nan": float("nan"), + "negative_zero": -0.0, + "scientific": 1.23e-45, + }, + "complex_call(infinity=inf, nan=nan, negative_zero=-0.0, scientific=1.23e-45)", + ), + # 4. Deeply nested structures that could stress repr() + ( + "process_data", + { + "nested": {"level1": {"level2": {"level3": [1, 2, {"deep": True}]}}}, + "circular_ref_like": {"a": {"b": {"c": "back_to_start"}}}, + }, + "process_data(nested={'level1': {'level2': {'level3': [1, 2, {'deep': True}]}}}, circular_ref_like={'a': {'b': {'c': 'back_to_start'}}})", + ), +] + + +def test_tool_call_to_python_code(): + from agentlab.llm.response_api import tool_call_to_python_code + + for edge_case in EDGE_CASES: + func_name, kwargs, expected = edge_case + result = tool_call_to_python_code(func_name, kwargs) + print(result) + assert result == expected, f"Expected {expected} but got {result}" + + +if __name__ == "__main__": + test_tool_call_to_python_code() diff --git a/tests/verify_rate_limit_anthropic.py b/tests/verify_rate_limit_anthropic.py new file mode 100644 index 00000000..5ff2d91d --- /dev/null +++ b/tests/verify_rate_limit_anthropic.py @@ -0,0 +1,89 @@ +import os +import time +from concurrent.futures import ThreadPoolExecutor, as_completed + +import anthropic + +client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"]) + + +def make_request(messages): + response = client.messages.create( + model="claude-3-5-sonnet-20241022", max_tokens=10, messages=messages + ) + return response.usage + + +def make_message(text): + return { + "role": "user", + "content": [ + { + "type": "text", + "text": text, + } + ], + } + + +def add_cache_control(message: dict, cache_type="ephemeral"): + message["content"][0]["cache_control"] = {"type": cache_type} + + +def remove_cache_control(message: dict): + if "cache_control" in message["content"][0]: + del message["content"][0]["cache_control"] + + +def test_rate_limit_single(thread_id): + # Create ~100k token message that will be cached + big_text = "This is a large block of text for caching. " * 10000 # ~100k tokens + medium_text = "This is a large block of text for caching. " * 2000 # ~10k tokens + + print(f"Thread {thread_id}: Starting rate limit test with cached content...") + + # Rebuild conversation each time (simulating web agent) + messages = [] + + # Add all previous conversation turns + for i in range(5): + if i == 0: + messages.append(make_message(big_text)) + t0 = time.time() + else: + messages.append(make_message(medium_text)) + add_cache_control(messages[-1]) + try: + usage = make_request(messages) + dt = time.time() - t0 + print(f"{dt:.2f}: Thread {thread_id}: {usage}") + except Exception as e: + print(f"Thread {thread_id}: Error - {e}") + break + remove_cache_control(messages[-1]) + + +def test_rate_limit_parallel(num_threads=3): + print(f"Starting parallel rate limit test with {num_threads} threads...") + + with ThreadPoolExecutor(max_workers=num_threads) as executor: + futures = [executor.submit(test_rate_limit_single, i) for i in range(num_threads)] + + for future in as_completed(futures): + try: + future.result() + except Exception as e: + print(f"Thread completed with error: {e}") + + +def test_rate_limit(): + # Original single-threaded version + test_rate_limit_single(0) + + +if __name__ == "__main__": + # Use parallel version to quickly exhaust rate limits + test_rate_limit_parallel(num_threads=3) + + # Or use original single-threaded version + # test_rate_limit()