From c9852ecf548db35504b4a1b4e3932a98de732a51 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com> Date: Fri, 29 Aug 2025 15:57:09 -0400 Subject: [PATCH 01/21] rename to trace-recorder to hilt_agent --- .../hilt_agent/base_multi_candidate_agent.py | 51 +++ .../hilt_agent/generic_human_guided_agent.py | 378 ++++++++++++++++ src/agentlab/agents/hilt_agent/hilt_agent.py | 209 +++++++++ .../agents/hilt_agent/hint_labelling.py | 153 +++++++ .../hint_labeling_ui.html | 419 ++++++++++++++++++ .../multi_candidate_generic_agent.py | 216 +++++++++ 6 files changed, 1426 insertions(+) create mode 100644 src/agentlab/agents/hilt_agent/base_multi_candidate_agent.py create mode 100644 src/agentlab/agents/hilt_agent/generic_human_guided_agent.py create mode 100644 src/agentlab/agents/hilt_agent/hilt_agent.py create mode 100644 src/agentlab/agents/hilt_agent/hint_labelling.py create mode 100644 src/agentlab/agents/hilt_agent/hint_labelling_ui_files/hint_labeling_ui.html create mode 100644 src/agentlab/agents/hilt_agent/multi_candidate_generic_agent.py diff --git a/src/agentlab/agents/hilt_agent/base_multi_candidate_agent.py b/src/agentlab/agents/hilt_agent/base_multi_candidate_agent.py new file mode 100644 index 00000000..6cd4624a --- /dev/null +++ b/src/agentlab/agents/hilt_agent/base_multi_candidate_agent.py @@ -0,0 +1,51 @@ +from typing_extensions import Protocol +from agentlab.agents.agent_args import AgentArgs + + +class MultiCandidateAgent(Protocol): + """ + Protocol for agents that generate multiple candidates for get_action. + + This protocol defines the contract for agents that can generate + multiple candidate actions and allow selection of one of them for execution. + """ + + def get_candidate_generations( + self, obs: dict, hint: list[str] | None = None, n_candidates: int = 3 + ) -> list[dict]: + """ + Generate multiple candidate actions for the given observation. + You can pass extra info in agent_info to update internal state of the + agent based on the selected candidate. Your internal state management + should be robust to multiple calls to the get_candidate_generations method + in a single step. + + Args: + obs: The current observation dictionary containing environment state + hint: Optional list of hint strings to guide candidate generation + n_candidates: Number of candidate actions to generate + + Returns: + List of dictionaries, each containing: + - 'action': The candidate action to be executed + - 'agent_info': Additional information about the action generation + """ + ... + + def update_agent_state_from_selected_candidate(self, output: dict): + """ + Update the agent's internal state based on the selected candidate. + This can include any memory or planning updates. + + """ + ... + + +class MultiCandidateAgentArgs(AgentArgs): + def make_agent(self) -> MultiCandidateAgent: ... + + def __post_init__(self): + """Prefix subagent name with 'MC-'.""" + super().__post_init__() + if hasattr(self, 'agent_name') and self.agent_name: + self.agent_name = "MC-" + self.agent_name diff --git a/src/agentlab/agents/hilt_agent/generic_human_guided_agent.py b/src/agentlab/agents/hilt_agent/generic_human_guided_agent.py new file mode 100644 index 00000000..220ca0df --- /dev/null +++ b/src/agentlab/agents/hilt_agent/generic_human_guided_agent.py @@ -0,0 +1,378 @@ +import base64 +import copy +import io +import re +from dataclasses import Field, asdict, dataclass +from typing import Dict, List + +import bgym +import numpy as np +from PIL import Image + +from agentlab.agents import dynamic_prompting as dp +from agentlab.agents.generic_agent.generic_agent import GenericAgent, GenericAgentArgs +from agentlab.agents.generic_agent.generic_agent_prompt import MainPrompt +from agentlab.agents.hilt_agent.hint_labelling import ( + HintLabeling, + HintLabelingInputs, +) +from agentlab.analyze import overlay_utils +from agentlab.llm.llm_utils import ( + Discussion, + HumanMessage, + SystemMessage, +) +from agentlab.llm.tracking import cost_tracker_decorator +from browsergym.experiments.agent import AgentInfo + + +class CandidatesGeneration(dp.PromptElement): + # Ask for multiple alternatives; each candidate must contain and . + def __init__(self, hint: list[str] | None = None, n_candidates=3) -> None: + self.hint = hint + self.n_candidates = n_candidates + self.hint_prompt = "\n".join(f"{i}. {c}" for i, c in enumerate(hint, 1)) if hint else "" + super().__init__(True) + self._prompt = [ + dict( + type="text", + text=f""" + You are a web agent. Propose {self.n_candidates} alternative next steps for the current page. + {('Use the Hints:' + self.hint_prompt) if self.hint else ""}\n + Return EACH candidate wrapped as numbered tags: + ... + ... + + Inside every candidate you MUST include: + ...why this action is appropriate now... + ...ONE atomic, executable action string... + + Do not include any extra text outside the candidate tags. + Use this format: + + Explain why Candidate One is chosen + Candidate One Action + + + + Explain why Candidate Two is chosen + Candidate Two Action + + # Example + + The login button is visible and proceeding will reveal the auth form. + click(role="button", name="Log in") + + + + User might need to enter email first; the email field is focused and visible. + fill(bid="a112", text="user@example.com") + + """, + ) + ] + + # Regex patterns for numbered candidates only + _NUM_BLOCK = re.compile( + r"<\s*candidate[_ ]generation[_ ](?P[0-9]+)\s*>(?P.*?)<\s*/\s*candidate[_ ]generation[_ ](?P=idx)\s*>", + flags=re.IGNORECASE | re.DOTALL, + ) + _THINK_PATTERN = re.compile( + r"<\s*think\s*>(?P.*?)<\s*/\s*think\s*>", + flags=re.IGNORECASE | re.DOTALL, + ) + _ACTION_PATTERN = re.compile( + r"<\s*action\s*>(?P.*?)<\s*/\s*action\s*>", + flags=re.IGNORECASE | re.DOTALL, + ) + + def _parse_answer(self, text_answer: str) -> Dict[str, Dict[str, str]]: + """ + Extract up to n_candidates candidates, using numbered tags only. + + Returns: + { + "candidate_generation_1": {"think": "...", "action": "..."}, + "candidate_generation_2": {"think": "...", "action": "..."}, + ... + } + """ + result = { + f"candidate_generation_{i+1}": {"think": "", "action": ""} + for i in range(self.n_candidates) + } + + if not isinstance(text_answer, str): + return result + + matches: List[re.Match] = list(self._NUM_BLOCK.finditer(text_answer)) + # Sort by numeric index + matches_sorted = sorted(matches, key=lambda m: int(m.group("idx"))) + for i, m in enumerate(matches_sorted[: self.n_candidates]): + body = m.group("body").strip() + think_m = self._THINK_PATTERN.search(body) + action_m = self._ACTION_PATTERN.search(body) + result[f"candidate_generation_{i+1}"] = { + "think": (think_m.group("think").strip() if think_m else ""), + "action": (action_m.group("action").strip() if action_m else ""), + } + + return result + + +def overlay_action(obs, action): + """Overlays actions on screenshot in-place""" + act_img = copy.deepcopy(obs["screenshot"]) + act_img = Image.fromarray(act_img) + overlay_utils.annotate_action(act_img, action, properties=obs["extra_element_properties"]) + return img_to_base_64(act_img) + + +def img_to_base_64(image: Image.Image | np.ndarray) -> str: + """Converts a PIL Image or NumPy array to a base64-encoded string.""" + if isinstance(image, np.ndarray): + image = Image.fromarray(image) + buffer = io.BytesIO() + image.save(buffer, format="PNG") + b64_str = base64.b64encode(buffer.getvalue()).decode("utf-8") + return b64_str + + +@dataclass +class MultipleProposalGenericAgentArgs(GenericAgentArgs): + + def make_agent(self): + return MultipleProposalGenericAgent( + chat_model_args=self.chat_model_args, flags=self.flags, max_retry=self.max_retry + ) + + def __post_init__(self): + """Prefix subagent name with 'HILT-'.""" + super().__post_init__() + if hasattr(self, 'agent_name') and self.agent_name: + self.agent_name = "HILT-" + self.agent_name + + +class MultipleProposalGenericAgent(GenericAgent): + + def __init__( + self, + chat_model_args, + flags, + max_retry: int = 4, + ): + super().__init__(chat_model_args, flags, max_retry) + self.ui = None # Single HintLabeling instance + + def get_candidate_generation( + self, + sys_prompt: SystemMessage, + human_prompt: HumanMessage, + hint: list[str] | None = None, + n_candidates=3, + ) -> tuple[Dict[str, Dict[str, str]], Discussion]: + + cg = CandidatesGeneration(hint=hint, n_candidates=n_candidates) + candidates_prompt = HumanMessage(cg.prompt) + chat_messages = Discussion([sys_prompt, human_prompt, candidates_prompt]) + output = self.chat_llm(chat_messages) + candidates = cg._parse_answer(output["content"]) + self.step_n_human_intervention_rounds += 1 + msg_to_add_to_xray = Discussion([sys_prompt, human_prompt]) + + return candidates, msg_to_add_to_xray + + @cost_tracker_decorator + def get_action(self, obs): + # reset vars + step_hint = [] + self.step_n_human_intervention_rounds = 0 + self.obs_history.append(obs) + main_prompt = MainPrompt( + action_set=self.action_set, + obs_history=self.obs_history, + actions=self.actions, + memories=self.memories, + thoughts=self.thoughts, + previous_plan=self.plan, + step=self.plan_step, + flags=self.flags, + ) + + max_prompt_tokens, max_trunc_itr = self._get_maxes() + + system_prompt = SystemMessage(dp.SystemPrompt().prompt) + + human_prompt = dp.fit_tokens( + shrinkable=main_prompt, + max_prompt_tokens=max_prompt_tokens, + model_name=self.chat_model_args.model_name, + max_iterations=max_trunc_itr, + additional_prompts=system_prompt, + ) + # Initialize UI once outside the loop + if self.ui is None: + self.ui = HintLabeling(headless=False) + # Show initial waiting state + initial_inputs = HintLabelingInputs( + goal=( + obs.get("goal_object", [{}])[0].get("text", "") + if obs.get("goal_object") + else "" + ), + error_feedback="", + screenshot=(img_to_base_64(obs["screenshot"]) if "screenshot" in obs else ""), + screenshots=[], # no overlay screenshots yet + axtree=obs.get("axtree_txt", ""), + history=[], + hint="", + suggestions=[], # no suggestions yet + ) + self.ui.update_context(initial_inputs) + + # Generate first candidates + candidates, chat_messages = self.get_candidate_generation( + sys_prompt=system_prompt, + human_prompt=human_prompt, + hint=step_hint if step_hint else None, + ) + suggestions = [ + { + "id": key.split("_")[-1], + "action": candidate["action"], + "think": candidate["think"], + } + for key, candidate in candidates.items() + ] + # List of Images as base64 - create overlay screenshots for each suggestion + screenshots = [overlay_action(obs, choice["action"]) for choice in suggestions] + + while True: + try: + hint_labeling_inputs = HintLabelingInputs( + goal=( + obs.get("goal_object", [{}])[0].get("text", "") + if obs.get("goal_object") + else "" + ), + error_feedback=obs.get("last_action_error", ""), + screenshot=(img_to_base_64(obs["screenshot"]) if "screenshot" in obs else ""), + screenshots=screenshots, # list of overlay screenshots for hover + axtree=obs.get("axtree_txt", ""), + history=[], # TODO: add history + hint=( + "\n".join(f"{i}. {c}" for i, c in enumerate(step_hint, 1)) + if step_hint + else "" + ), + suggestions=suggestions, + ) + + self.ui.update_context(hint_labeling_inputs) + response = self.ui.wait_for_response(timeout=300) + + if response["type"] == "reprompt": + hint = response["payload"]["hint"] + step_hint.append(hint) + candidates, chat_messages = self.get_candidate_generation( + sys_prompt=system_prompt, + human_prompt=human_prompt, + hint=step_hint if step_hint else None, + ) + suggestions = [ + { + "id": key.split("_")[-1], + "action": candidate["action"], + "think": candidate["think"], + } + for key, candidate in candidates.items() + ] + # Regenerate screenshots for new suggestions + screenshots = [overlay_action(obs, choice["action"]) for choice in suggestions] + # Continue the loop to show new suggestions + elif response["type"] == "step": + selected_action = response["payload"]["action"] + choice_idx = None + for i, candidate in enumerate(suggestions, 1): + if candidate["action"] == selected_action: + choice_idx = i + break + if choice_idx is None: + choice_idx = 1 + ans_dict = candidates[f"candidate_generation_{choice_idx}"] + break + else: + ans_dict = candidates["candidate_generation_1"] + break + + except KeyboardInterrupt: + print("User cancelled the operation") + if self.ui: + self.ui.close() + raise + except Exception as e: + print(f"Error in human intervention UI: {e}") + if self.ui: + self.ui.close() + self.ui = None + # Raise exception instead of falling back to console input + raise RuntimeError(f"Human intervention UI failed: {e}") from e + + # TODO: Refactor as discussed with ALAC. + stats = self.chat_llm.get_stats() + self.plan = ans_dict.get("plan", self.plan) + self.plan_step = ans_dict.get("step", self.plan_step) + self.actions.append(ans_dict["action"]) + self.memories.append(ans_dict.get("memory", None)) + self.thoughts.append(ans_dict.get("think", None)) + agent_info = AgentInfo( + think=ans_dict.get("think", None), + chat_messages=chat_messages, + stats=stats, + extra_info={ + "chat_model_args": asdict(self.chat_model_args), + "step_hints": step_hint, + "n_human_intervention_rounds": self.step_n_human_intervention_rounds, + "candidates": candidates, + "suggestions": suggestions, + }, + ) + return ans_dict["action"], agent_info + + +def get_base_agent(llm_config): + from agentlab.agents.generic_agent.tmlr_config import BASE_FLAGS + from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT + + return MultipleProposalGenericAgentArgs( + chat_model_args=CHAT_MODEL_ARGS_DICT[llm_config], + flags=BASE_FLAGS, + ) + + +HUMAN_GUIDED_GENERIC_AGENT = get_base_agent("openai/gpt-5-mini-2025-08-07") + +if __name__ == "__main__": + import logging + + from agentlab.agents.hilt_agent.generic_human_guided_agent import ( + HUMAN_GUIDED_GENERIC_AGENT, + ) + from agentlab.experiments.study import Study + + agent_configs = [HUMAN_GUIDED_GENERIC_AGENT] + benchmark = bgym.DEFAULT_BENCHMARKS["miniwob"]() + benchmark = benchmark.subset_from_glob("task_name", "*book*") + benchmark.env_args_list = benchmark.env_args_list[2:3] + + for env_args in benchmark.env_args_list: + env_args.max_steps = 100 # max human steps + env_args.headless = False + # env_args.use_chat_ui = False + # env_args.use_hint_labeling_ui = True + + Study(agent_configs, benchmark, logging_level=logging.WARNING).run( + n_jobs=1, + parallel_backend="sequential", + n_relaunch=1, + ) diff --git a/src/agentlab/agents/hilt_agent/hilt_agent.py b/src/agentlab/agents/hilt_agent/hilt_agent.py new file mode 100644 index 00000000..6a44489f --- /dev/null +++ b/src/agentlab/agents/hilt_agent/hilt_agent.py @@ -0,0 +1,209 @@ +import base64 +import copy +import io +from dataclasses import dataclass +from typing import Optional + +import bgym +import numpy as np +from PIL import Image + +from agentlab.agents.hilt_agent.hint_labelling import ( + HintLabeling, + HintLabelingInputs, +) +from agentlab.llm.tracking import cost_tracker_decorator +from agentlab.analyze import overlay_utils +from browsergym.experiments.agent import Agent +from agentlab.agents.agent_args import AgentArgs +from agentlab.agents.hilt_agent.base_multi_candidate_agent import MultiCandidateAgent + +class HumanInTheLoopAgent(Agent): + + def __init__( + self, + subagent_args, # Type: any object with MultiCandidateAgent interface + ): + self.subagent: MultiCandidateAgent = subagent_args.make_agent() + super().__init__() + self.ui = None + + @cost_tracker_decorator + def get_action(self, obs): + # reset vars + step_n_human_intervention_rounds = 0 + step_hint = [] + + # Initialize UI once outside the loop + if self.ui is None: + self.ui = HintLabeling(headless=False) + # Show initial waiting state + initial_inputs = HintLabelingInputs( + goal=( + obs.get("goal_object", [{}])[0].get("text", "") + if obs.get("goal_object") + else "" + ), + error_feedback="", + screenshot=(img_to_base_64(obs["screenshot"]) if "screenshot" in obs else ""), + screenshots=[], # no overlay screenshots yet + axtree=obs.get("axtree_txt", ""), + history=[], + hint="", + suggestions=[], # no suggestions yet + ) + self.ui.update_context(initial_inputs) + + # Generate first candidates + candidates = self.subagent.get_candidate_generations(obs, hint=None, n_candidates=3) + step_n_human_intervention_rounds += 1 + suggestions = [{ 'action': c['action'], 'think': c['agent_info'].think} for c in candidates] + # List of Images as base64 - create overlay screenshots for each suggested action + screenshots = [overlay_action(obs, choice["action"]) for choice in suggestions] + + while True: + try: + hint_labeling_inputs = HintLabelingInputs( + goal=( + obs.get("goal_object", [{}])[0].get("text", "") + if obs.get("goal_object") + else "" + ), + error_feedback=obs.get("last_action_error", ""), + screenshot=(img_to_base_64(obs["screenshot"]) if "screenshot" in obs else ""), + screenshots=screenshots, # list of overlay screenshots for hover + axtree=obs.get("axtree_txt", ""), + history=[], # TODO: add history + hint=( + "\n".join(f"{i}. {c}" for i, c in enumerate(step_hint, 1)) + if step_hint + else "" + ), + suggestions=suggestions, + ) + + self.ui.update_context(hint_labeling_inputs) + response = self.ui.wait_for_response(timeout=300) + + if response["type"] == "reprompt": + hint = response["payload"]["hint"] + step_hint.append(hint) + candidates = self.subagent.get_candidate_generations( + obs, + hint=step_hint if step_hint else None, + n_candidates=3 + ) + suggestions = [{'action': c['action'], 'think': c['agent_info'].think} for c in candidates] + screenshots = [overlay_action(obs, choice["action"]) for choice in suggestions] + + elif response["type"] == "step": + selected_action = response["payload"]["action"] + choice_idx = None + for i, candidate in enumerate(suggestions): + if candidate["action"] == selected_action: + choice_idx = i + break + selected_candidate = candidates[choice_idx] + self.subagent.update_agent_state_from_selected_candidate(selected_candidate) + action = selected_candidate["action"] + agent_info = selected_candidate["agent_info"] + return action, agent_info + + except KeyboardInterrupt: + print("User cancelled the operation") + if self.ui: + self.ui.close() + raise + except Exception as e: + print(f"Error in human intervention UI: {e}") + if self.ui: + self.ui.close() + self.ui = None + # Raise exception instead of falling back to console input + raise RuntimeError(f"Human intervention UI failed: {e}") from e + + +@dataclass +class HumanInTheLoopAgentArgs(AgentArgs): + subagent_args: Optional[AgentArgs] = None # args for the underlying multiple proposal agent + + + def make_agent(self): + assert self.subagent_args is not None + return HumanInTheLoopAgent(subagent_args=self.subagent_args) + + def __post_init__(self): + """Prefix subagent name with 'HILT-'.""" + super().__post_init__() + if self.subagent_args and self.subagent_args.agent_name: + self.agent_name = "HILT-" + self.subagent_args.agent_name + + def set_benchmark(self, benchmark, demo_mode): + """Delegate set_benchmark to the subagent if it has the method.""" + if hasattr(self.subagent_args, 'set_benchmark'): + self.subagent_args.set_benchmark(benchmark, demo_mode) + + def set_reproducibility_mode(self): + """Delegate set_reproducibility_mode to the subagent if it has the method.""" + if hasattr(self.subagent_args, 'set_reproducibility_mode'): + self.subagent_args.set_reproducibility_mode() + + +def overlay_action(obs, action): + """Overlays actions on screenshot in-place""" + act_img = copy.deepcopy(obs["screenshot"]) + act_img = Image.fromarray(act_img) + overlay_utils.annotate_action(act_img, action, properties=obs["extra_element_properties"]) + return img_to_base_64(act_img) + + +def img_to_base_64(image: Image.Image | np.ndarray) -> str: + """Converts a PIL Image or NumPy array to a base64-encoded string.""" + if isinstance(image, np.ndarray): + image = Image.fromarray(image) + buffer = io.BytesIO() + image.save(buffer, format="PNG") + b64_str = base64.b64encode(buffer.getvalue()).decode("utf-8") + return b64_str + +def get_base_human_in_the_loop_genericagent(llm_config): + from agentlab.agents.generic_agent.tmlr_config import BASE_FLAGS + from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT + from agentlab.agents.hilt_agent.hilt_agent import HumanInTheLoopAgentArgs + from agentlab.agents.hilt_agent.multi_candidate_generic_agent import ( + MultiCandidateGenericAgentArgs, + ) + + return HumanInTheLoopAgentArgs( + subagent_args = MultiCandidateGenericAgentArgs( + chat_model_args=CHAT_MODEL_ARGS_DICT[llm_config], + flags=BASE_FLAGS, + ) + ) + + +HUMAN_GUIDED_GENERIC_AGENT = get_base_human_in_the_loop_genericagent("openai/gpt-5-mini-2025-08-07") + +if __name__ == "__main__": + import logging + + from agentlab.agents.hilt_agent.hilt_agent import ( + HUMAN_GUIDED_GENERIC_AGENT, + ) + from agentlab.experiments.study import Study + + agent_configs = [HUMAN_GUIDED_GENERIC_AGENT] + benchmark = bgym.DEFAULT_BENCHMARKS["miniwob"]() + benchmark = benchmark.subset_from_glob("task_name", "*book*") + benchmark.env_args_list = benchmark.env_args_list[2:3] + + for env_args in benchmark.env_args_list: + env_args.max_steps = 100 # max human steps + env_args.headless = False + + + Study(agent_configs, benchmark, logging_level=logging.WARNING).run( + n_jobs=1, + parallel_backend="sequential", + n_relaunch=1, + ) diff --git a/src/agentlab/agents/hilt_agent/hint_labelling.py b/src/agentlab/agents/hilt_agent/hint_labelling.py new file mode 100644 index 00000000..6e293781 --- /dev/null +++ b/src/agentlab/agents/hilt_agent/hint_labelling.py @@ -0,0 +1,153 @@ +import json +import logging +from importlib import resources +from queue import Queue +from typing import Dict, List, Optional + +import playwright.sync_api +from pydantic import BaseModel, Field + +from agentlab.agents.hilt_agent import hint_labelling_ui_files +from browsergym.core import _get_global_playwright + +logger = logging.getLogger(__name__) + +HINT_LABELING_DIR = resources.files(hint_labelling_ui_files) + + +class HintLabelingInputs(BaseModel): + goal: str + error_feedback: str = "" + screenshot: str # base64 screenshot (original/current) + screenshots: List[str] = Field(default_factory=list) # list of base64 screenshots for hover + axtree: str + history: List[Dict[str, str]] = Field(default_factory=list) + hint: str = "" + suggestions: List[Dict[str, str]] = Field(default_factory=list) + + +class HintLabeling: + def __init__(self, headless: bool, window_size=(600, 1000), *args, **kwargs): + + pw: playwright.sync_api.Playwright = _get_global_playwright() + self.browser = pw.chromium.launch( + headless=headless, args=[f"--window-size={window_size[0]},{window_size[1]}"] + ) + self.context = self.browser.new_context( + no_viewport=True, + ) + self.page = self.context.new_page() + self._resp_queue: "Queue[dict]" = Queue() + + self.page.route("**/api/reprompt", self._route_reprompt) + self.page.route("**/api/submit", self._route_submit) + self.page.set_content(get_hint_labeling_ui(HINT_LABELING_DIR)) + + # internal state + self._context: HintLabelingInputs = None + self._running = False + + def _route_reprompt( + self, route: playwright.sync_api.Route, request: playwright.sync_api.Request + ): + logger.info("Route hit: %s %s", request.method, request.url) + try: + body = json.loads(request.post_data() or "{}") + except Exception: + body = {} + # enqueue output 1 (reprompt) + msg = {"type": "reprompt", "payload": {"hint": body.get("hint", "")}} + self._resp_queue.put(msg) + # Respond something minimal so UI doesn’t break; it will be refreshed by a later update_context() + route.fulfill( + status=200, + content_type="application/json", + body=json.dumps({"suggestions": []}), + ) + + def _route_submit(self, route: playwright.sync_api.Route, request: playwright.sync_api.Request): + logger.info("Route hit: %s %s", request.method, request.url) + try: + body = json.loads(request.post_data() or "{}") + except Exception: + body = {} + # Map UI payload -> your step shape + msg = { + "type": "step", + "payload": { + "think": body.get("think", ""), + "action": body.get("action", ""), + }, + } + self._resp_queue.put(msg) + # UI expects 200 JSON; we can optionally send new suggestions here too. + route.fulfill( + status=200, + content_type="application/json", + body=json.dumps({"suggestions": []}), + ) + + def _to_ui_bootstrap(self, ctx: HintLabelingInputs) -> dict: + return { + "goal": ctx.goal, + "error_feedback": ctx.error_feedback, + "screenshot": ctx.screenshot, + "screenshots": ctx.screenshots, # list of screenshots for hover + "axtree": ctx.axtree, + "history": ctx.history, + "hint": ctx.hint, + "suggestions": ctx.suggestions, + } + + def update_context(self, context: HintLabelingInputs): + self._context = context + ui_payload = self._to_ui_bootstrap(context) + # call JS function with arg (no string concat) + self.page.evaluate("(d) => updateContext(d)", ui_payload) + + def wait_for_response(self, timeout: Optional[float] = 600) -> dict: + """ + Wait until the page makes a request to /api/reprompt or /api/submit, + then parse the request body and return it in your schema. + """ + logger.info("Waiting for response from Hint Labeling UI...") + + def is_api(req: playwright.sync_api.Request) -> bool: + u = req.url + return ( + u.endswith("/api/reprompt") or u.endswith("/api/submit") + ) and req.method == "POST" + + # This pumps Playwright internally; no busy waiting. + with self.page.expect_request( + is_api, timeout=(timeout * 1000 if timeout else 0) + ) as req_info: + req = req_info.value + + body_text = req.post_data or "{}" + try: + body = json.loads(body_text) + except Exception as e: + print("JSON parse error:", e) + body = {} + + if req.url.endswith("/api/reprompt"): + msg = {"type": "reprompt", "payload": {"hint": body.get("hint", "")}} + else: + msg = { + "type": "step", + "payload": {"think": body.get("think", ""), "action": body.get("action", "")}, + } + + logger.info("Response received: %s", msg) + return msg + + def close(self): + self.context.close() + self.browser.close() + + +def get_hint_labeling_ui(hint_labeling_dir) -> str: + with open(hint_labeling_dir / "hint_labeling_ui.html", "r") as file: + hint_labeling_html = file.read() + return hint_labeling_html diff --git a/src/agentlab/agents/hilt_agent/hint_labelling_ui_files/hint_labeling_ui.html b/src/agentlab/agents/hilt_agent/hint_labelling_ui_files/hint_labeling_ui.html new file mode 100644 index 00000000..3371c3cd --- /dev/null +++ b/src/agentlab/agents/hilt_agent/hint_labelling_ui_files/hint_labeling_ui.html @@ -0,0 +1,419 @@ + + + + + + + Agent Reprompt UI + + + +
+ +
+
+

Goal

+
+
+
+

Error Feedback

+
+
+
+ + +
+
+ + + +
+
+
+ screenshot +
+ + +
+
+ + +
+

Hints

+ + + +
+ + +
+

Suggestions

+
+ + + +
+ + + +
+ + + + \ No newline at end of file diff --git a/src/agentlab/agents/hilt_agent/multi_candidate_generic_agent.py b/src/agentlab/agents/hilt_agent/multi_candidate_generic_agent.py new file mode 100644 index 00000000..64c821ef --- /dev/null +++ b/src/agentlab/agents/hilt_agent/multi_candidate_generic_agent.py @@ -0,0 +1,216 @@ +import re +from dataclasses import asdict, dataclass +from typing import Dict, List + +from agentlab.agents import dynamic_prompting as dp +from agentlab.agents.generic_agent.generic_agent import GenericAgent, GenericAgentArgs +from agentlab.agents.generic_agent.generic_agent_prompt import MainPrompt +from agentlab.llm.llm_utils import Discussion, HumanMessage, SystemMessage +from browsergym.experiments.agent import AgentInfo + + +class CandidatesGeneration(dp.PromptElement): + # Ask for multiple alternatives; each candidate must contain and . + def __init__(self, hint: list[str] | None = None, n_candidates=3) -> None: + self.hint = hint + self.n_candidates = n_candidates + self.hint_prompt = "\n".join(f"{i}. {c}" for i, c in enumerate(hint, 1)) if hint else "" + super().__init__(True) + self._prompt = [ + dict( + type="text", + text=f""" + You are a web agent. Propose {self.n_candidates} alternative next steps for the current page. + {('Use the Hints:' + self.hint_prompt) if self.hint else ""}\n + Return EACH candidate wrapped as numbered tags: + ... + ... + + Inside every candidate you MUST include: + ...why this action is appropriate now... + ...ONE atomic, executable action string... + + Do not include any extra text outside the candidate tags. + Use this format: + + Explain why Candidate One is chosen + Candidate One Action + + + + Explain why Candidate Two is chosen + Candidate Two Action + + # Example + + The login button is visible and proceeding will reveal the auth form. + click(role="button", name="Log in") + + + + User might need to enter email first; the email field is focused and visible. + fill(bid="a112", text="user@example.com") + + """, + ) + ] + + # Regex patterns for numbered candidates only + _NUM_BLOCK = re.compile( + r"<\s*candidate[_ ]generation[_ ](?P[0-9]+)\s*>(?P.*?)<\s*/\s*candidate[_ ]generation[_ ](?P=idx)\s*>", + flags=re.IGNORECASE | re.DOTALL, + ) + _THINK_PATTERN = re.compile( + r"<\s*think\s*>(?P.*?)<\s*/\s*think\s*>", + flags=re.IGNORECASE | re.DOTALL, + ) + _ACTION_PATTERN = re.compile( + r"<\s*action\s*>(?P.*?)<\s*/\s*action\s*>", + flags=re.IGNORECASE | re.DOTALL, + ) + + def _parse_answer(self, text_answer: str) -> Dict[str, Dict[str, str]]: + """ + Extract up to n_candidates candidates, using numbered tags only. + + Returns: + { + "candidate_generation_1": {"think": "...", "action": "..."}, + "candidate_generation_2": {"think": "...", "action": "..."}, + ... + } + """ + result = { + f"candidate_generation_{i+1}": {"think": "", "action": ""} + for i in range(self.n_candidates) + } + + if not isinstance(text_answer, str): + return result + + matches: List[re.Match] = list(self._NUM_BLOCK.finditer(text_answer)) + # Sort by numeric index + matches_sorted = sorted(matches, key=lambda m: int(m.group("idx"))) + for i, m in enumerate(matches_sorted[: self.n_candidates]): + body = m.group("body").strip() + think_m = self._THINK_PATTERN.search(body) + action_m = self._ACTION_PATTERN.search(body) + result[f"candidate_generation_{i+1}"] = { + "think": (think_m.group("think").strip() if think_m else ""), + "action": (action_m.group("action").strip() if action_m else ""), + } + + return result + + +class MultiCandidateGenericAgent(GenericAgent): + + def __init__( + self, + chat_model_args, + flags, + max_retry: int = 4, + ): + super().__init__(chat_model_args, flags, max_retry) + + def get_candidate_generations( + self, + obs, + hint: list[str] | None = None, + n_candidates=3, + ) -> list[dict]: + # Append obs to history only if it's not already the last entry + # Important to handle cases when get_candidate_generation is called multiple times in a single step. + if not self.obs_history or self.obs_history[-1] is not obs: + self.obs_history.append(obs) + + + main_prompt = MainPrompt( + action_set=self.action_set, + obs_history=self.obs_history, + actions=self.actions, + memories=self.memories, + thoughts=self.thoughts, + previous_plan=self.plan, + step=self.plan_step, + flags=self.flags, + ) + max_prompt_tokens, max_trunc_itr = self._get_maxes() + + system_prompt = SystemMessage(dp.SystemPrompt().prompt) + + human_prompt = dp.fit_tokens( + shrinkable=main_prompt, + max_prompt_tokens=max_prompt_tokens, + model_name=self.chat_model_args.model_name, + max_iterations=max_trunc_itr, + additional_prompts=system_prompt, + ) + + cg = CandidatesGeneration(hint=hint, n_candidates=n_candidates) + candidates_prompt = HumanMessage(cg.prompt) + chat_messages = Discussion([system_prompt, human_prompt, candidates_prompt]) + output = self.chat_llm(chat_messages) + candidates = cg._parse_answer(output["content"]) + # Not adding the generate candidate prompt to xray. + msg_to_add_to_xray = Discussion([system_prompt, human_prompt]) + suggestions = [ + { + "action": candidate["action"], + "think": candidate["think"], + } + for key, candidate in candidates.items() + ] + output = [] + for candidate in suggestions: + agent_info = AgentInfo( + think=candidate.get("think", None), + chat_messages=msg_to_add_to_xray, + stats=self.chat_llm.get_stats(), + extra_info={ + "chat_model_args": asdict(self.chat_model_args), + "think": candidate.get("think", None), + "plan": candidate.get("plan", None), + "step": candidate.get("step", None), + "memory": candidate.get("memory", None), + }, + ) + output.append({"action": candidate["action"], "agent_info": agent_info}) + + return output + + def update_agent_state_from_selected_candidate(self, output): + """Updates the agent's internal state based on the selected candidate from human feedback.""" + action, agent_info = output['action'], output['agent_info'] + self.plan = agent_info.extra_info.get("plan", self.plan) + self.plan_step = agent_info.extra_info.get("step", self.plan_step) + self.memories.append(agent_info.extra_info.get("memory", None)) + self.thoughts.append(agent_info.extra_info.get("think", None)) + self.actions.append(action) + + def get_action(self, obs): + """Generates multiple candidates and always returns the first one. + This allows to use this agent as a drop-in replacement for a single-candidate agent. + """ + candidates = self.get_candidate_generations(obs, hint=None, n_candidates=2) + selection = candidates[0] # always select the first option. + self.update_agent_state_from_selected_candidate(selection) + action, agent_info = selection['action'], selection['agent_info'] + + return action, agent_info + + +@dataclass +class MultiCandidateGenericAgentArgs(GenericAgentArgs): + def make_agent(self): + return MultiCandidateGenericAgent( + chat_model_args=self.chat_model_args, + flags=self.flags, + max_retry=self.max_retry, + ) + + def __post_init__(self): + """Prefix subagent name with 'MC-'.""" + super().__post_init__() + if hasattr(self, 'agent_name') and self.agent_name: + self.agent_name = "MC-" + self.agent_name From ed0f1bd7e1288cd4a73d8d6225cd6a1fe4219a4b Mon Sep 17 00:00:00 2001 From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com> Date: Tue, 2 Sep 2025 10:14:26 -0400 Subject: [PATCH 02/21] add timeout error for hilt agent. --- src/agentlab/agents/hilt_agent/hilt_agent.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/agentlab/agents/hilt_agent/hilt_agent.py b/src/agentlab/agents/hilt_agent/hilt_agent.py index 6a44489f..245db484 100644 --- a/src/agentlab/agents/hilt_agent/hilt_agent.py +++ b/src/agentlab/agents/hilt_agent/hilt_agent.py @@ -7,6 +7,7 @@ import bgym import numpy as np from PIL import Image +import playwright from agentlab.agents.hilt_agent.hint_labelling import ( HintLabeling, @@ -114,6 +115,14 @@ def get_action(self, obs): if self.ui: self.ui.close() raise + except playwright.sync_api.TimeoutError: + # Handle timeout specifically: fall back to first candidate + print("UI timeout; falling back to first candidate.") + selected_candidate = candidates[0] + self.subagent.update_agent_state_from_selected_candidate(selected_candidate) + action = selected_candidate["action"] + agent_info = selected_candidate["agent_info"] + return action, agent_info except Exception as e: print(f"Error in human intervention UI: {e}") if self.ui: From b2c1ac83d9d3fec69124c700bf49776bb26bacc9 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com> Date: Tue, 2 Sep 2025 12:20:00 -0400 Subject: [PATCH 03/21] darglint and black --- .../hilt_agent/base_multi_candidate_agent.py | 15 ++++--- .../hilt_agent/generic_human_guided_agent.py | 25 ++++++------ src/agentlab/agents/hilt_agent/hilt_agent.py | 36 ++++++++--------- .../agents/hilt_agent/hint_labelling.py | 12 +++++- .../multi_candidate_generic_agent.py | 39 ++++++++++++------- 5 files changed, 71 insertions(+), 56 deletions(-) diff --git a/src/agentlab/agents/hilt_agent/base_multi_candidate_agent.py b/src/agentlab/agents/hilt_agent/base_multi_candidate_agent.py index 6cd4624a..81a0db08 100644 --- a/src/agentlab/agents/hilt_agent/base_multi_candidate_agent.py +++ b/src/agentlab/agents/hilt_agent/base_multi_candidate_agent.py @@ -1,4 +1,5 @@ from typing_extensions import Protocol + from agentlab.agents.agent_args import AgentArgs @@ -12,9 +13,10 @@ class MultiCandidateAgent(Protocol): def get_candidate_generations( self, obs: dict, hint: list[str] | None = None, n_candidates: int = 3 - ) -> list[dict]: + ) -> "list[dict]": """ Generate multiple candidate actions for the given observation. + You can pass extra info in agent_info to update internal state of the agent based on the selected candidate. Your internal state management should be robust to multiple calls to the get_candidate_generations method @@ -24,11 +26,6 @@ def get_candidate_generations( obs: The current observation dictionary containing environment state hint: Optional list of hint strings to guide candidate generation n_candidates: Number of candidate actions to generate - - Returns: - List of dictionaries, each containing: - - 'action': The candidate action to be executed - - 'agent_info': Additional information about the action generation """ ... @@ -37,8 +34,10 @@ def update_agent_state_from_selected_candidate(self, output: dict): Update the agent's internal state based on the selected candidate. This can include any memory or planning updates. + Args: + output: The selected candidate action dictionary """ - ... + pass class MultiCandidateAgentArgs(AgentArgs): @@ -47,5 +46,5 @@ def make_agent(self) -> MultiCandidateAgent: ... def __post_init__(self): """Prefix subagent name with 'MC-'.""" super().__post_init__() - if hasattr(self, 'agent_name') and self.agent_name: + if hasattr(self, "agent_name") and self.agent_name: self.agent_name = "MC-" + self.agent_name diff --git a/src/agentlab/agents/hilt_agent/generic_human_guided_agent.py b/src/agentlab/agents/hilt_agent/generic_human_guided_agent.py index 220ca0df..950c3449 100644 --- a/src/agentlab/agents/hilt_agent/generic_human_guided_agent.py +++ b/src/agentlab/agents/hilt_agent/generic_human_guided_agent.py @@ -7,6 +7,7 @@ import bgym import numpy as np +from browsergym.experiments.agent import AgentInfo from PIL import Image from agentlab.agents import dynamic_prompting as dp @@ -23,7 +24,6 @@ SystemMessage, ) from agentlab.llm.tracking import cost_tracker_decorator -from browsergym.experiments.agent import AgentInfo class CandidatesGeneration(dp.PromptElement): @@ -87,15 +87,14 @@ def __init__(self, hint: list[str] | None = None, n_candidates=3) -> None: ) def _parse_answer(self, text_answer: str) -> Dict[str, Dict[str, str]]: - """ - Extract up to n_candidates candidates, using numbered tags only. + """Extract up to n_candidates candidates, using numbered tags only. + + Args: + text_answer: The text response containing candidate generation tags. Returns: - { - "candidate_generation_1": {"think": "...", "action": "..."}, - "candidate_generation_2": {"think": "...", "action": "..."}, - ... - } + Dictionary mapping candidate names to their think and action content. + Format: {"candidate_generation_1": {"think": "...", "action": "..."}, ...} """ result = { f"candidate_generation_{i+1}": {"think": "", "action": ""} @@ -145,11 +144,11 @@ def make_agent(self): return MultipleProposalGenericAgent( chat_model_args=self.chat_model_args, flags=self.flags, max_retry=self.max_retry ) - + def __post_init__(self): """Prefix subagent name with 'HILT-'.""" super().__post_init__() - if hasattr(self, 'agent_name') and self.agent_name: + if hasattr(self, "agent_name") and self.agent_name: self.agent_name = "HILT-" + self.agent_name @@ -363,13 +362,11 @@ def get_base_agent(llm_config): agent_configs = [HUMAN_GUIDED_GENERIC_AGENT] benchmark = bgym.DEFAULT_BENCHMARKS["miniwob"]() benchmark = benchmark.subset_from_glob("task_name", "*book*") - benchmark.env_args_list = benchmark.env_args_list[2:3] + benchmark.env_args_list = benchmark.env_args_list[3:4] for env_args in benchmark.env_args_list: env_args.max_steps = 100 # max human steps - env_args.headless = False - # env_args.use_chat_ui = False - # env_args.use_hint_labeling_ui = True + env_args.headless = True Study(agent_configs, benchmark, logging_level=logging.WARNING).run( n_jobs=1, diff --git a/src/agentlab/agents/hilt_agent/hilt_agent.py b/src/agentlab/agents/hilt_agent/hilt_agent.py index 245db484..6d5fb3f5 100644 --- a/src/agentlab/agents/hilt_agent/hilt_agent.py +++ b/src/agentlab/agents/hilt_agent/hilt_agent.py @@ -6,18 +6,19 @@ import bgym import numpy as np -from PIL import Image import playwright +from browsergym.experiments.agent import Agent +from PIL import Image +from agentlab.agents.agent_args import AgentArgs +from agentlab.agents.hilt_agent.base_multi_candidate_agent import MultiCandidateAgent from agentlab.agents.hilt_agent.hint_labelling import ( HintLabeling, HintLabelingInputs, ) -from agentlab.llm.tracking import cost_tracker_decorator from agentlab.analyze import overlay_utils -from browsergym.experiments.agent import Agent -from agentlab.agents.agent_args import AgentArgs -from agentlab.agents.hilt_agent.base_multi_candidate_agent import MultiCandidateAgent +from agentlab.llm.tracking import cost_tracker_decorator + class HumanInTheLoopAgent(Agent): @@ -58,7 +59,7 @@ def get_action(self, obs): # Generate first candidates candidates = self.subagent.get_candidate_generations(obs, hint=None, n_candidates=3) step_n_human_intervention_rounds += 1 - suggestions = [{ 'action': c['action'], 'think': c['agent_info'].think} for c in candidates] + suggestions = [{"action": c["action"], "think": c["agent_info"].think} for c in candidates] # List of Images as base64 - create overlay screenshots for each suggested action screenshots = [overlay_action(obs, choice["action"]) for choice in suggestions] @@ -90,11 +91,11 @@ def get_action(self, obs): hint = response["payload"]["hint"] step_hint.append(hint) candidates = self.subagent.get_candidate_generations( - obs, - hint=step_hint if step_hint else None, - n_candidates=3 + obs, hint=step_hint if step_hint else None, n_candidates=3 ) - suggestions = [{'action': c['action'], 'think': c['agent_info'].think} for c in candidates] + suggestions = [ + {"action": c["action"], "think": c["agent_info"].think} for c in candidates + ] screenshots = [overlay_action(obs, choice["action"]) for choice in suggestions] elif response["type"] == "step": @@ -135,7 +136,6 @@ def get_action(self, obs): @dataclass class HumanInTheLoopAgentArgs(AgentArgs): subagent_args: Optional[AgentArgs] = None # args for the underlying multiple proposal agent - def make_agent(self): assert self.subagent_args is not None @@ -146,15 +146,15 @@ def __post_init__(self): super().__post_init__() if self.subagent_args and self.subagent_args.agent_name: self.agent_name = "HILT-" + self.subagent_args.agent_name - + def set_benchmark(self, benchmark, demo_mode): """Delegate set_benchmark to the subagent if it has the method.""" - if hasattr(self.subagent_args, 'set_benchmark'): + if hasattr(self.subagent_args, "set_benchmark"): self.subagent_args.set_benchmark(benchmark, demo_mode) - + def set_reproducibility_mode(self): """Delegate set_reproducibility_mode to the subagent if it has the method.""" - if hasattr(self.subagent_args, 'set_reproducibility_mode'): + if hasattr(self.subagent_args, "set_reproducibility_mode"): self.subagent_args.set_reproducibility_mode() @@ -175,16 +175,17 @@ def img_to_base_64(image: Image.Image | np.ndarray) -> str: b64_str = base64.b64encode(buffer.getvalue()).decode("utf-8") return b64_str + def get_base_human_in_the_loop_genericagent(llm_config): from agentlab.agents.generic_agent.tmlr_config import BASE_FLAGS - from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT from agentlab.agents.hilt_agent.hilt_agent import HumanInTheLoopAgentArgs from agentlab.agents.hilt_agent.multi_candidate_generic_agent import ( MultiCandidateGenericAgentArgs, ) + from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT return HumanInTheLoopAgentArgs( - subagent_args = MultiCandidateGenericAgentArgs( + subagent_args=MultiCandidateGenericAgentArgs( chat_model_args=CHAT_MODEL_ARGS_DICT[llm_config], flags=BASE_FLAGS, ) @@ -210,7 +211,6 @@ def get_base_human_in_the_loop_genericagent(llm_config): env_args.max_steps = 100 # max human steps env_args.headless = False - Study(agent_configs, benchmark, logging_level=logging.WARNING).run( n_jobs=1, parallel_backend="sequential", diff --git a/src/agentlab/agents/hilt_agent/hint_labelling.py b/src/agentlab/agents/hilt_agent/hint_labelling.py index 6e293781..aa5a51ea 100644 --- a/src/agentlab/agents/hilt_agent/hint_labelling.py +++ b/src/agentlab/agents/hilt_agent/hint_labelling.py @@ -5,10 +5,10 @@ from typing import Dict, List, Optional import playwright.sync_api +from browsergym.core import _get_global_playwright from pydantic import BaseModel, Field from agentlab.agents.hilt_agent import hint_labelling_ui_files -from browsergym.core import _get_global_playwright logger = logging.getLogger(__name__) @@ -109,6 +109,16 @@ def wait_for_response(self, timeout: Optional[float] = 600) -> dict: """ Wait until the page makes a request to /api/reprompt or /api/submit, then parse the request body and return it in your schema. + + Args: + timeout (Optional[float]): Maximum time to wait for the request in seconds. If None or 0, + waits indefinitely. Defaults to 600 seconds. + + Returns: + dict: A dictionary containing the parsed response with 'type' and 'payload' keys. + For /api/reprompt: {'type': 'reprompt', 'payload': {'hint': str}} + For /api/submit: {'type': 'step', 'payload': {'think': str, 'action': str}} + """ logger.info("Waiting for response from Hint Labeling UI...") diff --git a/src/agentlab/agents/hilt_agent/multi_candidate_generic_agent.py b/src/agentlab/agents/hilt_agent/multi_candidate_generic_agent.py index 64c821ef..e4e53b7a 100644 --- a/src/agentlab/agents/hilt_agent/multi_candidate_generic_agent.py +++ b/src/agentlab/agents/hilt_agent/multi_candidate_generic_agent.py @@ -2,11 +2,12 @@ from dataclasses import asdict, dataclass from typing import Dict, List +from browsergym.experiments.agent import AgentInfo + from agentlab.agents import dynamic_prompting as dp from agentlab.agents.generic_agent.generic_agent import GenericAgent, GenericAgentArgs from agentlab.agents.generic_agent.generic_agent_prompt import MainPrompt from agentlab.llm.llm_utils import Discussion, HumanMessage, SystemMessage -from browsergym.experiments.agent import AgentInfo class CandidatesGeneration(dp.PromptElement): @@ -70,15 +71,14 @@ def __init__(self, hint: list[str] | None = None, n_candidates=3) -> None: ) def _parse_answer(self, text_answer: str) -> Dict[str, Dict[str, str]]: - """ - Extract up to n_candidates candidates, using numbered tags only. + """Extract up to n_candidates candidates, using numbered tags only. + + Args: + text_answer: The text response containing candidate generation tags. Returns: - { - "candidate_generation_1": {"think": "...", "action": "..."}, - "candidate_generation_2": {"think": "...", "action": "..."}, - ... - } + Dictionary mapping candidate names to their think and action content. + Format: {"candidate_generation_1": {"think": "...", "action": "..."}, ...} """ result = { f"candidate_generation_{i+1}": {"think": "", "action": ""} @@ -123,7 +123,6 @@ def get_candidate_generations( # Important to handle cases when get_candidate_generation is called multiple times in a single step. if not self.obs_history or self.obs_history[-1] is not obs: self.obs_history.append(obs) - main_prompt = MainPrompt( action_set=self.action_set, @@ -180,8 +179,12 @@ def get_candidate_generations( return output def update_agent_state_from_selected_candidate(self, output): - """Updates the agent's internal state based on the selected candidate from human feedback.""" - action, agent_info = output['action'], output['agent_info'] + """Updates the agent's internal state based on the selected candidate from human feedback. + + Args: + output: Dictionary containing 'action' and 'agent_info' keys from selected candidate. + """ + action, agent_info = output["action"], output["agent_info"] self.plan = agent_info.extra_info.get("plan", self.plan) self.plan_step = agent_info.extra_info.get("step", self.plan_step) self.memories.append(agent_info.extra_info.get("memory", None)) @@ -191,11 +194,17 @@ def update_agent_state_from_selected_candidate(self, output): def get_action(self, obs): """Generates multiple candidates and always returns the first one. This allows to use this agent as a drop-in replacement for a single-candidate agent. + + Args: + obs: The observation from the environment. + + Returns: + tuple: A tuple containing (action, agent_info). """ - candidates = self.get_candidate_generations(obs, hint=None, n_candidates=2) - selection = candidates[0] # always select the first option. + candidates = self.get_candidate_generations(obs, hint=None, n_candidates=2) + selection = candidates[0] # always select the first option. self.update_agent_state_from_selected_candidate(selection) - action, agent_info = selection['action'], selection['agent_info'] + action, agent_info = selection["action"], selection["agent_info"] return action, agent_info @@ -212,5 +221,5 @@ def make_agent(self): def __post_init__(self): """Prefix subagent name with 'MC-'.""" super().__post_init__() - if hasattr(self, 'agent_name') and self.agent_name: + if hasattr(self, "agent_name") and self.agent_name: self.agent_name = "MC-" + self.agent_name From 3b07fe90d9a632966f44929fc0ca6be211f9f085 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com> Date: Tue, 2 Sep 2025 14:58:04 -0400 Subject: [PATCH 04/21] correct spelling hilt -> hitl --- .../base_multi_candidate_agent.py | 0 .../generic_human_guided_agent.py | 12 ++++++------ .../{hilt_agent => hitl_agent}/hint_labelling.py | 2 +- .../hint_labelling_ui_files/hint_labeling_ui.html | 0 .../hilt_agent.py => hitl_agent/hitl_agent.py} | 14 +++++++------- .../multi_candidate_generic_agent.py | 0 6 files changed, 14 insertions(+), 14 deletions(-) rename src/agentlab/agents/{hilt_agent => hitl_agent}/base_multi_candidate_agent.py (100%) rename src/agentlab/agents/{hilt_agent => hitl_agent}/generic_human_guided_agent.py (97%) rename src/agentlab/agents/{hilt_agent => hitl_agent}/hint_labelling.py (98%) rename src/agentlab/agents/{hilt_agent => hitl_agent}/hint_labelling_ui_files/hint_labeling_ui.html (100%) rename src/agentlab/agents/{hilt_agent/hilt_agent.py => hitl_agent/hitl_agent.py} (95%) rename src/agentlab/agents/{hilt_agent => hitl_agent}/multi_candidate_generic_agent.py (100%) diff --git a/src/agentlab/agents/hilt_agent/base_multi_candidate_agent.py b/src/agentlab/agents/hitl_agent/base_multi_candidate_agent.py similarity index 100% rename from src/agentlab/agents/hilt_agent/base_multi_candidate_agent.py rename to src/agentlab/agents/hitl_agent/base_multi_candidate_agent.py diff --git a/src/agentlab/agents/hilt_agent/generic_human_guided_agent.py b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py similarity index 97% rename from src/agentlab/agents/hilt_agent/generic_human_guided_agent.py rename to src/agentlab/agents/hitl_agent/generic_human_guided_agent.py index 950c3449..fd83cf95 100644 --- a/src/agentlab/agents/hilt_agent/generic_human_guided_agent.py +++ b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py @@ -13,7 +13,7 @@ from agentlab.agents import dynamic_prompting as dp from agentlab.agents.generic_agent.generic_agent import GenericAgent, GenericAgentArgs from agentlab.agents.generic_agent.generic_agent_prompt import MainPrompt -from agentlab.agents.hilt_agent.hint_labelling import ( +from agentlab.agents.hitl_agent.hint_labelling import ( HintLabeling, HintLabelingInputs, ) @@ -146,10 +146,10 @@ def make_agent(self): ) def __post_init__(self): - """Prefix subagent name with 'HILT-'.""" + """Prefix subagent name with 'HITL-'.""" super().__post_init__() if hasattr(self, "agent_name") and self.agent_name: - self.agent_name = "HILT-" + self.agent_name + self.agent_name = "HITL-" + self.agent_name class MultipleProposalGenericAgent(GenericAgent): @@ -354,14 +354,14 @@ def get_base_agent(llm_config): if __name__ == "__main__": import logging - from agentlab.agents.hilt_agent.generic_human_guided_agent import ( + from agentlab.agents.hitl_agent.generic_human_guided_agent import ( HUMAN_GUIDED_GENERIC_AGENT, ) from agentlab.experiments.study import Study agent_configs = [HUMAN_GUIDED_GENERIC_AGENT] - benchmark = bgym.DEFAULT_BENCHMARKS["miniwob"]() - benchmark = benchmark.subset_from_glob("task_name", "*book*") + benchmark = bgym.DEFAULT_BENCHMARKS["workarena_l1"]() + benchmark = benchmark.subset_from_glob("task_name", "*create*") benchmark.env_args_list = benchmark.env_args_list[3:4] for env_args in benchmark.env_args_list: diff --git a/src/agentlab/agents/hilt_agent/hint_labelling.py b/src/agentlab/agents/hitl_agent/hint_labelling.py similarity index 98% rename from src/agentlab/agents/hilt_agent/hint_labelling.py rename to src/agentlab/agents/hitl_agent/hint_labelling.py index aa5a51ea..faa16506 100644 --- a/src/agentlab/agents/hilt_agent/hint_labelling.py +++ b/src/agentlab/agents/hitl_agent/hint_labelling.py @@ -8,7 +8,7 @@ from browsergym.core import _get_global_playwright from pydantic import BaseModel, Field -from agentlab.agents.hilt_agent import hint_labelling_ui_files +from agentlab.agents.hitl_agent import hint_labelling_ui_files logger = logging.getLogger(__name__) diff --git a/src/agentlab/agents/hilt_agent/hint_labelling_ui_files/hint_labeling_ui.html b/src/agentlab/agents/hitl_agent/hint_labelling_ui_files/hint_labeling_ui.html similarity index 100% rename from src/agentlab/agents/hilt_agent/hint_labelling_ui_files/hint_labeling_ui.html rename to src/agentlab/agents/hitl_agent/hint_labelling_ui_files/hint_labeling_ui.html diff --git a/src/agentlab/agents/hilt_agent/hilt_agent.py b/src/agentlab/agents/hitl_agent/hitl_agent.py similarity index 95% rename from src/agentlab/agents/hilt_agent/hilt_agent.py rename to src/agentlab/agents/hitl_agent/hitl_agent.py index 6d5fb3f5..150bcd74 100644 --- a/src/agentlab/agents/hilt_agent/hilt_agent.py +++ b/src/agentlab/agents/hitl_agent/hitl_agent.py @@ -11,8 +11,8 @@ from PIL import Image from agentlab.agents.agent_args import AgentArgs -from agentlab.agents.hilt_agent.base_multi_candidate_agent import MultiCandidateAgent -from agentlab.agents.hilt_agent.hint_labelling import ( +from agentlab.agents.hitl_agent.base_multi_candidate_agent import MultiCandidateAgent +from agentlab.agents.hitl_agent.hint_labelling import ( HintLabeling, HintLabelingInputs, ) @@ -142,10 +142,10 @@ def make_agent(self): return HumanInTheLoopAgent(subagent_args=self.subagent_args) def __post_init__(self): - """Prefix subagent name with 'HILT-'.""" + """Prefix subagent name with 'HITL-'.""" super().__post_init__() if self.subagent_args and self.subagent_args.agent_name: - self.agent_name = "HILT-" + self.subagent_args.agent_name + self.agent_name = "HITL-" + self.subagent_args.agent_name def set_benchmark(self, benchmark, demo_mode): """Delegate set_benchmark to the subagent if it has the method.""" @@ -178,8 +178,8 @@ def img_to_base_64(image: Image.Image | np.ndarray) -> str: def get_base_human_in_the_loop_genericagent(llm_config): from agentlab.agents.generic_agent.tmlr_config import BASE_FLAGS - from agentlab.agents.hilt_agent.hilt_agent import HumanInTheLoopAgentArgs - from agentlab.agents.hilt_agent.multi_candidate_generic_agent import ( + from agentlab.agents.hitl_agent.hitl_agent import HumanInTheLoopAgentArgs + from agentlab.agents.hitl_agent.multi_candidate_generic_agent import ( MultiCandidateGenericAgentArgs, ) from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT @@ -197,7 +197,7 @@ def get_base_human_in_the_loop_genericagent(llm_config): if __name__ == "__main__": import logging - from agentlab.agents.hilt_agent.hilt_agent import ( + from agentlab.agents.hitl_agent.hitl_agent import ( HUMAN_GUIDED_GENERIC_AGENT, ) from agentlab.experiments.study import Study diff --git a/src/agentlab/agents/hilt_agent/multi_candidate_generic_agent.py b/src/agentlab/agents/hitl_agent/multi_candidate_generic_agent.py similarity index 100% rename from src/agentlab/agents/hilt_agent/multi_candidate_generic_agent.py rename to src/agentlab/agents/hitl_agent/multi_candidate_generic_agent.py From 9633275402a7ae6a58fa690a988df0e755fd1a36 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com> Date: Tue, 2 Sep 2025 15:19:13 -0400 Subject: [PATCH 05/21] Move the overlay_action to utils --- src/agentlab/agents/agent_utils.py | 14 ++++++++++ .../hitl_agent/generic_human_guided_agent.py | 21 ++------------- src/agentlab/agents/hitl_agent/hitl_agent.py | 27 +++---------------- src/agentlab/llm/llm_utils.py | 10 +++++++ 4 files changed, 29 insertions(+), 43 deletions(-) diff --git a/src/agentlab/agents/agent_utils.py b/src/agentlab/agents/agent_utils.py index 29219d2d..9059b5ad 100644 --- a/src/agentlab/agents/agent_utils.py +++ b/src/agentlab/agents/agent_utils.py @@ -1,5 +1,8 @@ +import copy from PIL import Image, ImageDraw from playwright.sync_api import Page +from agentlab.llm.llm_utils import img_to_base_64 +from agentlab.analyze import overlay_utils def draw_mouse_pointer(image: Image.Image, x: int, y: int) -> Image.Image: @@ -128,3 +131,14 @@ def zoom_webpage(page: Page, zoom_factor: float = 1.5): page.evaluate(f"document.documentElement.style.zoom='{zoom_factor*100}%'") return page + + +def overlay_action(obs, action): + """Overlays actions on screenshot in-place""" + act_img = copy.deepcopy(obs["screenshot"]) + act_img = Image.fromarray(act_img) + overlay_utils.annotate_action(act_img, action, properties=obs["extra_element_properties"]) + return img_to_base_64(act_img) + + + diff --git a/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py index fd83cf95..507efe85 100644 --- a/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py +++ b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py @@ -24,7 +24,8 @@ SystemMessage, ) from agentlab.llm.tracking import cost_tracker_decorator - +from agentlab.agents.agent_utils import overlay_action +from agentlab.llm.llm_utils import img_to_base_64 class CandidatesGeneration(dp.PromptElement): # Ask for multiple alternatives; each candidate must contain and . @@ -119,24 +120,6 @@ def _parse_answer(self, text_answer: str) -> Dict[str, Dict[str, str]]: return result -def overlay_action(obs, action): - """Overlays actions on screenshot in-place""" - act_img = copy.deepcopy(obs["screenshot"]) - act_img = Image.fromarray(act_img) - overlay_utils.annotate_action(act_img, action, properties=obs["extra_element_properties"]) - return img_to_base_64(act_img) - - -def img_to_base_64(image: Image.Image | np.ndarray) -> str: - """Converts a PIL Image or NumPy array to a base64-encoded string.""" - if isinstance(image, np.ndarray): - image = Image.fromarray(image) - buffer = io.BytesIO() - image.save(buffer, format="PNG") - b64_str = base64.b64encode(buffer.getvalue()).decode("utf-8") - return b64_str - - @dataclass class MultipleProposalGenericAgentArgs(GenericAgentArgs): diff --git a/src/agentlab/agents/hitl_agent/hitl_agent.py b/src/agentlab/agents/hitl_agent/hitl_agent.py index 150bcd74..3bfb47b5 100644 --- a/src/agentlab/agents/hitl_agent/hitl_agent.py +++ b/src/agentlab/agents/hitl_agent/hitl_agent.py @@ -1,14 +1,10 @@ -import base64 -import copy -import io from dataclasses import dataclass from typing import Optional import bgym -import numpy as np import playwright from browsergym.experiments.agent import Agent -from PIL import Image + from agentlab.agents.agent_args import AgentArgs from agentlab.agents.hitl_agent.base_multi_candidate_agent import MultiCandidateAgent @@ -16,7 +12,8 @@ HintLabeling, HintLabelingInputs, ) -from agentlab.analyze import overlay_utils +from agentlab.agents.agent_utils import overlay_action +from agentlab.llm.llm_utils import img_to_base_64 from agentlab.llm.tracking import cost_tracker_decorator @@ -158,24 +155,6 @@ def set_reproducibility_mode(self): self.subagent_args.set_reproducibility_mode() -def overlay_action(obs, action): - """Overlays actions on screenshot in-place""" - act_img = copy.deepcopy(obs["screenshot"]) - act_img = Image.fromarray(act_img) - overlay_utils.annotate_action(act_img, action, properties=obs["extra_element_properties"]) - return img_to_base_64(act_img) - - -def img_to_base_64(image: Image.Image | np.ndarray) -> str: - """Converts a PIL Image or NumPy array to a base64-encoded string.""" - if isinstance(image, np.ndarray): - image = Image.fromarray(image) - buffer = io.BytesIO() - image.save(buffer, format="PNG") - b64_str = base64.b64encode(buffer.getvalue()).decode("utf-8") - return b64_str - - def get_base_human_in_the_loop_genericagent(llm_config): from agentlab.agents.generic_agent.tmlr_config import BASE_FLAGS from agentlab.agents.hitl_agent.hitl_agent import HumanInTheLoopAgentArgs diff --git a/src/agentlab/llm/llm_utils.py b/src/agentlab/llm/llm_utils.py index 10013b72..2bc83d43 100644 --- a/src/agentlab/llm/llm_utils.py +++ b/src/agentlab/llm/llm_utils.py @@ -727,6 +727,16 @@ def image_to_png_base64_url(image: np.ndarray | Image.Image): return f"data:image/png;base64,{image_base64}" +def img_to_base_64(image: Image.Image | np.ndarray) -> str: + """Converts a PIL Image or NumPy array to a base64-encoded string.""" + if isinstance(image, np.ndarray): + image = Image.fromarray(image) + buffer = io.BytesIO() + image.save(buffer, format="PNG") + b64_str = base64.b64encode(buffer.getvalue()).decode("utf-8") + return b64_str + + class BaseMessage(dict): def __init__(self, role: str, content: Union[str, list[dict]], **kwargs): allowed_attrs = {"log_probs"} From 51cacdb9826f651b8c0b9aeae80e55b4168aaea0 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com> Date: Tue, 2 Sep 2025 15:44:12 -0400 Subject: [PATCH 06/21] Increase timeout --- src/agentlab/agents/hitl_agent/generic_human_guided_agent.py | 2 +- src/agentlab/agents/hitl_agent/hitl_agent.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py index 507efe85..af7ed0c4 100644 --- a/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py +++ b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py @@ -251,7 +251,7 @@ def get_action(self, obs): ) self.ui.update_context(hint_labeling_inputs) - response = self.ui.wait_for_response(timeout=300) + response = self.ui.wait_for_response(timeout=600) if response["type"] == "reprompt": hint = response["payload"]["hint"] diff --git a/src/agentlab/agents/hitl_agent/hitl_agent.py b/src/agentlab/agents/hitl_agent/hitl_agent.py index 3bfb47b5..85063f82 100644 --- a/src/agentlab/agents/hitl_agent/hitl_agent.py +++ b/src/agentlab/agents/hitl_agent/hitl_agent.py @@ -82,7 +82,7 @@ def get_action(self, obs): ) self.ui.update_context(hint_labeling_inputs) - response = self.ui.wait_for_response(timeout=300) + response = self.ui.wait_for_response(timeout=600) if response["type"] == "reprompt": hint = response["payload"]["hint"] From 958430ccc6e1a0d2a84376d9bbdbac311e6e11da Mon Sep 17 00:00:00 2001 From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com> Date: Tue, 2 Sep 2025 16:11:32 -0400 Subject: [PATCH 07/21] add docstring for functions and black --- src/agentlab/agents/agent_utils.py | 7 +- .../hitl_agent/generic_human_guided_agent.py | 15 +- src/agentlab/agents/hitl_agent/hitl_agent.py | 14 + .../2_eval_on_miniwob/inspect_results.ipynb | 258 +++++++++++++++++- 4 files changed, 281 insertions(+), 13 deletions(-) diff --git a/src/agentlab/agents/agent_utils.py b/src/agentlab/agents/agent_utils.py index 9059b5ad..7ce8cdad 100644 --- a/src/agentlab/agents/agent_utils.py +++ b/src/agentlab/agents/agent_utils.py @@ -1,8 +1,10 @@ import copy + from PIL import Image, ImageDraw from playwright.sync_api import Page -from agentlab.llm.llm_utils import img_to_base_64 + from agentlab.analyze import overlay_utils +from agentlab.llm.llm_utils import img_to_base_64 def draw_mouse_pointer(image: Image.Image, x: int, y: int) -> Image.Image: @@ -139,6 +141,3 @@ def overlay_action(obs, action): act_img = Image.fromarray(act_img) overlay_utils.annotate_action(act_img, action, properties=obs["extra_element_properties"]) return img_to_base_64(act_img) - - - diff --git a/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py index af7ed0c4..507e16fe 100644 --- a/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py +++ b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py @@ -11,6 +11,7 @@ from PIL import Image from agentlab.agents import dynamic_prompting as dp +from agentlab.agents.agent_utils import overlay_action from agentlab.agents.generic_agent.generic_agent import GenericAgent, GenericAgentArgs from agentlab.agents.generic_agent.generic_agent_prompt import MainPrompt from agentlab.agents.hitl_agent.hint_labelling import ( @@ -22,10 +23,10 @@ Discussion, HumanMessage, SystemMessage, + img_to_base_64, ) from agentlab.llm.tracking import cost_tracker_decorator -from agentlab.agents.agent_utils import overlay_action -from agentlab.llm.llm_utils import img_to_base_64 + class CandidatesGeneration(dp.PromptElement): # Ask for multiple alternatives; each candidate must contain and . @@ -323,6 +324,16 @@ def get_action(self, obs): def get_base_agent(llm_config): + """Creates and returns a MultipleProposalGenericAgentArgs instance with + specified LLM configuration from CHAT_MODEL_ARGS_DICT. + + Args: + llm_config: The LLM configuration key to use from CHAT_MODEL_ARGS_DICT. + + Returns: + MultipleProposalGenericAgentArgs: Configured agent arguments instance. + """ + from agentlab.agents.generic_agent.tmlr_config import BASE_FLAGS from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT diff --git a/src/agentlab/agents/hitl_agent/hitl_agent.py b/src/agentlab/agents/hitl_agent/hitl_agent.py index 85063f82..26c0c696 100644 --- a/src/agentlab/agents/hitl_agent/hitl_agent.py +++ b/src/agentlab/agents/hitl_agent/hitl_agent.py @@ -90,6 +90,7 @@ def get_action(self, obs): candidates = self.subagent.get_candidate_generations( obs, hint=step_hint if step_hint else None, n_candidates=3 ) + step_n_human_intervention_rounds += 1 suggestions = [ {"action": c["action"], "think": c["agent_info"].think} for c in candidates ] @@ -156,6 +157,19 @@ def set_reproducibility_mode(self): def get_base_human_in_the_loop_genericagent(llm_config): + """ + Create a base human-in-the-loop generic agent configuration using the key from CHAT_MODEL_ARGS_DICT. + + This function creates a HumanInTheLoopAgentArgs instance with a MultiCandidateGenericAgent + as the subagent, configured with the specified LLM configuration and base flags. + + Args: + llm_config (str): The LLM configuration key to use from CHAT_MODEL_ARGS_DICT. + + Returns: + HumanInTheLoopAgentArgs: Configured human-in-the-loop agent arguments with + a multi-candidate generic agent as the subagent. + """ from agentlab.agents.generic_agent.tmlr_config import BASE_FLAGS from agentlab.agents.hitl_agent.hitl_agent import HumanInTheLoopAgentArgs from agentlab.agents.hitl_agent.multi_candidate_generic_agent import ( diff --git a/tutorials/2_eval_on_miniwob/inspect_results.ipynb b/tutorials/2_eval_on_miniwob/inspect_results.ipynb index 06127b78..84a73cca 100644 --- a/tutorials/2_eval_on_miniwob/inspect_results.ipynb +++ b/tutorials/2_eval_on_miniwob/inspect_results.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "58086537", "metadata": {}, "outputs": [], @@ -25,10 +25,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, + "id": "7901cccc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/Users/aman.jaiswal/Work/AgentLab.worktrees/trace-recorder/results')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "RESULTS_DIR" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "id": "50be19a9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/aman.jaiswal/Work/AgentLab.worktrees/trace-recorder/results/2025-09-02_15-52-00_hitl-genericagent-gpt-5-mini-2025-08-07-on-workarena-l1-task-name-create\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Searching experiments directories.: 100%|██████████| 1/1 [00:00<00:00, 5433.04it/s]\n", + "Loading results: 100%|██████████| 1/1 [00:00<00:00, 373.26it/s]\n" + ] + } + ], "source": [ "# replace this by your desired directory if needed.\n", "result_dir = get_most_recent_study(RESULTS_DIR, contains=None)\n", @@ -39,15 +76,222 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, + "id": "82cc1557", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/Users/aman.jaiswal/Work/AgentLab.worktrees/trace-recorder/results/2025-09-02_15-52-00_hitl-genericagent-gpt-5-mini-2025-08-07-on-workarena-l1-task-name-create')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result_dir" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "id": "a424c470", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found multiple configuration, averaging across tasks and returning a per-agent report.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 agent.agent\n", + "nameenv.benchmarkavg\n", + "rewardstd\n", + "erravg\n", + "stepsn\n", + "completedn\n", + "err
0HITL-GenericAgent-gpt-5-mini-2025-08-07workarenanannannan0/10
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "report = inspect_results.global_report(result_df)\n", "inspect_results.display_report(report)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "f86e44fd", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "('agent.agent_name', 'env.benchmark')", + "rawType": "object", + "type": "unknown" + }, + { + "name": "avg_reward", + "rawType": "float64", + "type": "float" + }, + { + "name": "std_err", + "rawType": "float64", + "type": "float" + }, + { + "name": "avg_steps", + "rawType": "float64", + "type": "float" + }, + { + "name": "n_completed", + "rawType": "object", + "type": "string" + }, + { + "name": "n_err", + "rawType": "int64", + "type": "integer" + } + ], + "ref": "ea68795e-a1d8-404e-9e36-1061d8fa9e87", + "rows": [ + [ + "('HITL-GenericAgent-gpt-5-mini-2025-08-07', 'workarena')", + null, + null, + null, + "0/1", + "0" + ] + ], + "shape": { + "columns": 5, + "rows": 1 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
avg_rewardstd_erravg_stepsn_completedn_err
agent.agent_nameenv.benchmark
HITL-GenericAgent-gpt-5-mini-2025-08-07workarenaNaNNaNNaN0/10
\n", + "
" + ], + "text/plain": [ + " avg_reward std_err \\\n", + "agent.agent_name env.benchmark \n", + "HITL-GenericAgent-gpt-5-mini-2025-08-07 workarena NaN NaN \n", + "\n", + " avg_steps n_completed \\\n", + "agent.agent_name env.benchmark \n", + "HITL-GenericAgent-gpt-5-mini-2025-08-07 workarena NaN 0/1 \n", + "\n", + " n_err \n", + "agent.agent_name env.benchmark \n", + "HITL-GenericAgent-gpt-5-mini-2025-08-07 workarena 0 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n" + ] + }, { "cell_type": "markdown", "id": "385559d7", @@ -149,7 +393,7 @@ ], "metadata": { "kernelspec": { - "display_name": "AgentLab", + "display_name": "agentlab", "language": "python", "name": "python3" }, @@ -163,7 +407,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.7" + "version": "3.12.9" } }, "nbformat": 4, From 4453a00a490602d9e15d0d0189873b6cbe9c4c01 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com> Date: Tue, 2 Sep 2025 17:18:54 -0400 Subject: [PATCH 08/21] Improve UI and step hint handling for multiple hints --- .../hitl_agent/generic_human_guided_agent.py | 17 +- .../agents/hitl_agent/hint_labelling.py | 39 ++-- .../hint_labeling_ui.html | 208 +++++++++++++----- src/agentlab/agents/hitl_agent/hitl_agent.py | 13 +- 4 files changed, 190 insertions(+), 87 deletions(-) diff --git a/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py index 507e16fe..29547fd4 100644 --- a/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py +++ b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py @@ -18,7 +18,6 @@ HintLabeling, HintLabelingInputs, ) -from agentlab.analyze import overlay_utils from agentlab.llm.llm_utils import ( Discussion, HumanMessage, @@ -208,7 +207,7 @@ def get_action(self, obs): screenshots=[], # no overlay screenshots yet axtree=obs.get("axtree_txt", ""), history=[], - hint="", + hints=[], suggestions=[], # no suggestions yet ) self.ui.update_context(initial_inputs) @@ -243,11 +242,7 @@ def get_action(self, obs): screenshots=screenshots, # list of overlay screenshots for hover axtree=obs.get("axtree_txt", ""), history=[], # TODO: add history - hint=( - "\n".join(f"{i}. {c}" for i, c in enumerate(step_hint, 1)) - if step_hint - else "" - ), + hints=step_hint, suggestions=suggestions, ) @@ -255,8 +250,8 @@ def get_action(self, obs): response = self.ui.wait_for_response(timeout=600) if response["type"] == "reprompt": - hint = response["payload"]["hint"] - step_hint.append(hint) + new_hints = response["payload"].get("hints", []) + step_hint = list(new_hints) if isinstance(new_hints, list) else step_hint candidates, chat_messages = self.get_candidate_generation( sys_prompt=system_prompt, human_prompt=human_prompt, @@ -354,8 +349,8 @@ def get_base_agent(llm_config): from agentlab.experiments.study import Study agent_configs = [HUMAN_GUIDED_GENERIC_AGENT] - benchmark = bgym.DEFAULT_BENCHMARKS["workarena_l1"]() - benchmark = benchmark.subset_from_glob("task_name", "*create*") + benchmark = bgym.DEFAULT_BENCHMARKS["miniwob"]() + benchmark = benchmark.subset_from_glob("task_name", "*book*") benchmark.env_args_list = benchmark.env_args_list[3:4] for env_args in benchmark.env_args_list: diff --git a/src/agentlab/agents/hitl_agent/hint_labelling.py b/src/agentlab/agents/hitl_agent/hint_labelling.py index faa16506..680f3cbe 100644 --- a/src/agentlab/agents/hitl_agent/hint_labelling.py +++ b/src/agentlab/agents/hitl_agent/hint_labelling.py @@ -8,11 +8,9 @@ from browsergym.core import _get_global_playwright from pydantic import BaseModel, Field -from agentlab.agents.hitl_agent import hint_labelling_ui_files - logger = logging.getLogger(__name__) - -HINT_LABELING_DIR = resources.files(hint_labelling_ui_files) + +HINT_LABELING_DIR = resources.files("agentlab.agents.hitl_agent.hint_labelling_ui_files") class HintLabelingInputs(BaseModel): @@ -22,14 +20,14 @@ class HintLabelingInputs(BaseModel): screenshots: List[str] = Field(default_factory=list) # list of base64 screenshots for hover axtree: str history: List[Dict[str, str]] = Field(default_factory=list) - hint: str = "" + hints: List[str] = Field(default_factory=list) suggestions: List[Dict[str, str]] = Field(default_factory=list) class HintLabeling: def __init__(self, headless: bool, window_size=(600, 1000), *args, **kwargs): - - pw: playwright.sync_api.Playwright = _get_global_playwright() + pw_opt = _get_global_playwright() + pw: playwright.sync_api.Playwright = pw_opt # type: ignore[assignment] self.browser = pw.chromium.launch( headless=headless, args=[f"--window-size={window_size[0]},{window_size[1]}"] ) @@ -37,14 +35,14 @@ def __init__(self, headless: bool, window_size=(600, 1000), *args, **kwargs): no_viewport=True, ) self.page = self.context.new_page() - self._resp_queue: "Queue[dict]" = Queue() + self._resp_queue = Queue() self.page.route("**/api/reprompt", self._route_reprompt) self.page.route("**/api/submit", self._route_submit) self.page.set_content(get_hint_labeling_ui(HINT_LABELING_DIR)) # internal state - self._context: HintLabelingInputs = None + self._context = None self._running = False def _route_reprompt( @@ -52,11 +50,16 @@ def _route_reprompt( ): logger.info("Route hit: %s %s", request.method, request.url) try: - body = json.loads(request.post_data() or "{}") + body = json.loads(request.post_data or "{}") except Exception: body = {} # enqueue output 1 (reprompt) - msg = {"type": "reprompt", "payload": {"hint": body.get("hint", "")}} + hints = body.get("hints") + if not isinstance(hints, list): + # Back-compat: accept single 'hint' string + h = body.get("hint") + hints = [h] if isinstance(h, str) and h.strip() else [] + msg = {"type": "reprompt", "payload": {"hints": hints}} self._resp_queue.put(msg) # Respond something minimal so UI doesn’t break; it will be refreshed by a later update_context() route.fulfill( @@ -68,10 +71,10 @@ def _route_reprompt( def _route_submit(self, route: playwright.sync_api.Route, request: playwright.sync_api.Request): logger.info("Route hit: %s %s", request.method, request.url) try: - body = json.loads(request.post_data() or "{}") + body = json.loads(request.post_data or "{}") except Exception: body = {} - # Map UI payload -> your step shape + # Map UI payload -> your step shape msg = { "type": "step", "payload": { @@ -95,7 +98,7 @@ def _to_ui_bootstrap(self, ctx: HintLabelingInputs) -> dict: "screenshots": ctx.screenshots, # list of screenshots for hover "axtree": ctx.axtree, "history": ctx.history, - "hint": ctx.hint, + "hints": ctx.hints, "suggestions": ctx.suggestions, } @@ -116,7 +119,7 @@ def wait_for_response(self, timeout: Optional[float] = 600) -> dict: Returns: dict: A dictionary containing the parsed response with 'type' and 'payload' keys. - For /api/reprompt: {'type': 'reprompt', 'payload': {'hint': str}} + For /api/reprompt: {'type': 'reprompt', 'payload': {'hints': list[str]}} For /api/submit: {'type': 'step', 'payload': {'think': str, 'action': str}} """ @@ -142,7 +145,11 @@ def is_api(req: playwright.sync_api.Request) -> bool: body = {} if req.url.endswith("/api/reprompt"): - msg = {"type": "reprompt", "payload": {"hint": body.get("hint", "")}} + hints = body.get("hints") + if not isinstance(hints, list): + h = body.get("hint") + hints = [h] if isinstance(h, str) and h.strip() else [] + msg = {"type": "reprompt", "payload": {"hints": hints}} else: msg = { "type": "step", diff --git a/src/agentlab/agents/hitl_agent/hint_labelling_ui_files/hint_labeling_ui.html b/src/agentlab/agents/hitl_agent/hint_labelling_ui_files/hint_labeling_ui.html index 3371c3cd..6c8c782a 100644 --- a/src/agentlab/agents/hitl_agent/hint_labelling_ui_files/hint_labeling_ui.html +++ b/src/agentlab/agents/hitl_agent/hint_labelling_ui_files/hint_labeling_ui.html @@ -10,8 +10,8 @@ --bg:#f4f6f8; --card:#fff; --muted:#6b7280; --text:#0f172a; --brand:#2563eb; --accent:#10b981; --danger:#ef4444; --border:#e5e7eb; } *{box-sizing:border-box} - body{margin:0;font-family:Inter,system-ui,Segoe UI,Roboto,Helvetica,Arial,sans-serif;background:var(--bg);color:var(--text)} - .container{max-width:1100px;margin:24px auto;padding:0 16px} + body{margin:0;font-family:Inter,system-ui,Segoe UI,Roboto,Helvetica,Arial,sans-serif;background:var(--bg);color:var(--text)} + .container{max-width:1280px;margin:24px auto;padding:0 16px} .grid{ display:grid;gap:16px; @@ -26,7 +26,7 @@ .tab.active{background:var(--card);border:1px solid var(--border);border-bottom:none;color:var(--text)} .tabpanel{border-top:1px solid var(--border)} - .screenshot{width:800px;height:450px;object-fit:contain;background:#0000000d;border-radius:8px} + .screenshot{width:100%;height:auto;max-height:65vh;object-fit:contain;background:#0000000d;border-radius:8px} .axtree{width:100%;height:520px;resize:none;border:none;padding:12px;font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,monospace;background:#0b10241a} .hints-row{display:grid;grid-template-columns: 1fr 140px;gap:12px;align-items:start} @@ -36,7 +36,13 @@ .btn-primary[disabled]{opacity:.6;cursor:not-allowed} .btn-ghost{background:transparent;border:1px solid var(--border)} - .choices{margin-top:12px;display:flex;flex-direction:column;gap:10px} + /* Hint rows with removable controls */ + .hint-row{display:flex;gap:8px;align-items:stretch} + .hint-row textarea.hint{flex:1;margin:0} + .remove-hint{width:36px;min-width:36px;height:36px;line-height:1;border-radius:10px;padding:0;font-size:18px;color:#64748b} + .remove-hint:hover{background:#f8fafc} + + .choices{margin-top:12px;display:flex;flex-direction:column;gap:10px;max-height:60vh;overflow:auto} .choice{display:grid;grid-template-columns:32px 1fr;gap:12px;align-items:start;background:#ffffff;border:1px solid var(--border);border-radius:14px;padding:12px} .choice.selected{border:2px solid var(--accent);background:#f0fdf4} .choice.disabled{opacity:0.5;pointer-events:none} @@ -55,8 +61,13 @@ .pill{display:inline-block;padding:6px 10px;border-radius:999px;background:#f1f5f9;color:#0f172a;border:1px solid var(--border);font-size:12px} + /* New: split layout for screenshot/tabs and hints+suggestions side-by-side */ + .split{display:grid;gap:16px;grid-template-columns: 1fr 1.3fr;align-items:start;margin-top:16px} + .right-stack{display:flex;flex-direction:column;gap:16px} + @media (max-width: 900px){ .grid{grid-template-columns: 1fr} + .split{grid-template-columns: 1fr} .axtree{height:420px} .screenshot{width:100%;height:auto;max-width:800px} } @@ -92,44 +103,50 @@

Error Feedback

- -
-
- - - -
-
-
- screenshot -
-