From c9852ecf548db35504b4a1b4e3932a98de732a51 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Fri, 29 Aug 2025 15:57:09 -0400
Subject: [PATCH 01/21] rename to trace-recorder to hilt_agent

---
 .../hilt_agent/base_multi_candidate_agent.py  |  51 +++
 .../hilt_agent/generic_human_guided_agent.py  | 378 ++++++++++++++++
 src/agentlab/agents/hilt_agent/hilt_agent.py  | 209 +++++++++
 .../agents/hilt_agent/hint_labelling.py       | 153 +++++++
 .../hint_labeling_ui.html                     | 419 ++++++++++++++++++
 .../multi_candidate_generic_agent.py          | 216 +++++++++
 6 files changed, 1426 insertions(+)
 create mode 100644 src/agentlab/agents/hilt_agent/base_multi_candidate_agent.py
 create mode 100644 src/agentlab/agents/hilt_agent/generic_human_guided_agent.py
 create mode 100644 src/agentlab/agents/hilt_agent/hilt_agent.py
 create mode 100644 src/agentlab/agents/hilt_agent/hint_labelling.py
 create mode 100644 src/agentlab/agents/hilt_agent/hint_labelling_ui_files/hint_labeling_ui.html
 create mode 100644 src/agentlab/agents/hilt_agent/multi_candidate_generic_agent.py

diff --git a/src/agentlab/agents/hilt_agent/base_multi_candidate_agent.py b/src/agentlab/agents/hilt_agent/base_multi_candidate_agent.py
new file mode 100644
index 00000000..6cd4624a
--- /dev/null
+++ b/src/agentlab/agents/hilt_agent/base_multi_candidate_agent.py
@@ -0,0 +1,51 @@
+from typing_extensions import Protocol
+from agentlab.agents.agent_args import AgentArgs
+
+
+class MultiCandidateAgent(Protocol):
+    """
+    Protocol for agents that generate multiple candidates for get_action.
+
+    This protocol defines the contract for agents that can generate
+    multiple candidate actions and allow selection of one of them for execution.
+    """
+
+    def get_candidate_generations(
+        self, obs: dict, hint: list[str] | None = None, n_candidates: int = 3
+    ) -> list[dict]:
+        """
+        Generate multiple candidate actions for the given observation.
+        You can pass extra info in agent_info to update internal state of the
+        agent based on the selected candidate. Your internal state management
+        should be robust to multiple calls to the get_candidate_generations method
+        in a single step.
+
+        Args:
+            obs: The current observation dictionary containing environment state
+            hint: Optional list of hint strings to guide candidate generation
+            n_candidates: Number of candidate actions to generate
+
+        Returns:
+            List of dictionaries, each containing:
+                - 'action': The candidate action to be executed
+                - 'agent_info': Additional information about the action generation
+        """
+        ...
+
+    def update_agent_state_from_selected_candidate(self, output: dict):
+        """
+        Update the agent's internal state based on the selected candidate.
+        This can include any memory or planning updates.
+
+        """
+        ...
+
+
+class MultiCandidateAgentArgs(AgentArgs):
+    def make_agent(self) -> MultiCandidateAgent: ...
+
+    def __post_init__(self):
+        """Prefix subagent name with 'MC-'."""
+        super().__post_init__()
+        if hasattr(self, 'agent_name') and self.agent_name:
+            self.agent_name = "MC-" + self.agent_name
diff --git a/src/agentlab/agents/hilt_agent/generic_human_guided_agent.py b/src/agentlab/agents/hilt_agent/generic_human_guided_agent.py
new file mode 100644
index 00000000..220ca0df
--- /dev/null
+++ b/src/agentlab/agents/hilt_agent/generic_human_guided_agent.py
@@ -0,0 +1,378 @@
+import base64
+import copy
+import io
+import re
+from dataclasses import Field, asdict, dataclass
+from typing import Dict, List
+
+import bgym
+import numpy as np
+from PIL import Image
+
+from agentlab.agents import dynamic_prompting as dp
+from agentlab.agents.generic_agent.generic_agent import GenericAgent, GenericAgentArgs
+from agentlab.agents.generic_agent.generic_agent_prompt import MainPrompt
+from agentlab.agents.hilt_agent.hint_labelling import (
+    HintLabeling,
+    HintLabelingInputs,
+)
+from agentlab.analyze import overlay_utils
+from agentlab.llm.llm_utils import (
+    Discussion,
+    HumanMessage,
+    SystemMessage,
+)
+from agentlab.llm.tracking import cost_tracker_decorator
+from browsergym.experiments.agent import AgentInfo
+
+
+class CandidatesGeneration(dp.PromptElement):
+    # Ask for multiple alternatives; each candidate must contain <think> and <action>.
+    def __init__(self, hint: list[str] | None = None, n_candidates=3) -> None:
+        self.hint = hint
+        self.n_candidates = n_candidates
+        self.hint_prompt = "\n".join(f"{i}. {c}" for i, c in enumerate(hint, 1)) if hint else ""
+        super().__init__(True)
+        self._prompt = [
+            dict(
+                type="text",
+                text=f"""
+    You are a web agent. Propose {self.n_candidates} alternative next steps for the current page.
+    {('Use the Hints:' + self.hint_prompt) if self.hint else ""}\n
+    Return EACH candidate wrapped as numbered tags:
+    <candidate_generation_1>...</candidate_generation_1>
+    <candidate_generation_2>...</candidate_generation_2>
+
+    Inside every candidate you MUST include:
+    <think>...why this action is appropriate now...</think>
+    <action>...ONE atomic, executable action string...</action>
+
+    Do not include any extra text outside the candidate tags.
+    Use this format:
+    <candidate_generation_1>
+    <think>Explain why Candidate One is chosen</think>
+    <action>Candidate One Action</action>
+    </candidate_generation_1>
+
+    <candidate_generation_2>
+    <think>Explain why Candidate Two is chosen</think>
+    <action>Candidate Two Action</action>
+    </candidate_generation_2>
+    # Example 
+    <candidate_generation_1>
+    <think>The login button is visible and proceeding will reveal the auth form.</think>
+    <action>click(role="button", name="Log in")</action>
+    </candidate_generation_1>
+
+    <candidate_generation_2>
+    <think>User might need to enter email first; the email field is focused and visible.</think>
+    <action>fill(bid="a112", text="user@example.com")</action>
+    </candidate_generation_2>
+    """,
+            )
+        ]
+
+    # Regex patterns for numbered candidates only
+    _NUM_BLOCK = re.compile(
+        r"<\s*candidate[_ ]generation[_ ](?P<idx>[0-9]+)\s*>(?P<body>.*?)<\s*/\s*candidate[_ ]generation[_ ](?P=idx)\s*>",
+        flags=re.IGNORECASE | re.DOTALL,
+    )
+    _THINK_PATTERN = re.compile(
+        r"<\s*think\s*>(?P<think>.*?)<\s*/\s*think\s*>",
+        flags=re.IGNORECASE | re.DOTALL,
+    )
+    _ACTION_PATTERN = re.compile(
+        r"<\s*action\s*>(?P<action>.*?)<\s*/\s*action\s*>",
+        flags=re.IGNORECASE | re.DOTALL,
+    )
+
+    def _parse_answer(self, text_answer: str) -> Dict[str, Dict[str, str]]:
+        """
+        Extract up to n_candidates candidates, using numbered tags only.
+
+        Returns:
+        {
+            "candidate_generation_1": {"think": "...", "action": "..."},
+            "candidate_generation_2": {"think": "...", "action": "..."},
+            ...
+        }
+        """
+        result = {
+            f"candidate_generation_{i+1}": {"think": "", "action": ""}
+            for i in range(self.n_candidates)
+        }
+
+        if not isinstance(text_answer, str):
+            return result
+
+        matches: List[re.Match] = list(self._NUM_BLOCK.finditer(text_answer))
+        # Sort by numeric index
+        matches_sorted = sorted(matches, key=lambda m: int(m.group("idx")))
+        for i, m in enumerate(matches_sorted[: self.n_candidates]):
+            body = m.group("body").strip()
+            think_m = self._THINK_PATTERN.search(body)
+            action_m = self._ACTION_PATTERN.search(body)
+            result[f"candidate_generation_{i+1}"] = {
+                "think": (think_m.group("think").strip() if think_m else ""),
+                "action": (action_m.group("action").strip() if action_m else ""),
+            }
+
+        return result
+
+
+def overlay_action(obs, action):
+    """Overlays actions on screenshot in-place"""
+    act_img = copy.deepcopy(obs["screenshot"])
+    act_img = Image.fromarray(act_img)
+    overlay_utils.annotate_action(act_img, action, properties=obs["extra_element_properties"])
+    return img_to_base_64(act_img)
+
+
+def img_to_base_64(image: Image.Image | np.ndarray) -> str:
+    """Converts a PIL Image or NumPy array to a base64-encoded string."""
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    buffer = io.BytesIO()
+    image.save(buffer, format="PNG")
+    b64_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
+    return b64_str
+
+
+@dataclass
+class MultipleProposalGenericAgentArgs(GenericAgentArgs):
+
+    def make_agent(self):
+        return MultipleProposalGenericAgent(
+            chat_model_args=self.chat_model_args, flags=self.flags, max_retry=self.max_retry
+        )
+    
+    def __post_init__(self):
+        """Prefix subagent name with 'HILT-'."""
+        super().__post_init__()
+        if hasattr(self, 'agent_name') and self.agent_name:
+            self.agent_name = "HILT-" + self.agent_name
+
+
+class MultipleProposalGenericAgent(GenericAgent):
+
+    def __init__(
+        self,
+        chat_model_args,
+        flags,
+        max_retry: int = 4,
+    ):
+        super().__init__(chat_model_args, flags, max_retry)
+        self.ui = None  # Single HintLabeling instance
+
+    def get_candidate_generation(
+        self,
+        sys_prompt: SystemMessage,
+        human_prompt: HumanMessage,
+        hint: list[str] | None = None,
+        n_candidates=3,
+    ) -> tuple[Dict[str, Dict[str, str]], Discussion]:
+
+        cg = CandidatesGeneration(hint=hint, n_candidates=n_candidates)
+        candidates_prompt = HumanMessage(cg.prompt)
+        chat_messages = Discussion([sys_prompt, human_prompt, candidates_prompt])
+        output = self.chat_llm(chat_messages)
+        candidates = cg._parse_answer(output["content"])
+        self.step_n_human_intervention_rounds += 1
+        msg_to_add_to_xray = Discussion([sys_prompt, human_prompt])
+
+        return candidates, msg_to_add_to_xray
+
+    @cost_tracker_decorator
+    def get_action(self, obs):
+        # reset vars
+        step_hint = []
+        self.step_n_human_intervention_rounds = 0
+        self.obs_history.append(obs)
+        main_prompt = MainPrompt(
+            action_set=self.action_set,
+            obs_history=self.obs_history,
+            actions=self.actions,
+            memories=self.memories,
+            thoughts=self.thoughts,
+            previous_plan=self.plan,
+            step=self.plan_step,
+            flags=self.flags,
+        )
+
+        max_prompt_tokens, max_trunc_itr = self._get_maxes()
+
+        system_prompt = SystemMessage(dp.SystemPrompt().prompt)
+
+        human_prompt = dp.fit_tokens(
+            shrinkable=main_prompt,
+            max_prompt_tokens=max_prompt_tokens,
+            model_name=self.chat_model_args.model_name,
+            max_iterations=max_trunc_itr,
+            additional_prompts=system_prompt,
+        )
+        # Initialize UI once outside the loop
+        if self.ui is None:
+            self.ui = HintLabeling(headless=False)
+            # Show initial waiting state
+            initial_inputs = HintLabelingInputs(
+                goal=(
+                    obs.get("goal_object", [{}])[0].get("text", "")
+                    if obs.get("goal_object")
+                    else ""
+                ),
+                error_feedback="",
+                screenshot=(img_to_base_64(obs["screenshot"]) if "screenshot" in obs else ""),
+                screenshots=[],  # no overlay screenshots yet
+                axtree=obs.get("axtree_txt", ""),
+                history=[],
+                hint="",
+                suggestions=[],  # no suggestions yet
+            )
+            self.ui.update_context(initial_inputs)
+
+        # Generate first candidates
+        candidates, chat_messages = self.get_candidate_generation(
+            sys_prompt=system_prompt,
+            human_prompt=human_prompt,
+            hint=step_hint if step_hint else None,
+        )
+        suggestions = [
+            {
+                "id": key.split("_")[-1],
+                "action": candidate["action"],
+                "think": candidate["think"],
+            }
+            for key, candidate in candidates.items()
+        ]
+        # List of Images as base64 - create overlay screenshots for each suggestion
+        screenshots = [overlay_action(obs, choice["action"]) for choice in suggestions]
+
+        while True:
+            try:
+                hint_labeling_inputs = HintLabelingInputs(
+                    goal=(
+                        obs.get("goal_object", [{}])[0].get("text", "")
+                        if obs.get("goal_object")
+                        else ""
+                    ),
+                    error_feedback=obs.get("last_action_error", ""),
+                    screenshot=(img_to_base_64(obs["screenshot"]) if "screenshot" in obs else ""),
+                    screenshots=screenshots,  # list of overlay screenshots for hover
+                    axtree=obs.get("axtree_txt", ""),
+                    history=[],  # TODO: add history
+                    hint=(
+                        "\n".join(f"{i}. {c}" for i, c in enumerate(step_hint, 1))
+                        if step_hint
+                        else ""
+                    ),
+                    suggestions=suggestions,
+                )
+
+                self.ui.update_context(hint_labeling_inputs)
+                response = self.ui.wait_for_response(timeout=300)
+
+                if response["type"] == "reprompt":
+                    hint = response["payload"]["hint"]
+                    step_hint.append(hint)
+                    candidates, chat_messages = self.get_candidate_generation(
+                        sys_prompt=system_prompt,
+                        human_prompt=human_prompt,
+                        hint=step_hint if step_hint else None,
+                    )
+                    suggestions = [
+                        {
+                            "id": key.split("_")[-1],
+                            "action": candidate["action"],
+                            "think": candidate["think"],
+                        }
+                        for key, candidate in candidates.items()
+                    ]
+                    # Regenerate screenshots for new suggestions
+                    screenshots = [overlay_action(obs, choice["action"]) for choice in suggestions]
+                    # Continue the loop to show new suggestions
+                elif response["type"] == "step":
+                    selected_action = response["payload"]["action"]
+                    choice_idx = None
+                    for i, candidate in enumerate(suggestions, 1):
+                        if candidate["action"] == selected_action:
+                            choice_idx = i
+                            break
+                    if choice_idx is None:
+                        choice_idx = 1
+                    ans_dict = candidates[f"candidate_generation_{choice_idx}"]
+                    break
+                else:
+                    ans_dict = candidates["candidate_generation_1"]
+                    break
+
+            except KeyboardInterrupt:
+                print("User cancelled the operation")
+                if self.ui:
+                    self.ui.close()
+                raise
+            except Exception as e:
+                print(f"Error in human intervention UI: {e}")
+                if self.ui:
+                    self.ui.close()
+                    self.ui = None
+                # Raise exception instead of falling back to console input
+                raise RuntimeError(f"Human intervention UI failed: {e}") from e
+
+        # TODO: Refactor as discussed with ALAC.
+        stats = self.chat_llm.get_stats()
+        self.plan = ans_dict.get("plan", self.plan)
+        self.plan_step = ans_dict.get("step", self.plan_step)
+        self.actions.append(ans_dict["action"])
+        self.memories.append(ans_dict.get("memory", None))
+        self.thoughts.append(ans_dict.get("think", None))
+        agent_info = AgentInfo(
+            think=ans_dict.get("think", None),
+            chat_messages=chat_messages,
+            stats=stats,
+            extra_info={
+                "chat_model_args": asdict(self.chat_model_args),
+                "step_hints": step_hint,
+                "n_human_intervention_rounds": self.step_n_human_intervention_rounds,
+                "candidates": candidates,
+                "suggestions": suggestions,
+            },
+        )
+        return ans_dict["action"], agent_info
+
+
+def get_base_agent(llm_config):
+    from agentlab.agents.generic_agent.tmlr_config import BASE_FLAGS
+    from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
+
+    return MultipleProposalGenericAgentArgs(
+        chat_model_args=CHAT_MODEL_ARGS_DICT[llm_config],
+        flags=BASE_FLAGS,
+    )
+
+
+HUMAN_GUIDED_GENERIC_AGENT = get_base_agent("openai/gpt-5-mini-2025-08-07")
+
+if __name__ == "__main__":
+    import logging
+
+    from agentlab.agents.hilt_agent.generic_human_guided_agent import (
+        HUMAN_GUIDED_GENERIC_AGENT,
+    )
+    from agentlab.experiments.study import Study
+
+    agent_configs = [HUMAN_GUIDED_GENERIC_AGENT]
+    benchmark = bgym.DEFAULT_BENCHMARKS["miniwob"]()
+    benchmark = benchmark.subset_from_glob("task_name", "*book*")
+    benchmark.env_args_list = benchmark.env_args_list[2:3]
+
+    for env_args in benchmark.env_args_list:
+        env_args.max_steps = 100  # max human steps
+        env_args.headless = False
+        # env_args.use_chat_ui = False
+        # env_args.use_hint_labeling_ui = True
+
+    Study(agent_configs, benchmark, logging_level=logging.WARNING).run(
+        n_jobs=1,
+        parallel_backend="sequential",
+        n_relaunch=1,
+    )
diff --git a/src/agentlab/agents/hilt_agent/hilt_agent.py b/src/agentlab/agents/hilt_agent/hilt_agent.py
new file mode 100644
index 00000000..6a44489f
--- /dev/null
+++ b/src/agentlab/agents/hilt_agent/hilt_agent.py
@@ -0,0 +1,209 @@
+import base64
+import copy
+import io
+from dataclasses import dataclass
+from typing import Optional
+
+import bgym
+import numpy as np
+from PIL import Image
+
+from agentlab.agents.hilt_agent.hint_labelling import (
+    HintLabeling,
+    HintLabelingInputs,
+)
+from agentlab.llm.tracking import cost_tracker_decorator
+from agentlab.analyze import overlay_utils
+from browsergym.experiments.agent import Agent
+from agentlab.agents.agent_args import AgentArgs
+from agentlab.agents.hilt_agent.base_multi_candidate_agent import MultiCandidateAgent
+
+class HumanInTheLoopAgent(Agent):
+
+    def __init__(
+        self,
+        subagent_args,  # Type: any object with MultiCandidateAgent interface
+    ):
+        self.subagent: MultiCandidateAgent = subagent_args.make_agent()
+        super().__init__()
+        self.ui = None
+
+    @cost_tracker_decorator
+    def get_action(self, obs):
+        # reset vars
+        step_n_human_intervention_rounds = 0
+        step_hint = []
+
+        # Initialize UI once outside the loop
+        if self.ui is None:
+            self.ui = HintLabeling(headless=False)
+            # Show initial waiting state
+            initial_inputs = HintLabelingInputs(
+                goal=(
+                    obs.get("goal_object", [{}])[0].get("text", "")
+                    if obs.get("goal_object")
+                    else ""
+                ),
+                error_feedback="",
+                screenshot=(img_to_base_64(obs["screenshot"]) if "screenshot" in obs else ""),
+                screenshots=[],  # no overlay screenshots yet
+                axtree=obs.get("axtree_txt", ""),
+                history=[],
+                hint="",
+                suggestions=[],  # no suggestions yet
+            )
+            self.ui.update_context(initial_inputs)
+
+        # Generate first candidates
+        candidates = self.subagent.get_candidate_generations(obs, hint=None, n_candidates=3)
+        step_n_human_intervention_rounds += 1
+        suggestions = [{ 'action': c['action'], 'think': c['agent_info'].think} for c in candidates]
+        # List of Images as base64 - create overlay screenshots for each suggested action
+        screenshots = [overlay_action(obs, choice["action"]) for choice in suggestions]
+
+        while True:
+            try:
+                hint_labeling_inputs = HintLabelingInputs(
+                    goal=(
+                        obs.get("goal_object", [{}])[0].get("text", "")
+                        if obs.get("goal_object")
+                        else ""
+                    ),
+                    error_feedback=obs.get("last_action_error", ""),
+                    screenshot=(img_to_base_64(obs["screenshot"]) if "screenshot" in obs else ""),
+                    screenshots=screenshots,  # list of overlay screenshots for hover
+                    axtree=obs.get("axtree_txt", ""),
+                    history=[],  # TODO: add history
+                    hint=(
+                        "\n".join(f"{i}. {c}" for i, c in enumerate(step_hint, 1))
+                        if step_hint
+                        else ""
+                    ),
+                    suggestions=suggestions,
+                )
+
+                self.ui.update_context(hint_labeling_inputs)
+                response = self.ui.wait_for_response(timeout=300)
+
+                if response["type"] == "reprompt":
+                    hint = response["payload"]["hint"]
+                    step_hint.append(hint)
+                    candidates = self.subagent.get_candidate_generations(
+                        obs, 
+                        hint=step_hint if step_hint else None,
+                        n_candidates=3
+                    )
+                    suggestions = [{'action': c['action'], 'think': c['agent_info'].think} for c in candidates]
+                    screenshots = [overlay_action(obs, choice["action"]) for choice in suggestions]
+
+                elif response["type"] == "step":
+                    selected_action = response["payload"]["action"]
+                    choice_idx = None
+                    for i, candidate in enumerate(suggestions):
+                        if candidate["action"] == selected_action:
+                            choice_idx = i
+                            break
+                    selected_candidate = candidates[choice_idx]
+                    self.subagent.update_agent_state_from_selected_candidate(selected_candidate)
+                    action = selected_candidate["action"]
+                    agent_info = selected_candidate["agent_info"]
+                    return action, agent_info
+
+            except KeyboardInterrupt:
+                print("User cancelled the operation")
+                if self.ui:
+                    self.ui.close()
+                raise
+            except Exception as e:
+                print(f"Error in human intervention UI: {e}")
+                if self.ui:
+                    self.ui.close()
+                    self.ui = None
+                # Raise exception instead of falling back to console input
+                raise RuntimeError(f"Human intervention UI failed: {e}") from e
+
+
+@dataclass
+class HumanInTheLoopAgentArgs(AgentArgs):
+    subagent_args: Optional[AgentArgs] = None  # args for the underlying multiple proposal agent
+    
+
+    def make_agent(self):
+        assert self.subagent_args is not None
+        return HumanInTheLoopAgent(subagent_args=self.subagent_args)
+
+    def __post_init__(self):
+        """Prefix subagent name with 'HILT-'."""
+        super().__post_init__()
+        if self.subagent_args and self.subagent_args.agent_name:
+            self.agent_name = "HILT-" + self.subagent_args.agent_name
+    
+    def set_benchmark(self, benchmark, demo_mode):
+        """Delegate set_benchmark to the subagent if it has the method."""
+        if hasattr(self.subagent_args, 'set_benchmark'):
+            self.subagent_args.set_benchmark(benchmark, demo_mode)
+    
+    def set_reproducibility_mode(self):
+        """Delegate set_reproducibility_mode to the subagent if it has the method."""
+        if hasattr(self.subagent_args, 'set_reproducibility_mode'):
+            self.subagent_args.set_reproducibility_mode()
+
+
+def overlay_action(obs, action):
+    """Overlays actions on screenshot in-place"""
+    act_img = copy.deepcopy(obs["screenshot"])
+    act_img = Image.fromarray(act_img)
+    overlay_utils.annotate_action(act_img, action, properties=obs["extra_element_properties"])
+    return img_to_base_64(act_img)
+
+
+def img_to_base_64(image: Image.Image | np.ndarray) -> str:
+    """Converts a PIL Image or NumPy array to a base64-encoded string."""
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    buffer = io.BytesIO()
+    image.save(buffer, format="PNG")
+    b64_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
+    return b64_str
+
+def get_base_human_in_the_loop_genericagent(llm_config):
+    from agentlab.agents.generic_agent.tmlr_config import BASE_FLAGS
+    from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
+    from agentlab.agents.hilt_agent.hilt_agent import HumanInTheLoopAgentArgs
+    from agentlab.agents.hilt_agent.multi_candidate_generic_agent import (
+        MultiCandidateGenericAgentArgs,
+    )
+
+    return HumanInTheLoopAgentArgs(
+        subagent_args = MultiCandidateGenericAgentArgs(
+            chat_model_args=CHAT_MODEL_ARGS_DICT[llm_config],
+            flags=BASE_FLAGS,
+        )
+    )
+
+
+HUMAN_GUIDED_GENERIC_AGENT = get_base_human_in_the_loop_genericagent("openai/gpt-5-mini-2025-08-07")
+
+if __name__ == "__main__":
+    import logging
+
+    from agentlab.agents.hilt_agent.hilt_agent import (
+        HUMAN_GUIDED_GENERIC_AGENT,
+    )
+    from agentlab.experiments.study import Study
+
+    agent_configs = [HUMAN_GUIDED_GENERIC_AGENT]
+    benchmark = bgym.DEFAULT_BENCHMARKS["miniwob"]()
+    benchmark = benchmark.subset_from_glob("task_name", "*book*")
+    benchmark.env_args_list = benchmark.env_args_list[2:3]
+
+    for env_args in benchmark.env_args_list:
+        env_args.max_steps = 100  # max human steps
+        env_args.headless = False
+
+
+    Study(agent_configs, benchmark, logging_level=logging.WARNING).run(
+        n_jobs=1,
+        parallel_backend="sequential",
+        n_relaunch=1,
+    )
diff --git a/src/agentlab/agents/hilt_agent/hint_labelling.py b/src/agentlab/agents/hilt_agent/hint_labelling.py
new file mode 100644
index 00000000..6e293781
--- /dev/null
+++ b/src/agentlab/agents/hilt_agent/hint_labelling.py
@@ -0,0 +1,153 @@
+import json
+import logging
+from importlib import resources
+from queue import Queue
+from typing import Dict, List, Optional
+
+import playwright.sync_api
+from pydantic import BaseModel, Field
+
+from agentlab.agents.hilt_agent import hint_labelling_ui_files
+from browsergym.core import _get_global_playwright
+
+logger = logging.getLogger(__name__)
+
+HINT_LABELING_DIR = resources.files(hint_labelling_ui_files)
+
+
+class HintLabelingInputs(BaseModel):
+    goal: str
+    error_feedback: str = ""
+    screenshot: str  # base64 screenshot (original/current)
+    screenshots: List[str] = Field(default_factory=list)  # list of base64 screenshots for hover
+    axtree: str
+    history: List[Dict[str, str]] = Field(default_factory=list)
+    hint: str = ""
+    suggestions: List[Dict[str, str]] = Field(default_factory=list)
+
+
+class HintLabeling:
+    def __init__(self, headless: bool, window_size=(600, 1000), *args, **kwargs):
+
+        pw: playwright.sync_api.Playwright = _get_global_playwright()
+        self.browser = pw.chromium.launch(
+            headless=headless, args=[f"--window-size={window_size[0]},{window_size[1]}"]
+        )
+        self.context = self.browser.new_context(
+            no_viewport=True,
+        )
+        self.page = self.context.new_page()
+        self._resp_queue: "Queue[dict]" = Queue()
+
+        self.page.route("**/api/reprompt", self._route_reprompt)
+        self.page.route("**/api/submit", self._route_submit)
+        self.page.set_content(get_hint_labeling_ui(HINT_LABELING_DIR))
+
+        # internal state
+        self._context: HintLabelingInputs = None
+        self._running = False
+
+    def _route_reprompt(
+        self, route: playwright.sync_api.Route, request: playwright.sync_api.Request
+    ):
+        logger.info("Route hit: %s %s", request.method, request.url)
+        try:
+            body = json.loads(request.post_data() or "{}")
+        except Exception:
+            body = {}
+        # enqueue output 1 (reprompt)
+        msg = {"type": "reprompt", "payload": {"hint": body.get("hint", "")}}
+        self._resp_queue.put(msg)
+        # Respond something minimal so UI doesn’t break; it will be refreshed by a later update_context()
+        route.fulfill(
+            status=200,
+            content_type="application/json",
+            body=json.dumps({"suggestions": []}),
+        )
+
+    def _route_submit(self, route: playwright.sync_api.Route, request: playwright.sync_api.Request):
+        logger.info("Route hit: %s %s", request.method, request.url)
+        try:
+            body = json.loads(request.post_data() or "{}")
+        except Exception:
+            body = {}
+        # Map UI payload -> your step shape
+        msg = {
+            "type": "step",
+            "payload": {
+                "think": body.get("think", ""),
+                "action": body.get("action", ""),
+            },
+        }
+        self._resp_queue.put(msg)
+        # UI expects 200 JSON; we can optionally send new suggestions here too.
+        route.fulfill(
+            status=200,
+            content_type="application/json",
+            body=json.dumps({"suggestions": []}),
+        )
+
+    def _to_ui_bootstrap(self, ctx: HintLabelingInputs) -> dict:
+        return {
+            "goal": ctx.goal,
+            "error_feedback": ctx.error_feedback,
+            "screenshot": ctx.screenshot,
+            "screenshots": ctx.screenshots,  # list of screenshots for hover
+            "axtree": ctx.axtree,
+            "history": ctx.history,
+            "hint": ctx.hint,
+            "suggestions": ctx.suggestions,
+        }
+
+    def update_context(self, context: HintLabelingInputs):
+        self._context = context
+        ui_payload = self._to_ui_bootstrap(context)
+        # call JS function with arg (no string concat)
+        self.page.evaluate("(d) => updateContext(d)", ui_payload)
+
+    def wait_for_response(self, timeout: Optional[float] = 600) -> dict:
+        """
+        Wait until the page makes a request to /api/reprompt or /api/submit,
+        then parse the request body and return it in your schema.
+        """
+        logger.info("Waiting for response from Hint Labeling UI...")
+
+        def is_api(req: playwright.sync_api.Request) -> bool:
+            u = req.url
+            return (
+                u.endswith("/api/reprompt") or u.endswith("/api/submit")
+            ) and req.method == "POST"
+
+        # This pumps Playwright internally; no busy waiting.
+        with self.page.expect_request(
+            is_api, timeout=(timeout * 1000 if timeout else 0)
+        ) as req_info:
+            req = req_info.value
+
+        body_text = req.post_data or "{}"
+        try:
+            body = json.loads(body_text)
+        except Exception as e:
+            print("JSON parse error:", e)
+            body = {}
+
+        if req.url.endswith("/api/reprompt"):
+            msg = {"type": "reprompt", "payload": {"hint": body.get("hint", "")}}
+        else:
+            msg = {
+                "type": "step",
+                "payload": {"think": body.get("think", ""), "action": body.get("action", "")},
+            }
+
+        logger.info("Response received: %s", msg)
+        return msg
+
+    def close(self):
+        self.context.close()
+        self.browser.close()
+
+
+def get_hint_labeling_ui(hint_labeling_dir) -> str:
+    with open(hint_labeling_dir / "hint_labeling_ui.html", "r") as file:
+        hint_labeling_html = file.read()
+    return hint_labeling_html
diff --git a/src/agentlab/agents/hilt_agent/hint_labelling_ui_files/hint_labeling_ui.html b/src/agentlab/agents/hilt_agent/hint_labelling_ui_files/hint_labeling_ui.html
new file mode 100644
index 00000000..3371c3cd
--- /dev/null
+++ b/src/agentlab/agents/hilt_agent/hint_labelling_ui_files/hint_labeling_ui.html
@@ -0,0 +1,419 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="utf-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1" />
+  <base href="http://route.local/"><!-- NEW: gives fetch a resolvable origin -->
+  <title>Agent Reprompt UI</title>
+  <style>
+    :root{
+      --bg:#f4f6f8; --card:#fff; --muted:#6b7280; --text:#0f172a; --brand:#2563eb; --accent:#10b981; --danger:#ef4444; --border:#e5e7eb;
+    }
+    *{box-sizing:border-box}
+    body{margin:0;font-family:Inter,system-ui,Segoe UI,Roboto,Helvetica,Arial,sans-serif;background:var(--bg);color:var(--text)}
+    .container{max-width:1100px;margin:24px auto;padding:0 16px}
+
+    .grid{
+      display:grid;gap:16px;
+      grid-template-columns: 1fr 1fr;
+    }
+    .card{background:var(--card);border:1px solid var(--border);border-radius:16px;box-shadow:0 2px 6px rgba(0,0,0,.05)}
+    .card h2{margin:0 0 8px 0;font-size:14px;text-transform:uppercase;letter-spacing:.06em;color:var(--muted)}
+    .pad{padding:16px}
+
+    .tabs{display:flex;gap:8px;padding:8px 8px 0}
+    .tab{border:none;background:transparent;padding:10px 14px;border-radius:12px 12px 0 0;cursor:pointer;font-weight:600;color:var(--muted)}
+    .tab.active{background:var(--card);border:1px solid var(--border);border-bottom:none;color:var(--text)}
+    .tabpanel{border-top:1px solid var(--border)}
+
+    .screenshot{width:800px;height:450px;object-fit:contain;background:#0000000d;border-radius:8px}
+    .axtree{width:100%;height:520px;resize:none;border:none;padding:12px;font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,monospace;background:#0b10241a}
+
+    .hints-row{display:grid;grid-template-columns: 1fr 140px;gap:12px;align-items:start}
+    textarea.hint{width:100%;min-height:120px;resize:vertical;padding:12px;border:1px solid var(--border);border-radius:12px;font-size:14px}
+    .btn{display:inline-flex;align-items:center;justify-content:center;gap:8px;border:none;border-radius:12px;padding:12px 16px;font-weight:600;cursor:pointer}
+    .btn-primary{background:var(--brand);color:#fff}
+    .btn-primary[disabled]{opacity:.6;cursor:not-allowed}
+    .btn-ghost{background:transparent;border:1px solid var(--border)}
+
+    .choices{margin-top:12px;display:flex;flex-direction:column;gap:10px}
+    .choice{display:grid;grid-template-columns:32px 1fr;gap:12px;align-items:start;background:#ffffff;border:1px solid var(--border);border-radius:14px;padding:12px}
+    .choice.selected{border:2px solid var(--accent);background:#f0fdf4}
+    .choice.disabled{opacity:0.5;pointer-events:none}
+    .choice input[type="radio"]{margin-top:6px;width:18px;height:18px}
+    .choice .action{font-weight:800}
+    .choice .row{display:flex;gap:6px;flex-wrap:wrap}
+    .choice .label{font-weight:700}
+    .choice .value{color:#0f172a}
+    .choice .reason{font-size:13px;color:#111827}
+
+    .footer{display:flex;justify-content:flex-end;gap:12px;margin-top:10px}
+
+    .banner{margin:12px 0;padding:10px 12px;border-radius:10px;font-size:14px}
+    .banner.info{background:#dbeafe;border:1px solid #bfdbfe}
+    .banner.error{background:#fee2e2;border:1px solid #fecaca;color:#991b1b}
+
+    .pill{display:inline-block;padding:6px 10px;border-radius:999px;background:#f1f5f9;color:#0f172a;border:1px solid var(--border);font-size:12px}
+
+    @media (max-width: 900px){
+      .grid{grid-template-columns: 1fr}
+      .axtree{height:420px}
+      .screenshot{width:100%;height:auto;max-width:800px}
+    }
+
+    #goalBox,
+    #errorBox {
+      white-space: pre-wrap;     /* respects \n; collapses multiple spaces nicely */
+      overflow-wrap: anywhere;   /* wrap very long tokens (URLs/unbroken text) */
+      word-break: break-word;    /* fallback for older engines */
+    }
+
+    .progress-area {
+      animation: pulse 2s infinite;
+    }
+
+    @keyframes pulse {
+      0%, 100% { opacity: 1; }
+      50% { opacity: 0.7; }
+    }
+  </style>
+</head>
+<body>
+  <div class="container">
+    <!-- Top: Goal & Error -->
+    <div class="grid" style="display: flex; gap: 16px;">
+      <div class="card pad" style="flex: 1; height: 150px;">
+        <h2>Goal</h2>
+        <div id="goalBox" style="padding: 12px 14px; font-size: 15px; background: #f8fafc; height: 80%; overflow-y: auto;"></div>
+      </div>
+      <div class="card pad" style="flex: 1; height: 150px;">
+        <h2>Error Feedback</h2>
+        <div id="errorBox" style="padding: 12px 14px; font-size: 15px; background: #fef2f2; height: 80%; overflow-y: auto;"></div>
+      </div>
+    </div>
+
+    <!-- Middle: Tabs -->
+    <div class="card" style="margin-top:16px">
+      <div class="tabs">
+        <button class="tab active" data-tab="screenshot">Screenshot</button>
+        <button class="tab" data-tab="axtree">AxTree</button>
+        <button class="tab" data-tab="history">History</button>
+      </div>
+      <div class="pad tabpanel">
+        <div id="tab-screenshot" class="tabcontent">
+          <img id="screenshotImg" alt="screenshot" class="screenshot" />
+        </div>
+        <div id="tab-axtree" class="tabcontent" hidden>
+          <textarea id="axtreeArea" class="axtree" readonly style="font-size: 12px; white-space: pre; overflow-wrap: normal;"></textarea>
+        </div>
+        <div id="tab-history" class="tabcontent" hidden>
+          <!-- intentionally empty for now -->
+          <div class="banner info">History will appear here.</div>
+        </div>
+      </div>
+    </div>
+
+    <!-- Hints & Reprompt -->
+    <div class="card pad" style="margin-top:16px">
+      <h2>Hints</h2>
+      <textarea id="hintInput" class="hint" placeholder="Type guidance for the next reprompt…" style="width: 100%;"></textarea>
+      <button id="repromptBtn" class="btn btn-primary" title="Send hint to get refreshed suggestions" style="margin-top: 12px;">Reprompt with Hint</button>
+      <div id="repromptStatus" class="banner info" style="display:none"></div>
+    </div>
+
+    <!-- Suggestions / Radio list -->
+    <div class="card pad" style="margin-top:16px">
+      <h2>Suggestions</h2>
+      <div id="choices" class="choices"></div>
+      <div id="choicesNote" class="banner info" style="display:none" title="Hover to see more details"></div>
+      <div class="footer">
+        <button id="submitBtn" class="btn btn-primary" disabled title="Select an action to enable">Send Action</button>
+      </div>
+      <div id="submitStatus" class="banner info" style="display:none" title="Hover to see submission status"></div>
+    </div>
+
+    <!-- Progress/Status Area - Hidden by default, shown only when there's status -->
+    <div id="progressContainer" class="card pad" style="margin-top:16px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border: none; box-shadow: 0 4px 12px rgba(102, 126, 234, 0.3); display: none;">
+      <h2 style="color: white; margin-bottom: 8px;">Status</h2>
+      <div id="progressArea" class="progress-area" style="font-size: 16px; font-weight: 600; text-align: center; padding: 16px; background: rgba(255, 255, 255, 0.1); border-radius: 12px; min-height: 60px; display: flex; align-items: center; justify-content: center;">
+        Waiting for first response...
+      </div>
+    </div>
+  </div>
+
+  <script>
+    /**
+     * Bootstrapping contract
+     * You can overwrite window.__BOOTSTRAP_DATA__ from your server-side template.
+     * Fields:
+     *   goal: string
+     *   error_feedback: string
+     *   screenshot: base64 string (no data: prefix required)
+     *   screenshots: Array<string> - list of base64 screenshots for hover (same length as suggestions)
+     *   axtree: string
+     *   hint: string
+     *   suggestions: Array<{ action: string, think: string, id?: string }>
+     */
+    window.__BOOTSTRAP_DATA__ = window.__BOOTSTRAP_DATA__ || {
+      goal: "go to the hardware catalog store and order a developer laptop",
+      error_feedback: "playwright error when clicking on something that is not visible (from the previous step)",
+      screenshot: "", // fill with base64 (PNG/JPG). When empty, we show a placeholder.
+      screenshots: [], // list of base64 screenshots for hover
+      axtree: "<root>\n  <window name=\"VITASPHERE\">…</window>\n</root>",
+      history: [],
+      hint: "",
+      suggestions: [
+        { id: "1", action: "click(\"42\")", think: "The button with id 42 advances the form." },
+        { id: "2", action: "type(\"Assigned to\", \"John Doe\")", think: "Fills the assignee field before submission." },
+        { id: "3", action: "open(\"/hardware-catalog\")", think: "Navigate directly to the catalog page." }
+      ]
+    };
+
+    var RECEIVED_RESPONSE = false;
+    var originalScreenshot = ""; // store original screenshot
+    var hoverScreenshots = []; // store screenshots for hover
+    var hoverEnabled = true; // track if hover behavior is enabled
+
+    function applyContext(d){
+      goalBox.textContent = d.goal || '';
+      errorBox.textContent = d.error_feedback || '';
+      originalScreenshot = d.screenshot || '';
+      hoverScreenshots = Array.isArray(d.screenshots) ? d.screenshots : [];
+      screenshotImg.src = dataUrlFromBase64(originalScreenshot);
+      axtreeArea.value = d.axtree || '';
+      if (Array.isArray(d.suggestions)) {
+        renderSuggestions(d.suggestions);
+      }
+      // keep the hint textarea in sync only if it's currently empty,
+      // so we don't clobber user typing
+      if (!hintInput.value) hintInput.value = d.hint || '';
+    }
+
+    // REPLACE your old updateContext with this:
+    function updateContext(data){
+      window.__BOOTSTRAP_DATA__ = data || {};
+      applyContext(window.__BOOTSTRAP_DATA__);
+    }
+
+    // Placeholder endpoints (replace later)
+    const ENDPOINTS = {
+      REPROMPT: "/api/reprompt",   // expects POST {hint} -> returns {suggestions: [...]} 
+      SUBMIT: "/api/submit"        // expects POST {hint, action, think, id?} -> returns {suggestions?: [...]} (optional)
+    };
+
+    // DOM references
+    const goalBox = document.getElementById('goalBox');
+    const errorBox = document.getElementById('errorBox');
+    const screenshotImg = document.getElementById('screenshotImg');
+    const axtreeArea = document.getElementById('axtreeArea');
+    const hintInput = document.getElementById('hintInput');
+    const repromptBtn = document.getElementById('repromptBtn');
+    const repromptStatus = document.getElementById('repromptStatus');
+    const choicesEl = document.getElementById('choices');
+    const choicesNote = document.getElementById('choicesNote');
+    const submitBtn = document.getElementById('submitBtn');
+    const submitStatus = document.getElementById('submitStatus');
+    const progressArea = document.getElementById('progressArea');
+    const progressContainer = document.getElementById('progressContainer');
+
+    // State
+    let currentSuggestions = [];
+    let selectedId = null;
+
+    // Helpers
+    function setVisible(el, visible){ el.style.display = visible ? '' : 'none'; }
+    function setBanner(el, text, variant='info'){ el.className = `banner ${variant}`; el.textContent = text; setVisible(el,true); }
+    function updateProgress(message, showAnimation = true) {
+      progressArea.textContent = message;
+      if (showAnimation) {
+        progressArea.style.animation = 'pulse 2s infinite';
+      } else {
+        progressArea.style.animation = 'none';
+      }
+      // Show the progress container when there's a message
+      setVisible(progressContainer, true);
+    }
+    function hideProgress() {
+      setVisible(progressContainer, false);
+    }
+
+    function dataUrlFromBase64(b64){
+      if(!b64) return 'data:image/svg+xml;charset=utf-8,' + encodeURIComponent(`<svg xmlns=\"http://www.w3.org/2000/svg\" width=\"1600\" height=\"900\"><rect width=\"100%\" height=\"100%\" fill=\"#eef2ff\"/><text x=\"50%\" y=\"50%\" font-family=\"sans-serif\" font-size=\"24\" text-anchor=\"middle\" fill=\"#64748b\">No screenshot provided</text></svg>`);
+      // naive sniff for png/jpg
+      const pref = b64.trim().startsWith('/') || b64.trim().startsWith('iVBOR') ? 'image/png' : 'image/jpeg';
+      return `data:${pref};base64,${b64}`;
+    }
+
+    function renderSuggestions(suggestions){
+      currentSuggestions = suggestions.slice(0,5); // cap at 5
+      choicesEl.innerHTML = '';
+      selectedId = null;
+      submitBtn.disabled = true;
+      hoverEnabled = true; // Re-enable hover when new suggestions are rendered
+
+      // Hide progress when new suggestions arrive - user is ready for interaction
+      hideProgress();
+
+      if(currentSuggestions.length === 0){
+        setBanner(choicesNote, 'No suggestions yet. Please Wait..');
+        return;
+      }
+      setVisible(choicesNote,false);
+
+      currentSuggestions.forEach((sugg, idx)=>{
+        const id = sugg.id || String(idx+1);
+        const wrapper = document.createElement('label');
+        wrapper.className = 'choice';
+        wrapper.setAttribute('for', `choice-${id}`);
+
+        // Add hover event listeners for screenshot changes
+        const screenshotForThisChoice = hoverScreenshots[idx] || originalScreenshot;
+        wrapper.addEventListener('mouseenter', () => {
+          if (hoverEnabled && screenshotForThisChoice && screenshotForThisChoice !== originalScreenshot) {
+            screenshotImg.src = dataUrlFromBase64(screenshotForThisChoice);
+          }
+        });
+        wrapper.addEventListener('mouseleave', () => {
+          if (hoverEnabled) {
+            screenshotImg.src = dataUrlFromBase64(originalScreenshot);
+          }
+        });
+
+        const radio = document.createElement('input');
+        radio.type = 'radio';
+        radio.name = 'choice';
+        radio.id = `choice-${id}`;
+        radio.value = id;
+        radio.addEventListener('change', ()=>{ selectedId = id; submitBtn.disabled = false; });
+
+        const box = document.createElement('div');
+        const actionRow = document.createElement('div');
+        actionRow.className = 'row';
+        const actionLabel = document.createElement('span');
+        actionLabel.className = 'label action';
+        actionLabel.textContent = '';
+        const actionVal = document.createElement('span');
+        actionVal.className = 'value action';
+        actionVal.textContent = `${sugg.action}`;
+        actionRow.appendChild(actionLabel); actionRow.appendChild(actionVal);
+
+        const reasonRow = document.createElement('div');
+        reasonRow.className = 'row reason';
+        const reasonLabel = document.createElement('span');
+        reasonLabel.className = 'label';
+        reasonLabel.textContent = 'reasoning:';
+        const reasonVal = document.createElement('span');
+        reasonVal.className = 'value';
+        reasonVal.style.maxHeight = '3em';
+        reasonVal.style.overflowY = 'auto';
+        reasonVal.textContent = ` ${sugg.think}`;
+        reasonRow.appendChild(reasonLabel); reasonRow.appendChild(reasonVal);
+
+        box.appendChild(actionRow);
+        box.appendChild(reasonRow);
+
+        wrapper.appendChild(radio);
+        wrapper.appendChild(box);
+        choicesEl.appendChild(wrapper);
+      });
+    }
+
+    function currentSelection(){
+      if(!selectedId) return null;
+      const obj = currentSuggestions.find(s=> (s.id||String(currentSuggestions.indexOf(s)+1)) === selectedId);
+      return obj || null;
+    }
+
+    // Tab logic
+    document.querySelectorAll('.tab').forEach(btn=>{
+      btn.addEventListener('click',()=>{
+        document.querySelectorAll('.tab').forEach(b=>b.classList.remove('active'));
+        btn.classList.add('active');
+        const name = btn.dataset.tab;
+        document.querySelectorAll('.tabcontent').forEach(c=>c.hidden = true);
+        document.getElementById('tab-'+name).hidden = false;
+      });
+    });
+
+    // Actions
+    repromptBtn.addEventListener('click', async ()=>{
+      updateProgress('Requesting new suggestions...', true);
+      try{
+        const res = await fetch(ENDPOINTS.REPROMPT,{
+          method:'POST', headers:{'Content-Type':'application/json'},
+          body: JSON.stringify({ hint: hintInput.value })
+        });
+        // Don't expect a response - the backend will update the UI via updateContext
+        updateProgress('Hint sent. Waiting for new suggestions...', true);
+      }catch(err){
+        updateProgress('Error: ' + String(err), false);
+      } finally{
+        setTimeout(()=>hideProgress(), 2000);
+      }
+    });
+
+    submitBtn.addEventListener('click', async ()=>{
+      const selection = currentSelection();
+      if(!selection){ return; }
+      updateProgress('Submitting selection...', true);
+      submitBtn.disabled = true;
+
+      // Find the index of the selected suggestion to get its screenshot
+      const selectedIndex = currentSuggestions.findIndex(s => (s.id || String(currentSuggestions.indexOf(s) + 1)) === selectedId);
+      const selectedScreenshot = hoverScreenshots[selectedIndex] || originalScreenshot;
+
+      // Show the selected option's screenshot instead of waiting message
+      if (selectedScreenshot) {
+        screenshotImg.src = dataUrlFromBase64(selectedScreenshot);
+      }
+
+      // Apply visual states to options
+      const allChoices = choicesEl.querySelectorAll('.choice');
+      allChoices.forEach((choice, idx) => {
+        const choiceId = currentSuggestions[idx].id || String(idx + 1);
+        if (choiceId === selectedId) {
+          choice.classList.add('selected');
+          choice.classList.remove('disabled');
+        } else {
+          choice.classList.add('disabled');
+          choice.classList.remove('selected');
+        }
+      });
+
+      // Disable hover behavior
+      hoverEnabled = false;
+
+      // Reset UI to ideal state
+      document.querySelectorAll('input[name="choice"]').forEach(r=> r.checked=false);
+      selectedId = null;
+      submitBtn.disabled = true;
+      hintInput.value = '';
+
+      try{
+        const payload = { hint: hintInput.value, action: selection.action, think: selection.think, id: selection.id };
+        const res = await fetch(ENDPOINTS.SUBMIT,{
+          method:'POST', headers:{'Content-Type':'application/json'},
+          body: JSON.stringify(payload)
+        });
+        // Don't expect a response - the backend will handle the selection
+        updateProgress('Selection submitted successfully!', false);
+      }catch(err){
+        updateProgress('Error: ' + String(err), false);
+      } finally{
+        setTimeout(()=>updateProgress('Waiting for LLM response...', false), 5000);
+      }
+    });
+
+    // Initial render from BOOTSTRAP_DATA
+    (function init(){
+      const d = window.__BOOTSTRAP_DATA__;
+      goalBox.textContent = d.goal || '';
+      errorBox.textContent = d.error_feedback || '';
+      screenshotImg.src = dataUrlFromBase64(d.screenshot || '');
+      axtreeArea.value = d.axtree || '';
+      renderSuggestions(Array.isArray(d.suggestions) ? d.suggestions : []);
+      // Don't show initial progress - only show when there's actual status
+    })();
+  </script>
+</body>
+</html>
\ No newline at end of file
diff --git a/src/agentlab/agents/hilt_agent/multi_candidate_generic_agent.py b/src/agentlab/agents/hilt_agent/multi_candidate_generic_agent.py
new file mode 100644
index 00000000..64c821ef
--- /dev/null
+++ b/src/agentlab/agents/hilt_agent/multi_candidate_generic_agent.py
@@ -0,0 +1,216 @@
+import re
+from dataclasses import asdict, dataclass
+from typing import Dict, List
+
+from agentlab.agents import dynamic_prompting as dp
+from agentlab.agents.generic_agent.generic_agent import GenericAgent, GenericAgentArgs
+from agentlab.agents.generic_agent.generic_agent_prompt import MainPrompt
+from agentlab.llm.llm_utils import Discussion, HumanMessage, SystemMessage
+from browsergym.experiments.agent import AgentInfo
+
+
+class CandidatesGeneration(dp.PromptElement):
+    # Ask for multiple alternatives; each candidate must contain <think> and <action>.
+    def __init__(self, hint: list[str] | None = None, n_candidates=3) -> None:
+        self.hint = hint
+        self.n_candidates = n_candidates
+        self.hint_prompt = "\n".join(f"{i}. {c}" for i, c in enumerate(hint, 1)) if hint else ""
+        super().__init__(True)
+        self._prompt = [
+            dict(
+                type="text",
+                text=f"""
+    You are a web agent. Propose {self.n_candidates} alternative next steps for the current page.
+    {('Use the Hints:' + self.hint_prompt) if self.hint else ""}\n
+    Return EACH candidate wrapped as numbered tags:
+    <candidate_generation_1>...</candidate_generation_1>
+    <candidate_generation_2>...</candidate_generation_2>
+
+    Inside every candidate you MUST include:
+    <think>...why this action is appropriate now...</think>
+    <action>...ONE atomic, executable action string...</action>
+
+    Do not include any extra text outside the candidate tags.
+    Use this format:
+    <candidate_generation_1>
+    <think>Explain why Candidate One is chosen</think>
+    <action>Candidate One Action</action>
+    </candidate_generation_1>
+
+    <candidate_generation_2>
+    <think>Explain why Candidate Two is chosen</think>
+    <action>Candidate Two Action</action>
+    </candidate_generation_2>
+    # Example 
+    <candidate_generation_1>
+    <think>The login button is visible and proceeding will reveal the auth form.</think>
+    <action>click(role="button", name="Log in")</action>
+    </candidate_generation_1>
+
+    <candidate_generation_2>
+    <think>User might need to enter email first; the email field is focused and visible.</think>
+    <action>fill(bid="a112", text="user@example.com")</action>
+    </candidate_generation_2>
+    """,
+            )
+        ]
+
+    # Regex patterns for numbered candidates only
+    _NUM_BLOCK = re.compile(
+        r"<\s*candidate[_ ]generation[_ ](?P<idx>[0-9]+)\s*>(?P<body>.*?)<\s*/\s*candidate[_ ]generation[_ ](?P=idx)\s*>",
+        flags=re.IGNORECASE | re.DOTALL,
+    )
+    _THINK_PATTERN = re.compile(
+        r"<\s*think\s*>(?P<think>.*?)<\s*/\s*think\s*>",
+        flags=re.IGNORECASE | re.DOTALL,
+    )
+    _ACTION_PATTERN = re.compile(
+        r"<\s*action\s*>(?P<action>.*?)<\s*/\s*action\s*>",
+        flags=re.IGNORECASE | re.DOTALL,
+    )
+
+    def _parse_answer(self, text_answer: str) -> Dict[str, Dict[str, str]]:
+        """
+        Extract up to n_candidates candidates, using numbered tags only.
+
+        Returns:
+        {
+            "candidate_generation_1": {"think": "...", "action": "..."},
+            "candidate_generation_2": {"think": "...", "action": "..."},
+            ...
+        }
+        """
+        result = {
+            f"candidate_generation_{i+1}": {"think": "", "action": ""}
+            for i in range(self.n_candidates)
+        }
+
+        if not isinstance(text_answer, str):
+            return result
+
+        matches: List[re.Match] = list(self._NUM_BLOCK.finditer(text_answer))
+        # Sort by numeric index
+        matches_sorted = sorted(matches, key=lambda m: int(m.group("idx")))
+        for i, m in enumerate(matches_sorted[: self.n_candidates]):
+            body = m.group("body").strip()
+            think_m = self._THINK_PATTERN.search(body)
+            action_m = self._ACTION_PATTERN.search(body)
+            result[f"candidate_generation_{i+1}"] = {
+                "think": (think_m.group("think").strip() if think_m else ""),
+                "action": (action_m.group("action").strip() if action_m else ""),
+            }
+
+        return result
+
+
+class MultiCandidateGenericAgent(GenericAgent):
+
+    def __init__(
+        self,
+        chat_model_args,
+        flags,
+        max_retry: int = 4,
+    ):
+        super().__init__(chat_model_args, flags, max_retry)
+
+    def get_candidate_generations(
+        self,
+        obs,
+        hint: list[str] | None = None,
+        n_candidates=3,
+    ) -> list[dict]:
+        # Append obs to history only if it's not already the last entry
+        # Important to handle cases when get_candidate_generation is called multiple times in a single step.
+        if not self.obs_history or self.obs_history[-1] is not obs:
+            self.obs_history.append(obs)
+    
+
+        main_prompt = MainPrompt(
+            action_set=self.action_set,
+            obs_history=self.obs_history,
+            actions=self.actions,
+            memories=self.memories,
+            thoughts=self.thoughts,
+            previous_plan=self.plan,
+            step=self.plan_step,
+            flags=self.flags,
+        )
+        max_prompt_tokens, max_trunc_itr = self._get_maxes()
+
+        system_prompt = SystemMessage(dp.SystemPrompt().prompt)
+
+        human_prompt = dp.fit_tokens(
+            shrinkable=main_prompt,
+            max_prompt_tokens=max_prompt_tokens,
+            model_name=self.chat_model_args.model_name,
+            max_iterations=max_trunc_itr,
+            additional_prompts=system_prompt,
+        )
+
+        cg = CandidatesGeneration(hint=hint, n_candidates=n_candidates)
+        candidates_prompt = HumanMessage(cg.prompt)
+        chat_messages = Discussion([system_prompt, human_prompt, candidates_prompt])
+        output = self.chat_llm(chat_messages)
+        candidates = cg._parse_answer(output["content"])
+        # Not adding the generate candidate prompt to xray.
+        msg_to_add_to_xray = Discussion([system_prompt, human_prompt])
+        suggestions = [
+            {
+                "action": candidate["action"],
+                "think": candidate["think"],
+            }
+            for key, candidate in candidates.items()
+        ]
+        output = []
+        for candidate in suggestions:
+            agent_info = AgentInfo(
+                think=candidate.get("think", None),
+                chat_messages=msg_to_add_to_xray,
+                stats=self.chat_llm.get_stats(),
+                extra_info={
+                    "chat_model_args": asdict(self.chat_model_args),
+                    "think": candidate.get("think", None),
+                    "plan": candidate.get("plan", None),
+                    "step": candidate.get("step", None),
+                    "memory": candidate.get("memory", None),
+                },
+            )
+            output.append({"action": candidate["action"], "agent_info": agent_info})
+
+        return output
+
+    def update_agent_state_from_selected_candidate(self, output):
+        """Updates the agent's internal state based on the selected candidate from human feedback."""
+        action, agent_info = output['action'], output['agent_info']
+        self.plan = agent_info.extra_info.get("plan", self.plan)
+        self.plan_step = agent_info.extra_info.get("step", self.plan_step)
+        self.memories.append(agent_info.extra_info.get("memory", None))
+        self.thoughts.append(agent_info.extra_info.get("think", None))
+        self.actions.append(action)
+
+    def get_action(self, obs):
+        """Generates multiple candidates and always returns the first one.
+        This allows to use this agent as a drop-in replacement for a single-candidate agent.
+        """
+        candidates = self.get_candidate_generations(obs, hint=None, n_candidates=2) 
+        selection = candidates[0] # always select the first option.
+        self.update_agent_state_from_selected_candidate(selection)
+        action, agent_info = selection['action'], selection['agent_info']
+
+        return action, agent_info
+
+
+@dataclass
+class MultiCandidateGenericAgentArgs(GenericAgentArgs):
+    def make_agent(self):
+        return MultiCandidateGenericAgent(
+            chat_model_args=self.chat_model_args,
+            flags=self.flags,
+            max_retry=self.max_retry,
+        )
+
+    def __post_init__(self):
+        """Prefix subagent name with 'MC-'."""
+        super().__post_init__()
+        if hasattr(self, 'agent_name') and self.agent_name:
+            self.agent_name = "MC-" + self.agent_name

From ed0f1bd7e1288cd4a73d8d6225cd6a1fe4219a4b Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Tue, 2 Sep 2025 10:14:26 -0400
Subject: [PATCH 02/21] add timeout error for hilt agent.

---
 src/agentlab/agents/hilt_agent/hilt_agent.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/agentlab/agents/hilt_agent/hilt_agent.py b/src/agentlab/agents/hilt_agent/hilt_agent.py
index 6a44489f..245db484 100644
--- a/src/agentlab/agents/hilt_agent/hilt_agent.py
+++ b/src/agentlab/agents/hilt_agent/hilt_agent.py
@@ -7,6 +7,7 @@
 import bgym
 import numpy as np
 from PIL import Image
+import playwright
 
 from agentlab.agents.hilt_agent.hint_labelling import (
     HintLabeling,
@@ -114,6 +115,14 @@ def get_action(self, obs):
                 if self.ui:
                     self.ui.close()
                 raise
+            except playwright.sync_api.TimeoutError:
+                # Handle timeout specifically: fall back to first candidate
+                print("UI timeout; falling back to first candidate.")
+                selected_candidate = candidates[0]
+                self.subagent.update_agent_state_from_selected_candidate(selected_candidate)
+                action = selected_candidate["action"]
+                agent_info = selected_candidate["agent_info"]
+                return action, agent_info
             except Exception as e:
                 print(f"Error in human intervention UI: {e}")
                 if self.ui:

From b2c1ac83d9d3fec69124c700bf49776bb26bacc9 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Tue, 2 Sep 2025 12:20:00 -0400
Subject: [PATCH 03/21] darglint and black

---
 .../hilt_agent/base_multi_candidate_agent.py  | 15 ++++---
 .../hilt_agent/generic_human_guided_agent.py  | 25 ++++++------
 src/agentlab/agents/hilt_agent/hilt_agent.py  | 36 ++++++++---------
 .../agents/hilt_agent/hint_labelling.py       | 12 +++++-
 .../multi_candidate_generic_agent.py          | 39 ++++++++++++-------
 5 files changed, 71 insertions(+), 56 deletions(-)

diff --git a/src/agentlab/agents/hilt_agent/base_multi_candidate_agent.py b/src/agentlab/agents/hilt_agent/base_multi_candidate_agent.py
index 6cd4624a..81a0db08 100644
--- a/src/agentlab/agents/hilt_agent/base_multi_candidate_agent.py
+++ b/src/agentlab/agents/hilt_agent/base_multi_candidate_agent.py
@@ -1,4 +1,5 @@
 from typing_extensions import Protocol
+
 from agentlab.agents.agent_args import AgentArgs
 
 
@@ -12,9 +13,10 @@ class MultiCandidateAgent(Protocol):
 
     def get_candidate_generations(
         self, obs: dict, hint: list[str] | None = None, n_candidates: int = 3
-    ) -> list[dict]:
+    ) -> "list[dict]":
         """
         Generate multiple candidate actions for the given observation.
+
         You can pass extra info in agent_info to update internal state of the
         agent based on the selected candidate. Your internal state management
         should be robust to multiple calls to the get_candidate_generations method
@@ -24,11 +26,6 @@ def get_candidate_generations(
             obs: The current observation dictionary containing environment state
             hint: Optional list of hint strings to guide candidate generation
             n_candidates: Number of candidate actions to generate
-
-        Returns:
-            List of dictionaries, each containing:
-                - 'action': The candidate action to be executed
-                - 'agent_info': Additional information about the action generation
         """
         ...
 
@@ -37,8 +34,10 @@ def update_agent_state_from_selected_candidate(self, output: dict):
         Update the agent's internal state based on the selected candidate.
         This can include any memory or planning updates.
 
+        Args:
+            output: The selected candidate action dictionary
         """
-        ...
+        pass
 
 
 class MultiCandidateAgentArgs(AgentArgs):
@@ -47,5 +46,5 @@ def make_agent(self) -> MultiCandidateAgent: ...
     def __post_init__(self):
         """Prefix subagent name with 'MC-'."""
         super().__post_init__()
-        if hasattr(self, 'agent_name') and self.agent_name:
+        if hasattr(self, "agent_name") and self.agent_name:
             self.agent_name = "MC-" + self.agent_name
diff --git a/src/agentlab/agents/hilt_agent/generic_human_guided_agent.py b/src/agentlab/agents/hilt_agent/generic_human_guided_agent.py
index 220ca0df..950c3449 100644
--- a/src/agentlab/agents/hilt_agent/generic_human_guided_agent.py
+++ b/src/agentlab/agents/hilt_agent/generic_human_guided_agent.py
@@ -7,6 +7,7 @@
 
 import bgym
 import numpy as np
+from browsergym.experiments.agent import AgentInfo
 from PIL import Image
 
 from agentlab.agents import dynamic_prompting as dp
@@ -23,7 +24,6 @@
     SystemMessage,
 )
 from agentlab.llm.tracking import cost_tracker_decorator
-from browsergym.experiments.agent import AgentInfo
 
 
 class CandidatesGeneration(dp.PromptElement):
@@ -87,15 +87,14 @@ def __init__(self, hint: list[str] | None = None, n_candidates=3) -> None:
     )
 
     def _parse_answer(self, text_answer: str) -> Dict[str, Dict[str, str]]:
-        """
-        Extract up to n_candidates candidates, using numbered tags only.
+        """Extract up to n_candidates candidates, using numbered tags only.
+
+        Args:
+            text_answer: The text response containing candidate generation tags.
 
         Returns:
-        {
-            "candidate_generation_1": {"think": "...", "action": "..."},
-            "candidate_generation_2": {"think": "...", "action": "..."},
-            ...
-        }
+            Dictionary mapping candidate names to their think and action content.
+            Format: {"candidate_generation_1": {"think": "...", "action": "..."}, ...}
         """
         result = {
             f"candidate_generation_{i+1}": {"think": "", "action": ""}
@@ -145,11 +144,11 @@ def make_agent(self):
         return MultipleProposalGenericAgent(
             chat_model_args=self.chat_model_args, flags=self.flags, max_retry=self.max_retry
         )
-    
+
     def __post_init__(self):
         """Prefix subagent name with 'HILT-'."""
         super().__post_init__()
-        if hasattr(self, 'agent_name') and self.agent_name:
+        if hasattr(self, "agent_name") and self.agent_name:
             self.agent_name = "HILT-" + self.agent_name
 
 
@@ -363,13 +362,11 @@ def get_base_agent(llm_config):
     agent_configs = [HUMAN_GUIDED_GENERIC_AGENT]
     benchmark = bgym.DEFAULT_BENCHMARKS["miniwob"]()
     benchmark = benchmark.subset_from_glob("task_name", "*book*")
-    benchmark.env_args_list = benchmark.env_args_list[2:3]
+    benchmark.env_args_list = benchmark.env_args_list[3:4]
 
     for env_args in benchmark.env_args_list:
         env_args.max_steps = 100  # max human steps
-        env_args.headless = False
-        # env_args.use_chat_ui = False
-        # env_args.use_hint_labeling_ui = True
+        env_args.headless = True
 
     Study(agent_configs, benchmark, logging_level=logging.WARNING).run(
         n_jobs=1,
diff --git a/src/agentlab/agents/hilt_agent/hilt_agent.py b/src/agentlab/agents/hilt_agent/hilt_agent.py
index 245db484..6d5fb3f5 100644
--- a/src/agentlab/agents/hilt_agent/hilt_agent.py
+++ b/src/agentlab/agents/hilt_agent/hilt_agent.py
@@ -6,18 +6,19 @@
 
 import bgym
 import numpy as np
-from PIL import Image
 import playwright
+from browsergym.experiments.agent import Agent
+from PIL import Image
 
+from agentlab.agents.agent_args import AgentArgs
+from agentlab.agents.hilt_agent.base_multi_candidate_agent import MultiCandidateAgent
 from agentlab.agents.hilt_agent.hint_labelling import (
     HintLabeling,
     HintLabelingInputs,
 )
-from agentlab.llm.tracking import cost_tracker_decorator
 from agentlab.analyze import overlay_utils
-from browsergym.experiments.agent import Agent
-from agentlab.agents.agent_args import AgentArgs
-from agentlab.agents.hilt_agent.base_multi_candidate_agent import MultiCandidateAgent
+from agentlab.llm.tracking import cost_tracker_decorator
+
 
 class HumanInTheLoopAgent(Agent):
 
@@ -58,7 +59,7 @@ def get_action(self, obs):
         # Generate first candidates
         candidates = self.subagent.get_candidate_generations(obs, hint=None, n_candidates=3)
         step_n_human_intervention_rounds += 1
-        suggestions = [{ 'action': c['action'], 'think': c['agent_info'].think} for c in candidates]
+        suggestions = [{"action": c["action"], "think": c["agent_info"].think} for c in candidates]
         # List of Images as base64 - create overlay screenshots for each suggested action
         screenshots = [overlay_action(obs, choice["action"]) for choice in suggestions]
 
@@ -90,11 +91,11 @@ def get_action(self, obs):
                     hint = response["payload"]["hint"]
                     step_hint.append(hint)
                     candidates = self.subagent.get_candidate_generations(
-                        obs, 
-                        hint=step_hint if step_hint else None,
-                        n_candidates=3
+                        obs, hint=step_hint if step_hint else None, n_candidates=3
                     )
-                    suggestions = [{'action': c['action'], 'think': c['agent_info'].think} for c in candidates]
+                    suggestions = [
+                        {"action": c["action"], "think": c["agent_info"].think} for c in candidates
+                    ]
                     screenshots = [overlay_action(obs, choice["action"]) for choice in suggestions]
 
                 elif response["type"] == "step":
@@ -135,7 +136,6 @@ def get_action(self, obs):
 @dataclass
 class HumanInTheLoopAgentArgs(AgentArgs):
     subagent_args: Optional[AgentArgs] = None  # args for the underlying multiple proposal agent
-    
 
     def make_agent(self):
         assert self.subagent_args is not None
@@ -146,15 +146,15 @@ def __post_init__(self):
         super().__post_init__()
         if self.subagent_args and self.subagent_args.agent_name:
             self.agent_name = "HILT-" + self.subagent_args.agent_name
-    
+
     def set_benchmark(self, benchmark, demo_mode):
         """Delegate set_benchmark to the subagent if it has the method."""
-        if hasattr(self.subagent_args, 'set_benchmark'):
+        if hasattr(self.subagent_args, "set_benchmark"):
             self.subagent_args.set_benchmark(benchmark, demo_mode)
-    
+
     def set_reproducibility_mode(self):
         """Delegate set_reproducibility_mode to the subagent if it has the method."""
-        if hasattr(self.subagent_args, 'set_reproducibility_mode'):
+        if hasattr(self.subagent_args, "set_reproducibility_mode"):
             self.subagent_args.set_reproducibility_mode()
 
 
@@ -175,16 +175,17 @@ def img_to_base_64(image: Image.Image | np.ndarray) -> str:
     b64_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
     return b64_str
 
+
 def get_base_human_in_the_loop_genericagent(llm_config):
     from agentlab.agents.generic_agent.tmlr_config import BASE_FLAGS
-    from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
     from agentlab.agents.hilt_agent.hilt_agent import HumanInTheLoopAgentArgs
     from agentlab.agents.hilt_agent.multi_candidate_generic_agent import (
         MultiCandidateGenericAgentArgs,
     )
+    from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
 
     return HumanInTheLoopAgentArgs(
-        subagent_args = MultiCandidateGenericAgentArgs(
+        subagent_args=MultiCandidateGenericAgentArgs(
             chat_model_args=CHAT_MODEL_ARGS_DICT[llm_config],
             flags=BASE_FLAGS,
         )
@@ -210,7 +211,6 @@ def get_base_human_in_the_loop_genericagent(llm_config):
         env_args.max_steps = 100  # max human steps
         env_args.headless = False
 
-
     Study(agent_configs, benchmark, logging_level=logging.WARNING).run(
         n_jobs=1,
         parallel_backend="sequential",
diff --git a/src/agentlab/agents/hilt_agent/hint_labelling.py b/src/agentlab/agents/hilt_agent/hint_labelling.py
index 6e293781..aa5a51ea 100644
--- a/src/agentlab/agents/hilt_agent/hint_labelling.py
+++ b/src/agentlab/agents/hilt_agent/hint_labelling.py
@@ -5,10 +5,10 @@
 from typing import Dict, List, Optional
 
 import playwright.sync_api
+from browsergym.core import _get_global_playwright
 from pydantic import BaseModel, Field
 
 from agentlab.agents.hilt_agent import hint_labelling_ui_files
-from browsergym.core import _get_global_playwright
 
 logger = logging.getLogger(__name__)
 
@@ -109,6 +109,16 @@ def wait_for_response(self, timeout: Optional[float] = 600) -> dict:
         """
         Wait until the page makes a request to /api/reprompt or /api/submit,
         then parse the request body and return it in your schema.
+
+        Args:
+            timeout (Optional[float]): Maximum time to wait for the request in seconds. If None or 0,
+                waits indefinitely. Defaults to 600 seconds.
+
+        Returns:
+            dict: A dictionary containing the parsed response with 'type' and 'payload' keys.
+                For /api/reprompt: {'type': 'reprompt', 'payload': {'hint': str}}
+                For /api/submit: {'type': 'step', 'payload': {'think': str, 'action': str}}
+
         """
         logger.info("Waiting for response from Hint Labeling UI...")
 
diff --git a/src/agentlab/agents/hilt_agent/multi_candidate_generic_agent.py b/src/agentlab/agents/hilt_agent/multi_candidate_generic_agent.py
index 64c821ef..e4e53b7a 100644
--- a/src/agentlab/agents/hilt_agent/multi_candidate_generic_agent.py
+++ b/src/agentlab/agents/hilt_agent/multi_candidate_generic_agent.py
@@ -2,11 +2,12 @@
 from dataclasses import asdict, dataclass
 from typing import Dict, List
 
+from browsergym.experiments.agent import AgentInfo
+
 from agentlab.agents import dynamic_prompting as dp
 from agentlab.agents.generic_agent.generic_agent import GenericAgent, GenericAgentArgs
 from agentlab.agents.generic_agent.generic_agent_prompt import MainPrompt
 from agentlab.llm.llm_utils import Discussion, HumanMessage, SystemMessage
-from browsergym.experiments.agent import AgentInfo
 
 
 class CandidatesGeneration(dp.PromptElement):
@@ -70,15 +71,14 @@ def __init__(self, hint: list[str] | None = None, n_candidates=3) -> None:
     )
 
     def _parse_answer(self, text_answer: str) -> Dict[str, Dict[str, str]]:
-        """
-        Extract up to n_candidates candidates, using numbered tags only.
+        """Extract up to n_candidates candidates, using numbered tags only.
+
+        Args:
+            text_answer: The text response containing candidate generation tags.
 
         Returns:
-        {
-            "candidate_generation_1": {"think": "...", "action": "..."},
-            "candidate_generation_2": {"think": "...", "action": "..."},
-            ...
-        }
+            Dictionary mapping candidate names to their think and action content.
+            Format: {"candidate_generation_1": {"think": "...", "action": "..."}, ...}
         """
         result = {
             f"candidate_generation_{i+1}": {"think": "", "action": ""}
@@ -123,7 +123,6 @@ def get_candidate_generations(
         # Important to handle cases when get_candidate_generation is called multiple times in a single step.
         if not self.obs_history or self.obs_history[-1] is not obs:
             self.obs_history.append(obs)
-    
 
         main_prompt = MainPrompt(
             action_set=self.action_set,
@@ -180,8 +179,12 @@ def get_candidate_generations(
         return output
 
     def update_agent_state_from_selected_candidate(self, output):
-        """Updates the agent's internal state based on the selected candidate from human feedback."""
-        action, agent_info = output['action'], output['agent_info']
+        """Updates the agent's internal state based on the selected candidate from human feedback.
+
+        Args:
+            output: Dictionary containing 'action' and 'agent_info' keys from selected candidate.
+        """
+        action, agent_info = output["action"], output["agent_info"]
         self.plan = agent_info.extra_info.get("plan", self.plan)
         self.plan_step = agent_info.extra_info.get("step", self.plan_step)
         self.memories.append(agent_info.extra_info.get("memory", None))
@@ -191,11 +194,17 @@ def update_agent_state_from_selected_candidate(self, output):
     def get_action(self, obs):
         """Generates multiple candidates and always returns the first one.
         This allows to use this agent as a drop-in replacement for a single-candidate agent.
+
+        Args:
+            obs: The observation from the environment.
+
+        Returns:
+            tuple: A tuple containing (action, agent_info).
         """
-        candidates = self.get_candidate_generations(obs, hint=None, n_candidates=2) 
-        selection = candidates[0] # always select the first option.
+        candidates = self.get_candidate_generations(obs, hint=None, n_candidates=2)
+        selection = candidates[0]  # always select the first option.
         self.update_agent_state_from_selected_candidate(selection)
-        action, agent_info = selection['action'], selection['agent_info']
+        action, agent_info = selection["action"], selection["agent_info"]
 
         return action, agent_info
 
@@ -212,5 +221,5 @@ def make_agent(self):
     def __post_init__(self):
         """Prefix subagent name with 'MC-'."""
         super().__post_init__()
-        if hasattr(self, 'agent_name') and self.agent_name:
+        if hasattr(self, "agent_name") and self.agent_name:
             self.agent_name = "MC-" + self.agent_name

From 3b07fe90d9a632966f44929fc0ca6be211f9f085 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Tue, 2 Sep 2025 14:58:04 -0400
Subject: [PATCH 04/21] correct spelling hilt -> hitl

---
 .../base_multi_candidate_agent.py                  |  0
 .../generic_human_guided_agent.py                  | 12 ++++++------
 .../{hilt_agent => hitl_agent}/hint_labelling.py   |  2 +-
 .../hint_labelling_ui_files/hint_labeling_ui.html  |  0
 .../hilt_agent.py => hitl_agent/hitl_agent.py}     | 14 +++++++-------
 .../multi_candidate_generic_agent.py               |  0
 6 files changed, 14 insertions(+), 14 deletions(-)
 rename src/agentlab/agents/{hilt_agent => hitl_agent}/base_multi_candidate_agent.py (100%)
 rename src/agentlab/agents/{hilt_agent => hitl_agent}/generic_human_guided_agent.py (97%)
 rename src/agentlab/agents/{hilt_agent => hitl_agent}/hint_labelling.py (98%)
 rename src/agentlab/agents/{hilt_agent => hitl_agent}/hint_labelling_ui_files/hint_labeling_ui.html (100%)
 rename src/agentlab/agents/{hilt_agent/hilt_agent.py => hitl_agent/hitl_agent.py} (95%)
 rename src/agentlab/agents/{hilt_agent => hitl_agent}/multi_candidate_generic_agent.py (100%)

diff --git a/src/agentlab/agents/hilt_agent/base_multi_candidate_agent.py b/src/agentlab/agents/hitl_agent/base_multi_candidate_agent.py
similarity index 100%
rename from src/agentlab/agents/hilt_agent/base_multi_candidate_agent.py
rename to src/agentlab/agents/hitl_agent/base_multi_candidate_agent.py
diff --git a/src/agentlab/agents/hilt_agent/generic_human_guided_agent.py b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py
similarity index 97%
rename from src/agentlab/agents/hilt_agent/generic_human_guided_agent.py
rename to src/agentlab/agents/hitl_agent/generic_human_guided_agent.py
index 950c3449..fd83cf95 100644
--- a/src/agentlab/agents/hilt_agent/generic_human_guided_agent.py
+++ b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py
@@ -13,7 +13,7 @@
 from agentlab.agents import dynamic_prompting as dp
 from agentlab.agents.generic_agent.generic_agent import GenericAgent, GenericAgentArgs
 from agentlab.agents.generic_agent.generic_agent_prompt import MainPrompt
-from agentlab.agents.hilt_agent.hint_labelling import (
+from agentlab.agents.hitl_agent.hint_labelling import (
     HintLabeling,
     HintLabelingInputs,
 )
@@ -146,10 +146,10 @@ def make_agent(self):
         )
 
     def __post_init__(self):
-        """Prefix subagent name with 'HILT-'."""
+        """Prefix subagent name with 'HITL-'."""
         super().__post_init__()
         if hasattr(self, "agent_name") and self.agent_name:
-            self.agent_name = "HILT-" + self.agent_name
+            self.agent_name = "HITL-" + self.agent_name
 
 
 class MultipleProposalGenericAgent(GenericAgent):
@@ -354,14 +354,14 @@ def get_base_agent(llm_config):
 if __name__ == "__main__":
     import logging
 
-    from agentlab.agents.hilt_agent.generic_human_guided_agent import (
+    from agentlab.agents.hitl_agent.generic_human_guided_agent import (
         HUMAN_GUIDED_GENERIC_AGENT,
     )
     from agentlab.experiments.study import Study
 
     agent_configs = [HUMAN_GUIDED_GENERIC_AGENT]
-    benchmark = bgym.DEFAULT_BENCHMARKS["miniwob"]()
-    benchmark = benchmark.subset_from_glob("task_name", "*book*")
+    benchmark = bgym.DEFAULT_BENCHMARKS["workarena_l1"]()
+    benchmark = benchmark.subset_from_glob("task_name", "*create*")
     benchmark.env_args_list = benchmark.env_args_list[3:4]
 
     for env_args in benchmark.env_args_list:
diff --git a/src/agentlab/agents/hilt_agent/hint_labelling.py b/src/agentlab/agents/hitl_agent/hint_labelling.py
similarity index 98%
rename from src/agentlab/agents/hilt_agent/hint_labelling.py
rename to src/agentlab/agents/hitl_agent/hint_labelling.py
index aa5a51ea..faa16506 100644
--- a/src/agentlab/agents/hilt_agent/hint_labelling.py
+++ b/src/agentlab/agents/hitl_agent/hint_labelling.py
@@ -8,7 +8,7 @@
 from browsergym.core import _get_global_playwright
 from pydantic import BaseModel, Field
 
-from agentlab.agents.hilt_agent import hint_labelling_ui_files
+from agentlab.agents.hitl_agent import hint_labelling_ui_files
 
 logger = logging.getLogger(__name__)
 
diff --git a/src/agentlab/agents/hilt_agent/hint_labelling_ui_files/hint_labeling_ui.html b/src/agentlab/agents/hitl_agent/hint_labelling_ui_files/hint_labeling_ui.html
similarity index 100%
rename from src/agentlab/agents/hilt_agent/hint_labelling_ui_files/hint_labeling_ui.html
rename to src/agentlab/agents/hitl_agent/hint_labelling_ui_files/hint_labeling_ui.html
diff --git a/src/agentlab/agents/hilt_agent/hilt_agent.py b/src/agentlab/agents/hitl_agent/hitl_agent.py
similarity index 95%
rename from src/agentlab/agents/hilt_agent/hilt_agent.py
rename to src/agentlab/agents/hitl_agent/hitl_agent.py
index 6d5fb3f5..150bcd74 100644
--- a/src/agentlab/agents/hilt_agent/hilt_agent.py
+++ b/src/agentlab/agents/hitl_agent/hitl_agent.py
@@ -11,8 +11,8 @@
 from PIL import Image
 
 from agentlab.agents.agent_args import AgentArgs
-from agentlab.agents.hilt_agent.base_multi_candidate_agent import MultiCandidateAgent
-from agentlab.agents.hilt_agent.hint_labelling import (
+from agentlab.agents.hitl_agent.base_multi_candidate_agent import MultiCandidateAgent
+from agentlab.agents.hitl_agent.hint_labelling import (
     HintLabeling,
     HintLabelingInputs,
 )
@@ -142,10 +142,10 @@ def make_agent(self):
         return HumanInTheLoopAgent(subagent_args=self.subagent_args)
 
     def __post_init__(self):
-        """Prefix subagent name with 'HILT-'."""
+        """Prefix subagent name with 'HITL-'."""
         super().__post_init__()
         if self.subagent_args and self.subagent_args.agent_name:
-            self.agent_name = "HILT-" + self.subagent_args.agent_name
+            self.agent_name = "HITL-" + self.subagent_args.agent_name
 
     def set_benchmark(self, benchmark, demo_mode):
         """Delegate set_benchmark to the subagent if it has the method."""
@@ -178,8 +178,8 @@ def img_to_base_64(image: Image.Image | np.ndarray) -> str:
 
 def get_base_human_in_the_loop_genericagent(llm_config):
     from agentlab.agents.generic_agent.tmlr_config import BASE_FLAGS
-    from agentlab.agents.hilt_agent.hilt_agent import HumanInTheLoopAgentArgs
-    from agentlab.agents.hilt_agent.multi_candidate_generic_agent import (
+    from agentlab.agents.hitl_agent.hitl_agent import HumanInTheLoopAgentArgs
+    from agentlab.agents.hitl_agent.multi_candidate_generic_agent import (
         MultiCandidateGenericAgentArgs,
     )
     from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
@@ -197,7 +197,7 @@ def get_base_human_in_the_loop_genericagent(llm_config):
 if __name__ == "__main__":
     import logging
 
-    from agentlab.agents.hilt_agent.hilt_agent import (
+    from agentlab.agents.hitl_agent.hitl_agent import (
         HUMAN_GUIDED_GENERIC_AGENT,
     )
     from agentlab.experiments.study import Study
diff --git a/src/agentlab/agents/hilt_agent/multi_candidate_generic_agent.py b/src/agentlab/agents/hitl_agent/multi_candidate_generic_agent.py
similarity index 100%
rename from src/agentlab/agents/hilt_agent/multi_candidate_generic_agent.py
rename to src/agentlab/agents/hitl_agent/multi_candidate_generic_agent.py

From 9633275402a7ae6a58fa690a988df0e755fd1a36 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Tue, 2 Sep 2025 15:19:13 -0400
Subject: [PATCH 05/21] Move the overlay_action to utils

---
 src/agentlab/agents/agent_utils.py            | 14 ++++++++++
 .../hitl_agent/generic_human_guided_agent.py  | 21 ++-------------
 src/agentlab/agents/hitl_agent/hitl_agent.py  | 27 +++----------------
 src/agentlab/llm/llm_utils.py                 | 10 +++++++
 4 files changed, 29 insertions(+), 43 deletions(-)

diff --git a/src/agentlab/agents/agent_utils.py b/src/agentlab/agents/agent_utils.py
index 29219d2d..9059b5ad 100644
--- a/src/agentlab/agents/agent_utils.py
+++ b/src/agentlab/agents/agent_utils.py
@@ -1,5 +1,8 @@
+import copy
 from PIL import Image, ImageDraw
 from playwright.sync_api import Page
+from agentlab.llm.llm_utils import img_to_base_64
+from agentlab.analyze import overlay_utils
 
 
 def draw_mouse_pointer(image: Image.Image, x: int, y: int) -> Image.Image:
@@ -128,3 +131,14 @@ def zoom_webpage(page: Page, zoom_factor: float = 1.5):
 
     page.evaluate(f"document.documentElement.style.zoom='{zoom_factor*100}%'")
     return page
+
+
+def overlay_action(obs, action):
+    """Overlays actions on screenshot in-place"""
+    act_img = copy.deepcopy(obs["screenshot"])
+    act_img = Image.fromarray(act_img)
+    overlay_utils.annotate_action(act_img, action, properties=obs["extra_element_properties"])
+    return img_to_base_64(act_img)
+
+
+
diff --git a/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py
index fd83cf95..507efe85 100644
--- a/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py
+++ b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py
@@ -24,7 +24,8 @@
     SystemMessage,
 )
 from agentlab.llm.tracking import cost_tracker_decorator
-
+from agentlab.agents.agent_utils import overlay_action
+from agentlab.llm.llm_utils import img_to_base_64
 
 class CandidatesGeneration(dp.PromptElement):
     # Ask for multiple alternatives; each candidate must contain <think> and <action>.
@@ -119,24 +120,6 @@ def _parse_answer(self, text_answer: str) -> Dict[str, Dict[str, str]]:
         return result
 
 
-def overlay_action(obs, action):
-    """Overlays actions on screenshot in-place"""
-    act_img = copy.deepcopy(obs["screenshot"])
-    act_img = Image.fromarray(act_img)
-    overlay_utils.annotate_action(act_img, action, properties=obs["extra_element_properties"])
-    return img_to_base_64(act_img)
-
-
-def img_to_base_64(image: Image.Image | np.ndarray) -> str:
-    """Converts a PIL Image or NumPy array to a base64-encoded string."""
-    if isinstance(image, np.ndarray):
-        image = Image.fromarray(image)
-    buffer = io.BytesIO()
-    image.save(buffer, format="PNG")
-    b64_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
-    return b64_str
-
-
 @dataclass
 class MultipleProposalGenericAgentArgs(GenericAgentArgs):
 
diff --git a/src/agentlab/agents/hitl_agent/hitl_agent.py b/src/agentlab/agents/hitl_agent/hitl_agent.py
index 150bcd74..3bfb47b5 100644
--- a/src/agentlab/agents/hitl_agent/hitl_agent.py
+++ b/src/agentlab/agents/hitl_agent/hitl_agent.py
@@ -1,14 +1,10 @@
-import base64
-import copy
-import io
 from dataclasses import dataclass
 from typing import Optional
 
 import bgym
-import numpy as np
 import playwright
 from browsergym.experiments.agent import Agent
-from PIL import Image
+
 
 from agentlab.agents.agent_args import AgentArgs
 from agentlab.agents.hitl_agent.base_multi_candidate_agent import MultiCandidateAgent
@@ -16,7 +12,8 @@
     HintLabeling,
     HintLabelingInputs,
 )
-from agentlab.analyze import overlay_utils
+from agentlab.agents.agent_utils import overlay_action
+from agentlab.llm.llm_utils import img_to_base_64
 from agentlab.llm.tracking import cost_tracker_decorator
 
 
@@ -158,24 +155,6 @@ def set_reproducibility_mode(self):
             self.subagent_args.set_reproducibility_mode()
 
 
-def overlay_action(obs, action):
-    """Overlays actions on screenshot in-place"""
-    act_img = copy.deepcopy(obs["screenshot"])
-    act_img = Image.fromarray(act_img)
-    overlay_utils.annotate_action(act_img, action, properties=obs["extra_element_properties"])
-    return img_to_base_64(act_img)
-
-
-def img_to_base_64(image: Image.Image | np.ndarray) -> str:
-    """Converts a PIL Image or NumPy array to a base64-encoded string."""
-    if isinstance(image, np.ndarray):
-        image = Image.fromarray(image)
-    buffer = io.BytesIO()
-    image.save(buffer, format="PNG")
-    b64_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
-    return b64_str
-
-
 def get_base_human_in_the_loop_genericagent(llm_config):
     from agentlab.agents.generic_agent.tmlr_config import BASE_FLAGS
     from agentlab.agents.hitl_agent.hitl_agent import HumanInTheLoopAgentArgs
diff --git a/src/agentlab/llm/llm_utils.py b/src/agentlab/llm/llm_utils.py
index 10013b72..2bc83d43 100644
--- a/src/agentlab/llm/llm_utils.py
+++ b/src/agentlab/llm/llm_utils.py
@@ -727,6 +727,16 @@ def image_to_png_base64_url(image: np.ndarray | Image.Image):
     return f"data:image/png;base64,{image_base64}"
 
 
+def img_to_base_64(image: Image.Image | np.ndarray) -> str:
+    """Converts a PIL Image or NumPy array to a base64-encoded string."""
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    buffer = io.BytesIO()
+    image.save(buffer, format="PNG")
+    b64_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
+    return b64_str
+
+
 class BaseMessage(dict):
     def __init__(self, role: str, content: Union[str, list[dict]], **kwargs):
         allowed_attrs = {"log_probs"}

From 51cacdb9826f651b8c0b9aeae80e55b4168aaea0 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Tue, 2 Sep 2025 15:44:12 -0400
Subject: [PATCH 06/21] Increase timeout

---
 src/agentlab/agents/hitl_agent/generic_human_guided_agent.py | 2 +-
 src/agentlab/agents/hitl_agent/hitl_agent.py                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py
index 507efe85..af7ed0c4 100644
--- a/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py
+++ b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py
@@ -251,7 +251,7 @@ def get_action(self, obs):
                 )
 
                 self.ui.update_context(hint_labeling_inputs)
-                response = self.ui.wait_for_response(timeout=300)
+                response = self.ui.wait_for_response(timeout=600)
 
                 if response["type"] == "reprompt":
                     hint = response["payload"]["hint"]
diff --git a/src/agentlab/agents/hitl_agent/hitl_agent.py b/src/agentlab/agents/hitl_agent/hitl_agent.py
index 3bfb47b5..85063f82 100644
--- a/src/agentlab/agents/hitl_agent/hitl_agent.py
+++ b/src/agentlab/agents/hitl_agent/hitl_agent.py
@@ -82,7 +82,7 @@ def get_action(self, obs):
                 )
 
                 self.ui.update_context(hint_labeling_inputs)
-                response = self.ui.wait_for_response(timeout=300)
+                response = self.ui.wait_for_response(timeout=600)
 
                 if response["type"] == "reprompt":
                     hint = response["payload"]["hint"]

From 958430ccc6e1a0d2a84376d9bbdbac311e6e11da Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Tue, 2 Sep 2025 16:11:32 -0400
Subject: [PATCH 07/21] add docstring for functions and black

---
 src/agentlab/agents/agent_utils.py            |   7 +-
 .../hitl_agent/generic_human_guided_agent.py  |  15 +-
 src/agentlab/agents/hitl_agent/hitl_agent.py  |  14 +
 .../2_eval_on_miniwob/inspect_results.ipynb   | 258 +++++++++++++++++-
 4 files changed, 281 insertions(+), 13 deletions(-)

diff --git a/src/agentlab/agents/agent_utils.py b/src/agentlab/agents/agent_utils.py
index 9059b5ad..7ce8cdad 100644
--- a/src/agentlab/agents/agent_utils.py
+++ b/src/agentlab/agents/agent_utils.py
@@ -1,8 +1,10 @@
 import copy
+
 from PIL import Image, ImageDraw
 from playwright.sync_api import Page
-from agentlab.llm.llm_utils import img_to_base_64
+
 from agentlab.analyze import overlay_utils
+from agentlab.llm.llm_utils import img_to_base_64
 
 
 def draw_mouse_pointer(image: Image.Image, x: int, y: int) -> Image.Image:
@@ -139,6 +141,3 @@ def overlay_action(obs, action):
     act_img = Image.fromarray(act_img)
     overlay_utils.annotate_action(act_img, action, properties=obs["extra_element_properties"])
     return img_to_base_64(act_img)
-
-
-
diff --git a/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py
index af7ed0c4..507e16fe 100644
--- a/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py
+++ b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py
@@ -11,6 +11,7 @@
 from PIL import Image
 
 from agentlab.agents import dynamic_prompting as dp
+from agentlab.agents.agent_utils import overlay_action
 from agentlab.agents.generic_agent.generic_agent import GenericAgent, GenericAgentArgs
 from agentlab.agents.generic_agent.generic_agent_prompt import MainPrompt
 from agentlab.agents.hitl_agent.hint_labelling import (
@@ -22,10 +23,10 @@
     Discussion,
     HumanMessage,
     SystemMessage,
+    img_to_base_64,
 )
 from agentlab.llm.tracking import cost_tracker_decorator
-from agentlab.agents.agent_utils import overlay_action
-from agentlab.llm.llm_utils import img_to_base_64
+
 
 class CandidatesGeneration(dp.PromptElement):
     # Ask for multiple alternatives; each candidate must contain <think> and <action>.
@@ -323,6 +324,16 @@ def get_action(self, obs):
 
 
 def get_base_agent(llm_config):
+    """Creates and returns a MultipleProposalGenericAgentArgs instance with
+    specified LLM configuration from CHAT_MODEL_ARGS_DICT.
+
+    Args:
+        llm_config: The LLM configuration key to use from CHAT_MODEL_ARGS_DICT.
+
+    Returns:
+        MultipleProposalGenericAgentArgs: Configured agent arguments instance.
+    """
+
     from agentlab.agents.generic_agent.tmlr_config import BASE_FLAGS
     from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
 
diff --git a/src/agentlab/agents/hitl_agent/hitl_agent.py b/src/agentlab/agents/hitl_agent/hitl_agent.py
index 85063f82..26c0c696 100644
--- a/src/agentlab/agents/hitl_agent/hitl_agent.py
+++ b/src/agentlab/agents/hitl_agent/hitl_agent.py
@@ -90,6 +90,7 @@ def get_action(self, obs):
                     candidates = self.subagent.get_candidate_generations(
                         obs, hint=step_hint if step_hint else None, n_candidates=3
                     )
+                    step_n_human_intervention_rounds += 1
                     suggestions = [
                         {"action": c["action"], "think": c["agent_info"].think} for c in candidates
                     ]
@@ -156,6 +157,19 @@ def set_reproducibility_mode(self):
 
 
 def get_base_human_in_the_loop_genericagent(llm_config):
+    """
+    Create a base human-in-the-loop generic agent configuration using the key from CHAT_MODEL_ARGS_DICT.
+
+    This function creates a HumanInTheLoopAgentArgs instance with a MultiCandidateGenericAgent
+    as the subagent, configured with the specified LLM configuration and base flags.
+
+    Args:
+        llm_config (str): The LLM configuration key to use from CHAT_MODEL_ARGS_DICT.
+
+    Returns:
+        HumanInTheLoopAgentArgs: Configured human-in-the-loop agent arguments with
+                                a multi-candidate generic agent as the subagent.
+    """
     from agentlab.agents.generic_agent.tmlr_config import BASE_FLAGS
     from agentlab.agents.hitl_agent.hitl_agent import HumanInTheLoopAgentArgs
     from agentlab.agents.hitl_agent.multi_candidate_generic_agent import (
diff --git a/tutorials/2_eval_on_miniwob/inspect_results.ipynb b/tutorials/2_eval_on_miniwob/inspect_results.ipynb
index 06127b78..84a73cca 100644
--- a/tutorials/2_eval_on_miniwob/inspect_results.ipynb
+++ b/tutorials/2_eval_on_miniwob/inspect_results.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "58086537",
    "metadata": {},
    "outputs": [],
@@ -25,10 +25,47 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
+   "id": "7901cccc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "PosixPath('/Users/aman.jaiswal/Work/AgentLab.worktrees/trace-recorder/results')"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "RESULTS_DIR"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
    "id": "50be19a9",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/Users/aman.jaiswal/Work/AgentLab.worktrees/trace-recorder/results/2025-09-02_15-52-00_hitl-genericagent-gpt-5-mini-2025-08-07-on-workarena-l1-task-name-create\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Searching experiments directories.: 100%|██████████| 1/1 [00:00<00:00, 5433.04it/s]\n",
+      "Loading results: 100%|██████████| 1/1 [00:00<00:00, 373.26it/s]\n"
+     ]
+    }
+   ],
    "source": [
     "# replace this by your desired directory if needed.\n",
     "result_dir = get_most_recent_study(RESULTS_DIR, contains=None)\n",
@@ -39,15 +76,222 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
+   "id": "82cc1557",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "PosixPath('/Users/aman.jaiswal/Work/AgentLab.worktrees/trace-recorder/results/2025-09-02_15-52-00_hitl-genericagent-gpt-5-mini-2025-08-07-on-workarena-l1-task-name-create')"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result_dir"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
    "id": "a424c470",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found multiple configuration, averaging across tasks and returning a per-agent report.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style type=\"text/css\">\n",
+       "#T_1d2fe th {\n",
+       "  white-space: pre-wrap;\n",
+       "}\n",
+       "</style>\n",
+       "<table id=\"T_1d2fe\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th class=\"blank level0\" >&nbsp;</th>\n",
+       "      <th id=\"T_1d2fe_level0_col0\" class=\"col_heading level0 col0\" >agent.agent\n",
+       "name</th>\n",
+       "      <th id=\"T_1d2fe_level0_col1\" class=\"col_heading level0 col1\" >env.benchmark</th>\n",
+       "      <th id=\"T_1d2fe_level0_col2\" class=\"col_heading level0 col2\" >avg\n",
+       "reward</th>\n",
+       "      <th id=\"T_1d2fe_level0_col3\" class=\"col_heading level0 col3\" >std\n",
+       "err</th>\n",
+       "      <th id=\"T_1d2fe_level0_col4\" class=\"col_heading level0 col4\" >avg\n",
+       "steps</th>\n",
+       "      <th id=\"T_1d2fe_level0_col5\" class=\"col_heading level0 col5\" >n\n",
+       "completed</th>\n",
+       "      <th id=\"T_1d2fe_level0_col6\" class=\"col_heading level0 col6\" >n\n",
+       "err</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th id=\"T_1d2fe_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
+       "      <td id=\"T_1d2fe_row0_col0\" class=\"data row0 col0\" >HITL-GenericAgent-gpt-5-mini-2025-08-07</td>\n",
+       "      <td id=\"T_1d2fe_row0_col1\" class=\"data row0 col1\" >workarena</td>\n",
+       "      <td id=\"T_1d2fe_row0_col2\" class=\"data row0 col2\" >nan</td>\n",
+       "      <td id=\"T_1d2fe_row0_col3\" class=\"data row0 col3\" >nan</td>\n",
+       "      <td id=\"T_1d2fe_row0_col4\" class=\"data row0 col4\" >nan</td>\n",
+       "      <td id=\"T_1d2fe_row0_col5\" class=\"data row0 col5\" >0/1</td>\n",
+       "      <td id=\"T_1d2fe_row0_col6\" class=\"data row0 col6\" >0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n"
+      ],
+      "text/plain": [
+       "<pandas.io.formats.style.Styler at 0x125c55850>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "report = inspect_results.global_report(result_df)\n",
     "inspect_results.display_report(report)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f86e44fd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.microsoft.datawrangler.viewer.v0+json": {
+       "columns": [
+        {
+         "name": "('agent.agent_name', 'env.benchmark')",
+         "rawType": "object",
+         "type": "unknown"
+        },
+        {
+         "name": "avg_reward",
+         "rawType": "float64",
+         "type": "float"
+        },
+        {
+         "name": "std_err",
+         "rawType": "float64",
+         "type": "float"
+        },
+        {
+         "name": "avg_steps",
+         "rawType": "float64",
+         "type": "float"
+        },
+        {
+         "name": "n_completed",
+         "rawType": "object",
+         "type": "string"
+        },
+        {
+         "name": "n_err",
+         "rawType": "int64",
+         "type": "integer"
+        }
+       ],
+       "ref": "ea68795e-a1d8-404e-9e36-1061d8fa9e87",
+       "rows": [
+        [
+         "('HITL-GenericAgent-gpt-5-mini-2025-08-07', 'workarena')",
+         null,
+         null,
+         null,
+         "0/1",
+         "0"
+        ]
+       ],
+       "shape": {
+        "columns": 5,
+        "rows": 1
+       }
+      },
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>avg_reward</th>\n",
+       "      <th>std_err</th>\n",
+       "      <th>avg_steps</th>\n",
+       "      <th>n_completed</th>\n",
+       "      <th>n_err</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>agent.agent_name</th>\n",
+       "      <th>env.benchmark</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>HITL-GenericAgent-gpt-5-mini-2025-08-07</th>\n",
+       "      <th>workarena</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0/1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                       avg_reward  std_err  \\\n",
+       "agent.agent_name                        env.benchmark                        \n",
+       "HITL-GenericAgent-gpt-5-mini-2025-08-07 workarena             NaN      NaN   \n",
+       "\n",
+       "                                                       avg_steps n_completed  \\\n",
+       "agent.agent_name                        env.benchmark                          \n",
+       "HITL-GenericAgent-gpt-5-mini-2025-08-07 workarena            NaN         0/1   \n",
+       "\n",
+       "                                                       n_err  \n",
+       "agent.agent_name                        env.benchmark         \n",
+       "HITL-GenericAgent-gpt-5-mini-2025-08-07 workarena          0  "
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "385559d7",
@@ -149,7 +393,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "AgentLab",
+   "display_name": "agentlab",
    "language": "python",
    "name": "python3"
   },
@@ -163,7 +407,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.7"
+   "version": "3.12.9"
   }
  },
  "nbformat": 4,

From 4453a00a490602d9e15d0d0189873b6cbe9c4c01 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Tue, 2 Sep 2025 17:18:54 -0400
Subject: [PATCH 08/21] Improve UI and step hint handling for multiple hints

---
 .../hitl_agent/generic_human_guided_agent.py  |  17 +-
 .../agents/hitl_agent/hint_labelling.py       |  39 ++--
 .../hint_labeling_ui.html                     | 208 +++++++++++++-----
 src/agentlab/agents/hitl_agent/hitl_agent.py  |  13 +-
 4 files changed, 190 insertions(+), 87 deletions(-)

diff --git a/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py
index 507e16fe..29547fd4 100644
--- a/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py
+++ b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py
@@ -18,7 +18,6 @@
     HintLabeling,
     HintLabelingInputs,
 )
-from agentlab.analyze import overlay_utils
 from agentlab.llm.llm_utils import (
     Discussion,
     HumanMessage,
@@ -208,7 +207,7 @@ def get_action(self, obs):
                 screenshots=[],  # no overlay screenshots yet
                 axtree=obs.get("axtree_txt", ""),
                 history=[],
-                hint="",
+                hints=[],
                 suggestions=[],  # no suggestions yet
             )
             self.ui.update_context(initial_inputs)
@@ -243,11 +242,7 @@ def get_action(self, obs):
                     screenshots=screenshots,  # list of overlay screenshots for hover
                     axtree=obs.get("axtree_txt", ""),
                     history=[],  # TODO: add history
-                    hint=(
-                        "\n".join(f"{i}. {c}" for i, c in enumerate(step_hint, 1))
-                        if step_hint
-                        else ""
-                    ),
+                    hints=step_hint,
                     suggestions=suggestions,
                 )
 
@@ -255,8 +250,8 @@ def get_action(self, obs):
                 response = self.ui.wait_for_response(timeout=600)
 
                 if response["type"] == "reprompt":
-                    hint = response["payload"]["hint"]
-                    step_hint.append(hint)
+                    new_hints = response["payload"].get("hints", [])
+                    step_hint = list(new_hints) if isinstance(new_hints, list) else step_hint
                     candidates, chat_messages = self.get_candidate_generation(
                         sys_prompt=system_prompt,
                         human_prompt=human_prompt,
@@ -354,8 +349,8 @@ def get_base_agent(llm_config):
     from agentlab.experiments.study import Study
 
     agent_configs = [HUMAN_GUIDED_GENERIC_AGENT]
-    benchmark = bgym.DEFAULT_BENCHMARKS["workarena_l1"]()
-    benchmark = benchmark.subset_from_glob("task_name", "*create*")
+    benchmark = bgym.DEFAULT_BENCHMARKS["miniwob"]()
+    benchmark = benchmark.subset_from_glob("task_name", "*book*")
     benchmark.env_args_list = benchmark.env_args_list[3:4]
 
     for env_args in benchmark.env_args_list:
diff --git a/src/agentlab/agents/hitl_agent/hint_labelling.py b/src/agentlab/agents/hitl_agent/hint_labelling.py
index faa16506..680f3cbe 100644
--- a/src/agentlab/agents/hitl_agent/hint_labelling.py
+++ b/src/agentlab/agents/hitl_agent/hint_labelling.py
@@ -8,11 +8,9 @@
 from browsergym.core import _get_global_playwright
 from pydantic import BaseModel, Field
 
-from agentlab.agents.hitl_agent import hint_labelling_ui_files
-
 logger = logging.getLogger(__name__)
-
-HINT_LABELING_DIR = resources.files(hint_labelling_ui_files)
+ 
+HINT_LABELING_DIR = resources.files("agentlab.agents.hitl_agent.hint_labelling_ui_files")
 
 
 class HintLabelingInputs(BaseModel):
@@ -22,14 +20,14 @@ class HintLabelingInputs(BaseModel):
     screenshots: List[str] = Field(default_factory=list)  # list of base64 screenshots for hover
     axtree: str
     history: List[Dict[str, str]] = Field(default_factory=list)
-    hint: str = ""
+    hints: List[str] = Field(default_factory=list)
     suggestions: List[Dict[str, str]] = Field(default_factory=list)
 
 
 class HintLabeling:
     def __init__(self, headless: bool, window_size=(600, 1000), *args, **kwargs):
-
-        pw: playwright.sync_api.Playwright = _get_global_playwright()
+        pw_opt = _get_global_playwright()
+        pw: playwright.sync_api.Playwright = pw_opt  # type: ignore[assignment]
         self.browser = pw.chromium.launch(
             headless=headless, args=[f"--window-size={window_size[0]},{window_size[1]}"]
         )
@@ -37,14 +35,14 @@ def __init__(self, headless: bool, window_size=(600, 1000), *args, **kwargs):
             no_viewport=True,
         )
         self.page = self.context.new_page()
-        self._resp_queue: "Queue[dict]" = Queue()
+        self._resp_queue = Queue()
 
         self.page.route("**/api/reprompt", self._route_reprompt)
         self.page.route("**/api/submit", self._route_submit)
         self.page.set_content(get_hint_labeling_ui(HINT_LABELING_DIR))
 
         # internal state
-        self._context: HintLabelingInputs = None
+        self._context = None
         self._running = False
 
     def _route_reprompt(
@@ -52,11 +50,16 @@ def _route_reprompt(
     ):
         logger.info("Route hit: %s %s", request.method, request.url)
         try:
-            body = json.loads(request.post_data() or "{}")
+            body = json.loads(request.post_data or "{}")
         except Exception:
             body = {}
         # enqueue output 1 (reprompt)
-        msg = {"type": "reprompt", "payload": {"hint": body.get("hint", "")}}
+        hints = body.get("hints")
+        if not isinstance(hints, list):
+            # Back-compat: accept single 'hint' string
+            h = body.get("hint")
+            hints = [h] if isinstance(h, str) and h.strip() else []
+        msg = {"type": "reprompt", "payload": {"hints": hints}}
         self._resp_queue.put(msg)
         # Respond something minimal so UI doesn’t break; it will be refreshed by a later update_context()
         route.fulfill(
@@ -68,10 +71,10 @@ def _route_reprompt(
     def _route_submit(self, route: playwright.sync_api.Route, request: playwright.sync_api.Request):
         logger.info("Route hit: %s %s", request.method, request.url)
         try:
-            body = json.loads(request.post_data() or "{}")
+            body = json.loads(request.post_data or "{}")
         except Exception:
             body = {}
-        # Map UI payload -> your step shape
+    # Map UI payload -> your step shape
         msg = {
             "type": "step",
             "payload": {
@@ -95,7 +98,7 @@ def _to_ui_bootstrap(self, ctx: HintLabelingInputs) -> dict:
             "screenshots": ctx.screenshots,  # list of screenshots for hover
             "axtree": ctx.axtree,
             "history": ctx.history,
-            "hint": ctx.hint,
+            "hints": ctx.hints,
             "suggestions": ctx.suggestions,
         }
 
@@ -116,7 +119,7 @@ def wait_for_response(self, timeout: Optional[float] = 600) -> dict:
 
         Returns:
             dict: A dictionary containing the parsed response with 'type' and 'payload' keys.
-                For /api/reprompt: {'type': 'reprompt', 'payload': {'hint': str}}
+                For /api/reprompt: {'type': 'reprompt', 'payload': {'hints': list[str]}}
                 For /api/submit: {'type': 'step', 'payload': {'think': str, 'action': str}}
 
         """
@@ -142,7 +145,11 @@ def is_api(req: playwright.sync_api.Request) -> bool:
             body = {}
 
         if req.url.endswith("/api/reprompt"):
-            msg = {"type": "reprompt", "payload": {"hint": body.get("hint", "")}}
+            hints = body.get("hints")
+            if not isinstance(hints, list):
+                h = body.get("hint")
+                hints = [h] if isinstance(h, str) and h.strip() else []
+            msg = {"type": "reprompt", "payload": {"hints": hints}}
         else:
             msg = {
                 "type": "step",
diff --git a/src/agentlab/agents/hitl_agent/hint_labelling_ui_files/hint_labeling_ui.html b/src/agentlab/agents/hitl_agent/hint_labelling_ui_files/hint_labeling_ui.html
index 3371c3cd..6c8c782a 100644
--- a/src/agentlab/agents/hitl_agent/hint_labelling_ui_files/hint_labeling_ui.html
+++ b/src/agentlab/agents/hitl_agent/hint_labelling_ui_files/hint_labeling_ui.html
@@ -10,8 +10,8 @@
       --bg:#f4f6f8; --card:#fff; --muted:#6b7280; --text:#0f172a; --brand:#2563eb; --accent:#10b981; --danger:#ef4444; --border:#e5e7eb;
     }
     *{box-sizing:border-box}
-    body{margin:0;font-family:Inter,system-ui,Segoe UI,Roboto,Helvetica,Arial,sans-serif;background:var(--bg);color:var(--text)}
-    .container{max-width:1100px;margin:24px auto;padding:0 16px}
+  body{margin:0;font-family:Inter,system-ui,Segoe UI,Roboto,Helvetica,Arial,sans-serif;background:var(--bg);color:var(--text)}
+  .container{max-width:1280px;margin:24px auto;padding:0 16px}
 
     .grid{
       display:grid;gap:16px;
@@ -26,7 +26,7 @@
     .tab.active{background:var(--card);border:1px solid var(--border);border-bottom:none;color:var(--text)}
     .tabpanel{border-top:1px solid var(--border)}
 
-    .screenshot{width:800px;height:450px;object-fit:contain;background:#0000000d;border-radius:8px}
+  .screenshot{width:100%;height:auto;max-height:65vh;object-fit:contain;background:#0000000d;border-radius:8px}
     .axtree{width:100%;height:520px;resize:none;border:none;padding:12px;font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,monospace;background:#0b10241a}
 
     .hints-row{display:grid;grid-template-columns: 1fr 140px;gap:12px;align-items:start}
@@ -36,7 +36,13 @@
     .btn-primary[disabled]{opacity:.6;cursor:not-allowed}
     .btn-ghost{background:transparent;border:1px solid var(--border)}
 
-    .choices{margin-top:12px;display:flex;flex-direction:column;gap:10px}
+  /* Hint rows with removable controls */
+  .hint-row{display:flex;gap:8px;align-items:stretch}
+  .hint-row textarea.hint{flex:1;margin:0}
+  .remove-hint{width:36px;min-width:36px;height:36px;line-height:1;border-radius:10px;padding:0;font-size:18px;color:#64748b}
+  .remove-hint:hover{background:#f8fafc}
+
+  .choices{margin-top:12px;display:flex;flex-direction:column;gap:10px;max-height:60vh;overflow:auto}
     .choice{display:grid;grid-template-columns:32px 1fr;gap:12px;align-items:start;background:#ffffff;border:1px solid var(--border);border-radius:14px;padding:12px}
     .choice.selected{border:2px solid var(--accent);background:#f0fdf4}
     .choice.disabled{opacity:0.5;pointer-events:none}
@@ -55,8 +61,13 @@
 
     .pill{display:inline-block;padding:6px 10px;border-radius:999px;background:#f1f5f9;color:#0f172a;border:1px solid var(--border);font-size:12px}
 
+  /* New: split layout for screenshot/tabs and hints+suggestions side-by-side */
+  .split{display:grid;gap:16px;grid-template-columns: 1fr 1.3fr;align-items:start;margin-top:16px}
+  .right-stack{display:flex;flex-direction:column;gap:16px}
+
     @media (max-width: 900px){
       .grid{grid-template-columns: 1fr}
+      .split{grid-template-columns: 1fr}
       .axtree{height:420px}
       .screenshot{width:100%;height:auto;max-width:800px}
     }
@@ -92,44 +103,50 @@ <h2>Error Feedback</h2>
       </div>
     </div>
 
-    <!-- Middle: Tabs -->
-    <div class="card" style="margin-top:16px">
-      <div class="tabs">
-        <button class="tab active" data-tab="screenshot">Screenshot</button>
-        <button class="tab" data-tab="axtree">AxTree</button>
-        <button class="tab" data-tab="history">History</button>
-      </div>
-      <div class="pad tabpanel">
-        <div id="tab-screenshot" class="tabcontent">
-          <img id="screenshotImg" alt="screenshot" class="screenshot" />
-        </div>
-        <div id="tab-axtree" class="tabcontent" hidden>
-          <textarea id="axtreeArea" class="axtree" readonly style="font-size: 12px; white-space: pre; overflow-wrap: normal;"></textarea>
+    <!-- Middle: Two-column split -->
+    <div class="split">
+      <!-- Left: Hints and Suggestions stacked -->
+      <div class="right-stack">
+        <!-- Hints & Reprompt -->
+        <div class="card pad" id="hintsSection">
+          <h2>Hints</h2>
+          <!-- Dynamic hints inputs will be injected here above the button -->
+          <button id="repromptBtn" class="btn btn-primary" title="Send hint to get refreshed suggestions" style="margin-top: 12px;">Reprompt with Hint</button>
+          <div id="repromptStatus" class="banner info" style="display:none"></div>
         </div>
-        <div id="tab-history" class="tabcontent" hidden>
-          <!-- intentionally empty for now -->
-          <div class="banner info">History will appear here.</div>
+
+        <!-- Suggestions / Radio list -->
+        <div class="card pad">
+          <h2>Suggestions</h2>
+          <div id="choices" class="choices"></div>
+          <div id="choicesNote" class="banner info" style="display:none" title="Hover to see more details"></div>
+          <div class="footer">
+            <button id="submitBtn" class="btn btn-primary" disabled title="Select an action to enable">Send Action</button>
+          </div>
+          <div id="submitStatus" class="banner info" style="display:none" title="Hover to see submission status"></div>
         </div>
       </div>
-    </div>
 
-    <!-- Hints & Reprompt -->
-    <div class="card pad" style="margin-top:16px">
-      <h2>Hints</h2>
-      <textarea id="hintInput" class="hint" placeholder="Type guidance for the next reprompt…" style="width: 100%;"></textarea>
-      <button id="repromptBtn" class="btn btn-primary" title="Send hint to get refreshed suggestions" style="margin-top: 12px;">Reprompt with Hint</button>
-      <div id="repromptStatus" class="banner info" style="display:none"></div>
-    </div>
-
-    <!-- Suggestions / Radio list -->
-    <div class="card pad" style="margin-top:16px">
-      <h2>Suggestions</h2>
-      <div id="choices" class="choices"></div>
-      <div id="choicesNote" class="banner info" style="display:none" title="Hover to see more details"></div>
-      <div class="footer">
-        <button id="submitBtn" class="btn btn-primary" disabled title="Select an action to enable">Send Action</button>
+      <!-- Right: Tabs with Screenshot/AxTree/History -->
+      <div class="card">
+        <div class="tabs">
+          <button class="tab active" data-tab="screenshot">Screenshot</button>
+          <button class="tab" data-tab="axtree">AxTree</button>
+          <button class="tab" data-tab="history">History</button>
+        </div>
+        <div class="pad tabpanel">
+          <div id="tab-screenshot" class="tabcontent">
+            <img id="screenshotImg" alt="screenshot" class="screenshot" />
+          </div>
+          <div id="tab-axtree" class="tabcontent" hidden>
+            <textarea id="axtreeArea" class="axtree" readonly style="font-size: 12px; white-space: pre; overflow-wrap: normal;"></textarea>
+          </div>
+          <div id="tab-history" class="tabcontent" hidden>
+            <!-- intentionally empty for now -->
+            <div class="banner info">History will appear here.</div>
+          </div>
+        </div>
       </div>
-      <div id="submitStatus" class="banner info" style="display:none" title="Hover to see submission status"></div>
     </div>
 
     <!-- Progress/Status Area - Hidden by default, shown only when there's status -->
@@ -151,7 +168,7 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
      *   screenshot: base64 string (no data: prefix required)
      *   screenshots: Array<string> - list of base64 screenshots for hover (same length as suggestions)
      *   axtree: string
-     *   hint: string
+  *   hints: Array<string>
      *   suggestions: Array<{ action: string, think: string, id?: string }>
      */
     window.__BOOTSTRAP_DATA__ = window.__BOOTSTRAP_DATA__ || {
@@ -161,7 +178,7 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
       screenshots: [], // list of base64 screenshots for hover
       axtree: "<root>\n  <window name=\"VITASPHERE\">…</window>\n</root>",
       history: [],
-      hint: "",
+  hints: [],
       suggestions: [
         { id: "1", action: "click(\"42\")", think: "The button with id 42 advances the form." },
         { id: "2", action: "type(\"Assigned to\", \"John Doe\")", think: "Fills the assignee field before submission." },
@@ -169,10 +186,10 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
       ]
     };
 
-    var RECEIVED_RESPONSE = false;
-    var originalScreenshot = ""; // store original screenshot
-    var hoverScreenshots = []; // store screenshots for hover
-    var hoverEnabled = true; // track if hover behavior is enabled
+  var RECEIVED_RESPONSE = false;
+  var originalScreenshot = ""; // store original screenshot
+  var hoverScreenshots = []; // store screenshots for hover
+  var hoverEnabled = true; // track if hover behavior is enabled
 
     function applyContext(d){
       goalBox.textContent = d.goal || '';
@@ -184,9 +201,11 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
       if (Array.isArray(d.suggestions)) {
         renderSuggestions(d.suggestions);
       }
-      // keep the hint textarea in sync only if it's currently empty,
-      // so we don't clobber user typing
-      if (!hintInput.value) hintInput.value = d.hint || '';
+      // render hints list from array (fallback to single hint string)
+      const incomingHints = Array.isArray(d.hints)
+        ? d.hints
+        : (d.hint ? [d.hint] : []);
+      renderHints(incomingHints);
     }
 
     // REPLACE your old updateContext with this:
@@ -197,8 +216,8 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
 
     // Placeholder endpoints (replace later)
     const ENDPOINTS = {
-      REPROMPT: "/api/reprompt",   // expects POST {hint} -> returns {suggestions: [...]} 
-      SUBMIT: "/api/submit"        // expects POST {hint, action, think, id?} -> returns {suggestions?: [...]} (optional)
+      REPROMPT: "/api/reprompt",   // expects POST {hints: string[]} -> returns {suggestions: [...]} 
+  SUBMIT: "/api/submit"        // expects POST {action, think, id?} -> returns {suggestions?: [...]} (optional)
     };
 
     // DOM references
@@ -206,7 +225,9 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
     const errorBox = document.getElementById('errorBox');
     const screenshotImg = document.getElementById('screenshotImg');
     const axtreeArea = document.getElementById('axtreeArea');
-    const hintInput = document.getElementById('hintInput');
+  // Hints UI elements (dynamic list)
+  let hintsContainer;
+  let addHintBtn;
     const repromptBtn = document.getElementById('repromptBtn');
     const repromptStatus = document.getElementById('repromptStatus');
     const choicesEl = document.getElementById('choices');
@@ -339,9 +360,10 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
     repromptBtn.addEventListener('click', async ()=>{
       updateProgress('Requesting new suggestions...', true);
       try{
+        const hints = collectHints();
         const res = await fetch(ENDPOINTS.REPROMPT,{
           method:'POST', headers:{'Content-Type':'application/json'},
-          body: JSON.stringify({ hint: hintInput.value })
+          body: JSON.stringify({ hints })
         });
         // Don't expect a response - the backend will update the UI via updateContext
         updateProgress('Hint sent. Waiting for new suggestions...', true);
@@ -387,10 +409,10 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
       document.querySelectorAll('input[name="choice"]').forEach(r=> r.checked=false);
       selectedId = null;
       submitBtn.disabled = true;
-      hintInput.value = '';
+  clearHintsUI();
 
       try{
-        const payload = { hint: hintInput.value, action: selection.action, think: selection.think, id: selection.id };
+        const payload = { action: selection.action, think: selection.think, id: selection.id };
         const res = await fetch(ENDPOINTS.SUBMIT,{
           method:'POST', headers:{'Content-Type':'application/json'},
           body: JSON.stringify(payload)
@@ -412,8 +434,90 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
       screenshotImg.src = dataUrlFromBase64(d.screenshot || '');
       axtreeArea.value = d.axtree || '';
       renderSuggestions(Array.isArray(d.suggestions) ? d.suggestions : []);
+      // setup hints UI
+      setupHintsUI();
+      const incomingHints = Array.isArray(d.hints)
+        ? d.hints
+        : (d.hint ? [d.hint] : []);
+      renderHints(incomingHints);
       // Don't show initial progress - only show when there's actual status
     })();
+
+    // Hints UI logic
+    function setupHintsUI(){
+      const hintsSection = document.getElementById('hintsSection');
+      hintsContainer = document.createElement('div');
+      hintsContainer.id = 'hintsContainer';
+      hintsContainer.style.display = 'flex';
+      hintsContainer.style.flexDirection = 'column';
+      hintsContainer.style.gap = '8px';
+
+      addHintBtn = document.createElement('button');
+      addHintBtn.id = 'addHintBtn';
+      addHintBtn.className = 'btn btn-ghost';
+      addHintBtn.type = 'button';
+      addHintBtn.textContent = '+ add hint';
+      addHintBtn.title = 'Add another hint textbox';
+      addHintBtn.addEventListener('click', ()=> addHintTextbox(''));
+
+      hintsSection.insertBefore(hintsContainer, hintsSection.querySelector('#repromptBtn'));
+      hintsSection.insertBefore(addHintBtn, hintsSection.querySelector('#repromptBtn'));
+    }
+
+    function addHintTextbox(value){
+      const row = document.createElement('div');
+      row.className = 'hint-row';
+
+      const ta = document.createElement('textarea');
+      ta.className = 'hint';
+      ta.placeholder = 'Type guidance for the next reprompt…';
+      ta.style.width = '100%';
+      ta.value = value || '';
+
+      const rm = document.createElement('button');
+      rm.type = 'button';
+      rm.className = 'btn btn-ghost remove-hint';
+      rm.title = 'Remove this hint';
+      rm.setAttribute('aria-label','Remove hint');
+      rm.textContent = '×';
+      rm.addEventListener('click', ()=>{
+        row.remove();
+        // Ensure at least one textbox remains
+        if (hintsContainer.querySelectorAll('textarea.hint').length === 0){
+          addHintTextbox('');
+        }
+      });
+
+      row.appendChild(ta);
+      row.appendChild(rm);
+      hintsContainer.appendChild(row);
+      return ta;
+    }
+
+    function renderHints(hintsArray){
+      if (!hintsContainer) return;
+      hintsContainer.innerHTML = '';
+      const items = (Array.isArray(hintsArray) ? hintsArray : []).filter(h => typeof h === 'string');
+      if (items.length === 0) {
+        // start with one empty textbox by default
+        addHintTextbox('');
+      } else {
+        items.forEach(h => addHintTextbox(h));
+      }
+    }
+
+    function collectHints(){
+      if (!hintsContainer) return [];
+      return Array.from(hintsContainer.querySelectorAll('textarea.hint'))
+        .map(ta => (ta.value || '').trim())
+        .filter(v => v.length > 0);
+    }
+
+    function clearHintsUI(){
+      if (!hintsContainer) return;
+      hintsContainer.innerHTML = '';
+      addHintTextbox('');
+    }
   </script>
 </body>
 </html>
\ No newline at end of file
diff --git a/src/agentlab/agents/hitl_agent/hitl_agent.py b/src/agentlab/agents/hitl_agent/hitl_agent.py
index 26c0c696..73d96baf 100644
--- a/src/agentlab/agents/hitl_agent/hitl_agent.py
+++ b/src/agentlab/agents/hitl_agent/hitl_agent.py
@@ -48,7 +48,7 @@ def get_action(self, obs):
                 screenshots=[],  # no overlay screenshots yet
                 axtree=obs.get("axtree_txt", ""),
                 history=[],
-                hint="",
+                hints=[],
                 suggestions=[],  # no suggestions yet
             )
             self.ui.update_context(initial_inputs)
@@ -73,11 +73,7 @@ def get_action(self, obs):
                     screenshots=screenshots,  # list of overlay screenshots for hover
                     axtree=obs.get("axtree_txt", ""),
                     history=[],  # TODO: add history
-                    hint=(
-                        "\n".join(f"{i}. {c}" for i, c in enumerate(step_hint, 1))
-                        if step_hint
-                        else ""
-                    ),
+                    hints=step_hint,
                     suggestions=suggestions,
                 )
 
@@ -85,8 +81,9 @@ def get_action(self, obs):
                 response = self.ui.wait_for_response(timeout=600)
 
                 if response["type"] == "reprompt":
-                    hint = response["payload"]["hint"]
-                    step_hint.append(hint)
+                    new_hints = response["payload"].get("hints", [])
+                    # Replace with the new list from UI, or extend if needed
+                    step_hint = list(new_hints) if isinstance(new_hints, list) else step_hint
                     candidates = self.subagent.get_candidate_generations(
                         obs, hint=step_hint if step_hint else None, n_candidates=3
                     )

From 97f390468d8cafe49f4dfa08a4e90e5b3d336208 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Tue, 2 Sep 2025 18:48:44 -0400
Subject: [PATCH 09/21] add snapshots navigation to see history of
 interactions.

---
 .../hitl_agent/generic_human_guided_agent.py  |   2 -
 .../agents/hitl_agent/hint_labelling.py       |   6 +-
 .../hint_labeling_ui.html                     | 230 ++++++++++++++++--
 src/agentlab/agents/hitl_agent/hitl_agent.py  |   5 +-
 4 files changed, 208 insertions(+), 35 deletions(-)

diff --git a/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py
index 29547fd4..e8d31688 100644
--- a/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py
+++ b/src/agentlab/agents/hitl_agent/generic_human_guided_agent.py
@@ -206,7 +206,6 @@ def get_action(self, obs):
                 screenshot=(img_to_base_64(obs["screenshot"]) if "screenshot" in obs else ""),
                 screenshots=[],  # no overlay screenshots yet
                 axtree=obs.get("axtree_txt", ""),
-                history=[],
                 hints=[],
                 suggestions=[],  # no suggestions yet
             )
@@ -241,7 +240,6 @@ def get_action(self, obs):
                     screenshot=(img_to_base_64(obs["screenshot"]) if "screenshot" in obs else ""),
                     screenshots=screenshots,  # list of overlay screenshots for hover
                     axtree=obs.get("axtree_txt", ""),
-                    history=[],  # TODO: add history
                     hints=step_hint,
                     suggestions=suggestions,
                 )
diff --git a/src/agentlab/agents/hitl_agent/hint_labelling.py b/src/agentlab/agents/hitl_agent/hint_labelling.py
index 680f3cbe..2355c15b 100644
--- a/src/agentlab/agents/hitl_agent/hint_labelling.py
+++ b/src/agentlab/agents/hitl_agent/hint_labelling.py
@@ -9,7 +9,7 @@
 from pydantic import BaseModel, Field
 
 logger = logging.getLogger(__name__)
- 
+
 HINT_LABELING_DIR = resources.files("agentlab.agents.hitl_agent.hint_labelling_ui_files")
 
 
@@ -19,7 +19,6 @@ class HintLabelingInputs(BaseModel):
     screenshot: str  # base64 screenshot (original/current)
     screenshots: List[str] = Field(default_factory=list)  # list of base64 screenshots for hover
     axtree: str
-    history: List[Dict[str, str]] = Field(default_factory=list)
     hints: List[str] = Field(default_factory=list)
     suggestions: List[Dict[str, str]] = Field(default_factory=list)
 
@@ -74,7 +73,7 @@ def _route_submit(self, route: playwright.sync_api.Route, request: playwright.sy
             body = json.loads(request.post_data or "{}")
         except Exception:
             body = {}
-    # Map UI payload -> your step shape
+        # Map UI payload -> your step shape
         msg = {
             "type": "step",
             "payload": {
@@ -97,7 +96,6 @@ def _to_ui_bootstrap(self, ctx: HintLabelingInputs) -> dict:
             "screenshot": ctx.screenshot,
             "screenshots": ctx.screenshots,  # list of screenshots for hover
             "axtree": ctx.axtree,
-            "history": ctx.history,
             "hints": ctx.hints,
             "suggestions": ctx.suggestions,
         }
diff --git a/src/agentlab/agents/hitl_agent/hint_labelling_ui_files/hint_labeling_ui.html b/src/agentlab/agents/hitl_agent/hint_labelling_ui_files/hint_labeling_ui.html
index 6c8c782a..a2c7b540 100644
--- a/src/agentlab/agents/hitl_agent/hint_labelling_ui_files/hint_labeling_ui.html
+++ b/src/agentlab/agents/hitl_agent/hint_labelling_ui_files/hint_labeling_ui.html
@@ -11,7 +11,7 @@
     }
     *{box-sizing:border-box}
   body{margin:0;font-family:Inter,system-ui,Segoe UI,Roboto,Helvetica,Arial,sans-serif;background:var(--bg);color:var(--text)}
-  .container{max-width:1280px;margin:24px auto;padding:0 16px}
+    .container{max-width:1600px;margin:24px auto;padding:0 16px}
 
     .grid{
       display:grid;gap:16px;
@@ -41,6 +41,8 @@
   .hint-row textarea.hint{flex:1;margin:0}
   .remove-hint{width:36px;min-width:36px;height:36px;line-height:1;border-radius:10px;padding:0;font-size:18px;color:#64748b}
   .remove-hint:hover{background:#f8fafc}
+  .hint:disabled{background:#f1f5f9;color:#94a3b8}
+  .btn-ghost[disabled]{opacity:.6;cursor:not-allowed}
 
   .choices{margin-top:12px;display:flex;flex-direction:column;gap:10px;max-height:60vh;overflow:auto}
     .choice{display:grid;grid-template-columns:32px 1fr;gap:12px;align-items:start;background:#ffffff;border:1px solid var(--border);border-radius:14px;padding:12px}
@@ -52,6 +54,7 @@
     .choice .label{font-weight:700}
     .choice .value{color:#0f172a}
     .choice .reason{font-size:13px;color:#111827}
+    .choice .reason .value{white-space:pre-wrap; overflow-wrap:anywhere; word-break:break-word}
 
     .footer{display:flex;justify-content:flex-end;gap:12px;margin-top:10px}
 
@@ -61,8 +64,20 @@
 
     .pill{display:inline-block;padding:6px 10px;border-radius:999px;background:#f1f5f9;color:#0f172a;border:1px solid var(--border);font-size:12px}
 
+  /* Timeline styles */
+  .timeline-wrap{margin-top:16px}
+  .timeline{display:flex;align-items:center;gap:10px;padding:10px 12px;background:var(--card);border:1px solid var(--border);border-radius:12px}
+  .timeline .dot{width:12px;height:12px;border-radius:999px;background:#cbd5e1;cursor:pointer;transition:transform .1s ease-in-out}
+  .timeline .dot:hover{transform:scale(1.2)}
+  .timeline .dot.active{background:var(--brand);box-shadow:0 0 0 4px rgba(37,99,235,.15)}
+  .timeline .label{margin-left:auto;font-size:12px;color:var(--muted)}
+  .history-notice{margin-top:8px}
+  /* step numbers above dots */
+  .timeline .dot{position:relative}
+  .timeline .dot::after{content: attr(data-step); position:absolute; top:-16px; left:50%; transform:translateX(-50%); font-size:10px; color:var(--muted)}
+
   /* New: split layout for screenshot/tabs and hints+suggestions side-by-side */
-  .split{display:grid;gap:16px;grid-template-columns: 1fr 1.3fr;align-items:start;margin-top:16px}
+  .split{display:grid;gap:16px;grid-template-columns: 1.3fr 1fr;align-items:start;margin-top:16px}
   .right-stack{display:flex;flex-direction:column;gap:16px}
 
     @media (max-width: 900px){
@@ -91,6 +106,14 @@
 </head>
 <body>
   <div class="container">
+    <!-- Timeline + notice -->
+    <div class="timeline-wrap">
+      <div id="timeline" class="timeline" role="tablist" aria-label="Context updates timeline">
+        <!-- dots injected here -->
+        <span id="timelineLabel" class="label"></span>
+      </div>
+      <div id="historyNotice" class="banner info history-notice" style="display:none"></div>
+    </div>
     <!-- Top: Goal & Error -->
     <div class="grid" style="display: flex; gap: 16px;">
       <div class="card pad" style="flex: 1; height: 150px;">
@@ -132,7 +155,6 @@ <h2>Suggestions</h2>
         <div class="tabs">
           <button class="tab active" data-tab="screenshot">Screenshot</button>
           <button class="tab" data-tab="axtree">AxTree</button>
-          <button class="tab" data-tab="history">History</button>
         </div>
         <div class="pad tabpanel">
           <div id="tab-screenshot" class="tabcontent">
@@ -141,10 +163,7 @@ <h2>Suggestions</h2>
           <div id="tab-axtree" class="tabcontent" hidden>
             <textarea id="axtreeArea" class="axtree" readonly style="font-size: 12px; white-space: pre; overflow-wrap: normal;"></textarea>
           </div>
-          <div id="tab-history" class="tabcontent" hidden>
-            <!-- intentionally empty for now -->
-            <div class="banner info">History will appear here.</div>
-          </div>
+          
         </div>
       </div>
     </div>
@@ -171,13 +190,12 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
   *   hints: Array<string>
      *   suggestions: Array<{ action: string, think: string, id?: string }>
      */
-    window.__BOOTSTRAP_DATA__ = window.__BOOTSTRAP_DATA__ || {
+  window.__BOOTSTRAP_DATA__ = window.__BOOTSTRAP_DATA__ || {
       goal: "go to the hardware catalog store and order a developer laptop",
       error_feedback: "playwright error when clicking on something that is not visible (from the previous step)",
       screenshot: "", // fill with base64 (PNG/JPG). When empty, we show a placeholder.
       screenshots: [], // list of base64 screenshots for hover
-      axtree: "<root>\n  <window name=\"VITASPHERE\">…</window>\n</root>",
-      history: [],
+  axtree: "<root>\n  <window name=\"VITASPHERE\">…</window>\n</root>",
   hints: [],
       suggestions: [
         { id: "1", action: "click(\"42\")", think: "The button with id 42 advances the form." },
@@ -191,6 +209,11 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
   var hoverScreenshots = []; // store screenshots for hover
   var hoverEnabled = true; // track if hover behavior is enabled
 
+  // Timeline state
+  let timeline = []; // {data, event, ts, meta}
+    let timelineIndex = -1;
+  let hintsLockedUntilNextSnapshot = false;
+
     function applyContext(d){
       goalBox.textContent = d.goal || '';
       errorBox.textContent = d.error_feedback || '';
@@ -210,8 +233,15 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
 
     // REPLACE your old updateContext with this:
     function updateContext(data){
-      window.__BOOTSTRAP_DATA__ = data || {};
-      applyContext(window.__BOOTSTRAP_DATA__);
+      // push new snapshot to timeline and render
+      const d = data || {};
+      window.__BOOTSTRAP_DATA__ = d;
+      // If backend delivered new suggestions, ensure hints unlock for the latest snapshot
+      if (Array.isArray(d.suggestions) && d.suggestions.length > 0) {
+        hintsLockedUntilNextSnapshot = false;
+      }
+      pushSnapshot('update', d);
+  setTimeout(ensureLatestEditable, 0);
     }
 
     // Placeholder endpoints (replace later)
@@ -236,11 +266,25 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
     const submitStatus = document.getElementById('submitStatus');
     const progressArea = document.getElementById('progressArea');
     const progressContainer = document.getElementById('progressContainer');
+  const timelineEl = document.getElementById('timeline');
+  const timelineLabel = document.getElementById('timelineLabel');
+  const historyNotice = document.getElementById('historyNotice');
+  
 
     // State
     let currentSuggestions = [];
     let selectedId = null;
 
+    // Ensure latest snapshot is editable for hints and reprompt
+    function ensureLatestEditable(){
+      const isLatest = (timelineIndex === -1) || (timelineIndex === timeline.length - 1);
+      if (isLatest) {
+        hintsLockedUntilNextSnapshot = false;
+        setHintsEditable(true);
+        if (repromptBtn) repromptBtn.disabled = false;
+      }
+    }
+
     // Helpers
     function setVisible(el, visible){ el.style.display = visible ? '' : 'none'; }
     function setBanner(el, text, variant='info'){ el.className = `banner ${variant}`; el.textContent = text; setVisible(el,true); }
@@ -258,6 +302,19 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
       setVisible(progressContainer, false);
     }
 
+    function updateModeForSnapshot(){
+      const isLatest = timelineIndex === timeline.length - 1;
+      repromptBtn.disabled = !isLatest;
+      // Only force-disable submit when not on latest snapshot; when latest, selection controls it
+  if (!isLatest) submitBtn.disabled = true;
+  setHintsEditable(isLatest && !hintsLockedUntilNextSnapshot);
+      if (!isLatest){
+        setBanner(historyNotice, 'Viewing past context snapshot. Use Left/Right arrows to navigate. Press End to go to latest.', 'info');
+      } else {
+        setVisible(historyNotice, false);
+      }
+    }
+
     function dataUrlFromBase64(b64){
       if(!b64) return 'data:image/svg+xml;charset=utf-8,' + encodeURIComponent(`<svg xmlns=\"http://www.w3.org/2000/svg\" width=\"1600\" height=\"900\"><rect width=\"100%\" height=\"100%\" fill=\"#eef2ff\"/><text x=\"50%\" y=\"50%\" font-family=\"sans-serif\" font-size=\"24\" text-anchor=\"middle\" fill=\"#64748b\">No screenshot provided</text></svg>`);
       // naive sniff for png/jpg
@@ -325,8 +382,9 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
         reasonLabel.textContent = 'reasoning:';
         const reasonVal = document.createElement('span');
         reasonVal.className = 'value';
-        reasonVal.style.maxHeight = '3em';
-        reasonVal.style.overflowY = 'auto';
+  // Let the reasoning grow naturally; wrapping handled in CSS
+  reasonVal.style.maxHeight = '';
+  reasonVal.style.overflowY = '';
         reasonVal.textContent = ` ${sugg.think}`;
         reasonRow.appendChild(reasonLabel); reasonRow.appendChild(reasonVal);
 
@@ -337,6 +395,9 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
         wrapper.appendChild(box);
         choicesEl.appendChild(wrapper);
       });
+
+  // Ensure latest snapshot controls are enabled
+  ensureLatestEditable();
     }
 
     function currentSelection(){
@@ -365,6 +426,10 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
           method:'POST', headers:{'Content-Type':'application/json'},
           body: JSON.stringify({ hints })
         });
+  
+        // Lock current hints until a new snapshot arrives
+        hintsLockedUntilNextSnapshot = true;
+        setHintsEditable(false);
         // Don't expect a response - the backend will update the UI via updateContext
         updateProgress('Hint sent. Waiting for new suggestions...', true);
       }catch(err){
@@ -402,6 +467,16 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
         }
       });
 
+      // Record selection into snapshot meta for step counting and history visuals
+      try {
+        if (timeline[timelineIndex]) {
+          timeline[timelineIndex].meta = timeline[timelineIndex].meta || {};
+          timeline[timelineIndex].meta.selectedAction = selection.action;
+          timeline[timelineIndex].meta.selectedId = selectedId;
+          renderTimeline();
+        }
+      } catch {}
+
       // Disable hover behavior
       hoverEnabled = false;
 
@@ -417,6 +492,7 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
           method:'POST', headers:{'Content-Type':'application/json'},
           body: JSON.stringify(payload)
         });
+        
         // Don't expect a response - the backend will handle the selection
         updateProgress('Selection submitted successfully!', false);
       }catch(err){
@@ -428,21 +504,112 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
 
     // Initial render from BOOTSTRAP_DATA
     (function init(){
-      const d = window.__BOOTSTRAP_DATA__;
-      goalBox.textContent = d.goal || '';
-      errorBox.textContent = d.error_feedback || '';
-      screenshotImg.src = dataUrlFromBase64(d.screenshot || '');
-      axtreeArea.value = d.axtree || '';
-      renderSuggestions(Array.isArray(d.suggestions) ? d.suggestions : []);
       // setup hints UI
       setupHintsUI();
-      const incomingHints = Array.isArray(d.hints)
-        ? d.hints
-        : (d.hint ? [d.hint] : []);
-      renderHints(incomingHints);
-      // Don't show initial progress - only show when there's actual status
+      // prime timeline with initial data
+      const d = window.__BOOTSTRAP_DATA__;
+      // Do not add a placeholder snapshot; just render the initial context
+      applyContext(d);
+      const initHints = Array.isArray(d.hints) ? d.hints : (d.hint ? [d.hint] : []);
+      renderHints(initHints);
+      // Keyboard navigation for timeline
+      document.addEventListener('keydown', (e)=>{
+        const tag = (document.activeElement && document.activeElement.tagName) || '';
+        if (tag === 'TEXTAREA' || tag === 'INPUT') return; // don't hijack text editing
+        if (e.key === 'ArrowLeft') { goRelative(-1); }
+        else if (e.key === 'ArrowRight') { goRelative(1); }
+        else if (e.key === 'Home') { goTo(0); }
+        else if (e.key === 'End') { goTo(timeline.length - 1); }
+      });
+  // enable hints at start
+  setHintsEditable(true);
     })();
 
+    // Timeline helpers
+    function deepClone(obj){ try { return JSON.parse(JSON.stringify(obj)); } catch { return obj; } }
+    function pushSnapshot(event, data){
+      // Avoid adding placeholder snapshots when suggestions are missing or empty
+      const suggs = (data && Array.isArray(data.suggestions)) ? data.suggestions : [];
+      if (!Array.isArray((data||{}).suggestions) || suggs.length === 0){
+        applyContext(data || {});
+        const hints0 = Array.isArray((data||{}).hints) ? data.hints : ((data||{}).hint ? [data.hint] : []);
+        renderHints(hints0);
+        // Even if we didn't add a snapshot, if suggestions were updated later, unlock
+        if (Array.isArray(suggs) && suggs.length > 0) {
+          hintsLockedUntilNextSnapshot = false;
+        }
+        updateModeForSnapshot();
+        return;
+      }
+      const snap = { event, ts: Date.now(), data: deepClone(data), meta: {} };
+      // Apply on push to set context and then record it
+      applyContext(snap.data);
+      // Ensure hints UI reflects incoming data
+      const incomingHints = Array.isArray(snap.data.hints) ? snap.data.hints : (snap.data.hint ? [snap.data.hint] : []);
+      renderHints(incomingHints);
+      timeline.push(snap);
+      timelineIndex = timeline.length - 1;
+  renderTimeline();
+  // New snapshot unlocks hints (unless we immediately navigate away)
+  hintsLockedUntilNextSnapshot = false;
+      updateModeForSnapshot();
+  // Ensure latest snapshot is editable for hints
+  setHintsEditable(true);
+  // Also re-enable reprompt on the latest snapshot
+  if (repromptBtn) repromptBtn.disabled = false;
+  setTimeout(ensureLatestEditable, 0);
+    }
+
+    function goTo(i){
+      if (i < 0 || i >= timeline.length) return;
+      timelineIndex = i;
+      const snap = timeline[timelineIndex];
+      applyContext(snap.data);
+      // set hints from snapshot
+      const incomingHints = Array.isArray(snap.data.hints) ? snap.data.hints : (snap.data.hint ? [snap.data.hint] : []);
+      renderHints(incomingHints);
+      // If a selection was made on this snapshot, restore its visual state
+      if (snap.meta && snap.meta.selectedAction){
+        const selAction = snap.meta.selectedAction;
+        const allChoices = choicesEl.querySelectorAll('.choice');
+        allChoices.forEach((choice, idx) => {
+          const sugg = currentSuggestions[idx];
+          if (!sugg) return;
+          if (sugg.action === selAction){
+            choice.classList.add('selected');
+            choice.classList.remove('disabled');
+          } else {
+            choice.classList.add('disabled');
+            choice.classList.remove('selected');
+          }
+        });
+      }
+      renderTimeline();
+      updateModeForSnapshot();
+    }
+    function goRelative(d){ goTo(timelineIndex + d); }
+
+    function renderTimeline(){
+      // clear existing dots (except label span)
+      Array.from(timelineEl.querySelectorAll('.dot')).forEach(n => n.remove());
+      const count = timeline.length;
+      let stepsSoFar = 0;
+      for (let i = 0; i < count; i++){
+        const dot = document.createElement('div');
+        dot.className = 'dot' + (i === timelineIndex ? ' active' : '');
+        dot.title = `Snapshot ${i+1} of ${count}`;
+        dot.setAttribute('role','tab');
+  dot.setAttribute('aria-selected', String(i === timelineIndex));
+        if (timeline[i] && timeline[i].meta && timeline[i].meta.selectedAction){ stepsSoFar += 1; }
+        dot.setAttribute('data-step', stepsSoFar > 0 ? String(stepsSoFar) : '');
+        dot.addEventListener('click', ()=> goTo(i));
+        timelineEl.insertBefore(dot, timelineLabel);
+      }
+      timelineLabel.textContent = count ? `Snapshot ${timelineIndex+1} / ${count}` : '';
+    }
+
+    
+
     // Hints UI logic
     function setupHintsUI(){
       const hintsSection = document.getElementById('hintsSection');
@@ -494,6 +661,19 @@ <h2 style="color: white; margin-bottom: 8px;">Status</h2>
       return ta;
     }
 
+    function setHintsEditable(enabled){
+      // Toggle textareas
+      (hintsContainer ? hintsContainer.querySelectorAll('textarea.hint') : []).forEach((ta)=>{
+        ta.disabled = !enabled;
+      });
+      // Toggle add button
+      if (addHintBtn) addHintBtn.disabled = !enabled;
+      // Toggle remove buttons
+      (hintsContainer ? hintsContainer.querySelectorAll('.remove-hint') : []).forEach((btn)=>{
+        btn.disabled = !enabled;
+      });
+    }
+
     function renderHints(hintsArray){
       if (!hintsContainer) return;
       hintsContainer.innerHTML = '';
diff --git a/src/agentlab/agents/hitl_agent/hitl_agent.py b/src/agentlab/agents/hitl_agent/hitl_agent.py
index 73d96baf..9b84793b 100644
--- a/src/agentlab/agents/hitl_agent/hitl_agent.py
+++ b/src/agentlab/agents/hitl_agent/hitl_agent.py
@@ -5,14 +5,13 @@
 import playwright
 from browsergym.experiments.agent import Agent
 
-
 from agentlab.agents.agent_args import AgentArgs
+from agentlab.agents.agent_utils import overlay_action
 from agentlab.agents.hitl_agent.base_multi_candidate_agent import MultiCandidateAgent
 from agentlab.agents.hitl_agent.hint_labelling import (
     HintLabeling,
     HintLabelingInputs,
 )
-from agentlab.agents.agent_utils import overlay_action
 from agentlab.llm.llm_utils import img_to_base_64
 from agentlab.llm.tracking import cost_tracker_decorator
 
@@ -47,7 +46,6 @@ def get_action(self, obs):
                 screenshot=(img_to_base_64(obs["screenshot"]) if "screenshot" in obs else ""),
                 screenshots=[],  # no overlay screenshots yet
                 axtree=obs.get("axtree_txt", ""),
-                history=[],
                 hints=[],
                 suggestions=[],  # no suggestions yet
             )
@@ -72,7 +70,6 @@ def get_action(self, obs):
                     screenshot=(img_to_base_64(obs["screenshot"]) if "screenshot" in obs else ""),
                     screenshots=screenshots,  # list of overlay screenshots for hover
                     axtree=obs.get("axtree_txt", ""),
-                    history=[],  # TODO: add history
                     hints=step_hint,
                     suggestions=suggestions,
                 )

From 88d1d8dad706bd84649d5812833f2872f48ccd9b Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Tue, 2 Sep 2025 19:16:54 -0400
Subject: [PATCH 10/21] View human added hints in xray agent_info.

---
 src/agentlab/analyze/agent_xray.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index 8accbfd6..6dbec117 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -818,6 +818,18 @@ def update_agent_info_html():
         s1, action_str = get_screenshot(info, info.step, False)
         s2, action_str = get_screenshot(info, info.step + 1, False)
         agent_info = info.exp_result.steps_info[info.step].agent_info
+        # Minimal: show step_hints if present
+        hints = (
+            agent_info.get("step_hints")
+            or agent_info.get("hints")
+            or agent_info.get("extra_info", {}).get("step_hints")
+        )
+        if hints:
+            if not isinstance(hints, (list, tuple)):
+                hints = [hints]
+            items = "".join(f"<li>{html.escape(str(h))}</li>" for h in hints)
+            hints_html = f"<html><body><h3>Step Hints</h3><ul>{items}</ul></body></html>"
+            return _page_to_iframe(hints_html), s1, s2
         page = agent_info.get("html_page", ["No Agent Info"])
         if page is None:
             page = """Fill up html_page attribute in AgentInfo to display here."""

From 6b78e8e4816c899322aaf220562ba429ede34e85 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Tue, 2 Sep 2025 19:20:36 -0400
Subject: [PATCH 11/21] revert change to ipynb

---
 .../2_eval_on_miniwob/inspect_results.ipynb   | 258 +-----------------
 1 file changed, 7 insertions(+), 251 deletions(-)

diff --git a/tutorials/2_eval_on_miniwob/inspect_results.ipynb b/tutorials/2_eval_on_miniwob/inspect_results.ipynb
index 84a73cca..06127b78 100644
--- a/tutorials/2_eval_on_miniwob/inspect_results.ipynb
+++ b/tutorials/2_eval_on_miniwob/inspect_results.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "58086537",
    "metadata": {},
    "outputs": [],
@@ -25,47 +25,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "id": "7901cccc",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "PosixPath('/Users/aman.jaiswal/Work/AgentLab.worktrees/trace-recorder/results')"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "RESULTS_DIR"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "50be19a9",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/Users/aman.jaiswal/Work/AgentLab.worktrees/trace-recorder/results/2025-09-02_15-52-00_hitl-genericagent-gpt-5-mini-2025-08-07-on-workarena-l1-task-name-create\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Searching experiments directories.: 100%|██████████| 1/1 [00:00<00:00, 5433.04it/s]\n",
-      "Loading results: 100%|██████████| 1/1 [00:00<00:00, 373.26it/s]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# replace this by your desired directory if needed.\n",
     "result_dir = get_most_recent_study(RESULTS_DIR, contains=None)\n",
@@ -76,222 +39,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "id": "82cc1557",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "PosixPath('/Users/aman.jaiswal/Work/AgentLab.worktrees/trace-recorder/results/2025-09-02_15-52-00_hitl-genericagent-gpt-5-mini-2025-08-07-on-workarena-l1-task-name-create')"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "result_dir"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "a424c470",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Found multiple configuration, averaging across tasks and returning a per-agent report.\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<style type=\"text/css\">\n",
-       "#T_1d2fe th {\n",
-       "  white-space: pre-wrap;\n",
-       "}\n",
-       "</style>\n",
-       "<table id=\"T_1d2fe\">\n",
-       "  <thead>\n",
-       "    <tr>\n",
-       "      <th class=\"blank level0\" >&nbsp;</th>\n",
-       "      <th id=\"T_1d2fe_level0_col0\" class=\"col_heading level0 col0\" >agent.agent\n",
-       "name</th>\n",
-       "      <th id=\"T_1d2fe_level0_col1\" class=\"col_heading level0 col1\" >env.benchmark</th>\n",
-       "      <th id=\"T_1d2fe_level0_col2\" class=\"col_heading level0 col2\" >avg\n",
-       "reward</th>\n",
-       "      <th id=\"T_1d2fe_level0_col3\" class=\"col_heading level0 col3\" >std\n",
-       "err</th>\n",
-       "      <th id=\"T_1d2fe_level0_col4\" class=\"col_heading level0 col4\" >avg\n",
-       "steps</th>\n",
-       "      <th id=\"T_1d2fe_level0_col5\" class=\"col_heading level0 col5\" >n\n",
-       "completed</th>\n",
-       "      <th id=\"T_1d2fe_level0_col6\" class=\"col_heading level0 col6\" >n\n",
-       "err</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th id=\"T_1d2fe_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
-       "      <td id=\"T_1d2fe_row0_col0\" class=\"data row0 col0\" >HITL-GenericAgent-gpt-5-mini-2025-08-07</td>\n",
-       "      <td id=\"T_1d2fe_row0_col1\" class=\"data row0 col1\" >workarena</td>\n",
-       "      <td id=\"T_1d2fe_row0_col2\" class=\"data row0 col2\" >nan</td>\n",
-       "      <td id=\"T_1d2fe_row0_col3\" class=\"data row0 col3\" >nan</td>\n",
-       "      <td id=\"T_1d2fe_row0_col4\" class=\"data row0 col4\" >nan</td>\n",
-       "      <td id=\"T_1d2fe_row0_col5\" class=\"data row0 col5\" >0/1</td>\n",
-       "      <td id=\"T_1d2fe_row0_col6\" class=\"data row0 col6\" >0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n"
-      ],
-      "text/plain": [
-       "<pandas.io.formats.style.Styler at 0x125c55850>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "report = inspect_results.global_report(result_df)\n",
     "inspect_results.display_report(report)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f86e44fd",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.microsoft.datawrangler.viewer.v0+json": {
-       "columns": [
-        {
-         "name": "('agent.agent_name', 'env.benchmark')",
-         "rawType": "object",
-         "type": "unknown"
-        },
-        {
-         "name": "avg_reward",
-         "rawType": "float64",
-         "type": "float"
-        },
-        {
-         "name": "std_err",
-         "rawType": "float64",
-         "type": "float"
-        },
-        {
-         "name": "avg_steps",
-         "rawType": "float64",
-         "type": "float"
-        },
-        {
-         "name": "n_completed",
-         "rawType": "object",
-         "type": "string"
-        },
-        {
-         "name": "n_err",
-         "rawType": "int64",
-         "type": "integer"
-        }
-       ],
-       "ref": "ea68795e-a1d8-404e-9e36-1061d8fa9e87",
-       "rows": [
-        [
-         "('HITL-GenericAgent-gpt-5-mini-2025-08-07', 'workarena')",
-         null,
-         null,
-         null,
-         "0/1",
-         "0"
-        ]
-       ],
-       "shape": {
-        "columns": 5,
-        "rows": 1
-       }
-      },
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th>avg_reward</th>\n",
-       "      <th>std_err</th>\n",
-       "      <th>avg_steps</th>\n",
-       "      <th>n_completed</th>\n",
-       "      <th>n_err</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>agent.agent_name</th>\n",
-       "      <th>env.benchmark</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>HITL-GenericAgent-gpt-5-mini-2025-08-07</th>\n",
-       "      <th>workarena</th>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0/1</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                       avg_reward  std_err  \\\n",
-       "agent.agent_name                        env.benchmark                        \n",
-       "HITL-GenericAgent-gpt-5-mini-2025-08-07 workarena             NaN      NaN   \n",
-       "\n",
-       "                                                       avg_steps n_completed  \\\n",
-       "agent.agent_name                        env.benchmark                          \n",
-       "HITL-GenericAgent-gpt-5-mini-2025-08-07 workarena            NaN         0/1   \n",
-       "\n",
-       "                                                       n_err  \n",
-       "agent.agent_name                        env.benchmark         \n",
-       "HITL-GenericAgent-gpt-5-mini-2025-08-07 workarena          0  "
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "\n"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "385559d7",
@@ -393,7 +149,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "agentlab",
+   "display_name": "AgentLab",
    "language": "python",
    "name": "python3"
   },
@@ -407,7 +163,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.9"
+   "version": "3.12.7"
   }
  },
  "nbformat": 4,

From 79cde9048aa4d20dc8e3319774a22099a3fcf57a Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Tue, 2 Sep 2025 19:41:00 -0400
Subject: [PATCH 12/21] add agent-mentor laucher

---
 pyproject.toml                                |   1 +
 .../agents/hitl_agent/launch_hint_ui.py       | 122 ++++++++++++++++++
 2 files changed, 123 insertions(+)
 create mode 100644 src/agentlab/agents/hitl_agent/launch_hint_ui.py

diff --git a/pyproject.toml b/pyproject.toml
index ef9a7342..b2d7eacb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -109,3 +109,4 @@ hint = [
 [project.scripts]
 agentlab-assistant = "agentlab.ui_assistant:main"
 agentlab-xray = "agentlab.analyze.agent_xray:main"
+agentlab-mentor = "agentlab.agents.hitl_agent.launch_hint_ui:main"
diff --git a/src/agentlab/agents/hitl_agent/launch_hint_ui.py b/src/agentlab/agents/hitl_agent/launch_hint_ui.py
new file mode 100644
index 00000000..d5f43fa5
--- /dev/null
+++ b/src/agentlab/agents/hitl_agent/launch_hint_ui.py
@@ -0,0 +1,122 @@
+"""
+Console launcher for the Human-in-the-Loop Generic Agent UI.
+
+Usage (installed entry point):
+  agentlab-hint-ui --benchmark miniwob --task-name miniwob.book-flight --seed 123
+
+This will run a Study with the MultipleProposalGenericAgent and the selected task.
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+from typing import Optional
+
+import bgym
+
+from agentlab.agents.hitl_agent.generic_human_guided_agent import (
+    HUMAN_GUIDED_GENERIC_AGENT,
+)
+from agentlab.experiments.study import Study
+
+
+def build_benchmark(benchmark_name: str, task_name: Optional[str], seed: Optional[int]):
+    # Instantiate benchmark by name using BrowserGym registry
+    try:
+        benchmark = bgym.DEFAULT_BENCHMARKS[benchmark_name.lower()]()
+    except KeyError as e:
+        choices = ", ".join(sorted(bgym.DEFAULT_BENCHMARKS.keys()))
+        raise SystemExit(f"Unknown benchmark '{benchmark_name}'. Choose one of: {choices}") from e
+
+    if task_name:
+        # If a fully-qualified name is provided, filter by exact match; otherwise, allow glob
+        if any(ch in task_name for ch in "*?[]"):
+            benchmark = benchmark.subset_from_glob("task_name", task_name)
+        else:
+            benchmark = benchmark.subset_from_glob("task_name", task_name)
+
+    # If a specific seed is provided, set it on all env args
+    if seed is not None:
+        for env_args in benchmark.env_args_list:
+            env_args.task_seed = seed
+
+    # Reasonable defaults for interactive UI
+    for env_args in benchmark.env_args_list:
+        env_args.max_steps = env_args.max_steps or 100
+        # Leave headless True by default; UI is external Gradio, not browser GUI
+        env_args.headless = True
+
+    return benchmark
+
+
+def parse_args():
+    p = argparse.ArgumentParser(description="Run HITL Generic Agent UI on a benchmark task")
+    p.add_argument(
+        "--benchmark",
+        required=True,
+        help="Benchmark name as registered in BrowserGym, e.g., miniwob, workarena_l1, webarena, visualwebarena",
+    )
+    p.add_argument(
+        "--task-name",
+        dest="task_name",
+        default=None,
+        help="Task name or glob to filter tasks within the benchmark (e.g., 'miniwob.*book*')",
+    )
+    p.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Task seed to use for all selected tasks. If omitted, tasks keep their configured/random seed.",
+    )
+    p.add_argument(
+        "--jobs",
+        type=int,
+        default=1,
+        help="Number of parallel jobs (UI agent typically runs sequentially)",
+    )
+    p.add_argument(
+        "--parallel-backend",
+        default="sequential",
+        choices=["sequential", "ray", "joblib"],
+        help="Parallel backend to use",
+    )
+    p.add_argument(
+        "--retries",
+        type=int,
+        default=1,
+        help="Number of relaunch attempts for incomplete experiments",
+    )
+    p.add_argument(
+        "--log-level",
+        default="WARNING",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
+        help="Logging level",
+    )
+    return p.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    logging_level = getattr(logging, args.log_level)
+
+    benchmark = build_benchmark(args.benchmark, args.task_name, args.seed)
+    agent_configs = [HUMAN_GUIDED_GENERIC_AGENT]
+
+    study = Study(
+        agent_configs,
+        benchmark,
+        logging_level=logging_level,
+        logging_level_stdout=logging_level,
+    )
+
+    study.run(
+        n_jobs=args.jobs,
+        parallel_backend=args.parallel_backend,
+        n_relaunch=args.retries,
+    )
+
+
+if __name__ == "__main__":
+    main()

From 517aaf57c845f0e6a0a869bf816e32e48ea4407d Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Tue, 2 Sep 2025 19:47:14 -0400
Subject: [PATCH 13/21] addling headless as args in agentlab-mentor

---
 .../agents/hitl_agent/launch_hint_ui.py         | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/agentlab/agents/hitl_agent/launch_hint_ui.py b/src/agentlab/agents/hitl_agent/launch_hint_ui.py
index d5f43fa5..9fe29ffe 100644
--- a/src/agentlab/agents/hitl_agent/launch_hint_ui.py
+++ b/src/agentlab/agents/hitl_agent/launch_hint_ui.py
@@ -2,7 +2,7 @@
 Console launcher for the Human-in-the-Loop Generic Agent UI.
 
 Usage (installed entry point):
-  agentlab-hint-ui --benchmark miniwob --task-name miniwob.book-flight --seed 123
+    agentlab-mentor --benchmark miniwob --task-name miniwob.book-flight --seed 123 --no-headless
 
 This will run a Study with the MultipleProposalGenericAgent and the selected task.
 """
@@ -21,7 +21,9 @@
 from agentlab.experiments.study import Study
 
 
-def build_benchmark(benchmark_name: str, task_name: Optional[str], seed: Optional[int]):
+def build_benchmark(
+    benchmark_name: str, task_name: Optional[str], seed: Optional[int], headless: bool
+):
     # Instantiate benchmark by name using BrowserGym registry
     try:
         benchmark = bgym.DEFAULT_BENCHMARKS[benchmark_name.lower()]()
@@ -44,8 +46,7 @@ def build_benchmark(benchmark_name: str, task_name: Optional[str], seed: Optiona
     # Reasonable defaults for interactive UI
     for env_args in benchmark.env_args_list:
         env_args.max_steps = env_args.max_steps or 100
-        # Leave headless True by default; UI is external Gradio, not browser GUI
-        env_args.headless = True
+        env_args.headless = headless
 
     return benchmark
 
@@ -93,6 +94,12 @@ def parse_args():
         choices=["DEBUG", "INFO", "WARNING", "ERROR"],
         help="Logging level",
     )
+    p.add_argument(
+        "--headless",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="Run the browser headless (default: True). Use --no-headless to show the browser.",
+    )
     return p.parse_args()
 
 
@@ -101,7 +108,7 @@ def main():
 
     logging_level = getattr(logging, args.log_level)
 
-    benchmark = build_benchmark(args.benchmark, args.task_name, args.seed)
+    benchmark = build_benchmark(args.benchmark, args.task_name, args.seed, args.headless)
     agent_configs = [HUMAN_GUIDED_GENERIC_AGENT]
 
     study = Study(

From 9ed33763478c0b367de6a650841f14eb627cdbb3 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Wed, 3 Sep 2025 10:57:05 -0400
Subject: [PATCH 14/21] improve entry point args for agentlab-mentor to allow
 multiple seeds

---
 .../agents/hitl_agent/launch_hint_ui.py       | 33 +++++++++++--------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/agentlab/agents/hitl_agent/launch_hint_ui.py b/src/agentlab/agents/hitl_agent/launch_hint_ui.py
index 9fe29ffe..0feeaf4a 100644
--- a/src/agentlab/agents/hitl_agent/launch_hint_ui.py
+++ b/src/agentlab/agents/hitl_agent/launch_hint_ui.py
@@ -2,7 +2,7 @@
 Console launcher for the Human-in-the-Loop Generic Agent UI.
 
 Usage (installed entry point):
-    agentlab-mentor --benchmark miniwob --task-name miniwob.book-flight --seed 123 --no-headless
+    agentlab-mentor --benchmark miniwob --task-name miniwob.book-flight --seed 123 --seed 456 --no-headless
 
 This will run a Study with the MultipleProposalGenericAgent and the selected task.
 """
@@ -11,6 +11,7 @@
 
 import argparse
 import logging
+import copy
 from typing import Optional
 
 import bgym
@@ -20,9 +21,10 @@
 )
 from agentlab.experiments.study import Study
 
+logger = logging.getLogger(__name__)
 
 def build_benchmark(
-    benchmark_name: str, task_name: Optional[str], seed: Optional[int], headless: bool
+    benchmark_name: str, task_name: Optional[str], seeds: Optional[list[int]], headless: bool
 ):
     # Instantiate benchmark by name using BrowserGym registry
     try:
@@ -32,16 +34,20 @@ def build_benchmark(
         raise SystemExit(f"Unknown benchmark '{benchmark_name}'. Choose one of: {choices}") from e
 
     if task_name:
-        # If a fully-qualified name is provided, filter by exact match; otherwise, allow glob
-        if any(ch in task_name for ch in "*?[]"):
-            benchmark = benchmark.subset_from_glob("task_name", task_name)
-        else:
-            benchmark = benchmark.subset_from_glob("task_name", task_name)
-
-    # If a specific seed is provided, set it on all env args
-    if seed is not None:
-        for env_args in benchmark.env_args_list:
-            env_args.task_seed = seed
+        benchmark = benchmark.subset_from_glob("task_name", task_name)
+        tasks = list(set(e.task_name for e in benchmark.env_args_list))
+        logger.warning(f'Found {len(tasks)} tasks matching "{task_name}:" \n {tasks}, using only the first one.')
+        task = tasks[0]
+
+    # If specific seeds are provided, duplicate envs for each seed
+    if seeds is not None:
+        new_env_args_list = []
+        task_env = next((x for x in benchmark.env_args_list if x.task_name == task))
+        for seed in seeds:
+            ea = copy.deepcopy(task_env)
+            ea.task_seed = seed
+            new_env_args_list.append(ea)
+        benchmark.env_args_list = new_env_args_list
 
     # Reasonable defaults for interactive UI
     for env_args in benchmark.env_args_list:
@@ -66,9 +72,10 @@ def parse_args():
     )
     p.add_argument(
         "--seed",
+        action="append",
         type=int,
         default=None,
-        help="Task seed to use for all selected tasks. If omitted, tasks keep their configured/random seed.",
+        help="Task seed. Repeat flag for multiple seeds (e.g., --seed 1 --seed 2). If omitted, tasks keep their configured/random seed.",
     )
     p.add_argument(
         "--jobs",

From 4f50293f3fbfb6942ce5546ed6bcd1bae55c5f6a Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Wed, 3 Sep 2025 12:07:53 -0400
Subject: [PATCH 15/21] update error-handling for agentlab-mentor

---
 .../agents/hitl_agent/launch_hint_ui.py       | 45 ++++++++++++++++---
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/src/agentlab/agents/hitl_agent/launch_hint_ui.py b/src/agentlab/agents/hitl_agent/launch_hint_ui.py
index 0feeaf4a..794ebe7d 100644
--- a/src/agentlab/agents/hitl_agent/launch_hint_ui.py
+++ b/src/agentlab/agents/hitl_agent/launch_hint_ui.py
@@ -34,15 +34,50 @@ def build_benchmark(
         raise SystemExit(f"Unknown benchmark '{benchmark_name}'. Choose one of: {choices}") from e
 
     if task_name:
-        benchmark = benchmark.subset_from_glob("task_name", task_name)
-        tasks = list(set(e.task_name for e in benchmark.env_args_list))
-        logger.warning(f'Found {len(tasks)} tasks matching "{task_name}:" \n {tasks}, using only the first one.')
-        task = tasks[0]
+        try:
+            benchmark = benchmark.subset_from_glob("task_name", task_name)
+            tasks = sorted({e.task_name for e in benchmark.env_args_list})
+            if not tasks:
+                msg = f"No tasks found matching pattern '{task_name}'."
+                logger.error(msg)
+                raise SystemExit(msg)
+            if len(tasks) > 1:
+                logger.warning(
+                    "Found %d tasks matching '%s'. Using only the first: %s",
+                    len(tasks),
+                    task_name,
+                    tasks[0],
+                )
+            task = tasks[0]
+        except SystemExit:
+            raise
+        except Exception as e:
+            logger.error(f"Error occurred while filtering tasks: {e}")
+            raise SystemExit(str(e))
 
     # If specific seeds are provided, duplicate envs for each seed
     if seeds is not None:
         new_env_args_list = []
-        task_env = next((x for x in benchmark.env_args_list if x.task_name == task))
+        # If a specific task was selected above, duplicate that; otherwise, ensure there is exactly one task
+        if 'task' in locals():
+            task_env = next((x for x in benchmark.env_args_list if x.task_name == task), None)
+            if task_env is None:
+                msg = f"Internal error: selected task '{task}' not found in env list."
+                logger.error(msg)
+                raise SystemExit(msg)
+        else:
+            unique_tasks = sorted({e.task_name for e in benchmark.env_args_list})
+            if not unique_tasks:
+                raise SystemExit("No tasks available in the selected benchmark.")
+            if len(unique_tasks) > 1:
+                raise SystemExit(
+                    "Multiple tasks present in benchmark. Please specify --task-name to apply seeds to a single task."
+                )
+            task = unique_tasks[0]
+            task_env = next((x for x in benchmark.env_args_list if x.task_name == task), None)
+            if task_env is None:
+                raise SystemExit(f"Task '{task}' not found in env list.")
+
         for seed in seeds:
             ea = copy.deepcopy(task_env)
             ea.task_seed = seed

From dbc332fcd504c43fe3ff63ff663ea68bb3e22696 Mon Sep 17 00:00:00 2001
From: Patrice Bechard <patrice.bechard@servicenow.com>
Date: Wed, 3 Sep 2025 12:20:02 -0400
Subject: [PATCH 16/21] update default window size (revert to playwright
 default)

---
 src/agentlab/agents/hitl_agent/hint_labelling.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/agentlab/agents/hitl_agent/hint_labelling.py b/src/agentlab/agents/hitl_agent/hint_labelling.py
index 2355c15b..f1120f02 100644
--- a/src/agentlab/agents/hitl_agent/hint_labelling.py
+++ b/src/agentlab/agents/hitl_agent/hint_labelling.py
@@ -24,12 +24,10 @@ class HintLabelingInputs(BaseModel):
 
 
 class HintLabeling:
-    def __init__(self, headless: bool, window_size=(600, 1000), *args, **kwargs):
+    def __init__(self, headless: bool, *args, **kwargs):
         pw_opt = _get_global_playwright()
         pw: playwright.sync_api.Playwright = pw_opt  # type: ignore[assignment]
-        self.browser = pw.chromium.launch(
-            headless=headless, args=[f"--window-size={window_size[0]},{window_size[1]}"]
-        )
+        self.browser = pw.chromium.launch(headless=headless)
         self.context = self.browser.new_context(
             no_viewport=True,
         )

From 7d988a83ac4c03ea4a38f5614e64b2fd82cfdd45 Mon Sep 17 00:00:00 2001
From: Patrice Bechard <patrice.bechard@servicenow.com>
Date: Wed, 3 Sep 2025 13:30:36 -0400
Subject: [PATCH 17/21] hack to fix bbox issue

---
 src/agentlab/agents/agent_utils.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/agentlab/agents/agent_utils.py b/src/agentlab/agents/agent_utils.py
index 7ce8cdad..dd30b84e 100644
--- a/src/agentlab/agents/agent_utils.py
+++ b/src/agentlab/agents/agent_utils.py
@@ -139,5 +139,17 @@ def overlay_action(obs, action):
     """Overlays actions on screenshot in-place"""
     act_img = copy.deepcopy(obs["screenshot"])
     act_img = Image.fromarray(act_img)
-    overlay_utils.annotate_action(act_img, action, properties=obs["extra_element_properties"])
+
+    new_obs_properties = copy.deepcopy(obs["extra_element_properties"])
+    import os
+    if os.getenv("AGENTLAB_USE_RETINA"):    
+        # HACK: divide everything by 2 in the obs
+        # TODO: make this more robust by changing login in annotate_action directly (or maybe in the obs section?)
+        for key, value in new_obs_properties.items():
+            try:
+                new_obs_properties[key]["bbox"] = [elem / 2 for elem in value["bbox"]]
+            except:
+                pass
+
+    overlay_utils.annotate_action(act_img, action, properties=new_obs_properties)
     return img_to_base_64(act_img)

From 8addffb9a79d560fdb39d24d413fdf0cc50feca8 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Wed, 3 Sep 2025 17:07:51 -0400
Subject: [PATCH 18/21] simplify CLI args and add ability to download hints
 using CLI.

---
 .../agents/hitl_agent/launch_hint_ui.py       | 206 +++++++++---------
 1 file changed, 103 insertions(+), 103 deletions(-)

diff --git a/src/agentlab/agents/hitl_agent/launch_hint_ui.py b/src/agentlab/agents/hitl_agent/launch_hint_ui.py
index 794ebe7d..0b0aca84 100644
--- a/src/agentlab/agents/hitl_agent/launch_hint_ui.py
+++ b/src/agentlab/agents/hitl_agent/launch_hint_ui.py
@@ -2,7 +2,7 @@
 Console launcher for the Human-in-the-Loop Generic Agent UI.
 
 Usage (installed entry point):
-    agentlab-mentor --benchmark miniwob --task-name miniwob.book-flight --seed 123 --seed 456 --no-headless
+    agentlab-mentor --benchmark miniwob --task-name miniwob.book-flight --seed 123 --no-headless
 
 This will run a Study with the MultipleProposalGenericAgent and the selected task.
 """
@@ -11,21 +11,17 @@
 
 import argparse
 import logging
-import copy
-from typing import Optional
 
 import bgym
 
 from agentlab.agents.hitl_agent.generic_human_guided_agent import (
     HUMAN_GUIDED_GENERIC_AGENT,
 )
+from agentlab.experiments.exp_utils import RESULTS_DIR
 from agentlab.experiments.study import Study
+from pathlib import Path
 
-logger = logging.getLogger(__name__)
-
-def build_benchmark(
-    benchmark_name: str, task_name: Optional[str], seeds: Optional[list[int]], headless: bool
-):
+def build_benchmark(benchmark_name: str, task_name: str, seed: int, headless: bool):
     # Instantiate benchmark by name using BrowserGym registry
     try:
         benchmark = bgym.DEFAULT_BENCHMARKS[benchmark_name.lower()]()
@@ -33,108 +29,95 @@ def build_benchmark(
         choices = ", ".join(sorted(bgym.DEFAULT_BENCHMARKS.keys()))
         raise SystemExit(f"Unknown benchmark '{benchmark_name}'. Choose one of: {choices}") from e
 
-    if task_name:
-        try:
-            benchmark = benchmark.subset_from_glob("task_name", task_name)
-            tasks = sorted({e.task_name for e in benchmark.env_args_list})
-            if not tasks:
-                msg = f"No tasks found matching pattern '{task_name}'."
-                logger.error(msg)
-                raise SystemExit(msg)
-            if len(tasks) > 1:
-                logger.warning(
-                    "Found %d tasks matching '%s'. Using only the first: %s",
-                    len(tasks),
-                    task_name,
-                    tasks[0],
-                )
-            task = tasks[0]
-        except SystemExit:
-            raise
-        except Exception as e:
-            logger.error(f"Error occurred while filtering tasks: {e}")
-            raise SystemExit(str(e))
-
-    # If specific seeds are provided, duplicate envs for each seed
-    if seeds is not None:
-        new_env_args_list = []
-        # If a specific task was selected above, duplicate that; otherwise, ensure there is exactly one task
-        if 'task' in locals():
-            task_env = next((x for x in benchmark.env_args_list if x.task_name == task), None)
-            if task_env is None:
-                msg = f"Internal error: selected task '{task}' not found in env list."
-                logger.error(msg)
-                raise SystemExit(msg)
-        else:
-            unique_tasks = sorted({e.task_name for e in benchmark.env_args_list})
-            if not unique_tasks:
-                raise SystemExit("No tasks available in the selected benchmark.")
-            if len(unique_tasks) > 1:
-                raise SystemExit(
-                    "Multiple tasks present in benchmark. Please specify --task-name to apply seeds to a single task."
-                )
-            task = unique_tasks[0]
-            task_env = next((x for x in benchmark.env_args_list if x.task_name == task), None)
-            if task_env is None:
-                raise SystemExit(f"Task '{task}' not found in env list.")
-
-        for seed in seeds:
-            ea = copy.deepcopy(task_env)
-            ea.task_seed = seed
-            new_env_args_list.append(ea)
-        benchmark.env_args_list = new_env_args_list
+    filtered_env_args = [
+        env_args for env_args in benchmark.env_args_list if env_args.task_name == task_name
+    ]
+    if not filtered_env_args:
+        raise SystemExit(f'No tasks found matching "{task_name}"')
+    filtered_env_args = filtered_env_args[:1]  # take the first one
+    benchmark.env_args_list = filtered_env_args
 
     # Reasonable defaults for interactive UI
     for env_args in benchmark.env_args_list:
-        env_args.max_steps = env_args.max_steps or 100
+        env_args.task_seed = seed
+        env_args.max_steps = env_args.max_steps or 200
         env_args.headless = headless
 
     return benchmark
 
 
+def extract_hints_from_experiment_trace(exp_dir):
+    """Extracts hints from every step of each episode in a exp_dir and returns a df with each row containing a hint.
+
+    Args:
+        exp_dir: Path-like to a study/experiment directory whose results should be scanned.
+
+    Returns:
+        pandas.DataFrame: One row per hint with metadata columns.
+    """
+    import pandas as pd
+
+    from agentlab.analyze import inspect_results
+    from agentlab.experiments.exp_utils import RESULTS_DIR
+    from agentlab.experiments.loop import ExpResult
+
+    output = []
+    # Use provided exp_dir if set; otherwise default to <$AGENTLAB_EXP_ROOT>/agentlab_mentor
+    result_df = inspect_results.load_result_df(exp_dir or (RESULTS_DIR / "agentlab_mentor"))
+    if result_df is None:
+        # No results to parse; return empty dataframe with expected columns
+        return pd.DataFrame(
+            columns=[
+                "exp_id",
+                "agent_name",
+                "benchmark",
+                "task_name",
+                "episode_reward",
+                "hint",
+            ]
+        )
+    result_df = result_df.reset_index()
+    for _, row in result_df.iterrows():
+        result = ExpResult(row.exp_dir)
+        episode = result.steps_info
+        episode_reward = max([step.reward for step in episode])
+        for step_info in episode:
+            step_hints = step_info.agent_info.get("extra_info", {}).get("step_hints", None)
+            if step_hints:
+                for hint in step_hints:
+                    output.append(
+                        {
+                            "exp_id": row["exp_id"],
+                            "agent_name": row["agent.agent_name"],
+                            "benchmark": row["env.task_name"].split(".")[0],
+                            "task_name": row["env.task_name"],
+                            "episode_reward": episode_reward,
+                            "hint": hint,
+                        }
+                    )
+    output = pd.DataFrame(output)
+    output = output.dropna()
+    return output
+
+
 def parse_args():
     p = argparse.ArgumentParser(description="Run HITL Generic Agent UI on a benchmark task")
     p.add_argument(
         "--benchmark",
-        required=True,
+        required=False,
         help="Benchmark name as registered in BrowserGym, e.g., miniwob, workarena_l1, webarena, visualwebarena",
     )
     p.add_argument(
         "--task-name",
         dest="task_name",
-        default=None,
-        help="Task name or glob to filter tasks within the benchmark (e.g., 'miniwob.*book*')",
+        required=False,
+        help="Exact task name within the benchmark (e.g., 'miniwob.book-flight')",
     )
     p.add_argument(
         "--seed",
-        action="append",
-        type=int,
-        default=None,
-        help="Task seed. Repeat flag for multiple seeds (e.g., --seed 1 --seed 2). If omitted, tasks keep their configured/random seed.",
-    )
-    p.add_argument(
-        "--jobs",
         type=int,
-        default=1,
-        help="Number of parallel jobs (UI agent typically runs sequentially)",
-    )
-    p.add_argument(
-        "--parallel-backend",
-        default="sequential",
-        choices=["sequential", "ray", "joblib"],
-        help="Parallel backend to use",
-    )
-    p.add_argument(
-        "--retries",
-        type=int,
-        default=1,
-        help="Number of relaunch attempts for incomplete experiments",
-    )
-    p.add_argument(
-        "--log-level",
-        default="WARNING",
-        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
-        help="Logging level",
+        required=False,
+        help="Task seed to use for the selected task.",
     )
     p.add_argument(
         "--headless",
@@ -142,28 +125,45 @@ def parse_args():
         default=True,
         help="Run the browser headless (default: True). Use --no-headless to show the browser.",
     )
+    p.add_argument(
+        "--download-hints",
+        nargs="?",
+        const="extracted_hints.csv",
+        required=False,
+        default=None,
+        metavar="[OUTPUT_CSV]",
+        help=(
+            "Extract hints from the default study directory and save to OUTPUT_CSV. "
+            "If OUTPUT_CSV is omitted, saves to 'extracted_hints.csv'. When provided, other args are ignored."
+        ),
+    )
     return p.parse_args()
 
 
 def main():
     args = parse_args()
-
-    logging_level = getattr(logging, args.log_level)
-
+    save_dir = RESULTS_DIR / "agentlab_mentor"
+    if args.download_hints:
+        df = extract_hints_from_experiment_trace(save_dir)
+        out_path = Path(args.download_hints)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        df.to_csv(out_path, index=False)
+        print(str(out_path))
+        return
+    # Validate required args only when not downloading hints
+    if not args.benchmark or not args.task_name or args.seed is None:
+        raise SystemExit(
+            "--benchmark, --task-name, and --seed are required unless using --download-hints"
+        )
     benchmark = build_benchmark(args.benchmark, args.task_name, args.seed, args.headless)
     agent_configs = [HUMAN_GUIDED_GENERIC_AGENT]
-
-    study = Study(
-        agent_configs,
-        benchmark,
-        logging_level=logging_level,
-        logging_level_stdout=logging_level,
-    )
-
+    # study is needed to run the 'set_benchmark' method which sets appropriate agent parameters.
+    study = Study(agent_args=agent_configs, benchmark=benchmark, logging_level=logging.WARNING)
     study.run(
-        n_jobs=args.jobs,
-        parallel_backend=args.parallel_backend,
-        n_relaunch=args.retries,
+        n_jobs=1,
+        parallel_backend="sequential",
+        n_relaunch=1,
+        exp_root=save_dir,
     )
 
 

From 59217775b0a830905dd6f42e1dda0df1db90fddf Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Wed, 3 Sep 2025 17:11:27 -0400
Subject: [PATCH 19/21] black

---
 src/agentlab/agents/hitl_agent/launch_hint_ui.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/agentlab/agents/hitl_agent/launch_hint_ui.py b/src/agentlab/agents/hitl_agent/launch_hint_ui.py
index 0b0aca84..18914d14 100644
--- a/src/agentlab/agents/hitl_agent/launch_hint_ui.py
+++ b/src/agentlab/agents/hitl_agent/launch_hint_ui.py
@@ -11,6 +11,7 @@
 
 import argparse
 import logging
+from pathlib import Path
 
 import bgym
 
@@ -19,7 +20,7 @@
 )
 from agentlab.experiments.exp_utils import RESULTS_DIR
 from agentlab.experiments.study import Study
-from pathlib import Path
+
 
 def build_benchmark(benchmark_name: str, task_name: str, seed: int, headless: bool):
     # Instantiate benchmark by name using BrowserGym registry

From 92f9a74870fea1c848e57d21b2d71941a63f4403 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Wed, 3 Sep 2025 17:27:51 -0400
Subject: [PATCH 20/21] formatting

---
 src/agentlab/agents/agent_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/agentlab/agents/agent_utils.py b/src/agentlab/agents/agent_utils.py
index dd30b84e..179a94d2 100644
--- a/src/agentlab/agents/agent_utils.py
+++ b/src/agentlab/agents/agent_utils.py
@@ -142,7 +142,8 @@ def overlay_action(obs, action):
 
     new_obs_properties = copy.deepcopy(obs["extra_element_properties"])
     import os
-    if os.getenv("AGENTLAB_USE_RETINA"):    
+
+    if os.getenv("AGENTLAB_USE_RETINA"):
         # HACK: divide everything by 2 in the obs
         # TODO: make this more robust by changing login in annotate_action directly (or maybe in the obs section?)
         for key, value in new_obs_properties.items():

From 02347f831ab3b7f732011848429c7cfe0feb2348 Mon Sep 17 00:00:00 2001
From: Patrice Bechard <patrice.bechard@servicenow.com>
Date: Wed, 3 Sep 2025 18:21:48 -0400
Subject: [PATCH 21/21] add flag to select llm config

---
 src/agentlab/agents/hitl_agent/launch_hint_ui.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/agentlab/agents/hitl_agent/launch_hint_ui.py b/src/agentlab/agents/hitl_agent/launch_hint_ui.py
index 18914d14..df2e9dbc 100644
--- a/src/agentlab/agents/hitl_agent/launch_hint_ui.py
+++ b/src/agentlab/agents/hitl_agent/launch_hint_ui.py
@@ -15,9 +15,7 @@
 
 import bgym
 
-from agentlab.agents.hitl_agent.generic_human_guided_agent import (
-    HUMAN_GUIDED_GENERIC_AGENT,
-)
+from agentlab.agents.hitl_agent.generic_human_guided_agent import get_base_agent
 from agentlab.experiments.exp_utils import RESULTS_DIR
 from agentlab.experiments.study import Study
 
@@ -120,6 +118,12 @@ def parse_args():
         required=False,
         help="Task seed to use for the selected task.",
     )
+    p.add_argument(
+        "--llm-config",
+        dest="llm_config",
+        default="openai/gpt-5-mini-2025-08-07",
+        help="LLM configuration to use for the agent (e.g., 'azure/gpt-5-mini-2025-08-07').",
+    )
     p.add_argument(
         "--headless",
         action=argparse.BooleanOptionalAction,
@@ -157,7 +161,7 @@ def main():
             "--benchmark, --task-name, and --seed are required unless using --download-hints"
         )
     benchmark = build_benchmark(args.benchmark, args.task_name, args.seed, args.headless)
-    agent_configs = [HUMAN_GUIDED_GENERIC_AGENT]
+    agent_configs = [get_base_agent(args.llm_config)]
     # study is needed to run the 'set_benchmark' method which sets appropriate agent parameters.
     study = Study(agent_args=agent_configs, benchmark=benchmark, logging_level=logging.WARNING)
     study.run(