From f57a919ba2703177598ae03aff0ef301b277d4e5 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 20 Jun 2025 09:03:20 -0400
Subject: [PATCH 01/37] Enhance goal application to support dynamic system
 messages based on multi-action configuration

---
 .../agents/tool_use_agent/tool_use_agent.py      | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
index 86140d02..05e89d4a 100644
--- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py
+++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
@@ -127,8 +127,10 @@ class Goal(Block):
 
     goal_as_system_msg: bool = True
 
-    def apply(self, llm, discussion: StructuredDiscussion, obs: dict) -> dict:
-        system_message = llm.msg.system().add_text(SYS_MSG)
+    def apply(
+        self, llm, discussion: StructuredDiscussion, obs: dict, sys_msg: str = SYS_MSG
+    ) -> dict:
+        system_message = llm.msg.system().add_text(sys_msg)
         discussion.append(system_message)
 
         if self.goal_as_system_msg:
@@ -441,7 +443,12 @@ def get_action(self, obs: Any) -> float:
         self.llm.reset_stats()
         if not self.discussion.is_goal_set():
             self.discussion.new_group("goal")
-            self.config.goal.apply(self.llm, self.discussion, obs)
+
+            if self.config.multiaction:
+                sys_msg = SYS_MSG + "\nYou can take multiple actions in a single step, if needed."
+            else:
+                sys_msg = SYS_MSG + "\nYou can only take one action at a time."
+            self.config.goal.apply(self.llm, self.discussion, obs, sys_msg)
             self.config.summarizer.apply_init(self.llm, self.discussion)
             self.config.general_hints.apply(self.llm, self.discussion)
             self.task_hint.apply(self.llm, self.discussion, self.task_name)
@@ -460,7 +467,6 @@ def get_action(self, obs: Any) -> float:
             cache_complete_prompt=False,
             use_cache_breakpoints=True,
         )
-
         action = response.action
         think = response.think
         last_summary = self.discussion.get_last_summary()
@@ -532,7 +538,7 @@ def get_action(self, obs: Any) -> float:
     general_hints=GeneralHints(use_hints=False),
     task_hint=TaskHint(use_task_hint=True),
     keep_last_n_obs=None,  # keep only the last observation in the discussion
-    multiaction=False,  # whether to use multi-action or not
+    multiaction=True,  # whether to use multi-action or not
     # action_subsets=("bid",),
     action_subsets=("coord"),
     # action_subsets=("coord", "bid"),

From 0b9a9859f321a382739cb93276417a0449bdf6ee Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Fri, 20 Jun 2025 16:05:28 -0400
Subject: [PATCH 02/37] Modify API for parallel tool calls add parallel tool
 support for anthrophic

---
 .../agents/tool_use_agent/tool_use_agent.py   |  47 +++---
 src/agentlab/llm/response_api.py              | 139 ++++++++++++++----
 2 files changed, 141 insertions(+), 45 deletions(-)

diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
index 05e89d4a..474f4851 100644
--- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py
+++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
@@ -169,11 +169,19 @@ class Obs(Block):
     def apply(
         self, llm, discussion: StructuredDiscussion, obs: dict, last_llm_output: LLMOutput
     ) -> dict:
-
-        if last_llm_output.tool_calls is None:
-            obs_msg = llm.msg.user()  # type: MessageBuilder
-        else:
-            obs_msg = llm.msg.tool(last_llm_output.raw_response)  # type: MessageBuilder
+        # bgym_calls = [call for call in last_llm_output.tool_calls if call.is_bgym_action]
+        # fn_calls = [call for call in last_llm_output.tool_calls if not call.is_bgym_action]
+
+        obs_msg = llm.msg.user()
+        if tool_calls := last_llm_output.tool_calls:
+            for action_call in tool_calls.get_bgym_action_calls():
+                action_call.add_text("See the observation")
+            for fn_call in tool_calls.get_non_bgym_action_calls():
+                call_results = execute_fn_calls(fn_call.name, fn_call.arguments)
+                fn_call.add_text(call_results)
+            
+            tool_response = llm.msg.add_responded_tool_calls(tool_calls)
+            discussion.append(tool_response)
 
         if self.use_last_error:
             if obs["last_action_error"] != "":
@@ -206,6 +214,9 @@ def apply(
         return obs_msg
 
 
+def execute_fn_calls(func_name: str, arguments: dict) -> str:
+    return ""
+
 def _format_tabs(obs):
     """Format the open tabs in a llm-readable way."""
     prompt_pieces = ["Currently open tabs:"]
@@ -320,23 +331,23 @@ def apply(self, llm, discussion: StructuredDiscussion, task_name: str) -> dict:
             discussion.append(msg)
 
 
-class ToolCall(Block):
+# class ToolCall(Block):
 
-    def __init__(self, tool_server):
-        self.tool_server = tool_server
+#     def __init__(self, tool_server):
+#         self.tool_server = tool_server
 
-    def apply(self, llm, messages: list[MessageBuilder], obs: dict) -> dict:
-        # build the message by adding components to obs
-        response: LLMOutput = llm(messages=self.messages)
+#     def apply(self, llm, messages: list[MessageBuilder], obs: dict) -> dict:
+#         # build the message by adding components to obs
+#         response: LLMOutput = llm(messages=self.messages)
 
-        messages.append(response.assistant_message)  # this is tool call
+#         messages.append(response.assistant_message)  # this is tool call
 
-        tool_answer = self.tool_server.call_tool(response)
-        tool_msg = llm.msg.tool()  # type: MessageBuilder
-        tool_msg.add_tool_id(response.last_computer_call_id)
-        tool_msg.update_last_raw_response(response)
-        tool_msg.add_text(str(tool_answer))
-        messages.append(tool_msg)
+#         tool_answer = self.tool_server.call_tool(response)
+#         tool_msg = llm.msg.tool()  # type: MessageBuilder
+#         tool_msg.add_tool_id(response.last_computer_call_id)
+#         tool_msg.update_last_raw_response(response)
+#         tool_msg.add_text(str(tool_answer))
+#         messages.append(tool_msg)
 
 
 @dataclass
diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
index 755886f8..f36e9ba1 100644
--- a/src/agentlab/llm/response_api.py
+++ b/src/agentlab/llm/response_api.py
@@ -29,24 +29,97 @@
 ContentItem = Dict[str, Any]
 Message = Dict[str, Union[str, List[ContentItem]]]
 
+BGYM_RESERVED_ACTION_FUNCTION_NAMES = [
+            "noop",
+            "scroll_at",
+            "mouse_move",
+            "mouse_up",
+            "mouse_down",
+            "mouse_click",
+            "mouse_dblclick",
+            "mouse_drag_and_drop",
+            "mouse_upload_file",
+            "keyboard_down",
+            "keyboard_up",
+            "keyboard_press",
+            "keyboard_type",
+            "keyboard_insert_text",
+        ]
+
+
+@dataclass
+class ToolCall:
+    name: str = field(default=None)
+    arguments: Dict[str, Any] = field(default_factory=dict)
+    raw_call: Any =  field(default=None)
+    tool_response: List[ContentItem] = field(default_factory=list)
+
+    @property
+    def is_bgym_action(self) -> bool:
+        """Check if the tool call is a reserved BGYM action."""
+        return self.name in BGYM_RESERVED_ACTION_FUNCTION_NAMES
+
+    @property
+    def is_response_set(self) -> bool:
+        """Check if the tool response is set."""
+        return self.tool_response is not None
+
+    def add_text(self, text: str) -> "MessageBuilder":
+        self.tool_response.append({"text": text})
+        return self
+
+    def add_image(self, text: str) -> "MessageBuilder":
+        self.tool_response.append({"image": text})
+        return self
+
+@dataclass
+class ToolCalls:
+    tool_calls: List[ToolCall] = field(default_factory=list)
+    raw_calls: List[Any] = field(default_factory=list)
+
+    def add_tool_call(self, tool_call: ToolCall) -> "ToolCalls":
+        self.tool_calls.append(tool_call)
+        return self
+
+    def get_bgym_action_calls(self) -> List[ToolCall]:
+        """Get all tool calls that are reserved BGYM actions."""
+        return [call for call in self.tool_calls if call.is_bgym_action]
+    
+    def get_non_bgym_action_calls(self) -> List[ToolCall]:
+        """Get all tool calls that are not reserved BGYM actions."""
+        return [call for call in self.tool_calls if not call.is_bgym_action]
+    
+    @property
+    def all_responses_set(self) -> bool:
+        """Check if all tool calls have responses set."""
+        return all(call.is_response_set for call in self.tool_calls)
+
+    def __len__(self) -> int:
+        """Return the number of tool calls."""
+        return len(self.tool_calls)
+
+    def __iter__(self):
+        """Make ToolCalls iterable."""
+        return iter(self.tool_calls)
+
 
 @dataclass
 class LLMOutput:
     """Serializable object for the output of a response LLM."""
 
-    raw_response: Any = field(default_factory=dict)
+    raw_response: Any = field(default=None)
     think: str = field(default="")
     action: str = field(default=None)  # Default action if no tool call is made
-    tool_calls: Any = field(default=None)  # This will hold the tool call response if any
+    tool_calls: ToolCalls = field(default=None) # This will hold the tool call response if any
 
 
 class MessageBuilder:
     def __init__(self, role: str):
 
         self.role = role
-        self.last_raw_response: LLMOutput = None
+        self.last_raw_response: LLMOutput = None # NOTE: last_raw_response will be deprecated in future version.
         self.content: List[ContentItem] = []
-        self.tool_call_id: Optional[str] = None
+        self.responsed_tool_calls: ToolCalls = None 
 
     @classmethod
     def system(cls) -> "MessageBuilder":
@@ -104,6 +177,15 @@ def mark_all_previous_msg_for_caching(self):
         # This is a placeholder for future implementation.
         raise NotImplementedError
 
+    @classmethod
+    def add_responded_tool_calls(cls, responsed_tool_calls: ToolCalls) -> "MessageBuilder":
+        """Add tool calls to the message content."""
+
+        assert responsed_tool_calls.all_responses_set, "All tool calls must have a response."
+        msg = cls.tool(last_raw_response=None)
+        msg.responsed_tool_calls = responsed_tool_calls
+        return msg
+
 
 # TODO: Support parallel tool calls.
 
@@ -168,22 +250,15 @@ def prepare_message(self) -> List[Message]:
             output["role"] = "user"
 
         if self.role == "tool":
-
-            api_response = self.last_raw_response
-            fn_calls = [content for content in api_response.content if content.type == "tool_use"]
-            assert len(fn_calls) > 0, "No tool calls found in the last response"
-            if len(fn_calls) > 1:
-                logging.warning("Using only the first tool call from many.")
-            tool_call_id = fn_calls[0].id  # Using the first tool call ID
-
+            assert self.responsed_tool_calls is not None, "No tool_calls added to tool call response"
             output["role"] = "user"
-            output["content"] = [
-                {
+            output["content"] = [{
                     "type": "tool_result",
-                    "tool_use_id": tool_call_id,
-                    "content": output["content"],
-                }
-            ]
+                    "tool_use_id": call.raw_call.id,
+                    "content": [self.transform_content(item) for item in call.tool_response]
+                } for call in self.responsed_tool_calls
+                ]
+
         if self.role == "assistant":
             # Strip whitespace from assistant text responses. See anthropic error code 400.
             for c in output["content"]:
@@ -347,7 +422,7 @@ def _call_api(self, messages: list[Any | MessageBuilder], **kwargs) -> dict:
 
         return response
 
-    def _parse_response(self, response: dict) -> dict:
+    def _parse_response(self, response: dict) -> LLMOutput:
         result = LLMOutput(
             raw_response=response,
             think="",
@@ -542,7 +617,13 @@ def _call_api(
         sys_msg, other_msgs = self.filter_system_messages(messages)
         sys_msg_text = "\n".join(c["text"] for m in sys_msg for c in m.content)
         for msg in other_msgs:
-            temp = msg.prepare_message() if isinstance(msg, MessageBuilder) else [msg]
+            if isinstance(msg, MessageBuilder):
+                temp = msg.prepare_message() 
+            elif isinstance(msg, ToolCalls):
+                temp = [{
+                    "role": "assistant",
+                    "content": msg.raw_calls.content
+                }]
             if kwargs.pop("use_cache_breakpoints", False):
                 temp = self.apply_cache_breakpoints(msg, temp)
             input.extend(temp)
@@ -588,16 +669,16 @@ def filter_system_messages(messages: list[dict | MessageBuilder]) -> tuple[Messa
                 other_msgs.append(msg)
         return sys_msgs, other_msgs
 
-    def _parse_response(self, response: dict) -> dict:
+    def _parse_response(self, response: dict) -> LLMOutput:
         result = LLMOutput(
             raw_response=response,
             think="",
             action=None,
-            tool_calls={
-                "role": "assistant",
-                "content": response.content,
-            },
-        )
+            tool_calls=None
+            )
+        tool_calls = ToolCalls(raw_calls=response)  # Initialize ToolCalls to hold tool call responses
+        action_list = []
+        # print(f"Response from Claude: {response}")
         for output in response.content:
             if output.type == "tool_use":
                 func_args_str = ", ".join(
@@ -606,9 +687,13 @@ def _parse_response(self, response: dict) -> dict:
                         for k, v in output.input.items()
                     ]
                 )
-                result.action = f"{output.name}({func_args_str})"
+                action_list.append(f"{output.name}({func_args_str})")
+                tool_calls.add_tool_call(ToolCall(name=output.name, arguments=output.input, raw_call=output))
             elif output.type == "text":
                 result.think += output.text
+        
+        result.tool_calls = tool_calls if tool_calls else None
+        result.action = action_list
         return result
 
     # def ensure_cache_conditions(self, msgs: List[Message]) -> bool:

From 89376abdbadabdeae3a568bcbe8cbda29df547f0 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Fri, 20 Jun 2025 18:07:45 -0400
Subject: [PATCH 03/37] enable screenshot tagging in xray for mutli-actions

---
 src/agentlab/agents/agent_utils.py | 96 ++++++++++++++++--------------
 1 file changed, 51 insertions(+), 45 deletions(-)

diff --git a/src/agentlab/agents/agent_utils.py b/src/agentlab/agents/agent_utils.py
index 991e27e6..dda0e2a9 100644
--- a/src/agentlab/agents/agent_utils.py
+++ b/src/agentlab/agents/agent_utils.py
@@ -10,7 +10,7 @@
 """
 
 
-def tag_screenshot_with_action(screenshot: Image, action: str) -> Image:
+def tag_screenshot_with_action(screenshot: Image, action: str | list[str]) -> Image:
     """
     If action is a coordinate action, try to render it on the screenshot.
 
@@ -26,50 +26,56 @@ def tag_screenshot_with_action(screenshot: Image, action: str) -> Image:
     Raises:
         ValueError: If the action parsing fails.
     """
-    if action.startswith("mouse_click"):
-        try:
-            coords = action[action.index("(") + 1 : action.index(")")].split(",")
-            coords = [c.strip() for c in coords]
-            if len(coords) not in [2, 3]:
-                raise ValueError(f"Invalid coordinate format: {coords}")
-            if coords[0].startswith("x="):
-                coords[0] = coords[0][2:]
-            if coords[1].startswith("y="):
-                coords[1] = coords[1][2:]
-            x, y = float(coords[0].strip()), float(coords[1].strip())
-            draw = ImageDraw.Draw(screenshot)
-            radius = 5
-            draw.ellipse(
-                (x - radius, y - radius, x + radius, y + radius), fill="blue", outline="blue"
-            )
-        except (ValueError, IndexError) as e:
-            warning(f"Failed to parse action '{action}': {e}")
-
-    elif action.startswith("mouse_drag_and_drop"):
-        try:
-            func_name, parsed_args = parse_func_call_string(action)
-            if func_name == "mouse_drag_and_drop" and parsed_args is not None:
-                args, kwargs = parsed_args
-                x1, y1, x2, y2 = None, None, None, None
-
-                if args and len(args) >= 4:
-                    # Positional arguments: mouse_drag_and_drop(x1, y1, x2, y2)
-                    x1, y1, x2, y2 = map(float, args[:4])
-                elif kwargs:
-                    # Keyword arguments: mouse_drag_and_drop(from_x=x1, from_y=y1, to_x=x2, to_y=y2)
-                    x1 = float(kwargs.get("from_x", 0))
-                    y1 = float(kwargs.get("from_y", 0))
-                    x2 = float(kwargs.get("to_x", 0))
-                    y2 = float(kwargs.get("to_y", 0))
-
-                if all(coord is not None for coord in [x1, y1, x2, y2]):
-                    draw = ImageDraw.Draw(screenshot)
-                    # Draw the main line
-                    draw.line((x1, y1, x2, y2), fill="red", width=2)
-                    # Draw arrowhead at the end point using the helper function
-                    draw_arrowhead(draw, (x1, y1), (x2, y2))
-        except (ValueError, IndexError) as e:
-            warning(f"Failed to parse action '{action}': {e}")
+    import copy
+    actions = copy.deepcopy(action)  # Avoid modifying the original action
+    if action is str:
+        actions = [actions]
+    
+    for action in actions:
+        if action.startswith("mouse_click"):
+            try:
+                coords = action[action.index("(") + 1 : action.index(")")].split(",")
+                coords = [c.strip() for c in coords]
+                if len(coords) not in [2, 3]:
+                    raise ValueError(f"Invalid coordinate format: {coords}")
+                if coords[0].startswith("x="):
+                    coords[0] = coords[0][2:]
+                if coords[1].startswith("y="):
+                    coords[1] = coords[1][2:]
+                x, y = float(coords[0].strip()), float(coords[1].strip())
+                draw = ImageDraw.Draw(screenshot)
+                radius = 5
+                draw.ellipse(
+                    (x - radius, y - radius, x + radius, y + radius), fill="blue", outline="blue"
+                )
+            except (ValueError, IndexError) as e:
+                warning(f"Failed to parse action '{action}': {e}")
+
+        elif action.startswith("mouse_drag_and_drop"):
+            try:
+                func_name, parsed_args = parse_func_call_string(action)
+                if func_name == "mouse_drag_and_drop" and parsed_args is not None:
+                    args, kwargs = parsed_args
+                    x1, y1, x2, y2 = None, None, None, None
+
+                    if args and len(args) >= 4:
+                        # Positional arguments: mouse_drag_and_drop(x1, y1, x2, y2)
+                        x1, y1, x2, y2 = map(float, args[:4])
+                    elif kwargs:
+                        # Keyword arguments: mouse_drag_and_drop(from_x=x1, from_y=y1, to_x=x2, to_y=y2)
+                        x1 = float(kwargs.get("from_x", 0))
+                        y1 = float(kwargs.get("from_y", 0))
+                        x2 = float(kwargs.get("to_x", 0))
+                        y2 = float(kwargs.get("to_y", 0))
+
+                    if all(coord is not None for coord in [x1, y1, x2, y2]):
+                        draw = ImageDraw.Draw(screenshot)
+                        # Draw the main line
+                        draw.line((x1, y1, x2, y2), fill="red", width=2)
+                        # Draw arrowhead at the end point using the helper function
+                        draw_arrowhead(draw, (x1, y1), (x2, y2))
+            except (ValueError, IndexError) as e:
+                warning(f"Failed to parse action '{action}': {e}")
     return screenshot
 
 

From 81b1a88aac5560ece1c76f24e5a18912f170ac71 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Fri, 20 Jun 2025 18:08:41 -0400
Subject: [PATCH 04/37] add tool call and tool response markdown in xray

---
 src/agentlab/analyze/agent_xray.py |  4 ++++
 src/agentlab/llm/response_api.py   | 13 +++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index e09b4af8..d02d2d61 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -25,6 +25,7 @@
 from agentlab.llm.llm_utils import BaseMessage as AgentLabBaseMessage
 from agentlab.llm.llm_utils import Discussion
 from agentlab.llm.response_api import MessageBuilder
+from agentlab.llm.response_api import ToolCalls
 
 select_dir_instructions = "Select Experiment Directory"
 AGENT_NAME_KEY = "agent.agent_name"
@@ -610,6 +611,9 @@ def dict_to_markdown(d: dict):
         str: A markdown-formatted string representation of the dictionary.
     """
     if not isinstance(d, dict):
+        if isinstance(d, ToolCalls):
+            # ToolCalls rendered by to_markdown method.
+            return ""
         warning(f"Expected dict, got {type(d)}")
         return repr(d)
     if not d:
diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
index f36e9ba1..fe48bae3 100644
--- a/src/agentlab/llm/response_api.py
+++ b/src/agentlab/llm/response_api.py
@@ -162,6 +162,19 @@ def to_markdown(self) -> str:
             elif "image" in item:
                 parts.append(f"![Image]({item['image']})")
 
+        # Tool call markdown repr
+        if self.responsed_tool_calls:
+            for i, tool_call in enumerate(self.responsed_tool_calls.tool_calls, 1):
+                args = ", ".join(f"{k}={v}" for k, v in tool_call.arguments.items())
+                parts.append(f"\n**Tool Call {i}**: {tool_call.name}({args})")
+                
+                if tool_call.tool_response:
+                    parts.append(f"\n**Tool Response {i}:**")
+                    for response_item in tool_call.tool_response:
+                        content = (f"```\n{response_item['text']}\n```" if "text" in response_item 
+                                 else f"![Tool Response Image]({response_item['image']})")
+                        parts.append(content)
+
         markdown = f"### {self.role.capitalize()}\n"
         markdown += "\n".join(parts)
 

From c244db3451c17b920423388379ecb68d99bc282d Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 23 Jun 2025 12:14:48 -0400
Subject: [PATCH 05/37] update tests for parallel tool calls in claude for
 responses_api.py

---
 tests/llm/test_response_api.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/llm/test_response_api.py b/tests/llm/test_response_api.py
index 16316a92..bfe054a9 100644
--- a/tests/llm/test_response_api.py
+++ b/tests/llm/test_response_api.py
@@ -290,17 +290,15 @@ def test_claude_response_model_parse_and_cost():
             messages = [
                 AnthropicAPIMessageBuilder.user()
                 .add_text("Search for latest news")
-                .prepare_message()[0]
             ]
             parsed_output = model(messages)
 
     mock_create.assert_called_once()
-    fn_calls = [
-        content for content in parsed_output.raw_response.content if content.type == "tool_use"
-    ]
+    fn_call = next(iter(parsed_output.tool_calls))
+
     assert "Thinking about the request." in parsed_output.think
-    assert parsed_output.action == 'search_web(query="latest news")'
-    assert fn_calls[0].id == "tool_abc"
+    assert parsed_output.action == ['search_web(query="latest news")']
+    assert fn_call.name == "search_web"
     assert global_tracker.stats["input_tokens"] == 40
     assert global_tracker.stats["output_tokens"] == 20
     # assert global_tracker.stats["cost"] > 0 # Verify cost is calculated
@@ -442,14 +440,13 @@ def test_claude_response_model_pricy_call():
         messages = [
             AnthropicAPIMessageBuilder.user()
             .add_text("What is the weather in Paris?")
-            .prepare_message()[0]
         ]
         parsed_output = model(messages)
 
     assert parsed_output.raw_response is not None
     assert (
-        parsed_output.action == 'get_weather(location="Paris")'
-    ), f'Expected get_weather("Paris") but got {parsed_output.action}'
+        parsed_output.action == ['get_weather(location="Paris")']
+    ), f'Expected [get_weather("Paris")] but got {parsed_output.action}'
     assert global_tracker.stats["input_tokens"] > 0
     assert global_tracker.stats["output_tokens"] > 0
     assert global_tracker.stats["cost"] > 0
@@ -689,7 +686,10 @@ def test_claude_model_with_multiple_messages_pricy_call():
         prev_cost = global_tracker.stats["cost"]
 
         messages.append(llm_output1.tool_calls)
-        messages.append(msg_builder.tool(llm_output1.raw_response).add_text("Its sunny! 25°C"))
+        for tool_call in llm_output1.tool_calls:
+            tool_call.add_text("It's sunny! 25°C")
+        messages.append(
+            msg_builder.add_responded_tool_calls(llm_output1.tool_calls))
         messages.append(msg_builder.user().add_text("What is the weather in Delhi?"))
         llm_output2 = model(messages)
         # Token and cost deltas
@@ -703,8 +703,8 @@ def test_claude_model_with_multiple_messages_pricy_call():
     assert prev_cost > 0, "Expected previous cost value to be greater than 0"
     assert llm_output2.raw_response is not None
     assert (
-        llm_output2.action == 'get_weather(location="Delhi", unit="celsius")'
-    ), f'Expected get_weather("Delhi") but got {llm_output2.action}'
+        llm_output2.action == ['get_weather(location="Delhi", unit="celsius")']
+    ), f'Expected [get_weather("Delhi")] but got {llm_output2.action}'
     assert delta_input > 0, "Expected new input tokens to be greater than 0"
     assert delta_output > 0, "Expected new output tokens to be greater than 0"
     assert delta_cost > 0, "Expected new cost value to be greater than 0"

From e7613d686286d6cba5a49b26a49852872ead8589 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Wed, 25 Jun 2025 20:51:46 -0400
Subject: [PATCH 06/37] Add support for openai CUA for tool use agent

---
 .../agents/tool_use_agent/openai_cua.py       | 328 ++++++++++++++++++
 .../agents/tool_use_agent/tool_use_agent.py   |  51 ++-
 2 files changed, 365 insertions(+), 14 deletions(-)
 create mode 100644 src/agentlab/agents/tool_use_agent/openai_cua.py

diff --git a/src/agentlab/agents/tool_use_agent/openai_cua.py b/src/agentlab/agents/tool_use_agent/openai_cua.py
new file mode 100644
index 00000000..66a7d5b2
--- /dev/null
+++ b/src/agentlab/agents/tool_use_agent/openai_cua.py
@@ -0,0 +1,328 @@
+import json
+from dataclasses import dataclass
+from typing import Any, Dict, List
+
+from agentlab.llm.llm_utils import call_openai_api_with_retries
+from agentlab.llm.response_api import (
+    ContentItem,
+    LLMOutput,
+    Message,
+    MessageBuilder,
+    OpenAIResponseAPIMessageBuilder,
+    OpenAIResponseModel,
+    OpenAIResponseModelArgs,
+    ToolCall,
+    ToolCalls,
+)
+
+from .tool_use_agent import (
+    GeneralHints,
+    Goal,
+    Obs,
+    PromptConfig,
+    Summarizer,
+    TaskHint,
+    ToolUseAgentArgs,
+)
+
+
+class OpenAICUAModel(OpenAIResponseModel):
+
+    def _call_api(self, messages: list[Any | MessageBuilder], tool_choice="auto", **kwargs) -> dict:
+        input = []
+        for msg in messages:
+            if isinstance(msg, MessageBuilder):
+                temp = msg.prepare_message()
+            elif isinstance(msg, ToolCalls):
+                temp = msg.raw_calls
+            else:
+                raise TypeError('Unsupported message type: {}'.format(type(msg)))
+            input.extend(temp)
+
+        api_params: Dict[str, Any] = {
+            "model": self.model_name,
+            "input": input,
+            "temperature": self.temperature,
+            "max_output_tokens": self.max_tokens,
+            "truncation": "auto",  # truncation is required for OpenAI CUA
+            "tool_choice": "auto",  # Tool choice can only be auto
+            **self.extra_kwargs,  
+        }
+
+        if "tools" in api_params:
+            cua_tool_present = any(
+                tool.get("type") == "computer_use_preview" for tool in api_params["tools"]
+            )
+            if not cua_tool_present:
+                api_params["tools"].extend(
+                    [
+                        {
+                            "type": "computer_use_preview",
+                            "display_width": 1024,
+                            "display_height": 768,
+                            "environment": "browser",  # other possible values: "mac", "windows", "ubuntu"
+                        }
+                    ]
+                )
+
+        response = call_openai_api_with_retries(
+            self.client.responses.create,
+            api_params,
+        )
+
+        return response
+
+    def _parse_response(self, response: dict) -> dict:
+        result = LLMOutput(
+            raw_response=response,
+            think="",
+            action=None,
+            tool_calls=ToolCalls(),
+        )
+        interesting_keys = ["output_text"]
+        actions = []  # Collect all actions for multi-action support
+
+        for output in response.output:
+            if output.type in "computer_call":
+                # Mapping CUA action space to bgym coord action space.
+                bgym_fn, bgym_fn_args, action_str = (
+                    self.cua_action_to_bgym_action(output.action)
+                )
+                tool_call = ToolCall(
+                    name=bgym_fn,
+                    arguments=bgym_fn_args,
+                    raw_call=output,
+                )
+                result.tool_calls.add_tool_call(tool_call)
+                actions.append(action_str)
+
+            elif output.type == "function_call":
+                arguments = json.loads(output.arguments)
+                func_args_str = ", ".join(
+                    [
+                        f'{k}="{v}"' if isinstance(v, str) else f"{k}={v}"
+                        for k, v in arguments.items()
+                    ]
+                )
+                action_str = f"{output.name}({func_args_str})"
+                tool_call = ToolCall(
+                    name=output.name,
+                    arguments=arguments,
+                    raw_call=output,
+                )
+                result.tool_calls.add_tool_call(tool_call)
+                if tool_call.is_bgym_action():
+                    actions.append(action_str)
+
+            elif output.type == "reasoning":
+                if len(output.summary) > 0:
+                    result.think += output.summary[0].text + "\n"
+
+            elif output.type == "message" and output.content:
+                result.think += output.content[0].text + "\n"
+
+        result.action = actions
+        result.tool_calls.raw_calls = response.output
+
+        for key in interesting_keys:
+            if key_content := getattr(output, "output_text", None) is not None:
+                result.think += f"<{key}>{key_content}</{key}>"
+        return result
+
+    @staticmethod
+    def cua_action_to_bgym_action(action) -> str:
+        """
+        Given a computer action (e.g., click, double_click, scroll, etc.),
+        convert it to a text description.
+        """
+
+        action_type = action.type
+
+        try:
+            match action_type:
+
+                case "click":
+                    x, y = action.x, action.y
+                    button = action.button
+                    print(f"Action: click at ({x}, {y}) with button '{button}'")
+                    # Not handling things like middle click, etc.
+                    if button != "left" and button != "right":
+                        button = "left"
+                    action_str = f"mouse_click({x}, {y}, button='{button}')"
+                    (
+                        bgym_fn,
+                        bgym_fn_args,
+                    ) = "mouse_click", {"x": x, "y": y, "button": button}
+
+                case "scroll":
+                    x, y = action.x, action.y
+                    scroll_x, scroll_y = action.scroll_x, action.scroll_y
+                    action_str = f"scroll_at({x}, {y}, {scroll_x},  {scroll_y})"
+                    bgym_fn, bgym_fn_args = "scroll_at", {
+                        "x": x,
+                        "y": y,
+                        "scroll_x": scroll_x,
+                        "scroll_y": scroll_y,
+                    }
+
+                case "keypress":
+                    keys = action.keys
+                    for k in keys:
+                        print(f"Action: keypress '{k}'")
+                        # A simple mapping for common keys; expand as needed.
+                        if k.lower() == "enter":
+                            action_str = "keyboard_press('Enter')"
+                        elif k.lower() == "space":
+                            action_str = "keyboard_press(' ')"
+                        else:
+                            action_str = f"keyboard_press('{k}')"
+
+                        bgym_fn, bgym_fn_args = "keyboard_press", {"key": k}
+
+                case "type":
+                    text = action.text
+                    print(f"Action: type text: {text}")
+                    action_str = f"keyboard_type('{text}')"
+                    bgym_fn, bgym_fn_args = "keyboard_type", {"text": text}
+
+                case "wait":
+                    print("Action: wait")
+                    action_str = "noop()"
+                    bgym_fn, bgym_fn_args = "noop", {}
+
+                case "screenshot":
+                    # Not a valid bgym action
+                    action_str = "noop()"
+                    bgym_fn, bgym_fn_args = "noop", {}
+
+                case "drag":
+                    x1, y1 = action.path[0].x, action.path[0].y
+                    x2, y2 = action.path[1].x, action.path[1].y
+                    print(f"Action: drag from ({x1}, {y1}) to ({x2}, {y2})")
+                    action_str = f"mouse_drag_and_drop({x1}, {y1}, {x2}, {y2})"
+                    bgym_fn, bgym_fn_args = "mouse_drag_and_drop", {
+                        "x1": x1,
+                        "y1": y1,
+                        "x2": x2,
+                        "y2": y2,
+                    }
+
+                case _:
+                    raise ValueError(f"Unrecognized action type: {action_type}")
+
+            # Return the function name and arguments for bgym
+
+            return bgym_fn, bgym_fn_args, action_str
+
+        except Exception as e:
+            print(f"Error handling action {action}: {e}")
+
+
+class OpenaAICUAMessageBuilder(OpenAIResponseAPIMessageBuilder):
+
+    def prepare_message(self) -> List[Message]:
+        content = []
+        for item in self.content:
+            content.append(self.convert_content_to_expected_format(item))
+        output = [{"role": self.role, "content": content}]
+
+        if self.role != "tool":
+            return output
+        else:
+            return self.handle_tool_call()
+
+    def convert_content_to_expected_format(self, content: ContentItem) -> ContentItem:
+        """Convert the content item to the expected format for OpenAI Responses."""
+        if "text" in content:
+            content_type = "input_text" if self.role != "assistant" else "output_text"
+            return {"type": content_type, "text": content["text"]}
+        elif "image" in content:
+            return {"type": "input_image", "image_url": content["image"]}
+        else:
+            raise ValueError(f"Unsupported content type: {content}")
+
+    def handle_tool_call(self):
+        """Handle the tool call response from the last raw response."""
+        if self.responsed_tool_calls is None:
+            raise ValueError("No tool calls found in responsed_tool_calls")
+
+        output = []
+        for fn_call in self.responsed_tool_calls:
+            call_type = fn_call.raw_call.type
+            call_id = fn_call.raw_call.call_id
+            call_response = fn_call.tool_response  # List[ContentItem]
+
+            match call_type:
+                case "function_call":
+                    # image output is not supported in function calls response.
+                    fn_call_response = {
+                        "type": "function_call_output",
+                        "call_id": call_id,
+                        "output": [
+                            self.convert_content_to_expected_format(item) for item in call_response
+                        ],
+                    }
+                    output.append(fn_call_response)
+
+                case "computer_call":
+                    # For computer calls, use only images are expected.
+                    computer_call_output = {
+                        "type": "computer_call_output",
+                        "call_id": call_id,
+                        "output": self.convert_content_to_expected_format(call_response[0]), # list needs to be flattened
+                    }
+                    output.append(computer_call_output)  # this needs to be a screenshot
+
+        return output
+
+    def mark_all_previous_msg_for_caching(self):
+        pass
+
+
+@dataclass
+class OpenAICUAModelArgs(OpenAIResponseModelArgs):
+    """Serializable object for instantiating a generic chat model with an OpenAI
+    model."""
+
+    api = "openai"
+
+    def make_model(self, extra_kwargs=None, **kwargs):
+        return OpenAICUAModel(
+            model_name=self.model_name,
+            temperature=self.temperature,
+            max_tokens=self.max_new_tokens,
+            extra_kwargs=extra_kwargs,
+            pricing_api="openai",
+            **kwargs,
+        )
+
+    def get_message_builder(self) -> MessageBuilder:
+        return OpenaAICUAMessageBuilder
+
+
+# Default configuration for Computer Use Agent
+DEFAULT_CUA_PROMPT_CONFIG = PromptConfig(
+    tag_screenshot=True,
+    goal=Goal(goal_as_system_msg=True),
+    obs=Obs(
+        use_last_error=True,
+        use_screenshot=True,
+        use_axtree=True,
+        use_dom=False,
+        use_som=False,
+        use_tabs=False,
+        openai_cua_mode=True,  # Enable CUA mode for OpenAI
+    ),
+    summarizer=Summarizer(do_summary=True),
+    general_hints=GeneralHints(use_hints=False),
+    task_hint=TaskHint(use_task_hint=False),
+    keep_last_n_obs=1,  #NOTE: API error if more than 1 obs is used. There can be only one computer call output in the response.
+    multiaction=True,  # whether to use multi-action or not
+    # action_subsets=("bid",),
+    action_subsets=("coord"),
+)
+
+OAI_CUA_TOOL_AGENT = ToolUseAgentArgs(
+    model_args=OpenAICUAModelArgs(model_name="computer-use-preview"),
+    config=DEFAULT_CUA_PROMPT_CONFIG,
+)
diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
index 474f4851..098e86ee 100644
--- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py
+++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
@@ -22,6 +22,7 @@
 from agentlab.agents.agent_args import AgentArgs
 from agentlab.llm.llm_utils import image_to_png_base64_url
 from agentlab.llm.response_api import (
+    ToolCalls,
     ClaudeResponseModelArgs,
     LLMOutput,
     MessageBuilder,
@@ -98,7 +99,8 @@ def flatten(self) -> list[MessageBuilder]:
                 messages.extend(group.messages)
             # Mark all summarized messages for caching
             if i == len(self.groups) - keep_last_n_obs:
-                messages[i].mark_all_previous_msg_for_caching()
+                if not isinstance(messages[i], ToolCalls):
+                    messages[i].mark_all_previous_msg_for_caching()
         return messages
 
     def set_last_summary(self, summary: MessageBuilder):
@@ -114,6 +116,18 @@ def get_last_summary(self) -> MessageBuilder | None:
     def is_goal_set(self) -> bool:
         """Check if the goal is set in the first group."""
         return len(self.groups) > 0
+    
+    def contains_image(self) -> bool:
+        """Check if an image is set in any group"""
+        for grp in self.groups:
+            for msg in grp.messages:
+                for item in msg.content:
+                    if 'image' in item:
+                        return True
+        return False
+       
+
+
 
 
 SYS_MSG = """You are a web agent. Based on the observation, you will decide which action to take to accomplish your goal. 
@@ -165,24 +179,14 @@ class Obs(Block):
     use_tabs: bool = False
     add_mouse_pointer: bool = False
     use_zoomed_webpage: bool = False
+    openai_cua_mode: bool = False  #  screenshot can only be added as tool response, given an initial screenshot obs
 
     def apply(
         self, llm, discussion: StructuredDiscussion, obs: dict, last_llm_output: LLMOutput
     ) -> dict:
-        # bgym_calls = [call for call in last_llm_output.tool_calls if call.is_bgym_action]
-        # fn_calls = [call for call in last_llm_output.tool_calls if not call.is_bgym_action]
 
         obs_msg = llm.msg.user()
-        if tool_calls := last_llm_output.tool_calls:
-            for action_call in tool_calls.get_bgym_action_calls():
-                action_call.add_text("See the observation")
-            for fn_call in tool_calls.get_non_bgym_action_calls():
-                call_results = execute_fn_calls(fn_call.name, fn_call.arguments)
-                fn_call.add_text(call_results)
-            
-            tool_response = llm.msg.add_responded_tool_calls(tool_calls)
-            discussion.append(tool_response)
-
+        tool_calls = last_llm_output.tool_calls
         if self.use_last_error:
             if obs["last_action_error"] != "":
                 obs_msg.add_text(f"Last action error:\n{obs['last_action_error']}")
@@ -201,8 +205,16 @@ def apply(
                         Image.fromarray(obs["screenshot"]), obs["last_action"]
                     )
                 )
+            
+            if self.openai_cua_mode and discussion.contains_image():
+                if tool_calls and tool_calls.get_bgym_action_calls():
+                    computer_call = tool_calls.get_bgym_action_calls()[0]
+                    computer_call.add_image(
+                        image_to_png_base64_url(screenshot)
+                    )
+            else:
+                obs_msg.add_image(image_to_png_base64_url(screenshot))
 
-            obs_msg.add_image(image_to_png_base64_url(screenshot))
         if self.use_axtree:
             obs_msg.add_text(f"AXTree:\n{AXTREE_NOTE}\n{obs['axtree_txt']}")
         if self.use_dom:
@@ -211,6 +223,17 @@ def apply(
             obs_msg.add_text(_format_tabs(obs))
 
         discussion.append(obs_msg)
+
+        if tool_calls:
+            for action_call in tool_calls.get_bgym_action_calls():
+                if not self.openai_cua_mode:
+                    action_call.add_text("See the observation")
+            for fn_call in tool_calls.get_non_bgym_action_calls():
+                call_results = execute_fn_calls(fn_call.name, fn_call.arguments)
+                fn_call.add_text(call_results)
+            tool_response = llm.msg.add_responded_tool_calls(tool_calls)
+            discussion.append(tool_response)
+
         return obs_msg
 
 

From b4c283f793a6447f414506ba0e4023607b306969 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Wed, 25 Jun 2025 20:53:16 -0400
Subject: [PATCH 07/37] update openai cache tracking usage to support Chat
 Completion and Responses API

---
 src/agentlab/llm/tracking.py | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/src/agentlab/llm/tracking.py b/src/agentlab/llm/tracking.py
index ad846a71..ab033b22 100644
--- a/src/agentlab/llm/tracking.py
+++ b/src/agentlab/llm/tracking.py
@@ -163,6 +163,10 @@ def __call__(self, *args, **kwargs):
         response = self._call_api(*args, **kwargs)
 
         usage = dict(getattr(response, "usage", {}))
+        if 'prompt_tokens_details' in usage:
+            usage['cached_tokens'] = usage['prompt_token_details'].cached_tokens
+        if 'input_tokens_details' in usage:
+            usage['cached_tokens'] = usage['input_tokens_details'].cached_tokens
         usage = {f"usage_{k}": v for k, v in usage.items() if isinstance(v, (int, float))}
         usage |= {"n_api_calls": 1}
         usage |= {"effective_cost": self.get_effective_cost(response)}
@@ -298,21 +302,29 @@ def get_effective_cost_from_openai_api(self, response) -> float:
         Returns:
             float: The effective cost calculated from the response.
         """
-        usage = getattr(response, "usage", {})
-        prompt_token_details = getattr(response, "prompt_tokens_details", {})
-
-        total_input_tokens = getattr(
-            prompt_token_details, "prompt_tokens", 0
-        )  # Cache read tokens + new input tokens
-        output_tokens = getattr(usage, "completion_tokens", 0)
-        cache_read_tokens = getattr(prompt_token_details, "cached_tokens", 0)
-
-        non_cached_input_tokens = total_input_tokens - cache_read_tokens
+        usage = getattr(response, "usage", None)
+        if usage is None:
+            logging.warning("No usage information found in the response. Defaulting cost to 0.0.")
+            return 0.0
+        api_type = 'chatcompletion' if hasattr(usage, "prompt_tokens_details") else 'response'
+        if api_type == 'chatcompletion':
+            total_input_tokens = usage.prompt_tokens
+            output_tokens = usage.completion_tokens
+            cached_input_tokens = usage.prompt_tokens_details.cached_tokens
+            non_cached_input_tokens = total_input_tokens - cached_input_tokens
+        elif api_type == 'response':
+            total_input_tokens = usage.input_tokens
+            output_tokens = usage.output_tokens
+            cached_input_tokens = usage.input_tokens_details.cached_tokens
+            non_cached_input_tokens = total_input_tokens - cached_input_tokens
+        else:
+            logging.warning(f"Unsupported API type: {api_type}. Defaulting cost to 0.0.")
+            return 0.0
+        
         cache_read_cost = self.input_cost * OPENAI_CACHE_PRICING_FACTOR["cache_read_tokens"]
-
         effective_cost = (
             self.input_cost * non_cached_input_tokens
-            + cache_read_tokens * cache_read_cost
+            + cached_input_tokens * cache_read_cost
             + self.output_cost * output_tokens
         )
         return effective_cost

From 00da78af1564bb8981e9531ba12b5f632f9759d8 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Fri, 27 Jun 2025 17:59:41 -0400
Subject: [PATCH 08/37] fix: join action_list into a single string with
 new-lines

---
 src/agentlab/llm/response_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
index 158b2548..c3264f8f 100644
--- a/src/agentlab/llm/response_api.py
+++ b/src/agentlab/llm/response_api.py
@@ -709,7 +709,7 @@ def _parse_response(self, response: dict) -> LLMOutput:
                 result.think += output.text
         
         result.tool_calls = tool_calls if tool_calls else None
-        result.action = action_list
+        result.action = "\n".join(action_list)
         return result
 
     # def ensure_cache_conditions(self, msgs: List[Message]) -> bool:

From 1d7060c1635b4e4efac4ad81d7194ffb9171c33b Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Fri, 27 Jun 2025 21:25:32 -0400
Subject: [PATCH 09/37] Refactor OpenAiResponsesModel to isolate env specific
 functionality and added computer_call handling.

---
 src/agentlab/llm/response_api.py | 203 ++++++++++++++++++++++---------
 1 file changed, 143 insertions(+), 60 deletions(-)

diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
index c3264f8f..c7d1858c 100644
--- a/src/agentlab/llm/response_api.py
+++ b/src/agentlab/llm/response_api.py
@@ -55,7 +55,7 @@ class ToolCall:
     tool_response: List[ContentItem] = field(default_factory=list)
 
     @property
-    def is_bgym_action(self) -> bool:
+    def is_env_action(self) -> bool:
         """Check if the tool call is a reserved BGYM action."""
         return self.name in BGYM_RESERVED_ACTION_FUNCTION_NAMES
 
@@ -71,6 +71,10 @@ def add_text(self, text: str) -> "MessageBuilder":
     def add_image(self, text: str) -> "MessageBuilder":
         self.tool_response.append({"image": text})
         return self
+    
+    def __repr__(self):
+        return f"ToolCall(name={self.name}, arguments={self.arguments})"
+
 
 @dataclass
 class ToolCalls:
@@ -83,11 +87,11 @@ def add_tool_call(self, tool_call: ToolCall) -> "ToolCalls":
 
     def get_bgym_action_calls(self) -> List[ToolCall]:
         """Get all tool calls that are reserved BGYM actions."""
-        return [call for call in self.tool_calls if call.is_bgym_action]
+        return [call for call in self.tool_calls if call.is_env_action]
     
     def get_non_bgym_action_calls(self) -> List[ToolCall]:
         """Get all tool calls that are not reserved BGYM actions."""
-        return [call for call in self.tool_calls if not call.is_bgym_action]
+        return [call for call in self.tool_calls if not call.is_env_action]
     
     @property
     def all_responses_set(self) -> bool:
@@ -101,6 +105,10 @@ def __len__(self) -> int:
     def __iter__(self):
         """Make ToolCalls iterable."""
         return iter(self.tool_calls)
+    
+    def __bool__(self):
+        """Check if there are any tool calls."""
+        return len(self.tool_calls) > 0
 
 
 @dataclass
@@ -212,44 +220,56 @@ def system(cls) -> "OpenAIResponseAPIMessageBuilder":
     def prepare_message(self) -> List[Message]:
         content = []
         for item in self.content:
-            if "text" in item:
-                content_type = "input_text" if self.role != "assistant" else "output_text"
-                content.append({"type": content_type, "text": item["text"]})
+            content.append(self.convert_content_to_expected_format(item))
+        output = [{"role": self.role, "content": content}]
 
-            elif "image" in item:
-                content.append({"type": "input_image", "image_url": item["image"]})
+        return output if self.role != "tool" else self.handle_tool_call()
 
-        output = [{"role": self.role, "content": content}]
-        if self.role != "tool":
-            return output
+    def convert_content_to_expected_format(self, content: ContentItem) -> ContentItem:
+        """Convert the content item to the expected format for OpenAI Responses."""
+        if "text" in content:
+            content_type = "input_text" if self.role != "assistant" else "output_text"
+            return {"type": content_type, "text": content["text"]}
+        elif "image" in content:
+            return {"type": "input_image", "image_url": content["image"]}
         else:
-            tool_call_response = self.handle_tool_call(content)
-            return tool_call_response
+            raise ValueError(f"Unsupported content type: {content}")
 
-    def handle_tool_call(self, content):
+    def handle_tool_call(self):
         """Handle the tool call response from the last raw response."""
+        if self.responsed_tool_calls is None:
+            raise ValueError("No tool calls found in responsed_tool_calls")
+
         output = []
-        head_content, *tail_content = content
-        api_response = self.last_raw_response
-        fn_calls = [content for content in api_response.output if content.type == "function_call"]
-        assert len(fn_calls) > 0, "No function calls found in the last response"
-        if len(fn_calls) > 1:
-            logging.warning("Using only the first tool call from many.")
+        for fn_call in self.responsed_tool_calls:
+            call_type = fn_call.raw_call.type
+            call_id = fn_call.raw_call.call_id
+            call_response = fn_call.tool_response  # List[ContentItem]
+
+            match call_type:
+                case "function_call":
+                    # image output is not supported in function calls response.
+                    fn_call_response = {
+                        "type": "function_call_output",
+                        "call_id": call_id,
+                        "output": [
+                            self.convert_content_to_expected_format(item) for item in call_response
+                        ],
+                    }
+                    output.append(fn_call_response)
+
+                case "computer_call":
+                    # For computer calls, use only images are expected.
+                    computer_call_output = {
+                        "type": "computer_call_output",
+                        "call_id": call_id,
+                        "output": self.convert_content_to_expected_format(call_response[0]), # list needs to be flattened
+                    }
+                    output.append(computer_call_output)  # this needs to be a screenshot
 
-        first_fn_call_id = fn_calls[0].call_id
-        fn_output = head_content.get("text", "Function call answer in next message")
-        fn_call_response = {
-            "type": "function_call_output",
-            "call_id": first_fn_call_id,
-            "output": fn_output,
-        }
-        output.append(fn_call_response)
-        if tail_content:
-            # if there are more content items, add them as a new user message
-            output.append({"role": "user", "content": tail_content})
         return output
 
-    def mark_all_previous_msg_for_caching(self) -> List[Message]:
+    def mark_all_previous_msg_for_caching(self):
         pass
 
 
@@ -402,6 +422,8 @@ def __init__(
     ):
         self.tools = kwargs.pop("tools", None)
         self.tool_choice = kwargs.pop("tool_choice", None)
+        self.action_space_as_tools = True # this should be a config
+        self.multiaction_in_a_step = True # this should be a config
         super().__init__(
             model_name=model_name,
             api_key=api_key,
@@ -413,9 +435,7 @@ def __init__(
         self.client = OpenAI(api_key=api_key)
 
     def _call_api(self, messages: list[Any | MessageBuilder], **kwargs) -> dict:
-        input = []
-        for msg in messages:
-            input.extend(msg.prepare_message() if isinstance(msg, MessageBuilder) else [msg])
+        input = self.convert_messages_to_api_format(messages)
 
         api_params: Dict[str, Any] = {
             "model": self.model_name,
@@ -438,36 +458,100 @@ def _call_api(self, messages: list[Any | MessageBuilder], **kwargs) -> dict:
 
         return response
 
-    def _parse_response(self, response: dict) -> LLMOutput:
-        result = LLMOutput(
+    def convert_messages_to_api_format(self, messages: List[MessageBuilder| ToolCalls]) -> List[Message]:
+        """Convert messages to the format expected by the OpenAI Responses API."""
+        input = []
+        for msg in messages:
+            if isinstance(msg, MessageBuilder):
+                temp = msg.prepare_message()
+            elif isinstance(msg, ToolCalls):
+                temp = msg.raw_calls
+            else:
+                raise TypeError('Unsupported message type: {}'.format(type(msg)))
+            input.extend(temp)
+        return input
+
+    def _parse_response(self, response: "OpenAIResponseObject") -> LLMOutput:
+        """Parse the raw response from the OpenAI Responses API."""
+        think_output = self._extract_thinking_content_from_response(response)
+        toolcalls = self._extract_tool_calls_from_response(response)
+        if self.action_space_as_tools:
+            env_action = self._extract_env_actions_from_toolcalls(toolcalls)
+        else:
+            env_action = self._extract_env_actions_from_text_response(response)
+        return LLMOutput(
             raw_response=response,
-            think="",
-            action=None,
-            tool_calls=None,
+            think=think_output,
+            action=env_action if env_action is not None else "",  
+            tool_calls=toolcalls if toolcalls is not None else None,
         )
-        interesting_keys = ["output_text"]
+
+    def _extract_tool_calls_from_response(self, response: "OpenAIResponseObject") -> ToolCalls:
+        """Extracts tool calls from the response."""
+        tool_calls = ToolCalls(raw_calls=response.output)
         for output in response.output:
             if output.type == "function_call":
-                arguments = json.loads(output.arguments)
-                func_args_str = ", ".join(
-                    [
-                        f'{k}="{v}"' if isinstance(v, str) else f"{k}={v}"
-                        for k, v in arguments.items()
-                    ]
-                )
-                result.action = f"{output.name}({func_args_str})"
-                result.tool_calls = output
-                break
-            elif output.type == "reasoning":
-                if len(output.summary) > 0:
-                    result.think += output.summary[0].text + "\n"
+                tool_name = output.name
+                tool_args = json.loads(output.arguments)
+            elif output.type == "computer_call":
+                tool_name, tool_args = self.cua_action_to_env_tool_name_and_args(output.action)
+            else:
+                continue
+            tool_call = ToolCall(
+                name=tool_name,
+                arguments=tool_args,
+                raw_call=output,
+            )
+            tool_calls.add_tool_call(tool_call)
+        return tool_calls
+
+    def _extract_env_actions_from_toolcalls(self, toolcalls: ToolCalls) -> Any | None:
+        """Extracts actions from the response."""
+        actions = []
+        for call in toolcalls:
+            if call.is_env_action:
+                action_str = self.convert_toolcall_to_env_action_format(call)
+                actions.append(action_str)
+        if self.multiaction_in_a_step: # This should be a config
+            return self.convert_multiactions_to_env_action_format(actions)
+        else:
+            return actions[0] if actions else None
 
+    def _extract_thinking_content_from_response(self, response: "OpenAIResponseObject") -> str:
+        """Extracts the thinking content from the response."""
+        thinking_content = ""
+        for output in response.output:
+            if output.type == "reasoning":
+                if len(output.summary) > 0:
+                    thinking_content += output.summary[0].text + "\n"
             elif output.type == "message" and output.content:
-                result.think += output.content[0].text + "\n"
-        for key in interesting_keys:
-            if key_content := getattr(output, "output_text", None) is not None:
-                result.think += f"<{key}>{key_content}</{key}>"
-        return result
+                thinking_content += output.content[0].text + "\n"
+            elif hasattr(output, "output_text") and output.output_text:
+                thinking_content += f"{output.output_text}\n"
+        return thinking_content
+
+    # Environment Specific functions, in this case BGYM
+
+    def convert_toolcall_to_env_action_format(self, toolcall: ToolCall) -> str:
+        """Convert a tool call to an BGYM environment action string."""
+        action_name, tool_args = toolcall.name, toolcall.arguments
+        action_args = ", ".join(
+            f'{k}="{v}"' if isinstance(v, str) else f"{k}={v}" for k, v in tool_args.items()
+        )
+        action_str = f"{action_name}({action_args})"
+        return action_str
+
+    def convert_multiactions_to_env_action_format(self, actions:list[Any] ) -> Any:
+        """Convert multiple actions list to a format that env supports"""
+        return "\n".join(actions) if actions else None
+
+    def cua_action_to_env_tool_name_and_args(self, action: str) -> tuple[str, Dict[str, Any]]:
+        pass
+
+    def _extract_env_actions_from_text_response(self, response: "OpenAIResponseObject") -> str | None:
+        """Extracts environment actions from the text response."""
+        # Use when action space is not given as tools.
+        pass
 
 
 class OpenAIChatCompletionModel(BaseModelWithPricing):
@@ -600,7 +684,6 @@ def extract_content_with_reasoning(message, wrap_tag="think"):
             reasoning_content = ""
         return f"{reasoning_content}{msg_content}{message.get('content', '')}"
 
-
 class ClaudeResponseModel(BaseModelWithPricing):
     def __init__(
         self,

From 997cc7b10a920a9d8326f23a33bca387965b45f3 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 30 Jun 2025 12:39:07 -0400
Subject: [PATCH 10/37] Change func names to use 'env' instead of 'bgym'

---
 src/agentlab/llm/response_api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
index c7d1858c..4a6ded78 100644
--- a/src/agentlab/llm/response_api.py
+++ b/src/agentlab/llm/response_api.py
@@ -85,11 +85,11 @@ def add_tool_call(self, tool_call: ToolCall) -> "ToolCalls":
         self.tool_calls.append(tool_call)
         return self
 
-    def get_bgym_action_calls(self) -> List[ToolCall]:
+    def get_env_action_calls(self) -> List[ToolCall]:
         """Get all tool calls that are reserved BGYM actions."""
         return [call for call in self.tool_calls if call.is_env_action]
     
-    def get_non_bgym_action_calls(self) -> List[ToolCall]:
+    def get_non_env_action_calls(self) -> List[ToolCall]:
         """Get all tool calls that are not reserved BGYM actions."""
         return [call for call in self.tool_calls if not call.is_env_action]
     

From b3d409fa26ddf2463bd6b715e86786f7fbf16a6d Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 30 Jun 2025 13:33:50 -0400
Subject: [PATCH 11/37] Add TODO's and WIP config classes

---
 src/agentlab/llm/response_api.py | 100 +++++++++++++++++++++++++++++--
 1 file changed, 96 insertions(+), 4 deletions(-)

diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
index 4a6ded78..af256152 100644
--- a/src/agentlab/llm/response_api.py
+++ b/src/agentlab/llm/response_api.py
@@ -49,6 +49,7 @@
 
 @dataclass
 class ToolCall:
+    #TODO: Check if this is a suitable tool representation for being MCP compliant.
     name: str = field(default=None)
     arguments: Dict[str, Any] = field(default_factory=dict)
     raw_call: Any =  field(default=None)
@@ -57,6 +58,8 @@ class ToolCall:
     @property
     def is_env_action(self) -> bool:
         """Check if the tool call is a reserved BGYM action."""
+        # TODO: env should return some func to check if agent action is env action.
+        # Keep in mind env may or may not have a fixed set of reserved actions.
         return self.name in BGYM_RESERVED_ACTION_FUNCTION_NAMES
 
     @property
@@ -86,11 +89,11 @@ def add_tool_call(self, tool_call: ToolCall) -> "ToolCalls":
         return self
 
     def get_env_action_calls(self) -> List[ToolCall]:
-        """Get all tool calls that are reserved BGYM actions."""
+        """Get all tool calls that are reserved Environment actions."""
         return [call for call in self.tool_calls if call.is_env_action]
     
     def get_non_env_action_calls(self) -> List[ToolCall]:
-        """Get all tool calls that are not reserved BGYM actions."""
+        """Get all tool calls that are not reserved Environment actions."""
         return [call for call in self.tool_calls if not call.is_env_action]
     
     @property
@@ -125,7 +128,7 @@ class MessageBuilder:
     def __init__(self, role: str):
 
         self.role = role
-        self.last_raw_response: LLMOutput = None # NOTE: last_raw_response will be deprecated in future version.
+        self.last_raw_response: LLMOutput = None # NOTE: last_raw_response will be deprecated in future version. We can use ToolCalls object to get all the relevant information.
         self.content: List[ContentItem] = []
         self.responsed_tool_calls: ToolCalls = None 
 
@@ -410,6 +413,85 @@ class BaseModelWithPricing(TrackAPIPricingMixin, BaseResponseModel):
     pass
 
 
+
+# TODO: Define and use Flexible set of Configuration.
+# Below configs are not used and are WIP. 
+# _______________________________________________________________
+
+# Some High-level requirements.
+
+# Env can have multiple actions sets. Each action set should be supported as tools and prompt description.
+# Env should have converstion functions to parse the tool calls or text to back to env actions.
+
+# Backend LLMs or Large action models can have thier own action sets (Ui-Tars, CUA), which can be fixed or flexible.
+# EnvConfig or LLMConfig or ActionConfig should provide conversion from Backend LLM action to env_action.
+
+# AgentLab Agents may emit multiple actions. EnvConfig should mention if it supports multiple actions in a single step.
+# If Env controller does not natively support multiactions. We can choose to integrate Env logic which brings this support.
+
+# Env should broadcast what obersvations are supported and agent loop should be able to handle them. (e.g, Ax_tree) 
+
+@dataclass
+class ActionConfig:
+    action_set: "AbstractActionSet"  # TODO: Agentlab AbstractActionSet, have constructor methods to create actions as tools or descriptions with examples.
+    multiaction: bool = True
+    env_action_as_tools: bool = True  # If True, action set is treated as tools
+    tools: Optional[List[Dict[str, Any]]] = None  # List of tool definitions or list of functions
+    tool_text_descriptions: str = ""  # Some description of the tools, emitted by the environment.
+    tools_calls_to_env_action_parser: callable = # Some callable given by the environment to convert tool calls to env actions.
+    text_to_env_action_parser: Optional[Type[MessageBuilder]] = None
+
+@dataclass
+class ObsConfig
+# Check generic agent
+    pass
+@dataclass
+class Config:
+    model_args: BaseModelArgs
+    obs: ObsConfig
+    action: ActionConfig
+    generationConfig: GenerationConfig
+
+@dataclass
+class PromptConfig:
+    # use_hints
+    # use_summarizer
+    pass
+@dataclass
+class ProviderConfig:
+    """Configuration for the LLM provider."""
+    api_key_env_var: Optional[str] = None
+    base_url: Optional[str] = None  # Base URL for the API, if different
+    # Anything else? # VLLM specific configurations ?, etc.
+@dataclass
+class LLMConfig:
+    # backend LLM supported action set 
+    # Any other LLM specific configurations
+    # Tool calling format?
+    # Maybe include provider specific configurations here?
+    
+    pass
+
+@dataclass
+class GenerationConfig:
+    temperature: float = 0.5
+    max_new_tokens: int = 100
+    # Might be useful for exploration to have the ability to modify inside agent loop.
+
+@dataclass
+class APIPayload:
+    messages: List[MessageBuilder | ToolCalls]
+    api_endpoint: str
+    api_key_env_var: Optional[str] = None
+    base_url: Optional[str] = None
+    tools: Optional[List[Dict[str, Any]]] = None  # Taken from ActionConfig 
+    tool_choice: Optional[str] = None  # Fix some literal value for tool choice, e.g., "auto" and convert according to the API. OpenAI and Anthrophic can have different tool choice parameters that behave differently.
+    generation_config: GenerationConfig = GenerationConfig()
+    caching: bool = False  # If True, cache the response
+    # The agent loop will form the payload based on the config and pass it to the API call.
+
+# _______________________________________________________________
+
 class OpenAIResponseModel(BaseModelWithPricing):
     def __init__(
         self,
@@ -437,6 +519,7 @@ def __init__(
     def _call_api(self, messages: list[Any | MessageBuilder], **kwargs) -> dict:
         input = self.convert_messages_to_api_format(messages)
 
+        #TODO: API/Payload Params should be a config dataclass. Update once settled on a config structure.
         api_params: Dict[str, Any] = {
             "model": self.model_name,
             "input": input,
@@ -486,8 +569,10 @@ def _parse_response(self, response: "OpenAIResponseObject") -> LLMOutput:
             tool_calls=toolcalls if toolcalls is not None else None,
         )
 
+
     def _extract_tool_calls_from_response(self, response: "OpenAIResponseObject") -> ToolCalls:
         """Extracts tool calls from the response."""
+        #TODO: Should this be in the BaseResponseModelclass?
         tool_calls = ToolCalls(raw_calls=response.output)
         for output in response.output:
             if output.type == "function_call":
@@ -507,6 +592,7 @@ def _extract_tool_calls_from_response(self, response: "OpenAIResponseObject") ->
 
     def _extract_env_actions_from_toolcalls(self, toolcalls: ToolCalls) -> Any | None:
         """Extracts actions from the response."""
+        #TODO: Should this be in the BaseResponseModelclass? or Emitted by Environment?
         actions = []
         for call in toolcalls:
             if call.is_env_action:
@@ -530,8 +616,9 @@ def _extract_thinking_content_from_response(self, response: "OpenAIResponseObjec
                 thinking_content += f"{output.output_text}\n"
         return thinking_content
 
-    # Environment Specific functions, in this case BGYM
+    ### Environment Specific functions, in this case BGYM  ###
 
+    #TODO: Should the below functions be in the BaseResponseModelclass? or Emitted by the Environment and intialized using a config?
     def convert_toolcall_to_env_action_format(self, toolcall: ToolCall) -> str:
         """Convert a tool call to an BGYM environment action string."""
         action_name, tool_args = toolcall.name, toolcall.arguments
@@ -554,6 +641,7 @@ def _extract_env_actions_from_text_response(self, response: "OpenAIResponseObjec
         pass
 
 
+# TODO: Refactor similar to OpenAIResponseModel
 class OpenAIChatCompletionModel(BaseModelWithPricing):
     def __init__(
         self,
@@ -684,6 +772,8 @@ def extract_content_with_reasoning(message, wrap_tag="think"):
             reasoning_content = ""
         return f"{reasoning_content}{msg_content}{message.get('content', '')}"
 
+
+# TODO: Refactor similar to OpenAIResponseModel
 class ClaudeResponseModel(BaseModelWithPricing):
     def __init__(
         self,
@@ -807,6 +897,8 @@ def apply_cache_breakpoints(self, msg: Message, prepared_msg: dict) -> List[Mess
 
 
 # Factory classes to create the appropriate model based on the API endpoint.
+
+# TODO: Do we really need these factory classes? how about implementing a _from_args() method in the BaseModelArgs class?
 @dataclass
 class OpenAIResponseModelArgs(BaseModelArgs):
     """Serializable object for instantiating a generic chat model with an OpenAI

From f5443b6665683dba4faa5e3d8bdc4614c22860bd Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 30 Jun 2025 13:57:08 -0400
Subject: [PATCH 12/37] Refactor OpenAICUAModel to streamline action handling
 and improve code organization

---
 .../agents/tool_use_agent/openai_cua.py       | 271 ++++--------------
 1 file changed, 60 insertions(+), 211 deletions(-)

diff --git a/src/agentlab/agents/tool_use_agent/openai_cua.py b/src/agentlab/agents/tool_use_agent/openai_cua.py
index 66a7d5b2..d7f1f97f 100644
--- a/src/agentlab/agents/tool_use_agent/openai_cua.py
+++ b/src/agentlab/agents/tool_use_agent/openai_cua.py
@@ -4,14 +4,10 @@
 
 from agentlab.llm.llm_utils import call_openai_api_with_retries
 from agentlab.llm.response_api import (
-    ContentItem,
-    LLMOutput,
-    Message,
     MessageBuilder,
     OpenAIResponseAPIMessageBuilder,
     OpenAIResponseModel,
     OpenAIResponseModelArgs,
-    ToolCall,
     ToolCalls,
 )
 
@@ -28,16 +24,8 @@
 
 class OpenAICUAModel(OpenAIResponseModel):
 
-    def _call_api(self, messages: list[Any | MessageBuilder], tool_choice="auto", **kwargs) -> dict:
-        input = []
-        for msg in messages:
-            if isinstance(msg, MessageBuilder):
-                temp = msg.prepare_message()
-            elif isinstance(msg, ToolCalls):
-                temp = msg.raw_calls
-            else:
-                raise TypeError('Unsupported message type: {}'.format(type(msg)))
-            input.extend(temp)
+    def _call_api(self, messages: list[ToolCalls | MessageBuilder], tool_choice="auto", **kwargs) -> dict:
+        input = self.convert_messages_to_api_format(messages)
 
         api_params: Dict[str, Any] = {
             "model": self.model_name,
@@ -53,14 +41,15 @@ def _call_api(self, messages: list[Any | MessageBuilder], tool_choice="auto", **
             cua_tool_present = any(
                 tool.get("type") == "computer_use_preview" for tool in api_params["tools"]
             )
+            # CUA requires this tool 
             if not cua_tool_present:
                 api_params["tools"].extend(
                     [
                         {
                             "type": "computer_use_preview",
-                            "display_width": 1024,
+                            "display_width": 1024,   
                             "display_height": 768,
-                            "environment": "browser",  # other possible values: "mac", "windows", "ubuntu"
+                            "environment": "browser",  # TODO: Parametrize this 
                         }
                     ]
                 )
@@ -72,212 +61,72 @@ def _call_api(self, messages: list[Any | MessageBuilder], tool_choice="auto", **
 
         return response
 
-    def _parse_response(self, response: dict) -> dict:
-        result = LLMOutput(
-            raw_response=response,
-            think="",
-            action=None,
-            tool_calls=ToolCalls(),
-        )
-        interesting_keys = ["output_text"]
-        actions = []  # Collect all actions for multi-action support
-
-        for output in response.output:
-            if output.type in "computer_call":
-                # Mapping CUA action space to bgym coord action space.
-                bgym_fn, bgym_fn_args, action_str = (
-                    self.cua_action_to_bgym_action(output.action)
-                )
-                tool_call = ToolCall(
-                    name=bgym_fn,
-                    arguments=bgym_fn_args,
-                    raw_call=output,
-                )
-                result.tool_calls.add_tool_call(tool_call)
-                actions.append(action_str)
-
-            elif output.type == "function_call":
-                arguments = json.loads(output.arguments)
-                func_args_str = ", ".join(
-                    [
-                        f'{k}="{v}"' if isinstance(v, str) else f"{k}={v}"
-                        for k, v in arguments.items()
-                    ]
-                )
-                action_str = f"{output.name}({func_args_str})"
-                tool_call = ToolCall(
-                    name=output.name,
-                    arguments=arguments,
-                    raw_call=output,
-                )
-                result.tool_calls.add_tool_call(tool_call)
-                if tool_call.is_bgym_action():
-                    actions.append(action_str)
-
-            elif output.type == "reasoning":
-                if len(output.summary) > 0:
-                    result.think += output.summary[0].text + "\n"
-
-            elif output.type == "message" and output.content:
-                result.think += output.content[0].text + "\n"
-
-        result.action = actions
-        result.tool_calls.raw_calls = response.output
-
-        for key in interesting_keys:
-            if key_content := getattr(output, "output_text", None) is not None:
-                result.think += f"<{key}>{key_content}</{key}>"
-        return result
-
-    @staticmethod
-    def cua_action_to_bgym_action(action) -> str:
+    def cua_action_to_env_tool_name_and_args(self, action) -> str:
         """
         Given a computer action (e.g., click, double_click, scroll, etc.),
         convert it to a text description.
         """
+        #TODO: #Provide an alternate implementation for OS-World.
 
         action_type = action.type
 
         try:
-            match action_type:
-
-                case "click":
-                    x, y = action.x, action.y
-                    button = action.button
-                    print(f"Action: click at ({x}, {y}) with button '{button}'")
-                    # Not handling things like middle click, etc.
-                    if button != "left" and button != "right":
-                        button = "left"
-                    action_str = f"mouse_click({x}, {y}, button='{button}')"
-                    (
-                        bgym_fn,
-                        bgym_fn_args,
-                    ) = "mouse_click", {"x": x, "y": y, "button": button}
-
-                case "scroll":
-                    x, y = action.x, action.y
-                    scroll_x, scroll_y = action.scroll_x, action.scroll_y
-                    action_str = f"scroll_at({x}, {y}, {scroll_x},  {scroll_y})"
-                    bgym_fn, bgym_fn_args = "scroll_at", {
-                        "x": x,
-                        "y": y,
-                        "scroll_x": scroll_x,
-                        "scroll_y": scroll_y,
-                    }
-
-                case "keypress":
-                    keys = action.keys
-                    for k in keys:
-                        print(f"Action: keypress '{k}'")
-                        # A simple mapping for common keys; expand as needed.
-                        if k.lower() == "enter":
-                            action_str = "keyboard_press('Enter')"
-                        elif k.lower() == "space":
-                            action_str = "keyboard_press(' ')"
-                        else:
-                            action_str = f"keyboard_press('{k}')"
-
-                        bgym_fn, bgym_fn_args = "keyboard_press", {"key": k}
-
-                case "type":
-                    text = action.text
-                    print(f"Action: type text: {text}")
-                    action_str = f"keyboard_type('{text}')"
-                    bgym_fn, bgym_fn_args = "keyboard_type", {"text": text}
-
-                case "wait":
-                    print("Action: wait")
-                    action_str = "noop()"
-                    bgym_fn, bgym_fn_args = "noop", {}
-
-                case "screenshot":
-                    # Not a valid bgym action
-                    action_str = "noop()"
-                    bgym_fn, bgym_fn_args = "noop", {}
-
-                case "drag":
-                    x1, y1 = action.path[0].x, action.path[0].y
-                    x2, y2 = action.path[1].x, action.path[1].y
-                    print(f"Action: drag from ({x1}, {y1}) to ({x2}, {y2})")
-                    action_str = f"mouse_drag_and_drop({x1}, {y1}, {x2}, {y2})"
-                    bgym_fn, bgym_fn_args = "mouse_drag_and_drop", {
-                        "x1": x1,
-                        "y1": y1,
-                        "x2": x2,
-                        "y2": y2,
-                    }
-
-                case _:
-                    raise ValueError(f"Unrecognized action type: {action_type}")
-
-            # Return the function name and arguments for bgym
-
-            return bgym_fn, bgym_fn_args, action_str
+            action_mapping = {
+                "click": lambda: self._handle_click_action(action),
+                "scroll": lambda: self._handle_scroll_action(action),
+                "keypress": lambda: self._handle_keypress_action(action),
+                "type": lambda: self._handle_type_action(action),
+                "wait": lambda: self._handle_wait_action(action),
+                "screenshot": lambda: self._handle_screenshot_action(action),
+                "drag": lambda: self._handle_drag_action(action),
+            }
+
+            if action_type in action_mapping:
+                return action_mapping[action_type]()
+            else:
+                raise ValueError(f"Unrecognized openAI CUA action type: {action_type}")
 
         except Exception as e:
             print(f"Error handling action {action}: {e}")
 
-
-class OpenaAICUAMessageBuilder(OpenAIResponseAPIMessageBuilder):
-
-    def prepare_message(self) -> List[Message]:
-        content = []
-        for item in self.content:
-            content.append(self.convert_content_to_expected_format(item))
-        output = [{"role": self.role, "content": content}]
-
-        if self.role != "tool":
-            return output
-        else:
-            return self.handle_tool_call()
-
-    def convert_content_to_expected_format(self, content: ContentItem) -> ContentItem:
-        """Convert the content item to the expected format for OpenAI Responses."""
-        if "text" in content:
-            content_type = "input_text" if self.role != "assistant" else "output_text"
-            return {"type": content_type, "text": content["text"]}
-        elif "image" in content:
-            return {"type": "input_image", "image_url": content["image"]}
-        else:
-            raise ValueError(f"Unsupported content type: {content}")
-
-    def handle_tool_call(self):
-        """Handle the tool call response from the last raw response."""
-        if self.responsed_tool_calls is None:
-            raise ValueError("No tool calls found in responsed_tool_calls")
-
-        output = []
-        for fn_call in self.responsed_tool_calls:
-            call_type = fn_call.raw_call.type
-            call_id = fn_call.raw_call.call_id
-            call_response = fn_call.tool_response  # List[ContentItem]
-
-            match call_type:
-                case "function_call":
-                    # image output is not supported in function calls response.
-                    fn_call_response = {
-                        "type": "function_call_output",
-                        "call_id": call_id,
-                        "output": [
-                            self.convert_content_to_expected_format(item) for item in call_response
-                        ],
-                    }
-                    output.append(fn_call_response)
-
-                case "computer_call":
-                    # For computer calls, use only images are expected.
-                    computer_call_output = {
-                        "type": "computer_call_output",
-                        "call_id": call_id,
-                        "output": self.convert_content_to_expected_format(call_response[0]), # list needs to be flattened
-                    }
-                    output.append(computer_call_output)  # this needs to be a screenshot
-
-        return output
-
-    def mark_all_previous_msg_for_caching(self):
-        pass
-
+    def _handle_click_action(self, action):
+        x, y = action.x, action.y
+        button = action.button
+        if button != "left" and button != "right":
+            button = "left"
+        return "mouse_click", {"x": x, "y": y, "button": button}
+
+    def _handle_scroll_action(self, action):
+        x, y = action.x, action.y
+        scroll_x, scroll_y = action.scroll_x, action.scroll_y
+        return "scroll_at", {"x": x, "y": y, "scroll_x": scroll_x, "scroll_y": scroll_y}
+
+    def _handle_keypress_action(self, action):
+        keys = action.keys
+        #TODO: Check this if is suitable for BGYM env.
+        for k in keys:
+            print(f"Action: keypress '{k}'")
+            if k.lower() == "enter":
+                key = "Enter"
+            elif k.lower() == "space":
+                key = " "
+            return "keyboard_press", {"key": key}
+
+    def _handle_type_action(self, action):
+        text = action.text
+        return "keyboard_type", {"text": text}
+
+    def _handle_wait_action(self, action):
+        return "noop", {}
+
+    def _handle_screenshot_action(self, action):
+        return "noop", {}
+
+    def _handle_drag_action(self, action):
+        x1, y1 = action.path[0].x, action.path[0].y
+        x2, y2 = action.path[1].x, action.path[1].y
+        print(f"Action: drag from ({x1}, {y1}) to ({x2}, {y2})")
+        return "mouse_drag_and_drop", {"x1": x1, "y1": y1, "x2": x2, "y2": y2}
 
 @dataclass
 class OpenAICUAModelArgs(OpenAIResponseModelArgs):
@@ -297,7 +146,7 @@ def make_model(self, extra_kwargs=None, **kwargs):
         )
 
     def get_message_builder(self) -> MessageBuilder:
-        return OpenaAICUAMessageBuilder
+        return OpenAIResponseAPIMessageBuilder
 
 
 # Default configuration for Computer Use Agent

From ddc8f5604962f8e9f565039c824b8ae3dc3cdb37 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Fri, 11 Jul 2025 22:53:03 -0400
Subject: [PATCH 13/37] fix typo in tracking

---
 src/agentlab/llm/tracking.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/agentlab/llm/tracking.py b/src/agentlab/llm/tracking.py
index ab033b22..74557d4d 100644
--- a/src/agentlab/llm/tracking.py
+++ b/src/agentlab/llm/tracking.py
@@ -164,7 +164,7 @@ def __call__(self, *args, **kwargs):
 
         usage = dict(getattr(response, "usage", {}))
         if 'prompt_tokens_details' in usage:
-            usage['cached_tokens'] = usage['prompt_token_details'].cached_tokens
+            usage['cached_tokens'] = usage['prompt_tokens_details'].cached_tokens
         if 'input_tokens_details' in usage:
             usage['cached_tokens'] = usage['input_tokens_details'].cached_tokens
         usage = {f"usage_{k}": v for k, v in usage.items() if isinstance(v, (int, float))}

From a375a35807f6a4c30bd466b37c2bc7972e9a328f Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Sun, 13 Jul 2025 15:01:25 -0400
Subject: [PATCH 14/37] Refactor TrackAPIPricingMixin to remove init inside
 mixin class

---
 src/agentlab/llm/tracking.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/agentlab/llm/tracking.py b/src/agentlab/llm/tracking.py
index 74557d4d..53acbfe2 100644
--- a/src/agentlab/llm/tracking.py
+++ b/src/agentlab/llm/tracking.py
@@ -151,22 +151,22 @@ class TrackAPIPricingMixin:
     def reset_stats(self):
         self.stats = Stats()
 
-    def __init__(self, *args, **kwargs):
-        pricing_api = kwargs.pop("pricing_api", None)
+    def init_pricing_tracker(
+        self, pricing_api=None
+    ):  # TODO: Use this function in the base class init instead of having a init in the Mixin class.
         self._pricing_api = pricing_api
-        super().__init__(*args, **kwargs)
         self.set_pricing_attributes()
         self.reset_stats()
 
     def __call__(self, *args, **kwargs):
         """Call the API and update the pricing tracker."""
+        # 'self' here calls ._call_api() method of the subclass
         response = self._call_api(*args, **kwargs)
-
         usage = dict(getattr(response, "usage", {}))
-        if 'prompt_tokens_details' in usage:
-            usage['cached_tokens'] = usage['prompt_tokens_details'].cached_tokens
-        if 'input_tokens_details' in usage:
-            usage['cached_tokens'] = usage['input_tokens_details'].cached_tokens
+        if "prompt_tokens_details" in usage:
+            usage["cached_tokens"] = usage["prompt_tokens_details"].cached_tokens
+        if "input_tokens_details" in usage:
+            usage["cached_tokens"] = usage["input_tokens_details"].cached_tokens
         usage = {f"usage_{k}": v for k, v in usage.items() if isinstance(v, (int, float))}
         usage |= {"n_api_calls": 1}
         usage |= {"effective_cost": self.get_effective_cost(response)}
@@ -306,13 +306,13 @@ def get_effective_cost_from_openai_api(self, response) -> float:
         if usage is None:
             logging.warning("No usage information found in the response. Defaulting cost to 0.0.")
             return 0.0
-        api_type = 'chatcompletion' if hasattr(usage, "prompt_tokens_details") else 'response'
-        if api_type == 'chatcompletion':
+        api_type = "chatcompletion" if hasattr(usage, "prompt_tokens_details") else "response"
+        if api_type == "chatcompletion":
             total_input_tokens = usage.prompt_tokens
             output_tokens = usage.completion_tokens
             cached_input_tokens = usage.prompt_tokens_details.cached_tokens
             non_cached_input_tokens = total_input_tokens - cached_input_tokens
-        elif api_type == 'response':
+        elif api_type == "response":
             total_input_tokens = usage.input_tokens
             output_tokens = usage.output_tokens
             cached_input_tokens = usage.input_tokens_details.cached_tokens
@@ -320,7 +320,7 @@ def get_effective_cost_from_openai_api(self, response) -> float:
         else:
             logging.warning(f"Unsupported API type: {api_type}. Defaulting cost to 0.0.")
             return 0.0
-        
+
         cache_read_cost = self.input_cost * OPENAI_CACHE_PRICING_FACTOR["cache_read_tokens"]
         effective_cost = (
             self.input_cost * non_cached_input_tokens

From 8ce988557f0d3018d309667ca669fb34178ec385 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Sun, 13 Jul 2025 15:04:56 -0400
Subject: [PATCH 15/37] Multiaction support and cleaner API usage with
 ToolCalls.

---
 src/agentlab/llm/response_api.py | 918 ++++++++++++++++---------------
 1 file changed, 471 insertions(+), 447 deletions(-)

diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
index af256152..b6e5dc1e 100644
--- a/src/agentlab/llm/response_api.py
+++ b/src/agentlab/llm/response_api.py
@@ -3,10 +3,12 @@
 import os
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Type, Union
+from typing import Any, Dict, List, Literal, Optional, Union
 
 import openai
 from anthropic import Anthropic
+from anthropic.types import Completion
+from anthropic.types import Message as AnthrophicMessage
 from openai import OpenAI
 
 from agentlab.llm.llm_utils import image_to_png_base64_url
@@ -29,31 +31,39 @@
 ContentItem = Dict[str, Any]
 Message = Dict[str, Union[str, List[ContentItem]]]
 
+# TODO: It would be better idea to let the agent logic decide what is not an env action, instead of env emitting env based actions.
 BGYM_RESERVED_ACTION_FUNCTION_NAMES = [
-            "noop",
-            "scroll_at",
-            "mouse_move",
-            "mouse_up",
-            "mouse_down",
-            "mouse_click",
-            "mouse_dblclick",
-            "mouse_drag_and_drop",
-            "mouse_upload_file",
-            "keyboard_down",
-            "keyboard_up",
-            "keyboard_press",
-            "keyboard_type",
-            "keyboard_insert_text",
-        ]
+    "noop",
+    "scroll_at",
+    "mouse_move",
+    "mouse_up",
+    "mouse_down",
+    "mouse_click",
+    "mouse_dblclick",
+    "mouse_drag_and_drop",
+    "mouse_upload_file",
+    "keyboard_down",
+    "keyboard_up",
+    "keyboard_press",
+    "keyboard_type",
+    "keyboard_insert_text",
+]
 
 
 @dataclass
 class ToolCall:
-    #TODO: Check if this is a suitable tool representation for being MCP compliant.
+    """Represents a tool call made by the LLM.
+    Attributes:
+    name: Name of the tool called.
+    arguments: Arguments passed to the tool.
+    raw_call: The raw call object from the LLM API.
+    tool_response: Output of the tool call goes here. It can be only one content item.
+    """
+
     name: str = field(default=None)
     arguments: Dict[str, Any] = field(default_factory=dict)
-    raw_call: Any =  field(default=None)
-    tool_response: List[ContentItem] = field(default_factory=list)
+    raw_call: Any = field(default=None)
+    tool_response: ContentItem = None
 
     @property
     def is_env_action(self) -> bool:
@@ -67,20 +77,27 @@ def is_response_set(self) -> bool:
         """Check if the tool response is set."""
         return self.tool_response is not None
 
-    def add_text(self, text: str) -> "MessageBuilder":
-        self.tool_response.append({"text": text})
+    def response_text(self, text: str) -> "MessageBuilder":
+        self.tool_response = {"text": text}
         return self
 
-    def add_image(self, text: str) -> "MessageBuilder":
-        self.tool_response.append({"image": text})
+    def response_image(self, image: str) -> "MessageBuilder":
+        self.tool_response = {"image": image}
         return self
-    
+
     def __repr__(self):
         return f"ToolCall(name={self.name}, arguments={self.arguments})"
 
 
 @dataclass
 class ToolCalls:
+    """A collection of tool calls made by the LLM.
+
+    Attributes:
+    tool_calls: List of ToolCall objects.
+    raw_calls: Represents raw tool calls object returned by a LLM API, may contain one or more tool calls.
+    """
+
     tool_calls: List[ToolCall] = field(default_factory=list)
     raw_calls: List[Any] = field(default_factory=list)
 
@@ -91,11 +108,11 @@ def add_tool_call(self, tool_call: ToolCall) -> "ToolCalls":
     def get_env_action_calls(self) -> List[ToolCall]:
         """Get all tool calls that are reserved Environment actions."""
         return [call for call in self.tool_calls if call.is_env_action]
-    
+
     def get_non_env_action_calls(self) -> List[ToolCall]:
         """Get all tool calls that are not reserved Environment actions."""
         return [call for call in self.tool_calls if not call.is_env_action]
-    
+
     @property
     def all_responses_set(self) -> bool:
         """Check if all tool calls have responses set."""
@@ -108,7 +125,7 @@ def __len__(self) -> int:
     def __iter__(self):
         """Make ToolCalls iterable."""
         return iter(self.tool_calls)
-    
+
     def __bool__(self):
         """Check if there are any tool calls."""
         return len(self.tool_calls) > 0
@@ -121,16 +138,15 @@ class LLMOutput:
     raw_response: Any = field(default=None)
     think: str = field(default="")
     action: str = field(default=None)  # Default action if no tool call is made
-    tool_calls: ToolCalls = field(default=None) # This will hold the tool call response if any
+    tool_calls: ToolCalls = field(default=None)  # This will hold the tool call response if any
 
 
 class MessageBuilder:
     def __init__(self, role: str):
 
         self.role = role
-        self.last_raw_response: LLMOutput = None # NOTE: last_raw_response will be deprecated in future version. We can use ToolCalls object to get all the relevant information.
         self.content: List[ContentItem] = []
-        self.responsed_tool_calls: ToolCalls = None 
+        self.responded_tool_calls: ToolCalls = None
 
     @classmethod
     def system(cls) -> "MessageBuilder":
@@ -144,19 +160,16 @@ def user(cls) -> "MessageBuilder":
     def assistant(cls) -> "MessageBuilder":
         return cls("assistant")
 
-    @classmethod
-    def tool(cls, last_raw_response) -> "MessageBuilder":
-        return cls("tool").update_last_raw_response(last_raw_response)
+    # Use responded_tool_calls to add tool calls to the message content.
+    # @classmethod
+    # def tool(cls) -> "MessageBuilder":  
+    #     return cls("tool")
 
     @abstractmethod
     def prepare_message(self) -> List[Message]:
         """Prepare the message for the API call."""
         raise NotImplementedError("Subclasses must implement this method.")
 
-    def update_last_raw_response(self, last_raw_response: Any) -> "MessageBuilder":
-        self.last_raw_response = last_raw_response
-        return self
-
     def add_text(self, text: str) -> "MessageBuilder":
         self.content.append({"text": text})
         return self
@@ -174,17 +187,19 @@ def to_markdown(self) -> str:
                 parts.append(f"![Image]({item['image']})")
 
         # Tool call markdown repr
-        if self.responsed_tool_calls:
-            for i, tool_call in enumerate(self.responsed_tool_calls.tool_calls, 1):
+        if self.responded_tool_calls is not None:
+            for i, tool_call in enumerate(self.responded_tool_calls.tool_calls, 1):
                 args = ", ".join(f"{k}={v}" for k, v in tool_call.arguments.items())
                 parts.append(f"\n**Tool Call {i}**: {tool_call.name}({args})")
-                
-                if tool_call.tool_response:
+                response = tool_call.tool_response
+                if response is not None:
                     parts.append(f"\n**Tool Response {i}:**")
-                    for response_item in tool_call.tool_response:
-                        content = (f"```\n{response_item['text']}\n```" if "text" in response_item 
-                                 else f"![Tool Response Image]({response_item['image']})")
-                        parts.append(content)
+                    content = (
+                        f"```\n{response['text']}\n```"
+                        if "text" in response
+                        else f"![Tool Response Image]({response['image']})"
+                    )
+                    parts.append(content)
 
         markdown = f"### {self.role.capitalize()}\n"
         markdown += "\n".join(parts)
@@ -202,18 +217,14 @@ def mark_all_previous_msg_for_caching(self):
         raise NotImplementedError
 
     @classmethod
-    def add_responded_tool_calls(cls, responsed_tool_calls: ToolCalls) -> "MessageBuilder":
+    def add_responded_tool_calls(cls, responded_tool_calls: ToolCalls) -> "MessageBuilder":
         """Add tool calls to the message content."""
-
-        assert responsed_tool_calls.all_responses_set, "All tool calls must have a response."
-        msg = cls.tool(last_raw_response=None)
-        msg.responsed_tool_calls = responsed_tool_calls
+        assert responded_tool_calls.all_responses_set, "All tool calls must have a response."
+        msg = cls('tool')
+        msg.responded_tool_calls = responded_tool_calls
         return msg
 
 
-# TODO: Support parallel tool calls.
-
-
 class OpenAIResponseAPIMessageBuilder(MessageBuilder):
     @classmethod
     def system(cls) -> "OpenAIResponseAPIMessageBuilder":
@@ -238,41 +249,47 @@ def convert_content_to_expected_format(self, content: ContentItem) -> ContentIte
         else:
             raise ValueError(f"Unsupported content type: {content}")
 
-    def handle_tool_call(self):
+    def handle_tool_call(self) -> List[Message]:
         """Handle the tool call response from the last raw response."""
-        if self.responsed_tool_calls is None:
-            raise ValueError("No tool calls found in responsed_tool_calls")
+        if self.responded_tool_calls is None:
+            raise ValueError("No tool calls found in responded_tool_calls")
 
         output = []
-        for fn_call in self.responsed_tool_calls:
+        output.extend(self.responded_tool_calls.raw_calls.output) # this contains response
+        for fn_call in self.responded_tool_calls:
             call_type = fn_call.raw_call.type
             call_id = fn_call.raw_call.call_id
-            call_response = fn_call.tool_response  # List[ContentItem]
+            call_response = fn_call.tool_response
 
             match call_type:
                 case "function_call":
                     # image output is not supported in function calls response.
+                    assert (
+                        "image" not in call_response
+                    ), "Image output is not supported in function calls response."
                     fn_call_response = {
                         "type": "function_call_output",
                         "call_id": call_id,
-                        "output": [
-                            self.convert_content_to_expected_format(item) for item in call_response
-                        ],
+                        "output": self.convert_content_to_expected_format(call_response)["text"],
                     }
                     output.append(fn_call_response)
 
                 case "computer_call":
                     # For computer calls, use only images are expected.
+                    assert (
+                        "text" not in call_response
+                    ), "Text output is not supported in computer calls response."
                     computer_call_output = {
                         "type": "computer_call_output",
                         "call_id": call_id,
-                        "output": self.convert_content_to_expected_format(call_response[0]), # list needs to be flattened
+                        "output": self.convert_content_to_expected_format(call_response),
                     }
                     output.append(computer_call_output)  # this needs to be a screenshot
 
         return output
 
     def mark_all_previous_msg_for_caching(self):
+        """Nothing special to do here for openAI. They do not have a notion of cache breakpoints."""
         pass
 
 
@@ -282,21 +299,8 @@ def prepare_message(self) -> List[Message]:
         content = [self.transform_content(item) for item in self.content]
         output = {"role": self.role, "content": content}
 
-        if self.role == "system":
-            logging.info(
-                "Treating system message as 'user'. In the Anthropic API, system messages should be passed as a direct input to the client."
-            )
-            output["role"] = "user"
-
         if self.role == "tool":
-            assert self.responsed_tool_calls is not None, "No tool_calls added to tool call response"
-            output["role"] = "user"
-            output["content"] = [{
-                    "type": "tool_result",
-                    "tool_use_id": call.raw_call.id,
-                    "content": [self.transform_content(item) for item in call.tool_response]
-                } for call in self.responsed_tool_calls
-                ]
+            return self.handle_tool_call()
 
         if self.role == "assistant":
             # Strip whitespace from assistant text responses. See anthropic error code 400.
@@ -305,6 +309,25 @@ def prepare_message(self) -> List[Message]:
                     c["text"] = c["text"].strip()
         return [output]
 
+    def handle_tool_call(self) -> List[Message]:
+        """Handle the tool call response from the last raw response."""
+        if self.responded_tool_calls is None:
+            raise ValueError("No tool calls found in responded_tool_calls")
+        
+        llm_tool_call = {"role": "assistant", "content": self.responded_tool_calls.raw_calls.content} # Add the toolcall block 
+        tool_response = {'role': 'user', 'content': []}  # Anthropic expects a list of messages
+        for call in self.responded_tool_calls:
+            assert (
+                "image" not in call.tool_response
+            ), "Image output is not supported in tool calls response."
+            tool_response['content'].append({
+                "type": "tool_result",
+                "tool_use_id": call.raw_call.id,
+                "content": self.transform_content(call.tool_response)["text"], # needs to be str
+            })
+
+        return [llm_tool_call, tool_response]
+
     def transform_content(self, content: ContentItem) -> ContentItem:
         """Transform content item to the format expected by Anthropic API."""
         if "text" in content:
@@ -335,13 +358,13 @@ class OpenAIChatCompletionAPIMessageBuilder(MessageBuilder):
 
     def prepare_message(self) -> List[Message]:
         """Prepare the message for the OpenAI API."""
-        content = [self.transform_content(item) for item in self.content]
-        if self.role == "tool":
-            return self.handle_tool_call(content)
-        else:
-            return [{"role": self.role, "content": content}]
+        content = []
+        for item in self.content:
+            content.append(self.convert_content_to_expected_format(item))
+        output = [{"role": self.role, "content": content}]
+        return output if self.role != "tool" else self.handle_tool_call()
 
-    def transform_content(self, content: ContentItem) -> ContentItem:
+    def convert_content_to_expected_format(self, content: ContentItem) -> ContentItem:
         """Transform content item to the format expected by OpenAI ChatCompletion."""
         if "text" in content:
             return {"type": "text", "text": content["text"]}
@@ -350,30 +373,56 @@ def transform_content(self, content: ContentItem) -> ContentItem:
         else:
             raise ValueError(f"Unsupported content type: {content}")
 
-    def handle_tool_call(self, content) -> List[Message]:
+    def handle_tool_call(self) -> List[Message]:
         """Handle the tool call response from the last raw response."""
+        if self.responded_tool_calls is None:
+            raise ValueError("No tool calls found in responded_tool_calls")
         output = []
-        content_head, *content_tail = content
-        api_response = self.last_raw_response.choices[0].message
-        fn_calls = getattr(api_response, "tool_calls", None)
-        assert fn_calls is not None, "Tool calls not found in the last response"
-        if len(fn_calls) > 1:
-            logging.warning("Using only the first tool call from many.")
-
-        # a function_call_output dict has keys "role", "tool_call_id" and "content"
-        tool_call_reponse = {
-            "role": "tool",
-            "tool_call_id": fn_calls[0].id,  # using the first tool call ID
-            "content": content_head.get("text", "Tool call answer in next message"),
-            "name": fn_calls[0].function.name,  # required with OpenRouter
-        }
+        output.append(self.responded_tool_calls.raw_calls.choices[0].message)  # add raw calls to output
+        for fn_call in self.responded_tool_calls:
+            raw_call = fn_call.raw_call
+            assert ("image" not in fn_call.tool_response
+                    ), "Image output is not supported in function calls response."
+            # a function_call_output dict has keys "role", "tool_call_id" and "content"
+            tool_call_reponse = {
+                "name": raw_call["function"]["name"],  # required with OpenRouter
+                "role": "tool",
+                "tool_call_id": raw_call["id"],
+                "content": self.convert_content_to_expected_format(fn_call.tool_response)["text"],
+            }
+            output.append(tool_call_reponse)
 
-        output.append(tool_call_reponse)
-        if content_tail:
-            # if there are more content items, add them as a new user message
-            output.append({"role": "user", "content": content_tail})
         return output
 
+    def mark_all_previous_msg_for_caching(self):
+        """Nothing special to do here for openAI. They do not have a notion of cache breakpoints."""
+        pass
+
+
+@dataclass
+class APIPayload:
+    messages: List[MessageBuilder | ToolCalls] = None
+    tools: List[Dict[str, Any]] | None = None
+    tool_choice: Literal["none", "auto", "any", "required"] | None = None
+    force_call_tool: str = (
+        None  # Name of the tool to call # If set, will force the LLM to call this tool.
+    )
+    use_cache_breakpoints: bool = (
+        False  # If True, will apply cache breakpoints to the messages. # applicable for Anthropic
+    )
+    cache_tool_definition: bool = (
+        False  # If True, will cache the tool definition in the last message.
+    )
+    cache_complete_prompt: bool = (
+        False  # If True, will cache the complete prompt in the last message.
+    )
+
+    def __post_init__(self):
+        # assert tool_choice is None when force_call_tool is set
+        assert (
+            self.tool_choice is None or self.force_call_tool is None
+        ), "tool_choice and force_call_tool cannot be set at the same time."
+
 
 # # Base class for all API Endpoints
 class BaseResponseModel(ABC):
@@ -381,25 +430,23 @@ def __init__(
         self,
         model_name: str,
         api_key: Optional[str] = None,
-        temperature: float = 0.5,
-        max_tokens: int = 100,
-        extra_kwargs: Optional[Dict[str, Any]] = None,
+        temperature: float | None = None,
+        max_tokens: int | None = None,
     ):
-        self.model_name = model_name
+
         self.api_key = api_key
+        self.model_name = model_name
         self.temperature = temperature
         self.max_tokens = max_tokens
-        self.extra_kwargs = extra_kwargs or {}
-
         super().__init__()
 
-    def __call__(self, messages: list[dict | MessageBuilder], **kwargs) -> dict:
+    def __call__(self, payload: APIPayload) -> dict:
         """Make a call to the model and return the parsed response."""
-        response = self._call_api(messages, **kwargs)
+        response = self._call_api(payload)
         return self._parse_response(response)
 
     @abstractmethod
-    def _call_api(self, messages: list[dict | MessageBuilder], **kwargs) -> Any:
+    def _call_api(self, payload: APIPayload) -> Any:
         """Make a call to the model API and return the raw response."""
         pass
 
@@ -408,132 +455,76 @@ def _parse_response(self, response: Any) -> LLMOutput:
         """Parse the raw response from the model API and return a structured response."""
         pass
 
+class AgentlabAction:
+    """
+    Collection of utility function to convert tool calls to Agentlab action format.
+    """
 
-class BaseModelWithPricing(TrackAPIPricingMixin, BaseResponseModel):
-    pass
-
-
-
-# TODO: Define and use Flexible set of Configuration.
-# Below configs are not used and are WIP. 
-# _______________________________________________________________
-
-# Some High-level requirements.
-
-# Env can have multiple actions sets. Each action set should be supported as tools and prompt description.
-# Env should have converstion functions to parse the tool calls or text to back to env actions.
-
-# Backend LLMs or Large action models can have thier own action sets (Ui-Tars, CUA), which can be fixed or flexible.
-# EnvConfig or LLMConfig or ActionConfig should provide conversion from Backend LLM action to env_action.
-
-# AgentLab Agents may emit multiple actions. EnvConfig should mention if it supports multiple actions in a single step.
-# If Env controller does not natively support multiactions. We can choose to integrate Env logic which brings this support.
-
-# Env should broadcast what obersvations are supported and agent loop should be able to handle them. (e.g, Ax_tree) 
+    def convert_toolcall_to_agentlab_action_format(toolcall: ToolCall) -> str:
+        """Convert a tool call to an Agentlab environment action string.
+        This converts tools calls to python function call strings."""
+        action_name, tool_args = toolcall.name, toolcall.arguments
+        action_args = ", ".join(
+            f'{k}="{v}"' if isinstance(v, str) else f"{k}={v}" for k, v in tool_args.items()
+        )
+        action_str = f"{action_name}({action_args})"
+        return action_str
 
-@dataclass
-class ActionConfig:
-    action_set: "AbstractActionSet"  # TODO: Agentlab AbstractActionSet, have constructor methods to create actions as tools or descriptions with examples.
-    multiaction: bool = True
-    env_action_as_tools: bool = True  # If True, action set is treated as tools
-    tools: Optional[List[Dict[str, Any]]] = None  # List of tool definitions or list of functions
-    tool_text_descriptions: str = ""  # Some description of the tools, emitted by the environment.
-    tools_calls_to_env_action_parser: callable = # Some callable given by the environment to convert tool calls to env actions.
-    text_to_env_action_parser: Optional[Type[MessageBuilder]] = None
+    def convert_multiactions_to_agentlab_action_format(actions: list[str]) -> str:
+        """Convert multiple actions list to a format that env supports
+        Joins multiple python function calls with a newline character.
+        """
+        return "\n".join(actions) if actions else None
 
-@dataclass
-class ObsConfig
-# Check generic agent
-    pass
-@dataclass
-class Config:
-    model_args: BaseModelArgs
-    obs: ObsConfig
-    action: ActionConfig
-    generationConfig: GenerationConfig
 
-@dataclass
-class PromptConfig:
-    # use_hints
-    # use_summarizer
-    pass
-@dataclass
-class ProviderConfig:
-    """Configuration for the LLM provider."""
-    api_key_env_var: Optional[str] = None
-    base_url: Optional[str] = None  # Base URL for the API, if different
-    # Anything else? # VLLM specific configurations ?, etc.
-@dataclass
-class LLMConfig:
-    # backend LLM supported action set 
-    # Any other LLM specific configurations
-    # Tool calling format?
-    # Maybe include provider specific configurations here?
-    
+class BaseModelWithPricing(TrackAPIPricingMixin, BaseResponseModel):
     pass
 
-@dataclass
-class GenerationConfig:
-    temperature: float = 0.5
-    max_new_tokens: int = 100
-    # Might be useful for exploration to have the ability to modify inside agent loop.
-
-@dataclass
-class APIPayload:
-    messages: List[MessageBuilder | ToolCalls]
-    api_endpoint: str
-    api_key_env_var: Optional[str] = None
-    base_url: Optional[str] = None
-    tools: Optional[List[Dict[str, Any]]] = None  # Taken from ActionConfig 
-    tool_choice: Optional[str] = None  # Fix some literal value for tool choice, e.g., "auto" and convert according to the API. OpenAI and Anthrophic can have different tool choice parameters that behave differently.
-    generation_config: GenerationConfig = GenerationConfig()
-    caching: bool = False  # If True, cache the response
-    # The agent loop will form the payload based on the config and pass it to the API call.
-
-# _______________________________________________________________
-
 class OpenAIResponseModel(BaseModelWithPricing):
     def __init__(
         self,
         model_name: str,
+        base_url: Optional[str] = None,
         api_key: Optional[str] = None,
-        temperature: float = 0.5,
-        max_tokens: int = 100,
-        extra_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs,
+        temperature: float | None = None,
+        max_tokens: int | None = 100,
     ):
-        self.tools = kwargs.pop("tools", None)
-        self.tool_choice = kwargs.pop("tool_choice", None)
-        self.action_space_as_tools = True # this should be a config
-        self.multiaction_in_a_step = True # this should be a config
-        super().__init__(
-            model_name=model_name,
-            api_key=api_key,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            extra_kwargs=extra_kwargs,
-            **kwargs,
+        self.action_space_as_tools = True  # this should be a config
+        super().__init__(  # This is passed to BaseModel
+            model_name=model_name, api_key=api_key, temperature=temperature, max_tokens=max_tokens
         )
-        self.client = OpenAI(api_key=api_key)
+        client_args = {}
+        if base_url is not None:
+            client_args["base_url"] = base_url
+        if api_key is not None:
+            client_args["api_key"] = api_key
+        self.client = OpenAI(**client_args)
+        # Init pricing tracker after super() so that all attributes have been set.
+        self.init_pricing_tracker(pricing_api="openai")  # Use the PricingMixin
 
-    def _call_api(self, messages: list[Any | MessageBuilder], **kwargs) -> dict:
-        input = self.convert_messages_to_api_format(messages)
+    def _call_api(self, payload: APIPayload) -> "ResponseObject":
 
-        #TODO: API/Payload Params should be a config dataclass. Update once settled on a config structure.
+        input = []
+        for msg in payload.messages:
+            input.extend(msg.prepare_message())
         api_params: Dict[str, Any] = {
             "model": self.model_name,
             "input": input,
-            "temperature": self.temperature,
-            "max_output_tokens": self.max_tokens,
-            **self.extra_kwargs,
         }
+        # Not all Open AI models support these parameters (example: o3), so we check if they are set.
+        if self.temperature is not None:
+            api_params["temperature"] = self.temperature
+        if self.max_tokens is not None:
+            api_params["max_output_tokens"] = self.max_tokens
+        if payload.tools is not None:
+            api_params["tools"] = payload.tools
+        if payload.tool_choice is not None and payload.force_call_tool is None:
+            api_params["tool_choice"] = (
+                "required" if payload.tool_choice in ("required", "any") else payload.tool_choice
+            )
+        if payload.force_call_tool is not None:
+            api_params["tool_choice"] = {"type": "function", "name": payload.force_call_tool}
 
-        if self.tools is not None:
-            api_params["tools"] = self.tools
-        if self.tool_choice is not None:
-            api_params["tool_choice"] = self.tool_choice
-
-        # api_params |= kwargs  # Merge any additional parameters passed
         response = call_openai_api_with_retries(
             self.client.responses.create,
             api_params,
@@ -541,39 +532,42 @@ def _call_api(self, messages: list[Any | MessageBuilder], **kwargs) -> dict:
 
         return response
 
-    def convert_messages_to_api_format(self, messages: List[MessageBuilder| ToolCalls]) -> List[Message]:
-        """Convert messages to the format expected by the OpenAI Responses API."""
-        input = []
-        for msg in messages:
-            if isinstance(msg, MessageBuilder):
-                temp = msg.prepare_message()
-            elif isinstance(msg, ToolCalls):
-                temp = msg.raw_calls
-            else:
-                raise TypeError('Unsupported message type: {}'.format(type(msg)))
-            input.extend(temp)
-        return input
-
     def _parse_response(self, response: "OpenAIResponseObject") -> LLMOutput:
         """Parse the raw response from the OpenAI Responses API."""
+
         think_output = self._extract_thinking_content_from_response(response)
         toolcalls = self._extract_tool_calls_from_response(response)
+
         if self.action_space_as_tools:
             env_action = self._extract_env_actions_from_toolcalls(toolcalls)
         else:
             env_action = self._extract_env_actions_from_text_response(response)
+
         return LLMOutput(
             raw_response=response,
             think=think_output,
-            action=env_action if env_action is not None else "",  
+            action=env_action if env_action is not None else None,
             tool_calls=toolcalls if toolcalls is not None else None,
         )
 
+    def convert_messages_to_api_format(
+        self, messages: List[MessageBuilder | ToolCalls]
+    ) -> List[Message]:
+        """Convert messages to the format expected by the OpenAI Responses API."""
+        input = []
+        for msg in messages:
+            if isinstance(msg, MessageBuilder):
+                temp = msg.prepare_message()
+            elif isinstance(msg, ToolCalls):
+                temp = msg.raw_calls
+            else:
+                raise TypeError("Unsupported message type: {}".format(type(msg)))
+            input.extend(temp)
+        return input
 
     def _extract_tool_calls_from_response(self, response: "OpenAIResponseObject") -> ToolCalls:
         """Extracts tool calls from the response."""
-        #TODO: Should this be in the BaseResponseModelclass?
-        tool_calls = ToolCalls(raw_calls=response.output)
+        tool_calls = []
         for output in response.output:
             if output.type == "function_call":
                 tool_name = output.name
@@ -581,27 +575,26 @@ def _extract_tool_calls_from_response(self, response: "OpenAIResponseObject") ->
             elif output.type == "computer_call":
                 tool_name, tool_args = self.cua_action_to_env_tool_name_and_args(output.action)
             else:
+                # skip if the output is not a tool call
                 continue
-            tool_call = ToolCall(
-                name=tool_name,
-                arguments=tool_args,
-                raw_call=output,
-            )
-            tool_calls.add_tool_call(tool_call)
-        return tool_calls
+            tool_calls.append(ToolCall(name=tool_name, arguments=tool_args, raw_call=output))
+
+        return ToolCalls(tool_calls=tool_calls, raw_calls=response)
 
     def _extract_env_actions_from_toolcalls(self, toolcalls: ToolCalls) -> Any | None:
         """Extracts actions from the response."""
-        #TODO: Should this be in the BaseResponseModelclass? or Emitted by Environment?
-        actions = []
-        for call in toolcalls:
-            if call.is_env_action:
-                action_str = self.convert_toolcall_to_env_action_format(call)
-                actions.append(action_str)
-        if self.multiaction_in_a_step: # This should be a config
-            return self.convert_multiactions_to_env_action_format(actions)
-        else:
-            return actions[0] if actions else None
+        if not toolcalls:
+            return None
+
+        actions = [
+            AgentlabAction.convert_toolcall_to_agentlab_action_format(call) for call in toolcalls
+        ]
+        actions = (
+            AgentlabAction.convert_multiactions_to_agentlab_action_format(actions)
+            if len(actions) > 1
+            else actions[0]
+        )
+        return actions
 
     def _extract_thinking_content_from_response(self, response: "OpenAIResponseObject") -> str:
         """Extracts the thinking content from the response."""
@@ -616,108 +609,159 @@ def _extract_thinking_content_from_response(self, response: "OpenAIResponseObjec
                 thinking_content += f"{output.output_text}\n"
         return thinking_content
 
-    ### Environment Specific functions, in this case BGYM  ###
-
-    #TODO: Should the below functions be in the BaseResponseModelclass? or Emitted by the Environment and intialized using a config?
-    def convert_toolcall_to_env_action_format(self, toolcall: ToolCall) -> str:
-        """Convert a tool call to an BGYM environment action string."""
-        action_name, tool_args = toolcall.name, toolcall.arguments
-        action_args = ", ".join(
-            f'{k}="{v}"' if isinstance(v, str) else f"{k}={v}" for k, v in tool_args.items()
-        )
-        action_str = f"{action_name}({action_args})"
-        return action_str
-
-    def convert_multiactions_to_env_action_format(self, actions:list[Any] ) -> Any:
-        """Convert multiple actions list to a format that env supports"""
-        return "\n".join(actions) if actions else None
-
     def cua_action_to_env_tool_name_and_args(self, action: str) -> tuple[str, Dict[str, Any]]:
+        """ "Overwrite this method to convert a computer action to agentlab action string"""
         pass
 
-    def _extract_env_actions_from_text_response(self, response: "OpenAIResponseObject") -> str | None:
+    def _extract_env_actions_from_text_response(
+        self, response: "OpenAIResponseObject"
+    ) -> str | None:
         """Extracts environment actions from the text response."""
         # Use when action space is not given as tools.
         pass
 
 
-# TODO: Refactor similar to OpenAIResponseModel
 class OpenAIChatCompletionModel(BaseModelWithPricing):
     def __init__(
         self,
         model_name: str,
-        client_args: Optional[Dict[str, Any]] = {},
-        temperature: float = 0.5,
-        max_tokens: int = 100,
-        extra_kwargs: Optional[Dict[str, Any]] = None,
-        *args,
-        **kwargs,
+        base_url: Optional[str] = None,
+        api_key: Optional[str] = None,
+        temperature: float | None = None,
+        max_tokens: int | None = 100,
     ):
-
-        self.tools = self.format_tools_for_chat_completion(kwargs.pop("tools", None))
-        self.tool_choice = kwargs.pop("tool_choice", None)
-
         super().__init__(
             model_name=model_name,
             temperature=temperature,
             max_tokens=max_tokens,
-            extra_kwargs=extra_kwargs,
-            *args,
-            **kwargs,
         )
-
-        self.client = OpenAI(
-            **client_args
-        )  # Ensures client_args is a dict or defaults to an empty dict
-
-    def _call_api(self, messages: list[dict | MessageBuilder]) -> openai.types.chat.ChatCompletion:
+        self.action_space_as_tools = True  # this should be a config
+        client_args = {}
+        if base_url is not None:
+            client_args["base_url"] = base_url
+        if api_key is not None:
+            client_args["api_key"] = api_key
+        self.client = OpenAI(**client_args)
+        self.init_pricing_tracker(pricing_api="openai")  # Use the PricingMixin
+
+    def _call_api(self, payload: APIPayload) -> openai.types.chat.ChatCompletion:
         input = []
-        for msg in messages:
-            input.extend(msg.prepare_message() if isinstance(msg, MessageBuilder) else [msg])
+        for msg in payload.messages:
+            input.extend(msg.prepare_message())
         api_params: Dict[str, Any] = {
             "model": self.model_name,
             "messages": input,
-            "temperature": self.temperature,
-            "max_tokens": self.max_tokens,
-            **self.extra_kwargs,  # Pass tools, tool_choice, etc. here
         }
-        if self.tools is not None:
-            api_params["tools"] = self.tools
-        if self.tool_choice is not None:
-            api_params["tool_choice"] = self.tool_choice
+        if self.temperature is not None:
+            api_params["temperature"] = self.temperature
+
+        if self.max_tokens is not None:
+            api_params["max_completion_tokens"] = self.max_tokens
+
+        if payload.tools is not None:
+            # tools format is OpenAI Response API format.
+            api_params["tools"] = self.format_tools_for_chat_completion(payload.tools)
+
+        if payload.tool_choice is not None and payload.force_call_tool is None:
+            api_params["tool_choice"] = (
+                "required" if payload.tool_choice in ("required", "any") else payload.tool_choice
+            )
+
+        if payload.force_call_tool is not None:
+            api_params["tool_choice"] = {
+                "type": "function",
+                "function": {"name": payload.force_call_tool},
+            }
 
         response = call_openai_api_with_retries(self.client.chat.completions.create, api_params)
 
         return response
 
     def _parse_response(self, response: openai.types.chat.ChatCompletion) -> LLMOutput:
+        think_output = self._extract_thinking_content_from_response(response)
+        tool_calls = self._extract_tool_calls_from_response(response)
 
-        output = LLMOutput(
+        if self.action_space_as_tools:
+            env_action = self._extract_env_actions_from_toolcalls(tool_calls)
+        else:
+            env_action = self._extract_env_actions_from_text_response(response)
+        return LLMOutput(
             raw_response=response,
-            think="",
-            action=None,  # Default if no tool call
-            tool_calls=None,
+            think=think_output,
+            action=env_action if env_action is not None else None,
+            tool_calls=tool_calls if tool_calls is not None else None,
         )
+
+
+    def _extract_thinking_content_from_response(
+        self, response: openai.types.chat.ChatCompletion, wrap_tag="think"
+    ):
+        """Extracts the content from the message, including reasoning if available.
+        It wraps the reasoning around <think>...</think> for easy identification of reasoning content,
+        When LLM produces 'text' and 'reasoning' in the same message.
+        Note: The wrapping of 'thinking' content may not be nedeed and may be reconsidered.
+
+        Args:
+            message: The message object or dict containing content and reasoning.
+            wrap_tag: The tag name to wrap reasoning content (default: "think").
+
+        Returns:
+            str: The extracted content with reasoning wrapped in specified tags.
+        """
+        message = response.choices[0].message
+        if not isinstance(message, dict):
+            message = message.to_dict()
+
+        reasoning_content = message.get("reasoning", None)
+        msg_content = message.get("text", "")  # works for Open-router
+        if reasoning_content:
+            # Wrap reasoning in <think> tags with newlines for clarity
+            reasoning_content = f"<{wrap_tag}>{reasoning_content}</{wrap_tag}>\n"
+            logging.debug("Extracting content from response.choices[i].message.reasoning")
+        else:
+            reasoning_content = ""
+        return f"{reasoning_content}{msg_content}{message.get('content', '')}"
+
+    def _extract_tool_calls_from_response(
+        self, response: openai.types.chat.ChatCompletion
+    ) -> ToolCalls | None:
+        """Extracts tool calls from the response."""
         message = response.choices[0].message.to_dict()
-        output.think = self.extract_content_with_reasoning(message)
-
-        if tool_calls := message.get("tool_calls", None):
-            for tool_call in tool_calls:
-                function = tool_call["function"]
-                arguments = json.loads(function["arguments"])
-                func_args_str = ", ".join(
-                    [
-                        f'{k}="{v}"' if isinstance(v, str) else f"{k}={v}"
-                        for k, v in arguments.items()
-                    ]
+        tool_calls = message.get("tool_calls", None)
+        if tool_calls is None:
+            return None
+        tool_call_list = []
+        for tc in tool_calls:
+            tool_call_list.append(
+                ToolCall(
+                    name=tc["function"]["name"],
+                    arguments=json.loads(tc["function"]["arguments"]),
+                    raw_call=tc,
                 )
-                output.action = f"{function['name']}({func_args_str})"
-                output.tool_calls = {
-                    "role": "assistant",
-                    "tool_calls": [message["tool_calls"][0]],  # Use only the first tool call
-                }
-                break
-        return output
+            )
+        return ToolCalls(tool_calls=tool_call_list, raw_calls=response)
+
+    def _extract_env_actions_from_toolcalls(self, toolcalls: ToolCalls) -> Any | None:
+        """Extracts actions from the response."""
+        if not toolcalls:
+            return None
+
+        actions = [
+            AgentlabAction.convert_toolcall_to_agentlab_action_format(call) for call in toolcalls
+        ]
+        actions = (
+            AgentlabAction.convert_multiactions_to_agentlab_action_format(actions)
+            if len(actions) > 1
+            else actions[0]
+        )
+        return actions
+    
+    def _extract_env_actions_from_text_response(
+        self, response: "openai.types.chat.ChatCompletion"
+    ) -> str | None:
+        """Extracts environment actions from the text response."""
+        # Use when action space is not given as tools.
+        pass
 
     @staticmethod
     def format_tools_for_chat_completion(tools):
@@ -744,98 +788,67 @@ def format_tools_for_chat_completion(tools):
             ]
         return formatted_tools
 
-    @staticmethod
-    def extract_content_with_reasoning(message, wrap_tag="think"):
-        """Extracts the content from the message, including reasoning if available.
-        It wraps the reasoning around <think>...</think> for easy identification of reasoning content,
-        When LLM produces 'text' and 'reasoning' in the same message.
-        Note: The wrapping of 'thinking' content may not be nedeed and may be reconsidered.
 
-        Args:
-            message: The message object or dict containing content and reasoning.
-            wrap_tag: The tag name to wrap reasoning content (default: "think").
-
-        Returns:
-            str: The extracted content with reasoning wrapped in specified tags.
-        """
-        if not isinstance(message, dict):
-            message = message.to_dict()
-
-        reasoning_content = message.get("reasoning", None)
-        msg_content = message.get("text", "")  # works for OR
-
-        if reasoning_content:
-            # Wrap reasoning in <think> tags with newlines for clarity
-            reasoning_content = f"<{wrap_tag}>{reasoning_content}</{wrap_tag}>\n"
-            logging.debug("Extracting content from response.choices[i].message.reasoning")
-        else:
-            reasoning_content = ""
-        return f"{reasoning_content}{msg_content}{message.get('content', '')}"
-
-
-# TODO: Refactor similar to OpenAIResponseModel
 class ClaudeResponseModel(BaseModelWithPricing):
     def __init__(
         self,
         model_name: str,
+        base_url: Optional[str] = None,
         api_key: Optional[str] = None,
-        temperature: float = 0.5,
-        max_tokens: int = 100,
-        extra_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs,
+        temperature: float | None = None,
+        max_tokens: int | None = 100,
     ):
-        self.tools = kwargs.pop("tools", None)
-        self.tool_choice = kwargs.pop("tool_choice", None)
+        self.action_space_as_tools = True  # this should be a config
 
         super().__init__(
             model_name=model_name,
             api_key=api_key,
             temperature=temperature,
             max_tokens=max_tokens,
-            extra_kwargs=extra_kwargs,
-            **kwargs,
         )
-
-        self.client = Anthropic(api_key=api_key)
+        client_args = {}
+        if base_url is not None:
+            client_args["base_url"] = base_url
+        if api_key is not None:
+            client_args["api_key"] = api_key
+        self.client = Anthropic(**client_args)
+        self.init_pricing_tracker(pricing_api="anthropic")  # Use the PricingMixin
 
     def _call_api(
-        self, messages: list[dict | MessageBuilder], tool_choice="auto", **kwargs
-    ) -> dict:
-        input = []
-
-        sys_msg, other_msgs = self.filter_system_messages(messages)
+        self, payload: APIPayload) -> Completion:
+        sys_msg, other_msgs = self.filter_system_messages(payload.messages)
         sys_msg_text = "\n".join(c["text"] for m in sys_msg for c in m.content)
+        input = []
         for msg in other_msgs:
-            if isinstance(msg, MessageBuilder):
-                temp = msg.prepare_message() 
-            elif isinstance(msg, ToolCalls):
-                temp = [{
-                    "role": "assistant",
-                    "content": msg.raw_calls.content
-                }]
-            if kwargs.pop("use_cache_breakpoints", False):
+            temp = msg.prepare_message()
+            if payload.use_cache_breakpoints:
                 temp = self.apply_cache_breakpoints(msg, temp)
             input.extend(temp)
 
         api_params: Dict[str, Any] = {
             "model": self.model_name,
             "messages": input,
-            "temperature": self.temperature,
-            "max_tokens": self.max_tokens,
-            "system": sys_msg_text,  # Anthropic API expects system message as a string
-            "tool_choice": {"type": tool_choice},  # Tool choice for Claude API
-            **self.extra_kwargs,  # Pass tools, tool_choice, etc. here
-        }
-        if self.tools is not None:
-            api_params["tools"] = self.tools
-        if kwargs.pop("cache_tool_definition", False):
-            # Indicating cache control for the last tool enables caching of all previous tool definitions.
+            "system": sys_msg_text}  # Anthropic API expects system message as a string
+
+        if self.temperature is not None:
+            api_params['temperature'] = self.temperature
+        if self.max_tokens is not None:
+            api_params["max_tokens"] = self.max_tokens
+
+        if payload.tools is not None:
+            api_params["tools"] = payload.tools
+        if payload.tool_choice is not None and payload.force_call_tool is None:
+            api_params["tool_choice"] = (
+                {"type": "any"} if payload.tool_choice in ("required", "any") else {"type": payload.tool_choice}
+            )
+        if payload.force_call_tool is not None:
+            api_params["tool_choice"] = {"type": "tool", "name": payload.force_call_tool}
+        if payload.cache_tool_definition:
+            # Indicating cache control for the last message enables caching of the last message.
             api_params["tools"][-1]["cache_control"] = {"type": "ephemeral"}
-        if kwargs.pop("cache_complete_prompt", False):
+        if payload.cache_complete_prompt:        
             # Indicating cache control for the last message enables caching of the complete prompt.
             api_params["messages"][-1]["content"][-1]["cache_control"] = {"type": "ephemeral"}
-        if self.extra_kwargs.get("reasoning", None) is not None:
-            api_params["reasoning"] = self.extra_kwargs["reasoning"]
 
         response = call_anthropic_api_with_retries(self.client.messages.create, api_params)
 
@@ -858,36 +871,60 @@ def filter_system_messages(messages: list[dict | MessageBuilder]) -> tuple[Messa
                 other_msgs.append(msg)
         return sys_msgs, other_msgs
 
-    def _parse_response(self, response: dict) -> LLMOutput:
-        result = LLMOutput(
+    def _parse_response(self, response: "AnthrophicMessage") -> LLMOutput:
+
+        toolcalls = self._extract_tool_calls_from_response(response)
+        think_output = self._extract_thinking_content_from_response(response)
+        if self.action_space_as_tools:
+            env_action = self._extract_env_actions_from_toolcalls(toolcalls)
+        else:
+            env_action = self._extract_env_actions_from_text_response(response)
+        return LLMOutput(
             raw_response=response,
-            think="",
-            action=None,
-            tool_calls=None
-            )
-        tool_calls = ToolCalls(raw_calls=response)  # Initialize ToolCalls to hold tool call responses
-        action_list = []
-        # print(f"Response from Claude: {response}")
+            think=think_output,
+            action=env_action if env_action is not None else None,
+            tool_calls=toolcalls if toolcalls is not None else None,
+        )
+
+    def _extract_tool_calls_from_response(self, response: "AnthrophicMessage") -> ToolCalls:
+        """Extracts tool calls from the response."""
+        tool_calls = []
         for output in response.content:
             if output.type == "tool_use":
-                func_args_str = ", ".join(
-                    [
-                        f'{k}="{v}"' if isinstance(v, str) else f"{k}={v}"
-                        for k, v in output.input.items()
-                    ]
+                tool_calls.append(
+                    ToolCall(
+                        name=output.name,
+                        arguments=output.input,
+                        raw_call=output,
+                    )
                 )
-                action_list.append(f"{output.name}({func_args_str})")
-                tool_calls.add_tool_call(ToolCall(name=output.name, arguments=output.input, raw_call=output))
-            elif output.type == "text":
-                result.think += output.text
-        
-        result.tool_calls = tool_calls if tool_calls else None
-        result.action = "\n".join(action_list)
-        return result
+        return ToolCalls(tool_calls=tool_calls, raw_calls=response)
 
-    # def ensure_cache_conditions(self, msgs: List[Message]) -> bool:
-    #     """Ensure API specific cache conditions are met."""
-    #     assert sum(getattr(msg, "_cache_breakpoint", 0) for msg in msgs) <= 4, "Too many cache breakpoints in the message."
+    def _extract_thinking_content_from_response(self, response: "AnthrophicMessage"):
+        """Extracts the thinking content from the response."""
+        return "".join(output.text for output in response.content if output.type == "text")
+
+    def _extract_env_actions_from_toolcalls(self, toolcalls: ToolCalls) -> Any | None:
+        """Extracts actions from the response."""
+        if not toolcalls:
+            return None
+
+        actions = [
+            AgentlabAction.convert_toolcall_to_agentlab_action_format(call) for call in toolcalls
+        ]
+        actions = (
+            AgentlabAction.convert_multiactions_to_agentlab_action_format(actions)
+            if len(actions) > 1
+            else actions[0]
+        )
+        return actions
+    
+    def _extract_env_actions_from_text_response(
+        self, response: "AnthrophicMessage"
+    ) -> str | None:
+        """Extracts environment actions from the text response."""
+        # Use when action space is not given as tools.
+        pass
 
     def apply_cache_breakpoints(self, msg: Message, prepared_msg: dict) -> List[Message]:
         """Apply cache breakpoints to the messages."""
@@ -898,7 +935,6 @@ def apply_cache_breakpoints(self, msg: Message, prepared_msg: dict) -> List[Mess
 
 # Factory classes to create the appropriate model based on the API endpoint.
 
-# TODO: Do we really need these factory classes? how about implementing a _from_args() method in the BaseModelArgs class?
 @dataclass
 class OpenAIResponseModelArgs(BaseModelArgs):
     """Serializable object for instantiating a generic chat model with an OpenAI
@@ -906,14 +942,11 @@ class OpenAIResponseModelArgs(BaseModelArgs):
 
     api = "openai"
 
-    def make_model(self, extra_kwargs=None, **kwargs):
+    def make_model(self):
         return OpenAIResponseModel(
             model_name=self.model_name,
             temperature=self.temperature,
             max_tokens=self.max_new_tokens,
-            extra_kwargs=extra_kwargs,
-            pricing_api="openai",
-            **kwargs,
         )
 
     def get_message_builder(self) -> MessageBuilder:
@@ -927,14 +960,11 @@ class ClaudeResponseModelArgs(BaseModelArgs):
 
     api = "anthropic"
 
-    def make_model(self, extra_kwargs=None, **kwargs):
+    def make_model(self):
         return ClaudeResponseModel(
             model_name=self.model_name,
             temperature=self.temperature,
             max_tokens=self.max_new_tokens,
-            extra_kwargs=extra_kwargs,
-            pricing_api="anthropic",
-            **kwargs,
         )
 
     def get_message_builder(self) -> MessageBuilder:
@@ -948,14 +978,11 @@ class OpenAIChatModelArgs(BaseModelArgs):
 
     api = "openai"
 
-    def make_model(self, extra_kwargs=None, **kwargs):
+    def make_model(self):
         return OpenAIChatCompletionModel(
             model_name=self.model_name,
             temperature=self.temperature,
             max_tokens=self.max_new_tokens,
-            extra_kwargs=extra_kwargs,
-            pricing_api="openai",
-            **kwargs,
         )
 
     def get_message_builder(self) -> MessageBuilder:
@@ -969,43 +996,40 @@ class OpenRouterModelArgs(BaseModelArgs):
 
     api: str = "openai"  # tool description format used by actionset.to_tool_description() in bgym
 
-    def make_model(self, extra_kwargs=None, **kwargs):
+    def make_model(self):
         return OpenAIChatCompletionModel(
-            client_args={
-                "base_url": "https://openrouter.ai/api/v1",
-                "api_key": os.getenv("OPENROUTER_API_KEY"),
-            },
+            base_url="https://openrouter.ai/api/v1",
+            api_key=os.getenv("OPENROUTER_API_KEY"),
             model_name=self.model_name,
             temperature=self.temperature,
             max_tokens=self.max_new_tokens,
-            extra_kwargs=extra_kwargs,
-            pricing_api="openrouter",
-            **kwargs,
         )
 
     def get_message_builder(self) -> MessageBuilder:
         return OpenAIChatCompletionAPIMessageBuilder
 
 
-class VLLMModelArgs(BaseModelArgs):
-    """Serializable object for instantiating a generic chat model with a VLLM
-    model."""
+# ___Not__Tested__#
 
-    api = "openai"  # tool description format used by actionset.to_tool_description() in bgym
+# class VLLMModelArgs(BaseModelArgs):
+#     """Serializable object for instantiating a generic chat model with a VLLM
+#     model."""
 
-    def make_model(self, extra_kwargs=None, **kwargs):
-        return OpenAIChatCompletionModel(
-            client_args={
-                "base_url": "http://localhost:8000/v1",
-                "api_key": os.getenv("VLLM_API_KEY", "EMPTY"),
-            },
-            model_name=self.model_name,  # this needs to be set
-            temperature=self.temperature,
-            max_tokens=self.max_new_tokens,
-            extra_kwargs=extra_kwargs,
-            pricing_api="vllm",
-            **kwargs,
-        )
+#     api = "openai"  # tool description format used by actionset.to_tool_description() in bgym
 
-    def get_message_builder(self) -> MessageBuilder:
-        return OpenAIChatCompletionAPIMessageBuilder
+#     def make_model(self, extra_kwargs=None, **kwargs):
+#         return OpenAIChatCompletionModel(
+#             client_args={
+#                 "base_url": "http://localhost:8000/v1",
+#                 "api_key": os.getenv("VLLM_API_KEY", "EMPTY"),
+#             },
+#             model_name=self.model_name,  # this needs to be set
+#             temperature=self.temperature,
+#             max_tokens=self.max_new_tokens,
+#             extra_kwargs=extra_kwargs,
+#             pricing_api="vllm",
+#             **kwargs,
+#         )
+
+#     def get_message_builder(self) -> MessageBuilder:
+#         return OpenAIChatCompletionAPIMessageBuilder

From df3bc6d9eff8d32113099b78162a6fbde4641184 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Sun, 13 Jul 2025 15:11:15 -0400
Subject: [PATCH 16/37] Use APIPayload obj to call llms in  tool use agent.

---
 .../agents/tool_use_agent/tool_use_agent.py   | 77 ++++++++++++++-----
 1 file changed, 59 insertions(+), 18 deletions(-)

diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
index 098e86ee..cc7426dc 100644
--- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py
+++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
@@ -28,6 +28,7 @@
     MessageBuilder,
     OpenAIChatModelArgs,
     OpenAIResponseModelArgs,
+    APIPayload,
 )
 from agentlab.llm.tracking import cost_tracker_decorator
 
@@ -125,9 +126,6 @@ def contains_image(self) -> bool:
                     if 'image' in item:
                         return True
         return False
-       
-
-
 
 
 SYS_MSG = """You are a web agent. Based on the observation, you will decide which action to take to accomplish your goal. 
@@ -225,12 +223,9 @@ def apply(
         discussion.append(obs_msg)
 
         if tool_calls:
-            for action_call in tool_calls.get_bgym_action_calls():
-                if not self.openai_cua_mode:
-                    action_call.add_text("See the observation")
-            for fn_call in tool_calls.get_non_bgym_action_calls():
-                call_results = execute_fn_calls(fn_call.name, fn_call.arguments)
-                fn_call.add_text(call_results)
+            for call in tool_calls:
+                # call_results = execute_fn_calls(call.name, call.arguments)
+                call.response_text("See Observation")
             tool_response = llm.msg.add_responded_tool_calls(tool_calls)
             discussion.append(tool_response)
 
@@ -288,8 +283,8 @@ def apply(self, llm, discussion: StructuredDiscussion) -> dict:
         msg = llm.msg.user().add_text("""Summarize\n""")
 
         discussion.append(msg)
-        # TODO need to make sure we don't force tool use here
-        summary_response = llm(messages=discussion.flatten(), tool_choice="none")
+
+        summary_response = llm(APIPayload(messages=discussion.flatten()))
 
         summary_msg = llm.msg.assistant().add_text(summary_response.think)
         discussion.append(summary_msg)
@@ -428,7 +423,7 @@ def __init__(
 
         self.call_ids = []
 
-        self.llm = model_args.make_model(extra_kwargs={"tools": self.tools})
+        self.llm = model_args.make_model()
         self.msg_builder = model_args.get_message_builder()
         self.llm.msg = self.msg_builder
 
@@ -495,11 +490,14 @@ def get_action(self, obs: Any) -> float:
 
         messages = self.discussion.flatten()
         response: LLMOutput = self.llm(
-            messages=messages,
-            tool_choice="any",
-            cache_tool_definition=True,
-            cache_complete_prompt=False,
-            use_cache_breakpoints=True,
+            APIPayload(
+                messages=messages,
+                tools=self.tools, # You can update tools available tools now.
+                tool_choice="any",
+                cache_tool_definition=True,
+                cache_complete_prompt=False,
+                use_cache_breakpoints=True,
+            )
         )
         action = response.action
         think = response.think
@@ -508,7 +506,7 @@ def get_action(self, obs: Any) -> float:
             think = last_summary.content[0]["text"] + "\n" + think
 
         self.discussion.new_group()
-        self.discussion.append(response.tool_calls)
+        # self.discussion.append(response.tool_calls) # No need to append tool calls anymore.
 
         self.last_response = response
         self._responses.append(response)  # may be useful for debugging
@@ -537,6 +535,32 @@ def get_action(self, obs: Any) -> float:
     temperature=0.1,
     vision_support=True,
 )
+O3_RESPONSE_MODEL = OpenAIResponseModelArgs(
+    model_name="o3-2025-04-16",
+    max_total_tokens=200_000,
+    max_input_tokens=200_000,
+    max_new_tokens=2_000,
+    temperature=None,   # O3 does not support temperature
+    vision_support=True,
+)
+O3_CHATAPI_MODEL = OpenAIChatModelArgs(
+    model_name="o3-2025-04-16",
+    max_total_tokens=200_000,
+    max_input_tokens=200_000,
+    max_new_tokens=2_000,
+    temperature=None,
+    vision_support=True,
+)
+from agentlab.llm.response_api import OpenRouterModelArgs
+
+GPT4_1_OPENROUTER_MODEL = OpenRouterModelArgs(
+    model_name="openai/gpt-4.1",
+    max_total_tokens=200_000,
+    max_input_tokens=200_000,
+    max_new_tokens=2_000,
+    temperature=None,  # O3 does not support temperature
+    vision_support=True,
+)
 
 OPENAI_CHATAPI_MODEL_CONFIG = OpenAIChatModelArgs(
     model_name="gpt-4o-2024-08-06",
@@ -582,3 +606,20 @@ def get_action(self, obs: Any) -> float:
     model_args=CLAUDE_MODEL_CONFIG,
     config=DEFAULT_PROMPT_CONFIG,
 )
+
+OAI_AGENT = ToolUseAgentArgs(
+    model_args=O3_RESPONSE_MODEL,
+    config=DEFAULT_PROMPT_CONFIG,
+)
+
+OAI_CHATAPI_AGENT = ToolUseAgentArgs(
+    model_args=O3_CHATAPI_MODEL,
+    config=DEFAULT_PROMPT_CONFIG,
+)
+
+OAI_OPENROUTER_AGENT = ToolUseAgentArgs(
+    model_args=GPT4_1_OPENROUTER_MODEL,
+    config=DEFAULT_PROMPT_CONFIG,
+)
+
+## My test can have a different config and a simple task for the tool use agent.

From 0ec59f8f199903dd29b15b24a66140c7dcc44769 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Sun, 13 Jul 2025 15:34:52 -0400
Subject: [PATCH 17/37] remove is_env_action method from ToolCalls.

---
 src/agentlab/llm/response_api.py | 33 --------------------------------
 1 file changed, 33 deletions(-)

diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
index b6e5dc1e..f74d433c 100644
--- a/src/agentlab/llm/response_api.py
+++ b/src/agentlab/llm/response_api.py
@@ -31,24 +31,6 @@
 ContentItem = Dict[str, Any]
 Message = Dict[str, Union[str, List[ContentItem]]]
 
-# TODO: It would be better idea to let the agent logic decide what is not an env action, instead of env emitting env based actions.
-BGYM_RESERVED_ACTION_FUNCTION_NAMES = [
-    "noop",
-    "scroll_at",
-    "mouse_move",
-    "mouse_up",
-    "mouse_down",
-    "mouse_click",
-    "mouse_dblclick",
-    "mouse_drag_and_drop",
-    "mouse_upload_file",
-    "keyboard_down",
-    "keyboard_up",
-    "keyboard_press",
-    "keyboard_type",
-    "keyboard_insert_text",
-]
-
 
 @dataclass
 class ToolCall:
@@ -65,13 +47,6 @@ class ToolCall:
     raw_call: Any = field(default=None)
     tool_response: ContentItem = None
 
-    @property
-    def is_env_action(self) -> bool:
-        """Check if the tool call is a reserved BGYM action."""
-        # TODO: env should return some func to check if agent action is env action.
-        # Keep in mind env may or may not have a fixed set of reserved actions.
-        return self.name in BGYM_RESERVED_ACTION_FUNCTION_NAMES
-
     @property
     def is_response_set(self) -> bool:
         """Check if the tool response is set."""
@@ -105,14 +80,6 @@ def add_tool_call(self, tool_call: ToolCall) -> "ToolCalls":
         self.tool_calls.append(tool_call)
         return self
 
-    def get_env_action_calls(self) -> List[ToolCall]:
-        """Get all tool calls that are reserved Environment actions."""
-        return [call for call in self.tool_calls if call.is_env_action]
-
-    def get_non_env_action_calls(self) -> List[ToolCall]:
-        """Get all tool calls that are not reserved Environment actions."""
-        return [call for call in self.tool_calls if not call.is_env_action]
-
     @property
     def all_responses_set(self) -> bool:
         """Check if all tool calls have responses set."""

From e716b8bad0de5558b58073aee58ee186fcca2f43 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Sun, 13 Jul 2025 15:38:47 -0400
Subject: [PATCH 18/37] remove stale method and black formatting

---
 src/agentlab/llm/response_api.py | 88 ++++++++++++++------------------
 1 file changed, 39 insertions(+), 49 deletions(-)

diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
index f74d433c..1b69d9d8 100644
--- a/src/agentlab/llm/response_api.py
+++ b/src/agentlab/llm/response_api.py
@@ -127,11 +127,6 @@ def user(cls) -> "MessageBuilder":
     def assistant(cls) -> "MessageBuilder":
         return cls("assistant")
 
-    # Use responded_tool_calls to add tool calls to the message content.
-    # @classmethod
-    # def tool(cls) -> "MessageBuilder":  
-    #     return cls("tool")
-
     @abstractmethod
     def prepare_message(self) -> List[Message]:
         """Prepare the message for the API call."""
@@ -187,7 +182,7 @@ def mark_all_previous_msg_for_caching(self):
     def add_responded_tool_calls(cls, responded_tool_calls: ToolCalls) -> "MessageBuilder":
         """Add tool calls to the message content."""
         assert responded_tool_calls.all_responses_set, "All tool calls must have a response."
-        msg = cls('tool')
+        msg = cls("tool")
         msg.responded_tool_calls = responded_tool_calls
         return msg
 
@@ -222,7 +217,7 @@ def handle_tool_call(self) -> List[Message]:
             raise ValueError("No tool calls found in responded_tool_calls")
 
         output = []
-        output.extend(self.responded_tool_calls.raw_calls.output) # this contains response
+        output.extend(self.responded_tool_calls.raw_calls.output)  # this contains response
         for fn_call in self.responded_tool_calls:
             call_type = fn_call.raw_call.type
             call_id = fn_call.raw_call.call_id
@@ -280,18 +275,25 @@ def handle_tool_call(self) -> List[Message]:
         """Handle the tool call response from the last raw response."""
         if self.responded_tool_calls is None:
             raise ValueError("No tool calls found in responded_tool_calls")
-        
-        llm_tool_call = {"role": "assistant", "content": self.responded_tool_calls.raw_calls.content} # Add the toolcall block 
-        tool_response = {'role': 'user', 'content': []}  # Anthropic expects a list of messages
+
+        llm_tool_call = {
+            "role": "assistant",
+            "content": self.responded_tool_calls.raw_calls.content,
+        }  # Add the toolcall block
+        tool_response = {"role": "user", "content": []}  # Anthropic expects a list of messages
         for call in self.responded_tool_calls:
             assert (
                 "image" not in call.tool_response
             ), "Image output is not supported in tool calls response."
-            tool_response['content'].append({
-                "type": "tool_result",
-                "tool_use_id": call.raw_call.id,
-                "content": self.transform_content(call.tool_response)["text"], # needs to be str
-            })
+            tool_response["content"].append(
+                {
+                    "type": "tool_result",
+                    "tool_use_id": call.raw_call.id,
+                    "content": self.transform_content(call.tool_response)[
+                        "text"
+                    ],  # needs to be str
+                }
+            )
 
         return [llm_tool_call, tool_response]
 
@@ -345,11 +347,14 @@ def handle_tool_call(self) -> List[Message]:
         if self.responded_tool_calls is None:
             raise ValueError("No tool calls found in responded_tool_calls")
         output = []
-        output.append(self.responded_tool_calls.raw_calls.choices[0].message)  # add raw calls to output
+        output.append(
+            self.responded_tool_calls.raw_calls.choices[0].message
+        )  # add raw calls to output
         for fn_call in self.responded_tool_calls:
             raw_call = fn_call.raw_call
-            assert ("image" not in fn_call.tool_response
-                    ), "Image output is not supported in function calls response."
+            assert (
+                "image" not in fn_call.tool_response
+            ), "Image output is not supported in function calls response."
             # a function_call_output dict has keys "role", "tool_call_id" and "content"
             tool_call_reponse = {
                 "name": raw_call["function"]["name"],  # required with OpenRouter
@@ -422,6 +427,7 @@ def _parse_response(self, response: Any) -> LLMOutput:
         """Parse the raw response from the model API and return a structured response."""
         pass
 
+
 class AgentlabAction:
     """
     Collection of utility function to convert tool calls to Agentlab action format.
@@ -447,6 +453,7 @@ def convert_multiactions_to_agentlab_action_format(actions: list[str]) -> str:
 class BaseModelWithPricing(TrackAPIPricingMixin, BaseResponseModel):
     pass
 
+
 class OpenAIResponseModel(BaseModelWithPricing):
     def __init__(
         self,
@@ -517,21 +524,6 @@ def _parse_response(self, response: "OpenAIResponseObject") -> LLMOutput:
             tool_calls=toolcalls if toolcalls is not None else None,
         )
 
-    def convert_messages_to_api_format(
-        self, messages: List[MessageBuilder | ToolCalls]
-    ) -> List[Message]:
-        """Convert messages to the format expected by the OpenAI Responses API."""
-        input = []
-        for msg in messages:
-            if isinstance(msg, MessageBuilder):
-                temp = msg.prepare_message()
-            elif isinstance(msg, ToolCalls):
-                temp = msg.raw_calls
-            else:
-                raise TypeError("Unsupported message type: {}".format(type(msg)))
-            input.extend(temp)
-        return input
-
     def _extract_tool_calls_from_response(self, response: "OpenAIResponseObject") -> ToolCalls:
         """Extracts tool calls from the response."""
         tool_calls = []
@@ -542,7 +534,6 @@ def _extract_tool_calls_from_response(self, response: "OpenAIResponseObject") ->
             elif output.type == "computer_call":
                 tool_name, tool_args = self.cua_action_to_env_tool_name_and_args(output.action)
             else:
-                # skip if the output is not a tool call
                 continue
             tool_calls.append(ToolCall(name=tool_name, arguments=tool_args, raw_call=output))
 
@@ -611,7 +602,7 @@ def __init__(
         self.client = OpenAI(**client_args)
         self.init_pricing_tracker(pricing_api="openai")  # Use the PricingMixin
 
-    def _call_api(self, payload: APIPayload) -> openai.types.chat.ChatCompletion:
+    def _call_api(self, payload: APIPayload) -> "openai.types.chat.ChatCompletion":
         input = []
         for msg in payload.messages:
             input.extend(msg.prepare_message())
@@ -644,7 +635,7 @@ def _call_api(self, payload: APIPayload) -> openai.types.chat.ChatCompletion:
 
         return response
 
-    def _parse_response(self, response: openai.types.chat.ChatCompletion) -> LLMOutput:
+    def _parse_response(self, response: "openai.types.chat.ChatCompletion") -> LLMOutput:
         think_output = self._extract_thinking_content_from_response(response)
         tool_calls = self._extract_tool_calls_from_response(response)
 
@@ -659,7 +650,6 @@ def _parse_response(self, response: openai.types.chat.ChatCompletion) -> LLMOutp
             tool_calls=tool_calls if tool_calls is not None else None,
         )
 
-
     def _extract_thinking_content_from_response(
         self, response: openai.types.chat.ChatCompletion, wrap_tag="think"
     ):
@@ -722,7 +712,7 @@ def _extract_env_actions_from_toolcalls(self, toolcalls: ToolCalls) -> Any | Non
             else actions[0]
         )
         return actions
-    
+
     def _extract_env_actions_from_text_response(
         self, response: "openai.types.chat.ChatCompletion"
     ) -> str | None:
@@ -737,7 +727,6 @@ def format_tools_for_chat_completion(tools):
         Why we need this?
         Ans: actionset.to_tool_description() in bgym only returns description
         format valid for OpenAI Response API.
-
         Args:
             tools: List of tool descriptions to format for Chat Completion API.
 
@@ -781,8 +770,7 @@ def __init__(
         self.client = Anthropic(**client_args)
         self.init_pricing_tracker(pricing_api="anthropic")  # Use the PricingMixin
 
-    def _call_api(
-        self, payload: APIPayload) -> Completion:
+    def _call_api(self, payload: APIPayload) -> Completion:
         sys_msg, other_msgs = self.filter_system_messages(payload.messages)
         sys_msg_text = "\n".join(c["text"] for m in sys_msg for c in m.content)
         input = []
@@ -795,10 +783,11 @@ def _call_api(
         api_params: Dict[str, Any] = {
             "model": self.model_name,
             "messages": input,
-            "system": sys_msg_text}  # Anthropic API expects system message as a string
+            "system": sys_msg_text,
+        }  # Anthropic API expects system message as a string
 
         if self.temperature is not None:
-            api_params['temperature'] = self.temperature
+            api_params["temperature"] = self.temperature
         if self.max_tokens is not None:
             api_params["max_tokens"] = self.max_tokens
 
@@ -806,14 +795,16 @@ def _call_api(
             api_params["tools"] = payload.tools
         if payload.tool_choice is not None and payload.force_call_tool is None:
             api_params["tool_choice"] = (
-                {"type": "any"} if payload.tool_choice in ("required", "any") else {"type": payload.tool_choice}
+                {"type": "any"}
+                if payload.tool_choice in ("required", "any")
+                else {"type": payload.tool_choice}
             )
         if payload.force_call_tool is not None:
             api_params["tool_choice"] = {"type": "tool", "name": payload.force_call_tool}
         if payload.cache_tool_definition:
             # Indicating cache control for the last message enables caching of the last message.
             api_params["tools"][-1]["cache_control"] = {"type": "ephemeral"}
-        if payload.cache_complete_prompt:        
+        if payload.cache_complete_prompt:
             # Indicating cache control for the last message enables caching of the complete prompt.
             api_params["messages"][-1]["content"][-1]["cache_control"] = {"type": "ephemeral"}
 
@@ -885,10 +876,8 @@ def _extract_env_actions_from_toolcalls(self, toolcalls: ToolCalls) -> Any | Non
             else actions[0]
         )
         return actions
-    
-    def _extract_env_actions_from_text_response(
-        self, response: "AnthrophicMessage"
-    ) -> str | None:
+
+    def _extract_env_actions_from_text_response(self, response: "AnthrophicMessage") -> str | None:
         """Extracts environment actions from the text response."""
         # Use when action space is not given as tools.
         pass
@@ -902,6 +891,7 @@ def apply_cache_breakpoints(self, msg: Message, prepared_msg: dict) -> List[Mess
 
 # Factory classes to create the appropriate model based on the API endpoint.
 
+
 @dataclass
 class OpenAIResponseModelArgs(BaseModelArgs):
     """Serializable object for instantiating a generic chat model with an OpenAI

From 2fc5b5f6eb7341ac894c7acf1fda2d443bc3f491 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 14 Jul 2025 11:16:09 -0400
Subject: [PATCH 19/37] remove OAI cua reference from tool use agent

---
 .../agents/tool_use_agent/tool_use_agent.py   | 54 ++++---------------
 1 file changed, 10 insertions(+), 44 deletions(-)

diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
index cc7426dc..33312cd4 100644
--- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py
+++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
@@ -22,13 +22,13 @@
 from agentlab.agents.agent_args import AgentArgs
 from agentlab.llm.llm_utils import image_to_png_base64_url
 from agentlab.llm.response_api import (
-    ToolCalls,
+    APIPayload,
     ClaudeResponseModelArgs,
     LLMOutput,
     MessageBuilder,
     OpenAIChatModelArgs,
     OpenAIResponseModelArgs,
-    APIPayload,
+    ToolCalls,
 )
 from agentlab.llm.tracking import cost_tracker_decorator
 
@@ -117,13 +117,13 @@ def get_last_summary(self) -> MessageBuilder | None:
     def is_goal_set(self) -> bool:
         """Check if the goal is set in the first group."""
         return len(self.groups) > 0
-    
+
     def contains_image(self) -> bool:
         """Check if an image is set in any group"""
         for grp in self.groups:
             for msg in grp.messages:
                 for item in msg.content:
-                    if 'image' in item:
+                    if "image" in item:
                         return True
         return False
 
@@ -177,7 +177,6 @@ class Obs(Block):
     use_tabs: bool = False
     add_mouse_pointer: bool = False
     use_zoomed_webpage: bool = False
-    openai_cua_mode: bool = False  #  screenshot can only be added as tool response, given an initial screenshot obs
 
     def apply(
         self, llm, discussion: StructuredDiscussion, obs: dict, last_llm_output: LLMOutput
@@ -197,21 +196,13 @@ def apply(
                 screenshot = obs["screenshot"]
 
             if self.add_mouse_pointer:
-                # TODO this mouse pointer should be added at the browsergym level
                 screenshot = np.array(
                     agent_utils.add_mouse_pointer_from_action(
                         Image.fromarray(obs["screenshot"]), obs["last_action"]
                     )
                 )
-            
-            if self.openai_cua_mode and discussion.contains_image():
-                if tool_calls and tool_calls.get_bgym_action_calls():
-                    computer_call = tool_calls.get_bgym_action_calls()[0]
-                    computer_call.add_image(
-                        image_to_png_base64_url(screenshot)
-                    )
-            else:
-                obs_msg.add_image(image_to_png_base64_url(screenshot))
+
+        obs_msg.add_image(image_to_png_base64_url(screenshot))
 
         if self.use_axtree:
             obs_msg.add_text(f"AXTree:\n{AXTREE_NOTE}\n{obs['axtree_txt']}")
@@ -224,7 +215,6 @@ def apply(
 
         if tool_calls:
             for call in tool_calls:
-                # call_results = execute_fn_calls(call.name, call.arguments)
                 call.response_text("See Observation")
             tool_response = llm.msg.add_responded_tool_calls(tool_calls)
             discussion.append(tool_response)
@@ -232,9 +222,6 @@ def apply(
         return obs_msg
 
 
-def execute_fn_calls(func_name: str, arguments: dict) -> str:
-    return ""
-
 def _format_tabs(obs):
     """Format the open tabs in a llm-readable way."""
     prompt_pieces = ["Currently open tabs:"]
@@ -349,25 +336,6 @@ def apply(self, llm, discussion: StructuredDiscussion, task_name: str) -> dict:
             discussion.append(msg)
 
 
-# class ToolCall(Block):
-
-#     def __init__(self, tool_server):
-#         self.tool_server = tool_server
-
-#     def apply(self, llm, messages: list[MessageBuilder], obs: dict) -> dict:
-#         # build the message by adding components to obs
-#         response: LLMOutput = llm(messages=self.messages)
-
-#         messages.append(response.assistant_message)  # this is tool call
-
-#         tool_answer = self.tool_server.call_tool(response)
-#         tool_msg = llm.msg.tool()  # type: MessageBuilder
-#         tool_msg.add_tool_id(response.last_computer_call_id)
-#         tool_msg.update_last_raw_response(response)
-#         tool_msg.add_text(str(tool_answer))
-#         messages.append(tool_msg)
-
-
 @dataclass
 class PromptConfig:
     tag_screenshot: bool = True  # Whether to tag the screenshot with the last action.
@@ -492,7 +460,7 @@ def get_action(self, obs: Any) -> float:
         response: LLMOutput = self.llm(
             APIPayload(
                 messages=messages,
-                tools=self.tools, # You can update tools available tools now.
+                tools=self.tools,  # You can update tools available tools now.
                 tool_choice="any",
                 cache_tool_definition=True,
                 cache_complete_prompt=False,
@@ -540,7 +508,7 @@ def get_action(self, obs: Any) -> float:
     max_total_tokens=200_000,
     max_input_tokens=200_000,
     max_new_tokens=2_000,
-    temperature=None,   # O3 does not support temperature
+    temperature=None,  # O3 does not support temperature
     vision_support=True,
 )
 O3_CHATAPI_MODEL = OpenAIChatModelArgs(
@@ -595,7 +563,7 @@ def get_action(self, obs: Any) -> float:
     summarizer=Summarizer(do_summary=True),
     general_hints=GeneralHints(use_hints=False),
     task_hint=TaskHint(use_task_hint=True),
-    keep_last_n_obs=None,  # keep only the last observation in the discussion
+    keep_last_n_obs=None,
     multiaction=True,  # whether to use multi-action or not
     # action_subsets=("bid",),
     action_subsets=("coord"),
@@ -608,7 +576,7 @@ def get_action(self, obs: Any) -> float:
 )
 
 OAI_AGENT = ToolUseAgentArgs(
-    model_args=O3_RESPONSE_MODEL,
+    model_args=OPENAI_MODEL_CONFIG,
     config=DEFAULT_PROMPT_CONFIG,
 )
 
@@ -621,5 +589,3 @@ def get_action(self, obs: Any) -> float:
     model_args=GPT4_1_OPENROUTER_MODEL,
     config=DEFAULT_PROMPT_CONFIG,
 )
-
-## My test can have a different config and a simple task for the tool use agent.

From 160fb93ecde78153fdb109a501bea3cec39466e4 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 14 Jul 2025 11:30:39 -0400
Subject: [PATCH 20/37] remove openai cua from this PR

---
 .../agents/tool_use_agent/openai_cua.py       | 177 ------------------
 1 file changed, 177 deletions(-)
 delete mode 100644 src/agentlab/agents/tool_use_agent/openai_cua.py

diff --git a/src/agentlab/agents/tool_use_agent/openai_cua.py b/src/agentlab/agents/tool_use_agent/openai_cua.py
deleted file mode 100644
index d7f1f97f..00000000
--- a/src/agentlab/agents/tool_use_agent/openai_cua.py
+++ /dev/null
@@ -1,177 +0,0 @@
-import json
-from dataclasses import dataclass
-from typing import Any, Dict, List
-
-from agentlab.llm.llm_utils import call_openai_api_with_retries
-from agentlab.llm.response_api import (
-    MessageBuilder,
-    OpenAIResponseAPIMessageBuilder,
-    OpenAIResponseModel,
-    OpenAIResponseModelArgs,
-    ToolCalls,
-)
-
-from .tool_use_agent import (
-    GeneralHints,
-    Goal,
-    Obs,
-    PromptConfig,
-    Summarizer,
-    TaskHint,
-    ToolUseAgentArgs,
-)
-
-
-class OpenAICUAModel(OpenAIResponseModel):
-
-    def _call_api(self, messages: list[ToolCalls | MessageBuilder], tool_choice="auto", **kwargs) -> dict:
-        input = self.convert_messages_to_api_format(messages)
-
-        api_params: Dict[str, Any] = {
-            "model": self.model_name,
-            "input": input,
-            "temperature": self.temperature,
-            "max_output_tokens": self.max_tokens,
-            "truncation": "auto",  # truncation is required for OpenAI CUA
-            "tool_choice": "auto",  # Tool choice can only be auto
-            **self.extra_kwargs,  
-        }
-
-        if "tools" in api_params:
-            cua_tool_present = any(
-                tool.get("type") == "computer_use_preview" for tool in api_params["tools"]
-            )
-            # CUA requires this tool 
-            if not cua_tool_present:
-                api_params["tools"].extend(
-                    [
-                        {
-                            "type": "computer_use_preview",
-                            "display_width": 1024,   
-                            "display_height": 768,
-                            "environment": "browser",  # TODO: Parametrize this 
-                        }
-                    ]
-                )
-
-        response = call_openai_api_with_retries(
-            self.client.responses.create,
-            api_params,
-        )
-
-        return response
-
-    def cua_action_to_env_tool_name_and_args(self, action) -> str:
-        """
-        Given a computer action (e.g., click, double_click, scroll, etc.),
-        convert it to a text description.
-        """
-        #TODO: #Provide an alternate implementation for OS-World.
-
-        action_type = action.type
-
-        try:
-            action_mapping = {
-                "click": lambda: self._handle_click_action(action),
-                "scroll": lambda: self._handle_scroll_action(action),
-                "keypress": lambda: self._handle_keypress_action(action),
-                "type": lambda: self._handle_type_action(action),
-                "wait": lambda: self._handle_wait_action(action),
-                "screenshot": lambda: self._handle_screenshot_action(action),
-                "drag": lambda: self._handle_drag_action(action),
-            }
-
-            if action_type in action_mapping:
-                return action_mapping[action_type]()
-            else:
-                raise ValueError(f"Unrecognized openAI CUA action type: {action_type}")
-
-        except Exception as e:
-            print(f"Error handling action {action}: {e}")
-
-    def _handle_click_action(self, action):
-        x, y = action.x, action.y
-        button = action.button
-        if button != "left" and button != "right":
-            button = "left"
-        return "mouse_click", {"x": x, "y": y, "button": button}
-
-    def _handle_scroll_action(self, action):
-        x, y = action.x, action.y
-        scroll_x, scroll_y = action.scroll_x, action.scroll_y
-        return "scroll_at", {"x": x, "y": y, "scroll_x": scroll_x, "scroll_y": scroll_y}
-
-    def _handle_keypress_action(self, action):
-        keys = action.keys
-        #TODO: Check this if is suitable for BGYM env.
-        for k in keys:
-            print(f"Action: keypress '{k}'")
-            if k.lower() == "enter":
-                key = "Enter"
-            elif k.lower() == "space":
-                key = " "
-            return "keyboard_press", {"key": key}
-
-    def _handle_type_action(self, action):
-        text = action.text
-        return "keyboard_type", {"text": text}
-
-    def _handle_wait_action(self, action):
-        return "noop", {}
-
-    def _handle_screenshot_action(self, action):
-        return "noop", {}
-
-    def _handle_drag_action(self, action):
-        x1, y1 = action.path[0].x, action.path[0].y
-        x2, y2 = action.path[1].x, action.path[1].y
-        print(f"Action: drag from ({x1}, {y1}) to ({x2}, {y2})")
-        return "mouse_drag_and_drop", {"x1": x1, "y1": y1, "x2": x2, "y2": y2}
-
-@dataclass
-class OpenAICUAModelArgs(OpenAIResponseModelArgs):
-    """Serializable object for instantiating a generic chat model with an OpenAI
-    model."""
-
-    api = "openai"
-
-    def make_model(self, extra_kwargs=None, **kwargs):
-        return OpenAICUAModel(
-            model_name=self.model_name,
-            temperature=self.temperature,
-            max_tokens=self.max_new_tokens,
-            extra_kwargs=extra_kwargs,
-            pricing_api="openai",
-            **kwargs,
-        )
-
-    def get_message_builder(self) -> MessageBuilder:
-        return OpenAIResponseAPIMessageBuilder
-
-
-# Default configuration for Computer Use Agent
-DEFAULT_CUA_PROMPT_CONFIG = PromptConfig(
-    tag_screenshot=True,
-    goal=Goal(goal_as_system_msg=True),
-    obs=Obs(
-        use_last_error=True,
-        use_screenshot=True,
-        use_axtree=True,
-        use_dom=False,
-        use_som=False,
-        use_tabs=False,
-        openai_cua_mode=True,  # Enable CUA mode for OpenAI
-    ),
-    summarizer=Summarizer(do_summary=True),
-    general_hints=GeneralHints(use_hints=False),
-    task_hint=TaskHint(use_task_hint=False),
-    keep_last_n_obs=1,  #NOTE: API error if more than 1 obs is used. There can be only one computer call output in the response.
-    multiaction=True,  # whether to use multi-action or not
-    # action_subsets=("bid",),
-    action_subsets=("coord"),
-)
-
-OAI_CUA_TOOL_AGENT = ToolUseAgentArgs(
-    model_args=OpenAICUAModelArgs(model_name="computer-use-preview"),
-    config=DEFAULT_CUA_PROMPT_CONFIG,
-)

From d31f49f6051232bb2cca56c1ea5094964f75db84 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 14 Jul 2025 14:08:00 -0400
Subject: [PATCH 21/37] Refactor APIPayload validation

---
 src/agentlab/llm/response_api.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
index 1b69d9d8..e865801e 100644
--- a/src/agentlab/llm/response_api.py
+++ b/src/agentlab/llm/response_api.py
@@ -390,10 +390,8 @@ class APIPayload:
     )
 
     def __post_init__(self):
-        # assert tool_choice is None when force_call_tool is set
-        assert (
-            self.tool_choice is None or self.force_call_tool is None
-        ), "tool_choice and force_call_tool cannot be set at the same time."
+        if self.tool_choice and self.force_call_tool:
+            raise ValueError("tool_choice and force_call_tool are mutually exclusive")
 
 
 # # Base class for all API Endpoints
@@ -569,7 +567,9 @@ def _extract_thinking_content_from_response(self, response: "OpenAIResponseObjec
 
     def cua_action_to_env_tool_name_and_args(self, action: str) -> tuple[str, Dict[str, Any]]:
         """ "Overwrite this method to convert a computer action to agentlab action string"""
-        pass
+        raise NotImplementedError(
+            "This method should be implemented in the subclass to convert a computer action to agentlab action string."
+        )
 
     def _extract_env_actions_from_text_response(
         self, response: "OpenAIResponseObject"

From d4368fed8d25e7c6f6bfff358d5ee4af194baf0b Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 14 Jul 2025 14:08:40 -0400
Subject: [PATCH 22/37] Update test for the new API

---
 tests/llm/test_response_api.py | 302 ++++++++++++++-------------------
 1 file changed, 131 insertions(+), 171 deletions(-)

diff --git a/tests/llm/test_response_api.py b/tests/llm/test_response_api.py
index bfe054a9..6e667dd8 100644
--- a/tests/llm/test_response_api.py
+++ b/tests/llm/test_response_api.py
@@ -9,6 +9,7 @@
 from agentlab.llm import tracking
 from agentlab.llm.response_api import (
     AnthropicAPIMessageBuilder,
+    APIPayload,
     ClaudeResponseModelArgs,
     LLMOutput,
     OpenAIChatCompletionAPIMessageBuilder,
@@ -78,6 +79,69 @@ def create_mock_openai_chat_completion(
     return completion
 
 
+responses_api_tools = [
+    {
+        "type": "function",
+        "name": "get_weather",
+        "description": "Get the current weather in a given location.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "The location to get the weather for.",
+                },
+                "unit": {
+                    "type": "string",
+                    "enum": ["celsius", "fahrenheit"],
+                    "description": "The unit of temperature.",
+                },
+            },
+            "required": ["location"],
+        },
+    }
+]
+
+chat_api_tools = [
+    {
+        "type": "function",
+        "name": "get_weather",
+        "description": "Get the current weather in a given location.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "The location to get the weather for.",
+                },
+                "unit": {
+                    "type": "string",
+                    "enum": ["celsius", "fahrenheit"],
+                    "description": "The unit of temperature.",
+                },
+            },
+            "required": ["location"],
+        },
+    }
+]
+anthropic_tools = [
+    {
+        "name": "get_weather",
+        "description": "Get the current weather in a given location.",
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "The location to get the weather for.",
+                },
+            },
+            "required": ["location"],
+        },
+    }
+]
+
+
 # Helper to create a mock Anthropic response
 def create_mock_anthropic_response(
     text_content=None, tool_use=None, input_tokens=15, output_tokens=25
@@ -196,13 +260,6 @@ def test_anthropic_api_message_builder_image():
 def test_openai_chat_completion_api_message_builder_text():
     builder = OpenAIChatCompletionAPIMessageBuilder.user()
     builder.add_text("Hello, ChatCompletion!")
-    # Mock last_response as it's used by tool role
-    builder.last_raw_response = MagicMock(spec=LLMOutput)
-    builder.last_raw_response.raw_response = MagicMock()
-    builder.last_raw_response.raw_response.choices = [MagicMock()]
-    builder.last_raw_response.raw_response.choices[0].message.to_dict.return_value = {
-        "tool_calls": [{"function": {"name": "some_function"}}]
-    }
     messages = builder.prepare_message()
 
     assert len(messages) == 1
@@ -213,13 +270,6 @@ def test_openai_chat_completion_api_message_builder_text():
 def test_openai_chat_completion_api_message_builder_image():
     builder = OpenAIChatCompletionAPIMessageBuilder.user()
     builder.add_image("data:image/jpeg;base64,CHATCOMPLETIONBASE64")
-    # Mock last_response
-    builder.last_raw_response = MagicMock(spec=LLMOutput)
-    builder.last_raw_response.raw_response = MagicMock()
-    builder.last_raw_response.raw_response.choices = [MagicMock()]
-    builder.last_raw_response.raw_response.choices[0].message.to_dict.return_value = {
-        "tool_calls": [{"function": {"name": "some_function"}}]
-    }
     messages = builder.prepare_message()
 
     assert len(messages) == 1
@@ -230,14 +280,12 @@ def test_openai_chat_completion_api_message_builder_image():
 
 
 def test_openai_chat_completion_model_parse_and_cost():
-    args = OpenAIChatModelArgs(model_name="gpt-3.5-turbo")  # A cheap model for testing
-    # Mock the OpenAI client to avoid needing OPENAI_API_KEY
+    args = OpenAIChatModelArgs(model_name="gpt-3.5-turbo")
     with patch("agentlab.llm.response_api.OpenAI") as mock_openai_class:
         mock_client = MagicMock()
         mock_openai_class.return_value = mock_client
         model = args.make_model()
 
-    # Mock the API call
     mock_response = create_mock_openai_chat_completion(
         content="This is a test thought.",
         tool_calls=[
@@ -254,13 +302,14 @@ def test_openai_chat_completion_model_parse_and_cost():
     with patch.object(
         model.client.chat.completions, "create", return_value=mock_response
     ) as mock_create:
-        with tracking.set_tracker() as global_tracker:  # Use your global tracker
+        with tracking.set_tracker() as global_tracker:
             messages = [
-                OpenAIChatCompletionAPIMessageBuilder.user()
-                .add_text("What's the weather in Paris?")
-                .prepare_message()[0]
+                OpenAIChatCompletionAPIMessageBuilder.user().add_text(
+                    "What's the weather in Paris?"
+                )
             ]
-            parsed_output = model(messages)
+            payload = APIPayload(messages=messages)
+            parsed_output = model(payload)
 
     mock_create.assert_called_once()
     assert parsed_output.raw_response.choices[0].message.content == "This is a test thought."
@@ -273,7 +322,7 @@ def test_openai_chat_completion_model_parse_and_cost():
 
 
 def test_claude_response_model_parse_and_cost():
-    args = ClaudeResponseModelArgs(model_name="claude-3-haiku-20240307")  # A cheap model
+    args = ClaudeResponseModelArgs(model_name="claude-3-haiku-20240307")
     model = args.make_model()
 
     mock_anthropic_api_response = create_mock_anthropic_response(
@@ -287,31 +336,23 @@ def test_claude_response_model_parse_and_cost():
         model.client.messages, "create", return_value=mock_anthropic_api_response
     ) as mock_create:
         with tracking.set_tracker() as global_tracker:
-            messages = [
-                AnthropicAPIMessageBuilder.user()
-                .add_text("Search for latest news")
-            ]
-            parsed_output = model(messages)
+            messages = [AnthropicAPIMessageBuilder.user().add_text("Search for latest news")]
+            payload = APIPayload(messages=messages)
+            parsed_output = model(payload)
 
     mock_create.assert_called_once()
     fn_call = next(iter(parsed_output.tool_calls))
 
     assert "Thinking about the request." in parsed_output.think
-    assert parsed_output.action == ['search_web(query="latest news")']
+    assert parsed_output.action == 'search_web(query="latest news")'
     assert fn_call.name == "search_web"
     assert global_tracker.stats["input_tokens"] == 40
     assert global_tracker.stats["output_tokens"] == 20
-    # assert global_tracker.stats["cost"] > 0 # Verify cost is calculated
 
 
 def test_openai_response_model_parse_and_cost():
-    """
-    Tests OpenAIResponseModel output parsing and cost tracking with both
-    function_call and reasoning outputs.
-    """
     args = OpenAIResponseModelArgs(model_name="gpt-4.1")
 
-    # Mock outputs
     mock_function_call_output = {
         "type": "function_call",
         "name": "get_current_weather",
@@ -325,7 +366,6 @@ def test_openai_response_model_parse_and_cost():
         output_tokens=40,
     )
 
-    # Mock the OpenAI client to avoid needing OPENAI_API_KEY
     with patch("agentlab.llm.response_api.OpenAI") as mock_openai_class:
         mock_client = MagicMock()
         mock_openai_class.return_value = mock_client
@@ -336,15 +376,16 @@ def test_openai_response_model_parse_and_cost():
     ) as mock_create_method:
         with tracking.set_tracker() as global_tracker:
             messages = [
-                OpenAIResponseAPIMessageBuilder.user()
-                .add_text("What's the weather in Boston?")
-                .prepare_message()[0]
+                OpenAIResponseAPIMessageBuilder.user().add_text("What's the weather in Boston?")
             ]
-            parsed_output = model(messages)
+            payload = APIPayload(messages=messages)
+            parsed_output = model(payload)
 
     mock_create_method.assert_called_once()
     fn_calls = [
-        content for content in parsed_output.raw_response.output if content.type == "function_call"
+        content
+        for content in parsed_output.tool_calls.raw_calls.output
+        if content.type == "function_call"
     ]
     assert parsed_output.action == 'get_current_weather(location="Boston, MA", unit="celsius")'
     assert fn_calls[0].call_id == "call_abc123"
@@ -366,38 +407,15 @@ def test_openai_chat_completion_model_pricy_call():
         max_new_tokens=100,
     )
 
-    tools = [
-        {
-            "type": "function",
-            "name": "get_weather",
-            "description": "Get the current weather in a given location.",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {
-                        "type": "string",
-                        "description": "The location to get the weather for.",
-                    },
-                    "unit": {
-                        "type": "string",
-                        "enum": ["celsius", "fahrenheit"],
-                        "description": "The unit of temperature.",
-                    },
-                },
-                "required": ["location"],
-            },
-        }
-    ]
-
-    model = args.make_model(tools=tools, tool_choice="required")
+    tools = chat_api_tools
+    model = args.make_model()
 
     with tracking.set_tracker() as global_tracker:
         messages = [
-            OpenAIChatCompletionAPIMessageBuilder.user()
-            .add_text("What is the weather in Paris?")
-            .prepare_message()[0]
+            OpenAIChatCompletionAPIMessageBuilder.user().add_text("What is the weather in Paris?")
         ]
-        parsed_output = model(messages)
+        payload = APIPayload(messages=messages, tools=tools, tool_choice="required")
+        parsed_output = model(payload)
 
     assert parsed_output.raw_response is not None
     assert (
@@ -418,35 +436,18 @@ def test_claude_response_model_pricy_call():
         temperature=1e-5,
         max_new_tokens=100,
     )
-    tools = [
-        {
-            "name": "get_weather",
-            "description": "Get the current weather in a given location.",
-            "input_schema": {
-                "type": "object",
-                "properties": {
-                    "location": {
-                        "type": "string",
-                        "description": "The location to get the weather for.",
-                    },
-                },
-                "required": ["location"],
-            },
-        }
-    ]
-    model = args.make_model(tools=tools)
+    tools = anthropic_tools
+    model = args.make_model()
 
     with tracking.set_tracker() as global_tracker:
-        messages = [
-            AnthropicAPIMessageBuilder.user()
-            .add_text("What is the weather in Paris?")
-        ]
-        parsed_output = model(messages)
+        messages = [AnthropicAPIMessageBuilder.user().add_text("What is the weather in Paris?")]
+        payload = APIPayload(messages=messages, tools=tools)
+        parsed_output = model(payload)
 
     assert parsed_output.raw_response is not None
     assert (
-        parsed_output.action == ['get_weather(location="Paris")']
-    ), f'Expected [get_weather("Paris")] but got {parsed_output.action}'
+        parsed_output.action == 'get_weather(location="Paris")'
+    ), f'Expected get_weather("Paris") but got {parsed_output.action}'
     assert global_tracker.stats["input_tokens"] > 0
     assert global_tracker.stats["output_tokens"] > 0
     assert global_tracker.stats["cost"] > 0
@@ -461,37 +462,15 @@ def test_openai_response_model_pricy_call():
     """
     args = OpenAIResponseModelArgs(model_name="gpt-4.1", temperature=1e-5, max_new_tokens=100)
 
-    tools = [
-        {
-            "type": "function",
-            "name": "get_weather",
-            "description": "Get the current weather in a given location.",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {
-                        "type": "string",
-                        "description": "The location to get the weather for.",
-                    },
-                    "unit": {
-                        "type": "string",
-                        "enum": ["celsius", "fahrenheit"],
-                        "description": "The unit of temperature.",
-                    },
-                },
-                "required": ["location"],
-            },
-        }
-    ]
-    model = args.make_model(tools=tools)
+    tools = responses_api_tools
+    model = args.make_model()
 
     with tracking.set_tracker() as global_tracker:
         messages = [
-            OpenAIResponseAPIMessageBuilder.user()
-            .add_text("What is the weather in Paris?")
-            .prepare_message()[0]
+            OpenAIResponseAPIMessageBuilder.user().add_text("What is the weather in Paris?")
         ]
-        parsed_output = model(messages)
+        payload = APIPayload(messages=messages, tools=tools)
+        parsed_output = model(payload)
 
     assert parsed_output.raw_response is not None
     assert (
@@ -511,56 +490,36 @@ def test_openai_response_model_with_multiple_messages_and_cost_tracking():
     """
     args = OpenAIResponseModelArgs(model_name="gpt-4.1", temperature=1e-5, max_new_tokens=100)
 
-    tools = [
-        {
-            "type": "function",
-            "name": "get_weather",
-            "description": "Get the current weather in a given location.",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {
-                        "type": "string",
-                        "description": "The location to get the weather for.",
-                    },
-                    "unit": {
-                        "type": "string",
-                        "enum": ["celsius", "fahrenheit"],
-                        "description": "The unit of temperature.",
-                    },
-                },
-                "required": ["location"],
-            },
-        }
-    ]
-
-    model = args.make_model(tools=tools, tool_choice="required")
+    tools = responses_api_tools
+    model = args.make_model()
     builder = args.get_message_builder()
 
     messages = [builder.user().add_text("What is the weather in Paris?")]
 
     with tracking.set_tracker() as tracker:
-        # First turn: get initial tool call
-        parsed = model(messages)
+        payload = APIPayload(messages=messages, tools=tools, tool_choice="required")
+        parsed = model(payload)
         prev_input = tracker.stats["input_tokens"]
         prev_output = tracker.stats["output_tokens"]
         prev_cost = tracker.stats["cost"]
 
+        assert parsed.tool_calls, "Expected tool calls in the response"
+        # Set tool responses
+        for tool_call in parsed.tool_calls:
+            tool_call.response_text("Its sunny! 25°C")
         # Simulate tool execution and user follow-up
         messages += [
-            parsed.tool_calls,  # Add tool call from the model
-            builder.tool(parsed.raw_response).add_text("Its sunny! 25°C"),
+            builder.add_responded_tool_calls(parsed.tool_calls),
             builder.user().add_text("What is the weather in Delhi?"),
         ]
 
-        parsed = model(messages)
+        payload = APIPayload(messages=messages, tools=tools, tool_choice="required")
+        parsed = model(payload)
 
-        # Token and cost deltas
         delta_input = tracker.stats["input_tokens"] - prev_input
         delta_output = tracker.stats["output_tokens"] - prev_output
         delta_cost = tracker.stats["cost"] - prev_cost
 
-    # Assertions
     assert prev_input > 0
     assert prev_output > 0
     assert prev_cost > 0
@@ -606,33 +565,34 @@ def test_openai_chat_completion_model_with_multiple_messages_and_cost_tracking()
         }
     ]
 
-    model = args.make_model(tools=tools, tool_choice="required")
+    model = args.make_model()
     builder = args.get_message_builder()
 
     messages = [builder.user().add_text("What is the weather in Paris?")]
 
     with tracking.set_tracker() as tracker:
-        # First turn: get initial tool call
-        parsed = model(messages)
+        payload = APIPayload(messages=messages, tools=tools, tool_choice="required")
+        parsed = model(payload)
         prev_input = tracker.stats["input_tokens"]
         prev_output = tracker.stats["output_tokens"]
         prev_cost = tracker.stats["cost"]
 
+        for tool_call in parsed.tool_calls:
+            tool_call.response_text("Its sunny! 25°C")
         # Simulate tool execution and user follow-up
         messages += [
-            parsed.tool_calls,  # Add tool call from the model
-            builder.tool(parsed.raw_response).add_text("Its sunny! 25°C"),
+            builder.add_responded_tool_calls(parsed.tool_calls),
             builder.user().add_text("What is the weather in Delhi?"),
         ]
+        # Set tool responses
 
-        parsed = model(messages)
+        payload = APIPayload(messages=messages, tools=tools, tool_choice="required")
+        parsed = model(payload)
 
-        # Token and cost deltas
         delta_input = tracker.stats["input_tokens"] - prev_input
         delta_output = tracker.stats["output_tokens"] - prev_output
         delta_cost = tracker.stats["cost"] - prev_cost
 
-    # Assertions
     assert prev_input > 0
     assert prev_output > 0
     assert prev_cost > 0
@@ -673,38 +633,38 @@ def test_claude_model_with_multiple_messages_pricy_call():
             },
         }
     ]
-    model = model_factory.make_model(tools=tools)
+    model = model_factory.make_model()
     msg_builder = model_factory.get_message_builder()
     messages = []
 
     messages.append(msg_builder.user().add_text("What is the weather in Paris?"))
     with tracking.set_tracker() as global_tracker:
-        llm_output1 = model(messages)
+        payload = APIPayload(messages=messages, tools=tools)
+        llm_output1 = model(payload)
 
         prev_input = global_tracker.stats["input_tokens"]
         prev_output = global_tracker.stats["output_tokens"]
         prev_cost = global_tracker.stats["cost"]
 
-        messages.append(llm_output1.tool_calls)
         for tool_call in llm_output1.tool_calls:
-            tool_call.add_text("It's sunny! 25°C")
-        messages.append(
-            msg_builder.add_responded_tool_calls(llm_output1.tool_calls))
-        messages.append(msg_builder.user().add_text("What is the weather in Delhi?"))
-        llm_output2 = model(messages)
-        # Token and cost deltas
+            tool_call.response_text("It's sunny! 25°C")
+        messages += [
+            msg_builder.add_responded_tool_calls(llm_output1.tool_calls),
+            msg_builder.user().add_text("What is the weather in Delhi?"),
+        ]
+        payload = APIPayload(messages=messages, tools=tools)
+        llm_output2 = model(payload)
         delta_input = global_tracker.stats["input_tokens"] - prev_input
         delta_output = global_tracker.stats["output_tokens"] - prev_output
         delta_cost = global_tracker.stats["cost"] - prev_cost
 
-    # Assertions
     assert prev_input > 0, "Expected previous input tokens to be greater than 0"
     assert prev_output > 0, "Expected previous output tokens to be greater than 0"
     assert prev_cost > 0, "Expected previous cost value to be greater than 0"
     assert llm_output2.raw_response is not None
     assert (
-        llm_output2.action == ['get_weather(location="Delhi", unit="celsius")']
-    ), f'Expected [get_weather("Delhi")] but got {llm_output2.action}'
+        llm_output2.action == 'get_weather(location="Delhi", unit="celsius")'
+    ), f'Expected get_weather("Delhi") but got {llm_output2.action}'
     assert delta_input > 0, "Expected new input tokens to be greater than 0"
     assert delta_output > 0, "Expected new output tokens to be greater than 0"
     assert delta_cost > 0, "Expected new cost value to be greater than 0"

From 1305892d70e0ada6c1f41111fc4fb3a721674aae Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 14 Jul 2025 14:09:55 -0400
Subject: [PATCH 23/37] Add tests to check parallel tool calling ability of
 APIs and models.

---
 tests/llm/test_response_api.py | 68 ++++++++++++++++++++++++++++++++--
 1 file changed, 65 insertions(+), 3 deletions(-)

diff --git a/tests/llm/test_response_api.py b/tests/llm/test_response_api.py
index 6e667dd8..6634bc11 100644
--- a/tests/llm/test_response_api.py
+++ b/tests/llm/test_response_api.py
@@ -673,6 +673,68 @@ def test_claude_model_with_multiple_messages_pricy_call():
     assert global_tracker.stats["cost"] == pytest.approx(prev_cost + delta_cost)
 
 
-# TODO: Add tests for image token costing (this is complex and model-specific)
-#       - For OpenAI, you'd need to know how they bill for images (e.g., fixed cost per image + tokens for text parts)
-#       - You'd likely need to mock the response from client.chat.completions.create to include specific usage for images.
+## Test multiaction
+@pytest.mark.pricy
+def test_multi_action_tool_calls():
+    """
+    Test that the model can produce multiple tool calls in parallel.
+    Uncomment commented lines to see the full behaviour of models and tool choices.
+    """
+    # test_config (setting name, BaseModelArgs, model_name, tools)
+    tool_test_configs = [
+        (
+            "gpt-4.1-responses API",
+            OpenAIResponseModelArgs,
+            "gpt-4.1-2025-04-14",
+            responses_api_tools,
+        ),
+        ("gpt-4.1-chat Completions API", OpenAIChatModelArgs, "gpt-4.1-2025-04-14", chat_api_tools),
+        # ("claude-3", ClaudeResponseModelArgs, "claude-3-haiku-20240307", anthropic_tools),   # fails
+        # ("claude-3.7", ClaudeResponseModelArgs, "claude-3-7-sonnet-20250219", anthropic_tools), # fails
+        ("claude-4-sonnet", ClaudeResponseModelArgs, "claude-sonnet-4-20250514", anthropic_tools),
+        # add more models as needed
+    ]
+
+    def add_user_messages(msg_builder):
+        return [
+            msg_builder.user().add_text("What is the weather in Paris and Delhi?"),
+            msg_builder.user().add_text("You must call multiple tools to achieve the task."),
+        ]
+
+    res_df = []
+
+    for tool_choice in [
+        # 'none',
+        # 'required', # fails for Responses API
+        # 'any',  # fails for Responses API
+        "auto",
+        # 'get_weather'
+    ]:
+        for name, llm_class, checkpoint_name, tools in tool_test_configs:
+            print(name, "tool choice:", tool_choice, "\n", "**" * 10)
+            model_args = llm_class(model_name=checkpoint_name, max_new_tokens=200, temperature=None)
+            llm, msg_builder = model_args.make_model(), model_args.get_message_builder()
+            messages = add_user_messages(msg_builder)
+            if tool_choice == "get_weather":  # force a specific tool call
+                response: LLMOutput = llm(
+                    APIPayload(messages=messages, tools=tools, force_call_tool=tool_choice)
+                )
+            else:
+                response: LLMOutput = llm(
+                    APIPayload(messages=messages, tools=tools, tool_choice=tool_choice)
+                )
+                num_tool_calls = len(response.tool_calls) if response.tool_calls else 0
+            res_df.append(
+                {
+                    "model": name,
+                    "checkpoint": checkpoint_name,
+                    "tool_choice": tool_choice,
+                    "num_tool_calls": num_tool_calls,
+                    "action": response.action,
+                }
+            )
+            assert (
+                num_tool_calls == 2
+            ), f"Expected 2 tool calls, but got {num_tool_calls} for {name} with tool choice {tool_choice}"
+        # import pandas as pd
+        # print(pd.DataFrame(res_df))

From 8f2ee28030062f278f91ee9856c760fbb16199fb Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 14 Jul 2025 14:46:21 -0400
Subject: [PATCH 24/37] add tool_call_to_python_code in response_api.py

---
 src/agentlab/llm/response_api.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
index e865801e..9a6acb61 100644
--- a/src/agentlab/llm/response_api.py
+++ b/src/agentlab/llm/response_api.py
@@ -434,12 +434,8 @@ class AgentlabAction:
     def convert_toolcall_to_agentlab_action_format(toolcall: ToolCall) -> str:
         """Convert a tool call to an Agentlab environment action string.
         This converts tools calls to python function call strings."""
-        action_name, tool_args = toolcall.name, toolcall.arguments
-        action_args = ", ".join(
-            f'{k}="{v}"' if isinstance(v, str) else f"{k}={v}" for k, v in tool_args.items()
-        )
-        action_str = f"{action_name}({action_args})"
-        return action_str
+        tool_name, tool_args = toolcall.name, toolcall.arguments
+        return tool_call_to_python_code(tool_name, tool_args)
 
     def convert_multiactions_to_agentlab_action_format(actions: list[str]) -> str:
         """Convert multiple actions list to a format that env supports
@@ -474,7 +470,7 @@ def __init__(
         # Init pricing tracker after super() so that all attributes have been set.
         self.init_pricing_tracker(pricing_api="openai")  # Use the PricingMixin
 
-    def _call_api(self, payload: APIPayload) -> "ResponseObject":
+    def _call_api(self, payload: APIPayload) -> "OpenAIResponseObject":
 
         input = []
         for msg in payload.messages:
@@ -966,6 +962,18 @@ def get_message_builder(self) -> MessageBuilder:
         return OpenAIChatCompletionAPIMessageBuilder
 
 
+def tool_call_to_python_code(func_name, kwargs):
+    """Format a function name and kwargs dict into a Python function call string."""
+    if kwargs is None:
+        kwargs = {}
+
+    if not kwargs:
+        return f"{func_name}()"
+
+    args_str = ", ".join(f"{key}={repr(value)}" for key, value in kwargs.items())
+    return f"{func_name}({args_str})"
+
+
 # ___Not__Tested__#
 
 # class VLLMModelArgs(BaseModelArgs):

From dff3e33616a18a040ec5d0e741438247ed65b464 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 14 Jul 2025 14:47:11 -0400
Subject: [PATCH 25/37] fix qoutes in response API test

---
 tests/llm/test_response_api.py | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/tests/llm/test_response_api.py b/tests/llm/test_response_api.py
index 6634bc11..29a44cca 100644
--- a/tests/llm/test_response_api.py
+++ b/tests/llm/test_response_api.py
@@ -313,7 +313,7 @@ def test_openai_chat_completion_model_parse_and_cost():
 
     mock_create.assert_called_once()
     assert parsed_output.raw_response.choices[0].message.content == "This is a test thought."
-    assert parsed_output.action == 'get_weather(location="Paris")'
+    assert parsed_output.action == """get_weather(location='Paris')"""
     assert parsed_output.raw_response.choices[0].message.tool_calls[0].id == "call_123"
     # Check cost tracking (token counts)
     assert global_tracker.stats["input_tokens"] == 50
@@ -419,8 +419,8 @@ def test_openai_chat_completion_model_pricy_call():
 
     assert parsed_output.raw_response is not None
     assert (
-        parsed_output.action == 'get_weather(location="Paris")'
-    ), f""" Expected get_weather(location="Paris") but got {parsed_output.action}"""
+        parsed_output.action == "get_weather(location='Paris')"
+    ), f""" Expected get_weather(location='Paris') but got {parsed_output.action}"""
     assert global_tracker.stats["input_tokens"] > 0
     assert global_tracker.stats["output_tokens"] > 0
     assert global_tracker.stats["cost"] > 0
@@ -446,8 +446,8 @@ def test_claude_response_model_pricy_call():
 
     assert parsed_output.raw_response is not None
     assert (
-        parsed_output.action == 'get_weather(location="Paris")'
-    ), f'Expected get_weather("Paris") but got {parsed_output.action}'
+        parsed_output.action == "get_weather(location='Paris')"
+    ), f"""Expected get_weather('Paris') but got {parsed_output.action}"""
     assert global_tracker.stats["input_tokens"] > 0
     assert global_tracker.stats["output_tokens"] > 0
     assert global_tracker.stats["cost"] > 0
@@ -474,8 +474,8 @@ def test_openai_response_model_pricy_call():
 
     assert parsed_output.raw_response is not None
     assert (
-        parsed_output.action == """get_weather(location="Paris")"""
-    ), f""" Expected get_weather(location="Paris") but got {parsed_output.action}"""
+        parsed_output.action == """get_weather(location='Paris')"""
+    ), f""" Expected get_weather(location='Paris') but got {parsed_output.action}"""
     assert global_tracker.stats["input_tokens"] > 0
     assert global_tracker.stats["output_tokens"] > 0
     assert global_tracker.stats["cost"] > 0
@@ -524,7 +524,9 @@ def test_openai_response_model_with_multiple_messages_and_cost_tracking():
     assert prev_output > 0
     assert prev_cost > 0
     assert parsed.raw_response is not None
-    assert parsed.action == 'get_weather(location="Delhi")', f"Unexpected action: {parsed.action}"
+    assert (
+        parsed.action == """get_weather(location='Delhi')"""
+    ), f"Unexpected action: {parsed.action}"
     assert delta_input > 0
     assert delta_output > 0
     assert delta_cost > 0
@@ -597,7 +599,9 @@ def test_openai_chat_completion_model_with_multiple_messages_and_cost_tracking()
     assert prev_output > 0
     assert prev_cost > 0
     assert parsed.raw_response is not None
-    assert parsed.action == 'get_weather(location="Delhi")', f"Unexpected action: {parsed.action}"
+    assert (
+        parsed.action == """get_weather(location='Delhi')"""
+    ), f"Unexpected action: {parsed.action}"
     assert delta_input > 0
     assert delta_output > 0
     assert delta_cost > 0
@@ -663,8 +667,8 @@ def test_claude_model_with_multiple_messages_pricy_call():
     assert prev_cost > 0, "Expected previous cost value to be greater than 0"
     assert llm_output2.raw_response is not None
     assert (
-        llm_output2.action == 'get_weather(location="Delhi", unit="celsius")'
-    ), f'Expected get_weather("Delhi") but got {llm_output2.action}'
+        llm_output2.action == """get_weather(location='Delhi', unit='celsius')"""
+    ), f"""Expected get_weather('Delhi') but got {llm_output2.action}"""
     assert delta_input > 0, "Expected new input tokens to be greater than 0"
     assert delta_output > 0, "Expected new output tokens to be greater than 0"
     assert delta_cost > 0, "Expected new cost value to be greater than 0"
@@ -682,12 +686,7 @@ def test_multi_action_tool_calls():
     """
     # test_config (setting name, BaseModelArgs, model_name, tools)
     tool_test_configs = [
-        (
-            "gpt-4.1-responses API",
-            OpenAIResponseModelArgs,
-            "gpt-4.1-2025-04-14",
-            responses_api_tools,
-        ),
+        ("gpt-4.1-responses API", OpenAIResponseModelArgs, "gpt-4.1-2025-04-14", responses_api_tools),
         ("gpt-4.1-chat Completions API", OpenAIChatModelArgs, "gpt-4.1-2025-04-14", chat_api_tools),
         # ("claude-3", ClaudeResponseModelArgs, "claude-3-haiku-20240307", anthropic_tools),   # fails
         # ("claude-3.7", ClaudeResponseModelArgs, "claude-3-7-sonnet-20250219", anthropic_tools), # fails

From fe37bb87269254618fedda15edaf7a1501518b32 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 14 Jul 2025 15:44:15 -0400
Subject: [PATCH 26/37] Make gaia test conditional on successful import

---
 tests/agents/test_gaia_agent.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tests/agents/test_gaia_agent.py b/tests/agents/test_gaia_agent.py
index 0d39f9ef..604ac00c 100644
--- a/tests/agents/test_gaia_agent.py
+++ b/tests/agents/test_gaia_agent.py
@@ -2,10 +2,15 @@
 import uuid
 from pathlib import Path
 
-from tapeagents.steps import ImageObservation
+try:
+    from tapeagents.steps import ImageObservation
 
-from agentlab.agents.tapeagent.agent import TapeAgent, TapeAgentArgs, load_config
-from agentlab.benchmarks.gaia import GaiaBenchmark, GaiaQuestion
+    from agentlab.agents.tapeagent.agent import TapeAgent, TapeAgentArgs, load_config
+    from agentlab.benchmarks.gaia import GaiaBenchmark, GaiaQuestion
+except ModuleNotFoundError:
+    import pytest
+
+    pytest.skip("Skipping test due to missing dependencies", allow_module_level=True)
 
 
 def mock_dataset() -> dict:

From 72feab7f8b85a66735aa397246d87413e7fcdf2d Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 14 Jul 2025 16:36:39 -0400
Subject: [PATCH 27/37] add tool calls to message for pretty xray output

---
 src/agentlab/agents/tool_use_agent/tool_use_agent.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
index 0e5b2294..39033759 100644
--- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py
+++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
@@ -486,8 +486,11 @@ def get_action(self, obs: Any) -> float:
         tools_msg = MessageBuilder("tool_description").add_text(tools_str)
 
         # Adding these extra messages to visualize in gradio
-        messages.insert(0, tools_msg)  # insert at the beginning of the messages
-        messages.append(response.tool_calls)
+        messages.insert(0, tools_msg)  # insert at the beginning of the message
+        # This avoids the assertion error with self.llm.user().add_responded_tool_calls(tool_calls)
+        msg = self.llm.msg("tool")
+        msg.responded_tool_calls = response.tool_calls
+        messages.append(msg) 
 
         agent_info = bgym.AgentInfo(
             think=think,

From f8a5a3c68c630655672a163a674f5f0f89b0fbb3 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 14 Jul 2025 16:37:51 -0400
Subject: [PATCH 28/37] use tool_call_to_python_code in to_markdown for
 responded tool calls

---
 src/agentlab/llm/response_api.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
index 417aab05..a70d03b0 100644
--- a/src/agentlab/llm/response_api.py
+++ b/src/agentlab/llm/response_api.py
@@ -151,8 +151,7 @@ def to_markdown(self) -> str:
         # Tool call markdown repr
         if self.responded_tool_calls is not None:
             for i, tool_call in enumerate(self.responded_tool_calls.tool_calls, 1):
-                args = ", ".join(f"{k}={v}" for k, v in tool_call.arguments.items())
-                parts.append(f"\n**Tool Call {i}**: {tool_call.name}({args})")
+                parts.append(f"\n**Tool Call {i}**: {tool_call_to_python_code(tool_call.name, tool_call.arguments)}")
                 response = tool_call.tool_response
                 if response is not None:
                     parts.append(f"\n**Tool Response {i}:**")

From 12b6d509bff354f219b5be494d8876858c2142c6 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 14 Jul 2025 16:40:30 -0400
Subject: [PATCH 29/37] black formatting.

---
 src/agentlab/agents/tool_use_agent/tool_use_agent.py | 2 +-
 src/agentlab/llm/response_api.py                     | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
index 39033759..c6b665fd 100644
--- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py
+++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
@@ -490,7 +490,7 @@ def get_action(self, obs: Any) -> float:
         # This avoids the assertion error with self.llm.user().add_responded_tool_calls(tool_calls)
         msg = self.llm.msg("tool")
         msg.responded_tool_calls = response.tool_calls
-        messages.append(msg) 
+        messages.append(msg)
 
         agent_info = bgym.AgentInfo(
             think=think,
diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
index a70d03b0..dfc9858a 100644
--- a/src/agentlab/llm/response_api.py
+++ b/src/agentlab/llm/response_api.py
@@ -151,7 +151,9 @@ def to_markdown(self) -> str:
         # Tool call markdown repr
         if self.responded_tool_calls is not None:
             for i, tool_call in enumerate(self.responded_tool_calls.tool_calls, 1):
-                parts.append(f"\n**Tool Call {i}**: {tool_call_to_python_code(tool_call.name, tool_call.arguments)}")
+                parts.append(
+                    f"\n**Tool Call {i}**: {tool_call_to_python_code(tool_call.name, tool_call.arguments)}"
+                )
                 response = tool_call.tool_response
                 if response is not None:
                     parts.append(f"\n**Tool Response {i}:**")

From 42aa6807356cc01e39ec958de84a5c8ede4cc6df Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 14 Jul 2025 16:49:06 -0400
Subject: [PATCH 30/37] make darglint compliant

---
 src/agentlab/llm/response_api.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
index dfc9858a..a13c0e22 100644
--- a/src/agentlab/llm/response_api.py
+++ b/src/agentlab/llm/response_api.py
@@ -433,13 +433,23 @@ class AgentlabAction:
 
     def convert_toolcall_to_agentlab_action_format(toolcall: ToolCall) -> str:
         """Convert a tool call to an Agentlab environment action string.
-        This converts tools calls to python function call strings."""
+        Args:
+            toolcall: ToolCall object containing the name and arguments of the tool call.
+
+        Returns:
+            str: A string representing the action in Agentlab format i.e. python function call string.
+        """
+
         tool_name, tool_args = toolcall.name, toolcall.arguments
         return tool_call_to_python_code(tool_name, tool_args)
 
     def convert_multiactions_to_agentlab_action_format(actions: list[str]) -> str:
-        """Convert multiple actions list to a format that env supports
-        Joins multiple python function calls with a newline character.
+        """Convert multiple actions list to a format that env supports.
+        Args:
+            actions: List of action strings to be joined.
+
+        Returns:
+            str: Joined actions separated by newlines, or None if empty.
         """
         return "\n".join(actions) if actions else None
 
@@ -655,7 +665,7 @@ def _extract_thinking_content_from_response(
         Note: The wrapping of 'thinking' content may not be nedeed and may be reconsidered.
 
         Args:
-            message: The message object or dict containing content and reasoning.
+            response: The message object or dict containing content and reasoning.
             wrap_tag: The tag name to wrap reasoning content (default: "think").
 
         Returns:
@@ -723,6 +733,7 @@ def format_tools_for_chat_completion(tools):
         Why we need this?
         Ans: actionset.to_tool_description() in bgym only returns description
         format valid for OpenAI Response API.
+
         Args:
             tools: List of tool descriptions to format for Chat Completion API.
 

From ddf672b28d55c12416b009d282ca07dfaffedb08 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 14 Jul 2025 17:03:05 -0400
Subject: [PATCH 31/37] black formatting

---
 tests/llm/test_response_api.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/llm/test_response_api.py b/tests/llm/test_response_api.py
index e0435cac..dda7281e 100644
--- a/tests/llm/test_response_api.py
+++ b/tests/llm/test_response_api.py
@@ -686,7 +686,12 @@ def test_multi_action_tool_calls():
     """
     # test_config (setting name, BaseModelArgs, model_name, tools)
     tool_test_configs = [
-        ("gpt-4.1-responses API", OpenAIResponseModelArgs, "gpt-4.1-2025-04-14", responses_api_tools),
+        (
+            "gpt-4.1-responses API",
+            OpenAIResponseModelArgs,
+            "gpt-4.1-2025-04-14",
+            responses_api_tools,
+        ),
         ("gpt-4.1-chat Completions API", OpenAIChatModelArgs, "gpt-4.1-2025-04-14", chat_api_tools),
         # ("claude-3", ClaudeResponseModelArgs, "claude-3-haiku-20240307", anthropic_tools),   # fails
         # ("claude-3.7", ClaudeResponseModelArgs, "claude-3-7-sonnet-20250219", anthropic_tools), # fails

From 119a43d7a8812cbc13d8c7d0178dc2235293c8b9 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 14 Jul 2025 20:50:07 -0400
Subject: [PATCH 32/37] Log warning if effective cost is negative

---
 src/agentlab/llm/tracking.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/agentlab/llm/tracking.py b/src/agentlab/llm/tracking.py
index 53acbfe2..b8bcce7c 100644
--- a/src/agentlab/llm/tracking.py
+++ b/src/agentlab/llm/tracking.py
@@ -13,7 +13,7 @@
 
 TRACKER = threading.local()
 
-ANTHROPHIC_CACHE_PRICING_FACTOR = {
+ANTHROPIC_CACHE_PRICING_FACTOR = {
     "cache_read_tokens": 0.1,  # Cost for 5 min ephemeral cache. See Pricing Here: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing
     "cache_write_tokens": 1.25,
 }
@@ -274,8 +274,8 @@ def get_effective_cost_from_antrophic_api(self, response) -> float:
         cache_read_tokens = getattr(usage, "cache_input_tokens", 0)
         cache_write_tokens = getattr(usage, "cache_creation_input_tokens", 0)
 
-        cache_read_cost = self.input_cost * ANTHROPHIC_CACHE_PRICING_FACTOR["cache_read_tokens"]
-        cache_write_cost = self.input_cost * ANTHROPHIC_CACHE_PRICING_FACTOR["cache_write_tokens"]
+        cache_read_cost = self.input_cost * ANTHROPIC_CACHE_PRICING_FACTOR["cache_read_tokens"]
+        cache_write_cost = self.input_cost * ANTHROPIC_CACHE_PRICING_FACTOR["cache_write_tokens"]
 
         # Calculate the effective cost
         effective_cost = (
@@ -284,6 +284,10 @@ def get_effective_cost_from_antrophic_api(self, response) -> float:
             + cache_read_tokens * cache_read_cost
             + cache_write_tokens * cache_write_cost
         )
+        if effective_cost < 0:
+            logging.warning(
+                "Anthropic: Negative effective cost detected.(Impossible! Likely a bug)"
+            )
         return effective_cost
 
     def get_effective_cost_from_openai_api(self, response) -> float:
@@ -308,25 +312,29 @@ def get_effective_cost_from_openai_api(self, response) -> float:
             return 0.0
         api_type = "chatcompletion" if hasattr(usage, "prompt_tokens_details") else "response"
         if api_type == "chatcompletion":
-            total_input_tokens = usage.prompt_tokens
+            total_input_tokens = usage.prompt_tokens  # (cache read tokens + new input tokens)
             output_tokens = usage.completion_tokens
             cached_input_tokens = usage.prompt_tokens_details.cached_tokens
-            non_cached_input_tokens = total_input_tokens - cached_input_tokens
+            new_input_tokens = total_input_tokens - cached_input_tokens
         elif api_type == "response":
-            total_input_tokens = usage.input_tokens
+            total_input_tokens = usage.input_tokens  # (cache read tokens + new input tokens)
             output_tokens = usage.output_tokens
             cached_input_tokens = usage.input_tokens_details.cached_tokens
-            non_cached_input_tokens = total_input_tokens - cached_input_tokens
+            new_input_tokens = total_input_tokens - cached_input_tokens
         else:
             logging.warning(f"Unsupported API type: {api_type}. Defaulting cost to 0.0.")
             return 0.0
-
         cache_read_cost = self.input_cost * OPENAI_CACHE_PRICING_FACTOR["cache_read_tokens"]
         effective_cost = (
-            self.input_cost * non_cached_input_tokens
+            self.input_cost * new_input_tokens
             + cached_input_tokens * cache_read_cost
             + self.output_cost * output_tokens
         )
+        if effective_cost < 0:
+            logging.warning(
+                f"OpenAI: Negative effective cost detected.(Impossible! Likely a bug). "
+                f"New input tokens: {total_input_tokens}"
+            )
         return effective_cost
 
 

From 8f3105fa010bf908d400c9ba3a53155bf883fe57 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 14 Jul 2025 21:21:31 -0400
Subject: [PATCH 33/37] update test mock object to incude cache info.

---
 tests/llm/test_response_api.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/llm/test_response_api.py b/tests/llm/test_response_api.py
index dda7281e..653f87bf 100644
--- a/tests/llm/test_response_api.py
+++ b/tests/llm/test_response_api.py
@@ -56,6 +56,7 @@ def create_mock_openai_chat_completion(
     # or if get_tokens_counts_from_response had different fallback logic.
     completion.usage.prompt_tokens = prompt_tokens
     completion.usage.completion_tokens = completion_tokens
+    completion.usage.prompt_tokens_details.cached_tokens = 0
 
     completion.model_dump.return_value = {
         "id": "chatcmpl-xxxx",
@@ -69,6 +70,7 @@ def create_mock_openai_chat_completion(
             "output_tokens": completion_tokens,  # Generic name
             "prompt_tokens": prompt_tokens,  # OpenAI specific
             "completion_tokens": completion_tokens,  # OpenAI specific
+            "prompt_tokens_details": {"cached_tokens": 0},
         },
     }
     message.to_dict.return_value = {
@@ -166,6 +168,8 @@ def create_mock_anthropic_response(
     response.usage = MagicMock()
     response.usage.input_tokens = input_tokens
     response.usage.output_tokens = output_tokens
+    response.usage.cache_input_tokens = 0
+    response.usage.cache_creation_input_tokens = 0
     return response
 
 
@@ -207,6 +211,7 @@ def create_mock_openai_responses_api_response(
     response_mock.usage.output_tokens = output_tokens
     response_mock.usage.prompt_tokens = input_tokens
     response_mock.usage.completion_tokens = output_tokens
+    response_mock.usage.input_tokens_details.cached_tokens = 0
 
     return response_mock
 

From 16d334cc976de4bac006161c7dd88142557ac71d Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 14 Jul 2025 21:39:27 -0400
Subject: [PATCH 34/37] fix nested mock object in test.

---
 tests/llm/test_response_api.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/llm/test_response_api.py b/tests/llm/test_response_api.py
index 653f87bf..ad1d9dd2 100644
--- a/tests/llm/test_response_api.py
+++ b/tests/llm/test_response_api.py
@@ -56,7 +56,9 @@ def create_mock_openai_chat_completion(
     # or if get_tokens_counts_from_response had different fallback logic.
     completion.usage.prompt_tokens = prompt_tokens
     completion.usage.completion_tokens = completion_tokens
-    completion.usage.prompt_tokens_details.cached_tokens = 0
+    prompt_tokens_details_mock = MagicMock()
+    prompt_tokens_details_mock.cached_tokens = 0
+    completion.usage.prompt_tokens_details = prompt_tokens_details_mock
 
     completion.model_dump.return_value = {
         "id": "chatcmpl-xxxx",
@@ -211,7 +213,9 @@ def create_mock_openai_responses_api_response(
     response_mock.usage.output_tokens = output_tokens
     response_mock.usage.prompt_tokens = input_tokens
     response_mock.usage.completion_tokens = output_tokens
-    response_mock.usage.input_tokens_details.cached_tokens = 0
+    input_tokens_details_mock = MagicMock()
+    input_tokens_details_mock.cached_tokens = 0
+    response_mock.usage.input_tokens_details = input_tokens_details_mock
 
     return response_mock
 

From f60b31415cdd6769332df02aa9e554c1d6561e81 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Mon, 14 Jul 2025 23:59:04 -0400
Subject: [PATCH 35/37] Fix Reponses API mock object specification in
 test_responses_api.py

---
 tests/llm/test_response_api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/llm/test_response_api.py b/tests/llm/test_response_api.py
index ad1d9dd2..6bb639f6 100644
--- a/tests/llm/test_response_api.py
+++ b/tests/llm/test_response_api.py
@@ -184,7 +184,7 @@ def create_mock_openai_responses_api_response(
     Compatible with OpenAIResponseModel and TrackAPIPricingMixin.
     """
 
-    response_mock = MagicMock(openai.types.responses.response)
+    response_mock = MagicMock(spec=openai.types.responses.response.Response)
     response_mock.type = "response"
     response_mock.output = []
 
@@ -208,7 +208,7 @@ def create_mock_openai_responses_api_response(
             response_mock.output.append(output_item_mock)
 
     # Token usage for pricing tracking
-    response_mock.usage = MagicMock()
+    response_mock.usage = MagicMock(spec=openai.types.responses.response.ResponseUsage)
     response_mock.usage.input_tokens = input_tokens
     response_mock.usage.output_tokens = output_tokens
     response_mock.usage.prompt_tokens = input_tokens

From 22f938558f22804f1b51a79133300fd6bb755281 Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Tue, 15 Jul 2025 14:55:00 -0400
Subject: [PATCH 36/37] fix obs indentation and remove unnessesary code.

---
 .../agents/tool_use_agent/tool_use_agent.py   | 26 ++++++-------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
index c6b665fd..bec693ae 100644
--- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py
+++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
@@ -119,15 +119,6 @@ def is_goal_set(self) -> bool:
         """Check if the goal is set in the first group."""
         return len(self.groups) > 0
 
-    def contains_image(self) -> bool:
-        """Check if an image is set in any group"""
-        for grp in self.groups:
-            for msg in grp.messages:
-                for item in msg.content:
-                    if "image" in item:
-                        return True
-        return False
-
 
 SYS_MSG = """You are a web agent. Based on the observation, you will decide which action to take to accomplish your goal. 
 You strive for excellence and need to be as meticulous as possible. Make sure to explore when not sure.
@@ -176,7 +167,7 @@ class Obs(Block):
     use_dom: bool = False
     use_som: bool = False
     use_tabs: bool = False
-    add_mouse_pointer: bool = False
+    # add_mouse_pointer: bool = False
     use_zoomed_webpage: bool = False
 
     def apply(
@@ -196,15 +187,14 @@ def apply(
             else:
                 screenshot = obs["screenshot"]
 
-            if self.add_mouse_pointer:
-                screenshot = np.array(
-                    agent_utils.add_mouse_pointer_from_action(
-                        Image.fromarray(obs["screenshot"]), obs["last_action"]
-                    )
-                )
-
-        obs_msg.add_image(image_to_png_base64_url(screenshot))
+            # if self.add_mouse_pointer:
+            #     screenshot = np.array(
+            #         agent_utils.add_mouse_pointer_from_action(
+            #             Image.fromarray(obs["screenshot"]), obs["last_action"]
+            #         )
+            #     )
 
+            obs_msg.add_image(image_to_png_base64_url(screenshot))
         if self.use_axtree:
             obs_msg.add_text(f"AXTree:\n{AXTREE_NOTE}\n{obs['axtree_txt']}")
         if self.use_dom:

From a6f53497e3f398705ae3708f77d093743348099f Mon Sep 17 00:00:00 2001
From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com>
Date: Tue, 15 Jul 2025 14:55:28 -0400
Subject: [PATCH 37/37] improve type hints

---
 src/agentlab/llm/response_api.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
index a13c0e22..e8c74849 100644
--- a/src/agentlab/llm/response_api.py
+++ b/src/agentlab/llm/response_api.py
@@ -104,8 +104,10 @@ class LLMOutput:
 
     raw_response: Any = field(default=None)
     think: str = field(default="")
-    action: str = field(default=None)  # Default action if no tool call is made
-    tool_calls: ToolCalls = field(default=None)  # This will hold the tool call response if any
+    action: str | None = field(default=None)  # Default action if no tool call is made
+    tool_calls: ToolCalls | None = field(
+        default=None
+    )  # This will hold the tool call response if any
 
 
 class MessageBuilder:
@@ -374,10 +376,10 @@ def mark_all_previous_msg_for_caching(self):
 
 @dataclass
 class APIPayload:
-    messages: List[MessageBuilder | ToolCalls] = None
+    messages: List[MessageBuilder] | None = None
     tools: List[Dict[str, Any]] | None = None
     tool_choice: Literal["none", "auto", "any", "required"] | None = None
-    force_call_tool: str = (
+    force_call_tool: str | None = (
         None  # Name of the tool to call # If set, will force the LLM to call this tool.
     )
     use_cache_breakpoints: bool = (
@@ -410,7 +412,7 @@ def __init__(
         self.max_tokens = max_tokens
         super().__init__()
 
-    def __call__(self, payload: APIPayload) -> dict:
+    def __call__(self, payload: APIPayload) -> LLMOutput:
         """Make a call to the model and return the parsed response."""
         response = self._call_api(payload)
         return self._parse_response(response)
@@ -431,25 +433,29 @@ class AgentlabAction:
     Collection of utility function to convert tool calls to Agentlab action format.
     """
 
+    @staticmethod
     def convert_toolcall_to_agentlab_action_format(toolcall: ToolCall) -> str:
         """Convert a tool call to an Agentlab environment action string.
+
         Args:
             toolcall: ToolCall object containing the name and arguments of the tool call.
 
         Returns:
-            str: A string representing the action in Agentlab format i.e. python function call string.
+            A string representing the action in Agentlab format i.e. python function call string.
         """
 
         tool_name, tool_args = toolcall.name, toolcall.arguments
         return tool_call_to_python_code(tool_name, tool_args)
 
-    def convert_multiactions_to_agentlab_action_format(actions: list[str]) -> str:
+    @staticmethod
+    def convert_multiactions_to_agentlab_action_format(actions: list[str]) -> str | None:
         """Convert multiple actions list to a format that env supports.
+
         Args:
             actions: List of action strings to be joined.
 
         Returns:
-            str: Joined actions separated by newlines, or None if empty.
+            Joined actions separated by newlines, or None if empty.
         """
         return "\n".join(actions) if actions else None