Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 85 additions & 78 deletions reproducibility_journal.csv

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions src/agentlab/agents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,7 @@

- TapeAgent: An agent that uses the Tape data structure to perform actions

- VisualAgent: An agent that uses visual observations to to perform actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Duplicated Word in Documentation category Documentation

Tell me more
What is the issue?

There is a duplicated 'to' in the description of VisualAgent.

Why this matters

The duplicate word makes the documentation incorrect and unprofessional, affecting the clarity of the API documentation for users.

Suggested change ∙ Feature Preview

Replace the line with a correctly formatted description:

- VisualAgent: An agent that uses visual observations to perform actions
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.


- VisualWebArenaAgent: An implementation of the agent used in WebArena and VisualWebArena
"""
5 changes: 4 additions & 1 deletion src/agentlab/agents/generic_agent/agent_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from .generic_agent import GenericAgentArgs
from .generic_agent_prompt import GenericPromptFlags
from .tmlr_config import BASE_FLAGS
from .tmlr_config import BASE_FLAGS, get_base_agent

FLAGS_CUSTOM = GenericPromptFlags(
obs=dp.ObsFlags(
Expand Down Expand Up @@ -302,6 +302,9 @@
flags=BASE_FLAGS,
)

AGENT_QWEN_2_5_VL_32B = get_base_agent("openrouter/qwen/qwen2.5-vl-32b-instruct")
AGENT_QWEN_3_32B = get_base_agent("openrouter/qwen/qwen3-32b")

DEFAULT_RS_FLAGS = GenericPromptFlags(
flag_group="default_rs",
obs=dp.ObsFlags(
Expand Down
1 change: 1 addition & 0 deletions src/agentlab/agents/generic_agent/tmlr_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@


def get_base_agent(llm_config: str):
assert llm_config in CHAT_MODEL_ARGS_DICT, f"Unsupported LLM config: {llm_config}"
return GenericAgentArgs(
chat_model_args=CHAT_MODEL_ARGS_DICT[llm_config],
flags=BASE_FLAGS,
Expand Down
43 changes: 43 additions & 0 deletions src/agentlab/agents/visual_agent/agent_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,46 @@
chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/anthropic/claude-3.5-sonnet:beta"],
flags=DEFAULT_PROMPT_FLAGS,
)

VISUAL_AGENT_QWEN_2_5_VL_32B = VisualAgentArgs(
chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/qwen/qwen2.5-vl-32b-instruct"],
flags=DEFAULT_PROMPT_FLAGS,
)

def get_som_agent(llm_config: str):
"""Creates basic 1-step vision SOM agent"""
assert llm_config in CHAT_MODEL_ARGS_DICT, f"Unsupported LLM config: {llm_config}"
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assert used for production validation category Error Handling

Tell me more
What is the issue?

Using assert for input validation in a function that could be called from production code.

Why this matters

Assertions can be disabled with Python's -O flag in production, leaving the function vulnerable to invalid inputs without any error handling.

Suggested change ∙ Feature Preview

Replace assertion with explicit error handling:

if llm_config not in CHAT_MODEL_ARGS_DICT:
    raise ValueError(f"Unsupported LLM config: {llm_config}")
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.

obs_flags = dp.ObsFlags(
use_tabs=True,
use_error_logs=True,
use_past_error_logs=False,
use_screenshot=True,
use_som=True,
openai_vision_detail="auto",
)
action_flags = dp.ActionFlags(
action_set=bgym.HighLevelActionSetArgs(subsets=["bid"]),
long_description=True,
individual_examples=False,
)
som_prompt_flags = PromptFlags(
obs=obs_flags,
action=action_flags,
use_thinking=True,
use_concrete_example=False,
use_abstract_example=True,
enable_chat=False,
extra_instructions=None,
)

agent_args = VisualAgentArgs(
chat_model_args=CHAT_MODEL_ARGS_DICT[llm_config],
flags=som_prompt_flags,
)
model_name = agent_args.chat_model_args.model_name
agent_args.agent_name = f"VisualAgent-som-{model_name}".replace("/", "_")

return agent_args


VISUAL_SOM_AGENT_LLAMA4_17B_INSTRUCT = get_som_agent("openrouter/meta-llama/llama-4-maverick")
20 changes: 18 additions & 2 deletions src/agentlab/llm/chat_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,19 +322,35 @@ def __call__(self, messages: list[dict], n_samples: int = 1, temperature: float
tracking.TRACKER.instance(input_tokens, output_tokens, cost)

if n_samples == 1:
res = AIMessage(completion.choices[0].message.content)
res = AIMessage(self.extract_content_with_reasoning(completion.choices[0].message))
if self.log_probs:
res["log_probs"] = completion.choices[0].log_probs
return res
else:
return [AIMessage(c.message.content) for c in completion.choices]
return [
AIMessage(self.extract_content_with_reasoning(c.message))
for c in completion.choices
]

def get_stats(self):
return {
"n_retry_llm": self.retries,
# "busted_retry_llm": int(not self.success), # not logged if it occurs anyways
}

# Support for models that return reasoning.
def extract_content_with_reasoning(self, message, wrap_tag="think"):
"""Extracts the content from the message, including reasoning if available.
It wraps the reasoning around <think>...</think> for backward compatibility."""

reasoning_content = getattr(message, "reasoning", None)
if reasoning_content:
# Wrap reasoning in <think> tags with newlines for clarity
reasoning_content = f"<{wrap_tag}>\n{reasoning_content}\n</{wrap_tag}>\n"
else:
reasoning_content = ""
return f"{reasoning_content}{message.content}"


class OpenAIChatModel(ChatModel):
def __init__(
Expand Down
16 changes: 16 additions & 0 deletions src/agentlab/llm/llm_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,4 +207,20 @@
max_new_tokens=64_000,
temperature=1e-1,
),
"openrouter/qwen/qwen2.5-vl-32b-instruct": OpenRouterModelArgs(
model_name="qwen/qwen2.5-vl-32b-instruct",
max_total_tokens=128_000,
max_input_tokens=120_000,
max_new_tokens=8_000,
Comment on lines +212 to +214
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No Token Buffer Safety Margin category Performance

Tell me more
What is the issue?

The sum of max_input_tokens and max_new_tokens equals max_total_tokens exactly, which could lead to token limit errors at runtime.

Why this matters

When the actual input reaches near max_input_tokens, even slightly exceeding it due to tokenization differences could cause failures since there's no buffer in the total token limit.

Suggested change ∙ Feature Preview

Add a small buffer by reducing max_input_tokens or max_new_tokens to ensure total is less than max_total_tokens:

max_total_tokens=128_000,
max_input_tokens=119_000,  # Reduced to provide buffer
max_new_tokens=8_000,
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.

temperature=1e-1,
vision_support=True,
),
"openrouter/qwen/qwen3-32b": OpenRouterModelArgs(
model_name="qwen/qwen3-32b",
max_total_tokens=128_000,
max_input_tokens=120_000,
max_new_tokens=8_000,
temperature=1e-1,
vision_support=True,
),
}
Loading