Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
732 changes: 0 additions & 732 deletions ICML2024/script.ipynb

This file was deleted.

4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@


![AgentLab Banner](https://github.com/user-attachments/assets/a23b3cd8-b5c4-4918-817b-654ae6468cb4)

[![pypi](https://badge.fury.io/py/agentlab.svg)](https://pypi.org/project/agentlab/)
Expand All @@ -10,6 +8,7 @@
[![Tests](https://github.com/ServiceNow/AgentLab/actions/workflows/unit_tests.yml/badge.svg)](https://github.com/ServiceNow/AgentLab/actions/workflows/unit_tests.yml)



[🛠️ Setup](#%EF%B8%8F-setup-agentlab)  | 
[🤖 Assistant](#-ui-assistant)  | 
[🚀 Launch Experiments](#-launch-experiments)  | 
Expand Down Expand Up @@ -48,6 +47,7 @@ AgentLab Features:
| [Mind2Web-live](https://huggingface.co/datasets/iMeanAI/Mind2Web-Live) (soon) | - | - | None | - | - | live web | soon |
| [MiniWoB](https://miniwob.farama.org/index.html) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/README.md) | 125 | Medium | 10 | no | self hosted (static files) | soon |


## 🛠️ Setup AgentLab

```bash
Expand Down
89 changes: 0 additions & 89 deletions src/agentlab/analyze/error_categorization.py

This file was deleted.

47 changes: 2 additions & 45 deletions src/agentlab/analyze/inspect_results.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import fnmatch
import io
import json
import random
import re
import traceback
import warnings
from collections import defaultdict
from datetime import datetime
from logging import warn
from pathlib import Path

Expand All @@ -16,25 +14,14 @@
from IPython.display import display
from tqdm import tqdm

from agentlab.analyze.error_categorization import (
ERR_CLASS_MAP,
is_critical_server_error,
is_minor_server_error,
)
from agentlab.experiments.exp_utils import RESULTS_DIR
from agentlab.utils.bootstrap import bootstrap_matrix, convert_df_to_array

# TODO find a more portable way to code set_task_category_as_index at least
# handle dynamic imports. We don't want to always import workarena
# from browsergym.workarena import TASK_CATEGORY_MAP

warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

try:
import pyperclip
except ImportError:
pyperclip = None

pd.set_option("display.multi_sparse", False)

AGENT_NAME_KEY = "agent.agent_name"
Expand Down Expand Up @@ -224,18 +211,6 @@ def report_constant_and_variables(df, show_stack_traces=True):
print(f" ...\n")


def get_bootstrap(df, metric, reduce_fn=np.nanmean, n_bootstrap=100, group_by=TASK_KEY, prior=0.5):
"""Get the stratified bootstrap mean and std for the given metric."""
grouped_df = df.reset_index(inplace=False).groupby(group_by)
array = convert_df_to_array(grouped_df, metric=metric, threshold=0.7)
if prior is not None:
prior = prior * np.ones((len(array), 1))
array = np.concatenate([array, prior], axis=1)

bootstrapped_values = bootstrap_matrix(array, n_bootstrap=n_bootstrap, reduce_fn=reduce_fn)
return np.nanmean(bootstrapped_values), np.nanstd(bootstrapped_values)


def get_std_err(df, metric):
"""Get the standard error for a binary metric."""
# extract non missing values
Expand All @@ -262,7 +237,7 @@ def get_sample_std_err(df, metric):
return mean, std_err


def summarize(sub_df, use_bootstrap=False):
def summarize(sub_df):
if not "cum_reward" in sub_df:
record = dict(
avg_reward=np.nan,
Expand All @@ -279,10 +254,7 @@ def summarize(sub_df, use_bootstrap=False):
if n_completed == 0:
return None

if use_bootstrap:
_mean_reward, std_reward = get_bootstrap(sub_df, "cum_reward")
else:
_mean_reward, std_reward = get_std_err(sub_df, "cum_reward")
_mean_reward, std_reward = get_std_err(sub_df, "cum_reward")

# sanity check, if there is an error the reward should be zero
assert sub_df[sub_df["err_msg"].notnull()]["cum_reward"].sum() == 0
Expand Down Expand Up @@ -466,21 +438,6 @@ def _rename_bool_flags(report: pd.DataFrame, true_str="✓", false_str="-"):
return report


def to_clipboard(df: pd.DataFrame):
"""Copy the dataframe to the clipboard as a tab separated csv."""
output = io.StringIO()
df.to_csv(output, sep="\t", index=True)
csv_string = output.getvalue()
if pyperclip is not None:
try:
pyperclip.copy(csv_string)
except Exception as e:
warn(f"Failed to copy to clipboard: {e}")
# else:
# print("pyperclip is not installed, cannot copy to clipboard.")
# return df


def flag_report(report: pd.DataFrame, metric: str = "avg_reward", round_digits: int = 2):
# for all index in the multi-index with boolean value, get the average for
# True and the average for False separately. Produce a new dataframe with
Expand Down
Loading