Skip to content

Commit 33fba24

Browse files
authored
Add more agent evals to evals cli (#1422)
# why we need more evals for agent # what changed - Added 19 new evals composed primarily of "hard" level tasks from public datasets such as onlineMind2web - Updated evals to import agent from agent, rather than v3Agent, as it was an incorrect import causing tasks to fail # test plan ran evals <!-- This is an auto-generated description by cubic. --> --- ## Summary by cubic Added 18 new hard-level agent evals and fixed the agent import to use the correct agent, improving coverage and stability of browser tasks. - **New Features** - Added evals for diverse sites (Amazon cart, KFC order, Redfin rentals, Flipkart filters, WebMD tools, Trustpilot, Uniqlo, Alibaba, NVIDIA drivers, OED search, Radiotimes, TheGamer, Trailhead, etc.). - Integrated ScreenshotCollector in new evals to capture journeys for better automated evaluation. - Updated evals.config.json to register all new tasks under the agent category. - **Bug Fixes** - Replaced v3Agent with agent across existing evals to prevent task failures. - Standardized agent.execute usage and evaluation flow to improve reliability. <sup>Written for commit b947d97. Summary will update automatically on new commits.</sup> <!-- End of auto-generated description by cubic. -->
1 parent cadd192 commit 33fba24

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+1524
-53
lines changed

packages/evals/evals.config.json

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,82 @@
548548
{
549549
"name": "agent/onlineMind2Web",
550550
"categories": ["external_agent_benchmarks"]
551+
},
552+
{
553+
"name": "agent/alibaba_supplier_search",
554+
"categories": ["agent"]
555+
},
556+
{
557+
"name": "agent/amazon_shoes_cart",
558+
"categories": ["agent"]
559+
},
560+
{
561+
"name": "agent/columbia_tuition",
562+
"categories": ["agent"]
563+
},
564+
{
565+
"name": "agent/flipkart_laptops",
566+
"categories": ["agent"]
567+
},
568+
{
569+
"name": "agent/google_shopping",
570+
"categories": ["agent"]
571+
},
572+
{
573+
"name": "agent/hotels_paris_amenities",
574+
"categories": ["agent"]
575+
},
576+
{
577+
"name": "agent/instacart_organic_bananas",
578+
"categories": ["agent"]
579+
},
580+
{
581+
"name": "agent/kfc_tenders_combo",
582+
"categories": ["agent"]
583+
},
584+
{
585+
"name": "agent/made_in_china_supplier",
586+
"categories": ["agent"]
587+
},
588+
{
589+
"name": "agent/nvidia_hgx_driver",
590+
"categories": ["agent"]
591+
},
592+
{
593+
"name": "agent/oed_word_search",
594+
"categories": ["agent"]
595+
},
596+
{
597+
"name": "agent/radiotimes_tv_schedule",
598+
"categories": ["agent"]
599+
},
600+
{
601+
"name": "agent/redfin_apartment_rental",
602+
"categories": ["agent"]
603+
},
604+
{
605+
"name": "agent/thegamer_opinion_article",
606+
"categories": ["agent"]
607+
},
608+
{
609+
"name": "agent/trailhead_superbadge",
610+
"categories": ["agent"]
611+
},
612+
{
613+
"name": "agent/trustpilot_hr_companies",
614+
"categories": ["agent"]
615+
},
616+
{
617+
"name": "agent/uniqlo_mens_blazers",
618+
"categories": ["agent"]
619+
},
620+
{
621+
"name": "agent/webmd_audiologist_search",
622+
"categories": ["agent"]
623+
},
624+
{
625+
"name": "agent/webmd_ovulation_calculator",
626+
"categories": ["agent"]
551627
}
552628
]
553629
}
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import { EvalFunction } from "../../types/evals";
2+
import { V3Evaluator } from "@browserbasehq/stagehand";
3+
import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
4+
5+
export const alibaba_supplier_search: EvalFunction = async ({
6+
debugUrl,
7+
sessionUrl,
8+
logger,
9+
agent,
10+
v3,
11+
}) => {
12+
try {
13+
const page = v3.context.pages()[0];
14+
await page.goto("https://www.alibaba.com/");
15+
16+
// Start collecting screenshots throughout the agent's journey
17+
const screenshotCollector = new ScreenshotCollector(page, {
18+
interval: 3000,
19+
maxScreenshots: 15,
20+
});
21+
screenshotCollector.start();
22+
23+
const instruction =
24+
"Search for 'solar panels' on Alibaba and find 3 suppliers. For each supplier, tell me their company name, minimum order quantity, and price range if available.";
25+
const agentResult = await agent.execute({
26+
instruction,
27+
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
28+
});
29+
30+
// Stop and collect all screenshots from the journey
31+
const screenshots = screenshotCollector.stop();
32+
33+
logger.log({
34+
category: "evaluation",
35+
message: `Collected ${screenshots.length} screenshots for evaluation`,
36+
level: 1,
37+
});
38+
39+
const evaluator = new V3Evaluator(v3);
40+
const { evaluation, reasoning } = await evaluator.ask({
41+
question: `did the agent complete this task successfully? ${instruction}`,
42+
screenshot: screenshots,
43+
agentReasoning: agentResult.message,
44+
});
45+
46+
console.log(`reasoning: ${reasoning}`);
47+
48+
const success = evaluation === "YES";
49+
50+
if (!success) {
51+
return {
52+
_success: false,
53+
message: reasoning,
54+
debugUrl,
55+
sessionUrl,
56+
logs: logger.getLogs(),
57+
};
58+
}
59+
return {
60+
_success: true,
61+
debugUrl,
62+
sessionUrl,
63+
logs: logger.getLogs(),
64+
};
65+
} catch (error) {
66+
const errorMessage = error instanceof Error ? error.message : String(error);
67+
return {
68+
_success: false,
69+
message: errorMessage,
70+
debugUrl,
71+
sessionUrl,
72+
logs: logger.getLogs(),
73+
};
74+
} finally {
75+
await v3.close();
76+
}
77+
};

packages/evals/tasks/agent/all_recipes.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,14 @@ export const all_recipes: EvalFunction = async ({
55
debugUrl,
66
sessionUrl,
77
logger,
8-
v3Agent,
8+
agent,
99
v3,
1010
}) => {
1111
try {
1212
const page = v3.context.pages()[0];
1313
await page.goto("https://www.allrecipes.com/");
1414
const evaluator = new V3Evaluator(v3);
15-
const agentResult = await v3Agent.execute({
15+
const agentResult = await agent.execute({
1616
instruction:
1717
"Search for a recipe for Beef Wellington on Allrecipes that has at least 200 reviews and an average rating of 4.5 stars or higher. List the main ingredients required for the dish.",
1818
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import { EvalFunction } from "../../types/evals";
2+
import { V3Evaluator } from "@browserbasehq/stagehand";
3+
import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
4+
5+
export const amazon_shoes_cart: EvalFunction = async ({
6+
debugUrl,
7+
sessionUrl,
8+
logger,
9+
agent,
10+
v3,
11+
}) => {
12+
try {
13+
const page = v3.context.pages()[0];
14+
await page.goto("https://www.amazon.com");
15+
16+
// Start collecting screenshots throughout the agent's journey
17+
const screenshotCollector = new ScreenshotCollector(page, {
18+
interval: 3000,
19+
maxScreenshots: 15,
20+
});
21+
screenshotCollector.start();
22+
23+
const instruction =
24+
"go to amazon, and add a pair of black running shoes to cart in size 14. stop after you add the item to cart, and reach the login page";
25+
const agentResult = await agent.execute({
26+
instruction,
27+
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 40,
28+
});
29+
30+
// Stop and collect all screenshots from the journey
31+
const screenshots = screenshotCollector.stop();
32+
33+
logger.log({
34+
category: "evaluation",
35+
message: `Collected ${screenshots.length} screenshots for evaluation`,
36+
level: 1,
37+
});
38+
39+
const evaluator = new V3Evaluator(v3);
40+
const { evaluation, reasoning } = await evaluator.ask({
41+
question: `did the agent complete this task successfully? ${instruction}`,
42+
screenshot: screenshots,
43+
agentReasoning: agentResult.message,
44+
});
45+
46+
console.log(`reasoning: ${reasoning}`);
47+
48+
const success = evaluation === "YES";
49+
50+
if (!success) {
51+
return {
52+
_success: false,
53+
message: reasoning,
54+
debugUrl,
55+
sessionUrl,
56+
logs: logger.getLogs(),
57+
};
58+
}
59+
return {
60+
_success: true,
61+
debugUrl,
62+
sessionUrl,
63+
logs: logger.getLogs(),
64+
};
65+
} catch (error) {
66+
const errorMessage = error instanceof Error ? error.message : String(error);
67+
return {
68+
_success: false,
69+
message: errorMessage,
70+
debugUrl,
71+
sessionUrl,
72+
logs: logger.getLogs(),
73+
};
74+
} finally {
75+
await v3.close();
76+
}
77+
};

packages/evals/tasks/agent/apple_trade_in.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@ export const apple_trade_in: EvalFunction = async ({
66
debugUrl,
77
sessionUrl,
88
logger,
9-
v3Agent,
9+
agent,
1010
v3,
1111
}) => {
1212
try {
1313
const page = v3.context.pages()[0];
1414
await page.goto("https://www.apple.com/shop/trade-in");
1515
const evaluator = new V3Evaluator(v3);
16-
await v3Agent.execute({
16+
await agent.execute({
1717
instruction:
1818
"Find out the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website.",
1919
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,

packages/evals/tasks/agent/apple_tv.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,14 @@ export const apple_tv: EvalFunction = async ({
55
debugUrl,
66
sessionUrl,
77
logger,
8-
v3Agent,
8+
agent,
99
v3,
1010
}) => {
1111
try {
1212
const page = v3.context.pages()[0];
1313
await page.goto("https://www.apple.com/");
1414

15-
const agentResult = await v3Agent.execute({
15+
const agentResult = await agent.execute({
1616
instruction:
1717
"Identify the size and weight for the Apple TV 4K and list the Siri Remote features introduced.",
1818
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,

packages/evals/tasks/agent/arxiv_gpt_report.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,15 @@ export const arxiv_gpt_report: EvalFunction = async ({
66
debugUrl,
77
sessionUrl,
88
logger,
9-
v3Agent,
9+
agent,
1010
v3,
1111
}) => {
1212
try {
1313
const page = v3.context.pages()[0];
1414
const evaluator = new V3Evaluator(v3);
1515
await page.goto("https://arxiv.org/");
1616

17-
await v3Agent.execute({
17+
await agent.execute({
1818
instruction:
1919
"Find the paper 'GPT-4 Technical Report', when was v3 submitted?",
2020
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 25,
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import { EvalFunction } from "../../types/evals";
2+
import { V3Evaluator } from "@browserbasehq/stagehand";
3+
import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
4+
5+
export const columbia_tuition: EvalFunction = async ({
6+
debugUrl,
7+
sessionUrl,
8+
logger,
9+
agent,
10+
v3,
11+
}) => {
12+
try {
13+
const page = v3.context.pages()[0];
14+
await page.goto("https://columbia.edu/");
15+
16+
// Start collecting screenshots throughout the agent's journey
17+
const screenshotCollector = new ScreenshotCollector(page, {
18+
interval: 3000,
19+
maxScreenshots: 15,
20+
});
21+
screenshotCollector.start();
22+
23+
const instruction =
24+
"Use the search functionality to locate pages detailing tuition and fees, then extract the published tuition fee information for undergraduate programs. Only use http://columbia.edu to achieve the task. Don't go to any other site. The task is achievable with just navigation from this site.";
25+
const agentResult = await agent.execute({
26+
instruction,
27+
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
28+
});
29+
30+
// Stop and collect all screenshots from the journey
31+
const screenshots = screenshotCollector.stop();
32+
33+
logger.log({
34+
category: "evaluation",
35+
message: `Collected ${screenshots.length} screenshots for evaluation`,
36+
level: 1,
37+
});
38+
39+
const evaluator = new V3Evaluator(v3);
40+
const { evaluation, reasoning } = await evaluator.ask({
41+
question: `did the agent complete this task successfully? ${instruction}`,
42+
screenshot: screenshots,
43+
agentReasoning: agentResult.message,
44+
});
45+
46+
console.log(`reasoning: ${reasoning}`);
47+
48+
const success = evaluation === "YES";
49+
50+
if (!success) {
51+
return {
52+
_success: false,
53+
message: reasoning,
54+
debugUrl,
55+
sessionUrl,
56+
logs: logger.getLogs(),
57+
};
58+
}
59+
return {
60+
_success: true,
61+
debugUrl,
62+
sessionUrl,
63+
logs: logger.getLogs(),
64+
};
65+
} catch (error) {
66+
const errorMessage = error instanceof Error ? error.message : String(error);
67+
return {
68+
_success: false,
69+
message: errorMessage,
70+
debugUrl,
71+
sessionUrl,
72+
logs: logger.getLogs(),
73+
};
74+
} finally {
75+
await v3.close();
76+
}
77+
};

0 commit comments

Comments
 (0)