browserbase
diff --git a/‎packages/evals/evals.config.json‎
Lines changed: 76 additions & 0 deletions b/‎packages/evals/evals.config.json‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎packages/evals/tasks/agent/alibaba_supplier_search.ts‎
Lines changed: 77 additions & 0 deletions b/‎packages/evals/tasks/agent/alibaba_supplier_search.ts‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎packages/evals/tasks/agent/all_recipes.ts‎
Lines changed: 2 additions & 2 deletions b/‎packages/evals/tasks/agent/all_recipes.ts‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎packages/evals/tasks/agent/amazon_shoes_cart.ts‎
Lines changed: 77 additions & 0 deletions b/‎packages/evals/tasks/agent/amazon_shoes_cart.ts‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎packages/evals/tasks/agent/apple_trade_in.ts‎
Lines changed: 2 additions & 2 deletions b/‎packages/evals/tasks/agent/apple_trade_in.ts‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎packages/evals/tasks/agent/apple_tv.ts‎
Lines changed: 2 additions & 2 deletions b/‎packages/evals/tasks/agent/apple_tv.ts‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎packages/evals/tasks/agent/arxiv_gpt_report.ts‎
Lines changed: 2 additions & 2 deletions b/‎packages/evals/tasks/agent/arxiv_gpt_report.ts‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎packages/evals/tasks/agent/columbia_tuition.ts‎
Lines changed: 77 additions & 0 deletions b/‎packages/evals/tasks/agent/columbia_tuition.ts‎
Lines changed: 77 additions & 0 deletions
@@ -548,6 +548,82 @@
     {
       "name": "agent/onlineMind2Web",
       "categories": ["external_agent_benchmarks"]
+    },
+    {
+      "name": "agent/alibaba_supplier_search",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/amazon_shoes_cart",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/columbia_tuition",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/flipkart_laptops",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/google_shopping",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/hotels_paris_amenities",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/instacart_organic_bananas",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/kfc_tenders_combo",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/made_in_china_supplier",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/nvidia_hgx_driver",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/oed_word_search",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/radiotimes_tv_schedule",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/redfin_apartment_rental",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/thegamer_opinion_article",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/trailhead_superbadge",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/trustpilot_hr_companies",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/uniqlo_mens_blazers",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/webmd_audiologist_search",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/webmd_ovulation_calculator",
+      "categories": ["agent"]
     }
   ]
 }
@@ -0,0 +1,77 @@
+import { EvalFunction } from "../../types/evals";
+import { V3Evaluator } from "@browserbasehq/stagehand";
+import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
+
+export const alibaba_supplier_search: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  logger,
+  agent,
+  v3,
+}) => {
+  try {
+    const page = v3.context.pages()[0];
+    await page.goto("https://www.alibaba.com/");
+
+    // Start collecting screenshots throughout the agent's journey
+    const screenshotCollector = new ScreenshotCollector(page, {
+      interval: 3000,
+      maxScreenshots: 15,
+    });
+    screenshotCollector.start();
+
+    const instruction =
+      "Search for 'solar panels' on Alibaba and find 3 suppliers. For each supplier, tell me their company name, minimum order quantity, and price range if available.";
+    const agentResult = await agent.execute({
+      instruction,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
+    });
+
+    // Stop and collect all screenshots from the journey
+    const screenshots = screenshotCollector.stop();
+
+    logger.log({
+      category: "evaluation",
+      message: `Collected ${screenshots.length} screenshots for evaluation`,
+      level: 1,
+    });
+
+    const evaluator = new V3Evaluator(v3);
+    const { evaluation, reasoning } = await evaluator.ask({
+      question: `did the agent complete this task successfully? ${instruction}`,
+      screenshot: screenshots,
+      agentReasoning: agentResult.message,
+    });
+
+    console.log(`reasoning: ${reasoning}`);
+
+    const success = evaluation === "YES";
+
+    if (!success) {
+      return {
+        _success: false,
+        message: reasoning,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+    return {
+      _success: true,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    const errorMessage = error instanceof Error ? error.message : String(error);
+    return {
+      _success: false,
+      message: errorMessage,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } finally {
+    await v3.close();
+  }
+};
@@ -5,14 +5,14 @@ export const all_recipes: EvalFunction = async ({
   debugUrl,
   sessionUrl,
   logger,
-  v3Agent,
+  agent,
   v3,
 }) => {
   try {
     const page = v3.context.pages()[0];
     await page.goto("https://www.allrecipes.com/");
     const evaluator = new V3Evaluator(v3);
-    const agentResult = await v3Agent.execute({
+    const agentResult = await agent.execute({
       instruction:
         "Search for a recipe for Beef Wellington on Allrecipes that has at least 200 reviews and an average rating of 4.5 stars or higher. List the main ingredients required for the dish.",
       maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
 
@@ -0,0 +1,77 @@
+import { EvalFunction } from "../../types/evals";
+import { V3Evaluator } from "@browserbasehq/stagehand";
+import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
+
+export const amazon_shoes_cart: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  logger,
+  agent,
+  v3,
+}) => {
+  try {
+    const page = v3.context.pages()[0];
+    await page.goto("https://www.amazon.com");
+
+    // Start collecting screenshots throughout the agent's journey
+    const screenshotCollector = new ScreenshotCollector(page, {
+      interval: 3000,
+      maxScreenshots: 15,
+    });
+    screenshotCollector.start();
+
+    const instruction =
+      "go to amazon, and add a pair of black running shoes to cart in size 14. stop after you add the item to cart, and reach the login page";
+    const agentResult = await agent.execute({
+      instruction,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 40,
+    });
+
+    // Stop and collect all screenshots from the journey
+    const screenshots = screenshotCollector.stop();
+
+    logger.log({
+      category: "evaluation",
+      message: `Collected ${screenshots.length} screenshots for evaluation`,
+      level: 1,
+    });
+
+    const evaluator = new V3Evaluator(v3);
+    const { evaluation, reasoning } = await evaluator.ask({
+      question: `did the agent complete this task successfully? ${instruction}`,
+      screenshot: screenshots,
+      agentReasoning: agentResult.message,
+    });
+
+    console.log(`reasoning: ${reasoning}`);
+
+    const success = evaluation === "YES";
+
+    if (!success) {
+      return {
+        _success: false,
+        message: reasoning,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+    return {
+      _success: true,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    const errorMessage = error instanceof Error ? error.message : String(error);
+    return {
+      _success: false,
+      message: errorMessage,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } finally {
+    await v3.close();
+  }
+};
@@ -6,14 +6,14 @@ export const apple_trade_in: EvalFunction = async ({
   debugUrl,
   sessionUrl,
   logger,
-  v3Agent,
+  agent,
   v3,
 }) => {
   try {
     const page = v3.context.pages()[0];
     await page.goto("https://www.apple.com/shop/trade-in");
     const evaluator = new V3Evaluator(v3);
-    await v3Agent.execute({
+    await agent.execute({
       instruction:
         "Find out the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website.",
       maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
 
@@ -5,14 +5,14 @@ export const apple_tv: EvalFunction = async ({
   debugUrl,
   sessionUrl,
   logger,
-  v3Agent,
+  agent,
   v3,
 }) => {
   try {
     const page = v3.context.pages()[0];
     await page.goto("https://www.apple.com/");
 
-    const agentResult = await v3Agent.execute({
+    const agentResult = await agent.execute({
       instruction:
         "Identify the size and weight for the Apple TV 4K and list the Siri Remote features introduced.",
       maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
 
@@ -6,15 +6,15 @@ export const arxiv_gpt_report: EvalFunction = async ({
   debugUrl,
   sessionUrl,
   logger,
-  v3Agent,
+  agent,
   v3,
 }) => {
   try {
     const page = v3.context.pages()[0];
     const evaluator = new V3Evaluator(v3);
     await page.goto("https://arxiv.org/");
 
-    await v3Agent.execute({
+    await agent.execute({
       instruction:
         "Find the paper 'GPT-4 Technical Report', when was v3 submitted?",
       maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 25,
 
@@ -0,0 +1,77 @@
+import { EvalFunction } from "../../types/evals";
+import { V3Evaluator } from "@browserbasehq/stagehand";
+import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
+
+export const columbia_tuition: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  logger,
+  agent,
+  v3,
+}) => {
+  try {
+    const page = v3.context.pages()[0];
+    await page.goto("https://columbia.edu/");
+
+    // Start collecting screenshots throughout the agent's journey
+    const screenshotCollector = new ScreenshotCollector(page, {
+      interval: 3000,
+      maxScreenshots: 15,
+    });
+    screenshotCollector.start();
+
+    const instruction =
+      "Use the search functionality to locate pages detailing tuition and fees, then extract the published tuition fee information for undergraduate programs. Only use http://columbia.edu to achieve the task. Don't go to any other site. The task is achievable with just navigation from this site.";
+    const agentResult = await agent.execute({
+      instruction,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
+    });
+
+    // Stop and collect all screenshots from the journey
+    const screenshots = screenshotCollector.stop();
+
+    logger.log({
+      category: "evaluation",
+      message: `Collected ${screenshots.length} screenshots for evaluation`,
+      level: 1,
+    });
+
+    const evaluator = new V3Evaluator(v3);
+    const { evaluation, reasoning } = await evaluator.ask({
+      question: `did the agent complete this task successfully? ${instruction}`,
+      screenshot: screenshots,
+      agentReasoning: agentResult.message,
+    });
+
+    console.log(`reasoning: ${reasoning}`);
+
+    const success = evaluation === "YES";
+
+    if (!success) {
+      return {
+        _success: false,
+        message: reasoning,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+    return {
+      _success: true,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    const errorMessage = error instanceof Error ? error.message : String(error);
+    return {
+      _success: false,
+      message: errorMessage,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } finally {
+    await v3.close();
+  }
+};