Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/kind-games-rush.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@browserbasehq/stagehand": patch
---

Export imageResize utility
1 change: 1 addition & 0 deletions packages/core/lib/v3/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ export {
loadApiKeyFromEnv,
trimTrailingTextNode,
jsonSchemaToZod,
imageResize,
} from "../utils";
export { isZod4Schema, isZod3Schema, toJsonSchema } from "./zodCompat";

Expand Down
15 changes: 9 additions & 6 deletions packages/evals/suites/onlineMind2Web.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,21 +61,24 @@ export const buildOnlineMind2WebTestcases = (models: string[]): Testcase[] => {
level: row.level,
},
};
const taskCategories =
tasksConfig.find((t) => t.name === input.name)?.categories || [];
allTestcases.push({
input,
name: input.name,
tags: [
model,
input.name,
...(
tasksConfig.find((t) => t.name === input.name)?.categories || []
).map((x) => `category/${x}`),
`onlineMind2Web/id/${row.task_id}`,
...(row.level ? [`onlineMind2Web/level/${row.level}`] : []),
"mind2web", // Simple dataset tag
],
metadata: {
model: model as AvailableModel,
test: `${input.name}:${row.task_id}`,
category: taskCategories[0] || "agent",
categories: taskCategories,
dataset: "onlineMind2Web",
task_id: row.task_id,
difficulty: row.level,
website: row.website,
},
expected: true,
});
Expand Down
13 changes: 8 additions & 5 deletions packages/evals/suites/webvoyager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,20 +59,23 @@ export const buildWebVoyagerTestcases = (models: string[]): Testcase[] => {
web_name: row.web_name,
},
};
const taskCategories =
tasksConfig.find((t) => t.name === input.name)?.categories || [];
allTestcases.push({
input,
name: input.name,
tags: [
model,
input.name,
...(
tasksConfig.find((t) => t.name === input.name)?.categories || []
).map((x) => `category/${x}`),
`webvoyager/id/${row.id}`,
"webvoyager", // Simple dataset tag
],
metadata: {
model: model as AvailableModel,
test: `${input.name}:${row.id}`,
category: taskCategories[0] || "agent",
categories: taskCategories,
dataset: "webvoyager",
task_id: row.id,
website: row.web_name || row.web,
},
expected: true,
});
Expand Down
75 changes: 56 additions & 19 deletions packages/evals/tasks/agent/onlineMind2Web.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import { EvalFunction } from "../../types/evals";
import { V3Evaluator } from "@browserbasehq/stagehand";
import { V3Evaluator, imageResize } from "@browserbasehq/stagehand";
import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
import dotenv from "dotenv";
import fs from "fs";
dotenv.config();

export const onlineMind2Web: EvalFunction = async ({
Expand All @@ -13,6 +12,10 @@ export const onlineMind2Web: EvalFunction = async ({
modelName,
input,
}) => {
// Track resources that need cleanup
let screenshotCollector: ScreenshotCollector | null = null;
let screenshotHandler: ((buffer: Buffer) => void) | null = null;

try {
const params = ((input && input.params) || {}) as {
task_id?: string;
Expand All @@ -33,7 +36,7 @@ export const onlineMind2Web: EvalFunction = async ({
}
const page = v3.context.pages()[0];
await page.goto(params.website, {
timeoutMs: 60_000,
timeoutMs: 120_000,
});

const agent = v3.agent({
Expand All @@ -42,31 +45,47 @@ export const onlineMind2Web: EvalFunction = async ({
systemPrompt: `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: <answer>" summarizing the requested result (e.g., score, list, or text). Current page: ${await page.title()}. ALWAYS OPERATE WITHIN THE PAGE OPENED BY THE USER, WHICHEVER TASK YOU ARE ATTEMPTING TO COMPLETE CAN BE ACCOMPLISHED WITHIN THE PAGE.`,
});

const screenshot = await page.screenshot();
fs.writeFileSync("screenshot.png", screenshot);

// Start collecting screenshots in parallel
const screenshotCollector = new ScreenshotCollector(page, {
maxScreenshots: 5, // Keep up to the last 5 screenshots
captureOnNavigation: true, // Also capture on page navigation
// Set up event-driven screenshot collection via the V3 event bus
screenshotCollector = new ScreenshotCollector(v3, {
maxScreenshots: 7,
});

// Subscribe to screenshot events from the agent via the bus
const screenshotHandler = (buffer: Buffer) => {
screenshotCollector.addScreenshot(buffer);
// Subscribe to screenshot events from the agent
screenshotHandler = (buffer: Buffer) => {
screenshotCollector?.addScreenshot(buffer);
};
v3.bus.on("agent_screensot_taken_event", screenshotHandler);

screenshotCollector.start();

const agentResult = await agent.execute({
instruction: params.confirmed_task,
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
});

// Stop collecting, clean up event listener, and get all screenshots
v3.bus.off("agent_screensot_taken_event", screenshotHandler);
const screenshots = screenshotCollector.stop();
// Stop collecting and get all screenshots
let screenshots = screenshotCollector.stop();

// Try to capture final screenshot, but don't fail if CDP is disconnected
try {
const lastPage = await v3.context.awaitActivePage();
const lastScreenshot = await lastPage.screenshot();
screenshots = [...screenshots, lastScreenshot];
} catch (screenshotError) {
logger.warn({
category: "evaluation",
message: `Failed to capture final screenshot (CDP may be disconnected): ${screenshotError}`,
level: 1,
});
// Continue with whatever screenshots we already collected
}

// Resize screenshots if we have any
if (screenshots.length > 0) {
screenshots = await Promise.all(
screenshots.map(async (screenshot) => {
return await imageResize(screenshot, 0.7);
}),
);
}

logger.log({
category: "evaluation",
Expand All @@ -83,10 +102,12 @@ export const onlineMind2Web: EvalFunction = async ({
"no reasoning available, agent potentially hit step limit",
});

// Clear screenshot buffers to free memory
screenshots.length = 0;

return {
_success: evalResult.evaluation === "YES",
reasoning: evalResult.reasoning,
// screenshotCount: screenshots.length,
task_level: params.level,
debugUrl,
sessionUrl,
Expand All @@ -100,5 +121,21 @@ export const onlineMind2Web: EvalFunction = async ({
sessionUrl,
logs: logger.getLogs(),
};
} finally {
// Always clean up event listener and stop collector to prevent hanging
if (screenshotHandler) {
try {
v3.bus.off("agent_screensot_taken_event", screenshotHandler);
} catch {
// Ignore errors during cleanup
}
}
if (screenshotCollector) {
try {
screenshotCollector.stop();
} catch {
// Ignore errors during cleanup
}
}
}
};
68 changes: 59 additions & 9 deletions packages/evals/tasks/agent/webvoyager.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { EvalFunction } from "../../types/evals";
import { V3Evaluator } from "@browserbasehq/stagehand";
import { V3Evaluator, imageResize } from "@browserbasehq/stagehand";
import { ScreenshotCollector } from "../../utils/ScreenshotCollector";

export const webvoyager: EvalFunction = async ({
Expand All @@ -10,6 +10,10 @@ export const webvoyager: EvalFunction = async ({
modelName,
input,
}) => {
// Track resources that need cleanup
let screenshotCollector: ScreenshotCollector | null = null;
let screenshotHandler: ((buffer: Buffer) => void) | null = null;

try {
const params = ((input && input.params) || {}) as {
id?: string;
Expand All @@ -29,28 +33,56 @@ export const webvoyager: EvalFunction = async ({
}

const page = v3.context.pages()[0];
await page.goto(params.web);
await page.goto(params.web, {
timeoutMs: 120_000,
});

const agent = v3.agent({
model: modelName,
systemPrompt: `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: <answer>" summarizing the requested result (e.g., score, list, or text). Current page: ${await page.title()}`,
});

// Start collecting screenshots in parallel
const screenshotCollector = new ScreenshotCollector(page, {
maxScreenshots: 10, // Keep last 10 screenshots
captureOnNavigation: true, // Also capture on page navigation
// Set up event-driven screenshot collection via the V3 event bus
screenshotCollector = new ScreenshotCollector(v3, {
maxScreenshots: 7,
});

screenshotCollector.start();
// Subscribe to screenshot events from the agent
screenshotHandler = (buffer: Buffer) => {
screenshotCollector?.addScreenshot(buffer);
};
v3.bus.on("agent_screensot_taken_event", screenshotHandler);

const agentResult = await agent.execute({
instruction: params.ques,
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
});

// Stop collecting and get all screenshots
const screenshots = screenshotCollector.stop();
let screenshots = screenshotCollector.stop();

// Try to capture final screenshot, but don't fail if CDP is disconnected
try {
const lastPage = await v3.context.awaitActivePage();
const lastScreenshot = await lastPage.screenshot();
screenshots = [...screenshots, lastScreenshot];
} catch (screenshotError) {
logger.warn({
category: "evaluation",
message: `Failed to capture final screenshot (CDP may be disconnected): ${screenshotError}`,
level: 1,
});
// Continue with whatever screenshots we already collected
}

// Resize screenshots if we have any
if (screenshots.length > 0) {
screenshots = await Promise.all(
screenshots.map(async (screenshot) => {
return await imageResize(screenshot, 0.7);
}),
);
}

logger.log({
category: "evaluation",
Expand All @@ -67,10 +99,12 @@ export const webvoyager: EvalFunction = async ({
"no reasoning available, agent potentially hit step limit",
});

// Clear screenshot buffers to free memory
screenshots.length = 0;

return {
_success: evalResult.evaluation === "YES",
reasoning: evalResult.reasoning,
screenshotCount: screenshots.length,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
Expand All @@ -83,5 +117,21 @@ export const webvoyager: EvalFunction = async ({
sessionUrl,
logs: logger.getLogs(),
};
} finally {
// Always clean up event listener and stop collector to prevent hanging
if (screenshotHandler) {
try {
v3.bus.off("agent_screensot_taken_event", screenshotHandler);
} catch {
// Ignore errors during cleanup
}
}
if (screenshotCollector) {
try {
screenshotCollector.stop();
} catch {
// Ignore errors during cleanup
}
}
}
};
22 changes: 20 additions & 2 deletions packages/evals/types/evals.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,30 @@ export interface Testcase
extends EvalCase<
EvalInput,
unknown,
{ model: AvailableModel; test: string; categories?: string[] }
{
model: AvailableModel;
test: string;
categories?: string[];
category?: string;
dataset?: string;
task_id?: string;
website?: string;
difficulty?: string;
}
> {
input: EvalInput;
name: string;
tags: string[];
metadata: { model: AvailableModel; test: string; categories?: string[] };
metadata: {
model: AvailableModel;
test: string;
categories?: string[];
category?: string;
dataset?: string;
task_id?: string;
website?: string;
difficulty?: string;
};
expected: unknown;
}

Expand Down
7 changes: 5 additions & 2 deletions packages/evals/types/screenshotCollector.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
export interface ScreenshotCollectorOptions {
/**
* Interval in ms for polling-based screenshot capture.
* If provided, start() will begin polling at this interval.
* If omitted, use addScreenshot() via the V3 event bus for event-driven collection.
*/
interval?: number;
maxScreenshots?: number;
/* @deprecated for V3, there's a new method to intercept screenshots by injecting into the agent loop */
captureOnNavigation?: boolean;
}

// Minimal page-like interface: supports screenshot() and optional event hooks
Expand Down
Loading
Loading