diff --git a/.changeset/kind-games-rush.md b/.changeset/kind-games-rush.md new file mode 100644 index 000000000..e408e52e6 --- /dev/null +++ b/.changeset/kind-games-rush.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Export imageResize utility diff --git a/packages/core/lib/v3/index.ts b/packages/core/lib/v3/index.ts index 8e102cba7..a213dda93 100644 --- a/packages/core/lib/v3/index.ts +++ b/packages/core/lib/v3/index.ts @@ -17,6 +17,7 @@ export { loadApiKeyFromEnv, trimTrailingTextNode, jsonSchemaToZod, + imageResize, } from "../utils"; export { isZod4Schema, isZod3Schema, toJsonSchema } from "./zodCompat"; diff --git a/packages/evals/suites/onlineMind2Web.ts b/packages/evals/suites/onlineMind2Web.ts index baa922d0c..481a8e22a 100644 --- a/packages/evals/suites/onlineMind2Web.ts +++ b/packages/evals/suites/onlineMind2Web.ts @@ -61,21 +61,24 @@ export const buildOnlineMind2WebTestcases = (models: string[]): Testcase[] => { level: row.level, }, }; + const taskCategories = + tasksConfig.find((t) => t.name === input.name)?.categories || []; allTestcases.push({ input, name: input.name, tags: [ model, - input.name, - ...( - tasksConfig.find((t) => t.name === input.name)?.categories || [] - ).map((x) => `category/${x}`), - `onlineMind2Web/id/${row.task_id}`, - ...(row.level ? [`onlineMind2Web/level/${row.level}`] : []), + "mind2web", // Simple dataset tag ], metadata: { model: model as AvailableModel, test: `${input.name}:${row.task_id}`, + category: taskCategories[0] || "agent", + categories: taskCategories, + dataset: "onlineMind2Web", + task_id: row.task_id, + difficulty: row.level, + website: row.website, }, expected: true, }); diff --git a/packages/evals/suites/webvoyager.ts b/packages/evals/suites/webvoyager.ts index 16f59db2e..fa5138351 100644 --- a/packages/evals/suites/webvoyager.ts +++ b/packages/evals/suites/webvoyager.ts @@ -59,20 +59,23 @@ export const buildWebVoyagerTestcases = (models: string[]): Testcase[] => { web_name: row.web_name, }, }; + const taskCategories = + tasksConfig.find((t) => t.name === input.name)?.categories || []; allTestcases.push({ input, name: input.name, tags: [ model, - input.name, - ...( - tasksConfig.find((t) => t.name === input.name)?.categories || [] - ).map((x) => `category/${x}`), - `webvoyager/id/${row.id}`, + "webvoyager", // Simple dataset tag ], metadata: { model: model as AvailableModel, test: `${input.name}:${row.id}`, + category: taskCategories[0] || "agent", + categories: taskCategories, + dataset: "webvoyager", + task_id: row.id, + website: row.web_name || row.web, }, expected: true, }); diff --git a/packages/evals/tasks/agent/onlineMind2Web.ts b/packages/evals/tasks/agent/onlineMind2Web.ts index 987dd29a4..1eab5d07a 100644 --- a/packages/evals/tasks/agent/onlineMind2Web.ts +++ b/packages/evals/tasks/agent/onlineMind2Web.ts @@ -1,8 +1,7 @@ import { EvalFunction } from "../../types/evals"; -import { V3Evaluator } from "@browserbasehq/stagehand"; +import { V3Evaluator, imageResize } from "@browserbasehq/stagehand"; import { ScreenshotCollector } from "../../utils/ScreenshotCollector"; import dotenv from "dotenv"; -import fs from "fs"; dotenv.config(); export const onlineMind2Web: EvalFunction = async ({ @@ -13,6 +12,10 @@ export const onlineMind2Web: EvalFunction = async ({ modelName, input, }) => { + // Track resources that need cleanup + let screenshotCollector: ScreenshotCollector | null = null; + let screenshotHandler: ((buffer: Buffer) => void) | null = null; + try { const params = ((input && input.params) || {}) as { task_id?: string; @@ -33,7 +36,7 @@ export const onlineMind2Web: EvalFunction = async ({ } const page = v3.context.pages()[0]; await page.goto(params.website, { - timeoutMs: 60_000, + timeoutMs: 120_000, }); const agent = v3.agent({ @@ -42,31 +45,47 @@ export const onlineMind2Web: EvalFunction = async ({ systemPrompt: `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: " summarizing the requested result (e.g., score, list, or text). Current page: ${await page.title()}. ALWAYS OPERATE WITHIN THE PAGE OPENED BY THE USER, WHICHEVER TASK YOU ARE ATTEMPTING TO COMPLETE CAN BE ACCOMPLISHED WITHIN THE PAGE.`, }); - const screenshot = await page.screenshot(); - fs.writeFileSync("screenshot.png", screenshot); - - // Start collecting screenshots in parallel - const screenshotCollector = new ScreenshotCollector(page, { - maxScreenshots: 5, // Keep up to the last 5 screenshots - captureOnNavigation: true, // Also capture on page navigation + // Set up event-driven screenshot collection via the V3 event bus + screenshotCollector = new ScreenshotCollector(v3, { + maxScreenshots: 7, }); - // Subscribe to screenshot events from the agent via the bus - const screenshotHandler = (buffer: Buffer) => { - screenshotCollector.addScreenshot(buffer); + // Subscribe to screenshot events from the agent + screenshotHandler = (buffer: Buffer) => { + screenshotCollector?.addScreenshot(buffer); }; v3.bus.on("agent_screensot_taken_event", screenshotHandler); - screenshotCollector.start(); - const agentResult = await agent.execute({ instruction: params.confirmed_task, maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50, }); - // Stop collecting, clean up event listener, and get all screenshots - v3.bus.off("agent_screensot_taken_event", screenshotHandler); - const screenshots = screenshotCollector.stop(); + // Stop collecting and get all screenshots + let screenshots = screenshotCollector.stop(); + + // Try to capture final screenshot, but don't fail if CDP is disconnected + try { + const lastPage = await v3.context.awaitActivePage(); + const lastScreenshot = await lastPage.screenshot(); + screenshots = [...screenshots, lastScreenshot]; + } catch (screenshotError) { + logger.warn({ + category: "evaluation", + message: `Failed to capture final screenshot (CDP may be disconnected): ${screenshotError}`, + level: 1, + }); + // Continue with whatever screenshots we already collected + } + + // Resize screenshots if we have any + if (screenshots.length > 0) { + screenshots = await Promise.all( + screenshots.map(async (screenshot) => { + return await imageResize(screenshot, 0.7); + }), + ); + } logger.log({ category: "evaluation", @@ -83,10 +102,12 @@ export const onlineMind2Web: EvalFunction = async ({ "no reasoning available, agent potentially hit step limit", }); + // Clear screenshot buffers to free memory + screenshots.length = 0; + return { _success: evalResult.evaluation === "YES", reasoning: evalResult.reasoning, - // screenshotCount: screenshots.length, task_level: params.level, debugUrl, sessionUrl, @@ -100,5 +121,21 @@ export const onlineMind2Web: EvalFunction = async ({ sessionUrl, logs: logger.getLogs(), }; + } finally { + // Always clean up event listener and stop collector to prevent hanging + if (screenshotHandler) { + try { + v3.bus.off("agent_screensot_taken_event", screenshotHandler); + } catch { + // Ignore errors during cleanup + } + } + if (screenshotCollector) { + try { + screenshotCollector.stop(); + } catch { + // Ignore errors during cleanup + } + } } }; diff --git a/packages/evals/tasks/agent/webvoyager.ts b/packages/evals/tasks/agent/webvoyager.ts index e3cd89bcd..8146bf3f1 100644 --- a/packages/evals/tasks/agent/webvoyager.ts +++ b/packages/evals/tasks/agent/webvoyager.ts @@ -1,5 +1,5 @@ import { EvalFunction } from "../../types/evals"; -import { V3Evaluator } from "@browserbasehq/stagehand"; +import { V3Evaluator, imageResize } from "@browserbasehq/stagehand"; import { ScreenshotCollector } from "../../utils/ScreenshotCollector"; export const webvoyager: EvalFunction = async ({ @@ -10,6 +10,10 @@ export const webvoyager: EvalFunction = async ({ modelName, input, }) => { + // Track resources that need cleanup + let screenshotCollector: ScreenshotCollector | null = null; + let screenshotHandler: ((buffer: Buffer) => void) | null = null; + try { const params = ((input && input.params) || {}) as { id?: string; @@ -29,20 +33,25 @@ export const webvoyager: EvalFunction = async ({ } const page = v3.context.pages()[0]; - await page.goto(params.web); + await page.goto(params.web, { + timeoutMs: 120_000, + }); const agent = v3.agent({ model: modelName, systemPrompt: `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: " summarizing the requested result (e.g., score, list, or text). Current page: ${await page.title()}`, }); - // Start collecting screenshots in parallel - const screenshotCollector = new ScreenshotCollector(page, { - maxScreenshots: 10, // Keep last 10 screenshots - captureOnNavigation: true, // Also capture on page navigation + // Set up event-driven screenshot collection via the V3 event bus + screenshotCollector = new ScreenshotCollector(v3, { + maxScreenshots: 7, }); - screenshotCollector.start(); + // Subscribe to screenshot events from the agent + screenshotHandler = (buffer: Buffer) => { + screenshotCollector?.addScreenshot(buffer); + }; + v3.bus.on("agent_screensot_taken_event", screenshotHandler); const agentResult = await agent.execute({ instruction: params.ques, @@ -50,7 +59,30 @@ export const webvoyager: EvalFunction = async ({ }); // Stop collecting and get all screenshots - const screenshots = screenshotCollector.stop(); + let screenshots = screenshotCollector.stop(); + + // Try to capture final screenshot, but don't fail if CDP is disconnected + try { + const lastPage = await v3.context.awaitActivePage(); + const lastScreenshot = await lastPage.screenshot(); + screenshots = [...screenshots, lastScreenshot]; + } catch (screenshotError) { + logger.warn({ + category: "evaluation", + message: `Failed to capture final screenshot (CDP may be disconnected): ${screenshotError}`, + level: 1, + }); + // Continue with whatever screenshots we already collected + } + + // Resize screenshots if we have any + if (screenshots.length > 0) { + screenshots = await Promise.all( + screenshots.map(async (screenshot) => { + return await imageResize(screenshot, 0.7); + }), + ); + } logger.log({ category: "evaluation", @@ -67,10 +99,12 @@ export const webvoyager: EvalFunction = async ({ "no reasoning available, agent potentially hit step limit", }); + // Clear screenshot buffers to free memory + screenshots.length = 0; + return { _success: evalResult.evaluation === "YES", reasoning: evalResult.reasoning, - screenshotCount: screenshots.length, debugUrl, sessionUrl, logs: logger.getLogs(), @@ -83,5 +117,21 @@ export const webvoyager: EvalFunction = async ({ sessionUrl, logs: logger.getLogs(), }; + } finally { + // Always clean up event listener and stop collector to prevent hanging + if (screenshotHandler) { + try { + v3.bus.off("agent_screensot_taken_event", screenshotHandler); + } catch { + // Ignore errors during cleanup + } + } + if (screenshotCollector) { + try { + screenshotCollector.stop(); + } catch { + // Ignore errors during cleanup + } + } } }; diff --git a/packages/evals/types/evals.ts b/packages/evals/types/evals.ts index cd52b3d05..b907fe5d6 100644 --- a/packages/evals/types/evals.ts +++ b/packages/evals/types/evals.ts @@ -53,12 +53,30 @@ export interface Testcase extends EvalCase< EvalInput, unknown, - { model: AvailableModel; test: string; categories?: string[] } + { + model: AvailableModel; + test: string; + categories?: string[]; + category?: string; + dataset?: string; + task_id?: string; + website?: string; + difficulty?: string; + } > { input: EvalInput; name: string; tags: string[]; - metadata: { model: AvailableModel; test: string; categories?: string[] }; + metadata: { + model: AvailableModel; + test: string; + categories?: string[]; + category?: string; + dataset?: string; + task_id?: string; + website?: string; + difficulty?: string; + }; expected: unknown; } diff --git a/packages/evals/types/screenshotCollector.ts b/packages/evals/types/screenshotCollector.ts index 16d468521..afd874444 100644 --- a/packages/evals/types/screenshotCollector.ts +++ b/packages/evals/types/screenshotCollector.ts @@ -1,8 +1,11 @@ export interface ScreenshotCollectorOptions { + /** + * Interval in ms for polling-based screenshot capture. + * If provided, start() will begin polling at this interval. + * If omitted, use addScreenshot() via the V3 event bus for event-driven collection. + */ interval?: number; maxScreenshots?: number; - /* @deprecated for V3, there's a new method to intercept screenshots by injecting into the agent loop */ - captureOnNavigation?: boolean; } // Minimal page-like interface: supports screenshot() and optional event hooks diff --git a/packages/evals/utils/ScreenshotCollector.ts b/packages/evals/utils/ScreenshotCollector.ts index 26829c886..05a3ea63d 100644 --- a/packages/evals/utils/ScreenshotCollector.ts +++ b/packages/evals/utils/ScreenshotCollector.ts @@ -1,29 +1,36 @@ -import { Page } from "@browserbasehq/stagehand"; +import { V3 } from "@browserbasehq/stagehand"; import sharp from "sharp"; import { ScreenshotCollectorOptions } from "../types/screenshotCollector"; export class ScreenshotCollector { private screenshots: Buffer[] = []; - private page: Page; - private interval: number; + private v3: V3; + private interval?: number; private maxScreenshots: number; - private captureOnNavigation: boolean; private intervalId?: NodeJS.Timeout; - private navigationListeners: Array<() => void> = []; private isCapturing: boolean = false; private lastScreenshot?: Buffer; private ssimThreshold: number = 0.75; private mseThreshold: number = 30; + private stopped: boolean = false; - constructor(page: Page, options: ScreenshotCollectorOptions = {}) { - this.page = page; - this.interval = options.interval || 5000; + constructor(v3: V3, options: ScreenshotCollectorOptions = {}) { + this.v3 = v3; + this.interval = options.interval; // undefined means event-driven mode this.maxScreenshots = options.maxScreenshots || 10; - // Capture on navigation is deprecated for V3 pages - this.captureOnNavigation = options.captureOnNavigation ?? false; } + /** + * Start interval-based screenshot capture. + * Only activates if interval option was provided in constructor. + * For event-driven collection, use addScreenshot() directly via the V3 event bus. + */ start(): void { + // Only start interval if interval was provided + if (!this.interval) { + return; + } + if (this.intervalId) { return; } @@ -42,29 +49,41 @@ export class ScreenshotCollector { } stop(): Buffer[] { + // Mark as stopped first to prevent any new operations + this.stopped = true; + + // Clear interval if running if (this.intervalId) { clearInterval(this.intervalId); this.intervalId = undefined; } - this.navigationListeners.forEach((removeListener) => removeListener()); - this.navigationListeners = []; + // Reset capturing flag to unblock any pending state + this.isCapturing = false; - // Capture final screenshot without blocking - this.captureScreenshot("final").catch((error) => { - console.error("Failed to capture final screenshot:", error); - }); - return this.getScreenshots(); + // Return a copy and clear internal state to free memory + const result = [...this.screenshots]; + this.screenshots = []; + this.lastScreenshot = undefined; + + return result; } private async captureScreenshot(trigger: string): Promise { - if (this.isCapturing) { + // Don't capture if stopped or already capturing + if (this.stopped || this.isCapturing) { return; } this.isCapturing = true; try { - const screenshot = await this.page.screenshot(); + const page = await this.v3.context.awaitActivePage(); + const screenshot = await page.screenshot({ fullPage: false }); + + // If stopped while awaiting screenshot, don't process further + if (this.stopped) { + return; + } // Check if we should keep this screenshot based on image diff let shouldKeep = true; @@ -118,12 +137,13 @@ export class ScreenshotCollector { } /** - * Manually add a screenshot to the collection + * Manually add a screenshot to the collection. + * Use this with the V3 event bus for event-driven screenshot collection. * @param screenshot The screenshot buffer to add */ async addScreenshot(screenshot: Buffer): Promise { - // Prevent concurrent processing - if (this.isCapturing) { + // Don't add if stopped or already capturing + if (this.stopped || this.isCapturing) { return; } this.isCapturing = true;