diff --git a/src/index.ts b/src/index.ts index bb014938..66394d33 100644 --- a/src/index.ts +++ b/src/index.ts @@ -27,6 +27,7 @@ export { GLMProvider, } from './llm-provider'; export { SentienceAgent, AgentActResult, HistoryEntry, TokenStats } from './agent'; +export { SentienceVisualAgent } from './visual-agent'; // Conversational Agent Layer (v0.3.0+) export { diff --git a/src/visual-agent.ts b/src/visual-agent.ts new file mode 100644 index 00000000..8edbdd2a --- /dev/null +++ b/src/visual-agent.ts @@ -0,0 +1,859 @@ +/** + * Visual Agent - Uses labeled screenshots with vision-capable LLMs + * + * This agent extends SentienceAgent to use visual prompts: + * 1. Takes snapshot with screenshot enabled + * 2. Draws bounding boxes and labels element IDs on the screenshot + * 3. Uses anti-collision algorithm to position labels (4 sides + 4 corners) + * 4. Sends labeled screenshot to vision-capable LLM + * 5. Extracts element ID from LLM response + * 6. Clicks the element using click() + * + * Dependencies: + * - sharp: Required for image processing + * Install with: npm install sharp + * - canvas: Required for drawing on images + * Install with: npm install canvas + * - Vision-capable LLM: Requires an LLM provider that supports vision (e.g., GPT-4o, Claude 3) + */ + +import { SentienceBrowser } from './browser'; +import { snapshot, SnapshotOptions } from './snapshot'; +import { Snapshot, Element } from './types'; +import { LLMProvider, LLMResponse } from './llm-provider'; +import { Tracer } from './tracing/tracer'; +import { randomUUID } from 'crypto'; +import { TraceEventBuilder } from './utils/trace-event-builder'; +import { SnapshotEventBuilder } from './utils/snapshot-event-builder'; +import { SnapshotProcessor } from './utils/snapshot-processor'; +import { click } from './actions'; +import { SentienceAgent, AgentActResult } from './agent'; +import * as path from 'path'; +import * as fs from 'fs'; +import { v4 as uuidv4 } from 'uuid'; + +// Check for required image processing libraries +let sharp: any; +let canvas: any; +let CANVAS_AVAILABLE = false; +let SHARP_AVAILABLE = false; + +try { + // eslint-disable-next-line @typescript-eslint/no-require-imports + sharp = require('sharp'); + SHARP_AVAILABLE = true; +} catch { + sharp = undefined; + console.warn('⚠️ Warning: sharp not available. Install with: npm install sharp'); +} + +try { + // eslint-disable-next-line @typescript-eslint/no-require-imports + canvas = require('canvas'); + CANVAS_AVAILABLE = true; +} catch { + canvas = undefined; + console.warn('⚠️ Warning: canvas not available. Install with: npm install canvas'); +} + +/** + * Visual agent that uses labeled screenshots with vision-capable LLMs. + * + * Extends SentienceAgent to override act() method with visual prompting. + * + * Requirements: + * - sharp: Required for image processing + * Install with: npm install sharp + * - canvas: Required for drawing on images + * Install with: npm install canvas + * - Vision-capable LLM: Requires an LLM provider that supports vision (e.g., GPT-4o, Claude 3) + */ +export class SentienceVisualAgent extends SentienceAgent { + /** + * Initialize Visual Agent + * + * @param browser - SentienceBrowser instance + * @param llm - LLM provider (must support vision, e.g., GPT-4o, Claude 3) + * @param snapshotLimit - Default maximum elements to include + * @param verbose - Print execution logs + * @param tracer - Optional Tracer instance + * @param showOverlay - Show green bbox overlay in browser + */ + constructor( + browser: SentienceBrowser, + llm: LLMProvider, + snapshotLimit: number = 50, + verbose: boolean = true, + tracer?: Tracer, + showOverlay: boolean = false + ) { + super(browser, llm, snapshotLimit, verbose, tracer, showOverlay); + + if (!SHARP_AVAILABLE || !CANVAS_AVAILABLE) { + throw new Error( + 'sharp and canvas are required for SentienceVisualAgent. ' + + 'Install with: npm install sharp canvas' + ); + } + + // Track previous snapshot for diff computation (stored in base class) + } + + /** + * Decode base64 screenshot data URL to image buffer + * + * @param screenshotDataUrl - Base64-encoded data URL (e.g., "data:image/png;base64,...") + * @returns Image buffer + */ + private decodeScreenshot(screenshotDataUrl: string): Buffer { + // Extract base64 data from data URL + if (screenshotDataUrl.startsWith('data:image/')) { + // Format: "data:image/png;base64," + const base64Data = screenshotDataUrl.split(',', 2)[1]; + return Buffer.from(base64Data, 'base64'); + } else { + // Assume it's already base64 + return Buffer.from(screenshotDataUrl, 'base64'); + } + } + + /** + * Find best position for label using anti-collision algorithm. + * + * Tries 8 positions: 4 sides (top, bottom, left, right) + 4 corners. + * Returns the first position that doesn't collide with existing labels. + * + * @param elementBbox - Element bounding box {x, y, width, height} + * @param existingLabels - List of existing label bounding boxes + * @param imageWidth - Image width in pixels + * @param imageHeight - Image height in pixels + * @param labelWidth - Label width in pixels + * @param labelHeight - Label height in pixels + * @returns (x, y) position for label + */ + private findLabelPosition( + elementBbox: { x: number; y: number; width: number; height: number }, + existingLabels: Array<{ x: number; y: number; width: number; height: number }>, + imageWidth: number, + imageHeight: number, + labelWidth: number, + labelHeight: number + ): [number, number] { + const { x, y, width, height } = elementBbox; + + // Offset from element edge + const labelOffset = 15; // Increased from 5px for better separation + + // Try 8 positions: top, bottom, left, right, top-left, top-right, bottom-left, bottom-right + const positions: Array<[number, number]> = [ + [Math.floor(x + width / 2 - labelWidth / 2), Math.floor(y - labelHeight - labelOffset)], // Top + [Math.floor(x + width / 2 - labelWidth / 2), Math.floor(y + height + labelOffset)], // Bottom + [Math.floor(x - labelWidth - labelOffset), Math.floor(y + height / 2 - labelHeight / 2)], // Left + [Math.floor(x + width + labelOffset), Math.floor(y + height / 2 - labelHeight / 2)], // Right + [Math.floor(x - labelWidth - labelOffset), Math.floor(y - labelHeight - labelOffset)], // Top-left + [Math.floor(x + width + labelOffset), Math.floor(y - labelHeight - labelOffset)], // Top-right + [Math.floor(x - labelWidth - labelOffset), Math.floor(y + height + labelOffset)], // Bottom-left + [Math.floor(x + width + labelOffset), Math.floor(y + height + labelOffset)], // Bottom-right + ]; + + // Check each position for collisions + for (const [posX, posY] of positions) { + // Check bounds + if ( + posX < 0 || + posY < 0 || + posX + labelWidth > imageWidth || + posY + labelHeight > imageHeight + ) { + continue; + } + + // Check collision with existing labels + const labelBbox = { + x: posX, + y: posY, + width: labelWidth, + height: labelHeight, + }; + + let collision = false; + for (const existing of existingLabels) { + // Simple AABB collision detection + if ( + !( + labelBbox.x + labelBbox.width < existing.x || + labelBbox.x > existing.x + existing.width || + labelBbox.y + labelBbox.height < existing.y || + labelBbox.y > existing.y + existing.height + ) + ) { + collision = true; + break; + } + } + + if (!collision) { + return [posX, posY]; + } + } + + // If all positions collide, use top position with increased offset + return [ + Math.floor(x + width / 2 - labelWidth / 2), + Math.floor(y - labelHeight - labelOffset * 2), + ]; + } + + /** + * Draw labeled screenshot with bounding boxes and element IDs. + * + * @param snapshot - Snapshot with screenshot data + * @param elements - List of elements to label + * @returns Image buffer with labels drawn + */ + private async drawLabeledScreenshot(snapshot: Snapshot, elements: Element[]): Promise { + if (!snapshot.screenshot) { + throw new Error('Screenshot not available in snapshot'); + } + + // Decode screenshot + const imageBuffer = this.decodeScreenshot(snapshot.screenshot); + if (!sharp) { + throw new Error('sharp is not available. Install with: npm install sharp'); + } + const img = await sharp(imageBuffer); + const metadata = await img.metadata(); + const imageWidth = metadata.width || 0; + const imageHeight = metadata.height || 0; + + // Create canvas for drawing + if (!canvas) { + throw new Error('canvas is not available. Install with: npm install canvas'); + } + const { createCanvas, loadImage } = canvas; + const canvasElement = createCanvas(imageWidth, imageHeight); + const ctx = canvasElement.getContext('2d'); + + // Draw original image on canvas + const image = await loadImage(imageBuffer); + ctx.drawImage(image, 0, 0); + + // Load font (fallback to default if not available) + let font = '16px Arial'; + try { + // Try to use system font + font = '16px Helvetica'; + } catch { + // Use default + font = '16px Arial'; + } + + const existingLabels: Array<{ x: number; y: number; width: number; height: number }> = []; + + // Neon green color: #39FF14 (bright, vibrant green) + const neonGreen = '#39FF14'; + + // Draw bounding boxes and labels for each element + for (const element of elements) { + const bbox = element.bbox; + const x = bbox.x; + const y = bbox.y; + const width = bbox.width; + const height = bbox.height; + + // Draw bounding box rectangle (neon green with 2px width) + ctx.strokeStyle = neonGreen; + ctx.lineWidth = 2; + ctx.strokeRect(x, y, width, height); + + // Prepare label text (just the number - keep it simple and compact) + const labelText = String(element.id); + + // Measure label text size + ctx.font = font; + const textMetrics = ctx.measureText(labelText); + const labelWidth = textMetrics.width; + const labelHeight = 16; // Approximate height for 16px font + + // Find best position for label (anti-collision) + const [labelX, labelY] = this.findLabelPosition( + { x, y, width, height }, + existingLabels, + imageWidth, + imageHeight, + labelWidth + 8, // Add padding + labelHeight + 4 // Add padding + ); + + // Calculate connection points for a clearer visual link + const elementCenterX = x + width / 2; + const elementCenterY = y + height / 2; + const labelCenterX = labelX + labelWidth / 2; + const labelCenterY = labelY + labelHeight / 2; + + // Determine which edge of the element is closest to the label + const distTop = Math.abs(labelCenterY - y); + const distBottom = Math.abs(labelCenterY - (y + height)); + const distLeft = Math.abs(labelCenterX - x); + const distRight = Math.abs(labelCenterX - (x + width)); + + const minDist = Math.min(distTop, distBottom, distLeft, distRight); + + let lineStart: [number, number]; + if (minDist === distTop) { + lineStart = [elementCenterX, y]; + } else if (minDist === distBottom) { + lineStart = [elementCenterX, y + height]; + } else if (minDist === distLeft) { + lineStart = [x, elementCenterY]; + } else { + lineStart = [x + width, elementCenterY]; + } + + // Draw connecting line from element edge to label + ctx.strokeStyle = neonGreen; + ctx.lineWidth = 2; + ctx.beginPath(); + ctx.moveTo(lineStart[0], lineStart[1]); + ctx.lineTo(labelCenterX, labelCenterY); + ctx.stroke(); + + // Draw label background (white with neon green border) + const labelBgX1 = labelX - 4; + const labelBgY1 = labelY - 2; + const labelBgX2 = labelX + labelWidth + 4; + const labelBgY2 = labelY + labelHeight + 2; + + // Draw white background + ctx.fillStyle = 'white'; + ctx.fillRect(labelBgX1, labelBgY1, labelBgX2 - labelBgX1, labelBgY2 - labelBgY1); + + // Draw neon green border + ctx.strokeStyle = neonGreen; + ctx.lineWidth = 2; + ctx.strokeRect(labelBgX1, labelBgY1, labelBgX2 - labelBgX1, labelBgY2 - labelBgY1); + + // Draw label text (black for high contrast) + ctx.fillStyle = 'black'; + ctx.font = font; + ctx.fillText(labelText, labelX, labelY + labelHeight); + + // Record label position for collision detection + existingLabels.push({ + x: labelBgX1, + y: labelBgY1, + width: labelBgX2 - labelBgX1, + height: labelBgY2 - labelBgY1, + }); + } + + // Convert canvas to buffer + return canvasElement.toBuffer('image/png'); + } + + /** + * Encode image buffer to base64 data URL with size optimization. + * + * Vision LLM APIs typically have size limits (e.g., 20MB for OpenAI). + * This function automatically compresses images if they're too large. + * + * @param imageBuffer - Image buffer + * @param format - Image format ('PNG' or 'JPEG') + * @param maxSizeMb - Maximum size in MB before compression (default: 20MB) + * @returns Base64-encoded data URL + */ + private async encodeImageToBase64( + imageBuffer: Buffer, + format: 'PNG' | 'JPEG' = 'PNG', + maxSizeMb: number = 20.0 + ): Promise { + if (!sharp) { + throw new Error('sharp is not available. Install with: npm install sharp'); + } + + let quality = 95; // Start with high quality + let outputBuffer = imageBuffer; + + // Try to fit within size limit + for (let attempt = 0; attempt < 3; attempt++) { + if (format === 'JPEG') { + outputBuffer = await sharp(imageBuffer).jpeg({ quality, mozjpeg: true }).toBuffer(); + } else { + outputBuffer = await sharp(imageBuffer).png({ compressionLevel: 9 }).toBuffer(); + } + + const sizeMb = outputBuffer.length / (1024 * 1024); + + if (sizeMb <= maxSizeMb) { + break; + } + + // Reduce quality for next attempt + quality = Math.max(70, quality - 15); + if ((this as any).verbose && attempt === 0) { + console.log(` ⚠️ Image size ${sizeMb.toFixed(2)}MB exceeds limit, compressing...`); + } + } + + const finalSizeMb = outputBuffer.length / (1024 * 1024); + if ((this as any).verbose) { + console.log( + ` 📸 Image encoded: ${finalSizeMb.toFixed(2)}MB (${outputBuffer.length} bytes)` + ); + } + + const base64Data = outputBuffer.toString('base64'); + const mimeType = format === 'PNG' ? 'image/png' : 'image/jpeg'; + return `data:${mimeType};base64,${base64Data}`; + } + + /** + * Query LLM with vision (labeled screenshot). + * + * @param imageDataUrl - Base64-encoded image data URL + * @param goal - User's goal/task + * @returns LLMResponse with element ID + */ + private async queryLLMWithVision(imageDataUrl: string, goal: string): Promise { + const systemPrompt = `You are a web automation assistant. You will see a screenshot of a web page with labeled element IDs. +Each clickable element has: +- A bright neon green (#39FF14) bounding box around the element +- A white label box with a number (the element ID) connected by a green line +- The label is clearly separate from the element (not part of the UI) + +CRITICAL INSTRUCTIONS: +1. Look at the screenshot carefully +2. Find the element that matches the user's goal (ignore the white label boxes - they are annotations, not UI elements) +3. Follow the green line from that element to find its label box with the ID number +4. Respond with ONLY that integer ID number (e.g., "42" or "1567") +5. Do NOT include any explanation, reasoning, or other text +6. Do NOT say "element 1" or "the first element" - just return the number +7. Do NOT confuse the white label box with an interactive element - labels are annotations connected by green lines + +Example responses: +- Correct: "42" +- Correct: "1567" +- Wrong: "I see element 42" +- Wrong: "The element ID is 42" +- Wrong: "42 (the search box)"`; + + const userPrompt = `Goal: ${goal} + +Look at the screenshot. Each element has a neon green bounding box with a white label showing its ID number. +Find the element that should be clicked to accomplish this goal. +Return ONLY the integer ID number from the label, nothing else.`; + + // Check if LLM provider supports vision (OpenAI GPT-4o, Claude, etc.) + // For now, we'll use a fallback approach - try to pass image via the generate method + // Individual LLM providers should implement vision support in their generate methods + + try { + // Try to use vision API if available + // This is a placeholder - actual implementation depends on LLM provider + const response = await (this as any).llm.generate(systemPrompt, userPrompt, { + image: imageDataUrl, + temperature: 0.0, + }); + + return response; + } catch { + // Fallback: Try to pass image via text description + const fallbackPrompt = `${userPrompt}\n\n[Image data: ${imageDataUrl.substring(0, 200)}...]`; + const fallbackResponse = await (this as any).llm.generate(systemPrompt, fallbackPrompt, { + temperature: 0.0, + }); + + if ((this as any).verbose) { + console.log(' ⚠️ Using fallback method (may not support vision)'); + } + + return fallbackResponse; + } + } + + /** + * Extract element ID integer from LLM response. + * + * @param llmResponse - LLM response text + * @returns Element ID as integer, or undefined if not found + */ + private extractElementId(llmResponse: string): number | undefined { + if ((this as any).verbose) { + console.log(`🔍 Raw LLM response: ${JSON.stringify(llmResponse)}`); + } + + // Clean the response - remove leading/trailing whitespace + let cleaned = llmResponse.trim(); + + if ((this as any).verbose) { + console.log(` 🧹 After strip: ${JSON.stringify(cleaned)}`); + } + + // Remove common prefixes that LLMs might add + const prefixesToRemove = [ + 'element', + 'id', + 'the element', + 'element id', + 'the id', + 'click', + 'click on', + 'select', + 'choose', + ]; + + for (const prefix of prefixesToRemove) { + if (cleaned.toLowerCase().startsWith(prefix)) { + cleaned = cleaned.substring(prefix.length).trim(); + // Remove any remaining punctuation + cleaned = cleaned.replace(/^[:.,;!?()[\]{}]+/, '').trim(); + if ((this as any).verbose) { + console.log(` 🧹 After removing prefix '${prefix}': ${JSON.stringify(cleaned)}`); + } + } + } + + // Try to find all integers in the cleaned response + const numbers = cleaned.match(/\d+/g); + + if ((this as any).verbose) { + console.log(` 🔢 Numbers found: ${numbers}`); + } + + if (numbers && numbers.length > 0) { + // If multiple numbers found, prefer the largest one (likely the actual element ID) + // Element IDs are typically larger numbers, not small ones like "1" + try { + const intNumbers = numbers.map(n => parseInt(n, 10)); + if ((this as any).verbose) { + console.log(` 🔢 As integers: ${intNumbers}`); + } + + // Prefer larger numbers (element IDs are usually > 10) + // But if only small numbers exist, use the first one + const largeNumbers = intNumbers.filter(n => n > 10); + let elementId: number; + if (largeNumbers.length > 0) { + elementId = Math.max(...largeNumbers); // Take the largest + if ((this as any).verbose) { + console.log(` ✅ Selected largest number > 10: ${elementId}`); + } + } else { + elementId = intNumbers[0]; // Fallback to first if all are small + if ((this as any).verbose) { + console.log(` ⚠️ All numbers ≤ 10, using first: ${elementId}`); + } + } + + if ((this as any).verbose) { + console.log(`✅ Extracted element ID: ${elementId} (from ${numbers})`); + } + return elementId; + } catch { + if ((this as any).verbose) { + console.log(' ❌ Failed to convert numbers to integers'); + } + } + } + + if ((this as any).verbose) { + console.log(`⚠️ Could not extract element ID from response: ${llmResponse}`); + } + return undefined; + } + + /** + * Override act() method to use visual prompting with full tracing support. + * + * @param goal - User's goal/task + * @param maxRetries - Maximum retry attempts + * @param snapshotOptions - Optional snapshot options (screenshot will be enabled) + * @returns AgentActResult + */ + async act( + goal: string, + _maxRetries: number = 2, + snapshotOptions?: SnapshotOptions + ): Promise { + if ((this as any).verbose) { + console.log('\n' + '='.repeat(70)); + console.log(`🤖 Visual Agent Goal: ${goal}`); + console.log('='.repeat(70)); + } + + // Increment step counter and generate step ID + const stepCount = (this as any).stepCount + 1; + (this as any).stepCount = stepCount; + const stepId = randomUUID(); + + // Emit step_start event + const tracer = (this as any).tracer as Tracer | undefined; + if (tracer) { + const page = (this as any).browser.getPage(); + const currentUrl = page ? page.url() : 'unknown'; + tracer.emitStepStart(stepId, stepCount, goal, 0, currentUrl); + } + + const startTime = Date.now(); + + try { + // Ensure screenshot is enabled + const snapOpts: SnapshotOptions = { + ...snapshotOptions, + screenshot: snapshotOptions?.screenshot ?? true, + goal: snapshotOptions?.goal ?? goal, + limit: snapshotOptions?.limit || (this as any).snapshotLimit, + }; + + if ((this as any).verbose) { + console.log(`🎯 Goal: ${goal}`); + console.log('📸 Taking snapshot with screenshot...'); + } + + // 1. Take snapshot with screenshot + const snap = await snapshot((this as any).browser, snapOpts); + + if (snap.status !== 'success') { + throw new Error(`Snapshot failed: ${snap.error}`); + } + + if (!snap.screenshot) { + throw new Error('Screenshot not available in snapshot'); + } + + // Process snapshot: compute diff status and filter elements + const processed = SnapshotProcessor.process( + snap, + (this as any).previousSnapshot, + goal, + (this as any).snapshotLimit + ); + + // Update previous snapshot for next comparison + (this as any).previousSnapshot = snap; + + const snapWithDiff = processed.withDiff; + + // Emit snapshot event + if (tracer) { + const snapshotData = SnapshotEventBuilder.buildSnapshotEventData(snapWithDiff, stepId); + tracer.emit('snapshot', snapshotData, stepId); + } + + if ((this as any).verbose) { + console.log(`✅ Snapshot taken: ${snap.elements.length} elements`); + } + + // 2. Draw labeled screenshot + if ((this as any).verbose) { + console.log('🎨 Drawing bounding boxes and labels...'); + console.log(` Elements to label: ${snap.elements.length}`); + if (snap.elements.length > 0) { + const elementIds = snap.elements.slice(0, 10).map(el => el.id); // Show first 10 + console.log(` Sample element IDs: ${elementIds}`); + } + } + + const labeledImageBuffer = await this.drawLabeledScreenshot(snap, snap.elements); + + // Save labeled image to disk for debugging + try { + const cwd = process.cwd(); + let playgroundPath: string | undefined; + + // Check if current working directory contains playground + if (fs.existsSync(path.join(cwd, 'playground'))) { + playgroundPath = path.join(cwd, 'playground', 'images'); + } else { + // Check if we're in a playground context via module path + const modulePaths = require.resolve.paths('sentienceapi') || []; + for (const modulePath of modulePaths) { + const potentialPlayground = path.join(modulePath, '..', 'playground', 'images'); + if (fs.existsSync(path.dirname(potentialPlayground))) { + playgroundPath = potentialPlayground; + break; + } + } + } + + if (!playgroundPath) { + // Fallback: use current working directory + playgroundPath = path.join(cwd, 'playground', 'images'); + } + + const imagesDir = playgroundPath; + if (!fs.existsSync(imagesDir)) { + fs.mkdirSync(imagesDir, { recursive: true }); + } + + const imageUuid = uuidv4(); + const imageFilename = `labeled_screenshot_${imageUuid}.png`; + const imagePath = path.join(imagesDir, imageFilename); + fs.writeFileSync(imagePath, labeledImageBuffer); + if ((this as any).verbose) { + console.log(` 💾 Saved labeled screenshot: ${path.resolve(imagePath)}`); + } + } catch (saveError: any) { + // Don't fail if image save fails - it's just for debugging + if ((this as any).verbose) { + console.log(` ⚠️ Could not save labeled screenshot: ${saveError.message}`); + } + } + + // Use JPEG for better compression (smaller file size for vision APIs) + const labeledImageDataUrl = await this.encodeImageToBase64(labeledImageBuffer, 'JPEG', 20.0); + + // 3. Query LLM with vision + if ((this as any).verbose) { + console.log('🧠 Querying LLM with labeled screenshot...'); + } + + const llmResponse = await this.queryLLMWithVision(labeledImageDataUrl, goal); + + // Emit LLM query event + if (tracer) { + tracer.emit( + 'llm_query', + { + prompt_tokens: llmResponse.promptTokens, + completion_tokens: llmResponse.completionTokens, + model: llmResponse.modelName, + response_text: llmResponse.content.substring(0, 200), // Truncate for brevity + }, + stepId + ); + } + + if ((this as any).verbose) { + console.log(`💭 LLM Response: ${llmResponse.content}`); + } + + // Track token usage + (this as any).trackTokens(goal, llmResponse); + + // 4. Extract element ID + const elementId = this.extractElementId(llmResponse.content); + + if (elementId === undefined) { + throw new Error(`Could not extract element ID from LLM response: ${llmResponse.content}`); + } + + if ((this as any).verbose) { + console.log(`🎯 Extracted Element ID: ${elementId}`); + } + + // 5. Click the element + if ((this as any).verbose) { + console.log(`🖱️ Clicking element ${elementId}...`); + } + + const clickResult = await click((this as any).browser, elementId); + + const durationMs = Date.now() - startTime; + + // Create AgentActResult from click result + const result: AgentActResult = { + success: clickResult.success, + action: 'click', + goal, + durationMs, + attempt: 0, + elementId, + outcome: clickResult.outcome, + urlChanged: clickResult.url_changed || false, + error: clickResult.error?.reason, + }; + + // Emit action execution event + if (tracer) { + const page = (this as any).browser.getPage(); + const postUrl = page ? page.url() : null; + + // Include element data for live overlay visualization + const elementsData = snap.elements.slice(0, 50).map(el => ({ + id: el.id, + bbox: { + x: el.bbox.x, + y: el.bbox.y, + width: el.bbox.width, + height: el.bbox.height, + }, + role: el.role, + text: el.text ? el.text.substring(0, 50) : '', + })); + + tracer.emit( + 'action', + { + action_type: result.action, + action: result.action, + element_id: result.elementId, + success: result.success, + // Note: duration_ms and other custom fields are not in TraceEventData type + // but are accepted at runtime for custom visualization + post_url: postUrl, + elements: elementsData, // Add element data for overlay + target_element_id: result.elementId, // Highlight target in red + } as any, // Type assertion needed for custom visualization fields + stepId + ); + } + + // Record history + const history = (this as any).history as Array; + history.push({ + goal, + action: `CLICK(${elementId})`, + result, + success: result.success, + attempt: 0, + durationMs, + }); + + if ((this as any).verbose) { + const status = result.success ? '✅' : '❌'; + console.log(`${status} Completed in ${durationMs}ms`); + } + + // Emit step completion event + if (tracer) { + const preUrl = snap.url; + const page = (this as any).browser.getPage(); + const postUrl = page ? page.url() || null : null; + + // Build complete step_end event + // Note: snapshotDigest, llmResponseText, execData, and verifyData are computed + // inside TraceEventBuilder.buildStepEndData, so we don't need them here + + // Build complete step_end event + const stepEndData = TraceEventBuilder.buildStepEndData({ + stepId, + stepIndex: stepCount, + goal, + attempt: 0, + preUrl, + postUrl: postUrl || preUrl, + snapshot: snapWithDiff, + llmResponse, + result, + }); + + tracer.emit('step_end', stepEndData, stepId); + } + + return result; + } catch (error: any) { + // Emit error event + if (tracer) { + tracer.emitError(stepId, error.message, 0); + } + + if ((this as any).verbose) { + console.log(`❌ Error: ${error.message}`); + } + + // Re-raise the exception + throw error; + } + } +}