diff --git a/src/agent.ts b/src/agent.ts index 32c9323..a6f2754 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -211,10 +211,17 @@ export class SentienceAgent { this.tracer.emitStepStart(stepId, this.stepCount, goal, 0, currentUrl); } + // Track data collected during step execution for step_end emission on failure + let stepSnapWithDiff: Snapshot | null = null; + let stepPreUrl: string | null = null; + let stepLlmResponse: LLMResponse | null = null; + let stepStartTime: number = Date.now(); + for (let attempt = 0; attempt <= maxRetries; attempt++) { try { // 1. OBSERVE: Get refined semantic snapshot const startTime = Date.now(); + stepStartTime = startTime; const snapOpts: SnapshotOptions = { ...snapshotOptions, @@ -246,6 +253,10 @@ export class SentienceAgent { const snapWithDiff = processed.withDiff; const filteredSnap = processed.filtered; + // Track for step_end emission on failure + stepSnapWithDiff = snapWithDiff; + stepPreUrl = snap.url; + // Emit snapshot event if (this.tracer) { const snapshotData = SnapshotEventBuilder.buildSnapshotEventData(snapWithDiff, stepId); @@ -258,6 +269,9 @@ export class SentienceAgent { // 3. THINK: Query LLM for next action const llmResponse = await this.llmHandler.queryLLM(context, goal); + // Track for step_end emission on failure + stepLlmResponse = llmResponse; + if (this.verbose) { console.log(`🧠 LLM Decision: ${llmResponse.content}`); } @@ -357,6 +371,28 @@ export class SentienceAgent { await new Promise(resolve => setTimeout(resolve, 1000)); continue; } else { + // Emit step_end with whatever data we collected before failure + // This ensures diff_status and other fields are preserved in traces + if (this.tracer && stepSnapWithDiff) { + const postUrl = this.browser.getPage()?.url() || null; + const durationMs = Date.now() - stepStartTime; + + const stepEndData = TraceEventBuilder.buildPartialStepEndData({ + stepId, + stepIndex: this.stepCount, + goal, + attempt, + preUrl: stepPreUrl, + postUrl, + snapshot: stepSnapWithDiff, + llmResponse: stepLlmResponse, + error: error.message, + durationMs, + }); + + this.tracer.emit('step_end', stepEndData, stepId); + } + const errorResult: AgentActResult = { success: false, goal, diff --git a/src/failure-artifacts.ts b/src/failure-artifacts.ts index e5ba04b..b9f6aab 100644 --- a/src/failure-artifacts.ts +++ b/src/failure-artifacts.ts @@ -104,6 +104,31 @@ function isFfmpegAvailable(): boolean { } } +/** + * Get ffmpeg version as a tuple [major, minor] or null if unable to determine. + * Used to determine which flags to use (e.g., -vsync vs -fps_mode). + */ +function getFfmpegVersion(): [number, number] | null { + try { + const result = spawnSync('ffmpeg', ['-version'], { + timeout: 5000, + stdio: 'pipe', + }); + if (result.status !== 0) { + return null; + } + const output = result.stdout?.toString('utf-8') || ''; + // Parse version from output like "ffmpeg version 7.0.1 ..." or "ffmpeg version n7.0.1 ..." + const match = output.match(/ffmpeg version [n]?(\d+)\.(\d+)/i); + if (match) { + return [parseInt(match[1], 10), parseInt(match[2], 10)]; + } + return null; + } catch { + return null; + } +} + /** * Generate an MP4 video clip from a directory of frames using ffmpeg. */ @@ -123,7 +148,9 @@ function generateClipFromFrames(framesDir: string, outputPath: string, fps: numb } // Create a temporary file list for ffmpeg concat demuxer - const listFile = path.join(framesDir, 'frames_list.txt'); + // Use relative path (just filename) since we run ffmpeg with cwd=framesDir + const listFile = 'frames_list.txt'; + const listFilePath = path.join(framesDir, listFile); const frameDuration = 1.0 / fps; try { @@ -132,7 +159,19 @@ function generateClipFromFrames(framesDir: string, outputPath: string, fps: numb files.map(f => `file '${f}'\nduration ${frameDuration}`).join('\n') + `\nfile '${files[files.length - 1]}'`; // ffmpeg concat quirk - fs.writeFileSync(listFile, listContent); + fs.writeFileSync(listFilePath, listContent); + + // Determine which vsync/fps_mode flag to use based on ffmpeg version + // -vsync is deprecated in ffmpeg 7.0+, use -fps_mode instead (available since 5.1) + const version = getFfmpegVersion(); + let syncArgs: string[]; + if (version && (version[0] > 5 || (version[0] === 5 && version[1] >= 1))) { + // ffmpeg 5.1+: use -fps_mode + syncArgs = ['-fps_mode', 'vfr']; + } else { + // ffmpeg < 5.1: use legacy -vsync + syncArgs = ['-vsync', 'vfr']; + } // Run ffmpeg to generate the clip const result = spawnSync( @@ -145,8 +184,7 @@ function generateClipFromFrames(framesDir: string, outputPath: string, fps: numb '0', '-i', listFile, - '-vsync', - 'vfr', + ...syncArgs, '-pix_fmt', 'yuv420p', '-c:v', @@ -175,7 +213,7 @@ function generateClipFromFrames(framesDir: string, outputPath: string, fps: numb } finally { // Clean up the list file try { - fs.unlinkSync(listFile); + fs.unlinkSync(listFilePath); } catch { // ignore } diff --git a/src/utils/trace-event-builder.ts b/src/utils/trace-event-builder.ts index fbdd4c3..ce0204f 100644 --- a/src/utils/trace-event-builder.ts +++ b/src/utils/trace-event-builder.ts @@ -279,4 +279,119 @@ export class TraceEventBuilder { return data; } + + /** + * Build partial step_end event data for failed steps + * + * This is used when a step fails after collecting some data (snapshot, LLM response, etc.) + * but before completing execution. It ensures diff_status and other fields are preserved + * in traces even when the agent run fails. + * + * @param params - Parameters for building partial step_end event + * @returns Partial step_end event data + */ + static buildPartialStepEndData(params: { + stepId: string; + stepIndex: number; + goal: string; + attempt: number; + preUrl: string | null; + postUrl: string | null; + snapshot?: Snapshot | null; + llmResponse?: LLMResponse | null; + error: string; + durationMs: number; + }): TraceEventData { + const { + stepId, + stepIndex, + goal, + attempt, + preUrl, + postUrl, + snapshot, + llmResponse, + error, + durationMs, + } = params; + + // Build pre data + const preData: TraceEventData['pre'] = { + url: preUrl || undefined, + snapshot_digest: snapshot ? this.buildSnapshotDigest(snapshot) : undefined, + }; + + // Add elements with diff_status if snapshot is available + if (snapshot && snapshot.elements.length > 0) { + const importanceValues = snapshot.elements.map(el => el.importance); + const minImportance = importanceValues.length > 0 ? Math.min(...importanceValues) : 0; + const maxImportance = importanceValues.length > 0 ? Math.max(...importanceValues) : 0; + const importanceRange = maxImportance - minImportance; + + preData.elements = snapshot.elements.map(el => { + let importanceScore: number; + if (importanceRange > 0) { + importanceScore = (el.importance - minImportance) / importanceRange; + } else { + importanceScore = 0.5; + } + + return { + id: el.id, + role: el.role, + text: el.text, + bbox: el.bbox, + importance: el.importance, + importance_score: importanceScore, + visual_cues: el.visual_cues, + in_viewport: el.in_viewport, + is_occluded: el.is_occluded, + z_index: el.z_index, + rerank_index: el.rerank_index, + heuristic_index: el.heuristic_index, + ml_probability: el.ml_probability, + ml_score: el.ml_score, + diff_status: el.diff_status, + }; + }); + } + + // Build LLM data if available + let llmData: TraceEventData['llm'] | undefined; + if (llmResponse) { + llmData = this.buildLLMData(llmResponse); + } + + // Build exec data for failure + const execData: TraceEventData['exec'] = { + success: false, + action: 'error', + outcome: error, + duration_ms: durationMs, + error: error, + }; + + // Build verify data for failure + const verifyData: TraceEventData['verify'] = { + passed: false, + signals: { + error: error, + }, + }; + + return { + v: 1, + step_id: stepId, + step_index: stepIndex, + goal: goal, + attempt: attempt, + pre: preData, + llm: llmData, + exec: execData, + post: { + url: postUrl || undefined, + }, + verify: verifyData, + }; + } } diff --git a/src/visual-agent.ts b/src/visual-agent.ts index 8edbdd2..f620e84 100644 --- a/src/visual-agent.ts +++ b/src/visual-agent.ts @@ -596,6 +596,11 @@ Return ONLY the integer ID number from the label, nothing else.`; const startTime = Date.now(); + // Track data collected during step execution for step_end emission on failure + let stepSnapWithDiff: Snapshot | null = null; + let stepPreUrl: string | null = null; + let stepLlmResponse: LLMResponse | null = null; + try { // Ensure screenshot is enabled const snapOpts: SnapshotOptions = { @@ -634,6 +639,10 @@ Return ONLY the integer ID number from the label, nothing else.`; const snapWithDiff = processed.withDiff; + // Track for step_end emission on failure + stepSnapWithDiff = snapWithDiff; + stepPreUrl = snap.url; + // Emit snapshot event if (tracer) { const snapshotData = SnapshotEventBuilder.buildSnapshotEventData(snapWithDiff, stepId); @@ -710,6 +719,9 @@ Return ONLY the integer ID number from the label, nothing else.`; const llmResponse = await this.queryLLMWithVision(labeledImageDataUrl, goal); + // Track for step_end emission on failure + stepLlmResponse = llmResponse; + // Emit LLM query event if (tracer) { tracer.emit( @@ -848,6 +860,29 @@ Return ONLY the integer ID number from the label, nothing else.`; tracer.emitError(stepId, error.message, 0); } + // Emit step_end with whatever data we collected before failure + // This ensures diff_status and other fields are preserved in traces + if (tracer && stepSnapWithDiff) { + const page = (this as any).browser.getPage(); + const postUrl = page ? page.url() || null : null; + const durationMs = Date.now() - startTime; + + const stepEndData = TraceEventBuilder.buildPartialStepEndData({ + stepId, + stepIndex: stepCount, + goal, + attempt: 0, + preUrl: stepPreUrl, + postUrl, + snapshot: stepSnapWithDiff, + llmResponse: stepLlmResponse, + error: error.message, + durationMs, + }); + + tracer.emit('step_end', stepEndData, stepId); + } + if ((this as any).verbose) { console.log(`❌ Error: ${error.message}`); }