diff --git a/examples/runtime-agent-minimal.ts b/examples/runtime-agent-minimal.ts new file mode 100644 index 0000000..077235b --- /dev/null +++ b/examples/runtime-agent-minimal.ts @@ -0,0 +1,98 @@ +/** + * Example: RuntimeAgent (AgentRuntime-backed) minimal demo. + * + * This demonstrates the verification-first loop: + * snapshot -> propose action (structured executor) -> execute -> verify (AgentRuntime predicates) + * + * Requirements: + * - SENTIENCE_API_KEY (needed to start SentienceBrowser) + * + * Usage: + * ts-node examples/runtime-agent-minimal.ts + */ + +import { Page } from 'playwright'; +import { + AgentRuntime, + RuntimeAgent, + RuntimeStep, + StepVerification, + SentienceBrowser, + exists, + urlContains, +} from '../src'; +import { createTracer } from '../src/tracing/tracer-factory'; +import { LLMProvider, LLMResponse } from '../src/llm-provider'; +import type { Snapshot } from '../src/types'; + +/** + * Adapter to make SentienceBrowser compatible with AgentRuntime's BrowserLike interface. + * AgentRuntime expects snapshot(page, options) but SentienceBrowser has snapshot(options). + */ +function createBrowserAdapter(browser: SentienceBrowser) { + return { + snapshot: async (_page: Page, options?: Record): Promise => { + return await browser.snapshot(options); + }, + }; +} + +class FixedActionProvider extends LLMProvider { + constructor(private action: string) { + super(); + } + get modelName(): string { + return 'fixed-action'; + } + supportsJsonMode(): boolean { + return false; + } + async generate(_systemPrompt: string, _userPrompt: string, _options: Record = {}): Promise { + return { content: this.action, modelName: this.modelName }; + } +} + +async function main() { + const sentienceKey = process.env.SENTIENCE_API_KEY; + if (!sentienceKey) { + console.error('Error: SENTIENCE_API_KEY not set'); + process.exit(1); + } + + const runId = 'runtime-agent-minimal'; + const tracer = await createTracer({ apiKey: sentienceKey, runId, uploadTrace: false }); + + const browser = new SentienceBrowser(sentienceKey, undefined, false); + await browser.start(); + const page = browser.getPage(); + + try { + await page.goto('https://example.com'); + await page.waitForLoadState('networkidle'); + + const runtime = new AgentRuntime(createBrowserAdapter(browser), page, tracer); + + // Structured executor (for demo, we just return FINISH()). + const executor = new FixedActionProvider('FINISH()'); + const agent = new RuntimeAgent({ runtime, executor }); + + const step: RuntimeStep = { + goal: 'Confirm Example Domain page is loaded', + verifications: [ + { predicate: urlContains('example.com'), label: 'url_contains_example', required: true } satisfies StepVerification, + { predicate: exists('role=heading'), label: 'has_heading', required: true } satisfies StepVerification, + ], + maxSnapshotAttempts: 2, + snapshotLimitBase: 60, + }; + + const ok = await agent.runStep({ taskGoal: 'Open example.com and verify', step }); + console.log(`step ok: ${ok}`); + } finally { + await tracer.close(true); + await browser.close(); + } +} + +main().catch(console.error); + diff --git a/src/index.ts b/src/index.ts index 7f6478f..6a84a30 100644 --- a/src/index.ts +++ b/src/index.ts @@ -70,6 +70,8 @@ export { isCollapsed, } from './verification'; export { AgentRuntime, AssertionHandle, AssertionRecord, EventuallyOptions } from './agent-runtime'; +export { RuntimeAgent } from './runtime-agent'; +export type { RuntimeStep, StepVerification } from './runtime-agent'; export * from './captcha/types'; export * from './captcha/strategies'; diff --git a/src/runtime-agent.ts b/src/runtime-agent.ts new file mode 100644 index 0000000..ead51c0 --- /dev/null +++ b/src/runtime-agent.ts @@ -0,0 +1,396 @@ +/** + * AgentRuntime-backed agent with optional vision executor fallback. + * + * This keeps the control plane verification-first: + * - Actions may be proposed by either a structured executor (DOM snapshot prompt) + * or a vision executor (screenshot prompt). + * - Verification is always executed via AgentRuntime predicates. + */ + +import { AgentRuntime } from './agent-runtime'; +import { LLMProvider } from './llm-provider'; +import { LLMInteractionHandler } from './utils/llm-interaction-handler'; +import type { Snapshot, Element, BBox } from './types'; +import type { Predicate } from './verification'; + +export interface StepVerification { + predicate: Predicate; + label: string; + required?: boolean; + eventually?: boolean; + timeoutMs?: number; + pollMs?: number; + maxSnapshotAttempts?: number; + minConfidence?: number; +} + +export interface RuntimeStep { + goal: string; + intent?: string; + verifications?: StepVerification[]; + + // Snapshot quality policy (handled at agent layer; SDK core unchanged). + snapshotLimitBase?: number; + snapshotLimitStep?: number; + snapshotLimitMax?: number; + maxSnapshotAttempts?: number; + minConfidence?: number; + minActionables?: number; + + // Vision executor fallback (bounded). + visionExecutorEnabled?: boolean; + maxVisionExecutorAttempts?: number; +} + +type ParsedAction = + | { kind: 'finish' } + | { kind: 'press'; key: string } + | { kind: 'click_id'; id: number } + | { kind: 'type_id'; id: number; text: string } + | { kind: 'click_xy'; x: number; y: number } + | { kind: 'click_rect'; x: number; y: number; w: number; h: number }; + +export class RuntimeAgent { + readonly runtime: AgentRuntime; + readonly executor: LLMProvider; + readonly visionExecutor?: LLMProvider; + readonly visionVerifier?: LLMProvider; + readonly shortCircuitCanvas: boolean; + + private structuredLLM: LLMInteractionHandler; + + constructor(opts: { + runtime: AgentRuntime; + executor: LLMProvider; + visionExecutor?: LLMProvider; + visionVerifier?: LLMProvider; + shortCircuitCanvas?: boolean; + }) { + this.runtime = opts.runtime; + this.executor = opts.executor; + this.visionExecutor = opts.visionExecutor; + this.visionVerifier = opts.visionVerifier; + this.shortCircuitCanvas = opts.shortCircuitCanvas ?? true; + this.structuredLLM = new LLMInteractionHandler(this.executor, false); + } + + async runStep(opts: { taskGoal: string; step: RuntimeStep }): Promise { + const { taskGoal, step } = opts; + this.runtime.beginStep(step.goal); + + const snap = await this.snapshotWithRamp(step); + + if (await this.shouldShortCircuitToVision(step, snap)) { + return await this.visionExecutorAttempt({ taskGoal, step, snap }); + } + + // 1) Structured executor attempt. + const action = await this.proposeStructuredAction({ taskGoal, step, snap }); + await this.executeAction(action, snap); + const ok = await this.applyVerifications(step); + if (ok) return true; + + // 2) Optional vision executor fallback (bounded). + const enabled = step.visionExecutorEnabled ?? true; + const maxAttempts = step.maxVisionExecutorAttempts ?? 1; + if (enabled && maxAttempts > 0) { + return await this.visionExecutorAttempt({ taskGoal, step, snap }); + } + + return false; + } + + private async snapshotWithRamp(step: RuntimeStep): Promise { + const base = step.snapshotLimitBase ?? 60; + const stepInc = step.snapshotLimitStep ?? 40; + const max = step.snapshotLimitMax ?? 220; + const attempts = Math.max(1, step.maxSnapshotAttempts ?? 3); + const minConf = step.minConfidence; + const minActionables = step.minActionables; + + let limit = base; + let last: Snapshot | null = null; + + for (let i = 0; i < attempts; i++) { + last = await this.runtime.snapshot({ limit, goal: step.goal }); + + if (typeof minConf === 'number') { + const conf = last?.diagnostics?.confidence; + if (typeof conf === 'number' && Number.isFinite(conf) && conf < minConf) { + limit = Math.min(max, limit + stepInc); + continue; + } + } + + if (typeof minActionables === 'number') { + if (this.countActionables(last) < minActionables) { + limit = Math.min(max, limit + stepInc); + continue; + } + } + + return last; + } + + if (!last) throw new Error('snapshot() returned null/undefined repeatedly'); + return last; + } + + private async proposeStructuredAction(opts: { + taskGoal: string; + step: RuntimeStep; + snap: Snapshot; + }): Promise { + const { taskGoal, step, snap } = opts; + const domContext = this.structuredLLM.buildContext(snap, step.goal); + const combinedGoal = `${taskGoal}\n\nSTEP: ${step.goal}`; + const resp = await this.structuredLLM.queryLLM(domContext, combinedGoal); + return this.extractActionFromText(resp.content); + } + + private async visionExecutorAttempt(opts: { + taskGoal: string; + step: RuntimeStep; + snap: Snapshot | null; + }): Promise { + const { taskGoal, step, snap } = opts; + const provider = this.visionExecutor; + if (!provider || !provider.supportsVision?.()) return false; + + const url = this.runtime.page?.url?.() ?? snap?.url ?? '(unknown)'; + const buf = (await (this.runtime.page as any).screenshot({ type: 'png' })) as Buffer; + const imageBase64 = Buffer.from(buf).toString('base64'); + + const { systemPrompt, userPrompt } = this.visionExecutorPrompts({ + taskGoal, + step, + url, + snap, + }); + + const resp = await provider.generateWithImage(systemPrompt, userPrompt, imageBase64, { + temperature: 0.0, + }); + + const action = this.extractActionFromText(resp.content); + await this.executeAction(action, snap ?? undefined); + + // This is a retry of the same step; clear prior step assertions. + this.runtime.flushAssertions(); + return await this.applyVerifications(step); + } + + private async applyVerifications(step: RuntimeStep): Promise { + const verifications = step.verifications ?? []; + if (verifications.length === 0) return true; + + let allOk = true; + for (const v of verifications) { + const required = v.required ?? true; + const eventually = v.eventually ?? true; + let ok: boolean; + if (eventually) { + ok = await this.runtime.check(v.predicate, v.label, required).eventually({ + timeoutMs: v.timeoutMs ?? 10_000, + pollMs: v.pollMs ?? 250, + minConfidence: v.minConfidence, + maxSnapshotAttempts: v.maxSnapshotAttempts, + visionProvider: this.visionVerifier, + }); + } else { + ok = this.runtime.assert(v.predicate, v.label, required); + } + allOk = allOk && ok; + } + + return this.runtime.requiredAssertionsPassed() && allOk; + } + + private async executeAction(action: string, snap?: Snapshot): Promise { + const url = this.runtime.page?.url?.() ?? snap?.url; + await this.runtime.recordAction(action, url); + + const parsed = this.parseAction(action); + + if (parsed.kind === 'finish') return; + + if (parsed.kind === 'press') { + await this.runtime.page.keyboard.press(parsed.key); + await this.stabilizeBestEffort(); + return; + } + + if (parsed.kind === 'click_xy') { + await this.runtime.page.mouse.click(parsed.x, parsed.y); + await this.stabilizeBestEffort(); + return; + } + + if (parsed.kind === 'click_rect') { + const x = parsed.x + parsed.w / 2; + const y = parsed.y + parsed.h / 2; + await this.runtime.page.mouse.click(x, y); + await this.stabilizeBestEffort(); + return; + } + + if (!snap) throw new Error('Cannot execute CLICK(id)/TYPE(id, ...) without a snapshot'); + + if (parsed.kind === 'click_id') { + const el = this.findElement(snap, parsed.id); + if (!el) throw new Error(`Element id ${parsed.id} not found in snapshot`); + await this.clickBBox(el.bbox); + await this.stabilizeBestEffort(); + return; + } + + if (parsed.kind === 'type_id') { + const el = this.findElement(snap, parsed.id); + if (!el) throw new Error(`Element id ${parsed.id} not found in snapshot`); + await this.clickBBox(el.bbox); + await this.runtime.page.keyboard.type(parsed.text); + await this.stabilizeBestEffort(); + return; + } + } + + private async stabilizeBestEffort(): Promise { + try { + await this.runtime.page.waitForTimeout(50); + } catch { + // best-effort + } + } + + private clickBBox(bbox: BBox): Promise { + const x = bbox.x + bbox.width / 2; + const y = bbox.y + bbox.height / 2; + return this.runtime.page.mouse.click(x, y); + } + + private findElement(snap: Snapshot, id: number): Element | undefined { + return snap.elements.find(e => e.id === id); + } + + private countActionables(snap: Snapshot): number { + let n = 0; + for (const el of snap.elements ?? []) { + if (el.visual_cues?.is_clickable) n += 1; + } + return n; + } + + private async shouldShortCircuitToVision( + step: RuntimeStep, + snap: Snapshot | null + ): Promise { + const enabled = step.visionExecutorEnabled ?? true; + if (!enabled) return false; + if (!this.visionExecutor || !this.visionExecutor.supportsVision?.()) return false; + if (!snap) return true; + + const minActionables = step.minActionables; + if (typeof minActionables === 'number' && this.countActionables(snap) < minActionables) { + if (this.shortCircuitCanvas) { + try { + const n = await this.runtime.page.evaluate("document.querySelectorAll('canvas').length"); + if (typeof n === 'number' && n > 0) return true; + } catch { + // ignore + } + } + } + return false; + } + + private visionExecutorPrompts(opts: { + taskGoal: string; + step: RuntimeStep; + url: string; + snap: Snapshot | null; + }): { systemPrompt: string; userPrompt: string } { + const verifyTargets = this.verificationTargetsHuman(opts.step.verifications ?? []); + const snapshotSummary = opts.snap + ? `\n\nStructured snapshot summary:\n- url: ${opts.snap.url}\n- elements: ${opts.snap.elements?.length ?? 0}\n` + : ''; + + const systemPrompt = `You are a vision-capable web automation executor. + +TASK GOAL: +${opts.taskGoal} + +STEP GOAL: +${opts.step.goal} + +CURRENT URL (text): +${opts.url || '(unknown)'} + +VERIFICATION TARGETS (text): +${verifyTargets || '(none provided)'}${snapshotSummary} + +RESPONSE FORMAT: +Return ONLY ONE of: +- CLICK(id) +- TYPE(id, "text") +- CLICK_XY(x, y) +- CLICK_RECT(x, y, w, h) +- PRESS("key") +- FINISH() + +No explanations, no markdown. +`; + + return { + systemPrompt, + userPrompt: 'From the screenshot, return the single best next action:', + }; + } + + private verificationTargetsHuman(verifications: StepVerification[]): string { + if (!verifications.length) return ''; + return verifications + .map(v => `- ${v.label} (${(v.required ?? true) ? 'required' : 'optional'})`) + .join('\n'); + } + + private parseAction(action: string): ParsedAction { + const s = action.trim(); + + if (/^FINISH\s*\(\s*\)\s*$/i.test(s)) return { kind: 'finish' }; + + const mXY = s.match(/^CLICK_XY\s*\(\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*\)\s*$/i); + if (mXY) return { kind: 'click_xy', x: Number(mXY[1]), y: Number(mXY[2]) }; + + const mRect = s.match( + /^CLICK_RECT\s*\(\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*\)\s*$/i + ); + if (mRect) { + return { + kind: 'click_rect', + x: Number(mRect[1]), + y: Number(mRect[2]), + w: Number(mRect[3]), + h: Number(mRect[4]), + }; + } + + const mClick = s.match(/^CLICK\s*\(\s*(\d+)\s*\)\s*$/i); + if (mClick) return { kind: 'click_id', id: Number(mClick[1]) }; + + const mType = s.match(/^TYPE\s*\(\s*(\d+)\s*,\s*["']([^"']*)["']\s*\)\s*$/i); + if (mType) return { kind: 'type_id', id: Number(mType[1]), text: mType[2] }; + + const mPress = s.match(/^PRESS\s*\(\s*["']([^"']+)["']\s*\)\s*$/i); + if (mPress) return { kind: 'press', key: mPress[1] }; + + throw new Error(`Unknown action format: ${action}`); + } + + private extractActionFromText(text: string): string { + const cleaned = (text || '').replace(/```[\w]*\n?/g, '').trim(); + const pat = + /(CLICK_XY\s*\(\s*-?\d+(?:\.\d+)?\s*,\s*-?\d+(?:\.\d+)?\s*\)|CLICK_RECT\s*\(\s*-?\d+(?:\.\d+)?\s*,\s*-?\d+(?:\.\d+)?\s*,\s*-?\d+(?:\.\d+)?\s*,\s*-?\d+(?:\.\d+)?\s*\)|CLICK\s*\(\s*\d+\s*\)|TYPE\s*\(\s*\d+\s*,\s*["'].*?["']\s*\)|PRESS\s*\(\s*["'].*?["']\s*\)|FINISH\s*\(\s*\))/i; + const m = cleaned.match(pat); + return m ? m[1] : cleaned; + } +} diff --git a/tests/runtime-agent.test.ts b/tests/runtime-agent.test.ts new file mode 100644 index 0000000..2aba8c5 --- /dev/null +++ b/tests/runtime-agent.test.ts @@ -0,0 +1,348 @@ +import { RuntimeAgent, RuntimeStep, StepVerification } from '../src/runtime-agent'; +import { AgentRuntime } from '../src/agent-runtime'; +import { Tracer } from '../src/tracing/tracer'; +import { TraceSink } from '../src/tracing/sink'; +import { MockPage } from './mocks/browser-mock'; +import { LLMProvider } from '../src/llm-provider'; +import type { LLMResponse } from '../src/llm-provider'; +import type { Element, Snapshot } from '../src/types'; +import type { Predicate } from '../src/verification'; + +class MockSink extends TraceSink { + public events: any[] = []; + emit(event: Record): void { + this.events.push(event); + } + async close(): Promise { + // no-op + } + getSinkType(): string { + return 'MockSink'; + } +} + +class ProviderStub extends LLMProvider { + private responses: string[]; + public calls: Array<{ system: string; user: string; options?: any }> = []; + + constructor(responses: string[] = []) { + super(); + this.responses = [...responses]; + } + + get modelName(): string { + return 'stub'; + } + + supportsJsonMode(): boolean { + return true; + } + + async generate( + systemPrompt: string, + userPrompt: string, + options: Record = {} + ): Promise { + this.calls.push({ system: systemPrompt, user: userPrompt, options }); + const content = this.responses.length ? (this.responses.shift() as string) : 'FINISH()'; + return { content, modelName: this.modelName }; + } +} + +class VisionProviderStub extends ProviderStub { + supportsVision(): boolean { + return true; + } + + public visionCalls: Array<{ system: string; user: string; image: string; options?: any }> = []; + + async generateWithImage( + systemPrompt: string, + userPrompt: string, + imageBase64: string, + options: Record = {} + ): Promise { + this.visionCalls.push({ system: systemPrompt, user: userPrompt, image: imageBase64, options }); + const content = (this as any).responses?.length ? (this as any).responses.shift() : 'FINISH()'; + return { content, modelName: this.modelName }; + } +} + +function makeClickableElement(id: number): Element { + return { + id, + role: 'button', + text: 'OK', + importance: 100, + bbox: { x: 10, y: 20, width: 100, height: 40 }, + visual_cues: { is_primary: true, is_clickable: true, background_color_name: null }, + in_viewport: true, + is_occluded: false, + z_index: 1, + }; +} + +describe('RuntimeAgent (runtime-backed agent)', () => { + it('structured executor succeeds without vision', async () => { + const sink = new MockSink(); + const tracer = new Tracer('run', sink); + const page = new MockPage('https://example.com/start') as any; + + const snapshots: Snapshot[] = [ + { + status: 'success', + url: 'https://example.com/start', + elements: [makeClickableElement(1)], + timestamp: 't1', + }, + { + status: 'success', + url: 'https://example.com/done', + elements: [makeClickableElement(1)], + timestamp: 't2', + }, + ]; + + const browserLike = { + snapshot: async () => snapshots.shift() as Snapshot, + }; + + const runtime = new AgentRuntime(browserLike as any, page as any, tracer); + const executor = new ProviderStub(['CLICK(1)']); + const agent = new RuntimeAgent({ runtime, executor }); + + const pred: Predicate = ctx => ({ + passed: (ctx.url || '').endsWith('/done'), + reason: '', + details: {}, + }); + + const step: RuntimeStep = { + goal: 'Click OK', + maxSnapshotAttempts: 1, + verifications: [ + { + predicate: pred, + label: 'url_done', + required: true, + eventually: true, + timeoutMs: 2000, + pollMs: 0, + maxSnapshotAttempts: 1, + } satisfies StepVerification, + ], + }; + + const ok = await agent.runStep({ taskGoal: 'test', step }); + expect(ok).toBe(true); + expect(executor.calls.length).toBe(1); + expect(page.mouseClickCalls.length).toBeGreaterThan(0); + }); + + it('vision executor fallback is used after verification fail', async () => { + const sink = new MockSink(); + const tracer = new Tracer('run', sink); + const page = new MockPage('https://example.com/start') as any; + + const snapshots: Snapshot[] = [ + // ramp snapshot + { + status: 'success', + url: 'https://example.com/start', + elements: [makeClickableElement(1)], + timestamp: 't1', + }, + // verification attempt #1: fail + { + status: 'success', + url: 'https://example.com/still', + elements: [makeClickableElement(1)], + timestamp: 't2', + }, + // verification after vision retry: pass + { + status: 'success', + url: 'https://example.com/done', + elements: [makeClickableElement(1)], + timestamp: 't3', + }, + ]; + + const browserLike = { + snapshot: async () => snapshots.shift() as Snapshot, + }; + + const runtime = new AgentRuntime(browserLike as any, page as any, tracer); + const executor = new ProviderStub(['CLICK(1)']); + const vision = new VisionProviderStub(['CLICK(1)']); + const agent = new RuntimeAgent({ runtime, executor, visionExecutor: vision }); + + const pred: Predicate = ctx => ({ + passed: (ctx.url || '').endsWith('/done'), + reason: (ctx.url || '').endsWith('/done') ? '' : 'not done', + details: {}, + }); + + const step: RuntimeStep = { + goal: 'Try click; fallback if needed', + maxSnapshotAttempts: 1, + visionExecutorEnabled: true, + maxVisionExecutorAttempts: 1, + verifications: [ + { + predicate: pred, + label: 'url_done', + required: true, + eventually: true, + // Force structured attempt to FAIL fast so fallback triggers. + timeoutMs: 0, + pollMs: 0, + maxSnapshotAttempts: 1, + }, + ], + }; + + const ok = await agent.runStep({ taskGoal: 'test', step }); + expect(ok).toBe(true); + expect(executor.calls.length).toBe(1); + expect(vision.visionCalls.length).toBe(1); + }); + + it('snapshot limit ramp increases limit on low confidence', async () => { + const sink = new MockSink(); + const tracer = new Tracer('run', sink); + const page = new MockPage('https://example.com/start') as any; + + const seenLimits: number[] = []; + const snapshots: Snapshot[] = [ + { + status: 'success', + url: 'https://example.com/start', + elements: [makeClickableElement(1)], + timestamp: 't1', + diagnostics: { confidence: 0.1, reasons: [], metrics: { quiet_ms: 10 } } as any, + }, + { + status: 'success', + url: 'https://example.com/start', + elements: [makeClickableElement(1)], + timestamp: 't2', + diagnostics: { confidence: 0.9, reasons: [], metrics: { quiet_ms: 10 } } as any, + }, + { + status: 'success', + url: 'https://example.com/done', + elements: [makeClickableElement(1)], + timestamp: 't3', + }, + ]; + + const browserLike = { + snapshot: async (_page: any, options?: any) => { + if (options?.limit !== undefined) { + seenLimits.push(Number(options.limit)); + } + return snapshots.shift() as Snapshot; + }, + }; + + const runtime = new AgentRuntime(browserLike as any, page as any, tracer); + const executor = new ProviderStub(['CLICK(1)']); + const agent = new RuntimeAgent({ runtime, executor }); + + const pred: Predicate = ctx => ({ + passed: (ctx.url || '').endsWith('/done'), + reason: '', + details: {}, + }); + + const step: RuntimeStep = { + goal: 'ramp', + minConfidence: 0.7, + snapshotLimitBase: 60, + snapshotLimitStep: 40, + snapshotLimitMax: 220, + maxSnapshotAttempts: 2, + verifications: [ + { + predicate: pred, + label: 'url_done', + required: true, + eventually: true, + timeoutMs: 2000, + pollMs: 0, + maxSnapshotAttempts: 1, + }, + ], + }; + + const ok = await agent.runStep({ taskGoal: 'test', step }); + expect(ok).toBe(true); + expect(seenLimits.slice(0, 2)).toEqual([60, 100]); + }); + + it('short-circuits to vision on canvas + low actionables', async () => { + const sink = new MockSink(); + const tracer = new Tracer('run', sink); + const page = new MockPage('https://example.com/start') as any; + + // Make page.evaluate("document.querySelectorAll('canvas').length") return 1 + const originalEvaluate = page.evaluate.bind(page); + page.evaluate = async (script: any, ...args: any[]) => { + if (typeof script === 'string' && script.includes("querySelectorAll('canvas')")) { + return 1 as any; + } + return originalEvaluate(script, ...args); + }; + + const snapshots: Snapshot[] = [ + { status: 'success', url: 'https://example.com/start', elements: [], timestamp: 't1' }, + { status: 'success', url: 'https://example.com/done', elements: [], timestamp: 't2' }, + ]; + + const browserLike = { + snapshot: async () => snapshots.shift() as Snapshot, + }; + + const runtime = new AgentRuntime(browserLike as any, page as any, tracer); + const executor = new ProviderStub(['CLICK(999)']); + const vision = new VisionProviderStub(['CLICK_XY(100, 200)']); + const agent = new RuntimeAgent({ + runtime, + executor, + visionExecutor: vision, + shortCircuitCanvas: true, + }); + + const pred: Predicate = ctx => ({ + passed: (ctx.url || '').endsWith('/done'), + reason: '', + details: {}, + }); + + const step: RuntimeStep = { + goal: 'canvas step', + minActionables: 1, + maxSnapshotAttempts: 1, + visionExecutorEnabled: true, + maxVisionExecutorAttempts: 1, + verifications: [ + { + predicate: pred, + label: 'url_done', + required: true, + eventually: true, + timeoutMs: 2000, + pollMs: 0, + maxSnapshotAttempts: 1, + }, + ], + }; + + const ok = await agent.runStep({ taskGoal: 'test', step }); + expect(ok).toBe(true); + expect(executor.calls.length).toBe(0); + expect(vision.visionCalls.length).toBe(1); + expect(page.mouseClickCalls).toEqual([{ x: 100, y: 200 }]); + }); +});