diff --git a/README.md b/README.md index d93f401c..774744a4 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,16 @@ npm run build Use `AgentRuntime` to add Jest-style assertions to your agent loops. Verify browser state, check task completion, and get clear feedback on what's working: ```typescript -import { SentienceBrowser, AgentRuntime, urlContains, exists, allOf } from 'sentienceapi'; +import { + SentienceBrowser, + AgentRuntime, + urlContains, + exists, + allOf, + isEnabled, + isChecked, + valueEquals, +} from 'sentienceapi'; import { createTracer } from 'sentienceapi'; import { Page } from 'playwright'; @@ -52,6 +61,20 @@ runtime.assert(urlContains('example.com'), 'on_correct_domain'); runtime.assert(exists('role=heading'), 'has_heading'); runtime.assert(allOf([exists('role=button'), exists('role=link')]), 'has_interactive_elements'); +// v1: state-aware assertions (when Gateway refinement is enabled) +runtime.assert(isEnabled('role=button'), 'button_enabled'); +runtime.assert(isChecked("role=checkbox name~'subscribe'"), 'subscribe_checked_if_present'); +runtime.assert( + valueEquals("role=textbox name~'email'", 'user@example.com'), + 'email_value_if_present' +); + +// v2: retry loop with snapshot confidence gating + exhaustion +const ok = await runtime + .check(exists('role=heading'), 'heading_eventually_visible', true) + .eventually({ timeoutMs: 10_000, pollMs: 250, minConfidence: 0.7, maxSnapshotAttempts: 3 }); +console.log('eventually() result:', ok); + // Check task completion if (runtime.assertDone(exists("text~'Example'"), 'task_complete')) { console.log('✅ Task completed!'); @@ -60,7 +83,7 @@ if (runtime.assertDone(exists("text~'Example'"), 'task_complete')) { console.log(`Task done: ${runtime.isTaskDone}`); ``` -**See example:** [examples/agent-runtime-verification.ts](examples/agent-runtime-verification.ts) +**See examples:** [`examples/asserts/`](examples/asserts/) ## 🚀 Quick Start: Choose Your Abstraction Level @@ -316,69 +339,41 @@ console.log(`Task done: ${runtime.isTaskDone}`); ---
-

💼 Real-World Example: Amazon Shopping Bot

+

💼 Real-World Example: Assertion-driven navigation

-This example demonstrates navigating Amazon, finding products, and adding items to cart: +This example shows how to use **assertions + `.eventually()`** to make an agent loop resilient: ```typescript -import { SentienceBrowser, snapshot, find, click } from './src'; +import { SentienceBrowser, AgentRuntime, urlContains, exists } from 'sentienceapi'; +import { createTracer } from 'sentienceapi'; async function main() { - const browser = new SentienceBrowser(undefined, undefined, false); - - try { - await browser.start(); - - // Navigate to Amazon Best Sellers - await browser.goto('https://www.amazon.com/gp/bestsellers/'); - await browser.getPage().waitForLoadState('networkidle'); - await new Promise(resolve => setTimeout(resolve, 2000)); - - // Take snapshot and find products - const snap = await snapshot(browser); - console.log(`Found ${snap.elements.length} elements`); - - // Find first product in viewport using spatial filtering - const products = snap.elements.filter( - el => - el.role === 'link' && - el.visual_cues.is_clickable && - el.in_viewport && - !el.is_occluded && - el.bbox.y < 600 // First row - ); - - if (products.length > 0) { - // Sort by position (left to right, top to bottom) - products.sort((a, b) => a.bbox.y - b.bbox.y || a.bbox.x - b.bbox.x); - const firstProduct = products[0]; - - console.log(`Clicking: ${firstProduct.text}`); - const result = await click(browser, firstProduct.id); - - // Wait for product page - await browser.getPage().waitForLoadState('networkidle'); - await new Promise(resolve => setTimeout(resolve, 2000)); - - // Find and click "Add to Cart" button - const productSnap = await snapshot(browser); - const addToCart = find(productSnap, 'role=button text~"add to cart"'); - - if (addToCart) { - const cartResult = await click(browser, addToCart.id); - console.log(`Added to cart: ${cartResult.success}`); - } - } - } finally { - await browser.close(); - } + const browser = await SentienceBrowser.create({ apiKey: process.env.SENTIENCE_API_KEY }); + const tracer = await createTracer({ runId: 'verified-run', uploadTrace: false }); + + const adapter = { + snapshot: async (_page: any, options?: Record) => { + return await browser.snapshot(options); + }, + }; + const runtime = new AgentRuntime(adapter as any, browser.getPage() as any, tracer); + + await browser.getPage().goto('https://example.com'); + runtime.beginStep('Verify we are on the right page'); + + await runtime + .check(urlContains('example.com'), 'on_domain', true) + .eventually({ timeoutMs: 10_000, pollMs: 250, minConfidence: 0.7, maxSnapshotAttempts: 3 }); + + runtime.assert(exists('role=heading'), 'heading_present'); + + await tracer.close(); + await browser.close(); } -main(); +main().catch(console.error); ``` -**📖 See the complete tutorial:** [Amazon Shopping Guide](../docs/AMAZON_SHOPPING_GUIDE.md) -
--- diff --git a/examples/asserts/README.md b/examples/asserts/README.md new file mode 100644 index 00000000..2ec9fbe0 --- /dev/null +++ b/examples/asserts/README.md @@ -0,0 +1,16 @@ +# Assertions examples (v1 + v2) + +These examples focus on **AgentRuntime assertions**: + +- **v1**: deterministic, state-aware assertions (enabled/checked/value/expanded) + failure intelligence +- **v2**: `.check(...).eventually(...)` retry loops with `minConfidence` gating + snapshot exhaustion + +Run examples: + +```bash +cd sdk-ts +npm run build +node dist/examples/asserts/v1-state-assertions.js +node dist/examples/asserts/v2-eventually-min-confidence.js +``` + diff --git a/examples/asserts/eventually-min-confidence.ts b/examples/asserts/eventually-min-confidence.ts new file mode 100644 index 00000000..967a3aa7 --- /dev/null +++ b/examples/asserts/eventually-min-confidence.ts @@ -0,0 +1,46 @@ +/** + * v2: `.check(...).eventually(...)` with snapshot confidence gating + exhaustion. + */ + +import { SentienceBrowser } from '../../src/browser'; +import { AgentRuntime } from '../../src/agent-runtime'; +import { createTracer } from '../../src/tracing/tracer-factory'; +import { exists } from '../../src/verification'; + +async function main(): Promise { + const browser = new SentienceBrowser(process.env.SENTIENCE_API_KEY); + await browser.start(); + + const tracer = await createTracer({ runId: 'asserts-v2', uploadTrace: false }); + const adapter = { + snapshot: async (_page: any, options?: Record) => { + return await browser.snapshot(options); + }, + }; + const runtime = new AgentRuntime(adapter as any, browser.getPage() as any, tracer); + + await browser.getPage().goto('https://example.com'); + runtime.beginStep('Assert v2 eventually'); + + const ok = await runtime + .check(exists("text~'Example Domain'"), 'example_domain_text', true) + .eventually({ + timeoutMs: 10_000, + pollMs: 250, + minConfidence: 0.7, + maxSnapshotAttempts: 3, + snapshotOptions: { use_api: true }, + }); + + console.log('eventually() result:', ok); + console.log('Final assertions:', runtime.getAssertionsForStepEnd().assertions); + + await tracer.close(); + await browser.close(); +} + +main().catch(err => { + console.error(err); + process.exit(1); +}); + diff --git a/examples/asserts/state-assertions.ts b/examples/asserts/state-assertions.ts new file mode 100644 index 00000000..4d777d92 --- /dev/null +++ b/examples/asserts/state-assertions.ts @@ -0,0 +1,51 @@ +/** + * v1: State-aware assertions with AgentRuntime. + * + * This example is meant to be run with a Pro/Enterprise API key so the Gateway + * can refine raw elements into SmartElements with state fields (enabled/checked/value/etc). + * + * Env vars: + * - SENTIENCE_API_KEY (optional but recommended for v1 state assertions) + */ + +import { SentienceBrowser } from '../../src/browser'; +import { AgentRuntime } from '../../src/agent-runtime'; +import { createTracer } from '../../src/tracing/tracer-factory'; +import { exists, isChecked, isDisabled, isEnabled, isExpanded, valueContains } from '../../src/verification'; + +async function main(): Promise { + const browser = new SentienceBrowser(process.env.SENTIENCE_API_KEY); + await browser.start(); + + const tracer = await createTracer({ runId: 'asserts-v1', uploadTrace: false }); + + // AgentRuntime in TS expects a minimal adapter with snapshot(page, options). + const adapter = { + snapshot: async (_page: any, options?: Record) => { + return await browser.snapshot(options); + }, + }; + + const runtime = new AgentRuntime(adapter as any, browser.getPage() as any, tracer); + + await browser.getPage().goto('https://example.com'); + runtime.beginStep('Assert v1 state'); + await runtime.snapshot({ use_api: true }); // Pro tier (Gateway refinement) if api key is present + + runtime.assert(exists('role=heading'), 'has_heading'); + runtime.assert(isEnabled('role=link'), 'some_link_enabled'); + runtime.assert(isDisabled("role=button text~'continue'"), 'continue_disabled_if_present'); + runtime.assert(isChecked("role=checkbox name~'subscribe'"), 'subscribe_checked_if_present'); + runtime.assert(isExpanded("role=button name~'more'"), 'more_is_expanded_if_present'); + runtime.assert(valueContains("role=textbox name~'email'", '@'), 'email_has_at_if_present'); + + console.log('Assertions recorded:', runtime.getAssertionsForStepEnd().assertions); + await tracer.close(); + await browser.close(); +} + +main().catch(err => { + console.error(err); + process.exit(1); +}); + diff --git a/src/agent-runtime.ts b/src/agent-runtime.ts index 845ca35a..1b5a5cf4 100644 --- a/src/agent-runtime.ts +++ b/src/agent-runtime.ts @@ -43,6 +43,7 @@ import { v4 as uuidv4 } from 'uuid'; import { Snapshot } from './types'; import { AssertContext, Predicate } from './verification'; import { Tracer } from './tracing/tracer'; +import { LLMProvider } from './llm-provider'; // Define a minimal browser interface to avoid circular dependencies interface BrowserLike { @@ -60,6 +61,225 @@ export interface AssertionRecord { details: Record; } +export interface EventuallyOptions { + timeoutMs?: number; + pollMs?: number; + snapshotOptions?: Record; + /** If set, `.eventually()` will treat snapshots below this confidence as failures and resnapshot. */ + minConfidence?: number; + /** Max number of snapshot attempts to get above minConfidence before declaring exhaustion. */ + maxSnapshotAttempts?: number; + /** Optional: vision fallback provider used after snapshot exhaustion (last resort). */ + visionProvider?: LLMProvider; + /** Optional: override vision system prompt (YES/NO only). */ + visionSystemPrompt?: string; + /** Optional: override vision user prompt (YES/NO only). */ + visionUserPrompt?: string; +} + +export class AssertionHandle { + private runtime: AgentRuntime; + private predicate: Predicate; + private label: string; + private required: boolean; + + constructor(runtime: AgentRuntime, predicate: Predicate, label: string, required: boolean) { + this.runtime = runtime; + this.predicate = predicate; + this.label = label; + this.required = required; + } + + once(): boolean { + return this.runtime.assert(this.predicate, this.label, this.required); + } + + async eventually(options: EventuallyOptions = {}): Promise { + const timeoutMs = options.timeoutMs ?? 10_000; + const pollMs = options.pollMs ?? 250; + const snapshotOptions = options.snapshotOptions; + const minConfidence = options.minConfidence; + const maxSnapshotAttempts = options.maxSnapshotAttempts ?? 3; + const visionProvider = options.visionProvider; + const visionSystemPrompt = options.visionSystemPrompt; + const visionUserPrompt = options.visionUserPrompt; + + const deadline = Date.now() + timeoutMs; + let attempt = 0; + let snapshotAttempt = 0; + let lastOutcome: ReturnType | null = null; + + while (true) { + attempt += 1; + await this.runtime.snapshot(snapshotOptions); + snapshotAttempt += 1; + + const diagnostics = this.runtime.lastSnapshot?.diagnostics; + const confidence = diagnostics?.confidence; + if ( + typeof minConfidence === 'number' && + typeof confidence === 'number' && + Number.isFinite(confidence) && + confidence < minConfidence + ) { + lastOutcome = { + passed: false, + reason: `Snapshot confidence ${confidence.toFixed(3)} < minConfidence ${minConfidence.toFixed(3)}`, + details: { + reason_code: 'snapshot_low_confidence', + confidence, + min_confidence: minConfidence, + snapshot_attempt: snapshotAttempt, + diagnostics, + }, + }; + + (this.runtime as any)._recordOutcome( + lastOutcome, + this.label, + this.required, + { eventually: true, attempt, snapshot_attempt: snapshotAttempt, final: false }, + false + ); + + if (snapshotAttempt >= maxSnapshotAttempts) { + // Optional: vision fallback after snapshot exhaustion (last resort). + // Keeps the assertion surface invariant; only perception changes. + if (visionProvider && visionProvider.supportsVision?.()) { + try { + const buf = (await (this.runtime.page as any).screenshot({ type: 'png' })) as Buffer; + const imageBase64 = Buffer.from(buf).toString('base64'); + const sys = + visionSystemPrompt ?? 'You are a strict visual verifier. Answer only YES or NO.'; + const user = + visionUserPrompt ?? + `Given the screenshot, is the following condition satisfied?\n\n${this.label}\n\nAnswer YES or NO.`; + + const resp = await visionProvider.generateWithImage(sys, user, imageBase64, { + temperature: 0.0, + }); + const text = (resp.content || '').trim().toLowerCase(); + const passed = text.startsWith('yes'); + + const finalOutcome = { + passed, + reason: passed ? 'vision_fallback_yes' : 'vision_fallback_no', + details: { + reason_code: passed ? 'vision_fallback_pass' : 'vision_fallback_fail', + vision_response: resp.content, + min_confidence: minConfidence, + snapshot_attempts: snapshotAttempt, + }, + }; + + (this.runtime as any)._recordOutcome( + finalOutcome, + this.label, + this.required, + { + eventually: true, + attempt, + snapshot_attempt: snapshotAttempt, + final: true, + vision_fallback: true, + }, + true + ); + return passed; + } catch { + // fall through to snapshot_exhausted + } + } + + const finalOutcome = { + passed: false, + reason: `Snapshot exhausted after ${snapshotAttempt} attempt(s) below minConfidence ${minConfidence.toFixed(3)}`, + details: { + reason_code: 'snapshot_exhausted', + confidence, + min_confidence: minConfidence, + snapshot_attempts: snapshotAttempt, + diagnostics, + }, + }; + + (this.runtime as any)._recordOutcome( + finalOutcome, + this.label, + this.required, + { + eventually: true, + attempt, + snapshot_attempt: snapshotAttempt, + final: true, + exhausted: true, + }, + true + ); + return false; + } + + if (Date.now() >= deadline) { + (this.runtime as any)._recordOutcome( + lastOutcome, + this.label, + this.required, + { + eventually: true, + attempt, + snapshot_attempt: snapshotAttempt, + final: true, + timeout: true, + }, + true + ); + return false; + } + + await new Promise(resolve => setTimeout(resolve, pollMs)); + continue; + } + + lastOutcome = this.predicate((this.runtime as any).ctx()); + + // Emit attempt event (not recorded in step_end) + (this.runtime as any)._recordOutcome( + lastOutcome, + this.label, + this.required, + { eventually: true, attempt, final: false }, + false + ); + + if (lastOutcome.passed) { + // Record final success once + (this.runtime as any)._recordOutcome( + lastOutcome, + this.label, + this.required, + { eventually: true, attempt, final: true }, + true + ); + return true; + } + + if (Date.now() >= deadline) { + // Record final failure once + (this.runtime as any)._recordOutcome( + lastOutcome, + this.label, + this.required, + { eventually: true, attempt, final: true, timeout: true }, + true + ); + return false; + } + + await new Promise(resolve => setTimeout(resolve, pollMs)); + } + } +} + /** * Runtime wrapper for agent verification loops. * @@ -92,6 +312,82 @@ export class AgentRuntime { private taskDone: boolean = false; private taskDoneLabel: string | null = null; + private static similarity(a: string, b: string): number { + const s1 = a.toLowerCase(); + const s2 = b.toLowerCase(); + if (!s1 || !s2) return 0; + if (s1 === s2) return 1; + + // Bigram overlap (cheap, robust enough for suggestions) + const bigrams = (s: string): string[] => { + const out: string[] = []; + for (let i = 0; i < s.length - 1; i++) out.push(s.slice(i, i + 2)); + return out; + }; + const a2 = bigrams(s1); + const b2 = bigrams(s2); + const setB = new Set(b2); + let common = 0; + for (const g of a2) if (setB.has(g)) common += 1; + return (2 * common) / (a2.length + b2.length + 1e-9); + } + + _recordOutcome( + outcome: ReturnType, + label: string, + required: boolean, + extra: Record | null, + recordInStep: boolean + ): void { + const details = { ...(outcome.details || {}) } as Record; + + // Failure intelligence: nearest matches for selector-driven assertions + if (!outcome.passed && this.lastSnapshot && typeof details.selector === 'string') { + const selector = details.selector; + const scored: Array<{ score: number; el: any }> = []; + for (const el of this.lastSnapshot.elements) { + const hay = el.name ?? el.text ?? ''; + if (!hay) continue; + const score = AgentRuntime.similarity(selector, hay); + scored.push({ score, el }); + } + scored.sort((x, y) => y.score - x.score); + details.nearest_matches = scored.slice(0, 3).map(({ score, el }) => ({ + id: el.id, + role: el.role, + text: (el.text ?? '').toString().slice(0, 80), + name: (el.name ?? '').toString().slice(0, 80), + score: Math.round(score * 10_000) / 10_000, + })); + } + + const record: AssertionRecord & Record = { + label, + passed: outcome.passed, + required, + reason: outcome.reason, + details, + ...(extra || {}), + }; + + if (recordInStep) { + this.assertionsThisStep.push(record); + } + + this.tracer.emit( + 'verification', + { + kind: 'assert', + ...record, + }, + this.stepId || undefined + ); + } + + check(predicate: Predicate, label: string, required: boolean = false): AssertionHandle { + return new AssertionHandle(this, predicate, label, required); + } + /** * Create a new AgentRuntime. * @@ -179,31 +475,7 @@ export class AgentRuntime { */ assert(predicate: Predicate, label: string, required: boolean = false): boolean { const outcome = predicate(this.ctx()); - - const record: AssertionRecord = { - label, - passed: outcome.passed, - required, - reason: outcome.reason, - details: outcome.details, - }; - this.assertionsThisStep.push(record); - - // Emit dedicated verification event (Option B from design doc) - // This makes assertions visible in Studio timeline - this.tracer.emit( - 'verification', - { - kind: 'assert', - passed: outcome.passed, - label, - required, - reason: outcome.reason, - details: outcome.details, - }, - this.stepId || undefined - ); - + this._recordOutcome(outcome, label, required, null, true); return outcome.passed; } diff --git a/src/index.ts b/src/index.ts index 74eb3c36..ad80abb0 100644 --- a/src/index.ts +++ b/src/index.ts @@ -57,8 +57,16 @@ export { allOf, anyOf, custom, + isEnabled, + isDisabled, + isChecked, + isUnchecked, + valueEquals, + valueContains, + isExpanded, + isCollapsed, } from './verification'; -export { AgentRuntime, AssertionRecord } from './agent-runtime'; +export { AgentRuntime, AssertionHandle, AssertionRecord, EventuallyOptions } from './agent-runtime'; // Ordinal Support (Phase 3) export { diff --git a/src/llm-provider.ts b/src/llm-provider.ts index 91399e91..9816c24b 100644 --- a/src/llm-provider.ts +++ b/src/llm-provider.ts @@ -41,6 +41,31 @@ export abstract class LLMProvider { * Get the model name/identifier */ abstract get modelName(): string; + + /** + * Whether this provider supports image input for vision tasks. + * Override in subclasses that support vision-capable models. + */ + supportsVision(): boolean { + return false; + } + + /** + * Generate with image input (vision-capable models only). + * Override in subclasses that support vision. + */ + + async generateWithImage( + systemPrompt: string, + userPrompt: string, + imageBase64: string, + options: Record = {} + ): Promise { + throw new Error( + `${this.constructor.name} does not support vision. ` + + `Use a vision-capable provider (e.g., OpenAIProvider with GPT-4o, AnthropicProvider with Claude 3).` + ); + } } /** @@ -95,6 +120,42 @@ export class OpenAIProvider extends LLMProvider { return true; } + supportsVision(): boolean { + return true; + } + + async generateWithImage( + systemPrompt: string, + userPrompt: string, + imageBase64: string, + options: Record = {} + ): Promise { + const response = await this.client.chat.completions.create({ + model: this._modelName, + messages: [ + { role: 'system', content: systemPrompt }, + { + role: 'user', + content: [ + { type: 'text', text: userPrompt }, + { type: 'image_url', image_url: { url: `data:image/png;base64,${imageBase64}` } }, + ], + }, + ], + temperature: options.temperature ?? 0.0, + ...options, + }); + + const choice = response.choices[0]; + return { + content: choice.message.content || '', + promptTokens: response.usage?.prompt_tokens, + completionTokens: response.usage?.completion_tokens, + totalTokens: response.usage?.total_tokens, + modelName: this._modelName, + }; + } + get modelName(): string { return this._modelName; } @@ -151,6 +212,50 @@ export class AnthropicProvider extends LLMProvider { return false; } + supportsVision(): boolean { + return true; + } + + async generateWithImage( + systemPrompt: string, + userPrompt: string, + imageBase64: string, + options: Record = {} + ): Promise { + const response = await this.client.messages.create({ + model: this._modelName, + max_tokens: options.max_tokens ?? 1024, + system: systemPrompt, + messages: [ + { + role: 'user', + content: [ + { type: 'text', text: userPrompt }, + { + type: 'image', + source: { + type: 'base64', + media_type: options.media_type ?? 'image/png', + data: imageBase64, + }, + }, + ], + }, + ], + temperature: options.temperature ?? 0.0, + ...options, + }); + + const content = response.content[0].text; + return { + content, + promptTokens: response.usage?.input_tokens, + completionTokens: response.usage?.output_tokens, + totalTokens: (response.usage?.input_tokens || 0) + (response.usage?.output_tokens || 0), + modelName: this._modelName, + }; + } + get modelName(): string { return this._modelName; } diff --git a/src/query.ts b/src/query.ts index 2a8dc93c..ba933692 100644 --- a/src/query.ts +++ b/src/query.ts @@ -25,6 +25,12 @@ export function parseSelector(selector: string): QuerySelectorObject { text_contains?: string; text_prefix?: string; text_suffix?: string; + name_contains?: string; + name_prefix?: string; + name_suffix?: string; + value_contains?: string; + value_prefix?: string; + value_suffix?: string; visible?: boolean; tag?: string; importance?: number; @@ -71,18 +77,30 @@ export function parseSelector(selector: string): QuerySelectorObject { } } else if (op === '~') { // Substring match (case-insensitive) - if (key === 'text' || key === 'name') { + if (key === 'text') { query.text_contains = value; + } else if (key === 'name') { + query.name_contains = value; + } else if (key === 'value') { + query.value_contains = value; } } else if (op === '^=') { // Prefix match - if (key === 'text' || key === 'name') { + if (key === 'text') { query.text_prefix = value; + } else if (key === 'name') { + query.name_prefix = value; + } else if (key === 'value') { + query.value_prefix = value; } } else if (op === '$=') { // Suffix match - if (key === 'text' || key === 'name') { + if (key === 'text') { query.text_suffix = value; + } else if (key === 'name') { + query.name_suffix = value; + } else if (key === 'value') { + query.value_suffix = value; } } else if (op === '>') { // Greater than @@ -146,8 +164,14 @@ export function parseSelector(selector: string): QuerySelectorObject { query.visible = value.toLowerCase() === 'true'; } else if (key === 'tag') { query.tag = value; - } else if (key === 'name' || key === 'text') { + } else if (key === 'text') { query.text = value; + } else if (key === 'name') { + query.name = value; + } else if (key === 'value') { + query.value = value; + } else if (key === 'checked' || key === 'disabled' || key === 'expanded') { + (query as any)[key] = value.toLowerCase() === 'true'; } else if (key === 'importance' && isNumeric) { query.importance = numericValue; } else if (key.startsWith('attr.')) { @@ -178,6 +202,12 @@ function matchElement( text_contains?: string; text_prefix?: string; text_suffix?: string; + name_contains?: string; + name_prefix?: string; + name_suffix?: string; + value_contains?: string; + value_prefix?: string; + value_suffix?: string; visible?: boolean; tag?: string; importance?: number; @@ -262,6 +292,69 @@ function matchElement( } } + // Name matching (best-effort; fallback to text for backward compatibility) + const nameVal = element.name ?? element.text ?? ''; + if (query.name !== undefined) { + if (!nameVal || nameVal !== query.name) { + return false; + } + } + if (query.name_contains !== undefined) { + if (!nameVal || !nameVal.toLowerCase().includes(query.name_contains.toLowerCase())) { + return false; + } + } + if (query.name_prefix !== undefined) { + if (!nameVal || !nameVal.toLowerCase().startsWith(query.name_prefix.toLowerCase())) { + return false; + } + } + if (query.name_suffix !== undefined) { + if (!nameVal || !nameVal.toLowerCase().endsWith(query.name_suffix.toLowerCase())) { + return false; + } + } + + // Value matching (inputs/textarea/select) + const valueVal = element.value ?? null; + if ((query as any).value !== undefined) { + if (valueVal === null || valueVal !== (query as any).value) { + return false; + } + } + if (query.value_contains !== undefined) { + if (valueVal === null || !valueVal.toLowerCase().includes(query.value_contains.toLowerCase())) { + return false; + } + } + if (query.value_prefix !== undefined) { + if (valueVal === null || !valueVal.toLowerCase().startsWith(query.value_prefix.toLowerCase())) { + return false; + } + } + if (query.value_suffix !== undefined) { + if (valueVal === null || !valueVal.toLowerCase().endsWith(query.value_suffix.toLowerCase())) { + return false; + } + } + + // State matching (best-effort) + if ((query as any).checked !== undefined) { + if ((element.checked === true) !== (query as any).checked) { + return false; + } + } + if ((query as any).disabled !== undefined) { + if ((element.disabled === true) !== (query as any).disabled) { + return false; + } + } + if ((query as any).expanded !== undefined) { + if ((element.expanded === true) !== (query as any).expanded) { + return false; + } + } + // Importance filtering if (query.importance !== undefined) { if (element.importance !== query.importance) { diff --git a/src/snapshot.ts b/src/snapshot.ts index 053852d1..821cd5d7 100644 --- a/src/snapshot.ts +++ b/src/snapshot.ts @@ -238,6 +238,7 @@ async function snapshotViaApi( // Step 2: Send to server for smart ranking/filtering // Use raw_elements (raw data) instead of elements (processed data) // Server validates API key and applies proprietary ranking logic + const clientMetrics = rawResult?.diagnostics?.metrics; const payload = { raw_elements: rawResult.raw_elements || [], // Raw data needed for server processing url: rawResult.url || '', @@ -247,6 +248,7 @@ async function snapshotViaApi( limit: options.limit, filter: options.filter, }, + client_metrics: clientMetrics || undefined, }; // Check payload size before sending (server has 10MB limit) @@ -307,6 +309,8 @@ async function snapshotViaApi( error: apiResult.error, // Phase 2: Ordinal support - dominant group key from Gateway dominant_group_key: apiResult.dominant_group_key, + // Phase 2: Runtime stability/debug info + diagnostics: apiResult.diagnostics || rawResult.diagnostics, }; // Show visual overlay if requested (use API-ranked elements) diff --git a/src/types.ts b/src/types.ts index 0ce8e589..761af962 100644 --- a/src/types.ts +++ b/src/types.ts @@ -50,6 +50,24 @@ export interface Element { // Hyperlink URL (for link elements) href?: string; + // ===== v1 state-aware assertion fields (optional) ===== + /** Best-effort accessible name/label for controls (distinct from visible text) */ + name?: string | null; + /** Current value for inputs/textarea/select (PII-aware: may be omitted/redacted) */ + value?: string | null; + /** Input type (e.g., "text", "email", "password") */ + input_type?: string | null; + /** Whether value was redacted for privacy */ + value_redacted?: boolean | null; + /** Normalized boolean states (best-effort) */ + checked?: boolean | null; + disabled?: boolean | null; + expanded?: boolean | null; + /** Raw ARIA state strings (tri-state / debugging) */ + aria_checked?: string | null; + aria_disabled?: string | null; + aria_expanded?: string | null; + // Phase 3.2: Pre-computed dominant group membership (uses fuzzy matching) // This field is computed by the gateway so downstream consumers don't need to // implement fuzzy matching logic themselves. @@ -123,6 +141,22 @@ export interface Snapshot { requires_license?: boolean; // Phase 2: Dominant group key for ordinal selection dominant_group_key?: string; // The most common group_key (main content group) + // Phase 2: Runtime stability/debug info (confidence/reasons/metrics) + diagnostics?: SnapshotDiagnostics; +} + +export interface SnapshotDiagnosticsMetrics { + ready_state?: string | null; + quiet_ms?: number | null; + node_count?: number | null; + interactive_count?: number | null; + raw_elements_count?: number | null; +} + +export interface SnapshotDiagnostics { + confidence?: number | null; + reasons?: string[]; + metrics?: SnapshotDiagnosticsMetrics; } /** @@ -161,6 +195,10 @@ export interface QuerySelectorObject { role?: string; text?: string; name?: string; + value?: string; + checked?: boolean; + disabled?: boolean; + expanded?: boolean; clickable?: boolean; isPrimary?: boolean; importance?: number | { min?: number; max?: number }; diff --git a/src/verification.ts b/src/verification.ts index 90ea3e73..1e0c8a70 100644 --- a/src/verification.ts +++ b/src/verification.ts @@ -144,7 +144,7 @@ export function exists(selector: QuerySelector): Predicate { return { passed: false, reason: 'no snapshot available', - details: { selector: selectorStr }, + details: { selector: selectorStr, reason_code: 'no_snapshot' }, }; } @@ -153,7 +153,11 @@ export function exists(selector: QuerySelector): Predicate { return { passed: ok, reason: ok ? '' : `no elements matched selector: ${selectorStr}`, - details: { selector: selectorStr, matched: matches.length }, + details: { + selector: selectorStr, + matched: matches.length, + reason_code: ok ? 'ok' : 'no_match', + }, }; }; } @@ -180,7 +184,7 @@ export function notExists(selector: QuerySelector): Predicate { return { passed: false, reason: 'no snapshot available', - details: { selector: selectorStr }, + details: { selector: selectorStr, reason_code: 'no_snapshot' }, }; } @@ -189,7 +193,11 @@ export function notExists(selector: QuerySelector): Predicate { return { passed: ok, reason: ok ? '' : `found ${matches.length} elements matching: ${selectorStr}`, - details: { selector: selectorStr, matched: matches.length }, + details: { + selector: selectorStr, + matched: matches.length, + reason_code: ok ? 'ok' : 'unexpected_match', + }, }; }; } @@ -362,3 +370,186 @@ export function custom( } }; } + +// ============================================================================ +// v1 state-aware predicates (deterministic, schema-driven) +// ============================================================================ + +export function isEnabled(selector: QuerySelector): Predicate { + const selectorStr = selectorToString(selector); + return (ctx: AssertContext): AssertOutcome => { + const snap = ctx.snapshot; + if (!snap) { + return { passed: false, reason: 'no snapshot available', details: { selector: selectorStr } }; + } + + const matches = query(snap, selector); + if (matches.length === 0) { + return { + passed: false, + reason: `no elements matched selector: ${selectorStr}`, + details: { selector: selectorStr, matched: 0, reason_code: 'no_match' }, + }; + } + + const ok = matches.some(m => m.disabled !== true); + return { + passed: ok, + reason: ok ? '' : `all matched elements are disabled: ${selectorStr}`, + details: { + selector: selectorStr, + matched: matches.length, + reason_code: ok ? 'ok' : 'state_mismatch', + }, + }; + }; +} + +export function isDisabled(selector: QuerySelector): Predicate { + const selectorStr = selectorToString(selector); + return (ctx: AssertContext): AssertOutcome => { + const snap = ctx.snapshot; + if (!snap) { + return { passed: false, reason: 'no snapshot available', details: { selector: selectorStr } }; + } + const matches = query(snap, selector); + const ok = matches.some(m => m.disabled === true); + return { + passed: ok, + reason: ok ? '' : `no matched elements are disabled: ${selectorStr}`, + details: { + selector: selectorStr, + matched: matches.length, + reason_code: ok ? 'ok' : 'state_mismatch', + }, + }; + }; +} + +export function isChecked(selector: QuerySelector): Predicate { + const selectorStr = selectorToString(selector); + return (ctx: AssertContext): AssertOutcome => { + const snap = ctx.snapshot; + if (!snap) { + return { passed: false, reason: 'no snapshot available', details: { selector: selectorStr } }; + } + const matches = query(snap, selector); + const ok = matches.some(m => m.checked === true); + return { + passed: ok, + reason: ok ? '' : `no matched elements are checked: ${selectorStr}`, + details: { + selector: selectorStr, + matched: matches.length, + reason_code: ok ? 'ok' : 'state_mismatch', + }, + }; + }; +} + +export function isUnchecked(selector: QuerySelector): Predicate { + const selectorStr = selectorToString(selector); + return (ctx: AssertContext): AssertOutcome => { + const snap = ctx.snapshot; + if (!snap) { + return { passed: false, reason: 'no snapshot available', details: { selector: selectorStr } }; + } + const matches = query(snap, selector); + const ok = matches.some(m => m.checked !== true); + return { + passed: ok, + reason: ok ? '' : `all matched elements are checked: ${selectorStr}`, + details: { + selector: selectorStr, + matched: matches.length, + reason_code: ok ? 'ok' : 'state_mismatch', + }, + }; + }; +} + +export function valueEquals(selector: QuerySelector, expected: string): Predicate { + const selectorStr = selectorToString(selector); + return (ctx: AssertContext): AssertOutcome => { + const snap = ctx.snapshot; + if (!snap) { + return { passed: false, reason: 'no snapshot available', details: { selector: selectorStr } }; + } + const matches = query(snap, selector); + const ok = matches.some(m => (m.value ?? '') === expected); + return { + passed: ok, + reason: ok ? '' : `no matched elements had value == '${expected}'`, + details: { + selector: selectorStr, + expected, + matched: matches.length, + reason_code: ok ? 'ok' : 'state_mismatch', + }, + }; + }; +} + +export function valueContains(selector: QuerySelector, substring: string): Predicate { + const selectorStr = selectorToString(selector); + return (ctx: AssertContext): AssertOutcome => { + const snap = ctx.snapshot; + if (!snap) { + return { passed: false, reason: 'no snapshot available', details: { selector: selectorStr } }; + } + const matches = query(snap, selector); + const ok = matches.some(m => (m.value ?? '').toLowerCase().includes(substring.toLowerCase())); + return { + passed: ok, + reason: ok ? '' : `no matched elements had value containing '${substring}'`, + details: { + selector: selectorStr, + substring, + matched: matches.length, + reason_code: ok ? 'ok' : 'state_mismatch', + }, + }; + }; +} + +export function isExpanded(selector: QuerySelector): Predicate { + const selectorStr = selectorToString(selector); + return (ctx: AssertContext): AssertOutcome => { + const snap = ctx.snapshot; + if (!snap) { + return { passed: false, reason: 'no snapshot available', details: { selector: selectorStr } }; + } + const matches = query(snap, selector); + const ok = matches.some(m => m.expanded === true); + return { + passed: ok, + reason: ok ? '' : `no matched elements are expanded: ${selectorStr}`, + details: { + selector: selectorStr, + matched: matches.length, + reason_code: ok ? 'ok' : 'state_mismatch', + }, + }; + }; +} + +export function isCollapsed(selector: QuerySelector): Predicate { + const selectorStr = selectorToString(selector); + return (ctx: AssertContext): AssertOutcome => { + const snap = ctx.snapshot; + if (!snap) { + return { passed: false, reason: 'no snapshot available', details: { selector: selectorStr } }; + } + const matches = query(snap, selector); + const ok = matches.some(m => m.expanded !== true); + return { + passed: ok, + reason: ok ? '' : `all matched elements are expanded: ${selectorStr}`, + details: { + selector: selectorStr, + matched: matches.length, + reason_code: ok ? 'ok' : 'state_mismatch', + }, + }; + }; +} diff --git a/tests/agent-runtime-eventually.test.ts b/tests/agent-runtime-eventually.test.ts new file mode 100644 index 00000000..71cfb971 --- /dev/null +++ b/tests/agent-runtime-eventually.test.ts @@ -0,0 +1,179 @@ +import { AgentRuntime } from '../src/agent-runtime'; +import { TraceSink } from '../src/tracing/sink'; +import { Tracer } from '../src/tracing/tracer'; +import { Predicate } from '../src/verification'; +import { Snapshot } from '../src/types'; +import { MockPage } from './mocks/browser-mock'; + +class MockSink extends TraceSink { + public events: any[] = []; + emit(event: Record): void { + this.events.push(event); + } + async close(): Promise { + // no-op + } + getSinkType(): string { + return 'MockSink'; + } +} + +describe('AgentRuntime.check().eventually()', () => { + it('records only final assertion and emits attempt events', async () => { + const sink = new MockSink(); + const tracer = new Tracer('test-run', sink); + const page = new MockPage('https://example.com') as any; + + const snapshots: Snapshot[] = [ + { status: 'success', url: 'https://example.com', elements: [], timestamp: 't1' }, + { status: 'success', url: 'https://example.com', elements: [], timestamp: 't2' }, + { status: 'success', url: 'https://example.com/done', elements: [], timestamp: 't3' }, + ]; + + const browserLike = { + snapshot: async () => snapshots.shift() as Snapshot, + }; + + const runtime = new AgentRuntime(browserLike as any, page as any, tracer); + runtime.beginStep('Test'); + + const pred: Predicate = ctx => { + const ok = (ctx.url || '').endsWith('/done'); + return { + passed: ok, + reason: ok ? '' : 'not done', + details: { selector: "text~'Done'", reason_code: ok ? 'ok' : 'no_match' }, + }; + }; + + const ok = await runtime.check(pred, 'eventually_done').eventually({ + timeoutMs: 2000, + pollMs: 0, + }); + + expect(ok).toBe(true); + + const stepEnd = runtime.getAssertionsForStepEnd(); + expect(stepEnd.assertions.length).toBe(1); + expect(stepEnd.assertions[0].label).toBe('eventually_done'); + expect(stepEnd.assertions[0].passed).toBe(true); + expect((stepEnd.assertions[0] as any).final).toBe(true); + + // emitted attempt events + final event (at least 3) + const verificationEvents = sink.events.filter(e => e.type === 'verification'); + expect(verificationEvents.length).toBeGreaterThanOrEqual(3); + }); + + it('can gate on minConfidence and stop with snapshot_exhausted', async () => { + const sink = new MockSink(); + const tracer = new Tracer('test-run', sink); + const page = new MockPage('https://example.com') as any; + + const snapshots: Snapshot[] = [ + { + status: 'success', + url: 'https://example.com', + elements: [], + timestamp: 't1', + diagnostics: { + confidence: 0.1, + reasons: ['dom_unstable'], + metrics: { quiet_ms: 50 }, + } as any, + }, + { + status: 'success', + url: 'https://example.com', + elements: [], + timestamp: 't2', + diagnostics: { + confidence: 0.1, + reasons: ['dom_unstable'], + metrics: { quiet_ms: 50 }, + } as any, + }, + ]; + + const browserLike = { + snapshot: async () => snapshots.shift() as Snapshot, + }; + + const runtime = new AgentRuntime(browserLike as any, page as any, tracer); + runtime.beginStep('Test'); + + const pred: Predicate = _ctx => ({ + passed: true, + reason: 'would pass', + details: {}, + }); + + const ok = await runtime.check(pred, 'min_confidence_gate').eventually({ + timeoutMs: 2000, + pollMs: 0, + minConfidence: 0.7, + maxSnapshotAttempts: 2, + }); + + expect(ok).toBe(false); + + const stepEnd = runtime.getAssertionsForStepEnd(); + expect(stepEnd.assertions.length).toBe(1); + expect(stepEnd.assertions[0].label).toBe('min_confidence_gate'); + expect(stepEnd.assertions[0].passed).toBe(false); + expect((stepEnd.assertions[0] as any).details.reason_code).toBe('snapshot_exhausted'); + }); + + it('can use vision fallback after snapshot_exhausted (YES/NO)', async () => { + const sink = new MockSink(); + const tracer = new Tracer('test-run', sink); + const page = new MockPage('https://example.com') as any; + + const snapshots: Snapshot[] = [ + { + status: 'success', + url: 'https://example.com', + elements: [], + timestamp: 't1', + diagnostics: { + confidence: 0.1, + reasons: ['dom_unstable'], + metrics: { quiet_ms: 50 }, + } as any, + }, + ]; + + const browserLike = { + snapshot: async () => snapshots.shift() as Snapshot, + }; + + const runtime = new AgentRuntime(browserLike as any, page as any, tracer); + runtime.beginStep('Test'); + + const visionProvider = { + supportsVision: () => true, + generateWithImage: async () => ({ content: 'YES' }), + } as any; + + const pred: Predicate = _ctx => ({ + passed: false, + reason: 'should not run', + details: {}, + }); + + const ok = await runtime.check(pred, 'vision_fallback_check').eventually({ + timeoutMs: 2000, + pollMs: 0, + minConfidence: 0.7, + maxSnapshotAttempts: 1, + visionProvider, + }); + + expect(ok).toBe(true); + + const stepEnd = runtime.getAssertionsForStepEnd(); + expect(stepEnd.assertions.length).toBe(1); + expect(stepEnd.assertions[0].passed).toBe(true); + expect((stepEnd.assertions[0] as any).vision_fallback).toBe(true); + expect((stepEnd.assertions[0] as any).details.reason_code).toBe('vision_fallback_pass'); + }); +}); diff --git a/tests/mocks/browser-mock.ts b/tests/mocks/browser-mock.ts index 9faa2a0e..cc1d4c30 100644 --- a/tests/mocks/browser-mock.ts +++ b/tests/mocks/browser-mock.ts @@ -6,7 +6,7 @@ */ import { IBrowser, IPage } from '../../src/protocols/browser-protocol'; -import { Snapshot, SnapshotOptions } from '../../src/types'; +import { Snapshot } from '../../src/types'; import { Page } from 'playwright'; /** @@ -21,6 +21,7 @@ export class MockPage implements IPage { public mouseClickCalls: Array<{ x: number; y: number }> = []; public keyboardTypeCalls: string[] = []; public keyboardPressCalls: string[] = []; + public screenshotCalls: Array<{ options?: any }> = []; constructor(url?: string) { if (url) { @@ -88,6 +89,12 @@ export class MockPage implements IPage { this.keyboardPressCalls.push(key); }, }; + + // Playwright Page API (subset): used by vision fallback in AgentRuntime eventually() + async screenshot(options?: any): Promise { + this.screenshotCalls.push({ options }); + return Buffer.from('mock-png'); + } } /** @@ -108,7 +115,7 @@ export class MockBrowser implements IBrowser { await this.mockPage.goto(url); } - async snapshot(options?: SnapshotOptions): Promise { + async snapshot(options?: any): Promise { // Mock snapshot - return empty snapshot return { status: 'success', diff --git a/tests/query.test.ts b/tests/query.test.ts index 5b7e6ea9..f969d72e 100644 --- a/tests/query.test.ts +++ b/tests/query.test.ts @@ -73,6 +73,19 @@ describe('parseSelector', () => { const q = parseSelector('tag=button'); expect((q as any).tag).toBe('button'); }); + + it('should parse name/value/state selectors', () => { + const q1 = parseSelector("name~'Email'"); + expect((q1 as any).name_contains).toBe('Email'); + + const q2 = parseSelector("value~'@example.com'"); + expect((q2 as any).value_contains).toBe('@example.com'); + + const q3 = parseSelector('disabled=true checked=false expanded=true'); + expect((q3 as any).disabled).toBe(true); + expect((q3 as any).checked).toBe(false); + expect((q3 as any).expanded).toBe(true); + }); }); describe('query', () => { @@ -82,6 +95,7 @@ describe('query', () => { id: 1, role: 'button', text: 'Sign In', + name: 'Sign In', importance: 1000, bbox: { x: 10, y: 20, width: 100, height: 40 }, visual_cues: { is_primary: true, background_color_name: null, is_clickable: true }, @@ -111,6 +125,20 @@ describe('query', () => { is_occluded: false, z_index: 1, }, + { + id: 4, + role: 'textbox', + text: null, + name: 'Email', + value: 'user@example.com', + disabled: false, + importance: 100, + bbox: { x: 10, y: 120, width: 300, height: 40 }, + visual_cues: { is_primary: false, background_color_name: null, is_clickable: true }, + in_viewport: true, + is_occluded: false, + z_index: 1, + }, ]; return { @@ -130,8 +158,8 @@ describe('query', () => { it('should filter by importance less than', () => { const snap = createTestSnapshot(); const results = query(snap, 'importance<300'); - expect(results.length).toBe(1); - expect(results[0].id).toBe(3); + expect(results.length).toBe(2); + expect(results.map(el => el.id)).toEqual([3, 4]); }); it('should filter by text prefix', () => { @@ -165,7 +193,7 @@ describe('query', () => { it('should filter by visible', () => { const snap = createTestSnapshot(); const results = query(snap, 'visible=true'); - expect(results.length).toBe(3); // All are visible + expect(results.length).toBe(4); // All are visible }); it('should filter by z-index', () => { @@ -178,13 +206,20 @@ describe('query', () => { it('should filter by in_viewport', () => { const snap = createTestSnapshot(); const results = query(snap, 'in_viewport=true'); - expect(results.length).toBe(3); + expect(results.length).toBe(4); }); it('should filter by is_occluded', () => { const snap = createTestSnapshot(); const results = query(snap, 'is_occluded=false'); - expect(results.length).toBe(3); + expect(results.length).toBe(4); + }); + + it('should filter by name/value/state', () => { + const snap = createTestSnapshot(); + expect(query(snap, "name~'Email'").map(e => e.id)).toEqual([4]); + expect(query(snap, "value~'@example.com'").map(e => e.id)).toEqual([4]); + expect(query(snap, 'disabled=false').map(e => e.id)).toContain(4); }); }); diff --git a/tests/verification.test.ts b/tests/verification.test.ts index d4b982d9..97e6c7b4 100644 --- a/tests/verification.test.ts +++ b/tests/verification.test.ts @@ -14,6 +14,14 @@ import { allOf, anyOf, custom, + isEnabled, + isDisabled, + isChecked, + isUnchecked, + valueEquals, + valueContains, + isExpanded, + isCollapsed, } from '../src/verification'; import { Snapshot, Element, BBox, Viewport, VisualCues } from '../src/types'; @@ -171,6 +179,47 @@ describe('exists', () => { }); }); +describe('state-aware predicates', () => { + it('isEnabled/isDisabled', () => { + const el1 = { ...makeElement(1, 'button', 'Submit'), disabled: false } as Element; + const el2 = { ...makeElement(2, 'button', 'Disabled'), disabled: true } as Element; + const snap = makeSnapshot([el1, el2]); + const ctx: AssertContext = { snapshot: snap, url: snap.url, stepId: null }; + + expect(isEnabled('role=button')(ctx).passed).toBe(true); + expect(isDisabled("text~'Disabled'")(ctx).passed).toBe(true); + }); + + it('isChecked/isUnchecked', () => { + const el1 = { ...makeElement(1, 'checkbox', 'Opt in'), checked: true } as Element; + const el2 = { ...makeElement(2, 'checkbox', 'Opt out'), checked: false } as Element; + const snap = makeSnapshot([el1, el2]); + const ctx: AssertContext = { snapshot: snap, url: snap.url, stepId: null }; + + expect(isChecked("text~'Opt in'")(ctx).passed).toBe(true); + expect(isUnchecked("text~'Opt out'")(ctx).passed).toBe(true); + }); + + it('valueEquals/valueContains', () => { + const el = { ...makeElement(1, 'textbox', null), value: 'user@example.com' } as Element; + const snap = makeSnapshot([el]); + const ctx: AssertContext = { snapshot: snap, url: snap.url, stepId: null }; + + expect(valueEquals('role=textbox', 'user@example.com')(ctx).passed).toBe(true); + expect(valueContains('role=textbox', '@example.com')(ctx).passed).toBe(true); + }); + + it('isExpanded/isCollapsed', () => { + const el1 = { ...makeElement(1, 'button', 'Menu'), expanded: true } as Element; + const el2 = { ...makeElement(2, 'button', 'Details'), expanded: false } as Element; + const snap = makeSnapshot([el1, el2]); + const ctx: AssertContext = { snapshot: snap, url: snap.url, stepId: null }; + + expect(isExpanded("text~'Menu'")(ctx).passed).toBe(true); + expect(isCollapsed("text~'Details'")(ctx).passed).toBe(true); + }); +}); + describe('notExists', () => { it('passes when element absent', () => { const elements = [makeElement(1, 'button')];