From cd2a6983b070d36f8eef430d35f34b11c02a4a53 Mon Sep 17 00:00:00 2001 From: rcholic Date: Sat, 3 Jan 2026 12:36:58 -0800 Subject: [PATCH 1/2] optimize LLM agent efficiency --- src/utils/element-filter.ts | 11 +- src/utils/llm-interaction-handler.ts | 111 ++++++++++++++++---- tests/utils/llm-interaction-handler.test.ts | 91 ++++++++++++++++ 3 files changed, 191 insertions(+), 22 deletions(-) diff --git a/src/utils/element-filter.ts b/src/utils/element-filter.ts index ec2e7891..f88441f3 100644 --- a/src/utils/element-filter.ts +++ b/src/utils/element-filter.ts @@ -32,7 +32,8 @@ export class ElementFilter { * ``` */ static filterByImportance(snapshot: Snapshot, maxElements: number = 50): Element[] { - const elements = [...snapshot.elements]; + // Filter out REMOVED elements - they're not actionable and shouldn't be in LLM context + const elements = snapshot.elements.filter(el => el.diff_status !== 'REMOVED'); // Sort by importance (descending) elements.sort((a, b) => b.importance - a.importance); @@ -60,13 +61,16 @@ export class ElementFilter { return this.filterByImportance(snapshot, maxElements); } + // Filter out REMOVED elements - they're not actionable and shouldn't be in LLM context + const elements = snapshot.elements.filter(el => el.diff_status !== 'REMOVED'); + const goalLower = goal.toLowerCase(); const keywords = this.extractKeywords(goalLower); // Score elements based on keyword matches const scoredElements: Array<[number, Element]> = []; - for (const element of snapshot.elements) { + for (const element of elements) { let score = element.importance; // Start with base importance // Boost score for keyword matches in text @@ -115,7 +119,8 @@ export class ElementFilter { * ``` */ static filter(snapshot: Snapshot, options: FilterOptions = {}): Element[] { - let elements = [...snapshot.elements]; + // Filter out REMOVED elements - they're not actionable and shouldn't be in LLM context + let elements = snapshot.elements.filter(el => el.diff_status !== 'REMOVED'); // Apply filters if (options.minImportance !== undefined) { diff --git a/src/utils/llm-interaction-handler.ts b/src/utils/llm-interaction-handler.ts index 8983ed44..9017d166 100644 --- a/src/utils/llm-interaction-handler.ts +++ b/src/utils/llm-interaction-handler.ts @@ -20,14 +20,20 @@ export class LLMInteractionHandler { /** * Build context string from snapshot for LLM prompt * + * Format: [ID] "text" {cues} @ (x,y) size:WxH importance:score [status] + * * @param snap - Snapshot containing elements - * @param goal - Goal/task description + * @param goal - Goal/task description (unused but kept for API consistency) * @returns Formatted context string */ buildContext(snap: Snapshot, _goal: string): string { const lines: string[] = []; for (const el of snap.elements) { + // Skip REMOVED elements - they're not actionable and shouldn't be in LLM context + if (el.diff_status === 'REMOVED') { + continue; + } // Extract visual cues const cues: string[] = []; if (el.visual_cues.is_primary) cues.push('PRIMARY'); @@ -36,14 +42,44 @@ export class LLMInteractionHandler { cues.push(`color:${el.visual_cues.background_color_name}`); } - // Format element line + // Format element line with improved readability const cuesStr = cues.length > 0 ? ` {${cues.join(',')}}` : ''; - const text = el.text || ''; - const textPreview = text.length > 50 ? text.substring(0, 50) + '...' : text; + // Better text handling - show truncation indicator + let textPreview = ''; + if (el.text) { + if (el.text.length > 50) { + textPreview = `"${el.text.substring(0, 50)}..."`; + } else { + textPreview = `"${el.text}"`; + } + } + + // Build position and size info + const x = Math.floor(el.bbox.x); + const y = Math.floor(el.bbox.y); + const width = Math.floor(el.bbox.width); + const height = Math.floor(el.bbox.height); + const positionStr = `@ (${x},${y})`; + const sizeStr = `size:${width}x${height}`; + + // Build status indicators (only include if relevant) + const statusParts: string[] = []; + if (!el.in_viewport) { + statusParts.push('not_in_viewport'); + } + if (el.is_occluded) { + statusParts.push('occluded'); + } + if (el.diff_status) { + statusParts.push(`diff:${el.diff_status}`); + } + const statusStr = statusParts.length > 0 ? ` [${statusParts.join(',')}]` : ''; + + // Format: [ID] "text" {cues} @ (x,y) size:WxH importance:score [status] lines.push( - `[${el.id}] <${el.role}> "${textPreview}"${cuesStr} ` + - `@ (${Math.floor(el.bbox.x)},${Math.floor(el.bbox.y)}) (Imp:${el.importance})` + `[${el.id}] <${el.role}> ${textPreview}${cuesStr} ` + + `${positionStr} ${sizeStr} importance:${el.importance}${statusStr}` ); } @@ -59,23 +95,60 @@ export class LLMInteractionHandler { */ async queryLLM(domContext: string, goal: string): Promise { const systemPrompt = `You are an AI web automation agent. -Your job is to analyze the current page state and decide the next action to take. - -Available actions: -- CLICK(id) - Click element with ID -- TYPE(id, "text") - Type text into element with ID -- PRESS("key") - Press keyboard key (e.g., "Enter", "Escape", "Tab") -- FINISH() - Task is complete - -Format your response as a single action command on one line. -Example: CLICK(42) or TYPE(5, "search query") or PRESS("Enter")`; - const userPrompt = `Goal: ${goal} +GOAL: ${goal} -Current page elements: +VISIBLE ELEMENTS (sorted by importance): ${domContext} -What action should I take next? Respond with only the action command (e.g., CLICK(42)).`; +VISUAL CUES EXPLAINED: +After the text, you may see visual cues in curly braces like {CLICKABLE} or {PRIMARY,CLICKABLE,color:white}: +- PRIMARY: Main call-to-action element on the page +- CLICKABLE: Element is clickable/interactive +- color:X: Background color name (e.g., color:white, color:blue) +Multiple cues are comma-separated inside the braces: {CLICKABLE,color:white} + +ELEMENT FORMAT EXPLAINED: +Each element line follows this format: +[ID] "text" {cues} @ (x,y) size:WxH importance:score [status] + +Example: [346]