igrigorik · madmath · Feb 17, 2026
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "ai-sidebar-extension",
-  "version": "0.6.1",
+  "version": "0.6.2",
   "description": "Chrome extension AI sidebar with LLM providers and MCP support",
   "private": true,
   "type": "module",

diff --git a/src/background/index.ts b/src/background/index.ts
@@ -537,6 +537,66 @@ chrome.runtime.onMessage.addListener((request: ExtensionMessage, sender, sendRes
       return true; // Now async to support getBoundTabIdForSidebar
     }
 
+    case 'EVAL_JUDGE': {
+      // Eval judge: use LLM to verify post-conditions of an eval scenario
+      (async () => {
+        try {
+          const { agentId, prompt, assistantResponse, toolCalls, postConditions } = request;
+
+          const systemPrompt = `You are an evaluation judge. Given a user prompt, assistant response, tool calls made, and expected post-conditions, determine whether the post-conditions were met.
+
+Respond ONLY with a JSON object (no markdown fences, no extra text):
+{"verdict": "pass" or "fail", "score": 0.0 to 1.0, "reasoning": "brief explanation"}`;
+
+          const toolCallsSummary = toolCalls
+            .map(
+              (tc: { toolName: string; input: unknown; output: unknown; status: string }) =>
+                `- ${tc.toolName}(${JSON.stringify(tc.input)}) → ${tc.status}: ${JSON.stringify(tc.output)}`
+            )
+            .join('\n');
+
+          const userMsg = `## User Prompt
+${prompt}
+
+## Assistant Response
+${assistantResponse}
+
+## Tool Calls
+${toolCallsSummary || '(none)'}
+
+## Post-Conditions to Verify
+${postConditions}`;
+
+          const responseText = await aiClient.generateTextForAgent(agentId, systemPrompt, userMsg, {
+            temperature: 0,
+          });
+
+          // Parse JSON from response (strip markdown fences if present)
+          const cleaned = responseText
+            .replace(/```(?:json)?\s*/g, '')
+            .replace(/```\s*/g, '')
+            .trim();
+          const parsed = JSON.parse(cleaned);
+
+          sendResponse({
+            success: true,
+            verdict: parsed.verdict || 'fail',
+            score: typeof parsed.score === 'number' ? parsed.score : 0,
+            reasoning: parsed.reasoning || '',
+          });
+        } catch (error) {
+          log.error('[Background] EVAL_JUDGE error:', error);
+          sendResponse({
+            success: true,
+            score: 0,
+            verdict: 'fail',
+            reasoning: `Failed to parse judge response: ${error instanceof Error ? error.message : 'Unknown error'}`,
+          });
+        }
+      })();
+      return true;
+    }
+
     default:
       log.debug('Unknown message type:', request.type);
       return false; // No async response needed

diff --git a/src/lib/ai/client.ts b/src/lib/ai/client.ts
@@ -9,7 +9,7 @@ import log from '../logger';
 import { createOpenAI } from '@ai-sdk/openai';
 import { createAnthropic } from '@ai-sdk/anthropic';
 import { createGoogleGenerativeAI } from '@ai-sdk/google';
-import { streamText, type LanguageModel, CoreMessage, type JSONValue } from 'ai';
+import { streamText, generateText, type LanguageModel, CoreMessage, type JSONValue } from 'ai';
 import type { AgentConfig } from '../storage/config';
 import type { AIProvider, ToolCall } from '../../types';
 import { ConfigStorage } from '../storage/config';
@@ -758,6 +758,43 @@ export class AIClient {
     this.abortController?.abort();
   }
 
+  /**
+   * Non-streaming text generation for a specific agent.
+   * Used for eval judge and other non-interactive use cases.
+   */
+  async generateTextForAgent(
+    agentId: string,
+    systemPrompt: string,
+    userMessage: string,
+    options?: { temperature?: number }
+  ): Promise<string> {
+    const agent = await this.configStorage.getAgent(agentId);
+    if (!agent) {
+      throw new Error(`Agent ${agentId} not found`);
+    }
+
+    if (!agent.apiKey && !agent.endpoint) {
+      throw new Error(`No API key or endpoint configured for agent "${agent.name}"`);
+    }
+
+    const modelFactory = this.createProviderForAgent(agent);
+    const model = modelFactory();
+
+    const messages: CoreMessage[] = [
+      { role: 'system', content: systemPrompt },
+      { role: 'user', content: userMessage },
+    ];
+
+    const result = await generateText({
+      model,
+      messages,
+      temperature: options?.temperature ?? 0,
+      maxRetries: 2,
+    });
+
+    return result.text;
+  }
+
   /**
    * Test connection with provided agent details (for new agents before saving)
    */

diff --git a/src/lib/commands/builtins.ts b/src/lib/commands/builtins.ts
@@ -317,5 +317,11 @@ export function createBuiltinCommands(
     clear: () => {
       clearChat();
     },
+
+    eval: () => {
+      // No-op: actual handling is intercepted in sidebar's handleSendMessage.
+      // Registered here so /help lists it.
+      window.dispatchEvent(new CustomEvent('start-eval'));
+    },
   };
 }