diff --git a/package.json b/package.json index 2b2fd8e..68ea274 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "ai-sidebar-extension", - "version": "0.6.1", + "version": "0.6.2", "description": "Chrome extension AI sidebar with LLM providers and MCP support", "private": true, "type": "module", diff --git a/src/background/index.ts b/src/background/index.ts index f389761..531a182 100644 --- a/src/background/index.ts +++ b/src/background/index.ts @@ -537,6 +537,66 @@ chrome.runtime.onMessage.addListener((request: ExtensionMessage, sender, sendRes return true; // Now async to support getBoundTabIdForSidebar } + case 'EVAL_JUDGE': { + // Eval judge: use LLM to verify post-conditions of an eval scenario + (async () => { + try { + const { agentId, prompt, assistantResponse, toolCalls, postConditions } = request; + + const systemPrompt = `You are an evaluation judge. Given a user prompt, assistant response, tool calls made, and expected post-conditions, determine whether the post-conditions were met. + +Respond ONLY with a JSON object (no markdown fences, no extra text): +{"verdict": "pass" or "fail", "score": 0.0 to 1.0, "reasoning": "brief explanation"}`; + + const toolCallsSummary = toolCalls + .map( + (tc: { toolName: string; input: unknown; output: unknown; status: string }) => + `- ${tc.toolName}(${JSON.stringify(tc.input)}) → ${tc.status}: ${JSON.stringify(tc.output)}` + ) + .join('\n'); + + const userMsg = `## User Prompt +${prompt} + +## Assistant Response +${assistantResponse} + +## Tool Calls +${toolCallsSummary || '(none)'} + +## Post-Conditions to Verify +${postConditions}`; + + const responseText = await aiClient.generateTextForAgent(agentId, systemPrompt, userMsg, { + temperature: 0, + }); + + // Parse JSON from response (strip markdown fences if present) + const cleaned = responseText + .replace(/```(?:json)?\s*/g, '') + .replace(/```\s*/g, '') + .trim(); + const parsed = JSON.parse(cleaned); + + sendResponse({ + success: true, + verdict: parsed.verdict || 'fail', + score: typeof parsed.score === 'number' ? parsed.score : 0, + reasoning: parsed.reasoning || '', + }); + } catch (error) { + log.error('[Background] EVAL_JUDGE error:', error); + sendResponse({ + success: true, + score: 0, + verdict: 'fail', + reasoning: `Failed to parse judge response: ${error instanceof Error ? error.message : 'Unknown error'}`, + }); + } + })(); + return true; + } + default: log.debug('Unknown message type:', request.type); return false; // No async response needed diff --git a/src/lib/ai/client.ts b/src/lib/ai/client.ts index d2dc693..afd53c9 100644 --- a/src/lib/ai/client.ts +++ b/src/lib/ai/client.ts @@ -9,7 +9,7 @@ import log from '../logger'; import { createOpenAI } from '@ai-sdk/openai'; import { createAnthropic } from '@ai-sdk/anthropic'; import { createGoogleGenerativeAI } from '@ai-sdk/google'; -import { streamText, type LanguageModel, CoreMessage, type JSONValue } from 'ai'; +import { streamText, generateText, type LanguageModel, CoreMessage, type JSONValue } from 'ai'; import type { AgentConfig } from '../storage/config'; import type { AIProvider, ToolCall } from '../../types'; import { ConfigStorage } from '../storage/config'; @@ -758,6 +758,43 @@ export class AIClient { this.abortController?.abort(); } + /** + * Non-streaming text generation for a specific agent. + * Used for eval judge and other non-interactive use cases. + */ + async generateTextForAgent( + agentId: string, + systemPrompt: string, + userMessage: string, + options?: { temperature?: number } + ): Promise { + const agent = await this.configStorage.getAgent(agentId); + if (!agent) { + throw new Error(`Agent ${agentId} not found`); + } + + if (!agent.apiKey && !agent.endpoint) { + throw new Error(`No API key or endpoint configured for agent "${agent.name}"`); + } + + const modelFactory = this.createProviderForAgent(agent); + const model = modelFactory(); + + const messages: CoreMessage[] = [ + { role: 'system', content: systemPrompt }, + { role: 'user', content: userMessage }, + ]; + + const result = await generateText({ + model, + messages, + temperature: options?.temperature ?? 0, + maxRetries: 2, + }); + + return result.text; + } + /** * Test connection with provided agent details (for new agents before saving) */ diff --git a/src/lib/commands/builtins.ts b/src/lib/commands/builtins.ts index d24431e..05e47e0 100644 --- a/src/lib/commands/builtins.ts +++ b/src/lib/commands/builtins.ts @@ -317,5 +317,11 @@ export function createBuiltinCommands( clear: () => { clearChat(); }, + + eval: () => { + // No-op: actual handling is intercepted in sidebar's handleSendMessage. + // Registered here so /help lists it. + window.dispatchEvent(new CustomEvent('start-eval')); + }, }; } diff --git a/src/lib/eval/runner.ts b/src/lib/eval/runner.ts new file mode 100644 index 0000000..0cc613c --- /dev/null +++ b/src/lib/eval/runner.ts @@ -0,0 +1,350 @@ +/** + * EvalRunner orchestrates batch evaluation of an eval suite. + * Runs each scenario sequentially: navigate → clear → prompt → extract → score → judge. + */ + +import type { ChatMessage, ToolCall } from '../../types'; +import type { + EvalSuite, + EvalScenario, + EvalScenarioResult, + EvalSuiteResult, + EvalProgress, + EvalPhase, + JudgeScore, +} from './types'; +import { scoreToolCalls, combinedScore } from './scoring'; + +const DEFAULT_TIMEOUT_MS = 60_000; +const NAVIGATION_TIMEOUT_MS = 15_000; +const SETTLE_MS = 200; + +/** + * Interface for sidebar internals to avoid circular deps. + * The sidebar creates an adapter implementing this. + */ +export interface SidebarInterface { + attachedTabId: number | null; + currentAgentId: string | null; + getMessageHistory(): ChatMessage[]; + clearConversation(): void; + sendMessage(text: string): Promise; +} + +export class EvalRunner { + private sidebar: SidebarInterface; + private aborted = false; + private onProgress: ((progress: EvalProgress) => void) | null = null; + + constructor(sidebar: SidebarInterface) { + this.sidebar = sidebar; + } + + /** + * Set progress callback (called after each scenario completes). + */ + setProgressCallback(cb: (progress: EvalProgress) => void): void { + this.onProgress = cb; + } + + /** + * Abort the current eval run. + */ + abort(): void { + this.aborted = true; + } + + /** + * Run all scenarios in the suite sequentially. + */ + async run(suite: EvalSuite): Promise { + this.aborted = false; + const startTime = Date.now(); + const scenarioResults: EvalScenarioResult[] = []; + + for (let i = 0; i < suite.scenarios.length; i++) { + if (this.aborted) break; + + const scenario = suite.scenarios[i]; + this.emitProgress('loading', i, suite.scenarios.length, scenario.id, scenarioResults); + + const result = await this.runScenario(scenario, suite, i); + scenarioResults.push(result); + + this.emitProgress('scoring', i + 1, suite.scenarios.length, scenario.id, scenarioResults); + } + + const endTime = Date.now(); + this.emitProgress( + 'complete', + suite.scenarios.length, + suite.scenarios.length, + '', + scenarioResults + ); + + return this.buildSuiteResult(suite.name, startTime, endTime, scenarioResults); + } + + private async runScenario( + scenario: EvalScenario, + suite: EvalSuite, + index: number + ): Promise { + const scenarioStart = Date.now(); + const timeoutMs = scenario.timeoutMs || DEFAULT_TIMEOUT_MS; + + try { + return await this.withTimeout( + this.executeScenario(scenario, suite, index), + timeoutMs, + `Scenario "${scenario.id}" timed out after ${timeoutMs}ms` + ); + } catch (error) { + return { + scenarioId: scenario.id, + prompt: scenario.prompt, + status: 'error', + durationMs: Date.now() - scenarioStart, + toolCallScore: { + score: 0, + expectedCalled: [], + expectedMissed: [], + forbiddenCalled: [], + unexpectedCalls: [], + }, + judgeScore: null, + combinedScore: 0, + actualToolCalls: [], + assistantResponse: '', + error: error instanceof Error ? error.message : String(error), + }; + } + } + + private async executeScenario( + scenario: EvalScenario, + suite: EvalSuite, + index: number + ): Promise { + const scenarioStart = Date.now(); + + // 1. Navigate if startPage is set + if (scenario.startPage && this.sidebar.attachedTabId) { + this.emitProgress('navigating', index, suite.scenarios.length, scenario.id, []); + const baseUrl = suite.baseUrl || ''; + const fullUrl = scenario.startPage.startsWith('http') + ? scenario.startPage + : baseUrl + scenario.startPage; + + await this.navigateTab(this.sidebar.attachedTabId, fullUrl); + } + + // 2. Clear conversation + this.emitProgress('clearing', index, suite.scenarios.length, scenario.id, []); + this.sidebar.clearConversation(); + await this.sleep(SETTLE_MS); + + // 3. Send prompt and wait for completion + this.emitProgress('prompting', index, suite.scenarios.length, scenario.id, []); + await this.sidebar.sendMessage(scenario.prompt); + + // 4. Extract results from message history + const history = this.sidebar.getMessageHistory(); + const lastAssistant = [...history].reverse().find((m) => m.role === 'assistant'); + const assistantResponse = + typeof lastAssistant?.content === 'string' + ? lastAssistant.content + : lastAssistant?.content + ?.filter((p) => p.type === 'text') + .map((p) => p.text) + .join('') || ''; + + // Collect all tool calls from assistant messages + const allToolCalls: ToolCall[] = []; + for (const msg of history) { + if (msg.role === 'assistant' && msg.toolCalls) { + allToolCalls.push(...msg.toolCalls); + } + } + + const actualToolCallNames = allToolCalls.map((tc) => tc.toolName); + const actualToolCallDetails = allToolCalls.map((tc) => ({ + toolName: tc.toolName, + input: tc.input, + output: tc.output, + status: tc.status, + })); + + // 5. Score tool calls + const toolScore = scoreToolCalls( + actualToolCallNames, + scenario.expectations.toolCalls, + scenario.expectations.forbiddenToolCalls + ); + + // 6. Judge (if post-conditions defined) + let judgeResult: JudgeScore | null = null; + if (scenario.expectations.postConditions && this.sidebar.currentAgentId) { + this.emitProgress('judging', index, suite.scenarios.length, scenario.id, []); + judgeResult = await this.runJudge( + scenario.prompt, + assistantResponse, + actualToolCallDetails, + scenario.expectations.postConditions + ); + } + + // 7. Compute combined score + const combined = combinedScore(toolScore, judgeResult); + const status = combined >= 0.5 ? 'pass' : 'fail'; + + return { + scenarioId: scenario.id, + prompt: scenario.prompt, + status, + durationMs: Date.now() - scenarioStart, + toolCallScore: toolScore, + judgeScore: judgeResult, + combinedScore: combined, + actualToolCalls: actualToolCallDetails, + assistantResponse, + }; + } + + private async navigateTab(tabId: number, url: string): Promise { + await chrome.tabs.update(tabId, { url }); + + // Wait for WEBMCP_TOOLS_CHANGED or fall back after webNavigation completes + await new Promise((resolve) => { + let resolved = false; + const done = () => { + if (!resolved) { + resolved = true; + chrome.runtime.onMessage.removeListener(onMessage); + resolve(); + } + }; + + // Listen for tools changed signal (ideal) + const onMessage = (msg: { type: string; tabId?: number }) => { + if (msg.type === 'WEBMCP_TOOLS_CHANGED' && msg.tabId === tabId) { + done(); + } + }; + chrome.runtime.onMessage.addListener(onMessage); + + // Fallback: wait for tab to finish loading + 3s settle + chrome.tabs.onUpdated.addListener(function onUpdated( + updatedTabId: number, + changeInfo: chrome.tabs.TabChangeInfo + ) { + if (updatedTabId === tabId && changeInfo.status === 'complete') { + chrome.tabs.onUpdated.removeListener(onUpdated); + setTimeout(done, 3000); + } + }); + + // Hard timeout + setTimeout(done, NAVIGATION_TIMEOUT_MS); + }); + } + + private async runJudge( + prompt: string, + assistantResponse: string, + toolCalls: Array<{ toolName: string; input: unknown; output: unknown; status: string }>, + postConditions: string + ): Promise { + try { + const response = await chrome.runtime.sendMessage({ + type: 'EVAL_JUDGE', + agentId: this.sidebar.currentAgentId, + prompt, + assistantResponse, + toolCalls, + postConditions, + }); + + if (response?.success) { + return { + score: typeof response.score === 'number' ? response.score : 0, + verdict: response.verdict || 'fail', + reasoning: response.reasoning || '', + }; + } + + return { score: 0, verdict: 'fail', reasoning: 'Judge call failed' }; + } catch (error) { + return { + score: 0, + verdict: 'fail', + reasoning: `Judge error: ${error instanceof Error ? error.message : String(error)}`, + }; + } + } + + private buildSuiteResult( + suiteName: string, + startTime: number, + endTime: number, + scenarios: EvalScenarioResult[] + ): EvalSuiteResult { + const passed = scenarios.filter((s) => s.status === 'pass').length; + const failed = scenarios.filter((s) => s.status === 'fail').length; + const errored = scenarios.filter((s) => s.status === 'error').length; + + const toolScores = scenarios.map((s) => s.toolCallScore.score); + const judgeScores = scenarios + .filter((s): s is EvalScenarioResult & { judgeScore: JudgeScore } => s.judgeScore !== null) + .map((s) => s.judgeScore.score); + const combinedScores = scenarios.map((s) => s.combinedScore); + + const avg = (arr: number[]) => + arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : 0; + + return { + suiteName, + startTime, + endTime, + totalDurationMs: endTime - startTime, + scenarios, + summary: { + total: scenarios.length, + passed, + failed, + errored, + avgToolCallScore: Math.round(avg(toolScores) * 1000) / 1000, + avgJudgeScore: Math.round(avg(judgeScores) * 1000) / 1000, + avgCombinedScore: Math.round(avg(combinedScores) * 1000) / 1000, + }, + }; + } + + private emitProgress( + phase: EvalPhase, + currentIndex: number, + total: number, + scenarioId: string, + results: EvalScenarioResult[] + ): void { + this.onProgress?.({ + phase, + currentScenarioIndex: currentIndex, + totalScenarios: total, + currentScenarioId: scenarioId, + scenarioResults: [...results], + }); + } + + private sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); + } + + private withTimeout(promise: Promise, ms: number, message: string): Promise { + return Promise.race([ + promise, + new Promise((_, reject) => setTimeout(() => reject(new Error(message)), ms)), + ]); + } +} diff --git a/src/lib/eval/scoring.ts b/src/lib/eval/scoring.ts new file mode 100644 index 0000000..6e556a6 --- /dev/null +++ b/src/lib/eval/scoring.ts @@ -0,0 +1,82 @@ +/** + * Pure scoring functions for eval scenarios. + * No side effects — suitable for unit testing. + */ + +import type { ToolCallScore, JudgeScore } from './types'; + +/** + * Score tool call accuracy against expectations. + * + * Score = (expected tools called / total expected) minus 0.5 penalty if any forbidden tool called. + * Empty expectations = 1.0 (vacuously true). + */ +export function scoreToolCalls( + actualCalls: string[], + expectedToolCalls?: string[], + forbiddenToolCalls?: string[] +): ToolCallScore { + const actualSet = new Set(actualCalls); + + const expectedCalled: string[] = []; + const expectedMissed: string[] = []; + + if (expectedToolCalls && expectedToolCalls.length > 0) { + for (const expected of expectedToolCalls) { + if (actualSet.has(expected)) { + expectedCalled.push(expected); + } else { + expectedMissed.push(expected); + } + } + } + + const forbiddenCalled: string[] = []; + if (forbiddenToolCalls) { + for (const forbidden of forbiddenToolCalls) { + if (actualSet.has(forbidden)) { + forbiddenCalled.push(forbidden); + } + } + } + + // Unexpected = actual calls not in expected (informational, no penalty) + const expectedSet = new Set(expectedToolCalls || []); + const unexpectedCalls = actualCalls.filter((c) => !expectedSet.has(c)); + + // Calculate score + let score: number; + if (!expectedToolCalls || expectedToolCalls.length === 0) { + // No expectations → vacuously true + score = 1.0; + } else { + score = expectedCalled.length / expectedToolCalls.length; + } + + // Apply forbidden penalty + if (forbiddenCalled.length > 0) { + score = Math.max(0, score - 0.5); + } + + return { + score: Math.round(score * 1000) / 1000, + expectedCalled, + expectedMissed, + forbiddenCalled, + unexpectedCalls, + }; +} + +/** + * Combine tool call score and judge score into a single number. + * + * Both present: 40% tool calls + 60% judge. + * Only one present: 100% of that score. + */ +export function combinedScore(toolCallScore: ToolCallScore, judgeScore: JudgeScore | null): number { + if (judgeScore !== null) { + const combined = 0.4 * toolCallScore.score + 0.6 * judgeScore.score; + return Math.round(combined * 1000) / 1000; + } + return toolCallScore.score; +} diff --git a/src/lib/eval/types.ts b/src/lib/eval/types.ts new file mode 100644 index 0000000..f7622a9 --- /dev/null +++ b/src/lib/eval/types.ts @@ -0,0 +1,102 @@ +/** + * Type definitions for the eval system. + * Eval suites test whether tool definitions lead to correct agent behavior. + */ + +// --- Suite & Scenario Definitions --- + +export interface EvalSuite { + name: string; + description?: string; + baseUrl?: string; + scenarios: EvalScenario[]; +} + +export interface EvalScenario { + id: string; + prompt: string; + startPage?: string; + expectations: EvalExpectations; + tags?: string[]; + timeoutMs?: number; +} + +export interface EvalExpectations { + toolCalls?: string[]; + forbiddenToolCalls?: string[]; + postConditions?: string; +} + +export interface StoredEvalSuite extends EvalSuite { + id: string; + fileName?: string; + importedAt: number; +} + +// --- Scoring --- + +export interface ToolCallScore { + score: number; + expectedCalled: string[]; + expectedMissed: string[]; + forbiddenCalled: string[]; + unexpectedCalls: string[]; +} + +export interface JudgeScore { + score: number; + verdict: string; + reasoning: string; +} + +// --- Results --- + +export interface EvalScenarioResult { + scenarioId: string; + prompt: string; + status: 'pass' | 'fail' | 'error'; + durationMs: number; + toolCallScore: ToolCallScore; + judgeScore: JudgeScore | null; + combinedScore: number; + actualToolCalls: Array<{ toolName: string; input: unknown; output: unknown; status: string }>; + assistantResponse: string; + error?: string; +} + +export interface EvalSuiteResult { + suiteName: string; + startTime: number; + endTime: number; + totalDurationMs: number; + scenarios: EvalScenarioResult[]; + summary: { + total: number; + passed: number; + failed: number; + errored: number; + avgToolCallScore: number; + avgJudgeScore: number; + avgCombinedScore: number; + }; +} + +// --- Progress Tracking --- + +export type EvalPhase = + | 'loading' + | 'navigating' + | 'clearing' + | 'prompting' + | 'judging' + | 'scoring' + | 'complete' + | 'error'; + +export interface EvalProgress { + phase: EvalPhase; + currentScenarioIndex: number; + totalScenarios: number; + currentScenarioId: string; + scenarioResults: EvalScenarioResult[]; +} diff --git a/src/lib/storage/config.ts b/src/lib/storage/config.ts index fb0c053..497a884 100644 --- a/src/lib/storage/config.ts +++ b/src/lib/storage/config.ts @@ -3,6 +3,8 @@ * Agent-centric configuration for AI assistants */ +import type { StoredEvalSuite, EvalSuite } from '../eval/types'; + export type AIProvider = 'openai' | 'anthropic' | 'google'; export interface ReasoningConfig { @@ -45,6 +47,7 @@ export interface StorageConfig { mcpConfig?: MCPConfig; userScripts?: UserScript[]; // WebMCP user-defined tool scripts builtinScripts?: BuiltinScript[]; // Built-in tool state (only stores user overrides) + evalSuites?: StoredEvalSuite[]; // Imported eval suite definitions logLevel?: string; // Global log level: 'trace' | 'debug' | 'info' | 'warn' | 'error' | 'silent' } @@ -404,4 +407,35 @@ export class ConfigStorage { await this.set({ builtinScripts: scripts }); } + + // Eval Suite management methods + async getEvalSuites(): Promise { + const config = await this.get(); + return config.evalSuites || []; + } + + async addEvalSuite(suite: EvalSuite, fileName?: string): Promise { + const suites = await this.getEvalSuites(); + const id = globalThis.crypto.randomUUID(); + const stored: StoredEvalSuite = { + ...suite, + id, + fileName, + importedAt: Date.now(), + }; + + await this.set({ evalSuites: [...suites, stored] }); + return id; + } + + async deleteEvalSuite(id: string): Promise { + const suites = await this.getEvalSuites(); + const filtered = suites.filter((s) => s.id !== id); + + if (filtered.length === suites.length) { + throw new Error(`Eval suite ${id} not found`); + } + + await this.set({ evalSuites: filtered }); + } } diff --git a/src/options/eval-suites.ts b/src/options/eval-suites.ts new file mode 100644 index 0000000..dc794b5 --- /dev/null +++ b/src/options/eval-suites.ts @@ -0,0 +1,253 @@ +/** + * Eval Suites management for the options page. + * Handles importing, displaying, and deleting eval suite JSON files. + */ + +import log from '../lib/logger'; +import { ConfigStorage } from '../lib/storage/config'; +import type { EvalSuite, StoredEvalSuite } from '../lib/eval/types'; + +const configStorage = ConfigStorage.getInstance(); + +/** + * Initialize eval suites UI + */ +export async function initEvalSuites(): Promise { + const importBtn = document.getElementById('import-evalsuite'); + const fileInput = document.getElementById('import-evalsuite-file') as HTMLInputElement; + + if (!importBtn || !fileInput) { + log.warn('[EvalSuites] UI elements not found, skipping initialization'); + return; + } + + importBtn.addEventListener('click', () => { + fileInput.click(); + }); + + fileInput.addEventListener('change', async (event) => { + const input = event.target as HTMLInputElement; + const file = input.files?.[0]; + + if (file) { + await importEvalSuite(file); + input.value = ''; + } + }); + + await renderEvalSuites(); +} + +/** + * Render eval suite cards from storage + */ +async function renderEvalSuites(): Promise { + const listEl = document.getElementById('evalsuites-list'); + const emptyEl = document.getElementById('no-evalsuites'); + + if (!listEl || !emptyEl) return; + + const suites = await configStorage.getEvalSuites(); + + listEl.innerHTML = ''; + + if (suites.length === 0) { + listEl.classList.add('hidden'); + emptyEl.classList.remove('hidden'); + return; + } + + listEl.classList.remove('hidden'); + emptyEl.classList.add('hidden'); + + for (const suite of suites) { + listEl.appendChild(createSuiteCard(suite)); + } +} + +/** + * Create a card element for an eval suite + */ +function createSuiteCard(suite: StoredEvalSuite): HTMLElement { + const card = document.createElement('div'); + card.className = 'card card-clickable'; + card.dataset.id = suite.id; + + // Header + const header = document.createElement('div'); + header.className = 'card-header'; + + const info = document.createElement('div'); + info.className = 'card-info'; + + const title = document.createElement('div'); + title.className = 'card-title'; + title.textContent = suite.name; + info.appendChild(title); + + if (suite.description) { + const subtitle = document.createElement('div'); + subtitle.className = 'card-subtitle'; + subtitle.textContent = suite.description; + info.appendChild(subtitle); + } + + header.appendChild(info); + card.appendChild(header); + + // Details + const body = document.createElement('div'); + body.className = 'card-body'; + + const scenarioDetail = document.createElement('div'); + scenarioDetail.className = 'card-detail'; + scenarioDetail.innerHTML = `Scenarios: ${suite.scenarios.length}`; + body.appendChild(scenarioDetail); + + if (suite.baseUrl) { + const urlDetail = document.createElement('div'); + urlDetail.className = 'card-detail'; + urlDetail.innerHTML = `Base URL: ${escapeHtml(suite.baseUrl)}`; + body.appendChild(urlDetail); + } + + if (suite.fileName) { + const fileDetail = document.createElement('div'); + fileDetail.className = 'card-detail'; + fileDetail.innerHTML = `File: ${escapeHtml(suite.fileName)}`; + body.appendChild(fileDetail); + } + + const dateDetail = document.createElement('div'); + dateDetail.className = 'card-detail'; + dateDetail.innerHTML = `Imported: ${new Date(suite.importedAt).toLocaleDateString()}`; + body.appendChild(dateDetail); + + // Tags from all scenarios + const allTags = new Set(); + for (const scenario of suite.scenarios) { + if (scenario.tags) { + for (const tag of scenario.tags) { + allTags.add(tag); + } + } + } + if (allTags.size > 0) { + const tagsDetail = document.createElement('div'); + tagsDetail.className = 'card-detail'; + tagsDetail.innerHTML = `Tags: ${[...allTags].map((t) => `${escapeHtml(t)}`).join(' ')}`; + body.appendChild(tagsDetail); + } + + card.appendChild(body); + + // Delete button + const actions = document.createElement('div'); + actions.className = 'card-header-actions'; + const deleteBtn = document.createElement('button'); + deleteBtn.className = 'button button-danger button-small'; + deleteBtn.textContent = 'Delete'; + deleteBtn.addEventListener('click', async (e) => { + e.stopPropagation(); + if (window.confirm(`Delete eval suite "${suite.name}"?`)) { + await deleteEvalSuite(suite.id); + } + }); + actions.appendChild(deleteBtn); + header.appendChild(actions); + + return card; +} + +/** + * Import an eval suite from a JSON file + */ +async function importEvalSuite(file: File): Promise { + try { + const text = await readFile(file); + const parsed = JSON.parse(text); + + // Validate structure + if (!parsed.name || typeof parsed.name !== 'string') { + throw new Error('Suite must have a "name" field'); + } + if (!Array.isArray(parsed.scenarios) || parsed.scenarios.length === 0) { + throw new Error('Suite must have at least one scenario'); + } + + for (const scenario of parsed.scenarios) { + if (!scenario.id || !scenario.prompt) { + throw new Error(`Each scenario must have "id" and "prompt" fields`); + } + if (!scenario.expectations || typeof scenario.expectations !== 'object') { + throw new Error(`Scenario "${scenario.id}" must have "expectations" object`); + } + } + + const suite: EvalSuite = { + name: parsed.name, + description: parsed.description, + baseUrl: parsed.baseUrl, + scenarios: parsed.scenarios, + }; + + await configStorage.addEvalSuite(suite, file.name); + showStatus( + `Imported eval suite "${suite.name}" (${suite.scenarios.length} scenarios)`, + 'success' + ); + await renderEvalSuites(); + } catch (error) { + log.error('[EvalSuites] Import failed:', error); + if (error instanceof SyntaxError) { + showStatus('Invalid JSON file', 'error'); + } else if (error instanceof Error) { + showStatus(`Import failed: ${error.message}`, 'error'); + } else { + showStatus('Import failed: Unknown error', 'error'); + } + } +} + +/** + * Delete an eval suite by ID + */ +async function deleteEvalSuite(id: string): Promise { + try { + await configStorage.deleteEvalSuite(id); + showStatus('Eval suite deleted', 'success'); + await renderEvalSuites(); + } catch (error) { + log.error('[EvalSuites] Delete failed:', error); + showStatus('Failed to delete eval suite', 'error'); + } +} + +function readFile(file: File): Promise { + return new Promise((resolve, reject) => { + const reader = new FileReader(); + reader.onload = () => resolve(reader.result as string); + reader.onerror = () => reject(new Error('Failed to read file')); + reader.readAsText(file); + }); +} + +function escapeHtml(text: string): string { + const div = document.createElement('div'); + div.textContent = text; + return div.innerHTML; +} + +function showStatus(message: string, type: 'success' | 'error' | 'info'): void { + const statusEl = document.getElementById('status-message'); + if (!statusEl) return; + + statusEl.textContent = message; + statusEl.className = `status-message ${type}`; + statusEl.style.display = 'block'; + + const delay = type === 'error' ? 5000 : 3000; + setTimeout(() => { + statusEl.style.display = 'none'; + }, delay); +} diff --git a/src/options/index.html b/src/options/index.html index 31efdcd..144f3a5 100644 --- a/src/options/index.html +++ b/src/options/index.html @@ -96,6 +96,26 @@

Server Status

+ +
+
+

Eval Suites

+ +
+ +
+ +
+ + +
+ +

Settings

diff --git a/src/options/index.ts b/src/options/index.ts index 2efbd04..0d9a4c1 100644 --- a/src/options/index.ts +++ b/src/options/index.ts @@ -18,6 +18,7 @@ import { initializeWebMCPScripts } from './webmcp-scripts'; import { initializeCommands } from './commands'; import { openModal, closeModal, setupBackdropHandler } from './modal-manager'; import { initializeBackupRestore } from './backup-restore'; +import { initEvalSuites } from './eval-suites'; import { createCard, setupModalFooter, @@ -41,6 +42,7 @@ document.addEventListener('DOMContentLoaded', async () => { await initializeWebMCPScripts(); await initializeCommands(); await initializeBackupRestore(); + await initEvalSuites(); }); async function renderAgents() { diff --git a/src/options/styles.css b/src/options/styles.css index 1fc5fe2..bcea2c3 100644 --- a/src/options/styles.css +++ b/src/options/styles.css @@ -47,7 +47,8 @@ header h1 { .agents-section, .commands-section, .webmcp-section, -.mcp-section { +.mcp-section, +.eval-section { background: white; border-radius: 12px; padding: 24px; @@ -58,7 +59,8 @@ header h1 { .agents-header, .commands-header, .webmcp-header, -.mcp-header { +.mcp-header, +.eval-header { display: flex; justify-content: space-between; align-items: center; @@ -70,6 +72,7 @@ header h1 { .commands-header h2, .webmcp-header h2, .mcp-header h2, +.eval-header h2, .settings-section h2 { font-size: 20px; font-weight: 600; @@ -77,10 +80,11 @@ header h1 { margin: 0; } -/* Grid lists for agents, commands, and scripts */ +/* Grid lists for agents, commands, scripts, and eval suites */ .agents-list, .commands-list, -.scripts-list { +.scripts-list, +.evalsuites-list { display: grid; grid-template-columns: repeat(auto-fill, minmax(350px, 1fr)); gap: 20px; @@ -177,13 +181,15 @@ header h1 { /* Empty state - unified styling */ .no-agents, .no-commands, -.no-scripts { +.no-scripts, +.no-evalsuites { margin: 0; } .no-agents-content, .no-commands-content, -.no-scripts-content { +.no-scripts-content, +.no-evalsuites-content { text-align: center; padding: 48px 32px; background: #f8f9fa; @@ -193,7 +199,8 @@ header h1 { .no-agents-content h3, .no-commands-content h3, -.no-scripts-content h3 { +.no-scripts-content h3, +.no-evalsuites-content h3 { font-size: 16px; font-weight: 500; color: #6c757d; @@ -202,7 +209,8 @@ header h1 { .no-agents-content p, .no-commands-content p, -.no-scripts-content p { +.no-scripts-content p, +.no-evalsuites-content p { color: #6c757d; font-size: 14px; margin: 0; @@ -213,7 +221,8 @@ header h1 { } .no-agents-content code, -.no-commands-content code { +.no-commands-content code, +.no-evalsuites-content code { background: rgba(0, 0, 0, 0.06); padding: 2px 6px; border-radius: 3px; diff --git a/src/sidebar/EvalReportBox.ts b/src/sidebar/EvalReportBox.ts new file mode 100644 index 0000000..0462a74 --- /dev/null +++ b/src/sidebar/EvalReportBox.ts @@ -0,0 +1,231 @@ +/** + * EvalReportBox — UI component for displaying eval progress and results. + * Rendered into the #messages container in the sidebar. + */ + +import type { EvalProgress, EvalScenarioResult, EvalSuiteResult } from '../lib/eval/types'; + +export class EvalReportBox { + private container: HTMLElement; + private progressSection: HTMLElement; + private progressBar: HTMLElement; + private progressFill: HTMLElement; + private progressLabel: HTMLElement; + private scenarioList: HTMLElement; + private summarySection: HTMLElement; + + constructor() { + this.container = document.createElement('div'); + this.container.className = 'eval-report-box'; + + // Progress section + this.progressSection = document.createElement('div'); + this.progressSection.className = 'eval-progress'; + + this.progressLabel = document.createElement('div'); + this.progressLabel.className = 'eval-progress-label'; + this.progressLabel.textContent = 'Starting eval...'; + this.progressSection.appendChild(this.progressLabel); + + this.progressBar = document.createElement('div'); + this.progressBar.className = 'eval-progress-bar'; + this.progressFill = document.createElement('div'); + this.progressFill.className = 'eval-progress-fill'; + this.progressBar.appendChild(this.progressFill); + this.progressSection.appendChild(this.progressBar); + + this.container.appendChild(this.progressSection); + + // Scenario list + this.scenarioList = document.createElement('div'); + this.scenarioList.className = 'eval-scenario-list'; + this.container.appendChild(this.scenarioList); + + // Summary section (hidden until complete) + this.summarySection = document.createElement('div'); + this.summarySection.className = 'eval-summary hidden'; + this.container.appendChild(this.summarySection); + } + + getElement(): HTMLElement { + return this.container; + } + + /** + * Update the progress display. + */ + updateProgress(progress: EvalProgress): void { + const { phase, currentScenarioIndex, totalScenarios, currentScenarioId, scenarioResults } = + progress; + + // Update progress bar + const pct = totalScenarios > 0 ? (currentScenarioIndex / totalScenarios) * 100 : 0; + this.progressFill.style.width = `${pct}%`; + + // Update label + if (phase === 'complete') { + this.progressLabel.textContent = `${totalScenarios}/${totalScenarios} — Complete`; + } else { + this.progressLabel.textContent = `${currentScenarioIndex}/${totalScenarios} — ${phase} ${currentScenarioId}`; + } + + // Append newly completed scenarios + const existingCount = this.scenarioList.children.length; + for (let i = existingCount; i < scenarioResults.length; i++) { + this.scenarioList.appendChild(this.createScenarioRow(scenarioResults[i])); + } + } + + /** + * Show final summary. + */ + showSummary(result: EvalSuiteResult): void { + this.progressFill.style.width = '100%'; + this.progressLabel.textContent = `${result.summary.total}/${result.summary.total} — Complete`; + + const { summary } = result; + + this.summarySection.classList.remove('hidden'); + this.summarySection.innerHTML = ` +
Summary — ${result.suiteName}
+
+
+ ${summary.total} + Total +
+
+ ${summary.passed} + Passed +
+
+ ${summary.failed} + Failed +
+
+ ${summary.errored} + Errors +
+
+
+
Avg Tool Score: ${(summary.avgToolCallScore * 100).toFixed(0)}%
+
Avg Judge Score: ${(summary.avgJudgeScore * 100).toFixed(0)}%
+
Avg Combined: ${(summary.avgCombinedScore * 100).toFixed(0)}%
+
Duration: ${(result.totalDurationMs / 1000).toFixed(1)}s
+
+ `; + } + + /** + * Show an error that prevented the eval from running. + */ + showError(message: string): void { + this.progressLabel.textContent = 'Error'; + this.progressFill.style.width = '0%'; + + const errorEl = document.createElement('div'); + errorEl.className = 'eval-error'; + errorEl.textContent = message; + this.container.appendChild(errorEl); + } + + private createScenarioRow(result: EvalScenarioResult): HTMLElement { + const row = document.createElement('div'); + row.className = `eval-scenario-row eval-scenario-${result.status}`; + + // Header (always visible) + const header = document.createElement('div'); + header.className = 'eval-scenario-header'; + header.style.cursor = 'pointer'; + + const badge = document.createElement('span'); + badge.className = `eval-status-badge eval-badge-${result.status}`; + badge.textContent = result.status.toUpperCase(); + header.appendChild(badge); + + // ID + prompt stacked vertically + const idBlock = document.createElement('div'); + idBlock.className = 'eval-scenario-id-block'; + + const id = document.createElement('div'); + id.className = 'eval-scenario-id'; + id.textContent = result.scenarioId; + idBlock.appendChild(id); + + const prompt = document.createElement('div'); + prompt.className = 'eval-scenario-prompt'; + prompt.textContent = result.prompt; + idBlock.appendChild(prompt); + + header.appendChild(idBlock); + + const score = document.createElement('span'); + score.className = 'eval-scenario-score'; + score.textContent = `${(result.combinedScore * 100).toFixed(0)}%`; + header.appendChild(score); + + const duration = document.createElement('span'); + duration.className = 'eval-scenario-duration'; + duration.textContent = `${(result.durationMs / 1000).toFixed(1)}s`; + header.appendChild(duration); + + const chevron = document.createElement('span'); + chevron.className = 'eval-chevron'; + chevron.innerHTML = ` + + `; + header.appendChild(chevron); + + row.appendChild(header); + + // Details (collapsible) + const details = document.createElement('div'); + details.className = 'eval-scenario-details'; + + // Tool calls + if (result.actualToolCalls.length > 0) { + const toolCallsHtml = result.actualToolCalls + .map((tc) => `${escapeHtml(tc.toolName)} (${tc.status})`) + .join(', '); + details.innerHTML += `
Tool Calls: ${toolCallsHtml}
`; + } else { + details.innerHTML += `
Tool Calls: none
`; + } + + // Tool score details + const ts = result.toolCallScore; + if (ts.expectedMissed.length > 0) { + details.innerHTML += `
Missing: ${ts.expectedMissed.map((t) => `${escapeHtml(t)}`).join(', ')}
`; + } + if (ts.forbiddenCalled.length > 0) { + details.innerHTML += `
Forbidden: ${ts.forbiddenCalled.map((t) => `${escapeHtml(t)}`).join(', ')}
`; + } + + // Judge verdict + if (result.judgeScore) { + const js = result.judgeScore; + details.innerHTML += `
Judge: ${escapeHtml(js.verdict)} (${(js.score * 100).toFixed(0)}%) — ${escapeHtml(js.reasoning)}
`; + } + + // Error + if (result.error) { + details.innerHTML += `
Error: ${escapeHtml(result.error)}
`; + } + + row.appendChild(details); + + // Toggle expand/collapse + let expanded = false; + header.addEventListener('click', () => { + expanded = !expanded; + row.classList.toggle('expanded', expanded); + }); + + return row; + } +} + +function escapeHtml(text: string): string { + const div = document.createElement('div'); + div.textContent = text; + return div.innerHTML; +} diff --git a/src/sidebar/index.ts b/src/sidebar/index.ts index 79c0569..7637e5c 100644 --- a/src/sidebar/index.ts +++ b/src/sidebar/index.ts @@ -12,6 +12,8 @@ import { ReasoningBox } from './ReasoningBox'; import { TextBox } from './TextBox'; import { StreamingMarkdownRenderer } from './StreamingMarkdownRenderer'; import { CommandRegistry, CommandProcessor, createBuiltinCommands } from '../lib/commands'; +import { EvalRunner, type SidebarInterface } from '../lib/eval/runner'; +import { EvalReportBox } from './EvalReportBox'; // Streaming session interface to encapsulate all streaming state interface StreamingSession { @@ -59,6 +61,9 @@ let pendingAttachments: ImageAttachment[] = []; let commandRegistry: CommandRegistry; let commandProcessor: CommandProcessor; +// Eval mode state +let activeEvalRunner: EvalRunner | null = null; + // Scroll management state let isUserAtBottom = true; // Initially at bottom @@ -532,6 +537,15 @@ async function handleSendMessage() { if (!text && pendingAttachments.length === 0) return; if (isLoading) return; + // Intercept /eval command before the command processor + if (text === '/eval' || text.startsWith('/eval ')) { + messageInput.value = ''; + messageInput.style.height = '88px'; + const suiteName = text.startsWith('/eval ') ? text.slice(6).trim() : undefined; + startEvalMode(suiteName || undefined); + return; + } + // Clear input immediately for better UX messageInput.value = ''; messageInput.style.height = '88px'; @@ -1204,11 +1218,19 @@ function cancelCurrentStream() { } } -// Cancel current stream on Escape OR clear attachments +// Cancel current stream on Escape OR clear attachments OR abort eval document.addEventListener('keydown', (e) => { if (e.key === 'Escape') { + // Priority 0: Abort active eval + if (activeEvalRunner) { + activeEvalRunner.abort(); + activeEvalRunner = null; + isLoading = false; + updateSendButton(); + log.info('[Sidebar] Eval run aborted'); + } // Priority 1: Cancel active stream - if (currentSession) { + else if (currentSession) { cancelCurrentStream(); } // Priority 2: Clear pending attachments @@ -1220,4 +1242,104 @@ document.addEventListener('keydown', (e) => { } }); +/** + * Start eval mode: load suite, run all scenarios, show report. + */ +async function startEvalMode(suiteName?: string): Promise { + if (isLoading) { + addMessage('error', 'Cannot start eval while a message is loading.'); + return; + } + + // Load eval suites from storage + const suites = await configStorage.getEvalSuites(); + if (suites.length === 0) { + addMessage('error', 'No eval suites imported. Upload one in Settings.'); + return; + } + + // Pick suite: by name or first available + const suite = suiteName + ? suites.find((s) => s.name.toLowerCase() === suiteName.toLowerCase()) + : suites[0]; + + if (!suite) { + addMessage( + 'error', + `Eval suite "${suiteName}" not found. Available: ${suites.map((s) => s.name).join(', ')}` + ); + return; + } + + if (!currentAgentId) { + addMessage('error', 'No agent selected. Select an agent before running evals.'); + return; + } + + // Clear conversation and set up report UI + clearConversation(); + isLoading = true; + updateSendButton(); + + const reportBox = new EvalReportBox(); + messagesContainer.appendChild(reportBox.getElement()); + + // Create sidebar interface adapter + const sidebarAdapter: SidebarInterface = { + attachedTabId, + currentAgentId, + getMessageHistory: () => messageHistory, + clearConversation: () => { + // Clear without adding welcome message (we manage UI ourselves during eval) + messageHistory = []; + // Remove everything except the report box + const children = Array.from(messagesContainer.children); + for (const child of children) { + if (!child.classList.contains('eval-report-box')) { + child.remove(); + } + } + if (currentSession) { + currentSession.port.disconnect(); + currentSession = null; + } + isUserAtBottom = true; + }, + sendMessage: async (text: string) => { + // Create and send message, waiting for stream completion + const content = text; + const userMsg: ChatMessage = { + id: globalThis.crypto.randomUUID(), + role: 'user', + content, + timestamp: Date.now(), + }; + messageHistory.push(userMsg); + await streamAIResponse(); + }, + }; + + // Run eval + const runner = new EvalRunner(sidebarAdapter); + activeEvalRunner = runner; + + runner.setProgressCallback((progress) => { + reportBox.updateProgress(progress); + messagesContainer.scrollTop = messagesContainer.scrollHeight; + }); + + try { + const result = await runner.run(suite); + reportBox.showSummary(result); + } catch (error) { + log.error('[Sidebar] Eval run failed:', error); + reportBox.showError(error instanceof Error ? error.message : 'Eval run failed'); + } finally { + activeEvalRunner = null; + isLoading = false; + updateSendButton(); + messagesContainer.scrollTop = messagesContainer.scrollHeight; + } +} + export {}; diff --git a/src/sidebar/styles.css b/src/sidebar/styles.css index 2714798..6d9d136 100644 --- a/src/sidebar/styles.css +++ b/src/sidebar/styles.css @@ -1392,3 +1392,258 @@ body { .tool-item-box.expanded .chevron { transform: rotate(90deg); } + +/* ============================================ + Eval Report Box Styles + ============================================ */ + +.eval-report-box { + margin: 12px 16px; + padding: 16px; + background: var(--bg-secondary); + border: 1px solid var(--border-color); + border-radius: 12px; +} + +/* Progress */ +.eval-progress { + margin-bottom: 12px; +} + +.eval-progress-label { + font-size: 13px; + font-weight: 500; + color: var(--text-secondary); + margin-bottom: 6px; +} + +.eval-progress-bar { + height: 6px; + background: var(--bg-tertiary); + border-radius: 3px; + overflow: hidden; +} + +.eval-progress-fill { + height: 100%; + background: var(--accent-primary); + border-radius: 3px; + transition: width 0.3s ease; + width: 0%; +} + +/* Scenario list */ +.eval-scenario-list { + display: flex; + flex-direction: column; + gap: 4px; +} + +.eval-scenario-row { + border: 1px solid var(--border-color); + border-radius: 8px; + overflow: hidden; + background: var(--bg-primary); +} + +.eval-scenario-row.eval-scenario-pass { + border-left: 3px solid var(--success-color); +} + +.eval-scenario-row.eval-scenario-fail { + border-left: 3px solid var(--error-color); +} + +.eval-scenario-row.eval-scenario-error { + border-left: 3px solid #f59e0b; +} + +.eval-scenario-header { + display: flex; + align-items: center; + gap: 8px; + padding: 8px 10px; + font-size: 13px; +} + +.eval-status-badge { + display: inline-block; + padding: 1px 6px; + border-radius: 4px; + font-size: 10px; + font-weight: 700; + letter-spacing: 0.5px; + text-transform: uppercase; +} + +.eval-badge-pass { + background: rgba(16, 185, 129, 0.15); + color: var(--success-color); +} + +.eval-badge-fail { + background: rgba(239, 68, 68, 0.15); + color: var(--error-color); +} + +.eval-badge-error { + background: rgba(245, 158, 11, 0.15); + color: #f59e0b; +} + +.eval-scenario-id-block { + flex: 1; + min-width: 0; + overflow: hidden; +} + +.eval-scenario-id { + font-weight: 500; + color: var(--text-primary); + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +.eval-scenario-prompt { + font-size: 12px; + color: var(--text-secondary); + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + margin-top: 1px; +} + +.eval-scenario-score { + font-weight: 600; + color: var(--accent-primary); + font-size: 12px; +} + +.eval-scenario-duration { + font-size: 11px; + color: var(--text-secondary); +} + +.eval-chevron { + color: var(--text-secondary); + transition: transform 0.2s ease; + display: flex; + align-items: center; +} + +.eval-scenario-row.expanded .eval-chevron { + transform: rotate(90deg); +} + +.eval-scenario-details { + display: none; + padding: 8px 10px 10px; + border-top: 1px solid var(--border-color); + font-size: 12px; + line-height: 1.5; +} + +.eval-scenario-row.expanded .eval-scenario-details { + display: block; +} + +.eval-detail-row { + margin-bottom: 4px; + color: var(--text-secondary); +} + +.eval-detail-row strong { + color: var(--text-primary); +} + +.eval-detail-row code { + background: var(--code-bg); + padding: 1px 4px; + border-radius: 3px; + font-size: 11px; +} + +.eval-detail-warn { + color: #f59e0b; +} + +.eval-detail-error { + color: var(--error-color); +} + +/* Summary */ +.eval-summary { + margin-top: 12px; + padding-top: 12px; + border-top: 1px solid var(--border-color); +} + +.eval-summary-title { + font-size: 14px; + font-weight: 600; + color: var(--text-primary); + margin-bottom: 10px; +} + +.eval-summary-grid { + display: grid; + grid-template-columns: repeat(4, 1fr); + gap: 8px; + margin-bottom: 10px; +} + +.eval-summary-stat { + text-align: center; + padding: 8px; + background: var(--bg-tertiary); + border-radius: 8px; +} + +.eval-stat-value { + display: block; + font-size: 20px; + font-weight: 700; + color: var(--text-primary); +} + +.eval-stat-label { + display: block; + font-size: 11px; + color: var(--text-secondary); + text-transform: uppercase; + letter-spacing: 0.5px; +} + +.eval-stat-pass .eval-stat-value { + color: var(--success-color); +} + +.eval-stat-fail .eval-stat-value { + color: var(--error-color); +} + +.eval-stat-error .eval-stat-value { + color: #f59e0b; +} + +.eval-summary-scores { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 4px; + font-size: 12px; + color: var(--text-secondary); +} + +.eval-summary-scores strong { + color: var(--text-primary); +} + +.eval-error { + margin-top: 8px; + padding: 8px 12px; + background: var(--error-bg); + color: var(--error-text); + border-radius: 6px; + font-size: 13px; +} diff --git a/src/types/index.ts b/src/types/index.ts index 8e7026e..f0c8f7d 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -130,6 +130,15 @@ export interface WebMCPScriptsUpdatedMessage { type: 'WEBMCP_SCRIPTS_UPDATED'; } +export interface EvalJudgeMessage { + type: 'EVAL_JUDGE'; + agentId: string; + prompt: string; + assistantResponse: string; + toolCalls: Array<{ toolName: string; input: unknown; output: unknown; status: string }>; + postConditions: string; +} + // Union type for all possible extension messages export type ExtensionMessage = | GetConfigMessage @@ -150,7 +159,8 @@ export type ExtensionMessage = | WebMCPCallToolMessage | WebMCPGetToolsMessage | WebMCPToolsChangedMessage - | WebMCPScriptsUpdatedMessage; + | WebMCPScriptsUpdatedMessage + | EvalJudgeMessage; // Response wrapper for message handlers export interface MessageResponse {