diff --git a/package.json b/package.json
index 2b2fd8e..68ea274 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "ai-sidebar-extension",
-  "version": "0.6.1",
+  "version": "0.6.2",
   "description": "Chrome extension AI sidebar with LLM providers and MCP support",
   "private": true,
   "type": "module",
diff --git a/src/background/index.ts b/src/background/index.ts
index f389761..531a182 100644
--- a/src/background/index.ts
+++ b/src/background/index.ts
@@ -537,6 +537,66 @@ chrome.runtime.onMessage.addListener((request: ExtensionMessage, sender, sendRes
       return true; // Now async to support getBoundTabIdForSidebar
     }
 
+    case 'EVAL_JUDGE': {
+      // Eval judge: use LLM to verify post-conditions of an eval scenario
+      (async () => {
+        try {
+          const { agentId, prompt, assistantResponse, toolCalls, postConditions } = request;
+
+          const systemPrompt = `You are an evaluation judge. Given a user prompt, assistant response, tool calls made, and expected post-conditions, determine whether the post-conditions were met.
+
+Respond ONLY with a JSON object (no markdown fences, no extra text):
+{"verdict": "pass" or "fail", "score": 0.0 to 1.0, "reasoning": "brief explanation"}`;
+
+          const toolCallsSummary = toolCalls
+            .map(
+              (tc: { toolName: string; input: unknown; output: unknown; status: string }) =>
+                `- ${tc.toolName}(${JSON.stringify(tc.input)}) → ${tc.status}: ${JSON.stringify(tc.output)}`
+            )
+            .join('\n');
+
+          const userMsg = `## User Prompt
+${prompt}
+
+## Assistant Response
+${assistantResponse}
+
+## Tool Calls
+${toolCallsSummary || '(none)'}
+
+## Post-Conditions to Verify
+${postConditions}`;
+
+          const responseText = await aiClient.generateTextForAgent(agentId, systemPrompt, userMsg, {
+            temperature: 0,
+          });
+
+          // Parse JSON from response (strip markdown fences if present)
+          const cleaned = responseText
+            .replace(/```(?:json)?\s*/g, '')
+            .replace(/```\s*/g, '')
+            .trim();
+          const parsed = JSON.parse(cleaned);
+
+          sendResponse({
+            success: true,
+            verdict: parsed.verdict || 'fail',
+            score: typeof parsed.score === 'number' ? parsed.score : 0,
+            reasoning: parsed.reasoning || '',
+          });
+        } catch (error) {
+          log.error('[Background] EVAL_JUDGE error:', error);
+          sendResponse({
+            success: true,
+            score: 0,
+            verdict: 'fail',
+            reasoning: `Failed to parse judge response: ${error instanceof Error ? error.message : 'Unknown error'}`,
+          });
+        }
+      })();
+      return true;
+    }
+
     default:
       log.debug('Unknown message type:', request.type);
       return false; // No async response needed
diff --git a/src/lib/ai/client.ts b/src/lib/ai/client.ts
index d2dc693..afd53c9 100644
--- a/src/lib/ai/client.ts
+++ b/src/lib/ai/client.ts
@@ -9,7 +9,7 @@ import log from '../logger';
 import { createOpenAI } from '@ai-sdk/openai';
 import { createAnthropic } from '@ai-sdk/anthropic';
 import { createGoogleGenerativeAI } from '@ai-sdk/google';
-import { streamText, type LanguageModel, CoreMessage, type JSONValue } from 'ai';
+import { streamText, generateText, type LanguageModel, CoreMessage, type JSONValue } from 'ai';
 import type { AgentConfig } from '../storage/config';
 import type { AIProvider, ToolCall } from '../../types';
 import { ConfigStorage } from '../storage/config';
@@ -758,6 +758,43 @@ export class AIClient {
     this.abortController?.abort();
   }
 
+  /**
+   * Non-streaming text generation for a specific agent.
+   * Used for eval judge and other non-interactive use cases.
+   */
+  async generateTextForAgent(
+    agentId: string,
+    systemPrompt: string,
+    userMessage: string,
+    options?: { temperature?: number }
+  ): Promise<string> {
+    const agent = await this.configStorage.getAgent(agentId);
+    if (!agent) {
+      throw new Error(`Agent ${agentId} not found`);
+    }
+
+    if (!agent.apiKey && !agent.endpoint) {
+      throw new Error(`No API key or endpoint configured for agent "${agent.name}"`);
+    }
+
+    const modelFactory = this.createProviderForAgent(agent);
+    const model = modelFactory();
+
+    const messages: CoreMessage[] = [
+      { role: 'system', content: systemPrompt },
+      { role: 'user', content: userMessage },
+    ];
+
+    const result = await generateText({
+      model,
+      messages,
+      temperature: options?.temperature ?? 0,
+      maxRetries: 2,
+    });
+
+    return result.text;
+  }
+
   /**
    * Test connection with provided agent details (for new agents before saving)
    */
diff --git a/src/lib/commands/builtins.ts b/src/lib/commands/builtins.ts
index d24431e..05e47e0 100644
--- a/src/lib/commands/builtins.ts
+++ b/src/lib/commands/builtins.ts
@@ -317,5 +317,11 @@ export function createBuiltinCommands(
     clear: () => {
       clearChat();
     },
+
+    eval: () => {
+      // No-op: actual handling is intercepted in sidebar's handleSendMessage.
+      // Registered here so /help lists it.
+      window.dispatchEvent(new CustomEvent('start-eval'));
+    },
   };
 }
diff --git a/src/lib/eval/runner.ts b/src/lib/eval/runner.ts
new file mode 100644
index 0000000..0cc613c
--- /dev/null
+++ b/src/lib/eval/runner.ts
@@ -0,0 +1,350 @@
+/**
+ * EvalRunner orchestrates batch evaluation of an eval suite.
+ * Runs each scenario sequentially: navigate → clear → prompt → extract → score → judge.
+ */
+
+import type { ChatMessage, ToolCall } from '../../types';
+import type {
+  EvalSuite,
+  EvalScenario,
+  EvalScenarioResult,
+  EvalSuiteResult,
+  EvalProgress,
+  EvalPhase,
+  JudgeScore,
+} from './types';
+import { scoreToolCalls, combinedScore } from './scoring';
+
+const DEFAULT_TIMEOUT_MS = 60_000;
+const NAVIGATION_TIMEOUT_MS = 15_000;
+const SETTLE_MS = 200;
+
+/**
+ * Interface for sidebar internals to avoid circular deps.
+ * The sidebar creates an adapter implementing this.
+ */
+export interface SidebarInterface {
+  attachedTabId: number | null;
+  currentAgentId: string | null;
+  getMessageHistory(): ChatMessage[];
+  clearConversation(): void;
+  sendMessage(text: string): Promise<void>;
+}
+
+export class EvalRunner {
+  private sidebar: SidebarInterface;
+  private aborted = false;
+  private onProgress: ((progress: EvalProgress) => void) | null = null;
+
+  constructor(sidebar: SidebarInterface) {
+    this.sidebar = sidebar;
+  }
+
+  /**
+   * Set progress callback (called after each scenario completes).
+   */
+  setProgressCallback(cb: (progress: EvalProgress) => void): void {
+    this.onProgress = cb;
+  }
+
+  /**
+   * Abort the current eval run.
+   */
+  abort(): void {
+    this.aborted = true;
+  }
+
+  /**
+   * Run all scenarios in the suite sequentially.
+   */
+  async run(suite: EvalSuite): Promise<EvalSuiteResult> {
+    this.aborted = false;
+    const startTime = Date.now();
+    const scenarioResults: EvalScenarioResult[] = [];
+
+    for (let i = 0; i < suite.scenarios.length; i++) {
+      if (this.aborted) break;
+
+      const scenario = suite.scenarios[i];
+      this.emitProgress('loading', i, suite.scenarios.length, scenario.id, scenarioResults);
+
+      const result = await this.runScenario(scenario, suite, i);
+      scenarioResults.push(result);
+
+      this.emitProgress('scoring', i + 1, suite.scenarios.length, scenario.id, scenarioResults);
+    }
+
+    const endTime = Date.now();
+    this.emitProgress(
+      'complete',
+      suite.scenarios.length,
+      suite.scenarios.length,
+      '',
+      scenarioResults
+    );
+
+    return this.buildSuiteResult(suite.name, startTime, endTime, scenarioResults);
+  }
+
+  private async runScenario(
+    scenario: EvalScenario,
+    suite: EvalSuite,
+    index: number
+  ): Promise<EvalScenarioResult> {
+    const scenarioStart = Date.now();
+    const timeoutMs = scenario.timeoutMs || DEFAULT_TIMEOUT_MS;
+
+    try {
+      return await this.withTimeout(
+        this.executeScenario(scenario, suite, index),
+        timeoutMs,
+        `Scenario "${scenario.id}" timed out after ${timeoutMs}ms`
+      );
+    } catch (error) {
+      return {
+        scenarioId: scenario.id,
+        prompt: scenario.prompt,
+        status: 'error',
+        durationMs: Date.now() - scenarioStart,
+        toolCallScore: {
+          score: 0,
+          expectedCalled: [],
+          expectedMissed: [],
+          forbiddenCalled: [],
+          unexpectedCalls: [],
+        },
+        judgeScore: null,
+        combinedScore: 0,
+        actualToolCalls: [],
+        assistantResponse: '',
+        error: error instanceof Error ? error.message : String(error),
+      };
+    }
+  }
+
+  private async executeScenario(
+    scenario: EvalScenario,
+    suite: EvalSuite,
+    index: number
+  ): Promise<EvalScenarioResult> {
+    const scenarioStart = Date.now();
+
+    // 1. Navigate if startPage is set
+    if (scenario.startPage && this.sidebar.attachedTabId) {
+      this.emitProgress('navigating', index, suite.scenarios.length, scenario.id, []);
+      const baseUrl = suite.baseUrl || '';
+      const fullUrl = scenario.startPage.startsWith('http')
+        ? scenario.startPage
+        : baseUrl + scenario.startPage;
+
+      await this.navigateTab(this.sidebar.attachedTabId, fullUrl);
+    }
+
+    // 2. Clear conversation
+    this.emitProgress('clearing', index, suite.scenarios.length, scenario.id, []);
+    this.sidebar.clearConversation();
+    await this.sleep(SETTLE_MS);
+
+    // 3. Send prompt and wait for completion
+    this.emitProgress('prompting', index, suite.scenarios.length, scenario.id, []);
+    await this.sidebar.sendMessage(scenario.prompt);
+
+    // 4. Extract results from message history
+    const history = this.sidebar.getMessageHistory();
+    const lastAssistant = [...history].reverse().find((m) => m.role === 'assistant');
+    const assistantResponse =
+      typeof lastAssistant?.content === 'string'
+        ? lastAssistant.content
+        : lastAssistant?.content
+            ?.filter((p) => p.type === 'text')
+            .map((p) => p.text)
+            .join('') || '';
+
+    // Collect all tool calls from assistant messages
+    const allToolCalls: ToolCall[] = [];
+    for (const msg of history) {
+      if (msg.role === 'assistant' && msg.toolCalls) {
+        allToolCalls.push(...msg.toolCalls);
+      }
+    }
+
+    const actualToolCallNames = allToolCalls.map((tc) => tc.toolName);
+    const actualToolCallDetails = allToolCalls.map((tc) => ({
+      toolName: tc.toolName,
+      input: tc.input,
+      output: tc.output,
+      status: tc.status,
+    }));
+
+    // 5. Score tool calls
+    const toolScore = scoreToolCalls(
+      actualToolCallNames,
+      scenario.expectations.toolCalls,
+      scenario.expectations.forbiddenToolCalls
+    );
+
+    // 6. Judge (if post-conditions defined)
+    let judgeResult: JudgeScore | null = null;
+    if (scenario.expectations.postConditions && this.sidebar.currentAgentId) {
+      this.emitProgress('judging', index, suite.scenarios.length, scenario.id, []);
+      judgeResult = await this.runJudge(
+        scenario.prompt,
+        assistantResponse,
+        actualToolCallDetails,
+        scenario.expectations.postConditions
+      );
+    }
+
+    // 7. Compute combined score
+    const combined = combinedScore(toolScore, judgeResult);
+    const status = combined >= 0.5 ? 'pass' : 'fail';
+
+    return {
+      scenarioId: scenario.id,
+      prompt: scenario.prompt,
+      status,
+      durationMs: Date.now() - scenarioStart,
+      toolCallScore: toolScore,
+      judgeScore: judgeResult,
+      combinedScore: combined,
+      actualToolCalls: actualToolCallDetails,
+      assistantResponse,
+    };
+  }
+
+  private async navigateTab(tabId: number, url: string): Promise<void> {
+    await chrome.tabs.update(tabId, { url });
+
+    // Wait for WEBMCP_TOOLS_CHANGED or fall back after webNavigation completes
+    await new Promise<void>((resolve) => {
+      let resolved = false;
+      const done = () => {
+        if (!resolved) {
+          resolved = true;
+          chrome.runtime.onMessage.removeListener(onMessage);
+          resolve();
+        }
+      };
+
+      // Listen for tools changed signal (ideal)
+      const onMessage = (msg: { type: string; tabId?: number }) => {
+        if (msg.type === 'WEBMCP_TOOLS_CHANGED' && msg.tabId === tabId) {
+          done();
+        }
+      };
+      chrome.runtime.onMessage.addListener(onMessage);
+
+      // Fallback: wait for tab to finish loading + 3s settle
+      chrome.tabs.onUpdated.addListener(function onUpdated(
+        updatedTabId: number,
+        changeInfo: chrome.tabs.TabChangeInfo
+      ) {
+        if (updatedTabId === tabId && changeInfo.status === 'complete') {
+          chrome.tabs.onUpdated.removeListener(onUpdated);
+          setTimeout(done, 3000);
+        }
+      });
+
+      // Hard timeout
+      setTimeout(done, NAVIGATION_TIMEOUT_MS);
+    });
+  }
+
+  private async runJudge(
+    prompt: string,
+    assistantResponse: string,
+    toolCalls: Array<{ toolName: string; input: unknown; output: unknown; status: string }>,
+    postConditions: string
+  ): Promise<JudgeScore> {
+    try {
+      const response = await chrome.runtime.sendMessage({
+        type: 'EVAL_JUDGE',
+        agentId: this.sidebar.currentAgentId,
+        prompt,
+        assistantResponse,
+        toolCalls,
+        postConditions,
+      });
+
+      if (response?.success) {
+        return {
+          score: typeof response.score === 'number' ? response.score : 0,
+          verdict: response.verdict || 'fail',
+          reasoning: response.reasoning || '',
+        };
+      }
+
+      return { score: 0, verdict: 'fail', reasoning: 'Judge call failed' };
+    } catch (error) {
+      return {
+        score: 0,
+        verdict: 'fail',
+        reasoning: `Judge error: ${error instanceof Error ? error.message : String(error)}`,
+      };
+    }
+  }
+
+  private buildSuiteResult(
+    suiteName: string,
+    startTime: number,
+    endTime: number,
+    scenarios: EvalScenarioResult[]
+  ): EvalSuiteResult {
+    const passed = scenarios.filter((s) => s.status === 'pass').length;
+    const failed = scenarios.filter((s) => s.status === 'fail').length;
+    const errored = scenarios.filter((s) => s.status === 'error').length;
+
+    const toolScores = scenarios.map((s) => s.toolCallScore.score);
+    const judgeScores = scenarios
+      .filter((s): s is EvalScenarioResult & { judgeScore: JudgeScore } => s.judgeScore !== null)
+      .map((s) => s.judgeScore.score);
+    const combinedScores = scenarios.map((s) => s.combinedScore);
+
+    const avg = (arr: number[]) =>
+      arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : 0;
+
+    return {
+      suiteName,
+      startTime,
+      endTime,
+      totalDurationMs: endTime - startTime,
+      scenarios,
+      summary: {
+        total: scenarios.length,
+        passed,
+        failed,
+        errored,
+        avgToolCallScore: Math.round(avg(toolScores) * 1000) / 1000,
+        avgJudgeScore: Math.round(avg(judgeScores) * 1000) / 1000,
+        avgCombinedScore: Math.round(avg(combinedScores) * 1000) / 1000,
+      },
+    };
+  }
+
+  private emitProgress(
+    phase: EvalPhase,
+    currentIndex: number,
+    total: number,
+    scenarioId: string,
+    results: EvalScenarioResult[]
+  ): void {
+    this.onProgress?.({
+      phase,
+      currentScenarioIndex: currentIndex,
+      totalScenarios: total,
+      currentScenarioId: scenarioId,
+      scenarioResults: [...results],
+    });
+  }
+
+  private sleep(ms: number): Promise<void> {
+    return new Promise((resolve) => setTimeout(resolve, ms));
+  }
+
+  private withTimeout<T>(promise: Promise<T>, ms: number, message: string): Promise<T> {
+    return Promise.race([
+      promise,
+      new Promise<T>((_, reject) => setTimeout(() => reject(new Error(message)), ms)),
+    ]);
+  }
+}
diff --git a/src/lib/eval/scoring.ts b/src/lib/eval/scoring.ts
new file mode 100644
index 0000000..6e556a6
--- /dev/null
+++ b/src/lib/eval/scoring.ts
@@ -0,0 +1,82 @@
+/**
+ * Pure scoring functions for eval scenarios.
+ * No side effects — suitable for unit testing.
+ */
+
+import type { ToolCallScore, JudgeScore } from './types';
+
+/**
+ * Score tool call accuracy against expectations.
+ *
+ * Score = (expected tools called / total expected) minus 0.5 penalty if any forbidden tool called.
+ * Empty expectations = 1.0 (vacuously true).
+ */
+export function scoreToolCalls(
+  actualCalls: string[],
+  expectedToolCalls?: string[],
+  forbiddenToolCalls?: string[]
+): ToolCallScore {
+  const actualSet = new Set(actualCalls);
+
+  const expectedCalled: string[] = [];
+  const expectedMissed: string[] = [];
+
+  if (expectedToolCalls && expectedToolCalls.length > 0) {
+    for (const expected of expectedToolCalls) {
+      if (actualSet.has(expected)) {
+        expectedCalled.push(expected);
+      } else {
+        expectedMissed.push(expected);
+      }
+    }
+  }
+
+  const forbiddenCalled: string[] = [];
+  if (forbiddenToolCalls) {
+    for (const forbidden of forbiddenToolCalls) {
+      if (actualSet.has(forbidden)) {
+        forbiddenCalled.push(forbidden);
+      }
+    }
+  }
+
+  // Unexpected = actual calls not in expected (informational, no penalty)
+  const expectedSet = new Set(expectedToolCalls || []);
+  const unexpectedCalls = actualCalls.filter((c) => !expectedSet.has(c));
+
+  // Calculate score
+  let score: number;
+  if (!expectedToolCalls || expectedToolCalls.length === 0) {
+    // No expectations → vacuously true
+    score = 1.0;
+  } else {
+    score = expectedCalled.length / expectedToolCalls.length;
+  }
+
+  // Apply forbidden penalty
+  if (forbiddenCalled.length > 0) {
+    score = Math.max(0, score - 0.5);
+  }
+
+  return {
+    score: Math.round(score * 1000) / 1000,
+    expectedCalled,
+    expectedMissed,
+    forbiddenCalled,
+    unexpectedCalls,
+  };
+}
+
+/**
+ * Combine tool call score and judge score into a single number.
+ *
+ * Both present: 40% tool calls + 60% judge.
+ * Only one present: 100% of that score.
+ */
+export function combinedScore(toolCallScore: ToolCallScore, judgeScore: JudgeScore | null): number {
+  if (judgeScore !== null) {
+    const combined = 0.4 * toolCallScore.score + 0.6 * judgeScore.score;
+    return Math.round(combined * 1000) / 1000;
+  }
+  return toolCallScore.score;
+}
diff --git a/src/lib/eval/types.ts b/src/lib/eval/types.ts
new file mode 100644
index 0000000..f7622a9
--- /dev/null
+++ b/src/lib/eval/types.ts
@@ -0,0 +1,102 @@
+/**
+ * Type definitions for the eval system.
+ * Eval suites test whether tool definitions lead to correct agent behavior.
+ */
+
+// --- Suite & Scenario Definitions ---
+
+export interface EvalSuite {
+  name: string;
+  description?: string;
+  baseUrl?: string;
+  scenarios: EvalScenario[];
+}
+
+export interface EvalScenario {
+  id: string;
+  prompt: string;
+  startPage?: string;
+  expectations: EvalExpectations;
+  tags?: string[];
+  timeoutMs?: number;
+}
+
+export interface EvalExpectations {
+  toolCalls?: string[];
+  forbiddenToolCalls?: string[];
+  postConditions?: string;
+}
+
+export interface StoredEvalSuite extends EvalSuite {
+  id: string;
+  fileName?: string;
+  importedAt: number;
+}
+
+// --- Scoring ---
+
+export interface ToolCallScore {
+  score: number;
+  expectedCalled: string[];
+  expectedMissed: string[];
+  forbiddenCalled: string[];
+  unexpectedCalls: string[];
+}
+
+export interface JudgeScore {
+  score: number;
+  verdict: string;
+  reasoning: string;
+}
+
+// --- Results ---
+
+export interface EvalScenarioResult {
+  scenarioId: string;
+  prompt: string;
+  status: 'pass' | 'fail' | 'error';
+  durationMs: number;
+  toolCallScore: ToolCallScore;
+  judgeScore: JudgeScore | null;
+  combinedScore: number;
+  actualToolCalls: Array<{ toolName: string; input: unknown; output: unknown; status: string }>;
+  assistantResponse: string;
+  error?: string;
+}
+
+export interface EvalSuiteResult {
+  suiteName: string;
+  startTime: number;
+  endTime: number;
+  totalDurationMs: number;
+  scenarios: EvalScenarioResult[];
+  summary: {
+    total: number;
+    passed: number;
+    failed: number;
+    errored: number;
+    avgToolCallScore: number;
+    avgJudgeScore: number;
+    avgCombinedScore: number;
+  };
+}
+
+// --- Progress Tracking ---
+
+export type EvalPhase =
+  | 'loading'
+  | 'navigating'
+  | 'clearing'
+  | 'prompting'
+  | 'judging'
+  | 'scoring'
+  | 'complete'
+  | 'error';
+
+export interface EvalProgress {
+  phase: EvalPhase;
+  currentScenarioIndex: number;
+  totalScenarios: number;
+  currentScenarioId: string;
+  scenarioResults: EvalScenarioResult[];
+}
diff --git a/src/lib/storage/config.ts b/src/lib/storage/config.ts
index fb0c053..497a884 100644
--- a/src/lib/storage/config.ts
+++ b/src/lib/storage/config.ts
@@ -3,6 +3,8 @@
  * Agent-centric configuration for AI assistants
  */
 
+import type { StoredEvalSuite, EvalSuite } from '../eval/types';
+
 export type AIProvider = 'openai' | 'anthropic' | 'google';
 
 export interface ReasoningConfig {
@@ -45,6 +47,7 @@ export interface StorageConfig {
   mcpConfig?: MCPConfig;
   userScripts?: UserScript[]; // WebMCP user-defined tool scripts
   builtinScripts?: BuiltinScript[]; // Built-in tool state (only stores user overrides)
+  evalSuites?: StoredEvalSuite[]; // Imported eval suite definitions
   logLevel?: string; // Global log level: 'trace' | 'debug' | 'info' | 'warn' | 'error' | 'silent'
 }
 
@@ -404,4 +407,35 @@ export class ConfigStorage {
 
     await this.set({ builtinScripts: scripts });
   }
+
+  // Eval Suite management methods
+  async getEvalSuites(): Promise<StoredEvalSuite[]> {
+    const config = await this.get();
+    return config.evalSuites || [];
+  }
+
+  async addEvalSuite(suite: EvalSuite, fileName?: string): Promise<string> {
+    const suites = await this.getEvalSuites();
+    const id = globalThis.crypto.randomUUID();
+    const stored: StoredEvalSuite = {
+      ...suite,
+      id,
+      fileName,
+      importedAt: Date.now(),
+    };
+
+    await this.set({ evalSuites: [...suites, stored] });
+    return id;
+  }
+
+  async deleteEvalSuite(id: string): Promise<void> {
+    const suites = await this.getEvalSuites();
+    const filtered = suites.filter((s) => s.id !== id);
+
+    if (filtered.length === suites.length) {
+      throw new Error(`Eval suite ${id} not found`);
+    }
+
+    await this.set({ evalSuites: filtered });
+  }
 }
diff --git a/src/options/eval-suites.ts b/src/options/eval-suites.ts
new file mode 100644
index 0000000..dc794b5
--- /dev/null
+++ b/src/options/eval-suites.ts
@@ -0,0 +1,253 @@
+/**
+ * Eval Suites management for the options page.
+ * Handles importing, displaying, and deleting eval suite JSON files.
+ */
+
+import log from '../lib/logger';
+import { ConfigStorage } from '../lib/storage/config';
+import type { EvalSuite, StoredEvalSuite } from '../lib/eval/types';
+
+const configStorage = ConfigStorage.getInstance();
+
+/**
+ * Initialize eval suites UI
+ */
+export async function initEvalSuites(): Promise<void> {
+  const importBtn = document.getElementById('import-evalsuite');
+  const fileInput = document.getElementById('import-evalsuite-file') as HTMLInputElement;
+
+  if (!importBtn || !fileInput) {
+    log.warn('[EvalSuites] UI elements not found, skipping initialization');
+    return;
+  }
+
+  importBtn.addEventListener('click', () => {
+    fileInput.click();
+  });
+
+  fileInput.addEventListener('change', async (event) => {
+    const input = event.target as HTMLInputElement;
+    const file = input.files?.[0];
+
+    if (file) {
+      await importEvalSuite(file);
+      input.value = '';
+    }
+  });
+
+  await renderEvalSuites();
+}
+
+/**
+ * Render eval suite cards from storage
+ */
+async function renderEvalSuites(): Promise<void> {
+  const listEl = document.getElementById('evalsuites-list');
+  const emptyEl = document.getElementById('no-evalsuites');
+
+  if (!listEl || !emptyEl) return;
+
+  const suites = await configStorage.getEvalSuites();
+
+  listEl.innerHTML = '';
+
+  if (suites.length === 0) {
+    listEl.classList.add('hidden');
+    emptyEl.classList.remove('hidden');
+    return;
+  }
+
+  listEl.classList.remove('hidden');
+  emptyEl.classList.add('hidden');
+
+  for (const suite of suites) {
+    listEl.appendChild(createSuiteCard(suite));
+  }
+}
+
+/**
+ * Create a card element for an eval suite
+ */
+function createSuiteCard(suite: StoredEvalSuite): HTMLElement {
+  const card = document.createElement('div');
+  card.className = 'card card-clickable';
+  card.dataset.id = suite.id;
+
+  // Header
+  const header = document.createElement('div');
+  header.className = 'card-header';
+
+  const info = document.createElement('div');
+  info.className = 'card-info';
+
+  const title = document.createElement('div');
+  title.className = 'card-title';
+  title.textContent = suite.name;
+  info.appendChild(title);
+
+  if (suite.description) {
+    const subtitle = document.createElement('div');
+    subtitle.className = 'card-subtitle';
+    subtitle.textContent = suite.description;
+    info.appendChild(subtitle);
+  }
+
+  header.appendChild(info);
+  card.appendChild(header);
+
+  // Details
+  const body = document.createElement('div');
+  body.className = 'card-body';
+
+  const scenarioDetail = document.createElement('div');
+  scenarioDetail.className = 'card-detail';
+  scenarioDetail.innerHTML = `<span class="detail-label">Scenarios:</span> <span class="detail-value">${suite.scenarios.length}</span>`;
+  body.appendChild(scenarioDetail);
+
+  if (suite.baseUrl) {
+    const urlDetail = document.createElement('div');
+    urlDetail.className = 'card-detail';
+    urlDetail.innerHTML = `<span class="detail-label">Base URL:</span> <span class="detail-value monospace">${escapeHtml(suite.baseUrl)}</span>`;
+    body.appendChild(urlDetail);
+  }
+
+  if (suite.fileName) {
+    const fileDetail = document.createElement('div');
+    fileDetail.className = 'card-detail';
+    fileDetail.innerHTML = `<span class="detail-label">File:</span> <span class="detail-value monospace">${escapeHtml(suite.fileName)}</span>`;
+    body.appendChild(fileDetail);
+  }
+
+  const dateDetail = document.createElement('div');
+  dateDetail.className = 'card-detail';
+  dateDetail.innerHTML = `<span class="detail-label">Imported:</span> <span class="detail-value">${new Date(suite.importedAt).toLocaleDateString()}</span>`;
+  body.appendChild(dateDetail);
+
+  // Tags from all scenarios
+  const allTags = new Set<string>();
+  for (const scenario of suite.scenarios) {
+    if (scenario.tags) {
+      for (const tag of scenario.tags) {
+        allTags.add(tag);
+      }
+    }
+  }
+  if (allTags.size > 0) {
+    const tagsDetail = document.createElement('div');
+    tagsDetail.className = 'card-detail';
+    tagsDetail.innerHTML = `<span class="detail-label">Tags:</span> <span class="detail-value">${[...allTags].map((t) => `<code>${escapeHtml(t)}</code>`).join(' ')}</span>`;
+    body.appendChild(tagsDetail);
+  }
+
+  card.appendChild(body);
+
+  // Delete button
+  const actions = document.createElement('div');
+  actions.className = 'card-header-actions';
+  const deleteBtn = document.createElement('button');
+  deleteBtn.className = 'button button-danger button-small';
+  deleteBtn.textContent = 'Delete';
+  deleteBtn.addEventListener('click', async (e) => {
+    e.stopPropagation();
+    if (window.confirm(`Delete eval suite "${suite.name}"?`)) {
+      await deleteEvalSuite(suite.id);
+    }
+  });
+  actions.appendChild(deleteBtn);
+  header.appendChild(actions);
+
+  return card;
+}
+
+/**
+ * Import an eval suite from a JSON file
+ */
+async function importEvalSuite(file: File): Promise<void> {
+  try {
+    const text = await readFile(file);
+    const parsed = JSON.parse(text);
+
+    // Validate structure
+    if (!parsed.name || typeof parsed.name !== 'string') {
+      throw new Error('Suite must have a "name" field');
+    }
+    if (!Array.isArray(parsed.scenarios) || parsed.scenarios.length === 0) {
+      throw new Error('Suite must have at least one scenario');
+    }
+
+    for (const scenario of parsed.scenarios) {
+      if (!scenario.id || !scenario.prompt) {
+        throw new Error(`Each scenario must have "id" and "prompt" fields`);
+      }
+      if (!scenario.expectations || typeof scenario.expectations !== 'object') {
+        throw new Error(`Scenario "${scenario.id}" must have "expectations" object`);
+      }
+    }
+
+    const suite: EvalSuite = {
+      name: parsed.name,
+      description: parsed.description,
+      baseUrl: parsed.baseUrl,
+      scenarios: parsed.scenarios,
+    };
+
+    await configStorage.addEvalSuite(suite, file.name);
+    showStatus(
+      `Imported eval suite "${suite.name}" (${suite.scenarios.length} scenarios)`,
+      'success'
+    );
+    await renderEvalSuites();
+  } catch (error) {
+    log.error('[EvalSuites] Import failed:', error);
+    if (error instanceof SyntaxError) {
+      showStatus('Invalid JSON file', 'error');
+    } else if (error instanceof Error) {
+      showStatus(`Import failed: ${error.message}`, 'error');
+    } else {
+      showStatus('Import failed: Unknown error', 'error');
+    }
+  }
+}
+
+/**
+ * Delete an eval suite by ID
+ */
+async function deleteEvalSuite(id: string): Promise<void> {
+  try {
+    await configStorage.deleteEvalSuite(id);
+    showStatus('Eval suite deleted', 'success');
+    await renderEvalSuites();
+  } catch (error) {
+    log.error('[EvalSuites] Delete failed:', error);
+    showStatus('Failed to delete eval suite', 'error');
+  }
+}
+
+function readFile(file: File): Promise<string> {
+  return new Promise((resolve, reject) => {
+    const reader = new FileReader();
+    reader.onload = () => resolve(reader.result as string);
+    reader.onerror = () => reject(new Error('Failed to read file'));
+    reader.readAsText(file);
+  });
+}
+
+function escapeHtml(text: string): string {
+  const div = document.createElement('div');
+  div.textContent = text;
+  return div.innerHTML;
+}
+
+function showStatus(message: string, type: 'success' | 'error' | 'info'): void {
+  const statusEl = document.getElementById('status-message');
+  if (!statusEl) return;
+
+  statusEl.textContent = message;
+  statusEl.className = `status-message ${type}`;
+  statusEl.style.display = 'block';
+
+  const delay = type === 'error' ? 5000 : 3000;
+  setTimeout(() => {
+    statusEl.style.display = 'none';
+  }, delay);
+}
diff --git a/src/options/index.html b/src/options/index.html
index 31efdcd..144f3a5 100644
--- a/src/options/index.html
+++ b/src/options/index.html
@@ -96,6 +96,26 @@ <h3>Server Status</h3>
         </div>
       </section>
 
+      <!-- Eval Suites Section -->
+      <section class="eval-section">
+        <div class="eval-header">
+          <h2>Eval Suites</h2>
+          <button id="import-evalsuite" class="button button-primary">+ Import Suite</button>
+        </div>
+
+        <div id="evalsuites-list" class="evalsuites-list">
+          <!-- Eval suite cards will be populated dynamically -->
+        </div>
+
+        <div id="no-evalsuites" class="no-evalsuites hidden">
+          <div class="no-evalsuites-content">
+            <h3>No eval suites imported</h3>
+            <p>Import a JSON eval suite file to test your WebMCP tool definitions. Run evals from the sidebar with <code>/eval</code>.</p>
+          </div>
+        </div>
+      </section>
+      <input type="file" id="import-evalsuite-file" class="hidden" accept=".json">
+
       <!-- Settings Section -->
       <section class="settings-section">
         <h2>Settings</h2>
diff --git a/src/options/index.ts b/src/options/index.ts
index 2efbd04..0d9a4c1 100644
--- a/src/options/index.ts
+++ b/src/options/index.ts
@@ -18,6 +18,7 @@ import { initializeWebMCPScripts } from './webmcp-scripts';
 import { initializeCommands } from './commands';
 import { openModal, closeModal, setupBackdropHandler } from './modal-manager';
 import { initializeBackupRestore } from './backup-restore';
+import { initEvalSuites } from './eval-suites';
 import {
   createCard,
   setupModalFooter,
@@ -41,6 +42,7 @@ document.addEventListener('DOMContentLoaded', async () => {
   await initializeWebMCPScripts();
   await initializeCommands();
   await initializeBackupRestore();
+  await initEvalSuites();
 });
 
 async function renderAgents() {
diff --git a/src/options/styles.css b/src/options/styles.css
index 1fc5fe2..bcea2c3 100644
--- a/src/options/styles.css
+++ b/src/options/styles.css
@@ -47,7 +47,8 @@ header h1 {
 .agents-section,
 .commands-section,
 .webmcp-section,
-.mcp-section {
+.mcp-section,
+.eval-section {
   background: white;
   border-radius: 12px;
   padding: 24px;
@@ -58,7 +59,8 @@ header h1 {
 .agents-header,
 .commands-header,
 .webmcp-header,
-.mcp-header {
+.mcp-header,
+.eval-header {
   display: flex;
   justify-content: space-between;
   align-items: center;
@@ -70,6 +72,7 @@ header h1 {
 .commands-header h2,
 .webmcp-header h2,
 .mcp-header h2,
+.eval-header h2,
 .settings-section h2 {
   font-size: 20px;
   font-weight: 600;
@@ -77,10 +80,11 @@ header h1 {
   margin: 0;
 }
 
-/* Grid lists for agents, commands, and scripts */
+/* Grid lists for agents, commands, scripts, and eval suites */
 .agents-list,
 .commands-list,
-.scripts-list {
+.scripts-list,
+.evalsuites-list {
   display: grid;
   grid-template-columns: repeat(auto-fill, minmax(350px, 1fr));
   gap: 20px;
@@ -177,13 +181,15 @@ header h1 {
 /* Empty state - unified styling */
 .no-agents,
 .no-commands,
-.no-scripts {
+.no-scripts,
+.no-evalsuites {
   margin: 0;
 }
 
 .no-agents-content,
 .no-commands-content,
-.no-scripts-content {
+.no-scripts-content,
+.no-evalsuites-content {
   text-align: center;
   padding: 48px 32px;
   background: #f8f9fa;
@@ -193,7 +199,8 @@ header h1 {
 
 .no-agents-content h3,
 .no-commands-content h3,
-.no-scripts-content h3 {
+.no-scripts-content h3,
+.no-evalsuites-content h3 {
   font-size: 16px;
   font-weight: 500;
   color: #6c757d;
@@ -202,7 +209,8 @@ header h1 {
 
 .no-agents-content p,
 .no-commands-content p,
-.no-scripts-content p {
+.no-scripts-content p,
+.no-evalsuites-content p {
   color: #6c757d;
   font-size: 14px;
   margin: 0;
@@ -213,7 +221,8 @@ header h1 {
 }
 
 .no-agents-content code,
-.no-commands-content code {
+.no-commands-content code,
+.no-evalsuites-content code {
   background: rgba(0, 0, 0, 0.06);
   padding: 2px 6px;
   border-radius: 3px;
diff --git a/src/sidebar/EvalReportBox.ts b/src/sidebar/EvalReportBox.ts
new file mode 100644
index 0000000..0462a74
--- /dev/null
+++ b/src/sidebar/EvalReportBox.ts
@@ -0,0 +1,231 @@
+/**
+ * EvalReportBox — UI component for displaying eval progress and results.
+ * Rendered into the #messages container in the sidebar.
+ */
+
+import type { EvalProgress, EvalScenarioResult, EvalSuiteResult } from '../lib/eval/types';
+
+export class EvalReportBox {
+  private container: HTMLElement;
+  private progressSection: HTMLElement;
+  private progressBar: HTMLElement;
+  private progressFill: HTMLElement;
+  private progressLabel: HTMLElement;
+  private scenarioList: HTMLElement;
+  private summarySection: HTMLElement;
+
+  constructor() {
+    this.container = document.createElement('div');
+    this.container.className = 'eval-report-box';
+
+    // Progress section
+    this.progressSection = document.createElement('div');
+    this.progressSection.className = 'eval-progress';
+
+    this.progressLabel = document.createElement('div');
+    this.progressLabel.className = 'eval-progress-label';
+    this.progressLabel.textContent = 'Starting eval...';
+    this.progressSection.appendChild(this.progressLabel);
+
+    this.progressBar = document.createElement('div');
+    this.progressBar.className = 'eval-progress-bar';
+    this.progressFill = document.createElement('div');
+    this.progressFill.className = 'eval-progress-fill';
+    this.progressBar.appendChild(this.progressFill);
+    this.progressSection.appendChild(this.progressBar);
+
+    this.container.appendChild(this.progressSection);
+
+    // Scenario list
+    this.scenarioList = document.createElement('div');
+    this.scenarioList.className = 'eval-scenario-list';
+    this.container.appendChild(this.scenarioList);
+
+    // Summary section (hidden until complete)
+    this.summarySection = document.createElement('div');
+    this.summarySection.className = 'eval-summary hidden';
+    this.container.appendChild(this.summarySection);
+  }
+
+  getElement(): HTMLElement {
+    return this.container;
+  }
+
+  /**
+   * Update the progress display.
+   */
+  updateProgress(progress: EvalProgress): void {
+    const { phase, currentScenarioIndex, totalScenarios, currentScenarioId, scenarioResults } =
+      progress;
+
+    // Update progress bar
+    const pct = totalScenarios > 0 ? (currentScenarioIndex / totalScenarios) * 100 : 0;
+    this.progressFill.style.width = `${pct}%`;
+
+    // Update label
+    if (phase === 'complete') {
+      this.progressLabel.textContent = `${totalScenarios}/${totalScenarios} — Complete`;
+    } else {
+      this.progressLabel.textContent = `${currentScenarioIndex}/${totalScenarios} — ${phase} ${currentScenarioId}`;
+    }
+
+    // Append newly completed scenarios
+    const existingCount = this.scenarioList.children.length;
+    for (let i = existingCount; i < scenarioResults.length; i++) {
+      this.scenarioList.appendChild(this.createScenarioRow(scenarioResults[i]));
+    }
+  }
+
+  /**
+   * Show final summary.
+   */
+  showSummary(result: EvalSuiteResult): void {
+    this.progressFill.style.width = '100%';
+    this.progressLabel.textContent = `${result.summary.total}/${result.summary.total} — Complete`;
+
+    const { summary } = result;
+
+    this.summarySection.classList.remove('hidden');
+    this.summarySection.innerHTML = `
+      <div class="eval-summary-title">Summary — ${result.suiteName}</div>
+      <div class="eval-summary-grid">
+        <div class="eval-summary-stat">
+          <span class="eval-stat-value">${summary.total}</span>
+          <span class="eval-stat-label">Total</span>
+        </div>
+        <div class="eval-summary-stat eval-stat-pass">
+          <span class="eval-stat-value">${summary.passed}</span>
+          <span class="eval-stat-label">Passed</span>
+        </div>
+        <div class="eval-summary-stat eval-stat-fail">
+          <span class="eval-stat-value">${summary.failed}</span>
+          <span class="eval-stat-label">Failed</span>
+        </div>
+        <div class="eval-summary-stat eval-stat-error">
+          <span class="eval-stat-value">${summary.errored}</span>
+          <span class="eval-stat-label">Errors</span>
+        </div>
+      </div>
+      <div class="eval-summary-scores">
+        <div>Avg Tool Score: <strong>${(summary.avgToolCallScore * 100).toFixed(0)}%</strong></div>
+        <div>Avg Judge Score: <strong>${(summary.avgJudgeScore * 100).toFixed(0)}%</strong></div>
+        <div>Avg Combined: <strong>${(summary.avgCombinedScore * 100).toFixed(0)}%</strong></div>
+        <div>Duration: <strong>${(result.totalDurationMs / 1000).toFixed(1)}s</strong></div>
+      </div>
+    `;
+  }
+
+  /**
+   * Show an error that prevented the eval from running.
+   */
+  showError(message: string): void {
+    this.progressLabel.textContent = 'Error';
+    this.progressFill.style.width = '0%';
+
+    const errorEl = document.createElement('div');
+    errorEl.className = 'eval-error';
+    errorEl.textContent = message;
+    this.container.appendChild(errorEl);
+  }
+
+  private createScenarioRow(result: EvalScenarioResult): HTMLElement {
+    const row = document.createElement('div');
+    row.className = `eval-scenario-row eval-scenario-${result.status}`;
+
+    // Header (always visible)
+    const header = document.createElement('div');
+    header.className = 'eval-scenario-header';
+    header.style.cursor = 'pointer';
+
+    const badge = document.createElement('span');
+    badge.className = `eval-status-badge eval-badge-${result.status}`;
+    badge.textContent = result.status.toUpperCase();
+    header.appendChild(badge);
+
+    // ID + prompt stacked vertically
+    const idBlock = document.createElement('div');
+    idBlock.className = 'eval-scenario-id-block';
+
+    const id = document.createElement('div');
+    id.className = 'eval-scenario-id';
+    id.textContent = result.scenarioId;
+    idBlock.appendChild(id);
+
+    const prompt = document.createElement('div');
+    prompt.className = 'eval-scenario-prompt';
+    prompt.textContent = result.prompt;
+    idBlock.appendChild(prompt);
+
+    header.appendChild(idBlock);
+
+    const score = document.createElement('span');
+    score.className = 'eval-scenario-score';
+    score.textContent = `${(result.combinedScore * 100).toFixed(0)}%`;
+    header.appendChild(score);
+
+    const duration = document.createElement('span');
+    duration.className = 'eval-scenario-duration';
+    duration.textContent = `${(result.durationMs / 1000).toFixed(1)}s`;
+    header.appendChild(duration);
+
+    const chevron = document.createElement('span');
+    chevron.className = 'eval-chevron';
+    chevron.innerHTML = `<svg width="14" height="14" viewBox="0 0 16 16" fill="none">
+      <path d="M6 4L10 8L6 12" stroke="currentColor" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
+    </svg>`;
+    header.appendChild(chevron);
+
+    row.appendChild(header);
+
+    // Details (collapsible)
+    const details = document.createElement('div');
+    details.className = 'eval-scenario-details';
+
+    // Tool calls
+    if (result.actualToolCalls.length > 0) {
+      const toolCallsHtml = result.actualToolCalls
+        .map((tc) => `<code>${escapeHtml(tc.toolName)}</code> (${tc.status})`)
+        .join(', ');
+      details.innerHTML += `<div class="eval-detail-row"><strong>Tool Calls:</strong> ${toolCallsHtml}</div>`;
+    } else {
+      details.innerHTML += `<div class="eval-detail-row"><strong>Tool Calls:</strong> <em>none</em></div>`;
+    }
+
+    // Tool score details
+    const ts = result.toolCallScore;
+    if (ts.expectedMissed.length > 0) {
+      details.innerHTML += `<div class="eval-detail-row eval-detail-warn"><strong>Missing:</strong> ${ts.expectedMissed.map((t) => `<code>${escapeHtml(t)}</code>`).join(', ')}</div>`;
+    }
+    if (ts.forbiddenCalled.length > 0) {
+      details.innerHTML += `<div class="eval-detail-row eval-detail-error"><strong>Forbidden:</strong> ${ts.forbiddenCalled.map((t) => `<code>${escapeHtml(t)}</code>`).join(', ')}</div>`;
+    }
+
+    // Judge verdict
+    if (result.judgeScore) {
+      const js = result.judgeScore;
+      details.innerHTML += `<div class="eval-detail-row"><strong>Judge:</strong> ${escapeHtml(js.verdict)} (${(js.score * 100).toFixed(0)}%) — ${escapeHtml(js.reasoning)}</div>`;
+    }
+
+    // Error
+    if (result.error) {
+      details.innerHTML += `<div class="eval-detail-row eval-detail-error"><strong>Error:</strong> ${escapeHtml(result.error)}</div>`;
+    }
+
+    row.appendChild(details);
+
+    // Toggle expand/collapse
+    let expanded = false;
+    header.addEventListener('click', () => {
+      expanded = !expanded;
+      row.classList.toggle('expanded', expanded);
+    });
+
+    return row;
+  }
+}
+
+function escapeHtml(text: string): string {
+  const div = document.createElement('div');
+  div.textContent = text;
+  return div.innerHTML;
+}
diff --git a/src/sidebar/index.ts b/src/sidebar/index.ts
index 79c0569..7637e5c 100644
--- a/src/sidebar/index.ts
+++ b/src/sidebar/index.ts
@@ -12,6 +12,8 @@ import { ReasoningBox } from './ReasoningBox';
 import { TextBox } from './TextBox';
 import { StreamingMarkdownRenderer } from './StreamingMarkdownRenderer';
 import { CommandRegistry, CommandProcessor, createBuiltinCommands } from '../lib/commands';
+import { EvalRunner, type SidebarInterface } from '../lib/eval/runner';
+import { EvalReportBox } from './EvalReportBox';
 
 // Streaming session interface to encapsulate all streaming state
 interface StreamingSession {
@@ -59,6 +61,9 @@ let pendingAttachments: ImageAttachment[] = [];
 let commandRegistry: CommandRegistry;
 let commandProcessor: CommandProcessor;
 
+// Eval mode state
+let activeEvalRunner: EvalRunner | null = null;
+
 // Scroll management state
 let isUserAtBottom = true; // Initially at bottom
 
@@ -532,6 +537,15 @@ async function handleSendMessage() {
   if (!text && pendingAttachments.length === 0) return;
   if (isLoading) return;
 
+  // Intercept /eval command before the command processor
+  if (text === '/eval' || text.startsWith('/eval ')) {
+    messageInput.value = '';
+    messageInput.style.height = '88px';
+    const suiteName = text.startsWith('/eval ') ? text.slice(6).trim() : undefined;
+    startEvalMode(suiteName || undefined);
+    return;
+  }
+
   // Clear input immediately for better UX
   messageInput.value = '';
   messageInput.style.height = '88px';
@@ -1204,11 +1218,19 @@ function cancelCurrentStream() {
   }
 }
 
-// Cancel current stream on Escape OR clear attachments
+// Cancel current stream on Escape OR clear attachments OR abort eval
 document.addEventListener('keydown', (e) => {
   if (e.key === 'Escape') {
+    // Priority 0: Abort active eval
+    if (activeEvalRunner) {
+      activeEvalRunner.abort();
+      activeEvalRunner = null;
+      isLoading = false;
+      updateSendButton();
+      log.info('[Sidebar] Eval run aborted');
+    }
     // Priority 1: Cancel active stream
-    if (currentSession) {
+    else if (currentSession) {
       cancelCurrentStream();
     }
     // Priority 2: Clear pending attachments
@@ -1220,4 +1242,104 @@ document.addEventListener('keydown', (e) => {
   }
 });
 
+/**
+ * Start eval mode: load suite, run all scenarios, show report.
+ */
+async function startEvalMode(suiteName?: string): Promise<void> {
+  if (isLoading) {
+    addMessage('error', 'Cannot start eval while a message is loading.');
+    return;
+  }
+
+  // Load eval suites from storage
+  const suites = await configStorage.getEvalSuites();
+  if (suites.length === 0) {
+    addMessage('error', 'No eval suites imported. Upload one in Settings.');
+    return;
+  }
+
+  // Pick suite: by name or first available
+  const suite = suiteName
+    ? suites.find((s) => s.name.toLowerCase() === suiteName.toLowerCase())
+    : suites[0];
+
+  if (!suite) {
+    addMessage(
+      'error',
+      `Eval suite "${suiteName}" not found. Available: ${suites.map((s) => s.name).join(', ')}`
+    );
+    return;
+  }
+
+  if (!currentAgentId) {
+    addMessage('error', 'No agent selected. Select an agent before running evals.');
+    return;
+  }
+
+  // Clear conversation and set up report UI
+  clearConversation();
+  isLoading = true;
+  updateSendButton();
+
+  const reportBox = new EvalReportBox();
+  messagesContainer.appendChild(reportBox.getElement());
+
+  // Create sidebar interface adapter
+  const sidebarAdapter: SidebarInterface = {
+    attachedTabId,
+    currentAgentId,
+    getMessageHistory: () => messageHistory,
+    clearConversation: () => {
+      // Clear without adding welcome message (we manage UI ourselves during eval)
+      messageHistory = [];
+      // Remove everything except the report box
+      const children = Array.from(messagesContainer.children);
+      for (const child of children) {
+        if (!child.classList.contains('eval-report-box')) {
+          child.remove();
+        }
+      }
+      if (currentSession) {
+        currentSession.port.disconnect();
+        currentSession = null;
+      }
+      isUserAtBottom = true;
+    },
+    sendMessage: async (text: string) => {
+      // Create and send message, waiting for stream completion
+      const content = text;
+      const userMsg: ChatMessage = {
+        id: globalThis.crypto.randomUUID(),
+        role: 'user',
+        content,
+        timestamp: Date.now(),
+      };
+      messageHistory.push(userMsg);
+      await streamAIResponse();
+    },
+  };
+
+  // Run eval
+  const runner = new EvalRunner(sidebarAdapter);
+  activeEvalRunner = runner;
+
+  runner.setProgressCallback((progress) => {
+    reportBox.updateProgress(progress);
+    messagesContainer.scrollTop = messagesContainer.scrollHeight;
+  });
+
+  try {
+    const result = await runner.run(suite);
+    reportBox.showSummary(result);
+  } catch (error) {
+    log.error('[Sidebar] Eval run failed:', error);
+    reportBox.showError(error instanceof Error ? error.message : 'Eval run failed');
+  } finally {
+    activeEvalRunner = null;
+    isLoading = false;
+    updateSendButton();
+    messagesContainer.scrollTop = messagesContainer.scrollHeight;
+  }
+}
+
 export {};
diff --git a/src/sidebar/styles.css b/src/sidebar/styles.css
index 2714798..6d9d136 100644
--- a/src/sidebar/styles.css
+++ b/src/sidebar/styles.css
@@ -1392,3 +1392,258 @@ body {
 .tool-item-box.expanded .chevron {
   transform: rotate(90deg);
 }
+
+/* ============================================
+   Eval Report Box Styles
+   ============================================ */
+
+.eval-report-box {
+  margin: 12px 16px;
+  padding: 16px;
+  background: var(--bg-secondary);
+  border: 1px solid var(--border-color);
+  border-radius: 12px;
+}
+
+/* Progress */
+.eval-progress {
+  margin-bottom: 12px;
+}
+
+.eval-progress-label {
+  font-size: 13px;
+  font-weight: 500;
+  color: var(--text-secondary);
+  margin-bottom: 6px;
+}
+
+.eval-progress-bar {
+  height: 6px;
+  background: var(--bg-tertiary);
+  border-radius: 3px;
+  overflow: hidden;
+}
+
+.eval-progress-fill {
+  height: 100%;
+  background: var(--accent-primary);
+  border-radius: 3px;
+  transition: width 0.3s ease;
+  width: 0%;
+}
+
+/* Scenario list */
+.eval-scenario-list {
+  display: flex;
+  flex-direction: column;
+  gap: 4px;
+}
+
+.eval-scenario-row {
+  border: 1px solid var(--border-color);
+  border-radius: 8px;
+  overflow: hidden;
+  background: var(--bg-primary);
+}
+
+.eval-scenario-row.eval-scenario-pass {
+  border-left: 3px solid var(--success-color);
+}
+
+.eval-scenario-row.eval-scenario-fail {
+  border-left: 3px solid var(--error-color);
+}
+
+.eval-scenario-row.eval-scenario-error {
+  border-left: 3px solid #f59e0b;
+}
+
+.eval-scenario-header {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  padding: 8px 10px;
+  font-size: 13px;
+}
+
+.eval-status-badge {
+  display: inline-block;
+  padding: 1px 6px;
+  border-radius: 4px;
+  font-size: 10px;
+  font-weight: 700;
+  letter-spacing: 0.5px;
+  text-transform: uppercase;
+}
+
+.eval-badge-pass {
+  background: rgba(16, 185, 129, 0.15);
+  color: var(--success-color);
+}
+
+.eval-badge-fail {
+  background: rgba(239, 68, 68, 0.15);
+  color: var(--error-color);
+}
+
+.eval-badge-error {
+  background: rgba(245, 158, 11, 0.15);
+  color: #f59e0b;
+}
+
+.eval-scenario-id-block {
+  flex: 1;
+  min-width: 0;
+  overflow: hidden;
+}
+
+.eval-scenario-id {
+  font-weight: 500;
+  color: var(--text-primary);
+  overflow: hidden;
+  text-overflow: ellipsis;
+  white-space: nowrap;
+}
+
+.eval-scenario-prompt {
+  font-size: 12px;
+  color: var(--text-secondary);
+  overflow: hidden;
+  text-overflow: ellipsis;
+  white-space: nowrap;
+  margin-top: 1px;
+}
+
+.eval-scenario-score {
+  font-weight: 600;
+  color: var(--accent-primary);
+  font-size: 12px;
+}
+
+.eval-scenario-duration {
+  font-size: 11px;
+  color: var(--text-secondary);
+}
+
+.eval-chevron {
+  color: var(--text-secondary);
+  transition: transform 0.2s ease;
+  display: flex;
+  align-items: center;
+}
+
+.eval-scenario-row.expanded .eval-chevron {
+  transform: rotate(90deg);
+}
+
+.eval-scenario-details {
+  display: none;
+  padding: 8px 10px 10px;
+  border-top: 1px solid var(--border-color);
+  font-size: 12px;
+  line-height: 1.5;
+}
+
+.eval-scenario-row.expanded .eval-scenario-details {
+  display: block;
+}
+
+.eval-detail-row {
+  margin-bottom: 4px;
+  color: var(--text-secondary);
+}
+
+.eval-detail-row strong {
+  color: var(--text-primary);
+}
+
+.eval-detail-row code {
+  background: var(--code-bg);
+  padding: 1px 4px;
+  border-radius: 3px;
+  font-size: 11px;
+}
+
+.eval-detail-warn {
+  color: #f59e0b;
+}
+
+.eval-detail-error {
+  color: var(--error-color);
+}
+
+/* Summary */
+.eval-summary {
+  margin-top: 12px;
+  padding-top: 12px;
+  border-top: 1px solid var(--border-color);
+}
+
+.eval-summary-title {
+  font-size: 14px;
+  font-weight: 600;
+  color: var(--text-primary);
+  margin-bottom: 10px;
+}
+
+.eval-summary-grid {
+  display: grid;
+  grid-template-columns: repeat(4, 1fr);
+  gap: 8px;
+  margin-bottom: 10px;
+}
+
+.eval-summary-stat {
+  text-align: center;
+  padding: 8px;
+  background: var(--bg-tertiary);
+  border-radius: 8px;
+}
+
+.eval-stat-value {
+  display: block;
+  font-size: 20px;
+  font-weight: 700;
+  color: var(--text-primary);
+}
+
+.eval-stat-label {
+  display: block;
+  font-size: 11px;
+  color: var(--text-secondary);
+  text-transform: uppercase;
+  letter-spacing: 0.5px;
+}
+
+.eval-stat-pass .eval-stat-value {
+  color: var(--success-color);
+}
+
+.eval-stat-fail .eval-stat-value {
+  color: var(--error-color);
+}
+
+.eval-stat-error .eval-stat-value {
+  color: #f59e0b;
+}
+
+.eval-summary-scores {
+  display: grid;
+  grid-template-columns: 1fr 1fr;
+  gap: 4px;
+  font-size: 12px;
+  color: var(--text-secondary);
+}
+
+.eval-summary-scores strong {
+  color: var(--text-primary);
+}
+
+.eval-error {
+  margin-top: 8px;
+  padding: 8px 12px;
+  background: var(--error-bg);
+  color: var(--error-text);
+  border-radius: 6px;
+  font-size: 13px;
+}
diff --git a/src/types/index.ts b/src/types/index.ts
index 8e7026e..f0c8f7d 100644
--- a/src/types/index.ts
+++ b/src/types/index.ts
@@ -130,6 +130,15 @@ export interface WebMCPScriptsUpdatedMessage {
   type: 'WEBMCP_SCRIPTS_UPDATED';
 }
 
+export interface EvalJudgeMessage {
+  type: 'EVAL_JUDGE';
+  agentId: string;
+  prompt: string;
+  assistantResponse: string;
+  toolCalls: Array<{ toolName: string; input: unknown; output: unknown; status: string }>;
+  postConditions: string;
+}
+
 // Union type for all possible extension messages
 export type ExtensionMessage =
   | GetConfigMessage
@@ -150,7 +159,8 @@ export type ExtensionMessage =
   | WebMCPCallToolMessage
   | WebMCPGetToolsMessage
   | WebMCPToolsChangedMessage
-  | WebMCPScriptsUpdatedMessage;
+  | WebMCPScriptsUpdatedMessage
+  | EvalJudgeMessage;
 
 // Response wrapper for message handlers
 export interface MessageResponse<T = unknown> {