SpillwaveSolutions · RichardHightower · Feb 26, 2026 · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026
diff --git a/.github/workflows/e2e-cli.yml b/.github/workflows/e2e-cli.yml
@@ -0,0 +1,124 @@
+name: E2E CLI Tests
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+env:
+  CARGO_TERM_COLOR: always
+  RUST_BACKTRACE: 1
+
+jobs:
+  e2e-cli:
+    name: E2E CLI - ${{ matrix.cli }} (${{ matrix.os }})
+    runs-on: ${{ matrix.os }}
+    environment: e2e-cli
+    strategy:
+      fail-fast: false
+      matrix:
+        cli: [claude-code, gemini, opencode, copilot, codex]
+        os: [ubuntu-24.04, macos-latest]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Install system dependencies (Linux)
+        if: runner.os == 'Linux'
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y protobuf-compiler libclang-dev
+
+      - name: Install system dependencies (macOS)
+        if: runner.os == 'macOS'
+        run: |
+          brew install protobuf llvm
+          echo "LIBCLANG_PATH=$(brew --prefix llvm)/lib" >> $GITHUB_ENV
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Cache cargo registry
+        uses: Swatinem/rust-cache@v2
+        with:
+          shared-key: "e2e-cli-${{ matrix.os }}"
+
+      - name: Build daemon and ingest binaries
+        run: cargo build -p memory-daemon -p memory-ingest
+
+      - name: Install bats-core (Linux)
+        if: runner.os == 'Linux'
+        run: |
+          sudo apt-get install -y bats
+
+      - name: Install bats-core (macOS)
+        if: runner.os == 'macOS'
+        run: |
+          brew install bats-core
+
+      - name: Install bats helper libraries
+        run: |
+          mkdir -p tests/cli/lib
+          git clone --depth 1 https://github.com/bats-core/bats-support.git tests/cli/lib/bats-support
+          git clone --depth 1 https://github.com/bats-core/bats-assert.git tests/cli/lib/bats-assert
+
+      - name: Verify jq is available
+        run: jq --version
+
+      - name: Run bats tests
+        id: bats_run
+        continue-on-error: true
+        env:
+          BATS_LIB_PATH: tests/cli/lib
+          MEMORY_DAEMON_BIN: target/debug/memory-daemon
+          MEMORY_INGEST_BIN: target/debug/memory-ingest
+        run: |
+          mkdir -p tests/cli/.runs
+          if [ -d "tests/cli/${{ matrix.cli }}" ]; then
+            bats --report-formatter junit --output tests/cli/.runs tests/cli/${{ matrix.cli }}/ 2>&1 | tee e2e-cli-results.txt
+          else
+            echo "No tests found for ${{ matrix.cli }} — skipping"
+            echo "::notice::No bats tests found for ${{ matrix.cli }}, skipping"
+            exit 0
+          fi
+
+      - name: Upload JUnit XML report
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: junit-${{ matrix.cli }}-${{ matrix.os }}
+          path: tests/cli/.runs/report.xml
+          if-no-files-found: ignore
+          retention-days: 14
+
+      - name: Upload failure artifacts
+        if: failure() || steps.bats_run.outcome == 'failure'
+        uses: actions/upload-artifact@v4
+        with:
+          name: failure-artifacts-${{ matrix.cli }}-${{ matrix.os }}
+          path: |
+            tests/cli/.runs/
+            e2e-cli-results.txt
+          if-no-files-found: ignore
+          retention-days: 7
+
+      - name: Report summary
+        if: always()
+        run: |
+          echo "## E2E CLI Results: ${{ matrix.cli }} (${{ matrix.os }})" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          if [ -f e2e-cli-results.txt ]; then
+            echo '```' >> $GITHUB_STEP_SUMMARY
+            tail -20 e2e-cli-results.txt >> $GITHUB_STEP_SUMMARY
+            echo '```' >> $GITHUB_STEP_SUMMARY
+          else
+            echo "No test results file found." >> $GITHUB_STEP_SUMMARY
+          fi
+
+      - name: Check bats test result
+        if: always() && steps.bats_run.outcome == 'failure'
+        run: |
+          echo "Bats tests failed for ${{ matrix.cli }}"
+          exit 1
diff --git a/.planning/PROJECT.md b/.planning/PROJECT.md
@@ -1,5 +1,9 @@
 # Agent Memory
 
+## Current Milestone: v2.4 Headless CLI Testing
+
+**Goal:** Build a shell-based E2E test harness that spawns real CLI processes (Claude Code, OpenCode, Gemini, Copilot, Codex) in headless mode, validating integration behavior in isolated workspaces with matrix reporting.
+
 ## Current State
 
 **Version:** v2.3 (Shipped 2026-02-12)
@@ -180,7 +184,18 @@ Agent Memory implements a layered cognitive architecture:
 - [x] Performance benchmark harness with ingest, TOC, BM25, vector, topic graph latency — v2.3
 - [x] Baseline metrics for all tier/mode combinations with p50/p90/p99 percentiles — v2.3
 
-### Active (future)
+### Active (v2.4 — Headless CLI Testing)
+
+**Headless Multi-CLI E2E Harness**
+- [ ] Codex CLI adapter (new — no hook support, commands/skills only)
+- [ ] Shell-based E2E harness with isolated workspaces per test
+- [ ] Claude Code CLI headless tests (framework phase — builds isolation, reporting, fixtures)
+- [ ] OpenCode CLI headless tests
+- [ ] Gemini CLI headless tests
+- [ ] Copilot CLI headless tests
+- [ ] Codex CLI headless tests (hooks excluded)
+- [ ] Matrix reporting: CLI × scenario → pass/fail/skipped
+- [ ] CI integration with artifact retention on failure
 
 **Deferred**
 - Cross-project unified memory
@@ -267,6 +282,11 @@ CLI client and agent skill query the daemon. Agent receives TOC navigation tools
 | Wizard-style setup skills | Confirm before edits, verification-only commands | ✓ Validated v2.3 |
 | perf_bench as binary | Standalone binary in e2e-tests crate; not unit tests | ✓ Validated v2.3 |
 | Baseline JSON with thresholds | warning/severe thresholds per step for regression detection | ✓ Validated v2.3 |
+| Shell-first E2E harness | Fits CLI testing model; Python/Bun for validation only | — v2.4 |
+| Real CLI processes | Spawn actual CLIs headless, not simulated behavior | — v2.4 |
+| One phase per CLI | Each CLI gets own harness phase; Claude Code first builds framework | — v2.4 |
+| Keep both test layers | Existing cargo E2E tests stay; CLI harness is separate layer | — v2.4 |
+| Codex adapter (no hooks) | Codex lacks hook support; skip hook-dependent tests | — v2.4 |
 
 ---
-*Last updated: 2026-02-12 after v2.3 milestone completion*
+*Last updated: 2026-02-22 after v2.4 milestone start*
diff --git a/.planning/REQUIREMENTS.md b/.planning/REQUIREMENTS.md
@@ -0,0 +1,117 @@
+# Requirements: Agent Memory v2.4
+
+**Defined:** 2026-02-22
+**Core Value:** Agent can answer "what were we talking about last week?" without scanning everything
+
+## v2.4 Requirements
+
+Requirements for v2.4 Headless CLI Testing milestone. Each maps to roadmap phases.
+
+### Harness Framework
+
+- [ ] **HARN-01**: Shell-based E2E harness using bats-core with isolated workspace per test file
+- [ ] **HARN-02**: Daemon lifecycle management (start/stop/health check per workspace, OS-assigned port)
+- [ ] **HARN-03**: CLI availability detection with graceful skip when binary not installed
+- [ ] **HARN-04**: Common helper library (common.bash) with workspace, daemon, CLI wrapper functions
+- [ ] **HARN-05**: JUnit XML reporting via bats native formatter
+- [ ] **HARN-06**: CI integration with GitHub Actions matrix (CLI x category) and artifact retention on failure
+- [ ] **HARN-07**: Fixture data directory with predefined JSON payloads and expected outputs
+
+### Claude Code Tests
+
+- [ ] **CLDE-01**: Claude Code headless smoke tests (binary detection, `-p` invocation, JSON output)
+- [ ] **CLDE-02**: Claude Code hook capture tests (SessionStart, UserPrompt, PostToolUse, Stop payloads)
+- [ ] **CLDE-03**: Claude Code E2E pipeline test (hook fire -> daemon ingest -> gRPC query verification)
+- [ ] **CLDE-04**: Claude Code negative tests (daemon down, malformed input, timeout enforcement)
+
+### Gemini CLI Tests
+
+- [ ] **GEMI-01**: Gemini CLI headless smoke tests (binary detection, positional args, JSON output)
+- [ ] **GEMI-02**: Gemini CLI hook capture tests (JSON stdin format, agent field verification)
+- [ ] **GEMI-03**: Gemini CLI E2E pipeline test (hook -> ingest -> query)
+- [ ] **GEMI-04**: Gemini CLI negative tests
+
+### OpenCode Tests
+
+- [ ] **OPEN-01**: OpenCode headless smoke tests (binary detection, `-p -q -f json` invocation)
+- [ ] **OPEN-02**: OpenCode hook capture tests
+- [ ] **OPEN-03**: OpenCode E2E pipeline test (hook -> ingest -> query)
+- [ ] **OPEN-04**: OpenCode negative tests
+
+### Copilot CLI Tests
+
+- [ ] **CPLT-01**: Copilot CLI headless smoke tests (binary detection, `-p --yes --allow-all-tools`)
+- [ ] **CPLT-02**: Copilot CLI hook capture tests (session ID synthesis verification)
+- [ ] **CPLT-03**: Copilot CLI E2E pipeline test (hook -> ingest -> query)
+- [ ] **CPLT-04**: Copilot CLI negative tests
+
+### Codex CLI
+
+- [ ] **CDEX-01**: Codex CLI adapter (commands + skills, no hooks, sandbox workaround docs)
+- [ ] **CDEX-02**: Codex CLI headless smoke tests (binary detection, `codex exec -q --full-auto`)
+- [ ] **CDEX-03**: Codex CLI command invocation tests (hooks skipped)
+- [ ] **CDEX-04**: Codex CLI negative tests (hooks skipped)
+- [ ] **CDEX-05**: Cross-CLI matrix report aggregation (CLI x scenario -> pass/fail/skipped)
+
+## Future Requirements
+
+### Post-v2.4
+
+- Windows CLI testing support
+- Performance regression tracking in shell tests
+- GUI/dashboard for test results
+- Cross-project shared harness (Agent RuleZ, Agent Cron, Agent CLOD)
+
+## Out of Scope
+
+| Feature | Reason |
+|---------|--------|
+| Mock CLI simulators | Defeats E2E purpose; tests mock not CLI |
+| Interactive/TUI testing | Brittle keystroke simulation; headless only |
+| Full LLM round-trip tests | Slow, expensive, non-deterministic; test mechanical pipeline |
+| API key management in tests | Use CI secrets; skip locally when absent |
+| Custom test framework | Use bats-core; no maintenance burden |
+| Windows support | macOS/Linux only for v2.4 |
+| Shared state between tests | Each test file gets own workspace and daemon |
+
+## Traceability
+
+| Requirement | Phase | Status |
+|-------------|-------|--------|
+| HARN-01 | Phase 30 | Pending |
+| HARN-02 | Phase 30 | Pending |
+| HARN-03 | Phase 30 | Pending |
+| HARN-04 | Phase 30 | Pending |
+| HARN-05 | Phase 30 | Pending |
+| HARN-06 | Phase 30 | Pending |
+| HARN-07 | Phase 30 | Pending |
+| CLDE-01 | Phase 30 | Pending |
+| CLDE-02 | Phase 30 | Pending |
+| CLDE-03 | Phase 30 | Pending |
+| CLDE-04 | Phase 30 | Pending |
+| GEMI-01 | Phase 31 | Pending |
+| GEMI-02 | Phase 31 | Pending |
+| GEMI-03 | Phase 31 | Pending |
+| GEMI-04 | Phase 31 | Pending |
+| OPEN-01 | Phase 32 | Pending |
+| OPEN-02 | Phase 32 | Pending |
+| OPEN-03 | Phase 32 | Pending |
+| OPEN-04 | Phase 32 | Pending |
+| CPLT-01 | Phase 33 | Pending |
+| CPLT-02 | Phase 33 | Pending |
+| CPLT-03 | Phase 33 | Pending |
+| CPLT-04 | Phase 33 | Pending |
+| CDEX-01 | Phase 34 | Pending |
+| CDEX-02 | Phase 34 | Pending |
+| CDEX-03 | Phase 34 | Pending |
+| CDEX-04 | Phase 34 | Pending |
+| CDEX-05 | Phase 34 | Pending |
+
+**Coverage:**
+- v2.4 requirements: 28 total
+- Mapped to phases: 28
+- Unmapped: 0 ✓
+
+---
+*Requirements defined: 2026-02-22*
+*Last updated: 2026-02-22 after initial definition*
diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md
@@ -7,6 +7,7 @@
 - ✅ **v2.1 Multi-Agent Ecosystem** — Phases 18-23 (shipped 2026-02-10)
 - ✅ **v2.2 Production Hardening** — Phases 24-27 (shipped 2026-02-11)
 - ✅ **v2.3 Install & Setup Experience** — Phases 28-29 (shipped 2026-02-12)
+- 🚧 **v2.4 Headless CLI Testing** — Phases 30-34 (in progress)
 
 ## Phases
 
@@ -79,6 +80,81 @@ See: `.planning/milestones/v2.3-ROADMAP.md`
 
 </details>
 
+### v2.4 Headless CLI Testing (In Progress)
+
+**Milestone Goal:** Build a shell-based E2E test harness that spawns real CLI processes in headless mode, validating integration behavior across 5 AI coding CLIs with isolated workspaces and matrix reporting.
+
+- [ ] **Phase 30: Claude Code CLI Harness** - Build bats-core framework + all Claude Code headless tests
+- [ ] **Phase 31: Gemini CLI Tests** - Apply harness to Gemini CLI with JSON stdin hooks
+- [ ] **Phase 32: OpenCode CLI Tests** - Apply harness to OpenCode CLI with headless quirk handling
+- [ ] **Phase 33: Copilot CLI Tests** - Apply harness to Copilot CLI with session ID synthesis
+- [ ] **Phase 34: Codex CLI Adapter + Tests + Matrix Report** - New adapter, hook-excluded tests, cross-CLI matrix
+
+## Phase Details
+
+### Phase 30: Claude Code CLI Harness
+**Goal**: Developers can run isolated shell-based E2E tests for Claude Code that validate the full hook-to-query pipeline, with reusable framework infrastructure for all subsequent CLI phases
+**Depends on**: Phase 29 (v2.3 complete)
+**Requirements**: HARN-01, HARN-02, HARN-03, HARN-04, HARN-05, HARN-06, HARN-07, CLDE-01, CLDE-02, CLDE-03, CLDE-04
+**Success Criteria** (what must be TRUE):
+  1. Running `bats tests/cli/claude-code/` executes all Claude Code tests in isolated temp workspaces, each with its own daemon on an OS-assigned port
+  2. Tests that require `claude` binary skip gracefully with informative message when binary is not installed
+  3. Claude Code hook fires produce events visible via gRPC query in the same test workspace
+  4. JUnit XML report is generated and CI matrix job uploads failure artifacts (logs, workspace tarballs)
+  5. A `tests/cli/lib/common.bash` library exists that other CLI test phases can source (via `load ../lib/common`) for workspace setup, daemon lifecycle, and CLI wrappers
+**Plans:** 6 plans
+Plans:
+- [x] 30-01-PLAN.md — Common helper library (common.bash + cli_wrappers.bash) + workspace/daemon lifecycle
+- [x] 30-02-PLAN.md — Fixture JSON payloads + e2e-cli.yml CI workflow with 5-CLI matrix
+- [x] 30-03-PLAN.md — Smoke tests + hook capture tests (all event types via stdin pipe)
+- [x] 30-04-PLAN.md — E2E pipeline tests + negative tests (daemon down, malformed, timeout)
+- [x] 30-05-PLAN.md — Fix memory-ingest MEMORY_DAEMON_ADDR env var support
+- [x] 30-06-PLAN.md — Fix hooks.bats Layer 2 assertions + ROADMAP path correction
+
+### Phase 31: Gemini CLI Tests
+**Goal**: Developers can run isolated shell-based E2E tests for Gemini CLI that validate hook capture and the full ingest-to-query pipeline
+**Depends on**: Phase 30 (framework)
+**Requirements**: GEMI-01, GEMI-02, GEMI-03, GEMI-04
+**Success Criteria** (what must be TRUE):
+  1. Running `bats tests/cli/gemini/` executes all Gemini tests in isolated workspaces, reusing Phase 30 common helpers
+  2. Gemini CLI binary detection and graceful skip works when `gemini` is not installed
+  3. Gemini hook handler correctly captures events with agent field set to "gemini" and events are queryable via gRPC
+  4. Negative tests verify daemon-down and malformed-input handling without test failures leaking
+**Plans**: TBD
+
+### Phase 32: OpenCode CLI Tests
+**Goal**: Developers can run isolated shell-based E2E tests for OpenCode CLI, handling its less mature headless mode with appropriate skip/warn patterns
+**Depends on**: Phase 30 (framework)
+**Requirements**: OPEN-01, OPEN-02, OPEN-03, OPEN-04
+**Success Criteria** (what must be TRUE):
+  1. Running `bats tests/cli/opencode/` executes all OpenCode tests in isolated workspaces, reusing Phase 30 common helpers
+  2. OpenCode invocation uses `-p -q -f json` flags and timeout guards prevent hangs from headless mode quirks
+  3. OpenCode hook capture produces events with agent field "opencode" queryable via gRPC pipeline test
+  4. Negative tests cover daemon-down and timeout scenarios specific to OpenCode's headless behavior
+**Plans**: TBD
+
+### Phase 33: Copilot CLI Tests
+**Goal**: Developers can run isolated shell-based E2E tests for Copilot CLI that validate session ID synthesis and the hook-to-query pipeline
+**Depends on**: Phase 30 (framework)
+**Requirements**: CPLT-01, CPLT-02, CPLT-03, CPLT-04
+**Success Criteria** (what must be TRUE):
+  1. Running `bats tests/cli/copilot/` executes all Copilot tests in isolated workspaces, reusing Phase 30 common helpers
+  2. Copilot binary detection uses correct binary name and `--yes --allow-all-tools` prevents interactive prompts
+  3. Copilot session ID synthesis produces deterministic session IDs from workspace context, verified in captured events
+  4. Negative tests verify daemon-down and malformed-input handling for Copilot-specific edge cases
+**Plans**: TBD
+
+### Phase 34: Codex CLI Adapter + Tests + Matrix Report
+**Goal**: Codex CLI adapter exists with commands and skills (no hooks), Codex headless tests pass with hook tests skipped, and a cross-CLI matrix report aggregates results from all 5 CLIs
+**Depends on**: Phase 30 (framework), Phases 31-33 (all CLI tests for matrix)
+**Requirements**: CDEX-01, CDEX-02, CDEX-03, CDEX-04, CDEX-05
+**Success Criteria** (what must be TRUE):
+  1. A Codex CLI adapter directory exists under `adapters/codex-cli/` with commands, skills, and sandbox workaround documentation (no hook handler)
+  2. Running `bats tests/cli/codex/` executes Codex tests with hook-dependent scenarios explicitly skipped and annotated
+  3. Codex command invocation tests use `codex exec -q --full-auto` with timeout guards
+  4. A matrix report script aggregates JUnit XML from all 5 CLIs into a CLI x scenario pass/fail/skipped summary viewable in CI
+**Plans**: TBD
+
 ## Progress
 
 | Phase | Milestone | Plans | Status | Completed |
@@ -88,7 +164,12 @@ See: `.planning/milestones/v2.3-ROADMAP.md`
 | 18-23 | v2.1 | 22/22 | Complete | 2026-02-10 |
 | 24-27 | v2.2 | 10/10 | Complete | 2026-02-11 |
 | 28-29 | v2.3 | 2/2 | Complete | 2026-02-12 |
+| 30 | v2.4 | 6/6 | In Progress | - |
+| 31 | v2.4 | 0/TBD | Not started | - |
+| 32 | v2.4 | 0/TBD | Not started | - |
+| 33 | v2.4 | 0/TBD | Not started | - |
+| 34 | v2.4 | 0/TBD | Not started | - |
 
 ---
 
-*Updated: 2026-02-12 after v2.3 milestone completion*
+*Updated: 2026-02-22 after v2.4 roadmap creation*