diff --git a/benchmarks/RESULTS.md b/benchmarks/RESULTS.md
new file mode 100644
index 0000000..9bae898
--- /dev/null
+++ b/benchmarks/RESULTS.md
@@ -0,0 +1,271 @@
+# Code Search Benchmark: grep vs rtk grep vs rtk rgai vs head_n
+
+## Environment & Reproduction
+
+```
+Date: Sat Feb 14 22:34:25 UTC 2026
+Commit: 4b0a413562c775757d5bc09a6ff966b4e532508c
+rtk_bin: /Users/andrew/Programming/rtk/target/release/rtk
+rtk: rtk 0.15.3
+grep: grep (BSD grep, GNU compatible) 2.6.0-FreeBSD
+tiktoken_encoding: cl100k_base
+rtk_grep_max: 200
+rgai_max: 8
+head_n_lines: 100
+OS: Darwin MacBook-Pro-Andy.local 25.2.0 Darwin Kernel Version 25.2.0: Tue Nov 18 21:09:56 PST 2025; root:xnu-12377.61.12~1/RELEASE_ARM64_T6041 arm64
+CPU: Apple M4 Pro
+Rust files: 54
+Total LOC: 23240
+```
+
+## Dataset: rtk-ai/rtk @ `4b0a413562c775757d5bc09a6ff966b4e532508c`
+
+**Reproduction**:
+```bash
+rtk --version
+bash benchmarks/bench_code.sh
+python3 benchmarks/analyze_code.py
+python3 -m unittest discover -s benchmarks/tests -p 'test_*.py'
+```
+
+## Methodology
+
+### Metrics (reported separately, NO composite score)
+
+| Metric | Definition | Purpose |
+|--------|-----------|---------|
+| Output bytes | `wc -c` of stdout | Raw size footprint |
+| Output tokens | `tiktoken` (`cl100k_base`) on full stdout | Model-aligned token cost |
+| Token Efficiency (TE) | `output_tokens / grep_output_tokens` | Token compression vs baseline |
+| Result count | Effective output lines / no-result aware count | Distinguish compactness vs empty results |
+| Gold hit rate | `% gold_files found` (plus found/min files) | Relevance/correctness |
+| Timing | Median of 5 runs, plus min/max in summaries | Performance distribution |
+
+**Critical rule**: if `expect_results=true` and `result_count==0`, mark as **MISS**.
+For regex category, `rtk rgai` is marked `EXPECTED_UNSUPPORTED` by design.
+
+### Categories
+
+| Category | Queries |
+|----------|---------|
+| A: Exact Identifier | 6 |
+| B: Regex Pattern | 6 |
+| C: Semantic Intent | 10 |
+| D: Cross-File Pattern Discovery | 5 |
+| E: Edge Cases | 3 |
+
+## Category A: Exact Identifier Search
+
+| ID | Query | Tool | Bytes | Tokens | TE | Result Count | Gold Hit | Timing (med) | Status |
+|----|-------|------|-------|--------|----|-------------|----------|-------------|--------|
+| A1 | TimedExecution | grep | 10338 | 2927 | 1.000 | 104 | 100% (34/30) | 6.0ms | OK |
+| A1 | TimedExecution | rtk_grep | 6527 | 1979 | 0.676 | 159 | 100% (34/30) | 25.0ms | OK |
+| A1 | TimedExecution | rtk_rgai | 2797 | 841 | 0.287 | 94 | 60% (8/30) | 11.0ms | LOW_COVERAGE |
+| A1 | TimedExecution | head_n | 9933 | 2810 | 0.960 | 100 | 100% (32/30) | 0μs | OK |
+| A2 | FilterLevel | grep | 2196 | 605 | 1.000 | 23 | 100% (3/3) | 5.0ms | OK |
+| A2 | FilterLevel | rtk_grep | 902 | 288 | 0.476 | 25 | 100% (3/3) | 25.0ms | OK |
+| A2 | FilterLevel | rtk_rgai | 691 | 223 | 0.369 | 32 | 100% (3/3) | 10.0ms | OK |
+| A2 | FilterLevel | head_n | 2196 | 605 | 1.000 | 23 | 100% (3/3) | 0μs | OK |
+| A3 | classify_command | grep | 2524 | 626 | 1.000 | 22 | 100% (2/2) | 7.0ms | OK |
+| A3 | classify_command | rtk_grep | 817 | 225 | 0.359 | 20 | 100% (2/2) | 24.0ms | OK |
+| A3 | classify_command | rtk_rgai | 782 | 200 | 0.319 | 25 | 100% (2/2) | 11.0ms | OK |
+| A3 | classify_command | head_n | 2524 | 626 | 1.000 | 22 | 100% (2/2) | 0μs | OK |
+| A4 | package_manager_exec | grep | 918 | 260 | 1.000 | 9 | 100% (5/5) | 7.0ms | OK |
+| A4 | package_manager_exec | rtk_grep | 797 | 246 | 0.946 | 21 | 100% (5/5) | 25.0ms | OK |
+| A4 | package_manager_exec | rtk_rgai | 1370 | 381 | 1.465 | 44 | 100% (5/5) | 11.0ms | OK |
+| A4 | package_manager_exec | head_n | 918 | 260 | 1.000 | 9 | 100% (5/5) | 0μs | OK |
+| A5 | strip_ansi | grep | 1852 | 539 | 1.000 | 20 | 100% (5/5) | 8.0ms | OK |
+| A5 | strip_ansi | rtk_grep | 1197 | 388 | 0.720 | 33 | 100% (5/5) | 24.0ms | OK |
+| A5 | strip_ansi | rtk_rgai | 1264 | 425 | 0.788 | 51 | 100% (5/5) | 10.0ms | OK |
+| A5 | strip_ansi | head_n | 1852 | 539 | 1.000 | 20 | 100% (5/5) | 0μs | OK |
+| A6 | HISTORY_DAYS | grep | 201 | 61 | 1.000 | 2 | 100% (1/1) | 5.0ms | OK |
+| A6 | HISTORY_DAYS | rtk_grep | 182 | 66 | 1.082 | 6 | 100% (1/1) | 24.0ms | OK |
+| A6 | HISTORY_DAYS | rtk_rgai | 686 | 208 | 3.410 | 23 | 100% (2/1) | 11.0ms | OK |
+| A6 | HISTORY_DAYS | head_n | 201 | 61 | 1.000 | 2 | 100% (1/1) | 0μs | OK |
+
+### Category A: Exact Identifier Search — Summary
+
+- **grep**: | TE min/med/max=1.000/1.000/1.000 | gold hit min/med/max=100%/100%/100% | time min/med/max=5.0ms / 6.5ms / 8.0ms
+- **rtk_grep**: | TE min/med/max=0.359/0.698/1.082 | gold hit min/med/max=100%/100%/100% | time min/med/max=22.0ms / 24.5ms / 44.0ms
+- **rtk_rgai**: | TE min/med/max=0.287/0.579/3.410 | gold hit min/med/max=60%/100%/100% | time min/med/max=10.0ms / 11.0ms / 13.0ms | LOW_COVERAGE=1
+- **head_n**: | TE min/med/max=0.960/1.000/1.000 | gold hit min/med/max=100%/100%/100% | time min/med/max=0μs / 0μs / 0μs
+
+## Category B: Regex Pattern Search
+
+> `rtk rgai` does not support regex; misses are EXPECTED_UNSUPPORTED.
+
+| ID | Query | Tool | Bytes | Tokens | TE | Result Count | Gold Hit | Timing (med) | Status |
+|----|-------|------|-------|--------|----|-------------|----------|-------------|--------|
+| B1 | fn run\(.*verbose: u8 | grep | 3128 | 999 | 1.000 | 27 | 100% (27/25) | 7.0ms | OK |
+| B1 | fn run\(.*verbose: u8 | rtk_grep | 3518 | 1206 | 1.207 | 83 | 100% (27/25) | 25.0ms | OK |
+| B1 | fn run\(.*verbose: u8 | rtk_rgai | 3264 | 1065 | 1.066 | 99 | 38% (8/25) | 12.0ms | EXPECTED_UNSUPPORTED |
+| B1 | fn run\(.*verbose: u8 | head_n | 3128 | 999 | 1.000 | 27 | 100% (27/25) | 0μs | OK |
+| B2 | timer\.track\( | grep | 10764 | 3338 | 1.000 | 116 | 100% (34/30) | 8.0ms | OK |
+| B2 | timer\.track\( | rtk_grep | 5723 | 1979 | 0.593 | 158 | 100% (34/30) | 24.0ms | OK |
+| B2 | timer\.track\( | rtk_rgai | 2542 | 803 | 0.241 | 93 | 50% (8/30) | 12.0ms | EXPECTED_UNSUPPORTED |
+| B2 | timer\.track\( | head_n | 9143 | 2822 | 0.845 | 100 | 100% (32/30) | 0μs | OK |
+| B3 | \.unwrap_or\(1\) | grep | 5347 | 1513 | 1.000 | 48 | 100% (20/15) | 7.0ms | OK |
+| B3 | \.unwrap_or\(1\) | rtk_grep | 3806 | 1200 | 0.793 | 87 | 100% (20/15) | 24.0ms | OK |
+| B3 | \.unwrap_or\(1\) | rtk_rgai | 2777 | 885 | 0.585 | 106 | 50% (8/15) | 11.0ms | EXPECTED_UNSUPPORTED |
+| B3 | \.unwrap_or\(1\) | head_n | 5347 | 1513 | 1.000 | 48 | 100% (20/15) | 0μs | OK |
+| B4 | #\[cfg\(test\)\] | grep | 2605 | 845 | 1.000 | 41 | 100% (41/35) | 5.0ms | OK |
+| B4 | #\[cfg\(test\)\] | rtk_grep | 3098 | 1142 | 1.351 | 125 | 100% (41/35) | 25.0ms | OK |
+| B4 | #\[cfg\(test\)\] | rtk_rgai | 2247 | 716 | 0.847 | 101 | 40% (7/35) | 11.0ms | EXPECTED_UNSUPPORTED |
+| B4 | #\[cfg\(test\)\] | head_n | 2605 | 845 | 1.000 | 41 | 100% (41/35) | 0μs | OK |
+| B5 | HashMap<String, Vec< | grep | 751 | 211 | 1.000 | 6 | 100% (6/6) | 6.0ms | OK |
+| B5 | HashMap<String, Vec< | rtk_grep | 797 | 255 | 1.209 | 20 | 100% (6/6) | 24.0ms | OK |
+| B5 | HashMap<String, Vec< | rtk_rgai | 2771 | 865 | 4.100 | 96 | 100% (8/6) | 14.0ms | EXPECTED_UNSUPPORTED |
+| B5 | HashMap<String, Vec< | head_n | 751 | 211 | 1.000 | 6 | 100% (6/6) | 0μs | OK |
+| B6 | lazy_static! | grep | 930 | 280 | 1.000 | 12 | 100% (9/9) | 8.0ms | OK |
+| B6 | lazy_static! | rtk_grep | 841 | 292 | 1.043 | 32 | 100% (9/9) | 25.0ms | OK |
+| B6 | lazy_static! | rtk_rgai | 2218 | 708 | 2.529 | 72 | 89% (8/9) | 10.0ms | EXPECTED_UNSUPPORTED |
+| B6 | lazy_static! | head_n | 930 | 280 | 1.000 | 12 | 100% (9/9) | 0μs | OK |
+
+### Category B: Regex Pattern Search — Summary
+
+- **grep**: | TE min/med/max=1.000/1.000/1.000 | gold hit min/med/max=100%/100%/100% | time min/med/max=5.0ms / 7.0ms / 9.0ms
+- **rtk_grep**: | TE min/med/max=0.593/1.125/1.351 | gold hit min/med/max=100%/100%/100% | time min/med/max=23.0ms / 24.5ms / 31.0ms
+- **rtk_rgai**: expected unsupported for this category.
+- **head_n**: | TE min/med/max=0.845/1.000/1.000 | gold hit min/med/max=100%/100%/100% | time min/med/max=0μs / 0μs / 0μs
+
+## Category C: Semantic Intent Search
+
+> For multi-concept queries, grep exact-substring misses are expected and shown as MISS.
+
+| ID | Query | Tool | Bytes | Tokens | TE | Result Count | Gold Hit | Timing (med) | Status |
+|----|-------|------|-------|--------|----|-------------|----------|-------------|--------|
+| C1 | token savings tracking database | grep | 0 | 0 | MISS | 0 | N/A (0/1) | 8.0ms | **MISS** |
+| C1 | token savings tracking database | rtk_grep | 45 | 12 | MISS | 0 | 0% (0/1) | 24.0ms | **MISS** |
+| C1 | token savings tracking database | rtk_rgai | 2801 | 832 | N/A | 102 | 100% (8/1) | 15.0ms | OK |
+| C1 | token savings tracking database | head_n | 0 | 0 | MISS | 0 | N/A (0/1) | 0μs | **MISS** |
+| C2 | exit code preservation | grep | 0 | 0 | MISS | 0 | N/A (0/2) | 9.0ms | **MISS** |
+| C2 | exit code preservation | rtk_grep | 36 | 11 | MISS | 0 | 0% (0/2) | 25.0ms | **MISS** |
+| C2 | exit code preservation | rtk_rgai | 2158 | 702 | N/A | 96 | 80% (8/2) | 12.0ms | OK |
+| C2 | exit code preservation | head_n | 0 | 0 | MISS | 0 | N/A (0/2) | 0μs | **MISS** |
+| C3 | language aware code filtering | grep | 0 | 0 | MISS | 0 | N/A (0/1) | 8.0ms | **MISS** |
+| C3 | language aware code filtering | rtk_grep | 43 | 12 | MISS | 0 | 0% (0/1) | 24.0ms | **MISS** |
+| C3 | language aware code filtering | rtk_rgai | 3112 | 926 | N/A | 103 | 100% (8/1) | 14.0ms | OK |
+| C3 | language aware code filtering | head_n | 0 | 0 | MISS | 0 | N/A (0/1) | 0μs | **MISS** |
+| C4 | output grouping by file | grep | 0 | 0 | MISS | 0 | N/A (0/2) | 8.0ms | **MISS** |
+| C4 | output grouping by file | rtk_grep | 37 | 12 | MISS | 0 | 0% (0/2) | 25.0ms | **MISS** |
+| C4 | output grouping by file | rtk_rgai | 3348 | 989 | N/A | 105 | 0% (8/2) | 13.0ms | LOW_COVERAGE |
+| C4 | output grouping by file | head_n | 0 | 0 | MISS | 0 | N/A (0/2) | 0μs | **MISS** |
+| C5 | three tier parser degradation | grep | 0 | 0 | MISS | 0 | N/A (0/1) | 9.0ms | **MISS** |
+| C5 | three tier parser degradation | rtk_grep | 43 | 12 | MISS | 0 | 0% (0/1) | 25.0ms | **MISS** |
+| C5 | three tier parser degradation | rtk_rgai | 2453 | 741 | N/A | 95 | 50% (7/1) | 13.0ms | OK |
+| C5 | three tier parser degradation | head_n | 0 | 0 | MISS | 0 | N/A (0/1) | 0μs | **MISS** |
+| C6 | ANSI color stripping cleanup | grep | 0 | 0 | MISS | 0 | N/A (0/1) | 6.0ms | **MISS** |
+| C6 | ANSI color stripping cleanup | rtk_grep | 42 | 13 | MISS | 0 | 0% (0/1) | 27.0ms | **MISS** |
+| C6 | ANSI color stripping cleanup | rtk_rgai | 2139 | 697 | N/A | 92 | 100% (8/1) | 14.0ms | OK |
+| C6 | ANSI color stripping cleanup | head_n | 0 | 0 | MISS | 0 | N/A (0/1) | 0μs | **MISS** |
+| C7 | hook installation settings json | grep | 0 | 0 | MISS | 0 | N/A (0/1) | 8.0ms | **MISS** |
+| C7 | hook installation settings json | rtk_grep | 45 | 12 | MISS | 0 | 0% (0/1) | 27.0ms | **MISS** |
+| C7 | hook installation settings json | rtk_rgai | 2940 | 907 | N/A | 104 | 100% (8/1) | 15.0ms | OK |
+| C7 | hook installation settings json | head_n | 0 | 0 | MISS | 0 | N/A (0/1) | 0μs | **MISS** |
+| C8 | command classification discover | grep | 0 | 0 | MISS | 0 | N/A (0/1) | 9.0ms | **MISS** |
+| C8 | command classification discover | rtk_grep | 45 | 11 | MISS | 0 | 0% (0/1) | 26.0ms | **MISS** |
+| C8 | command classification discover | rtk_rgai | 2867 | 796 | N/A | 104 | 100% (8/1) | 13.0ms | OK |
+| C8 | command classification discover | head_n | 0 | 0 | MISS | 0 | N/A (0/1) | 0μs | **MISS** |
+| C9 | pnpm yarn npm auto detection | grep | 0 | 0 | MISS | 0 | N/A (0/1) | 8.0ms | **MISS** |
+| C9 | pnpm yarn npm auto detection | rtk_grep | 42 | 14 | MISS | 0 | 0% (0/1) | 27.0ms | **MISS** |
+| C9 | pnpm yarn npm auto detection | rtk_rgai | 2682 | 931 | N/A | 104 | 100% (8/1) | 14.0ms | OK |
+| C9 | pnpm yarn npm auto detection | head_n | 0 | 0 | MISS | 0 | N/A (0/1) | 0μs | **MISS** |
+| C10 | SQLite retention cleanup policy | grep | 0 | 0 | MISS | 0 | N/A (0/1) | 11.0ms | **MISS** |
+| C10 | SQLite retention cleanup policy | rtk_grep | 45 | 12 | MISS | 0 | 0% (0/1) | 25.0ms | **MISS** |
+| C10 | SQLite retention cleanup policy | rtk_rgai | 806 | 241 | N/A | 27 | 100% (2/1) | 14.0ms | OK |
+| C10 | SQLite retention cleanup policy | head_n | 0 | 0 | MISS | 0 | N/A (0/1) | 0μs | **MISS** |
+
+### Category C: Semantic Intent Search — Summary
+
+- **grep**: | time min/med/max=6.0ms / 8.0ms / 37.0ms | MISS=10
+- **rtk_grep**: | gold hit min/med/max=0%/0%/0% | time min/med/max=23.0ms / 25.0ms / 63.0ms | MISS=10
+- **rtk_rgai**: | gold hit min/med/max=0%/100%/100% | time min/med/max=12.0ms / 14.0ms / 18.0ms | LOW_COVERAGE=1
+- **head_n**: | time min/med/max=0μs / 0μs / 0μs | MISS=10
+
+## Category D: Cross-File Pattern Discovery
+
+| ID | Query | Tool | Bytes | Tokens | TE | Result Count | Gold Hit | Timing (med) | Status |
+|----|-------|------|-------|--------|----|-------------|----------|-------------|--------|
+| D1 | verbose > 0 | grep | 6540 | 2112 | 1.000 | 90 | 100% (36/30) | 6.0ms | OK |
+| D1 | verbose > 0 | rtk_grep | 4307 | 1634 | 0.774 | 162 | 100% (36/30) | 25.0ms | OK |
+| D1 | verbose > 0 | rtk_rgai | 2238 | 709 | 0.336 | 97 | 50% (8/30) | 11.0ms | LOW_COVERAGE |
+| D1 | verbose > 0 | head_n | 6540 | 2112 | 1.000 | 90 | 100% (36/30) | 0μs | OK |
+| D2 | anyhow::Result | grep | 753 | 235 | 1.000 | 11 | 100% (11/11) | 8.0ms | OK |
+| D2 | anyhow::Result | rtk_grep | 954 | 333 | 1.417 | 35 | 100% (11/11) | 24.0ms | OK |
+| D2 | anyhow::Result | rtk_rgai | 2416 | 765 | 3.255 | 102 | 73% (8/11) | 12.0ms | LOW_COVERAGE |
+| D2 | anyhow::Result | head_n | 753 | 235 | 1.000 | 11 | 100% (11/11) | 0μs | OK |
+| D3 | process::exit | grep | 5234 | 1474 | 1.000 | 47 | 100% (19/15) | 7.0ms | OK |
+| D3 | process::exit | rtk_grep | 3682 | 1154 | 0.783 | 84 | 100% (19/15) | 24.0ms | OK |
+| D3 | process::exit | rtk_rgai | 2538 | 804 | 0.545 | 106 | 83% (8/15) | 12.0ms | LOW_COVERAGE |
+| D3 | process::exit | head_n | 5234 | 1474 | 1.000 | 47 | 100% (19/15) | 0μs | OK |
+| D4 | Command::new | grep | 9867 | 2999 | 1.000 | 111 | 100% (24/20) | 5.0ms | OK |
+| D4 | Command::new | rtk_grep | 5321 | 1790 | 0.597 | 145 | 100% (24/20) | 25.0ms | OK |
+| D4 | Command::new | rtk_rgai | 2283 | 769 | 0.256 | 102 | 57% (8/20) | 12.0ms | LOW_COVERAGE |
+| D4 | Command::new | head_n | 8937 | 2700 | 0.900 | 100 | 100% (23/20) | 0μs | OK |
+| D5 | from_utf8_lossy | grep | 17304 | 5038 | 1.000 | 157 | 100% (28/25) | 7.0ms | OK |
+| D5 | from_utf8_lossy | rtk_grep | 8386 | 2572 | 0.511 | 168 | 100% (28/25) | 25.0ms | OK |
+| D5 | from_utf8_lossy | rtk_rgai | 2767 | 867 | 0.172 | 94 | 29% (8/25) | 11.0ms | LOW_COVERAGE |
+| D5 | from_utf8_lossy | head_n | 10775 | 3127 | 0.621 | 100 | 43% (17/25) | 0μs | LOW_COVERAGE |
+
+### Category D: Cross-File Pattern Discovery — Summary
+
+- **grep**: | TE min/med/max=1.000/1.000/1.000 | gold hit min/med/max=100%/100%/100% | time min/med/max=5.0ms / 7.0ms / 8.0ms
+- **rtk_grep**: | TE min/med/max=0.511/0.774/1.417 | gold hit min/med/max=100%/100%/100% | time min/med/max=23.0ms / 25.0ms / 27.0ms
+- **rtk_rgai**: | TE min/med/max=0.172/0.336/3.255 | gold hit min/med/max=29%/57%/83% | time min/med/max=11.0ms / 12.0ms / 13.0ms | LOW_COVERAGE=5
+- **head_n**: | TE min/med/max=0.621/1.000/1.000 | gold hit min/med/max=43%/100%/100% | time min/med/max=0μs / 0μs / 0μs | LOW_COVERAGE=1
+
+## Category E: Edge Cases
+
+> Edge cases are discussed per-case; no category-level winner is inferred.
+
+| ID | Query | Tool | Bytes | Tokens | TE | Result Count | Gold Hit | Timing (med) | Status |
+|----|-------|------|-------|--------|----|-------------|----------|-------------|--------|
+| E1 | the | grep | 19971 | 5421 | 1.000 | 178 | N/A | 8.0ms | OK |
+| E1 | the | rtk_grep | 11273 | 3399 | 0.627 | 239 | N/A | 26.0ms | OK |
+| E1 | the | rtk_rgai | 2359 | 779 | 0.144 | 106 | N/A | 10.0ms | OK |
+| E1 | the | head_n | 11771 | 3170 | 0.585 | 100 | N/A | 0μs | OK |
+| E2 | fn | grep | 77939 | 23141 | 1.000 | 784 | N/A | 7.0ms | OK |
+| E2 | fn | rtk_grep | 12744 | 4052 | 0.175 | 264 | N/A | 26.0ms | OK |
+| E2 | fn | rtk_rgai | 2733 | 872 | 0.038 | 101 | N/A | 10.0ms | OK |
+| E2 | fn | head_n | 10320 | 3103 | 0.134 | 100 | N/A | 0μs | OK |
+| E3 | error handling retry backoff | grep | 0 | 0 | N/A | 0 | N/A | 9.0ms | OK |
+| E3 | error handling retry backoff | rtk_grep | 42 | 13 | N/A | 0 | N/A | 25.0ms | OK |
+| E3 | error handling retry backoff | rtk_rgai | 2340 | 756 | N/A | 102 | N/A | 13.0ms | **UNEXPECTED_HIT** |
+| E3 | error handling retry backoff | head_n | 0 | 0 | N/A | 0 | N/A | 0μs | OK |
+
+## Summary: When to Use Which Tool
+
+| Situation | Recommended | Evidence |
+|-----------|-------------|----------|
+| Exact identifier search (Category A) | rtk_grep | median gold hit=100%, MISS=0, LOW_COVERAGE=0, median TE=0.698 |
+| Cross-file pattern discovery (Category D) | rtk_grep | median gold hit=100%, MISS=0, LOW_COVERAGE=0, median TE=0.774 |
+| Semantic intent search (Category C) | rtk_rgai | median gold hit=100%, MISS=0, LOW_COVERAGE=1, UNEXPECTED_HIT=0, median TE=N/A |
+| Regex patterns (Category B) | grep / rtk grep | `rtk rgai` expected unsupported for regex |
+| Exact zero-result validation (E3) | grep / rtk grep | Unexpected hits observed for: rtk_rgai |
+
+## Failure Modes
+
+### grep
+- Floods output on broad/common queries.
+- Misses semantic intent queries that do not appear as exact substrings.
+- No built-in grouping/truncation.
+
+### rtk grep
+- Output truncation (`--max 200`) can reduce recall in high-frequency queries.
+- Still exact-match based (no semantic expansion).
+
+### rtk rgai
+- Regex queries are unsupported by design.
+- Can return semantically related content even when strict zero results are expected.
+- Quality depends on ranking/model behavior and may vary by environment.
+
+### head_n (negative control)
+- Naive truncation may look token-efficient but is relevance-blind.
+- Useful as a floor comparator, not as a production recommendation.
+
+## Limitations
+
+- Single codebase benchmark (`src/` Rust files only).
+- Gold standards are author-defined and include subjective intent mapping.
+- Gold hit is computed from first-run samples; non-deterministic tools may vary across runs.
+- Timing is hardware and background-load dependent.
diff --git a/benchmarks/analyze_code.py b/benchmarks/analyze_code.py
new file mode 100644
index 0000000..2404df3
--- /dev/null
+++ b/benchmarks/analyze_code.py
@@ -0,0 +1,679 @@
+#!/usr/bin/env python3
+"""
+Analyze code-search benchmark results and generate RESULTS.md.
+
+Rules:
+  - No composite score.
+  - Per-category analysis only.
+  - Report distributions (min/median/max).
+  - If gold expects results and result_count == 0 => MISS.
+  - Regex category: rgai is EXPECTED_UNSUPPORTED (not failure).
+"""
+
+from __future__ import annotations
+
+import csv
+import json
+import re
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+CSV_PATH = SCRIPT_DIR / "results_raw.csv"
+ENV_PATH = SCRIPT_DIR / "results_env.txt"
+GOLD_PATH = SCRIPT_DIR / "gold_standards.json"
+GOLD_AUTO_PATH = SCRIPT_DIR / "gold_auto.json"  # ADDED: auto-generated gold
+QUALITY_DIR = SCRIPT_DIR / "quality_samples"
+RESULTS_PATH = SCRIPT_DIR / "RESULTS.md"
+
+TOOLS = ("grep", "rtk_grep", "rtk_rgai", "head_n")  # CHANGED: added head_n
+RECOMMENDABLE_TOOLS = ("grep", "rtk_grep", "rtk_rgai")
+CATEGORY_ORDER = [
+    "exact_identifier",
+    "regex_pattern",
+    "semantic_intent",
+    "cross_file",
+    "edge_case",
+]
+CATEGORY_TITLES = {
+    "exact_identifier": "Category A: Exact Identifier Search",
+    "regex_pattern": "Category B: Regex Pattern Search",
+    "semantic_intent": "Category C: Semantic Intent Search",
+    "cross_file": "Category D: Cross-File Pattern Discovery",
+    "edge_case": "Category E: Edge Cases",
+}
+
+
+def median_val(values: list[int | float]) -> float:
+    vals = sorted(values)
+    if not vals:
+        return 0.0
+    n = len(vals)
+    if n % 2 == 1:
+        return float(vals[n // 2])
+    return (vals[n // 2 - 1] + vals[n // 2]) / 2.0
+
+
+def min_val(values: list[int | float]) -> float:
+    return float(min(values)) if values else 0.0
+
+
+def max_val(values: list[int | float]) -> float:
+    return float(max(values)) if values else 0.0
+
+
+def is_valid_exit(exit_code: int) -> bool:
+    # 0 = matches/success, 1 = no matches, >=2 = execution error
+    return exit_code in (0, 1)
+
+
+def normalize_rs_path(path: str) -> str:
+    p = path.strip(" \t\r\n:;,.()[]{}<>\"'")
+    p = p.replace("\\", "/")
+    if "/.../" in p:
+        p = p.split("/.../", 1)[1]
+    if "/src/" in p:
+        p = p.split("/src/", 1)[1]
+    elif p.startswith("src/"):
+        p = p[4:]
+
+    p = p.lstrip("./")
+    p = re.sub(r"/{2,}", "/", p)
+    return p
+
+
+def extract_filenames(text: str) -> set[str]:
+    filenames: set[str] = set()
+    for raw_line in text.splitlines():
+        line = raw_line.strip()
+        if not line:
+            continue
+
+        # RTK grouped output:
+        #   📄 /path/to/file.rs (12):
+        #   📄 parser/mod.rs [9.4]
+        m = re.match(r"^📄\s+(.+?\.rs)\s*(?:\(|\[|$)", line)
+        if m:
+            candidate = normalize_rs_path(m.group(1))
+            if candidate.endswith(".rs"):
+                filenames.add(candidate)
+            continue
+
+        # grep -rn style:
+        #   /abs/src/file.rs:42:...
+        #   src/file.rs:42:...
+        m = re.match(r"^(.+?\.rs):\d+(?::|$)", line)
+        if m:
+            candidate = normalize_rs_path(m.group(1))
+            if candidate.endswith(".rs"):
+                filenames.add(candidate)
+
+    return filenames
+
+
+def file_matches_gold(gold_file: str, found_files: set[str]) -> bool:
+    if gold_file in found_files:
+        return True
+
+    if "/" not in gold_file:
+        suffix = f"/{gold_file}"
+        return any(f == gold_file or f.endswith(suffix) for f in found_files)
+
+    return False
+
+
+def compute_gold_hits(sample_text: str, gold_files: list[str]) -> int:
+    if not gold_files:
+        return 0
+    found_files = extract_filenames(sample_text)
+    return sum(1 for gf in gold_files if file_matches_gold(gf, found_files))
+
+
+def infer_no_result_from_sample(sample_text: str, tool: str) -> bool:
+    text = sample_text.strip()
+    if not text:
+        return False
+    if tool in {"rtk_grep", "rtk_rgai"}:
+        # rtk no-results marker examples:
+        # "🔍 0 for 'query'" / "🧠 0 for 'query'"
+        if re.search(r"(?:🔍|🧠)\s*0\s+for\b", text):
+            return True
+        # Fallback in case glyphs differ.
+        if re.search(r"^\s*0\s+for\b", text):
+            return True
+    return False
+
+
+def compute_gold_hit_rate(sample_text: str, gold_files: list[str]) -> float | None:
+    if not gold_files:
+        return None
+    hits = compute_gold_hits(sample_text, gold_files)
+    return hits / len(gold_files)
+
+
+def compute_gold_found_count(sample_text: str) -> int:
+    return len(extract_filenames(sample_text))
+
+
+def is_miss(result_count: int, expect_results: bool) -> bool:
+    return result_count == 0 and expect_results
+
+
+def format_te(te: float | None, miss: bool) -> str:
+    if miss:
+        return "MISS"
+    if te is None:
+        return "N/A"
+    return f"{te:.3f}"
+
+
+def format_pct(te: float | None, miss: bool) -> str:
+    if miss:
+        return "MISS"
+    if te is None:
+        return "N/A"
+    savings = (1 - te) * 100
+    return f"{savings:.1f}%"
+
+
+def format_gold(rate: float | None, found_count: int, min_required: int) -> str:
+    if rate is None:
+        if min_required > 0:
+            return f"N/A ({found_count}/{min_required})"
+        return "N/A"
+    if min_required > 0:
+        return f"{rate * 100:.0f}% ({found_count}/{min_required})"
+    return f"{rate * 100:.0f}%"
+
+
+def format_timing(us: float) -> str:
+    if us >= 1_000_000:
+        return f"{us / 1_000_000:.2f}s"
+    if us >= 1_000:
+        return f"{us / 1_000:.1f}ms"
+    return f"{us:.0f}μs"
+
+
+def format_timing_range(min_us: float, med_us: float, max_us: float) -> str:
+    return f"{format_timing(min_us)} / {format_timing(med_us)} / {format_timing(max_us)}"
+
+
+def load_gold_standards() -> tuple[dict, dict]:
+    with open(GOLD_PATH, encoding="utf-8") as f:
+        data = json.load(f)
+    return data["queries"], data.get("metadata", {})
+
+
+def load_gold_auto() -> dict:  # ADDED: load auto-generated gold
+    """Load auto-generated gold standards from grep output."""
+    if not GOLD_AUTO_PATH.exists():
+        return {}
+    with open(GOLD_AUTO_PATH, encoding="utf-8") as f:
+        data = json.load(f)
+    return data.get("queries", {})
+
+
+def load_csv() -> list[dict]:
+    rows = []
+    with open(CSV_PATH, newline="", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            if "output_tokens" not in row or row["output_tokens"] in (None, ""):
+                raise ValueError(
+                    "results_raw.csv is missing 'output_tokens'. "
+                    "Re-run benchmarks/bench_code.sh after installing tiktoken."
+                )
+            row["time_us"] = int(row["time_us"])
+            row["output_bytes"] = int(row["output_bytes"])
+            row["output_tokens"] = int(row["output_tokens"])
+            row["result_count"] = int(row["result_count"])
+            row["exit_code"] = int(row["exit_code"])
+            row["run"] = int(row["run"])
+            rows.append(row)
+    return rows
+
+
+def load_quality_sample(test_id: str, tool: str) -> str:
+    path = QUALITY_DIR / f"{test_id}_{tool}.txt"
+    if path.exists():
+        return path.read_text(errors="replace")
+    return ""
+
+
+def parse_commit_from_env(env_text: str) -> str | None:
+    m = re.search(r"^Commit:\s*([0-9a-f]{7,40})\s*$", env_text, flags=re.MULTILINE)
+    return m.group(1) if m else None
+
+
+def compute_metrics(rows: list[dict], gold: dict, gold_auto: dict | None = None) -> list[dict]:  # CHANGED: added gold_auto
+    grouped: dict[tuple[str, str], list[dict]] = defaultdict(list)
+    for row in rows:
+        grouped[(row["test_id"], row["tool"])].append(row)
+
+    aggregates: dict[tuple[str, str], dict] = {}
+    for (tid, tool), runs in grouped.items():
+        times = [r["time_us"] for r in runs]
+        bytess = [r["output_bytes"] for r in runs]
+        tokens = [r["output_tokens"] for r in runs]  # ADDED: token counts
+        counts = [r["result_count"] for r in runs]
+        exits = [r["exit_code"] for r in runs]
+        aggregates[(tid, tool)] = {
+            "test_id": tid,
+            "tool": tool,
+            "category": runs[0]["category"],
+            "query": runs[0]["query"].strip('"'),
+            "median_time_us": median_val(times),
+            "min_time_us": min_val(times),
+            "max_time_us": max_val(times),
+            "median_bytes": median_val(bytess),
+            "median_tokens": median_val(tokens),  # ADDED
+            "median_count": median_val(counts),
+            "valid": all(is_valid_exit(e) for e in exits),
+        }
+
+    test_ids = sorted(
+        set(tid for tid, _ in aggregates.keys()),
+        key=lambda x: (x[0], int(x[1:]) if x[1:].isdigit() else 0),
+    )
+
+    results = []
+    for tid in test_ids:
+        gold_entry = gold.get(tid, {})
+        category = gold_entry.get("category", "unknown")
+        expect = gold_entry.get("expect_results", True)
+        gold_files = gold_entry.get("gold_files", [])
+        gold_min_files = int(gold_entry.get("gold_min_files", 0) or 0)
+
+        grep_agg = aggregates.get((tid, "grep"))
+        grep_tokens = grep_agg["median_tokens"] if grep_agg else 0
+
+        entry = {
+            "test_id": tid,
+            "category": category,
+            "query": gold_entry.get("query", ""),
+            "expect_results": expect,
+        }
+
+        for tool in TOOLS:
+            agg = aggregates.get((tid, tool))
+            if not agg:
+                continue
+
+            sample = load_quality_sample(tid, tool)
+            no_result_marker = infer_no_result_from_sample(sample, tool)
+            adjusted_count = 0 if no_result_marker else int(agg["median_count"])
+
+            unsupported = category == "regex_pattern" and tool == "rtk_rgai"
+            miss = is_miss(adjusted_count, expect) and not unsupported
+            unexpected_hit = (not expect) and adjusted_count > 0
+
+            ghr = compute_gold_hit_rate(sample, gold_files) if sample else None
+            found_count = compute_gold_found_count(sample) if sample else 0
+            gold_hits = compute_gold_hits(sample, gold_files) if sample else 0
+            gold_min_ok = None
+            if gold_min_files > 0:
+                gold_min_ok = found_count >= gold_min_files
+            low_coverage = False
+            if not miss and not unsupported:
+                if gold_min_files > 0 and gold_min_ok is False:
+                    low_coverage = True
+                if ghr is not None and ghr == 0.0:
+                    low_coverage = True
+
+            te = None
+            if agg["valid"] and grep_tokens > 0:
+                te = agg["median_tokens"] / grep_tokens
+
+            entry[f"{tool}_bytes"] = agg["median_bytes"]
+            entry[f"{tool}_tokens"] = agg["median_tokens"]
+            entry[f"{tool}_count"] = adjusted_count
+            entry[f"{tool}_time_us"] = agg["median_time_us"]
+            entry[f"{tool}_min_time_us"] = agg["min_time_us"]
+            entry[f"{tool}_max_time_us"] = agg["max_time_us"]
+            entry[f"{tool}_te"] = te
+            entry[f"{tool}_gold_hit"] = ghr
+            entry[f"{tool}_gold_found"] = found_count
+            entry[f"{tool}_gold_hits"] = gold_hits
+            entry[f"{tool}_gold_min_required"] = gold_min_files
+            entry[f"{tool}_gold_min_ok"] = gold_min_ok
+            entry[f"{tool}_low_coverage"] = low_coverage
+            entry[f"{tool}_valid"] = agg["valid"]
+            entry[f"{tool}_miss"] = miss
+            entry[f"{tool}_unsupported"] = unsupported
+            entry[f"{tool}_unexpected_hit"] = unexpected_hit
+
+        results.append(entry)
+
+    return results
+
+
+def category_tool_stats(cat_metrics: list[dict], tool: str) -> dict:
+    entries = [m for m in cat_metrics if f"{tool}_bytes" in m and not m.get(f"{tool}_unsupported", False)]
+
+    te_vals = [m[f"{tool}_te"] for m in entries if m.get(f"{tool}_te") is not None and not m.get(f"{tool}_miss", False)]
+    gold_vals = [m[f"{tool}_gold_hit"] for m in entries if m.get(f"{tool}_gold_hit") is not None]
+    time_vals = [m[f"{tool}_time_us"] for m in entries]
+    min_time_vals = [m[f"{tool}_min_time_us"] for m in entries]
+    max_time_vals = [m[f"{tool}_max_time_us"] for m in entries]
+
+    miss_count = sum(1 for m in entries if m.get(f"{tool}_miss", False))
+    unexpected_count = sum(1 for m in entries if m.get(f"{tool}_unexpected_hit", False))
+    low_cov_count = sum(
+        1
+        for m in entries
+        if m.get(f"{tool}_low_coverage", False)
+        and not m.get(f"{tool}_miss", False)
+    )
+
+    return {
+        "entries": entries,
+        "te_vals": te_vals,
+        "gold_vals": gold_vals,
+        "time_vals": time_vals,
+        "min_time_vals": min_time_vals,
+        "max_time_vals": max_time_vals,
+        "miss_count": miss_count,
+        "unexpected_count": unexpected_count,
+        "low_cov_count": low_cov_count,
+        "unsupported_count": sum(1 for m in cat_metrics if m.get(f"{tool}_unsupported", False)),
+    }
+
+
+def pick_best_for_exact(cat_metrics: list[dict]) -> tuple[str, str]:
+    candidates = []
+    for tool in RECOMMENDABLE_TOOLS:
+        st = category_tool_stats(cat_metrics, tool)
+        if not st["entries"]:
+            continue
+        med_te = median_val(st["te_vals"]) if st["te_vals"] else 1e18
+        med_gold = median_val(st["gold_vals"]) if st["gold_vals"] else -1.0
+        med_time = median_val(st["time_vals"]) if st["time_vals"] else 1e18
+        candidates.append(
+            (tool, st["miss_count"], st["low_cov_count"], med_gold, med_te, med_time)
+        )
+
+    if not candidates:
+        return "N/A", "Insufficient valid metrics"
+
+    # For exact/cross-file tasks: correctness first, compression second.
+    candidates.sort(key=lambda x: (x[1], x[2], -x[3], x[4], x[5]))
+    tool, miss, low_cov, med_gold, med_te, _ = candidates[0]
+    gold_str = "N/A" if med_gold < 0 else f"{med_gold * 100:.0f}%"
+    te_str = "N/A" if med_te == 1e18 else f"{med_te:.3f}"
+    return tool, f"median gold hit={gold_str}, MISS={miss}, LOW_COVERAGE={low_cov}, median TE={te_str}"
+
+
+def pick_best_for_semantic(cat_metrics: list[dict]) -> tuple[str, str]:
+    candidates = []
+    for tool in RECOMMENDABLE_TOOLS:
+        st = category_tool_stats(cat_metrics, tool)
+        if not st["entries"]:
+            continue
+        med_gold = median_val(st["gold_vals"]) if st["gold_vals"] else -1.0
+        med_te = median_val(st["te_vals"]) if st["te_vals"] else 1e18
+        miss = st["miss_count"]
+        low_cov = st["low_cov_count"]
+        unexpected = st["unexpected_count"]
+        candidates.append((tool, miss, low_cov, unexpected, -med_gold, med_te, med_gold))
+
+    if not candidates:
+        return "N/A", "Insufficient valid metrics"
+
+    # For semantic tasks: misses/coverage first, then relevance, then compression.
+    candidates.sort(key=lambda x: (x[1], x[2], x[3], x[4], x[5]))
+    tool, miss, low_cov, unexpected, _, med_te, med_gold = candidates[0]
+    gold_str = "N/A" if med_gold < 0 else f"{med_gold * 100:.0f}%"
+    te_str = "N/A" if med_te == 1e18 else f"{med_te:.3f}"
+    return tool, f"median gold hit={gold_str}, MISS={miss}, LOW_COVERAGE={low_cov}, UNEXPECTED_HIT={unexpected}, median TE={te_str}"
+
+
+def generate_report(
+    metrics: list[dict],
+    env_text: str,
+    gold_queries: dict,
+    pinned_commit: str,
+    env_commit: str | None,
+) -> str:
+    lines: list[str] = []
+    w = lines.append
+
+    w("# Code Search Benchmark: grep vs rtk grep vs rtk rgai vs head_n\n")  # CHANGED: added head_n
+    w("## Environment & Reproduction\n")
+    w("```")
+    w(env_text.strip())
+    w("```\n")
+
+    w(f"## Dataset: rtk-ai/rtk @ `{pinned_commit}`\n")
+    if env_commit and env_commit != pinned_commit:
+        w(
+            f"> **WARNING**: benchmark env commit `{env_commit}` differs from pinned "
+            f"gold commit `{pinned_commit}`. Results are not strictly reproducible.\n"
+        )
+
+    w("**Reproduction**:")
+    w("```bash")
+    w("rtk --version")
+    w("bash benchmarks/bench_code.sh")
+    w("python3 benchmarks/analyze_code.py")
+    w("python3 -m unittest discover -s benchmarks/tests -p 'test_*.py'")
+    w("```\n")
+
+    w("## Methodology\n")
+    w("### Metrics (reported separately, NO composite score)\n")
+    w("| Metric | Definition | Purpose |")
+    w("|--------|-----------|---------|")
+    w("| Output bytes | `wc -c` of stdout | Raw size footprint |")
+    w("| Output tokens | `tiktoken` (`cl100k_base`) on full stdout | Model-aligned token cost |")
+    w("| Token Efficiency (TE) | `output_tokens / grep_output_tokens` | Token compression vs baseline |")
+    w("| Result count | Effective output lines / no-result aware count | Distinguish compactness vs empty results |")
+    w("| Gold hit rate | `% gold_files found` (plus found/min files) | Relevance/correctness |")
+    w("| Timing | Median of 5 runs, plus min/max in summaries | Performance distribution |")
+    w("")
+    w("**Critical rule**: if `expect_results=true` and `result_count==0`, mark as **MISS**.")
+    w("For regex category, `rtk rgai` is marked `EXPECTED_UNSUPPORTED` by design.\n")
+
+    w("### Categories\n")
+    w("| Category | Queries |")
+    w("|----------|---------|")
+    w("| A: Exact Identifier | 6 |")
+    w("| B: Regex Pattern | 6 |")
+    w("| C: Semantic Intent | 10 |")
+    w("| D: Cross-File Pattern Discovery | 5 |")
+    w("| E: Edge Cases | 3 |")
+    w("")
+
+    for cat_key in CATEGORY_ORDER:
+        cat_title = CATEGORY_TITLES[cat_key]
+        cat_metrics = [m for m in metrics if m["category"] == cat_key]
+        if not cat_metrics:
+            continue
+
+        w(f"## {cat_title}\n")
+
+        if cat_key == "regex_pattern":
+            w("> `rtk rgai` does not support regex; misses are EXPECTED_UNSUPPORTED.\n")
+        if cat_key == "semantic_intent":
+            w("> For multi-concept queries, grep exact-substring misses are expected and shown as MISS.\n")
+        if cat_key == "edge_case":
+            w("> Edge cases are discussed per-case; no category-level winner is inferred.\n")
+
+        w("| ID | Query | Tool | Bytes | Tokens | TE | Result Count | Gold Hit | Timing (med) | Status |")
+        w("|----|-------|------|-------|--------|----|-------------|----------|-------------|--------|")
+
+        for m in cat_metrics:
+            for tool in TOOLS:
+                if f"{tool}_bytes" not in m:
+                    continue
+
+                miss = m.get(f"{tool}_miss", False)
+                unsupported = m.get(f"{tool}_unsupported", False)
+                unexpected_hit = m.get(f"{tool}_unexpected_hit", False)
+                valid = m.get(f"{tool}_valid", False)
+                min_required = m.get(f"{tool}_gold_min_required", 0)
+                low_coverage = m.get(f"{tool}_low_coverage", False)
+
+                if unsupported:
+                    status = "EXPECTED_UNSUPPORTED"
+                elif not valid:
+                    status = "INVALID"
+                elif miss:
+                    status = "**MISS**"
+                elif unexpected_hit:
+                    status = "**UNEXPECTED_HIT**"
+                elif low_coverage:
+                    status = "LOW_COVERAGE"
+                else:
+                    status = "OK"
+
+                w(
+                    f"| {m['test_id']} | {m['query']} | {tool} | {m[f'{tool}_bytes']:.0f} | "
+                    f"{m.get(f'{tool}_tokens', 0):.0f} | "
+                    f"{format_te(m.get(f'{tool}_te'), miss)} | "
+                    f"{m.get(f'{tool}_count', 0):.0f} | "
+                    f"{format_gold(m.get(f'{tool}_gold_hit'), m.get(f'{tool}_gold_found', 0), min_required)} | "
+                    f"{format_timing(m.get(f'{tool}_time_us', 0.0))} | {status} |"
+                )
+        w("")
+
+        if cat_key != "edge_case":
+            w(f"### {cat_title} — Summary\n")
+            for tool in TOOLS:
+                st = category_tool_stats(cat_metrics, tool)
+                if st["unsupported_count"] == len(cat_metrics):
+                    w(f"- **{tool}**: expected unsupported for this category.")
+                    continue
+
+                parts = [f"**{tool}**:"]
+                if st["te_vals"]:
+                    parts.append(
+                        "TE min/med/max="
+                        f"{min_val(st['te_vals']):.3f}/"
+                        f"{median_val(st['te_vals']):.3f}/"
+                        f"{max_val(st['te_vals']):.3f}"
+                    )
+                if st["gold_vals"]:
+                    parts.append(
+                        "gold hit min/med/max="
+                        f"{min_val(st['gold_vals']) * 100:.0f}%/"
+                        f"{median_val(st['gold_vals']) * 100:.0f}%/"
+                        f"{max_val(st['gold_vals']) * 100:.0f}%"
+                    )
+                if st["time_vals"]:
+                    parts.append(
+                        "time min/med/max="
+                        + format_timing_range(
+                            min_val(st["min_time_vals"]),
+                            median_val(st["time_vals"]),
+                            max_val(st["max_time_vals"]),
+                        )
+                    )
+                if st["miss_count"] > 0:
+                    parts.append(f"MISS={st['miss_count']}")
+                if st["unexpected_count"] > 0:
+                    parts.append(f"UNEXPECTED_HIT={st['unexpected_count']}")
+                if st["low_cov_count"] > 0:
+                    parts.append(f"LOW_COVERAGE={st['low_cov_count']}")
+                w("- " + " | ".join(parts))
+            w("")
+
+    # Tool recommendation rows without cross-category averaging.
+    w("## Summary: When to Use Which Tool\n")
+    w("| Situation | Recommended | Evidence |")
+    w("|-----------|-------------|----------|")
+
+    cat_a = [m for m in metrics if m["category"] == "exact_identifier"]
+    cat_d = [m for m in metrics if m["category"] == "cross_file"]
+    cat_c = [m for m in metrics if m["category"] == "semantic_intent"]
+    cat_e = [m for m in metrics if m["category"] == "edge_case"]
+
+    best_a, ev_a = pick_best_for_exact(cat_a)
+    best_d, ev_d = pick_best_for_exact(cat_d)
+    best_c, ev_c = pick_best_for_semantic(cat_c)
+
+    w(f"| Exact identifier search (Category A) | {best_a} | {ev_a} |")
+    w(f"| Cross-file pattern discovery (Category D) | {best_d} | {ev_d} |")
+    w(f"| Semantic intent search (Category C) | {best_c} | {ev_c} |")
+    w("| Regex patterns (Category B) | grep / rtk grep | `rtk rgai` expected unsupported for regex |")
+
+    # Edge evidence: E3 should be zero results.
+    e3 = next((m for m in cat_e if m["test_id"] == "E3"), None)
+    if e3:
+        bad_tools = [t for t in TOOLS if e3.get(f"{t}_unexpected_hit", False)]
+        if bad_tools:
+            w(
+                "| Exact zero-result validation (E3) | grep / rtk grep | "
+                f"Unexpected hits observed for: {', '.join(bad_tools)} |"
+            )
+        else:
+            w("| Exact zero-result validation (E3) | all tools | All returned zero results as expected |")
+    w("")
+
+    w("## Failure Modes\n")
+    w("### grep")
+    w("- Floods output on broad/common queries.")
+    w("- Misses semantic intent queries that do not appear as exact substrings.")
+    w("- No built-in grouping/truncation.\n")
+    w("### rtk grep")
+    w("- Output truncation (`--max 200`) can reduce recall in high-frequency queries.")
+    w("- Still exact-match based (no semantic expansion).\n")
+    w("### rtk rgai")
+    w("- Regex queries are unsupported by design.")
+    w("- Can return semantically related content even when strict zero results are expected.")
+    w("- Quality depends on ranking/model behavior and may vary by environment.\n")
+    if "head_n" in TOOLS:
+        w("### head_n (negative control)")
+        w("- Naive truncation may look token-efficient but is relevance-blind.")
+        w("- Useful as a floor comparator, not as a production recommendation.\n")
+
+    w("## Limitations\n")
+    w("- Single codebase benchmark (`src/` Rust files only).")
+    w("- Gold standards are author-defined and include subjective intent mapping.")
+    w("- Gold hit is computed from first-run samples; non-deterministic tools may vary across runs.")
+    w("- Timing is hardware and background-load dependent.")
+    w("")
+
+    return "\n".join(lines)
+
+
+def main():
+    if not CSV_PATH.exists():
+        print(f"ERROR: {CSV_PATH} not found. Run bench_code.sh first.", file=sys.stderr)
+        sys.exit(1)
+    if not GOLD_PATH.exists():
+        print(f"ERROR: {GOLD_PATH} not found.", file=sys.stderr)
+        sys.exit(1)
+
+    gold_queries, gold_meta = load_gold_standards()
+    gold_auto = load_gold_auto()  # ADDED: auto-generated gold from grep output
+    rows = load_csv()
+    env_text = ENV_PATH.read_text() if ENV_PATH.exists() else ""
+    env_commit = parse_commit_from_env(env_text)
+    pinned_commit = gold_meta.get("pinned_commit", "unknown")
+
+    print(f"Loaded {len(rows)} measurements from {CSV_PATH}")
+    print(f"Loaded {len(gold_queries)} gold standards from {GOLD_PATH}")
+    if gold_auto:
+        print(f"Loaded {len(gold_auto)} auto-generated gold entries from {GOLD_AUTO_PATH}")  # ADDED
+
+    metrics = compute_metrics(rows, gold_queries, gold_auto)  # CHANGED: pass gold_auto
+    report = generate_report(metrics, env_text, gold_queries, pinned_commit, env_commit)
+    RESULTS_PATH.write_text(report, encoding="utf-8")
+
+    miss_count = 0
+    unexpected_count = 0
+    for m in metrics:
+        for tool in TOOLS:
+            if m.get(f"{tool}_miss", False):
+                miss_count += 1
+            if m.get(f"{tool}_unexpected_hit", False):
+                unexpected_count += 1
+
+    print(f"\nReport written to {RESULTS_PATH}")
+    print(f"  {len(metrics)} queries analyzed")
+    print(f"  MISS entries: {miss_count}")
+    print(f"  UNEXPECTED_HIT entries: {unexpected_count}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/bench_code.sh b/benchmarks/bench_code.sh
new file mode 100755
index 0000000..1df8949
--- /dev/null
+++ b/benchmarks/bench_code.sh
@@ -0,0 +1,367 @@
+#!/usr/bin/env bash
+#
+# Benchmark runner: grep vs rtk grep vs rtk rgai on rtk codebase
+#
+# Usage:
+#   bash benchmarks/bench_code.sh
+#
+# Output:
+#   benchmarks/results_raw.csv    — raw measurements (30 queries × 4 tools × 5 runs)  # CHANGED: 4 tools
+#   benchmarks/results_env.txt    — environment snapshot
+#   benchmarks/quality_samples/   — first-run full output samples (no truncation)
+#   benchmarks/gold_auto.json     — auto-generated gold files from grep output
+#
+set -euo pipefail
+export LC_ALL=C
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+SRC_DIR="$PROJECT_DIR/src"
+GOLD_PATH="$SCRIPT_DIR/gold_standards.json"
+CSV_OUT="$SCRIPT_DIR/results_raw.csv"
+ENV_OUT="$SCRIPT_DIR/results_env.txt"
+QUALITY_DIR="$SCRIPT_DIR/quality_samples"
+GOLD_AUTO="$SCRIPT_DIR/gold_auto.json"  # ADDED: auto-generated gold
+
+RUNS=5
+HEAD_N_LINES="${HEAD_N_LINES:-100}"  # ADDED: negative control truncation threshold
+TOKENIZER_ENCODING="${TOKENIZER_ENCODING:-cl100k_base}"
+RTK_BIN="${RTK_BIN:-$PROJECT_DIR/target/release/rtk}"
+ALLOW_DIRTY="${ALLOW_DIRTY:-0}"
+RTK_GREP_MAX="${RTK_GREP_MAX:-200}"
+RGAI_MAX="${RGAI_MAX:-8}"
+
+# ── Pre-flight checks ──────────────────────────────────────────────────── #
+
+if [ ! -d "$SRC_DIR" ]; then
+    echo "ERROR: src/ directory not found at $SRC_DIR" >&2
+    exit 1
+fi
+
+if [ ! -f "$GOLD_PATH" ]; then
+    echo "ERROR: gold_standards.json not found at $GOLD_PATH" >&2
+    exit 1
+fi
+
+if [ ! -x "$RTK_BIN" ]; then
+    echo "ERROR: rtk binary not found or not executable at $RTK_BIN" >&2
+    echo "Build local binary first: cargo build --release" >&2
+    exit 1
+fi
+
+if ! python3 - "$TOKENIZER_ENCODING" <<'PY'
+import sys
+import tiktoken
+
+enc = sys.argv[1]
+tiktoken.get_encoding(enc)
+PY
+then
+    echo "ERROR: Python package 'tiktoken' is required for token-based TE." >&2
+    echo "Install it with: python3 -m pip install tiktoken" >&2
+    exit 1
+fi
+
+# ── Pin commit for reproducibility ──────────────────────────────────────── #
+
+EXPECTED_COMMIT="$(python3 -c "import json;print(json.load(open('$GOLD_PATH', encoding='utf-8'))['metadata']['pinned_commit'])")"
+PINNED_COMMIT="$(cd "$PROJECT_DIR" && git rev-parse HEAD)"
+if [ "$PINNED_COMMIT" != "$EXPECTED_COMMIT" ]; then
+    echo "ERROR: Current HEAD ($PINNED_COMMIT) does not match pinned commit in gold_standards.json ($EXPECTED_COMMIT)." >&2
+    echo "Checkout pinned commit first for reproducible results." >&2
+    exit 2
+fi
+
+echo "Pinned commit: $PINNED_COMMIT"
+
+if [ "$ALLOW_DIRTY" != "1" ]; then
+    if ! (cd "$PROJECT_DIR" && git diff --quiet -- src Cargo.toml Cargo.lock && git diff --cached --quiet -- src Cargo.toml Cargo.lock); then
+        echo "ERROR: Working tree has local changes in benchmarked sources (src/, Cargo.toml, Cargo.lock)." >&2
+        echo "Commit/stash changes for auditable reproducibility, or set ALLOW_DIRTY=1 to override." >&2
+        exit 3
+    fi
+    if [ -n "$(cd "$PROJECT_DIR" && git ls-files --others --exclude-standard -- src)" ]; then
+        echo "ERROR: Untracked files exist under src/; benchmark dataset is not clean." >&2
+        echo "Commit/remove untracked source files, or set ALLOW_DIRTY=1 to override." >&2
+        exit 3
+    fi
+fi
+
+# ── Environment snapshot ────────────────────────────────────────────────── #
+
+{
+    echo "Date: $(date -u)"
+    echo "Commit: $PINNED_COMMIT"
+    echo "rtk_bin: $RTK_BIN"
+    echo "rtk: $($RTK_BIN --version 2>&1 || echo 'N/A')"
+    echo "grep: $(grep --version 2>&1 | head -1 || echo 'N/A')"
+    echo "tiktoken_encoding: $TOKENIZER_ENCODING"
+    echo "rtk_grep_max: $RTK_GREP_MAX"
+    echo "rgai_max: $RGAI_MAX"
+    echo "head_n_lines: $HEAD_N_LINES"  # ADDED: negative control param
+    echo "OS: $(uname -a)"
+    if [[ "$(uname)" == "Darwin" ]]; then
+        echo "CPU: $(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo 'N/A')"
+    else
+        echo "CPU: $(lscpu 2>/dev/null | grep 'Model name' | sed 's/.*: *//' || echo 'N/A')"
+    fi
+    echo "Rust files: $(find "$SRC_DIR" -name '*.rs' | wc -l | tr -d ' ')"
+    echo "Total LOC: $(find "$SRC_DIR" -name '*.rs' -exec cat {} + | wc -l | tr -d ' ')"
+} > "$ENV_OUT"
+
+echo "=== Environment ==="
+cat "$ENV_OUT"
+echo ""
+
+# ── Warmup (populate OS page cache) ────────────────────────────────────── #
+
+echo "Warming up filesystem cache..."
+find "$SRC_DIR" -name '*.rs' -exec cat {} + > /dev/null 2>&1
+echo ""
+
+# ── Command runner helper ───────────────────────────────────────────────── #
+# Prints: time_us,output_bytes,output_tokens,result_count,exit_code
+
+count_tokens_tiktoken() {
+    local out_file="$1"
+    python3 - "$out_file" "$TOKENIZER_ENCODING" <<'PY'
+import sys
+from pathlib import Path
+import tiktoken
+
+path = Path(sys.argv[1])
+encoding_name = sys.argv[2]
+enc = tiktoken.get_encoding(encoding_name)
+text = path.read_text(encoding="utf-8", errors="replace")
+print(len(enc.encode(text)))
+PY
+}
+
+run_command_capture() {
+    local out_file="$1"
+    shift
+
+    local time_file elapsed_s time_us bytes tokens lines exit_code
+    time_file="$(mktemp)"
+
+    TIMEFORMAT='%R'
+    set +e
+    { time "$@" > "$out_file" 2>/dev/null; } 2> "$time_file"
+    exit_code=$?
+    set -e
+
+    elapsed_s="$(tr -d ' \t\r\n' < "$time_file")"
+    rm -f "$time_file"
+
+    if [[ "$elapsed_s" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
+        time_us="$(awk -v s="$elapsed_s" 'BEGIN { printf "%.0f", s * 1000000 }')"
+    else
+        time_us=0
+    fi
+
+    bytes=$(wc -c < "$out_file" | tr -d ' ')
+    tokens=$(count_tokens_tiktoken "$out_file")
+    lines=$(wc -l < "$out_file" | tr -d ' ')
+    echo "${time_us},${bytes},${tokens},${lines},${exit_code}"
+}
+
+# ── Quality sample capture ──────────────────────────────────────────────── #
+
+mkdir -p "$QUALITY_DIR"
+rm -f "$QUALITY_DIR"/*.txt
+
+# ── Test matrix ─────────────────────────────────────────────────────────── #
+# Fields: test_id  category  query  grep_flags
+
+declare -a TEST_IDS=()
+declare -a TEST_CATEGORIES=()
+declare -a TEST_QUERIES=()
+declare -a TEST_GREP_FLAGS=()
+
+add_test() {
+    TEST_IDS+=("$1")
+    TEST_CATEGORIES+=("$2")
+    TEST_QUERIES+=("$3")
+    TEST_GREP_FLAGS+=("$4")
+}
+
+# Category A: Exact Identifier
+add_test "A1" "exact_identifier" "TimedExecution"       ""
+add_test "A2" "exact_identifier" "FilterLevel"          ""
+add_test "A3" "exact_identifier" "classify_command"     ""
+add_test "A4" "exact_identifier" "package_manager_exec" ""
+add_test "A5" "exact_identifier" "strip_ansi"           ""
+add_test "A6" "exact_identifier" "HISTORY_DAYS"         ""
+
+# Category B: Regex Pattern
+add_test "B1" "regex_pattern" 'fn run\(.*verbose: u8'   "-E"
+add_test "B2" "regex_pattern" 'timer\.track\('          "-E"
+add_test "B3" "regex_pattern" '\.unwrap_or\(1\)'        "-E"
+add_test "B4" "regex_pattern" '#\[cfg\(test\)\]'        "-E"
+add_test "B5" "regex_pattern" 'HashMap<String, Vec<'    "-E"
+add_test "B6" "regex_pattern" 'lazy_static!'            ""
+
+# Category C: Semantic Intent
+add_test "C1"  "semantic_intent" "token savings tracking database"    ""
+add_test "C2"  "semantic_intent" "exit code preservation"             ""
+add_test "C3"  "semantic_intent" "language aware code filtering"      ""
+add_test "C4"  "semantic_intent" "output grouping by file"            ""
+add_test "C5"  "semantic_intent" "three tier parser degradation"      ""
+add_test "C6"  "semantic_intent" "ANSI color stripping cleanup"       ""
+add_test "C7"  "semantic_intent" "hook installation settings json"    ""
+add_test "C8"  "semantic_intent" "command classification discover"    ""
+add_test "C9"  "semantic_intent" "pnpm yarn npm auto detection"       ""
+add_test "C10" "semantic_intent" "SQLite retention cleanup policy"    ""
+
+# Category D: Cross-File Pattern Discovery
+add_test "D1" "cross_file" "verbose > 0"      ""
+add_test "D2" "cross_file" "anyhow::Result"   ""
+add_test "D3" "cross_file" "process::exit"    ""
+add_test "D4" "cross_file" "Command::new"     ""
+add_test "D5" "cross_file" "from_utf8_lossy"  ""
+
+# Category E: Edge Cases
+add_test "E1" "edge_case" "the"                          ""
+add_test "E2" "edge_case" "fn"                           ""
+add_test "E3" "edge_case" "error handling retry backoff" ""
+
+# ── CSV header ──────────────────────────────────────────────────────────── #
+
+echo "test_id,category,query,tool,run,time_us,output_bytes,output_tokens,result_count,exit_code" > "$CSV_OUT"
+
+# ── Run matrix ──────────────────────────────────────────────────────────── #
+
+NUM_TESTS=${#TEST_IDS[@]}
+echo "Running $NUM_TESTS tests × 4 tools × $RUNS runs = $(( NUM_TESTS * 4 * RUNS )) measurements"  # CHANGED: 4 tools
+echo ""
+
+for idx in $(seq 0 $(( NUM_TESTS - 1 ))); do
+    tid="${TEST_IDS[$idx]}"
+    category="${TEST_CATEGORIES[$idx]}"
+    query="${TEST_QUERIES[$idx]}"
+    grep_flags="${TEST_GREP_FLAGS[$idx]}"
+
+    echo "[$tid] query=\"$query\""
+
+    for run in $(seq 1 $RUNS); do
+        tmp_grep="$(mktemp)"
+        tmp_rtk_grep="$(mktemp)"
+        tmp_rtk_rgai="$(mktemp)"
+        tmp_head_n="$(mktemp)"  # ADDED: negative control
+
+        grep_flag_arr=()
+        if [ -n "$grep_flags" ]; then
+            read -r -a grep_flag_arr <<< "$grep_flags"
+        fi
+
+        # grep
+        grep_cmd=(grep -rn "${grep_flag_arr[@]}" -- "$query" "$SRC_DIR")
+        IFS=',' read -r grep_time grep_bytes grep_tokens grep_lines grep_exit < <(run_command_capture "$tmp_grep" "${grep_cmd[@]}")
+        echo "$tid,$category,\"$query\",grep,$run,$grep_time,$grep_bytes,$grep_tokens,$grep_lines,$grep_exit" >> "$CSV_OUT"
+
+        if [ "$run" -eq 1 ]; then
+            cp "$tmp_grep" "$QUALITY_DIR/${tid}_grep.txt" 2>/dev/null || true
+        fi
+
+        # rtk grep
+        rtk_grep_cmd=("$RTK_BIN" grep "$query" "$SRC_DIR" --max "$RTK_GREP_MAX")
+        IFS=',' read -r rtk_grep_time rtk_grep_bytes rtk_grep_tokens rtk_grep_lines rtk_grep_exit < <(run_command_capture "$tmp_rtk_grep" "${rtk_grep_cmd[@]}")
+        echo "$tid,$category,\"$query\",rtk_grep,$run,$rtk_grep_time,$rtk_grep_bytes,$rtk_grep_tokens,$rtk_grep_lines,$rtk_grep_exit" >> "$CSV_OUT"
+
+        if [ "$run" -eq 1 ]; then
+            cp "$tmp_rtk_grep" "$QUALITY_DIR/${tid}_rtk_grep.txt" 2>/dev/null || true
+        fi
+
+        # rtk rgai
+        rtk_rgai_cmd=("$RTK_BIN" rgai --path "$SRC_DIR" --max "$RGAI_MAX" -- "$query")
+        IFS=',' read -r rtk_rgai_time rtk_rgai_bytes rtk_rgai_tokens rtk_rgai_lines rtk_rgai_exit < <(run_command_capture "$tmp_rtk_rgai" "${rtk_rgai_cmd[@]}")
+        echo "$tid,$category,\"$query\",rtk_rgai,$run,$rtk_rgai_time,$rtk_rgai_bytes,$rtk_rgai_tokens,$rtk_rgai_lines,$rtk_rgai_exit" >> "$CSV_OUT"
+
+        if [ "$run" -eq 1 ]; then
+            cp "$tmp_rtk_rgai" "$QUALITY_DIR/${tid}_rtk_rgai.txt" 2>/dev/null || true
+        fi
+
+        # head_n (NEGATIVE CONTROL) ──────────────────────────────────  # ADDED: entire section
+        # Naive truncation baseline: just take first N lines of grep output
+        head -n "$HEAD_N_LINES" "$tmp_grep" > "$tmp_head_n" 2>/dev/null || true
+        head_n_tokens=$(count_tokens_tiktoken "$tmp_head_n")
+        head_n_bytes=$(wc -c < "$tmp_head_n" | tr -d ' ')
+        head_n_lines=$(wc -l < "$tmp_head_n" | tr -d ' ')
+        # Timing is negligible for head, use 0
+        echo "$tid,$category,\"$query\",head_n,$run,0,$head_n_bytes,$head_n_tokens,$head_n_lines,0" >> "$CSV_OUT"
+
+        if [ "$run" -eq 1 ]; then
+            cp "$tmp_head_n" "$QUALITY_DIR/${tid}_head_n.txt" 2>/dev/null || true
+        fi
+
+        rm -f "$tmp_grep" "$tmp_rtk_grep" "$tmp_rtk_rgai" "$tmp_head_n"  # CHANGED: added tmp_head_n
+    done
+    echo "  done ($RUNS runs)"
+done
+
+echo ""
+echo "=== Generating Auto Gold Standards ==="  # ADDED: entire section
+
+# Generate gold_auto.json from grep output (automatic verification)
+python3 - "$QUALITY_DIR" "$GOLD_AUTO" "$PINNED_COMMIT" << 'PYEOF'
+import json
+import re
+import sys
+from pathlib import Path
+
+quality_dir = Path(sys.argv[1])
+output_path = Path(sys.argv[2])
+pinned_commit = sys.argv[3]
+
+def extract_rs_files(text: str) -> list[str]:
+    """Extract unique .rs filenames from grep output."""
+    files = set()
+    for match in re.finditer(r"([A-Za-z0-9_./-]+\.rs)", text):
+        path = match.group(1)
+        # Normalize: strip src/ prefix, keep nested paths
+        if "/src/" in path:
+            path = path.split("/src/", 1)[1]
+        elif path.startswith("src/"):
+            path = path[4:]
+        path = path.lstrip("./")
+        if path.endswith(".rs"):
+            files.add(path)
+    return sorted(files)
+
+gold_auto = {
+    "metadata": {
+        "description": "Auto-generated gold standards from grep output",
+        "pinned_commit": pinned_commit,
+        "generated": "auto",
+        "notes": "Gold files extracted automatically from grep results - no manual curation"
+    },
+    "queries": {}
+}
+
+# Process each grep sample
+for grep_file in sorted(quality_dir.glob("*_grep.txt")):
+    tid = grep_file.stem.replace("_grep", "")
+    text = grep_file.read_text(errors="replace")
+    gold_files = extract_rs_files(text)
+
+    gold_auto["queries"][tid] = {
+        "gold_files_auto": gold_files,
+        "gold_file_count": len(gold_files),
+        "grep_lines": len(text.splitlines()),
+        "grep_bytes": len(text.encode("utf-8"))
+    }
+
+output_path.write_text(json.dumps(gold_auto, indent=2), encoding="utf-8")
+print(f"Generated {output_path} with {len(gold_auto['queries'])} queries")
+PYEOF
+
+echo ""
+echo "=== Benchmark Complete ==="
+echo "Raw results:      $CSV_OUT"
+echo "Quality samples:  $QUALITY_DIR/"
+echo "Auto gold:        $GOLD_AUTO"  # ADDED
+echo "Environment:      $ENV_OUT"
+echo ""
+echo "Total measurements: $(( $(wc -l < "$CSV_OUT") - 1 ))"
+echo ""
+echo "Next step: python3 benchmarks/analyze_code.py"
diff --git a/benchmarks/gold_standards.json b/benchmarks/gold_standards.json
new file mode 100644
index 0000000..d74610d
--- /dev/null
+++ b/benchmarks/gold_standards.json
@@ -0,0 +1,285 @@
+{
+  "metadata": {
+    "description": "Gold standards for code-search benchmark on rtk codebase",
+    "pinned_commit": "4b0a413562c775757d5bc09a6ff966b4e532508c",
+    "codebase": "rtk-ai/rtk",
+    "generated": "2026-02-15",
+    "notes": "Gold files verified by grep on pinned commit. gold_min_files = minimum unique .rs files expected in output."
+  },
+  "queries": {
+    "A1": {
+      "query": "TimedExecution",
+      "category": "exact_identifier",
+      "grep_flags": "",
+      "gold_files": ["tracking.rs", "main.rs", "cargo_cmd.rs", "git.rs", "container.rs", "grep_cmd.rs", "npm_cmd.rs", "go_cmd.rs", "gh_cmd.rs", "ls.rs"],
+      "gold_min_files": 30,
+      "expect_results": true,
+      "notes": "Struct def + all usages across 34 command modules"
+    },
+    "A2": {
+      "query": "FilterLevel",
+      "category": "exact_identifier",
+      "grep_flags": "",
+      "gold_files": ["filter.rs", "main.rs", "read.rs"],
+      "gold_min_files": 3,
+      "expect_results": true,
+      "notes": "Enum def + variants in 3 files"
+    },
+    "A3": {
+      "query": "classify_command",
+      "category": "exact_identifier",
+      "grep_flags": "",
+      "gold_files": ["discover/registry.rs", "discover/mod.rs"],
+      "gold_min_files": 2,
+      "expect_results": true,
+      "notes": "Function def + caller in discover module"
+    },
+    "A4": {
+      "query": "package_manager_exec",
+      "category": "exact_identifier",
+      "grep_flags": "",
+      "gold_files": ["utils.rs", "vitest_cmd.rs", "playwright_cmd.rs", "prettier_cmd.rs", "lint_cmd.rs"],
+      "gold_min_files": 5,
+      "expect_results": true,
+      "notes": "Function def in utils.rs + callers in JS tooling modules"
+    },
+    "A5": {
+      "query": "strip_ansi",
+      "category": "exact_identifier",
+      "grep_flags": "",
+      "gold_files": ["utils.rs", "vitest_cmd.rs", "playwright_cmd.rs", "next_cmd.rs", "cargo_cmd.rs"],
+      "gold_min_files": 5,
+      "expect_results": true,
+      "notes": "Function def in utils.rs + callers"
+    },
+    "A6": {
+      "query": "HISTORY_DAYS",
+      "category": "exact_identifier",
+      "grep_flags": "",
+      "gold_files": ["tracking.rs"],
+      "gold_min_files": 1,
+      "expect_results": true,
+      "notes": "Const def + usage in cleanup logic, single file"
+    },
+
+    "B1": {
+      "query": "fn run\\(.*verbose: u8",
+      "category": "regex_pattern",
+      "grep_flags": "-E",
+      "gold_files": ["git.rs", "cargo_cmd.rs", "npm_cmd.rs", "container.rs", "ls.rs", "tsc_cmd.rs", "vitest_cmd.rs", "pnpm_cmd.rs"],
+      "gold_min_files": 25,
+      "expect_results": true,
+      "notes": "All run() signatures with verbose param on same line; 27 files match. NB: go_cmd/grep_cmd/runner use run_test()/run_err() so don't match this regex."
+    },
+    "B2": {
+      "query": "timer\\.track\\(",
+      "category": "regex_pattern",
+      "grep_flags": "-E",
+      "gold_files": ["main.rs", "git.rs", "cargo_cmd.rs", "npm_cmd.rs", "go_cmd.rs", "gh_cmd.rs", "container.rs", "curl_cmd.rs"],
+      "gold_min_files": 30,
+      "expect_results": true,
+      "notes": "All tracking calls; 34 files match (same scope as TimedExecution users)"
+    },
+    "B3": {
+      "query": "\\.unwrap_or\\(1\\)",
+      "category": "regex_pattern",
+      "grep_flags": "-E",
+      "gold_files": ["main.rs", "git.rs", "cargo_cmd.rs", "container.rs", "gh_cmd.rs", "tree.rs", "npm_cmd.rs", "gain.rs"],
+      "gold_min_files": 15,
+      "expect_results": true,
+      "notes": "Exit code fallback pattern .code().unwrap_or(1); 20 files match"
+    },
+    "B4": {
+      "query": "#\\[cfg\\(test\\)\\]",
+      "category": "regex_pattern",
+      "grep_flags": "-E",
+      "gold_files": ["filter.rs", "utils.rs", "discover/registry.rs", "vitest_cmd.rs", "next_cmd.rs", "playwright_cmd.rs", "runner.rs", "tsc_cmd.rs", "git.rs", "cargo_cmd.rs"],
+      "gold_min_files": 35,
+      "expect_results": true,
+      "notes": "Test module declarations; 41 files have #[cfg(test)] (nearly all modules)"
+    },
+    "B5": {
+      "query": "HashMap<String, Vec<",
+      "category": "regex_pattern",
+      "grep_flags": "-E",
+      "gold_files": ["tsc_cmd.rs", "grep_cmd.rs", "cargo_cmd.rs", "find_cmd.rs", "parser/formatter.rs", "learn/report.rs"],
+      "gold_min_files": 6,
+      "expect_results": true,
+      "notes": "Grouping data structure pattern; exactly 6 files match this specific generic signature"
+    },
+    "B6": {
+      "query": "lazy_static!",
+      "category": "regex_pattern",
+      "grep_flags": "",
+      "gold_files": ["utils.rs", "next_cmd.rs", "filter.rs", "discover/registry.rs", "playwright_cmd.rs", "tsc_cmd.rs", "runner.rs", "vitest_cmd.rs", "learn/detector.rs"],
+      "gold_min_files": 9,
+      "expect_results": true,
+      "notes": "lazy_static! macro invocations (both bare and lazy_static::lazy_static!); exactly 9 files"
+    },
+
+    "C1": {
+      "query": "token savings tracking database",
+      "category": "semantic_intent",
+      "grep_flags": "",
+      "gold_files": ["tracking.rs", "gain.rs"],
+      "gold_min_files": 1,
+      "expect_results": true,
+      "notes": "SQLite tracking architecture; grep unlikely to find exact phrase"
+    },
+    "C2": {
+      "query": "exit code preservation",
+      "category": "semantic_intent",
+      "grep_flags": "",
+      "gold_files": ["git.rs", "npm_cmd.rs", "cargo_cmd.rs", "go_cmd.rs", "container.rs"],
+      "gold_min_files": 2,
+      "expect_results": true,
+      "notes": "process::exit patterns across command modules"
+    },
+    "C3": {
+      "query": "language aware code filtering",
+      "category": "semantic_intent",
+      "grep_flags": "",
+      "gold_files": ["filter.rs"],
+      "gold_min_files": 1,
+      "expect_results": true,
+      "notes": "Language enum, FilterStrategy in filter.rs"
+    },
+    "C4": {
+      "query": "output grouping by file",
+      "category": "semantic_intent",
+      "grep_flags": "",
+      "gold_files": ["grep_cmd.rs", "lint_cmd.rs", "tsc_cmd.rs", "log_cmd.rs"],
+      "gold_min_files": 2,
+      "expect_results": true,
+      "notes": "HashMap grouping patterns in output formatter modules"
+    },
+    "C5": {
+      "query": "three tier parser degradation",
+      "category": "semantic_intent",
+      "grep_flags": "",
+      "gold_files": ["parser/mod.rs", "parser/types.rs"],
+      "gold_min_files": 1,
+      "expect_results": true,
+      "notes": "ParseResult enum, tier logic in parser module"
+    },
+    "C6": {
+      "query": "ANSI color stripping cleanup",
+      "category": "semantic_intent",
+      "grep_flags": "",
+      "gold_files": ["utils.rs"],
+      "gold_min_files": 1,
+      "expect_results": true,
+      "notes": "strip_ansi function in utils.rs"
+    },
+    "C7": {
+      "query": "hook installation settings json",
+      "category": "semantic_intent",
+      "grep_flags": "",
+      "gold_files": ["init.rs"],
+      "gold_min_files": 1,
+      "expect_results": true,
+      "notes": "patch_settings_json, hook setup in init.rs"
+    },
+    "C8": {
+      "query": "command classification discover",
+      "category": "semantic_intent",
+      "grep_flags": "",
+      "gold_files": ["discover/registry.rs"],
+      "gold_min_files": 1,
+      "expect_results": true,
+      "notes": "RtkRule, RULES, classify_command in registry.rs"
+    },
+    "C9": {
+      "query": "pnpm yarn npm auto detection",
+      "category": "semantic_intent",
+      "grep_flags": "",
+      "gold_files": ["utils.rs", "npm_cmd.rs"],
+      "gold_min_files": 1,
+      "expect_results": true,
+      "notes": "package_manager_exec and related logic"
+    },
+    "C10": {
+      "query": "SQLite retention cleanup policy",
+      "category": "semantic_intent",
+      "grep_flags": "",
+      "gold_files": ["tracking.rs"],
+      "gold_min_files": 1,
+      "expect_results": true,
+      "notes": "HISTORY_DAYS, cleanup logic in tracking.rs"
+    },
+
+    "D1": {
+      "query": "verbose > 0",
+      "category": "cross_file",
+      "grep_flags": "",
+      "gold_files": ["main.rs", "git.rs", "cargo_cmd.rs", "npm_cmd.rs", "go_cmd.rs", "container.rs", "grep_cmd.rs", "pnpm_cmd.rs"],
+      "gold_min_files": 30,
+      "expect_results": true,
+      "notes": "All verbose debug logging points; 90 occurrences across 37 files (includes parser/README.md)"
+    },
+    "D2": {
+      "query": "anyhow::Result",
+      "category": "cross_file",
+      "grep_flags": "",
+      "gold_files": ["tracking.rs", "config.rs", "discover/mod.rs", "log_cmd.rs", "diff_cmd.rs", "deps.rs", "find_cmd.rs", "env_cmd.rs", "local_llm.rs", "learn/mod.rs", "learn/report.rs"],
+      "gold_min_files": 11,
+      "expect_results": true,
+      "notes": "Literal 'anyhow::Result' only in 11 files with standalone import. Most files use 'use anyhow::{Context, Result}' which doesn't contain the literal substring."
+    },
+    "D3": {
+      "query": "process::exit",
+      "category": "cross_file",
+      "grep_flags": "",
+      "gold_files": ["main.rs", "git.rs", "cargo_cmd.rs", "npm_cmd.rs", "container.rs", "gh_cmd.rs"],
+      "gold_min_files": 15,
+      "expect_results": true,
+      "notes": "All exit points across 19 files"
+    },
+    "D4": {
+      "query": "Command::new",
+      "category": "cross_file",
+      "grep_flags": "",
+      "gold_files": ["main.rs", "git.rs", "cargo_cmd.rs", "npm_cmd.rs", "utils.rs", "go_cmd.rs", "grep_cmd.rs"],
+      "gold_min_files": 20,
+      "expect_results": true,
+      "notes": "All subprocess spawns across 25 files"
+    },
+    "D5": {
+      "query": "from_utf8_lossy",
+      "category": "cross_file",
+      "grep_flags": "",
+      "gold_files": ["main.rs", "git.rs", "cargo_cmd.rs", "npm_cmd.rs", "utils.rs", "container.rs", "grep_cmd.rs"],
+      "gold_min_files": 25,
+      "expect_results": true,
+      "notes": "All lossy UTF-8 conversions across 29 files"
+    },
+
+    "E1": {
+      "query": "the",
+      "category": "edge_case",
+      "grep_flags": "",
+      "gold_files": [],
+      "gold_min_files": 0,
+      "expect_results": true,
+      "notes": "Stop word: grep/rtk grep flood output; rgai should filter or return minimal"
+    },
+    "E2": {
+      "query": "fn",
+      "category": "edge_case",
+      "grep_flags": "",
+      "gold_files": [],
+      "gold_min_files": 0,
+      "expect_results": true,
+      "notes": "Ultra-common 2-char token: every .rs file matches; tests truncation limits"
+    },
+    "E3": {
+      "query": "error handling retry backoff",
+      "category": "edge_case",
+      "grep_flags": "",
+      "gold_files": [],
+      "gold_min_files": 0,
+      "expect_results": false,
+      "notes": "Zero results expected from all tools — nothing in codebase matches this phrase"
+    }
+  }
+}
diff --git a/benchmarks/tests/__init__.py b/benchmarks/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/benchmarks/tests/test_analyze_code.py b/benchmarks/tests/test_analyze_code.py
new file mode 100644
index 0000000..d769e7b
--- /dev/null
+++ b/benchmarks/tests/test_analyze_code.py
@@ -0,0 +1,466 @@
+"""
+Tests for analyze_code.py benchmark analyzer.
+
+Covers:
+  - median_val computation
+  - is_valid_exit semantics
+  - extract_filenames from various output formats
+  - compute_gold_hit_rate accuracy
+  - is_miss detection (critical rule)
+  - format_te / format_pct output
+  - compute_metrics end-to-end with mock data
+"""
+
+from __future__ import annotations
+
+import json
+import tempfile
+import unittest
+from pathlib import Path
+from unittest.mock import patch
+
+import sys
+
+# Add parent dir to path so we can import analyze_code
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+import analyze_code as ac
+
+
+class TestMedianVal(unittest.TestCase):
+    def test_odd_count(self):
+        self.assertEqual(ac.median_val([3, 1, 2]), 2.0)
+
+    def test_even_count(self):
+        self.assertEqual(ac.median_val([1, 2, 3, 4]), 2.5)
+
+    def test_single(self):
+        self.assertEqual(ac.median_val([42]), 42.0)
+
+    def test_empty(self):
+        self.assertEqual(ac.median_val([]), 0.0)
+
+    def test_already_sorted(self):
+        self.assertEqual(ac.median_val([10, 20, 30, 40, 50]), 30.0)
+
+    def test_duplicates(self):
+        self.assertEqual(ac.median_val([5, 5, 5, 5, 5]), 5.0)
+
+
+class TestIsValidExit(unittest.TestCase):
+    def test_exit_0_valid(self):
+        self.assertTrue(ac.is_valid_exit(0))
+
+    def test_exit_1_valid(self):
+        """exit=1 means 'no matches' for grep — still valid."""
+        self.assertTrue(ac.is_valid_exit(1))
+
+    def test_exit_2_invalid(self):
+        """exit>=2 means execution error."""
+        self.assertFalse(ac.is_valid_exit(2))
+
+    def test_exit_127_invalid(self):
+        self.assertFalse(ac.is_valid_exit(127))
+
+
+class TestExtractFilenames(unittest.TestCase):
+    def test_grep_output(self):
+        """Standard grep -rn output format."""
+        text = (
+            "src/tracking.rs:42:pub struct TimedExecution {\n"
+            "src/main.rs:100:    let timer = TimedExecution::new();\n"
+            "src/discover/registry.rs:77:const RULES: &[RtkRule] = &[\n"
+        )
+        filenames = ac.extract_filenames(text)
+        self.assertIn("tracking.rs", filenames)
+        self.assertIn("main.rs", filenames)
+        self.assertIn("discover/registry.rs", filenames)
+
+    def test_rtk_file_headers(self):
+        """RTK grouped output headers."""
+        text = (
+            "📄 /Users/andrew/Programming/rtk/src/tracking.rs (1):\n"
+            "📄 /.../discover/registry.rs [9.1]\n"
+        )
+        filenames = ac.extract_filenames(text)
+        self.assertIn("tracking.rs", filenames)
+        self.assertIn("discover/registry.rs", filenames)
+
+    def test_no_filenames(self):
+        """Text with no .rs files."""
+        text = "no rust files here\njust plain text"
+        filenames = ac.extract_filenames(text)
+        self.assertEqual(len(filenames), 0)
+
+    def test_nested_path(self):
+        """Nested directory paths."""
+        text = "src/parser/mod.rs:1:pub mod types;"
+        filenames = ac.extract_filenames(text)
+        self.assertIn("parser/mod.rs", filenames)
+
+    def test_absolute_path_normalization(self):
+        text = (
+            "/Users/andrew/Programming/rtk/src/tracking.rs:42:code\n"
+            "/Users/andrew/Programming/rtk/src/discover/registry.rs:77:code\n"
+        )
+        filenames = ac.extract_filenames(text)
+        self.assertIn("tracking.rs", filenames)
+        self.assertIn("discover/registry.rs", filenames)
+
+    def test_deduplication(self):
+        """Same file appearing multiple times."""
+        text = (
+            "src/git.rs:10:code\n"
+            "src/git.rs:20:more code\n"
+            "src/git.rs:30:even more\n"
+        )
+        filenames = ac.extract_filenames(text)
+        self.assertEqual(filenames.count("git.rs") if isinstance(filenames, list) else 1, 1)
+        self.assertIn("git.rs", filenames)
+
+    def test_does_not_parse_rs_inside_code_snippet(self):
+        text = ' 565: classify_command("cat src/main.rs"),'
+        filenames = ac.extract_filenames(text)
+        self.assertEqual(len(filenames), 0)
+
+
+class TestComputeGoldHitRate(unittest.TestCase):
+    def test_all_found(self):
+        sample = "src/tracking.rs:1:code\nsrc/main.rs:2:code\n"
+        gold = ["tracking.rs", "main.rs"]
+        self.assertAlmostEqual(ac.compute_gold_hit_rate(sample, gold), 1.0)
+
+    def test_partial_found(self):
+        sample = "src/tracking.rs:1:code\nsrc/utils.rs:2:code\n"
+        gold = ["tracking.rs", "main.rs"]
+        self.assertAlmostEqual(ac.compute_gold_hit_rate(sample, gold), 0.5)
+
+    def test_none_found(self):
+        sample = "src/utils.rs:1:code\n"
+        gold = ["tracking.rs", "main.rs"]
+        self.assertAlmostEqual(ac.compute_gold_hit_rate(sample, gold), 0.0)
+
+    def test_empty_gold_files(self):
+        """No gold files => N/A for hit-rate."""
+        sample = "src/anything.rs:1:code\n"
+        self.assertIsNone(ac.compute_gold_hit_rate(sample, []))
+
+    def test_empty_sample(self):
+        gold = ["tracking.rs"]
+        self.assertAlmostEqual(ac.compute_gold_hit_rate("", gold), 0.0)
+
+    def test_nested_gold_files(self):
+        sample = "src/discover/registry.rs:77:const RULES\n"
+        gold = ["discover/registry.rs"]
+        self.assertAlmostEqual(ac.compute_gold_hit_rate(sample, gold), 1.0)
+
+
+class TestIsMiss(unittest.TestCase):
+    def test_zero_results_expects_results(self):
+        """0 results when gold expects results → MISS."""
+        self.assertTrue(ac.is_miss(0, True))
+
+    def test_zero_results_expects_nothing(self):
+        """0 results when gold expects nothing → NOT miss."""
+        self.assertFalse(ac.is_miss(0, False))
+
+    def test_has_results_expects_results(self):
+        """Has results when expected → NOT miss."""
+        self.assertFalse(ac.is_miss(42, True))
+
+    def test_has_results_expects_nothing(self):
+        """Has results when nothing expected → NOT miss (unexpected but not MISS)."""
+        self.assertFalse(ac.is_miss(5, False))
+
+
+class TestFormatFunctions(unittest.TestCase):
+    def test_format_te_normal(self):
+        self.assertEqual(ac.format_te(0.123, False), "0.123")
+
+    def test_format_te_miss(self):
+        self.assertEqual(ac.format_te(0.5, True), "MISS")
+
+    def test_format_te_none(self):
+        self.assertEqual(ac.format_te(None, False), "N/A")
+
+    def test_format_pct_savings(self):
+        self.assertEqual(ac.format_pct(0.3, False), "70.0%")
+
+    def test_format_pct_miss(self):
+        self.assertEqual(ac.format_pct(0.3, True), "MISS")
+
+    def test_format_pct_expansion(self):
+        """TE > 1.0 means output is larger than grep baseline."""
+        self.assertEqual(ac.format_pct(1.5, False), "-50.0%")
+
+    def test_format_gold_full(self):
+        self.assertEqual(ac.format_gold(1.0, 10, 5), "100% (10/5)")
+
+    def test_format_gold_partial(self):
+        self.assertEqual(ac.format_gold(0.6, 3, 8), "60% (3/8)")
+
+    def test_format_gold_none(self):
+        self.assertEqual(ac.format_gold(None, 0, 0), "N/A")
+
+    def test_format_gold_none_with_min(self):
+        self.assertEqual(ac.format_gold(None, 2, 10), "N/A (2/10)")
+
+    def test_format_timing_microseconds(self):
+        self.assertEqual(ac.format_timing(500), "500μs")
+
+    def test_format_timing_milliseconds(self):
+        self.assertEqual(ac.format_timing(5000), "5.0ms")
+
+    def test_format_timing_seconds(self):
+        self.assertEqual(ac.format_timing(2_500_000), "2.50s")
+
+
+class TestComputeMetrics(unittest.TestCase):
+    """End-to-end test of compute_metrics with synthetic data."""
+
+    def _make_rows(self, tid, tool, runs=5, time=1000, output_bytes=500,
+                   output_tokens=100, result_count=10, exit_code=0):
+        """Helper to generate mock CSV rows."""
+        return [
+            {
+                "test_id": tid,
+                "category": "exact_identifier",
+                "query": f'"{tid} query"',
+                "tool": tool,
+                "run": i + 1,
+                "time_us": time + i * 10,
+                "output_bytes": output_bytes,
+                "output_tokens": output_tokens,
+                "result_count": result_count,
+                "exit_code": exit_code,
+            }
+            for i in range(runs)
+        ]
+
+    def test_basic_metrics(self):
+        """Verify TE computation for a simple case."""
+        gold = {
+            "T1": {
+                "query": "test",
+                "category": "exact_identifier",
+                "gold_files": [],
+                "gold_min_files": 0,
+                "expect_results": True,
+            }
+        }
+        rows = (
+            self._make_rows("T1", "grep", output_bytes=1000, output_tokens=1000, result_count=50)
+            + self._make_rows("T1", "rtk_grep", output_bytes=300, output_tokens=300, result_count=20)
+            + self._make_rows("T1", "rtk_rgai", output_bytes=200, output_tokens=200, result_count=10)
+        )
+
+        with patch.object(ac, "load_quality_sample", return_value=""):
+            metrics = ac.compute_metrics(rows, gold)
+
+        self.assertEqual(len(metrics), 1)
+        m = metrics[0]
+        # TE = rtk_grep_tokens / grep_tokens = 300/1000 = 0.3
+        self.assertAlmostEqual(m["rtk_grep_te"], 0.3)
+        # TE = rtk_rgai_tokens / grep_tokens = 200/1000 = 0.2
+        self.assertAlmostEqual(m["rtk_rgai_te"], 0.2)
+
+    def test_miss_detection(self):
+        """0 result count with expect_results=True → MISS."""
+        gold = {
+            "T2": {
+                "query": "test",
+                "category": "semantic_intent",
+                "gold_files": ["tracking.rs"],
+                "gold_min_files": 1,
+                "expect_results": True,
+            }
+        }
+        rows = (
+            self._make_rows("T2", "grep", output_bytes=0, result_count=0)
+            + self._make_rows("T2", "rtk_grep", output_bytes=0, result_count=0)
+            + self._make_rows("T2", "rtk_rgai", output_bytes=500, result_count=5)
+        )
+
+        with patch.object(ac, "load_quality_sample", return_value=""):
+            metrics = ac.compute_metrics(rows, gold)
+
+        m = metrics[0]
+        self.assertTrue(m["grep_miss"])
+        self.assertTrue(m["rtk_grep_miss"])
+        self.assertFalse(m["rtk_rgai_miss"])
+
+    def test_miss_detection_with_rtk_zero_marker(self):
+        """rtk '0 for' marker should force effective result_count=0."""
+        gold = {
+            "T2B": {
+                "query": "semantic query",
+                "category": "semantic_intent",
+                "gold_files": ["tracking.rs"],
+                "gold_min_files": 1,
+                "expect_results": True,
+            }
+        }
+        rows = (
+            self._make_rows("T2B", "grep", output_bytes=0, result_count=0)
+            + self._make_rows("T2B", "rtk_grep", output_bytes=42, result_count=1)
+            + self._make_rows("T2B", "rtk_rgai", output_bytes=2400, result_count=80)
+        )
+
+        def fake_sample(tid, tool):
+            if tid == "T2B" and tool == "rtk_grep":
+                return "🔍 0 for 'semantic query'\n"
+            return ""
+
+        with patch.object(ac, "load_quality_sample", side_effect=fake_sample):
+            metrics = ac.compute_metrics(rows, gold)
+
+        m = metrics[0]
+        self.assertEqual(m["rtk_grep_count"], 0)
+        self.assertTrue(m["rtk_grep_miss"])
+
+    def test_no_miss_when_not_expected(self):
+        """0 results with expect_results=False → NOT miss."""
+        gold = {
+            "T3": {
+                "query": "nonexistent",
+                "category": "edge_case",
+                "gold_files": [],
+                "gold_min_files": 0,
+                "expect_results": False,
+            }
+        }
+        rows = (
+            self._make_rows("T3", "grep", output_bytes=0, result_count=0)
+            + self._make_rows("T3", "rtk_grep", output_bytes=0, result_count=0)
+            + self._make_rows("T3", "rtk_rgai", output_bytes=0, result_count=0)
+        )
+
+        with patch.object(ac, "load_quality_sample", return_value=""):
+            metrics = ac.compute_metrics(rows, gold)
+
+        m = metrics[0]
+        self.assertFalse(m["grep_miss"])
+        self.assertFalse(m["rtk_grep_miss"])
+        self.assertFalse(m["rtk_rgai_miss"])
+
+    def test_grep_baseline_zero_te_none(self):
+        """When grep baseline is 0 bytes, TE should be None."""
+        gold = {
+            "T4": {
+                "query": "rare",
+                "category": "exact_identifier",
+                "gold_files": [],
+                "gold_min_files": 0,
+                "expect_results": False,
+            }
+        }
+        rows = (
+            self._make_rows("T4", "grep", output_bytes=0, output_tokens=0, result_count=0)
+            + self._make_rows("T4", "rtk_grep", output_bytes=100, result_count=2)
+            + self._make_rows("T4", "rtk_rgai", output_bytes=50, result_count=1)
+        )
+
+        with patch.object(ac, "load_quality_sample", return_value=""):
+            metrics = ac.compute_metrics(rows, gold)
+
+        m = metrics[0]
+        self.assertIsNone(m["rtk_grep_te"])
+        self.assertIsNone(m["rtk_rgai_te"])
+
+    def test_zero_gold_hit_marks_low_coverage(self):
+        gold = {
+            "T5": {
+                "query": "semantic",
+                "category": "semantic_intent",
+                "gold_files": ["tracking.rs"],
+                "gold_min_files": 1,
+                "expect_results": True,
+            }
+        }
+        rows = (
+            self._make_rows("T5", "grep", output_tokens=100, result_count=10)
+            + self._make_rows("T5", "rtk_rgai", output_tokens=20, result_count=5)
+        )
+
+        def fake_sample(tid, tool):
+            if tool == "grep":
+                return "src/tracking.rs:1:code\n"
+            if tool == "rtk_rgai":
+                return "📄 src/utils.rs [10.0]\n"
+            return ""
+
+        with patch.object(ac, "load_quality_sample", side_effect=fake_sample):
+            metrics = ac.compute_metrics(rows, gold)
+
+        m = metrics[0]
+        self.assertTrue(m["rtk_rgai_low_coverage"])
+
+
+class TestGoldStandardsIntegrity(unittest.TestCase):
+    """Verify gold_standards.json is well-formed."""
+
+    def setUp(self):
+        gold_path = Path(__file__).resolve().parent.parent / "gold_standards.json"
+        with open(gold_path, encoding="utf-8") as f:
+            self.data = json.load(f)
+        self.queries = self.data["queries"]
+
+    def test_has_metadata(self):
+        self.assertIn("metadata", self.data)
+        self.assertIn("pinned_commit", self.data["metadata"])
+
+    def test_query_count(self):
+        """Should have exactly 30 queries."""
+        self.assertEqual(len(self.queries), 30)
+
+    def test_category_distribution(self):
+        """A=6, B=6, C=10, D=5, E=3."""
+        cats = [q["category"] for q in self.queries.values()]
+        self.assertEqual(cats.count("exact_identifier"), 6)
+        self.assertEqual(cats.count("regex_pattern"), 6)
+        self.assertEqual(cats.count("semantic_intent"), 10)
+        self.assertEqual(cats.count("cross_file"), 5)
+        self.assertEqual(cats.count("edge_case"), 3)
+
+    def test_required_fields(self):
+        """Every query has required fields."""
+        required = {"query", "category", "grep_flags", "gold_files",
+                    "gold_min_files", "expect_results", "notes"}
+        for tid, q in self.queries.items():
+            for field in required:
+                self.assertIn(
+                    field, q,
+                    f"Query {tid} missing field '{field}'"
+                )
+
+    def test_id_prefix_matches_category(self):
+        """A* → exact_identifier, B* → regex_pattern, etc."""
+        prefix_map = {
+            "A": "exact_identifier",
+            "B": "regex_pattern",
+            "C": "semantic_intent",
+            "D": "cross_file",
+            "E": "edge_case",
+        }
+        for tid, q in self.queries.items():
+            expected_cat = prefix_map.get(tid[0])
+            self.assertEqual(
+                q["category"], expected_cat,
+                f"Query {tid} has category '{q['category']}' "
+                f"but expected '{expected_cat}'"
+            )
+
+    def test_e3_expects_no_results(self):
+        """E3 (nonexistent phrase) should expect no results."""
+        self.assertFalse(self.queries["E3"]["expect_results"])
+
+    def test_gold_files_are_lists(self):
+        for tid, q in self.queries.items():
+            self.assertIsInstance(
+                q["gold_files"], list,
+                f"Query {tid} gold_files is not a list"
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()