diff --git a/benchmarks/RESULTS.md b/benchmarks/RESULTS.md new file mode 100644 index 0000000..9bae898 --- /dev/null +++ b/benchmarks/RESULTS.md @@ -0,0 +1,271 @@ +# Code Search Benchmark: grep vs rtk grep vs rtk rgai vs head_n + +## Environment & Reproduction + +``` +Date: Sat Feb 14 22:34:25 UTC 2026 +Commit: 4b0a413562c775757d5bc09a6ff966b4e532508c +rtk_bin: /Users/andrew/Programming/rtk/target/release/rtk +rtk: rtk 0.15.3 +grep: grep (BSD grep, GNU compatible) 2.6.0-FreeBSD +tiktoken_encoding: cl100k_base +rtk_grep_max: 200 +rgai_max: 8 +head_n_lines: 100 +OS: Darwin MacBook-Pro-Andy.local 25.2.0 Darwin Kernel Version 25.2.0: Tue Nov 18 21:09:56 PST 2025; root:xnu-12377.61.12~1/RELEASE_ARM64_T6041 arm64 +CPU: Apple M4 Pro +Rust files: 54 +Total LOC: 23240 +``` + +## Dataset: rtk-ai/rtk @ `4b0a413562c775757d5bc09a6ff966b4e532508c` + +**Reproduction**: +```bash +rtk --version +bash benchmarks/bench_code.sh +python3 benchmarks/analyze_code.py +python3 -m unittest discover -s benchmarks/tests -p 'test_*.py' +``` + +## Methodology + +### Metrics (reported separately, NO composite score) + +| Metric | Definition | Purpose | +|--------|-----------|---------| +| Output bytes | `wc -c` of stdout | Raw size footprint | +| Output tokens | `tiktoken` (`cl100k_base`) on full stdout | Model-aligned token cost | +| Token Efficiency (TE) | `output_tokens / grep_output_tokens` | Token compression vs baseline | +| Result count | Effective output lines / no-result aware count | Distinguish compactness vs empty results | +| Gold hit rate | `% gold_files found` (plus found/min files) | Relevance/correctness | +| Timing | Median of 5 runs, plus min/max in summaries | Performance distribution | + +**Critical rule**: if `expect_results=true` and `result_count==0`, mark as **MISS**. +For regex category, `rtk rgai` is marked `EXPECTED_UNSUPPORTED` by design. + +### Categories + +| Category | Queries | +|----------|---------| +| A: Exact Identifier | 6 | +| B: Regex Pattern | 6 | +| C: Semantic Intent | 10 | +| D: Cross-File Pattern Discovery | 5 | +| E: Edge Cases | 3 | + +## Category A: Exact Identifier Search + +| ID | Query | Tool | Bytes | Tokens | TE | Result Count | Gold Hit | Timing (med) | Status | +|----|-------|------|-------|--------|----|-------------|----------|-------------|--------| +| A1 | TimedExecution | grep | 10338 | 2927 | 1.000 | 104 | 100% (34/30) | 6.0ms | OK | +| A1 | TimedExecution | rtk_grep | 6527 | 1979 | 0.676 | 159 | 100% (34/30) | 25.0ms | OK | +| A1 | TimedExecution | rtk_rgai | 2797 | 841 | 0.287 | 94 | 60% (8/30) | 11.0ms | LOW_COVERAGE | +| A1 | TimedExecution | head_n | 9933 | 2810 | 0.960 | 100 | 100% (32/30) | 0μs | OK | +| A2 | FilterLevel | grep | 2196 | 605 | 1.000 | 23 | 100% (3/3) | 5.0ms | OK | +| A2 | FilterLevel | rtk_grep | 902 | 288 | 0.476 | 25 | 100% (3/3) | 25.0ms | OK | +| A2 | FilterLevel | rtk_rgai | 691 | 223 | 0.369 | 32 | 100% (3/3) | 10.0ms | OK | +| A2 | FilterLevel | head_n | 2196 | 605 | 1.000 | 23 | 100% (3/3) | 0μs | OK | +| A3 | classify_command | grep | 2524 | 626 | 1.000 | 22 | 100% (2/2) | 7.0ms | OK | +| A3 | classify_command | rtk_grep | 817 | 225 | 0.359 | 20 | 100% (2/2) | 24.0ms | OK | +| A3 | classify_command | rtk_rgai | 782 | 200 | 0.319 | 25 | 100% (2/2) | 11.0ms | OK | +| A3 | classify_command | head_n | 2524 | 626 | 1.000 | 22 | 100% (2/2) | 0μs | OK | +| A4 | package_manager_exec | grep | 918 | 260 | 1.000 | 9 | 100% (5/5) | 7.0ms | OK | +| A4 | package_manager_exec | rtk_grep | 797 | 246 | 0.946 | 21 | 100% (5/5) | 25.0ms | OK | +| A4 | package_manager_exec | rtk_rgai | 1370 | 381 | 1.465 | 44 | 100% (5/5) | 11.0ms | OK | +| A4 | package_manager_exec | head_n | 918 | 260 | 1.000 | 9 | 100% (5/5) | 0μs | OK | +| A5 | strip_ansi | grep | 1852 | 539 | 1.000 | 20 | 100% (5/5) | 8.0ms | OK | +| A5 | strip_ansi | rtk_grep | 1197 | 388 | 0.720 | 33 | 100% (5/5) | 24.0ms | OK | +| A5 | strip_ansi | rtk_rgai | 1264 | 425 | 0.788 | 51 | 100% (5/5) | 10.0ms | OK | +| A5 | strip_ansi | head_n | 1852 | 539 | 1.000 | 20 | 100% (5/5) | 0μs | OK | +| A6 | HISTORY_DAYS | grep | 201 | 61 | 1.000 | 2 | 100% (1/1) | 5.0ms | OK | +| A6 | HISTORY_DAYS | rtk_grep | 182 | 66 | 1.082 | 6 | 100% (1/1) | 24.0ms | OK | +| A6 | HISTORY_DAYS | rtk_rgai | 686 | 208 | 3.410 | 23 | 100% (2/1) | 11.0ms | OK | +| A6 | HISTORY_DAYS | head_n | 201 | 61 | 1.000 | 2 | 100% (1/1) | 0μs | OK | + +### Category A: Exact Identifier Search — Summary + +- **grep**: | TE min/med/max=1.000/1.000/1.000 | gold hit min/med/max=100%/100%/100% | time min/med/max=5.0ms / 6.5ms / 8.0ms +- **rtk_grep**: | TE min/med/max=0.359/0.698/1.082 | gold hit min/med/max=100%/100%/100% | time min/med/max=22.0ms / 24.5ms / 44.0ms +- **rtk_rgai**: | TE min/med/max=0.287/0.579/3.410 | gold hit min/med/max=60%/100%/100% | time min/med/max=10.0ms / 11.0ms / 13.0ms | LOW_COVERAGE=1 +- **head_n**: | TE min/med/max=0.960/1.000/1.000 | gold hit min/med/max=100%/100%/100% | time min/med/max=0μs / 0μs / 0μs + +## Category B: Regex Pattern Search + +> `rtk rgai` does not support regex; misses are EXPECTED_UNSUPPORTED. + +| ID | Query | Tool | Bytes | Tokens | TE | Result Count | Gold Hit | Timing (med) | Status | +|----|-------|------|-------|--------|----|-------------|----------|-------------|--------| +| B1 | fn run\(.*verbose: u8 | grep | 3128 | 999 | 1.000 | 27 | 100% (27/25) | 7.0ms | OK | +| B1 | fn run\(.*verbose: u8 | rtk_grep | 3518 | 1206 | 1.207 | 83 | 100% (27/25) | 25.0ms | OK | +| B1 | fn run\(.*verbose: u8 | rtk_rgai | 3264 | 1065 | 1.066 | 99 | 38% (8/25) | 12.0ms | EXPECTED_UNSUPPORTED | +| B1 | fn run\(.*verbose: u8 | head_n | 3128 | 999 | 1.000 | 27 | 100% (27/25) | 0μs | OK | +| B2 | timer\.track\( | grep | 10764 | 3338 | 1.000 | 116 | 100% (34/30) | 8.0ms | OK | +| B2 | timer\.track\( | rtk_grep | 5723 | 1979 | 0.593 | 158 | 100% (34/30) | 24.0ms | OK | +| B2 | timer\.track\( | rtk_rgai | 2542 | 803 | 0.241 | 93 | 50% (8/30) | 12.0ms | EXPECTED_UNSUPPORTED | +| B2 | timer\.track\( | head_n | 9143 | 2822 | 0.845 | 100 | 100% (32/30) | 0μs | OK | +| B3 | \.unwrap_or\(1\) | grep | 5347 | 1513 | 1.000 | 48 | 100% (20/15) | 7.0ms | OK | +| B3 | \.unwrap_or\(1\) | rtk_grep | 3806 | 1200 | 0.793 | 87 | 100% (20/15) | 24.0ms | OK | +| B3 | \.unwrap_or\(1\) | rtk_rgai | 2777 | 885 | 0.585 | 106 | 50% (8/15) | 11.0ms | EXPECTED_UNSUPPORTED | +| B3 | \.unwrap_or\(1\) | head_n | 5347 | 1513 | 1.000 | 48 | 100% (20/15) | 0μs | OK | +| B4 | #\[cfg\(test\)\] | grep | 2605 | 845 | 1.000 | 41 | 100% (41/35) | 5.0ms | OK | +| B4 | #\[cfg\(test\)\] | rtk_grep | 3098 | 1142 | 1.351 | 125 | 100% (41/35) | 25.0ms | OK | +| B4 | #\[cfg\(test\)\] | rtk_rgai | 2247 | 716 | 0.847 | 101 | 40% (7/35) | 11.0ms | EXPECTED_UNSUPPORTED | +| B4 | #\[cfg\(test\)\] | head_n | 2605 | 845 | 1.000 | 41 | 100% (41/35) | 0μs | OK | +| B5 | HashMap For multi-concept queries, grep exact-substring misses are expected and shown as MISS. + +| ID | Query | Tool | Bytes | Tokens | TE | Result Count | Gold Hit | Timing (med) | Status | +|----|-------|------|-------|--------|----|-------------|----------|-------------|--------| +| C1 | token savings tracking database | grep | 0 | 0 | MISS | 0 | N/A (0/1) | 8.0ms | **MISS** | +| C1 | token savings tracking database | rtk_grep | 45 | 12 | MISS | 0 | 0% (0/1) | 24.0ms | **MISS** | +| C1 | token savings tracking database | rtk_rgai | 2801 | 832 | N/A | 102 | 100% (8/1) | 15.0ms | OK | +| C1 | token savings tracking database | head_n | 0 | 0 | MISS | 0 | N/A (0/1) | 0μs | **MISS** | +| C2 | exit code preservation | grep | 0 | 0 | MISS | 0 | N/A (0/2) | 9.0ms | **MISS** | +| C2 | exit code preservation | rtk_grep | 36 | 11 | MISS | 0 | 0% (0/2) | 25.0ms | **MISS** | +| C2 | exit code preservation | rtk_rgai | 2158 | 702 | N/A | 96 | 80% (8/2) | 12.0ms | OK | +| C2 | exit code preservation | head_n | 0 | 0 | MISS | 0 | N/A (0/2) | 0μs | **MISS** | +| C3 | language aware code filtering | grep | 0 | 0 | MISS | 0 | N/A (0/1) | 8.0ms | **MISS** | +| C3 | language aware code filtering | rtk_grep | 43 | 12 | MISS | 0 | 0% (0/1) | 24.0ms | **MISS** | +| C3 | language aware code filtering | rtk_rgai | 3112 | 926 | N/A | 103 | 100% (8/1) | 14.0ms | OK | +| C3 | language aware code filtering | head_n | 0 | 0 | MISS | 0 | N/A (0/1) | 0μs | **MISS** | +| C4 | output grouping by file | grep | 0 | 0 | MISS | 0 | N/A (0/2) | 8.0ms | **MISS** | +| C4 | output grouping by file | rtk_grep | 37 | 12 | MISS | 0 | 0% (0/2) | 25.0ms | **MISS** | +| C4 | output grouping by file | rtk_rgai | 3348 | 989 | N/A | 105 | 0% (8/2) | 13.0ms | LOW_COVERAGE | +| C4 | output grouping by file | head_n | 0 | 0 | MISS | 0 | N/A (0/2) | 0μs | **MISS** | +| C5 | three tier parser degradation | grep | 0 | 0 | MISS | 0 | N/A (0/1) | 9.0ms | **MISS** | +| C5 | three tier parser degradation | rtk_grep | 43 | 12 | MISS | 0 | 0% (0/1) | 25.0ms | **MISS** | +| C5 | three tier parser degradation | rtk_rgai | 2453 | 741 | N/A | 95 | 50% (7/1) | 13.0ms | OK | +| C5 | three tier parser degradation | head_n | 0 | 0 | MISS | 0 | N/A (0/1) | 0μs | **MISS** | +| C6 | ANSI color stripping cleanup | grep | 0 | 0 | MISS | 0 | N/A (0/1) | 6.0ms | **MISS** | +| C6 | ANSI color stripping cleanup | rtk_grep | 42 | 13 | MISS | 0 | 0% (0/1) | 27.0ms | **MISS** | +| C6 | ANSI color stripping cleanup | rtk_rgai | 2139 | 697 | N/A | 92 | 100% (8/1) | 14.0ms | OK | +| C6 | ANSI color stripping cleanup | head_n | 0 | 0 | MISS | 0 | N/A (0/1) | 0μs | **MISS** | +| C7 | hook installation settings json | grep | 0 | 0 | MISS | 0 | N/A (0/1) | 8.0ms | **MISS** | +| C7 | hook installation settings json | rtk_grep | 45 | 12 | MISS | 0 | 0% (0/1) | 27.0ms | **MISS** | +| C7 | hook installation settings json | rtk_rgai | 2940 | 907 | N/A | 104 | 100% (8/1) | 15.0ms | OK | +| C7 | hook installation settings json | head_n | 0 | 0 | MISS | 0 | N/A (0/1) | 0μs | **MISS** | +| C8 | command classification discover | grep | 0 | 0 | MISS | 0 | N/A (0/1) | 9.0ms | **MISS** | +| C8 | command classification discover | rtk_grep | 45 | 11 | MISS | 0 | 0% (0/1) | 26.0ms | **MISS** | +| C8 | command classification discover | rtk_rgai | 2867 | 796 | N/A | 104 | 100% (8/1) | 13.0ms | OK | +| C8 | command classification discover | head_n | 0 | 0 | MISS | 0 | N/A (0/1) | 0μs | **MISS** | +| C9 | pnpm yarn npm auto detection | grep | 0 | 0 | MISS | 0 | N/A (0/1) | 8.0ms | **MISS** | +| C9 | pnpm yarn npm auto detection | rtk_grep | 42 | 14 | MISS | 0 | 0% (0/1) | 27.0ms | **MISS** | +| C9 | pnpm yarn npm auto detection | rtk_rgai | 2682 | 931 | N/A | 104 | 100% (8/1) | 14.0ms | OK | +| C9 | pnpm yarn npm auto detection | head_n | 0 | 0 | MISS | 0 | N/A (0/1) | 0μs | **MISS** | +| C10 | SQLite retention cleanup policy | grep | 0 | 0 | MISS | 0 | N/A (0/1) | 11.0ms | **MISS** | +| C10 | SQLite retention cleanup policy | rtk_grep | 45 | 12 | MISS | 0 | 0% (0/1) | 25.0ms | **MISS** | +| C10 | SQLite retention cleanup policy | rtk_rgai | 806 | 241 | N/A | 27 | 100% (2/1) | 14.0ms | OK | +| C10 | SQLite retention cleanup policy | head_n | 0 | 0 | MISS | 0 | N/A (0/1) | 0μs | **MISS** | + +### Category C: Semantic Intent Search — Summary + +- **grep**: | time min/med/max=6.0ms / 8.0ms / 37.0ms | MISS=10 +- **rtk_grep**: | gold hit min/med/max=0%/0%/0% | time min/med/max=23.0ms / 25.0ms / 63.0ms | MISS=10 +- **rtk_rgai**: | gold hit min/med/max=0%/100%/100% | time min/med/max=12.0ms / 14.0ms / 18.0ms | LOW_COVERAGE=1 +- **head_n**: | time min/med/max=0μs / 0μs / 0μs | MISS=10 + +## Category D: Cross-File Pattern Discovery + +| ID | Query | Tool | Bytes | Tokens | TE | Result Count | Gold Hit | Timing (med) | Status | +|----|-------|------|-------|--------|----|-------------|----------|-------------|--------| +| D1 | verbose > 0 | grep | 6540 | 2112 | 1.000 | 90 | 100% (36/30) | 6.0ms | OK | +| D1 | verbose > 0 | rtk_grep | 4307 | 1634 | 0.774 | 162 | 100% (36/30) | 25.0ms | OK | +| D1 | verbose > 0 | rtk_rgai | 2238 | 709 | 0.336 | 97 | 50% (8/30) | 11.0ms | LOW_COVERAGE | +| D1 | verbose > 0 | head_n | 6540 | 2112 | 1.000 | 90 | 100% (36/30) | 0μs | OK | +| D2 | anyhow::Result | grep | 753 | 235 | 1.000 | 11 | 100% (11/11) | 8.0ms | OK | +| D2 | anyhow::Result | rtk_grep | 954 | 333 | 1.417 | 35 | 100% (11/11) | 24.0ms | OK | +| D2 | anyhow::Result | rtk_rgai | 2416 | 765 | 3.255 | 102 | 73% (8/11) | 12.0ms | LOW_COVERAGE | +| D2 | anyhow::Result | head_n | 753 | 235 | 1.000 | 11 | 100% (11/11) | 0μs | OK | +| D3 | process::exit | grep | 5234 | 1474 | 1.000 | 47 | 100% (19/15) | 7.0ms | OK | +| D3 | process::exit | rtk_grep | 3682 | 1154 | 0.783 | 84 | 100% (19/15) | 24.0ms | OK | +| D3 | process::exit | rtk_rgai | 2538 | 804 | 0.545 | 106 | 83% (8/15) | 12.0ms | LOW_COVERAGE | +| D3 | process::exit | head_n | 5234 | 1474 | 1.000 | 47 | 100% (19/15) | 0μs | OK | +| D4 | Command::new | grep | 9867 | 2999 | 1.000 | 111 | 100% (24/20) | 5.0ms | OK | +| D4 | Command::new | rtk_grep | 5321 | 1790 | 0.597 | 145 | 100% (24/20) | 25.0ms | OK | +| D4 | Command::new | rtk_rgai | 2283 | 769 | 0.256 | 102 | 57% (8/20) | 12.0ms | LOW_COVERAGE | +| D4 | Command::new | head_n | 8937 | 2700 | 0.900 | 100 | 100% (23/20) | 0μs | OK | +| D5 | from_utf8_lossy | grep | 17304 | 5038 | 1.000 | 157 | 100% (28/25) | 7.0ms | OK | +| D5 | from_utf8_lossy | rtk_grep | 8386 | 2572 | 0.511 | 168 | 100% (28/25) | 25.0ms | OK | +| D5 | from_utf8_lossy | rtk_rgai | 2767 | 867 | 0.172 | 94 | 29% (8/25) | 11.0ms | LOW_COVERAGE | +| D5 | from_utf8_lossy | head_n | 10775 | 3127 | 0.621 | 100 | 43% (17/25) | 0μs | LOW_COVERAGE | + +### Category D: Cross-File Pattern Discovery — Summary + +- **grep**: | TE min/med/max=1.000/1.000/1.000 | gold hit min/med/max=100%/100%/100% | time min/med/max=5.0ms / 7.0ms / 8.0ms +- **rtk_grep**: | TE min/med/max=0.511/0.774/1.417 | gold hit min/med/max=100%/100%/100% | time min/med/max=23.0ms / 25.0ms / 27.0ms +- **rtk_rgai**: | TE min/med/max=0.172/0.336/3.255 | gold hit min/med/max=29%/57%/83% | time min/med/max=11.0ms / 12.0ms / 13.0ms | LOW_COVERAGE=5 +- **head_n**: | TE min/med/max=0.621/1.000/1.000 | gold hit min/med/max=43%/100%/100% | time min/med/max=0μs / 0μs / 0μs | LOW_COVERAGE=1 + +## Category E: Edge Cases + +> Edge cases are discussed per-case; no category-level winner is inferred. + +| ID | Query | Tool | Bytes | Tokens | TE | Result Count | Gold Hit | Timing (med) | Status | +|----|-------|------|-------|--------|----|-------------|----------|-------------|--------| +| E1 | the | grep | 19971 | 5421 | 1.000 | 178 | N/A | 8.0ms | OK | +| E1 | the | rtk_grep | 11273 | 3399 | 0.627 | 239 | N/A | 26.0ms | OK | +| E1 | the | rtk_rgai | 2359 | 779 | 0.144 | 106 | N/A | 10.0ms | OK | +| E1 | the | head_n | 11771 | 3170 | 0.585 | 100 | N/A | 0μs | OK | +| E2 | fn | grep | 77939 | 23141 | 1.000 | 784 | N/A | 7.0ms | OK | +| E2 | fn | rtk_grep | 12744 | 4052 | 0.175 | 264 | N/A | 26.0ms | OK | +| E2 | fn | rtk_rgai | 2733 | 872 | 0.038 | 101 | N/A | 10.0ms | OK | +| E2 | fn | head_n | 10320 | 3103 | 0.134 | 100 | N/A | 0μs | OK | +| E3 | error handling retry backoff | grep | 0 | 0 | N/A | 0 | N/A | 9.0ms | OK | +| E3 | error handling retry backoff | rtk_grep | 42 | 13 | N/A | 0 | N/A | 25.0ms | OK | +| E3 | error handling retry backoff | rtk_rgai | 2340 | 756 | N/A | 102 | N/A | 13.0ms | **UNEXPECTED_HIT** | +| E3 | error handling retry backoff | head_n | 0 | 0 | N/A | 0 | N/A | 0μs | OK | + +## Summary: When to Use Which Tool + +| Situation | Recommended | Evidence | +|-----------|-------------|----------| +| Exact identifier search (Category A) | rtk_grep | median gold hit=100%, MISS=0, LOW_COVERAGE=0, median TE=0.698 | +| Cross-file pattern discovery (Category D) | rtk_grep | median gold hit=100%, MISS=0, LOW_COVERAGE=0, median TE=0.774 | +| Semantic intent search (Category C) | rtk_rgai | median gold hit=100%, MISS=0, LOW_COVERAGE=1, UNEXPECTED_HIT=0, median TE=N/A | +| Regex patterns (Category B) | grep / rtk grep | `rtk rgai` expected unsupported for regex | +| Exact zero-result validation (E3) | grep / rtk grep | Unexpected hits observed for: rtk_rgai | + +## Failure Modes + +### grep +- Floods output on broad/common queries. +- Misses semantic intent queries that do not appear as exact substrings. +- No built-in grouping/truncation. + +### rtk grep +- Output truncation (`--max 200`) can reduce recall in high-frequency queries. +- Still exact-match based (no semantic expansion). + +### rtk rgai +- Regex queries are unsupported by design. +- Can return semantically related content even when strict zero results are expected. +- Quality depends on ranking/model behavior and may vary by environment. + +### head_n (negative control) +- Naive truncation may look token-efficient but is relevance-blind. +- Useful as a floor comparator, not as a production recommendation. + +## Limitations + +- Single codebase benchmark (`src/` Rust files only). +- Gold standards are author-defined and include subjective intent mapping. +- Gold hit is computed from first-run samples; non-deterministic tools may vary across runs. +- Timing is hardware and background-load dependent. diff --git a/benchmarks/analyze_code.py b/benchmarks/analyze_code.py new file mode 100644 index 0000000..2404df3 --- /dev/null +++ b/benchmarks/analyze_code.py @@ -0,0 +1,679 @@ +#!/usr/bin/env python3 +""" +Analyze code-search benchmark results and generate RESULTS.md. + +Rules: + - No composite score. + - Per-category analysis only. + - Report distributions (min/median/max). + - If gold expects results and result_count == 0 => MISS. + - Regex category: rgai is EXPECTED_UNSUPPORTED (not failure). +""" + +from __future__ import annotations + +import csv +import json +import re +import sys +from collections import defaultdict +from pathlib import Path + +SCRIPT_DIR = Path(__file__).resolve().parent +CSV_PATH = SCRIPT_DIR / "results_raw.csv" +ENV_PATH = SCRIPT_DIR / "results_env.txt" +GOLD_PATH = SCRIPT_DIR / "gold_standards.json" +GOLD_AUTO_PATH = SCRIPT_DIR / "gold_auto.json" # ADDED: auto-generated gold +QUALITY_DIR = SCRIPT_DIR / "quality_samples" +RESULTS_PATH = SCRIPT_DIR / "RESULTS.md" + +TOOLS = ("grep", "rtk_grep", "rtk_rgai", "head_n") # CHANGED: added head_n +RECOMMENDABLE_TOOLS = ("grep", "rtk_grep", "rtk_rgai") +CATEGORY_ORDER = [ + "exact_identifier", + "regex_pattern", + "semantic_intent", + "cross_file", + "edge_case", +] +CATEGORY_TITLES = { + "exact_identifier": "Category A: Exact Identifier Search", + "regex_pattern": "Category B: Regex Pattern Search", + "semantic_intent": "Category C: Semantic Intent Search", + "cross_file": "Category D: Cross-File Pattern Discovery", + "edge_case": "Category E: Edge Cases", +} + + +def median_val(values: list[int | float]) -> float: + vals = sorted(values) + if not vals: + return 0.0 + n = len(vals) + if n % 2 == 1: + return float(vals[n // 2]) + return (vals[n // 2 - 1] + vals[n // 2]) / 2.0 + + +def min_val(values: list[int | float]) -> float: + return float(min(values)) if values else 0.0 + + +def max_val(values: list[int | float]) -> float: + return float(max(values)) if values else 0.0 + + +def is_valid_exit(exit_code: int) -> bool: + # 0 = matches/success, 1 = no matches, >=2 = execution error + return exit_code in (0, 1) + + +def normalize_rs_path(path: str) -> str: + p = path.strip(" \t\r\n:;,.()[]{}<>\"'") + p = p.replace("\\", "/") + if "/.../" in p: + p = p.split("/.../", 1)[1] + if "/src/" in p: + p = p.split("/src/", 1)[1] + elif p.startswith("src/"): + p = p[4:] + + p = p.lstrip("./") + p = re.sub(r"/{2,}", "/", p) + return p + + +def extract_filenames(text: str) -> set[str]: + filenames: set[str] = set() + for raw_line in text.splitlines(): + line = raw_line.strip() + if not line: + continue + + # RTK grouped output: + # 📄 /path/to/file.rs (12): + # 📄 parser/mod.rs [9.4] + m = re.match(r"^📄\s+(.+?\.rs)\s*(?:\(|\[|$)", line) + if m: + candidate = normalize_rs_path(m.group(1)) + if candidate.endswith(".rs"): + filenames.add(candidate) + continue + + # grep -rn style: + # /abs/src/file.rs:42:... + # src/file.rs:42:... + m = re.match(r"^(.+?\.rs):\d+(?::|$)", line) + if m: + candidate = normalize_rs_path(m.group(1)) + if candidate.endswith(".rs"): + filenames.add(candidate) + + return filenames + + +def file_matches_gold(gold_file: str, found_files: set[str]) -> bool: + if gold_file in found_files: + return True + + if "/" not in gold_file: + suffix = f"/{gold_file}" + return any(f == gold_file or f.endswith(suffix) for f in found_files) + + return False + + +def compute_gold_hits(sample_text: str, gold_files: list[str]) -> int: + if not gold_files: + return 0 + found_files = extract_filenames(sample_text) + return sum(1 for gf in gold_files if file_matches_gold(gf, found_files)) + + +def infer_no_result_from_sample(sample_text: str, tool: str) -> bool: + text = sample_text.strip() + if not text: + return False + if tool in {"rtk_grep", "rtk_rgai"}: + # rtk no-results marker examples: + # "🔍 0 for 'query'" / "🧠 0 for 'query'" + if re.search(r"(?:🔍|🧠)\s*0\s+for\b", text): + return True + # Fallback in case glyphs differ. + if re.search(r"^\s*0\s+for\b", text): + return True + return False + + +def compute_gold_hit_rate(sample_text: str, gold_files: list[str]) -> float | None: + if not gold_files: + return None + hits = compute_gold_hits(sample_text, gold_files) + return hits / len(gold_files) + + +def compute_gold_found_count(sample_text: str) -> int: + return len(extract_filenames(sample_text)) + + +def is_miss(result_count: int, expect_results: bool) -> bool: + return result_count == 0 and expect_results + + +def format_te(te: float | None, miss: bool) -> str: + if miss: + return "MISS" + if te is None: + return "N/A" + return f"{te:.3f}" + + +def format_pct(te: float | None, miss: bool) -> str: + if miss: + return "MISS" + if te is None: + return "N/A" + savings = (1 - te) * 100 + return f"{savings:.1f}%" + + +def format_gold(rate: float | None, found_count: int, min_required: int) -> str: + if rate is None: + if min_required > 0: + return f"N/A ({found_count}/{min_required})" + return "N/A" + if min_required > 0: + return f"{rate * 100:.0f}% ({found_count}/{min_required})" + return f"{rate * 100:.0f}%" + + +def format_timing(us: float) -> str: + if us >= 1_000_000: + return f"{us / 1_000_000:.2f}s" + if us >= 1_000: + return f"{us / 1_000:.1f}ms" + return f"{us:.0f}μs" + + +def format_timing_range(min_us: float, med_us: float, max_us: float) -> str: + return f"{format_timing(min_us)} / {format_timing(med_us)} / {format_timing(max_us)}" + + +def load_gold_standards() -> tuple[dict, dict]: + with open(GOLD_PATH, encoding="utf-8") as f: + data = json.load(f) + return data["queries"], data.get("metadata", {}) + + +def load_gold_auto() -> dict: # ADDED: load auto-generated gold + """Load auto-generated gold standards from grep output.""" + if not GOLD_AUTO_PATH.exists(): + return {} + with open(GOLD_AUTO_PATH, encoding="utf-8") as f: + data = json.load(f) + return data.get("queries", {}) + + +def load_csv() -> list[dict]: + rows = [] + with open(CSV_PATH, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + for row in reader: + if "output_tokens" not in row or row["output_tokens"] in (None, ""): + raise ValueError( + "results_raw.csv is missing 'output_tokens'. " + "Re-run benchmarks/bench_code.sh after installing tiktoken." + ) + row["time_us"] = int(row["time_us"]) + row["output_bytes"] = int(row["output_bytes"]) + row["output_tokens"] = int(row["output_tokens"]) + row["result_count"] = int(row["result_count"]) + row["exit_code"] = int(row["exit_code"]) + row["run"] = int(row["run"]) + rows.append(row) + return rows + + +def load_quality_sample(test_id: str, tool: str) -> str: + path = QUALITY_DIR / f"{test_id}_{tool}.txt" + if path.exists(): + return path.read_text(errors="replace") + return "" + + +def parse_commit_from_env(env_text: str) -> str | None: + m = re.search(r"^Commit:\s*([0-9a-f]{7,40})\s*$", env_text, flags=re.MULTILINE) + return m.group(1) if m else None + + +def compute_metrics(rows: list[dict], gold: dict, gold_auto: dict | None = None) -> list[dict]: # CHANGED: added gold_auto + grouped: dict[tuple[str, str], list[dict]] = defaultdict(list) + for row in rows: + grouped[(row["test_id"], row["tool"])].append(row) + + aggregates: dict[tuple[str, str], dict] = {} + for (tid, tool), runs in grouped.items(): + times = [r["time_us"] for r in runs] + bytess = [r["output_bytes"] for r in runs] + tokens = [r["output_tokens"] for r in runs] # ADDED: token counts + counts = [r["result_count"] for r in runs] + exits = [r["exit_code"] for r in runs] + aggregates[(tid, tool)] = { + "test_id": tid, + "tool": tool, + "category": runs[0]["category"], + "query": runs[0]["query"].strip('"'), + "median_time_us": median_val(times), + "min_time_us": min_val(times), + "max_time_us": max_val(times), + "median_bytes": median_val(bytess), + "median_tokens": median_val(tokens), # ADDED + "median_count": median_val(counts), + "valid": all(is_valid_exit(e) for e in exits), + } + + test_ids = sorted( + set(tid for tid, _ in aggregates.keys()), + key=lambda x: (x[0], int(x[1:]) if x[1:].isdigit() else 0), + ) + + results = [] + for tid in test_ids: + gold_entry = gold.get(tid, {}) + category = gold_entry.get("category", "unknown") + expect = gold_entry.get("expect_results", True) + gold_files = gold_entry.get("gold_files", []) + gold_min_files = int(gold_entry.get("gold_min_files", 0) or 0) + + grep_agg = aggregates.get((tid, "grep")) + grep_tokens = grep_agg["median_tokens"] if grep_agg else 0 + + entry = { + "test_id": tid, + "category": category, + "query": gold_entry.get("query", ""), + "expect_results": expect, + } + + for tool in TOOLS: + agg = aggregates.get((tid, tool)) + if not agg: + continue + + sample = load_quality_sample(tid, tool) + no_result_marker = infer_no_result_from_sample(sample, tool) + adjusted_count = 0 if no_result_marker else int(agg["median_count"]) + + unsupported = category == "regex_pattern" and tool == "rtk_rgai" + miss = is_miss(adjusted_count, expect) and not unsupported + unexpected_hit = (not expect) and adjusted_count > 0 + + ghr = compute_gold_hit_rate(sample, gold_files) if sample else None + found_count = compute_gold_found_count(sample) if sample else 0 + gold_hits = compute_gold_hits(sample, gold_files) if sample else 0 + gold_min_ok = None + if gold_min_files > 0: + gold_min_ok = found_count >= gold_min_files + low_coverage = False + if not miss and not unsupported: + if gold_min_files > 0 and gold_min_ok is False: + low_coverage = True + if ghr is not None and ghr == 0.0: + low_coverage = True + + te = None + if agg["valid"] and grep_tokens > 0: + te = agg["median_tokens"] / grep_tokens + + entry[f"{tool}_bytes"] = agg["median_bytes"] + entry[f"{tool}_tokens"] = agg["median_tokens"] + entry[f"{tool}_count"] = adjusted_count + entry[f"{tool}_time_us"] = agg["median_time_us"] + entry[f"{tool}_min_time_us"] = agg["min_time_us"] + entry[f"{tool}_max_time_us"] = agg["max_time_us"] + entry[f"{tool}_te"] = te + entry[f"{tool}_gold_hit"] = ghr + entry[f"{tool}_gold_found"] = found_count + entry[f"{tool}_gold_hits"] = gold_hits + entry[f"{tool}_gold_min_required"] = gold_min_files + entry[f"{tool}_gold_min_ok"] = gold_min_ok + entry[f"{tool}_low_coverage"] = low_coverage + entry[f"{tool}_valid"] = agg["valid"] + entry[f"{tool}_miss"] = miss + entry[f"{tool}_unsupported"] = unsupported + entry[f"{tool}_unexpected_hit"] = unexpected_hit + + results.append(entry) + + return results + + +def category_tool_stats(cat_metrics: list[dict], tool: str) -> dict: + entries = [m for m in cat_metrics if f"{tool}_bytes" in m and not m.get(f"{tool}_unsupported", False)] + + te_vals = [m[f"{tool}_te"] for m in entries if m.get(f"{tool}_te") is not None and not m.get(f"{tool}_miss", False)] + gold_vals = [m[f"{tool}_gold_hit"] for m in entries if m.get(f"{tool}_gold_hit") is not None] + time_vals = [m[f"{tool}_time_us"] for m in entries] + min_time_vals = [m[f"{tool}_min_time_us"] for m in entries] + max_time_vals = [m[f"{tool}_max_time_us"] for m in entries] + + miss_count = sum(1 for m in entries if m.get(f"{tool}_miss", False)) + unexpected_count = sum(1 for m in entries if m.get(f"{tool}_unexpected_hit", False)) + low_cov_count = sum( + 1 + for m in entries + if m.get(f"{tool}_low_coverage", False) + and not m.get(f"{tool}_miss", False) + ) + + return { + "entries": entries, + "te_vals": te_vals, + "gold_vals": gold_vals, + "time_vals": time_vals, + "min_time_vals": min_time_vals, + "max_time_vals": max_time_vals, + "miss_count": miss_count, + "unexpected_count": unexpected_count, + "low_cov_count": low_cov_count, + "unsupported_count": sum(1 for m in cat_metrics if m.get(f"{tool}_unsupported", False)), + } + + +def pick_best_for_exact(cat_metrics: list[dict]) -> tuple[str, str]: + candidates = [] + for tool in RECOMMENDABLE_TOOLS: + st = category_tool_stats(cat_metrics, tool) + if not st["entries"]: + continue + med_te = median_val(st["te_vals"]) if st["te_vals"] else 1e18 + med_gold = median_val(st["gold_vals"]) if st["gold_vals"] else -1.0 + med_time = median_val(st["time_vals"]) if st["time_vals"] else 1e18 + candidates.append( + (tool, st["miss_count"], st["low_cov_count"], med_gold, med_te, med_time) + ) + + if not candidates: + return "N/A", "Insufficient valid metrics" + + # For exact/cross-file tasks: correctness first, compression second. + candidates.sort(key=lambda x: (x[1], x[2], -x[3], x[4], x[5])) + tool, miss, low_cov, med_gold, med_te, _ = candidates[0] + gold_str = "N/A" if med_gold < 0 else f"{med_gold * 100:.0f}%" + te_str = "N/A" if med_te == 1e18 else f"{med_te:.3f}" + return tool, f"median gold hit={gold_str}, MISS={miss}, LOW_COVERAGE={low_cov}, median TE={te_str}" + + +def pick_best_for_semantic(cat_metrics: list[dict]) -> tuple[str, str]: + candidates = [] + for tool in RECOMMENDABLE_TOOLS: + st = category_tool_stats(cat_metrics, tool) + if not st["entries"]: + continue + med_gold = median_val(st["gold_vals"]) if st["gold_vals"] else -1.0 + med_te = median_val(st["te_vals"]) if st["te_vals"] else 1e18 + miss = st["miss_count"] + low_cov = st["low_cov_count"] + unexpected = st["unexpected_count"] + candidates.append((tool, miss, low_cov, unexpected, -med_gold, med_te, med_gold)) + + if not candidates: + return "N/A", "Insufficient valid metrics" + + # For semantic tasks: misses/coverage first, then relevance, then compression. + candidates.sort(key=lambda x: (x[1], x[2], x[3], x[4], x[5])) + tool, miss, low_cov, unexpected, _, med_te, med_gold = candidates[0] + gold_str = "N/A" if med_gold < 0 else f"{med_gold * 100:.0f}%" + te_str = "N/A" if med_te == 1e18 else f"{med_te:.3f}" + return tool, f"median gold hit={gold_str}, MISS={miss}, LOW_COVERAGE={low_cov}, UNEXPECTED_HIT={unexpected}, median TE={te_str}" + + +def generate_report( + metrics: list[dict], + env_text: str, + gold_queries: dict, + pinned_commit: str, + env_commit: str | None, +) -> str: + lines: list[str] = [] + w = lines.append + + w("# Code Search Benchmark: grep vs rtk grep vs rtk rgai vs head_n\n") # CHANGED: added head_n + w("## Environment & Reproduction\n") + w("```") + w(env_text.strip()) + w("```\n") + + w(f"## Dataset: rtk-ai/rtk @ `{pinned_commit}`\n") + if env_commit and env_commit != pinned_commit: + w( + f"> **WARNING**: benchmark env commit `{env_commit}` differs from pinned " + f"gold commit `{pinned_commit}`. Results are not strictly reproducible.\n" + ) + + w("**Reproduction**:") + w("```bash") + w("rtk --version") + w("bash benchmarks/bench_code.sh") + w("python3 benchmarks/analyze_code.py") + w("python3 -m unittest discover -s benchmarks/tests -p 'test_*.py'") + w("```\n") + + w("## Methodology\n") + w("### Metrics (reported separately, NO composite score)\n") + w("| Metric | Definition | Purpose |") + w("|--------|-----------|---------|") + w("| Output bytes | `wc -c` of stdout | Raw size footprint |") + w("| Output tokens | `tiktoken` (`cl100k_base`) on full stdout | Model-aligned token cost |") + w("| Token Efficiency (TE) | `output_tokens / grep_output_tokens` | Token compression vs baseline |") + w("| Result count | Effective output lines / no-result aware count | Distinguish compactness vs empty results |") + w("| Gold hit rate | `% gold_files found` (plus found/min files) | Relevance/correctness |") + w("| Timing | Median of 5 runs, plus min/max in summaries | Performance distribution |") + w("") + w("**Critical rule**: if `expect_results=true` and `result_count==0`, mark as **MISS**.") + w("For regex category, `rtk rgai` is marked `EXPECTED_UNSUPPORTED` by design.\n") + + w("### Categories\n") + w("| Category | Queries |") + w("|----------|---------|") + w("| A: Exact Identifier | 6 |") + w("| B: Regex Pattern | 6 |") + w("| C: Semantic Intent | 10 |") + w("| D: Cross-File Pattern Discovery | 5 |") + w("| E: Edge Cases | 3 |") + w("") + + for cat_key in CATEGORY_ORDER: + cat_title = CATEGORY_TITLES[cat_key] + cat_metrics = [m for m in metrics if m["category"] == cat_key] + if not cat_metrics: + continue + + w(f"## {cat_title}\n") + + if cat_key == "regex_pattern": + w("> `rtk rgai` does not support regex; misses are EXPECTED_UNSUPPORTED.\n") + if cat_key == "semantic_intent": + w("> For multi-concept queries, grep exact-substring misses are expected and shown as MISS.\n") + if cat_key == "edge_case": + w("> Edge cases are discussed per-case; no category-level winner is inferred.\n") + + w("| ID | Query | Tool | Bytes | Tokens | TE | Result Count | Gold Hit | Timing (med) | Status |") + w("|----|-------|------|-------|--------|----|-------------|----------|-------------|--------|") + + for m in cat_metrics: + for tool in TOOLS: + if f"{tool}_bytes" not in m: + continue + + miss = m.get(f"{tool}_miss", False) + unsupported = m.get(f"{tool}_unsupported", False) + unexpected_hit = m.get(f"{tool}_unexpected_hit", False) + valid = m.get(f"{tool}_valid", False) + min_required = m.get(f"{tool}_gold_min_required", 0) + low_coverage = m.get(f"{tool}_low_coverage", False) + + if unsupported: + status = "EXPECTED_UNSUPPORTED" + elif not valid: + status = "INVALID" + elif miss: + status = "**MISS**" + elif unexpected_hit: + status = "**UNEXPECTED_HIT**" + elif low_coverage: + status = "LOW_COVERAGE" + else: + status = "OK" + + w( + f"| {m['test_id']} | {m['query']} | {tool} | {m[f'{tool}_bytes']:.0f} | " + f"{m.get(f'{tool}_tokens', 0):.0f} | " + f"{format_te(m.get(f'{tool}_te'), miss)} | " + f"{m.get(f'{tool}_count', 0):.0f} | " + f"{format_gold(m.get(f'{tool}_gold_hit'), m.get(f'{tool}_gold_found', 0), min_required)} | " + f"{format_timing(m.get(f'{tool}_time_us', 0.0))} | {status} |" + ) + w("") + + if cat_key != "edge_case": + w(f"### {cat_title} — Summary\n") + for tool in TOOLS: + st = category_tool_stats(cat_metrics, tool) + if st["unsupported_count"] == len(cat_metrics): + w(f"- **{tool}**: expected unsupported for this category.") + continue + + parts = [f"**{tool}**:"] + if st["te_vals"]: + parts.append( + "TE min/med/max=" + f"{min_val(st['te_vals']):.3f}/" + f"{median_val(st['te_vals']):.3f}/" + f"{max_val(st['te_vals']):.3f}" + ) + if st["gold_vals"]: + parts.append( + "gold hit min/med/max=" + f"{min_val(st['gold_vals']) * 100:.0f}%/" + f"{median_val(st['gold_vals']) * 100:.0f}%/" + f"{max_val(st['gold_vals']) * 100:.0f}%" + ) + if st["time_vals"]: + parts.append( + "time min/med/max=" + + format_timing_range( + min_val(st["min_time_vals"]), + median_val(st["time_vals"]), + max_val(st["max_time_vals"]), + ) + ) + if st["miss_count"] > 0: + parts.append(f"MISS={st['miss_count']}") + if st["unexpected_count"] > 0: + parts.append(f"UNEXPECTED_HIT={st['unexpected_count']}") + if st["low_cov_count"] > 0: + parts.append(f"LOW_COVERAGE={st['low_cov_count']}") + w("- " + " | ".join(parts)) + w("") + + # Tool recommendation rows without cross-category averaging. + w("## Summary: When to Use Which Tool\n") + w("| Situation | Recommended | Evidence |") + w("|-----------|-------------|----------|") + + cat_a = [m for m in metrics if m["category"] == "exact_identifier"] + cat_d = [m for m in metrics if m["category"] == "cross_file"] + cat_c = [m for m in metrics if m["category"] == "semantic_intent"] + cat_e = [m for m in metrics if m["category"] == "edge_case"] + + best_a, ev_a = pick_best_for_exact(cat_a) + best_d, ev_d = pick_best_for_exact(cat_d) + best_c, ev_c = pick_best_for_semantic(cat_c) + + w(f"| Exact identifier search (Category A) | {best_a} | {ev_a} |") + w(f"| Cross-file pattern discovery (Category D) | {best_d} | {ev_d} |") + w(f"| Semantic intent search (Category C) | {best_c} | {ev_c} |") + w("| Regex patterns (Category B) | grep / rtk grep | `rtk rgai` expected unsupported for regex |") + + # Edge evidence: E3 should be zero results. + e3 = next((m for m in cat_e if m["test_id"] == "E3"), None) + if e3: + bad_tools = [t for t in TOOLS if e3.get(f"{t}_unexpected_hit", False)] + if bad_tools: + w( + "| Exact zero-result validation (E3) | grep / rtk grep | " + f"Unexpected hits observed for: {', '.join(bad_tools)} |" + ) + else: + w("| Exact zero-result validation (E3) | all tools | All returned zero results as expected |") + w("") + + w("## Failure Modes\n") + w("### grep") + w("- Floods output on broad/common queries.") + w("- Misses semantic intent queries that do not appear as exact substrings.") + w("- No built-in grouping/truncation.\n") + w("### rtk grep") + w("- Output truncation (`--max 200`) can reduce recall in high-frequency queries.") + w("- Still exact-match based (no semantic expansion).\n") + w("### rtk rgai") + w("- Regex queries are unsupported by design.") + w("- Can return semantically related content even when strict zero results are expected.") + w("- Quality depends on ranking/model behavior and may vary by environment.\n") + if "head_n" in TOOLS: + w("### head_n (negative control)") + w("- Naive truncation may look token-efficient but is relevance-blind.") + w("- Useful as a floor comparator, not as a production recommendation.\n") + + w("## Limitations\n") + w("- Single codebase benchmark (`src/` Rust files only).") + w("- Gold standards are author-defined and include subjective intent mapping.") + w("- Gold hit is computed from first-run samples; non-deterministic tools may vary across runs.") + w("- Timing is hardware and background-load dependent.") + w("") + + return "\n".join(lines) + + +def main(): + if not CSV_PATH.exists(): + print(f"ERROR: {CSV_PATH} not found. Run bench_code.sh first.", file=sys.stderr) + sys.exit(1) + if not GOLD_PATH.exists(): + print(f"ERROR: {GOLD_PATH} not found.", file=sys.stderr) + sys.exit(1) + + gold_queries, gold_meta = load_gold_standards() + gold_auto = load_gold_auto() # ADDED: auto-generated gold from grep output + rows = load_csv() + env_text = ENV_PATH.read_text() if ENV_PATH.exists() else "" + env_commit = parse_commit_from_env(env_text) + pinned_commit = gold_meta.get("pinned_commit", "unknown") + + print(f"Loaded {len(rows)} measurements from {CSV_PATH}") + print(f"Loaded {len(gold_queries)} gold standards from {GOLD_PATH}") + if gold_auto: + print(f"Loaded {len(gold_auto)} auto-generated gold entries from {GOLD_AUTO_PATH}") # ADDED + + metrics = compute_metrics(rows, gold_queries, gold_auto) # CHANGED: pass gold_auto + report = generate_report(metrics, env_text, gold_queries, pinned_commit, env_commit) + RESULTS_PATH.write_text(report, encoding="utf-8") + + miss_count = 0 + unexpected_count = 0 + for m in metrics: + for tool in TOOLS: + if m.get(f"{tool}_miss", False): + miss_count += 1 + if m.get(f"{tool}_unexpected_hit", False): + unexpected_count += 1 + + print(f"\nReport written to {RESULTS_PATH}") + print(f" {len(metrics)} queries analyzed") + print(f" MISS entries: {miss_count}") + print(f" UNEXPECTED_HIT entries: {unexpected_count}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_code.sh b/benchmarks/bench_code.sh new file mode 100755 index 0000000..1df8949 --- /dev/null +++ b/benchmarks/bench_code.sh @@ -0,0 +1,367 @@ +#!/usr/bin/env bash +# +# Benchmark runner: grep vs rtk grep vs rtk rgai on rtk codebase +# +# Usage: +# bash benchmarks/bench_code.sh +# +# Output: +# benchmarks/results_raw.csv — raw measurements (30 queries × 4 tools × 5 runs) # CHANGED: 4 tools +# benchmarks/results_env.txt — environment snapshot +# benchmarks/quality_samples/ — first-run full output samples (no truncation) +# benchmarks/gold_auto.json — auto-generated gold files from grep output +# +set -euo pipefail +export LC_ALL=C + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +SRC_DIR="$PROJECT_DIR/src" +GOLD_PATH="$SCRIPT_DIR/gold_standards.json" +CSV_OUT="$SCRIPT_DIR/results_raw.csv" +ENV_OUT="$SCRIPT_DIR/results_env.txt" +QUALITY_DIR="$SCRIPT_DIR/quality_samples" +GOLD_AUTO="$SCRIPT_DIR/gold_auto.json" # ADDED: auto-generated gold + +RUNS=5 +HEAD_N_LINES="${HEAD_N_LINES:-100}" # ADDED: negative control truncation threshold +TOKENIZER_ENCODING="${TOKENIZER_ENCODING:-cl100k_base}" +RTK_BIN="${RTK_BIN:-$PROJECT_DIR/target/release/rtk}" +ALLOW_DIRTY="${ALLOW_DIRTY:-0}" +RTK_GREP_MAX="${RTK_GREP_MAX:-200}" +RGAI_MAX="${RGAI_MAX:-8}" + +# ── Pre-flight checks ──────────────────────────────────────────────────── # + +if [ ! -d "$SRC_DIR" ]; then + echo "ERROR: src/ directory not found at $SRC_DIR" >&2 + exit 1 +fi + +if [ ! -f "$GOLD_PATH" ]; then + echo "ERROR: gold_standards.json not found at $GOLD_PATH" >&2 + exit 1 +fi + +if [ ! -x "$RTK_BIN" ]; then + echo "ERROR: rtk binary not found or not executable at $RTK_BIN" >&2 + echo "Build local binary first: cargo build --release" >&2 + exit 1 +fi + +if ! python3 - "$TOKENIZER_ENCODING" <<'PY' +import sys +import tiktoken + +enc = sys.argv[1] +tiktoken.get_encoding(enc) +PY +then + echo "ERROR: Python package 'tiktoken' is required for token-based TE." >&2 + echo "Install it with: python3 -m pip install tiktoken" >&2 + exit 1 +fi + +# ── Pin commit for reproducibility ──────────────────────────────────────── # + +EXPECTED_COMMIT="$(python3 -c "import json;print(json.load(open('$GOLD_PATH', encoding='utf-8'))['metadata']['pinned_commit'])")" +PINNED_COMMIT="$(cd "$PROJECT_DIR" && git rev-parse HEAD)" +if [ "$PINNED_COMMIT" != "$EXPECTED_COMMIT" ]; then + echo "ERROR: Current HEAD ($PINNED_COMMIT) does not match pinned commit in gold_standards.json ($EXPECTED_COMMIT)." >&2 + echo "Checkout pinned commit first for reproducible results." >&2 + exit 2 +fi + +echo "Pinned commit: $PINNED_COMMIT" + +if [ "$ALLOW_DIRTY" != "1" ]; then + if ! (cd "$PROJECT_DIR" && git diff --quiet -- src Cargo.toml Cargo.lock && git diff --cached --quiet -- src Cargo.toml Cargo.lock); then + echo "ERROR: Working tree has local changes in benchmarked sources (src/, Cargo.toml, Cargo.lock)." >&2 + echo "Commit/stash changes for auditable reproducibility, or set ALLOW_DIRTY=1 to override." >&2 + exit 3 + fi + if [ -n "$(cd "$PROJECT_DIR" && git ls-files --others --exclude-standard -- src)" ]; then + echo "ERROR: Untracked files exist under src/; benchmark dataset is not clean." >&2 + echo "Commit/remove untracked source files, or set ALLOW_DIRTY=1 to override." >&2 + exit 3 + fi +fi + +# ── Environment snapshot ────────────────────────────────────────────────── # + +{ + echo "Date: $(date -u)" + echo "Commit: $PINNED_COMMIT" + echo "rtk_bin: $RTK_BIN" + echo "rtk: $($RTK_BIN --version 2>&1 || echo 'N/A')" + echo "grep: $(grep --version 2>&1 | head -1 || echo 'N/A')" + echo "tiktoken_encoding: $TOKENIZER_ENCODING" + echo "rtk_grep_max: $RTK_GREP_MAX" + echo "rgai_max: $RGAI_MAX" + echo "head_n_lines: $HEAD_N_LINES" # ADDED: negative control param + echo "OS: $(uname -a)" + if [[ "$(uname)" == "Darwin" ]]; then + echo "CPU: $(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo 'N/A')" + else + echo "CPU: $(lscpu 2>/dev/null | grep 'Model name' | sed 's/.*: *//' || echo 'N/A')" + fi + echo "Rust files: $(find "$SRC_DIR" -name '*.rs' | wc -l | tr -d ' ')" + echo "Total LOC: $(find "$SRC_DIR" -name '*.rs' -exec cat {} + | wc -l | tr -d ' ')" +} > "$ENV_OUT" + +echo "=== Environment ===" +cat "$ENV_OUT" +echo "" + +# ── Warmup (populate OS page cache) ────────────────────────────────────── # + +echo "Warming up filesystem cache..." +find "$SRC_DIR" -name '*.rs' -exec cat {} + > /dev/null 2>&1 +echo "" + +# ── Command runner helper ───────────────────────────────────────────────── # +# Prints: time_us,output_bytes,output_tokens,result_count,exit_code + +count_tokens_tiktoken() { + local out_file="$1" + python3 - "$out_file" "$TOKENIZER_ENCODING" <<'PY' +import sys +from pathlib import Path +import tiktoken + +path = Path(sys.argv[1]) +encoding_name = sys.argv[2] +enc = tiktoken.get_encoding(encoding_name) +text = path.read_text(encoding="utf-8", errors="replace") +print(len(enc.encode(text))) +PY +} + +run_command_capture() { + local out_file="$1" + shift + + local time_file elapsed_s time_us bytes tokens lines exit_code + time_file="$(mktemp)" + + TIMEFORMAT='%R' + set +e + { time "$@" > "$out_file" 2>/dev/null; } 2> "$time_file" + exit_code=$? + set -e + + elapsed_s="$(tr -d ' \t\r\n' < "$time_file")" + rm -f "$time_file" + + if [[ "$elapsed_s" =~ ^[0-9]+([.][0-9]+)?$ ]]; then + time_us="$(awk -v s="$elapsed_s" 'BEGIN { printf "%.0f", s * 1000000 }')" + else + time_us=0 + fi + + bytes=$(wc -c < "$out_file" | tr -d ' ') + tokens=$(count_tokens_tiktoken "$out_file") + lines=$(wc -l < "$out_file" | tr -d ' ') + echo "${time_us},${bytes},${tokens},${lines},${exit_code}" +} + +# ── Quality sample capture ──────────────────────────────────────────────── # + +mkdir -p "$QUALITY_DIR" +rm -f "$QUALITY_DIR"/*.txt + +# ── Test matrix ─────────────────────────────────────────────────────────── # +# Fields: test_id category query grep_flags + +declare -a TEST_IDS=() +declare -a TEST_CATEGORIES=() +declare -a TEST_QUERIES=() +declare -a TEST_GREP_FLAGS=() + +add_test() { + TEST_IDS+=("$1") + TEST_CATEGORIES+=("$2") + TEST_QUERIES+=("$3") + TEST_GREP_FLAGS+=("$4") +} + +# Category A: Exact Identifier +add_test "A1" "exact_identifier" "TimedExecution" "" +add_test "A2" "exact_identifier" "FilterLevel" "" +add_test "A3" "exact_identifier" "classify_command" "" +add_test "A4" "exact_identifier" "package_manager_exec" "" +add_test "A5" "exact_identifier" "strip_ansi" "" +add_test "A6" "exact_identifier" "HISTORY_DAYS" "" + +# Category B: Regex Pattern +add_test "B1" "regex_pattern" 'fn run\(.*verbose: u8' "-E" +add_test "B2" "regex_pattern" 'timer\.track\(' "-E" +add_test "B3" "regex_pattern" '\.unwrap_or\(1\)' "-E" +add_test "B4" "regex_pattern" '#\[cfg\(test\)\]' "-E" +add_test "B5" "regex_pattern" 'HashMap 0" "" +add_test "D2" "cross_file" "anyhow::Result" "" +add_test "D3" "cross_file" "process::exit" "" +add_test "D4" "cross_file" "Command::new" "" +add_test "D5" "cross_file" "from_utf8_lossy" "" + +# Category E: Edge Cases +add_test "E1" "edge_case" "the" "" +add_test "E2" "edge_case" "fn" "" +add_test "E3" "edge_case" "error handling retry backoff" "" + +# ── CSV header ──────────────────────────────────────────────────────────── # + +echo "test_id,category,query,tool,run,time_us,output_bytes,output_tokens,result_count,exit_code" > "$CSV_OUT" + +# ── Run matrix ──────────────────────────────────────────────────────────── # + +NUM_TESTS=${#TEST_IDS[@]} +echo "Running $NUM_TESTS tests × 4 tools × $RUNS runs = $(( NUM_TESTS * 4 * RUNS )) measurements" # CHANGED: 4 tools +echo "" + +for idx in $(seq 0 $(( NUM_TESTS - 1 ))); do + tid="${TEST_IDS[$idx]}" + category="${TEST_CATEGORIES[$idx]}" + query="${TEST_QUERIES[$idx]}" + grep_flags="${TEST_GREP_FLAGS[$idx]}" + + echo "[$tid] query=\"$query\"" + + for run in $(seq 1 $RUNS); do + tmp_grep="$(mktemp)" + tmp_rtk_grep="$(mktemp)" + tmp_rtk_rgai="$(mktemp)" + tmp_head_n="$(mktemp)" # ADDED: negative control + + grep_flag_arr=() + if [ -n "$grep_flags" ]; then + read -r -a grep_flag_arr <<< "$grep_flags" + fi + + # grep + grep_cmd=(grep -rn "${grep_flag_arr[@]}" -- "$query" "$SRC_DIR") + IFS=',' read -r grep_time grep_bytes grep_tokens grep_lines grep_exit < <(run_command_capture "$tmp_grep" "${grep_cmd[@]}") + echo "$tid,$category,\"$query\",grep,$run,$grep_time,$grep_bytes,$grep_tokens,$grep_lines,$grep_exit" >> "$CSV_OUT" + + if [ "$run" -eq 1 ]; then + cp "$tmp_grep" "$QUALITY_DIR/${tid}_grep.txt" 2>/dev/null || true + fi + + # rtk grep + rtk_grep_cmd=("$RTK_BIN" grep "$query" "$SRC_DIR" --max "$RTK_GREP_MAX") + IFS=',' read -r rtk_grep_time rtk_grep_bytes rtk_grep_tokens rtk_grep_lines rtk_grep_exit < <(run_command_capture "$tmp_rtk_grep" "${rtk_grep_cmd[@]}") + echo "$tid,$category,\"$query\",rtk_grep,$run,$rtk_grep_time,$rtk_grep_bytes,$rtk_grep_tokens,$rtk_grep_lines,$rtk_grep_exit" >> "$CSV_OUT" + + if [ "$run" -eq 1 ]; then + cp "$tmp_rtk_grep" "$QUALITY_DIR/${tid}_rtk_grep.txt" 2>/dev/null || true + fi + + # rtk rgai + rtk_rgai_cmd=("$RTK_BIN" rgai --path "$SRC_DIR" --max "$RGAI_MAX" -- "$query") + IFS=',' read -r rtk_rgai_time rtk_rgai_bytes rtk_rgai_tokens rtk_rgai_lines rtk_rgai_exit < <(run_command_capture "$tmp_rtk_rgai" "${rtk_rgai_cmd[@]}") + echo "$tid,$category,\"$query\",rtk_rgai,$run,$rtk_rgai_time,$rtk_rgai_bytes,$rtk_rgai_tokens,$rtk_rgai_lines,$rtk_rgai_exit" >> "$CSV_OUT" + + if [ "$run" -eq 1 ]; then + cp "$tmp_rtk_rgai" "$QUALITY_DIR/${tid}_rtk_rgai.txt" 2>/dev/null || true + fi + + # head_n (NEGATIVE CONTROL) ────────────────────────────────── # ADDED: entire section + # Naive truncation baseline: just take first N lines of grep output + head -n "$HEAD_N_LINES" "$tmp_grep" > "$tmp_head_n" 2>/dev/null || true + head_n_tokens=$(count_tokens_tiktoken "$tmp_head_n") + head_n_bytes=$(wc -c < "$tmp_head_n" | tr -d ' ') + head_n_lines=$(wc -l < "$tmp_head_n" | tr -d ' ') + # Timing is negligible for head, use 0 + echo "$tid,$category,\"$query\",head_n,$run,0,$head_n_bytes,$head_n_tokens,$head_n_lines,0" >> "$CSV_OUT" + + if [ "$run" -eq 1 ]; then + cp "$tmp_head_n" "$QUALITY_DIR/${tid}_head_n.txt" 2>/dev/null || true + fi + + rm -f "$tmp_grep" "$tmp_rtk_grep" "$tmp_rtk_rgai" "$tmp_head_n" # CHANGED: added tmp_head_n + done + echo " done ($RUNS runs)" +done + +echo "" +echo "=== Generating Auto Gold Standards ===" # ADDED: entire section + +# Generate gold_auto.json from grep output (automatic verification) +python3 - "$QUALITY_DIR" "$GOLD_AUTO" "$PINNED_COMMIT" << 'PYEOF' +import json +import re +import sys +from pathlib import Path + +quality_dir = Path(sys.argv[1]) +output_path = Path(sys.argv[2]) +pinned_commit = sys.argv[3] + +def extract_rs_files(text: str) -> list[str]: + """Extract unique .rs filenames from grep output.""" + files = set() + for match in re.finditer(r"([A-Za-z0-9_./-]+\.rs)", text): + path = match.group(1) + # Normalize: strip src/ prefix, keep nested paths + if "/src/" in path: + path = path.split("/src/", 1)[1] + elif path.startswith("src/"): + path = path[4:] + path = path.lstrip("./") + if path.endswith(".rs"): + files.add(path) + return sorted(files) + +gold_auto = { + "metadata": { + "description": "Auto-generated gold standards from grep output", + "pinned_commit": pinned_commit, + "generated": "auto", + "notes": "Gold files extracted automatically from grep results - no manual curation" + }, + "queries": {} +} + +# Process each grep sample +for grep_file in sorted(quality_dir.glob("*_grep.txt")): + tid = grep_file.stem.replace("_grep", "") + text = grep_file.read_text(errors="replace") + gold_files = extract_rs_files(text) + + gold_auto["queries"][tid] = { + "gold_files_auto": gold_files, + "gold_file_count": len(gold_files), + "grep_lines": len(text.splitlines()), + "grep_bytes": len(text.encode("utf-8")) + } + +output_path.write_text(json.dumps(gold_auto, indent=2), encoding="utf-8") +print(f"Generated {output_path} with {len(gold_auto['queries'])} queries") +PYEOF + +echo "" +echo "=== Benchmark Complete ===" +echo "Raw results: $CSV_OUT" +echo "Quality samples: $QUALITY_DIR/" +echo "Auto gold: $GOLD_AUTO" # ADDED +echo "Environment: $ENV_OUT" +echo "" +echo "Total measurements: $(( $(wc -l < "$CSV_OUT") - 1 ))" +echo "" +echo "Next step: python3 benchmarks/analyze_code.py" diff --git a/benchmarks/gold_standards.json b/benchmarks/gold_standards.json new file mode 100644 index 0000000..d74610d --- /dev/null +++ b/benchmarks/gold_standards.json @@ -0,0 +1,285 @@ +{ + "metadata": { + "description": "Gold standards for code-search benchmark on rtk codebase", + "pinned_commit": "4b0a413562c775757d5bc09a6ff966b4e532508c", + "codebase": "rtk-ai/rtk", + "generated": "2026-02-15", + "notes": "Gold files verified by grep on pinned commit. gold_min_files = minimum unique .rs files expected in output." + }, + "queries": { + "A1": { + "query": "TimedExecution", + "category": "exact_identifier", + "grep_flags": "", + "gold_files": ["tracking.rs", "main.rs", "cargo_cmd.rs", "git.rs", "container.rs", "grep_cmd.rs", "npm_cmd.rs", "go_cmd.rs", "gh_cmd.rs", "ls.rs"], + "gold_min_files": 30, + "expect_results": true, + "notes": "Struct def + all usages across 34 command modules" + }, + "A2": { + "query": "FilterLevel", + "category": "exact_identifier", + "grep_flags": "", + "gold_files": ["filter.rs", "main.rs", "read.rs"], + "gold_min_files": 3, + "expect_results": true, + "notes": "Enum def + variants in 3 files" + }, + "A3": { + "query": "classify_command", + "category": "exact_identifier", + "grep_flags": "", + "gold_files": ["discover/registry.rs", "discover/mod.rs"], + "gold_min_files": 2, + "expect_results": true, + "notes": "Function def + caller in discover module" + }, + "A4": { + "query": "package_manager_exec", + "category": "exact_identifier", + "grep_flags": "", + "gold_files": ["utils.rs", "vitest_cmd.rs", "playwright_cmd.rs", "prettier_cmd.rs", "lint_cmd.rs"], + "gold_min_files": 5, + "expect_results": true, + "notes": "Function def in utils.rs + callers in JS tooling modules" + }, + "A5": { + "query": "strip_ansi", + "category": "exact_identifier", + "grep_flags": "", + "gold_files": ["utils.rs", "vitest_cmd.rs", "playwright_cmd.rs", "next_cmd.rs", "cargo_cmd.rs"], + "gold_min_files": 5, + "expect_results": true, + "notes": "Function def in utils.rs + callers" + }, + "A6": { + "query": "HISTORY_DAYS", + "category": "exact_identifier", + "grep_flags": "", + "gold_files": ["tracking.rs"], + "gold_min_files": 1, + "expect_results": true, + "notes": "Const def + usage in cleanup logic, single file" + }, + + "B1": { + "query": "fn run\\(.*verbose: u8", + "category": "regex_pattern", + "grep_flags": "-E", + "gold_files": ["git.rs", "cargo_cmd.rs", "npm_cmd.rs", "container.rs", "ls.rs", "tsc_cmd.rs", "vitest_cmd.rs", "pnpm_cmd.rs"], + "gold_min_files": 25, + "expect_results": true, + "notes": "All run() signatures with verbose param on same line; 27 files match. NB: go_cmd/grep_cmd/runner use run_test()/run_err() so don't match this regex." + }, + "B2": { + "query": "timer\\.track\\(", + "category": "regex_pattern", + "grep_flags": "-E", + "gold_files": ["main.rs", "git.rs", "cargo_cmd.rs", "npm_cmd.rs", "go_cmd.rs", "gh_cmd.rs", "container.rs", "curl_cmd.rs"], + "gold_min_files": 30, + "expect_results": true, + "notes": "All tracking calls; 34 files match (same scope as TimedExecution users)" + }, + "B3": { + "query": "\\.unwrap_or\\(1\\)", + "category": "regex_pattern", + "grep_flags": "-E", + "gold_files": ["main.rs", "git.rs", "cargo_cmd.rs", "container.rs", "gh_cmd.rs", "tree.rs", "npm_cmd.rs", "gain.rs"], + "gold_min_files": 15, + "expect_results": true, + "notes": "Exit code fallback pattern .code().unwrap_or(1); 20 files match" + }, + "B4": { + "query": "#\\[cfg\\(test\\)\\]", + "category": "regex_pattern", + "grep_flags": "-E", + "gold_files": ["filter.rs", "utils.rs", "discover/registry.rs", "vitest_cmd.rs", "next_cmd.rs", "playwright_cmd.rs", "runner.rs", "tsc_cmd.rs", "git.rs", "cargo_cmd.rs"], + "gold_min_files": 35, + "expect_results": true, + "notes": "Test module declarations; 41 files have #[cfg(test)] (nearly all modules)" + }, + "B5": { + "query": "HashMap 0", + "category": "cross_file", + "grep_flags": "", + "gold_files": ["main.rs", "git.rs", "cargo_cmd.rs", "npm_cmd.rs", "go_cmd.rs", "container.rs", "grep_cmd.rs", "pnpm_cmd.rs"], + "gold_min_files": 30, + "expect_results": true, + "notes": "All verbose debug logging points; 90 occurrences across 37 files (includes parser/README.md)" + }, + "D2": { + "query": "anyhow::Result", + "category": "cross_file", + "grep_flags": "", + "gold_files": ["tracking.rs", "config.rs", "discover/mod.rs", "log_cmd.rs", "diff_cmd.rs", "deps.rs", "find_cmd.rs", "env_cmd.rs", "local_llm.rs", "learn/mod.rs", "learn/report.rs"], + "gold_min_files": 11, + "expect_results": true, + "notes": "Literal 'anyhow::Result' only in 11 files with standalone import. Most files use 'use anyhow::{Context, Result}' which doesn't contain the literal substring." + }, + "D3": { + "query": "process::exit", + "category": "cross_file", + "grep_flags": "", + "gold_files": ["main.rs", "git.rs", "cargo_cmd.rs", "npm_cmd.rs", "container.rs", "gh_cmd.rs"], + "gold_min_files": 15, + "expect_results": true, + "notes": "All exit points across 19 files" + }, + "D4": { + "query": "Command::new", + "category": "cross_file", + "grep_flags": "", + "gold_files": ["main.rs", "git.rs", "cargo_cmd.rs", "npm_cmd.rs", "utils.rs", "go_cmd.rs", "grep_cmd.rs"], + "gold_min_files": 20, + "expect_results": true, + "notes": "All subprocess spawns across 25 files" + }, + "D5": { + "query": "from_utf8_lossy", + "category": "cross_file", + "grep_flags": "", + "gold_files": ["main.rs", "git.rs", "cargo_cmd.rs", "npm_cmd.rs", "utils.rs", "container.rs", "grep_cmd.rs"], + "gold_min_files": 25, + "expect_results": true, + "notes": "All lossy UTF-8 conversions across 29 files" + }, + + "E1": { + "query": "the", + "category": "edge_case", + "grep_flags": "", + "gold_files": [], + "gold_min_files": 0, + "expect_results": true, + "notes": "Stop word: grep/rtk grep flood output; rgai should filter or return minimal" + }, + "E2": { + "query": "fn", + "category": "edge_case", + "grep_flags": "", + "gold_files": [], + "gold_min_files": 0, + "expect_results": true, + "notes": "Ultra-common 2-char token: every .rs file matches; tests truncation limits" + }, + "E3": { + "query": "error handling retry backoff", + "category": "edge_case", + "grep_flags": "", + "gold_files": [], + "gold_min_files": 0, + "expect_results": false, + "notes": "Zero results expected from all tools — nothing in codebase matches this phrase" + } + } +} diff --git a/benchmarks/tests/__init__.py b/benchmarks/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/tests/test_analyze_code.py b/benchmarks/tests/test_analyze_code.py new file mode 100644 index 0000000..d769e7b --- /dev/null +++ b/benchmarks/tests/test_analyze_code.py @@ -0,0 +1,466 @@ +""" +Tests for analyze_code.py benchmark analyzer. + +Covers: + - median_val computation + - is_valid_exit semantics + - extract_filenames from various output formats + - compute_gold_hit_rate accuracy + - is_miss detection (critical rule) + - format_te / format_pct output + - compute_metrics end-to-end with mock data +""" + +from __future__ import annotations + +import json +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch + +import sys + +# Add parent dir to path so we can import analyze_code +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +import analyze_code as ac + + +class TestMedianVal(unittest.TestCase): + def test_odd_count(self): + self.assertEqual(ac.median_val([3, 1, 2]), 2.0) + + def test_even_count(self): + self.assertEqual(ac.median_val([1, 2, 3, 4]), 2.5) + + def test_single(self): + self.assertEqual(ac.median_val([42]), 42.0) + + def test_empty(self): + self.assertEqual(ac.median_val([]), 0.0) + + def test_already_sorted(self): + self.assertEqual(ac.median_val([10, 20, 30, 40, 50]), 30.0) + + def test_duplicates(self): + self.assertEqual(ac.median_val([5, 5, 5, 5, 5]), 5.0) + + +class TestIsValidExit(unittest.TestCase): + def test_exit_0_valid(self): + self.assertTrue(ac.is_valid_exit(0)) + + def test_exit_1_valid(self): + """exit=1 means 'no matches' for grep — still valid.""" + self.assertTrue(ac.is_valid_exit(1)) + + def test_exit_2_invalid(self): + """exit>=2 means execution error.""" + self.assertFalse(ac.is_valid_exit(2)) + + def test_exit_127_invalid(self): + self.assertFalse(ac.is_valid_exit(127)) + + +class TestExtractFilenames(unittest.TestCase): + def test_grep_output(self): + """Standard grep -rn output format.""" + text = ( + "src/tracking.rs:42:pub struct TimedExecution {\n" + "src/main.rs:100: let timer = TimedExecution::new();\n" + "src/discover/registry.rs:77:const RULES: &[RtkRule] = &[\n" + ) + filenames = ac.extract_filenames(text) + self.assertIn("tracking.rs", filenames) + self.assertIn("main.rs", filenames) + self.assertIn("discover/registry.rs", filenames) + + def test_rtk_file_headers(self): + """RTK grouped output headers.""" + text = ( + "📄 /Users/andrew/Programming/rtk/src/tracking.rs (1):\n" + "📄 /.../discover/registry.rs [9.1]\n" + ) + filenames = ac.extract_filenames(text) + self.assertIn("tracking.rs", filenames) + self.assertIn("discover/registry.rs", filenames) + + def test_no_filenames(self): + """Text with no .rs files.""" + text = "no rust files here\njust plain text" + filenames = ac.extract_filenames(text) + self.assertEqual(len(filenames), 0) + + def test_nested_path(self): + """Nested directory paths.""" + text = "src/parser/mod.rs:1:pub mod types;" + filenames = ac.extract_filenames(text) + self.assertIn("parser/mod.rs", filenames) + + def test_absolute_path_normalization(self): + text = ( + "/Users/andrew/Programming/rtk/src/tracking.rs:42:code\n" + "/Users/andrew/Programming/rtk/src/discover/registry.rs:77:code\n" + ) + filenames = ac.extract_filenames(text) + self.assertIn("tracking.rs", filenames) + self.assertIn("discover/registry.rs", filenames) + + def test_deduplication(self): + """Same file appearing multiple times.""" + text = ( + "src/git.rs:10:code\n" + "src/git.rs:20:more code\n" + "src/git.rs:30:even more\n" + ) + filenames = ac.extract_filenames(text) + self.assertEqual(filenames.count("git.rs") if isinstance(filenames, list) else 1, 1) + self.assertIn("git.rs", filenames) + + def test_does_not_parse_rs_inside_code_snippet(self): + text = ' 565: classify_command("cat src/main.rs"),' + filenames = ac.extract_filenames(text) + self.assertEqual(len(filenames), 0) + + +class TestComputeGoldHitRate(unittest.TestCase): + def test_all_found(self): + sample = "src/tracking.rs:1:code\nsrc/main.rs:2:code\n" + gold = ["tracking.rs", "main.rs"] + self.assertAlmostEqual(ac.compute_gold_hit_rate(sample, gold), 1.0) + + def test_partial_found(self): + sample = "src/tracking.rs:1:code\nsrc/utils.rs:2:code\n" + gold = ["tracking.rs", "main.rs"] + self.assertAlmostEqual(ac.compute_gold_hit_rate(sample, gold), 0.5) + + def test_none_found(self): + sample = "src/utils.rs:1:code\n" + gold = ["tracking.rs", "main.rs"] + self.assertAlmostEqual(ac.compute_gold_hit_rate(sample, gold), 0.0) + + def test_empty_gold_files(self): + """No gold files => N/A for hit-rate.""" + sample = "src/anything.rs:1:code\n" + self.assertIsNone(ac.compute_gold_hit_rate(sample, [])) + + def test_empty_sample(self): + gold = ["tracking.rs"] + self.assertAlmostEqual(ac.compute_gold_hit_rate("", gold), 0.0) + + def test_nested_gold_files(self): + sample = "src/discover/registry.rs:77:const RULES\n" + gold = ["discover/registry.rs"] + self.assertAlmostEqual(ac.compute_gold_hit_rate(sample, gold), 1.0) + + +class TestIsMiss(unittest.TestCase): + def test_zero_results_expects_results(self): + """0 results when gold expects results → MISS.""" + self.assertTrue(ac.is_miss(0, True)) + + def test_zero_results_expects_nothing(self): + """0 results when gold expects nothing → NOT miss.""" + self.assertFalse(ac.is_miss(0, False)) + + def test_has_results_expects_results(self): + """Has results when expected → NOT miss.""" + self.assertFalse(ac.is_miss(42, True)) + + def test_has_results_expects_nothing(self): + """Has results when nothing expected → NOT miss (unexpected but not MISS).""" + self.assertFalse(ac.is_miss(5, False)) + + +class TestFormatFunctions(unittest.TestCase): + def test_format_te_normal(self): + self.assertEqual(ac.format_te(0.123, False), "0.123") + + def test_format_te_miss(self): + self.assertEqual(ac.format_te(0.5, True), "MISS") + + def test_format_te_none(self): + self.assertEqual(ac.format_te(None, False), "N/A") + + def test_format_pct_savings(self): + self.assertEqual(ac.format_pct(0.3, False), "70.0%") + + def test_format_pct_miss(self): + self.assertEqual(ac.format_pct(0.3, True), "MISS") + + def test_format_pct_expansion(self): + """TE > 1.0 means output is larger than grep baseline.""" + self.assertEqual(ac.format_pct(1.5, False), "-50.0%") + + def test_format_gold_full(self): + self.assertEqual(ac.format_gold(1.0, 10, 5), "100% (10/5)") + + def test_format_gold_partial(self): + self.assertEqual(ac.format_gold(0.6, 3, 8), "60% (3/8)") + + def test_format_gold_none(self): + self.assertEqual(ac.format_gold(None, 0, 0), "N/A") + + def test_format_gold_none_with_min(self): + self.assertEqual(ac.format_gold(None, 2, 10), "N/A (2/10)") + + def test_format_timing_microseconds(self): + self.assertEqual(ac.format_timing(500), "500μs") + + def test_format_timing_milliseconds(self): + self.assertEqual(ac.format_timing(5000), "5.0ms") + + def test_format_timing_seconds(self): + self.assertEqual(ac.format_timing(2_500_000), "2.50s") + + +class TestComputeMetrics(unittest.TestCase): + """End-to-end test of compute_metrics with synthetic data.""" + + def _make_rows(self, tid, tool, runs=5, time=1000, output_bytes=500, + output_tokens=100, result_count=10, exit_code=0): + """Helper to generate mock CSV rows.""" + return [ + { + "test_id": tid, + "category": "exact_identifier", + "query": f'"{tid} query"', + "tool": tool, + "run": i + 1, + "time_us": time + i * 10, + "output_bytes": output_bytes, + "output_tokens": output_tokens, + "result_count": result_count, + "exit_code": exit_code, + } + for i in range(runs) + ] + + def test_basic_metrics(self): + """Verify TE computation for a simple case.""" + gold = { + "T1": { + "query": "test", + "category": "exact_identifier", + "gold_files": [], + "gold_min_files": 0, + "expect_results": True, + } + } + rows = ( + self._make_rows("T1", "grep", output_bytes=1000, output_tokens=1000, result_count=50) + + self._make_rows("T1", "rtk_grep", output_bytes=300, output_tokens=300, result_count=20) + + self._make_rows("T1", "rtk_rgai", output_bytes=200, output_tokens=200, result_count=10) + ) + + with patch.object(ac, "load_quality_sample", return_value=""): + metrics = ac.compute_metrics(rows, gold) + + self.assertEqual(len(metrics), 1) + m = metrics[0] + # TE = rtk_grep_tokens / grep_tokens = 300/1000 = 0.3 + self.assertAlmostEqual(m["rtk_grep_te"], 0.3) + # TE = rtk_rgai_tokens / grep_tokens = 200/1000 = 0.2 + self.assertAlmostEqual(m["rtk_rgai_te"], 0.2) + + def test_miss_detection(self): + """0 result count with expect_results=True → MISS.""" + gold = { + "T2": { + "query": "test", + "category": "semantic_intent", + "gold_files": ["tracking.rs"], + "gold_min_files": 1, + "expect_results": True, + } + } + rows = ( + self._make_rows("T2", "grep", output_bytes=0, result_count=0) + + self._make_rows("T2", "rtk_grep", output_bytes=0, result_count=0) + + self._make_rows("T2", "rtk_rgai", output_bytes=500, result_count=5) + ) + + with patch.object(ac, "load_quality_sample", return_value=""): + metrics = ac.compute_metrics(rows, gold) + + m = metrics[0] + self.assertTrue(m["grep_miss"]) + self.assertTrue(m["rtk_grep_miss"]) + self.assertFalse(m["rtk_rgai_miss"]) + + def test_miss_detection_with_rtk_zero_marker(self): + """rtk '0 for' marker should force effective result_count=0.""" + gold = { + "T2B": { + "query": "semantic query", + "category": "semantic_intent", + "gold_files": ["tracking.rs"], + "gold_min_files": 1, + "expect_results": True, + } + } + rows = ( + self._make_rows("T2B", "grep", output_bytes=0, result_count=0) + + self._make_rows("T2B", "rtk_grep", output_bytes=42, result_count=1) + + self._make_rows("T2B", "rtk_rgai", output_bytes=2400, result_count=80) + ) + + def fake_sample(tid, tool): + if tid == "T2B" and tool == "rtk_grep": + return "🔍 0 for 'semantic query'\n" + return "" + + with patch.object(ac, "load_quality_sample", side_effect=fake_sample): + metrics = ac.compute_metrics(rows, gold) + + m = metrics[0] + self.assertEqual(m["rtk_grep_count"], 0) + self.assertTrue(m["rtk_grep_miss"]) + + def test_no_miss_when_not_expected(self): + """0 results with expect_results=False → NOT miss.""" + gold = { + "T3": { + "query": "nonexistent", + "category": "edge_case", + "gold_files": [], + "gold_min_files": 0, + "expect_results": False, + } + } + rows = ( + self._make_rows("T3", "grep", output_bytes=0, result_count=0) + + self._make_rows("T3", "rtk_grep", output_bytes=0, result_count=0) + + self._make_rows("T3", "rtk_rgai", output_bytes=0, result_count=0) + ) + + with patch.object(ac, "load_quality_sample", return_value=""): + metrics = ac.compute_metrics(rows, gold) + + m = metrics[0] + self.assertFalse(m["grep_miss"]) + self.assertFalse(m["rtk_grep_miss"]) + self.assertFalse(m["rtk_rgai_miss"]) + + def test_grep_baseline_zero_te_none(self): + """When grep baseline is 0 bytes, TE should be None.""" + gold = { + "T4": { + "query": "rare", + "category": "exact_identifier", + "gold_files": [], + "gold_min_files": 0, + "expect_results": False, + } + } + rows = ( + self._make_rows("T4", "grep", output_bytes=0, output_tokens=0, result_count=0) + + self._make_rows("T4", "rtk_grep", output_bytes=100, result_count=2) + + self._make_rows("T4", "rtk_rgai", output_bytes=50, result_count=1) + ) + + with patch.object(ac, "load_quality_sample", return_value=""): + metrics = ac.compute_metrics(rows, gold) + + m = metrics[0] + self.assertIsNone(m["rtk_grep_te"]) + self.assertIsNone(m["rtk_rgai_te"]) + + def test_zero_gold_hit_marks_low_coverage(self): + gold = { + "T5": { + "query": "semantic", + "category": "semantic_intent", + "gold_files": ["tracking.rs"], + "gold_min_files": 1, + "expect_results": True, + } + } + rows = ( + self._make_rows("T5", "grep", output_tokens=100, result_count=10) + + self._make_rows("T5", "rtk_rgai", output_tokens=20, result_count=5) + ) + + def fake_sample(tid, tool): + if tool == "grep": + return "src/tracking.rs:1:code\n" + if tool == "rtk_rgai": + return "📄 src/utils.rs [10.0]\n" + return "" + + with patch.object(ac, "load_quality_sample", side_effect=fake_sample): + metrics = ac.compute_metrics(rows, gold) + + m = metrics[0] + self.assertTrue(m["rtk_rgai_low_coverage"]) + + +class TestGoldStandardsIntegrity(unittest.TestCase): + """Verify gold_standards.json is well-formed.""" + + def setUp(self): + gold_path = Path(__file__).resolve().parent.parent / "gold_standards.json" + with open(gold_path, encoding="utf-8") as f: + self.data = json.load(f) + self.queries = self.data["queries"] + + def test_has_metadata(self): + self.assertIn("metadata", self.data) + self.assertIn("pinned_commit", self.data["metadata"]) + + def test_query_count(self): + """Should have exactly 30 queries.""" + self.assertEqual(len(self.queries), 30) + + def test_category_distribution(self): + """A=6, B=6, C=10, D=5, E=3.""" + cats = [q["category"] for q in self.queries.values()] + self.assertEqual(cats.count("exact_identifier"), 6) + self.assertEqual(cats.count("regex_pattern"), 6) + self.assertEqual(cats.count("semantic_intent"), 10) + self.assertEqual(cats.count("cross_file"), 5) + self.assertEqual(cats.count("edge_case"), 3) + + def test_required_fields(self): + """Every query has required fields.""" + required = {"query", "category", "grep_flags", "gold_files", + "gold_min_files", "expect_results", "notes"} + for tid, q in self.queries.items(): + for field in required: + self.assertIn( + field, q, + f"Query {tid} missing field '{field}'" + ) + + def test_id_prefix_matches_category(self): + """A* → exact_identifier, B* → regex_pattern, etc.""" + prefix_map = { + "A": "exact_identifier", + "B": "regex_pattern", + "C": "semantic_intent", + "D": "cross_file", + "E": "edge_case", + } + for tid, q in self.queries.items(): + expected_cat = prefix_map.get(tid[0]) + self.assertEqual( + q["category"], expected_cat, + f"Query {tid} has category '{q['category']}' " + f"but expected '{expected_cat}'" + ) + + def test_e3_expects_no_results(self): + """E3 (nonexistent phrase) should expect no results.""" + self.assertFalse(self.queries["E3"]["expect_results"]) + + def test_gold_files_are_lists(self): + for tid, q in self.queries.items(): + self.assertIsInstance( + q["gold_files"], list, + f"Query {tid} gold_files is not a list" + ) + + +if __name__ == "__main__": + unittest.main()