Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion dataset/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,11 @@ Pre-defined task subsets are available in `subsets/` for quick evaluation:

| Subset | Tasks | Pairs | Repos | Description |
|--------|-------|-------|-------|-------------|
| `spark` | 13 | 20 | 10 | Smoke-test subset (sampled from flash) |
| `flash` | 20 | 50 | 11 | Dev subset for rapid iteration (sampled from lite) |
| `lite` | 26 | 100 | 12 | Quick evaluation subset |

Both subsets are generated via **uniform pair-level sampling**:
All subsets are generated via **uniform pair-level sampling** and form a strict hierarchy: `lite` (100 pairs) → `flash` (50 pairs) → `spark` (20 pairs).

```python
random.seed(42) # fixed seed for reproducibility
Expand Down
169 changes: 169 additions & 0 deletions dataset/subsets/spark.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
{
"name": "spark",
"description": "20-pair dev subset for smoke testing (uniform pair-level sampling from flash, seed=42)",
"stats": {
"tasks": 13,
"pairs": 20,
"repos": 10
},
"tasks": [
{
"repo": "dottxt_ai_outlines_task",
"task_id": 1655,
"pairs": [
[
7,
10
]
]
},
{
"repo": "dottxt_ai_outlines_task",
"task_id": 1706,
"pairs": [
[
4,
6
]
]
},
{
"repo": "dspy_task",
"task_id": 8394,
"pairs": [
[
3,
5
]
]
},
{
"repo": "go_chi_task",
"task_id": 26,
"pairs": [
[
1,
2
]
]
},
{
"repo": "go_chi_task",
"task_id": 56,
"pairs": [
[
1,
5
]
]
},
{
"repo": "huggingface_datasets_task",
"task_id": 3997,
"pairs": [
[
2,
4
]
]
},
{
"repo": "openai_tiktoken_task",
"task_id": 0,
"pairs": [
[
3,
6
],
[
6,
8
]
]
},
{
"repo": "pallets_click_task",
"task_id": 2068,
"pairs": [
[
1,
6
],
[
5,
7
]
]
},
{
"repo": "pallets_jinja_task",
"task_id": 1465,
"pairs": [
[
2,
6
]
]
},
{
"repo": "pallets_jinja_task",
"task_id": 1621,
"pairs": [
[
1,
6
],
[
3,
5
],
[
4,
10
],
[
6,
7
]
]
},
{
"repo": "pillow_task",
"task_id": 68,
"pairs": [
[
1,
5
]
]
},
{
"repo": "samuelcolvin_dirty_equals_task",
"task_id": 43,
"pairs": [
[
2,
3
]
]
},
{
"repo": "typst_task",
"task_id": 6554,
"pairs": [
[
1,
9
],
[
4,
9
],
[
7,
8
]
]
}
]
}
20 changes: 20 additions & 0 deletions tests/runner/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,26 @@ def test_flash_benchmark_split_is_stable(self):
lite_subset = load_subset("lite")
assert subset["tasks"].issubset(lite_subset["tasks"]), "Flash tasks should be subset of lite"

def test_spark_benchmark_split_is_stable(self):
"""Verify spark benchmark split hasn't changed - this is a frozen evaluation set."""
subset = load_subset("spark")

# Check structure
assert "tasks" in subset, "Subset should have 'tasks' key"
assert "pairs" in subset, "Subset should have 'pairs' key"

# Spark is a 20-pair subset sampled from flash
assert len(subset["tasks"]) == 13, "Spark should have 13 tasks"
assert len(subset["pairs"]) == 13, "All 13 tasks should have specific pairs"

# Verify discover_tasks generates expected 20 pairs
discovered = discover_tasks(subset="spark")
assert len(discovered) == 20, "Spark subset should generate exactly 20 feature pairs"

# Spark should be a strict subset of flash
flash_subset = load_subset("flash")
assert subset["tasks"].issubset(flash_subset["tasks"]), "Spark tasks should be subset of flash"

def test_load_subset_returns_dict_with_pairs(self):
"""Test that load_subset returns dict with tasks and pairs."""
subset = load_subset("lite")
Expand Down
Loading