From f4b610b195052a8a7594d7621bc42e4b4baa96c1 Mon Sep 17 00:00:00 2001 From: Priyank-Shethia3 Date: Mon, 16 Feb 2026 12:58:12 -0800 Subject: [PATCH] create spark subset of 20 pairs from flash --- dataset/README.md | 3 +- dataset/subsets/spark.json | 169 +++++++++++++++++++++++++++++++++++++ tests/runner/test_tasks.py | 20 +++++ 3 files changed, 191 insertions(+), 1 deletion(-) create mode 100644 dataset/subsets/spark.json diff --git a/dataset/README.md b/dataset/README.md index 8d1d421..4ab84b0 100644 --- a/dataset/README.md +++ b/dataset/README.md @@ -65,10 +65,11 @@ Pre-defined task subsets are available in `subsets/` for quick evaluation: | Subset | Tasks | Pairs | Repos | Description | |--------|-------|-------|-------|-------------| +| `spark` | 13 | 20 | 10 | Smoke-test subset (sampled from flash) | | `flash` | 20 | 50 | 11 | Dev subset for rapid iteration (sampled from lite) | | `lite` | 26 | 100 | 12 | Quick evaluation subset | -Both subsets are generated via **uniform pair-level sampling**: +All subsets are generated via **uniform pair-level sampling** and form a strict hierarchy: `lite` (100 pairs) → `flash` (50 pairs) → `spark` (20 pairs). ```python random.seed(42) # fixed seed for reproducibility diff --git a/dataset/subsets/spark.json b/dataset/subsets/spark.json new file mode 100644 index 0000000..f13bea4 --- /dev/null +++ b/dataset/subsets/spark.json @@ -0,0 +1,169 @@ +{ + "name": "spark", + "description": "20-pair dev subset for smoke testing (uniform pair-level sampling from flash, seed=42)", + "stats": { + "tasks": 13, + "pairs": 20, + "repos": 10 + }, + "tasks": [ + { + "repo": "dottxt_ai_outlines_task", + "task_id": 1655, + "pairs": [ + [ + 7, + 10 + ] + ] + }, + { + "repo": "dottxt_ai_outlines_task", + "task_id": 1706, + "pairs": [ + [ + 4, + 6 + ] + ] + }, + { + "repo": "dspy_task", + "task_id": 8394, + "pairs": [ + [ + 3, + 5 + ] + ] + }, + { + "repo": "go_chi_task", + "task_id": 26, + "pairs": [ + [ + 1, + 2 + ] + ] + }, + { + "repo": "go_chi_task", + "task_id": 56, + "pairs": [ + [ + 1, + 5 + ] + ] + }, + { + "repo": "huggingface_datasets_task", + "task_id": 3997, + "pairs": [ + [ + 2, + 4 + ] + ] + }, + { + "repo": "openai_tiktoken_task", + "task_id": 0, + "pairs": [ + [ + 3, + 6 + ], + [ + 6, + 8 + ] + ] + }, + { + "repo": "pallets_click_task", + "task_id": 2068, + "pairs": [ + [ + 1, + 6 + ], + [ + 5, + 7 + ] + ] + }, + { + "repo": "pallets_jinja_task", + "task_id": 1465, + "pairs": [ + [ + 2, + 6 + ] + ] + }, + { + "repo": "pallets_jinja_task", + "task_id": 1621, + "pairs": [ + [ + 1, + 6 + ], + [ + 3, + 5 + ], + [ + 4, + 10 + ], + [ + 6, + 7 + ] + ] + }, + { + "repo": "pillow_task", + "task_id": 68, + "pairs": [ + [ + 1, + 5 + ] + ] + }, + { + "repo": "samuelcolvin_dirty_equals_task", + "task_id": 43, + "pairs": [ + [ + 2, + 3 + ] + ] + }, + { + "repo": "typst_task", + "task_id": 6554, + "pairs": [ + [ + 1, + 9 + ], + [ + 4, + 9 + ], + [ + 7, + 8 + ] + ] + } + ] +} diff --git a/tests/runner/test_tasks.py b/tests/runner/test_tasks.py index 1412cac..b436de1 100644 --- a/tests/runner/test_tasks.py +++ b/tests/runner/test_tasks.py @@ -158,6 +158,26 @@ def test_flash_benchmark_split_is_stable(self): lite_subset = load_subset("lite") assert subset["tasks"].issubset(lite_subset["tasks"]), "Flash tasks should be subset of lite" + def test_spark_benchmark_split_is_stable(self): + """Verify spark benchmark split hasn't changed - this is a frozen evaluation set.""" + subset = load_subset("spark") + + # Check structure + assert "tasks" in subset, "Subset should have 'tasks' key" + assert "pairs" in subset, "Subset should have 'pairs' key" + + # Spark is a 20-pair subset sampled from flash + assert len(subset["tasks"]) == 13, "Spark should have 13 tasks" + assert len(subset["pairs"]) == 13, "All 13 tasks should have specific pairs" + + # Verify discover_tasks generates expected 20 pairs + discovered = discover_tasks(subset="spark") + assert len(discovered) == 20, "Spark subset should generate exactly 20 feature pairs" + + # Spark should be a strict subset of flash + flash_subset = load_subset("flash") + assert subset["tasks"].issubset(flash_subset["tasks"]), "Spark tasks should be subset of flash" + def test_load_subset_returns_dict_with_pairs(self): """Test that load_subset returns dict with tasks and pairs.""" subset = load_subset("lite")