cooperbench · akhatua2 · Feb 2, 2026 · Feb 2, 2026 · Feb 2, 2026
diff --git a/.gitignore b/.gitignore
@@ -41,6 +41,7 @@ site/
 logs*/
 *.log
 cooperbench_results.xlsx
+generated/
 
 # Cache
 .cooperbench_cache/

diff --git a/dataset/huggingface_datasets_task/task7309/runner.sh b/dataset/huggingface_datasets_task/task7309/runner.sh
@@ -17,10 +17,10 @@ trap cleanup EXIT INT TERM
 # Get input params
 TEST_PATCH="$1"
 FEATURE_PATCH="$2"
-TEST_PATH="tests/io/test_parquet.py"
+TEST_PATH="${3:-tests/io/test_parquet.py}"  # Optional 3rd param, default to original
 
 if [[ -z "$TEST_PATCH" ]]; then
-    echo "Usage: docker run -v \$(pwd):/patches <image> <test_patch> [feature_patch]"
+    echo "Usage: docker run -v \$(pwd):/patches <image> <test_patch> [feature_patch] [test_path]"
     exit 1
 fi
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -107,6 +107,7 @@ exclude = [
 
 [tool.hatch.build.targets.wheel]
 packages = ["src/cooperbench"]
+exclude = ["src/cooperbench/generation"]
 
 [tool.ruff]
 line-length = 120

diff --git a/src/cooperbench/agents/mini_swe_agent/adapter.py b/src/cooperbench/agents/mini_swe_agent/adapter.py
@@ -61,9 +61,14 @@ def run(
         with open(config_path) as f:
             default_config = yaml.safe_load(f)
 
-        # Merge passed config overrides into default config
+        # Deep merge passed config overrides into default config
         if config is not None:
-            default_config.update(config)
+            for key, value in config.items():
+                if key in default_config and isinstance(default_config[key], dict) and isinstance(value, dict):
+                    # Deep merge nested dicts (like "agent")
+                    default_config[key].update(value)
+                else:
+                    default_config[key] = value
 
         agent_config = default_config.get("agent", {})
         backend = default_config.get("backend", "modal")
@@ -175,9 +180,16 @@ def run(
     def _get_patch(self, env: "ModalEnvironment | DockerEnvironment", base_commit: str) -> str:
         """Extract git diff from base commit to current working tree state."""
         try:
-            # Single diff from base commit to working tree (includes both
-            # committed and uncommitted changes)
-            result = env.execute(f"git diff {base_commit}", timeout=30)
+            # Stage all changes (including new untracked files)
+            env.execute("git add -A", timeout=10)
+            # Configure git identity (required for commit in fresh sandbox environments)
+            env.execute("git config user.email 'agent@cooperbench.local'", timeout=10)
+            env.execute("git config user.name 'CooperBench Agent'", timeout=10)
+            # Commit everything so committed + staged + unstaged changes are all in HEAD
+            # This ensures we capture changes even if the agent made commits
+            env.execute("git commit --allow-empty -m 'Agent changes'", timeout=10)
+            # Diff from base commit to HEAD captures all changes
+            result = env.execute(f"git diff {base_commit} HEAD", timeout=30)
             return result.get("output", "").strip()
         except Exception:
             return ""
diff --git a/src/cooperbench/generation/README.md b/src/cooperbench/generation/README.md
@@ -0,0 +1,131 @@
+# Feature Generation Pipeline
+
+Automated generation of new benchmark features using LLM agents running on Modal.
+
+## Quick Start
+
+```bash
+# From project root
+cd /path/to/CooperBench
+
+# Generate a single feature
+python -m cooperbench.generation --task dspy_task/task8394
+
+# Just see the prompt (no agent run)
+python -m cooperbench.generation --task dspy_task/task8394 --prompt-only
+
+# Validate existing patches
+python -m cooperbench.generation --task dspy_task/task8394 --validate feature.patch tests.patch
+```
+
+## Usage
+
+### Generate Features
+
+```bash
+# Single attempt with Gemini 3 Flash (default)
+python -m cooperbench.generation --task dspy_task/task8394
+
+# Multiple attempts with output directory
+python -m cooperbench.generation --task dspy_task/task8394 --attempts 5 --output ./generated
+
+# Use different model
+python -m cooperbench.generation --task dspy_task/task8394 --model claude-3-opus
+
+# Use local Docker instead of Modal
+python -m cooperbench.generation --task dspy_task/task8394 --backend docker
+```
+
+### Validate Patches
+
+```bash
+# Check if patches pass tests and conflict with existing features
+python -m cooperbench.generation \
+    --task dspy_task/task8394 \
+    --validate ./generated/feature.patch ./generated/tests.patch
+```
+
+## How It Works
+
+### 1. Prompt Building (`prompt.py`)
+
+Analyzes existing features in a task to build a generation prompt:
+- Reads all `feature.md` files to understand the format
+- Parses `feature.patch` files to identify "hot spots" (frequently modified files/lines)
+- Instructs agent to create conflicting features
+
+### 2. Agent Execution (`generator.py`)
+
+Runs `mini_swe_agent` on Modal with the task's Docker image:
+- Agent explores the codebase
+- Implements a new feature that modifies similar code regions
+- Writes tests
+- Verifies tests pass
+
+### 3. Patch Splitting (`splitter.py`)
+
+Separates agent's output into:
+- `feature.patch` - Source code changes
+- `tests.patch` - Test file changes
+- `feature.md` - Feature description extracted from agent output
+
+### 4. Validation (`validator.py`)
+
+All validation runs in Modal sandboxes:
+- **Test validation**: Runs tests using existing `runner.sh`
+- **Conflict detection**: Applies patches to git branches and attempts merge
+
+A generated feature is **valid** if:
+- ✅ All tests pass
+- ✅ Conflicts with at least 1 existing feature
+
+## Module Structure
+
+```
+generation/
+├── __init__.py      # Package exports
+├── __main__.py      # CLI entry point
+├── generator.py     # Main orchestrator
+├── prompt.py        # Prompt building
+├── splitter.py      # Patch splitting
+├── validator.py     # Modal-based validation
+└── README.md        # This file
+```
+
+## Programmatic Usage
+
+```python
+from cooperbench.generation import generate_feature, validate_generated_feature
+
+# Generate a new feature
+result = generate_feature(
+    task_dir="dataset/dspy_task/task8394",
+    model_name="gpt-4o",
+    backend="modal",
+)
+
+if result.success:
+    print(f"Feature patch:\n{result.feature_patch}")
+    print(f"Tests patch:\n{result.tests_patch}")
+    print(f"Cost: ${result.agent_cost:.4f}")
+
+# Validate patches
+validation = validate_generated_feature(
+    repo_name="dspy_task",
+    task_id=8394,
+    feature_patch=result.feature_patch,
+    tests_patch=result.tests_patch,
+)
+
+print(f"Valid: {validation['valid']}")
+print(f"Conflicts with features: {validation['conflict_result']['conflicts']}")
+```
+
+## Success Criteria
+
+A generated feature is considered **successful** if:
+
+1. **Tests Pass**: The feature implementation is correct and all tests (including new tests) pass
+2. **Has Conflicts**: The feature conflicts with at least one existing feature when merging
+
+The conflict requirement ensures the generated feature is useful for testing multi-agent coordination - features that merge cleanly don't test the coordination aspects of the benchmark.
diff --git a/src/cooperbench/generation/__init__.py b/src/cooperbench/generation/__init__.py
@@ -0,0 +1,17 @@
+"""Task generation package - automated creation of new benchmark features."""
+
+from cooperbench.generation.generator import generate_feature
+from cooperbench.generation.prompt import build_prompt
+from cooperbench.generation.splitter import split_patch
+from cooperbench.generation.validator import (
+    check_conflicts_in_sandbox,
+    validate_generated_feature,
+)
+
+__all__ = [
+    "generate_feature",
+    "build_prompt",
+    "split_patch",
+    "check_conflicts_in_sandbox",
+    "validate_generated_feature",
+]
-Original file line number
+Diff line change
@@ Expand Up / @@ -41,6 +41,7 @@ site/ @@
     logs*/
     *.log
     cooperbench_results.xlsx
+    generated/
     # Cache
     .cooperbench_cache/
@@ Expand Down @@