From a781eefbb8a09c7df3d5d6c9b94828f4a8908840 Mon Sep 17 00:00:00 2001
From: Arpandeep Khatua <akhatua@stanford.edu>
Date: Sun, 1 Feb 2026 16:00:25 -0800
Subject: [PATCH 1/3] Adding data gen using llm

---
 .gitignore                                    |   1 +
 .../task7309/runner.sh                        |   4 +-
 .../agents/mini_swe_agent/adapter.py          |  16 +-
 src/cooperbench/generation/README.md          | 131 +++++
 src/cooperbench/generation/__init__.py        |  17 +
 src/cooperbench/generation/__main__.py        | 270 +++++++++
 src/cooperbench/generation/generator.py       | 512 ++++++++++++++++++
 src/cooperbench/generation/prompt.py          | 310 +++++++++++
 src/cooperbench/generation/splitter.py        | 198 +++++++
 src/cooperbench/generation/validator.py       | 400 ++++++++++++++
 10 files changed, 1852 insertions(+), 7 deletions(-)
 create mode 100644 src/cooperbench/generation/README.md
 create mode 100644 src/cooperbench/generation/__init__.py
 create mode 100644 src/cooperbench/generation/__main__.py
 create mode 100644 src/cooperbench/generation/generator.py
 create mode 100644 src/cooperbench/generation/prompt.py
 create mode 100644 src/cooperbench/generation/splitter.py
 create mode 100644 src/cooperbench/generation/validator.py
diff --git a/.gitignore b/.gitignore
index 9e70e61..eebb634 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,6 +41,7 @@ site/
 logs*/
 *.log
 cooperbench_results.xlsx
+generated/
 
 # Cache
 .cooperbench_cache/
diff --git a/dataset/huggingface_datasets_task/task7309/runner.sh b/dataset/huggingface_datasets_task/task7309/runner.sh
index d77750b..e1cff66 100644
--- a/dataset/huggingface_datasets_task/task7309/runner.sh
+++ b/dataset/huggingface_datasets_task/task7309/runner.sh
@@ -17,10 +17,10 @@ trap cleanup EXIT INT TERM
 # Get input params
 TEST_PATCH="$1"
 FEATURE_PATCH="$2"
-TEST_PATH="tests/io/test_parquet.py"
+TEST_PATH="${3:-tests/io/test_parquet.py}"  # Optional 3rd param, default to original
 
 if [[ -z "$TEST_PATCH" ]]; then
-    echo "Usage: docker run -v \$(pwd):/patches <image> <test_patch> [feature_patch]"
+    echo "Usage: docker run -v \$(pwd):/patches <image> <test_patch> [feature_patch] [test_path]"
     exit 1
 fi
 
diff --git a/src/cooperbench/agents/mini_swe_agent/adapter.py b/src/cooperbench/agents/mini_swe_agent/adapter.py
index 995102f..6a21024 100644
--- a/src/cooperbench/agents/mini_swe_agent/adapter.py
+++ b/src/cooperbench/agents/mini_swe_agent/adapter.py
@@ -61,9 +61,14 @@ def run(
         with open(config_path) as f:
             default_config = yaml.safe_load(f)
 
-        # Merge passed config overrides into default config
+        # Deep merge passed config overrides into default config
         if config is not None:
-            default_config.update(config)
+            for key, value in config.items():
+                if key in default_config and isinstance(default_config[key], dict) and isinstance(value, dict):
+                    # Deep merge nested dicts (like "agent")
+                    default_config[key].update(value)
+                else:
+                    default_config[key] = value
 
         agent_config = default_config.get("agent", {})
         backend = default_config.get("backend", "modal")
@@ -175,9 +180,10 @@ def run(
     def _get_patch(self, env: "ModalEnvironment | DockerEnvironment", base_commit: str) -> str:
         """Extract git diff from base commit to current working tree state."""
         try:
-            # Single diff from base commit to working tree (includes both
-            # committed and uncommitted changes)
-            result = env.execute(f"git diff {base_commit}", timeout=30)
+            # Stage all changes (including new untracked files) so they appear in diff
+            env.execute("git add -A", timeout=10)
+            # Diff from base commit to staged changes (includes new files)
+            result = env.execute(f"git diff --cached {base_commit}", timeout=30)
             return result.get("output", "").strip()
         except Exception:
             return ""
diff --git a/src/cooperbench/generation/README.md b/src/cooperbench/generation/README.md
new file mode 100644
index 0000000..1caff18
--- /dev/null
+++ b/src/cooperbench/generation/README.md
@@ -0,0 +1,131 @@
+# Feature Generation Pipeline
+
+Automated generation of new benchmark features using LLM agents running on Modal.
+
+## Quick Start
+
+```bash
+# From project root
+cd /path/to/CooperBench
+
+# Generate a single feature
+python -m cooperbench.generation --task dspy_task/task8394
+
+# Just see the prompt (no agent run)
+python -m cooperbench.generation --task dspy_task/task8394 --prompt-only
+
+# Validate existing patches
+python -m cooperbench.generation --task dspy_task/task8394 --validate feature.patch tests.patch
+```
+
+## Usage
+
+### Generate Features
+
+```bash
+# Single attempt with Gemini 3 Flash (default)
+python -m cooperbench.generation --task dspy_task/task8394
+
+# Multiple attempts with output directory
+python -m cooperbench.generation --task dspy_task/task8394 --attempts 5 --output ./generated
+
+# Use different model
+python -m cooperbench.generation --task dspy_task/task8394 --model claude-3-opus
+
+# Use local Docker instead of Modal
+python -m cooperbench.generation --task dspy_task/task8394 --backend docker
+```
+
+### Validate Patches
+
+```bash
+# Check if patches pass tests and conflict with existing features
+python -m cooperbench.generation \
+    --task dspy_task/task8394 \
+    --validate ./generated/feature.patch ./generated/tests.patch
+```
+
+## How It Works
+
+### 1. Prompt Building (`prompt.py`)
+
+Analyzes existing features in a task to build a generation prompt:
+- Reads all `feature.md` files to understand the format
+- Parses `feature.patch` files to identify "hot spots" (frequently modified files/lines)
+- Instructs agent to create conflicting features
+
+### 2. Agent Execution (`generator.py`)
+
+Runs `mini_swe_agent` on Modal with the task's Docker image:
+- Agent explores the codebase
+- Implements a new feature that modifies similar code regions
+- Writes tests
+- Verifies tests pass
+
+### 3. Patch Splitting (`splitter.py`)
+
+Separates agent's output into:
+- `feature.patch` - Source code changes
+- `tests.patch` - Test file changes
+- `feature.md` - Feature description extracted from agent output
+
+### 4. Validation (`validator.py`)
+
+All validation runs in Modal sandboxes:
+- **Test validation**: Runs tests using existing `runner.sh`
+- **Conflict detection**: Applies patches to git branches and attempts merge
+
+A generated feature is **valid** if:
+- ✅ All tests pass
+- ✅ Conflicts with at least 1 existing feature
+
+## Module Structure
+
+```
+generation/
+├── __init__.py      # Package exports
+├── __main__.py      # CLI entry point
+├── generator.py     # Main orchestrator
+├── prompt.py        # Prompt building
+├── splitter.py      # Patch splitting
+├── validator.py     # Modal-based validation
+└── README.md        # This file
+```
+
+## Programmatic Usage
+
+```python
+from cooperbench.generation import generate_feature, validate_generated_feature
+
+# Generate a new feature
+result = generate_feature(
+    task_dir="dataset/dspy_task/task8394",
+    model_name="gpt-4o",
+    backend="modal",
+)
+
+if result.success:
+    print(f"Feature patch:\n{result.feature_patch}")
+    print(f"Tests patch:\n{result.tests_patch}")
+    print(f"Cost: ${result.agent_cost:.4f}")
+
+# Validate patches
+validation = validate_generated_feature(
+    repo_name="dspy_task",
+    task_id=8394,
+    feature_patch=result.feature_patch,
+    tests_patch=result.tests_patch,
+)
+
+print(f"Valid: {validation['valid']}")
+print(f"Conflicts with features: {validation['conflict_result']['conflicts']}")
+```
+
+## Success Criteria
+
+A generated feature is considered **successful** if:
+
+1. **Tests Pass**: The feature implementation is correct and all tests (including new tests) pass
+2. **Has Conflicts**: The feature conflicts with at least one existing feature when merging
+
+The conflict requirement ensures the generated feature is useful for testing multi-agent coordination - features that merge cleanly don't test the coordination aspects of the benchmark.
diff --git a/src/cooperbench/generation/__init__.py b/src/cooperbench/generation/__init__.py
new file mode 100644
index 0000000..2c42499
--- /dev/null
+++ b/src/cooperbench/generation/__init__.py
@@ -0,0 +1,17 @@
+"""Task generation package - automated creation of new benchmark features."""
+
+from cooperbench.generation.generator import generate_feature
+from cooperbench.generation.prompt import build_prompt
+from cooperbench.generation.splitter import split_patch
+from cooperbench.generation.validator import (
+    check_conflicts_in_sandbox,
+    validate_generated_feature,
+)
+
+__all__ = [
+    "generate_feature",
+    "build_prompt",
+    "split_patch",
+    "check_conflicts_in_sandbox",
+    "validate_generated_feature",
+]
diff --git a/src/cooperbench/generation/__main__.py b/src/cooperbench/generation/__main__.py
new file mode 100644
index 0000000..00f7202
--- /dev/null
+++ b/src/cooperbench/generation/__main__.py
@@ -0,0 +1,270 @@
+"""Direct execution entry point for generation module.
+
+Usage:
+    python -m cooperbench.generation --task dspy_task/task8394 --model gpt-4o
+"""
+
+import argparse
+import json
+import logging
+import sys
+from pathlib import Path
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate new features for CooperBench tasks",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Generate a single feature
+    python -m cooperbench.generation --task dspy_task/task8394
+
+    # Generate multiple attempts
+    python -m cooperbench.generation --task dspy_task/task8394 --attempts 5 --output ./generated
+
+    # Just build and print the prompt (no agent run)
+    python -m cooperbench.generation --task dspy_task/task8394 --prompt-only
+
+    # Validate an existing patch
+    python -m cooperbench.generation --task dspy_task/task8394 --validate feature.patch tests.patch
+""",
+    )
+
+    parser.add_argument(
+        "--task",
+        required=True,
+        help="Task path relative to dataset/ (e.g., dspy_task/task8394)",
+    )
+    parser.add_argument(
+        "--model",
+        default="gemini/gemini-3-flash-preview",
+        help="LLM model to use (default: gemini/gemini-3-flash-preview)",
+    )
+    parser.add_argument(
+        "--backend",
+        choices=["modal", "docker"],
+        default="modal",
+        help="Execution backend (default: modal)",
+    )
+    parser.add_argument(
+        "--attempts",
+        type=int,
+        default=1,
+        help="Number of generation attempts (default: 1)",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        help="Output directory for generated features",
+    )
+    parser.add_argument(
+        "--feature",
+        type=int,
+        help="Target a specific feature ID for conflicts (default: first feature)",
+    )
+    parser.add_argument(
+        "--prompt-only",
+        action="store_true",
+        help="Just print the prompt without running the agent",
+    )
+    parser.add_argument(
+        "--list-features",
+        action="store_true",
+        help="List all feature IDs in the task and exit",
+    )
+    parser.add_argument(
+        "--validate",
+        nargs=2,
+        metavar=("FEATURE_PATCH", "TESTS_PATCH"),
+        help="Validate existing patches instead of generating",
+    )
+    parser.add_argument(
+        "--step-limit",
+        type=int,
+        default=75,
+        help="Maximum agent steps (default: 75)",
+    )
+    parser.add_argument(
+        "--cost-limit",
+        type=float,
+        default=2.0,
+        help="Maximum cost in USD (default: 2.0)",
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Save full agent trajectory for debugging",
+    )
+
+    args = parser.parse_args()
+
+    # Resolve task directory
+    task_dir = Path("dataset") / args.task
+    if not task_dir.exists():
+        logger.error(f"Task directory not found: {task_dir}")
+        sys.exit(1)
+
+    # Parse repo_name and task_id from path
+    parts = args.task.split("/")
+    if len(parts) != 2:
+        logger.error(f"Invalid task path format. Expected: repo_name/taskID (e.g., dspy_task/task8394)")
+        sys.exit(1)
+
+    repo_name = parts[0]
+    task_id = int(parts[1].replace("task", ""))
+
+    # Mode 0: List features
+    if args.list_features:
+        from cooperbench.generation.prompt import list_features
+        features = list_features(task_dir)
+        print(f"Features in {args.task}: {features}")
+        return
+
+    # Mode 1: Prompt only
+    if args.prompt_only:
+        from cooperbench.generation.prompt import build_prompt
+        prompt = build_prompt(task_dir, feature_id=args.feature)
+        print(prompt)
+        return
+
+    # Mode 2: Validate existing patches
+    if args.validate:
+        feature_patch_path, tests_patch_path = args.validate
+        feature_patch = Path(feature_patch_path).read_text()
+        tests_patch = Path(tests_patch_path).read_text()
+
+        from cooperbench.generation.validator import validate_generated_feature
+
+        logger.info(f"Validating patches for {args.task}")
+        result = validate_generated_feature(
+            repo_name=repo_name,
+            task_id=task_id,
+            feature_patch=feature_patch,
+            tests_patch=tests_patch,
+            backend=args.backend,
+        )
+
+        print(json.dumps(result, indent=2, default=str))
+        sys.exit(0 if result["valid"] else 1)
+
+    # Mode 3: Generate features
+    from cooperbench.generation.generator import generate_feature, generate_features_batch
+    import hashlib
+    import re as re_module
+
+    def make_output_dir(base_dir: Path, feature_md: str | None, fallback_hash: str) -> Path:
+        """Create output directory named after the feature title + short hash."""
+        # Extract title from feature_md if available
+        title_slug = "unknown"
+        if feature_md:
+            # Look for **Title**: ... pattern
+            match = re_module.search(r"\*\*Title\*\*:\s*(.+?)(?:\n|$)", feature_md)
+            if match:
+                title = match.group(1).strip()
+                # Convert to slug: lowercase, replace spaces with underscores, remove special chars
+                title_slug = re_module.sub(r"[^a-z0-9]+", "_", title.lower()).strip("_")[:40]
+
+        # Add short hash for uniqueness
+        content_hash = hashlib.md5((feature_md or fallback_hash).encode()).hexdigest()[:5]
+        folder_name = f"{title_slug}_{content_hash}"
+
+        output_dir = base_dir / folder_name
+        output_dir.mkdir(parents=True, exist_ok=True)
+        return output_dir
+
+    # Base directory for this task
+    base_output_dir = args.output or (Path("generated") / repo_name / f"task{task_id}")
+
+    if args.attempts == 1:
+        logger.info(f"Generating feature for {args.task} with {args.model} (target: feature {args.feature or 'first'})")
+        logger.info(f"Limits: {args.step_limit} steps, ${args.cost_limit} cost")
+
+        # Generate first (to temp location for trajectory)
+        import tempfile
+        temp_dir = Path(tempfile.mkdtemp())
+
+        result = generate_feature(
+            task_dir=task_dir,
+            feature_id=args.feature,
+            model_name=args.model,
+            backend=args.backend,
+            step_limit=args.step_limit,
+            cost_limit=args.cost_limit,
+            debug=args.debug,
+            output_dir=temp_dir,
+        )
+
+        # Create final output dir based on feature title
+        from datetime import datetime
+        fallback = datetime.now().strftime("%Y%m%d_%H%M%S")
+        output_dir = make_output_dir(base_output_dir, result.feature_md, fallback)
+        logger.info(f"Output directory: {output_dir}")
+
+        # Move trajectory files from temp to final
+        import shutil
+        for f in temp_dir.glob("trajectory_*"):
+            shutil.move(str(f), output_dir / f.name)
+
+        # Save outputs (ensure patches end with newline for git compatibility)
+        if result.feature_patch:
+            patch = result.feature_patch.rstrip() + "\n"
+            (output_dir / "feature.patch").write_text(patch)
+            logger.info(f"Saved feature.patch")
+        if result.tests_patch:
+            patch = result.tests_patch.rstrip() + "\n"
+            (output_dir / "tests.patch").write_text(patch)
+            logger.info(f"Saved tests.patch")
+        if result.feature_md:
+            (output_dir / "feature.md").write_text(result.feature_md)
+            logger.info(f"Saved feature.md")
+
+        # Save full result as JSON
+        (output_dir / "result.json").write_text(json.dumps(result.to_dict(), indent=2, default=str))
+        logger.info(f"Saved result.json")
+
+        # Print summary
+        print(f"\n{'='*60}")
+        print(f"Result: {'SUCCESS' if result.success else 'FAILED'}")
+        print(f"Output saved to: {output_dir}")
+        if result.errors:
+            print(f"Errors: {result.errors}")
+        print(f"Agent: {result.agent_steps} steps, ${result.agent_cost:.4f}")
+        print(f"{'='*60}")
+
+        sys.exit(0 if result.success else 1)
+    else:
+        logger.info(f"Running {args.attempts} generation attempts for {args.task} (target: feature {args.feature or 'first'})")
+        logger.info(f"Output directory: {base_output_dir}")
+
+        results = generate_features_batch(
+            task_dir=task_dir,
+            feature_id=args.feature,
+            num_attempts=args.attempts,
+            model_name=args.model,
+            backend=args.backend,
+            output_dir=base_output_dir,
+            step_limit=args.step_limit,
+            cost_limit=args.cost_limit,
+            debug=args.debug,
+        )
+
+        # Summary
+        successful = sum(1 for r in results if r.success)
+        print(f"\n{'='*60}")
+        print(f"Summary: {successful}/{args.attempts} successful")
+        print(f"Output saved to: {base_output_dir}")
+
+        for i, r in enumerate(results, 1):
+            status = "✓" if r.success else "✗"
+            print(f"  {status} Attempt {i}: {r.errors or 'OK'}")
+
+        print(f"{'='*60}")
+        sys.exit(0 if successful > 0 else 1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/cooperbench/generation/generator.py b/src/cooperbench/generation/generator.py
new file mode 100644
index 0000000..ef8da3b
--- /dev/null
+++ b/src/cooperbench/generation/generator.py
@@ -0,0 +1,512 @@
+"""Main generator - orchestrates feature generation using agents."""
+
+import json
+import logging
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+
+from cooperbench.generation.prompt import build_prompt, list_features
+from cooperbench.generation.splitter import extract_feature_description, split_patch
+from cooperbench.generation.validator import (
+    check_conflicts_in_sandbox,
+    run_tests_in_sandbox,
+)
+from cooperbench.utils import get_image_name
+
+logger = logging.getLogger(__name__)
+
+
+def _extract_feature_md_from_patch(patch: str) -> str | None:
+    """Extract .feature_description.md content from a patch."""
+    if not patch or ".feature_description.md" not in patch:
+        return None
+
+    lines = patch.split("\n")
+    in_feature_file = False
+    content_lines = []
+
+    for line in lines:
+        if line.startswith("diff --git") and ".feature_description.md" in line:
+            in_feature_file = True
+            content_lines = []
+        elif in_feature_file and line.startswith("diff --git"):
+            # End of the feature file
+            break
+        elif in_feature_file and line.startswith("+") and not line.startswith("+++"):
+            # Added line (strip the leading +)
+            content_lines.append(line[1:])
+
+    if content_lines:
+        return "\n".join(content_lines).strip()
+    return None
+
+
+def _remove_feature_md_from_patch(patch: str) -> str:
+    """Remove .feature_description.md from a patch (it's metadata, not code)."""
+    if not patch or ".feature_description.md" not in patch:
+        return patch
+
+    lines = patch.split("\n")
+    result_lines = []
+    skip_file = False
+
+    for line in lines:
+        if line.startswith("diff --git") and ".feature_description.md" in line:
+            skip_file = True
+        elif line.startswith("diff --git"):
+            skip_file = False
+
+        if not skip_file:
+            result_lines.append(line)
+
+    return "\n".join(result_lines)
+
+
+@dataclass
+class GenerationResult:
+    """Result of a feature generation attempt."""
+
+    success: bool
+    feature_md: str | None = None
+    feature_patch: str | None = None
+    tests_patch: str | None = None
+    conflicts: list[int] = field(default_factory=list)
+    conflicts_info: list[dict] = field(default_factory=list)  # [{id, title}, ...]
+    errors: list[str] = field(default_factory=list)
+    agent_cost: float = 0.0
+    agent_steps: int = 0
+    duration_seconds: float = 0.0
+    # Validation details
+    tests_passed: bool | None = None
+    tests_output: str | None = None
+    validation_run: bool = False
+
+    def to_dict(self) -> dict:
+        return {
+            "success": self.success,
+            "feature_md": self.feature_md,
+            "feature_patch": self.feature_patch,
+            "tests_patch": self.tests_patch,
+            "conflicts": self.conflicts,
+            "conflicts_info": self.conflicts_info,
+            "errors": self.errors,
+            "agent_cost": self.agent_cost,
+            "agent_steps": self.agent_steps,
+            "duration_seconds": self.duration_seconds,
+            "tests_passed": self.tests_passed,
+            "tests_output": self.tests_output,
+            "validation_run": self.validation_run,
+        }
+
+
+def _get_task_image(task_dir: Path) -> str:
+    """Get the Docker image for a task using existing naming convention."""
+    task_id = int(task_dir.name.replace("task", ""))
+    repo_name = task_dir.parent.name
+    return get_image_name(repo_name, task_id)
+
+
+def _get_repo_and_task_id(task_dir: Path) -> tuple[str, int]:
+    """Extract repo_name and task_id from task directory."""
+    task_id = int(task_dir.name.replace("task", ""))
+    repo_name = task_dir.parent.name
+    return repo_name, task_id
+
+
+def generate_feature(
+    task_dir: str | Path,
+    feature_id: int | None = None,
+    model_name: str = "gemini/gemini-3-flash-preview",
+    backend: str = "modal",
+    timeout: int = 3600,
+    validate: bool = True,
+    step_limit: int = 75,
+    cost_limit: float = 2.0,
+    debug: bool = False,
+    output_dir: Path | None = None,
+) -> GenerationResult:
+    """Generate a new feature for a task using an agent.
+
+    Args:
+        task_dir: Path to the task directory (e.g., dataset/dspy_task/task8394)
+        feature_id: ID of the specific feature to target for conflicts (default: first)
+        model_name: LLM model to use for the agent
+        backend: Execution backend ("modal", "docker", or "gcp")
+        timeout: Maximum time for generation in seconds
+        validate: Whether to validate (run tests + check conflicts) after generation
+        step_limit: Maximum number of agent steps (default: 75)
+        cost_limit: Maximum cost in USD (default: 2.0)
+        debug: Save full agent trajectory to file for inspection
+        output_dir: Directory to save debug output (default: current dir)
+
+    Returns:
+        GenerationResult with the generated feature or errors.
+    """
+    task_dir = Path(task_dir)
+    start_time = time.time()
+
+    if not task_dir.exists():
+        return GenerationResult(
+            success=False,
+            errors=[f"Task directory not found: {task_dir}"],
+        )
+
+    repo_name, task_id = _get_repo_and_task_id(task_dir)
+
+    # Build the prompt
+    logger.info(f"Building prompt for {task_dir} (target feature: {feature_id or 'first'})")
+    prompt = build_prompt(task_dir, feature_id=feature_id)
+
+    # Get the Docker image for this task
+    image = _get_task_image(task_dir)
+    logger.info(f"Using image: {image}")
+
+    # Get existing feature IDs for conflict checking
+    existing_feature_ids = list_features(task_dir)
+    logger.info(f"Found {len(existing_feature_ids)} existing features: {existing_feature_ids}")
+
+    # Run the agent
+    logger.info(f"Running agent with model {model_name} on {backend}")
+
+    try:
+        from cooperbench.agents import get_runner
+
+        agent = get_runner("mini_swe_agent")
+
+        result = agent.run(
+            task=prompt,
+            image=image,
+            model_name=model_name,
+            config={
+                "backend": backend,
+                "agent": {
+                    "step_limit": step_limit,
+                    "cost_limit": cost_limit,
+                },
+            },
+        )
+
+        agent_cost = result.cost
+        agent_steps = result.steps
+
+        # Save/log agent trajectory for debugging
+        if result.messages:
+            logger.info(f"Agent trajectory: {len(result.messages)} messages, {agent_steps} steps, ${agent_cost:.4f}")
+
+            # Save full trajectory to file if debug mode or output_dir specified
+            if debug or output_dir:
+                import re as re_module
+                save_dir = output_dir or Path(".")
+                save_dir.mkdir(parents=True, exist_ok=True)
+
+                traj_file = save_dir / f"trajectory_{repo_name}_{task_id}.json"
+                traj_data = {
+                    "task": f"{repo_name}/task{task_id}",
+                    "model": model_name,
+                    "steps": agent_steps,
+                    "cost": agent_cost,
+                    "status": result.status,
+                    "messages": result.messages,
+                    "patch": result.patch,
+                }
+                with open(traj_file, "w") as f:
+                    json.dump(traj_data, f, indent=2, default=str)
+                logger.info(f"Saved trajectory to: {traj_file}")
+
+                # Also save a human-readable version
+                readable_file = save_dir / f"trajectory_{repo_name}_{task_id}.txt"
+                with open(readable_file, "w") as f:
+                    f.write(f"=== Agent Trajectory ===\n")
+                    f.write(f"Task: {repo_name}/task{task_id}\n")
+                    f.write(f"Model: {model_name}\n")
+                    f.write(f"Steps: {agent_steps}, Cost: ${agent_cost:.4f}\n")
+                    f.write(f"Status: {result.status}\n\n")
+
+                    for i, msg in enumerate(result.messages):
+                        role = msg.get("role", "?").upper()
+                        content = msg.get("content", "")
+                        f.write(f"\n{'='*60}\n")
+                        f.write(f"[{i}] {role}\n")
+                        f.write(f"{'='*60}\n")
+                        f.write(content)
+                        f.write("\n")
+                logger.info(f"Saved readable trajectory to: {readable_file}")
+
+            # Log summary to console
+            import re as re_module
+            for i, msg in enumerate(result.messages):
+                role = msg.get("role", "?")
+                content = msg.get("content", "")[:500]
+                if role == "assistant":
+                    actions = re_module.findall(r"```bash\s*\n(.*?)\n```", content, re_module.DOTALL)
+                    if actions:
+                        logger.info(f"  [{i}] AGENT: {actions[0][:200]}")
+                elif role == "user" and "returncode" in content:
+                    rc_match = re_module.search(r"<returncode>(\d+)</returncode>", content)
+                    rc = rc_match.group(1) if rc_match else "?"
+                    logger.info(f"  [{i}] RESULT: exit={rc}")
+
+        # Check for agent errors
+        if result.status == "Error" or result.error:
+            return GenerationResult(
+                success=False,
+                errors=[f"Agent error: {result.error or result.status}"],
+                agent_cost=agent_cost,
+                agent_steps=agent_steps,
+                duration_seconds=time.time() - start_time,
+            )
+
+        # Get the patch from agent
+        full_patch = result.patch
+
+        if not full_patch:
+            return GenerationResult(
+                success=False,
+                errors=["Agent produced no changes"],
+                agent_cost=agent_cost,
+                agent_steps=agent_steps,
+                duration_seconds=time.time() - start_time,
+            )
+
+        # Extract feature description from .feature_description.md in the patch (before removing it)
+        feature_md = _extract_feature_md_from_patch(full_patch)
+
+        # Remove .feature_description.md from patch (it's metadata, not code)
+        clean_patch = _remove_feature_md_from_patch(full_patch)
+
+        # Split patch into feature and tests
+        logger.info("Splitting patch into feature and tests...")
+        feature_patch, tests_patch = split_patch(clean_patch)
+
+        # Fallback: try extracting from agent messages if not in patch
+        if not feature_md and result.messages:
+            for msg in result.messages:
+                if msg.get("role") == "assistant":
+                    content = msg.get("content", "")
+                    if isinstance(content, str):
+                        extracted = extract_feature_description(content)
+                        if extracted:
+                            feature_md = extracted
+                            break
+
+        # Basic validation
+        errors = []
+        if not feature_patch:
+            errors.append("No feature changes in patch (only test files modified)")
+
+        if not tests_patch:
+            errors.append("No test changes in patch")
+
+        # If basic validation fails, return early
+        if errors:
+            return GenerationResult(
+                success=False,
+                feature_md=feature_md,
+                feature_patch=feature_patch,
+                tests_patch=tests_patch,
+                errors=errors,
+                agent_cost=agent_cost,
+                agent_steps=agent_steps,
+                duration_seconds=time.time() - start_time,
+            )
+
+        # Run full validation if requested
+        tests_passed = None
+        tests_output = None
+        conflicts = []
+        conflicts_info = []
+        validation_run = False
+
+        if validate:
+            logger.info("Running validation...")
+            validation_run = True
+
+            # Step 1: Run tests
+            logger.info("Step 1/2: Running tests in sandbox...")
+            test_result = run_tests_in_sandbox(
+                repo_name=repo_name,
+                task_id=task_id,
+                feature_patch=feature_patch,
+                tests_patch=tests_patch,
+                timeout=600,
+                backend=backend,
+            )
+
+            tests_passed = test_result["passed"]
+            tests_output = test_result.get("output", "")
+
+            if test_result.get("error"):
+                errors.append(f"Test error: {test_result['error']}")
+
+            if not tests_passed:
+                errors.append(f"Tests failed: {test_result['tests_failed']} failed, {test_result['tests_passed']} passed")
+                logger.warning(f"Tests failed: {test_result['tests_failed']} failed")
+            else:
+                logger.info(f"Tests passed: {test_result['tests_passed']} passed")
+
+            # Step 2: Check conflicts (only if tests pass)
+            if tests_passed:
+                logger.info("Step 2/2: Checking conflicts with existing features...")
+                conflict_result = check_conflicts_in_sandbox(
+                    repo_name=repo_name,
+                    task_id=task_id,
+                    new_feature_patch=feature_patch,
+                    existing_feature_ids=existing_feature_ids,
+                    timeout=300,
+                    backend=backend,
+                )
+
+                conflicts = conflict_result["conflicts"]
+                conflicts_info = conflict_result.get("conflicts_info", [])
+
+                if conflict_result.get("errors"):
+                    for err in conflict_result["errors"]:
+                        errors.append(f"Conflict check: {err}")
+
+                if not conflicts:
+                    errors.append("No conflicts with any existing feature - feature may be too independent")
+                    logger.warning("No conflicts detected with existing features")
+                else:
+                    conflict_titles = [c.get("title", f"Feature {c['id']}") for c in conflicts_info]
+                    logger.info(f"Conflicts detected with features: {conflict_titles}")
+
+        # Determine success
+        success = (
+            len(errors) == 0
+            and feature_patch
+            and tests_patch
+            and (not validate or (tests_passed and len(conflicts) > 0))
+        )
+
+        return GenerationResult(
+            success=success,
+            feature_md=feature_md,
+            feature_patch=feature_patch,
+            tests_patch=tests_patch,
+            conflicts=conflicts,
+            conflicts_info=conflicts_info,
+            errors=errors,
+            agent_cost=agent_cost,
+            agent_steps=agent_steps,
+            duration_seconds=time.time() - start_time,
+            tests_passed=tests_passed,
+            tests_output=tests_output,
+            validation_run=validation_run,
+        )
+
+    except Exception as e:
+        logger.exception("Generation failed")
+        return GenerationResult(
+            success=False,
+            errors=[f"Generation failed: {e!s}"],
+            duration_seconds=time.time() - start_time,
+        )
+
+
+def generate_features_batch(
+    task_dir: str | Path,
+    feature_id: int | None = None,
+    num_attempts: int = 5,
+    model_name: str = "gemini/gemini-3-flash-preview",
+    backend: str = "modal",
+    output_dir: str | Path | None = None,
+    validate: bool = True,
+    step_limit: int = 75,
+    cost_limit: float = 2.0,
+    debug: bool = False,
+) -> list[GenerationResult]:
+    """Generate multiple feature candidates for a task.
+
+    Args:
+        task_dir: Path to the task directory
+        feature_id: Target feature ID (default: first)
+        num_attempts: Number of generation attempts
+        model_name: LLM model to use
+        backend: Execution backend
+        output_dir: Directory to save results (optional)
+        validate: Whether to run validation after each generation
+        debug: Save full trajectory for each attempt
+
+    Returns:
+        List of GenerationResults (including failures).
+    """
+    import hashlib
+    import re as re_module
+    import tempfile
+    import shutil
+
+    def make_feature_dir(base_dir: Path, feature_md: str | None, attempt_num: int) -> Path:
+        """Create output directory named after the feature title + short hash."""
+        title_slug = "unknown"
+        if feature_md:
+            match = re_module.search(r"\*\*Title\*\*:\s*(.+?)(?:\n|$)", feature_md)
+            if match:
+                title = match.group(1).strip()
+                title_slug = re_module.sub(r"[^a-z0-9]+", "_", title.lower()).strip("_")[:40]
+
+        content_hash = hashlib.md5((feature_md or f"attempt_{attempt_num}").encode()).hexdigest()[:5]
+        folder_name = f"{title_slug}_{content_hash}"
+
+        feature_dir = base_dir / folder_name
+        feature_dir.mkdir(parents=True, exist_ok=True)
+        return feature_dir
+
+    task_dir = Path(task_dir)
+    base_output_dir = Path(output_dir) if output_dir else None
+    results = []
+
+    for i in range(num_attempts):
+        logger.info(f"=== Generation attempt {i + 1}/{num_attempts} ===")
+
+        # Use temp dir for trajectory during generation
+        temp_dir = Path(tempfile.mkdtemp()) if base_output_dir else None
+
+        result = generate_feature(
+            task_dir=task_dir,
+            feature_id=feature_id,
+            model_name=model_name,
+            backend=backend,
+            validate=validate,
+            step_limit=step_limit,
+            cost_limit=cost_limit,
+            debug=debug,
+            output_dir=temp_dir,
+        )
+
+        results.append(result)
+
+        # Save to named directory based on feature title
+        if base_output_dir:
+            attempt_dir = make_feature_dir(base_output_dir, result.feature_md, i + 1)
+
+            # Move trajectory files from temp
+            if temp_dir:
+                for f in temp_dir.glob("trajectory_*"):
+                    shutil.move(str(f), attempt_dir / f.name)
+
+            # Save result JSON
+            with open(attempt_dir / "result.json", "w") as f:
+                json.dump(result.to_dict(), f, indent=2)
+
+            # Save patches if available (ensure trailing newline for git compatibility)
+            if result.feature_patch:
+                patch = result.feature_patch.rstrip() + "\n"
+                (attempt_dir / "feature.patch").write_text(patch)
+            if result.tests_patch:
+                patch = result.tests_patch.rstrip() + "\n"
+                (attempt_dir / "tests.patch").write_text(patch)
+            if result.feature_md:
+                (attempt_dir / "feature.md").write_text(result.feature_md)
+
+            logger.info(f"Saved attempt {i + 1} to {attempt_dir}")
+
+        # Log result
+        status = "✓ SUCCESS" if result.success else "✗ FAILED"
+        logger.info(f"Attempt {i + 1} {status}: {result.errors or 'OK'}")
+
+    # Summary
+    successful = sum(1 for r in results if r.success)
+    logger.info(f"=== Generation complete: {successful}/{num_attempts} successful ===")
+
+    return results
diff --git a/src/cooperbench/generation/prompt.py b/src/cooperbench/generation/prompt.py
new file mode 100644
index 0000000..c8d5fff
--- /dev/null
+++ b/src/cooperbench/generation/prompt.py
@@ -0,0 +1,310 @@
+"""Prompt building for feature generation."""
+
+from pathlib import Path
+
+from unidiff import PatchSet
+
+
+def _extract_patch_info(patch_path: Path) -> dict:
+    """Extract file and line information from a patch file."""
+    try:
+        content = patch_path.read_text()
+        patchset = PatchSet(content)
+    except Exception:
+        return {"files": [], "raw": "", "error": "Failed to parse patch"}
+
+    files_info = []
+    for patched_file in patchset:
+        file_info = {
+            "path": patched_file.path,
+            "added": patched_file.added,
+            "removed": patched_file.removed,
+            "hunks": [],
+        }
+        for hunk in patched_file:
+            file_info["hunks"].append({
+                "source_start": hunk.source_start,
+                "source_length": hunk.source_length,
+                "target_start": hunk.target_start,
+                "target_length": hunk.target_length,
+            })
+        files_info.append(file_info)
+
+    return {"files": files_info, "raw": content}
+
+
+def _read_feature_md(feature_dir: Path) -> str:
+    """Read and return contents of feature.md."""
+    feature_md = feature_dir / "feature.md"
+    if feature_md.exists():
+        return feature_md.read_text()
+    return ""
+
+
+def _get_feature_info(task_dir: Path, feature_id: int) -> dict | None:
+    """Get full information about a specific feature."""
+    feature_dir = task_dir / f"feature{feature_id}"
+
+    if not feature_dir.exists():
+        return None
+
+    feature_info = {
+        "id": feature_id,
+        "name": f"feature{feature_id}",
+        "description": _read_feature_md(feature_dir),
+        "patch_info": None,
+    }
+
+    # Extract patch information
+    feature_patch = feature_dir / "feature.patch"
+    if feature_patch.exists():
+        feature_info["patch_info"] = _extract_patch_info(feature_patch)
+
+    return feature_info
+
+
+def _get_existing_feature_ids(task_dir: Path) -> list[int]:
+    """Get IDs of all existing features in a task."""
+    ids = []
+    for d in task_dir.iterdir():
+        if d.is_dir() and d.name.startswith("feature"):
+            try:
+                fid = int(d.name.replace("feature", ""))
+                ids.append(fid)
+            except ValueError:
+                pass
+    return sorted(ids)
+
+
+def _get_test_command(task_dir: Path) -> str:
+    """Extract the test command from runner.sh, resolving variables."""
+    runner_sh = task_dir / "runner.sh"
+    if not runner_sh.exists():
+        return "# Test command not found - check runner.sh"
+
+    content = runner_sh.read_text()
+
+    # First, try to find variable definitions like TEST_PATH="..."
+    variables = {}
+    for line in content.split("\n"):
+        line = line.strip()
+        # Match patterns like: TEST_PATH="tests/io/test_parquet.py"
+        if "=" in line and not line.startswith("#"):
+            parts = line.split("=", 1)
+            if len(parts) == 2:
+                var_name = parts[0].strip()
+                var_value = parts[1].strip().strip('"').strip("'")
+                variables[var_name] = var_value
+
+    # Look for pytest or cargo test commands
+    for line in content.split("\n"):
+        line = line.strip()
+        if "pytest" in line and not line.startswith("#"):
+            if "python -m pytest" in line:
+                # Resolve variables in the command
+                resolved = line
+                for var_name, var_value in variables.items():
+                    resolved = resolved.replace(f"${var_name}", var_value)
+                    resolved = resolved.replace(f"${{{var_name}}}", var_value)
+                    resolved = resolved.replace(f'"${var_name}"', f'"{var_value}"')
+                    resolved = resolved.replace(f'"${{{var_name}}}"', f'"{var_value}"')
+                # Clean up timeout wrapper if present
+                if resolved.startswith("timeout"):
+                    # Extract just the pytest part
+                    if "python -m pytest" in resolved:
+                        idx = resolved.find("python -m pytest")
+                        resolved = resolved[idx:]
+                return resolved
+        if "cargo test" in line and not line.startswith("#"):
+            return line
+
+    return "# See runner.sh for test commands"
+
+
+def _extract_test_file(test_command: str) -> str:
+    """Extract the test file path from a test command."""
+    import re
+
+    # Try to extract pytest test file path
+    # Matches patterns like: pytest "tests/foo/test_bar.py" or pytest tests/foo/test_bar.py
+    pytest_match = re.search(r'pytest\s+["\']?([^\s"\']+(?:test[^\s"\']*\.py|tests?/[^\s"\']+))["\']?', test_command)
+    if pytest_match:
+        return pytest_match.group(1)
+
+    # Try to find any .py file path that looks like a test file
+    test_file_match = re.search(r'([^\s"\']+(?:test[^\s"\']*\.py|tests?/[^\s"\']+\.py))', test_command)
+    if test_file_match:
+        return test_file_match.group(1)
+
+    # For cargo test, return the tests directory
+    if "cargo test" in test_command:
+        return "tests/"
+
+    return "the existing test file"
+
+
+def _format_code_snippet(patch_content: str, max_lines: int = 80) -> str:
+    """Format patch content for display, limiting length."""
+    lines = patch_content.split("\n")
+    if len(lines) <= max_lines:
+        return patch_content
+
+    # Take first half and last quarter
+    first_part = lines[:max_lines // 2]
+    last_part = lines[-(max_lines // 4):]
+
+    return "\n".join(first_part) + "\n\n... (truncated) ...\n\n" + "\n".join(last_part)
+
+
+GENERATION_PROMPT_TEMPLATE = '''Create a NEW feature that will CONFLICT with an existing feature during git merge.
+
+## Existing Feature (your feature must conflict with this)
+
+{feature_description}
+
+### Code Changes:
+{files_summary}
+
+```diff
+{code_snippet}
+```
+
+## Requirements
+
+Your new feature must:
+1. **Cause merge conflicts** - modify some of the same lines/regions (around lines {hot_lines})
+2. **Be a real enhancement** - not random changes, but a legitimate useful feature
+3. **MUST include tests** - write NEW test functions that verify your feature works
+4. **Pass all tests** - run tests to verify everything works before submitting
+
+**IMPORTANT TEST REQUIREMENTS**:
+- You MUST write tests before submitting. A feature without tests is incomplete.
+- **Add your tests to the SAME test file** that the existing tests use: `{test_file}`
+- Do NOT create new test files. Add new test functions/classes to the existing test file.
+- This ensures test changes can create merge conflicts with other features' tests.
+
+Existing tests: `{test_command}`
+
+You CAN modify other files too, but at least some changes must overlap with the existing feature to create conflicts.
+
+## Output Format
+
+**IMPORTANT**: Before submitting, you MUST create a feature description file at `.feature_description.md` in the repo root.
+
+This description must be **detailed enough that another developer could implement the same feature** without seeing your code. Include:
+- What the feature does and why it's useful
+- The API/interface changes (new parameters, functions, classes)
+- Key implementation details (algorithms, data structures, edge cases handled)
+- How it integrates with the existing code
+
+```bash
+cat << 'FEATURE_EOF' > .feature_description.md
+**Title**: [Descriptive feature title]
+
+**Description**: [2-3 sentences explaining what the feature does and its purpose]
+
+**API Changes**:
+- [New function/method signatures with parameters]
+- [New parameters added to existing functions]
+- [New classes or data structures]
+
+**Implementation Details**:
+- [Key algorithms or logic used]
+- [How it modifies existing behavior]
+- [Edge cases handled]
+
+**Files Modified**: [List each file and what was changed in it]
+FEATURE_EOF
+```
+
+This file is required for the submission to be valid. Create it right before you submit.
+
+Start by exploring the modified files to understand the code structure.
+'''
+
+
+def build_prompt(task_dir: Path, feature_id: int | None = None) -> str:
+    """Build the generation prompt for a task, targeting a specific feature.
+
+    Args:
+        task_dir: Path to the task directory (e.g., dataset/dspy_task/task8394)
+        feature_id: ID of the specific feature to target for conflicts.
+                   If None, uses the first feature.
+
+    Returns:
+        The formatted prompt string
+    """
+    task_dir = Path(task_dir)
+
+    # Get existing feature IDs
+    existing_ids = _get_existing_feature_ids(task_dir)
+    if not existing_ids:
+        return "Error: No existing features found in task"
+
+    # Default to first feature if not specified
+    if feature_id is None:
+        feature_id = existing_ids[0]
+
+    if feature_id not in existing_ids:
+        return f"Error: Feature {feature_id} not found. Available: {existing_ids}"
+
+    # Get full feature info
+    feature = _get_feature_info(task_dir, feature_id)
+    if not feature:
+        return f"Error: Could not load feature {feature_id}"
+
+    # Format feature description
+    feature_description = feature["description"] or f"(No description for feature {feature_id})"
+
+    # Format files summary
+    files_summary = ""
+    hot_lines = []
+    if feature["patch_info"] and feature["patch_info"].get("files"):
+        for f in feature["patch_info"]["files"]:
+            files_summary += f"- `{f['path']}` (+{f['added']}/-{f['removed']} lines)\n"
+            for hunk in f["hunks"]:
+                start = hunk["source_start"]
+                end = start + hunk["source_length"]
+                files_summary += f"  - Lines {start}-{end}\n"
+                hot_lines.extend([start, end])
+
+    # Format hot lines
+    if hot_lines:
+        hot_lines_str = ", ".join(str(l) for l in sorted(set(hot_lines))[:5])
+    else:
+        hot_lines_str = "the modified sections"
+
+    # Get code snippet
+    code_snippet = ""
+    if feature["patch_info"] and feature["patch_info"].get("raw"):
+        code_snippet = _format_code_snippet(feature["patch_info"]["raw"])
+    else:
+        code_snippet = "(patch content not available)"
+
+    # Get test command and extract test file path
+    test_command = _get_test_command(task_dir)
+    test_file = _extract_test_file(test_command)
+
+    # Build final prompt
+    prompt = GENERATION_PROMPT_TEMPLATE.format(
+        feature_description=feature_description,
+        files_summary=files_summary or "(no files info)",
+        code_snippet=code_snippet,
+        hot_lines=hot_lines_str,
+        test_command=test_command,
+        test_file=test_file,
+    )
+
+    return prompt
+
+
+def list_features(task_dir: Path) -> list[int]:
+    """List all feature IDs in a task.
+
+    Args:
+        task_dir: Path to the task directory
+
+    Returns:
+        List of feature IDs
+    """
+    return _get_existing_feature_ids(Path(task_dir))
diff --git a/src/cooperbench/generation/splitter.py b/src/cooperbench/generation/splitter.py
new file mode 100644
index 0000000..e607225
--- /dev/null
+++ b/src/cooperbench/generation/splitter.py
@@ -0,0 +1,198 @@
+"""Patch splitting - separate feature changes from test changes."""
+
+from unidiff import PatchSet
+
+# Common patterns for test files across different languages/frameworks
+DEFAULT_TEST_PATTERNS = [
+    "test_",
+    "_test.",
+    "/tests/",
+    "/test/",
+    ".test.",
+    ".spec.",
+    "_spec.",
+    "tests.py",
+    "test.py",
+    # Rust
+    "#[cfg(test)]",
+    "mod tests",
+]
+
+# Files to exclude from patches (agent helper scripts, junk files)
+JUNK_FILE_PATTERNS = [
+    "fix_",           # Helper scripts like fix_parquet.py
+    "temp_",          # Temporary files
+    "tmp_",
+    "debug_",
+    "scratch_",
+    "helper_",
+    ".pyc",
+    "__pycache__",
+    ".egg-info",
+]
+
+
+def split_patch(
+    patch: str,
+    test_patterns: list[str] | None = None,
+) -> tuple[str, str]:
+    """Split a patch into feature.patch and tests.patch.
+
+    Args:
+        patch: The full git diff as a string
+        test_patterns: List of patterns to identify test files.
+                      Defaults to common test file patterns.
+
+    Returns:
+        Tuple of (feature_patch, tests_patch) as strings.
+        Either may be empty if no matching files found.
+    """
+    if test_patterns is None:
+        test_patterns = DEFAULT_TEST_PATTERNS
+
+    if not patch or not patch.strip():
+        return "", ""
+
+    try:
+        patchset = PatchSet(patch)
+    except Exception:
+        # If we can't parse the patch, return it all as feature
+        return patch, ""
+
+    feature_hunks = []
+    test_hunks = []
+
+    for patched_file in patchset:
+        path = patched_file.path
+
+        # Skip junk/helper files
+        if _is_junk_file(path):
+            continue
+
+        # Check if this is a test file
+        is_test = _is_test_file(path, test_patterns)
+
+        if is_test:
+            test_hunks.append(str(patched_file))
+        else:
+            feature_hunks.append(str(patched_file))
+
+    feature_patch = "\n".join(feature_hunks) if feature_hunks else ""
+    tests_patch = "\n".join(test_hunks) if test_hunks else ""
+
+    # Ensure patches end with newline (required for git apply)
+    # Strip first to remove excess whitespace, then add exactly one newline
+    feature_patch = feature_patch.strip() + "\n" if feature_patch.strip() else ""
+    tests_patch = tests_patch.strip() + "\n" if tests_patch.strip() else ""
+
+    return feature_patch, tests_patch
+
+
+def _is_junk_file(path: str) -> bool:
+    """Check if a file should be excluded from patches."""
+    path_lower = path.lower()
+    filename = path.split("/")[-1].lower()
+
+    # Check filename patterns
+    for pattern in JUNK_FILE_PATTERNS:
+        if filename.startswith(pattern) or pattern in path_lower:
+            return True
+
+    # Exclude root-level Python scripts that aren't in src/ or proper package structure
+    # These are usually helper scripts the agent created
+    if "/" not in path and path.endswith(".py"):
+        return True
+
+    return False
+
+
+def _is_test_file(path: str, patterns: list[str]) -> bool:
+    """Check if a file path matches test file patterns."""
+    path_lower = path.lower()
+
+    for pattern in patterns:
+        if pattern.lower() in path_lower:
+            return True
+
+    return False
+
+
+def extract_feature_description(agent_output: str) -> str | None:
+    """Extract the feature.md content from agent's output.
+
+    The agent is instructed to output the feature description in a specific
+    markdown format. This function extracts that content.
+
+    Args:
+        agent_output: The full agent conversation/output
+
+    Returns:
+        The extracted feature description, or None if not found.
+    """
+    # Look for the feature description block
+    # The agent outputs it in markdown format starting with **Title**:
+
+    # Only match structured feature description markers, not bash comments
+    markers = [
+        "**Title**:",
+        "**Title:**",  # Without space variant
+        "# Feature:",
+        "## Feature",
+    ]
+
+    # Find the start of the feature description
+    start_idx = -1
+    for marker in markers:
+        idx = agent_output.find(marker)
+        if idx != -1:
+            if start_idx == -1 or idx < start_idx:
+                start_idx = idx
+
+    if start_idx == -1:
+        return None
+
+    # Extract from the marker to end of that block
+    # Look for common end markers or take until end
+    content = agent_output[start_idx:]
+
+    # Try to find where the description ends
+    # Usually followed by code blocks or action outputs
+    end_markers = [
+        "\n```bash",
+        "\n```python",
+        "\n<action>",
+        "\n## Steps",
+        "\nBegin by",
+    ]
+
+    end_idx = len(content)
+    for marker in end_markers:
+        idx = content.find(marker)
+        if idx != -1 and idx < end_idx:
+            end_idx = idx
+
+    description = content[:end_idx].strip()
+
+    # Clean up any markdown code block wrappers
+    if description.startswith("```markdown"):
+        description = description[len("```markdown"):].strip()
+    if description.startswith("```"):
+        description = description[3:].strip()
+    if description.endswith("```"):
+        description = description[:-3].strip()
+
+    # Remove agent submission markers
+    cleanup_patterns = [
+        "COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT",
+        "SUBMIT_FINAL_OUTPUT",
+        "END_OF_FEATURE",
+        "<submit>",
+        "</submit>",
+    ]
+    for pattern in cleanup_patterns:
+        description = description.replace(pattern, "").strip()
+
+    # Remove trailing whitespace on each line
+    description = "\n".join(line.rstrip() for line in description.split("\n"))
+
+    return description if description else None
diff --git a/src/cooperbench/generation/validator.py b/src/cooperbench/generation/validator.py
new file mode 100644
index 0000000..4ff40fd
--- /dev/null
+++ b/src/cooperbench/generation/validator.py
@@ -0,0 +1,400 @@
+"""Validation - check conflicts and test results using Modal sandboxes."""
+
+import re
+from pathlib import Path
+
+from cooperbench.eval.backends import get_backend
+from cooperbench.utils import get_image_name
+from cooperbench.eval.sandbox import _parse_results, _write_patch
+
+
+def _extract_feature_title(feature_md_path: Path) -> str | None:
+    """Extract title from a feature.md file."""
+    if not feature_md_path.exists():
+        return None
+
+    content = feature_md_path.read_text()
+    # Look for **Title**: pattern
+    match = re.search(r"\*\*Title\*\*:\s*(.+?)(?:\n|$)", content)
+    if match:
+        return match.group(1).strip()
+    return None
+
+
+def check_conflicts_in_sandbox(
+    repo_name: str,
+    task_id: int,
+    new_feature_patch: str,
+    existing_feature_ids: list[int],
+    timeout: int = 300,
+    backend: str = "modal",
+) -> dict:
+    """Check which existing features conflict with a new feature.
+
+    Runs inside a Modal sandbox with the task's Docker image.
+
+    Args:
+        repo_name: Repository name (e.g., "dspy_task")
+        task_id: Task ID
+        new_feature_patch: The new feature patch as a string
+        existing_feature_ids: List of existing feature IDs to check against
+        timeout: Sandbox timeout in seconds
+        backend: Execution backend ("modal", "docker")
+
+    Returns:
+        Dict with:
+        - conflicts: list[int] - feature IDs that conflict
+        - clean: list[int] - feature IDs that merge cleanly
+        - errors: list[str] - any errors encountered
+        - output: str - raw output from sandbox
+    """
+    task_dir = Path("dataset") / repo_name / f"task{task_id}"
+
+    if not task_dir.exists():
+        return {"conflicts": [], "clean": [], "errors": [f"Task dir not found: {task_dir}"], "output": ""}
+
+    image = get_image_name(repo_name, task_id)
+    eval_backend = get_backend(backend)
+    sb = eval_backend.create_sandbox(image, timeout)
+
+    try:
+        # Write the new patch to sandbox
+        _write_patch(sb, "new_feature.patch", new_feature_patch)
+
+        # Write existing feature patches
+        for fid in existing_feature_ids:
+            feature_patch_path = task_dir / f"feature{fid}" / "feature.patch"
+            if feature_patch_path.exists():
+                content = feature_patch_path.read_text()
+                _write_patch(sb, f"feature{fid}.patch", content)
+
+        # Run conflict checking script
+        feature_ids_str = " ".join(str(fid) for fid in existing_feature_ids)
+        script = _build_conflict_check_script(existing_feature_ids)
+
+        result = sb.exec("bash", "-c", script)
+        output = result.stdout_read() + result.stderr_read()
+
+        # Parse results and collect feature info
+        conflicts = []
+        conflicts_info = []
+        clean = []
+        errors = []
+
+        for line in output.split("\n"):
+            if line.startswith("CONFLICT:"):
+                fid = int(line.split(":")[1].strip())
+                conflicts.append(fid)
+                # Get feature title
+                feature_md_path = task_dir / f"feature{fid}" / "feature.md"
+                title = _extract_feature_title(feature_md_path)
+                conflicts_info.append({
+                    "id": fid,
+                    "title": title or f"Feature {fid}",
+                })
+            elif line.startswith("CLEAN:"):
+                fid = int(line.split(":")[1].strip())
+                clean.append(fid)
+            elif line.startswith("ERROR:"):
+                errors.append(line)
+
+        return {
+            "conflicts": conflicts,
+            "conflicts_info": conflicts_info,
+            "clean": clean,
+            "errors": errors,
+            "output": output,
+        }
+
+    except Exception as e:
+        return {"conflicts": [], "clean": [], "errors": [str(e)], "output": ""}
+    finally:
+        sb.terminate()
+
+
+def run_tests_in_sandbox(
+    repo_name: str,
+    task_id: int,
+    feature_patch: str,
+    tests_patch: str,
+    timeout: int = 600,
+    backend: str = "modal",
+) -> dict:
+    """Run the NEW tests for a generated feature in a Modal sandbox.
+
+    Uses runner.sh which handles task-specific environment setup (deps, etc).
+    Passes the new test files as the 3rd param (requires updated runner.sh).
+
+    Args:
+        repo_name: Repository name
+        task_id: Task ID
+        feature_patch: The feature implementation patch
+        tests_patch: The tests patch
+        timeout: Sandbox timeout
+        backend: Execution backend
+
+    Returns:
+        Dict with: passed, tests_passed, tests_failed, output, error
+    """
+    import logging
+    logger = logging.getLogger(__name__)
+
+    image = get_image_name(repo_name, task_id)
+    logger.debug(f"Creating sandbox with image: {image}")
+    eval_backend = get_backend(backend)
+    sb = eval_backend.create_sandbox(image, timeout)
+    logger.debug("Sandbox created successfully")
+
+    try:
+        # Write patches to /patches/ directory
+        logger.debug(f"Writing tests.patch ({len(tests_patch)} bytes)")
+        _write_patch(sb, "tests.patch", tests_patch)
+        logger.debug(f"Writing feature.patch ({len(feature_patch)} bytes)")
+        _write_patch(sb, "feature.patch", feature_patch)
+
+        # Extract NEW test function names from the patch (not just files)
+        # This ensures we only run tests added by the agent, not pre-existing tests
+        new_test_specs = _extract_new_test_functions(tests_patch)
+        test_path = " ".join(new_test_specs) if new_test_specs else ""
+        logger.debug(f"New test functions to run: {test_path}")
+
+        # Use runner.sh with: tests.patch feature.patch [test_path]
+        # - Old images: 3rd param ignored, runs default tests
+        # - New images: runs the specific new test files
+        if test_path:
+            logger.debug(f"Running: bash /usr/local/bin/runner.sh tests.patch feature.patch {test_path}")
+            result = sb.exec("bash", "/usr/local/bin/runner.sh", "tests.patch", "feature.patch", test_path)
+        else:
+            logger.debug("Running: bash /usr/local/bin/runner.sh tests.patch feature.patch")
+            result = sb.exec("bash", "/usr/local/bin/runner.sh", "tests.patch", "feature.patch")
+        logger.debug(f"Runner completed with exit code: {result.returncode}")
+
+        output = result.stdout_read() + result.stderr_read()
+        exit_code = result.returncode
+
+        # Parse test results (reuse existing parser that handles pytest, go, cargo, jest)
+        parsed = _parse_results(output)
+
+        return {
+            "passed": exit_code == 0 and parsed["passed"] > 0,
+            "tests_passed": parsed["passed"],
+            "tests_failed": parsed["failed"],
+            "output": output,
+            "error": None,
+        }
+    except Exception as e:
+        return {
+            "passed": False,
+            "tests_passed": 0,
+            "tests_failed": 0,
+            "output": "",
+            "error": str(e),
+        }
+    finally:
+        sb.terminate()
+
+
+def _extract_test_files_from_patch(patch: str) -> list[str]:
+    """Extract new/modified file paths from a patch."""
+    import re
+    files = []
+    for match in re.finditer(r"^\+\+\+ b/(.+)$", patch, re.MULTILINE):
+        path = match.group(1)
+        if path and not path.startswith("/dev/null"):
+            files.append(path)
+    return files
+
+
+def _extract_new_test_functions(patch: str) -> list[str]:
+    """Extract new test function names from a patch with their file paths.
+
+    Returns paths in pytest format: path/to/test.py::test_function_name
+    """
+    import re
+
+    test_specs = []
+    current_file = None
+
+    for line in patch.split("\n"):
+        # Track which file we're in
+        if line.startswith("+++ b/"):
+            current_file = line[6:]  # Remove "+++ b/" prefix
+        # Find new test function definitions (lines starting with +def test_)
+        elif line.startswith("+def test_") and current_file:
+            # Extract function name: +def test_foo(args): -> test_foo
+            match = re.match(r"\+def (test_\w+)\s*\(", line)
+            if match:
+                func_name = match.group(1)
+                test_specs.append(f"{current_file}::{func_name}")
+
+    return test_specs
+
+
+def validate_generated_feature(
+    repo_name: str,
+    task_id: int,
+    feature_patch: str,
+    tests_patch: str,
+    min_conflicts: int = 1,
+    timeout: int = 600,
+    backend: str = "modal",
+) -> dict:
+    """Full validation of a generated feature.
+
+    Checks:
+    1. Tests pass
+    2. Conflicts with at least min_conflicts existing features
+
+    Args:
+        repo_name: Repository name
+        task_id: Task ID
+        feature_patch: The feature implementation patch
+        tests_patch: The tests patch
+        min_conflicts: Minimum required conflicts (default: 1)
+        timeout: Sandbox timeout
+        backend: Execution backend
+
+    Returns:
+        Dict with validation results
+    """
+    task_dir = Path("dataset") / repo_name / f"task{task_id}"
+
+    # Get existing feature IDs
+    existing_ids = _get_existing_feature_ids(task_dir)
+
+    # Step 1: Run tests
+    test_result = run_tests_in_sandbox(
+        repo_name=repo_name,
+        task_id=task_id,
+        feature_patch=feature_patch,
+        tests_patch=tests_patch,
+        timeout=timeout,
+        backend=backend,
+    )
+
+    if not test_result["passed"]:
+        return {
+            "valid": False,
+            "reason": "tests_failed",
+            "test_result": test_result,
+            "conflict_result": None,
+        }
+
+    # Step 2: Check conflicts
+    conflict_result = check_conflicts_in_sandbox(
+        repo_name=repo_name,
+        task_id=task_id,
+        new_feature_patch=feature_patch,
+        existing_feature_ids=existing_ids,
+        timeout=timeout,
+        backend=backend,
+    )
+
+    num_conflicts = len(conflict_result["conflicts"])
+
+    if num_conflicts < min_conflicts:
+        return {
+            "valid": False,
+            "reason": f"insufficient_conflicts ({num_conflicts} < {min_conflicts})",
+            "test_result": test_result,
+            "conflict_result": conflict_result,
+        }
+
+    return {
+        "valid": True,
+        "reason": None,
+        "test_result": test_result,
+        "conflict_result": conflict_result,
+    }
+
+
+def _build_conflict_check_script(feature_ids: list[int]) -> str:
+    """Build bash script for checking REAL git merge conflicts.
+
+    For each existing feature:
+    1. Create branch A from base, apply existing feature, commit
+    2. Create branch B from base, apply new feature, commit
+    3. Try git merge --no-commit from A
+    4. Check if merge has conflicts (git merge --abort needed)
+    """
+    feature_checks = "\n".join(f'''
+# Check feature {fid} for REAL merge conflicts
+echo "Checking feature {fid}..."
+git checkout --quiet $BASE_SHA
+git clean -fd >/dev/null 2>&1
+
+# Branch A: existing feature {fid}
+git checkout --quiet -b __existing_{fid}
+if ! git apply /patches/feature{fid}.patch 2>/dev/null; then
+    echo "ERROR:feature{fid} patch failed to apply"
+    git checkout --quiet $BASE_SHA 2>/dev/null || true
+    git branch -D __existing_{fid} 2>/dev/null || true
+    continue
+fi
+git add -A
+git commit -qm "existing feature{fid}" --allow-empty
+
+# Branch B: new feature (from base)
+git checkout --quiet $BASE_SHA
+git checkout --quiet -b __new_{fid}
+if ! git apply /patches/new_feature.patch 2>/dev/null; then
+    echo "ERROR:new_feature patch failed to apply for check {fid}"
+    git checkout --quiet $BASE_SHA 2>/dev/null || true
+    git branch -D __existing_{fid} 2>/dev/null || true
+    git branch -D __new_{fid} 2>/dev/null || true
+    continue
+fi
+git add -A
+git commit -qm "new feature" --allow-empty
+
+# Try to merge existing feature into new feature branch
+# --no-commit so we can check for conflicts without auto-commit
+if git merge --no-commit --no-ff __existing_{fid} 2>/dev/null; then
+    # Merge succeeded cleanly
+    echo "CLEAN:{fid}"
+    git merge --abort 2>/dev/null || git reset --hard HEAD >/dev/null 2>&1
+else
+    # Merge has conflicts!
+    echo "CONFLICT:{fid}"
+    git merge --abort 2>/dev/null || git reset --hard HEAD >/dev/null 2>&1
+fi
+
+# Cleanup branches
+git checkout --quiet $BASE_SHA 2>/dev/null || true
+git branch -D __existing_{fid} 2>/dev/null || true
+git branch -D __new_{fid} 2>/dev/null || true
+''' for fid in feature_ids)
+
+    return f'''
+cd /workspace/repo
+
+# Get base commit
+BASE_SHA=$(git rev-parse HEAD)
+
+# Ensure clean state
+git reset --hard HEAD >/dev/null 2>&1
+git clean -fd >/dev/null 2>&1
+
+# Configure git for commits
+git config user.email "test@test.com" 2>/dev/null || true
+git config user.name "Test" 2>/dev/null || true
+
+{feature_checks}
+
+# Final cleanup
+git checkout --quiet $BASE_SHA 2>/dev/null || true
+git reset --hard HEAD >/dev/null 2>&1
+'''
+
+
+def _get_existing_feature_ids(task_dir: Path) -> list[int]:
+    """Get IDs of existing features in a task."""
+    ids = []
+    for d in task_dir.iterdir():
+        if d.is_dir() and d.name.startswith("feature"):
+            try:
+                fid = int(d.name.replace("feature", ""))
+                ids.append(fid)
+            except ValueError:
+                pass
+    return sorted(ids)

From d143a83bdfbf7475b20ebbfef27edab8cbc6d6a1 Mon Sep 17 00:00:00 2001
From: Arpandeep Khatua <akhatua@stanford.edu>
Date: Sun, 1 Feb 2026 16:08:22 -0800
Subject: [PATCH 2/3] Capture the conflict

---
 src/cooperbench/generation/generator.py |  2 +-
 src/cooperbench/generation/validator.py | 62 +++++++++++++++++++------
 2 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/src/cooperbench/generation/generator.py b/src/cooperbench/generation/generator.py
index ef8da3b..01f3c60 100644
--- a/src/cooperbench/generation/generator.py
+++ b/src/cooperbench/generation/generator.py
@@ -72,7 +72,7 @@ class GenerationResult:
     feature_patch: str | None = None
     tests_patch: str | None = None
     conflicts: list[int] = field(default_factory=list)
-    conflicts_info: list[dict] = field(default_factory=list)  # [{id, title}, ...]
+    conflicts_info: list[dict] = field(default_factory=list)  # [{id, title, conflict_diff}, ...]
     errors: list[str] = field(default_factory=list)
     agent_cost: float = 0.0
     agent_steps: int = 0
diff --git a/src/cooperbench/generation/validator.py b/src/cooperbench/generation/validator.py
index 4ff40fd..52bc813 100644
--- a/src/cooperbench/generation/validator.py
+++ b/src/cooperbench/generation/validator.py
@@ -61,6 +61,13 @@ def check_conflicts_in_sandbox(
         # Write the new patch to sandbox
         _write_patch(sb, "new_feature.patch", new_feature_patch)
 
+        # Extract feature titles for commit messages
+        feature_titles = {}
+        for fid in existing_feature_ids:
+            feature_md_path = task_dir / f"feature{fid}" / "feature.md"
+            title = _extract_feature_title(feature_md_path)
+            feature_titles[fid] = title or f"Feature {fid}"
+
         # Write existing feature patches
         for fid in existing_feature_ids:
             feature_patch_path = task_dir / f"feature{fid}" / "feature.patch"
@@ -69,8 +76,7 @@ def check_conflicts_in_sandbox(
                 _write_patch(sb, f"feature{fid}.patch", content)
 
         # Run conflict checking script
-        feature_ids_str = " ".join(str(fid) for fid in existing_feature_ids)
-        script = _build_conflict_check_script(existing_feature_ids)
+        script = _build_conflict_check_script(existing_feature_ids, feature_titles)
 
         result = sb.exec("bash", "-c", script)
         output = result.stdout_read() + result.stderr_read()
@@ -81,22 +87,34 @@ def check_conflicts_in_sandbox(
         clean = []
         errors = []
 
-        for line in output.split("\n"):
-            if line.startswith("CONFLICT:"):
+        # Parse output line by line, capturing conflict content
+        lines = output.split("\n")
+        i = 0
+        while i < len(lines):
+            line = lines[i]
+            if line.startswith("CONFLICT_START:"):
+                # Format: CONFLICT_START:fid then content until CONFLICT_END:fid
                 fid = int(line.split(":")[1].strip())
+                conflict_content = []
+                i += 1
+                while i < len(lines) and not lines[i].startswith(f"CONFLICT_END:{fid}"):
+                    conflict_content.append(lines[i])
+                    i += 1
                 conflicts.append(fid)
-                # Get feature title
-                feature_md_path = task_dir / f"feature{fid}" / "feature.md"
-                title = _extract_feature_title(feature_md_path)
+                # Get title from feature_titles we extracted earlier
+                title = feature_titles.get(fid, f"Feature {fid}")
                 conflicts_info.append({
                     "id": fid,
-                    "title": title or f"Feature {fid}",
+                    "title": title,
+                    "conflict_diff": "\n".join(conflict_content),
                 })
             elif line.startswith("CLEAN:"):
+                # Format: CLEAN:fid
                 fid = int(line.split(":")[1].strip())
                 clean.append(fid)
             elif line.startswith("ERROR:"):
                 errors.append(line)
+            i += 1
 
         return {
             "conflicts": conflicts,
@@ -308,7 +326,7 @@ def validate_generated_feature(
     }
 
 
-def _build_conflict_check_script(feature_ids: list[int]) -> str:
+def _build_conflict_check_script(feature_ids: list[int], feature_titles: dict[int, str]) -> str:
     """Build bash script for checking REAL git merge conflicts.
 
     For each existing feature:
@@ -317,7 +335,10 @@ def _build_conflict_check_script(feature_ids: list[int]) -> str:
     3. Try git merge --no-commit from A
     4. Check if merge has conflicts (git merge --abort needed)
     """
-    feature_checks = "\n".join(f'''
+    def _build_feature_check(fid: int, title: str) -> str:
+        # Escape title for shell - replace : with space to avoid parsing issues
+        safe_title = title.replace(":", " -").replace("'", "\\'").replace('"', '\\"')
+        return f'''
 # Check feature {fid} for REAL merge conflicts
 echo "Checking feature {fid}..."
 git checkout --quiet $BASE_SHA
@@ -332,7 +353,7 @@ def _build_conflict_check_script(feature_ids: list[int]) -> str:
     continue
 fi
 git add -A
-git commit -qm "existing feature{fid}" --allow-empty
+git commit -qm "{safe_title}" --allow-empty
 
 # Branch B: new feature (from base)
 git checkout --quiet $BASE_SHA
@@ -352,10 +373,16 @@ def _build_conflict_check_script(feature_ids: list[int]) -> str:
 if git merge --no-commit --no-ff __existing_{fid} 2>/dev/null; then
     # Merge succeeded cleanly
     echo "CLEAN:{fid}"
-    git merge --abort 2>/dev/null || git reset --hard HEAD >/dev/null 2>&1
+    git reset --hard HEAD >/dev/null 2>&1
 else
-    # Merge has conflicts!
-    echo "CONFLICT:{fid}"
+    # Merge has conflicts! Capture the actual conflict content
+    echo "CONFLICT_START:{fid}"
+    # Show files with conflict markers (<<<<<<< ======= >>>>>>>)
+    for f in $(git diff --name-only --diff-filter=U 2>/dev/null); do
+        echo "--- $f ---"
+        cat "$f" | grep -A 50 -B 5 "<<<<<<" | head -100
+    done
+    echo "CONFLICT_END:{fid}"
     git merge --abort 2>/dev/null || git reset --hard HEAD >/dev/null 2>&1
 fi
 
@@ -363,7 +390,12 @@ def _build_conflict_check_script(feature_ids: list[int]) -> str:
 git checkout --quiet $BASE_SHA 2>/dev/null || true
 git branch -D __existing_{fid} 2>/dev/null || true
 git branch -D __new_{fid} 2>/dev/null || true
-''' for fid in feature_ids)
+'''
+
+    feature_checks = "\n".join(
+        _build_feature_check(fid, feature_titles.get(fid, f"Feature {fid}"))
+        for fid in feature_ids
+    )
 
     return f'''
 cd /workspace/repo

From 00b82e5cfeb0f939a1e6e7de784c0f482f49ce08 Mon Sep 17 00:00:00 2001
From: Arpandeep Khatua <akhatua@stanford.edu>
Date: Sun, 1 Feb 2026 16:27:37 -0800
Subject: [PATCH 3/3] Fixing git issues

---
 pyproject.toml                                |  1 +
 .../agents/mini_swe_agent/adapter.py          | 12 +++-
 src/cooperbench/generation/__main__.py        | 30 ++++++---
 src/cooperbench/generation/generator.py       | 14 ++--
 src/cooperbench/generation/prompt.py          | 67 +++++++++++++++----
 src/cooperbench/generation/splitter.py        |  6 +-
 src/cooperbench/generation/validator.py       | 32 +++++----
 7 files changed, 116 insertions(+), 46 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d3e5f50..c48ecec 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -107,6 +107,7 @@ exclude = [
 
 [tool.hatch.build.targets.wheel]
 packages = ["src/cooperbench"]
+exclude = ["src/cooperbench/generation"]
 
 [tool.ruff]
 line-length = 120
diff --git a/src/cooperbench/agents/mini_swe_agent/adapter.py b/src/cooperbench/agents/mini_swe_agent/adapter.py
index 6a21024..641c413 100644
--- a/src/cooperbench/agents/mini_swe_agent/adapter.py
+++ b/src/cooperbench/agents/mini_swe_agent/adapter.py
@@ -180,10 +180,16 @@ def run(
     def _get_patch(self, env: "ModalEnvironment | DockerEnvironment", base_commit: str) -> str:
         """Extract git diff from base commit to current working tree state."""
         try:
-            # Stage all changes (including new untracked files) so they appear in diff
+            # Stage all changes (including new untracked files)
             env.execute("git add -A", timeout=10)
-            # Diff from base commit to staged changes (includes new files)
-            result = env.execute(f"git diff --cached {base_commit}", timeout=30)
+            # Configure git identity (required for commit in fresh sandbox environments)
+            env.execute("git config user.email 'agent@cooperbench.local'", timeout=10)
+            env.execute("git config user.name 'CooperBench Agent'", timeout=10)
+            # Commit everything so committed + staged + unstaged changes are all in HEAD
+            # This ensures we capture changes even if the agent made commits
+            env.execute("git commit --allow-empty -m 'Agent changes'", timeout=10)
+            # Diff from base commit to HEAD captures all changes
+            result = env.execute(f"git diff {base_commit} HEAD", timeout=30)
             return result.get("output", "").strip()
         except Exception:
             return ""
diff --git a/src/cooperbench/generation/__main__.py b/src/cooperbench/generation/__main__.py
index 00f7202..1af4932 100644
--- a/src/cooperbench/generation/__main__.py
+++ b/src/cooperbench/generation/__main__.py
@@ -111,7 +111,7 @@ def main():
     # Parse repo_name and task_id from path
     parts = args.task.split("/")
     if len(parts) != 2:
-        logger.error(f"Invalid task path format. Expected: repo_name/taskID (e.g., dspy_task/task8394)")
+        logger.error("Invalid task path format. Expected: repo_name/taskID (e.g., dspy_task/task8394)")
         sys.exit(1)
 
     repo_name = parts[0]
@@ -120,6 +120,7 @@ def main():
     # Mode 0: List features
     if args.list_features:
         from cooperbench.generation.prompt import list_features
+
         features = list_features(task_dir)
         print(f"Features in {args.task}: {features}")
         return
@@ -127,6 +128,7 @@ def main():
     # Mode 1: Prompt only
     if args.prompt_only:
         from cooperbench.generation.prompt import build_prompt
+
         prompt = build_prompt(task_dir, feature_id=args.feature)
         print(prompt)
         return
@@ -152,10 +154,11 @@ def main():
         sys.exit(0 if result["valid"] else 1)
 
     # Mode 3: Generate features
-    from cooperbench.generation.generator import generate_feature, generate_features_batch
     import hashlib
     import re as re_module
 
+    from cooperbench.generation.generator import generate_feature, generate_features_batch
+
     def make_output_dir(base_dir: Path, feature_md: str | None, fallback_hash: str) -> Path:
         """Create output directory named after the feature title + short hash."""
         # Extract title from feature_md if available
@@ -185,6 +188,7 @@ def make_output_dir(base_dir: Path, feature_md: str | None, fallback_hash: str)
 
         # Generate first (to temp location for trajectory)
         import tempfile
+
         temp_dir = Path(tempfile.mkdtemp())
 
         result = generate_feature(
@@ -200,12 +204,14 @@ def make_output_dir(base_dir: Path, feature_md: str | None, fallback_hash: str)
 
         # Create final output dir based on feature title
         from datetime import datetime
+
         fallback = datetime.now().strftime("%Y%m%d_%H%M%S")
         output_dir = make_output_dir(base_output_dir, result.feature_md, fallback)
         logger.info(f"Output directory: {output_dir}")
 
         # Move trajectory files from temp to final
         import shutil
+
         for f in temp_dir.glob("trajectory_*"):
             shutil.move(str(f), output_dir / f.name)
 
@@ -213,31 +219,33 @@ def make_output_dir(base_dir: Path, feature_md: str | None, fallback_hash: str)
         if result.feature_patch:
             patch = result.feature_patch.rstrip() + "\n"
             (output_dir / "feature.patch").write_text(patch)
-            logger.info(f"Saved feature.patch")
+            logger.info("Saved feature.patch")
         if result.tests_patch:
             patch = result.tests_patch.rstrip() + "\n"
             (output_dir / "tests.patch").write_text(patch)
-            logger.info(f"Saved tests.patch")
+            logger.info("Saved tests.patch")
         if result.feature_md:
             (output_dir / "feature.md").write_text(result.feature_md)
-            logger.info(f"Saved feature.md")
+            logger.info("Saved feature.md")
 
         # Save full result as JSON
         (output_dir / "result.json").write_text(json.dumps(result.to_dict(), indent=2, default=str))
-        logger.info(f"Saved result.json")
+        logger.info("Saved result.json")
 
         # Print summary
-        print(f"\n{'='*60}")
+        print(f"\n{'=' * 60}")
         print(f"Result: {'SUCCESS' if result.success else 'FAILED'}")
         print(f"Output saved to: {output_dir}")
         if result.errors:
             print(f"Errors: {result.errors}")
         print(f"Agent: {result.agent_steps} steps, ${result.agent_cost:.4f}")
-        print(f"{'='*60}")
+        print(f"{'=' * 60}")
 
         sys.exit(0 if result.success else 1)
     else:
-        logger.info(f"Running {args.attempts} generation attempts for {args.task} (target: feature {args.feature or 'first'})")
+        logger.info(
+            f"Running {args.attempts} generation attempts for {args.task} (target: feature {args.feature or 'first'})"
+        )
         logger.info(f"Output directory: {base_output_dir}")
 
         results = generate_features_batch(
@@ -254,7 +262,7 @@ def make_output_dir(base_dir: Path, feature_md: str | None, fallback_hash: str)
 
         # Summary
         successful = sum(1 for r in results if r.success)
-        print(f"\n{'='*60}")
+        print(f"\n{'=' * 60}")
         print(f"Summary: {successful}/{args.attempts} successful")
         print(f"Output saved to: {base_output_dir}")
 
@@ -262,7 +270,7 @@ def make_output_dir(base_dir: Path, feature_md: str | None, fallback_hash: str)
             status = "✓" if r.success else "✗"
             print(f"  {status} Attempt {i}: {r.errors or 'OK'}")
 
-        print(f"{'='*60}")
+        print(f"{'=' * 60}")
         sys.exit(0 if successful > 0 else 1)
 
 
diff --git a/src/cooperbench/generation/generator.py b/src/cooperbench/generation/generator.py
index 01f3c60..c04befe 100644
--- a/src/cooperbench/generation/generator.py
+++ b/src/cooperbench/generation/generator.py
@@ -197,6 +197,7 @@ def generate_feature(
             # Save full trajectory to file if debug mode or output_dir specified
             if debug or output_dir:
                 import re as re_module
+
                 save_dir = output_dir or Path(".")
                 save_dir.mkdir(parents=True, exist_ok=True)
 
@@ -217,7 +218,7 @@ def generate_feature(
                 # Also save a human-readable version
                 readable_file = save_dir / f"trajectory_{repo_name}_{task_id}.txt"
                 with open(readable_file, "w") as f:
-                    f.write(f"=== Agent Trajectory ===\n")
+                    f.write("=== Agent Trajectory ===\n")
                     f.write(f"Task: {repo_name}/task{task_id}\n")
                     f.write(f"Model: {model_name}\n")
                     f.write(f"Steps: {agent_steps}, Cost: ${agent_cost:.4f}\n")
@@ -226,15 +227,16 @@ def generate_feature(
                     for i, msg in enumerate(result.messages):
                         role = msg.get("role", "?").upper()
                         content = msg.get("content", "")
-                        f.write(f"\n{'='*60}\n")
+                        f.write(f"\n{'=' * 60}\n")
                         f.write(f"[{i}] {role}\n")
-                        f.write(f"{'='*60}\n")
+                        f.write(f"{'=' * 60}\n")
                         f.write(content)
                         f.write("\n")
                 logger.info(f"Saved readable trajectory to: {readable_file}")
 
             # Log summary to console
             import re as re_module
+
             for i, msg in enumerate(result.messages):
                 role = msg.get("role", "?")
                 content = msg.get("content", "")[:500]
@@ -340,7 +342,9 @@ def generate_feature(
                 errors.append(f"Test error: {test_result['error']}")
 
             if not tests_passed:
-                errors.append(f"Tests failed: {test_result['tests_failed']} failed, {test_result['tests_passed']} passed")
+                errors.append(
+                    f"Tests failed: {test_result['tests_failed']} failed, {test_result['tests_passed']} passed"
+                )
                 logger.warning(f"Tests failed: {test_result['tests_failed']} failed")
             else:
                 logger.info(f"Tests passed: {test_result['tests_passed']} passed")
@@ -433,8 +437,8 @@ def generate_features_batch(
     """
     import hashlib
     import re as re_module
-    import tempfile
     import shutil
+    import tempfile
 
     def make_feature_dir(base_dir: Path, feature_md: str | None, attempt_num: int) -> Path:
         """Create output directory named after the feature title + short hash."""
diff --git a/src/cooperbench/generation/prompt.py b/src/cooperbench/generation/prompt.py
index c8d5fff..32b17db 100644
--- a/src/cooperbench/generation/prompt.py
+++ b/src/cooperbench/generation/prompt.py
@@ -22,12 +22,14 @@ def _extract_patch_info(patch_path: Path) -> dict:
             "hunks": [],
         }
         for hunk in patched_file:
-            file_info["hunks"].append({
-                "source_start": hunk.source_start,
-                "source_length": hunk.source_length,
-                "target_start": hunk.target_start,
-                "target_length": hunk.target_length,
-            })
+            file_info["hunks"].append(
+                {
+                    "source_start": hunk.source_start,
+                    "source_length": hunk.source_length,
+                    "target_start": hunk.target_start,
+                    "target_length": hunk.target_length,
+                }
+            )
         files_info.append(file_info)
 
     return {"files": files_info, "raw": content}
@@ -41,6 +43,23 @@ def _read_feature_md(feature_dir: Path) -> str:
     return ""
 
 
+def _extract_feature_title(feature_md_content: str) -> str | None:
+    """Extract just the title from feature.md content."""
+    import re
+
+    # Look for **Title**: pattern
+    match = re.search(r"\*\*Title\*\*:\s*(.+?)(?:\n|$)", feature_md_content)
+    if match:
+        return match.group(1).strip()
+
+    # Fallback: look for # Title or ## Title
+    match = re.search(r"^#+ (.+?)$", feature_md_content, re.MULTILINE)
+    if match:
+        return match.group(1).strip()
+
+    return None
+
+
 def _get_feature_info(task_dir: Path, feature_id: int) -> dict | None:
     """Get full information about a specific feature."""
     feature_dir = task_dir / f"feature{feature_id}"
@@ -150,13 +169,13 @@ def _format_code_snippet(patch_content: str, max_lines: int = 80) -> str:
         return patch_content
 
     # Take first half and last quarter
-    first_part = lines[:max_lines // 2]
-    last_part = lines[-(max_lines // 4):]
+    first_part = lines[: max_lines // 2]
+    last_part = lines[-(max_lines // 4) :]
 
     return "\n".join(first_part) + "\n\n... (truncated) ...\n\n" + "\n".join(last_part)
 
 
-GENERATION_PROMPT_TEMPLATE = '''Create a NEW feature that will CONFLICT with an existing feature during git merge.
+GENERATION_PROMPT_TEMPLATE = """Create a NEW feature that will CONFLICT with an existing feature during git merge.
 
 ## Existing Feature (your feature must conflict with this)
 
@@ -168,7 +187,7 @@ def _format_code_snippet(patch_content: str, max_lines: int = 80) -> str:
 ```diff
 {code_snippet}
 ```
-
+{other_features_section}
 ## Requirements
 
 Your new feature must:
@@ -220,7 +239,7 @@ def _format_code_snippet(patch_content: str, max_lines: int = 80) -> str:
 This file is required for the submission to be valid. Create it right before you submit.
 
 Start by exploring the modified files to understand the code structure.
-'''
+"""
 
 
 def build_prompt(task_dir: Path, feature_id: int | None = None) -> str:
@@ -270,7 +289,7 @@ def build_prompt(task_dir: Path, feature_id: int | None = None) -> str:
 
     # Format hot lines
     if hot_lines:
-        hot_lines_str = ", ".join(str(l) for l in sorted(set(hot_lines))[:5])
+        hot_lines_str = ", ".join(str(line) for line in sorted(set(hot_lines))[:5])
     else:
         hot_lines_str = "the modified sections"
 
@@ -285,6 +304,29 @@ def build_prompt(task_dir: Path, feature_id: int | None = None) -> str:
     test_command = _get_test_command(task_dir)
     test_file = _extract_test_file(test_command)
 
+    # Collect titles from other features (to avoid duplicates)
+    other_features_section = ""
+    other_ids = [fid for fid in existing_ids if fid != feature_id]
+    if other_ids:
+        other_titles = []
+        for fid in other_ids:
+            feature_dir = task_dir / f"feature{fid}"
+            md_content = _read_feature_md(feature_dir)
+            title = _extract_feature_title(md_content) if md_content else None
+            if title:
+                other_titles.append(f'- Feature {fid}: "{title}"')
+            else:
+                other_titles.append(f"- Feature {fid}: (no title)")
+
+        if other_titles:
+            other_features_section = (
+                "\n## Other Existing Features\n\n"
+                "The following features already exist in this task. "
+                "Make sure your proposed feature is different but compatible with these. \n"
+                + "\n".join(other_titles)
+                + "\n"
+            )
+
     # Build final prompt
     prompt = GENERATION_PROMPT_TEMPLATE.format(
         feature_description=feature_description,
@@ -293,6 +335,7 @@ def build_prompt(task_dir: Path, feature_id: int | None = None) -> str:
         hot_lines=hot_lines_str,
         test_command=test_command,
         test_file=test_file,
+        other_features_section=other_features_section,
     )
 
     return prompt
diff --git a/src/cooperbench/generation/splitter.py b/src/cooperbench/generation/splitter.py
index e607225..6c95042 100644
--- a/src/cooperbench/generation/splitter.py
+++ b/src/cooperbench/generation/splitter.py
@@ -20,8 +20,8 @@
 
 # Files to exclude from patches (agent helper scripts, junk files)
 JUNK_FILE_PATTERNS = [
-    "fix_",           # Helper scripts like fix_parquet.py
-    "temp_",          # Temporary files
+    "fix_",  # Helper scripts like fix_parquet.py
+    "temp_",  # Temporary files
     "tmp_",
     "debug_",
     "scratch_",
@@ -175,7 +175,7 @@ def extract_feature_description(agent_output: str) -> str | None:
 
     # Clean up any markdown code block wrappers
     if description.startswith("```markdown"):
-        description = description[len("```markdown"):].strip()
+        description = description[len("```markdown") :].strip()
     if description.startswith("```"):
         description = description[3:].strip()
     if description.endswith("```"):
diff --git a/src/cooperbench/generation/validator.py b/src/cooperbench/generation/validator.py
index 52bc813..5016337 100644
--- a/src/cooperbench/generation/validator.py
+++ b/src/cooperbench/generation/validator.py
@@ -4,8 +4,8 @@
 from pathlib import Path
 
 from cooperbench.eval.backends import get_backend
-from cooperbench.utils import get_image_name
 from cooperbench.eval.sandbox import _parse_results, _write_patch
+from cooperbench.utils import get_image_name
 
 
 def _extract_feature_title(feature_md_path: Path) -> str | None:
@@ -103,11 +103,13 @@ def check_conflicts_in_sandbox(
                 conflicts.append(fid)
                 # Get title from feature_titles we extracted earlier
                 title = feature_titles.get(fid, f"Feature {fid}")
-                conflicts_info.append({
-                    "id": fid,
-                    "title": title,
-                    "conflict_diff": "\n".join(conflict_content),
-                })
+                conflicts_info.append(
+                    {
+                        "id": fid,
+                        "title": title,
+                        "conflict_diff": "\n".join(conflict_content),
+                    }
+                )
             elif line.startswith("CLEAN:"):
                 # Format: CLEAN:fid
                 fid = int(line.split(":")[1].strip())
@@ -155,6 +157,7 @@ def run_tests_in_sandbox(
         Dict with: passed, tests_passed, tests_failed, output, error
     """
     import logging
+
     logger = logging.getLogger(__name__)
 
     image = get_image_name(repo_name, task_id)
@@ -179,9 +182,13 @@ def run_tests_in_sandbox(
         # Use runner.sh with: tests.patch feature.patch [test_path]
         # - Old images: 3rd param ignored, runs default tests
         # - New images: runs the specific new test files
+        # NOTE: We use bash -c to allow shell word splitting on test_path.
+        # Passing test_path directly as an argument would treat the space-separated
+        # specs as a single argument, which pytest can't parse.
         if test_path:
-            logger.debug(f"Running: bash /usr/local/bin/runner.sh tests.patch feature.patch {test_path}")
-            result = sb.exec("bash", "/usr/local/bin/runner.sh", "tests.patch", "feature.patch", test_path)
+            cmd = f"bash /usr/local/bin/runner.sh tests.patch feature.patch {test_path}"
+            logger.debug(f"Running: {cmd}")
+            result = sb.exec("bash", "-c", cmd)
         else:
             logger.debug("Running: bash /usr/local/bin/runner.sh tests.patch feature.patch")
             result = sb.exec("bash", "/usr/local/bin/runner.sh", "tests.patch", "feature.patch")
@@ -215,6 +222,7 @@ def run_tests_in_sandbox(
 def _extract_test_files_from_patch(patch: str) -> list[str]:
     """Extract new/modified file paths from a patch."""
     import re
+
     files = []
     for match in re.finditer(r"^\+\+\+ b/(.+)$", patch, re.MULTILINE):
         path = match.group(1)
@@ -335,6 +343,7 @@ def _build_conflict_check_script(feature_ids: list[int], feature_titles: dict[in
     3. Try git merge --no-commit from A
     4. Check if merge has conflicts (git merge --abort needed)
     """
+
     def _build_feature_check(fid: int, title: str) -> str:
         # Escape title for shell - replace : with space to avoid parsing issues
         safe_title = title.replace(":", " -").replace("'", "\\'").replace('"', '\\"')
@@ -393,11 +402,10 @@ def _build_feature_check(fid: int, title: str) -> str:
 '''
 
     feature_checks = "\n".join(
-        _build_feature_check(fid, feature_titles.get(fid, f"Feature {fid}"))
-        for fid in feature_ids
+        _build_feature_check(fid, feature_titles.get(fid, f"Feature {fid}")) for fid in feature_ids
     )
 
-    return f'''
+    return f"""
 cd /workspace/repo
 
 # Get base commit
@@ -416,7 +424,7 @@ def _build_feature_check(fid: int, title: str) -> str:
 # Final cleanup
 git checkout --quiet $BASE_SHA 2>/dev/null || true
 git reset --hard HEAD >/dev/null 2>&1
-'''
+"""
 
 
 def _get_existing_feature_ids(task_dir: Path) -> list[int]: