From a781eefbb8a09c7df3d5d6c9b94828f4a8908840 Mon Sep 17 00:00:00 2001 From: Arpandeep Khatua Date: Sun, 1 Feb 2026 16:00:25 -0800 Subject: [PATCH 1/3] Adding data gen using llm --- .gitignore | 1 + .../task7309/runner.sh | 4 +- .../agents/mini_swe_agent/adapter.py | 16 +- src/cooperbench/generation/README.md | 131 +++++ src/cooperbench/generation/__init__.py | 17 + src/cooperbench/generation/__main__.py | 270 +++++++++ src/cooperbench/generation/generator.py | 512 ++++++++++++++++++ src/cooperbench/generation/prompt.py | 310 +++++++++++ src/cooperbench/generation/splitter.py | 198 +++++++ src/cooperbench/generation/validator.py | 400 ++++++++++++++ 10 files changed, 1852 insertions(+), 7 deletions(-) create mode 100644 src/cooperbench/generation/README.md create mode 100644 src/cooperbench/generation/__init__.py create mode 100644 src/cooperbench/generation/__main__.py create mode 100644 src/cooperbench/generation/generator.py create mode 100644 src/cooperbench/generation/prompt.py create mode 100644 src/cooperbench/generation/splitter.py create mode 100644 src/cooperbench/generation/validator.py diff --git a/.gitignore b/.gitignore index 9e70e61..eebb634 100644 --- a/.gitignore +++ b/.gitignore @@ -41,6 +41,7 @@ site/ logs*/ *.log cooperbench_results.xlsx +generated/ # Cache .cooperbench_cache/ diff --git a/dataset/huggingface_datasets_task/task7309/runner.sh b/dataset/huggingface_datasets_task/task7309/runner.sh index d77750b..e1cff66 100644 --- a/dataset/huggingface_datasets_task/task7309/runner.sh +++ b/dataset/huggingface_datasets_task/task7309/runner.sh @@ -17,10 +17,10 @@ trap cleanup EXIT INT TERM # Get input params TEST_PATCH="$1" FEATURE_PATCH="$2" -TEST_PATH="tests/io/test_parquet.py" +TEST_PATH="${3:-tests/io/test_parquet.py}" # Optional 3rd param, default to original if [[ -z "$TEST_PATCH" ]]; then - echo "Usage: docker run -v \$(pwd):/patches [feature_patch]" + echo "Usage: docker run -v \$(pwd):/patches [feature_patch] [test_path]" exit 1 fi diff --git a/src/cooperbench/agents/mini_swe_agent/adapter.py b/src/cooperbench/agents/mini_swe_agent/adapter.py index 995102f..6a21024 100644 --- a/src/cooperbench/agents/mini_swe_agent/adapter.py +++ b/src/cooperbench/agents/mini_swe_agent/adapter.py @@ -61,9 +61,14 @@ def run( with open(config_path) as f: default_config = yaml.safe_load(f) - # Merge passed config overrides into default config + # Deep merge passed config overrides into default config if config is not None: - default_config.update(config) + for key, value in config.items(): + if key in default_config and isinstance(default_config[key], dict) and isinstance(value, dict): + # Deep merge nested dicts (like "agent") + default_config[key].update(value) + else: + default_config[key] = value agent_config = default_config.get("agent", {}) backend = default_config.get("backend", "modal") @@ -175,9 +180,10 @@ def run( def _get_patch(self, env: "ModalEnvironment | DockerEnvironment", base_commit: str) -> str: """Extract git diff from base commit to current working tree state.""" try: - # Single diff from base commit to working tree (includes both - # committed and uncommitted changes) - result = env.execute(f"git diff {base_commit}", timeout=30) + # Stage all changes (including new untracked files) so they appear in diff + env.execute("git add -A", timeout=10) + # Diff from base commit to staged changes (includes new files) + result = env.execute(f"git diff --cached {base_commit}", timeout=30) return result.get("output", "").strip() except Exception: return "" diff --git a/src/cooperbench/generation/README.md b/src/cooperbench/generation/README.md new file mode 100644 index 0000000..1caff18 --- /dev/null +++ b/src/cooperbench/generation/README.md @@ -0,0 +1,131 @@ +# Feature Generation Pipeline + +Automated generation of new benchmark features using LLM agents running on Modal. + +## Quick Start + +```bash +# From project root +cd /path/to/CooperBench + +# Generate a single feature +python -m cooperbench.generation --task dspy_task/task8394 + +# Just see the prompt (no agent run) +python -m cooperbench.generation --task dspy_task/task8394 --prompt-only + +# Validate existing patches +python -m cooperbench.generation --task dspy_task/task8394 --validate feature.patch tests.patch +``` + +## Usage + +### Generate Features + +```bash +# Single attempt with Gemini 3 Flash (default) +python -m cooperbench.generation --task dspy_task/task8394 + +# Multiple attempts with output directory +python -m cooperbench.generation --task dspy_task/task8394 --attempts 5 --output ./generated + +# Use different model +python -m cooperbench.generation --task dspy_task/task8394 --model claude-3-opus + +# Use local Docker instead of Modal +python -m cooperbench.generation --task dspy_task/task8394 --backend docker +``` + +### Validate Patches + +```bash +# Check if patches pass tests and conflict with existing features +python -m cooperbench.generation \ + --task dspy_task/task8394 \ + --validate ./generated/feature.patch ./generated/tests.patch +``` + +## How It Works + +### 1. Prompt Building (`prompt.py`) + +Analyzes existing features in a task to build a generation prompt: +- Reads all `feature.md` files to understand the format +- Parses `feature.patch` files to identify "hot spots" (frequently modified files/lines) +- Instructs agent to create conflicting features + +### 2. Agent Execution (`generator.py`) + +Runs `mini_swe_agent` on Modal with the task's Docker image: +- Agent explores the codebase +- Implements a new feature that modifies similar code regions +- Writes tests +- Verifies tests pass + +### 3. Patch Splitting (`splitter.py`) + +Separates agent's output into: +- `feature.patch` - Source code changes +- `tests.patch` - Test file changes +- `feature.md` - Feature description extracted from agent output + +### 4. Validation (`validator.py`) + +All validation runs in Modal sandboxes: +- **Test validation**: Runs tests using existing `runner.sh` +- **Conflict detection**: Applies patches to git branches and attempts merge + +A generated feature is **valid** if: +- ✅ All tests pass +- ✅ Conflicts with at least 1 existing feature + +## Module Structure + +``` +generation/ +├── __init__.py # Package exports +├── __main__.py # CLI entry point +├── generator.py # Main orchestrator +├── prompt.py # Prompt building +├── splitter.py # Patch splitting +├── validator.py # Modal-based validation +└── README.md # This file +``` + +## Programmatic Usage + +```python +from cooperbench.generation import generate_feature, validate_generated_feature + +# Generate a new feature +result = generate_feature( + task_dir="dataset/dspy_task/task8394", + model_name="gpt-4o", + backend="modal", +) + +if result.success: + print(f"Feature patch:\n{result.feature_patch}") + print(f"Tests patch:\n{result.tests_patch}") + print(f"Cost: ${result.agent_cost:.4f}") + +# Validate patches +validation = validate_generated_feature( + repo_name="dspy_task", + task_id=8394, + feature_patch=result.feature_patch, + tests_patch=result.tests_patch, +) + +print(f"Valid: {validation['valid']}") +print(f"Conflicts with features: {validation['conflict_result']['conflicts']}") +``` + +## Success Criteria + +A generated feature is considered **successful** if: + +1. **Tests Pass**: The feature implementation is correct and all tests (including new tests) pass +2. **Has Conflicts**: The feature conflicts with at least one existing feature when merging + +The conflict requirement ensures the generated feature is useful for testing multi-agent coordination - features that merge cleanly don't test the coordination aspects of the benchmark. diff --git a/src/cooperbench/generation/__init__.py b/src/cooperbench/generation/__init__.py new file mode 100644 index 0000000..2c42499 --- /dev/null +++ b/src/cooperbench/generation/__init__.py @@ -0,0 +1,17 @@ +"""Task generation package - automated creation of new benchmark features.""" + +from cooperbench.generation.generator import generate_feature +from cooperbench.generation.prompt import build_prompt +from cooperbench.generation.splitter import split_patch +from cooperbench.generation.validator import ( + check_conflicts_in_sandbox, + validate_generated_feature, +) + +__all__ = [ + "generate_feature", + "build_prompt", + "split_patch", + "check_conflicts_in_sandbox", + "validate_generated_feature", +] diff --git a/src/cooperbench/generation/__main__.py b/src/cooperbench/generation/__main__.py new file mode 100644 index 0000000..00f7202 --- /dev/null +++ b/src/cooperbench/generation/__main__.py @@ -0,0 +1,270 @@ +"""Direct execution entry point for generation module. + +Usage: + python -m cooperbench.generation --task dspy_task/task8394 --model gpt-4o +""" + +import argparse +import json +import logging +import sys +from pathlib import Path + +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser( + description="Generate new features for CooperBench tasks", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Generate a single feature + python -m cooperbench.generation --task dspy_task/task8394 + + # Generate multiple attempts + python -m cooperbench.generation --task dspy_task/task8394 --attempts 5 --output ./generated + + # Just build and print the prompt (no agent run) + python -m cooperbench.generation --task dspy_task/task8394 --prompt-only + + # Validate an existing patch + python -m cooperbench.generation --task dspy_task/task8394 --validate feature.patch tests.patch +""", + ) + + parser.add_argument( + "--task", + required=True, + help="Task path relative to dataset/ (e.g., dspy_task/task8394)", + ) + parser.add_argument( + "--model", + default="gemini/gemini-3-flash-preview", + help="LLM model to use (default: gemini/gemini-3-flash-preview)", + ) + parser.add_argument( + "--backend", + choices=["modal", "docker"], + default="modal", + help="Execution backend (default: modal)", + ) + parser.add_argument( + "--attempts", + type=int, + default=1, + help="Number of generation attempts (default: 1)", + ) + parser.add_argument( + "--output", + type=Path, + help="Output directory for generated features", + ) + parser.add_argument( + "--feature", + type=int, + help="Target a specific feature ID for conflicts (default: first feature)", + ) + parser.add_argument( + "--prompt-only", + action="store_true", + help="Just print the prompt without running the agent", + ) + parser.add_argument( + "--list-features", + action="store_true", + help="List all feature IDs in the task and exit", + ) + parser.add_argument( + "--validate", + nargs=2, + metavar=("FEATURE_PATCH", "TESTS_PATCH"), + help="Validate existing patches instead of generating", + ) + parser.add_argument( + "--step-limit", + type=int, + default=75, + help="Maximum agent steps (default: 75)", + ) + parser.add_argument( + "--cost-limit", + type=float, + default=2.0, + help="Maximum cost in USD (default: 2.0)", + ) + parser.add_argument( + "--debug", + action="store_true", + help="Save full agent trajectory for debugging", + ) + + args = parser.parse_args() + + # Resolve task directory + task_dir = Path("dataset") / args.task + if not task_dir.exists(): + logger.error(f"Task directory not found: {task_dir}") + sys.exit(1) + + # Parse repo_name and task_id from path + parts = args.task.split("/") + if len(parts) != 2: + logger.error(f"Invalid task path format. Expected: repo_name/taskID (e.g., dspy_task/task8394)") + sys.exit(1) + + repo_name = parts[0] + task_id = int(parts[1].replace("task", "")) + + # Mode 0: List features + if args.list_features: + from cooperbench.generation.prompt import list_features + features = list_features(task_dir) + print(f"Features in {args.task}: {features}") + return + + # Mode 1: Prompt only + if args.prompt_only: + from cooperbench.generation.prompt import build_prompt + prompt = build_prompt(task_dir, feature_id=args.feature) + print(prompt) + return + + # Mode 2: Validate existing patches + if args.validate: + feature_patch_path, tests_patch_path = args.validate + feature_patch = Path(feature_patch_path).read_text() + tests_patch = Path(tests_patch_path).read_text() + + from cooperbench.generation.validator import validate_generated_feature + + logger.info(f"Validating patches for {args.task}") + result = validate_generated_feature( + repo_name=repo_name, + task_id=task_id, + feature_patch=feature_patch, + tests_patch=tests_patch, + backend=args.backend, + ) + + print(json.dumps(result, indent=2, default=str)) + sys.exit(0 if result["valid"] else 1) + + # Mode 3: Generate features + from cooperbench.generation.generator import generate_feature, generate_features_batch + import hashlib + import re as re_module + + def make_output_dir(base_dir: Path, feature_md: str | None, fallback_hash: str) -> Path: + """Create output directory named after the feature title + short hash.""" + # Extract title from feature_md if available + title_slug = "unknown" + if feature_md: + # Look for **Title**: ... pattern + match = re_module.search(r"\*\*Title\*\*:\s*(.+?)(?:\n|$)", feature_md) + if match: + title = match.group(1).strip() + # Convert to slug: lowercase, replace spaces with underscores, remove special chars + title_slug = re_module.sub(r"[^a-z0-9]+", "_", title.lower()).strip("_")[:40] + + # Add short hash for uniqueness + content_hash = hashlib.md5((feature_md or fallback_hash).encode()).hexdigest()[:5] + folder_name = f"{title_slug}_{content_hash}" + + output_dir = base_dir / folder_name + output_dir.mkdir(parents=True, exist_ok=True) + return output_dir + + # Base directory for this task + base_output_dir = args.output or (Path("generated") / repo_name / f"task{task_id}") + + if args.attempts == 1: + logger.info(f"Generating feature for {args.task} with {args.model} (target: feature {args.feature or 'first'})") + logger.info(f"Limits: {args.step_limit} steps, ${args.cost_limit} cost") + + # Generate first (to temp location for trajectory) + import tempfile + temp_dir = Path(tempfile.mkdtemp()) + + result = generate_feature( + task_dir=task_dir, + feature_id=args.feature, + model_name=args.model, + backend=args.backend, + step_limit=args.step_limit, + cost_limit=args.cost_limit, + debug=args.debug, + output_dir=temp_dir, + ) + + # Create final output dir based on feature title + from datetime import datetime + fallback = datetime.now().strftime("%Y%m%d_%H%M%S") + output_dir = make_output_dir(base_output_dir, result.feature_md, fallback) + logger.info(f"Output directory: {output_dir}") + + # Move trajectory files from temp to final + import shutil + for f in temp_dir.glob("trajectory_*"): + shutil.move(str(f), output_dir / f.name) + + # Save outputs (ensure patches end with newline for git compatibility) + if result.feature_patch: + patch = result.feature_patch.rstrip() + "\n" + (output_dir / "feature.patch").write_text(patch) + logger.info(f"Saved feature.patch") + if result.tests_patch: + patch = result.tests_patch.rstrip() + "\n" + (output_dir / "tests.patch").write_text(patch) + logger.info(f"Saved tests.patch") + if result.feature_md: + (output_dir / "feature.md").write_text(result.feature_md) + logger.info(f"Saved feature.md") + + # Save full result as JSON + (output_dir / "result.json").write_text(json.dumps(result.to_dict(), indent=2, default=str)) + logger.info(f"Saved result.json") + + # Print summary + print(f"\n{'='*60}") + print(f"Result: {'SUCCESS' if result.success else 'FAILED'}") + print(f"Output saved to: {output_dir}") + if result.errors: + print(f"Errors: {result.errors}") + print(f"Agent: {result.agent_steps} steps, ${result.agent_cost:.4f}") + print(f"{'='*60}") + + sys.exit(0 if result.success else 1) + else: + logger.info(f"Running {args.attempts} generation attempts for {args.task} (target: feature {args.feature or 'first'})") + logger.info(f"Output directory: {base_output_dir}") + + results = generate_features_batch( + task_dir=task_dir, + feature_id=args.feature, + num_attempts=args.attempts, + model_name=args.model, + backend=args.backend, + output_dir=base_output_dir, + step_limit=args.step_limit, + cost_limit=args.cost_limit, + debug=args.debug, + ) + + # Summary + successful = sum(1 for r in results if r.success) + print(f"\n{'='*60}") + print(f"Summary: {successful}/{args.attempts} successful") + print(f"Output saved to: {base_output_dir}") + + for i, r in enumerate(results, 1): + status = "✓" if r.success else "✗" + print(f" {status} Attempt {i}: {r.errors or 'OK'}") + + print(f"{'='*60}") + sys.exit(0 if successful > 0 else 1) + + +if __name__ == "__main__": + main() diff --git a/src/cooperbench/generation/generator.py b/src/cooperbench/generation/generator.py new file mode 100644 index 0000000..ef8da3b --- /dev/null +++ b/src/cooperbench/generation/generator.py @@ -0,0 +1,512 @@ +"""Main generator - orchestrates feature generation using agents.""" + +import json +import logging +import time +from dataclasses import dataclass, field +from pathlib import Path + +from cooperbench.generation.prompt import build_prompt, list_features +from cooperbench.generation.splitter import extract_feature_description, split_patch +from cooperbench.generation.validator import ( + check_conflicts_in_sandbox, + run_tests_in_sandbox, +) +from cooperbench.utils import get_image_name + +logger = logging.getLogger(__name__) + + +def _extract_feature_md_from_patch(patch: str) -> str | None: + """Extract .feature_description.md content from a patch.""" + if not patch or ".feature_description.md" not in patch: + return None + + lines = patch.split("\n") + in_feature_file = False + content_lines = [] + + for line in lines: + if line.startswith("diff --git") and ".feature_description.md" in line: + in_feature_file = True + content_lines = [] + elif in_feature_file and line.startswith("diff --git"): + # End of the feature file + break + elif in_feature_file and line.startswith("+") and not line.startswith("+++"): + # Added line (strip the leading +) + content_lines.append(line[1:]) + + if content_lines: + return "\n".join(content_lines).strip() + return None + + +def _remove_feature_md_from_patch(patch: str) -> str: + """Remove .feature_description.md from a patch (it's metadata, not code).""" + if not patch or ".feature_description.md" not in patch: + return patch + + lines = patch.split("\n") + result_lines = [] + skip_file = False + + for line in lines: + if line.startswith("diff --git") and ".feature_description.md" in line: + skip_file = True + elif line.startswith("diff --git"): + skip_file = False + + if not skip_file: + result_lines.append(line) + + return "\n".join(result_lines) + + +@dataclass +class GenerationResult: + """Result of a feature generation attempt.""" + + success: bool + feature_md: str | None = None + feature_patch: str | None = None + tests_patch: str | None = None + conflicts: list[int] = field(default_factory=list) + conflicts_info: list[dict] = field(default_factory=list) # [{id, title}, ...] + errors: list[str] = field(default_factory=list) + agent_cost: float = 0.0 + agent_steps: int = 0 + duration_seconds: float = 0.0 + # Validation details + tests_passed: bool | None = None + tests_output: str | None = None + validation_run: bool = False + + def to_dict(self) -> dict: + return { + "success": self.success, + "feature_md": self.feature_md, + "feature_patch": self.feature_patch, + "tests_patch": self.tests_patch, + "conflicts": self.conflicts, + "conflicts_info": self.conflicts_info, + "errors": self.errors, + "agent_cost": self.agent_cost, + "agent_steps": self.agent_steps, + "duration_seconds": self.duration_seconds, + "tests_passed": self.tests_passed, + "tests_output": self.tests_output, + "validation_run": self.validation_run, + } + + +def _get_task_image(task_dir: Path) -> str: + """Get the Docker image for a task using existing naming convention.""" + task_id = int(task_dir.name.replace("task", "")) + repo_name = task_dir.parent.name + return get_image_name(repo_name, task_id) + + +def _get_repo_and_task_id(task_dir: Path) -> tuple[str, int]: + """Extract repo_name and task_id from task directory.""" + task_id = int(task_dir.name.replace("task", "")) + repo_name = task_dir.parent.name + return repo_name, task_id + + +def generate_feature( + task_dir: str | Path, + feature_id: int | None = None, + model_name: str = "gemini/gemini-3-flash-preview", + backend: str = "modal", + timeout: int = 3600, + validate: bool = True, + step_limit: int = 75, + cost_limit: float = 2.0, + debug: bool = False, + output_dir: Path | None = None, +) -> GenerationResult: + """Generate a new feature for a task using an agent. + + Args: + task_dir: Path to the task directory (e.g., dataset/dspy_task/task8394) + feature_id: ID of the specific feature to target for conflicts (default: first) + model_name: LLM model to use for the agent + backend: Execution backend ("modal", "docker", or "gcp") + timeout: Maximum time for generation in seconds + validate: Whether to validate (run tests + check conflicts) after generation + step_limit: Maximum number of agent steps (default: 75) + cost_limit: Maximum cost in USD (default: 2.0) + debug: Save full agent trajectory to file for inspection + output_dir: Directory to save debug output (default: current dir) + + Returns: + GenerationResult with the generated feature or errors. + """ + task_dir = Path(task_dir) + start_time = time.time() + + if not task_dir.exists(): + return GenerationResult( + success=False, + errors=[f"Task directory not found: {task_dir}"], + ) + + repo_name, task_id = _get_repo_and_task_id(task_dir) + + # Build the prompt + logger.info(f"Building prompt for {task_dir} (target feature: {feature_id or 'first'})") + prompt = build_prompt(task_dir, feature_id=feature_id) + + # Get the Docker image for this task + image = _get_task_image(task_dir) + logger.info(f"Using image: {image}") + + # Get existing feature IDs for conflict checking + existing_feature_ids = list_features(task_dir) + logger.info(f"Found {len(existing_feature_ids)} existing features: {existing_feature_ids}") + + # Run the agent + logger.info(f"Running agent with model {model_name} on {backend}") + + try: + from cooperbench.agents import get_runner + + agent = get_runner("mini_swe_agent") + + result = agent.run( + task=prompt, + image=image, + model_name=model_name, + config={ + "backend": backend, + "agent": { + "step_limit": step_limit, + "cost_limit": cost_limit, + }, + }, + ) + + agent_cost = result.cost + agent_steps = result.steps + + # Save/log agent trajectory for debugging + if result.messages: + logger.info(f"Agent trajectory: {len(result.messages)} messages, {agent_steps} steps, ${agent_cost:.4f}") + + # Save full trajectory to file if debug mode or output_dir specified + if debug or output_dir: + import re as re_module + save_dir = output_dir or Path(".") + save_dir.mkdir(parents=True, exist_ok=True) + + traj_file = save_dir / f"trajectory_{repo_name}_{task_id}.json" + traj_data = { + "task": f"{repo_name}/task{task_id}", + "model": model_name, + "steps": agent_steps, + "cost": agent_cost, + "status": result.status, + "messages": result.messages, + "patch": result.patch, + } + with open(traj_file, "w") as f: + json.dump(traj_data, f, indent=2, default=str) + logger.info(f"Saved trajectory to: {traj_file}") + + # Also save a human-readable version + readable_file = save_dir / f"trajectory_{repo_name}_{task_id}.txt" + with open(readable_file, "w") as f: + f.write(f"=== Agent Trajectory ===\n") + f.write(f"Task: {repo_name}/task{task_id}\n") + f.write(f"Model: {model_name}\n") + f.write(f"Steps: {agent_steps}, Cost: ${agent_cost:.4f}\n") + f.write(f"Status: {result.status}\n\n") + + for i, msg in enumerate(result.messages): + role = msg.get("role", "?").upper() + content = msg.get("content", "") + f.write(f"\n{'='*60}\n") + f.write(f"[{i}] {role}\n") + f.write(f"{'='*60}\n") + f.write(content) + f.write("\n") + logger.info(f"Saved readable trajectory to: {readable_file}") + + # Log summary to console + import re as re_module + for i, msg in enumerate(result.messages): + role = msg.get("role", "?") + content = msg.get("content", "")[:500] + if role == "assistant": + actions = re_module.findall(r"```bash\s*\n(.*?)\n```", content, re_module.DOTALL) + if actions: + logger.info(f" [{i}] AGENT: {actions[0][:200]}") + elif role == "user" and "returncode" in content: + rc_match = re_module.search(r"(\d+)", content) + rc = rc_match.group(1) if rc_match else "?" + logger.info(f" [{i}] RESULT: exit={rc}") + + # Check for agent errors + if result.status == "Error" or result.error: + return GenerationResult( + success=False, + errors=[f"Agent error: {result.error or result.status}"], + agent_cost=agent_cost, + agent_steps=agent_steps, + duration_seconds=time.time() - start_time, + ) + + # Get the patch from agent + full_patch = result.patch + + if not full_patch: + return GenerationResult( + success=False, + errors=["Agent produced no changes"], + agent_cost=agent_cost, + agent_steps=agent_steps, + duration_seconds=time.time() - start_time, + ) + + # Extract feature description from .feature_description.md in the patch (before removing it) + feature_md = _extract_feature_md_from_patch(full_patch) + + # Remove .feature_description.md from patch (it's metadata, not code) + clean_patch = _remove_feature_md_from_patch(full_patch) + + # Split patch into feature and tests + logger.info("Splitting patch into feature and tests...") + feature_patch, tests_patch = split_patch(clean_patch) + + # Fallback: try extracting from agent messages if not in patch + if not feature_md and result.messages: + for msg in result.messages: + if msg.get("role") == "assistant": + content = msg.get("content", "") + if isinstance(content, str): + extracted = extract_feature_description(content) + if extracted: + feature_md = extracted + break + + # Basic validation + errors = [] + if not feature_patch: + errors.append("No feature changes in patch (only test files modified)") + + if not tests_patch: + errors.append("No test changes in patch") + + # If basic validation fails, return early + if errors: + return GenerationResult( + success=False, + feature_md=feature_md, + feature_patch=feature_patch, + tests_patch=tests_patch, + errors=errors, + agent_cost=agent_cost, + agent_steps=agent_steps, + duration_seconds=time.time() - start_time, + ) + + # Run full validation if requested + tests_passed = None + tests_output = None + conflicts = [] + conflicts_info = [] + validation_run = False + + if validate: + logger.info("Running validation...") + validation_run = True + + # Step 1: Run tests + logger.info("Step 1/2: Running tests in sandbox...") + test_result = run_tests_in_sandbox( + repo_name=repo_name, + task_id=task_id, + feature_patch=feature_patch, + tests_patch=tests_patch, + timeout=600, + backend=backend, + ) + + tests_passed = test_result["passed"] + tests_output = test_result.get("output", "") + + if test_result.get("error"): + errors.append(f"Test error: {test_result['error']}") + + if not tests_passed: + errors.append(f"Tests failed: {test_result['tests_failed']} failed, {test_result['tests_passed']} passed") + logger.warning(f"Tests failed: {test_result['tests_failed']} failed") + else: + logger.info(f"Tests passed: {test_result['tests_passed']} passed") + + # Step 2: Check conflicts (only if tests pass) + if tests_passed: + logger.info("Step 2/2: Checking conflicts with existing features...") + conflict_result = check_conflicts_in_sandbox( + repo_name=repo_name, + task_id=task_id, + new_feature_patch=feature_patch, + existing_feature_ids=existing_feature_ids, + timeout=300, + backend=backend, + ) + + conflicts = conflict_result["conflicts"] + conflicts_info = conflict_result.get("conflicts_info", []) + + if conflict_result.get("errors"): + for err in conflict_result["errors"]: + errors.append(f"Conflict check: {err}") + + if not conflicts: + errors.append("No conflicts with any existing feature - feature may be too independent") + logger.warning("No conflicts detected with existing features") + else: + conflict_titles = [c.get("title", f"Feature {c['id']}") for c in conflicts_info] + logger.info(f"Conflicts detected with features: {conflict_titles}") + + # Determine success + success = ( + len(errors) == 0 + and feature_patch + and tests_patch + and (not validate or (tests_passed and len(conflicts) > 0)) + ) + + return GenerationResult( + success=success, + feature_md=feature_md, + feature_patch=feature_patch, + tests_patch=tests_patch, + conflicts=conflicts, + conflicts_info=conflicts_info, + errors=errors, + agent_cost=agent_cost, + agent_steps=agent_steps, + duration_seconds=time.time() - start_time, + tests_passed=tests_passed, + tests_output=tests_output, + validation_run=validation_run, + ) + + except Exception as e: + logger.exception("Generation failed") + return GenerationResult( + success=False, + errors=[f"Generation failed: {e!s}"], + duration_seconds=time.time() - start_time, + ) + + +def generate_features_batch( + task_dir: str | Path, + feature_id: int | None = None, + num_attempts: int = 5, + model_name: str = "gemini/gemini-3-flash-preview", + backend: str = "modal", + output_dir: str | Path | None = None, + validate: bool = True, + step_limit: int = 75, + cost_limit: float = 2.0, + debug: bool = False, +) -> list[GenerationResult]: + """Generate multiple feature candidates for a task. + + Args: + task_dir: Path to the task directory + feature_id: Target feature ID (default: first) + num_attempts: Number of generation attempts + model_name: LLM model to use + backend: Execution backend + output_dir: Directory to save results (optional) + validate: Whether to run validation after each generation + debug: Save full trajectory for each attempt + + Returns: + List of GenerationResults (including failures). + """ + import hashlib + import re as re_module + import tempfile + import shutil + + def make_feature_dir(base_dir: Path, feature_md: str | None, attempt_num: int) -> Path: + """Create output directory named after the feature title + short hash.""" + title_slug = "unknown" + if feature_md: + match = re_module.search(r"\*\*Title\*\*:\s*(.+?)(?:\n|$)", feature_md) + if match: + title = match.group(1).strip() + title_slug = re_module.sub(r"[^a-z0-9]+", "_", title.lower()).strip("_")[:40] + + content_hash = hashlib.md5((feature_md or f"attempt_{attempt_num}").encode()).hexdigest()[:5] + folder_name = f"{title_slug}_{content_hash}" + + feature_dir = base_dir / folder_name + feature_dir.mkdir(parents=True, exist_ok=True) + return feature_dir + + task_dir = Path(task_dir) + base_output_dir = Path(output_dir) if output_dir else None + results = [] + + for i in range(num_attempts): + logger.info(f"=== Generation attempt {i + 1}/{num_attempts} ===") + + # Use temp dir for trajectory during generation + temp_dir = Path(tempfile.mkdtemp()) if base_output_dir else None + + result = generate_feature( + task_dir=task_dir, + feature_id=feature_id, + model_name=model_name, + backend=backend, + validate=validate, + step_limit=step_limit, + cost_limit=cost_limit, + debug=debug, + output_dir=temp_dir, + ) + + results.append(result) + + # Save to named directory based on feature title + if base_output_dir: + attempt_dir = make_feature_dir(base_output_dir, result.feature_md, i + 1) + + # Move trajectory files from temp + if temp_dir: + for f in temp_dir.glob("trajectory_*"): + shutil.move(str(f), attempt_dir / f.name) + + # Save result JSON + with open(attempt_dir / "result.json", "w") as f: + json.dump(result.to_dict(), f, indent=2) + + # Save patches if available (ensure trailing newline for git compatibility) + if result.feature_patch: + patch = result.feature_patch.rstrip() + "\n" + (attempt_dir / "feature.patch").write_text(patch) + if result.tests_patch: + patch = result.tests_patch.rstrip() + "\n" + (attempt_dir / "tests.patch").write_text(patch) + if result.feature_md: + (attempt_dir / "feature.md").write_text(result.feature_md) + + logger.info(f"Saved attempt {i + 1} to {attempt_dir}") + + # Log result + status = "✓ SUCCESS" if result.success else "✗ FAILED" + logger.info(f"Attempt {i + 1} {status}: {result.errors or 'OK'}") + + # Summary + successful = sum(1 for r in results if r.success) + logger.info(f"=== Generation complete: {successful}/{num_attempts} successful ===") + + return results diff --git a/src/cooperbench/generation/prompt.py b/src/cooperbench/generation/prompt.py new file mode 100644 index 0000000..c8d5fff --- /dev/null +++ b/src/cooperbench/generation/prompt.py @@ -0,0 +1,310 @@ +"""Prompt building for feature generation.""" + +from pathlib import Path + +from unidiff import PatchSet + + +def _extract_patch_info(patch_path: Path) -> dict: + """Extract file and line information from a patch file.""" + try: + content = patch_path.read_text() + patchset = PatchSet(content) + except Exception: + return {"files": [], "raw": "", "error": "Failed to parse patch"} + + files_info = [] + for patched_file in patchset: + file_info = { + "path": patched_file.path, + "added": patched_file.added, + "removed": patched_file.removed, + "hunks": [], + } + for hunk in patched_file: + file_info["hunks"].append({ + "source_start": hunk.source_start, + "source_length": hunk.source_length, + "target_start": hunk.target_start, + "target_length": hunk.target_length, + }) + files_info.append(file_info) + + return {"files": files_info, "raw": content} + + +def _read_feature_md(feature_dir: Path) -> str: + """Read and return contents of feature.md.""" + feature_md = feature_dir / "feature.md" + if feature_md.exists(): + return feature_md.read_text() + return "" + + +def _get_feature_info(task_dir: Path, feature_id: int) -> dict | None: + """Get full information about a specific feature.""" + feature_dir = task_dir / f"feature{feature_id}" + + if not feature_dir.exists(): + return None + + feature_info = { + "id": feature_id, + "name": f"feature{feature_id}", + "description": _read_feature_md(feature_dir), + "patch_info": None, + } + + # Extract patch information + feature_patch = feature_dir / "feature.patch" + if feature_patch.exists(): + feature_info["patch_info"] = _extract_patch_info(feature_patch) + + return feature_info + + +def _get_existing_feature_ids(task_dir: Path) -> list[int]: + """Get IDs of all existing features in a task.""" + ids = [] + for d in task_dir.iterdir(): + if d.is_dir() and d.name.startswith("feature"): + try: + fid = int(d.name.replace("feature", "")) + ids.append(fid) + except ValueError: + pass + return sorted(ids) + + +def _get_test_command(task_dir: Path) -> str: + """Extract the test command from runner.sh, resolving variables.""" + runner_sh = task_dir / "runner.sh" + if not runner_sh.exists(): + return "# Test command not found - check runner.sh" + + content = runner_sh.read_text() + + # First, try to find variable definitions like TEST_PATH="..." + variables = {} + for line in content.split("\n"): + line = line.strip() + # Match patterns like: TEST_PATH="tests/io/test_parquet.py" + if "=" in line and not line.startswith("#"): + parts = line.split("=", 1) + if len(parts) == 2: + var_name = parts[0].strip() + var_value = parts[1].strip().strip('"').strip("'") + variables[var_name] = var_value + + # Look for pytest or cargo test commands + for line in content.split("\n"): + line = line.strip() + if "pytest" in line and not line.startswith("#"): + if "python -m pytest" in line: + # Resolve variables in the command + resolved = line + for var_name, var_value in variables.items(): + resolved = resolved.replace(f"${var_name}", var_value) + resolved = resolved.replace(f"${{{var_name}}}", var_value) + resolved = resolved.replace(f'"${var_name}"', f'"{var_value}"') + resolved = resolved.replace(f'"${{{var_name}}}"', f'"{var_value}"') + # Clean up timeout wrapper if present + if resolved.startswith("timeout"): + # Extract just the pytest part + if "python -m pytest" in resolved: + idx = resolved.find("python -m pytest") + resolved = resolved[idx:] + return resolved + if "cargo test" in line and not line.startswith("#"): + return line + + return "# See runner.sh for test commands" + + +def _extract_test_file(test_command: str) -> str: + """Extract the test file path from a test command.""" + import re + + # Try to extract pytest test file path + # Matches patterns like: pytest "tests/foo/test_bar.py" or pytest tests/foo/test_bar.py + pytest_match = re.search(r'pytest\s+["\']?([^\s"\']+(?:test[^\s"\']*\.py|tests?/[^\s"\']+))["\']?', test_command) + if pytest_match: + return pytest_match.group(1) + + # Try to find any .py file path that looks like a test file + test_file_match = re.search(r'([^\s"\']+(?:test[^\s"\']*\.py|tests?/[^\s"\']+\.py))', test_command) + if test_file_match: + return test_file_match.group(1) + + # For cargo test, return the tests directory + if "cargo test" in test_command: + return "tests/" + + return "the existing test file" + + +def _format_code_snippet(patch_content: str, max_lines: int = 80) -> str: + """Format patch content for display, limiting length.""" + lines = patch_content.split("\n") + if len(lines) <= max_lines: + return patch_content + + # Take first half and last quarter + first_part = lines[:max_lines // 2] + last_part = lines[-(max_lines // 4):] + + return "\n".join(first_part) + "\n\n... (truncated) ...\n\n" + "\n".join(last_part) + + +GENERATION_PROMPT_TEMPLATE = '''Create a NEW feature that will CONFLICT with an existing feature during git merge. + +## Existing Feature (your feature must conflict with this) + +{feature_description} + +### Code Changes: +{files_summary} + +```diff +{code_snippet} +``` + +## Requirements + +Your new feature must: +1. **Cause merge conflicts** - modify some of the same lines/regions (around lines {hot_lines}) +2. **Be a real enhancement** - not random changes, but a legitimate useful feature +3. **MUST include tests** - write NEW test functions that verify your feature works +4. **Pass all tests** - run tests to verify everything works before submitting + +**IMPORTANT TEST REQUIREMENTS**: +- You MUST write tests before submitting. A feature without tests is incomplete. +- **Add your tests to the SAME test file** that the existing tests use: `{test_file}` +- Do NOT create new test files. Add new test functions/classes to the existing test file. +- This ensures test changes can create merge conflicts with other features' tests. + +Existing tests: `{test_command}` + +You CAN modify other files too, but at least some changes must overlap with the existing feature to create conflicts. + +## Output Format + +**IMPORTANT**: Before submitting, you MUST create a feature description file at `.feature_description.md` in the repo root. + +This description must be **detailed enough that another developer could implement the same feature** without seeing your code. Include: +- What the feature does and why it's useful +- The API/interface changes (new parameters, functions, classes) +- Key implementation details (algorithms, data structures, edge cases handled) +- How it integrates with the existing code + +```bash +cat << 'FEATURE_EOF' > .feature_description.md +**Title**: [Descriptive feature title] + +**Description**: [2-3 sentences explaining what the feature does and its purpose] + +**API Changes**: +- [New function/method signatures with parameters] +- [New parameters added to existing functions] +- [New classes or data structures] + +**Implementation Details**: +- [Key algorithms or logic used] +- [How it modifies existing behavior] +- [Edge cases handled] + +**Files Modified**: [List each file and what was changed in it] +FEATURE_EOF +``` + +This file is required for the submission to be valid. Create it right before you submit. + +Start by exploring the modified files to understand the code structure. +''' + + +def build_prompt(task_dir: Path, feature_id: int | None = None) -> str: + """Build the generation prompt for a task, targeting a specific feature. + + Args: + task_dir: Path to the task directory (e.g., dataset/dspy_task/task8394) + feature_id: ID of the specific feature to target for conflicts. + If None, uses the first feature. + + Returns: + The formatted prompt string + """ + task_dir = Path(task_dir) + + # Get existing feature IDs + existing_ids = _get_existing_feature_ids(task_dir) + if not existing_ids: + return "Error: No existing features found in task" + + # Default to first feature if not specified + if feature_id is None: + feature_id = existing_ids[0] + + if feature_id not in existing_ids: + return f"Error: Feature {feature_id} not found. Available: {existing_ids}" + + # Get full feature info + feature = _get_feature_info(task_dir, feature_id) + if not feature: + return f"Error: Could not load feature {feature_id}" + + # Format feature description + feature_description = feature["description"] or f"(No description for feature {feature_id})" + + # Format files summary + files_summary = "" + hot_lines = [] + if feature["patch_info"] and feature["patch_info"].get("files"): + for f in feature["patch_info"]["files"]: + files_summary += f"- `{f['path']}` (+{f['added']}/-{f['removed']} lines)\n" + for hunk in f["hunks"]: + start = hunk["source_start"] + end = start + hunk["source_length"] + files_summary += f" - Lines {start}-{end}\n" + hot_lines.extend([start, end]) + + # Format hot lines + if hot_lines: + hot_lines_str = ", ".join(str(l) for l in sorted(set(hot_lines))[:5]) + else: + hot_lines_str = "the modified sections" + + # Get code snippet + code_snippet = "" + if feature["patch_info"] and feature["patch_info"].get("raw"): + code_snippet = _format_code_snippet(feature["patch_info"]["raw"]) + else: + code_snippet = "(patch content not available)" + + # Get test command and extract test file path + test_command = _get_test_command(task_dir) + test_file = _extract_test_file(test_command) + + # Build final prompt + prompt = GENERATION_PROMPT_TEMPLATE.format( + feature_description=feature_description, + files_summary=files_summary or "(no files info)", + code_snippet=code_snippet, + hot_lines=hot_lines_str, + test_command=test_command, + test_file=test_file, + ) + + return prompt + + +def list_features(task_dir: Path) -> list[int]: + """List all feature IDs in a task. + + Args: + task_dir: Path to the task directory + + Returns: + List of feature IDs + """ + return _get_existing_feature_ids(Path(task_dir)) diff --git a/src/cooperbench/generation/splitter.py b/src/cooperbench/generation/splitter.py new file mode 100644 index 0000000..e607225 --- /dev/null +++ b/src/cooperbench/generation/splitter.py @@ -0,0 +1,198 @@ +"""Patch splitting - separate feature changes from test changes.""" + +from unidiff import PatchSet + +# Common patterns for test files across different languages/frameworks +DEFAULT_TEST_PATTERNS = [ + "test_", + "_test.", + "/tests/", + "/test/", + ".test.", + ".spec.", + "_spec.", + "tests.py", + "test.py", + # Rust + "#[cfg(test)]", + "mod tests", +] + +# Files to exclude from patches (agent helper scripts, junk files) +JUNK_FILE_PATTERNS = [ + "fix_", # Helper scripts like fix_parquet.py + "temp_", # Temporary files + "tmp_", + "debug_", + "scratch_", + "helper_", + ".pyc", + "__pycache__", + ".egg-info", +] + + +def split_patch( + patch: str, + test_patterns: list[str] | None = None, +) -> tuple[str, str]: + """Split a patch into feature.patch and tests.patch. + + Args: + patch: The full git diff as a string + test_patterns: List of patterns to identify test files. + Defaults to common test file patterns. + + Returns: + Tuple of (feature_patch, tests_patch) as strings. + Either may be empty if no matching files found. + """ + if test_patterns is None: + test_patterns = DEFAULT_TEST_PATTERNS + + if not patch or not patch.strip(): + return "", "" + + try: + patchset = PatchSet(patch) + except Exception: + # If we can't parse the patch, return it all as feature + return patch, "" + + feature_hunks = [] + test_hunks = [] + + for patched_file in patchset: + path = patched_file.path + + # Skip junk/helper files + if _is_junk_file(path): + continue + + # Check if this is a test file + is_test = _is_test_file(path, test_patterns) + + if is_test: + test_hunks.append(str(patched_file)) + else: + feature_hunks.append(str(patched_file)) + + feature_patch = "\n".join(feature_hunks) if feature_hunks else "" + tests_patch = "\n".join(test_hunks) if test_hunks else "" + + # Ensure patches end with newline (required for git apply) + # Strip first to remove excess whitespace, then add exactly one newline + feature_patch = feature_patch.strip() + "\n" if feature_patch.strip() else "" + tests_patch = tests_patch.strip() + "\n" if tests_patch.strip() else "" + + return feature_patch, tests_patch + + +def _is_junk_file(path: str) -> bool: + """Check if a file should be excluded from patches.""" + path_lower = path.lower() + filename = path.split("/")[-1].lower() + + # Check filename patterns + for pattern in JUNK_FILE_PATTERNS: + if filename.startswith(pattern) or pattern in path_lower: + return True + + # Exclude root-level Python scripts that aren't in src/ or proper package structure + # These are usually helper scripts the agent created + if "/" not in path and path.endswith(".py"): + return True + + return False + + +def _is_test_file(path: str, patterns: list[str]) -> bool: + """Check if a file path matches test file patterns.""" + path_lower = path.lower() + + for pattern in patterns: + if pattern.lower() in path_lower: + return True + + return False + + +def extract_feature_description(agent_output: str) -> str | None: + """Extract the feature.md content from agent's output. + + The agent is instructed to output the feature description in a specific + markdown format. This function extracts that content. + + Args: + agent_output: The full agent conversation/output + + Returns: + The extracted feature description, or None if not found. + """ + # Look for the feature description block + # The agent outputs it in markdown format starting with **Title**: + + # Only match structured feature description markers, not bash comments + markers = [ + "**Title**:", + "**Title:**", # Without space variant + "# Feature:", + "## Feature", + ] + + # Find the start of the feature description + start_idx = -1 + for marker in markers: + idx = agent_output.find(marker) + if idx != -1: + if start_idx == -1 or idx < start_idx: + start_idx = idx + + if start_idx == -1: + return None + + # Extract from the marker to end of that block + # Look for common end markers or take until end + content = agent_output[start_idx:] + + # Try to find where the description ends + # Usually followed by code blocks or action outputs + end_markers = [ + "\n```bash", + "\n```python", + "\n", + "\n## Steps", + "\nBegin by", + ] + + end_idx = len(content) + for marker in end_markers: + idx = content.find(marker) + if idx != -1 and idx < end_idx: + end_idx = idx + + description = content[:end_idx].strip() + + # Clean up any markdown code block wrappers + if description.startswith("```markdown"): + description = description[len("```markdown"):].strip() + if description.startswith("```"): + description = description[3:].strip() + if description.endswith("```"): + description = description[:-3].strip() + + # Remove agent submission markers + cleanup_patterns = [ + "COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT", + "SUBMIT_FINAL_OUTPUT", + "END_OF_FEATURE", + "", + "", + ] + for pattern in cleanup_patterns: + description = description.replace(pattern, "").strip() + + # Remove trailing whitespace on each line + description = "\n".join(line.rstrip() for line in description.split("\n")) + + return description if description else None diff --git a/src/cooperbench/generation/validator.py b/src/cooperbench/generation/validator.py new file mode 100644 index 0000000..4ff40fd --- /dev/null +++ b/src/cooperbench/generation/validator.py @@ -0,0 +1,400 @@ +"""Validation - check conflicts and test results using Modal sandboxes.""" + +import re +from pathlib import Path + +from cooperbench.eval.backends import get_backend +from cooperbench.utils import get_image_name +from cooperbench.eval.sandbox import _parse_results, _write_patch + + +def _extract_feature_title(feature_md_path: Path) -> str | None: + """Extract title from a feature.md file.""" + if not feature_md_path.exists(): + return None + + content = feature_md_path.read_text() + # Look for **Title**: pattern + match = re.search(r"\*\*Title\*\*:\s*(.+?)(?:\n|$)", content) + if match: + return match.group(1).strip() + return None + + +def check_conflicts_in_sandbox( + repo_name: str, + task_id: int, + new_feature_patch: str, + existing_feature_ids: list[int], + timeout: int = 300, + backend: str = "modal", +) -> dict: + """Check which existing features conflict with a new feature. + + Runs inside a Modal sandbox with the task's Docker image. + + Args: + repo_name: Repository name (e.g., "dspy_task") + task_id: Task ID + new_feature_patch: The new feature patch as a string + existing_feature_ids: List of existing feature IDs to check against + timeout: Sandbox timeout in seconds + backend: Execution backend ("modal", "docker") + + Returns: + Dict with: + - conflicts: list[int] - feature IDs that conflict + - clean: list[int] - feature IDs that merge cleanly + - errors: list[str] - any errors encountered + - output: str - raw output from sandbox + """ + task_dir = Path("dataset") / repo_name / f"task{task_id}" + + if not task_dir.exists(): + return {"conflicts": [], "clean": [], "errors": [f"Task dir not found: {task_dir}"], "output": ""} + + image = get_image_name(repo_name, task_id) + eval_backend = get_backend(backend) + sb = eval_backend.create_sandbox(image, timeout) + + try: + # Write the new patch to sandbox + _write_patch(sb, "new_feature.patch", new_feature_patch) + + # Write existing feature patches + for fid in existing_feature_ids: + feature_patch_path = task_dir / f"feature{fid}" / "feature.patch" + if feature_patch_path.exists(): + content = feature_patch_path.read_text() + _write_patch(sb, f"feature{fid}.patch", content) + + # Run conflict checking script + feature_ids_str = " ".join(str(fid) for fid in existing_feature_ids) + script = _build_conflict_check_script(existing_feature_ids) + + result = sb.exec("bash", "-c", script) + output = result.stdout_read() + result.stderr_read() + + # Parse results and collect feature info + conflicts = [] + conflicts_info = [] + clean = [] + errors = [] + + for line in output.split("\n"): + if line.startswith("CONFLICT:"): + fid = int(line.split(":")[1].strip()) + conflicts.append(fid) + # Get feature title + feature_md_path = task_dir / f"feature{fid}" / "feature.md" + title = _extract_feature_title(feature_md_path) + conflicts_info.append({ + "id": fid, + "title": title or f"Feature {fid}", + }) + elif line.startswith("CLEAN:"): + fid = int(line.split(":")[1].strip()) + clean.append(fid) + elif line.startswith("ERROR:"): + errors.append(line) + + return { + "conflicts": conflicts, + "conflicts_info": conflicts_info, + "clean": clean, + "errors": errors, + "output": output, + } + + except Exception as e: + return {"conflicts": [], "clean": [], "errors": [str(e)], "output": ""} + finally: + sb.terminate() + + +def run_tests_in_sandbox( + repo_name: str, + task_id: int, + feature_patch: str, + tests_patch: str, + timeout: int = 600, + backend: str = "modal", +) -> dict: + """Run the NEW tests for a generated feature in a Modal sandbox. + + Uses runner.sh which handles task-specific environment setup (deps, etc). + Passes the new test files as the 3rd param (requires updated runner.sh). + + Args: + repo_name: Repository name + task_id: Task ID + feature_patch: The feature implementation patch + tests_patch: The tests patch + timeout: Sandbox timeout + backend: Execution backend + + Returns: + Dict with: passed, tests_passed, tests_failed, output, error + """ + import logging + logger = logging.getLogger(__name__) + + image = get_image_name(repo_name, task_id) + logger.debug(f"Creating sandbox with image: {image}") + eval_backend = get_backend(backend) + sb = eval_backend.create_sandbox(image, timeout) + logger.debug("Sandbox created successfully") + + try: + # Write patches to /patches/ directory + logger.debug(f"Writing tests.patch ({len(tests_patch)} bytes)") + _write_patch(sb, "tests.patch", tests_patch) + logger.debug(f"Writing feature.patch ({len(feature_patch)} bytes)") + _write_patch(sb, "feature.patch", feature_patch) + + # Extract NEW test function names from the patch (not just files) + # This ensures we only run tests added by the agent, not pre-existing tests + new_test_specs = _extract_new_test_functions(tests_patch) + test_path = " ".join(new_test_specs) if new_test_specs else "" + logger.debug(f"New test functions to run: {test_path}") + + # Use runner.sh with: tests.patch feature.patch [test_path] + # - Old images: 3rd param ignored, runs default tests + # - New images: runs the specific new test files + if test_path: + logger.debug(f"Running: bash /usr/local/bin/runner.sh tests.patch feature.patch {test_path}") + result = sb.exec("bash", "/usr/local/bin/runner.sh", "tests.patch", "feature.patch", test_path) + else: + logger.debug("Running: bash /usr/local/bin/runner.sh tests.patch feature.patch") + result = sb.exec("bash", "/usr/local/bin/runner.sh", "tests.patch", "feature.patch") + logger.debug(f"Runner completed with exit code: {result.returncode}") + + output = result.stdout_read() + result.stderr_read() + exit_code = result.returncode + + # Parse test results (reuse existing parser that handles pytest, go, cargo, jest) + parsed = _parse_results(output) + + return { + "passed": exit_code == 0 and parsed["passed"] > 0, + "tests_passed": parsed["passed"], + "tests_failed": parsed["failed"], + "output": output, + "error": None, + } + except Exception as e: + return { + "passed": False, + "tests_passed": 0, + "tests_failed": 0, + "output": "", + "error": str(e), + } + finally: + sb.terminate() + + +def _extract_test_files_from_patch(patch: str) -> list[str]: + """Extract new/modified file paths from a patch.""" + import re + files = [] + for match in re.finditer(r"^\+\+\+ b/(.+)$", patch, re.MULTILINE): + path = match.group(1) + if path and not path.startswith("/dev/null"): + files.append(path) + return files + + +def _extract_new_test_functions(patch: str) -> list[str]: + """Extract new test function names from a patch with their file paths. + + Returns paths in pytest format: path/to/test.py::test_function_name + """ + import re + + test_specs = [] + current_file = None + + for line in patch.split("\n"): + # Track which file we're in + if line.startswith("+++ b/"): + current_file = line[6:] # Remove "+++ b/" prefix + # Find new test function definitions (lines starting with +def test_) + elif line.startswith("+def test_") and current_file: + # Extract function name: +def test_foo(args): -> test_foo + match = re.match(r"\+def (test_\w+)\s*\(", line) + if match: + func_name = match.group(1) + test_specs.append(f"{current_file}::{func_name}") + + return test_specs + + +def validate_generated_feature( + repo_name: str, + task_id: int, + feature_patch: str, + tests_patch: str, + min_conflicts: int = 1, + timeout: int = 600, + backend: str = "modal", +) -> dict: + """Full validation of a generated feature. + + Checks: + 1. Tests pass + 2. Conflicts with at least min_conflicts existing features + + Args: + repo_name: Repository name + task_id: Task ID + feature_patch: The feature implementation patch + tests_patch: The tests patch + min_conflicts: Minimum required conflicts (default: 1) + timeout: Sandbox timeout + backend: Execution backend + + Returns: + Dict with validation results + """ + task_dir = Path("dataset") / repo_name / f"task{task_id}" + + # Get existing feature IDs + existing_ids = _get_existing_feature_ids(task_dir) + + # Step 1: Run tests + test_result = run_tests_in_sandbox( + repo_name=repo_name, + task_id=task_id, + feature_patch=feature_patch, + tests_patch=tests_patch, + timeout=timeout, + backend=backend, + ) + + if not test_result["passed"]: + return { + "valid": False, + "reason": "tests_failed", + "test_result": test_result, + "conflict_result": None, + } + + # Step 2: Check conflicts + conflict_result = check_conflicts_in_sandbox( + repo_name=repo_name, + task_id=task_id, + new_feature_patch=feature_patch, + existing_feature_ids=existing_ids, + timeout=timeout, + backend=backend, + ) + + num_conflicts = len(conflict_result["conflicts"]) + + if num_conflicts < min_conflicts: + return { + "valid": False, + "reason": f"insufficient_conflicts ({num_conflicts} < {min_conflicts})", + "test_result": test_result, + "conflict_result": conflict_result, + } + + return { + "valid": True, + "reason": None, + "test_result": test_result, + "conflict_result": conflict_result, + } + + +def _build_conflict_check_script(feature_ids: list[int]) -> str: + """Build bash script for checking REAL git merge conflicts. + + For each existing feature: + 1. Create branch A from base, apply existing feature, commit + 2. Create branch B from base, apply new feature, commit + 3. Try git merge --no-commit from A + 4. Check if merge has conflicts (git merge --abort needed) + """ + feature_checks = "\n".join(f''' +# Check feature {fid} for REAL merge conflicts +echo "Checking feature {fid}..." +git checkout --quiet $BASE_SHA +git clean -fd >/dev/null 2>&1 + +# Branch A: existing feature {fid} +git checkout --quiet -b __existing_{fid} +if ! git apply /patches/feature{fid}.patch 2>/dev/null; then + echo "ERROR:feature{fid} patch failed to apply" + git checkout --quiet $BASE_SHA 2>/dev/null || true + git branch -D __existing_{fid} 2>/dev/null || true + continue +fi +git add -A +git commit -qm "existing feature{fid}" --allow-empty + +# Branch B: new feature (from base) +git checkout --quiet $BASE_SHA +git checkout --quiet -b __new_{fid} +if ! git apply /patches/new_feature.patch 2>/dev/null; then + echo "ERROR:new_feature patch failed to apply for check {fid}" + git checkout --quiet $BASE_SHA 2>/dev/null || true + git branch -D __existing_{fid} 2>/dev/null || true + git branch -D __new_{fid} 2>/dev/null || true + continue +fi +git add -A +git commit -qm "new feature" --allow-empty + +# Try to merge existing feature into new feature branch +# --no-commit so we can check for conflicts without auto-commit +if git merge --no-commit --no-ff __existing_{fid} 2>/dev/null; then + # Merge succeeded cleanly + echo "CLEAN:{fid}" + git merge --abort 2>/dev/null || git reset --hard HEAD >/dev/null 2>&1 +else + # Merge has conflicts! + echo "CONFLICT:{fid}" + git merge --abort 2>/dev/null || git reset --hard HEAD >/dev/null 2>&1 +fi + +# Cleanup branches +git checkout --quiet $BASE_SHA 2>/dev/null || true +git branch -D __existing_{fid} 2>/dev/null || true +git branch -D __new_{fid} 2>/dev/null || true +''' for fid in feature_ids) + + return f''' +cd /workspace/repo + +# Get base commit +BASE_SHA=$(git rev-parse HEAD) + +# Ensure clean state +git reset --hard HEAD >/dev/null 2>&1 +git clean -fd >/dev/null 2>&1 + +# Configure git for commits +git config user.email "test@test.com" 2>/dev/null || true +git config user.name "Test" 2>/dev/null || true + +{feature_checks} + +# Final cleanup +git checkout --quiet $BASE_SHA 2>/dev/null || true +git reset --hard HEAD >/dev/null 2>&1 +''' + + +def _get_existing_feature_ids(task_dir: Path) -> list[int]: + """Get IDs of existing features in a task.""" + ids = [] + for d in task_dir.iterdir(): + if d.is_dir() and d.name.startswith("feature"): + try: + fid = int(d.name.replace("feature", "")) + ids.append(fid) + except ValueError: + pass + return sorted(ids) From d143a83bdfbf7475b20ebbfef27edab8cbc6d6a1 Mon Sep 17 00:00:00 2001 From: Arpandeep Khatua Date: Sun, 1 Feb 2026 16:08:22 -0800 Subject: [PATCH 2/3] Capture the conflict --- src/cooperbench/generation/generator.py | 2 +- src/cooperbench/generation/validator.py | 62 +++++++++++++++++++------ 2 files changed, 48 insertions(+), 16 deletions(-) diff --git a/src/cooperbench/generation/generator.py b/src/cooperbench/generation/generator.py index ef8da3b..01f3c60 100644 --- a/src/cooperbench/generation/generator.py +++ b/src/cooperbench/generation/generator.py @@ -72,7 +72,7 @@ class GenerationResult: feature_patch: str | None = None tests_patch: str | None = None conflicts: list[int] = field(default_factory=list) - conflicts_info: list[dict] = field(default_factory=list) # [{id, title}, ...] + conflicts_info: list[dict] = field(default_factory=list) # [{id, title, conflict_diff}, ...] errors: list[str] = field(default_factory=list) agent_cost: float = 0.0 agent_steps: int = 0 diff --git a/src/cooperbench/generation/validator.py b/src/cooperbench/generation/validator.py index 4ff40fd..52bc813 100644 --- a/src/cooperbench/generation/validator.py +++ b/src/cooperbench/generation/validator.py @@ -61,6 +61,13 @@ def check_conflicts_in_sandbox( # Write the new patch to sandbox _write_patch(sb, "new_feature.patch", new_feature_patch) + # Extract feature titles for commit messages + feature_titles = {} + for fid in existing_feature_ids: + feature_md_path = task_dir / f"feature{fid}" / "feature.md" + title = _extract_feature_title(feature_md_path) + feature_titles[fid] = title or f"Feature {fid}" + # Write existing feature patches for fid in existing_feature_ids: feature_patch_path = task_dir / f"feature{fid}" / "feature.patch" @@ -69,8 +76,7 @@ def check_conflicts_in_sandbox( _write_patch(sb, f"feature{fid}.patch", content) # Run conflict checking script - feature_ids_str = " ".join(str(fid) for fid in existing_feature_ids) - script = _build_conflict_check_script(existing_feature_ids) + script = _build_conflict_check_script(existing_feature_ids, feature_titles) result = sb.exec("bash", "-c", script) output = result.stdout_read() + result.stderr_read() @@ -81,22 +87,34 @@ def check_conflicts_in_sandbox( clean = [] errors = [] - for line in output.split("\n"): - if line.startswith("CONFLICT:"): + # Parse output line by line, capturing conflict content + lines = output.split("\n") + i = 0 + while i < len(lines): + line = lines[i] + if line.startswith("CONFLICT_START:"): + # Format: CONFLICT_START:fid then content until CONFLICT_END:fid fid = int(line.split(":")[1].strip()) + conflict_content = [] + i += 1 + while i < len(lines) and not lines[i].startswith(f"CONFLICT_END:{fid}"): + conflict_content.append(lines[i]) + i += 1 conflicts.append(fid) - # Get feature title - feature_md_path = task_dir / f"feature{fid}" / "feature.md" - title = _extract_feature_title(feature_md_path) + # Get title from feature_titles we extracted earlier + title = feature_titles.get(fid, f"Feature {fid}") conflicts_info.append({ "id": fid, - "title": title or f"Feature {fid}", + "title": title, + "conflict_diff": "\n".join(conflict_content), }) elif line.startswith("CLEAN:"): + # Format: CLEAN:fid fid = int(line.split(":")[1].strip()) clean.append(fid) elif line.startswith("ERROR:"): errors.append(line) + i += 1 return { "conflicts": conflicts, @@ -308,7 +326,7 @@ def validate_generated_feature( } -def _build_conflict_check_script(feature_ids: list[int]) -> str: +def _build_conflict_check_script(feature_ids: list[int], feature_titles: dict[int, str]) -> str: """Build bash script for checking REAL git merge conflicts. For each existing feature: @@ -317,7 +335,10 @@ def _build_conflict_check_script(feature_ids: list[int]) -> str: 3. Try git merge --no-commit from A 4. Check if merge has conflicts (git merge --abort needed) """ - feature_checks = "\n".join(f''' + def _build_feature_check(fid: int, title: str) -> str: + # Escape title for shell - replace : with space to avoid parsing issues + safe_title = title.replace(":", " -").replace("'", "\\'").replace('"', '\\"') + return f''' # Check feature {fid} for REAL merge conflicts echo "Checking feature {fid}..." git checkout --quiet $BASE_SHA @@ -332,7 +353,7 @@ def _build_conflict_check_script(feature_ids: list[int]) -> str: continue fi git add -A -git commit -qm "existing feature{fid}" --allow-empty +git commit -qm "{safe_title}" --allow-empty # Branch B: new feature (from base) git checkout --quiet $BASE_SHA @@ -352,10 +373,16 @@ def _build_conflict_check_script(feature_ids: list[int]) -> str: if git merge --no-commit --no-ff __existing_{fid} 2>/dev/null; then # Merge succeeded cleanly echo "CLEAN:{fid}" - git merge --abort 2>/dev/null || git reset --hard HEAD >/dev/null 2>&1 + git reset --hard HEAD >/dev/null 2>&1 else - # Merge has conflicts! - echo "CONFLICT:{fid}" + # Merge has conflicts! Capture the actual conflict content + echo "CONFLICT_START:{fid}" + # Show files with conflict markers (<<<<<<< ======= >>>>>>>) + for f in $(git diff --name-only --diff-filter=U 2>/dev/null); do + echo "--- $f ---" + cat "$f" | grep -A 50 -B 5 "<<<<<<" | head -100 + done + echo "CONFLICT_END:{fid}" git merge --abort 2>/dev/null || git reset --hard HEAD >/dev/null 2>&1 fi @@ -363,7 +390,12 @@ def _build_conflict_check_script(feature_ids: list[int]) -> str: git checkout --quiet $BASE_SHA 2>/dev/null || true git branch -D __existing_{fid} 2>/dev/null || true git branch -D __new_{fid} 2>/dev/null || true -''' for fid in feature_ids) +''' + + feature_checks = "\n".join( + _build_feature_check(fid, feature_titles.get(fid, f"Feature {fid}")) + for fid in feature_ids + ) return f''' cd /workspace/repo From 00b82e5cfeb0f939a1e6e7de784c0f482f49ce08 Mon Sep 17 00:00:00 2001 From: Arpandeep Khatua Date: Sun, 1 Feb 2026 16:27:37 -0800 Subject: [PATCH 3/3] Fixing git issues --- pyproject.toml | 1 + .../agents/mini_swe_agent/adapter.py | 12 +++- src/cooperbench/generation/__main__.py | 30 ++++++--- src/cooperbench/generation/generator.py | 14 ++-- src/cooperbench/generation/prompt.py | 67 +++++++++++++++---- src/cooperbench/generation/splitter.py | 6 +- src/cooperbench/generation/validator.py | 32 +++++---- 7 files changed, 116 insertions(+), 46 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d3e5f50..c48ecec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,6 +107,7 @@ exclude = [ [tool.hatch.build.targets.wheel] packages = ["src/cooperbench"] +exclude = ["src/cooperbench/generation"] [tool.ruff] line-length = 120 diff --git a/src/cooperbench/agents/mini_swe_agent/adapter.py b/src/cooperbench/agents/mini_swe_agent/adapter.py index 6a21024..641c413 100644 --- a/src/cooperbench/agents/mini_swe_agent/adapter.py +++ b/src/cooperbench/agents/mini_swe_agent/adapter.py @@ -180,10 +180,16 @@ def run( def _get_patch(self, env: "ModalEnvironment | DockerEnvironment", base_commit: str) -> str: """Extract git diff from base commit to current working tree state.""" try: - # Stage all changes (including new untracked files) so they appear in diff + # Stage all changes (including new untracked files) env.execute("git add -A", timeout=10) - # Diff from base commit to staged changes (includes new files) - result = env.execute(f"git diff --cached {base_commit}", timeout=30) + # Configure git identity (required for commit in fresh sandbox environments) + env.execute("git config user.email 'agent@cooperbench.local'", timeout=10) + env.execute("git config user.name 'CooperBench Agent'", timeout=10) + # Commit everything so committed + staged + unstaged changes are all in HEAD + # This ensures we capture changes even if the agent made commits + env.execute("git commit --allow-empty -m 'Agent changes'", timeout=10) + # Diff from base commit to HEAD captures all changes + result = env.execute(f"git diff {base_commit} HEAD", timeout=30) return result.get("output", "").strip() except Exception: return "" diff --git a/src/cooperbench/generation/__main__.py b/src/cooperbench/generation/__main__.py index 00f7202..1af4932 100644 --- a/src/cooperbench/generation/__main__.py +++ b/src/cooperbench/generation/__main__.py @@ -111,7 +111,7 @@ def main(): # Parse repo_name and task_id from path parts = args.task.split("/") if len(parts) != 2: - logger.error(f"Invalid task path format. Expected: repo_name/taskID (e.g., dspy_task/task8394)") + logger.error("Invalid task path format. Expected: repo_name/taskID (e.g., dspy_task/task8394)") sys.exit(1) repo_name = parts[0] @@ -120,6 +120,7 @@ def main(): # Mode 0: List features if args.list_features: from cooperbench.generation.prompt import list_features + features = list_features(task_dir) print(f"Features in {args.task}: {features}") return @@ -127,6 +128,7 @@ def main(): # Mode 1: Prompt only if args.prompt_only: from cooperbench.generation.prompt import build_prompt + prompt = build_prompt(task_dir, feature_id=args.feature) print(prompt) return @@ -152,10 +154,11 @@ def main(): sys.exit(0 if result["valid"] else 1) # Mode 3: Generate features - from cooperbench.generation.generator import generate_feature, generate_features_batch import hashlib import re as re_module + from cooperbench.generation.generator import generate_feature, generate_features_batch + def make_output_dir(base_dir: Path, feature_md: str | None, fallback_hash: str) -> Path: """Create output directory named after the feature title + short hash.""" # Extract title from feature_md if available @@ -185,6 +188,7 @@ def make_output_dir(base_dir: Path, feature_md: str | None, fallback_hash: str) # Generate first (to temp location for trajectory) import tempfile + temp_dir = Path(tempfile.mkdtemp()) result = generate_feature( @@ -200,12 +204,14 @@ def make_output_dir(base_dir: Path, feature_md: str | None, fallback_hash: str) # Create final output dir based on feature title from datetime import datetime + fallback = datetime.now().strftime("%Y%m%d_%H%M%S") output_dir = make_output_dir(base_output_dir, result.feature_md, fallback) logger.info(f"Output directory: {output_dir}") # Move trajectory files from temp to final import shutil + for f in temp_dir.glob("trajectory_*"): shutil.move(str(f), output_dir / f.name) @@ -213,31 +219,33 @@ def make_output_dir(base_dir: Path, feature_md: str | None, fallback_hash: str) if result.feature_patch: patch = result.feature_patch.rstrip() + "\n" (output_dir / "feature.patch").write_text(patch) - logger.info(f"Saved feature.patch") + logger.info("Saved feature.patch") if result.tests_patch: patch = result.tests_patch.rstrip() + "\n" (output_dir / "tests.patch").write_text(patch) - logger.info(f"Saved tests.patch") + logger.info("Saved tests.patch") if result.feature_md: (output_dir / "feature.md").write_text(result.feature_md) - logger.info(f"Saved feature.md") + logger.info("Saved feature.md") # Save full result as JSON (output_dir / "result.json").write_text(json.dumps(result.to_dict(), indent=2, default=str)) - logger.info(f"Saved result.json") + logger.info("Saved result.json") # Print summary - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f"Result: {'SUCCESS' if result.success else 'FAILED'}") print(f"Output saved to: {output_dir}") if result.errors: print(f"Errors: {result.errors}") print(f"Agent: {result.agent_steps} steps, ${result.agent_cost:.4f}") - print(f"{'='*60}") + print(f"{'=' * 60}") sys.exit(0 if result.success else 1) else: - logger.info(f"Running {args.attempts} generation attempts for {args.task} (target: feature {args.feature or 'first'})") + logger.info( + f"Running {args.attempts} generation attempts for {args.task} (target: feature {args.feature or 'first'})" + ) logger.info(f"Output directory: {base_output_dir}") results = generate_features_batch( @@ -254,7 +262,7 @@ def make_output_dir(base_dir: Path, feature_md: str | None, fallback_hash: str) # Summary successful = sum(1 for r in results if r.success) - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f"Summary: {successful}/{args.attempts} successful") print(f"Output saved to: {base_output_dir}") @@ -262,7 +270,7 @@ def make_output_dir(base_dir: Path, feature_md: str | None, fallback_hash: str) status = "✓" if r.success else "✗" print(f" {status} Attempt {i}: {r.errors or 'OK'}") - print(f"{'='*60}") + print(f"{'=' * 60}") sys.exit(0 if successful > 0 else 1) diff --git a/src/cooperbench/generation/generator.py b/src/cooperbench/generation/generator.py index 01f3c60..c04befe 100644 --- a/src/cooperbench/generation/generator.py +++ b/src/cooperbench/generation/generator.py @@ -197,6 +197,7 @@ def generate_feature( # Save full trajectory to file if debug mode or output_dir specified if debug or output_dir: import re as re_module + save_dir = output_dir or Path(".") save_dir.mkdir(parents=True, exist_ok=True) @@ -217,7 +218,7 @@ def generate_feature( # Also save a human-readable version readable_file = save_dir / f"trajectory_{repo_name}_{task_id}.txt" with open(readable_file, "w") as f: - f.write(f"=== Agent Trajectory ===\n") + f.write("=== Agent Trajectory ===\n") f.write(f"Task: {repo_name}/task{task_id}\n") f.write(f"Model: {model_name}\n") f.write(f"Steps: {agent_steps}, Cost: ${agent_cost:.4f}\n") @@ -226,15 +227,16 @@ def generate_feature( for i, msg in enumerate(result.messages): role = msg.get("role", "?").upper() content = msg.get("content", "") - f.write(f"\n{'='*60}\n") + f.write(f"\n{'=' * 60}\n") f.write(f"[{i}] {role}\n") - f.write(f"{'='*60}\n") + f.write(f"{'=' * 60}\n") f.write(content) f.write("\n") logger.info(f"Saved readable trajectory to: {readable_file}") # Log summary to console import re as re_module + for i, msg in enumerate(result.messages): role = msg.get("role", "?") content = msg.get("content", "")[:500] @@ -340,7 +342,9 @@ def generate_feature( errors.append(f"Test error: {test_result['error']}") if not tests_passed: - errors.append(f"Tests failed: {test_result['tests_failed']} failed, {test_result['tests_passed']} passed") + errors.append( + f"Tests failed: {test_result['tests_failed']} failed, {test_result['tests_passed']} passed" + ) logger.warning(f"Tests failed: {test_result['tests_failed']} failed") else: logger.info(f"Tests passed: {test_result['tests_passed']} passed") @@ -433,8 +437,8 @@ def generate_features_batch( """ import hashlib import re as re_module - import tempfile import shutil + import tempfile def make_feature_dir(base_dir: Path, feature_md: str | None, attempt_num: int) -> Path: """Create output directory named after the feature title + short hash.""" diff --git a/src/cooperbench/generation/prompt.py b/src/cooperbench/generation/prompt.py index c8d5fff..32b17db 100644 --- a/src/cooperbench/generation/prompt.py +++ b/src/cooperbench/generation/prompt.py @@ -22,12 +22,14 @@ def _extract_patch_info(patch_path: Path) -> dict: "hunks": [], } for hunk in patched_file: - file_info["hunks"].append({ - "source_start": hunk.source_start, - "source_length": hunk.source_length, - "target_start": hunk.target_start, - "target_length": hunk.target_length, - }) + file_info["hunks"].append( + { + "source_start": hunk.source_start, + "source_length": hunk.source_length, + "target_start": hunk.target_start, + "target_length": hunk.target_length, + } + ) files_info.append(file_info) return {"files": files_info, "raw": content} @@ -41,6 +43,23 @@ def _read_feature_md(feature_dir: Path) -> str: return "" +def _extract_feature_title(feature_md_content: str) -> str | None: + """Extract just the title from feature.md content.""" + import re + + # Look for **Title**: pattern + match = re.search(r"\*\*Title\*\*:\s*(.+?)(?:\n|$)", feature_md_content) + if match: + return match.group(1).strip() + + # Fallback: look for # Title or ## Title + match = re.search(r"^#+ (.+?)$", feature_md_content, re.MULTILINE) + if match: + return match.group(1).strip() + + return None + + def _get_feature_info(task_dir: Path, feature_id: int) -> dict | None: """Get full information about a specific feature.""" feature_dir = task_dir / f"feature{feature_id}" @@ -150,13 +169,13 @@ def _format_code_snippet(patch_content: str, max_lines: int = 80) -> str: return patch_content # Take first half and last quarter - first_part = lines[:max_lines // 2] - last_part = lines[-(max_lines // 4):] + first_part = lines[: max_lines // 2] + last_part = lines[-(max_lines // 4) :] return "\n".join(first_part) + "\n\n... (truncated) ...\n\n" + "\n".join(last_part) -GENERATION_PROMPT_TEMPLATE = '''Create a NEW feature that will CONFLICT with an existing feature during git merge. +GENERATION_PROMPT_TEMPLATE = """Create a NEW feature that will CONFLICT with an existing feature during git merge. ## Existing Feature (your feature must conflict with this) @@ -168,7 +187,7 @@ def _format_code_snippet(patch_content: str, max_lines: int = 80) -> str: ```diff {code_snippet} ``` - +{other_features_section} ## Requirements Your new feature must: @@ -220,7 +239,7 @@ def _format_code_snippet(patch_content: str, max_lines: int = 80) -> str: This file is required for the submission to be valid. Create it right before you submit. Start by exploring the modified files to understand the code structure. -''' +""" def build_prompt(task_dir: Path, feature_id: int | None = None) -> str: @@ -270,7 +289,7 @@ def build_prompt(task_dir: Path, feature_id: int | None = None) -> str: # Format hot lines if hot_lines: - hot_lines_str = ", ".join(str(l) for l in sorted(set(hot_lines))[:5]) + hot_lines_str = ", ".join(str(line) for line in sorted(set(hot_lines))[:5]) else: hot_lines_str = "the modified sections" @@ -285,6 +304,29 @@ def build_prompt(task_dir: Path, feature_id: int | None = None) -> str: test_command = _get_test_command(task_dir) test_file = _extract_test_file(test_command) + # Collect titles from other features (to avoid duplicates) + other_features_section = "" + other_ids = [fid for fid in existing_ids if fid != feature_id] + if other_ids: + other_titles = [] + for fid in other_ids: + feature_dir = task_dir / f"feature{fid}" + md_content = _read_feature_md(feature_dir) + title = _extract_feature_title(md_content) if md_content else None + if title: + other_titles.append(f'- Feature {fid}: "{title}"') + else: + other_titles.append(f"- Feature {fid}: (no title)") + + if other_titles: + other_features_section = ( + "\n## Other Existing Features\n\n" + "The following features already exist in this task. " + "Make sure your proposed feature is different but compatible with these. \n" + + "\n".join(other_titles) + + "\n" + ) + # Build final prompt prompt = GENERATION_PROMPT_TEMPLATE.format( feature_description=feature_description, @@ -293,6 +335,7 @@ def build_prompt(task_dir: Path, feature_id: int | None = None) -> str: hot_lines=hot_lines_str, test_command=test_command, test_file=test_file, + other_features_section=other_features_section, ) return prompt diff --git a/src/cooperbench/generation/splitter.py b/src/cooperbench/generation/splitter.py index e607225..6c95042 100644 --- a/src/cooperbench/generation/splitter.py +++ b/src/cooperbench/generation/splitter.py @@ -20,8 +20,8 @@ # Files to exclude from patches (agent helper scripts, junk files) JUNK_FILE_PATTERNS = [ - "fix_", # Helper scripts like fix_parquet.py - "temp_", # Temporary files + "fix_", # Helper scripts like fix_parquet.py + "temp_", # Temporary files "tmp_", "debug_", "scratch_", @@ -175,7 +175,7 @@ def extract_feature_description(agent_output: str) -> str | None: # Clean up any markdown code block wrappers if description.startswith("```markdown"): - description = description[len("```markdown"):].strip() + description = description[len("```markdown") :].strip() if description.startswith("```"): description = description[3:].strip() if description.endswith("```"): diff --git a/src/cooperbench/generation/validator.py b/src/cooperbench/generation/validator.py index 52bc813..5016337 100644 --- a/src/cooperbench/generation/validator.py +++ b/src/cooperbench/generation/validator.py @@ -4,8 +4,8 @@ from pathlib import Path from cooperbench.eval.backends import get_backend -from cooperbench.utils import get_image_name from cooperbench.eval.sandbox import _parse_results, _write_patch +from cooperbench.utils import get_image_name def _extract_feature_title(feature_md_path: Path) -> str | None: @@ -103,11 +103,13 @@ def check_conflicts_in_sandbox( conflicts.append(fid) # Get title from feature_titles we extracted earlier title = feature_titles.get(fid, f"Feature {fid}") - conflicts_info.append({ - "id": fid, - "title": title, - "conflict_diff": "\n".join(conflict_content), - }) + conflicts_info.append( + { + "id": fid, + "title": title, + "conflict_diff": "\n".join(conflict_content), + } + ) elif line.startswith("CLEAN:"): # Format: CLEAN:fid fid = int(line.split(":")[1].strip()) @@ -155,6 +157,7 @@ def run_tests_in_sandbox( Dict with: passed, tests_passed, tests_failed, output, error """ import logging + logger = logging.getLogger(__name__) image = get_image_name(repo_name, task_id) @@ -179,9 +182,13 @@ def run_tests_in_sandbox( # Use runner.sh with: tests.patch feature.patch [test_path] # - Old images: 3rd param ignored, runs default tests # - New images: runs the specific new test files + # NOTE: We use bash -c to allow shell word splitting on test_path. + # Passing test_path directly as an argument would treat the space-separated + # specs as a single argument, which pytest can't parse. if test_path: - logger.debug(f"Running: bash /usr/local/bin/runner.sh tests.patch feature.patch {test_path}") - result = sb.exec("bash", "/usr/local/bin/runner.sh", "tests.patch", "feature.patch", test_path) + cmd = f"bash /usr/local/bin/runner.sh tests.patch feature.patch {test_path}" + logger.debug(f"Running: {cmd}") + result = sb.exec("bash", "-c", cmd) else: logger.debug("Running: bash /usr/local/bin/runner.sh tests.patch feature.patch") result = sb.exec("bash", "/usr/local/bin/runner.sh", "tests.patch", "feature.patch") @@ -215,6 +222,7 @@ def run_tests_in_sandbox( def _extract_test_files_from_patch(patch: str) -> list[str]: """Extract new/modified file paths from a patch.""" import re + files = [] for match in re.finditer(r"^\+\+\+ b/(.+)$", patch, re.MULTILINE): path = match.group(1) @@ -335,6 +343,7 @@ def _build_conflict_check_script(feature_ids: list[int], feature_titles: dict[in 3. Try git merge --no-commit from A 4. Check if merge has conflicts (git merge --abort needed) """ + def _build_feature_check(fid: int, title: str) -> str: # Escape title for shell - replace : with space to avoid parsing issues safe_title = title.replace(":", " -").replace("'", "\\'").replace('"', '\\"') @@ -393,11 +402,10 @@ def _build_feature_check(fid: int, title: str) -> str: ''' feature_checks = "\n".join( - _build_feature_check(fid, feature_titles.get(fid, f"Feature {fid}")) - for fid in feature_ids + _build_feature_check(fid, feature_titles.get(fid, f"Feature {fid}")) for fid in feature_ids ) - return f''' + return f""" cd /workspace/repo # Get base commit @@ -416,7 +424,7 @@ def _build_feature_check(fid: int, title: str) -> str: # Final cleanup git checkout --quiet $BASE_SHA 2>/dev/null || true git reset --hard HEAD >/dev/null 2>&1 -''' +""" def _get_existing_feature_ids(task_dir: Path) -> list[int]: