From f7ac4c78cf771b56b0cfa3120179773eed5d1041 Mon Sep 17 00:00:00 2001 From: Test User Date: Wed, 11 Feb 2026 22:01:48 -0700 Subject: [PATCH 1/3] fix: conservative success detection + CLI exit codes for BLOCKED/FAILED (#374) _detect_success() now returns False when no patterns match (was True), preventing false positives when the agent produces no output. Failure patterns are checked before success patterns so mixed output fails. CLI work start --execute now raises typer.Exit(1) for BLOCKED/FAILED agent states, giving test automation a reliable exit code signal. --- codeframe/cli/app.py | 2 + tests/cli/test_work_exit_codes.py | 82 ++++++++++++++++++++++++++++ tests/e2e/cli/golden_path_runner.py | 17 +++--- tests/e2e/cli/test_detect_success.py | 63 +++++++++++++++++++++ 4 files changed, 157 insertions(+), 7 deletions(-) create mode 100644 tests/cli/test_work_exit_codes.py create mode 100644 tests/e2e/cli/test_detect_success.py diff --git a/codeframe/cli/app.py b/codeframe/cli/app.py index 386a03da..b8cbb890 100644 --- a/codeframe/cli/app.py +++ b/codeframe/cli/app.py @@ -2063,12 +2063,14 @@ def work_start( if state.blocker: console.print(f" Question: {state.blocker.question}") console.print(" Use 'codeframe blocker list' to see blockers") + raise typer.Exit(1) elif state.status == AgentStatus.FAILED: console.print("[red]Task execution failed[/red]") if state.step_results: last_result = state.step_results[-1] if last_result.error: console.print(f" Error: {last_result.error[:200]}") + raise typer.Exit(1) # Show debug log location if debugging was enabled if debug: diff --git a/tests/cli/test_work_exit_codes.py b/tests/cli/test_work_exit_codes.py new file mode 100644 index 00000000..f32cb327 --- /dev/null +++ b/tests/cli/test_work_exit_codes.py @@ -0,0 +1,82 @@ +"""Tests that `cf work start --execute` returns non-zero exit codes for BLOCKED/FAILED. + +Issue #374: CLI was returning exit code 0 even when the agent state was +BLOCKED or FAILED, causing false positives in test automation. +""" + +from dataclasses import dataclass, field +from unittest.mock import patch + +import pytest +from typer.testing import CliRunner + +from codeframe.cli.app import app +from codeframe.core.agent import AgentStatus +from codeframe.core import tasks +from codeframe.core.state_machine import TaskStatus +from codeframe.core.workspace import create_or_load_workspace + +pytestmark = pytest.mark.v2 + +runner = CliRunner() + + +@dataclass +class _FakeAgentState: + status: AgentStatus + blocker: object = None + step_results: list = field(default_factory=list) + + +@pytest.fixture +def workspace_with_ready_task(tmp_path, monkeypatch): + """Workspace with one READY task, API key set.""" + repo = tmp_path / "repo" + repo.mkdir() + monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-test-fake") + + ws = create_or_load_workspace(repo) + task = tasks.create(ws, title="Test task", description="A test task", + status=TaskStatus.READY) + + return repo, task.id[:8] + + +class TestWorkStartExitCodes: + """Verify CLI exit codes match agent execution outcomes.""" + + def test_completed_returns_exit_zero(self, workspace_with_ready_task): + repo, tid = workspace_with_ready_task + fake_state = _FakeAgentState(status=AgentStatus.COMPLETED) + + with patch("codeframe.core.runtime.execute_agent", return_value=fake_state): + result = runner.invoke( + app, ["work", "start", tid, "--execute", "-w", str(repo)] + ) + + assert result.exit_code == 0, f"Expected 0 for COMPLETED: {result.output}" + assert "completed successfully" in result.output.lower() + + def test_failed_returns_exit_one(self, workspace_with_ready_task): + repo, tid = workspace_with_ready_task + fake_state = _FakeAgentState(status=AgentStatus.FAILED) + + with patch("codeframe.core.runtime.execute_agent", return_value=fake_state): + result = runner.invoke( + app, ["work", "start", tid, "--execute", "-w", str(repo)] + ) + + assert result.exit_code == 1, f"Expected 1 for FAILED: {result.output}" + assert "failed" in result.output.lower() + + def test_blocked_returns_exit_one(self, workspace_with_ready_task): + repo, tid = workspace_with_ready_task + fake_state = _FakeAgentState(status=AgentStatus.BLOCKED) + + with patch("codeframe.core.runtime.execute_agent", return_value=fake_state): + result = runner.invoke( + app, ["work", "start", tid, "--execute", "-w", str(repo)] + ) + + assert result.exit_code == 1, f"Expected 1 for BLOCKED: {result.output}" + assert "blocked" in result.output.lower() diff --git a/tests/e2e/cli/golden_path_runner.py b/tests/e2e/cli/golden_path_runner.py index fc31754a..9d356268 100644 --- a/tests/e2e/cli/golden_path_runner.py +++ b/tests/e2e/cli/golden_path_runner.py @@ -315,8 +315,9 @@ def run_execute_task(self, task_id: str, task_title: str) -> TaskExecutionResult def _detect_success(self, exit_code: int, output: str) -> bool: """Determine actual success from CLI output patterns. - The CLI currently returns exit code 0 even for failed tasks, - so we inspect the output for success/failure markers. + Uses conservative detection: only returns True when an explicit + success pattern is found AND no failure patterns are present. + Returns False when output is ambiguous (no patterns matched). """ if exit_code != 0: return False @@ -331,15 +332,17 @@ def _detect_success(self, exit_code: int, output: str) -> bool: "Task completed successfully", ] - for pattern in success_patterns: - if pattern in output: - return True + # Check failure patterns first — failure takes precedence for pattern in failure_patterns: if pattern in output: return False - # If no clear signal, fall back to exit code - return exit_code == 0 + for pattern in success_patterns: + if pattern in output: + return True + + # No clear signal — conservative default: treat as failure + return False def _extract_iterations(self, output: str) -> Optional[int]: """Extract iteration count from agent output.""" diff --git a/tests/e2e/cli/test_detect_success.py b/tests/e2e/cli/test_detect_success.py new file mode 100644 index 00000000..43be49f7 --- /dev/null +++ b/tests/e2e/cli/test_detect_success.py @@ -0,0 +1,63 @@ +"""Unit tests for GoldenPathRunner._detect_success(). + +Validates that the success detection logic uses conservative defaults: +- Explicit success patterns → True +- Explicit failure patterns → False +- No patterns matched → False (not True!) +- Non-zero exit code → always False +""" + +import pytest + +from tests.e2e.cli.golden_path_runner import GoldenPathRunner + + +@pytest.fixture +def runner(tmp_path): + """Create a GoldenPathRunner instance for testing.""" + return GoldenPathRunner(project_path=tmp_path, engine="react") + + +class TestDetectSuccess: + """Tests for _detect_success() conservative detection logic.""" + + def test_explicit_success_pattern_returns_true(self, runner): + output = "Some output\nTask completed successfully!\nDone." + assert runner._detect_success(exit_code=0, output=output) is True + + def test_explicit_failure_pattern_returns_false(self, runner): + output = "Task execution failed\nSome error occurred" + assert runner._detect_success(exit_code=0, output=output) is False + + def test_api_key_missing_returns_false(self, runner): + output = "ANTHROPIC_API_KEY environment variable is required" + assert runner._detect_success(exit_code=0, output=output) is False + + def test_error_pattern_returns_false(self, runner): + output = "Error: something went wrong" + assert runner._detect_success(exit_code=0, output=output) is False + + def test_blocked_pattern_returns_false(self, runner): + output = "Task blocked - needs human input" + assert runner._detect_success(exit_code=0, output=output) is False + + def test_empty_output_exit_zero_returns_false(self, runner): + """The core bug: empty output should NOT be treated as success.""" + assert runner._detect_success(exit_code=0, output="") is False + + def test_no_patterns_exit_zero_returns_false(self, runner): + """Output with no matching patterns should be failure (conservative).""" + output = "Some random output that matches nothing" + assert runner._detect_success(exit_code=0, output=output) is False + + def test_nonzero_exit_code_always_returns_false(self, runner): + output = "Task completed successfully!" + assert runner._detect_success(exit_code=1, output=output) is False + + def test_nonzero_exit_code_with_no_output(self, runner): + assert runner._detect_success(exit_code=1, output="") is False + + def test_mixed_success_and_failure_returns_false(self, runner): + """When both success and failure patterns present, failure wins.""" + output = "Task completed successfully!\nBut then Error: crash" + assert runner._detect_success(exit_code=0, output=output) is False From 44912675dfa5be4636fc0cbea782fbbc00b06b98 Mon Sep 17 00:00:00 2001 From: Test User Date: Wed, 11 Feb 2026 22:48:51 -0700 Subject: [PATCH 2/3] fix: ensure debug log path prints before exit on BLOCKED/FAILED Move typer.Exit(1) after debug log output so users running with --debug still see the log file location when tasks fail or are blocked. --- codeframe/cli/app.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/codeframe/cli/app.py b/codeframe/cli/app.py index b8cbb890..f7cc62f4 100644 --- a/codeframe/cli/app.py +++ b/codeframe/cli/app.py @@ -2063,14 +2063,12 @@ def work_start( if state.blocker: console.print(f" Question: {state.blocker.question}") console.print(" Use 'codeframe blocker list' to see blockers") - raise typer.Exit(1) elif state.status == AgentStatus.FAILED: console.print("[red]Task execution failed[/red]") if state.step_results: last_result = state.step_results[-1] if last_result.error: console.print(f" Error: {last_result.error[:200]}") - raise typer.Exit(1) # Show debug log location if debugging was enabled if debug: @@ -2080,6 +2078,9 @@ def work_start( latest_log = max(debug_logs, key=lambda p: p.stat().st_mtime) console.print(f"\n[dim]Debug log written to: {latest_log}[/dim]") + if state.status in (AgentStatus.BLOCKED, AgentStatus.FAILED): + raise typer.Exit(1) + except ValueError as e: console.print(f"[red]Error:[/red] {e}") raise typer.Exit(1) From f4f027cc2692f699fc72b0d9bff1846628f32754 Mon Sep 17 00:00:00 2001 From: Test User Date: Wed, 11 Feb 2026 22:56:01 -0700 Subject: [PATCH 3/3] fix: add missing pytestmark v2 marker to test_detect_success MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses CodeRabbit review feedback — test file should include the module-level v2 marker for consistency with project conventions. --- tests/e2e/cli/test_detect_success.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/e2e/cli/test_detect_success.py b/tests/e2e/cli/test_detect_success.py index 43be49f7..8bebf690 100644 --- a/tests/e2e/cli/test_detect_success.py +++ b/tests/e2e/cli/test_detect_success.py @@ -11,6 +11,8 @@ from tests.e2e.cli.golden_path_runner import GoldenPathRunner +pytestmark = pytest.mark.v2 + @pytest.fixture def runner(tmp_path):