diff --git a/pdd/sync_orchestration.py b/pdd/sync_orchestration.py index d2a19d7e..d26a1a84 100644 --- a/pdd/sync_orchestration.py +++ b/pdd/sync_orchestration.py @@ -1749,7 +1749,10 @@ def __init__(self, rc, out, err): success = pdd_files['test'].exists() else: success = bool(result[0]) - cost = result[-2] if len(result) >= 2 and isinstance(result[-2], (int, float)) else 0.0 + if operation in ('test', 'test_extend') and len(result) >= 4: + cost = result[1] if isinstance(result[1], (int, float)) else 0.0 + else: + cost = result[-2] if len(result) >= 2 and isinstance(result[-2], (int, float)) else 0.0 current_cost_ref[0] += cost else: success = result is not None @@ -1774,7 +1777,7 @@ def __init__(self, rc, out, err): # cmd_test_main returns 4-tuple: (content, cost, model, agentic_success) # Other commands return 3-tuple: (content, cost, model) # Use explicit indexing for test operation to handle 4-tuple correctly - if operation == 'test' and len(result) >= 4: + if operation in ('test', 'test_extend') and len(result) >= 4: actual_cost = result[1] if isinstance(result[1], (int, float)) else 0.0 model_name = result[2] if isinstance(result[2], str) else 'unknown' else: diff --git a/tests/test_e2e_issue_508_budget_test_cost.py b/tests/test_e2e_issue_508_budget_test_cost.py new file mode 100644 index 00000000..2650fbd7 --- /dev/null +++ b/tests/test_e2e_issue_508_budget_test_cost.py @@ -0,0 +1,111 @@ +"""Tests for issue #508: Budget tracker drops test/test_extend costs due to wrong tuple index. + +The bug: sync_orchestration.py line 1752 uses `result[-2]` to extract cost from operation +results. For 4-tuples (returned by cmd_test_main), `result[-2]` is the model name string, +not the cost float, so isinstance(..., (int, float)) fails and cost defaults to $0.00. + +Secondary bug: line 1777 only checks `operation == 'test'`, missing `test_extend`. +""" + +import pytest + + +class TestBudgetCostExtraction: + """Test the cost extraction logic at sync_orchestration.py:1752.""" + + def _extract_cost(self, result, operation): + """Replicates the cost extraction logic from line 1752 (should match current source).""" + if operation in ('test', 'test_extend') and len(result) >= 4: + cost = result[1] if isinstance(result[1], (int, float)) else 0.0 + else: + cost = result[-2] if len(result) >= 2 and isinstance(result[-2], (int, float)) else 0.0 + return cost + + def test_4_tuple_test_cost_extraction(self): + """4-tuple from cmd_test_main has cost at index 1. + + cmd_test_main returns: (content, cost, model, agentic_success) + The fix ensures result[1] is used for test/test_extend operations. + """ + result = ("test content", 0.0007821, "gpt-4o-mini", True) + + cost = self._extract_cost(result, operation='test') + + assert cost == pytest.approx(0.0007821), ( + f"Cost should be {result[1]} but got {cost}. " + f"result[-2] = {result[-2]!r} (type={type(result[-2]).__name__}) is the model name, not cost." + ) + + def test_4_tuple_test_extend_cost_extraction(self): + """Same fix applies for test_extend operation which also calls cmd_test_main.""" + result = ("test content", 0.0012345, "claude-sonnet-4-5", False) + + cost = self._extract_cost(result, operation='test_extend') + + assert cost == pytest.approx(0.0012345), ( + f"test_extend cost should be {result[1]} but got {cost}." + ) + + def test_3_tuple_generate_cost_extraction(self): + """3-tuple operations (generate, etc.) work correctly with result[-2] — regression guard.""" + # 3-tuple: (content, cost, model) + result = ("generated code", 0.0005551, "gpt-4o-mini") + + cost = self._extract_cost(result, operation='generate') + + # For 3-tuples, result[-2] = result[1] = cost float — this works by accident + assert cost == pytest.approx(0.0005551) + + def test_budget_enforcement_with_test_costs(self): + """Budget check underestimates spend when test costs are dropped. + + Simulates a sync loop where generate costs $0.05 and test costs $0.10. + With budget=$0.12, sync should stop after test. But since test cost is + dropped to $0.00, the budget check sees only $0.05 and keeps going. + """ + budget = 0.12 + current_cost = 0.0 + + # Operation 1: generate (3-tuple) — cost extracted correctly + generate_result = ("code", 0.05, "gpt-4o-mini") + cost = self._extract_cost(generate_result, operation='generate') + current_cost += cost + + # Operation 2: test (4-tuple) — cost must be extracted correctly + test_result = ("tests", 0.10, "gpt-4o-mini", True) + cost = self._extract_cost(test_result, operation='test') + current_cost += cost + + # With the bug, current_cost is only 0.05 (test cost dropped) + # It should be 0.15, exceeding the budget of 0.12 + assert current_cost >= budget, ( + f"Total cost should be $0.15 (exceeding budget ${budget}) " + f"but tracker shows ${current_cost:.4f} due to dropped test cost." + ) + + +class TestLoggingSectionTestExtendGap: + """Test the secondary bug: logging at line 1777 misses test_extend.""" + + def _extract_logging_cost(self, result, operation): + """Replicates the logging cost extraction logic from lines 1777-1782.""" + if operation in ('test', 'test_extend') and len(result) >= 4: + actual_cost = result[1] if isinstance(result[1], (int, float)) else 0.0 + else: + actual_cost = result[-2] if isinstance(result[-2], (int, float)) else 0.0 + return actual_cost + + def test_logging_test_extend_cost(self): + """Bug: logging section only checks operation == 'test', not 'test_extend'. + + test_extend also returns a 4-tuple from cmd_test_main, so the same + explicit indexing should apply. + """ + result = ("tests", 0.0012345, "claude-sonnet-4-5", True) + + actual_cost = self._extract_logging_cost(result, operation='test_extend') + + assert actual_cost == pytest.approx(0.0012345), ( + f"Logging cost for test_extend should be {result[1]} but got {actual_cost}. " + f"The logging section only checks operation == 'test', missing 'test_extend'." + ) diff --git a/tests/test_e2e_issue_508_sync_budget_tracking.py b/tests/test_e2e_issue_508_sync_budget_tracking.py new file mode 100644 index 00000000..21cbb221 --- /dev/null +++ b/tests/test_e2e_issue_508_sync_budget_tracking.py @@ -0,0 +1,146 @@ +"""E2E test for issue #508: Budget tracker drops test/test_extend costs. + +Exercises the REAL sync_orchestration function in headless mode (quiet=True), +with mocked operation functions returning 4-tuples, verifying that the actual +cost extraction logic at line 1752 correctly accumulates costs. +""" + +import pytest +from pathlib import Path +from unittest.mock import patch, MagicMock + +import pdd +from pdd.sync_orchestration import sync_orchestration +from pdd.sync_determine_operation import SyncDecision + + +@pytest.fixture(autouse=True) +def set_pdd_path(monkeypatch): + pdd_package_dir = Path(pdd.__file__).parent + monkeypatch.setenv("PDD_PATH", str(pdd_package_dir)) + + +@pytest.fixture +def sync_workspace(tmp_path): + """Create minimal workspace files.""" + prompt = tmp_path / "prompts" / "budget_test_python.prompt" + prompt.parent.mkdir(parents=True, exist_ok=True) + prompt.write_text("Test prompt") + + code = tmp_path / "src" / "budget_test.py" + code.parent.mkdir(parents=True, exist_ok=True) + code.write_text("def hello(): pass\n") + + example = tmp_path / "examples" / "budget_test_example.py" + example.parent.mkdir(parents=True, exist_ok=True) + example.write_text("from budget_test import hello\nhello()\n") + + test = tmp_path / "tests" / "test_budget_test.py" + test.parent.mkdir(parents=True, exist_ok=True) + test.write_text("def test_hello(): pass\n") + + return { + 'prompt': prompt, + 'code': code, + 'example': example, + 'test': test, + } + + +def _make_decision(operation, reason='auto'): + return SyncDecision(operation=operation, reason=reason) + + +class TestE2ESyncBudgetTracking: + """E2E: sync_orchestration must accumulate costs from 4-tuple test results.""" + + def _run_sync(self, sync_workspace, tmp_path, monkeypatch, decisions, op_results, budget=10.0): + """Run sync_orchestration with mocked operations returning specified results.""" + monkeypatch.chdir(tmp_path) + + call_count = [0] + def mock_determine(*args, **kwargs): + if call_count[0] < len(decisions): + d = decisions[call_count[0]] + call_count[0] += 1 + return d + return _make_decision('all_synced', 'Complete') + + patches = { + 'pdd.sync_orchestration.get_pdd_file_paths': MagicMock(return_value=sync_workspace), + 'pdd.sync_orchestration.sync_determine_operation': mock_determine, + 'pdd.sync_orchestration.SyncLock': MagicMock(), + 'pdd.sync_orchestration.log_event': MagicMock(), + 'pdd.sync_orchestration.append_log_entry': MagicMock(), + 'pdd.sync_orchestration._save_fingerprint_atomic': MagicMock(), + 'pdd.sync_orchestration._save_run_report_atomic': MagicMock(), + 'pdd.sync_orchestration.calculate_sha256': MagicMock(return_value='abc123'), + 'pdd.sync_orchestration.maybe_steer_operation': MagicMock(side_effect=lambda op, *a, **kw: (op, False)), + 'pdd.sync_orchestration.create_log_entry': MagicMock(return_value={'details': {}}), + 'pdd.sync_orchestration.update_log_entry': MagicMock(), + 'pdd.sync_orchestration.load_operation_log': MagicMock(return_value=[]), + } + + if 'test' in op_results or 'test_extend' in op_results: + test_result = op_results.get('test') or op_results.get('test_extend') + patches['pdd.sync_orchestration.cmd_test_main'] = MagicMock(return_value=test_result) + if 'generate' in op_results: + patches['pdd.sync_orchestration.code_generator_main'] = MagicMock(return_value=op_results['generate']) + + ctx_managers = [patch(k, v) for k, v in patches.items()] + for cm in ctx_managers: + cm.start() + try: + return sync_orchestration( + basename='budget_test', + language='python', + budget=budget, + max_attempts=1, + strength=0.5, + temperature=0.0, + skip_verify=True, + skip_tests=False, + quiet=True, + force=True, + no_steer=True, + ) + finally: + for cm in ctx_managers: + cm.stop() + + def test_sync_tracks_test_4tuple_cost(self, sync_workspace, tmp_path, monkeypatch): + """Bug: 4-tuple test result cost is dropped because result[-2] gives model name.""" + test_cost = 0.0007821 + result = self._run_sync( + sync_workspace, tmp_path, monkeypatch, + decisions=[_make_decision('test', 'Need tests')], + op_results={'test': ("test content", test_cost, "gpt-4o-mini", True)}, + ) + + total_cost = result.get('total_cost', 0.0) + assert total_cost >= test_cost, ( + f"sync total_cost should include test cost ${test_cost} " + f"but got ${total_cost:.6f}. Bug: result[-2] on 4-tuple gives model name, not cost." + ) + + def test_sync_budget_enforcement_with_test_cost(self, sync_workspace, tmp_path, monkeypatch): + """generate($0.05) + test($0.10) = $0.15 should exceed budget $0.12.""" + result = self._run_sync( + sync_workspace, tmp_path, monkeypatch, + decisions=[ + _make_decision('generate', 'Need code'), + _make_decision('test', 'Need tests'), + _make_decision('test_extend', 'More coverage'), + ], + op_results={ + 'generate': ("code", 0.05, "gpt-4o-mini"), + 'test': ("tests", 0.10, "gpt-4o-mini", True), + }, + budget=0.12, + ) + + total_cost = result.get('total_cost', 0.0) + assert total_cost >= 0.12, ( + f"Total should be >= $0.12 (gen $0.05 + test $0.10) but got ${total_cost:.4f}. " + f"Bug drops test cost so budget check never fires." + )