From 7fc6c4c6f4a89a1356d357259ba9d6744849f0a1 Mon Sep 17 00:00:00 2001 From: Serhan Date: Thu, 12 Feb 2026 14:56:29 -0500 Subject: [PATCH 1/3] Add failing tests for budget tracking tuple index bug (#508) Unit and E2E tests that reproduce the bug where result[-2] on a 4-tuple from cmd_test_main extracts the model name string instead of the cost float, causing test/test_extend costs to be silently dropped from budget tracking. Co-Authored-By: Claude Opus 4.6 --- tests/test_e2e_issue_508_budget_test_cost.py | 118 ++++++++++++++ ...test_e2e_issue_508_sync_budget_tracking.py | 146 ++++++++++++++++++ 2 files changed, 264 insertions(+) create mode 100644 tests/test_e2e_issue_508_budget_test_cost.py create mode 100644 tests/test_e2e_issue_508_sync_budget_tracking.py diff --git a/tests/test_e2e_issue_508_budget_test_cost.py b/tests/test_e2e_issue_508_budget_test_cost.py new file mode 100644 index 00000000..459223f4 --- /dev/null +++ b/tests/test_e2e_issue_508_budget_test_cost.py @@ -0,0 +1,118 @@ +"""Tests for issue #508: Budget tracker drops test/test_extend costs due to wrong tuple index. + +The bug: sync_orchestration.py line 1752 uses `result[-2]` to extract cost from operation +results. For 4-tuples (returned by cmd_test_main), `result[-2]` is the model name string, +not the cost float, so isinstance(..., (int, float)) fails and cost defaults to $0.00. + +Secondary bug: line 1777 only checks `operation == 'test'`, missing `test_extend`. +""" + +import pytest + + +class TestBudgetCostExtraction: + """Test the cost extraction logic at sync_orchestration.py:1752.""" + + def _extract_cost_like_line_1752(self, result): + """Replicates the exact buggy logic from line 1752.""" + cost = result[-2] if len(result) >= 2 and isinstance(result[-2], (int, float)) else 0.0 + return cost + + def _extract_cost_fixed(self, result, operation): + """What the fixed logic should do.""" + if operation in ('test', 'test_extend') and len(result) >= 4: + cost = result[1] if isinstance(result[1], (int, float)) else 0.0 + else: + cost = result[-2] if len(result) >= 2 and isinstance(result[-2], (int, float)) else 0.0 + return cost + + def test_4_tuple_test_cost_extraction(self): + """Bug: 4-tuple from cmd_test_main has cost at index 1, but result[-2] gives index 2 (model name). + + cmd_test_main returns: (content, cost, model, agentic_success) + result[-2] for a 4-tuple = result[2] = model name (string) → isinstance check fails → $0.00 + """ + # Simulate cmd_test_main 4-tuple return + result = ("test content", 0.0007821, "gpt-4o-mini", True) + + cost = self._extract_cost_like_line_1752(result) + + # This assertion demonstrates the bug: cost should be 0.0007821 but is 0.0 + assert cost == pytest.approx(0.0007821), ( + f"Cost should be {result[1]} but got {cost}. " + f"result[-2] = {result[-2]!r} (type={type(result[-2]).__name__}) is the model name, not cost." + ) + + def test_4_tuple_test_extend_cost_extraction(self): + """Same bug for test_extend operation which also calls cmd_test_main.""" + result = ("test content", 0.0012345, "claude-sonnet-4-5", False) + + cost = self._extract_cost_like_line_1752(result) + + assert cost == pytest.approx(0.0012345), ( + f"test_extend cost should be {result[1]} but got {cost}." + ) + + def test_3_tuple_generate_cost_extraction(self): + """3-tuple operations (generate, etc.) work correctly with result[-2] — regression guard.""" + # 3-tuple: (content, cost, model) + result = ("generated code", 0.0005551, "gpt-4o-mini") + + cost = self._extract_cost_like_line_1752(result) + + # For 3-tuples, result[-2] = result[1] = cost float — this works by accident + assert cost == pytest.approx(0.0005551) + + def test_budget_enforcement_with_test_costs(self): + """Budget check underestimates spend when test costs are dropped. + + Simulates a sync loop where generate costs $0.05 and test costs $0.10. + With budget=$0.12, sync should stop after test. But since test cost is + dropped to $0.00, the budget check sees only $0.05 and keeps going. + """ + budget = 0.12 + current_cost = 0.0 + + # Operation 1: generate (3-tuple) — cost extracted correctly + generate_result = ("code", 0.05, "gpt-4o-mini") + cost = self._extract_cost_like_line_1752(generate_result) + current_cost += cost + + # Operation 2: test (4-tuple) — cost dropped due to bug + test_result = ("tests", 0.10, "gpt-4o-mini", True) + cost = self._extract_cost_like_line_1752(test_result) + current_cost += cost + + # With the bug, current_cost is only 0.05 (test cost dropped) + # It should be 0.15, exceeding the budget of 0.12 + assert current_cost >= budget, ( + f"Total cost should be $0.15 (exceeding budget ${budget}) " + f"but tracker shows ${current_cost:.4f} due to dropped test cost." + ) + + +class TestLoggingSectionTestExtendGap: + """Test the secondary bug: logging at line 1777 misses test_extend.""" + + def _extract_logging_cost_like_line_1777(self, result, operation): + """Replicates the logging cost extraction logic from lines 1777-1782.""" + if operation == 'test' and len(result) >= 4: + actual_cost = result[1] if isinstance(result[1], (int, float)) else 0.0 + else: + actual_cost = result[-2] if isinstance(result[-2], (int, float)) else 0.0 + return actual_cost + + def test_logging_test_extend_cost(self): + """Bug: logging section only checks operation == 'test', not 'test_extend'. + + test_extend also returns a 4-tuple from cmd_test_main, so the same + explicit indexing should apply. + """ + result = ("tests", 0.0012345, "claude-sonnet-4-5", True) + + actual_cost = self._extract_logging_cost_like_line_1777(result, operation='test_extend') + + assert actual_cost == pytest.approx(0.0012345), ( + f"Logging cost for test_extend should be {result[1]} but got {actual_cost}. " + f"The logging section only checks operation == 'test', missing 'test_extend'." + ) diff --git a/tests/test_e2e_issue_508_sync_budget_tracking.py b/tests/test_e2e_issue_508_sync_budget_tracking.py new file mode 100644 index 00000000..21cbb221 --- /dev/null +++ b/tests/test_e2e_issue_508_sync_budget_tracking.py @@ -0,0 +1,146 @@ +"""E2E test for issue #508: Budget tracker drops test/test_extend costs. + +Exercises the REAL sync_orchestration function in headless mode (quiet=True), +with mocked operation functions returning 4-tuples, verifying that the actual +cost extraction logic at line 1752 correctly accumulates costs. +""" + +import pytest +from pathlib import Path +from unittest.mock import patch, MagicMock + +import pdd +from pdd.sync_orchestration import sync_orchestration +from pdd.sync_determine_operation import SyncDecision + + +@pytest.fixture(autouse=True) +def set_pdd_path(monkeypatch): + pdd_package_dir = Path(pdd.__file__).parent + monkeypatch.setenv("PDD_PATH", str(pdd_package_dir)) + + +@pytest.fixture +def sync_workspace(tmp_path): + """Create minimal workspace files.""" + prompt = tmp_path / "prompts" / "budget_test_python.prompt" + prompt.parent.mkdir(parents=True, exist_ok=True) + prompt.write_text("Test prompt") + + code = tmp_path / "src" / "budget_test.py" + code.parent.mkdir(parents=True, exist_ok=True) + code.write_text("def hello(): pass\n") + + example = tmp_path / "examples" / "budget_test_example.py" + example.parent.mkdir(parents=True, exist_ok=True) + example.write_text("from budget_test import hello\nhello()\n") + + test = tmp_path / "tests" / "test_budget_test.py" + test.parent.mkdir(parents=True, exist_ok=True) + test.write_text("def test_hello(): pass\n") + + return { + 'prompt': prompt, + 'code': code, + 'example': example, + 'test': test, + } + + +def _make_decision(operation, reason='auto'): + return SyncDecision(operation=operation, reason=reason) + + +class TestE2ESyncBudgetTracking: + """E2E: sync_orchestration must accumulate costs from 4-tuple test results.""" + + def _run_sync(self, sync_workspace, tmp_path, monkeypatch, decisions, op_results, budget=10.0): + """Run sync_orchestration with mocked operations returning specified results.""" + monkeypatch.chdir(tmp_path) + + call_count = [0] + def mock_determine(*args, **kwargs): + if call_count[0] < len(decisions): + d = decisions[call_count[0]] + call_count[0] += 1 + return d + return _make_decision('all_synced', 'Complete') + + patches = { + 'pdd.sync_orchestration.get_pdd_file_paths': MagicMock(return_value=sync_workspace), + 'pdd.sync_orchestration.sync_determine_operation': mock_determine, + 'pdd.sync_orchestration.SyncLock': MagicMock(), + 'pdd.sync_orchestration.log_event': MagicMock(), + 'pdd.sync_orchestration.append_log_entry': MagicMock(), + 'pdd.sync_orchestration._save_fingerprint_atomic': MagicMock(), + 'pdd.sync_orchestration._save_run_report_atomic': MagicMock(), + 'pdd.sync_orchestration.calculate_sha256': MagicMock(return_value='abc123'), + 'pdd.sync_orchestration.maybe_steer_operation': MagicMock(side_effect=lambda op, *a, **kw: (op, False)), + 'pdd.sync_orchestration.create_log_entry': MagicMock(return_value={'details': {}}), + 'pdd.sync_orchestration.update_log_entry': MagicMock(), + 'pdd.sync_orchestration.load_operation_log': MagicMock(return_value=[]), + } + + if 'test' in op_results or 'test_extend' in op_results: + test_result = op_results.get('test') or op_results.get('test_extend') + patches['pdd.sync_orchestration.cmd_test_main'] = MagicMock(return_value=test_result) + if 'generate' in op_results: + patches['pdd.sync_orchestration.code_generator_main'] = MagicMock(return_value=op_results['generate']) + + ctx_managers = [patch(k, v) for k, v in patches.items()] + for cm in ctx_managers: + cm.start() + try: + return sync_orchestration( + basename='budget_test', + language='python', + budget=budget, + max_attempts=1, + strength=0.5, + temperature=0.0, + skip_verify=True, + skip_tests=False, + quiet=True, + force=True, + no_steer=True, + ) + finally: + for cm in ctx_managers: + cm.stop() + + def test_sync_tracks_test_4tuple_cost(self, sync_workspace, tmp_path, monkeypatch): + """Bug: 4-tuple test result cost is dropped because result[-2] gives model name.""" + test_cost = 0.0007821 + result = self._run_sync( + sync_workspace, tmp_path, monkeypatch, + decisions=[_make_decision('test', 'Need tests')], + op_results={'test': ("test content", test_cost, "gpt-4o-mini", True)}, + ) + + total_cost = result.get('total_cost', 0.0) + assert total_cost >= test_cost, ( + f"sync total_cost should include test cost ${test_cost} " + f"but got ${total_cost:.6f}. Bug: result[-2] on 4-tuple gives model name, not cost." + ) + + def test_sync_budget_enforcement_with_test_cost(self, sync_workspace, tmp_path, monkeypatch): + """generate($0.05) + test($0.10) = $0.15 should exceed budget $0.12.""" + result = self._run_sync( + sync_workspace, tmp_path, monkeypatch, + decisions=[ + _make_decision('generate', 'Need code'), + _make_decision('test', 'Need tests'), + _make_decision('test_extend', 'More coverage'), + ], + op_results={ + 'generate': ("code", 0.05, "gpt-4o-mini"), + 'test': ("tests", 0.10, "gpt-4o-mini", True), + }, + budget=0.12, + ) + + total_cost = result.get('total_cost', 0.0) + assert total_cost >= 0.12, ( + f"Total should be >= $0.12 (gen $0.05 + test $0.10) but got ${total_cost:.4f}. " + f"Bug drops test cost so budget check never fires." + ) From 307108d3bb04fb1c10fc2c3e3439b2eae904e93c Mon Sep 17 00:00:00 2001 From: Serhan Date: Thu, 12 Feb 2026 15:00:11 -0500 Subject: [PATCH 2/3] fix: Budget tracker drops test/test_extend costs due to wrong tuple index Fixes #508 --- pdd/sync_orchestration.py | 7 ++-- tests/test_e2e_issue_508_budget_test_cost.py | 35 ++++++++------------ 2 files changed, 19 insertions(+), 23 deletions(-) diff --git a/pdd/sync_orchestration.py b/pdd/sync_orchestration.py index d2a19d7e..d26a1a84 100644 --- a/pdd/sync_orchestration.py +++ b/pdd/sync_orchestration.py @@ -1749,7 +1749,10 @@ def __init__(self, rc, out, err): success = pdd_files['test'].exists() else: success = bool(result[0]) - cost = result[-2] if len(result) >= 2 and isinstance(result[-2], (int, float)) else 0.0 + if operation in ('test', 'test_extend') and len(result) >= 4: + cost = result[1] if isinstance(result[1], (int, float)) else 0.0 + else: + cost = result[-2] if len(result) >= 2 and isinstance(result[-2], (int, float)) else 0.0 current_cost_ref[0] += cost else: success = result is not None @@ -1774,7 +1777,7 @@ def __init__(self, rc, out, err): # cmd_test_main returns 4-tuple: (content, cost, model, agentic_success) # Other commands return 3-tuple: (content, cost, model) # Use explicit indexing for test operation to handle 4-tuple correctly - if operation == 'test' and len(result) >= 4: + if operation in ('test', 'test_extend') and len(result) >= 4: actual_cost = result[1] if isinstance(result[1], (int, float)) else 0.0 model_name = result[2] if isinstance(result[2], str) else 'unknown' else: diff --git a/tests/test_e2e_issue_508_budget_test_cost.py b/tests/test_e2e_issue_508_budget_test_cost.py index 459223f4..2650fbd7 100644 --- a/tests/test_e2e_issue_508_budget_test_cost.py +++ b/tests/test_e2e_issue_508_budget_test_cost.py @@ -13,13 +13,8 @@ class TestBudgetCostExtraction: """Test the cost extraction logic at sync_orchestration.py:1752.""" - def _extract_cost_like_line_1752(self, result): - """Replicates the exact buggy logic from line 1752.""" - cost = result[-2] if len(result) >= 2 and isinstance(result[-2], (int, float)) else 0.0 - return cost - - def _extract_cost_fixed(self, result, operation): - """What the fixed logic should do.""" + def _extract_cost(self, result, operation): + """Replicates the cost extraction logic from line 1752 (should match current source).""" if operation in ('test', 'test_extend') and len(result) >= 4: cost = result[1] if isinstance(result[1], (int, float)) else 0.0 else: @@ -27,27 +22,25 @@ def _extract_cost_fixed(self, result, operation): return cost def test_4_tuple_test_cost_extraction(self): - """Bug: 4-tuple from cmd_test_main has cost at index 1, but result[-2] gives index 2 (model name). + """4-tuple from cmd_test_main has cost at index 1. cmd_test_main returns: (content, cost, model, agentic_success) - result[-2] for a 4-tuple = result[2] = model name (string) → isinstance check fails → $0.00 + The fix ensures result[1] is used for test/test_extend operations. """ - # Simulate cmd_test_main 4-tuple return result = ("test content", 0.0007821, "gpt-4o-mini", True) - cost = self._extract_cost_like_line_1752(result) + cost = self._extract_cost(result, operation='test') - # This assertion demonstrates the bug: cost should be 0.0007821 but is 0.0 assert cost == pytest.approx(0.0007821), ( f"Cost should be {result[1]} but got {cost}. " f"result[-2] = {result[-2]!r} (type={type(result[-2]).__name__}) is the model name, not cost." ) def test_4_tuple_test_extend_cost_extraction(self): - """Same bug for test_extend operation which also calls cmd_test_main.""" + """Same fix applies for test_extend operation which also calls cmd_test_main.""" result = ("test content", 0.0012345, "claude-sonnet-4-5", False) - cost = self._extract_cost_like_line_1752(result) + cost = self._extract_cost(result, operation='test_extend') assert cost == pytest.approx(0.0012345), ( f"test_extend cost should be {result[1]} but got {cost}." @@ -58,7 +51,7 @@ def test_3_tuple_generate_cost_extraction(self): # 3-tuple: (content, cost, model) result = ("generated code", 0.0005551, "gpt-4o-mini") - cost = self._extract_cost_like_line_1752(result) + cost = self._extract_cost(result, operation='generate') # For 3-tuples, result[-2] = result[1] = cost float — this works by accident assert cost == pytest.approx(0.0005551) @@ -75,12 +68,12 @@ def test_budget_enforcement_with_test_costs(self): # Operation 1: generate (3-tuple) — cost extracted correctly generate_result = ("code", 0.05, "gpt-4o-mini") - cost = self._extract_cost_like_line_1752(generate_result) + cost = self._extract_cost(generate_result, operation='generate') current_cost += cost - # Operation 2: test (4-tuple) — cost dropped due to bug + # Operation 2: test (4-tuple) — cost must be extracted correctly test_result = ("tests", 0.10, "gpt-4o-mini", True) - cost = self._extract_cost_like_line_1752(test_result) + cost = self._extract_cost(test_result, operation='test') current_cost += cost # With the bug, current_cost is only 0.05 (test cost dropped) @@ -94,9 +87,9 @@ def test_budget_enforcement_with_test_costs(self): class TestLoggingSectionTestExtendGap: """Test the secondary bug: logging at line 1777 misses test_extend.""" - def _extract_logging_cost_like_line_1777(self, result, operation): + def _extract_logging_cost(self, result, operation): """Replicates the logging cost extraction logic from lines 1777-1782.""" - if operation == 'test' and len(result) >= 4: + if operation in ('test', 'test_extend') and len(result) >= 4: actual_cost = result[1] if isinstance(result[1], (int, float)) else 0.0 else: actual_cost = result[-2] if isinstance(result[-2], (int, float)) else 0.0 @@ -110,7 +103,7 @@ def test_logging_test_extend_cost(self): """ result = ("tests", 0.0012345, "claude-sonnet-4-5", True) - actual_cost = self._extract_logging_cost_like_line_1777(result, operation='test_extend') + actual_cost = self._extract_logging_cost(result, operation='test_extend') assert actual_cost == pytest.approx(0.0012345), ( f"Logging cost for test_extend should be {result[1]} but got {actual_cost}. " From 87476c7233019578051abd085c428128216167ba Mon Sep 17 00:00:00 2001 From: Serhan Date: Thu, 12 Feb 2026 16:10:53 -0500 Subject: [PATCH 3/3] ci: retry flaky test