From 7fc6c4c6f4a89a1356d357259ba9d6744849f0a1 Mon Sep 17 00:00:00 2001
From: Serhan <serhanasad2013@live.com>
Date: Thu, 12 Feb 2026 14:56:29 -0500
Subject: [PATCH 1/3] Add failing tests for budget tracking tuple index bug
 (#508)

Unit and E2E tests that reproduce the bug where result[-2] on a 4-tuple
from cmd_test_main extracts the model name string instead of the cost float,
causing test/test_extend costs to be silently dropped from budget tracking.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tests/test_e2e_issue_508_budget_test_cost.py  | 118 ++++++++++++++
 ...test_e2e_issue_508_sync_budget_tracking.py | 146 ++++++++++++++++++
 2 files changed, 264 insertions(+)
 create mode 100644 tests/test_e2e_issue_508_budget_test_cost.py
 create mode 100644 tests/test_e2e_issue_508_sync_budget_tracking.py

diff --git a/tests/test_e2e_issue_508_budget_test_cost.py b/tests/test_e2e_issue_508_budget_test_cost.py
new file mode 100644
index 00000000..459223f4
--- /dev/null
+++ b/tests/test_e2e_issue_508_budget_test_cost.py
@@ -0,0 +1,118 @@
+"""Tests for issue #508: Budget tracker drops test/test_extend costs due to wrong tuple index.
+
+The bug: sync_orchestration.py line 1752 uses `result[-2]` to extract cost from operation
+results. For 4-tuples (returned by cmd_test_main), `result[-2]` is the model name string,
+not the cost float, so isinstance(..., (int, float)) fails and cost defaults to $0.00.
+
+Secondary bug: line 1777 only checks `operation == 'test'`, missing `test_extend`.
+"""
+
+import pytest
+
+
+class TestBudgetCostExtraction:
+    """Test the cost extraction logic at sync_orchestration.py:1752."""
+
+    def _extract_cost_like_line_1752(self, result):
+        """Replicates the exact buggy logic from line 1752."""
+        cost = result[-2] if len(result) >= 2 and isinstance(result[-2], (int, float)) else 0.0
+        return cost
+
+    def _extract_cost_fixed(self, result, operation):
+        """What the fixed logic should do."""
+        if operation in ('test', 'test_extend') and len(result) >= 4:
+            cost = result[1] if isinstance(result[1], (int, float)) else 0.0
+        else:
+            cost = result[-2] if len(result) >= 2 and isinstance(result[-2], (int, float)) else 0.0
+        return cost
+
+    def test_4_tuple_test_cost_extraction(self):
+        """Bug: 4-tuple from cmd_test_main has cost at index 1, but result[-2] gives index 2 (model name).
+
+        cmd_test_main returns: (content, cost, model, agentic_success)
+        result[-2] for a 4-tuple = result[2] = model name (string) → isinstance check fails → $0.00
+        """
+        # Simulate cmd_test_main 4-tuple return
+        result = ("test content", 0.0007821, "gpt-4o-mini", True)
+
+        cost = self._extract_cost_like_line_1752(result)
+
+        # This assertion demonstrates the bug: cost should be 0.0007821 but is 0.0
+        assert cost == pytest.approx(0.0007821), (
+            f"Cost should be {result[1]} but got {cost}. "
+            f"result[-2] = {result[-2]!r} (type={type(result[-2]).__name__}) is the model name, not cost."
+        )
+
+    def test_4_tuple_test_extend_cost_extraction(self):
+        """Same bug for test_extend operation which also calls cmd_test_main."""
+        result = ("test content", 0.0012345, "claude-sonnet-4-5", False)
+
+        cost = self._extract_cost_like_line_1752(result)
+
+        assert cost == pytest.approx(0.0012345), (
+            f"test_extend cost should be {result[1]} but got {cost}."
+        )
+
+    def test_3_tuple_generate_cost_extraction(self):
+        """3-tuple operations (generate, etc.) work correctly with result[-2] — regression guard."""
+        # 3-tuple: (content, cost, model)
+        result = ("generated code", 0.0005551, "gpt-4o-mini")
+
+        cost = self._extract_cost_like_line_1752(result)
+
+        # For 3-tuples, result[-2] = result[1] = cost float — this works by accident
+        assert cost == pytest.approx(0.0005551)
+
+    def test_budget_enforcement_with_test_costs(self):
+        """Budget check underestimates spend when test costs are dropped.
+
+        Simulates a sync loop where generate costs $0.05 and test costs $0.10.
+        With budget=$0.12, sync should stop after test. But since test cost is
+        dropped to $0.00, the budget check sees only $0.05 and keeps going.
+        """
+        budget = 0.12
+        current_cost = 0.0
+
+        # Operation 1: generate (3-tuple) — cost extracted correctly
+        generate_result = ("code", 0.05, "gpt-4o-mini")
+        cost = self._extract_cost_like_line_1752(generate_result)
+        current_cost += cost
+
+        # Operation 2: test (4-tuple) — cost dropped due to bug
+        test_result = ("tests", 0.10, "gpt-4o-mini", True)
+        cost = self._extract_cost_like_line_1752(test_result)
+        current_cost += cost
+
+        # With the bug, current_cost is only 0.05 (test cost dropped)
+        # It should be 0.15, exceeding the budget of 0.12
+        assert current_cost >= budget, (
+            f"Total cost should be $0.15 (exceeding budget ${budget}) "
+            f"but tracker shows ${current_cost:.4f} due to dropped test cost."
+        )
+
+
+class TestLoggingSectionTestExtendGap:
+    """Test the secondary bug: logging at line 1777 misses test_extend."""
+
+    def _extract_logging_cost_like_line_1777(self, result, operation):
+        """Replicates the logging cost extraction logic from lines 1777-1782."""
+        if operation == 'test' and len(result) >= 4:
+            actual_cost = result[1] if isinstance(result[1], (int, float)) else 0.0
+        else:
+            actual_cost = result[-2] if isinstance(result[-2], (int, float)) else 0.0
+        return actual_cost
+
+    def test_logging_test_extend_cost(self):
+        """Bug: logging section only checks operation == 'test', not 'test_extend'.
+
+        test_extend also returns a 4-tuple from cmd_test_main, so the same
+        explicit indexing should apply.
+        """
+        result = ("tests", 0.0012345, "claude-sonnet-4-5", True)
+
+        actual_cost = self._extract_logging_cost_like_line_1777(result, operation='test_extend')
+
+        assert actual_cost == pytest.approx(0.0012345), (
+            f"Logging cost for test_extend should be {result[1]} but got {actual_cost}. "
+            f"The logging section only checks operation == 'test', missing 'test_extend'."
+        )
diff --git a/tests/test_e2e_issue_508_sync_budget_tracking.py b/tests/test_e2e_issue_508_sync_budget_tracking.py
new file mode 100644
index 00000000..21cbb221
--- /dev/null
+++ b/tests/test_e2e_issue_508_sync_budget_tracking.py
@@ -0,0 +1,146 @@
+"""E2E test for issue #508: Budget tracker drops test/test_extend costs.
+
+Exercises the REAL sync_orchestration function in headless mode (quiet=True),
+with mocked operation functions returning 4-tuples, verifying that the actual
+cost extraction logic at line 1752 correctly accumulates costs.
+"""
+
+import pytest
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+
+import pdd
+from pdd.sync_orchestration import sync_orchestration
+from pdd.sync_determine_operation import SyncDecision
+
+
+@pytest.fixture(autouse=True)
+def set_pdd_path(monkeypatch):
+    pdd_package_dir = Path(pdd.__file__).parent
+    monkeypatch.setenv("PDD_PATH", str(pdd_package_dir))
+
+
+@pytest.fixture
+def sync_workspace(tmp_path):
+    """Create minimal workspace files."""
+    prompt = tmp_path / "prompts" / "budget_test_python.prompt"
+    prompt.parent.mkdir(parents=True, exist_ok=True)
+    prompt.write_text("Test prompt")
+
+    code = tmp_path / "src" / "budget_test.py"
+    code.parent.mkdir(parents=True, exist_ok=True)
+    code.write_text("def hello(): pass\n")
+
+    example = tmp_path / "examples" / "budget_test_example.py"
+    example.parent.mkdir(parents=True, exist_ok=True)
+    example.write_text("from budget_test import hello\nhello()\n")
+
+    test = tmp_path / "tests" / "test_budget_test.py"
+    test.parent.mkdir(parents=True, exist_ok=True)
+    test.write_text("def test_hello(): pass\n")
+
+    return {
+        'prompt': prompt,
+        'code': code,
+        'example': example,
+        'test': test,
+    }
+
+
+def _make_decision(operation, reason='auto'):
+    return SyncDecision(operation=operation, reason=reason)
+
+
+class TestE2ESyncBudgetTracking:
+    """E2E: sync_orchestration must accumulate costs from 4-tuple test results."""
+
+    def _run_sync(self, sync_workspace, tmp_path, monkeypatch, decisions, op_results, budget=10.0):
+        """Run sync_orchestration with mocked operations returning specified results."""
+        monkeypatch.chdir(tmp_path)
+
+        call_count = [0]
+        def mock_determine(*args, **kwargs):
+            if call_count[0] < len(decisions):
+                d = decisions[call_count[0]]
+                call_count[0] += 1
+                return d
+            return _make_decision('all_synced', 'Complete')
+
+        patches = {
+            'pdd.sync_orchestration.get_pdd_file_paths': MagicMock(return_value=sync_workspace),
+            'pdd.sync_orchestration.sync_determine_operation': mock_determine,
+            'pdd.sync_orchestration.SyncLock': MagicMock(),
+            'pdd.sync_orchestration.log_event': MagicMock(),
+            'pdd.sync_orchestration.append_log_entry': MagicMock(),
+            'pdd.sync_orchestration._save_fingerprint_atomic': MagicMock(),
+            'pdd.sync_orchestration._save_run_report_atomic': MagicMock(),
+            'pdd.sync_orchestration.calculate_sha256': MagicMock(return_value='abc123'),
+            'pdd.sync_orchestration.maybe_steer_operation': MagicMock(side_effect=lambda op, *a, **kw: (op, False)),
+            'pdd.sync_orchestration.create_log_entry': MagicMock(return_value={'details': {}}),
+            'pdd.sync_orchestration.update_log_entry': MagicMock(),
+            'pdd.sync_orchestration.load_operation_log': MagicMock(return_value=[]),
+        }
+
+        if 'test' in op_results or 'test_extend' in op_results:
+            test_result = op_results.get('test') or op_results.get('test_extend')
+            patches['pdd.sync_orchestration.cmd_test_main'] = MagicMock(return_value=test_result)
+        if 'generate' in op_results:
+            patches['pdd.sync_orchestration.code_generator_main'] = MagicMock(return_value=op_results['generate'])
+
+        ctx_managers = [patch(k, v) for k, v in patches.items()]
+        for cm in ctx_managers:
+            cm.start()
+        try:
+            return sync_orchestration(
+                basename='budget_test',
+                language='python',
+                budget=budget,
+                max_attempts=1,
+                strength=0.5,
+                temperature=0.0,
+                skip_verify=True,
+                skip_tests=False,
+                quiet=True,
+                force=True,
+                no_steer=True,
+            )
+        finally:
+            for cm in ctx_managers:
+                cm.stop()
+
+    def test_sync_tracks_test_4tuple_cost(self, sync_workspace, tmp_path, monkeypatch):
+        """Bug: 4-tuple test result cost is dropped because result[-2] gives model name."""
+        test_cost = 0.0007821
+        result = self._run_sync(
+            sync_workspace, tmp_path, monkeypatch,
+            decisions=[_make_decision('test', 'Need tests')],
+            op_results={'test': ("test content", test_cost, "gpt-4o-mini", True)},
+        )
+
+        total_cost = result.get('total_cost', 0.0)
+        assert total_cost >= test_cost, (
+            f"sync total_cost should include test cost ${test_cost} "
+            f"but got ${total_cost:.6f}. Bug: result[-2] on 4-tuple gives model name, not cost."
+        )
+
+    def test_sync_budget_enforcement_with_test_cost(self, sync_workspace, tmp_path, monkeypatch):
+        """generate($0.05) + test($0.10) = $0.15 should exceed budget $0.12."""
+        result = self._run_sync(
+            sync_workspace, tmp_path, monkeypatch,
+            decisions=[
+                _make_decision('generate', 'Need code'),
+                _make_decision('test', 'Need tests'),
+                _make_decision('test_extend', 'More coverage'),
+            ],
+            op_results={
+                'generate': ("code", 0.05, "gpt-4o-mini"),
+                'test': ("tests", 0.10, "gpt-4o-mini", True),
+            },
+            budget=0.12,
+        )
+
+        total_cost = result.get('total_cost', 0.0)
+        assert total_cost >= 0.12, (
+            f"Total should be >= $0.12 (gen $0.05 + test $0.10) but got ${total_cost:.4f}. "
+            f"Bug drops test cost so budget check never fires."
+        )

From 307108d3bb04fb1c10fc2c3e3439b2eae904e93c Mon Sep 17 00:00:00 2001
From: Serhan <serhanasad2013@live.com>
Date: Thu, 12 Feb 2026 15:00:11 -0500
Subject: [PATCH 2/3] fix: Budget tracker drops test/test_extend costs due to
 wrong tuple index

Fixes #508
---
 pdd/sync_orchestration.py                    |  7 ++--
 tests/test_e2e_issue_508_budget_test_cost.py | 35 ++++++++------------
 2 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/pdd/sync_orchestration.py b/pdd/sync_orchestration.py
index d2a19d7e..d26a1a84 100644
--- a/pdd/sync_orchestration.py
+++ b/pdd/sync_orchestration.py
@@ -1749,7 +1749,10 @@ def __init__(self, rc, out, err):
                                         success = pdd_files['test'].exists()
                                 else:
                                     success = bool(result[0])
-                                cost = result[-2] if len(result) >= 2 and isinstance(result[-2], (int, float)) else 0.0
+                                if operation in ('test', 'test_extend') and len(result) >= 4:
+                                    cost = result[1] if isinstance(result[1], (int, float)) else 0.0
+                                else:
+                                    cost = result[-2] if len(result) >= 2 and isinstance(result[-2], (int, float)) else 0.0
                                 current_cost_ref[0] += cost
                             else:
                                 success = result is not None
@@ -1774,7 +1777,7 @@ def __init__(self, rc, out, err):
                                  # cmd_test_main returns 4-tuple: (content, cost, model, agentic_success)
                                  # Other commands return 3-tuple: (content, cost, model)
                                  # Use explicit indexing for test operation to handle 4-tuple correctly
-                                 if operation == 'test' and len(result) >= 4:
+                                 if operation in ('test', 'test_extend') and len(result) >= 4:
                                      actual_cost = result[1] if isinstance(result[1], (int, float)) else 0.0
                                      model_name = result[2] if isinstance(result[2], str) else 'unknown'
                                  else:
diff --git a/tests/test_e2e_issue_508_budget_test_cost.py b/tests/test_e2e_issue_508_budget_test_cost.py
index 459223f4..2650fbd7 100644
--- a/tests/test_e2e_issue_508_budget_test_cost.py
+++ b/tests/test_e2e_issue_508_budget_test_cost.py
@@ -13,13 +13,8 @@
 class TestBudgetCostExtraction:
     """Test the cost extraction logic at sync_orchestration.py:1752."""
 
-    def _extract_cost_like_line_1752(self, result):
-        """Replicates the exact buggy logic from line 1752."""
-        cost = result[-2] if len(result) >= 2 and isinstance(result[-2], (int, float)) else 0.0
-        return cost
-
-    def _extract_cost_fixed(self, result, operation):
-        """What the fixed logic should do."""
+    def _extract_cost(self, result, operation):
+        """Replicates the cost extraction logic from line 1752 (should match current source)."""
         if operation in ('test', 'test_extend') and len(result) >= 4:
             cost = result[1] if isinstance(result[1], (int, float)) else 0.0
         else:
@@ -27,27 +22,25 @@ def _extract_cost_fixed(self, result, operation):
         return cost
 
     def test_4_tuple_test_cost_extraction(self):
-        """Bug: 4-tuple from cmd_test_main has cost at index 1, but result[-2] gives index 2 (model name).
+        """4-tuple from cmd_test_main has cost at index 1.
 
         cmd_test_main returns: (content, cost, model, agentic_success)
-        result[-2] for a 4-tuple = result[2] = model name (string) → isinstance check fails → $0.00
+        The fix ensures result[1] is used for test/test_extend operations.
         """
-        # Simulate cmd_test_main 4-tuple return
         result = ("test content", 0.0007821, "gpt-4o-mini", True)
 
-        cost = self._extract_cost_like_line_1752(result)
+        cost = self._extract_cost(result, operation='test')
 
-        # This assertion demonstrates the bug: cost should be 0.0007821 but is 0.0
         assert cost == pytest.approx(0.0007821), (
             f"Cost should be {result[1]} but got {cost}. "
             f"result[-2] = {result[-2]!r} (type={type(result[-2]).__name__}) is the model name, not cost."
         )
 
     def test_4_tuple_test_extend_cost_extraction(self):
-        """Same bug for test_extend operation which also calls cmd_test_main."""
+        """Same fix applies for test_extend operation which also calls cmd_test_main."""
         result = ("test content", 0.0012345, "claude-sonnet-4-5", False)
 
-        cost = self._extract_cost_like_line_1752(result)
+        cost = self._extract_cost(result, operation='test_extend')
 
         assert cost == pytest.approx(0.0012345), (
             f"test_extend cost should be {result[1]} but got {cost}."
@@ -58,7 +51,7 @@ def test_3_tuple_generate_cost_extraction(self):
         # 3-tuple: (content, cost, model)
         result = ("generated code", 0.0005551, "gpt-4o-mini")
 
-        cost = self._extract_cost_like_line_1752(result)
+        cost = self._extract_cost(result, operation='generate')
 
         # For 3-tuples, result[-2] = result[1] = cost float — this works by accident
         assert cost == pytest.approx(0.0005551)
@@ -75,12 +68,12 @@ def test_budget_enforcement_with_test_costs(self):
 
         # Operation 1: generate (3-tuple) — cost extracted correctly
         generate_result = ("code", 0.05, "gpt-4o-mini")
-        cost = self._extract_cost_like_line_1752(generate_result)
+        cost = self._extract_cost(generate_result, operation='generate')
         current_cost += cost
 
-        # Operation 2: test (4-tuple) — cost dropped due to bug
+        # Operation 2: test (4-tuple) — cost must be extracted correctly
         test_result = ("tests", 0.10, "gpt-4o-mini", True)
-        cost = self._extract_cost_like_line_1752(test_result)
+        cost = self._extract_cost(test_result, operation='test')
         current_cost += cost
 
         # With the bug, current_cost is only 0.05 (test cost dropped)
@@ -94,9 +87,9 @@ def test_budget_enforcement_with_test_costs(self):
 class TestLoggingSectionTestExtendGap:
     """Test the secondary bug: logging at line 1777 misses test_extend."""
 
-    def _extract_logging_cost_like_line_1777(self, result, operation):
+    def _extract_logging_cost(self, result, operation):
         """Replicates the logging cost extraction logic from lines 1777-1782."""
-        if operation == 'test' and len(result) >= 4:
+        if operation in ('test', 'test_extend') and len(result) >= 4:
             actual_cost = result[1] if isinstance(result[1], (int, float)) else 0.0
         else:
             actual_cost = result[-2] if isinstance(result[-2], (int, float)) else 0.0
@@ -110,7 +103,7 @@ def test_logging_test_extend_cost(self):
         """
         result = ("tests", 0.0012345, "claude-sonnet-4-5", True)
 
-        actual_cost = self._extract_logging_cost_like_line_1777(result, operation='test_extend')
+        actual_cost = self._extract_logging_cost(result, operation='test_extend')
 
         assert actual_cost == pytest.approx(0.0012345), (
             f"Logging cost for test_extend should be {result[1]} but got {actual_cost}. "

From 87476c7233019578051abd085c428128216167ba Mon Sep 17 00:00:00 2001
From: Serhan <serhanasad2013@live.com>
Date: Thu, 12 Feb 2026 16:10:53 -0500
Subject: [PATCH 3/3] ci: retry flaky test