Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions pdd/sync_orchestration.py
Original file line number Diff line number Diff line change
Expand Up @@ -1749,7 +1749,10 @@ def __init__(self, rc, out, err):
success = pdd_files['test'].exists()
else:
success = bool(result[0])
cost = result[-2] if len(result) >= 2 and isinstance(result[-2], (int, float)) else 0.0
if operation in ('test', 'test_extend') and len(result) >= 4:
cost = result[1] if isinstance(result[1], (int, float)) else 0.0
else:
cost = result[-2] if len(result) >= 2 and isinstance(result[-2], (int, float)) else 0.0
current_cost_ref[0] += cost
else:
success = result is not None
Expand All @@ -1774,7 +1777,7 @@ def __init__(self, rc, out, err):
# cmd_test_main returns 4-tuple: (content, cost, model, agentic_success)
# Other commands return 3-tuple: (content, cost, model)
# Use explicit indexing for test operation to handle 4-tuple correctly
if operation == 'test' and len(result) >= 4:
if operation in ('test', 'test_extend') and len(result) >= 4:
actual_cost = result[1] if isinstance(result[1], (int, float)) else 0.0
model_name = result[2] if isinstance(result[2], str) else 'unknown'
else:
Expand Down
111 changes: 111 additions & 0 deletions tests/test_e2e_issue_508_budget_test_cost.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""Tests for issue #508: Budget tracker drops test/test_extend costs due to wrong tuple index.

The bug: sync_orchestration.py line 1752 uses `result[-2]` to extract cost from operation
results. For 4-tuples (returned by cmd_test_main), `result[-2]` is the model name string,
not the cost float, so isinstance(..., (int, float)) fails and cost defaults to $0.00.

Secondary bug: line 1777 only checks `operation == 'test'`, missing `test_extend`.
"""

import pytest


class TestBudgetCostExtraction:
"""Test the cost extraction logic at sync_orchestration.py:1752."""

def _extract_cost(self, result, operation):
"""Replicates the cost extraction logic from line 1752 (should match current source)."""
if operation in ('test', 'test_extend') and len(result) >= 4:
cost = result[1] if isinstance(result[1], (int, float)) else 0.0
else:
cost = result[-2] if len(result) >= 2 and isinstance(result[-2], (int, float)) else 0.0
return cost

def test_4_tuple_test_cost_extraction(self):
"""4-tuple from cmd_test_main has cost at index 1.

cmd_test_main returns: (content, cost, model, agentic_success)
The fix ensures result[1] is used for test/test_extend operations.
"""
result = ("test content", 0.0007821, "gpt-4o-mini", True)

cost = self._extract_cost(result, operation='test')

assert cost == pytest.approx(0.0007821), (
f"Cost should be {result[1]} but got {cost}. "
f"result[-2] = {result[-2]!r} (type={type(result[-2]).__name__}) is the model name, not cost."
)

def test_4_tuple_test_extend_cost_extraction(self):
"""Same fix applies for test_extend operation which also calls cmd_test_main."""
result = ("test content", 0.0012345, "claude-sonnet-4-5", False)

cost = self._extract_cost(result, operation='test_extend')

assert cost == pytest.approx(0.0012345), (
f"test_extend cost should be {result[1]} but got {cost}."
)

def test_3_tuple_generate_cost_extraction(self):
"""3-tuple operations (generate, etc.) work correctly with result[-2] — regression guard."""
# 3-tuple: (content, cost, model)
result = ("generated code", 0.0005551, "gpt-4o-mini")

cost = self._extract_cost(result, operation='generate')

# For 3-tuples, result[-2] = result[1] = cost float — this works by accident
assert cost == pytest.approx(0.0005551)

def test_budget_enforcement_with_test_costs(self):
"""Budget check underestimates spend when test costs are dropped.

Simulates a sync loop where generate costs $0.05 and test costs $0.10.
With budget=$0.12, sync should stop after test. But since test cost is
dropped to $0.00, the budget check sees only $0.05 and keeps going.
"""
budget = 0.12
current_cost = 0.0

# Operation 1: generate (3-tuple) — cost extracted correctly
generate_result = ("code", 0.05, "gpt-4o-mini")
cost = self._extract_cost(generate_result, operation='generate')
current_cost += cost

# Operation 2: test (4-tuple) — cost must be extracted correctly
test_result = ("tests", 0.10, "gpt-4o-mini", True)
cost = self._extract_cost(test_result, operation='test')
current_cost += cost

# With the bug, current_cost is only 0.05 (test cost dropped)
# It should be 0.15, exceeding the budget of 0.12
assert current_cost >= budget, (
f"Total cost should be $0.15 (exceeding budget ${budget}) "
f"but tracker shows ${current_cost:.4f} due to dropped test cost."
)


class TestLoggingSectionTestExtendGap:
"""Test the secondary bug: logging at line 1777 misses test_extend."""

def _extract_logging_cost(self, result, operation):
"""Replicates the logging cost extraction logic from lines 1777-1782."""
if operation in ('test', 'test_extend') and len(result) >= 4:
actual_cost = result[1] if isinstance(result[1], (int, float)) else 0.0
else:
actual_cost = result[-2] if isinstance(result[-2], (int, float)) else 0.0
return actual_cost

def test_logging_test_extend_cost(self):
"""Bug: logging section only checks operation == 'test', not 'test_extend'.

test_extend also returns a 4-tuple from cmd_test_main, so the same
explicit indexing should apply.
"""
result = ("tests", 0.0012345, "claude-sonnet-4-5", True)

actual_cost = self._extract_logging_cost(result, operation='test_extend')

assert actual_cost == pytest.approx(0.0012345), (
f"Logging cost for test_extend should be {result[1]} but got {actual_cost}. "
f"The logging section only checks operation == 'test', missing 'test_extend'."
)
146 changes: 146 additions & 0 deletions tests/test_e2e_issue_508_sync_budget_tracking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
"""E2E test for issue #508: Budget tracker drops test/test_extend costs.

Exercises the REAL sync_orchestration function in headless mode (quiet=True),
with mocked operation functions returning 4-tuples, verifying that the actual
cost extraction logic at line 1752 correctly accumulates costs.
"""

import pytest
from pathlib import Path
from unittest.mock import patch, MagicMock

import pdd
from pdd.sync_orchestration import sync_orchestration
from pdd.sync_determine_operation import SyncDecision


@pytest.fixture(autouse=True)
def set_pdd_path(monkeypatch):
pdd_package_dir = Path(pdd.__file__).parent
monkeypatch.setenv("PDD_PATH", str(pdd_package_dir))


@pytest.fixture
def sync_workspace(tmp_path):
"""Create minimal workspace files."""
prompt = tmp_path / "prompts" / "budget_test_python.prompt"
prompt.parent.mkdir(parents=True, exist_ok=True)
prompt.write_text("Test prompt")

code = tmp_path / "src" / "budget_test.py"
code.parent.mkdir(parents=True, exist_ok=True)
code.write_text("def hello(): pass\n")

example = tmp_path / "examples" / "budget_test_example.py"
example.parent.mkdir(parents=True, exist_ok=True)
example.write_text("from budget_test import hello\nhello()\n")

test = tmp_path / "tests" / "test_budget_test.py"
test.parent.mkdir(parents=True, exist_ok=True)
test.write_text("def test_hello(): pass\n")

return {
'prompt': prompt,
'code': code,
'example': example,
'test': test,
}


def _make_decision(operation, reason='auto'):
return SyncDecision(operation=operation, reason=reason)


class TestE2ESyncBudgetTracking:
"""E2E: sync_orchestration must accumulate costs from 4-tuple test results."""

def _run_sync(self, sync_workspace, tmp_path, monkeypatch, decisions, op_results, budget=10.0):
"""Run sync_orchestration with mocked operations returning specified results."""
monkeypatch.chdir(tmp_path)

call_count = [0]
def mock_determine(*args, **kwargs):
if call_count[0] < len(decisions):
d = decisions[call_count[0]]
call_count[0] += 1
return d
return _make_decision('all_synced', 'Complete')

patches = {
'pdd.sync_orchestration.get_pdd_file_paths': MagicMock(return_value=sync_workspace),
'pdd.sync_orchestration.sync_determine_operation': mock_determine,
'pdd.sync_orchestration.SyncLock': MagicMock(),
'pdd.sync_orchestration.log_event': MagicMock(),
'pdd.sync_orchestration.append_log_entry': MagicMock(),
'pdd.sync_orchestration._save_fingerprint_atomic': MagicMock(),
'pdd.sync_orchestration._save_run_report_atomic': MagicMock(),
'pdd.sync_orchestration.calculate_sha256': MagicMock(return_value='abc123'),
'pdd.sync_orchestration.maybe_steer_operation': MagicMock(side_effect=lambda op, *a, **kw: (op, False)),
'pdd.sync_orchestration.create_log_entry': MagicMock(return_value={'details': {}}),
'pdd.sync_orchestration.update_log_entry': MagicMock(),
'pdd.sync_orchestration.load_operation_log': MagicMock(return_value=[]),
}

if 'test' in op_results or 'test_extend' in op_results:
test_result = op_results.get('test') or op_results.get('test_extend')
patches['pdd.sync_orchestration.cmd_test_main'] = MagicMock(return_value=test_result)
if 'generate' in op_results:
patches['pdd.sync_orchestration.code_generator_main'] = MagicMock(return_value=op_results['generate'])

ctx_managers = [patch(k, v) for k, v in patches.items()]
for cm in ctx_managers:
cm.start()
try:
return sync_orchestration(
basename='budget_test',
language='python',
budget=budget,
max_attempts=1,
strength=0.5,
temperature=0.0,
skip_verify=True,
skip_tests=False,
quiet=True,
force=True,
no_steer=True,
)
finally:
for cm in ctx_managers:
cm.stop()

def test_sync_tracks_test_4tuple_cost(self, sync_workspace, tmp_path, monkeypatch):
"""Bug: 4-tuple test result cost is dropped because result[-2] gives model name."""
test_cost = 0.0007821
result = self._run_sync(
sync_workspace, tmp_path, monkeypatch,
decisions=[_make_decision('test', 'Need tests')],
op_results={'test': ("test content", test_cost, "gpt-4o-mini", True)},
)

total_cost = result.get('total_cost', 0.0)
assert total_cost >= test_cost, (
f"sync total_cost should include test cost ${test_cost} "
f"but got ${total_cost:.6f}. Bug: result[-2] on 4-tuple gives model name, not cost."
)

def test_sync_budget_enforcement_with_test_cost(self, sync_workspace, tmp_path, monkeypatch):
"""generate($0.05) + test($0.10) = $0.15 should exceed budget $0.12."""
result = self._run_sync(
sync_workspace, tmp_path, monkeypatch,
decisions=[
_make_decision('generate', 'Need code'),
_make_decision('test', 'Need tests'),
_make_decision('test_extend', 'More coverage'),
],
op_results={
'generate': ("code", 0.05, "gpt-4o-mini"),
'test': ("tests", 0.10, "gpt-4o-mini", True),
},
budget=0.12,
)

total_cost = result.get('total_cost', 0.0)
assert total_cost >= 0.12, (
f"Total should be >= $0.12 (gen $0.05 + test $0.10) but got ${total_cost:.4f}. "
f"Bug drops test cost so budget check never fires."
)
Loading