From f05ad9e8d4cee40e608dcfd136fa2cf2785496c6 Mon Sep 17 00:00:00 2001 From: Serhan Date: Thu, 12 Feb 2026 14:57:42 -0500 Subject: [PATCH 1/7] Add failing tests for retry cost under-reporting bug (#509) Unit and E2E tests that verify _LAST_CALLBACK_DATA["cost"] accumulates across retries instead of being overwritten. Tests currently fail, confirming the bug. Co-Authored-By: Claude Opus 4.6 --- tests/test_e2e_issue_509_retry_cost.py | 236 +++++++++++++++++++++++++ tests/test_llm_invoke_retry_cost.py | 236 +++++++++++++++++++++++++ 2 files changed, 472 insertions(+) create mode 100644 tests/test_e2e_issue_509_retry_cost.py create mode 100644 tests/test_llm_invoke_retry_cost.py diff --git a/tests/test_e2e_issue_509_retry_cost.py b/tests/test_e2e_issue_509_retry_cost.py new file mode 100644 index 00000000..78fd187c --- /dev/null +++ b/tests/test_e2e_issue_509_retry_cost.py @@ -0,0 +1,236 @@ +""" +E2E Test for Issue #509: Cost under-reporting when retry calls overwrite _LAST_CALLBACK_DATA. + +This test exercises the generate command through Click's CliRunner, simulating a scenario +where llm_invoke retries due to None content. The test verifies that the cost returned +by llm_invoke (and ultimately reported by the CLI) includes BOTH the original and retry +call costs, not just the retry cost. + +The bug: _LAST_CALLBACK_DATA["cost"] gets overwritten (not accumulated) on retry, +so the cost from the first LLM call is silently lost. +""" + +import csv +import os +import pytest +from pathlib import Path +from unittest.mock import patch, MagicMock +from click.testing import CliRunner + +from pdd.llm_invoke import _LAST_CALLBACK_DATA + + +@pytest.fixture(autouse=True) +def set_pdd_path(monkeypatch): + """Set PDD_PATH so language detection can find language_format.csv.""" + import pdd + pdd_package_dir = Path(pdd.__file__).parent + monkeypatch.setenv("PDD_PATH", str(pdd_package_dir)) + + +@pytest.fixture(autouse=True) +def reset_callback_data(): + """Reset _LAST_CALLBACK_DATA before each test.""" + _LAST_CALLBACK_DATA["cost"] = 0.0 + _LAST_CALLBACK_DATA["input_tokens"] = 0 + _LAST_CALLBACK_DATA["output_tokens"] = 0 + _LAST_CALLBACK_DATA["finish_reason"] = None + yield + + +def _make_response(content, prompt_tokens=100, completion_tokens=50): + resp = MagicMock() + choice = MagicMock() + choice.message.content = content + choice.finish_reason = "stop" + resp.choices = [choice] + usage = MagicMock() + usage.prompt_tokens = prompt_tokens + usage.completion_tokens = completion_tokens + resp.usage = usage + return resp + + +class TestE2ERetryCostAccumulation: + """E2E test: llm_invoke cost accumulation through the generate command path.""" + + @patch.dict(os.environ, {"PDD_FORCE_LOCAL": "1", "OPENAI_API_KEY": "test-key"}) + def test_generate_command_reports_accumulated_retry_cost(self, tmp_path, monkeypatch): + """ + Exercise the full generate -> llm_invoke -> retry path and verify + that the cost returned includes both the original and retry call costs. + + This simulates a user running `pdd generate` where the first LLM call + returns None content, triggering a cache-bypass retry. The total cost + should be $0.05 + $0.03 = $0.08, but the bug causes it to report $0.03. + """ + monkeypatch.chdir(tmp_path) + + # Create a prompt file + prompt_file = tmp_path / "hello_python.prompt" + prompt_file.write_text("Write a Python function that prints hello world.\n{input}") + + # Set up mock model data + mock_model = { + "provider": "Anthropic", + "model": "claude-test-model", + "input": 3.0, + "output": 15.0, + "coding_arena_elo": 1500, + "structured_output": False, + "base_url": "", + "api_key": "OPENAI_API_KEY", + "max_tokens": "", + "max_completion_tokens": "", + "reasoning_type": "none", + "max_reasoning_tokens": 0, + } + + import pandas as pd + + first_call_cost = 0.05 + retry_call_cost = 0.03 + expected_total = first_call_cost + retry_call_cost # $0.08 + + first_resp = _make_response(content=None) # Triggers retry + retry_resp = _make_response(content='def hello():\n print("Hello world")\n') + + call_count = [0] + + def mock_completion(**kwargs): + call_count[0] += 1 + if call_count[0] == 1: + _LAST_CALLBACK_DATA["cost"] = first_call_cost + _LAST_CALLBACK_DATA["input_tokens"] = 100 + _LAST_CALLBACK_DATA["output_tokens"] = 50 + return first_resp + else: + _LAST_CALLBACK_DATA["cost"] = retry_call_cost + _LAST_CALLBACK_DATA["input_tokens"] = 100 + _LAST_CALLBACK_DATA["output_tokens"] = 50 + return retry_resp + + # Patch at the llm_invoke level to control LLM behavior + with patch("pdd.llm_invoke._load_model_data") as mock_load, \ + patch("pdd.llm_invoke._select_model_candidates") as mock_select, \ + patch("pdd.llm_invoke._ensure_api_key", return_value=True), \ + patch("pdd.llm_invoke.litellm") as mock_litellm: + + mock_load.return_value = pd.DataFrame([mock_model]) + mock_select.return_value = [mock_model] + mock_litellm.completion = mock_completion + mock_litellm.cache = MagicMock() # Non-None triggers cache-bypass retry path + mock_litellm.drop_params = True + + # Call llm_invoke directly as the generate command would + from pdd.llm_invoke import llm_invoke + result = llm_invoke( + prompt="Write a Python function that prints hello world.\n{input}", + input_json={"input": ""}, + strength=0.5, + temperature=0.0, + use_cloud=False, + ) + + # Verify 2 LLM calls were made (original + retry) + assert call_count[0] == 2, ( + f"Expected 2 LLM calls (original + retry), got {call_count[0]}" + ) + + # THE BUG CHECK: cost should be accumulated ($0.05 + $0.03 = $0.08) + reported_cost = result["cost"] + assert reported_cost == pytest.approx(expected_total), ( + f"ISSUE #509 BUG DETECTED: Retry cost overwrites instead of accumulating!\n\n" + f" First call cost: ${first_call_cost:.2f}\n" + f" Retry call cost: ${retry_call_cost:.2f}\n" + f" Expected total: ${expected_total:.2f}\n" + f" Reported total: ${reported_cost:.2f}\n" + f" Lost cost: ${expected_total - reported_cost:.2f}\n\n" + f"The --output-cost CSV would under-report by ${expected_total - reported_cost:.2f} per retry." + ) + + @patch.dict(os.environ, {"PDD_FORCE_LOCAL": "1", "OPENAI_API_KEY": "test-key"}) + def test_budget_enforcement_sees_full_retry_cost(self, tmp_path, monkeypatch): + """ + E2E test: When budget is tight, under-reported retry costs could allow overspend. + + Simulates a scenario where a user sets --budget and multiple retries occur. + The budget check should see the accumulated cost, not just the last retry's cost. + """ + monkeypatch.chdir(tmp_path) + + mock_model = { + "provider": "Anthropic", + "model": "claude-test-model", + "input": 3.0, + "output": 15.0, + "coding_arena_elo": 1500, + "structured_output": False, + "base_url": "", + "api_key": "OPENAI_API_KEY", + "max_tokens": "", + "max_completion_tokens": "", + "reasoning_type": "none", + "max_reasoning_tokens": 0, + } + + import pandas as pd + + # Two retries: costs should accumulate to $0.15 + costs = [0.05, 0.05, 0.05] + expected_total = sum(costs) + + # First two calls return None, third succeeds + responses = [ + _make_response(content=None), + _make_response(content=None), + _make_response(content='print("hello")'), + ] + + call_count = [0] + + def mock_completion(**kwargs): + idx = call_count[0] + call_count[0] += 1 + # Only first call can trigger cache-bypass retry; subsequent are direct + if idx < len(costs): + _LAST_CALLBACK_DATA["cost"] = costs[idx] + _LAST_CALLBACK_DATA["input_tokens"] = 100 + _LAST_CALLBACK_DATA["output_tokens"] = 50 + return responses[idx] + # Fallback + _LAST_CALLBACK_DATA["cost"] = 0.05 + _LAST_CALLBACK_DATA["input_tokens"] = 100 + _LAST_CALLBACK_DATA["output_tokens"] = 50 + return responses[-1] + + with patch("pdd.llm_invoke._load_model_data") as mock_load, \ + patch("pdd.llm_invoke._select_model_candidates") as mock_select, \ + patch("pdd.llm_invoke._ensure_api_key", return_value=True), \ + patch("pdd.llm_invoke.litellm") as mock_litellm: + + mock_load.return_value = pd.DataFrame([mock_model]) + mock_select.return_value = [mock_model] + mock_litellm.completion = mock_completion + mock_litellm.cache = MagicMock() + mock_litellm.drop_params = True + + from pdd.llm_invoke import llm_invoke + result = llm_invoke( + prompt="Test {input}", + input_json={"input": "hello"}, + strength=0.5, + temperature=0.0, + use_cloud=False, + ) + + reported_cost = result["cost"] + + # The None-content retry path does one retry, so 2 calls made. + # Expected accumulated cost: $0.05 + $0.05 = $0.10 + # Bug reports only the retry cost: $0.05 + assert reported_cost == pytest.approx(costs[0] + costs[1]), ( + f"ISSUE #509 BUG: Budget enforcement sees only ${reported_cost:.2f}, " + f"but actual spend was ${costs[0] + costs[1]:.2f} (2 calls). " + f"This allows overspend when --budget is set." + ) diff --git a/tests/test_llm_invoke_retry_cost.py b/tests/test_llm_invoke_retry_cost.py new file mode 100644 index 00000000..0f222310 --- /dev/null +++ b/tests/test_llm_invoke_retry_cost.py @@ -0,0 +1,236 @@ +"""Tests for issue #509: Cost under-reporting when retry calls overwrite _LAST_CALLBACK_DATA. + +When llm_invoke() retries due to None content, malformed JSON, or invalid Python syntax, +the success callback overwrites _LAST_CALLBACK_DATA["cost"] with only the retry's cost, +silently losing the original call's cost. +""" + +import pytest +import os +import pandas as pd +from unittest.mock import patch, MagicMock +from pdd.llm_invoke import _LAST_CALLBACK_DATA + + +def _make_mock_model(): + return { + "provider": "Anthropic", + "model": "claude-test-model", + "input": 3.0, + "output": 15.0, + "coding_arena_elo": 1500, + "structured_output": False, + "base_url": "", + "api_key": "OPENAI_API_KEY", + "max_tokens": "", + "max_completion_tokens": "", + "reasoning_type": "none", + "max_reasoning_tokens": 0, + } + + +def _make_response(content, prompt_tokens=100, completion_tokens=50): + resp = MagicMock() + choice = MagicMock() + choice.message.content = content + choice.finish_reason = "stop" + resp.choices = [choice] + usage = MagicMock() + usage.prompt_tokens = prompt_tokens + usage.completion_tokens = completion_tokens + resp.usage = usage + return resp + + +def _make_completion_side_effect(responses_and_costs): + """Create a side_effect function that sets _LAST_CALLBACK_DATA on each call. + + responses_and_costs: list of (response, cost) tuples + """ + call_count = [0] + + def side_effect(**kwargs): + idx = call_count[0] + call_count[0] += 1 + resp, cost = responses_and_costs[idx] + # Simulate the callback overwriting _LAST_CALLBACK_DATA + _LAST_CALLBACK_DATA["cost"] = cost + _LAST_CALLBACK_DATA["input_tokens"] = resp.usage.prompt_tokens + _LAST_CALLBACK_DATA["output_tokens"] = resp.usage.completion_tokens + return resp + + return side_effect + + +@pytest.fixture(autouse=True) +def reset_callback_data(): + """Reset _LAST_CALLBACK_DATA before each test.""" + _LAST_CALLBACK_DATA["cost"] = 0.0 + _LAST_CALLBACK_DATA["input_tokens"] = 0 + _LAST_CALLBACK_DATA["output_tokens"] = 0 + _LAST_CALLBACK_DATA["finish_reason"] = None + yield + + +class TestRetryCostAccumulation: + """Tests that retry costs are accumulated, not overwritten.""" + + @patch.dict(os.environ, {"PDD_FORCE_LOCAL": "1", "OPENAI_API_KEY": "test-key"}) + @patch("pdd.llm_invoke._ensure_api_key", return_value=True) + @patch("pdd.llm_invoke._select_model_candidates") + @patch("pdd.llm_invoke._load_model_data") + @patch("pdd.llm_invoke.litellm") + def test_none_content_retry_accumulates_cost( + self, mock_litellm, mock_load, mock_select, mock_ensure + ): + """When first call returns None content and retry succeeds, + the reported cost should include BOTH calls' costs.""" + model = _make_mock_model() + mock_load.return_value = pd.DataFrame([model]) + mock_select.return_value = [model] + mock_litellm.cache = MagicMock() # Non-None so retry path triggers + mock_litellm.drop_params = True + + first_call_cost = 0.05 + retry_call_cost = 0.03 + expected_total = first_call_cost + retry_call_cost + + first_resp = _make_response(content=None) # Triggers None-content retry + retry_resp = _make_response(content="Hello world") + + mock_litellm.completion = _make_completion_side_effect([ + (first_resp, first_call_cost), + (retry_resp, retry_call_cost), + ]) + + from pdd.llm_invoke import llm_invoke + result = llm_invoke( + prompt="Test {input}", + input_json={"input": "hello"}, + strength=0.5, + temperature=0.0, + use_cloud=False, + ) + + reported_cost = result["cost"] + assert reported_cost == pytest.approx(expected_total), ( + f"Cost should be {expected_total} (first {first_call_cost} + retry {retry_call_cost}), " + f"but got {reported_cost}. Retry cost overwrote instead of accumulating." + ) + + @patch.dict(os.environ, {"PDD_FORCE_LOCAL": "1", "OPENAI_API_KEY": "test-key"}) + @patch("pdd.llm_invoke._ensure_api_key", return_value=True) + @patch("pdd.llm_invoke._select_model_candidates") + @patch("pdd.llm_invoke._load_model_data") + @patch("pdd.llm_invoke.litellm") + def test_malformed_json_retry_accumulates_cost( + self, mock_litellm, mock_load, mock_select, mock_ensure + ): + """When first call returns malformed JSON and retry succeeds, + the reported cost should include BOTH calls' costs.""" + model = _make_mock_model() + mock_load.return_value = pd.DataFrame([model]) + mock_select.return_value = [model] + mock_litellm.cache = MagicMock() + mock_litellm.drop_params = True + + first_call_cost = 0.04 + retry_call_cost = 0.04 + expected_total = first_call_cost + retry_call_cost + + # Malformed JSON: excessive trailing newlines + malformed = '{"key": "value' + '\n' * 500 + first_resp = _make_response(content=malformed) + retry_resp = _make_response(content='{"key": "value"}') + + mock_litellm.completion = _make_completion_side_effect([ + (first_resp, first_call_cost), + (retry_resp, retry_call_cost), + ]) + + from pdd.llm_invoke import llm_invoke + result = llm_invoke( + prompt="Test {input}", + input_json={"input": "hello"}, + strength=0.5, + temperature=0.0, + use_cloud=False, + ) + + reported_cost = result["cost"] + assert reported_cost == pytest.approx(expected_total), ( + f"Cost should be {expected_total} (first {first_call_cost} + retry {retry_call_cost}), " + f"but got {reported_cost}. Retry cost overwrote instead of accumulating." + ) + + @patch.dict(os.environ, {"PDD_FORCE_LOCAL": "1", "OPENAI_API_KEY": "test-key"}) + @patch("pdd.llm_invoke._ensure_api_key", return_value=True) + @patch("pdd.llm_invoke._select_model_candidates") + @patch("pdd.llm_invoke._load_model_data") + @patch("pdd.llm_invoke.litellm") + def test_no_retry_cost_unchanged( + self, mock_litellm, mock_load, mock_select, mock_ensure + ): + """When no retry occurs, cost should be reported normally (regression guard).""" + model = _make_mock_model() + mock_load.return_value = pd.DataFrame([model]) + mock_select.return_value = [model] + mock_litellm.cache = None + mock_litellm.drop_params = True + + single_cost = 0.05 + resp = _make_response(content="Hello world") + + mock_litellm.completion = _make_completion_side_effect([ + (resp, single_cost), + ]) + + from pdd.llm_invoke import llm_invoke + result = llm_invoke( + prompt="Test {input}", + input_json={"input": "hello"}, + strength=0.5, + temperature=0.0, + use_cloud=False, + ) + + assert result["cost"] == pytest.approx(single_cost) + + @patch.dict(os.environ, {"PDD_FORCE_LOCAL": "1", "OPENAI_API_KEY": "test-key"}) + @patch("pdd.llm_invoke._ensure_api_key", return_value=True) + @patch("pdd.llm_invoke._select_model_candidates") + @patch("pdd.llm_invoke._load_model_data") + @patch("pdd.llm_invoke.litellm") + def test_none_content_retry_accumulates_tokens( + self, mock_litellm, mock_load, mock_select, mock_ensure + ): + """Token counts should also accumulate across retries.""" + model = _make_mock_model() + mock_load.return_value = pd.DataFrame([model]) + mock_select.return_value = [model] + mock_litellm.cache = MagicMock() + mock_litellm.drop_params = True + + first_resp = _make_response(content=None, prompt_tokens=1000, completion_tokens=500) + retry_resp = _make_response(content="Hello", prompt_tokens=1000, completion_tokens=500) + + mock_litellm.completion = _make_completion_side_effect([ + (first_resp, 0.05), + (retry_resp, 0.03), + ]) + + from pdd.llm_invoke import llm_invoke + result = llm_invoke( + prompt="Test {input}", + input_json={"input": "hello"}, + strength=0.5, + temperature=0.0, + use_cloud=False, + ) + + # After fix, tokens should be accumulated (2000 input, 1000 output) + # Currently they get overwritten to just the retry's tokens + reported_cost = result["cost"] + assert reported_cost == pytest.approx(0.08), ( + f"Expected accumulated cost of 0.08, got {reported_cost}" + ) From 928ac9861de8902517036ba9ee24abd303f3be57 Mon Sep 17 00:00:00 2001 From: Serhan Date: Thu, 12 Feb 2026 15:20:17 -0500 Subject: [PATCH 2/7] fix: Cost under-reporting: retry calls overwrite original call cost in _LAST_CALLBACK_DATA Fixes #509 --- pdd/llm_invoke.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/pdd/llm_invoke.py b/pdd/llm_invoke.py index 24c6ba90..b5e3a365 100644 --- a/pdd/llm_invoke.py +++ b/pdd/llm_invoke.py @@ -863,6 +863,12 @@ def _is_malformed_json_response(content: str, threshold: int = 100) -> bool: if trailing_newline_count >= threshold: return True + # Also check for excessive actual trailing newlines (not just escaped \\n) + # This catches cases where raw newlines cause truncation + actual_newline_count = len(content) - len(content.rstrip('\n')) + if actual_newline_count >= threshold and not stripped.endswith('}'): + return True + # Also check for response that looks truncated mid-string # (ends with characters that suggest we're inside a JSON string value) if not stripped.endswith('}') and not stripped.endswith(']') and not stripped.endswith('"'): @@ -2449,6 +2455,10 @@ def calc_strength(candidate): retry_messages = _format_messages(modified_prompt, input_json, use_batch_mode) # Disable cache for retry litellm.cache = None + # Issue #509: Save accumulated cost/tokens before retry overwrites callback data + _accumulated_cost = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_input_tokens = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_output_tokens = _LAST_CALLBACK_DATA.get("output_tokens", 0) retry_response = litellm.completion( model=model_name_litellm, messages=retry_messages, @@ -2457,6 +2467,10 @@ def calc_strength(candidate): **time_kwargs, **retry_provider_kwargs # Issue #185: Pass Vertex AI credentials ) + # Issue #509: Accumulate cost/tokens from original call + retry + _LAST_CALLBACK_DATA["cost"] = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_cost + _LAST_CALLBACK_DATA["input_tokens"] = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_input_tokens + _LAST_CALLBACK_DATA["output_tokens"] = _LAST_CALLBACK_DATA.get("output_tokens", 0) + _accumulated_output_tokens # Re-enable cache - restore original configured cache (restore to original state, even if None) litellm.cache = configured_cache # Extract result from retry @@ -2489,6 +2503,10 @@ def calc_strength(candidate): # Disable cache for retry original_cache = litellm.cache litellm.cache = None + # Issue #509: Save accumulated cost/tokens before retry overwrites callback data + _accumulated_cost = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_input_tokens = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_output_tokens = _LAST_CALLBACK_DATA.get("output_tokens", 0) retry_response = litellm.completion( model=model_name_litellm, messages=retry_messages, @@ -2497,6 +2515,10 @@ def calc_strength(candidate): **time_kwargs, **retry_provider_kwargs # Issue #185: Pass Vertex AI credentials ) + # Issue #509: Accumulate cost/tokens from original call + retry + _LAST_CALLBACK_DATA["cost"] = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_cost + _LAST_CALLBACK_DATA["input_tokens"] = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_input_tokens + _LAST_CALLBACK_DATA["output_tokens"] = _LAST_CALLBACK_DATA.get("output_tokens", 0) + _accumulated_output_tokens # Re-enable cache litellm.cache = original_cache # Extract result from retry @@ -2727,6 +2749,10 @@ def calc_strength(candidate): # Disable cache for retry original_cache = litellm.cache litellm.cache = None + # Issue #509: Save accumulated cost/tokens before retry overwrites callback data + _accumulated_cost = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_input_tokens = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_output_tokens = _LAST_CALLBACK_DATA.get("output_tokens", 0) retry_response = litellm.completion( model=model_name_litellm, messages=retry_messages, @@ -2735,6 +2761,10 @@ def calc_strength(candidate): **time_kwargs, **retry_provider_kwargs # Issue #185: Pass Vertex AI credentials ) + # Issue #509: Accumulate cost/tokens from original call + retry + _LAST_CALLBACK_DATA["cost"] = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_cost + _LAST_CALLBACK_DATA["input_tokens"] = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_input_tokens + _LAST_CALLBACK_DATA["output_tokens"] = _LAST_CALLBACK_DATA.get("output_tokens", 0) + _accumulated_output_tokens # Re-enable cache litellm.cache = original_cache # Extract and re-parse the retry result From 615f32d82b92561d1efbd7deb2307b7a368e26c4 Mon Sep 17 00:00:00 2001 From: Serhan Date: Fri, 13 Feb 2026 17:56:44 -0500 Subject: [PATCH 3/7] fix: test isolation bug - use module ref for _LAST_CALLBACK_DATA to survive importlib.reload Co-Authored-By: Claude Opus 4.6 --- tests/test_e2e_issue_509_retry_cost.py | 40 +++++++++++++------------- tests/test_llm_invoke_retry_cost.py | 26 ++++++++--------- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/tests/test_e2e_issue_509_retry_cost.py b/tests/test_e2e_issue_509_retry_cost.py index 78fd187c..c8f43e3e 100644 --- a/tests/test_e2e_issue_509_retry_cost.py +++ b/tests/test_e2e_issue_509_retry_cost.py @@ -1,12 +1,12 @@ """ -E2E Test for Issue #509: Cost under-reporting when retry calls overwrite _LAST_CALLBACK_DATA. +E2E Test for Issue #509: Cost under-reporting when retry calls overwrite _llm_mod._LAST_CALLBACK_DATA. This test exercises the generate command through Click's CliRunner, simulating a scenario where llm_invoke retries due to None content. The test verifies that the cost returned by llm_invoke (and ultimately reported by the CLI) includes BOTH the original and retry call costs, not just the retry cost. -The bug: _LAST_CALLBACK_DATA["cost"] gets overwritten (not accumulated) on retry, +The bug: _llm_mod._LAST_CALLBACK_DATA["cost"] gets overwritten (not accumulated) on retry, so the cost from the first LLM call is silently lost. """ @@ -17,7 +17,7 @@ from unittest.mock import patch, MagicMock from click.testing import CliRunner -from pdd.llm_invoke import _LAST_CALLBACK_DATA +import pdd.llm_invoke as _llm_mod @pytest.fixture(autouse=True) @@ -30,11 +30,11 @@ def set_pdd_path(monkeypatch): @pytest.fixture(autouse=True) def reset_callback_data(): - """Reset _LAST_CALLBACK_DATA before each test.""" - _LAST_CALLBACK_DATA["cost"] = 0.0 - _LAST_CALLBACK_DATA["input_tokens"] = 0 - _LAST_CALLBACK_DATA["output_tokens"] = 0 - _LAST_CALLBACK_DATA["finish_reason"] = None + """Reset _llm_mod._LAST_CALLBACK_DATA before each test.""" + _llm_mod._LAST_CALLBACK_DATA["cost"] = 0.0 + _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 0 + _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 0 + _llm_mod._LAST_CALLBACK_DATA["finish_reason"] = None yield @@ -100,14 +100,14 @@ def test_generate_command_reports_accumulated_retry_cost(self, tmp_path, monkeyp def mock_completion(**kwargs): call_count[0] += 1 if call_count[0] == 1: - _LAST_CALLBACK_DATA["cost"] = first_call_cost - _LAST_CALLBACK_DATA["input_tokens"] = 100 - _LAST_CALLBACK_DATA["output_tokens"] = 50 + _llm_mod._LAST_CALLBACK_DATA["cost"] = first_call_cost + _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 100 + _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 50 return first_resp else: - _LAST_CALLBACK_DATA["cost"] = retry_call_cost - _LAST_CALLBACK_DATA["input_tokens"] = 100 - _LAST_CALLBACK_DATA["output_tokens"] = 50 + _llm_mod._LAST_CALLBACK_DATA["cost"] = retry_call_cost + _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 100 + _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 50 return retry_resp # Patch at the llm_invoke level to control LLM behavior @@ -194,14 +194,14 @@ def mock_completion(**kwargs): call_count[0] += 1 # Only first call can trigger cache-bypass retry; subsequent are direct if idx < len(costs): - _LAST_CALLBACK_DATA["cost"] = costs[idx] - _LAST_CALLBACK_DATA["input_tokens"] = 100 - _LAST_CALLBACK_DATA["output_tokens"] = 50 + _llm_mod._LAST_CALLBACK_DATA["cost"] = costs[idx] + _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 100 + _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 50 return responses[idx] # Fallback - _LAST_CALLBACK_DATA["cost"] = 0.05 - _LAST_CALLBACK_DATA["input_tokens"] = 100 - _LAST_CALLBACK_DATA["output_tokens"] = 50 + _llm_mod._LAST_CALLBACK_DATA["cost"] = 0.05 + _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 100 + _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 50 return responses[-1] with patch("pdd.llm_invoke._load_model_data") as mock_load, \ diff --git a/tests/test_llm_invoke_retry_cost.py b/tests/test_llm_invoke_retry_cost.py index 0f222310..140758a7 100644 --- a/tests/test_llm_invoke_retry_cost.py +++ b/tests/test_llm_invoke_retry_cost.py @@ -1,7 +1,7 @@ -"""Tests for issue #509: Cost under-reporting when retry calls overwrite _LAST_CALLBACK_DATA. +"""Tests for issue #509: Cost under-reporting when retry calls overwrite _llm_mod._LAST_CALLBACK_DATA. When llm_invoke() retries due to None content, malformed JSON, or invalid Python syntax, -the success callback overwrites _LAST_CALLBACK_DATA["cost"] with only the retry's cost, +the success callback overwrites _llm_mod._LAST_CALLBACK_DATA["cost"] with only the retry's cost, silently losing the original call's cost. """ @@ -9,7 +9,7 @@ import os import pandas as pd from unittest.mock import patch, MagicMock -from pdd.llm_invoke import _LAST_CALLBACK_DATA +import pdd.llm_invoke as _llm_mod def _make_mock_model(): @@ -43,7 +43,7 @@ def _make_response(content, prompt_tokens=100, completion_tokens=50): def _make_completion_side_effect(responses_and_costs): - """Create a side_effect function that sets _LAST_CALLBACK_DATA on each call. + """Create a side_effect function that sets _llm_mod._LAST_CALLBACK_DATA on each call. responses_and_costs: list of (response, cost) tuples """ @@ -53,10 +53,10 @@ def side_effect(**kwargs): idx = call_count[0] call_count[0] += 1 resp, cost = responses_and_costs[idx] - # Simulate the callback overwriting _LAST_CALLBACK_DATA - _LAST_CALLBACK_DATA["cost"] = cost - _LAST_CALLBACK_DATA["input_tokens"] = resp.usage.prompt_tokens - _LAST_CALLBACK_DATA["output_tokens"] = resp.usage.completion_tokens + # Simulate the callback overwriting _llm_mod._LAST_CALLBACK_DATA + _llm_mod._LAST_CALLBACK_DATA["cost"] = cost + _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = resp.usage.prompt_tokens + _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = resp.usage.completion_tokens return resp return side_effect @@ -64,11 +64,11 @@ def side_effect(**kwargs): @pytest.fixture(autouse=True) def reset_callback_data(): - """Reset _LAST_CALLBACK_DATA before each test.""" - _LAST_CALLBACK_DATA["cost"] = 0.0 - _LAST_CALLBACK_DATA["input_tokens"] = 0 - _LAST_CALLBACK_DATA["output_tokens"] = 0 - _LAST_CALLBACK_DATA["finish_reason"] = None + """Reset _llm_mod._LAST_CALLBACK_DATA before each test.""" + _llm_mod._LAST_CALLBACK_DATA["cost"] = 0.0 + _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 0 + _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 0 + _llm_mod._LAST_CALLBACK_DATA["finish_reason"] = None yield From 9b203e57f8c766d4e57b2e7cdb9526c0919c485e Mon Sep 17 00:00:00 2001 From: Serhan Date: Fri, 13 Feb 2026 18:33:06 -0500 Subject: [PATCH 4/7] fix: add run_test_command for JS/TS/TSX in language_format.csv The CSV had empty run_test_command for JavaScript, TypeScript, and TypeScriptReact, causing 5 CI test failures in test_agentic_langtest, test_get_language, and test_get_test_command. Co-Authored-By: Claude Opus 4.6 --- pdd/data/language_format.csv | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pdd/data/language_format.csv b/pdd/data/language_format.csv index 4555496a..e50dff8d 100644 --- a/pdd/data/language_format.csv +++ b/pdd/data/language_format.csv @@ -2,7 +2,7 @@ language,comment,extension,run_command,run_test_command Python,#,.py,python {file},python -m pytest {file} -v Java,//,.java,java {file}, C++,//,.cpp,, -JavaScript,//,.js,node {file}, +JavaScript,//,.js,node {file},node {file} HTML,"",.html,, CSS,"/**/",.css,, Ruby,#,.rb,ruby {file}, @@ -11,8 +11,8 @@ Swift,//,.swift,swift {file},swift test Go,//,.go,go run {file},go test -v {file} Rust,//,.rs,,cargo test Kotlin,//,.kt,kotlin {file}, -TypeScript,//,.ts,npx tsx {file}, -TypeScriptReact,//,.tsx,npx tsx {file}, +TypeScript,//,.ts,npx tsx {file},npx tsx {file} +TypeScriptReact,//,.tsx,npx tsx {file},npx tsx {file} C#,//,.cs,dotnet run {file},dotnet test SQL,--,.sql,, Shell,#,.sh,sh {file}, From e3def9b547d09850bdada0310249245ba314f820 Mon Sep 17 00:00:00 2001 From: Serhan Asad <95069916+Serhan-Asad@users.noreply.github.com> Date: Fri, 13 Feb 2026 18:33:57 -0500 Subject: [PATCH 5/7] Delete pdd/data/language_format.csv --- pdd/data/language_format.csv | 71 ------------------------------------ 1 file changed, 71 deletions(-) delete mode 100644 pdd/data/language_format.csv diff --git a/pdd/data/language_format.csv b/pdd/data/language_format.csv deleted file mode 100644 index e50dff8d..00000000 --- a/pdd/data/language_format.csv +++ /dev/null @@ -1,71 +0,0 @@ -language,comment,extension,run_command,run_test_command -Python,#,.py,python {file},python -m pytest {file} -v -Java,//,.java,java {file}, -C++,//,.cpp,, -JavaScript,//,.js,node {file},node {file} -HTML,"",.html,, -CSS,"/**/",.css,, -Ruby,#,.rb,ruby {file}, -PHP,//,.php,php {file}, -Swift,//,.swift,swift {file},swift test -Go,//,.go,go run {file},go test -v {file} -Rust,//,.rs,,cargo test -Kotlin,//,.kt,kotlin {file}, -TypeScript,//,.ts,npx tsx {file},npx tsx {file} -TypeScriptReact,//,.tsx,npx tsx {file},npx tsx {file} -C#,//,.cs,dotnet run {file},dotnet test -SQL,--,.sql,, -Shell,#,.sh,sh {file}, -Bash,#,.sh,bash {file}, -Fish,#,.fish,fish {file}, -Zsh,#,.zsh,zsh {file}, -PowerShell,#,.ps1,pwsh {file}, -Perl,#,.pl,perl {file}, -R,#,.R,Rscript {file}, -MATLAB,%,.m,, -Lua,--,.lua,lua {file}, -Haskell,--,.hs,runhaskell {file}, -Scala,//,.scala,scala {file}, -Groovy,//,.groovy,groovy {file}, -Dart,//,.dart,dart run {file}, -F#,//,.fs,dotnet fsi {file}, -YAML,#,.yml,, -YAML,#,.yaml,, -JSON,del,.json,, -JSONL,del,.jsonl,, -XML,"",.xml,, -Makefile,#,,, -CSV,del,.csv,, -Markdown,del,.md,, -LaTeX,%,.tex,, -Assembly,;,.asm,, -Fortran,!,.f90,gfortran -o {file}.out {file} && ./{file}.out, -COBOL,*>,.cob,, -Lisp,;,.lisp,sbcl --script {file}, -Prolog,%,.pl,swipl -s {file}, -Erlang,%,.erl,escript {file}, -Clojure,;,.clj,clojure {file}, -Julia,#,.jl,julia {file}, -Elixir,#,.ex,elixir {file}, -Pascal,//,.pas,fpc {file} && ./{file}, -VBScript,"'",.vbs,cscript {file}, -CoffeeScript,#,.coffee,coffee {file}, -Objective-C,//,.m,, -Scheme,;,.scm,scheme --script {file}, -Tcl,#,.tcl,tclsh {file}, -D,//,.d,rdmd {file}, -Ada,--,.ada,, -Nim,#,.nim,nim r {file}, -OCaml,"(**)",.ml,ocaml {file}, -LLM,del,.prompt,, -prompt,del,.prompt,, -TOML,#,.toml,, -Log,del,.log,, -reStructuredText,del,.rst,, -Text,del,.txt,, -INI,;,.ini,, -Verilog,//,.v,, -Systemverilog,//,.sv,, -Prisma,///,.prisma,, -Lean,--,.lean,lean {file}, -Agda,--,.agda,, From b972341630e8ed76bb16114c95e977dc6251f667 Mon Sep 17 00:00:00 2001 From: Serhan Date: Sat, 14 Feb 2026 10:39:53 -0500 Subject: [PATCH 6/7] Address Copilot review: fix test assertions, cleanup imports, simplify condition - Add token accumulation assertions to test that claims to verify tokens - Remove unused imports (csv, CliRunner) from e2e test - Fix misleading docstring about CliRunner usage - Trim 3-call setup to 2-call to match actual retry behavior - Simplify redundant endswith('}') check in _is_malformed_json_response Co-Authored-By: Claude Opus 4.6 --- pdd/llm_invoke.py | 2 +- tests/test_e2e_issue_509_retry_cost.py | 18 +++++++----------- tests/test_llm_invoke_retry_cost.py | 10 ++++++++++ 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/pdd/llm_invoke.py b/pdd/llm_invoke.py index df97a643..723a42c4 100644 --- a/pdd/llm_invoke.py +++ b/pdd/llm_invoke.py @@ -868,7 +868,7 @@ def _is_malformed_json_response(content: str, threshold: int = 100) -> bool: # Also check for excessive actual trailing newlines (not just escaped \\n) # This catches cases where raw newlines cause truncation actual_newline_count = len(content) - len(content.rstrip('\n')) - if actual_newline_count >= threshold and not stripped.endswith('}'): + if actual_newline_count >= threshold: return True # Also check for response that looks truncated mid-string diff --git a/tests/test_e2e_issue_509_retry_cost.py b/tests/test_e2e_issue_509_retry_cost.py index c8f43e3e..3b1a9d91 100644 --- a/tests/test_e2e_issue_509_retry_cost.py +++ b/tests/test_e2e_issue_509_retry_cost.py @@ -1,21 +1,19 @@ """ E2E Test for Issue #509: Cost under-reporting when retry calls overwrite _llm_mod._LAST_CALLBACK_DATA. -This test exercises the generate command through Click's CliRunner, simulating a scenario -where llm_invoke retries due to None content. The test verifies that the cost returned -by llm_invoke (and ultimately reported by the CLI) includes BOTH the original and retry -call costs, not just the retry cost. +This test exercises the generate command path by invoking llm_invoke directly, simulating +a scenario where llm_invoke retries due to None content. The test verifies that the cost +accumulated in _llm_mod._LAST_CALLBACK_DATA includes BOTH the original and retry call +costs, not just the retry cost. The bug: _llm_mod._LAST_CALLBACK_DATA["cost"] gets overwritten (not accumulated) on retry, so the cost from the first LLM call is silently lost. """ -import csv import os import pytest from pathlib import Path from unittest.mock import patch, MagicMock -from click.testing import CliRunner import pdd.llm_invoke as _llm_mod @@ -176,13 +174,11 @@ def test_budget_enforcement_sees_full_retry_cost(self, tmp_path, monkeypatch): import pandas as pd - # Two retries: costs should accumulate to $0.15 - costs = [0.05, 0.05, 0.05] - expected_total = sum(costs) + # One retry (two calls total): costs should accumulate to $0.10 + costs = [0.05, 0.05] - # First two calls return None, third succeeds + # First call returns None, second succeeds responses = [ - _make_response(content=None), _make_response(content=None), _make_response(content='print("hello")'), ] diff --git a/tests/test_llm_invoke_retry_cost.py b/tests/test_llm_invoke_retry_cost.py index 140758a7..975d2a83 100644 --- a/tests/test_llm_invoke_retry_cost.py +++ b/tests/test_llm_invoke_retry_cost.py @@ -234,3 +234,13 @@ def test_none_content_retry_accumulates_tokens( assert reported_cost == pytest.approx(0.08), ( f"Expected accumulated cost of 0.08, got {reported_cost}" ) + + # Verify that input/output token counts are also accumulated across retries. + assert _llm_mod._LAST_CALLBACK_DATA["input_tokens"] == 2000, ( + f"Expected accumulated input tokens of 2000, " + f"got {_llm_mod._LAST_CALLBACK_DATA['input_tokens']}" + ) + assert _llm_mod._LAST_CALLBACK_DATA["output_tokens"] == 1000, ( + f"Expected accumulated output tokens of 1000, " + f"got {_llm_mod._LAST_CALLBACK_DATA['output_tokens']}" + ) From 9fc9ce796e784c1cd9bdf7544935b5c7791b38a5 Mon Sep 17 00:00:00 2001 From: Serhan Date: Sun, 15 Feb 2026 13:35:32 -0500 Subject: [PATCH 7/7] fix: restore litellm.cache in finally blocks to prevent cache leak on retry failure All 3 retry paths (None content, malformed JSON, invalid Python) set litellm.cache = None but only restored it on success. If the retry call raised an exception, cache stayed permanently disabled for the process. Co-Authored-By: Claude Opus 4.6 --- pdd/llm_invoke.py | 72 +++++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 33 deletions(-) diff --git a/pdd/llm_invoke.py b/pdd/llm_invoke.py index 723a42c4..4bb51ba9 100644 --- a/pdd/llm_invoke.py +++ b/pdd/llm_invoke.py @@ -2461,21 +2461,23 @@ def calc_strength(candidate): _accumulated_cost = _LAST_CALLBACK_DATA.get("cost", 0.0) _accumulated_input_tokens = _LAST_CALLBACK_DATA.get("input_tokens", 0) _accumulated_output_tokens = _LAST_CALLBACK_DATA.get("output_tokens", 0) - retry_response = litellm.completion( - model=model_name_litellm, - messages=retry_messages, - temperature=current_temperature, - response_format=response_format, - timeout=LLM_CALL_TIMEOUT, - **time_kwargs, - **retry_provider_kwargs # Issue #185: Pass Vertex AI credentials - ) + try: + retry_response = litellm.completion( + model=model_name_litellm, + messages=retry_messages, + temperature=current_temperature, + response_format=response_format, + timeout=LLM_CALL_TIMEOUT, + **time_kwargs, + **retry_provider_kwargs # Issue #185: Pass Vertex AI credentials + ) + finally: + # Always restore cache, even if retry raises + litellm.cache = configured_cache # Issue #509: Accumulate cost/tokens from original call + retry _LAST_CALLBACK_DATA["cost"] = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_cost _LAST_CALLBACK_DATA["input_tokens"] = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_input_tokens _LAST_CALLBACK_DATA["output_tokens"] = _LAST_CALLBACK_DATA.get("output_tokens", 0) + _accumulated_output_tokens - # Re-enable cache - restore original configured cache (restore to original state, even if None) - litellm.cache = configured_cache # Extract result from retry retry_raw_result = retry_response.choices[0].message.content if retry_raw_result is not None: @@ -2510,21 +2512,23 @@ def calc_strength(candidate): _accumulated_cost = _LAST_CALLBACK_DATA.get("cost", 0.0) _accumulated_input_tokens = _LAST_CALLBACK_DATA.get("input_tokens", 0) _accumulated_output_tokens = _LAST_CALLBACK_DATA.get("output_tokens", 0) - retry_response = litellm.completion( - model=model_name_litellm, - messages=retry_messages, - temperature=current_temperature, - response_format=response_format, - timeout=LLM_CALL_TIMEOUT, - **time_kwargs, - **retry_provider_kwargs # Issue #185: Pass Vertex AI credentials - ) + try: + retry_response = litellm.completion( + model=model_name_litellm, + messages=retry_messages, + temperature=current_temperature, + response_format=response_format, + timeout=LLM_CALL_TIMEOUT, + **time_kwargs, + **retry_provider_kwargs # Issue #185: Pass Vertex AI credentials + ) + finally: + # Always restore cache, even if retry raises + litellm.cache = original_cache # Issue #509: Accumulate cost/tokens from original call + retry _LAST_CALLBACK_DATA["cost"] = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_cost _LAST_CALLBACK_DATA["input_tokens"] = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_input_tokens _LAST_CALLBACK_DATA["output_tokens"] = _LAST_CALLBACK_DATA.get("output_tokens", 0) + _accumulated_output_tokens - # Re-enable cache - litellm.cache = original_cache # Extract result from retry retry_raw_result = retry_response.choices[0].message.content if retry_raw_result is not None and not _is_malformed_json_response(retry_raw_result): @@ -2757,21 +2761,23 @@ def calc_strength(candidate): _accumulated_cost = _LAST_CALLBACK_DATA.get("cost", 0.0) _accumulated_input_tokens = _LAST_CALLBACK_DATA.get("input_tokens", 0) _accumulated_output_tokens = _LAST_CALLBACK_DATA.get("output_tokens", 0) - retry_response = litellm.completion( - model=model_name_litellm, - messages=retry_messages, - temperature=current_temperature, - response_format=response_format, - timeout=LLM_CALL_TIMEOUT, - **time_kwargs, - **retry_provider_kwargs # Issue #185: Pass Vertex AI credentials - ) + try: + retry_response = litellm.completion( + model=model_name_litellm, + messages=retry_messages, + temperature=current_temperature, + response_format=response_format, + timeout=LLM_CALL_TIMEOUT, + **time_kwargs, + **retry_provider_kwargs # Issue #185: Pass Vertex AI credentials + ) + finally: + # Always restore cache, even if retry raises + litellm.cache = original_cache # Issue #509: Accumulate cost/tokens from original call + retry _LAST_CALLBACK_DATA["cost"] = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_cost _LAST_CALLBACK_DATA["input_tokens"] = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_input_tokens _LAST_CALLBACK_DATA["output_tokens"] = _LAST_CALLBACK_DATA.get("output_tokens", 0) + _accumulated_output_tokens - # Re-enable cache - litellm.cache = original_cache # Extract and re-parse the retry result retry_raw_result = retry_response.choices[0].message.content if retry_raw_result is not None: