promptdriven · gltanaka · Feb 16, 2026 · Feb 12, 2026 · Feb 12, 2026 · Feb 13, 2026
diff --git a/pdd/llm_invoke.py b/pdd/llm_invoke.py
@@ -865,6 +865,12 @@ def _is_malformed_json_response(content: str, threshold: int = 100) -> bool:
     if trailing_newline_count >= threshold:
         return True
 
+    # Also check for excessive actual trailing newlines (not just escaped \\n)
+    # This catches cases where raw newlines cause truncation
+    actual_newline_count = len(content) - len(content.rstrip('\n'))
+    if actual_newline_count >= threshold:
+        return True
+
     # Also check for response that looks truncated mid-string
     # (ends with characters that suggest we're inside a JSON string value)
     if not stripped.endswith('}') and not stripped.endswith(']') and not stripped.endswith('"'):
@@ -2451,17 +2457,27 @@ def calc_strength(candidate):
                                     retry_messages = _format_messages(modified_prompt, input_json, use_batch_mode)
                                     # Disable cache for retry
                                     litellm.cache = None
-                                    retry_response = litellm.completion(
-                                        model=model_name_litellm,
-                                        messages=retry_messages,
-                                        temperature=current_temperature,
-                                        response_format=response_format,
-                                        timeout=LLM_CALL_TIMEOUT,
-                                        **time_kwargs,
-                                        **retry_provider_kwargs  # Issue #185: Pass Vertex AI credentials
-                                    )
-                                    # Re-enable cache - restore original configured cache (restore to original state, even if None)
-                                    litellm.cache = configured_cache
+                                    # Issue #509: Save accumulated cost/tokens before retry overwrites callback data
+                                    _accumulated_cost = _LAST_CALLBACK_DATA.get("cost", 0.0)
+                                    _accumulated_input_tokens = _LAST_CALLBACK_DATA.get("input_tokens", 0)
+                                    _accumulated_output_tokens = _LAST_CALLBACK_DATA.get("output_tokens", 0)
+                                    try:
+                                        retry_response = litellm.completion(
+                                            model=model_name_litellm,
+                                            messages=retry_messages,
+                                            temperature=current_temperature,
+                                            response_format=response_format,
+                                            timeout=LLM_CALL_TIMEOUT,
+                                            **time_kwargs,
+                                            **retry_provider_kwargs  # Issue #185: Pass Vertex AI credentials
+                                        )
+                                    finally:
+                                        # Always restore cache, even if retry raises
+                                        litellm.cache = configured_cache
+                                    # Issue #509: Accumulate cost/tokens from original call + retry
+                                    _LAST_CALLBACK_DATA["cost"] = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_cost
+                                    _LAST_CALLBACK_DATA["input_tokens"] = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_input_tokens
+                                    _LAST_CALLBACK_DATA["output_tokens"] = _LAST_CALLBACK_DATA.get("output_tokens", 0) + _accumulated_output_tokens
                                     # Extract result from retry
                                     retry_raw_result = retry_response.choices[0].message.content
                                     if retry_raw_result is not None:
@@ -2492,17 +2508,27 @@ def calc_strength(candidate):
                                     # Disable cache for retry
                                     original_cache = litellm.cache
                                     litellm.cache = None
-                                    retry_response = litellm.completion(
-                                        model=model_name_litellm,
-                                        messages=retry_messages,
-                                        temperature=current_temperature,
-                                        response_format=response_format,
-                                        timeout=LLM_CALL_TIMEOUT,
-                                        **time_kwargs,
-                                        **retry_provider_kwargs  # Issue #185: Pass Vertex AI credentials
-                                    )
-                                    # Re-enable cache
-                                    litellm.cache = original_cache
+                                    # Issue #509: Save accumulated cost/tokens before retry overwrites callback data
+                                    _accumulated_cost = _LAST_CALLBACK_DATA.get("cost", 0.0)
+                                    _accumulated_input_tokens = _LAST_CALLBACK_DATA.get("input_tokens", 0)
+                                    _accumulated_output_tokens = _LAST_CALLBACK_DATA.get("output_tokens", 0)
+                                    try:
+                                        retry_response = litellm.completion(
+                                            model=model_name_litellm,
+                                            messages=retry_messages,
+                                            temperature=current_temperature,
+                                            response_format=response_format,
+                                            timeout=LLM_CALL_TIMEOUT,
+                                            **time_kwargs,
+                                            **retry_provider_kwargs  # Issue #185: Pass Vertex AI credentials
+                                        )
+                                    finally:
+                                        # Always restore cache, even if retry raises
+                                        litellm.cache = original_cache
+                                    # Issue #509: Accumulate cost/tokens from original call + retry
+                                    _LAST_CALLBACK_DATA["cost"] = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_cost
+                                    _LAST_CALLBACK_DATA["input_tokens"] = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_input_tokens
+                                    _LAST_CALLBACK_DATA["output_tokens"] = _LAST_CALLBACK_DATA.get("output_tokens", 0) + _accumulated_output_tokens
                                     # Extract result from retry
                                     retry_raw_result = retry_response.choices[0].message.content
                                     if retry_raw_result is not None and not _is_malformed_json_response(retry_raw_result):
@@ -2731,17 +2757,27 @@ def calc_strength(candidate):
                                         # Disable cache for retry
                                         original_cache = litellm.cache
                                         litellm.cache = None
-                                        retry_response = litellm.completion(
-                                            model=model_name_litellm,
-                                            messages=retry_messages,
-                                            temperature=current_temperature,
-                                            response_format=response_format,
-                                            timeout=LLM_CALL_TIMEOUT,
-                                            **time_kwargs,
-                                            **retry_provider_kwargs  # Issue #185: Pass Vertex AI credentials
-                                        )
-                                        # Re-enable cache
-                                        litellm.cache = original_cache
+                                        # Issue #509: Save accumulated cost/tokens before retry overwrites callback data
+                                        _accumulated_cost = _LAST_CALLBACK_DATA.get("cost", 0.0)
+                                        _accumulated_input_tokens = _LAST_CALLBACK_DATA.get("input_tokens", 0)
+                                        _accumulated_output_tokens = _LAST_CALLBACK_DATA.get("output_tokens", 0)
+                                        try:
+                                            retry_response = litellm.completion(
+                                                model=model_name_litellm,
+                                                messages=retry_messages,
+                                                temperature=current_temperature,
+                                                response_format=response_format,
+                                                timeout=LLM_CALL_TIMEOUT,
+                                                **time_kwargs,
+                                                **retry_provider_kwargs  # Issue #185: Pass Vertex AI credentials
+                                            )
+                                        finally:
+                                            # Always restore cache, even if retry raises
+                                            litellm.cache = original_cache
+                                        # Issue #509: Accumulate cost/tokens from original call + retry
+                                        _LAST_CALLBACK_DATA["cost"] = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_cost
+                                        _LAST_CALLBACK_DATA["input_tokens"] = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_input_tokens
+                                        _LAST_CALLBACK_DATA["output_tokens"] = _LAST_CALLBACK_DATA.get("output_tokens", 0) + _accumulated_output_tokens
                                         # Extract and re-parse the retry result
                                         retry_raw_result = retry_response.choices[0].message.content
                                         if retry_raw_result is not None:

diff --git a/tests/test_e2e_issue_509_retry_cost.py b/tests/test_e2e_issue_509_retry_cost.py
@@ -0,0 +1,232 @@
+"""
+E2E Test for Issue #509: Cost under-reporting when retry calls overwrite _llm_mod._LAST_CALLBACK_DATA.
+
+This test exercises the generate command path by invoking llm_invoke directly, simulating
+a scenario where llm_invoke retries due to None content. The test verifies that the cost
+accumulated in _llm_mod._LAST_CALLBACK_DATA includes BOTH the original and retry call
+costs, not just the retry cost.
+
+The bug: _llm_mod._LAST_CALLBACK_DATA["cost"] gets overwritten (not accumulated) on retry,
+so the cost from the first LLM call is silently lost.
+"""
+
+import os
+import pytest
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+
+import pdd.llm_invoke as _llm_mod
+
+
+@pytest.fixture(autouse=True)
+def set_pdd_path(monkeypatch):
+    """Set PDD_PATH so language detection can find language_format.csv."""
+    import pdd
+    pdd_package_dir = Path(pdd.__file__).parent
+    monkeypatch.setenv("PDD_PATH", str(pdd_package_dir))
+
+
+@pytest.fixture(autouse=True)
+def reset_callback_data():
+    """Reset _llm_mod._LAST_CALLBACK_DATA before each test."""
+    _llm_mod._LAST_CALLBACK_DATA["cost"] = 0.0
+    _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 0
+    _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 0
+    _llm_mod._LAST_CALLBACK_DATA["finish_reason"] = None
+    yield
+
+
+def _make_response(content, prompt_tokens=100, completion_tokens=50):
+    resp = MagicMock()
+    choice = MagicMock()
+    choice.message.content = content
+    choice.finish_reason = "stop"
+    resp.choices = [choice]
+    usage = MagicMock()
+    usage.prompt_tokens = prompt_tokens
+    usage.completion_tokens = completion_tokens
+    resp.usage = usage
+    return resp
+
+
+class TestE2ERetryCostAccumulation:
+    """E2E test: llm_invoke cost accumulation through the generate command path."""
+
+    @patch.dict(os.environ, {"PDD_FORCE_LOCAL": "1", "OPENAI_API_KEY": "test-key"})
+    def test_generate_command_reports_accumulated_retry_cost(self, tmp_path, monkeypatch):
+        """
+        Exercise the full generate -> llm_invoke -> retry path and verify
+        that the cost returned includes both the original and retry call costs.
+
+        This simulates a user running `pdd generate` where the first LLM call
+        returns None content, triggering a cache-bypass retry. The total cost
+        should be $0.05 + $0.03 = $0.08, but the bug causes it to report $0.03.
+        """
+        monkeypatch.chdir(tmp_path)
+
+        # Create a prompt file
+        prompt_file = tmp_path / "hello_python.prompt"
+        prompt_file.write_text("Write a Python function that prints hello world.\n{input}")
+
+        # Set up mock model data
+        mock_model = {
+            "provider": "Anthropic",
+            "model": "claude-test-model",
+            "input": 3.0,
+            "output": 15.0,
+            "coding_arena_elo": 1500,
+            "structured_output": False,
+            "base_url": "",
+            "api_key": "OPENAI_API_KEY",
+            "max_tokens": "",
+            "max_completion_tokens": "",
+            "reasoning_type": "none",
+            "max_reasoning_tokens": 0,
+        }
+
+        import pandas as pd
+
+        first_call_cost = 0.05
+        retry_call_cost = 0.03
+        expected_total = first_call_cost + retry_call_cost  # $0.08
+
+        first_resp = _make_response(content=None)  # Triggers retry
+        retry_resp = _make_response(content='def hello():\n    print("Hello world")\n')
+
+        call_count = [0]
+
+        def mock_completion(**kwargs):
+            call_count[0] += 1
+            if call_count[0] == 1:
+                _llm_mod._LAST_CALLBACK_DATA["cost"] = first_call_cost
+                _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 100
+                _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 50
+                return first_resp
+            else:
+                _llm_mod._LAST_CALLBACK_DATA["cost"] = retry_call_cost
+                _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 100
+                _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 50
+                return retry_resp
+
+        # Patch at the llm_invoke level to control LLM behavior
+        with patch("pdd.llm_invoke._load_model_data") as mock_load, \
+             patch("pdd.llm_invoke._select_model_candidates") as mock_select, \
+             patch("pdd.llm_invoke._ensure_api_key", return_value=True), \
+             patch("pdd.llm_invoke.litellm") as mock_litellm:
+
+            mock_load.return_value = pd.DataFrame([mock_model])
+            mock_select.return_value = [mock_model]
+            mock_litellm.completion = mock_completion
+            mock_litellm.cache = MagicMock()  # Non-None triggers cache-bypass retry path
+            mock_litellm.drop_params = True
+
+            # Call llm_invoke directly as the generate command would
+            from pdd.llm_invoke import llm_invoke
+            result = llm_invoke(
+                prompt="Write a Python function that prints hello world.\n{input}",
+                input_json={"input": ""},
+                strength=0.5,
+                temperature=0.0,
+                use_cloud=False,
+            )
+
+        # Verify 2 LLM calls were made (original + retry)
+        assert call_count[0] == 2, (
+            f"Expected 2 LLM calls (original + retry), got {call_count[0]}"
+        )
+
+        # THE BUG CHECK: cost should be accumulated ($0.05 + $0.03 = $0.08)
+        reported_cost = result["cost"]
+        assert reported_cost == pytest.approx(expected_total), (
+            f"ISSUE #509 BUG DETECTED: Retry cost overwrites instead of accumulating!\n\n"
+            f"  First call cost:  ${first_call_cost:.2f}\n"
+            f"  Retry call cost:  ${retry_call_cost:.2f}\n"
+            f"  Expected total:   ${expected_total:.2f}\n"
+            f"  Reported total:   ${reported_cost:.2f}\n"
+            f"  Lost cost:        ${expected_total - reported_cost:.2f}\n\n"
+            f"The --output-cost CSV would under-report by ${expected_total - reported_cost:.2f} per retry."
+        )
+
+    @patch.dict(os.environ, {"PDD_FORCE_LOCAL": "1", "OPENAI_API_KEY": "test-key"})
+    def test_budget_enforcement_sees_full_retry_cost(self, tmp_path, monkeypatch):
+        """
+        E2E test: When budget is tight, under-reported retry costs could allow overspend.
+
+        Simulates a scenario where a user sets --budget and multiple retries occur.
+        The budget check should see the accumulated cost, not just the last retry's cost.
+        """
+        monkeypatch.chdir(tmp_path)
+
+        mock_model = {
+            "provider": "Anthropic",
+            "model": "claude-test-model",
+            "input": 3.0,
+            "output": 15.0,
+            "coding_arena_elo": 1500,
+            "structured_output": False,
+            "base_url": "",
+            "api_key": "OPENAI_API_KEY",
+            "max_tokens": "",
+            "max_completion_tokens": "",
+            "reasoning_type": "none",
+            "max_reasoning_tokens": 0,
+        }
+
+        import pandas as pd
+
+        # One retry (two calls total): costs should accumulate to $0.10
+        costs = [0.05, 0.05]
+
+        # First call returns None, second succeeds
+        responses = [
+            _make_response(content=None),
+            _make_response(content='print("hello")'),
+        ]
+
+        call_count = [0]
+
+        def mock_completion(**kwargs):
+            idx = call_count[0]
+            call_count[0] += 1
+            # Only first call can trigger cache-bypass retry; subsequent are direct
+            if idx < len(costs):
+                _llm_mod._LAST_CALLBACK_DATA["cost"] = costs[idx]
+                _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 100
+                _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 50
+                return responses[idx]
+            # Fallback
+            _llm_mod._LAST_CALLBACK_DATA["cost"] = 0.05
+            _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 100
+            _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 50
+            return responses[-1]
+
+        with patch("pdd.llm_invoke._load_model_data") as mock_load, \
+             patch("pdd.llm_invoke._select_model_candidates") as mock_select, \
+             patch("pdd.llm_invoke._ensure_api_key", return_value=True), \
+             patch("pdd.llm_invoke.litellm") as mock_litellm:
+
+            mock_load.return_value = pd.DataFrame([mock_model])
+            mock_select.return_value = [mock_model]
+            mock_litellm.completion = mock_completion
+            mock_litellm.cache = MagicMock()
+            mock_litellm.drop_params = True
+
+            from pdd.llm_invoke import llm_invoke
+            result = llm_invoke(
+                prompt="Test {input}",
+                input_json={"input": "hello"},
+                strength=0.5,
+                temperature=0.0,
+                use_cloud=False,
+            )
+
+        reported_cost = result["cost"]
+
+        # The None-content retry path does one retry, so 2 calls made.
+        # Expected accumulated cost: $0.05 + $0.05 = $0.10
+        # Bug reports only the retry cost: $0.05
+        assert reported_cost == pytest.approx(costs[0] + costs[1]), (
+            f"ISSUE #509 BUG: Budget enforcement sees only ${reported_cost:.2f}, "
+            f"but actual spend was ${costs[0] + costs[1]:.2f} (2 calls). "
+            f"This allows overspend when --budget is set."
+        )