From f05ad9e8d4cee40e608dcfd136fa2cf2785496c6 Mon Sep 17 00:00:00 2001
From: Serhan <serhanasad2013@live.com>
Date: Thu, 12 Feb 2026 14:57:42 -0500
Subject: [PATCH 1/7] Add failing tests for retry cost under-reporting bug
 (#509)

Unit and E2E tests that verify _LAST_CALLBACK_DATA["cost"] accumulates
across retries instead of being overwritten. Tests currently fail,
confirming the bug.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tests/test_e2e_issue_509_retry_cost.py | 236 +++++++++++++++++++++++++
 tests/test_llm_invoke_retry_cost.py    | 236 +++++++++++++++++++++++++
 2 files changed, 472 insertions(+)
 create mode 100644 tests/test_e2e_issue_509_retry_cost.py
 create mode 100644 tests/test_llm_invoke_retry_cost.py

diff --git a/tests/test_e2e_issue_509_retry_cost.py b/tests/test_e2e_issue_509_retry_cost.py
new file mode 100644
index 00000000..78fd187c
--- /dev/null
+++ b/tests/test_e2e_issue_509_retry_cost.py
@@ -0,0 +1,236 @@
+"""
+E2E Test for Issue #509: Cost under-reporting when retry calls overwrite _LAST_CALLBACK_DATA.
+
+This test exercises the generate command through Click's CliRunner, simulating a scenario
+where llm_invoke retries due to None content. The test verifies that the cost returned
+by llm_invoke (and ultimately reported by the CLI) includes BOTH the original and retry
+call costs, not just the retry cost.
+
+The bug: _LAST_CALLBACK_DATA["cost"] gets overwritten (not accumulated) on retry,
+so the cost from the first LLM call is silently lost.
+"""
+
+import csv
+import os
+import pytest
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+from click.testing import CliRunner
+
+from pdd.llm_invoke import _LAST_CALLBACK_DATA
+
+
+@pytest.fixture(autouse=True)
+def set_pdd_path(monkeypatch):
+    """Set PDD_PATH so language detection can find language_format.csv."""
+    import pdd
+    pdd_package_dir = Path(pdd.__file__).parent
+    monkeypatch.setenv("PDD_PATH", str(pdd_package_dir))
+
+
+@pytest.fixture(autouse=True)
+def reset_callback_data():
+    """Reset _LAST_CALLBACK_DATA before each test."""
+    _LAST_CALLBACK_DATA["cost"] = 0.0
+    _LAST_CALLBACK_DATA["input_tokens"] = 0
+    _LAST_CALLBACK_DATA["output_tokens"] = 0
+    _LAST_CALLBACK_DATA["finish_reason"] = None
+    yield
+
+
+def _make_response(content, prompt_tokens=100, completion_tokens=50):
+    resp = MagicMock()
+    choice = MagicMock()
+    choice.message.content = content
+    choice.finish_reason = "stop"
+    resp.choices = [choice]
+    usage = MagicMock()
+    usage.prompt_tokens = prompt_tokens
+    usage.completion_tokens = completion_tokens
+    resp.usage = usage
+    return resp
+
+
+class TestE2ERetryCostAccumulation:
+    """E2E test: llm_invoke cost accumulation through the generate command path."""
+
+    @patch.dict(os.environ, {"PDD_FORCE_LOCAL": "1", "OPENAI_API_KEY": "test-key"})
+    def test_generate_command_reports_accumulated_retry_cost(self, tmp_path, monkeypatch):
+        """
+        Exercise the full generate -> llm_invoke -> retry path and verify
+        that the cost returned includes both the original and retry call costs.
+
+        This simulates a user running `pdd generate` where the first LLM call
+        returns None content, triggering a cache-bypass retry. The total cost
+        should be $0.05 + $0.03 = $0.08, but the bug causes it to report $0.03.
+        """
+        monkeypatch.chdir(tmp_path)
+
+        # Create a prompt file
+        prompt_file = tmp_path / "hello_python.prompt"
+        prompt_file.write_text("Write a Python function that prints hello world.\n{input}")
+
+        # Set up mock model data
+        mock_model = {
+            "provider": "Anthropic",
+            "model": "claude-test-model",
+            "input": 3.0,
+            "output": 15.0,
+            "coding_arena_elo": 1500,
+            "structured_output": False,
+            "base_url": "",
+            "api_key": "OPENAI_API_KEY",
+            "max_tokens": "",
+            "max_completion_tokens": "",
+            "reasoning_type": "none",
+            "max_reasoning_tokens": 0,
+        }
+
+        import pandas as pd
+
+        first_call_cost = 0.05
+        retry_call_cost = 0.03
+        expected_total = first_call_cost + retry_call_cost  # $0.08
+
+        first_resp = _make_response(content=None)  # Triggers retry
+        retry_resp = _make_response(content='def hello():\n    print("Hello world")\n')
+
+        call_count = [0]
+
+        def mock_completion(**kwargs):
+            call_count[0] += 1
+            if call_count[0] == 1:
+                _LAST_CALLBACK_DATA["cost"] = first_call_cost
+                _LAST_CALLBACK_DATA["input_tokens"] = 100
+                _LAST_CALLBACK_DATA["output_tokens"] = 50
+                return first_resp
+            else:
+                _LAST_CALLBACK_DATA["cost"] = retry_call_cost
+                _LAST_CALLBACK_DATA["input_tokens"] = 100
+                _LAST_CALLBACK_DATA["output_tokens"] = 50
+                return retry_resp
+
+        # Patch at the llm_invoke level to control LLM behavior
+        with patch("pdd.llm_invoke._load_model_data") as mock_load, \
+             patch("pdd.llm_invoke._select_model_candidates") as mock_select, \
+             patch("pdd.llm_invoke._ensure_api_key", return_value=True), \
+             patch("pdd.llm_invoke.litellm") as mock_litellm:
+
+            mock_load.return_value = pd.DataFrame([mock_model])
+            mock_select.return_value = [mock_model]
+            mock_litellm.completion = mock_completion
+            mock_litellm.cache = MagicMock()  # Non-None triggers cache-bypass retry path
+            mock_litellm.drop_params = True
+
+            # Call llm_invoke directly as the generate command would
+            from pdd.llm_invoke import llm_invoke
+            result = llm_invoke(
+                prompt="Write a Python function that prints hello world.\n{input}",
+                input_json={"input": ""},
+                strength=0.5,
+                temperature=0.0,
+                use_cloud=False,
+            )
+
+        # Verify 2 LLM calls were made (original + retry)
+        assert call_count[0] == 2, (
+            f"Expected 2 LLM calls (original + retry), got {call_count[0]}"
+        )
+
+        # THE BUG CHECK: cost should be accumulated ($0.05 + $0.03 = $0.08)
+        reported_cost = result["cost"]
+        assert reported_cost == pytest.approx(expected_total), (
+            f"ISSUE #509 BUG DETECTED: Retry cost overwrites instead of accumulating!\n\n"
+            f"  First call cost:  ${first_call_cost:.2f}\n"
+            f"  Retry call cost:  ${retry_call_cost:.2f}\n"
+            f"  Expected total:   ${expected_total:.2f}\n"
+            f"  Reported total:   ${reported_cost:.2f}\n"
+            f"  Lost cost:        ${expected_total - reported_cost:.2f}\n\n"
+            f"The --output-cost CSV would under-report by ${expected_total - reported_cost:.2f} per retry."
+        )
+
+    @patch.dict(os.environ, {"PDD_FORCE_LOCAL": "1", "OPENAI_API_KEY": "test-key"})
+    def test_budget_enforcement_sees_full_retry_cost(self, tmp_path, monkeypatch):
+        """
+        E2E test: When budget is tight, under-reported retry costs could allow overspend.
+
+        Simulates a scenario where a user sets --budget and multiple retries occur.
+        The budget check should see the accumulated cost, not just the last retry's cost.
+        """
+        monkeypatch.chdir(tmp_path)
+
+        mock_model = {
+            "provider": "Anthropic",
+            "model": "claude-test-model",
+            "input": 3.0,
+            "output": 15.0,
+            "coding_arena_elo": 1500,
+            "structured_output": False,
+            "base_url": "",
+            "api_key": "OPENAI_API_KEY",
+            "max_tokens": "",
+            "max_completion_tokens": "",
+            "reasoning_type": "none",
+            "max_reasoning_tokens": 0,
+        }
+
+        import pandas as pd
+
+        # Two retries: costs should accumulate to $0.15
+        costs = [0.05, 0.05, 0.05]
+        expected_total = sum(costs)
+
+        # First two calls return None, third succeeds
+        responses = [
+            _make_response(content=None),
+            _make_response(content=None),
+            _make_response(content='print("hello")'),
+        ]
+
+        call_count = [0]
+
+        def mock_completion(**kwargs):
+            idx = call_count[0]
+            call_count[0] += 1
+            # Only first call can trigger cache-bypass retry; subsequent are direct
+            if idx < len(costs):
+                _LAST_CALLBACK_DATA["cost"] = costs[idx]
+                _LAST_CALLBACK_DATA["input_tokens"] = 100
+                _LAST_CALLBACK_DATA["output_tokens"] = 50
+                return responses[idx]
+            # Fallback
+            _LAST_CALLBACK_DATA["cost"] = 0.05
+            _LAST_CALLBACK_DATA["input_tokens"] = 100
+            _LAST_CALLBACK_DATA["output_tokens"] = 50
+            return responses[-1]
+
+        with patch("pdd.llm_invoke._load_model_data") as mock_load, \
+             patch("pdd.llm_invoke._select_model_candidates") as mock_select, \
+             patch("pdd.llm_invoke._ensure_api_key", return_value=True), \
+             patch("pdd.llm_invoke.litellm") as mock_litellm:
+
+            mock_load.return_value = pd.DataFrame([mock_model])
+            mock_select.return_value = [mock_model]
+            mock_litellm.completion = mock_completion
+            mock_litellm.cache = MagicMock()
+            mock_litellm.drop_params = True
+
+            from pdd.llm_invoke import llm_invoke
+            result = llm_invoke(
+                prompt="Test {input}",
+                input_json={"input": "hello"},
+                strength=0.5,
+                temperature=0.0,
+                use_cloud=False,
+            )
+
+        reported_cost = result["cost"]
+
+        # The None-content retry path does one retry, so 2 calls made.
+        # Expected accumulated cost: $0.05 + $0.05 = $0.10
+        # Bug reports only the retry cost: $0.05
+        assert reported_cost == pytest.approx(costs[0] + costs[1]), (
+            f"ISSUE #509 BUG: Budget enforcement sees only ${reported_cost:.2f}, "
+            f"but actual spend was ${costs[0] + costs[1]:.2f} (2 calls). "
+            f"This allows overspend when --budget is set."
+        )
diff --git a/tests/test_llm_invoke_retry_cost.py b/tests/test_llm_invoke_retry_cost.py
new file mode 100644
index 00000000..0f222310
--- /dev/null
+++ b/tests/test_llm_invoke_retry_cost.py
@@ -0,0 +1,236 @@
+"""Tests for issue #509: Cost under-reporting when retry calls overwrite _LAST_CALLBACK_DATA.
+
+When llm_invoke() retries due to None content, malformed JSON, or invalid Python syntax,
+the success callback overwrites _LAST_CALLBACK_DATA["cost"] with only the retry's cost,
+silently losing the original call's cost.
+"""
+
+import pytest
+import os
+import pandas as pd
+from unittest.mock import patch, MagicMock
+from pdd.llm_invoke import _LAST_CALLBACK_DATA
+
+
+def _make_mock_model():
+    return {
+        "provider": "Anthropic",
+        "model": "claude-test-model",
+        "input": 3.0,
+        "output": 15.0,
+        "coding_arena_elo": 1500,
+        "structured_output": False,
+        "base_url": "",
+        "api_key": "OPENAI_API_KEY",
+        "max_tokens": "",
+        "max_completion_tokens": "",
+        "reasoning_type": "none",
+        "max_reasoning_tokens": 0,
+    }
+
+
+def _make_response(content, prompt_tokens=100, completion_tokens=50):
+    resp = MagicMock()
+    choice = MagicMock()
+    choice.message.content = content
+    choice.finish_reason = "stop"
+    resp.choices = [choice]
+    usage = MagicMock()
+    usage.prompt_tokens = prompt_tokens
+    usage.completion_tokens = completion_tokens
+    resp.usage = usage
+    return resp
+
+
+def _make_completion_side_effect(responses_and_costs):
+    """Create a side_effect function that sets _LAST_CALLBACK_DATA on each call.
+
+    responses_and_costs: list of (response, cost) tuples
+    """
+    call_count = [0]
+
+    def side_effect(**kwargs):
+        idx = call_count[0]
+        call_count[0] += 1
+        resp, cost = responses_and_costs[idx]
+        # Simulate the callback overwriting _LAST_CALLBACK_DATA
+        _LAST_CALLBACK_DATA["cost"] = cost
+        _LAST_CALLBACK_DATA["input_tokens"] = resp.usage.prompt_tokens
+        _LAST_CALLBACK_DATA["output_tokens"] = resp.usage.completion_tokens
+        return resp
+
+    return side_effect
+
+
+@pytest.fixture(autouse=True)
+def reset_callback_data():
+    """Reset _LAST_CALLBACK_DATA before each test."""
+    _LAST_CALLBACK_DATA["cost"] = 0.0
+    _LAST_CALLBACK_DATA["input_tokens"] = 0
+    _LAST_CALLBACK_DATA["output_tokens"] = 0
+    _LAST_CALLBACK_DATA["finish_reason"] = None
+    yield
+
+
+class TestRetryCostAccumulation:
+    """Tests that retry costs are accumulated, not overwritten."""
+
+    @patch.dict(os.environ, {"PDD_FORCE_LOCAL": "1", "OPENAI_API_KEY": "test-key"})
+    @patch("pdd.llm_invoke._ensure_api_key", return_value=True)
+    @patch("pdd.llm_invoke._select_model_candidates")
+    @patch("pdd.llm_invoke._load_model_data")
+    @patch("pdd.llm_invoke.litellm")
+    def test_none_content_retry_accumulates_cost(
+        self, mock_litellm, mock_load, mock_select, mock_ensure
+    ):
+        """When first call returns None content and retry succeeds,
+        the reported cost should include BOTH calls' costs."""
+        model = _make_mock_model()
+        mock_load.return_value = pd.DataFrame([model])
+        mock_select.return_value = [model]
+        mock_litellm.cache = MagicMock()  # Non-None so retry path triggers
+        mock_litellm.drop_params = True
+
+        first_call_cost = 0.05
+        retry_call_cost = 0.03
+        expected_total = first_call_cost + retry_call_cost
+
+        first_resp = _make_response(content=None)  # Triggers None-content retry
+        retry_resp = _make_response(content="Hello world")
+
+        mock_litellm.completion = _make_completion_side_effect([
+            (first_resp, first_call_cost),
+            (retry_resp, retry_call_cost),
+        ])
+
+        from pdd.llm_invoke import llm_invoke
+        result = llm_invoke(
+            prompt="Test {input}",
+            input_json={"input": "hello"},
+            strength=0.5,
+            temperature=0.0,
+            use_cloud=False,
+        )
+
+        reported_cost = result["cost"]
+        assert reported_cost == pytest.approx(expected_total), (
+            f"Cost should be {expected_total} (first {first_call_cost} + retry {retry_call_cost}), "
+            f"but got {reported_cost}. Retry cost overwrote instead of accumulating."
+        )
+
+    @patch.dict(os.environ, {"PDD_FORCE_LOCAL": "1", "OPENAI_API_KEY": "test-key"})
+    @patch("pdd.llm_invoke._ensure_api_key", return_value=True)
+    @patch("pdd.llm_invoke._select_model_candidates")
+    @patch("pdd.llm_invoke._load_model_data")
+    @patch("pdd.llm_invoke.litellm")
+    def test_malformed_json_retry_accumulates_cost(
+        self, mock_litellm, mock_load, mock_select, mock_ensure
+    ):
+        """When first call returns malformed JSON and retry succeeds,
+        the reported cost should include BOTH calls' costs."""
+        model = _make_mock_model()
+        mock_load.return_value = pd.DataFrame([model])
+        mock_select.return_value = [model]
+        mock_litellm.cache = MagicMock()
+        mock_litellm.drop_params = True
+
+        first_call_cost = 0.04
+        retry_call_cost = 0.04
+        expected_total = first_call_cost + retry_call_cost
+
+        # Malformed JSON: excessive trailing newlines
+        malformed = '{"key": "value' + '\n' * 500
+        first_resp = _make_response(content=malformed)
+        retry_resp = _make_response(content='{"key": "value"}')
+
+        mock_litellm.completion = _make_completion_side_effect([
+            (first_resp, first_call_cost),
+            (retry_resp, retry_call_cost),
+        ])
+
+        from pdd.llm_invoke import llm_invoke
+        result = llm_invoke(
+            prompt="Test {input}",
+            input_json={"input": "hello"},
+            strength=0.5,
+            temperature=0.0,
+            use_cloud=False,
+        )
+
+        reported_cost = result["cost"]
+        assert reported_cost == pytest.approx(expected_total), (
+            f"Cost should be {expected_total} (first {first_call_cost} + retry {retry_call_cost}), "
+            f"but got {reported_cost}. Retry cost overwrote instead of accumulating."
+        )
+
+    @patch.dict(os.environ, {"PDD_FORCE_LOCAL": "1", "OPENAI_API_KEY": "test-key"})
+    @patch("pdd.llm_invoke._ensure_api_key", return_value=True)
+    @patch("pdd.llm_invoke._select_model_candidates")
+    @patch("pdd.llm_invoke._load_model_data")
+    @patch("pdd.llm_invoke.litellm")
+    def test_no_retry_cost_unchanged(
+        self, mock_litellm, mock_load, mock_select, mock_ensure
+    ):
+        """When no retry occurs, cost should be reported normally (regression guard)."""
+        model = _make_mock_model()
+        mock_load.return_value = pd.DataFrame([model])
+        mock_select.return_value = [model]
+        mock_litellm.cache = None
+        mock_litellm.drop_params = True
+
+        single_cost = 0.05
+        resp = _make_response(content="Hello world")
+
+        mock_litellm.completion = _make_completion_side_effect([
+            (resp, single_cost),
+        ])
+
+        from pdd.llm_invoke import llm_invoke
+        result = llm_invoke(
+            prompt="Test {input}",
+            input_json={"input": "hello"},
+            strength=0.5,
+            temperature=0.0,
+            use_cloud=False,
+        )
+
+        assert result["cost"] == pytest.approx(single_cost)
+
+    @patch.dict(os.environ, {"PDD_FORCE_LOCAL": "1", "OPENAI_API_KEY": "test-key"})
+    @patch("pdd.llm_invoke._ensure_api_key", return_value=True)
+    @patch("pdd.llm_invoke._select_model_candidates")
+    @patch("pdd.llm_invoke._load_model_data")
+    @patch("pdd.llm_invoke.litellm")
+    def test_none_content_retry_accumulates_tokens(
+        self, mock_litellm, mock_load, mock_select, mock_ensure
+    ):
+        """Token counts should also accumulate across retries."""
+        model = _make_mock_model()
+        mock_load.return_value = pd.DataFrame([model])
+        mock_select.return_value = [model]
+        mock_litellm.cache = MagicMock()
+        mock_litellm.drop_params = True
+
+        first_resp = _make_response(content=None, prompt_tokens=1000, completion_tokens=500)
+        retry_resp = _make_response(content="Hello", prompt_tokens=1000, completion_tokens=500)
+
+        mock_litellm.completion = _make_completion_side_effect([
+            (first_resp, 0.05),
+            (retry_resp, 0.03),
+        ])
+
+        from pdd.llm_invoke import llm_invoke
+        result = llm_invoke(
+            prompt="Test {input}",
+            input_json={"input": "hello"},
+            strength=0.5,
+            temperature=0.0,
+            use_cloud=False,
+        )
+
+        # After fix, tokens should be accumulated (2000 input, 1000 output)
+        # Currently they get overwritten to just the retry's tokens
+        reported_cost = result["cost"]
+        assert reported_cost == pytest.approx(0.08), (
+            f"Expected accumulated cost of 0.08, got {reported_cost}"
+        )

From 928ac9861de8902517036ba9ee24abd303f3be57 Mon Sep 17 00:00:00 2001
From: Serhan <serhanasad2013@live.com>
Date: Thu, 12 Feb 2026 15:20:17 -0500
Subject: [PATCH 2/7] fix: Cost under-reporting: retry calls overwrite original
 call cost in _LAST_CALLBACK_DATA

Fixes #509
---
 pdd/llm_invoke.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/pdd/llm_invoke.py b/pdd/llm_invoke.py
index 24c6ba90..b5e3a365 100644
--- a/pdd/llm_invoke.py
+++ b/pdd/llm_invoke.py
@@ -863,6 +863,12 @@ def _is_malformed_json_response(content: str, threshold: int = 100) -> bool:
     if trailing_newline_count >= threshold:
         return True
 
+    # Also check for excessive actual trailing newlines (not just escaped \\n)
+    # This catches cases where raw newlines cause truncation
+    actual_newline_count = len(content) - len(content.rstrip('\n'))
+    if actual_newline_count >= threshold and not stripped.endswith('}'):
+        return True
+
     # Also check for response that looks truncated mid-string
     # (ends with characters that suggest we're inside a JSON string value)
     if not stripped.endswith('}') and not stripped.endswith(']') and not stripped.endswith('"'):
@@ -2449,6 +2455,10 @@ def calc_strength(candidate):
                                     retry_messages = _format_messages(modified_prompt, input_json, use_batch_mode)
                                     # Disable cache for retry
                                     litellm.cache = None
+                                    # Issue #509: Save accumulated cost/tokens before retry overwrites callback data
+                                    _accumulated_cost = _LAST_CALLBACK_DATA.get("cost", 0.0)
+                                    _accumulated_input_tokens = _LAST_CALLBACK_DATA.get("input_tokens", 0)
+                                    _accumulated_output_tokens = _LAST_CALLBACK_DATA.get("output_tokens", 0)
                                     retry_response = litellm.completion(
                                         model=model_name_litellm,
                                         messages=retry_messages,
@@ -2457,6 +2467,10 @@ def calc_strength(candidate):
                                         **time_kwargs,
                                         **retry_provider_kwargs  # Issue #185: Pass Vertex AI credentials
                                     )
+                                    # Issue #509: Accumulate cost/tokens from original call + retry
+                                    _LAST_CALLBACK_DATA["cost"] = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_cost
+                                    _LAST_CALLBACK_DATA["input_tokens"] = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_input_tokens
+                                    _LAST_CALLBACK_DATA["output_tokens"] = _LAST_CALLBACK_DATA.get("output_tokens", 0) + _accumulated_output_tokens
                                     # Re-enable cache - restore original configured cache (restore to original state, even if None)
                                     litellm.cache = configured_cache
                                     # Extract result from retry
@@ -2489,6 +2503,10 @@ def calc_strength(candidate):
                                     # Disable cache for retry
                                     original_cache = litellm.cache
                                     litellm.cache = None
+                                    # Issue #509: Save accumulated cost/tokens before retry overwrites callback data
+                                    _accumulated_cost = _LAST_CALLBACK_DATA.get("cost", 0.0)
+                                    _accumulated_input_tokens = _LAST_CALLBACK_DATA.get("input_tokens", 0)
+                                    _accumulated_output_tokens = _LAST_CALLBACK_DATA.get("output_tokens", 0)
                                     retry_response = litellm.completion(
                                         model=model_name_litellm,
                                         messages=retry_messages,
@@ -2497,6 +2515,10 @@ def calc_strength(candidate):
                                         **time_kwargs,
                                         **retry_provider_kwargs  # Issue #185: Pass Vertex AI credentials
                                     )
+                                    # Issue #509: Accumulate cost/tokens from original call + retry
+                                    _LAST_CALLBACK_DATA["cost"] = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_cost
+                                    _LAST_CALLBACK_DATA["input_tokens"] = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_input_tokens
+                                    _LAST_CALLBACK_DATA["output_tokens"] = _LAST_CALLBACK_DATA.get("output_tokens", 0) + _accumulated_output_tokens
                                     # Re-enable cache
                                     litellm.cache = original_cache
                                     # Extract result from retry
@@ -2727,6 +2749,10 @@ def calc_strength(candidate):
                                         # Disable cache for retry
                                         original_cache = litellm.cache
                                         litellm.cache = None
+                                        # Issue #509: Save accumulated cost/tokens before retry overwrites callback data
+                                        _accumulated_cost = _LAST_CALLBACK_DATA.get("cost", 0.0)
+                                        _accumulated_input_tokens = _LAST_CALLBACK_DATA.get("input_tokens", 0)
+                                        _accumulated_output_tokens = _LAST_CALLBACK_DATA.get("output_tokens", 0)
                                         retry_response = litellm.completion(
                                             model=model_name_litellm,
                                             messages=retry_messages,
@@ -2735,6 +2761,10 @@ def calc_strength(candidate):
                                             **time_kwargs,
                                             **retry_provider_kwargs  # Issue #185: Pass Vertex AI credentials
                                         )
+                                        # Issue #509: Accumulate cost/tokens from original call + retry
+                                        _LAST_CALLBACK_DATA["cost"] = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_cost
+                                        _LAST_CALLBACK_DATA["input_tokens"] = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_input_tokens
+                                        _LAST_CALLBACK_DATA["output_tokens"] = _LAST_CALLBACK_DATA.get("output_tokens", 0) + _accumulated_output_tokens
                                         # Re-enable cache
                                         litellm.cache = original_cache
                                         # Extract and re-parse the retry result

From 615f32d82b92561d1efbd7deb2307b7a368e26c4 Mon Sep 17 00:00:00 2001
From: Serhan <serhanasad2013@live.com>
Date: Fri, 13 Feb 2026 17:56:44 -0500
Subject: [PATCH 3/7] fix: test isolation bug - use module ref for
 _LAST_CALLBACK_DATA to survive importlib.reload

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tests/test_e2e_issue_509_retry_cost.py | 40 +++++++++++++-------------
 tests/test_llm_invoke_retry_cost.py    | 26 ++++++++---------
 2 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/tests/test_e2e_issue_509_retry_cost.py b/tests/test_e2e_issue_509_retry_cost.py
index 78fd187c..c8f43e3e 100644
--- a/tests/test_e2e_issue_509_retry_cost.py
+++ b/tests/test_e2e_issue_509_retry_cost.py
@@ -1,12 +1,12 @@
 """
-E2E Test for Issue #509: Cost under-reporting when retry calls overwrite _LAST_CALLBACK_DATA.
+E2E Test for Issue #509: Cost under-reporting when retry calls overwrite _llm_mod._LAST_CALLBACK_DATA.
 
 This test exercises the generate command through Click's CliRunner, simulating a scenario
 where llm_invoke retries due to None content. The test verifies that the cost returned
 by llm_invoke (and ultimately reported by the CLI) includes BOTH the original and retry
 call costs, not just the retry cost.
 
-The bug: _LAST_CALLBACK_DATA["cost"] gets overwritten (not accumulated) on retry,
+The bug: _llm_mod._LAST_CALLBACK_DATA["cost"] gets overwritten (not accumulated) on retry,
 so the cost from the first LLM call is silently lost.
 """
 
@@ -17,7 +17,7 @@
 from unittest.mock import patch, MagicMock
 from click.testing import CliRunner
 
-from pdd.llm_invoke import _LAST_CALLBACK_DATA
+import pdd.llm_invoke as _llm_mod
 
 
 @pytest.fixture(autouse=True)
@@ -30,11 +30,11 @@ def set_pdd_path(monkeypatch):
 
 @pytest.fixture(autouse=True)
 def reset_callback_data():
-    """Reset _LAST_CALLBACK_DATA before each test."""
-    _LAST_CALLBACK_DATA["cost"] = 0.0
-    _LAST_CALLBACK_DATA["input_tokens"] = 0
-    _LAST_CALLBACK_DATA["output_tokens"] = 0
-    _LAST_CALLBACK_DATA["finish_reason"] = None
+    """Reset _llm_mod._LAST_CALLBACK_DATA before each test."""
+    _llm_mod._LAST_CALLBACK_DATA["cost"] = 0.0
+    _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 0
+    _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 0
+    _llm_mod._LAST_CALLBACK_DATA["finish_reason"] = None
     yield
 
 
@@ -100,14 +100,14 @@ def test_generate_command_reports_accumulated_retry_cost(self, tmp_path, monkeyp
         def mock_completion(**kwargs):
             call_count[0] += 1
             if call_count[0] == 1:
-                _LAST_CALLBACK_DATA["cost"] = first_call_cost
-                _LAST_CALLBACK_DATA["input_tokens"] = 100
-                _LAST_CALLBACK_DATA["output_tokens"] = 50
+                _llm_mod._LAST_CALLBACK_DATA["cost"] = first_call_cost
+                _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 100
+                _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 50
                 return first_resp
             else:
-                _LAST_CALLBACK_DATA["cost"] = retry_call_cost
-                _LAST_CALLBACK_DATA["input_tokens"] = 100
-                _LAST_CALLBACK_DATA["output_tokens"] = 50
+                _llm_mod._LAST_CALLBACK_DATA["cost"] = retry_call_cost
+                _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 100
+                _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 50
                 return retry_resp
 
         # Patch at the llm_invoke level to control LLM behavior
@@ -194,14 +194,14 @@ def mock_completion(**kwargs):
             call_count[0] += 1
             # Only first call can trigger cache-bypass retry; subsequent are direct
             if idx < len(costs):
-                _LAST_CALLBACK_DATA["cost"] = costs[idx]
-                _LAST_CALLBACK_DATA["input_tokens"] = 100
-                _LAST_CALLBACK_DATA["output_tokens"] = 50
+                _llm_mod._LAST_CALLBACK_DATA["cost"] = costs[idx]
+                _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 100
+                _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 50
                 return responses[idx]
             # Fallback
-            _LAST_CALLBACK_DATA["cost"] = 0.05
-            _LAST_CALLBACK_DATA["input_tokens"] = 100
-            _LAST_CALLBACK_DATA["output_tokens"] = 50
+            _llm_mod._LAST_CALLBACK_DATA["cost"] = 0.05
+            _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 100
+            _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 50
             return responses[-1]
 
         with patch("pdd.llm_invoke._load_model_data") as mock_load, \
diff --git a/tests/test_llm_invoke_retry_cost.py b/tests/test_llm_invoke_retry_cost.py
index 0f222310..140758a7 100644
--- a/tests/test_llm_invoke_retry_cost.py
+++ b/tests/test_llm_invoke_retry_cost.py
@@ -1,7 +1,7 @@
-"""Tests for issue #509: Cost under-reporting when retry calls overwrite _LAST_CALLBACK_DATA.
+"""Tests for issue #509: Cost under-reporting when retry calls overwrite _llm_mod._LAST_CALLBACK_DATA.
 
 When llm_invoke() retries due to None content, malformed JSON, or invalid Python syntax,
-the success callback overwrites _LAST_CALLBACK_DATA["cost"] with only the retry's cost,
+the success callback overwrites _llm_mod._LAST_CALLBACK_DATA["cost"] with only the retry's cost,
 silently losing the original call's cost.
 """
 
@@ -9,7 +9,7 @@
 import os
 import pandas as pd
 from unittest.mock import patch, MagicMock
-from pdd.llm_invoke import _LAST_CALLBACK_DATA
+import pdd.llm_invoke as _llm_mod
 
 
 def _make_mock_model():
@@ -43,7 +43,7 @@ def _make_response(content, prompt_tokens=100, completion_tokens=50):
 
 
 def _make_completion_side_effect(responses_and_costs):
-    """Create a side_effect function that sets _LAST_CALLBACK_DATA on each call.
+    """Create a side_effect function that sets _llm_mod._LAST_CALLBACK_DATA on each call.
 
     responses_and_costs: list of (response, cost) tuples
     """
@@ -53,10 +53,10 @@ def side_effect(**kwargs):
         idx = call_count[0]
         call_count[0] += 1
         resp, cost = responses_and_costs[idx]
-        # Simulate the callback overwriting _LAST_CALLBACK_DATA
-        _LAST_CALLBACK_DATA["cost"] = cost
-        _LAST_CALLBACK_DATA["input_tokens"] = resp.usage.prompt_tokens
-        _LAST_CALLBACK_DATA["output_tokens"] = resp.usage.completion_tokens
+        # Simulate the callback overwriting _llm_mod._LAST_CALLBACK_DATA
+        _llm_mod._LAST_CALLBACK_DATA["cost"] = cost
+        _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = resp.usage.prompt_tokens
+        _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = resp.usage.completion_tokens
         return resp
 
     return side_effect
@@ -64,11 +64,11 @@ def side_effect(**kwargs):
 
 @pytest.fixture(autouse=True)
 def reset_callback_data():
-    """Reset _LAST_CALLBACK_DATA before each test."""
-    _LAST_CALLBACK_DATA["cost"] = 0.0
-    _LAST_CALLBACK_DATA["input_tokens"] = 0
-    _LAST_CALLBACK_DATA["output_tokens"] = 0
-    _LAST_CALLBACK_DATA["finish_reason"] = None
+    """Reset _llm_mod._LAST_CALLBACK_DATA before each test."""
+    _llm_mod._LAST_CALLBACK_DATA["cost"] = 0.0
+    _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 0
+    _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 0
+    _llm_mod._LAST_CALLBACK_DATA["finish_reason"] = None
     yield
 
 

From 9b203e57f8c766d4e57b2e7cdb9526c0919c485e Mon Sep 17 00:00:00 2001
From: Serhan <serhanasad2013@live.com>
Date: Fri, 13 Feb 2026 18:33:06 -0500
Subject: [PATCH 4/7] fix: add run_test_command for JS/TS/TSX in
 language_format.csv

The CSV had empty run_test_command for JavaScript, TypeScript, and
TypeScriptReact, causing 5 CI test failures in test_agentic_langtest,
test_get_language, and test_get_test_command.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pdd/data/language_format.csv | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pdd/data/language_format.csv b/pdd/data/language_format.csv
index 4555496a..e50dff8d 100644
--- a/pdd/data/language_format.csv
+++ b/pdd/data/language_format.csv
@@ -2,7 +2,7 @@ language,comment,extension,run_command,run_test_command
 Python,#,.py,python {file},python -m pytest {file} -v
 Java,//,.java,java {file},
 C++,//,.cpp,,
-JavaScript,//,.js,node {file},
+JavaScript,//,.js,node {file},node {file}
 HTML,"<!-- -->",.html,,
 CSS,"/**/",.css,,
 Ruby,#,.rb,ruby {file},
@@ -11,8 +11,8 @@ Swift,//,.swift,swift {file},swift test
 Go,//,.go,go run {file},go test -v {file}
 Rust,//,.rs,,cargo test
 Kotlin,//,.kt,kotlin {file},
-TypeScript,//,.ts,npx tsx {file},
-TypeScriptReact,//,.tsx,npx tsx {file},
+TypeScript,//,.ts,npx tsx {file},npx tsx {file}
+TypeScriptReact,//,.tsx,npx tsx {file},npx tsx {file}
 C#,//,.cs,dotnet run {file},dotnet test
 SQL,--,.sql,,
 Shell,#,.sh,sh {file},

From e3def9b547d09850bdada0310249245ba314f820 Mon Sep 17 00:00:00 2001
From: Serhan Asad <95069916+Serhan-Asad@users.noreply.github.com>
Date: Fri, 13 Feb 2026 18:33:57 -0500
Subject: [PATCH 5/7] Delete pdd/data/language_format.csv

---
 pdd/data/language_format.csv | 71 ------------------------------------
 1 file changed, 71 deletions(-)
 delete mode 100644 pdd/data/language_format.csv

diff --git a/pdd/data/language_format.csv b/pdd/data/language_format.csv
deleted file mode 100644
index e50dff8d..00000000
--- a/pdd/data/language_format.csv
+++ /dev/null
@@ -1,71 +0,0 @@
-language,comment,extension,run_command,run_test_command
-Python,#,.py,python {file},python -m pytest {file} -v
-Java,//,.java,java {file},
-C++,//,.cpp,,
-JavaScript,//,.js,node {file},node {file}
-HTML,"<!-- -->",.html,,
-CSS,"/**/",.css,,
-Ruby,#,.rb,ruby {file},
-PHP,//,.php,php {file},
-Swift,//,.swift,swift {file},swift test
-Go,//,.go,go run {file},go test -v {file}
-Rust,//,.rs,,cargo test
-Kotlin,//,.kt,kotlin {file},
-TypeScript,//,.ts,npx tsx {file},npx tsx {file}
-TypeScriptReact,//,.tsx,npx tsx {file},npx tsx {file}
-C#,//,.cs,dotnet run {file},dotnet test
-SQL,--,.sql,,
-Shell,#,.sh,sh {file},
-Bash,#,.sh,bash {file},
-Fish,#,.fish,fish {file},
-Zsh,#,.zsh,zsh {file},
-PowerShell,#,.ps1,pwsh {file},
-Perl,#,.pl,perl {file},
-R,#,.R,Rscript {file},
-MATLAB,%,.m,,
-Lua,--,.lua,lua {file},
-Haskell,--,.hs,runhaskell {file},
-Scala,//,.scala,scala {file},
-Groovy,//,.groovy,groovy {file},
-Dart,//,.dart,dart run {file},
-F#,//,.fs,dotnet fsi {file},
-YAML,#,.yml,,
-YAML,#,.yaml,,
-JSON,del,.json,,
-JSONL,del,.jsonl,,
-XML,"<!-- -->",.xml,,
-Makefile,#,,,
-CSV,del,.csv,,
-Markdown,del,.md,,
-LaTeX,%,.tex,,
-Assembly,;,.asm,,
-Fortran,!,.f90,gfortran -o {file}.out {file} && ./{file}.out,
-COBOL,*>,.cob,,
-Lisp,;,.lisp,sbcl --script {file},
-Prolog,%,.pl,swipl -s {file},
-Erlang,%,.erl,escript {file},
-Clojure,;,.clj,clojure {file},
-Julia,#,.jl,julia {file},
-Elixir,#,.ex,elixir {file},
-Pascal,//,.pas,fpc {file} && ./{file},
-VBScript,"'",.vbs,cscript {file},
-CoffeeScript,#,.coffee,coffee {file},
-Objective-C,//,.m,,
-Scheme,;,.scm,scheme --script {file},
-Tcl,#,.tcl,tclsh {file},
-D,//,.d,rdmd {file},
-Ada,--,.ada,,
-Nim,#,.nim,nim r {file},
-OCaml,"(**)",.ml,ocaml {file},
-LLM,del,.prompt,,
-prompt,del,.prompt,,
-TOML,#,.toml,,
-Log,del,.log,,
-reStructuredText,del,.rst,,
-Text,del,.txt,,
-INI,;,.ini,,
-Verilog,//,.v,,
-Systemverilog,//,.sv,,
-Prisma,///,.prisma,,
-Lean,--,.lean,lean {file},
-Agda,--,.agda,,

From b972341630e8ed76bb16114c95e977dc6251f667 Mon Sep 17 00:00:00 2001
From: Serhan <serhanasad2013@live.com>
Date: Sat, 14 Feb 2026 10:39:53 -0500
Subject: [PATCH 6/7] Address Copilot review: fix test assertions, cleanup
 imports, simplify condition

- Add token accumulation assertions to test that claims to verify tokens
- Remove unused imports (csv, CliRunner) from e2e test
- Fix misleading docstring about CliRunner usage
- Trim 3-call setup to 2-call to match actual retry behavior
- Simplify redundant endswith('}') check in _is_malformed_json_response

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pdd/llm_invoke.py                      |  2 +-
 tests/test_e2e_issue_509_retry_cost.py | 18 +++++++-----------
 tests/test_llm_invoke_retry_cost.py    | 10 ++++++++++
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/pdd/llm_invoke.py b/pdd/llm_invoke.py
index df97a643..723a42c4 100644
--- a/pdd/llm_invoke.py
+++ b/pdd/llm_invoke.py
@@ -868,7 +868,7 @@ def _is_malformed_json_response(content: str, threshold: int = 100) -> bool:
     # Also check for excessive actual trailing newlines (not just escaped \\n)
     # This catches cases where raw newlines cause truncation
     actual_newline_count = len(content) - len(content.rstrip('\n'))
-    if actual_newline_count >= threshold and not stripped.endswith('}'):
+    if actual_newline_count >= threshold:
         return True
 
     # Also check for response that looks truncated mid-string
diff --git a/tests/test_e2e_issue_509_retry_cost.py b/tests/test_e2e_issue_509_retry_cost.py
index c8f43e3e..3b1a9d91 100644
--- a/tests/test_e2e_issue_509_retry_cost.py
+++ b/tests/test_e2e_issue_509_retry_cost.py
@@ -1,21 +1,19 @@
 """
 E2E Test for Issue #509: Cost under-reporting when retry calls overwrite _llm_mod._LAST_CALLBACK_DATA.
 
-This test exercises the generate command through Click's CliRunner, simulating a scenario
-where llm_invoke retries due to None content. The test verifies that the cost returned
-by llm_invoke (and ultimately reported by the CLI) includes BOTH the original and retry
-call costs, not just the retry cost.
+This test exercises the generate command path by invoking llm_invoke directly, simulating
+a scenario where llm_invoke retries due to None content. The test verifies that the cost
+accumulated in _llm_mod._LAST_CALLBACK_DATA includes BOTH the original and retry call
+costs, not just the retry cost.
 
 The bug: _llm_mod._LAST_CALLBACK_DATA["cost"] gets overwritten (not accumulated) on retry,
 so the cost from the first LLM call is silently lost.
 """
 
-import csv
 import os
 import pytest
 from pathlib import Path
 from unittest.mock import patch, MagicMock
-from click.testing import CliRunner
 
 import pdd.llm_invoke as _llm_mod
 
@@ -176,13 +174,11 @@ def test_budget_enforcement_sees_full_retry_cost(self, tmp_path, monkeypatch):
 
         import pandas as pd
 
-        # Two retries: costs should accumulate to $0.15
-        costs = [0.05, 0.05, 0.05]
-        expected_total = sum(costs)
+        # One retry (two calls total): costs should accumulate to $0.10
+        costs = [0.05, 0.05]
 
-        # First two calls return None, third succeeds
+        # First call returns None, second succeeds
         responses = [
-            _make_response(content=None),
             _make_response(content=None),
             _make_response(content='print("hello")'),
         ]
diff --git a/tests/test_llm_invoke_retry_cost.py b/tests/test_llm_invoke_retry_cost.py
index 140758a7..975d2a83 100644
--- a/tests/test_llm_invoke_retry_cost.py
+++ b/tests/test_llm_invoke_retry_cost.py
@@ -234,3 +234,13 @@ def test_none_content_retry_accumulates_tokens(
         assert reported_cost == pytest.approx(0.08), (
             f"Expected accumulated cost of 0.08, got {reported_cost}"
         )
+
+        # Verify that input/output token counts are also accumulated across retries.
+        assert _llm_mod._LAST_CALLBACK_DATA["input_tokens"] == 2000, (
+            f"Expected accumulated input tokens of 2000, "
+            f"got {_llm_mod._LAST_CALLBACK_DATA['input_tokens']}"
+        )
+        assert _llm_mod._LAST_CALLBACK_DATA["output_tokens"] == 1000, (
+            f"Expected accumulated output tokens of 1000, "
+            f"got {_llm_mod._LAST_CALLBACK_DATA['output_tokens']}"
+        )

From 9fc9ce796e784c1cd9bdf7544935b5c7791b38a5 Mon Sep 17 00:00:00 2001
From: Serhan <serhanasad2013@live.com>
Date: Sun, 15 Feb 2026 13:35:32 -0500
Subject: [PATCH 7/7] fix: restore litellm.cache in finally blocks to prevent
 cache leak on retry failure

All 3 retry paths (None content, malformed JSON, invalid Python) set
litellm.cache = None but only restored it on success. If the retry call
raised an exception, cache stayed permanently disabled for the process.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pdd/llm_invoke.py | 72 +++++++++++++++++++++++++----------------------
 1 file changed, 39 insertions(+), 33 deletions(-)

diff --git a/pdd/llm_invoke.py b/pdd/llm_invoke.py
index 723a42c4..4bb51ba9 100644
--- a/pdd/llm_invoke.py
+++ b/pdd/llm_invoke.py
@@ -2461,21 +2461,23 @@ def calc_strength(candidate):
                                     _accumulated_cost = _LAST_CALLBACK_DATA.get("cost", 0.0)
                                     _accumulated_input_tokens = _LAST_CALLBACK_DATA.get("input_tokens", 0)
                                     _accumulated_output_tokens = _LAST_CALLBACK_DATA.get("output_tokens", 0)
-                                    retry_response = litellm.completion(
-                                        model=model_name_litellm,
-                                        messages=retry_messages,
-                                        temperature=current_temperature,
-                                        response_format=response_format,
-                                        timeout=LLM_CALL_TIMEOUT,
-                                        **time_kwargs,
-                                        **retry_provider_kwargs  # Issue #185: Pass Vertex AI credentials
-                                    )
+                                    try:
+                                        retry_response = litellm.completion(
+                                            model=model_name_litellm,
+                                            messages=retry_messages,
+                                            temperature=current_temperature,
+                                            response_format=response_format,
+                                            timeout=LLM_CALL_TIMEOUT,
+                                            **time_kwargs,
+                                            **retry_provider_kwargs  # Issue #185: Pass Vertex AI credentials
+                                        )
+                                    finally:
+                                        # Always restore cache, even if retry raises
+                                        litellm.cache = configured_cache
                                     # Issue #509: Accumulate cost/tokens from original call + retry
                                     _LAST_CALLBACK_DATA["cost"] = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_cost
                                     _LAST_CALLBACK_DATA["input_tokens"] = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_input_tokens
                                     _LAST_CALLBACK_DATA["output_tokens"] = _LAST_CALLBACK_DATA.get("output_tokens", 0) + _accumulated_output_tokens
-                                    # Re-enable cache - restore original configured cache (restore to original state, even if None)
-                                    litellm.cache = configured_cache
                                     # Extract result from retry
                                     retry_raw_result = retry_response.choices[0].message.content
                                     if retry_raw_result is not None:
@@ -2510,21 +2512,23 @@ def calc_strength(candidate):
                                     _accumulated_cost = _LAST_CALLBACK_DATA.get("cost", 0.0)
                                     _accumulated_input_tokens = _LAST_CALLBACK_DATA.get("input_tokens", 0)
                                     _accumulated_output_tokens = _LAST_CALLBACK_DATA.get("output_tokens", 0)
-                                    retry_response = litellm.completion(
-                                        model=model_name_litellm,
-                                        messages=retry_messages,
-                                        temperature=current_temperature,
-                                        response_format=response_format,
-                                        timeout=LLM_CALL_TIMEOUT,
-                                        **time_kwargs,
-                                        **retry_provider_kwargs  # Issue #185: Pass Vertex AI credentials
-                                    )
+                                    try:
+                                        retry_response = litellm.completion(
+                                            model=model_name_litellm,
+                                            messages=retry_messages,
+                                            temperature=current_temperature,
+                                            response_format=response_format,
+                                            timeout=LLM_CALL_TIMEOUT,
+                                            **time_kwargs,
+                                            **retry_provider_kwargs  # Issue #185: Pass Vertex AI credentials
+                                        )
+                                    finally:
+                                        # Always restore cache, even if retry raises
+                                        litellm.cache = original_cache
                                     # Issue #509: Accumulate cost/tokens from original call + retry
                                     _LAST_CALLBACK_DATA["cost"] = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_cost
                                     _LAST_CALLBACK_DATA["input_tokens"] = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_input_tokens
                                     _LAST_CALLBACK_DATA["output_tokens"] = _LAST_CALLBACK_DATA.get("output_tokens", 0) + _accumulated_output_tokens
-                                    # Re-enable cache
-                                    litellm.cache = original_cache
                                     # Extract result from retry
                                     retry_raw_result = retry_response.choices[0].message.content
                                     if retry_raw_result is not None and not _is_malformed_json_response(retry_raw_result):
@@ -2757,21 +2761,23 @@ def calc_strength(candidate):
                                         _accumulated_cost = _LAST_CALLBACK_DATA.get("cost", 0.0)
                                         _accumulated_input_tokens = _LAST_CALLBACK_DATA.get("input_tokens", 0)
                                         _accumulated_output_tokens = _LAST_CALLBACK_DATA.get("output_tokens", 0)
-                                        retry_response = litellm.completion(
-                                            model=model_name_litellm,
-                                            messages=retry_messages,
-                                            temperature=current_temperature,
-                                            response_format=response_format,
-                                            timeout=LLM_CALL_TIMEOUT,
-                                            **time_kwargs,
-                                            **retry_provider_kwargs  # Issue #185: Pass Vertex AI credentials
-                                        )
+                                        try:
+                                            retry_response = litellm.completion(
+                                                model=model_name_litellm,
+                                                messages=retry_messages,
+                                                temperature=current_temperature,
+                                                response_format=response_format,
+                                                timeout=LLM_CALL_TIMEOUT,
+                                                **time_kwargs,
+                                                **retry_provider_kwargs  # Issue #185: Pass Vertex AI credentials
+                                            )
+                                        finally:
+                                            # Always restore cache, even if retry raises
+                                            litellm.cache = original_cache
                                         # Issue #509: Accumulate cost/tokens from original call + retry
                                         _LAST_CALLBACK_DATA["cost"] = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_cost
                                         _LAST_CALLBACK_DATA["input_tokens"] = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_input_tokens
                                         _LAST_CALLBACK_DATA["output_tokens"] = _LAST_CALLBACK_DATA.get("output_tokens", 0) + _accumulated_output_tokens
-                                        # Re-enable cache
-                                        litellm.cache = original_cache
                                         # Extract and re-parse the retry result
                                         retry_raw_result = retry_response.choices[0].message.content
                                         if retry_raw_result is not None: