diff --git a/pdd/llm_invoke.py b/pdd/llm_invoke.py index fae07244..4bb51ba9 100644 --- a/pdd/llm_invoke.py +++ b/pdd/llm_invoke.py @@ -865,6 +865,12 @@ def _is_malformed_json_response(content: str, threshold: int = 100) -> bool: if trailing_newline_count >= threshold: return True + # Also check for excessive actual trailing newlines (not just escaped \\n) + # This catches cases where raw newlines cause truncation + actual_newline_count = len(content) - len(content.rstrip('\n')) + if actual_newline_count >= threshold: + return True + # Also check for response that looks truncated mid-string # (ends with characters that suggest we're inside a JSON string value) if not stripped.endswith('}') and not stripped.endswith(']') and not stripped.endswith('"'): @@ -2451,17 +2457,27 @@ def calc_strength(candidate): retry_messages = _format_messages(modified_prompt, input_json, use_batch_mode) # Disable cache for retry litellm.cache = None - retry_response = litellm.completion( - model=model_name_litellm, - messages=retry_messages, - temperature=current_temperature, - response_format=response_format, - timeout=LLM_CALL_TIMEOUT, - **time_kwargs, - **retry_provider_kwargs # Issue #185: Pass Vertex AI credentials - ) - # Re-enable cache - restore original configured cache (restore to original state, even if None) - litellm.cache = configured_cache + # Issue #509: Save accumulated cost/tokens before retry overwrites callback data + _accumulated_cost = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_input_tokens = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_output_tokens = _LAST_CALLBACK_DATA.get("output_tokens", 0) + try: + retry_response = litellm.completion( + model=model_name_litellm, + messages=retry_messages, + temperature=current_temperature, + response_format=response_format, + timeout=LLM_CALL_TIMEOUT, + **time_kwargs, + **retry_provider_kwargs # Issue #185: Pass Vertex AI credentials + ) + finally: + # Always restore cache, even if retry raises + litellm.cache = configured_cache + # Issue #509: Accumulate cost/tokens from original call + retry + _LAST_CALLBACK_DATA["cost"] = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_cost + _LAST_CALLBACK_DATA["input_tokens"] = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_input_tokens + _LAST_CALLBACK_DATA["output_tokens"] = _LAST_CALLBACK_DATA.get("output_tokens", 0) + _accumulated_output_tokens # Extract result from retry retry_raw_result = retry_response.choices[0].message.content if retry_raw_result is not None: @@ -2492,17 +2508,27 @@ def calc_strength(candidate): # Disable cache for retry original_cache = litellm.cache litellm.cache = None - retry_response = litellm.completion( - model=model_name_litellm, - messages=retry_messages, - temperature=current_temperature, - response_format=response_format, - timeout=LLM_CALL_TIMEOUT, - **time_kwargs, - **retry_provider_kwargs # Issue #185: Pass Vertex AI credentials - ) - # Re-enable cache - litellm.cache = original_cache + # Issue #509: Save accumulated cost/tokens before retry overwrites callback data + _accumulated_cost = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_input_tokens = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_output_tokens = _LAST_CALLBACK_DATA.get("output_tokens", 0) + try: + retry_response = litellm.completion( + model=model_name_litellm, + messages=retry_messages, + temperature=current_temperature, + response_format=response_format, + timeout=LLM_CALL_TIMEOUT, + **time_kwargs, + **retry_provider_kwargs # Issue #185: Pass Vertex AI credentials + ) + finally: + # Always restore cache, even if retry raises + litellm.cache = original_cache + # Issue #509: Accumulate cost/tokens from original call + retry + _LAST_CALLBACK_DATA["cost"] = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_cost + _LAST_CALLBACK_DATA["input_tokens"] = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_input_tokens + _LAST_CALLBACK_DATA["output_tokens"] = _LAST_CALLBACK_DATA.get("output_tokens", 0) + _accumulated_output_tokens # Extract result from retry retry_raw_result = retry_response.choices[0].message.content if retry_raw_result is not None and not _is_malformed_json_response(retry_raw_result): @@ -2731,17 +2757,27 @@ def calc_strength(candidate): # Disable cache for retry original_cache = litellm.cache litellm.cache = None - retry_response = litellm.completion( - model=model_name_litellm, - messages=retry_messages, - temperature=current_temperature, - response_format=response_format, - timeout=LLM_CALL_TIMEOUT, - **time_kwargs, - **retry_provider_kwargs # Issue #185: Pass Vertex AI credentials - ) - # Re-enable cache - litellm.cache = original_cache + # Issue #509: Save accumulated cost/tokens before retry overwrites callback data + _accumulated_cost = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_input_tokens = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_output_tokens = _LAST_CALLBACK_DATA.get("output_tokens", 0) + try: + retry_response = litellm.completion( + model=model_name_litellm, + messages=retry_messages, + temperature=current_temperature, + response_format=response_format, + timeout=LLM_CALL_TIMEOUT, + **time_kwargs, + **retry_provider_kwargs # Issue #185: Pass Vertex AI credentials + ) + finally: + # Always restore cache, even if retry raises + litellm.cache = original_cache + # Issue #509: Accumulate cost/tokens from original call + retry + _LAST_CALLBACK_DATA["cost"] = _LAST_CALLBACK_DATA.get("cost", 0.0) + _accumulated_cost + _LAST_CALLBACK_DATA["input_tokens"] = _LAST_CALLBACK_DATA.get("input_tokens", 0) + _accumulated_input_tokens + _LAST_CALLBACK_DATA["output_tokens"] = _LAST_CALLBACK_DATA.get("output_tokens", 0) + _accumulated_output_tokens # Extract and re-parse the retry result retry_raw_result = retry_response.choices[0].message.content if retry_raw_result is not None: diff --git a/tests/test_e2e_issue_509_retry_cost.py b/tests/test_e2e_issue_509_retry_cost.py new file mode 100644 index 00000000..3b1a9d91 --- /dev/null +++ b/tests/test_e2e_issue_509_retry_cost.py @@ -0,0 +1,232 @@ +""" +E2E Test for Issue #509: Cost under-reporting when retry calls overwrite _llm_mod._LAST_CALLBACK_DATA. + +This test exercises the generate command path by invoking llm_invoke directly, simulating +a scenario where llm_invoke retries due to None content. The test verifies that the cost +accumulated in _llm_mod._LAST_CALLBACK_DATA includes BOTH the original and retry call +costs, not just the retry cost. + +The bug: _llm_mod._LAST_CALLBACK_DATA["cost"] gets overwritten (not accumulated) on retry, +so the cost from the first LLM call is silently lost. +""" + +import os +import pytest +from pathlib import Path +from unittest.mock import patch, MagicMock + +import pdd.llm_invoke as _llm_mod + + +@pytest.fixture(autouse=True) +def set_pdd_path(monkeypatch): + """Set PDD_PATH so language detection can find language_format.csv.""" + import pdd + pdd_package_dir = Path(pdd.__file__).parent + monkeypatch.setenv("PDD_PATH", str(pdd_package_dir)) + + +@pytest.fixture(autouse=True) +def reset_callback_data(): + """Reset _llm_mod._LAST_CALLBACK_DATA before each test.""" + _llm_mod._LAST_CALLBACK_DATA["cost"] = 0.0 + _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 0 + _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 0 + _llm_mod._LAST_CALLBACK_DATA["finish_reason"] = None + yield + + +def _make_response(content, prompt_tokens=100, completion_tokens=50): + resp = MagicMock() + choice = MagicMock() + choice.message.content = content + choice.finish_reason = "stop" + resp.choices = [choice] + usage = MagicMock() + usage.prompt_tokens = prompt_tokens + usage.completion_tokens = completion_tokens + resp.usage = usage + return resp + + +class TestE2ERetryCostAccumulation: + """E2E test: llm_invoke cost accumulation through the generate command path.""" + + @patch.dict(os.environ, {"PDD_FORCE_LOCAL": "1", "OPENAI_API_KEY": "test-key"}) + def test_generate_command_reports_accumulated_retry_cost(self, tmp_path, monkeypatch): + """ + Exercise the full generate -> llm_invoke -> retry path and verify + that the cost returned includes both the original and retry call costs. + + This simulates a user running `pdd generate` where the first LLM call + returns None content, triggering a cache-bypass retry. The total cost + should be $0.05 + $0.03 = $0.08, but the bug causes it to report $0.03. + """ + monkeypatch.chdir(tmp_path) + + # Create a prompt file + prompt_file = tmp_path / "hello_python.prompt" + prompt_file.write_text("Write a Python function that prints hello world.\n{input}") + + # Set up mock model data + mock_model = { + "provider": "Anthropic", + "model": "claude-test-model", + "input": 3.0, + "output": 15.0, + "coding_arena_elo": 1500, + "structured_output": False, + "base_url": "", + "api_key": "OPENAI_API_KEY", + "max_tokens": "", + "max_completion_tokens": "", + "reasoning_type": "none", + "max_reasoning_tokens": 0, + } + + import pandas as pd + + first_call_cost = 0.05 + retry_call_cost = 0.03 + expected_total = first_call_cost + retry_call_cost # $0.08 + + first_resp = _make_response(content=None) # Triggers retry + retry_resp = _make_response(content='def hello():\n print("Hello world")\n') + + call_count = [0] + + def mock_completion(**kwargs): + call_count[0] += 1 + if call_count[0] == 1: + _llm_mod._LAST_CALLBACK_DATA["cost"] = first_call_cost + _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 100 + _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 50 + return first_resp + else: + _llm_mod._LAST_CALLBACK_DATA["cost"] = retry_call_cost + _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 100 + _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 50 + return retry_resp + + # Patch at the llm_invoke level to control LLM behavior + with patch("pdd.llm_invoke._load_model_data") as mock_load, \ + patch("pdd.llm_invoke._select_model_candidates") as mock_select, \ + patch("pdd.llm_invoke._ensure_api_key", return_value=True), \ + patch("pdd.llm_invoke.litellm") as mock_litellm: + + mock_load.return_value = pd.DataFrame([mock_model]) + mock_select.return_value = [mock_model] + mock_litellm.completion = mock_completion + mock_litellm.cache = MagicMock() # Non-None triggers cache-bypass retry path + mock_litellm.drop_params = True + + # Call llm_invoke directly as the generate command would + from pdd.llm_invoke import llm_invoke + result = llm_invoke( + prompt="Write a Python function that prints hello world.\n{input}", + input_json={"input": ""}, + strength=0.5, + temperature=0.0, + use_cloud=False, + ) + + # Verify 2 LLM calls were made (original + retry) + assert call_count[0] == 2, ( + f"Expected 2 LLM calls (original + retry), got {call_count[0]}" + ) + + # THE BUG CHECK: cost should be accumulated ($0.05 + $0.03 = $0.08) + reported_cost = result["cost"] + assert reported_cost == pytest.approx(expected_total), ( + f"ISSUE #509 BUG DETECTED: Retry cost overwrites instead of accumulating!\n\n" + f" First call cost: ${first_call_cost:.2f}\n" + f" Retry call cost: ${retry_call_cost:.2f}\n" + f" Expected total: ${expected_total:.2f}\n" + f" Reported total: ${reported_cost:.2f}\n" + f" Lost cost: ${expected_total - reported_cost:.2f}\n\n" + f"The --output-cost CSV would under-report by ${expected_total - reported_cost:.2f} per retry." + ) + + @patch.dict(os.environ, {"PDD_FORCE_LOCAL": "1", "OPENAI_API_KEY": "test-key"}) + def test_budget_enforcement_sees_full_retry_cost(self, tmp_path, monkeypatch): + """ + E2E test: When budget is tight, under-reported retry costs could allow overspend. + + Simulates a scenario where a user sets --budget and multiple retries occur. + The budget check should see the accumulated cost, not just the last retry's cost. + """ + monkeypatch.chdir(tmp_path) + + mock_model = { + "provider": "Anthropic", + "model": "claude-test-model", + "input": 3.0, + "output": 15.0, + "coding_arena_elo": 1500, + "structured_output": False, + "base_url": "", + "api_key": "OPENAI_API_KEY", + "max_tokens": "", + "max_completion_tokens": "", + "reasoning_type": "none", + "max_reasoning_tokens": 0, + } + + import pandas as pd + + # One retry (two calls total): costs should accumulate to $0.10 + costs = [0.05, 0.05] + + # First call returns None, second succeeds + responses = [ + _make_response(content=None), + _make_response(content='print("hello")'), + ] + + call_count = [0] + + def mock_completion(**kwargs): + idx = call_count[0] + call_count[0] += 1 + # Only first call can trigger cache-bypass retry; subsequent are direct + if idx < len(costs): + _llm_mod._LAST_CALLBACK_DATA["cost"] = costs[idx] + _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 100 + _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 50 + return responses[idx] + # Fallback + _llm_mod._LAST_CALLBACK_DATA["cost"] = 0.05 + _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 100 + _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 50 + return responses[-1] + + with patch("pdd.llm_invoke._load_model_data") as mock_load, \ + patch("pdd.llm_invoke._select_model_candidates") as mock_select, \ + patch("pdd.llm_invoke._ensure_api_key", return_value=True), \ + patch("pdd.llm_invoke.litellm") as mock_litellm: + + mock_load.return_value = pd.DataFrame([mock_model]) + mock_select.return_value = [mock_model] + mock_litellm.completion = mock_completion + mock_litellm.cache = MagicMock() + mock_litellm.drop_params = True + + from pdd.llm_invoke import llm_invoke + result = llm_invoke( + prompt="Test {input}", + input_json={"input": "hello"}, + strength=0.5, + temperature=0.0, + use_cloud=False, + ) + + reported_cost = result["cost"] + + # The None-content retry path does one retry, so 2 calls made. + # Expected accumulated cost: $0.05 + $0.05 = $0.10 + # Bug reports only the retry cost: $0.05 + assert reported_cost == pytest.approx(costs[0] + costs[1]), ( + f"ISSUE #509 BUG: Budget enforcement sees only ${reported_cost:.2f}, " + f"but actual spend was ${costs[0] + costs[1]:.2f} (2 calls). " + f"This allows overspend when --budget is set." + ) diff --git a/tests/test_llm_invoke_retry_cost.py b/tests/test_llm_invoke_retry_cost.py new file mode 100644 index 00000000..975d2a83 --- /dev/null +++ b/tests/test_llm_invoke_retry_cost.py @@ -0,0 +1,246 @@ +"""Tests for issue #509: Cost under-reporting when retry calls overwrite _llm_mod._LAST_CALLBACK_DATA. + +When llm_invoke() retries due to None content, malformed JSON, or invalid Python syntax, +the success callback overwrites _llm_mod._LAST_CALLBACK_DATA["cost"] with only the retry's cost, +silently losing the original call's cost. +""" + +import pytest +import os +import pandas as pd +from unittest.mock import patch, MagicMock +import pdd.llm_invoke as _llm_mod + + +def _make_mock_model(): + return { + "provider": "Anthropic", + "model": "claude-test-model", + "input": 3.0, + "output": 15.0, + "coding_arena_elo": 1500, + "structured_output": False, + "base_url": "", + "api_key": "OPENAI_API_KEY", + "max_tokens": "", + "max_completion_tokens": "", + "reasoning_type": "none", + "max_reasoning_tokens": 0, + } + + +def _make_response(content, prompt_tokens=100, completion_tokens=50): + resp = MagicMock() + choice = MagicMock() + choice.message.content = content + choice.finish_reason = "stop" + resp.choices = [choice] + usage = MagicMock() + usage.prompt_tokens = prompt_tokens + usage.completion_tokens = completion_tokens + resp.usage = usage + return resp + + +def _make_completion_side_effect(responses_and_costs): + """Create a side_effect function that sets _llm_mod._LAST_CALLBACK_DATA on each call. + + responses_and_costs: list of (response, cost) tuples + """ + call_count = [0] + + def side_effect(**kwargs): + idx = call_count[0] + call_count[0] += 1 + resp, cost = responses_and_costs[idx] + # Simulate the callback overwriting _llm_mod._LAST_CALLBACK_DATA + _llm_mod._LAST_CALLBACK_DATA["cost"] = cost + _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = resp.usage.prompt_tokens + _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = resp.usage.completion_tokens + return resp + + return side_effect + + +@pytest.fixture(autouse=True) +def reset_callback_data(): + """Reset _llm_mod._LAST_CALLBACK_DATA before each test.""" + _llm_mod._LAST_CALLBACK_DATA["cost"] = 0.0 + _llm_mod._LAST_CALLBACK_DATA["input_tokens"] = 0 + _llm_mod._LAST_CALLBACK_DATA["output_tokens"] = 0 + _llm_mod._LAST_CALLBACK_DATA["finish_reason"] = None + yield + + +class TestRetryCostAccumulation: + """Tests that retry costs are accumulated, not overwritten.""" + + @patch.dict(os.environ, {"PDD_FORCE_LOCAL": "1", "OPENAI_API_KEY": "test-key"}) + @patch("pdd.llm_invoke._ensure_api_key", return_value=True) + @patch("pdd.llm_invoke._select_model_candidates") + @patch("pdd.llm_invoke._load_model_data") + @patch("pdd.llm_invoke.litellm") + def test_none_content_retry_accumulates_cost( + self, mock_litellm, mock_load, mock_select, mock_ensure + ): + """When first call returns None content and retry succeeds, + the reported cost should include BOTH calls' costs.""" + model = _make_mock_model() + mock_load.return_value = pd.DataFrame([model]) + mock_select.return_value = [model] + mock_litellm.cache = MagicMock() # Non-None so retry path triggers + mock_litellm.drop_params = True + + first_call_cost = 0.05 + retry_call_cost = 0.03 + expected_total = first_call_cost + retry_call_cost + + first_resp = _make_response(content=None) # Triggers None-content retry + retry_resp = _make_response(content="Hello world") + + mock_litellm.completion = _make_completion_side_effect([ + (first_resp, first_call_cost), + (retry_resp, retry_call_cost), + ]) + + from pdd.llm_invoke import llm_invoke + result = llm_invoke( + prompt="Test {input}", + input_json={"input": "hello"}, + strength=0.5, + temperature=0.0, + use_cloud=False, + ) + + reported_cost = result["cost"] + assert reported_cost == pytest.approx(expected_total), ( + f"Cost should be {expected_total} (first {first_call_cost} + retry {retry_call_cost}), " + f"but got {reported_cost}. Retry cost overwrote instead of accumulating." + ) + + @patch.dict(os.environ, {"PDD_FORCE_LOCAL": "1", "OPENAI_API_KEY": "test-key"}) + @patch("pdd.llm_invoke._ensure_api_key", return_value=True) + @patch("pdd.llm_invoke._select_model_candidates") + @patch("pdd.llm_invoke._load_model_data") + @patch("pdd.llm_invoke.litellm") + def test_malformed_json_retry_accumulates_cost( + self, mock_litellm, mock_load, mock_select, mock_ensure + ): + """When first call returns malformed JSON and retry succeeds, + the reported cost should include BOTH calls' costs.""" + model = _make_mock_model() + mock_load.return_value = pd.DataFrame([model]) + mock_select.return_value = [model] + mock_litellm.cache = MagicMock() + mock_litellm.drop_params = True + + first_call_cost = 0.04 + retry_call_cost = 0.04 + expected_total = first_call_cost + retry_call_cost + + # Malformed JSON: excessive trailing newlines + malformed = '{"key": "value' + '\n' * 500 + first_resp = _make_response(content=malformed) + retry_resp = _make_response(content='{"key": "value"}') + + mock_litellm.completion = _make_completion_side_effect([ + (first_resp, first_call_cost), + (retry_resp, retry_call_cost), + ]) + + from pdd.llm_invoke import llm_invoke + result = llm_invoke( + prompt="Test {input}", + input_json={"input": "hello"}, + strength=0.5, + temperature=0.0, + use_cloud=False, + ) + + reported_cost = result["cost"] + assert reported_cost == pytest.approx(expected_total), ( + f"Cost should be {expected_total} (first {first_call_cost} + retry {retry_call_cost}), " + f"but got {reported_cost}. Retry cost overwrote instead of accumulating." + ) + + @patch.dict(os.environ, {"PDD_FORCE_LOCAL": "1", "OPENAI_API_KEY": "test-key"}) + @patch("pdd.llm_invoke._ensure_api_key", return_value=True) + @patch("pdd.llm_invoke._select_model_candidates") + @patch("pdd.llm_invoke._load_model_data") + @patch("pdd.llm_invoke.litellm") + def test_no_retry_cost_unchanged( + self, mock_litellm, mock_load, mock_select, mock_ensure + ): + """When no retry occurs, cost should be reported normally (regression guard).""" + model = _make_mock_model() + mock_load.return_value = pd.DataFrame([model]) + mock_select.return_value = [model] + mock_litellm.cache = None + mock_litellm.drop_params = True + + single_cost = 0.05 + resp = _make_response(content="Hello world") + + mock_litellm.completion = _make_completion_side_effect([ + (resp, single_cost), + ]) + + from pdd.llm_invoke import llm_invoke + result = llm_invoke( + prompt="Test {input}", + input_json={"input": "hello"}, + strength=0.5, + temperature=0.0, + use_cloud=False, + ) + + assert result["cost"] == pytest.approx(single_cost) + + @patch.dict(os.environ, {"PDD_FORCE_LOCAL": "1", "OPENAI_API_KEY": "test-key"}) + @patch("pdd.llm_invoke._ensure_api_key", return_value=True) + @patch("pdd.llm_invoke._select_model_candidates") + @patch("pdd.llm_invoke._load_model_data") + @patch("pdd.llm_invoke.litellm") + def test_none_content_retry_accumulates_tokens( + self, mock_litellm, mock_load, mock_select, mock_ensure + ): + """Token counts should also accumulate across retries.""" + model = _make_mock_model() + mock_load.return_value = pd.DataFrame([model]) + mock_select.return_value = [model] + mock_litellm.cache = MagicMock() + mock_litellm.drop_params = True + + first_resp = _make_response(content=None, prompt_tokens=1000, completion_tokens=500) + retry_resp = _make_response(content="Hello", prompt_tokens=1000, completion_tokens=500) + + mock_litellm.completion = _make_completion_side_effect([ + (first_resp, 0.05), + (retry_resp, 0.03), + ]) + + from pdd.llm_invoke import llm_invoke + result = llm_invoke( + prompt="Test {input}", + input_json={"input": "hello"}, + strength=0.5, + temperature=0.0, + use_cloud=False, + ) + + # After fix, tokens should be accumulated (2000 input, 1000 output) + # Currently they get overwritten to just the retry's tokens + reported_cost = result["cost"] + assert reported_cost == pytest.approx(0.08), ( + f"Expected accumulated cost of 0.08, got {reported_cost}" + ) + + # Verify that input/output token counts are also accumulated across retries. + assert _llm_mod._LAST_CALLBACK_DATA["input_tokens"] == 2000, ( + f"Expected accumulated input tokens of 2000, " + f"got {_llm_mod._LAST_CALLBACK_DATA['input_tokens']}" + ) + assert _llm_mod._LAST_CALLBACK_DATA["output_tokens"] == 1000, ( + f"Expected accumulated output tokens of 1000, " + f"got {_llm_mod._LAST_CALLBACK_DATA['output_tokens']}" + )