From 7168d1f2e735b4b2bbf2b2582c83833960723fef Mon Sep 17 00:00:00 2001 From: Simon Hellmayr Date: Fri, 20 Feb 2026 13:35:10 +0100 Subject: [PATCH 1/4] fix(anthropic): fix token accounting --- sentry_sdk/integrations/anthropic.py | 17 +- .../integrations/anthropic/test_anthropic.py | 150 ++++++++++++++++++ 2 files changed, 165 insertions(+), 2 deletions(-) diff --git a/sentry_sdk/integrations/anthropic.py b/sentry_sdk/integrations/anthropic.py index b131e3381d..6bf6dedd30 100644 --- a/sentry_sdk/integrations/anthropic.py +++ b/sentry_sdk/integrations/anthropic.py @@ -108,6 +108,11 @@ def _get_token_usage(result: "Messages") -> "tuple[int, int, int, int]": ): cache_write_input_tokens = usage.cache_creation_input_tokens + # Anthropic's input_tokens excludes cached/cache_write tokens. + # Normalize to total input tokens so downstream cost calculations + # (input_tokens - cached) don't produce negative values. + input_tokens += cache_read_input_tokens + cache_write_input_tokens + return ( input_tokens, output_tokens, @@ -466,11 +471,15 @@ def new_iterator() -> "Iterator[MessageStreamEvent]": ) yield event + # Anthropic's input_tokens excludes cached/cache_write tokens. + # Normalize to total input tokens for correct cost calculations. + total_input = usage.input_tokens + (usage.cache_read_input_tokens or 0) + (usage.cache_write_input_tokens or 0) + _set_output_data( span=span, integration=integration, model=model, - input_tokens=usage.input_tokens, + input_tokens=total_input, output_tokens=usage.output_tokens, cache_read_input_tokens=usage.cache_read_input_tokens, cache_write_input_tokens=usage.cache_write_input_tokens, @@ -496,11 +505,15 @@ async def new_iterator_async() -> "AsyncIterator[MessageStreamEvent]": ) yield event + # Anthropic's input_tokens excludes cached/cache_write tokens. + # Normalize to total input tokens for correct cost calculations. + total_input = usage.input_tokens + (usage.cache_read_input_tokens or 0) + (usage.cache_write_input_tokens or 0) + _set_output_data( span=span, integration=integration, model=model, - input_tokens=usage.input_tokens, + input_tokens=total_input, output_tokens=usage.output_tokens, cache_read_input_tokens=usage.cache_read_input_tokens, cache_write_input_tokens=usage.cache_write_input_tokens, diff --git a/tests/integrations/anthropic/test_anthropic.py b/tests/integrations/anthropic/test_anthropic.py index 49164adf32..7f119c6912 100644 --- a/tests/integrations/anthropic/test_anthropic.py +++ b/tests/integrations/anthropic/test_anthropic.py @@ -2265,6 +2265,156 @@ def test_cache_tokens_nonstreaming(sentry_init, capture_events): assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 +def test_input_tokens_include_cached_nonstreaming(sentry_init, capture_events): + """ + Test that gen_ai.usage.input_tokens includes cached tokens. + + Anthropic's usage.input_tokens excludes cached/cache_write tokens, + but gen_ai.usage.input_tokens should be the TOTAL input tokens + (including cached + cache_write) so that downstream cost calculations + don't produce negative values. + + See: negative gen_ai.cost.input_tokens bug when cache_read > input_tokens. + """ + sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) + events = capture_events() + client = Anthropic(api_key="z") + + # Simulate Anthropic response where input_tokens=100 EXCLUDES cached tokens + # cache_read=80 and cache_write=20 are separate + # Total input tokens processed = 100 + 80 + 20 = 200 + client.messages._post = mock.Mock( + return_value=Message( + id="id", + model="claude-3-5-sonnet-20241022", + role="assistant", + content=[TextBlock(type="text", text="Response")], + type="message", + usage=Usage( + input_tokens=100, + output_tokens=50, + cache_read_input_tokens=80, + cache_creation_input_tokens=20, + ), + ) + ) + + with start_transaction(name="anthropic"): + client.messages.create( + max_tokens=1024, + messages=[{"role": "user", "content": "Hello"}], + model="claude-3-5-sonnet-20241022", + ) + + (span,) = events[0]["spans"] + + # input_tokens should be total: 100 (non-cached) + 80 (cache_read) + 20 (cache_write) = 200 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 + + # total_tokens should include the full input count + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 250 # 200 + 50 + + # Cache fields should still be reported correctly + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 + + +def test_input_tokens_include_cached_streaming(sentry_init, capture_events): + """ + Test that gen_ai.usage.input_tokens includes cached tokens for streaming responses. + + Same bug as non-streaming: Anthropic's input_tokens excludes cached tokens, + leading to negative cost calculations when cache_read > input_tokens. + """ + client = Anthropic(api_key="z") + returned_stream = Stream(cast_to=None, response=None, client=client) + returned_stream._iterator = [ + MessageStartEvent( + type="message_start", + message=Message( + id="id", + model="claude-3-5-sonnet-20241022", + role="assistant", + content=[], + type="message", + usage=Usage( + input_tokens=100, + output_tokens=0, + cache_read_input_tokens=80, + cache_creation_input_tokens=20, + ), + ), + ), + MessageDeltaEvent( + type="message_delta", + delta=Delta(stop_reason="end_turn"), + usage=MessageDeltaUsage(output_tokens=50), + ), + ] + + sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) + events = capture_events() + client.messages._post = mock.Mock(return_value=returned_stream) + + with start_transaction(name="anthropic"): + for _ in client.messages.create( + max_tokens=1024, + messages=[{"role": "user", "content": "Hello"}], + model="claude-3-5-sonnet-20241022", + stream=True, + ): + pass + + (span,) = events[0]["spans"] + + # input_tokens should be total: 100 + 80 + 20 = 200 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 + + # total_tokens should include the full input count + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 250 # 200 + 50 + + # Cache fields should still be reported correctly + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 + + +def test_input_tokens_unchanged_without_caching(sentry_init, capture_events): + """ + Test that input_tokens is unchanged when there are no cached tokens. + Ensures the fix doesn't break the non-caching case. + """ + sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) + events = capture_events() + client = Anthropic(api_key="z") + + client.messages._post = mock.Mock( + return_value=Message( + id="id", + model="claude-3-5-sonnet-20241022", + role="assistant", + content=[TextBlock(type="text", text="Response")], + type="message", + usage=Usage( + input_tokens=100, + output_tokens=50, + ), + ) + ) + + with start_transaction(name="anthropic"): + client.messages.create( + max_tokens=1024, + messages=[{"role": "user", "content": "Hello"}], + model="claude-3-5-sonnet-20241022", + ) + + (span,) = events[0]["spans"] + + # Without caching, input_tokens should remain as-is + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 100 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 150 # 100 + 50 + + def test_cache_tokens_streaming(sentry_init, capture_events): """Test cache tokens are tracked for streaming responses.""" client = Anthropic(api_key="z") From 120da10d4e287f698b4970c1fac9ae13bf29e140 Mon Sep 17 00:00:00 2001 From: Simon Hellmayr Date: Fri, 20 Feb 2026 13:37:42 +0100 Subject: [PATCH 2/4] formatting --- sentry_sdk/integrations/anthropic.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sentry_sdk/integrations/anthropic.py b/sentry_sdk/integrations/anthropic.py index 6bf6dedd30..90aec4dd6c 100644 --- a/sentry_sdk/integrations/anthropic.py +++ b/sentry_sdk/integrations/anthropic.py @@ -473,7 +473,11 @@ def new_iterator() -> "Iterator[MessageStreamEvent]": # Anthropic's input_tokens excludes cached/cache_write tokens. # Normalize to total input tokens for correct cost calculations. - total_input = usage.input_tokens + (usage.cache_read_input_tokens or 0) + (usage.cache_write_input_tokens or 0) + total_input = ( + usage.input_tokens + + (usage.cache_read_input_tokens or 0) + + (usage.cache_write_input_tokens or 0) + ) _set_output_data( span=span, @@ -507,7 +511,11 @@ async def new_iterator_async() -> "AsyncIterator[MessageStreamEvent]": # Anthropic's input_tokens excludes cached/cache_write tokens. # Normalize to total input tokens for correct cost calculations. - total_input = usage.input_tokens + (usage.cache_read_input_tokens or 0) + (usage.cache_write_input_tokens or 0) + total_input = ( + usage.input_tokens + + (usage.cache_read_input_tokens or 0) + + (usage.cache_write_input_tokens or 0) + ) _set_output_data( span=span, From 6a5c43c4468a658dae2b8fd89bf2391614d888bf Mon Sep 17 00:00:00 2001 From: Simon Hellmayr Date: Fri, 20 Feb 2026 13:39:28 +0100 Subject: [PATCH 3/4] adapt tests --- .../integrations/anthropic/test_anthropic.py | 148 +++++++++++------- 1 file changed, 93 insertions(+), 55 deletions(-) diff --git a/tests/integrations/anthropic/test_anthropic.py b/tests/integrations/anthropic/test_anthropic.py index 7f119c6912..07239ee179 100644 --- a/tests/integrations/anthropic/test_anthropic.py +++ b/tests/integrations/anthropic/test_anthropic.py @@ -2265,36 +2265,34 @@ def test_cache_tokens_nonstreaming(sentry_init, capture_events): assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 -def test_input_tokens_include_cached_nonstreaming(sentry_init, capture_events): +def test_input_tokens_include_cache_write_nonstreaming(sentry_init, capture_events): """ - Test that gen_ai.usage.input_tokens includes cached tokens. + Test that gen_ai.usage.input_tokens includes cache_write tokens (non-streaming). - Anthropic's usage.input_tokens excludes cached/cache_write tokens, - but gen_ai.usage.input_tokens should be the TOTAL input tokens - (including cached + cache_write) so that downstream cost calculations - don't produce negative values. + Reproduces a real Anthropic cache-write response. Anthropic's usage.input_tokens + only counts non-cached tokens, but gen_ai.usage.input_tokens should be the TOTAL + so downstream cost calculations don't produce negative values. - See: negative gen_ai.cost.input_tokens bug when cache_read > input_tokens. + Real Anthropic response (from E2E test): + Usage(input_tokens=19, output_tokens=14, + cache_creation_input_tokens=2846, cache_read_input_tokens=0) """ sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) events = capture_events() client = Anthropic(api_key="z") - # Simulate Anthropic response where input_tokens=100 EXCLUDES cached tokens - # cache_read=80 and cache_write=20 are separate - # Total input tokens processed = 100 + 80 + 20 = 200 client.messages._post = mock.Mock( return_value=Message( id="id", - model="claude-3-5-sonnet-20241022", + model="claude-sonnet-4-20250514", role="assistant", - content=[TextBlock(type="text", text="Response")], + content=[TextBlock(type="text", text="3 + 3 equals 6.")], type="message", usage=Usage( - input_tokens=100, - output_tokens=50, - cache_read_input_tokens=80, - cache_creation_input_tokens=20, + input_tokens=19, + output_tokens=14, + cache_read_input_tokens=0, + cache_creation_input_tokens=2846, ), ) ) @@ -2302,29 +2300,72 @@ def test_input_tokens_include_cached_nonstreaming(sentry_init, capture_events): with start_transaction(name="anthropic"): client.messages.create( max_tokens=1024, - messages=[{"role": "user", "content": "Hello"}], - model="claude-3-5-sonnet-20241022", + messages=[{"role": "user", "content": "What is 3+3?"}], + model="claude-sonnet-4-20250514", ) (span,) = events[0]["spans"] - # input_tokens should be total: 100 (non-cached) + 80 (cache_read) + 20 (cache_write) = 200 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 + # input_tokens should be total: 19 (non-cached) + 2846 (cache_write) = 2865 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 0 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 2846 - # total_tokens should include the full input count - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 250 # 200 + 50 - # Cache fields should still be reported correctly - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 +def test_input_tokens_include_cache_read_nonstreaming(sentry_init, capture_events): + """ + Test that gen_ai.usage.input_tokens includes cache_read tokens (non-streaming). + + Reproduces a real Anthropic cache-hit response. This is the scenario that + caused negative gen_ai.cost.input_tokens: input_tokens=19 but cached=2846, + so the backend computed 19 - 2846 = -2827 "regular" tokens. + + Real Anthropic response (from E2E test): + Usage(input_tokens=19, output_tokens=14, + cache_creation_input_tokens=0, cache_read_input_tokens=2846) + """ + sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) + events = capture_events() + client = Anthropic(api_key="z") + + client.messages._post = mock.Mock( + return_value=Message( + id="id", + model="claude-sonnet-4-20250514", + role="assistant", + content=[TextBlock(type="text", text="5 + 5 = 10.")], + type="message", + usage=Usage( + input_tokens=19, + output_tokens=14, + cache_read_input_tokens=2846, + cache_creation_input_tokens=0, + ), + ) + ) + with start_transaction(name="anthropic"): + client.messages.create( + max_tokens=1024, + messages=[{"role": "user", "content": "What is 5+5?"}], + model="claude-sonnet-4-20250514", + ) + + (span,) = events[0]["spans"] -def test_input_tokens_include_cached_streaming(sentry_init, capture_events): + # input_tokens should be total: 19 (non-cached) + 2846 (cache_read) = 2865 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0 + + +def test_input_tokens_include_cache_read_streaming(sentry_init, capture_events): """ - Test that gen_ai.usage.input_tokens includes cached tokens for streaming responses. + Test that gen_ai.usage.input_tokens includes cache_read tokens (streaming). - Same bug as non-streaming: Anthropic's input_tokens excludes cached tokens, - leading to negative cost calculations when cache_read > input_tokens. + Same cache-hit scenario as non-streaming, using realistic streaming events. """ client = Anthropic(api_key="z") returned_stream = Stream(cast_to=None, response=None, client=client) @@ -2333,22 +2374,22 @@ def test_input_tokens_include_cached_streaming(sentry_init, capture_events): type="message_start", message=Message( id="id", - model="claude-3-5-sonnet-20241022", + model="claude-sonnet-4-20250514", role="assistant", content=[], type="message", usage=Usage( - input_tokens=100, + input_tokens=19, output_tokens=0, - cache_read_input_tokens=80, - cache_creation_input_tokens=20, + cache_read_input_tokens=2846, + cache_creation_input_tokens=0, ), ), ), MessageDeltaEvent( type="message_delta", delta=Delta(stop_reason="end_turn"), - usage=MessageDeltaUsage(output_tokens=50), + usage=MessageDeltaUsage(output_tokens=14), ), ] @@ -2359,29 +2400,27 @@ def test_input_tokens_include_cached_streaming(sentry_init, capture_events): with start_transaction(name="anthropic"): for _ in client.messages.create( max_tokens=1024, - messages=[{"role": "user", "content": "Hello"}], - model="claude-3-5-sonnet-20241022", + messages=[{"role": "user", "content": "What is 5+5?"}], + model="claude-sonnet-4-20250514", stream=True, ): pass (span,) = events[0]["spans"] - # input_tokens should be total: 100 + 80 + 20 = 200 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 - - # total_tokens should include the full input count - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 250 # 200 + 50 - - # Cache fields should still be reported correctly - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 + # input_tokens should be total: 19 + 2846 = 2865 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0 def test_input_tokens_unchanged_without_caching(sentry_init, capture_events): """ Test that input_tokens is unchanged when there are no cached tokens. - Ensures the fix doesn't break the non-caching case. + + Real Anthropic response (from E2E test, simple call without caching): + Usage(input_tokens=20, output_tokens=12) """ sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) events = capture_events() @@ -2390,13 +2429,13 @@ def test_input_tokens_unchanged_without_caching(sentry_init, capture_events): client.messages._post = mock.Mock( return_value=Message( id="id", - model="claude-3-5-sonnet-20241022", + model="claude-sonnet-4-20250514", role="assistant", - content=[TextBlock(type="text", text="Response")], + content=[TextBlock(type="text", text="2+2 equals 4.")], type="message", usage=Usage( - input_tokens=100, - output_tokens=50, + input_tokens=20, + output_tokens=12, ), ) ) @@ -2404,15 +2443,14 @@ def test_input_tokens_unchanged_without_caching(sentry_init, capture_events): with start_transaction(name="anthropic"): client.messages.create( max_tokens=1024, - messages=[{"role": "user", "content": "Hello"}], - model="claude-3-5-sonnet-20241022", + messages=[{"role": "user", "content": "What is 2+2?"}], + model="claude-sonnet-4-20250514", ) (span,) = events[0]["spans"] - # Without caching, input_tokens should remain as-is - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 150 # 100 + 50 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 32 # 20 + 12 def test_cache_tokens_streaming(sentry_init, capture_events): From df065fb64419ff70c539c4e27a083599676de316 Mon Sep 17 00:00:00 2001 From: Simon Hellmayr Date: Fri, 20 Feb 2026 14:45:23 +0100 Subject: [PATCH 4/4] adapt existing tests so they would have failed --- tests/integrations/anthropic/test_anthropic.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/integrations/anthropic/test_anthropic.py b/tests/integrations/anthropic/test_anthropic.py index 07239ee179..4361ba9629 100644 --- a/tests/integrations/anthropic/test_anthropic.py +++ b/tests/integrations/anthropic/test_anthropic.py @@ -2261,6 +2261,10 @@ def test_cache_tokens_nonstreaming(sentry_init, capture_events): ) (span,) = events[0]["spans"] + # input_tokens normalized: 100 + 80 (cache_read) + 20 (cache_write) = 200 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 50 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 250 assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 @@ -2495,5 +2499,9 @@ def test_cache_tokens_streaming(sentry_init, capture_events): pass (span,) = events[0]["spans"] + # input_tokens normalized: 100 + 80 (cache_read) + 20 (cache_write) = 200 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 210 assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20