Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions sentry_sdk/integrations/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ def _get_token_usage(result: "Messages") -> "tuple[int, int, int, int]":
):
cache_write_input_tokens = usage.cache_creation_input_tokens

# Anthropic's input_tokens excludes cached/cache_write tokens.
# Normalize to total input tokens so downstream cost calculations
# (input_tokens - cached) don't produce negative values.
input_tokens += cache_read_input_tokens + cache_write_input_tokens
Comment on lines +111 to +114
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you're already adding cache read and write outside afterwards, can remove this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No I don't think we can remove this - there are two codepaths below, one of them for non-streaming and one for streaming. The streaming one does not use this function to retrieve the cost, so it needs to normalize separately


return (
input_tokens,
output_tokens,
Expand Down Expand Up @@ -466,11 +471,19 @@ def new_iterator() -> "Iterator[MessageStreamEvent]":
)
yield event

# Anthropic's input_tokens excludes cached/cache_write tokens.
# Normalize to total input tokens for correct cost calculations.
total_input = (
usage.input_tokens
+ (usage.cache_read_input_tokens or 0)
+ (usage.cache_write_input_tokens or 0)
)

_set_output_data(
span=span,
integration=integration,
model=model,
input_tokens=usage.input_tokens,
input_tokens=total_input,
output_tokens=usage.output_tokens,
cache_read_input_tokens=usage.cache_read_input_tokens,
cache_write_input_tokens=usage.cache_write_input_tokens,
Expand All @@ -496,11 +509,19 @@ async def new_iterator_async() -> "AsyncIterator[MessageStreamEvent]":
)
yield event

# Anthropic's input_tokens excludes cached/cache_write tokens.
# Normalize to total input tokens for correct cost calculations.
total_input = (
usage.input_tokens
+ (usage.cache_read_input_tokens or 0)
+ (usage.cache_write_input_tokens or 0)
)

_set_output_data(
span=span,
integration=integration,
model=model,
input_tokens=usage.input_tokens,
input_tokens=total_input,
output_tokens=usage.output_tokens,
cache_read_input_tokens=usage.cache_read_input_tokens,
cache_write_input_tokens=usage.cache_write_input_tokens,
Expand Down
196 changes: 196 additions & 0 deletions tests/integrations/anthropic/test_anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2261,10 +2261,202 @@ def test_cache_tokens_nonstreaming(sentry_init, capture_events):
)

(span,) = events[0]["spans"]
# input_tokens normalized: 100 + 80 (cache_read) + 20 (cache_write) = 200
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200
assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 50
assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 250
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20


def test_input_tokens_include_cache_write_nonstreaming(sentry_init, capture_events):
"""
Test that gen_ai.usage.input_tokens includes cache_write tokens (non-streaming).

Reproduces a real Anthropic cache-write response. Anthropic's usage.input_tokens
only counts non-cached tokens, but gen_ai.usage.input_tokens should be the TOTAL
so downstream cost calculations don't produce negative values.

Real Anthropic response (from E2E test):
Usage(input_tokens=19, output_tokens=14,
cache_creation_input_tokens=2846, cache_read_input_tokens=0)
"""
sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0)
events = capture_events()
client = Anthropic(api_key="z")

client.messages._post = mock.Mock(
return_value=Message(
id="id",
model="claude-sonnet-4-20250514",
role="assistant",
content=[TextBlock(type="text", text="3 + 3 equals 6.")],
type="message",
usage=Usage(
input_tokens=19,
output_tokens=14,
cache_read_input_tokens=0,
cache_creation_input_tokens=2846,
),
)
)

with start_transaction(name="anthropic"):
client.messages.create(
max_tokens=1024,
messages=[{"role": "user", "content": "What is 3+3?"}],
model="claude-sonnet-4-20250514",
)

(span,) = events[0]["spans"]

# input_tokens should be total: 19 (non-cached) + 2846 (cache_write) = 2865
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865
assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 0
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 2846


def test_input_tokens_include_cache_read_nonstreaming(sentry_init, capture_events):
"""
Test that gen_ai.usage.input_tokens includes cache_read tokens (non-streaming).

Reproduces a real Anthropic cache-hit response. This is the scenario that
caused negative gen_ai.cost.input_tokens: input_tokens=19 but cached=2846,
so the backend computed 19 - 2846 = -2827 "regular" tokens.

Real Anthropic response (from E2E test):
Usage(input_tokens=19, output_tokens=14,
cache_creation_input_tokens=0, cache_read_input_tokens=2846)
"""
sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0)
events = capture_events()
client = Anthropic(api_key="z")

client.messages._post = mock.Mock(
return_value=Message(
id="id",
model="claude-sonnet-4-20250514",
role="assistant",
content=[TextBlock(type="text", text="5 + 5 = 10.")],
type="message",
usage=Usage(
input_tokens=19,
output_tokens=14,
cache_read_input_tokens=2846,
cache_creation_input_tokens=0,
),
)
)

with start_transaction(name="anthropic"):
client.messages.create(
max_tokens=1024,
messages=[{"role": "user", "content": "What is 5+5?"}],
model="claude-sonnet-4-20250514",
)

(span,) = events[0]["spans"]

# input_tokens should be total: 19 (non-cached) + 2846 (cache_read) = 2865
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865
assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0


def test_input_tokens_include_cache_read_streaming(sentry_init, capture_events):
"""
Test that gen_ai.usage.input_tokens includes cache_read tokens (streaming).

Same cache-hit scenario as non-streaming, using realistic streaming events.
"""
client = Anthropic(api_key="z")
returned_stream = Stream(cast_to=None, response=None, client=client)
returned_stream._iterator = [
MessageStartEvent(
type="message_start",
message=Message(
id="id",
model="claude-sonnet-4-20250514",
role="assistant",
content=[],
type="message",
usage=Usage(
input_tokens=19,
output_tokens=0,
cache_read_input_tokens=2846,
cache_creation_input_tokens=0,
),
),
),
MessageDeltaEvent(
type="message_delta",
delta=Delta(stop_reason="end_turn"),
usage=MessageDeltaUsage(output_tokens=14),
),
]

sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0)
events = capture_events()
client.messages._post = mock.Mock(return_value=returned_stream)

with start_transaction(name="anthropic"):
for _ in client.messages.create(
max_tokens=1024,
messages=[{"role": "user", "content": "What is 5+5?"}],
model="claude-sonnet-4-20250514",
stream=True,
):
pass

(span,) = events[0]["spans"]

# input_tokens should be total: 19 + 2846 = 2865
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865
assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0


def test_input_tokens_unchanged_without_caching(sentry_init, capture_events):
"""
Test that input_tokens is unchanged when there are no cached tokens.

Real Anthropic response (from E2E test, simple call without caching):
Usage(input_tokens=20, output_tokens=12)
"""
sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0)
events = capture_events()
client = Anthropic(api_key="z")

client.messages._post = mock.Mock(
return_value=Message(
id="id",
model="claude-sonnet-4-20250514",
role="assistant",
content=[TextBlock(type="text", text="2+2 equals 4.")],
type="message",
usage=Usage(
input_tokens=20,
output_tokens=12,
),
)
)

with start_transaction(name="anthropic"):
client.messages.create(
max_tokens=1024,
messages=[{"role": "user", "content": "What is 2+2?"}],
model="claude-sonnet-4-20250514",
)

(span,) = events[0]["spans"]

assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 20
assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 32 # 20 + 12


def test_cache_tokens_streaming(sentry_init, capture_events):
"""Test cache tokens are tracked for streaming responses."""
client = Anthropic(api_key="z")
Expand Down Expand Up @@ -2307,5 +2499,9 @@ def test_cache_tokens_streaming(sentry_init, capture_events):
pass

(span,) = events[0]["spans"]
# input_tokens normalized: 100 + 80 (cache_read) + 20 (cache_write) = 200
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200
assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10
assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 210
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20
Loading