Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# 7.4.3 - 2026-01-02

Fixes cache creation cost for Langchain with Anthropic

# 7.4.2 - 2025-12-22

feat: add `in_app_modules` option to control code variables capturing
Expand All @@ -13,6 +17,7 @@ When using OpenAI stored prompts, the model is defined in the OpenAI dashboard r
feat: Add automatic retries for feature flag requests

Feature flag API requests now automatically retry on transient failures:

- Network errors (connection refused, DNS failures, timeouts)
- Server errors (500, 502, 503, 504)
- Up to 2 retries with exponential backoff (0.5s, 1s delays)
Expand Down
20 changes: 11 additions & 9 deletions posthog/ai/langchain/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -773,24 +773,26 @@ def _parse_usage_model(
for mapped_key, dataclass_key in field_mapping.items()
},
)
# For Anthropic providers, LangChain reports input_tokens as the sum of input and cache read tokens.
# For Anthropic providers, LangChain reports input_tokens as the sum of all input tokens.
# Our cost calculation expects them to be separate for Anthropic, so we subtract cache tokens.
# For other providers (OpenAI, etc.), input_tokens already includes cache tokens as expected.
# Both cache_read and cache_write tokens should be subtracted since Anthropic's raw API
# reports input_tokens as tokens NOT read from or used to create a cache.
# For other providers (OpenAI, etc.), input_tokens already excludes cache tokens as expected.
# Match logic consistent with plugin-server: exact match on provider OR substring match on model
is_anthropic = False
if provider and provider.lower() == "anthropic":
is_anthropic = True
elif model and "anthropic" in model.lower():
is_anthropic = True

if (
is_anthropic
and normalized_usage.input_tokens
and normalized_usage.cache_read_tokens
):
normalized_usage.input_tokens = max(
normalized_usage.input_tokens - normalized_usage.cache_read_tokens, 0
if is_anthropic and normalized_usage.input_tokens:
cache_tokens = (normalized_usage.cache_read_tokens or 0) + (
normalized_usage.cache_write_tokens or 0
)
if cache_tokens > 0:
normalized_usage.input_tokens = max(
normalized_usage.input_tokens - cache_tokens, 0
)
return normalized_usage


Expand Down
97 changes: 94 additions & 3 deletions posthog/test/ai/langchain/test_callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1638,6 +1638,95 @@ def test_anthropic_provider_subtracts_cache_tokens(mock_client):
assert generation_args["properties"]["$ai_cache_read_input_tokens"] == 800


def test_anthropic_provider_subtracts_cache_write_tokens(mock_client):
"""Test that Anthropic provider correctly subtracts cache write tokens from input tokens."""
from langchain_core.outputs import LLMResult, ChatGeneration
from langchain_core.messages import AIMessage
from uuid import uuid4

cb = CallbackHandler(mock_client)
run_id = uuid4()

# Set up with Anthropic provider
cb._set_llm_metadata(
serialized={},
run_id=run_id,
messages=[{"role": "user", "content": "test"}],
metadata={"ls_provider": "anthropic", "ls_model_name": "claude-3-sonnet"},
)

# Response with cache creation: 1000 input (includes 800 being written to cache)
response = LLMResult(
generations=[
[
ChatGeneration(
message=AIMessage(content="Response"),
generation_info={
"usage_metadata": {
"input_tokens": 1000,
"output_tokens": 50,
"cache_creation_input_tokens": 800,
}
},
)
]
],
llm_output={},
)

cb._pop_run_and_capture_generation(run_id, None, response)

generation_args = mock_client.capture.call_args_list[0][1]
assert generation_args["properties"]["$ai_input_tokens"] == 200 # 1000 - 800
assert generation_args["properties"]["$ai_cache_creation_input_tokens"] == 800


def test_anthropic_provider_subtracts_both_cache_read_and_write_tokens(mock_client):
"""Test that Anthropic provider correctly subtracts both cache read and write tokens."""
from langchain_core.outputs import LLMResult, ChatGeneration
from langchain_core.messages import AIMessage
from uuid import uuid4

cb = CallbackHandler(mock_client)
run_id = uuid4()

# Set up with Anthropic provider
cb._set_llm_metadata(
serialized={},
run_id=run_id,
messages=[{"role": "user", "content": "test"}],
metadata={"ls_provider": "anthropic", "ls_model_name": "claude-3-sonnet"},
)

# Response with both cache read and creation
response = LLMResult(
generations=[
[
ChatGeneration(
message=AIMessage(content="Response"),
generation_info={
"usage_metadata": {
"input_tokens": 2000,
"output_tokens": 50,
"cache_read_input_tokens": 800,
"cache_creation_input_tokens": 500,
}
},
)
]
],
llm_output={},
)

cb._pop_run_and_capture_generation(run_id, None, response)

generation_args = mock_client.capture.call_args_list[0][1]
# 2000 - 800 (read) - 500 (write) = 700
assert generation_args["properties"]["$ai_input_tokens"] == 700
assert generation_args["properties"]["$ai_cache_read_input_tokens"] == 800
assert generation_args["properties"]["$ai_cache_creation_input_tokens"] == 500


def test_openai_cache_read_tokens(mock_client):
"""Test that OpenAI cache read tokens are captured correctly."""
prompt = ChatPromptTemplate.from_messages(
Expand Down Expand Up @@ -2092,10 +2181,12 @@ def test_zero_input_tokens_with_cache_read(mock_client):
assert generation_props["$ai_cache_read_input_tokens"] == 50


def test_cache_write_tokens_not_subtracted_from_input(mock_client):
"""Test that cache_creation_input_tokens (cache write) do NOT affect input_tokens.
def test_non_anthropic_cache_write_tokens_not_subtracted_from_input(mock_client):
"""Test that cache_creation_input_tokens do NOT affect input_tokens for non-Anthropic providers.

Only cache_read_tokens should be subtracted from input_tokens, not cache_write_tokens.
When no provider metadata is set (or for non-Anthropic providers), cache tokens should
NOT be subtracted from input_tokens. This is because different providers report tokens
differently - only Anthropic's LangChain integration requires subtraction.
"""
prompt = ChatPromptTemplate.from_messages([("user", "Create cache")])

Expand Down
2 changes: 1 addition & 1 deletion posthog/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
VERSION = "7.4.2"
VERSION = "7.4.3"

if __name__ == "__main__":
print(VERSION, end="") # noqa: T201
Loading