Skip to content

Commit 69293f5

Browse files
skoob13Radu-Raiceaandrewm4894
authored
fix(llma): cache cost calculation in the LangChain callback (#346)
* fix(llma): cache cost calculation in the LangChain callback * fix: format * Update posthog/ai/langchain/callbacks.py Co-authored-by: Radu Raicea <[email protected]> * Bump version to 6.7.13 Master has already released 6.7.12 with other fixes, so this PR will be 6.7.13 --------- Co-authored-by: Radu Raicea <[email protected]> Co-authored-by: Andrew Maguire <[email protected]>
1 parent 46589f9 commit 69293f5

File tree

4 files changed

+224
-7
lines changed

4 files changed

+224
-7
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# 6.7.13 - 2025-11-02
2+
3+
- fix(llma): cache cost calculation in the LangChain callback
4+
15
# 6.7.12 - 2025-11-02
26

37
- fix(django): Restore process_exception method to capture view and downstream middleware exceptions (fixes #329)

posthog/ai/langchain/callbacks.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -758,12 +758,19 @@ def _parse_usage_model(
758758
"cache_read": "cache_read_tokens",
759759
"reasoning": "reasoning_tokens",
760760
}
761-
return ModelUsage(
761+
normalized_usage = ModelUsage(
762762
**{
763763
dataclass_key: parsed_usage.get(mapped_key) or 0
764764
for mapped_key, dataclass_key in field_mapping.items()
765765
},
766766
)
767+
# In LangChain, input_tokens is the sum of input and cache read tokens.
768+
# Our cost calculation expects them to be separate, for Anthropic.
769+
if normalized_usage.input_tokens and normalized_usage.cache_read_tokens:
770+
normalized_usage.input_tokens = max(
771+
normalized_usage.input_tokens - normalized_usage.cache_read_tokens, 0
772+
)
773+
return normalized_usage
767774

768775

769776
def _parse_usage(response: LLMResult) -> ModelUsage:

posthog/test/ai/langchain/test_callbacks.py

Lines changed: 211 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1565,9 +1565,9 @@ def test_anthropic_cache_write_and_read_tokens(mock_client):
15651565
AIMessage(
15661566
content="Using cached analysis to provide quick response.",
15671567
usage_metadata={
1568-
"input_tokens": 200,
1568+
"input_tokens": 1200,
15691569
"output_tokens": 30,
1570-
"total_tokens": 1030,
1570+
"total_tokens": 1230,
15711571
"cache_read_input_tokens": 800, # Anthropic cache read
15721572
},
15731573
)
@@ -1584,7 +1584,7 @@ def test_anthropic_cache_write_and_read_tokens(mock_client):
15841584
generation_props = generation_args["properties"]
15851585

15861586
assert generation_args["event"] == "$ai_generation"
1587-
assert generation_props["$ai_input_tokens"] == 200
1587+
assert generation_props["$ai_input_tokens"] == 400
15881588
assert generation_props["$ai_output_tokens"] == 30
15891589
assert generation_props["$ai_cache_creation_input_tokens"] == 0
15901590
assert generation_props["$ai_cache_read_input_tokens"] == 800
@@ -1626,7 +1626,7 @@ def test_openai_cache_read_tokens(mock_client):
16261626
generation_props = generation_args["properties"]
16271627

16281628
assert generation_args["event"] == "$ai_generation"
1629-
assert generation_props["$ai_input_tokens"] == 150
1629+
assert generation_props["$ai_input_tokens"] == 50
16301630
assert generation_props["$ai_output_tokens"] == 40
16311631
assert generation_props["$ai_cache_read_input_tokens"] == 100
16321632
assert generation_props["$ai_cache_creation_input_tokens"] == 0
@@ -1708,7 +1708,7 @@ def test_combined_reasoning_and_cache_tokens(mock_client):
17081708
generation_props = generation_args["properties"]
17091709

17101710
assert generation_args["event"] == "$ai_generation"
1711-
assert generation_props["$ai_input_tokens"] == 500
1711+
assert generation_props["$ai_input_tokens"] == 200
17121712
assert generation_props["$ai_output_tokens"] == 100
17131713
assert generation_props["$ai_cache_read_input_tokens"] == 300
17141714
assert generation_props["$ai_cache_creation_input_tokens"] == 0
@@ -1879,6 +1879,212 @@ def test_tool_definition(mock_client):
18791879
assert props["$ai_tools"] == tools
18801880

18811881

1882+
def test_cache_read_tokens_subtraction_from_input_tokens(mock_client):
1883+
"""Test that cache_read_tokens are properly subtracted from input_tokens.
1884+
1885+
This tests the logic in callbacks.py lines 757-758:
1886+
if normalized_usage.input_tokens and normalized_usage.cache_read_tokens:
1887+
normalized_usage.input_tokens = max(normalized_usage.input_tokens - normalized_usage.cache_read_tokens, 0)
1888+
"""
1889+
prompt = ChatPromptTemplate.from_messages(
1890+
[("user", "Use the cached prompt for this request")]
1891+
)
1892+
1893+
# Scenario 1: input_tokens includes cache_read_tokens (typical case)
1894+
# input_tokens=150 includes 100 cache_read tokens, so actual input is 50
1895+
model = FakeMessagesListChatModel(
1896+
responses=[
1897+
AIMessage(
1898+
content="Response using cached prompt context.",
1899+
usage_metadata={
1900+
"input_tokens": 150, # Total includes cache reads
1901+
"output_tokens": 40,
1902+
"total_tokens": 190,
1903+
"cache_read_input_tokens": 100, # 100 tokens read from cache
1904+
},
1905+
)
1906+
]
1907+
)
1908+
1909+
callbacks = [CallbackHandler(mock_client)]
1910+
chain = prompt | model
1911+
result = chain.invoke({}, config={"callbacks": callbacks})
1912+
1913+
assert result.content == "Response using cached prompt context."
1914+
assert mock_client.capture.call_count == 3
1915+
1916+
generation_args = mock_client.capture.call_args_list[1][1]
1917+
generation_props = generation_args["properties"]
1918+
1919+
assert generation_args["event"] == "$ai_generation"
1920+
# Input tokens should be reduced: 150 - 100 = 50
1921+
assert generation_props["$ai_input_tokens"] == 50
1922+
assert generation_props["$ai_output_tokens"] == 40
1923+
assert generation_props["$ai_cache_read_input_tokens"] == 100
1924+
1925+
1926+
def test_cache_read_tokens_subtraction_prevents_negative(mock_client):
1927+
"""Test that cache_read_tokens subtraction doesn't result in negative input_tokens.
1928+
1929+
This tests the max(..., 0) part of the logic in callbacks.py lines 757-758.
1930+
"""
1931+
prompt = ChatPromptTemplate.from_messages(
1932+
[("user", "Edge case with large cache read")]
1933+
)
1934+
1935+
# Edge case: cache_read_tokens >= input_tokens
1936+
# This could happen in some API responses where accounting differs
1937+
model = FakeMessagesListChatModel(
1938+
responses=[
1939+
AIMessage(
1940+
content="Response with edge case token counts.",
1941+
usage_metadata={
1942+
"input_tokens": 80,
1943+
"output_tokens": 20,
1944+
"total_tokens": 100,
1945+
"cache_read_input_tokens": 100, # More than input_tokens
1946+
},
1947+
)
1948+
]
1949+
)
1950+
1951+
callbacks = [CallbackHandler(mock_client)]
1952+
chain = prompt | model
1953+
result = chain.invoke({}, config={"callbacks": callbacks})
1954+
1955+
assert result.content == "Response with edge case token counts."
1956+
assert mock_client.capture.call_count == 3
1957+
1958+
generation_args = mock_client.capture.call_args_list[1][1]
1959+
generation_props = generation_args["properties"]
1960+
1961+
assert generation_args["event"] == "$ai_generation"
1962+
# Input tokens should be 0, not negative: max(80 - 100, 0) = 0
1963+
assert generation_props["$ai_input_tokens"] == 0
1964+
assert generation_props["$ai_output_tokens"] == 20
1965+
assert generation_props["$ai_cache_read_input_tokens"] == 100
1966+
1967+
1968+
def test_no_cache_read_tokens_no_subtraction(mock_client):
1969+
"""Test that when there are no cache_read_tokens, input_tokens remain unchanged.
1970+
1971+
This tests the conditional check before the subtraction in callbacks.py line 757.
1972+
"""
1973+
prompt = ChatPromptTemplate.from_messages(
1974+
[("user", "Normal request without cache")]
1975+
)
1976+
1977+
# No cache usage - input_tokens should remain as-is
1978+
model = FakeMessagesListChatModel(
1979+
responses=[
1980+
AIMessage(
1981+
content="Response without cache.",
1982+
usage_metadata={
1983+
"input_tokens": 100,
1984+
"output_tokens": 30,
1985+
"total_tokens": 130,
1986+
# No cache_read_input_tokens
1987+
},
1988+
)
1989+
]
1990+
)
1991+
1992+
callbacks = [CallbackHandler(mock_client)]
1993+
chain = prompt | model
1994+
result = chain.invoke({}, config={"callbacks": callbacks})
1995+
1996+
assert result.content == "Response without cache."
1997+
assert mock_client.capture.call_count == 3
1998+
1999+
generation_args = mock_client.capture.call_args_list[1][1]
2000+
generation_props = generation_args["properties"]
2001+
2002+
assert generation_args["event"] == "$ai_generation"
2003+
# Input tokens should remain unchanged at 100
2004+
assert generation_props["$ai_input_tokens"] == 100
2005+
assert generation_props["$ai_output_tokens"] == 30
2006+
assert generation_props["$ai_cache_read_input_tokens"] == 0
2007+
2008+
2009+
def test_zero_input_tokens_with_cache_read(mock_client):
2010+
"""Test edge case where input_tokens is 0 but cache_read_tokens exist.
2011+
2012+
This tests the falsy check in the conditional (line 757).
2013+
"""
2014+
prompt = ChatPromptTemplate.from_messages([("user", "Edge case query")])
2015+
2016+
# Edge case: input_tokens is 0 (falsy), should skip subtraction
2017+
model = FakeMessagesListChatModel(
2018+
responses=[
2019+
AIMessage(
2020+
content="Response.",
2021+
usage_metadata={
2022+
"input_tokens": 0,
2023+
"output_tokens": 10,
2024+
"total_tokens": 10,
2025+
"cache_read_input_tokens": 50,
2026+
},
2027+
)
2028+
]
2029+
)
2030+
2031+
callbacks = [CallbackHandler(mock_client)]
2032+
chain = prompt | model
2033+
result = chain.invoke({}, config={"callbacks": callbacks})
2034+
2035+
assert result.content == "Response."
2036+
assert mock_client.capture.call_count == 3
2037+
2038+
generation_args = mock_client.capture.call_args_list[1][1]
2039+
generation_props = generation_args["properties"]
2040+
2041+
assert generation_args["event"] == "$ai_generation"
2042+
# Input tokens should remain 0 (no subtraction because input_tokens is falsy)
2043+
assert generation_props["$ai_input_tokens"] == 0
2044+
assert generation_props["$ai_output_tokens"] == 10
2045+
assert generation_props["$ai_cache_read_input_tokens"] == 50
2046+
2047+
2048+
def test_cache_write_tokens_not_subtracted_from_input(mock_client):
2049+
"""Test that cache_creation_input_tokens (cache write) do NOT affect input_tokens.
2050+
2051+
Only cache_read_tokens should be subtracted from input_tokens, not cache_write_tokens.
2052+
"""
2053+
prompt = ChatPromptTemplate.from_messages([("user", "Create cache")])
2054+
2055+
# Cache creation without cache read
2056+
model = FakeMessagesListChatModel(
2057+
responses=[
2058+
AIMessage(
2059+
content="Creating cache.",
2060+
usage_metadata={
2061+
"input_tokens": 1000,
2062+
"output_tokens": 20,
2063+
"total_tokens": 1020,
2064+
"cache_creation_input_tokens": 800, # Cache write, not read
2065+
},
2066+
)
2067+
]
2068+
)
2069+
2070+
callbacks = [CallbackHandler(mock_client)]
2071+
chain = prompt | model
2072+
result = chain.invoke({}, config={"callbacks": callbacks})
2073+
2074+
assert result.content == "Creating cache."
2075+
assert mock_client.capture.call_count == 3
2076+
2077+
generation_args = mock_client.capture.call_args_list[1][1]
2078+
generation_props = generation_args["properties"]
2079+
2080+
assert generation_args["event"] == "$ai_generation"
2081+
# Input tokens should NOT be reduced by cache_creation_input_tokens
2082+
assert generation_props["$ai_input_tokens"] == 1000
2083+
assert generation_props["$ai_output_tokens"] == 20
2084+
assert generation_props["$ai_cache_creation_input_tokens"] == 800
2085+
assert generation_props["$ai_cache_read_input_tokens"] == 0
2086+
2087+
18822088
def test_agent_action_and_finish_imports():
18832089
"""
18842090
Regression test for LangChain 1.0+ compatibility (Issue #362).

posthog/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
VERSION = "6.7.12"
1+
VERSION = "6.7.13"
22

33
if __name__ == "__main__":
44
print(VERSION, end="") # noqa: T201

0 commit comments

Comments
 (0)