Skip to content

feat: expose cache metrics #392

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/strands/event_loop/streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ async def process_stream(
}
state["content"] = state["message"]["content"]

usage: Usage = Usage(inputTokens=0, outputTokens=0, totalTokens=0)
usage: Usage = Usage(inputTokens=0, outputTokens=0, totalTokens=0, cacheReadInputTokens=0, cacheWriteInputTokens=0)
metrics: Metrics = Metrics(latencyMs=0)

async for chunk in chunks:
Expand Down
2 changes: 2 additions & 0 deletions src/strands/models/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,8 @@ def format_chunk(self, event: dict[str, Any]) -> StreamEvent:
"inputTokens": usage["input_tokens"],
"outputTokens": usage["output_tokens"],
"totalTokens": usage["input_tokens"] + usage["output_tokens"],
"cacheReadInputTokens": usage.get("cache_read_input_tokens", 0),
"cacheWriteInputTokens": usage.get("cache_creation_input_tokens", 0),
},
"metrics": {
"latencyMs": 0, # TODO
Expand Down
10 changes: 9 additions & 1 deletion src/strands/models/litellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,15 @@ async def stream(
async for event in response:
_ = event

yield self.format_chunk({"chunk_type": "metadata", "data": event.usage})
usage = event.usage
cache_read = max(
getattr(usage, "cache_read_input_tokens", 0),
getattr(getattr(usage, "prompt_tokens_details", {}), "cached_tokens", 0),
)

usage.prompt_tokens_details.cached_tokens = cache_read

yield self.format_chunk({"chunk_type": "metadata", "data": usage})

logger.debug("finished streaming response from model")

Expand Down
3 changes: 3 additions & 0 deletions src/strands/models/llamaapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,9 @@ def format_chunk(self, event: dict[str, Any]) -> StreamEvent:
inputTokens=usage["inputTokens"],
outputTokens=usage["outputTokens"],
totalTokens=usage["totalTokens"],
# TODO does not seem to support caching as of July 2025
cacheWriteInputTokens=0,
cacheReadInputTokens=0,
)
return {
"metadata": {
Expand Down
3 changes: 3 additions & 0 deletions src/strands/models/mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,9 @@ def format_chunk(self, event: dict[str, Any]) -> StreamEvent:
"inputTokens": usage.prompt_tokens,
"outputTokens": usage.completion_tokens,
"totalTokens": usage.total_tokens,
# TODO does not seem to support caching as of July 2025
"cacheWriteInputTokens": 0,
"cacheReadInputTokens": 0,
},
"metrics": {
"latencyMs": event.get("latency_ms", 0),
Expand Down
3 changes: 3 additions & 0 deletions src/strands/models/ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,9 @@ def format_chunk(self, event: dict[str, Any]) -> StreamEvent:
"inputTokens": event["data"].eval_count,
"outputTokens": event["data"].prompt_eval_count,
"totalTokens": event["data"].eval_count + event["data"].prompt_eval_count,
# TODO add cache metrics
"cacheWriteInputTokens": 0,
"cacheReadInputTokens": 0,
},
"metrics": {
"latencyMs": event["data"].total_duration / 1e6,
Expand Down
2 changes: 2 additions & 0 deletions src/strands/models/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,8 @@ def format_chunk(self, event: dict[str, Any]) -> StreamEvent:
"inputTokens": event["data"].prompt_tokens,
"outputTokens": event["data"].completion_tokens,
"totalTokens": event["data"].total_tokens,
"cacheReadInputTokens": event["data"].prompt_tokens_details.cached_tokens,
"cacheWriteInputTokens": 0, # OpenAI does not return cache write information
},
"metrics": {
"latencyMs": 0, # TODO
Expand Down
25 changes: 21 additions & 4 deletions src/strands/telemetry/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,11 @@ class EventLoopMetrics:
tool_metrics: Dict[str, ToolMetrics] = field(default_factory=dict)
cycle_durations: List[float] = field(default_factory=list)
traces: List[Trace] = field(default_factory=list)
accumulated_usage: Usage = field(default_factory=lambda: Usage(inputTokens=0, outputTokens=0, totalTokens=0))
accumulated_usage: Usage = field(
default_factory=lambda: Usage(
inputTokens=0, outputTokens=0, totalTokens=0, cacheReadInputTokens=0, cacheWriteInputTokens=0
)
)
accumulated_metrics: Metrics = field(default_factory=lambda: Metrics(latencyMs=0))

@property
Expand Down Expand Up @@ -263,6 +267,8 @@ def update_usage(self, usage: Usage) -> None:
self.accumulated_usage["inputTokens"] += usage["inputTokens"]
self.accumulated_usage["outputTokens"] += usage["outputTokens"]
self.accumulated_usage["totalTokens"] += usage["totalTokens"]
self.accumulated_usage["cacheReadInputTokens"] += usage.get("cacheReadInputTokens", 0)
self.accumulated_usage["cacheWriteInputTokens"] += usage.get("cacheWriteInputTokens", 0)

def update_metrics(self, metrics: Metrics) -> None:
"""Update the accumulated performance metrics with new metrics data.
Expand Down Expand Up @@ -320,15 +326,18 @@ def _metrics_summary_to_lines(event_loop_metrics: EventLoopMetrics, allowed_name
An iterable of formatted text lines representing the metrics.
"""
summary = event_loop_metrics.get_summary()
accumulated_usage = summary["accumulated_usage"]
yield "Event Loop Metrics Summary:"
yield (
f"├─ Cycles: total={summary['total_cycles']}, avg_time={summary['average_cycle_time']:.3f}s, "
f"total_time={summary['total_duration']:.3f}s"
)
yield (
f"├─ Tokens: in={summary['accumulated_usage']['inputTokens']}, "
f"out={summary['accumulated_usage']['outputTokens']}, "
f"total={summary['accumulated_usage']['totalTokens']}"
f"├─ Tokens: in={accumulated_usage['inputTokens']}"
f" (cache_write={accumulated_usage.get('cacheWriteInputTokens', 0)}), "
f"out={accumulated_usage['outputTokens']}, "
f"total={accumulated_usage['totalTokens']}"
f" (cache_read={accumulated_usage.get('cacheReadInputTokens', 0)})"
)
yield f"├─ Bedrock Latency: {summary['accumulated_metrics']['latencyMs']}ms"

Expand Down Expand Up @@ -421,6 +430,8 @@ class MetricsClient:
event_loop_latency: Histogram
event_loop_input_tokens: Histogram
event_loop_output_tokens: Histogram
event_loop_input_tokens_cache_read: Histogram
event_loop_input_tokens_cache_write: Histogram

tool_call_count: Counter
tool_success_count: Counter
Expand Down Expand Up @@ -474,3 +485,9 @@ def create_instruments(self) -> None:
self.event_loop_output_tokens = self.meter.create_histogram(
name=constants.STRANDS_EVENT_LOOP_OUTPUT_TOKENS, unit="token"
)
self.event_loop_input_tokens_cache_read = self.meter.create_histogram(
name=constants.STRANDS_EVENT_LOOP_INPUT_TOKEN_CACHE_READ, unit="token"
)
self.event_loop_input_tokens_cache_write = self.meter.create_histogram(
name=constants.STRANDS_EVENT_LOOP_INPUT_TOKENS_CACHE_WRITE, unit="token"
)
2 changes: 2 additions & 0 deletions src/strands/telemetry/metrics_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@
STRANDS_EVENT_LOOP_CYCLE_DURATION = "strands.event_loop.cycle_duration"
STRANDS_EVENT_LOOP_INPUT_TOKENS = "strands.event_loop.input.tokens"
STRANDS_EVENT_LOOP_OUTPUT_TOKENS = "strands.event_loop.output.tokens"
STRANDS_EVENT_LOOP_INPUT_TOKEN_CACHE_READ = "strands.event_loop.input.tokens.cache.read"
STRANDS_EVENT_LOOP_INPUT_TOKENS_CACHE_WRITE = "strands.event_loop.input.tokens.cache.write"
4 changes: 4 additions & 0 deletions src/strands/telemetry/tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,8 @@ def end_model_invoke_span(
attributes: Dict[str, AttributeValue] = {
"gen_ai.usage.prompt_tokens": usage["inputTokens"],
"gen_ai.usage.input_tokens": usage["inputTokens"],
"gen_ai.usage.cache_read_input_tokens": usage.get("cacheReadInputTokens", 0),
"gen_ai.usage.cache_write_input_tokens": usage.get("cacheWriteInputTokens", 0),
"gen_ai.usage.completion_tokens": usage["outputTokens"],
"gen_ai.usage.output_tokens": usage["outputTokens"],
"gen_ai.usage.total_tokens": usage["totalTokens"],
Expand Down Expand Up @@ -492,6 +494,8 @@ def end_agent_span(
"gen_ai.usage.input_tokens": accumulated_usage["inputTokens"],
"gen_ai.usage.output_tokens": accumulated_usage["outputTokens"],
"gen_ai.usage.total_tokens": accumulated_usage["totalTokens"],
"gen_ai.usage.cache_read_input_tokens": accumulated_usage.get("cacheReadInputTokens", 0),
"gen_ai.usage.cache_write_input_tokens": accumulated_usage.get("cacheWriteInputTokens", 0),
}
)

Expand Down
6 changes: 5 additions & 1 deletion src/strands/types/event_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,22 @@
from typing_extensions import TypedDict


class Usage(TypedDict):
class Usage(TypedDict, total=False):
"""Token usage information for model interactions.

Attributes:
inputTokens: Number of tokens sent in the request to the model..
outputTokens: Number of tokens that the model generated for the request.
totalTokens: Total number of tokens (input + output).
cacheReadInputTokens: Number of tokens read from cache.
cacheWriteInputTokens: Number of tokens written to cache.
"""

inputTokens: int
outputTokens: int
totalTokens: int
cacheReadInputTokens: int
cacheWriteInputTokens: int


class Metrics(TypedDict):
Expand Down
60 changes: 53 additions & 7 deletions tests/strands/event_loop/test_streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,13 @@ def test_handle_message_stop():

def test_extract_usage_metrics():
event = {
"usage": {"inputTokens": 0, "outputTokens": 0, "totalTokens": 0},
"usage": {
"inputTokens": 0,
"outputTokens": 0,
"totalTokens": 0,
"cacheReadInputTokens": 0,
"cacheWriteInputTokens": 0,
},
"metrics": {"latencyMs": 0},
}

Expand Down Expand Up @@ -279,7 +285,13 @@ def test_extract_usage_metrics():
},
{
"metadata": {
"usage": {"inputTokens": 1, "outputTokens": 1, "totalTokens": 1},
"usage": {
"inputTokens": 1,
"outputTokens": 1,
"totalTokens": 1,
"cacheReadInputTokens": 1,
"cacheWriteInputTokens": 1,
},
"metrics": {"latencyMs": 1},
}
},
Expand Down Expand Up @@ -364,6 +376,8 @@ def test_extract_usage_metrics():
"inputTokens": 1,
"outputTokens": 1,
"totalTokens": 1,
"cacheReadInputTokens": 1,
"cacheWriteInputTokens": 1,
},
},
},
Expand All @@ -376,7 +390,13 @@ def test_extract_usage_metrics():
"role": "assistant",
"content": [{"toolUse": {"toolUseId": "123", "name": "test", "input": {"key": "value"}}}],
},
{"inputTokens": 1, "outputTokens": 1, "totalTokens": 1},
{
"inputTokens": 1,
"outputTokens": 1,
"totalTokens": 1,
"cacheReadInputTokens": 1,
"cacheWriteInputTokens": 1,
},
{"latencyMs": 1},
)
},
Expand All @@ -398,7 +418,13 @@ def test_extract_usage_metrics():
"role": "assistant",
"content": [],
},
{"inputTokens": 0, "outputTokens": 0, "totalTokens": 0},
{
"inputTokens": 0,
"outputTokens": 0,
"totalTokens": 0,
"cacheReadInputTokens": 0,
"cacheWriteInputTokens": 0,
},
{"latencyMs": 0},
),
},
Expand Down Expand Up @@ -426,7 +452,13 @@ def test_extract_usage_metrics():
},
{
"metadata": {
"usage": {"inputTokens": 1, "outputTokens": 1, "totalTokens": 1},
"usage": {
"inputTokens": 1,
"outputTokens": 1,
"totalTokens": 1,
"cacheReadInputTokens": 1,
"cacheWriteInputTokens": 1,
},
"metrics": {"latencyMs": 1},
}
},
Expand Down Expand Up @@ -506,6 +538,8 @@ def test_extract_usage_metrics():
"inputTokens": 1,
"outputTokens": 1,
"totalTokens": 1,
"cacheReadInputTokens": 1,
"cacheWriteInputTokens": 1,
},
},
},
Expand All @@ -518,7 +552,13 @@ def test_extract_usage_metrics():
"role": "assistant",
"content": [{"text": "REDACTED."}],
},
{"inputTokens": 1, "outputTokens": 1, "totalTokens": 1},
{
"inputTokens": 1,
"outputTokens": 1,
"totalTokens": 1,
"cacheReadInputTokens": 1,
"cacheWriteInputTokens": 1,
},
{"latencyMs": 1},
),
},
Expand Down Expand Up @@ -584,7 +624,13 @@ async def test_stream_messages(agenerator, alist):
"stop": (
"end_turn",
{"role": "assistant", "content": [{"text": "test"}]},
{"inputTokens": 0, "outputTokens": 0, "totalTokens": 0},
{
"inputTokens": 0,
"outputTokens": 0,
"totalTokens": 0,
"cacheReadInputTokens": 0,
"cacheWriteInputTokens": 0,
},
{"latencyMs": 0},
)
},
Expand Down
22 changes: 20 additions & 2 deletions tests/strands/models/test_anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,7 +597,12 @@ def test_format_chunk_message_stop(model):
def test_format_chunk_metadata(model):
event = {
"type": "metadata",
"usage": {"input_tokens": 1, "output_tokens": 2},
"usage": {
"input_tokens": 1,
"output_tokens": 2,
"cache_read_input_tokens": 4,
"cache_creation_input_tokens": 5,
},
}

tru_chunk = model.format_chunk(event)
Expand All @@ -607,6 +612,8 @@ def test_format_chunk_metadata(model):
"inputTokens": 1,
"outputTokens": 2,
"totalTokens": 3,
"cacheReadInputTokens": 4,
"cacheWriteInputTokens": 5,
},
"metrics": {
"latencyMs": 0,
Expand Down Expand Up @@ -656,7 +663,18 @@ async def test_stream(anthropic_client, model, agenerator, alist):
tru_events = await alist(response)
exp_events = [
{"messageStart": {"role": "assistant"}},
{"metadata": {"usage": {"inputTokens": 1, "outputTokens": 2, "totalTokens": 3}, "metrics": {"latencyMs": 0}}},
{
"metadata": {
"usage": {
"inputTokens": 1,
"outputTokens": 2,
"totalTokens": 3,
"cacheReadInputTokens": 0,
"cacheWriteInputTokens": 0,
},
"metrics": {"latencyMs": 0},
}
},
]

assert tru_events == exp_events
Expand Down
Loading