Skip to content

Commit e6bb456

Browse files
ajcasagrandeclaude
andcommitted
feat(metrics): add overall API usage prompt cache-read percentage
Adds OverallUsagePromptCacheReadPercentMetric, a token-volume-weighted run-level percentage of input tokens served from prompt cache, derived from the existing TotalUsagePromptCacheReadTokensMetric and TotalUsagePromptTokensMetric. The two underlying token totals are already exported per request, so a per-record variant is intentionally omitted — averaging per-request percentages weights small and large requests equally and is misleading. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Signed-off-by: Anthony Casagrande <acasagrande@nvidia.com>
1 parent 79de74e commit e6bb456

3 files changed

Lines changed: 106 additions & 1 deletion

File tree

docs/metrics-reference.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ This document provides a comprehensive reference of all metrics available in AIP
6565
- [Total Usage Total Tokens](#total-usage-total-tokens)
6666
- [Total Usage Reasoning Tokens](#total-usage-reasoning-tokens)
6767
- [Total Usage Prompt Cache Read Tokens](#total-usage-prompt-cache-read-tokens)
68+
- [Overall Usage Prompt Cache Read %](#overall-usage-prompt-cache-read-)
6869
- [Total Usage Prompt Cache Write Tokens](#total-usage-prompt-cache-write-tokens)
6970
- [Total Usage Prompt Cache Miss Tokens](#total-usage-prompt-cache-miss-tokens)
7071
- [Total Usage Prompt Audio Tokens](#total-usage-prompt-audio-tokens)
@@ -979,6 +980,24 @@ total_usage_prompt_cache_read_tokens = sum(r.usage_prompt_cache_read_tokens for
979980

980981
---
981982

983+
### Overall Usage Prompt Cache Read %
984+
985+
**Type:** [Derived Metric](#derived-metrics)
986+
987+
Run-aggregate share of input tokens served from prompt cache, weighted by token volume. Computed from the run totals so a request with 10k prompt tokens contributes 100x as much weight as a request with 100 prompt tokens — the resulting number reflects the actual fraction of input tokens the API served from cache across the whole benchmark.
988+
989+
**Formula:**
990+
```python
991+
overall_usage_prompt_cache_read_pct = (
992+
total_usage_prompt_cache_read_tokens / total_usage_prompt_tokens
993+
) * 100
994+
```
995+
996+
**Notes:**
997+
- No value is produced if `total_usage_prompt_tokens` is zero (e.g. all requests errored before reporting usage).
998+
999+
---
1000+
9821001
### Total Usage Prompt Cache Write Tokens
9831002

9841003
**Type:** [Derived Metric](#derived-metrics)

src/aiperf/metrics/types/usage_total_metrics.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@
88
"""
99

1010
from aiperf.common.enums import MetricConsoleGroup
11+
from aiperf.common.enums.metric_enums import GenericMetricUnit, MetricFlags
12+
from aiperf.common.exceptions import NoMetricValue
13+
from aiperf.metrics import BaseDerivedMetric
1114
from aiperf.metrics.derived_sum_metric import DerivedSumMetric
15+
from aiperf.metrics.metric_dicts import MetricResultsDict
1216
from aiperf.metrics.types.usage_cache_metrics import (
1317
UsagePromptCacheMissTokensMetric,
1418
UsagePromptCacheReadTokensMetric,
@@ -311,3 +315,48 @@ class TotalUsagePromptAudioSecondsMetric(
311315
short_header = "Total Usage Prompt Audio Sec"
312316
console_group = MetricConsoleGroup.USAGE
313317
display_order = 2040
318+
319+
320+
class OverallUsagePromptCacheReadPercentMetric(BaseDerivedMetric[float]):
321+
"""
322+
Overall (run-aggregate) prompt cache-read percentage across all requests.
323+
324+
Token-volume-weighted: divides the summed cache-read tokens by the summed
325+
prompt tokens across the whole benchmark. This differs from the
326+
per-request `UsagePromptCacheReadPercentMetric` aggregate stats (which
327+
average per-request percentages, treating small and large requests
328+
equally) — the overall figure reflects the actual share of input tokens
329+
the API served from cache.
330+
331+
Formula:
332+
Overall Usage Prompt Cache Read % =
333+
(Total Usage Prompt Cache Read Tokens / Total Usage Prompt Tokens) * 100
334+
"""
335+
336+
tag = "overall_usage_prompt_cache_read_pct"
337+
header = "Overall Usage Prompt Cache Read %"
338+
short_header = "Overall Cache Read %"
339+
short_header_hide_unit = True
340+
unit = GenericMetricUnit.PERCENT
341+
flags = MetricFlags.LARGER_IS_BETTER
342+
console_group = MetricConsoleGroup.USAGE
343+
display_order = 2012
344+
required_metrics = {
345+
TotalUsagePromptCacheReadTokensMetric.tag,
346+
TotalUsagePromptTokensMetric.tag,
347+
}
348+
349+
def _derive_value(
350+
self,
351+
metric_results: MetricResultsDict,
352+
) -> float:
353+
total_cache_read = metric_results.get_or_raise(
354+
TotalUsagePromptCacheReadTokensMetric
355+
)
356+
total_prompt = metric_results.get_or_raise(TotalUsagePromptTokensMetric)
357+
if total_prompt == 0:
358+
raise NoMetricValue(
359+
"Total usage prompt tokens is zero, "
360+
"cannot calculate overall cache-read percentage."
361+
)
362+
return (total_cache_read / total_prompt) * 100.0

tests/unit/metrics/test_usage_metrics.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from aiperf.common.models import ParsedResponse, ParsedResponseRecord, RequestRecord
99
from aiperf.common.models.record_models import TextResponseData, TokenCounts
1010
from aiperf.common.models.usage_models import Usage
11-
from aiperf.metrics.metric_dicts import MetricRecordDict
11+
from aiperf.metrics.metric_dicts import MetricRecordDict, MetricResultsDict
1212
from aiperf.metrics.types.usage_cache_metrics import (
1313
UsagePromptCacheMissTokensMetric,
1414
UsagePromptCacheReadTokensMetric,
@@ -26,13 +26,15 @@
2626
UsageRejectedPredictionTokensMetric,
2727
)
2828
from aiperf.metrics.types.usage_total_metrics import (
29+
OverallUsagePromptCacheReadPercentMetric,
2930
TotalUsageAcceptedPredictionTokensMetric,
3031
TotalUsageCompletionAudioTokensMetric,
3132
TotalUsagePromptAudioSecondsMetric,
3233
TotalUsagePromptAudioTokensMetric,
3334
TotalUsagePromptCacheMissTokensMetric,
3435
TotalUsagePromptCacheReadTokensMetric,
3536
TotalUsagePromptCacheWriteTokensMetric,
37+
TotalUsagePromptTokensMetric,
3638
TotalUsageReasoningTokensMetric,
3739
TotalUsageRejectedPredictionTokensMetric,
3840
TotalUsageToolUsePromptTokensMetric,
@@ -447,6 +449,41 @@ def test_metadata(self):
447449
)
448450

449451

452+
class TestOverallUsagePromptCacheReadPercentMetric:
453+
"""Tests for OverallUsagePromptCacheReadPercentMetric (run-aggregate cache %)."""
454+
455+
def test_basic_overall_percentage(self):
456+
metric_results = MetricResultsDict()
457+
metric_results[TotalUsagePromptCacheReadTokensMetric.tag] = 250
458+
metric_results[TotalUsagePromptTokensMetric.tag] = 1000
459+
result = OverallUsagePromptCacheReadPercentMetric().derive_value(metric_results)
460+
assert result == pytest.approx(25.0, rel=1e-9)
461+
462+
def test_zero_total_prompt_tokens_raises(self):
463+
metric_results = MetricResultsDict()
464+
metric_results[TotalUsagePromptCacheReadTokensMetric.tag] = 0
465+
metric_results[TotalUsagePromptTokensMetric.tag] = 0
466+
with pytest.raises(NoMetricValue):
467+
OverallUsagePromptCacheReadPercentMetric().derive_value(metric_results)
468+
469+
def test_metadata(self):
470+
assert (
471+
OverallUsagePromptCacheReadPercentMetric.tag
472+
== "overall_usage_prompt_cache_read_pct"
473+
)
474+
assert (
475+
OverallUsagePromptCacheReadPercentMetric.console_group
476+
== MetricConsoleGroup.USAGE
477+
)
478+
assert OverallUsagePromptCacheReadPercentMetric.has_flags(
479+
MetricFlags.LARGER_IS_BETTER
480+
)
481+
assert OverallUsagePromptCacheReadPercentMetric.required_metrics == {
482+
TotalUsagePromptCacheReadTokensMetric.tag,
483+
TotalUsagePromptTokensMetric.tag,
484+
}
485+
486+
450487
class TestUsageToolUsePromptTokensMetric:
451488
"""Tests for UsageToolUsePromptTokensMetric (Gemini-specific)."""
452489

0 commit comments

Comments
 (0)