Skip to content

Commit 5fd967c

Browse files
committed
[TRTLLM-11421][test] Add unit and integration tests for KV cache iteration stats
Unit tests: - test_stats_serializer.py: 6 tests for kvCacheIterationStats serialization (field presence, multiple window sizes, missing stats, interval gating) - test_collector.py: 5 tests for Prometheus metric collection (gauge updates, counter increments, multi-window labels, missing stats) - _test_openai_metrics.py / _test_openai_prometheus.py: field additions for serving endpoint tests Integration test (test_kv_cache_iteration_stats.py) with 8 pytest scenarios: 1. Cold start — verifies iterMissedBlocks, iterAllocTotalBlocks, iterGenAllocBlocks 2. Partial block reuse — short prompt repeated, verifies iterPartialReusedBlocks 3. Full block reuse — long prompt (3+ blocks) repeated, verifies iterFullReusedBlocks 4. Shared prefix — common prefix with different suffixes, verifies iterReusedBlocks 5. Batch generation — multiple prompts in one call, verifies pool usage 6. Long context — large prompt, verifies block allocation at scale 7. Rapid-fire — 20 requests, verifies accumulated deltas 8. Field completeness — all 18 fields present in every stats entry Runnable via pytest or standalone. Supports --verbose-stats (pytest) or --verbose/--test N/--list (standalone) for selective execution. Added to premerge L0 pipeline (l0_h100.yml, l0_b200.yml). Signed-off-by: Yueh-Ting Chen <yueh.ting.chen@gmail.com>
1 parent 9b004c7 commit 5fd967c

File tree

7 files changed

+917
-0
lines changed

7 files changed

+917
-0
lines changed

tests/integration/defs/kv_cache/test_kv_cache_iteration_stats.py

Lines changed: 470 additions & 0 deletions
Large diffs are not rendered by default.

tests/integration/test_lists/test-db/l0_b200.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,10 @@ l0_b200:
154154
- kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2LoRA::test_lora_multi_adapter_v2
155155
- kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2LoRA::test_lora_chunked_prefill
156156
- kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2LoRA::test_lora_eviction
157+
# ------------- KV Cache Iteration Stats ---------------
158+
- unittest/executor/test_stats_serializer.py
159+
- unittest/metrics/test_collector.py
160+
- kv_cache/test_kv_cache_iteration_stats.py
157161
# ------------- Visual Gen tests ---------------
158162
- unittest/_torch/visual_gen/test_visual_gen_args.py
159163
- unittest/_torch/visual_gen/test_teacache.py

tests/integration/test_lists/test-db/l0_h100.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,10 @@ l0_h100:
428428
- unittest/trt/attention/test_gpt_attention_no_cache.py
429429
- examples/test_gpt.py::test_gpt_oss_20b_lora_torch[gpt-oss-20b-lora-adapter_NIM_r8-gpt-oss-20b]
430430
- unittest/kv_cache_manager_v2_tests/ # 4 min
431+
# ------------- KV Cache Iteration Stats ---------------
432+
- unittest/executor/test_stats_serializer.py
433+
- unittest/metrics/test_collector.py
434+
- kv_cache/test_kv_cache_iteration_stats.py
431435
- condition:
432436
ranges:
433437
system_gpu_count:
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
"""Tests for _stats_serializer with kvCacheIterationStats injection."""
2+
3+
import json
4+
from types import SimpleNamespace
5+
from unittest.mock import MagicMock
6+
7+
import pytest
8+
9+
from tensorrt_llm.executor.base_worker import BaseWorker
10+
11+
12+
def _make_mock_iteration_stats(kv_cache_stats_json=None):
13+
"""Create a mock IterationStats object with to_json_str()."""
14+
base = {
15+
"iter": 1,
16+
"iterLatencyMS": 10.5,
17+
"gpuMemUsage": 1024,
18+
"cpuMemUsage": 0,
19+
"pinnedMemUsage": 0,
20+
}
21+
if kv_cache_stats_json is not None:
22+
base["kvCacheStats"] = kv_cache_stats_json
23+
24+
mock = MagicMock()
25+
mock.to_json_str.return_value = json.dumps(base)
26+
return mock
27+
28+
29+
def _make_mock_kv_iter_stats(
30+
window_size=16,
31+
primary_used=10,
32+
primary_max=20,
33+
reused=5,
34+
full_reused=4,
35+
partial_reused=1,
36+
missed=3,
37+
gen_alloc=2,
38+
):
39+
"""Create a mock KvCacheIterationStats nanobind object."""
40+
s = SimpleNamespace(
41+
primary_max_num_blocks=primary_max,
42+
primary_free_num_blocks=primary_max - primary_used,
43+
primary_used_num_blocks=primary_used,
44+
secondary_max_num_blocks=0,
45+
secondary_free_num_blocks=0,
46+
secondary_used_num_blocks=0,
47+
iter_alloc_total_blocks=reused + missed,
48+
iter_alloc_new_blocks=missed,
49+
iter_reused_blocks=reused,
50+
iter_full_reused_blocks=full_reused,
51+
iter_partial_reused_blocks=partial_reused,
52+
iter_missed_blocks=missed,
53+
iter_cache_hit_rate=reused / (reused + missed) if (reused + missed) > 0 else 0.0,
54+
iter_gen_alloc_blocks=gen_alloc,
55+
iter_onboard_blocks=1,
56+
iter_onboard_bytes=4096,
57+
iter_offload_blocks=0,
58+
iter_offload_bytes=0,
59+
)
60+
return {window_size: s}
61+
62+
63+
class TestStatsSerializer:
64+
def test_serializer_without_kv_iter_stats(self):
65+
"""Legacy 2-tuple and 3-tuple with None should produce same output."""
66+
iter_stats = _make_mock_iteration_stats()
67+
68+
# 3-tuple with None kv_iter_stats
69+
result = BaseWorker._stats_serializer((iter_stats, None, None))
70+
d = json.loads(result)
71+
assert "iter" in d
72+
assert "kvCacheIterationStats" not in d
73+
74+
def test_serializer_with_kv_iter_stats(self):
75+
"""KvCacheIterationStats should appear when provided."""
76+
iter_stats = _make_mock_iteration_stats(
77+
kv_cache_stats_json={"maxNumBlocks": 20, "usedNumBlocks": 10}
78+
)
79+
kv_iter = _make_mock_kv_iter_stats(
80+
window_size=16,
81+
primary_used=10,
82+
primary_max=20,
83+
reused=5,
84+
full_reused=4,
85+
partial_reused=1,
86+
missed=3,
87+
gen_alloc=2,
88+
)
89+
90+
result = BaseWorker._stats_serializer((iter_stats, None, kv_iter))
91+
d = json.loads(result)
92+
93+
# Existing kvCacheStats should still be present
94+
assert "kvCacheStats" in d
95+
96+
# New kvCacheIterationStats should be present
97+
assert "kvCacheIterationStats" in d
98+
iter_kv = d["kvCacheIterationStats"]
99+
assert "16" in iter_kv # window size key as string
100+
101+
ws_stats = iter_kv["16"]
102+
assert ws_stats["primaryMaxNumBlocks"] == 20
103+
assert ws_stats["primaryUsedNumBlocks"] == 10
104+
assert ws_stats["primaryFreeNumBlocks"] == 10
105+
assert ws_stats["iterReusedBlocks"] == 5
106+
assert ws_stats["iterFullReusedBlocks"] == 4
107+
assert ws_stats["iterPartialReusedBlocks"] == 1
108+
assert ws_stats["iterMissedBlocks"] == 3
109+
assert ws_stats["iterGenAllocBlocks"] == 2
110+
assert ws_stats["iterOnboardBlocks"] == 1
111+
assert ws_stats["iterOnboardBytes"] == 4096
112+
assert ws_stats["iterOffloadBlocks"] == 0
113+
assert ws_stats["iterOffloadBytes"] == 0
114+
assert ws_stats["iterCacheHitRate"] == pytest.approx(5 / 8)
115+
116+
def test_serializer_multiple_window_sizes(self):
117+
"""Multiple window sizes should all appear in output."""
118+
iter_stats = _make_mock_iteration_stats()
119+
kv_iter = _make_mock_kv_iter_stats(
120+
window_size=16,
121+
primary_used=5,
122+
primary_max=10,
123+
reused=2,
124+
full_reused=2,
125+
partial_reused=0,
126+
missed=1,
127+
gen_alloc=0,
128+
)
129+
# Add a second window size
130+
kv_iter[64] = _make_mock_kv_iter_stats(
131+
window_size=64,
132+
primary_used=8,
133+
primary_max=16,
134+
reused=3,
135+
full_reused=1,
136+
partial_reused=2,
137+
missed=2,
138+
gen_alloc=1,
139+
)[64]
140+
141+
result = BaseWorker._stats_serializer((iter_stats, None, kv_iter))
142+
d = json.loads(result)
143+
144+
iter_kv = d["kvCacheIterationStats"]
145+
assert "16" in iter_kv
146+
assert "64" in iter_kv
147+
assert iter_kv["16"]["primaryMaxNumBlocks"] == 10
148+
assert iter_kv["64"]["primaryMaxNumBlocks"] == 16
149+
150+
def test_serializer_with_request_stats(self):
151+
"""Request stats and kv iter stats should coexist."""
152+
iter_stats = _make_mock_iteration_stats()
153+
kv_iter = _make_mock_kv_iter_stats()
154+
155+
req_stat = MagicMock()
156+
req_stat.to_json_str.return_value = json.dumps({"id": 42})
157+
158+
result = BaseWorker._stats_serializer((iter_stats, [req_stat], kv_iter))
159+
d = json.loads(result)
160+
161+
assert "requestStats" in d
162+
assert len(d["requestStats"]) == 1
163+
assert d["requestStats"][0]["id"] == 42
164+
assert "kvCacheIterationStats" in d
165+
166+
def test_serializer_none_on_off_interval(self):
167+
"""When kv_iter_stats is None (off-interval), field should be absent."""
168+
iter_stats = _make_mock_iteration_stats()
169+
170+
result = BaseWorker._stats_serializer((iter_stats, None, None))
171+
d = json.loads(result)
172+
assert "kvCacheIterationStats" not in d
173+
174+
def test_serializer_legacy_2_tuple(self):
175+
"""Legacy 2-tuple without third element should work."""
176+
iter_stats = _make_mock_iteration_stats()
177+
178+
result = BaseWorker._stats_serializer((iter_stats, None))
179+
d = json.loads(result)
180+
assert "kvCacheIterationStats" not in d

tests/unittest/llmapi/apps/_test_openai_metrics.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,3 +98,19 @@ def test_metrics(client):
9898
assert "pinnedMemUsage" in response_dict
9999
assert "staticBatchingStats" in response_dict
100100
assert "timestamp" in response_dict
101+
# Per-iteration KV cache stats (keyed by window size)
102+
assert "kvCacheIterationStats" in response_dict
103+
kv_iter = response_dict["kvCacheIterationStats"]
104+
assert len(kv_iter) > 0
105+
# Check fields in the first (and likely only) window size entry
106+
ws_stats = next(iter(kv_iter.values()))
107+
assert "primaryMaxNumBlocks" in ws_stats
108+
assert "primaryUsedNumBlocks" in ws_stats
109+
assert "iterReusedBlocks" in ws_stats
110+
assert "iterFullReusedBlocks" in ws_stats
111+
assert "iterPartialReusedBlocks" in ws_stats
112+
assert "iterMissedBlocks" in ws_stats
113+
assert "iterCacheHitRate" in ws_stats
114+
assert "iterGenAllocBlocks" in ws_stats
115+
assert "iterOnboardBlocks" in ws_stats
116+
assert "iterOnboardBytes" in ws_stats

tests/unittest/llmapi/apps/_test_openai_prometheus.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,3 +119,8 @@ def test_metrics_endpoint(server: RemoteOpenAIServer):
119119
f"Iteration stats metrics not found after waiting {max_wait_time}s"
120120
assert metric_prefix + "kv_cache_hit_rate" in data
121121
assert metric_prefix + "kv_cache_utilization" in data
122+
123+
# Per-iteration KV cache metrics
124+
assert metric_prefix + "kv_cache_iter_reuse_rate" in data
125+
assert metric_prefix + "kv_cache_missed_blocks_total" in data
126+
assert metric_prefix + "kv_cache_gen_alloc_blocks_total" in data

0 commit comments

Comments
 (0)