Skip to content

Commit cdb8904

Browse files
committed
[TRTLLM-11421][test] Add unit and integration tests for KV cache iteration stats
Unit tests: - test_stats_serializer.py: 6 tests for kvCacheIterationStats serialization (field presence, multiple window sizes, missing stats, interval gating) - test_collector.py: 5 tests for Prometheus metric collection (gauge updates, counter increments, multi-window labels, missing stats) - _test_openai_metrics.py / _test_openai_prometheus.py: field additions for serving endpoint tests Integration test (test_kv_cache_iteration_stats.py) with 8 pytest scenarios: 1. Cold start — verifies iterMissedBlocks, iterAllocTotalBlocks, iterGenAllocBlocks 2. Partial block reuse — short prompt repeated, verifies iterPartialReusedBlocks 3. Full block reuse — long prompt (3+ blocks) repeated, verifies iterFullReusedBlocks 4. Shared prefix — common prefix with different suffixes, verifies iterReusedBlocks 5. Batch generation — multiple prompts in one call, verifies pool usage 6. Long context — large prompt, verifies block allocation at scale 7. Rapid-fire — 20 requests, verifies accumulated deltas 8. Field completeness — all 18 fields present in every stats entry Runnable via pytest or standalone. Supports --verbose-stats (pytest) or --verbose/--test N/--list (standalone) for selective execution. Added to premerge L0 pipeline (l0_h100.yml, l0_b200.yml). Signed-off-by: Yueh-Ting Chen <yueh.ting.chen@gmail.com>
1 parent 5b43d35 commit cdb8904

File tree

7 files changed

+956
-0
lines changed

7 files changed

+956
-0
lines changed

tests/integration/defs/kv_cache/test_kv_cache_iteration_stats.py

Lines changed: 495 additions & 0 deletions
Large diffs are not rendered by default.

tests/integration/test_lists/test-db/l0_b200.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,10 @@ l0_b200:
154154
- kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2LoRA::test_lora_multi_adapter_v2
155155
- kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2LoRA::test_lora_chunked_prefill
156156
- kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2LoRA::test_lora_eviction
157+
# ------------- KV Cache Iteration Stats ---------------
158+
- unittest/executor/test_stats_serializer.py
159+
- unittest/metrics/test_collector.py
160+
- kv_cache/test_kv_cache_iteration_stats.py
157161
# ------------- Visual Gen tests ---------------
158162
- unittest/_torch/visual_gen/test_visual_gen_args.py
159163
- unittest/_torch/visual_gen/test_teacache.py

tests/integration/test_lists/test-db/l0_h100.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,10 @@ l0_h100:
428428
- unittest/trt/attention/test_gpt_attention_no_cache.py
429429
- examples/test_gpt.py::test_gpt_oss_20b_lora_torch[gpt-oss-20b-lora-adapter_NIM_r8-gpt-oss-20b]
430430
- unittest/kv_cache_manager_v2_tests/ # 4 min
431+
# ------------- KV Cache Iteration Stats ---------------
432+
- unittest/executor/test_stats_serializer.py
433+
- unittest/metrics/test_collector.py
434+
- kv_cache/test_kv_cache_iteration_stats.py
431435
- condition:
432436
ranges:
433437
system_gpu_count:
Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""Tests for _stats_serializer with kvCacheIterationStats injection."""
16+
17+
import json
18+
from types import SimpleNamespace
19+
from unittest.mock import MagicMock
20+
21+
import pytest
22+
23+
from tensorrt_llm.executor.base_worker import BaseWorker
24+
25+
26+
def _make_mock_iteration_stats(kv_cache_stats_json=None):
27+
"""Create a mock IterationStats object with to_json_str()."""
28+
base = {
29+
"iter": 1,
30+
"iterLatencyMS": 10.5,
31+
"gpuMemUsage": 1024,
32+
"cpuMemUsage": 0,
33+
"pinnedMemUsage": 0,
34+
}
35+
if kv_cache_stats_json is not None:
36+
base["kvCacheStats"] = kv_cache_stats_json
37+
38+
mock = MagicMock()
39+
mock.to_json_str.return_value = json.dumps(base)
40+
return mock
41+
42+
43+
def _make_mock_kv_iter_stats(
44+
window_size=16,
45+
primary_used=10,
46+
primary_max=20,
47+
reused=5,
48+
full_reused=4,
49+
partial_reused=1,
50+
missed=3,
51+
gen_alloc=2,
52+
):
53+
"""Create a mock KvCacheIterationStats nanobind object."""
54+
s = SimpleNamespace(
55+
primary_max_num_blocks=primary_max,
56+
primary_free_num_blocks=primary_max - primary_used,
57+
primary_used_num_blocks=primary_used,
58+
secondary_max_num_blocks=0,
59+
secondary_free_num_blocks=0,
60+
secondary_used_num_blocks=0,
61+
iter_alloc_total_blocks=reused + missed,
62+
iter_alloc_new_blocks=missed,
63+
iter_reused_blocks=reused,
64+
iter_full_reused_blocks=full_reused,
65+
iter_partial_reused_blocks=partial_reused,
66+
iter_missed_blocks=missed,
67+
iter_cache_hit_rate=reused / (reused + missed) if (reused + missed) > 0 else 0.0,
68+
iter_gen_alloc_blocks=gen_alloc,
69+
iter_onboard_blocks=1,
70+
iter_onboard_bytes=4096,
71+
iter_offload_blocks=0,
72+
iter_offload_bytes=0,
73+
)
74+
return {window_size: s}
75+
76+
77+
class TestStatsSerializer:
78+
def test_serializer_without_kv_iter_stats(self):
79+
"""Legacy 2-tuple and 3-tuple with None should produce same output."""
80+
iter_stats = _make_mock_iteration_stats()
81+
82+
# 3-tuple with None kv_iter_stats
83+
result = BaseWorker._stats_serializer((iter_stats, None, None))
84+
d = json.loads(result)
85+
assert "iter" in d
86+
assert "kvCacheIterationStats" not in d
87+
88+
def test_serializer_with_kv_iter_stats(self):
89+
"""KvCacheIterationStats should appear when provided."""
90+
iter_stats = _make_mock_iteration_stats(
91+
kv_cache_stats_json={"maxNumBlocks": 20, "usedNumBlocks": 10}
92+
)
93+
kv_iter = _make_mock_kv_iter_stats(
94+
window_size=16,
95+
primary_used=10,
96+
primary_max=20,
97+
reused=5,
98+
full_reused=4,
99+
partial_reused=1,
100+
missed=3,
101+
gen_alloc=2,
102+
)
103+
104+
result = BaseWorker._stats_serializer((iter_stats, None, kv_iter))
105+
d = json.loads(result)
106+
107+
# Existing kvCacheStats should still be present
108+
assert "kvCacheStats" in d
109+
110+
# New kvCacheIterationStats should be present
111+
assert "kvCacheIterationStats" in d
112+
iter_kv = d["kvCacheIterationStats"]
113+
assert "16" in iter_kv # window size key as string
114+
115+
ws_stats = iter_kv["16"]
116+
assert ws_stats["primaryMaxNumBlocks"] == 20
117+
assert ws_stats["primaryUsedNumBlocks"] == 10
118+
assert ws_stats["primaryFreeNumBlocks"] == 10
119+
assert ws_stats["iterReusedBlocks"] == 5
120+
assert ws_stats["iterFullReusedBlocks"] == 4
121+
assert ws_stats["iterPartialReusedBlocks"] == 1
122+
assert ws_stats["iterMissedBlocks"] == 3
123+
assert ws_stats["iterGenAllocBlocks"] == 2
124+
assert ws_stats["iterOnboardBlocks"] == 1
125+
assert ws_stats["iterOnboardBytes"] == 4096
126+
assert ws_stats["iterOffloadBlocks"] == 0
127+
assert ws_stats["iterOffloadBytes"] == 0
128+
assert ws_stats["iterCacheHitRate"] == pytest.approx(5 / 8)
129+
130+
def test_serializer_multiple_window_sizes(self):
131+
"""Multiple window sizes should all appear in output."""
132+
iter_stats = _make_mock_iteration_stats()
133+
kv_iter = _make_mock_kv_iter_stats(
134+
window_size=16,
135+
primary_used=5,
136+
primary_max=10,
137+
reused=2,
138+
full_reused=2,
139+
partial_reused=0,
140+
missed=1,
141+
gen_alloc=0,
142+
)
143+
# Add a second window size
144+
kv_iter[64] = _make_mock_kv_iter_stats(
145+
window_size=64,
146+
primary_used=8,
147+
primary_max=16,
148+
reused=3,
149+
full_reused=1,
150+
partial_reused=2,
151+
missed=2,
152+
gen_alloc=1,
153+
)[64]
154+
155+
result = BaseWorker._stats_serializer((iter_stats, None, kv_iter))
156+
d = json.loads(result)
157+
158+
iter_kv = d["kvCacheIterationStats"]
159+
assert "16" in iter_kv
160+
assert "64" in iter_kv
161+
assert iter_kv["16"]["primaryMaxNumBlocks"] == 10
162+
assert iter_kv["64"]["primaryMaxNumBlocks"] == 16
163+
164+
def test_serializer_with_request_stats(self):
165+
"""Request stats and kv iter stats should coexist."""
166+
iter_stats = _make_mock_iteration_stats()
167+
kv_iter = _make_mock_kv_iter_stats()
168+
169+
req_stat = MagicMock()
170+
req_stat.to_json_str.return_value = json.dumps({"id": 42})
171+
172+
result = BaseWorker._stats_serializer((iter_stats, [req_stat], kv_iter))
173+
d = json.loads(result)
174+
175+
assert "requestStats" in d
176+
assert len(d["requestStats"]) == 1
177+
assert d["requestStats"][0]["id"] == 42
178+
assert "kvCacheIterationStats" in d
179+
180+
def test_serializer_none_on_off_interval(self):
181+
"""When kv_iter_stats is None (off-interval), field should be absent."""
182+
iter_stats = _make_mock_iteration_stats()
183+
184+
result = BaseWorker._stats_serializer((iter_stats, None, None))
185+
d = json.loads(result)
186+
assert "kvCacheIterationStats" not in d
187+
188+
def test_serializer_legacy_2_tuple(self):
189+
"""Legacy 2-tuple without third element should work."""
190+
iter_stats = _make_mock_iteration_stats()
191+
192+
result = BaseWorker._stats_serializer((iter_stats, None))
193+
d = json.loads(result)
194+
assert "kvCacheIterationStats" not in d

tests/unittest/llmapi/apps/_test_openai_metrics.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,3 +98,19 @@ def test_metrics(client):
9898
assert "pinnedMemUsage" in response_dict
9999
assert "staticBatchingStats" in response_dict
100100
assert "timestamp" in response_dict
101+
# Per-iteration KV cache stats (keyed by window size)
102+
assert "kvCacheIterationStats" in response_dict
103+
kv_iter = response_dict["kvCacheIterationStats"]
104+
assert len(kv_iter) > 0
105+
# Check fields in the first (and likely only) window size entry
106+
ws_stats = next(iter(kv_iter.values()))
107+
assert "primaryMaxNumBlocks" in ws_stats
108+
assert "primaryUsedNumBlocks" in ws_stats
109+
assert "iterReusedBlocks" in ws_stats
110+
assert "iterFullReusedBlocks" in ws_stats
111+
assert "iterPartialReusedBlocks" in ws_stats
112+
assert "iterMissedBlocks" in ws_stats
113+
assert "iterCacheHitRate" in ws_stats
114+
assert "iterGenAllocBlocks" in ws_stats
115+
assert "iterOnboardBlocks" in ws_stats
116+
assert "iterOnboardBytes" in ws_stats

tests/unittest/llmapi/apps/_test_openai_prometheus.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,3 +119,8 @@ def test_metrics_endpoint(server: RemoteOpenAIServer):
119119
f"Iteration stats metrics not found after waiting {max_wait_time}s"
120120
assert metric_prefix + "kv_cache_hit_rate" in data
121121
assert metric_prefix + "kv_cache_utilization" in data
122+
123+
# Per-iteration KV cache metrics
124+
assert metric_prefix + "kv_cache_iter_reuse_rate" in data
125+
assert metric_prefix + "kv_cache_missed_blocks_total" in data
126+
assert metric_prefix + "kv_cache_gen_alloc_blocks_total" in data

0 commit comments

Comments
 (0)