Skip to content

Commit 9b004c7

Browse files
committed
[TRTLLM-11421][feat] Add per-iteration KV cache statistics and Prometheus metrics
Add real-time, per-iteration KV cache monitoring across 3 layers: C++ layer (kvCacheManager): - KvCacheIterationStats struct with 18 fields: pool gauges (primary/secondary max/free/used), context-phase deltas (alloc total/new, reused full/partial, missed, hit rate), generation-phase deltas (gen alloc), and transfer deltas (onboard/offload blocks and bytes) - getAndResetIterationStats() on BlockManager with atomic delta reset - KvCacheTransferStats for onboard/offload byte counting Python plumbing: - Nanobind bindings for KvCacheIterationStats - get_iteration_stats() on resource_manager, plumbed through PyExecutor - Interval-gated collection via KvCacheConfig.iteration_stats_interval - Stats serialization in base_worker as kvCacheIterationStats dict Prometheus metrics (collector.py): - 9 new metrics: pool gauges, reuse rate, missed/reused/gen-alloc counters, onboard/offload byte counters, keyed by window_size label Signed-off-by: Yueh-Ting Chen <yueh.ting.chen@gmail.com>
1 parent 68001ce commit 9b004c7

File tree

10 files changed

+434
-9
lines changed

10 files changed

+434
-9
lines changed

cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h

Lines changed: 80 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,39 @@ struct KvCacheStats
172172
std::size_t allocatedBytes{};
173173
};
174174

175+
/// @brief Per-iteration KV cache statistics. All delta counters represent changes since the last call to
176+
/// getIterationStats(). Gauges are instantaneous snapshots.
177+
struct KvCacheIterationStats
178+
{
179+
// --- Instantaneous gauges ---
180+
// Primary (GPU) pool
181+
SizeType32 primaryMaxNumBlocks{0};
182+
SizeType32 primaryFreeNumBlocks{0};
183+
SizeType32 primaryUsedNumBlocks{0};
184+
// Secondary (host) pool
185+
SizeType32 secondaryMaxNumBlocks{0};
186+
SizeType32 secondaryFreeNumBlocks{0};
187+
SizeType32 secondaryUsedNumBlocks{0};
188+
189+
// --- Per-iteration deltas (reset on each read) ---
190+
// Context phase: block allocation and reuse
191+
SizeType32 iterAllocTotalBlocks{0};
192+
SizeType32 iterAllocNewBlocks{0};
193+
SizeType32 iterReusedBlocks{0}; // = iterFullReusedBlocks + iterPartialReusedBlocks
194+
SizeType32 iterFullReusedBlocks{0}; // blocks fully matched in radix tree
195+
SizeType32 iterPartialReusedBlocks{0}; // blocks partially matched in radix tree
196+
SizeType32 iterMissedBlocks{0};
197+
float iterCacheHitRate{0.0f};
198+
// Generation phase: block allocation
199+
SizeType32 iterGenAllocBlocks{0};
200+
201+
// Transfer traffic deltas
202+
SizeType32 iterOnboardBlocks{0};
203+
std::size_t iterOnboardBytes{0};
204+
SizeType32 iterOffloadBlocks{0};
205+
std::size_t iterOffloadBytes{0};
206+
};
207+
175208
// Basic building block of a paged KV cache - a single
176209
// cache block. This class just holds metadata, no pointers
177210
// since it is reused across all layers.
@@ -707,6 +740,12 @@ class WindowBlockManager
707740
return mMissedBlocks;
708741
}
709742

743+
// Get num free blocks in the secondary (host) memory pool
744+
[[nodiscard]] SizeType32 getNumFreeSecondaryBlocks() const noexcept;
745+
746+
//! \brief Get iteration stats (deltas since last call) for this window. Resets internal delta snapshots.
747+
[[nodiscard]] KvCacheIterationStats getAndResetIterationStats();
748+
710749
[[nodiscard]] bool hasFreeBlocks(SizeType32 numRequired = 1) const noexcept
711750
{
712751
return getNumFreeBlocks() >= numRequired;
@@ -1006,16 +1045,22 @@ class WindowBlockManager
10061045
std::shared_ptr<KVCacheTransferManager> mTransferManager;
10071046

10081047
// Statistics for block allocations/reuse
1009-
// Total number of blocks allocated by all requests
1048+
// Total number of blocks allocated by all requests (context phase)
10101049
SizeType32 mAllocTotalBlocks;
1011-
// Number of new blocks that were allocated
1050+
// Number of new blocks that were allocated (context phase)
10121051
SizeType32 mAllocNewBlocks;
1013-
// Number of blocks that were reused
1052+
// Number of blocks that were fully reused (context phase)
1053+
SizeType32 mFullReusedBlocks;
1054+
// Number of blocks that were partially reused (context phase)
1055+
SizeType32 mPartialReusedBlocks;
1056+
// Number of blocks that were reused (full + partial, context phase)
10141057
SizeType32 mReusedBlocks;
10151058
// Number of unique blocks that were reused
10161059
SizeType32 mReusedUniqueBlocks;
1017-
// Number of blocks that were not reused
1060+
// Number of blocks that were not reused (context phase)
10181061
SizeType32 mMissedBlocks;
1062+
// Number of blocks allocated during generation phase
1063+
SizeType32 mGenAllocBlocks;
10191064
// Only be 1 or 2. If 2: general KV stored. If 1: K == V for any token, so only K is stored to optimize the
10201065
// max_num_tokens(For DeepSeek). Controlled by mCacheType
10211066
SizeType32 mKVFactor;
@@ -1032,6 +1077,15 @@ class WindowBlockManager
10321077
// The kv cache connector manager
10331078
std::shared_ptr<kv_connector::KvCacheConnectorManager> mKvCacheConnectorManager;
10341079

1080+
// Snapshot of cumulative counters at last iteration stats read (for delta computation)
1081+
SizeType32 mPrevAllocTotalBlocks{0};
1082+
SizeType32 mPrevAllocNewBlocks{0};
1083+
SizeType32 mPrevReusedBlocks{0};
1084+
SizeType32 mPrevFullReusedBlocks{0};
1085+
SizeType32 mPrevPartialReusedBlocks{0};
1086+
SizeType32 mPrevMissedBlocks{0};
1087+
SizeType32 mPrevGenAllocBlocks{0};
1088+
10351089
// Mutex for the cached blocks root
10361090
mutable std::mutex mCachedBlocksRootMutex;
10371091

@@ -1230,6 +1284,19 @@ class BlockManager
12301284
return sumWindows([](auto const& manager) { return manager.getNumMissedBlocks(); });
12311285
}
12321286

1287+
[[nodiscard]] SizeType32 getNumSecondaryBlocks() const
1288+
{
1289+
return sumWindows([](auto const& manager) { return manager.getNumSecondaryBlocks(); });
1290+
}
1291+
1292+
[[nodiscard]] SizeType32 getNumFreeSecondaryBlocks() const
1293+
{
1294+
return sumWindows([](auto const& manager) { return manager.getNumFreeSecondaryBlocks(); });
1295+
}
1296+
1297+
//! \brief Get per-window-size iteration stats. Resets delta snapshots for each window.
1298+
[[nodiscard]] std::map<SizeType32, KvCacheIterationStats> getAndResetIterationStats();
1299+
12331300
[[nodiscard]] SizeType32 getNumLayers() const
12341301
{
12351302
return mNumLayers;
@@ -1536,6 +1603,10 @@ class BaseKVCacheManager
15361603

15371604
[[nodiscard]] virtual KvCacheStats getKvCacheStats() const = 0;
15381605

1606+
//! \brief Get per-iteration stats with delta counters, keyed by window size.
1607+
//! Resets delta snapshots on each call.
1608+
[[nodiscard]] virtual std::map<SizeType32, KvCacheIterationStats> getIterationStats() = 0;
1609+
15391610
[[nodiscard]] virtual OffsetTableDimensions getOffsetTableDimensions() const = 0;
15401611

15411612
[[nodiscard]] virtual std::deque<executor::KVCacheEvent> getLatestEvents(
@@ -1878,6 +1949,11 @@ class KVCacheManager : public BaseKVCacheManager
18781949
return kvCacheStats;
18791950
}
18801951

1952+
[[nodiscard]] std::map<SizeType32, KvCacheIterationStats> getIterationStats() override
1953+
{
1954+
return mBlockManager.getAndResetIterationStats();
1955+
}
1956+
18811957
[[nodiscard]] OffsetTableDimensions getOffsetTableDimensions() const override
18821958
{
18831959
OffsetTableDimensions dims;

cpp/include/tensorrt_llm/batch_manager/kvCacheTransferManager.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,16 @@ namespace kvc = tensorrt_llm::executor::kv_cache;
2727
namespace tensorrt_llm::batch_manager::kv_cache_manager
2828
{
2929

30+
/// @brief Statistics for block transfers between primary (GPU) and secondary (host) memory.
31+
/// Returned by KVCacheTransferManager::getAndResetTransferStats(). All counters are reset on read.
32+
struct KvCacheTransferStats
33+
{
34+
SizeType32 onboardBlocks{0};
35+
std::size_t onboardBytes{0};
36+
SizeType32 offloadBlocks{0};
37+
std::size_t offloadBytes{0};
38+
};
39+
3040
// The TransferManager accelerates transfers to/from the GPU by overlapping HtoD and DtoH transfers, and tracks ongoing
3141
// transfers in order to avoid race conditions. It is functionally equivalent to the prior approach of putting all
3242
// transfers into the forward pass stream. This is only ever used as a component of a KVCacheManager.
@@ -57,6 +67,9 @@ class KVCacheTransferManager
5767
//! must be called after last call to KVCacheManager::addSequence in every step.
5868
void syncTransfers();
5969

70+
//! \brief Get transfer stats accumulated since last call, and reset the counters.
71+
[[nodiscard]] KvCacheTransferStats getAndResetTransferStats();
72+
6073
private:
6174
//! \brief Get pointer to pool specified by cache block.
6275
static tr::ITensor::SharedPtr computeBlockPointer(
@@ -79,6 +92,12 @@ class KVCacheTransferManager
7992
int numTokensToCopy = 0, executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM,
8093
std::string const& directory = "");
8194

95+
//! \brief Compute total bytes actually transferred for a block copy across all pools.
96+
//! \param pools The pool descriptors.
97+
//! \param numTokensToCopy Number of tokens for partial copy (0 means full block).
98+
[[nodiscard]] std::size_t computeBlockTransferBytes(
99+
std::vector<KVCacheBlockPool> const& pools, int numTokensToCopy) const;
100+
82101
runtime::BufferManager mBufferManager;
83102
runtime::BufferManager mOnboardManager;
84103
runtime::BufferManager mOffloadManager;
@@ -90,6 +109,12 @@ class KVCacheTransferManager
90109
// Reference to parent loopback agent
91110
std::shared_ptr<kvc::BaseLoopbackAgent> mLoopbackAgent;
92111
int mDeviceId;
112+
113+
// Cumulative transfer statistics, reset on each call to getAndResetTransferStats()
114+
SizeType32 mOnboardBlockCount{0};
115+
std::size_t mOnboardByteCount{0};
116+
SizeType32 mOffloadBlockCount{0};
117+
std::size_t mOffloadByteCount{0};
93118
};
94119

95120
} // namespace tensorrt_llm::batch_manager::kv_cache_manager

cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -666,9 +666,12 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind
666666
, mTransferManager{std::make_shared<KVCacheTransferManager>(mBufferManager, mLoopbackAgent)}
667667
, mAllocTotalBlocks{0}
668668
, mAllocNewBlocks{0}
669+
, mFullReusedBlocks{0}
670+
, mPartialReusedBlocks{0}
669671
, mReusedBlocks{0}
670672
, mReusedUniqueBlocks{0}
671673
, mMissedBlocks{0}
674+
, mGenAllocBlocks{0}
672675
, mKVFactor{mCacheType == CacheType::kSELFKONLY ? 1 : 2}
673676
, mLogPrefix{tensorrt_llm::common::fmtstr("BlockManager[windowSize=%u]", mWindowSize)}
674677
, mReusedTokens{0.0}
@@ -1324,6 +1327,14 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
13241327
addBlockToAllBeams(matchingBlock, sequence);
13251328
// TODO: only add once for reused blocks
13261329
++mReusedBlocks;
1330+
if (partialMatch)
1331+
{
1332+
++mPartialReusedBlocks;
1333+
}
1334+
else
1335+
{
1336+
++mFullReusedBlocks;
1337+
}
13271338
if (!mReusedBlockIds.count(matchingBlockId))
13281339
{
13291340
mReusedBlockIds.insert(matchingBlockId);
@@ -1510,6 +1521,7 @@ void WindowBlockManager::adjustBlocksIfNeeded(GenerationRequest& sequence)
15101521
{
15111522
// Allocating a new block when the last token is a block boundary
15121523
allocateBlock(sequence, /*shareAmongBeams=*/sequence.getBeamWidth() == 1);
1524+
++mGenAllocBlocks;
15131525
updateLastCacheBlockOffsets(sequence);
15141526
}
15151527
}
@@ -1784,6 +1796,70 @@ void WindowBlockManager::releaseLastBlock(GenerationRequest& sequence)
17841796
return mEvictionPolicy->getNumFreeBlocks(kPrimaryLevel);
17851797
}
17861798

1799+
[[nodiscard]] SizeType32 WindowBlockManager::getNumFreeSecondaryBlocks() const noexcept
1800+
{
1801+
return mEvictionPolicy->getNumFreeBlocks(kSecondaryLevel);
1802+
}
1803+
1804+
KvCacheIterationStats WindowBlockManager::getAndResetIterationStats()
1805+
{
1806+
KvCacheIterationStats stats;
1807+
1808+
// Instantaneous gauges
1809+
stats.primaryMaxNumBlocks = getNumPrimaryBlocks();
1810+
stats.primaryFreeNumBlocks = getNumFreeBlocks();
1811+
stats.primaryUsedNumBlocks = stats.primaryMaxNumBlocks - stats.primaryFreeNumBlocks;
1812+
stats.secondaryMaxNumBlocks = getNumSecondaryBlocks();
1813+
stats.secondaryFreeNumBlocks = getNumFreeSecondaryBlocks();
1814+
stats.secondaryUsedNumBlocks = stats.secondaryMaxNumBlocks - stats.secondaryFreeNumBlocks;
1815+
1816+
// Compute deltas since last call — context phase
1817+
stats.iterAllocTotalBlocks = mAllocTotalBlocks - mPrevAllocTotalBlocks;
1818+
stats.iterAllocNewBlocks = mAllocNewBlocks - mPrevAllocNewBlocks;
1819+
stats.iterReusedBlocks = mReusedBlocks - mPrevReusedBlocks;
1820+
stats.iterFullReusedBlocks = mFullReusedBlocks - mPrevFullReusedBlocks;
1821+
stats.iterPartialReusedBlocks = mPartialReusedBlocks - mPrevPartialReusedBlocks;
1822+
stats.iterMissedBlocks = mMissedBlocks - mPrevMissedBlocks;
1823+
1824+
auto const iterTotal = stats.iterReusedBlocks + stats.iterMissedBlocks;
1825+
stats.iterCacheHitRate
1826+
= iterTotal == 0 ? 0.0f : static_cast<float>(stats.iterReusedBlocks) / static_cast<float>(iterTotal);
1827+
1828+
// Generation phase
1829+
stats.iterGenAllocBlocks = mGenAllocBlocks - mPrevGenAllocBlocks;
1830+
1831+
// Snapshot current values for next delta
1832+
mPrevAllocTotalBlocks = mAllocTotalBlocks;
1833+
mPrevAllocNewBlocks = mAllocNewBlocks;
1834+
mPrevReusedBlocks = mReusedBlocks;
1835+
mPrevFullReusedBlocks = mFullReusedBlocks;
1836+
mPrevPartialReusedBlocks = mPartialReusedBlocks;
1837+
mPrevMissedBlocks = mMissedBlocks;
1838+
mPrevGenAllocBlocks = mGenAllocBlocks;
1839+
1840+
// Transfer stats (collected from transfer manager)
1841+
if (mTransferManager)
1842+
{
1843+
auto transferStats = mTransferManager->getAndResetTransferStats();
1844+
stats.iterOnboardBlocks = transferStats.onboardBlocks;
1845+
stats.iterOnboardBytes = transferStats.onboardBytes;
1846+
stats.iterOffloadBlocks = transferStats.offloadBlocks;
1847+
stats.iterOffloadBytes = transferStats.offloadBytes;
1848+
}
1849+
1850+
return stats;
1851+
}
1852+
1853+
std::map<SizeType32, KvCacheIterationStats> BlockManager::getAndResetIterationStats()
1854+
{
1855+
std::map<SizeType32, KvCacheIterationStats> perWindowStats;
1856+
for (auto& [windowSize, manager] : mWindowBlockManagers)
1857+
{
1858+
perWindowStats[windowSize] = manager.getAndResetIterationStats();
1859+
}
1860+
return perWindowStats;
1861+
}
1862+
17871863
std::deque<tle::KVCacheEvent> BlockManager::getLatestEvents(std::optional<std::chrono::milliseconds> timeout) const
17881864
{
17891865
return mEventManager ? mEventManager->getEvents(timeout) : std::deque<tle::KVCacheEvent>{};

cpp/tensorrt_llm/batch_manager/kvCacheTransferManager.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,10 @@ void KVCacheTransferManager::onboard(BlockPtr const& offloadedBlock, BlockPtr co
273273

274274
copyBlock(offloadedBlock, block, pools, false, numTokensToCopy, mode, directory);
275275

276+
// Update transfer statistics
277+
++mOnboardBlockCount;
278+
mOnboardByteCount += computeBlockTransferBytes(pools, numTokensToCopy);
279+
276280
// Record new pending read from offloadedBlock
277281
mPendingReads[offloadedBlock->getMemoryPoolBlockIndex()] = tr::CudaEvent();
278282
mOnboardManager.getStream().record(mPendingReads[offloadedBlock->getMemoryPoolBlockIndex()]);
@@ -309,6 +313,10 @@ void KVCacheTransferManager::offload(BlockPtr const& block, BlockPtr const& offl
309313

310314
copyBlock(block, offloadBlock, pools, true, numTokensToCopy, mode, directory);
311315

316+
// Update transfer statistics
317+
++mOffloadBlockCount;
318+
mOffloadByteCount += computeBlockTransferBytes(pools, numTokensToCopy);
319+
312320
// Record new pending read from block
313321
mPendingReads[block->getMemoryPoolBlockIndex()] = tr::CudaEvent();
314322
mOffloadManager.getStream().record(mPendingReads[block->getMemoryPoolBlockIndex()]);
@@ -347,4 +355,55 @@ void KVCacheTransferManager::syncTransfers()
347355
mPendingWrites.clear();
348356
}
349357

358+
KvCacheTransferStats KVCacheTransferManager::getAndResetTransferStats()
359+
{
360+
KvCacheTransferStats stats;
361+
stats.onboardBlocks = mOnboardBlockCount;
362+
stats.onboardBytes = mOnboardByteCount;
363+
stats.offloadBlocks = mOffloadBlockCount;
364+
stats.offloadBytes = mOffloadByteCount;
365+
mOnboardBlockCount = 0;
366+
mOnboardByteCount = 0;
367+
mOffloadBlockCount = 0;
368+
mOffloadByteCount = 0;
369+
return stats;
370+
}
371+
372+
std::size_t KVCacheTransferManager::computeBlockTransferBytes(
373+
std::vector<KVCacheBlockPool> const& pools, int numTokensToCopy) const
374+
{
375+
std::size_t totalBytes = 0;
376+
for (auto const& pool : pools)
377+
{
378+
if (!pool.primaryPtr)
379+
{
380+
continue;
381+
}
382+
383+
auto const dataType = pool.primaryPtr->getDataType();
384+
auto const bytesPerElement
385+
= pool.primaryPtr->getSizeInBytes() / static_cast<std::size_t>(pool.primaryPtr->getSize());
386+
387+
// Mirror the logic in copyBlock: a partial copy only happens when numTokensToCopy > 0,
388+
// the data type supports it (not kINT4/kFP4), not block scales, and numTokensToCopy < tokensPerBlock.
389+
bool const isPartialCopy = numTokensToCopy > 0 && dataType != nvinfer1::DataType::kINT4
390+
&& dataType != nvinfer1::DataType::kFP4 && !pool.containsBlockScales
391+
&& numTokensToCopy < pool.tokensPerBlock;
392+
393+
if (isPartialCopy)
394+
{
395+
// Partial copy transfers: numLayers * kvFactor * numKvHeads * sizePerHead * numTokensToCopy elements
396+
totalBytes += static_cast<std::size_t>(pool.numLayers) * pool.kvFactor * pool.numKvHeads * pool.sizePerHead
397+
* numTokensToCopy * bytesPerElement;
398+
}
399+
else
400+
{
401+
// Full block copy: numLayers * kvFactor * blockSize elements
402+
// where blockSize = numKvHeads * sizePerHead * tokensPerBlock
403+
totalBytes += static_cast<std::size_t>(pool.numLayers) * pool.kvFactor * pool.blockSize * bytesPerElement;
404+
}
405+
}
406+
return totalBytes;
407+
}
408+
350409
} // namespace tensorrt_llm::batch_manager::kv_cache_manager

0 commit comments

Comments
 (0)