NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 80 additions & 4 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 80 additions & 4 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheTransferManager.h‎
Lines changed: 25 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheTransferManager.h‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp‎
Lines changed: 76 additions & 0 deletions b/‎cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/kvCacheTransferManager.cpp‎
Lines changed: 59 additions & 0 deletions b/‎cpp/tensorrt_llm/batch_manager/kvCacheTransferManager.cpp‎
Lines changed: 59 additions & 0 deletions
@@ -172,6 +172,39 @@ struct KvCacheStats
     std::size_t allocatedBytes{};
 };
 
+/// @brief Per-iteration KV cache statistics. All delta counters represent changes since the last call to
+/// getIterationStats(). Gauges are instantaneous snapshots.
+struct KvCacheIterationStats
+{
+    // --- Instantaneous gauges ---
+    // Primary (GPU) pool
+    SizeType32 primaryMaxNumBlocks{0};
+    SizeType32 primaryFreeNumBlocks{0};
+    SizeType32 primaryUsedNumBlocks{0};
+    // Secondary (host) pool
+    SizeType32 secondaryMaxNumBlocks{0};
+    SizeType32 secondaryFreeNumBlocks{0};
+    SizeType32 secondaryUsedNumBlocks{0};
+
+    // --- Per-iteration deltas (reset on each read) ---
+    // Context phase: block allocation and reuse
+    SizeType32 iterAllocTotalBlocks{0};
+    SizeType32 iterAllocNewBlocks{0};
+    SizeType32 iterReusedBlocks{0};        // = iterFullReusedBlocks + iterPartialReusedBlocks
+    SizeType32 iterFullReusedBlocks{0};    // blocks fully matched in radix tree
+    SizeType32 iterPartialReusedBlocks{0}; // blocks partially matched in radix tree
+    SizeType32 iterMissedBlocks{0};
+    float iterCacheHitRate{0.0f};
+    // Generation phase: block allocation
+    SizeType32 iterGenAllocBlocks{0};
+
+    // Transfer traffic deltas
+    SizeType32 iterOnboardBlocks{0};
+    std::size_t iterOnboardBytes{0};
+    SizeType32 iterOffloadBlocks{0};
+    std::size_t iterOffloadBytes{0};
+};
+
 // Basic building block of a paged KV cache - a single
 // cache block. This class just holds metadata, no pointers
 // since it is reused across all layers.
@@ -707,6 +740,12 @@ class WindowBlockManager
         return mMissedBlocks;
     }
 
+    // Get num free blocks in the secondary (host) memory pool
+    [[nodiscard]] SizeType32 getNumFreeSecondaryBlocks() const noexcept;
+
+    //! \brief Get iteration stats (deltas since last call) for this window. Resets internal delta snapshots.
+    [[nodiscard]] KvCacheIterationStats getAndResetIterationStats();
+
     [[nodiscard]] bool hasFreeBlocks(SizeType32 numRequired = 1) const noexcept
     {
         return getNumFreeBlocks() >= numRequired;
@@ -1006,16 +1045,22 @@ class WindowBlockManager
     std::shared_ptr<KVCacheTransferManager> mTransferManager;
 
     // Statistics for block allocations/reuse
-    // Total number of blocks allocated by all requests
+    // Total number of blocks allocated by all requests (context phase)
     SizeType32 mAllocTotalBlocks;
-    // Number of new blocks that were allocated
+    // Number of new blocks that were allocated (context phase)
     SizeType32 mAllocNewBlocks;
-    // Number of blocks that were reused
+    // Number of blocks that were fully reused (context phase)
+    SizeType32 mFullReusedBlocks;
+    // Number of blocks that were partially reused (context phase)
+    SizeType32 mPartialReusedBlocks;
+    // Number of blocks that were reused (full + partial, context phase)
     SizeType32 mReusedBlocks;
     // Number of unique blocks that were reused
     SizeType32 mReusedUniqueBlocks;
-    // Number of blocks that were not reused
+    // Number of blocks that were not reused (context phase)
     SizeType32 mMissedBlocks;
+    // Number of blocks allocated during generation phase
+    SizeType32 mGenAllocBlocks;
     // Only be 1 or 2. If 2: general KV stored. If 1: K == V for any token, so only K is stored to optimize the
     // max_num_tokens(For DeepSeek). Controlled by mCacheType
     SizeType32 mKVFactor;
@@ -1032,6 +1077,15 @@ class WindowBlockManager
     // The kv cache connector manager
     std::shared_ptr<kv_connector::KvCacheConnectorManager> mKvCacheConnectorManager;
 
+    // Snapshot of cumulative counters at last iteration stats read (for delta computation)
+    SizeType32 mPrevAllocTotalBlocks{0};
+    SizeType32 mPrevAllocNewBlocks{0};
+    SizeType32 mPrevReusedBlocks{0};
+    SizeType32 mPrevFullReusedBlocks{0};
+    SizeType32 mPrevPartialReusedBlocks{0};
+    SizeType32 mPrevMissedBlocks{0};
+    SizeType32 mPrevGenAllocBlocks{0};
+
     // Mutex for the cached blocks root
     mutable std::mutex mCachedBlocksRootMutex;
 
@@ -1230,6 +1284,19 @@ class BlockManager
         return sumWindows([](auto const& manager) { return manager.getNumMissedBlocks(); });
     }
 
+    [[nodiscard]] SizeType32 getNumSecondaryBlocks() const
+    {
+        return sumWindows([](auto const& manager) { return manager.getNumSecondaryBlocks(); });
+    }
+
+    [[nodiscard]] SizeType32 getNumFreeSecondaryBlocks() const
+    {
+        return sumWindows([](auto const& manager) { return manager.getNumFreeSecondaryBlocks(); });
+    }
+
+    //! \brief Get per-window-size iteration stats. Resets delta snapshots for each window.
+    [[nodiscard]] std::map<SizeType32, KvCacheIterationStats> getAndResetIterationStats();
+
     [[nodiscard]] SizeType32 getNumLayers() const
     {
         return mNumLayers;
@@ -1536,6 +1603,10 @@ class BaseKVCacheManager
 
     [[nodiscard]] virtual KvCacheStats getKvCacheStats() const = 0;
 
+    //! \brief Get per-iteration stats with delta counters, keyed by window size.
+    //! Resets delta snapshots on each call.
+    [[nodiscard]] virtual std::map<SizeType32, KvCacheIterationStats> getIterationStats() = 0;
+
     [[nodiscard]] virtual OffsetTableDimensions getOffsetTableDimensions() const = 0;
 
     [[nodiscard]] virtual std::deque<executor::KVCacheEvent> getLatestEvents(
@@ -1878,6 +1949,11 @@ class KVCacheManager : public BaseKVCacheManager
         return kvCacheStats;
     }
 
+    [[nodiscard]] std::map<SizeType32, KvCacheIterationStats> getIterationStats() override
+    {
+        return mBlockManager.getAndResetIterationStats();
+    }
+
     [[nodiscard]] OffsetTableDimensions getOffsetTableDimensions() const override
     {
         OffsetTableDimensions dims;
 
@@ -27,6 +27,16 @@ namespace kvc = tensorrt_llm::executor::kv_cache;
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 
+/// @brief Statistics for block transfers between primary (GPU) and secondary (host) memory.
+/// Returned by KVCacheTransferManager::getAndResetTransferStats(). All counters are reset on read.
+struct KvCacheTransferStats
+{
+    SizeType32 onboardBlocks{0};
+    std::size_t onboardBytes{0};
+    SizeType32 offloadBlocks{0};
+    std::size_t offloadBytes{0};
+};
+
 // The TransferManager accelerates transfers to/from the GPU by overlapping HtoD and DtoH transfers, and tracks ongoing
 // transfers in order to avoid race conditions. It is functionally equivalent to the prior approach of putting all
 // transfers into the forward pass stream. This is only ever used as a component of a KVCacheManager.
@@ -57,6 +67,9 @@ class KVCacheTransferManager
     //! must be called after last call to KVCacheManager::addSequence in every step.
     void syncTransfers();
 
+    //! \brief Get transfer stats accumulated since last call, and reset the counters.
+    [[nodiscard]] KvCacheTransferStats getAndResetTransferStats();
+
 private:
     //! \brief Get pointer to pool specified by cache block.
     static tr::ITensor::SharedPtr computeBlockPointer(
@@ -79,6 +92,12 @@ class KVCacheTransferManager
         int numTokensToCopy = 0, executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM,
         std::string const& directory = "");
 
+    //! \brief Compute total bytes actually transferred for a block copy across all pools.
+    //! \param pools The pool descriptors.
+    //! \param numTokensToCopy Number of tokens for partial copy (0 means full block).
+    [[nodiscard]] std::size_t computeBlockTransferBytes(
+        std::vector<KVCacheBlockPool> const& pools, int numTokensToCopy) const;
+
     runtime::BufferManager mBufferManager;
     runtime::BufferManager mOnboardManager;
     runtime::BufferManager mOffloadManager;
@@ -90,6 +109,12 @@ class KVCacheTransferManager
     // Reference to parent loopback agent
     std::shared_ptr<kvc::BaseLoopbackAgent> mLoopbackAgent;
     int mDeviceId;
+
+    // Cumulative transfer statistics, reset on each call to getAndResetTransferStats()
+    SizeType32 mOnboardBlockCount{0};
+    std::size_t mOnboardByteCount{0};
+    SizeType32 mOffloadBlockCount{0};
+    std::size_t mOffloadByteCount{0};
 };
 
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager
@@ -666,9 +666,12 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind
     , mTransferManager{std::make_shared<KVCacheTransferManager>(mBufferManager, mLoopbackAgent)}
     , mAllocTotalBlocks{0}
     , mAllocNewBlocks{0}
+    , mFullReusedBlocks{0}
+    , mPartialReusedBlocks{0}
     , mReusedBlocks{0}
     , mReusedUniqueBlocks{0}
     , mMissedBlocks{0}
+    , mGenAllocBlocks{0}
     , mKVFactor{mCacheType == CacheType::kSELFKONLY ? 1 : 2}
     , mLogPrefix{tensorrt_llm::common::fmtstr("BlockManager[windowSize=%u]", mWindowSize)}
     , mReusedTokens{0.0}
@@ -1324,6 +1327,14 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
             addBlockToAllBeams(matchingBlock, sequence);
             // TODO: only add once for reused blocks
             ++mReusedBlocks;
+            if (partialMatch)
+            {
+                ++mPartialReusedBlocks;
+            }
+            else
+            {
+                ++mFullReusedBlocks;
+            }
             if (!mReusedBlockIds.count(matchingBlockId))
             {
                 mReusedBlockIds.insert(matchingBlockId);
@@ -1510,6 +1521,7 @@ void WindowBlockManager::adjustBlocksIfNeeded(GenerationRequest& sequence)
     {
         // Allocating a new block when the last token is a block boundary
         allocateBlock(sequence, /*shareAmongBeams=*/sequence.getBeamWidth() == 1);
+        ++mGenAllocBlocks;
         updateLastCacheBlockOffsets(sequence);
     }
 }
@@ -1784,6 +1796,70 @@ void WindowBlockManager::releaseLastBlock(GenerationRequest& sequence)
     return mEvictionPolicy->getNumFreeBlocks(kPrimaryLevel);
 }
 
+[[nodiscard]] SizeType32 WindowBlockManager::getNumFreeSecondaryBlocks() const noexcept
+{
+    return mEvictionPolicy->getNumFreeBlocks(kSecondaryLevel);
+}
+
+KvCacheIterationStats WindowBlockManager::getAndResetIterationStats()
+{
+    KvCacheIterationStats stats;
+
+    // Instantaneous gauges
+    stats.primaryMaxNumBlocks = getNumPrimaryBlocks();
+    stats.primaryFreeNumBlocks = getNumFreeBlocks();
+    stats.primaryUsedNumBlocks = stats.primaryMaxNumBlocks - stats.primaryFreeNumBlocks;
+    stats.secondaryMaxNumBlocks = getNumSecondaryBlocks();
+    stats.secondaryFreeNumBlocks = getNumFreeSecondaryBlocks();
+    stats.secondaryUsedNumBlocks = stats.secondaryMaxNumBlocks - stats.secondaryFreeNumBlocks;
+
+    // Compute deltas since last call — context phase
+    stats.iterAllocTotalBlocks = mAllocTotalBlocks - mPrevAllocTotalBlocks;
+    stats.iterAllocNewBlocks = mAllocNewBlocks - mPrevAllocNewBlocks;
+    stats.iterReusedBlocks = mReusedBlocks - mPrevReusedBlocks;
+    stats.iterFullReusedBlocks = mFullReusedBlocks - mPrevFullReusedBlocks;
+    stats.iterPartialReusedBlocks = mPartialReusedBlocks - mPrevPartialReusedBlocks;
+    stats.iterMissedBlocks = mMissedBlocks - mPrevMissedBlocks;
+
+    auto const iterTotal = stats.iterReusedBlocks + stats.iterMissedBlocks;
+    stats.iterCacheHitRate
+        = iterTotal == 0 ? 0.0f : static_cast<float>(stats.iterReusedBlocks) / static_cast<float>(iterTotal);
+
+    // Generation phase
+    stats.iterGenAllocBlocks = mGenAllocBlocks - mPrevGenAllocBlocks;
+
+    // Snapshot current values for next delta
+    mPrevAllocTotalBlocks = mAllocTotalBlocks;
+    mPrevAllocNewBlocks = mAllocNewBlocks;
+    mPrevReusedBlocks = mReusedBlocks;
+    mPrevFullReusedBlocks = mFullReusedBlocks;
+    mPrevPartialReusedBlocks = mPartialReusedBlocks;
+    mPrevMissedBlocks = mMissedBlocks;
+    mPrevGenAllocBlocks = mGenAllocBlocks;
+
+    // Transfer stats (collected from transfer manager)
+    if (mTransferManager)
+    {
+        auto transferStats = mTransferManager->getAndResetTransferStats();
+        stats.iterOnboardBlocks = transferStats.onboardBlocks;
+        stats.iterOnboardBytes = transferStats.onboardBytes;
+        stats.iterOffloadBlocks = transferStats.offloadBlocks;
+        stats.iterOffloadBytes = transferStats.offloadBytes;
+    }
+
+    return stats;
+}
+
+std::map<SizeType32, KvCacheIterationStats> BlockManager::getAndResetIterationStats()
+{
+    std::map<SizeType32, KvCacheIterationStats> perWindowStats;
+    for (auto& [windowSize, manager] : mWindowBlockManagers)
+    {
+        perWindowStats[windowSize] = manager.getAndResetIterationStats();
+    }
+    return perWindowStats;
+}
+
 std::deque<tle::KVCacheEvent> BlockManager::getLatestEvents(std::optional<std::chrono::milliseconds> timeout) const
 {
     return mEventManager ? mEventManager->getEvents(timeout) : std::deque<tle::KVCacheEvent>{};
 
@@ -273,6 +273,10 @@ void KVCacheTransferManager::onboard(BlockPtr const& offloadedBlock, BlockPtr co
 
     copyBlock(offloadedBlock, block, pools, false, numTokensToCopy, mode, directory);
 
+    // Update transfer statistics
+    ++mOnboardBlockCount;
+    mOnboardByteCount += computeBlockTransferBytes(pools, numTokensToCopy);
+
     // Record new pending read from offloadedBlock
     mPendingReads[offloadedBlock->getMemoryPoolBlockIndex()] = tr::CudaEvent();
     mOnboardManager.getStream().record(mPendingReads[offloadedBlock->getMemoryPoolBlockIndex()]);
@@ -309,6 +313,10 @@ void KVCacheTransferManager::offload(BlockPtr const& block, BlockPtr const& offl
 
     copyBlock(block, offloadBlock, pools, true, numTokensToCopy, mode, directory);
 
+    // Update transfer statistics
+    ++mOffloadBlockCount;
+    mOffloadByteCount += computeBlockTransferBytes(pools, numTokensToCopy);
+
     // Record new pending read from block
     mPendingReads[block->getMemoryPoolBlockIndex()] = tr::CudaEvent();
     mOffloadManager.getStream().record(mPendingReads[block->getMemoryPoolBlockIndex()]);
@@ -347,4 +355,55 @@ void KVCacheTransferManager::syncTransfers()
     mPendingWrites.clear();
 }
 
+KvCacheTransferStats KVCacheTransferManager::getAndResetTransferStats()
+{
+    KvCacheTransferStats stats;
+    stats.onboardBlocks = mOnboardBlockCount;
+    stats.onboardBytes = mOnboardByteCount;
+    stats.offloadBlocks = mOffloadBlockCount;
+    stats.offloadBytes = mOffloadByteCount;
+    mOnboardBlockCount = 0;
+    mOnboardByteCount = 0;
+    mOffloadBlockCount = 0;
+    mOffloadByteCount = 0;
+    return stats;
+}
+
+std::size_t KVCacheTransferManager::computeBlockTransferBytes(
+    std::vector<KVCacheBlockPool> const& pools, int numTokensToCopy) const
+{
+    std::size_t totalBytes = 0;
+    for (auto const& pool : pools)
+    {
+        if (!pool.primaryPtr)
+        {
+            continue;
+        }
+
+        auto const dataType = pool.primaryPtr->getDataType();
+        auto const bytesPerElement
+            = pool.primaryPtr->getSizeInBytes() / static_cast<std::size_t>(pool.primaryPtr->getSize());
+
+        // Mirror the logic in copyBlock: a partial copy only happens when numTokensToCopy > 0,
+        // the data type supports it (not kINT4/kFP4), not block scales, and numTokensToCopy < tokensPerBlock.
+        bool const isPartialCopy = numTokensToCopy > 0 && dataType != nvinfer1::DataType::kINT4
+            && dataType != nvinfer1::DataType::kFP4 && !pool.containsBlockScales
+            && numTokensToCopy < pool.tokensPerBlock;
+
+        if (isPartialCopy)
+        {
+            // Partial copy transfers: numLayers * kvFactor * numKvHeads * sizePerHead * numTokensToCopy elements
+            totalBytes += static_cast<std::size_t>(pool.numLayers) * pool.kvFactor * pool.numKvHeads * pool.sizePerHead
+                * numTokensToCopy * bytesPerElement;
+        }
+        else
+        {
+            // Full block copy: numLayers * kvFactor * blockSize elements
+            // where blockSize = numKvHeads * sizePerHead * tokensPerBlock
+            totalBytes += static_cast<std::size_t>(pool.numLayers) * pool.kvFactor * pool.blockSize * bytesPerElement;
+        }
+    }
+    return totalBytes;
+}
+
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager