clean up unnecessary chagnes

VALLIS-NERIA · VALLIS-NERIA · commit 8fed1c2460c3 · 2026-03-24T10:14:50.000+08:00
Signed-off-by: Xiwen Yu &lt;13230610+VALLIS-NERIA@users.noreply.github.com&gt;
diff --git a/cpp/include/tensorrt_llm/batch_manager/blockKey.h b/cpp/include/tensorrt_llm/batch_manager/blockKey.h
@@ -143,23 +143,4 @@ struct BlockKeyHasher
     }
 };
 
-inline std::ostream& operator<<(std::ostream& out, BlockKey const& key)
-{
-    out << "BlockKey(n=" << key.uniqueTokens.size();
-    if (!key.uniqueTokens.empty())
-    {
-        out << ",tokens=[";
-        for (size_t i = 0; i < key.uniqueTokens.size(); ++i)
-        {
-            if (i > 0)
-            {
-                out << ",";
-            }
-            out << key.uniqueTokens[i].tokenId;
-        }
-        out << "]";
-    }
-    out << ")";
-    return out;
-}
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -499,20 +499,6 @@ class KVCacheBlock : public std::enable_shared_from_this<KVCacheBlock>
     size_t mHash;
 };
 
-//! \brief Stream block id for trie printTree (e.g. Node prints mValue as block ids).
-inline std::ostream& operator<<(std::ostream& out, BlockPtr const& block)
-{
-    if (block)
-    {
-        out << block->getBlockId();
-    }
-    else
-    {
-        out << "null";
-    }
-    return out;
-}
-
 class KVCacheBlockSet
 {
 public:
@@ -1117,12 +1103,6 @@ class WindowBlockManager
         mCachedBlocksRoot->setAsRoot(mLookupTree->getRoot(), mWindowSize);
     }
 
-    void printTree() const
-    {
-        std::lock_guard<std::mutex> lock(mCachedBlocksRootMutex);
-        mLookupTree->printTree();
-    }
-
 private:
     bool tryAllocatePlaceholderForLinearAttention(GenerationRequest& sequence, bool shareAmongBeams);
 
diff --git a/cpp/include/tensorrt_llm/batch_manager/templatedTrie.h b/cpp/include/tensorrt_llm/batch_manager/templatedTrie.h
@@ -20,10 +20,8 @@
 #include "tensorrt_llm/common/assert.h"
 #include <algorithm>
 #include <cstddef>
-#include <iostream>
 #include <memory>
 #include <optional>
-#include <string>
 
 //
 // This file implements a templated trie.
@@ -165,53 +163,6 @@ class Node
     {
     }
 
-    //! \brief Print subtree in Unix `tree` style (├──, └──, │). NodeKey must support operator<<(std::ostream&,
-    //! NodeKey).
-    void printTree(int depth = 0, std::string const& prefix = "", std::optional<bool> isLast = std::nullopt) const
-    {
-        (void) depth;
-        bool const isRoot = mPrevNode.expired();
-        if (isRoot)
-        {
-            std::cout << ".\n";
-            int idx = 0;
-            int const numChildren = static_cast<int>(mNextNodes.size());
-            for (auto const& [key, node] : mNextNodes)
-            {
-                node->printTree(0, "", idx == numChildren - 1);
-                ++idx;
-            }
-        }
-        else
-        {
-            std::cout << prefix << (isLast.value() ? "└── " : "├── ") << mKey;
-            if (!mValue.empty())
-            {
-                std::cout << " [";
-                bool first = true;
-                for (auto const& [vkey, val] : mValue)
-                {
-                    if (!first)
-                    {
-                        std::cout << ", ";
-                    }
-                    std::cout << vkey << ":" << val;
-                    first = false;
-                }
-                std::cout << "]";
-            }
-            std::cout << "\n";
-            int idx = 0;
-            int const numChildren = static_cast<int>(mNextNodes.size());
-            for (auto const& [key, node] : mNextNodes)
-            {
-                std::string newPrefix = prefix + (isLast.value() ? "    " : "│   ");
-                node->printTree(0, newPrefix, idx == numChildren - 1);
-                ++idx;
-            }
-        }
-    }
-
     //! \brief Update the back-pointer to this node's parent.
     //! \details Only updates mPrevNode (the back-edge). The caller is responsible for also
     //! updating the old and new parent's mNextNodes forward maps: remove this node from the old
@@ -652,11 +603,6 @@ class Trie
         return lookupValues(nodeMatches, vkey);
     }
 
-    void printTree() const
-    {
-        mRoot->printTree();
-    }
-
 private:
     NodePtr mRoot;
 };
diff --git a/cpp/tensorrt_llm/thop/attentionOp.cpp b/cpp/tensorrt_llm/thop/attentionOp.cpp
@@ -308,7 +308,6 @@ class Runner : public RunnerBase
         int32_t const layer_idx_in_cache_pool = op.useKVCache() && host_kv_cache_pool_mapping.has_value()
             ? host_kv_cache_pool_mapping.value().index({op.mLayerIdx, 1}).item<int32_t>()
             : 0;
-        // TLLM_LOG_INFO("pool_index: %d, layer_idx_in_cache_pool: %d", pool_index, layer_idx_in_cache_pool);
         KVBlockArray::DataType* block_offsets
             = static_cast<KVBlockArray::DataType*>(op.useKVCache() && kv_cache_block_offsets.has_value()
                     ? kv_cache_block_offsets.value().index({pool_index, seq_offset}).data_ptr()
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_next.py b/tensorrt_llm/_torch/models/modeling_qwen3_next.py
@@ -146,7 +146,6 @@ def __init__(
                                    strategy=model_config.allreduce_strategy)
 
         self.aux_stream = aux_stream
-        self.layer_idx = layer_idx
 
         self.gate = Qwen3NextGate(
             hidden_size=self.hidden_dim,
@@ -237,7 +236,7 @@ def _compute_routed_output():
                 do_finalize=do_finalize,
             )
 
-            return router_logits, final_hidden_states
+            return final_hidden_states
 
         def _compute_shared_output():
             shared_expert_output = self.shared_expert(
@@ -248,19 +247,17 @@ def _compute_shared_output():
                 self.shared_expert_gate(hidden_states)) * shared_expert_output
             return shared_expert_output
 
-        routed_output, shared_expert_output = maybe_execute_in_parallel(
+        final_hidden_states, shared_expert_output = maybe_execute_in_parallel(
             _compute_routed_output,
             _compute_shared_output,
             self.event_dict[EventType.Main],
             self.event_dict[EventType.MoeShared],
             self.aux_stream,
         )
         if not do_finalize:
-            return routed_output[0]
-
-        router_logits, routed_output = routed_output
+            return final_hidden_states
 
-        final_hidden_states = routed_output + shared_expert_output
+        final_hidden_states = final_hidden_states + shared_expert_output
 
         if not self.enable_attention_dp and self.mapping.tp_size > 1:
             final_hidden_states = self.allreduce(
@@ -611,6 +608,7 @@ def forward_decode(
         a = kwargs["a"]
         b = kwargs["b"]
         cache_indices = kwargs["cache_indices"]
+
         mixed_qkv = causal_conv1d_update(
             mixed_qkv,
             conv_states,
@@ -647,7 +645,6 @@ def forward_decode(
             use_qk_l2norm_in_kernel=True,
             softplus_beta=1.0,
             softplus_threshold=20.0,
-            layer_idx=self.layer_idx,
         )
 
         return core_attn_out
@@ -712,6 +709,7 @@ def forward_extend(
                 has_initial_state=has_initial_states,
                 cache_indices=cache_indices,
                 query_start_loc=query_start_loc).transpose(0, 1)
+
         key_split_dim = self.key_dim // self.attn_tp_size
         value_split_dim = self.value_dim // self.attn_tp_size
 
@@ -752,6 +750,7 @@ def forward_extend(
         last_recurrent_state = last_recurrent_state.to(ssm_states.dtype,
                                                        copy=False)
         ssm_states[cache_indices] = last_recurrent_state
+
         return core_attn_out
 
     def forward(
@@ -1095,6 +1094,7 @@ def forward(
         if spec_metadata is not None and spec_metadata.is_layer_capture(
                 self.layer_idx):
             self.fusion_config.POST_MOE_FUSION = False
+
         # Self Attention
         hidden_states = self.self_attn(
             position_ids=position_ids,
@@ -1105,6 +1105,7 @@ def forward(
             lora_params=lora_params,
             **kwargs,
         )
+
         if self.fusion_config.PRE_MOE_FUSION and self.enable_attention_dp:
             hidden_states, residual = self.allreduce(
                 hidden_states,
@@ -1173,6 +1174,7 @@ def forward(
             if self.next_layer_layernorm is not None:
                 hidden_states, residual = self.next_layer_layernorm(
                     hidden_states, residual)
+
         return hidden_states, residual
 
 
@@ -1243,6 +1245,7 @@ def forward(
             raise ValueError(
                 "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
             )
+
         mamba_metadata = attn_metadata.mamba_metadata
         if mamba_metadata.max_batch_size != attn_metadata.max_num_requests:
             attn_metadata.mamba_metadata = Mamba2Metadata(
diff --git a/tensorrt_llm/_torch/modules/fla/fused_sigmoid_gating_recurrent.py b/tensorrt_llm/_torch/modules/fla/fused_sigmoid_gating_recurrent.py
@@ -172,7 +172,6 @@ def fused_sigmoid_gating_delta_rule_update(
     scale: Optional[float] = None,
     use_qk_l2norm_in_kernel: bool = False,
     cu_seqlens: Optional[torch.Tensor] = None,
-    layer_idx: int = 0,
 ):
     """
     Fused triton implementation of sigmoid gating delta rule update.
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
@@ -655,7 +655,6 @@ def forward_chunk(
             use_dp_padding: Optional[bool] = None,
             repeating_info: tuple = (True, True),
     ) -> torch.Tensor:
-        self.layer_idx if self.layer_idx is not None else 0
         if isinstance(x, Fp4QuantizedTensor):
             assert output_dtype is not None
         else:
@@ -929,7 +928,6 @@ def forward_impl(
         num_chunks = (num_rows + self.moe_max_num_tokens -
                       1) // self.moe_max_num_tokens
 
-        self.layer_idx if self.layer_idx is not None else 0
         if num_chunks == 1:
             is_first_call = self.repeat_idx == 0
             is_last_call = self.repeat_idx == self.repeat_count - 1