Skip to content

Commit 8fed1c2

Browse files
committed
clean up unnecessary chagnes
Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com>
1 parent ab3bc32 commit 8fed1c2

File tree

7 files changed

+11
-105
lines changed

7 files changed

+11
-105
lines changed

cpp/include/tensorrt_llm/batch_manager/blockKey.h

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -143,23 +143,4 @@ struct BlockKeyHasher
143143
}
144144
};
145145

146-
inline std::ostream& operator<<(std::ostream& out, BlockKey const& key)
147-
{
148-
out << "BlockKey(n=" << key.uniqueTokens.size();
149-
if (!key.uniqueTokens.empty())
150-
{
151-
out << ",tokens=[";
152-
for (size_t i = 0; i < key.uniqueTokens.size(); ++i)
153-
{
154-
if (i > 0)
155-
{
156-
out << ",";
157-
}
158-
out << key.uniqueTokens[i].tokenId;
159-
}
160-
out << "]";
161-
}
162-
out << ")";
163-
return out;
164-
}
165146
} // namespace tensorrt_llm::batch_manager::kv_cache_manager

cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -499,20 +499,6 @@ class KVCacheBlock : public std::enable_shared_from_this<KVCacheBlock>
499499
size_t mHash;
500500
};
501501

502-
//! \brief Stream block id for trie printTree (e.g. Node prints mValue as block ids).
503-
inline std::ostream& operator<<(std::ostream& out, BlockPtr const& block)
504-
{
505-
if (block)
506-
{
507-
out << block->getBlockId();
508-
}
509-
else
510-
{
511-
out << "null";
512-
}
513-
return out;
514-
}
515-
516502
class KVCacheBlockSet
517503
{
518504
public:
@@ -1117,12 +1103,6 @@ class WindowBlockManager
11171103
mCachedBlocksRoot->setAsRoot(mLookupTree->getRoot(), mWindowSize);
11181104
}
11191105

1120-
void printTree() const
1121-
{
1122-
std::lock_guard<std::mutex> lock(mCachedBlocksRootMutex);
1123-
mLookupTree->printTree();
1124-
}
1125-
11261106
private:
11271107
bool tryAllocatePlaceholderForLinearAttention(GenerationRequest& sequence, bool shareAmongBeams);
11281108

cpp/include/tensorrt_llm/batch_manager/templatedTrie.h

Lines changed: 0 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,8 @@
2020
#include "tensorrt_llm/common/assert.h"
2121
#include <algorithm>
2222
#include <cstddef>
23-
#include <iostream>
2423
#include <memory>
2524
#include <optional>
26-
#include <string>
2725

2826
//
2927
// This file implements a templated trie.
@@ -165,53 +163,6 @@ class Node
165163
{
166164
}
167165

168-
//! \brief Print subtree in Unix `tree` style (├──, └──, │). NodeKey must support operator<<(std::ostream&,
169-
//! NodeKey).
170-
void printTree(int depth = 0, std::string const& prefix = "", std::optional<bool> isLast = std::nullopt) const
171-
{
172-
(void) depth;
173-
bool const isRoot = mPrevNode.expired();
174-
if (isRoot)
175-
{
176-
std::cout << ".\n";
177-
int idx = 0;
178-
int const numChildren = static_cast<int>(mNextNodes.size());
179-
for (auto const& [key, node] : mNextNodes)
180-
{
181-
node->printTree(0, "", idx == numChildren - 1);
182-
++idx;
183-
}
184-
}
185-
else
186-
{
187-
std::cout << prefix << (isLast.value() ? "└── " : "├── ") << mKey;
188-
if (!mValue.empty())
189-
{
190-
std::cout << " [";
191-
bool first = true;
192-
for (auto const& [vkey, val] : mValue)
193-
{
194-
if (!first)
195-
{
196-
std::cout << ", ";
197-
}
198-
std::cout << vkey << ":" << val;
199-
first = false;
200-
}
201-
std::cout << "]";
202-
}
203-
std::cout << "\n";
204-
int idx = 0;
205-
int const numChildren = static_cast<int>(mNextNodes.size());
206-
for (auto const& [key, node] : mNextNodes)
207-
{
208-
std::string newPrefix = prefix + (isLast.value() ? " " : "");
209-
node->printTree(0, newPrefix, idx == numChildren - 1);
210-
++idx;
211-
}
212-
}
213-
}
214-
215166
//! \brief Update the back-pointer to this node's parent.
216167
//! \details Only updates mPrevNode (the back-edge). The caller is responsible for also
217168
//! updating the old and new parent's mNextNodes forward maps: remove this node from the old
@@ -652,11 +603,6 @@ class Trie
652603
return lookupValues(nodeMatches, vkey);
653604
}
654605

655-
void printTree() const
656-
{
657-
mRoot->printTree();
658-
}
659-
660606
private:
661607
NodePtr mRoot;
662608
};

cpp/tensorrt_llm/thop/attentionOp.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,6 @@ class Runner : public RunnerBase
308308
int32_t const layer_idx_in_cache_pool = op.useKVCache() && host_kv_cache_pool_mapping.has_value()
309309
? host_kv_cache_pool_mapping.value().index({op.mLayerIdx, 1}).item<int32_t>()
310310
: 0;
311-
// TLLM_LOG_INFO("pool_index: %d, layer_idx_in_cache_pool: %d", pool_index, layer_idx_in_cache_pool);
312311
KVBlockArray::DataType* block_offsets
313312
= static_cast<KVBlockArray::DataType*>(op.useKVCache() && kv_cache_block_offsets.has_value()
314313
? kv_cache_block_offsets.value().index({pool_index, seq_offset}).data_ptr()

tensorrt_llm/_torch/models/modeling_qwen3_next.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,6 @@ def __init__(
146146
strategy=model_config.allreduce_strategy)
147147

148148
self.aux_stream = aux_stream
149-
self.layer_idx = layer_idx
150149

151150
self.gate = Qwen3NextGate(
152151
hidden_size=self.hidden_dim,
@@ -237,7 +236,7 @@ def _compute_routed_output():
237236
do_finalize=do_finalize,
238237
)
239238

240-
return router_logits, final_hidden_states
239+
return final_hidden_states
241240

242241
def _compute_shared_output():
243242
shared_expert_output = self.shared_expert(
@@ -248,19 +247,17 @@ def _compute_shared_output():
248247
self.shared_expert_gate(hidden_states)) * shared_expert_output
249248
return shared_expert_output
250249

251-
routed_output, shared_expert_output = maybe_execute_in_parallel(
250+
final_hidden_states, shared_expert_output = maybe_execute_in_parallel(
252251
_compute_routed_output,
253252
_compute_shared_output,
254253
self.event_dict[EventType.Main],
255254
self.event_dict[EventType.MoeShared],
256255
self.aux_stream,
257256
)
258257
if not do_finalize:
259-
return routed_output[0]
260-
261-
router_logits, routed_output = routed_output
258+
return final_hidden_states
262259

263-
final_hidden_states = routed_output + shared_expert_output
260+
final_hidden_states = final_hidden_states + shared_expert_output
264261

265262
if not self.enable_attention_dp and self.mapping.tp_size > 1:
266263
final_hidden_states = self.allreduce(
@@ -611,6 +608,7 @@ def forward_decode(
611608
a = kwargs["a"]
612609
b = kwargs["b"]
613610
cache_indices = kwargs["cache_indices"]
611+
614612
mixed_qkv = causal_conv1d_update(
615613
mixed_qkv,
616614
conv_states,
@@ -647,7 +645,6 @@ def forward_decode(
647645
use_qk_l2norm_in_kernel=True,
648646
softplus_beta=1.0,
649647
softplus_threshold=20.0,
650-
layer_idx=self.layer_idx,
651648
)
652649

653650
return core_attn_out
@@ -712,6 +709,7 @@ def forward_extend(
712709
has_initial_state=has_initial_states,
713710
cache_indices=cache_indices,
714711
query_start_loc=query_start_loc).transpose(0, 1)
712+
715713
key_split_dim = self.key_dim // self.attn_tp_size
716714
value_split_dim = self.value_dim // self.attn_tp_size
717715

@@ -752,6 +750,7 @@ def forward_extend(
752750
last_recurrent_state = last_recurrent_state.to(ssm_states.dtype,
753751
copy=False)
754752
ssm_states[cache_indices] = last_recurrent_state
753+
755754
return core_attn_out
756755

757756
def forward(
@@ -1095,6 +1094,7 @@ def forward(
10951094
if spec_metadata is not None and spec_metadata.is_layer_capture(
10961095
self.layer_idx):
10971096
self.fusion_config.POST_MOE_FUSION = False
1097+
10981098
# Self Attention
10991099
hidden_states = self.self_attn(
11001100
position_ids=position_ids,
@@ -1105,6 +1105,7 @@ def forward(
11051105
lora_params=lora_params,
11061106
**kwargs,
11071107
)
1108+
11081109
if self.fusion_config.PRE_MOE_FUSION and self.enable_attention_dp:
11091110
hidden_states, residual = self.allreduce(
11101111
hidden_states,
@@ -1173,6 +1174,7 @@ def forward(
11731174
if self.next_layer_layernorm is not None:
11741175
hidden_states, residual = self.next_layer_layernorm(
11751176
hidden_states, residual)
1177+
11761178
return hidden_states, residual
11771179

11781180

@@ -1243,6 +1245,7 @@ def forward(
12431245
raise ValueError(
12441246
"You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
12451247
)
1248+
12461249
mamba_metadata = attn_metadata.mamba_metadata
12471250
if mamba_metadata.max_batch_size != attn_metadata.max_num_requests:
12481251
attn_metadata.mamba_metadata = Mamba2Metadata(

tensorrt_llm/_torch/modules/fla/fused_sigmoid_gating_recurrent.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,6 @@ def fused_sigmoid_gating_delta_rule_update(
172172
scale: Optional[float] = None,
173173
use_qk_l2norm_in_kernel: bool = False,
174174
cu_seqlens: Optional[torch.Tensor] = None,
175-
layer_idx: int = 0,
176175
):
177176
"""
178177
Fused triton implementation of sigmoid gating delta rule update.

tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -655,7 +655,6 @@ def forward_chunk(
655655
use_dp_padding: Optional[bool] = None,
656656
repeating_info: tuple = (True, True),
657657
) -> torch.Tensor:
658-
self.layer_idx if self.layer_idx is not None else 0
659658
if isinstance(x, Fp4QuantizedTensor):
660659
assert output_dtype is not None
661660
else:
@@ -929,7 +928,6 @@ def forward_impl(
929928
num_chunks = (num_rows + self.moe_max_num_tokens -
930929
1) // self.moe_max_num_tokens
931930

932-
self.layer_idx if self.layer_idx is not None else 0
933931
if num_chunks == 1:
934932
is_first_call = self.repeat_idx == 0
935933
is_last_call = self.repeat_idx == self.repeat_count - 1

0 commit comments

Comments
 (0)