[BugFix] fix VL fp8 bug when moe token_num is 0 (#4929)

ming1753 · web-flow · commit 197a0f7af423 · 2025-11-10T21:16:10.000+08:00
* [BugFix] fix VL fp8 bug when moe token_num is 0

* fix bug
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
@@ -248,6 +248,7 @@ def apply(
         """
         Triton compute Fused MoE.
         """
+        x = paddle.concat([x, paddle.ones([1, layer.hidden_size], dtype=x.dtype)])
         gate_out = gate(x.cast("float32"))
         token_num = x.shape[0]
         top_k = layer.top_k
@@ -395,6 +396,7 @@ def apply(
         if layer.reduce_results and layer.tp_size > 1:
             out = tensor_model_parallel_all_reduce(out)
 
+        out = out[:-1]
         return out
 
 
@@ -601,6 +603,7 @@ def apply(
         """
         Triton compute Fused MoE.
         """
+        x = paddle.concat([x, paddle.ones([1, layer.hidden_size], dtype=x.dtype)])
         gate_out = gate(x.cast("float32"))
         token_num = x.shape[0]
         top_k = layer.top_k
@@ -769,6 +772,7 @@ def apply(
         if layer.reduce_results and layer.tp_size > 1:
             out = tensor_model_parallel_all_reduce(out)
 
+        out = out[:-1]
         return out
 
 
@@ -891,6 +895,7 @@ def apply(
         """
         Triton compute Fused MoE.
         """
+        x = paddle.concat([x, paddle.ones([1, layer.hidden_size], dtype=x.dtype)])
         gate_out = gate(x.cast("float32"))
         token_num = x.shape[0]
         top_k = layer.top_k
@@ -1058,6 +1063,7 @@ def apply(
         if layer.tp_size > 1:
             out = tensor_model_parallel_all_reduce(out)
 
+        out = out[:-1]
         return out
 
 
@@ -1315,6 +1321,7 @@ def apply(
         """
         Triton compute Fused MoE.
         """
+        x = paddle.concat([x, paddle.ones([1, layer.hidden_size], dtype=x.dtype)])
         gate_out = gate(x.cast("float32"))
         token_num = x.shape[0]
         top_k = layer.top_k
@@ -1462,4 +1469,5 @@ def apply(
         if layer.tp_size > 1:
             out = tensor_model_parallel_all_reduce(out)
 
+        out = out[:-1]
         return out