[mxfp8 moe training] remove unused block_size arg (#4177)

danielvegamyhre · web-flow · commit 96a9cdffc013 · 2026-03-26T13:44:01.000-07:00
diff --git a/benchmarks/prototype/moe_training/mxfp8/bench_ep_pipeline.py b/benchmarks/prototype/moe_training/mxfp8/bench_ep_pipeline.py
@@ -226,7 +226,6 @@ def mxfp8_pipeline(
         mx_permuted,
         expert_weights_t,
         offs=mx_group_offsets,
-        block_size=block_size,
         wgrad_with_hp=True,
     )
 
diff --git a/benchmarks/prototype/moe_training/mxfp8/roofline_unified.py b/benchmarks/prototype/moe_training/mxfp8/roofline_unified.py
@@ -436,7 +436,7 @@ def wrapper():
     return time_ms
 
 
-def benchmark_mxfp8_grouped_mm_fwd_bwd(x, w_t, offs, labels, block_size=32):
+def benchmark_mxfp8_grouped_mm_fwd_bwd(x, w_t, offs, labels):
     """Benchmark _to_mxfp8_then_scaled_grouped_mm forward + backward"""
     x_clone = x.clone().requires_grad_(True)
     w_t_clone = w_t.clone().requires_grad_(True)
@@ -447,7 +447,6 @@ def benchmark_mxfp8_grouped_mm_fwd_bwd(x, w_t, offs, labels, block_size=32):
     A = x_clone
     B_t = w_t_clone
     offs_arg = offs
-    block_size_arg = block_size
     out_dtype = torch.bfloat16
     kernel_preference = KernelPreference.AUTO
     wgrad_with_hp = False
@@ -458,7 +457,6 @@ def wrapper():
             A,
             B_t,
             offs_arg,
-            block_size_arg,
             out_dtype,
             kernel_preference,
             wgrad_with_hp,
diff --git a/test/prototype/moe_training/ep/test_compile.py b/test/prototype/moe_training/ep/test_compile.py
@@ -86,7 +86,6 @@ def standard_pipeline(
         permuted,
         expert_weights_t,
         offs=offsets,
-        block_size=block_size,
         wgrad_with_hp=True,
     )
 
@@ -154,7 +153,6 @@ def mxfp8_pipeline(
         mx_permuted,
         expert_weights_t,
         offs=mx_group_offsets,
-        block_size=block_size,
         wgrad_with_hp=True,
     )
 
diff --git a/test/prototype/moe_training/ep/test_integration.py b/test/prototype/moe_training/ep/test_integration.py
@@ -230,7 +230,6 @@ def test_full_pipeline(self):
                 mx_permuted,
                 expert_weights.transpose(-2, -1),
                 offs=mx_group_offsets,
-                block_size=block_size,
                 # wgrad_with_hp must be true if inputs are pre-quantized (MXTensor)
                 wgrad_with_hp=True,
             )
diff --git a/test/prototype/moe_training/test_mxfp8_grouped_mm.py b/test/prototype/moe_training/test_mxfp8_grouped_mm.py
@@ -165,7 +165,6 @@ def test_mxfp8_grouped_gemm_with_dq_fwd_bwd(
             "torch native dynamic per group pad/unpad functions do not work with torch.compile yet: https://github.com/pytorch/pytorch/issues/176770"
         )
 
-    block_size = 32
     x = torch.randn(M, K, dtype=torch.bfloat16, device="cuda", requires_grad=True)
     w = torch.randn(
         num_experts,
@@ -194,7 +193,6 @@ def test_mxfp8_grouped_gemm_with_dq_fwd_bwd(
         x,
         w_t,
         offs=offs,
-        block_size=block_size,
         kernel_preference=kernel_preference,
         wgrad_with_hp=wgrad_with_hp,
         scale_calculation_mode=scale_mode,
@@ -262,7 +260,6 @@ def test_mxfp8_grouped_gemm_from_qdata_and_scales_matches_dynamic():
         x_mx,
         w_t,
         offs=offs,
-        block_size=block_size,
         out_dtype=torch.bfloat16,
         kernel_preference=KernelPreference.EMULATED,
         wgrad_with_hp=True,
@@ -272,7 +269,6 @@ def test_mxfp8_grouped_gemm_from_qdata_and_scales_matches_dynamic():
         x_ref,
         w_t_ref,
         offs=offs,
-        block_size=block_size,
         out_dtype=torch.bfloat16,
         kernel_preference=KernelPreference.EMULATED,
         wgrad_with_hp=True,
@@ -334,7 +330,6 @@ def test_mxfp8_grouped_gemm_from_qdata_and_scales_forward():
         x_mx,
         w_t,
         offs=offs,
-        block_size=block_size,
         out_dtype=torch.bfloat16,
         kernel_preference=KernelPreference.EMULATED,
         wgrad_with_hp=True,
@@ -344,7 +339,6 @@ def test_mxfp8_grouped_gemm_from_qdata_and_scales_forward():
         x,
         w_t,
         offs=offs,
-        block_size=block_size,
         out_dtype=torch.bfloat16,
         kernel_preference=KernelPreference.EMULATED,
         wgrad_with_hp=True,
@@ -392,7 +386,6 @@ def test_mxfp8_grouped_gemm_mxtensor_requires_wgrad_with_hp():
             x_mx,
             w_t,
             offs=offs,
-            block_size=block_size,
             out_dtype=torch.bfloat16,
             kernel_preference=KernelPreference.EMULATED,
             wgrad_with_hp=False,
diff --git a/torchao/prototype/moe_training/mxfp8_grouped_mm.py b/torchao/prototype/moe_training/mxfp8_grouped_mm.py
@@ -83,7 +83,6 @@ def _to_mxfp8_then_scaled_grouped_mm(
     A: torch.Tensor,
     B_t: torch.Tensor,
     offs: Optional[torch.Tensor] = None,
-    block_size: Optional[int] = None,
     out_dtype: Optional[torch.dtype] = torch.bfloat16,
     kernel_preference: KernelPreference = KernelPreference.AUTO,
     wgrad_with_hp: bool = False,
@@ -103,7 +102,6 @@ def _to_mxfp8_then_scaled_grouped_mm(
             which must be 3D, which must be shape (G, K, N)
             and in "per group column-major memory" layout (i.e., strides of (N*K, 1, N)).
         offs (int32 torch.Tensor): The offsets to use to mark the end index of each group along the dim0 of the A tensor.
-        block_size (int): Block size for MXFP8 quantization. Must be 32 (the only supported value). This parameter exists for backward compatibility but is ignored.
         out_dtype (torch.dtype): Output dtype for the result. Defaults to torch.bfloat16.
         kernel_preference (KernelPreference): Kernel preference (AUTO uses CUDA/Triton, EMULATED uses to_mx). Defaults to KernelPreference.AUTO.
         wgrad_with_hp (bool): Whether to compute weight gradient in high precision. Defaults to False.
@@ -120,7 +118,6 @@ def _to_mxfp8_then_scaled_grouped_mm(
         A,
         B_t,
         offs,
-        block_size,
         out_dtype,
         kernel_preference,
         wgrad_with_hp,
@@ -144,7 +141,6 @@ def forward(
         input_act: torch.Tensor,
         weight_t: torch.Tensor,
         group_end_offsets: Optional[torch.Tensor] = None,
-        block_size: int = 32,
         out_dtype: Optional[torch.dtype] = torch.bfloat16,
         kernel_preference: KernelPreference = KernelPreference.AUTO,
         wgrad_with_hp: bool = False,
@@ -158,15 +154,18 @@ def forward(
             input_act: Input activations, shape (M, K) - may be MXTensor or high-precision
             weight_t: Expert weights transposed, shape (E, K, N) - always high-precision
             group_end_offsets: End index of each token group, shape (E,)
-            block_size: Block size for MXFP8 quantization (must be 32)
             out_dtype: Output dtype (bfloat16 or float32)
             kernel_preference: Kernel preference (AUTO uses CUDA/Triton, EMULATED uses to_mx)
             wgrad_with_hp: Compute weight gradient in high precision
             scale_calculation_mode: Mode for scale calculation (RCEIL, FLOOR, etc.)
+            pad_token_groups_for_grouped_mm: Whether to pad token groups to the next multiple of 32
 
         Returns:
             Output tensor, shape (M, N)
         """
+        # block_size is always 32 for MXFP8
+        block_size = 32
+
         assert kernel_preference in (
             KernelPreference.AUTO,
             KernelPreference.EMULATED,
@@ -182,7 +181,6 @@ def forward(
         # Input validation
         assert input_act.ndim == 2, "input_act must be 2D"
         assert weight_t.ndim == 3, "weight_t must be 3D"
-        assert block_size == 32, "Only block_size=32 is supported"
         assert group_end_offsets is not None, (
             "group_end_offsets must be provided for 2d-3d grouped mm"
         )
@@ -247,7 +245,6 @@ def forward(
             padded_group_start_offsets,
             padded_group_end_offsets,
         )
-        ctx.block_size = block_size
         ctx.out_dtype = out_dtype
         ctx.kernel_preference = kernel_preference
         ctx.wgrad_with_hp = wgrad_with_hp
@@ -279,7 +276,8 @@ def backward(ctx, grad_output: torch.Tensor):
             padded_group_end_offsets,
         ) = ctx.saved_tensors
 
-        block_size = ctx.block_size
+        # block_size is always 32 for MXFP8
+        block_size = 32
         out_dtype = ctx.out_dtype
         kernel_preference = ctx.kernel_preference
         wgrad_with_hp = ctx.wgrad_with_hp
@@ -338,13 +336,12 @@ def backward(ctx, grad_output: torch.Tensor):
         return (
             grad_input,
             grad_weight_t,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
+            None,  # group_end_offsets
+            None,  # out_dtype
+            None,  # kernel_preference
+            None,  # wgrad_with_hp
+            None,  # scale_calculation_mode
+            None,  # pad_token_groups_for_grouped_mm
         )
 
 

Original file line number	Diff line number	Diff line change
`@@ -226,7 +226,6 @@ def mxfp8_pipeline(`
`226`	`226`	`mx_permuted,`
`227`	`227`	`expert_weights_t,`
`228`	`228`	`offs=mx_group_offsets,`
`229`		`- block_size=block_size,`
`230`	`229`	`wgrad_with_hp=True,`
`231`	`230`	`)`
`232`	`231`
Original file line number	Diff line number	Diff line change
`@@ -86,7 +86,6 @@ def standard_pipeline(`
`86`	`86`	`permuted,`
`87`	`87`	`expert_weights_t,`
`88`	`88`	`offs=offsets,`
`89`		`- block_size=block_size,`
`90`	`89`	`wgrad_with_hp=True,`
`91`	`90`	`)`
`92`	`91`
`@@ -154,7 +153,6 @@ def mxfp8_pipeline(`
`154`	`153`	`mx_permuted,`
`155`	`154`	`expert_weights_t,`
`156`	`155`	`offs=mx_group_offsets,`
`157`		`- block_size=block_size,`
`158`	`156`	`wgrad_with_hp=True,`
`159`	`157`	`)`
`160`	`158`
Original file line number	Diff line number	Diff line change
`@@ -230,7 +230,6 @@ def test_full_pipeline(self):`
`230`	`230`	`mx_permuted,`
`231`	`231`	`expert_weights.transpose(-2, -1),`
`232`	`232`	`offs=mx_group_offsets,`
`233`		`- block_size=block_size,`
`234`	`233`	`# wgrad_with_hp must be true if inputs are pre-quantized (MXTensor)`
`235`	`234`	`wgrad_with_hp=True,`
`236`	`235`	`)`