[mxfp8 training] triton_to_mxfp8_dim0 nan handling consistent with torch reference

danielvegamyhre · danielvegamyhre · commit c38b55cc60b6 · 2026-03-31T02:14:36.000Z
stack-info: PR: #4201, branch: danielvegamyhre/stack/162
diff --git a/test/prototype/mx_formats/test_kernels.py b/test/prototype/mx_formats/test_kernels.py
@@ -472,9 +472,7 @@ def test_triton_mxfp8_dim1_randn(M, K, scaling_mode):
 )
 @pytest.mark.parametrize("M", (128, 256))
 @pytest.mark.parametrize("K", (128, 256))
-@pytest.mark.parametrize(
-    "scaling_mode", (ScaleCalculationMode.FLOOR, ScaleCalculationMode.RCEIL)
-)
+@pytest.mark.parametrize("scaling_mode", (ScaleCalculationMode.RCEIL,))
 def test_triton_mxfp8_dim0_randn(M, K, scaling_mode):
     x = torch.randn(M, K, dtype=torch.bfloat16, device="cuda")
     x_mx_ref, x_s_ref = triton_to_mxfp8_dim0_reference(
@@ -625,3 +623,215 @@ def test_cuda_mx_dim0_not_supported():
             rowwise=True,
             colwise=False,
         )
+
+
+@pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
+@pytest.mark.skipif(
+    not is_sm_at_least_100() and not is_MI350(),
+    reason="mxfp8 requires CUDA capability 10.0 or greater or ROCm gfx950 or greater.",
+)
+@pytest.mark.parametrize("scaling_mode", (ScaleCalculationMode.RCEIL,))
+def test_triton_mxfp8_dim0_special_values(scaling_mode: ScaleCalculationMode):
+    # Create tensor with special values - make it compatible with block_size=32
+    block_size = 32
+    special_vals = torch.zeros(2, block_size, dtype=torch.bfloat16, device="cuda")
+
+    # Fill first few elements of each row with special values
+    special_vals[0, :4] = torch.tensor(
+        [float("inf"), -float("inf"), float("nan"), 0.0], dtype=torch.bfloat16
+    )
+    special_vals[1, :4] = torch.tensor(
+        [
+            torch.finfo(torch.float32).max,
+            torch.finfo(torch.float32).min,
+            torch.finfo(torch.float32).tiny,
+            -torch.finfo(torch.float32).tiny,
+        ],
+        dtype=torch.bfloat16,
+    )
+
+    x_mx_ref, x_s_ref = triton_to_mxfp8_dim0_reference(
+        special_vals, block_size=block_size, scaling_mode=scaling_mode
+    )
+    x_mx_t, x_s_t = triton_to_mxfp8_dim0(
+        special_vals,
+        inner_block_size=block_size,
+        scaling_mode=scaling_mode.value.lower(),
+    )
+    x_mx_t = x_mx_t.to(torch.float32)
+    x_s_t = x_s_t.to(torch.uint8)
+    x_mx_ref = x_mx_ref.to(torch.float32)
+    x_s_ref = x_s_ref.to(torch.uint8)
+
+    # Check for NaNs in output (allow NaNs if input had NaNs, but check scales)
+    input_has_nan = special_vals.isnan().any()
+    if not input_has_nan:
+        assert not x_mx_t.isnan().any(), (
+            "quantized tensor should not contain NaNs when input has no NaNs"
+        )
+        assert not x_s_t.isnan().any(), (
+            "scales should not contain NaNs when input has no NaNs"
+        )
+
+    # Use NaN-aware comparison to handle nan != nan case properly
+    # Check NaN patterns match
+    nan_ref = torch.isnan(x_mx_ref)
+    nan_triton = torch.isnan(x_mx_t)
+    assert torch.equal(nan_ref, nan_triton), (
+        "NaN pattern mismatch between reference and triton"
+    )
+
+    # Check finite values
+    finite_mask = torch.isfinite(x_mx_ref) & torch.isfinite(x_mx_t)
+    if finite_mask.any():
+        assert torch.equal(x_mx_ref[finite_mask], x_mx_t[finite_mask]), (
+            "Finite values mismatch"
+        )
+
+    # Check infinity patterns
+    inf_ref = torch.isinf(x_mx_ref)
+    inf_triton = torch.isinf(x_mx_t)
+    assert torch.equal(inf_ref, inf_triton), (
+        "Infinity pattern mismatch between reference and triton"
+    )
+    if inf_ref.any():
+        assert torch.equal(x_mx_ref[inf_ref], x_mx_t[inf_ref]), (
+            "Infinity values mismatch"
+        )
+
+    # Check scales using exact comparison
+    x_s_ref_uint8 = x_s_ref.to(torch.uint8)
+    x_s_t_uint8 = x_s_t.to(torch.uint8)
+    assert torch.equal(x_s_t_uint8, x_s_ref_uint8), (
+        "Scale values mismatch between reference and triton"
+    )
+
+
+@pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
+@pytest.mark.skipif(
+    not is_sm_at_least_100() and not is_MI350(),
+    reason="mxfp8 requires CUDA capability 10.0 or greater or ROCm gfx950 or greater.",
+)
+@pytest.mark.parametrize("scaling_mode", (ScaleCalculationMode.RCEIL,))
+def test_triton_mxfp8_dim0_overflow_underflow(scaling_mode):
+    """Test with values near overflow and underflow thresholds."""
+    # Values near float8_e4m3fn limits
+    f8_max = torch.finfo(torch.float8_e4m3fn).max  # ~448
+    f8_min = torch.finfo(torch.float8_e4m3fn).tiny  # ~1.95e-06
+    block_size = 32
+
+    overflow_vals = torch.zeros(4, block_size, dtype=torch.bfloat16, device="cuda")
+
+    # Fill first few elements of each row with overflow/underflow values
+    overflow_vals[0, :4] = torch.tensor(
+        [f8_max * 0.9, f8_max * 1.1, f8_max * 2.0, f8_max * 10.0], dtype=torch.bfloat16
+    )
+    overflow_vals[1, :4] = torch.tensor(
+        [-f8_max * 0.9, -f8_max * 1.1, -f8_max * 2.0, -f8_max * 10.0],
+        dtype=torch.bfloat16,
+    )
+    overflow_vals[2, :4] = torch.tensor(
+        [f8_min * 0.1, f8_min * 0.5, f8_min * 2.0, f8_min * 10.0], dtype=torch.bfloat16
+    )
+    overflow_vals[3, :4] = torch.tensor(
+        [-f8_min * 0.1, -f8_min * 0.5, -f8_min * 2.0, -f8_min * 10.0],
+        dtype=torch.bfloat16,
+    )
+
+    x_mx_ref, x_s_ref = triton_to_mxfp8_dim0_reference(
+        overflow_vals, block_size=block_size, scaling_mode=scaling_mode
+    )
+    x_mx_t, x_s_t = triton_to_mxfp8_dim0(
+        overflow_vals,
+        inner_block_size=block_size,
+        scaling_mode=scaling_mode.value.lower(),
+    )
+
+    assert not x_mx_t.isnan().any(), "quantized tensor should not contain NaNs"
+    assert not x_s_t.isnan().any(), "scales should not contain NaNs"
+    torch.testing.assert_close(x_mx_t, x_mx_ref, rtol=0, atol=0)
+    torch.testing.assert_close(x_s_t, x_s_ref, rtol=0, atol=0)
+
+
+@pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
+@pytest.mark.skipif(
+    not is_sm_at_least_100() and not is_MI350(),
+    reason="mxfp8 requires CUDA capability 10.0 or greater or ROCm gfx950 or greater.",
+)
+@pytest.mark.parametrize("scaling_mode", (ScaleCalculationMode.RCEIL,))
+def test_triton_mxfp8_dim0_extreme_range(scaling_mode):
+    """Test with tensors containing both very large and very small values."""
+    # Mix of extreme values in same tensor to test scaling edge cases
+    block_size = 32
+    extreme_vals = torch.zeros(4, block_size, dtype=torch.bfloat16, device="cuda")
+
+    # Fill first few elements with extreme values
+    extreme_vals[0, :4] = torch.tensor([1e30, 1e-30, 1e20, 1e-20], dtype=torch.bfloat16)
+    extreme_vals[1, :4] = torch.tensor(
+        [-1e30, -1e-30, -1e20, -1e-20], dtype=torch.bfloat16
+    )
+    extreme_vals[2, :4] = torch.tensor(
+        [torch.finfo(torch.float32).max, torch.finfo(torch.float32).tiny, 1.0, -1.0],
+        dtype=torch.bfloat16,
+    )
+    extreme_vals[3, :4] = torch.tensor([0.0, 1e-40, 1e40, -1e40], dtype=torch.bfloat16)
+
+    x_mx_ref, x_s_ref = triton_to_mxfp8_dim0_reference(
+        extreme_vals, block_size=block_size, scaling_mode=scaling_mode
+    )
+    x_mx_t, x_s_t = triton_to_mxfp8_dim0(
+        extreme_vals,
+        inner_block_size=block_size,
+        scaling_mode=scaling_mode.value.lower(),
+    )
+
+    assert not x_mx_t.isnan().any(), "quantized tensor should not contain NaNs"
+    assert not x_s_t.isnan().any(), "scales should not contain NaNs"
+    torch.testing.assert_close(x_mx_t, x_mx_ref, rtol=0, atol=0)
+    torch.testing.assert_close(x_s_t, x_s_ref, rtol=0, atol=0)
+
+
+@pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
+@pytest.mark.skipif(
+    not is_sm_at_least_100() and not is_MI350(),
+    reason="mxfp8 requires CUDA capability 10.0 or greater or ROCm gfx950 or greater.",
+)
+@pytest.mark.parametrize("scaling_mode", (ScaleCalculationMode.RCEIL,))
+def test_triton_mxfp8_dim0_denormals_subnormals(scaling_mode):
+    """Test with denormal/subnormal values that might cause precision issues."""
+    # Create values in the denormal range
+    bf16_tiny = torch.finfo(torch.bfloat16).tiny
+    f32_tiny = torch.finfo(torch.float32).tiny
+    block_size = 32
+
+    denormal_vals = torch.zeros(4, block_size, dtype=torch.bfloat16, device="cuda")
+
+    # Fill first few elements with denormal values
+    denormal_vals[0, :4] = torch.tensor(
+        [bf16_tiny, bf16_tiny * 0.5, bf16_tiny * 0.1, bf16_tiny * 2.0],
+        dtype=torch.bfloat16,
+    )
+    denormal_vals[1, :4] = torch.tensor(
+        [f32_tiny, f32_tiny * 0.5, f32_tiny * 0.1, f32_tiny * 2.0], dtype=torch.bfloat16
+    )
+    denormal_vals[2, :4] = torch.tensor(
+        [-bf16_tiny, -bf16_tiny * 0.5, -bf16_tiny * 0.1, -bf16_tiny * 2.0],
+        dtype=torch.bfloat16,
+    )
+    denormal_vals[3, :4] = torch.tensor(
+        [1e-40, 1e-38, 1e-36, 1e-34], dtype=torch.bfloat16
+    )  # Very small values
+
+    x_mx_ref, x_s_ref = triton_to_mxfp8_dim0_reference(
+        denormal_vals, block_size=block_size, scaling_mode=scaling_mode
+    )
+    x_mx_t, x_s_t = triton_to_mxfp8_dim0(
+        denormal_vals,
+        inner_block_size=block_size,
+        scaling_mode=scaling_mode.value.lower(),
+    )
+
+    assert not x_mx_t.isnan().any(), "quantized tensor should not contain NaNs"
+    assert not x_s_t.isnan().any(), "scales should not contain NaNs"
+    torch.testing.assert_close(x_mx_t, x_mx_ref, rtol=0, atol=0)
+    torch.testing.assert_close(x_s_t, x_s_ref, rtol=0, atol=0)
diff --git a/torchao/prototype/mx_formats/kernels.py b/torchao/prototype/mx_formats/kernels.py
@@ -472,27 +472,29 @@ def triton_mxfp8_dequant_dim0(
 
     @triton.jit
     def _triton_calculate_scale_rceil(x, axis, USE_PTX: tl.constexpr):
+        """
+        Calculates and returns reciprocal scale using RCEIL rounding mode
+        """
         # There is no good support for accessing globals from a jit'ed triton
         # function, so we redefine them here. Since this is prototype code which
         # we plan to remove after torch.compile catches up, this is fine.
         e8m0_exponent_bias = 127
-        fp32_mbits = 23
 
         # Find the maximum absolute value for each row
         max_abs = tl.max(x, axis=axis)
 
         F8E4M3_MAX_RCP: tl.constexpr = 1.0 / 448.0
 
+        # Calculate scale input like CUDA: amax * max_norm_rcp
+        scale_input = max_abs * F8E4M3_MAX_RCP
+
+        # Handle special values at scale calculation level (like CUDA float_to_e8m0)
+        # Ref: https://github.com/NVIDIA/TransformerEngine/blob/b7598aa887eb7d619d64c90692980009669379bf/transformer_engine/common/util/ptx.cuh#L332-L341
+        is_nan = scale_input != scale_input  # NaN check
+        is_inf = tl.abs(scale_input) == float("inf")  # Inf check
+
         if USE_PTX:
-            # RCEIL scaling mode using PTX instruction supported on sm100.
-            # The input should be: amax / 448.0
-            # where 448.0 is the max representable value in FP8 E4M3 format.
-            scale_input = max_abs.to(tl.float32) * F8E4M3_MAX_RCP
-
-            # The PTX instruction outputs a packed uint16 where:
-            # - high byte = E8M0 of first input (0.0 in our case)
-            # - low byte = E8M0 of second input (scale_input)
-            # Casting uint16 to uint8 naturally truncates to the low byte.
+            # Use PTX instruction for normal values
             scale_e8m0_biased = tl.inline_asm_elementwise(
                 asm="cvt.rp.satfinite.ue8m0x2.f32 $0, 0.0, $1;",
                 constraints="=h,r",
@@ -502,35 +504,53 @@ def _triton_calculate_scale_rceil(x, axis, USE_PTX: tl.constexpr):
                 pack=1,
             ).to(tl.uint8)
         else:
-            # Original recil implementation described in https://docs.nvidia.com/cuda/cublas/#d-block-quantization
-            descale = max_abs * F8E4M3_MAX_RCP
-
-            # Clamp to exponents that can be represented in e8m0
+            # Fallback implementation
             scale_e8m0_unbiased = tl.clamp(
-                tl.ceil(tl.log2(descale)),
+                tl.ceil(tl.log2(scale_input)),
                 min=-1 * e8m0_exponent_bias,
                 max=e8m0_exponent_bias,
             )
+            scale_e8m0_biased = (scale_e8m0_unbiased + 127).to(tl.uint8)
 
-            # Create the biased e8m0 representation and cast it to 8 bits
-            # Set NaN values to 0xFF
-            is_nan = descale != descale
-            scale_e8m0_biased = tl.where(is_nan, 0xFF, scale_e8m0_unbiased + 127)
-            scale_e8m0_biased = scale_e8m0_biased.to(tl.uint8)
+        # Apply special value overrides (like CUDA)
+        # Ref: https://github.com/NVIDIA/TransformerEngine/blob/b7598aa887eb7d619d64c90692980009669379bf/transformer_engine/common/util/ptx.cuh#L332-L341
+        scale_e8m0_biased = tl.where(is_nan, 255, scale_e8m0_biased)  # 0xFF for NaN
+        scale_e8m0_biased = tl.where(is_inf, 254, scale_e8m0_biased)  # 0xFE for inf
 
-        # TODO(future PR): add NaN handling here,
-        # https://github.com/pytorch/pytorch/pull/100572 will likely be useful to
-        # get proper NaN propagation working
-        # Calculate the scale in floating point.
-        scale_fp = (scale_e8m0_biased.to(tl.int32) << fp32_mbits).to(
-            tl.float32, bitcast=True
-        )
+        # Efficient reciprocal calculation (like CUDA exp2f_rcp)
+        FP32_MANTISSA_BITS: tl.constexpr = 23
 
-        fp32_exp_bias = 127.0
-        fp32_min_normal = tl.exp2(-fp32_exp_bias + 1)
-        scale_fp = tl.clamp(scale_fp, min=fp32_min_normal, max=float("inf"))
+        # Equivalent CUDA per-thread code is more readable, copying here as documentation:
+        #
+        # __device__ __forceinline__ float exp2f_rcp(e8m0_t biased_exp) {
+        #   // Handle the special case of NaN.
+        #   if (biased_exp == 255) return __int_as_float(0x7fffffff);
+        #
+        #   // Handle the special case where the unbiased exponent is 127, so the reciprocal is 2^-127 which needs the first bit of
+        #   // the mantissa to be 1, which can't be obtained by shifting `FP32_MANTISSA_BITS` bits to the left.
+        #   if (biased_exp == 254) return __int_as_float(0x00400000);
+        #
+        #   // Fast calculation when the unbiased exp is in [-126, 126], and only the exponent part is used to express the reciprocal.
+        #   return __int_as_float((254 - biased_exp) << FP32_MANTISSA_BITS);
+        # }
+        descale_fp = tl.where(
+            scale_e8m0_biased == 255,  # NaN case -> return NaN
+            float("nan"),
+            tl.where(
+                scale_e8m0_biased == 254,  # Inf case -> return 2^-127
+                2**-127,
+                tl.where(
+                    scale_e8m0_biased == 0,  # Zero case -> return 1.0 (no scaling)
+                    1.0,
+                    # Normal case: fast bit manipulation (254 - biased_exp) << 23
+                    ((254 - scale_e8m0_biased).to(tl.int32) << FP32_MANTISSA_BITS).to(
+                        tl.float32, bitcast=True
+                    ),
+                ),
+            ),
+        )
 
-        return scale_fp, scale_e8m0_biased
+        return descale_fp, scale_e8m0_biased
 
     @triton.jit
     def _triton_calculate_scale_floor(
@@ -793,25 +813,23 @@ def to_mxfp8_dim0_kernel(
         # Find the maximum absolute value for each row (across columns)
         # shape: (ROW_TILE_SIZE * BLOCKS_PER_COL_TILE,)
         if SCALING_MODE == "rceil":
-            scale_fp32_r, scale_e8m0_r = _triton_calculate_scale_rceil(
+            descale_fp32_r, scale_e8m0_r = _triton_calculate_scale_rceil(
                 x_block_abs_r,
                 axis=1,
                 USE_PTX=not IS_ROCM,
             )
         else:
             tl.static_assert(SCALING_MODE == "floor")
-            scale_fp32_r, scale_e8m0_r = _triton_calculate_scale_floor(
+            descale_fp32_r, scale_e8m0_r = _triton_calculate_scale_floor(
                 x_block_abs_r,
                 axis=1,
             )
 
-        # Divide each row by scale
-        # Broadcasting scale to match x_block's shape
-        # x_block_r shape:
-        #    (ROW_TILE_SIZE * BLOCKS_PER_COL_TILE, SCALE_BLOCK_SIZE)
-        # scale[:, None] shape:
-        #    (ROW_TILE_SIZE * BLOCKS_PER_COL_TILE, 1)
-        scaled_data_r = x_block_r / scale_fp32_r[:, None]
+        # Broadcast descale to match x_block's shape
+        descale_broadcast = descale_fp32_r[:, None]
+
+        # Scale the data
+        scaled_data_r = x_block_r * descale_broadcast
 
         # Reshape back to original tile size
         e4m3_data_2d = tl.reshape(scaled_data_r, ROW_TILE_SIZE, COL_TILE_SIZE).to(
@@ -821,8 +839,10 @@ def to_mxfp8_dim0_kernel(
         # Store the row-normalized result in row-major format
         tl.store(output_ptr + row_major_offsets, e4m3_data_2d, mask=mask)
 
-        # Calculate scale offsets to write to
+        # Store e8m0 scales
         scales_per_row = n_cols // SCALE_BLOCK_SIZE
+
+        # Calculate scale storage offsets and mask
         scale_row_indices = (
             pid_row * ROW_TILE_SIZE + tl.arange(0, ROW_TILE_SIZE)[:, None]
         )
@@ -831,9 +851,9 @@ def to_mxfp8_dim0_kernel(
             + tl.arange(0, SCALE_BLOCKS_PER_COL_TILE)[None, :]
         )
         scale_offsets = scale_row_indices * scales_per_row + scale_col_indices
-
-        # Store e8m0 scales
         scale_mask = (scale_row_indices < n_rows) & (scale_col_indices < scales_per_row)
+
+        # Reshape scale values to 2D and store
         scale_e8m0_2d = scale_e8m0_r.reshape(ROW_TILE_SIZE, SCALE_BLOCKS_PER_COL_TILE)
         tl.store(scale_ptr + scale_offsets, scale_e8m0_2d, mask=scale_mask)
 
diff --git a/torchao/prototype/mx_formats/mx_tensor.py b/torchao/prototype/mx_formats/mx_tensor.py