pytorch
diff --git a/‎benchmarks/mx_formats/cast_bench.py‎
Lines changed: 70 additions & 1 deletion b/‎benchmarks/mx_formats/cast_bench.py‎
Lines changed: 70 additions & 1 deletion
diff --git a/‎benchmarks/prototype/moe_training/mxfp8/bench_cutedsl_quantize_2d.py‎
Lines changed: 208 additions & 0 deletions b/‎benchmarks/prototype/moe_training/mxfp8/bench_cutedsl_quantize_2d.py‎
Lines changed: 208 additions & 0 deletions
diff --git a/‎test/prototype/moe_training/test_kernels.py‎
Lines changed: 55 additions & 0 deletions b/‎test/prototype/moe_training/test_kernels.py‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎torchao/prototype/moe_training/kernels/mxfp8/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎torchao/prototype/moe_training/kernels/mxfp8/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -109,6 +109,7 @@ def run(
     print(f"triton version: {triton.__version__}")
     print(f"mode: {mode}")
     assert mode in (
+        "memcpy",
         "dim0",
         "dim1",
         "dim0_dim1",
@@ -125,11 +126,31 @@ def run(
         "dim1_mxfp8_triton_rceil",
         "dim1_mxfp8_cuda_floor",
         "dim1_mxfp8_cuda_rceil",
+        "dim0_mxfp8_cutedsl_2d_floor",
+        "dim0_mxfp8_cutedsl_2d_rceil",
     )
 
     x = torch.randn(M, K, dtype=torch.bfloat16, device="cuda") * 1000
 
-    if mode == "dim0":
+    if mode == "memcpy":
+        # Baseline memcpy benchmark to establish max achievable bandwidth
+        y = torch.randn_like(x)
+
+        # Warmup
+        for _ in range(2):
+            y.copy_(x)
+
+        time_us = benchmark_cuda_function_in_microseconds(
+            lambda src, dst: dst.copy_(src),
+            x,
+            y,
+        )
+
+        # bytes_read + bytes_written
+        bytes_rw = 2 * x.numel() * bytes_per_el_bf16
+        bps = bytes_rw / (time_us / 1e6)
+
+    elif mode == "dim0":
         scale_dim0_reference_c = torch.compile(scale_dim0_reference)
         y_d0, s_d0 = scale_dim0_reference_c(x, BLOCK_SIZE)
 
@@ -452,6 +473,54 @@ def run(
         bytes_w = (y_d1.numel() + s_d1.numel()) * bytes_per_el_fp8
         bps = (bytes_r + bytes_w) / (time_us / 1e6)
 
+    elif mode == "dim0_mxfp8_cutedsl_2d_floor":
+        from torchao.prototype.moe_training.kernels.mxfp8 import mxfp8_quantize_cuda_2d
+
+        y_d0, s_d0 = mxfp8_quantize_cuda_2d(
+            x, block_size=BLOCK_SIZE, scaling_mode="floor"
+        )
+
+        for _ in range(2):
+            __ = mxfp8_quantize_cuda_2d(x, block_size=BLOCK_SIZE, scaling_mode="floor")
+
+        time_us = benchmark_cuda_function_in_microseconds(
+            lambda x: mxfp8_quantize_cuda_2d(
+                x, block_size=BLOCK_SIZE, scaling_mode="floor"
+            ),
+            x,
+        )
+
+        assert y_d0.dtype == torch.float8_e4m3fn
+        assert s_d0.dtype == torch.float8_e8m0fnu
+
+        bytes_r = x.numel() * bytes_per_el_bf16
+        bytes_w = (y_d0.numel() + s_d0.numel()) * bytes_per_el_fp8
+        bps = (bytes_r + bytes_w) / (time_us / 1e6)
+
+    elif mode == "dim0_mxfp8_cutedsl_2d_rceil":
+        from torchao.prototype.moe_training.kernels.mxfp8 import mxfp8_quantize_cuda_2d
+
+        y_d0, s_d0 = mxfp8_quantize_cuda_2d(
+            x, block_size=BLOCK_SIZE, scaling_mode="rceil"
+        )
+
+        for _ in range(2):
+            __ = mxfp8_quantize_cuda_2d(x, block_size=BLOCK_SIZE, scaling_mode="rceil")
+
+        time_us = benchmark_cuda_function_in_microseconds(
+            lambda x: mxfp8_quantize_cuda_2d(
+                x, block_size=BLOCK_SIZE, scaling_mode="rceil"
+            ),
+            x,
+        )
+
+        assert y_d0.dtype == torch.float8_e4m3fn
+        assert s_d0.dtype == torch.float8_e8m0fnu
+
+        bytes_r = x.numel() * bytes_per_el_bf16
+        bytes_w = (y_d0.numel() + s_d0.numel()) * bytes_per_el_fp8
+        bps = (bytes_r + bytes_w) / (time_us / 1e6)
+
     else:
         raise AssertionError(f"unknown mode {mode}")
 
 
@@ -0,0 +1,208 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+# this benchmarking script is a modified version of the original script from: https://github.com/drisspg/transformer_nuggets/blob/main/transformer_nuggets/utils/benchmark.py
+
+import itertools
+from dataclasses import dataclass
+from typing import List
+
+import torch
+from tabulate import tabulate
+from tqdm import tqdm
+
+from benchmarks.utils import benchmark_cuda_function_in_microseconds
+from torchao.prototype.moe_training.kernels.mxfp8 import (
+    mx_block_rearrange_2d_M_groups_cuda,
+)
+from torchao.prototype.moe_training.kernels.mxfp8.cutedsl_quantize_2d import (
+    mxfp8_quantize_cutedsl_2d,
+)
+from torchao.prototype.moe_training.utils import generate_jagged_offs
+from torchao.prototype.mx_formats.kernels import triton_to_mxfp8_dim0
+
+device = torch.device("cuda")
+
+# Needed since changing args to function causes recompiles
+torch._dynamo.config.cache_size_limit = 1000
+
+
+@dataclass(frozen=True)
+class ExperimentConfig:
+    input_shape: tuple[int, int]
+    scaling_mode: str
+    num_groups: int
+
+
+@dataclass(frozen=True)
+class ExperimentResult:
+    # time
+    cutedsl_blocked_us: float
+    triton_plus_rearrange_us: float
+    # mem bw
+    cutedsl_blocked_gbps: float
+    triton_plus_rearrange_gbps: float
+
+
+@dataclass(frozen=True)
+class Experiment:
+    config: ExperimentConfig
+    result: ExperimentResult
+
+
+def get_configs() -> List[ExperimentConfig]:
+    input_shapes = [
+        # DeepSeekV3 671b shapes
+        (8192, 2048),
+        (8192, 7168),
+        (32768, 2048),
+        (32768, 7168),
+        (131072, 2048),
+        (131072, 7168),
+    ]
+    scaling_modes = ["floor", "rceil"]
+    num_groups_list = [8]
+    configs = []
+    for shape, scaling_mode, num_groups in itertools.product(
+        input_shapes, scaling_modes, num_groups_list
+    ):
+        configs.append(
+            ExperimentConfig(
+                input_shape=shape,
+                scaling_mode=scaling_mode,
+                num_groups=num_groups,
+            )
+        )
+    return configs
+
+
+def run_experiment(config: ExperimentConfig) -> ExperimentResult:
+    block_size = 32
+    input_shape = config.input_shape
+    scaling_mode = config.scaling_mode
+    num_groups = config.num_groups
+
+    input_tensor = torch.randn(
+        *input_shape,
+        dtype=torch.bfloat16,
+        device=device,
+    )
+
+    M, K = input_shape
+
+    # Generate jagged offsets with multiples of 128
+    # TODO: we use multiple of 128 here to avoid per-group padding requirement in blocked scales layout, which cutedsl doesn't support yet.
+    group_end_offsets = generate_jagged_offs(
+        num_groups, M, multiple_of=128, device=device
+    )
+
+    # Benchmark 1: CuTeDSL kernel with blocked scale output
+    data_cutedsl, scales_cutedsl = mxfp8_quantize_cutedsl_2d(
+        input_tensor,
+        block_size=block_size,
+        scaling_mode=scaling_mode,
+        blocked_scale_output=True,
+    )
+    cutedsl_blocked_time_us = benchmark_cuda_function_in_microseconds(
+        mxfp8_quantize_cutedsl_2d,
+        input_tensor,
+        block_size=block_size,
+        scaling_mode=scaling_mode,
+        blocked_scale_output=True,
+    )
+
+    # Benchmark 2: Triton quantization + CUDA scale rearrangement
+    def triton_plus_rearrange(x, group_offs):
+        # Quantize along dim0 (rowwise)
+        data, scales = triton_to_mxfp8_dim0(
+            x,
+            inner_block_size=block_size,
+            scaling_mode=scaling_mode,
+        )
+        # Convert scales to blocked layout
+        scales_blocked = mx_block_rearrange_2d_M_groups_cuda(
+            scales.view(torch.uint8), group_offs
+        )
+        return data, scales_blocked
+
+    data_triton, scales_triton = triton_plus_rearrange(input_tensor, group_end_offsets)
+    triton_plus_rearrange_time_us = benchmark_cuda_function_in_microseconds(
+        triton_plus_rearrange,
+        input_tensor,
+        group_end_offsets,
+    )
+
+    # Memory bandwidth calculations
+    bytes_per_input_el = torch.finfo(torch.bfloat16).bits / 8
+    bytes_per_output_el = torch.finfo(torch.float8_e4m3fn).bits / 8
+    bytes_per_scale_el = torch.finfo(torch.float8_e8m0fnu).bits / 8
+
+    read_bytes = input_tensor.numel() * bytes_per_input_el
+    write_bytes = (
+        data_cutedsl.numel() * bytes_per_output_el
+        + scales_cutedsl.numel() * bytes_per_scale_el
+    )
+
+    cutedsl_blocked_gbps = ((read_bytes + write_bytes) / 1e9) / (
+        cutedsl_blocked_time_us / 1e6
+    )
+    triton_plus_rearrange_gbps = ((read_bytes + write_bytes) / 1e9) / (
+        triton_plus_rearrange_time_us / 1e6
+    )
+
+    return ExperimentResult(
+        cutedsl_blocked_us=cutedsl_blocked_time_us,
+        triton_plus_rearrange_us=triton_plus_rearrange_time_us,
+        cutedsl_blocked_gbps=cutedsl_blocked_gbps,
+        triton_plus_rearrange_gbps=triton_plus_rearrange_gbps,
+    )
+
+
+def print_results(experiments: List[Experiment]):
+    headers = [
+        "input_shape",
+        "scaling_mode",
+        "num_groups",
+        "cutedsl_blocked_us",
+        "triton+rearrange_us",
+        "speedup",
+        "cutedsl_gbps",
+        "triton+rearrange_gbps",
+    ]
+    rows = []
+    for experiment in experiments:
+        speedup = (
+            experiment.result.triton_plus_rearrange_us
+            / experiment.result.cutedsl_blocked_us
+        )
+        rows.append(
+            [
+                str(experiment.config.input_shape),
+                experiment.config.scaling_mode,
+                experiment.config.num_groups,
+                f"{experiment.result.cutedsl_blocked_us:.2f}",
+                f"{experiment.result.triton_plus_rearrange_us:.2f}",
+                f"{speedup:.2f}x",
+                f"{experiment.result.cutedsl_blocked_gbps:.1f}",
+                f"{experiment.result.triton_plus_rearrange_gbps:.1f}",
+            ]
+        )
+    print(tabulate(rows, headers=headers))
+
+
+def main():
+    torch.random.manual_seed(123)
+    configs = get_configs()
+    results = []
+    for config in tqdm(configs):
+        result = run_experiment(config)
+        results.append(Experiment(config=config, result=result))
+
+    # Use Tabulate to print results
+    print_results(results)
+
+
+if __name__ == "__main__":
+    main()
@@ -38,6 +38,7 @@ def _is_sm_10x() -> bool:
     fused_pad_token_groups_cuda,
     fused_unpad_token_groups_cuda,
     mx_block_rearrange_2d_M_groups_cuda,
+    mxfp8_quantize_cuda_2d,
     mxfp8_quantize_cuda_3d,
     torch_pad_token_groups,
     torch_to_blocked_2d_K_groups,
@@ -436,6 +437,60 @@ def test_cuda_mx_dim1_3d_numerics(E, N, K, input_dtype, scaling_mode):
     assert y_d1.stride() == y_d1_ref.stride(), "quantized tensor strides do not match"
 
 
+@pytest.mark.skipif(
+    not _is_sm_10x(),
+    reason="MXFP8 requires CUDA SM 10.x",
+)
+@pytest.mark.skipif(
+    not _mxfp8_cutedsl_kernels_available,
+    reason="MXFP8 cutedsl kernels not available",
+)
+@pytest.mark.parametrize("M", (32, 160, 8192))
+@pytest.mark.parametrize("K", (32, 96, 1536, 5120, 7168, 8192))
+@pytest.mark.parametrize("input_dtype", (torch.bfloat16,))
+@pytest.mark.parametrize(
+    "scaling_mode", (ScaleCalculationMode.FLOOR, ScaleCalculationMode.RCEIL)
+)
+def test_cuda_mx_dim0_2d_numerics(M, K, input_dtype, scaling_mode):
+    scaling_mode_str = scaling_mode.value.lower()
+    block_size = 32
+
+    # Use distinct incrementing values from 0 to M*K-1 to make debugging easier.
+    x = (
+        torch.arange(0, M * K, dtype=input_dtype, device="cuda")
+        .reshape(M, K)
+        .contiguous()
+    )
+
+    # Reference implementation
+    s_d0_ref, y_d0_ref = to_mx(
+        x,
+        elem_dtype=torch.float8_e4m3fn,
+        block_size=block_size,
+        scaling_mode=scaling_mode,
+    )
+
+    # CuTeDSL kernel implementation
+    y_d0, s_d0 = mxfp8_quantize_cuda_2d(
+        x,
+        block_size=block_size,
+        scaling_mode=scaling_mode_str,
+    )
+
+    # Convert blocked scales back to reference format
+    s_d0 = from_blocked(s_d0, M, K // block_size).to(s_d0_ref.dtype)
+
+    # Check scales
+    torch.testing.assert_close(s_d0, s_d0_ref, rtol=0, atol=0)
+
+    # Check quantized values
+    torch.testing.assert_close(y_d0, y_d0_ref, rtol=0, atol=0)
+
+    # Verify row-major layout
+    assert y_d0.stride() == (K, 1), "quantized tensor should be row-major"
+    assert y_d0.stride() == y_d0_ref.stride(), "quantized tensor strides do not match"
+
+
 @pytest.mark.skipif(
     not _mxfp8_cuda_kernels_available,
     reason="CUDA kernel requires sm_100 and CUDA 12.8+",
 
@@ -3,6 +3,7 @@
     fused_pad_token_groups_cuda,  # noqa: F401
     fused_unpad_token_groups_cuda,  # noqa: F401
     mx_block_rearrange_2d_M_groups_cuda,  # noqa: F401
+    mxfp8_quantize_cuda_2d,  # noqa: F401
     mxfp8_quantize_cuda_3d,  # noqa: F401
     torch_pad_token_groups,  # noqa: F401
     torch_to_blocked_2d_K_groups,  # noqa: F401