pytorch
diff --git a/‎.github/workflows/dashboard_perf_test.yml‎
Lines changed: 0 additions & 3 deletions b/‎.github/workflows/dashboard_perf_test.yml‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎benchmarks/microbenchmarks/utils.py‎
Lines changed: 1 addition & 23 deletions b/‎benchmarks/microbenchmarks/utils.py‎
Lines changed: 1 addition & 23 deletions
diff --git a/‎benchmarks/mx_formats/cast_bench.py‎
Lines changed: 70 additions & 1 deletion b/‎benchmarks/mx_formats/cast_bench.py‎
Lines changed: 70 additions & 1 deletion
diff --git a/‎benchmarks/prototype/blockwise_fp8_training/README.md‎
Lines changed: 80 additions & 0 deletions b/‎benchmarks/prototype/blockwise_fp8_training/README.md‎
Lines changed: 80 additions & 0 deletions
@@ -45,9 +45,6 @@ jobs:
           # llama3 - compile baseline
           ${CONDA_RUN} python torchao/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --compile_prefill --output_json_path ${{ runner.temp }}/benchmark-results/llama3-benchmark-results.json
 
-          # llama3 - autoquant
-          ${CONDA_RUN} python torchao/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --compile_prefill --quantization autoquant --output_json_path ${{ runner.temp }}/benchmark-results/llama3-benchmark-results.json
-
           # skipping SAM because of https://hud.pytorch.org/pr/pytorch/ao/1407
           # # SAM
           # ${CONDA_RUN} pip install git+https://github.com/pytorch-labs/segment-anything-fast.git@main
 
@@ -16,7 +16,6 @@
 from torchao.quantization import (
     Float8DynamicActivationFloat8WeightConfig,
     Float8WeightOnlyConfig,
-    GemliteUIntXWeightOnlyConfig,
     Int8DynamicActivationInt8WeightConfig,
     Int8WeightOnlyConfig,
     MappingType,
@@ -182,11 +181,7 @@ def string_to_config(
     if "int8wo" in quantization:
         return Int8WeightOnlyConfig()
     if "int8dq" in quantization:
-        if sparsity is not None and ("semi" in sparsity or "2:4" in sparsity):
-            from torchao.dtypes import SemiSparseLayout
-
-            return Int8DynamicActivationInt8WeightConfig(layout=SemiSparseLayout())
-        elif "int8dq_prefill_wo_decode" in quantization:
+        if "int8dq_prefill_wo_decode" in quantization:
             return Int8DynamicActivationInt8WeightConfig(weight_only_decode=True)
         else:
             return Int8DynamicActivationInt8WeightConfig()
@@ -225,23 +220,6 @@ def string_to_config(
         else:
             granularity = PerTensor()
         return Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
-    if "gemlitewo" in quantization:
-        params = quantization.split("-")
-        bit_width = int(params[1]) if len(params) > 1 else 4
-        group_size = (
-            int(params[2])
-            if len(params) > 2 and bit_width == 4
-            else None
-            if bit_width == 8
-            else 64
-        )
-        assert group_size in [
-            32,
-            64,
-            128,
-            256,
-        ], f"int4wo group_size needs to be one of [32,64,128,256] but got {group_size}"
-        return GemliteUIntXWeightOnlyConfig(group_size=group_size, bit_width=bit_width)
     return None
 
 
 
@@ -109,6 +109,7 @@ def run(
     print(f"triton version: {triton.__version__}")
     print(f"mode: {mode}")
     assert mode in (
+        "memcpy",
         "dim0",
         "dim1",
         "dim0_dim1",
@@ -125,11 +126,31 @@ def run(
         "dim1_mxfp8_triton_rceil",
         "dim1_mxfp8_cuda_floor",
         "dim1_mxfp8_cuda_rceil",
+        "dim0_mxfp8_cutedsl_2d_floor",
+        "dim0_mxfp8_cutedsl_2d_rceil",
     )
 
     x = torch.randn(M, K, dtype=torch.bfloat16, device="cuda") * 1000
 
-    if mode == "dim0":
+    if mode == "memcpy":
+        # Baseline memcpy benchmark to establish max achievable bandwidth
+        y = torch.randn_like(x)
+
+        # Warmup
+        for _ in range(2):
+            y.copy_(x)
+
+        time_us = benchmark_cuda_function_in_microseconds(
+            lambda src, dst: dst.copy_(src),
+            x,
+            y,
+        )
+
+        # bytes_read + bytes_written
+        bytes_rw = 2 * x.numel() * bytes_per_el_bf16
+        bps = bytes_rw / (time_us / 1e6)
+
+    elif mode == "dim0":
         scale_dim0_reference_c = torch.compile(scale_dim0_reference)
         y_d0, s_d0 = scale_dim0_reference_c(x, BLOCK_SIZE)
 
@@ -452,6 +473,54 @@ def run(
         bytes_w = (y_d1.numel() + s_d1.numel()) * bytes_per_el_fp8
         bps = (bytes_r + bytes_w) / (time_us / 1e6)
 
+    elif mode == "dim0_mxfp8_cutedsl_2d_floor":
+        from torchao.prototype.moe_training.kernels.mxfp8 import mxfp8_quantize_cuda_2d
+
+        y_d0, s_d0 = mxfp8_quantize_cuda_2d(
+            x, block_size=BLOCK_SIZE, scaling_mode="floor"
+        )
+
+        for _ in range(2):
+            __ = mxfp8_quantize_cuda_2d(x, block_size=BLOCK_SIZE, scaling_mode="floor")
+
+        time_us = benchmark_cuda_function_in_microseconds(
+            lambda x: mxfp8_quantize_cuda_2d(
+                x, block_size=BLOCK_SIZE, scaling_mode="floor"
+            ),
+            x,
+        )
+
+        assert y_d0.dtype == torch.float8_e4m3fn
+        assert s_d0.dtype == torch.float8_e8m0fnu
+
+        bytes_r = x.numel() * bytes_per_el_bf16
+        bytes_w = (y_d0.numel() + s_d0.numel()) * bytes_per_el_fp8
+        bps = (bytes_r + bytes_w) / (time_us / 1e6)
+
+    elif mode == "dim0_mxfp8_cutedsl_2d_rceil":
+        from torchao.prototype.moe_training.kernels.mxfp8 import mxfp8_quantize_cuda_2d
+
+        y_d0, s_d0 = mxfp8_quantize_cuda_2d(
+            x, block_size=BLOCK_SIZE, scaling_mode="rceil"
+        )
+
+        for _ in range(2):
+            __ = mxfp8_quantize_cuda_2d(x, block_size=BLOCK_SIZE, scaling_mode="rceil")
+
+        time_us = benchmark_cuda_function_in_microseconds(
+            lambda x: mxfp8_quantize_cuda_2d(
+                x, block_size=BLOCK_SIZE, scaling_mode="rceil"
+            ),
+            x,
+        )
+
+        assert y_d0.dtype == torch.float8_e4m3fn
+        assert s_d0.dtype == torch.float8_e8m0fnu
+
+        bytes_r = x.numel() * bytes_per_el_bf16
+        bytes_w = (y_d0.numel() + s_d0.numel()) * bytes_per_el_fp8
+        bps = (bytes_r + bytes_w) / (time_us / 1e6)
+
     else:
         raise AssertionError(f"unknown mode {mode}")
 
 
@@ -0,0 +1,80 @@
+# Blockwise FP8 Training Benchmarks
+
+This directory contains benchmarking scripts for the blockwise FP8 quantization
+and GEMM paths under `torchao.prototype.blockwise_fp8_training.kernels`.
+
+## Quantized Kernel Bandwidth Benchmark
+
+The kernel-path bandwidth utility is:
+
+```bash
+python -m benchmarks.prototype.blockwise_fp8_training.benchmark_quant_kernel_bandwidth
+```
+
+To additionally validate Triton outputs against the Torch reference
+implementations:
+
+```bash
+python -m benchmarks.prototype.blockwise_fp8_training.benchmark_quant_kernel_bandwidth --check-correctness
+```
+
+What it reports:
+
+- `kernel_us`: measured runtime of the public quantization wrapper call
+- `effective_logical_io_gbps`: logical tensor IO bytes divided by measured time
+- `logical_io_vs_achievable_%`: `effective_logical_io_gbps / achievable_bandwidth_gbps`
+
+Notes:
+
+- The benchmark times the public wrapper functions in
+  `torchao.prototype.blockwise_fp8_training.kernels`.
+- `--check-correctness` runs the matching Torch reference path once per valid
+  kernel and shape before reporting results. This adds overhead and is intended
+  for validation, not headline timing runs.
+- The bandwidth number uses the expected tensor IO footprint, not hardware DRAM
+  counters.
+- Peak bandwidth defaults to CUDA device properties. `--use-roofline-utils`
+  switches to the static `roofline_utils` table.
+
+### Methodology
+
+- It times the public wrapper call, matching the style of the other benchmark
+  scripts in this directory.
+- It uses CUDA event timing and the median, via
+  `benchmark_cuda_function_in_microseconds(...)` from
+  [benchmarks/utils.py](/home/dev/ao/benchmarks/utils.py#L101).
+- It validates unsupported shapes up front and skips them instead of silently
+  measuring invalid configurations.
+
+## Current H100 Results
+
+Captured on 2026-03-20 with:
+
+```bash
+python -m benchmarks.prototype.blockwise_fp8_training.benchmark_quant_kernel_bandwidth
+```
+
+Environment:
+
+- GPU: `NVIDIA H100 80GB HBM3`
+- Peak bandwidth reference: `3352.3 GB/s`
+- Peak bandwidth source: `cuda_device_properties`
+- Achievable bandwidth reference: `3084.1 GB/s`
+- Achievable bandwidth uses `92.0%` of peak bandwidth
+- Achievable bandwidth source: `roofline_utils_pct_achievable_mem_bw`
+
+### Per-shape Results
+Tested with shapes 32768 and 131072 to reflect real world training:
+
+| kernel | shape | kernel_us | effective_logical_io_gbps | logical_io_vs_achievable_% |
+|---|---|---:|---:|---:|
+| act_quant_transposed_lhs | 32768x4096 | 154.46 | 2633.9 | 85.4 |
+| weight_quant_transposed_rhs | 32768x4096 | 150.53 | 2675.2 | 86.7 |
+| act_quant_lhs | 32768x4096 | 150.86 | 2696.8 | 87.4 |
+| act_quant_rhs | 32768x4096 | 148.70 | 2736.0 | 88.7 |
+| weight_quant_rhs | 32768x4096 | 144.99 | 2777.3 | 90.1 |
+| weight_quant_transposed_rhs | 131072x4096 | 581.89 | 2768.1 | 89.8 |
+| act_quant_lhs | 131072x4096 | 586.98 | 2772.5 | 89.9 |
+| act_quant_transposed_lhs | 131072x4096 | 581.47 | 2798.7 | 90.7 |
+| act_quant_rhs | 131072x4096 | 562.56 | 2892.8 | 93.8 |
+| weight_quant_rhs | 131072x4096 | 555.30 | 2900.7 | 94.1 |