pytorch
diff --git a/‎benchmarks/prototype/attention/benchmark_sdpa.py‎
Lines changed: 6 additions & 6 deletions b/‎benchmarks/prototype/attention/benchmark_sdpa.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎benchmarks/prototype/attention/eval_flux_model.py‎
Lines changed: 8 additions & 0 deletions b/‎benchmarks/prototype/attention/eval_flux_model.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎benchmarks/prototype/attention/eval_llama3_model.py‎
Lines changed: 9 additions & 0 deletions b/‎benchmarks/prototype/attention/eval_llama3_model.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎torchao/prototype/attention/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎torchao/prototype/attention/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎torchao/prototype/attention/api.py‎
Lines changed: 18 additions & 1 deletion b/‎torchao/prototype/attention/api.py‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎torchao/prototype/attention/quantization/__init__.py‎
Lines changed: 12 additions & 0 deletions b/‎torchao/prototype/attention/quantization/__init__.py‎
Lines changed: 12 additions & 0 deletions
@@ -27,34 +27,34 @@
 from torchao.prototype.attention.fp8_fa3.attention import fp8_fa3_sdpa
 from torchao.quantization.utils import compute_error as compute_sqnr
 
-BACKENDS = ["fa2", "fa3", "fa3_fp8"]
+BACKENDS = ["fa2", "fa3", "fa3_fp8", "fa3_fp8_hadamard"]
 
 BACKEND_LABELS = {
     "fa2": "FA2 BF16",
     "fa3": "FA3 BF16",
     "fa3_fp8": "FA3 FP8",
+    "fa3_fp8_hadamard": "FA3 FP8 Hadamard",
 }
 
 
 @contextmanager
 def _activate_backend(backend: str):
     """Context manager that activates the appropriate flash attention impl."""
-    if backend in ("fa3", "fa3_fp8"):
+    if backend in ("fa3", "fa3_fp8", "fa3_fp8_hadamard"):
         activate_flash_attention_impl("FA3")
-    else:
-        # fa2 is the default, no activation needed
-        pass
     try:
         yield
     finally:
-        if backend in ("fa3", "fa3_fp8"):
+        if backend in ("fa3", "fa3_fp8", "fa3_fp8_hadamard"):
             restore_flash_attention_impl()
 
 
 def _run_attention(backend: str, q, k, v, is_causal: bool):
     """Run a single attention call for the given backend."""
     if backend == "fa3_fp8":
         return fp8_fa3_sdpa(q, k, v, is_causal=is_causal)
+    elif backend == "fa3_fp8_hadamard":
+        return fp8_fa3_sdpa(q, k, v, is_causal=is_causal, hadamard=True)
     else:
         with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
             return F.scaled_dot_product_attention(q, k, v, is_causal=is_causal)
 
@@ -32,6 +32,7 @@
 
 from torchao.prototype.attention import (
     AttentionBackend,
+    HadamardMode,
     apply_low_precision_attention,
 )
 
@@ -43,6 +44,12 @@
         "fp8": True,
         "fp8_backend": AttentionBackend.FP8_FA3,
     },
+    "fa3_fp8_hadamard": {
+        "flash_impl": "FA3",
+        "fp8": True,
+        "fp8_backend": AttentionBackend.FP8_FA3,
+        "hadamard": HadamardMode.QKV,
+    },
 }
 
 IMAGE_SIZE = (512, 512)  # (width, height) - resize for consistent LPIPS
@@ -72,6 +79,7 @@ def setup_backend(
         pipe.transformer = apply_low_precision_attention(
             pipe.transformer,
             backend=cfg["fp8_backend"],
+            hadamard=cfg.get("hadamard", HadamardMode.NONE),
         )
         if compile_flag:
             print(f"Compiling transformer with torch.compile ({backend_name})...")
 
@@ -31,6 +31,7 @@
 
 from torchao.prototype.attention import (
     AttentionBackend,
+    HadamardMode,
     apply_low_precision_attention,
 )
 from torchao.prototype.attention.shared_utils.fusion_utils import (
@@ -57,6 +58,13 @@
         "fp8_backend": AttentionBackend.FP8_FA3,
         "label": "FA3 FP8",
     },
+    "fa3_fp8_hadamard": {
+        "flash_impl": "FA3",
+        "fp8": True,
+        "fp8_backend": AttentionBackend.FP8_FA3,
+        "hadamard": HadamardMode.QKV,
+        "label": "FA3 FP8 Hadamard",
+    },
 }
 
 RANDOM_SEED = 42
@@ -116,6 +124,7 @@ def setup_backend(orig_model, backend_name, compile_flag):
         model = apply_low_precision_attention(
             orig_model,
             backend=cfg["fp8_backend"],
+            hadamard=cfg.get("hadamard", HadamardMode.NONE),
         )
         if compile_flag:
             print(f"  Compiling model with torch.compile ({backend_name})...")
 
@@ -12,10 +12,12 @@
 
 from torchao.prototype.attention.api import (
     AttentionBackend,
+    HadamardMode,
     apply_low_precision_attention,
 )
 
 __all__ = [
     "AttentionBackend",
+    "HadamardMode",
     "apply_low_precision_attention",
 ]
@@ -25,6 +25,13 @@
     )
 
 
+class HadamardMode(str, Enum):
+    """Hadamard transform mode for improved FP8 quantization quality."""
+
+    NONE = "NONE"  # No Hadamard transform
+    QKV = "QKV"  # Apply Hadamard to Q, K, and V
+
+
 class AttentionBackend(str, Enum):
     """Backend kernel for computing attention."""
 
@@ -60,6 +67,7 @@ def _check_backend_available(backend: AttentionBackend) -> None:
 def apply_low_precision_attention(
     model: nn.Module,
     backend: Optional[AttentionBackend] = None,
+    hadamard: HadamardMode = HadamardMode.NONE,
 ) -> nn.Module:
     """Apply low-precision attention to a model.
 
@@ -71,6 +79,15 @@ def apply_low_precision_attention(
     for eager execution and sets a global pre-grad pass so that
     ``torch.compile`` will automatically fuse RoPE where detected.
 
+    Args:
+        model: The model to apply low-precision attention to.
+        backend: Backend to use. If None, auto-detected.
+        hadamard: Hadamard transform mode. ``HadamardMode.QKV`` applies
+            the Hadamard transform to Q, K, and V before FP8 quantization,
+            spreading outliers across the head dimension for better
+            dynamic range utilization. Requires D to be a power of 2
+            and <= 256.
+
     Example:
 
     .. literalinclude:: ../../examples/prototype/low_precision_attention.py
@@ -93,6 +110,6 @@ def apply_low_precision_attention(
         _check_backend_available(backend)
 
     if backend == AttentionBackend.FP8_FA3:
-        return setup_fp8_backend(model, "FA3")
+        return setup_fp8_backend(model, "FA3", hadamard=str(hadamard))
 
     raise ValueError(f"Unknown backend: {backend}")
@@ -4,6 +4,15 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 
+from torchao.prototype.attention.quantization.triton_hadamard_qkv_quantization import (
+    triton_fp8_hadamard_sdpa_quantize as _fp8_hadamard_sdpa_quantize,
+)
+from torchao.prototype.attention.quantization.triton_hadamard_rope_qkv_quantization import (
+    triton_fp8_hadamard_rope_sdpa_quantize as _fp8_hadamard_rope_sdpa_quantize,
+)
+from torchao.prototype.attention.quantization.triton_hadamard_utils import (
+    inverse_hadamard_transform as _inverse_hadamard_transform,
+)
 from torchao.prototype.attention.quantization.triton_qkv_quantization import (
     triton_fp8_sdpa_quantize as _fp8_sdpa_quantize,
 )
@@ -14,4 +23,7 @@
 __all__ = [
     "_fp8_sdpa_quantize",
     "_fp8_rope_sdpa_quantize",
+    "_fp8_hadamard_sdpa_quantize",
+    "_fp8_hadamard_rope_sdpa_quantize",
+    "_inverse_hadamard_transform",
 ]
Original file line number	Diff line number	Diff line change
`@@ -12,10 +12,12 @@`
`12`	`12`
`13`	`13`	`from torchao.prototype.attention.api import (`
`14`	`14`	`AttentionBackend,`
	`15`	`+ HadamardMode,`
`15`	`16`	`apply_low_precision_attention,`
`16`	`17`	`)`
`17`	`18`
`18`	`19`	`__all__ = [`
`19`	`20`	`"AttentionBackend",`
	`21`	`+ "HadamardMode",`
`20`	`22`	`"apply_low_precision_attention",`
`21`	`23`	`]`