fix: branch Hiera MaskUnitAttention into 4D global path for FlashAttention dispatch

Your Name · rwightman · commit 0c90043d23a3 · 2026-03-09T13:24:29.000-07:00
The global attention path in MaskUnitAttention.forward() used a 5D tensor
reshape with num_windows=1 as a shortcut. This caused PyTorch SDPA to
silently fall back from efficient backends (FlashAttention, Memory-Efficient,
CuDNN) to the O(N^2) math backend, as all efficient kernels require 4D
contiguous tensors.

At high resolutions (e.g. 2048x2048 -&gt; 16384 tokens), the math backend
materializes the full N*N attention matrix, causing catastrophic VRAM usage
and OOM on consumer GPUs.

Changes:
- Branch forward() into windowed (5D, unchanged) and global (4D) paths
- Global path reshapes directly to [B, N, 3, heads, head_dim] -&gt; 4D QKV
- Adjust q_stride pooling dim from amax(dim=3) to amax(dim=2) for global
- Add .contiguous() on q, k, v to guarantee FlashAttention compatibility
- Split output transpose: transpose(1,3) for windowed, transpose(1,2) for global
diff --git a/timm/models/hiera.py b/timm/models/hiera.py
@@ -299,13 +299,31 @@ def __init__(
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """ Input should be of shape [batch, tokens, channels]. """
         B, N, _ = x.shape
-        num_windows = (N // (self.q_stride * self.window_size)) if self.use_mask_unit_attn else 1
-        qkv = self.qkv(x).reshape(B, -1, num_windows, 3, self.heads, self.head_dim).permute(3, 0, 4, 2, 1, 5)
-        q, k, v = qkv.unbind(0)
 
-        if self.q_stride > 1:
-            # Refer to Unroll to see how this performs a maxpool-Nd
-            q = q.view(B, self.heads, num_windows, self.q_stride, -1, self.head_dim).amax(dim=3)
+        if self.use_mask_unit_attn:
+            # Windowed attention: 5D path [B, heads, num_windows, tokens_per_window, head_dim]
+            num_windows = N // (self.q_stride * self.window_size)
+            qkv = self.qkv(x).reshape(
+                B, -1, num_windows, 3, self.heads, self.head_dim,
+            ).permute(3, 0, 4, 2, 1, 5)
+            q, k, v = qkv.unbind(0)
+
+            if self.q_stride > 1:
+                # Refer to Unroll to see how this performs a maxpool-Nd
+                q = q.view(B, self.heads, num_windows, self.q_stride, -1, self.head_dim).amax(dim=3)
+        else:
+            # Global attention: 4D path [B, heads, N, head_dim]
+            # Avoids the dummy num_windows=1 dimension that prevents FlashAttention dispatch.
+            qkv = self.qkv(x).reshape(B, N, 3, self.heads, self.head_dim).permute(2, 0, 3, 1, 4)
+            q, k, v = qkv.unbind(0)
+
+            if self.q_stride > 1:
+                # dim=2 instead of dim=3 because num_windows dimension is absent
+                q = q.view(B, self.heads, self.q_stride, -1, self.head_dim).amax(dim=2)
+
+            # Enforce contiguous memory layout so SDPA dispatches to FlashAttention
+            # instead of silently falling back to the O(N^2) math backend.
+            q, k, v = q.contiguous(), k.contiguous(), v.contiguous()
 
         if self.fused_attn:
             # Note: the original paper did *not* use SDPA, it's a free boost!
@@ -315,7 +333,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             attn = attn.softmax(dim=-1)
             x = attn @ v
 
-        x = x.transpose(1, 3).reshape(B, -1, self.dim_out)
+        # Output transpose adapts to 5D (windowed) vs 4D (global) layout
+        if self.use_mask_unit_attn:
+            x = x.transpose(1, 3).reshape(B, -1, self.dim_out)
+        else:
+            x = x.transpose(1, 2).reshape(B, -1, self.dim_out)
+
         x = self.proj(x)
         return x