Implement PRR as a pooling module. Alternative to #2678

rwightman · rwightman · commit 682c8458d0a6 · 2026-03-13T13:45:50.000-07:00
diff --git a/tests/test_layers_pool.py b/tests/test_layers_pool.py
@@ -228,6 +228,41 @@ def test_attention_pool2d_class_token(self, pool_cls, base_kwargs, input_shape):
         out = pool(x)
         assert out.shape == (2, 64)
 
+    def test_attention_pool_prr_basic(self):
+        from timm.layers import AttentionPoolPrr
+        x = torch.randn(2, 50, 64, device=torch_device)  # 1 CLS + 49 patches
+        pool = AttentionPoolPrr(dim=64, num_heads=4).to(torch_device)
+        out = pool(x)
+        assert out.shape == (2, 64)
+
+    def test_attention_pool_prr_avg_pool(self):
+        from timm.layers import AttentionPoolPrr
+        x = torch.randn(2, 49, 64, device=torch_device)
+        pool = AttentionPoolPrr(dim=64, num_heads=4, pool_type='avg').to(torch_device)
+        out = pool(x)
+        assert out.shape == (2, 64)
+
+    def test_attention_pool_prr_parameter_free(self):
+        from timm.layers import AttentionPoolPrr
+        pool = AttentionPoolPrr(dim=64, num_heads=4)
+        num_params = sum(p.numel() for p in pool.parameters())
+        assert num_params == 0, f"Expected 0 parameters, got {num_params}"
+
+    def test_attention_pool_prr_with_norms(self):
+        from timm.layers import AttentionPoolPrr
+        pool = AttentionPoolPrr(
+            dim=64,
+            num_heads=4,
+            pre_norm=True,
+            post_norm=True,
+        ).to(torch_device)
+        # Should have parameters from the two LayerNorms
+        num_params = sum(p.numel() for p in pool.parameters())
+        assert num_params > 0
+        x = torch.randn(2, 49, 64, device=torch_device)
+        out = pool(x)
+        assert out.shape == (2, 64)
+
     @pytest.mark.parametrize('out_features,embed_dim,expected_out', [
         (None, None, 64),      # default: out_features = in_features
         (None, 128, 64),       # default with different embed_dim
@@ -365,6 +400,7 @@ class TestPoolingCommon:
         ('SimPool1d', {'dim': 64}, (2, 49, 64)),
         ('SelectAdaptivePool2d', {'pool_type': 'avg', 'flatten': True}, (2, 64, 7, 7)),
         ('AttentionPoolLatent', {'in_features': 64, 'num_heads': 4}, (2, 49, 64)),
+        ('AttentionPoolPrr', {'dim': 64, 'num_heads': 4}, (2, 49, 64)),
         ('AttentionPool2d', {'in_features': 64, 'feat_size': 7}, (2, 64, 7, 7)),
         ('RotAttentionPool2d', {'in_features': 64, 'ref_feat_size': 7}, (2, 64, 7, 7)),
     ])
@@ -383,6 +419,7 @@ def test_gradient_flow(self, pool_cls, kwargs, input_shape):
         ('LsePlus1d', {}, (2, 49, 64)),
         ('SimPool2d', {'dim': 64}, (2, 64, 7, 7)),
         ('SimPool1d', {'dim': 64}, (2, 49, 64)),
+        ('AttentionPoolPrr', {'dim': 64, 'num_heads': 4}, (2, 49, 64)),
         ('AttentionPool2d', {'in_features': 64, 'feat_size': 7}, (2, 64, 7, 7)),
         ('RotAttentionPool2d', {'in_features': 64, 'ref_feat_size': 7}, (2, 64, 7, 7)),
     ])
@@ -401,6 +438,7 @@ def test_torchscript(self, pool_cls, kwargs, input_shape):
         ('LsePlus1d', {}, (2, 49, 64)),
         ('SimPool2d', {'dim': 64}, (2, 64, 7, 7)),
         ('SimPool1d', {'dim': 64}, (2, 49, 64)),
+        ('AttentionPoolPrr', {'dim': 64, 'num_heads': 4}, (2, 49, 64)),
         ('AttentionPool2d', {'in_features': 64, 'feat_size': 7}, (2, 64, 7, 7)),
         ('RotAttentionPool2d', {'in_features': 64, 'ref_feat_size': 7}, (2, 64, 7, 7)),
     ])
diff --git a/timm/layers/__init__.py b/timm/layers/__init__.py
@@ -17,7 +17,7 @@
 )
 from .attention import Attention, AttentionRope, maybe_add_mask
 from .attention2d import MultiQueryAttention2d, Attention2d, MultiQueryAttentionV2
-from .attention_pool import AttentionPoolLatent
+from .attention_pool import AttentionPoolLatent, AttentionPoolPrr
 from .attention_pool2d import AttentionPool2d, RotAttentionPool2d
 from .blur_pool import BlurPool2d, create_aa
 from .classifier import create_classifier, ClassifierHead, NormMlpClassifierHead, ClNormMlpClassifierHead
diff --git a/timm/layers/attention_pool.py b/timm/layers/attention_pool.py
@@ -126,4 +126,70 @@ def forward(self, x, attn_mask: Optional[torch.Tensor] = None):
             x = x[:, 0]
         elif self.pool == 'avg':
             x = x.mean(1)
+        return x
+
+
+class AttentionPoolPrr(nn.Module):
+    """ Patch Representation Refinement (PRR) attention pool.
+
+    From "Locality-Attending Vision Transformer" (ICLR 2026).
+
+    Parameter-free multi-head self-attention that refines all patch representations
+    before pooling. No Q/K/V projections — input is reshaped directly into multi-head
+    format for self-attention.
+    """
+    fused_attn: torch.jit.Final[bool]
+
+    def __init__(
+            self,
+            dim: int,
+            num_heads: int = 8,
+            pool_type: str = 'token',
+            pre_norm: bool = False,
+            post_norm: bool = False,
+            norm_layer: Optional[Type[nn.Module]] = None,
+            device=None,
+            dtype=None,
+    ):
+        dd = {'device': device, 'dtype': dtype}
+        super().__init__()
+        assert pool_type in ('token', 'avg'), f"pool_type must be 'token' or 'avg', got '{pool_type}'"
+        assert dim % num_heads == 0, f"dim ({dim}) must be divisible by num_heads ({num_heads})"
+
+        if norm_layer is None and (pre_norm or post_norm):
+            norm_layer = nn.LayerNorm
+
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.pool = pool_type
+        self.fused_attn = use_fused_attn()
+        self.out_features = dim
+
+        self.pre_norm = norm_layer(dim, **dd) if pre_norm else nn.Identity()
+        self.post_norm = norm_layer(dim, **dd) if post_norm else nn.Identity()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, C = x.shape
+
+        x = self.pre_norm(x)
+
+        # Parameter-free self-attention: reshape into multi-head format
+        qkv = x.reshape(B, N, self.num_heads, self.head_dim).transpose(1, 2)  # (B, H, N, D)
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(qkv, qkv, qkv)
+        else:
+            attn = (qkv * self.scale) @ qkv.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            x = attn @ qkv
+        x = x.transpose(1, 2).reshape(B, N, C)
+
+        x = self.post_norm(x)
+
+        # Pool
+        if self.pool == 'token':
+            x = x[:, 0]
+        elif self.pool == 'avg':
+            x = x.mean(1)
+
         return x
diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py
@@ -49,6 +49,7 @@
     Attention,
     DiffAttention,
     AttentionPoolLatent,
+    AttentionPoolPrr,
     PatchEmbed,
     Mlp,
     SwiGLUPacked,
@@ -692,7 +693,7 @@ def __init__(
             patch_size: Union[int, Tuple[int, int]] = 16,
             in_chans: int = 3,
             num_classes: int = 1000,
-            global_pool: Literal['', 'avg', 'avgmax', 'max', 'token', 'map'] = 'token',
+            global_pool: Literal['', 'avg', 'avgmax', 'max', 'token', 'map', 'prr'] = 'token',
             embed_dim: int = 768,
             depth: int = 12,
             num_heads: int = 12,
@@ -764,7 +765,7 @@ def __init__(
         """
         super().__init__()
         dd = {'device': device, 'dtype': dtype}
-        assert global_pool in ('', 'avg', 'avgmax', 'max', 'token', 'map')
+        assert global_pool in ('', 'avg', 'avgmax', 'max', 'token', 'map', 'prr')
         assert class_token or global_pool != 'token'
         assert pos_embed in ('', 'none', 'learn')
         use_fc_norm = global_pool in ('avg', 'avgmax', 'max') if fc_norm is None else fc_norm
@@ -858,6 +859,15 @@ def __init__(
                 act_layer=act_layer,
                 **dd,
             )
+        elif global_pool == 'prr':
+            self.attn_pool = AttentionPoolPrr(
+                self.embed_dim,
+                num_heads=num_heads,
+                pool_type='token' if class_token else 'avg',
+                norm_layer=norm_layer,
+                **dd,
+            )
+            self.pool_include_prefix = True
         else:
             self.attn_pool = None
         self.fc_norm = norm_layer(embed_dim, **dd) if final_norm and use_fc_norm else nn.Identity()
@@ -961,11 +971,13 @@ def reset_classifier(self, num_classes: int, global_pool: Optional[str] = None)
         """
         self.num_classes = num_classes
         if global_pool is not None:
-            assert global_pool in ('', 'avg', 'avgmax', 'max', 'token', 'map')
-            if global_pool == 'map' and self.attn_pool is None:
+            assert global_pool in ('', 'avg', 'avgmax', 'max', 'token', 'map', 'prr')
+            if global_pool in ('map', 'prr') and self.attn_pool is None:
                 assert False, "Cannot currently add attention pooling in reset_classifier()."
-            elif global_pool != 'map' and self.attn_pool is not None:
+            elif global_pool not in ('map', 'prr') and self.attn_pool is not None:
                 self.attn_pool = None  # remove attention pooling
+            elif global_pool in ('map', 'prr') and self.global_pool != global_pool:
+                assert False, "Cannot currently change attention pooling type in reset_classifier()."
             self.global_pool = global_pool
         self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
 
@@ -1476,7 +1488,7 @@ def _n2p(_w, t=True, idx=None):
     # if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w:
     #     model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel']))
     #     model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias']))
-    if model.attn_pool is not None:
+    if isinstance(model.attn_pool, AttentionPoolLatent):
         block_prefix = f'{prefix}MAPHead_0/'
         mha_prefix = block_prefix + f'MultiHeadDotProductAttention_0/'
         model.attn_pool.latent.copy_(_n2p(w[f'{block_prefix}probe'], t=False))

Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@`
`17`	`17`	`)`
`18`	`18`	`from .attention import Attention, AttentionRope, maybe_add_mask`
`19`	`19`	`from .attention2d import MultiQueryAttention2d, Attention2d, MultiQueryAttentionV2`
`20`		`-from .attention_pool import AttentionPoolLatent`
	`20`	`+from .attention_pool import AttentionPoolLatent, AttentionPoolPrr`
`21`	`21`	`from .attention_pool2d import AttentionPool2d, RotAttentionPool2d`
`22`	`22`	`from .blur_pool import BlurPool2d, create_aa`
`23`	`23`	`from .classifier import create_classifier, ClassifierHead, NormMlpClassifierHead, ClNormMlpClassifierHead`