Improve 2d and latent attention pool dimension handling. Fix #2682

rwightman · rwightman · commit f8c695d164be · 2026-03-13T10:45:36.000-07:00
diff --git a/tests/test_layers_pool.py b/tests/test_layers_pool.py
@@ -137,6 +137,125 @@ def test_rot_attention_pool2d_rope_types(self):
             out = pool(x)
             assert out.shape == (2, 64)
 
+    @pytest.mark.parametrize('pool_cls,base_kwargs,input_shape', [
+        ('RotAttentionPool2d', {'in_features': 64, 'ref_feat_size': 7}, (2, 64, 7, 7)),
+        ('AttentionPool2d', {'in_features': 64, 'feat_size': 7}, (2, 64, 7, 7)),
+    ])
+    @pytest.mark.parametrize('out_features,embed_dim,expected_out', [
+        (None, None, 64),      # default: out_features = in_features
+        (None, 128, 64),       # default with different embed_dim
+        (32, None, 32),        # explicit out_features
+        (32, 128, 32),         # explicit out_features with different embed_dim
+        (0, None, 64),         # disabled projection, out = embed_dim = in_features
+        (0, 128, 128),         # disabled projection, out = embed_dim
+    ])
+    def test_attention_pool2d_out_features(
+            self, pool_cls, base_kwargs, input_shape, out_features, embed_dim, expected_out,
+    ):
+        import timm.layers as layers
+        kwargs = {**base_kwargs, 'out_features': out_features}
+        if embed_dim is not None:
+            kwargs['embed_dim'] = embed_dim
+        pool = getattr(layers, pool_cls)(**kwargs).to(torch_device)
+        assert pool.out_features == expected_out
+        if out_features == 0:
+            assert isinstance(pool.proj, nn.Identity)
+        else:
+            assert isinstance(pool.proj, nn.Linear)
+        x = torch.randn(*input_shape, device=torch_device)
+        out = pool(x)
+        assert out.shape == (2, expected_out)
+
+    @pytest.mark.parametrize('pool_cls,base_kwargs,input_shape', [
+        ('RotAttentionPool2d', {'in_features': 64, 'ref_feat_size': 7, 'embed_dim': 128}, (2, 64, 7, 7)),
+        ('AttentionPool2d', {'in_features': 64, 'feat_size': 7, 'embed_dim': 128}, (2, 64, 7, 7)),
+    ])
+    @pytest.mark.parametrize('num_classes,expected_out', [
+        (10, 10),
+        (0, 128),    # reset to 0 => Identity, out_features = embed_dim
+        (100, 100),
+    ])
+    def test_attention_pool2d_reset(
+            self, pool_cls, base_kwargs, input_shape, num_classes, expected_out,
+    ):
+        import timm.layers as layers
+        pool = getattr(layers, pool_cls)(**base_kwargs).to(torch_device)
+        pool.reset(num_classes=num_classes)
+        assert pool.out_features == expected_out
+        if num_classes > 0:
+            assert isinstance(pool.proj, nn.Linear)
+            assert pool.proj.in_features == 128  # embed_dim, not in_features
+            assert pool.proj.out_features == num_classes
+        else:
+            assert isinstance(pool.proj, nn.Identity)
+        x = torch.randn(*input_shape, device=torch_device)
+        out = pool(x)
+        assert out.shape == (2, expected_out)
+
+    @pytest.mark.parametrize('pool_cls,base_kwargs,input_shape', [
+        ('RotAttentionPool2d', {'in_features': 64, 'ref_feat_size': 7}, (2, 64, 7, 7)),
+        ('AttentionPool2d', {'in_features': 64, 'feat_size': 7}, (2, 64, 7, 7)),
+    ])
+    def test_attention_pool2d_pre_logits(self, pool_cls, base_kwargs, input_shape):
+        import timm.layers as layers
+        pool = getattr(layers, pool_cls)(**base_kwargs, out_features=32).to(torch_device)
+        x = torch.randn(*input_shape, device=torch_device)
+        out = pool(x, pre_logits=True)
+        # pre_logits skips proj, so output dim = embed_dim (= in_features by default)
+        assert out.shape == (2, 64)
+
+    @pytest.mark.parametrize('pool_cls,base_kwargs,input_shape', [
+        ('RotAttentionPool2d', {'in_features': 64, 'ref_feat_size': 7}, (2, 64, 7, 7)),
+        ('AttentionPool2d', {'in_features': 64, 'feat_size': 7}, (2, 64, 7, 7)),
+    ])
+    def test_attention_pool2d_qkv_separate(self, pool_cls, base_kwargs, input_shape):
+        import timm.layers as layers
+        pool = getattr(layers, pool_cls)(**base_kwargs, qkv_separate=True).to(torch_device)
+        assert pool.qkv is None
+        x = torch.randn(*input_shape, device=torch_device)
+        out = pool(x)
+        assert out.shape == (2, 64)
+
+    @pytest.mark.parametrize('pool_cls,base_kwargs,input_shape', [
+        ('RotAttentionPool2d', {'in_features': 64, 'ref_feat_size': 7}, (2, 64, 7, 7)),
+        ('AttentionPool2d', {'in_features': 64, 'feat_size': 7}, (2, 64, 7, 7)),
+    ])
+    def test_attention_pool2d_class_token(self, pool_cls, base_kwargs, input_shape):
+        import timm.layers as layers
+        pool = getattr(layers, pool_cls)(**base_kwargs, class_token=True).to(torch_device)
+        assert pool.cls_token is not None
+        x = torch.randn(*input_shape, device=torch_device)
+        out = pool(x)
+        assert out.shape == (2, 64)
+
+    @pytest.mark.parametrize('out_features,embed_dim,expected_out', [
+        (None, None, 64),      # default: out_features = in_features
+        (None, 128, 64),       # default with different embed_dim
+        (32, None, 32),        # explicit out_features
+        (32, 128, 32),         # explicit out_features with different embed_dim
+        (0, None, 64),         # disabled projection, out = embed_dim = in_features
+        (0, 128, 128),         # disabled projection, out = embed_dim
+    ])
+    def test_attention_pool_latent_out_features(self, out_features, embed_dim, expected_out):
+        from timm.layers import AttentionPoolLatent
+        kwargs = {'in_features': 64, 'num_heads': 4}
+        if out_features is not None:
+            kwargs['out_features'] = out_features
+        if embed_dim is not None:
+            kwargs['embed_dim'] = embed_dim
+        pool = AttentionPoolLatent(**kwargs).to(torch_device)
+        assert pool.out_features == expected_out
+        if out_features == 0:
+            assert isinstance(pool.proj, nn.Identity)
+            assert pool.mlp is None
+        else:
+            assert isinstance(pool.proj, nn.Linear)
+            assert pool.mlp is not None
+        in_dim = embed_dim or 64
+        x = torch.randn(2, 49, in_dim, device=torch_device)
+        out = pool(x)
+        assert out.shape == (2, expected_out)
+
 
 # LSE Pool Tests
 
diff --git a/timm/layers/attention_pool.py b/timm/layers/attention_pool.py
@@ -12,6 +12,8 @@
 
 class AttentionPoolLatent(nn.Module):
     """ Attention pooling w/ latent query
+
+    Setting out_features=0 disables the output projection, norm, and MLP layers (pre_logits mode).
     """
     fused_attn: torch.jit.Final[bool]
 
@@ -38,7 +40,8 @@ def __init__(
         dd = {'device': device, 'dtype': dtype}
         super().__init__()
         embed_dim = embed_dim or in_features
-        out_features = out_features or in_features
+        if out_features is None:
+            out_features = in_features
         assert embed_dim % num_heads == 0
         self.num_heads = num_heads
         self.head_dim = embed_dim // num_heads
@@ -66,11 +69,20 @@ def __init__(
         else:
             self.q_norm = nn.Identity()
             self.k_norm = nn.Identity()
-        self.proj = nn.Linear(embed_dim, embed_dim, **dd)
-        self.proj_drop = nn.Dropout(drop)
 
-        self.norm = norm_layer(out_features, **dd) if norm_layer is not None else nn.Identity()
-        self.mlp = Mlp(embed_dim, int(embed_dim * mlp_ratio), act_layer=act_layer, **dd)
+        if out_features > 0:
+            self.proj = nn.Linear(embed_dim, out_features, **dd)
+            self.proj_drop = nn.Dropout(drop)
+            self.norm = norm_layer(out_features, **dd) if norm_layer is not None else nn.Identity()
+            self.mlp = Mlp(out_features, int(out_features * mlp_ratio), out_features=out_features, act_layer=act_layer, **dd)
+        else:
+            self.proj = nn.Identity()
+            self.proj_drop = nn.Dropout(drop)
+            self.norm = nn.Identity()
+            self.mlp = None
+            out_features = embed_dim
+
+        self.out_features = out_features
 
         self.init_weights()
 
@@ -106,7 +118,8 @@ def forward(self, x, attn_mask: Optional[torch.Tensor] = None):
         x = self.proj(x)
         x = self.proj_drop(x)
 
-        x = x + self.mlp(self.norm(x))
+        if self.mlp is not None:
+            x = x + self.mlp(self.norm(x))
 
         # optional pool if latent seq_len > 1 and pooled output is desired
         if self.pool == 'token':
diff --git a/timm/layers/attention_pool2d.py b/timm/layers/attention_pool2d.py
@@ -28,6 +28,8 @@ class RotAttentionPool2d(nn.Module):
 
     NOTE: While this impl does not require a fixed feature size, performance at differeing resolutions from
     train varies widely and falls off dramatically. I'm not sure if there is a way around this... -RW
+
+    Setting out_features=0 disables the output projection (pre_logits mode).
     """
     fused_attn: torch.jit.Final[bool]
 
@@ -53,7 +55,12 @@ def __init__(
         assert pool_type in ('', 'token')
         self.embed_dim = embed_dim = embed_dim or in_features
         self.in_features = in_features
-        self.out_features = out_features or in_features
+        if out_features is None:
+            self.out_features = in_features
+        elif out_features > 0:
+            self.out_features = out_features
+        else:
+            self.out_features = embed_dim  # out_features=0 disables projection
         ref_feat_size = to_2tuple(ref_feat_size)
         if num_heads is not None:
             assert embed_dim % num_heads == 0
@@ -81,7 +88,7 @@ def __init__(
         else:
             self.qkv = nn.Linear(in_features, embed_dim * 3, bias=qkv_bias, **dd)
         self.drop = nn.Dropout(drop_rate)
-        self.proj = nn.Linear(embed_dim, self.out_features, **dd)
+        self.proj = nn.Linear(embed_dim, self.out_features, **dd) if out_features != 0 else nn.Identity()
 
         self.pos_embed = create_rope_embed(
             rope_type=rope_type,
@@ -113,7 +120,7 @@ def reset(self, num_classes: Optional[int] = None, pool_type: Optional[str] = No
             assert pool_type in ('', 'token')
             self.pool_type = pool_type
         if num_classes is not None:
-            self.proj = nn.Linear(self.in_features, num_classes) if num_classes > 0 else nn.Identity()
+            self.proj = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
             self.out_features = num_classes if num_classes > 0 else self.embed_dim
 
     def _pool(self, x: torch.Tensor, H: int, W: int) -> torch.Tensor:
@@ -172,6 +179,8 @@ class AttentionPool2d(nn.Module):
     https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py
 
     NOTE: This requires feature size upon construction and well prevent adaptive sizing of the network.
+
+    Setting out_features=0 disables the output projection (pre_logits mode).
     """
     fused_attn: torch.jit.Final[bool]
 
@@ -196,7 +205,12 @@ def __init__(
         assert pool_type in ('', 'token')
         self.embed_dim = embed_dim = embed_dim or in_features
         self.in_features = in_features
-        self.out_features = out_features or in_features
+        if out_features is None:
+            self.out_features = in_features
+        elif out_features > 0:
+            self.out_features = out_features
+        else:
+            self.out_features = embed_dim  # out_features=0 disables projection
         if num_heads is not None:
             assert embed_dim % num_heads == 0
             head_dim = embed_dim // num_heads
@@ -225,7 +239,7 @@ def __init__(
             self.q = self.k = self.v = None
             self.qkv = nn.Linear(in_features, embed_dim * 3, bias=qkv_bias, **dd)
         self.drop = nn.Dropout(drop_rate)
-        self.proj = nn.Linear(embed_dim, self.out_features, **dd)
+        self.proj = nn.Linear(embed_dim, self.out_features, **dd) if out_features != 0 else nn.Identity()
         self.pos_embed = nn.Parameter(torch.zeros(self.seq_len + 1, in_features, **dd))
 
         self.init_weights()
@@ -251,7 +265,7 @@ def reset(self, num_classes: Optional[int] = None, pool_type: Optional[str] = No
             assert pool_type in ('', 'token')
             self.pool_type = pool_type
         if num_classes is not None:
-            self.proj = nn.Linear(self.in_features, num_classes) if num_classes > 0 else nn.Identity()
+            self.proj = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
             self.out_features = num_classes if num_classes > 0 else self.embed_dim
 
     def _pool(self, x: torch.Tensor, H: int, W: int) -> torch.Tensor: