fla-org
diff --git a/‎fla/modules/convolution.py‎
Lines changed: 167 additions & 89 deletions b/‎fla/modules/convolution.py‎
Lines changed: 167 additions & 89 deletions
@@ -3,6 +3,7 @@
 # from https://github.com/HazyResearch/zoology/blob/main/zoology/mixers/convolution.py
 
 import math
+import warnings
 from typing import Optional, Tuple
 
 import torch
@@ -12,15 +13,15 @@
 import triton.language as tl
 from einops import rearrange
 
-from fla.modules.activations import ACT2FN
 from fla.ops.utils import prepare_chunk_indices, prepare_sequence_ids
-from fla.utils import checkpoint, get_multiprocessor_count, input_guard
+from fla.utils import get_multiprocessor_count, input_guard
 
 try:
-    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+    from causal_conv1d import causal_conv1d_fn
+    from causal_conv1d import causal_conv1d_update as causal_conv1d_update_cuda
 except ImportError:
     causal_conv1d_fn = None
-    causal_conv1d_update = None
+    causal_conv1d_update_cuda = None
 
 
 @triton.heuristics({
@@ -74,8 +75,8 @@ def causal_conv1d_fwd_kernel(
     m_d = o_d < D
 
     if HAS_WEIGHT:
-        # [D, W]
-        b_w = tl.load(weight + o_d[:, None] * W + o_w, mask=m_d[:, None], other=0)
+        # [BD, W]
+        b_w = tl.load(weight + o_d[:, None] * W + o_w, mask=m_d[:, None], other=0).to(tl.float32)
 
     b_y = tl.zeros((BT, BD), dtype=tl.float32)
     for i_w in tl.static_range(-W + 1, 1):
@@ -87,7 +88,7 @@ def causal_conv1d_fwd_kernel(
             b_yi *= tl.sum(b_w * (o_w == (i_w + W - 1)), 1)
         b_y += b_yi
     if HAS_BIAS:
-        b_y += tl.load(bias + o_d, mask=m_d)
+        b_y += tl.load(bias + o_d, mask=m_d).to(tl.float32)
 
     if ACTIVATION == 'swish' or ACTIVATION == 'silu':
         b_y = b_y * tl.sigmoid(b_y)
@@ -159,7 +160,7 @@ def causal_conv1d_bwd_kernel(
     if HAS_WEIGHT:
         p_x = tl.make_block_ptr(x + bos * D, (T, D), (D, 1), (i_t * BT, i_d * BD), (BT, BD), (1, 0))
         b_x = tl.load(p_x, boundary_check=(0, 1))
-        # [D, W]
+        # [BD, W]
         b_w = tl.load(weight + o_d[:, None] * W + o_w, mask=m_d[:, None], other=0)
 
     b_dx = tl.zeros((BT, BD), dtype=tl.float32)
@@ -196,6 +197,65 @@ def causal_conv1d_bwd_kernel(
     tl.store(p_dx, tl.cast(b_dx, dtype=p_dx.dtype.element_ty, fp_downcast_rounding='rtne'), boundary_check=(0, 1))
 
 
+@triton.heuristics({
+    'HAS_WEIGHT': lambda args: args['weight'] is not None,
+    'HAS_BIAS': lambda args: args['bias'] is not None,
+    'HAS_RESIDUAL': lambda args: args['residual'] is not None,
+})
+@triton.jit
+def causal_conv1d_update_kernel(
+    x,
+    cache,
+    residual,
+    y,
+    weight,
+    bias,
+    D: tl.constexpr,
+    W: tl.constexpr,
+    BD: tl.constexpr,
+    ACTIVATION: tl.constexpr,
+    HAS_WEIGHT: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+    HAS_RESIDUAL: tl.constexpr,
+):
+    i_d, i_n = tl.program_id(0), tl.program_id(1)
+
+    o_d = i_d * BD + tl.arange(0, BD)
+    o_w = tl.arange(0, W)
+    m_d = o_d < D
+    m_c = o_w < W - 1
+
+    # [BD]
+    b_x = tl.load(x + i_n * D + o_d, mask=m_d, other=0).to(tl.float32)
+
+    # shift the cache by 1 with the last one being discarded
+    p_cache = tl.make_block_ptr(cache + i_n * D*W, (D, W), (W, 1), (i_d * BD, 1), (BD, W), (1, 0))
+    # [BD, W]
+    b_cache = tl.load(p_cache, boundary_check=(0, 1)).to(tl.float32)
+    b_cache = tl.where(m_c[None, :], b_cache, b_x[:, None])
+
+    if HAS_WEIGHT:
+        b_w = tl.load(weight + o_d[:, None] * W + o_w, mask=m_d[:, None], other=0)
+        b_y = tl.sum(b_cache * b_w, 1)
+    else:
+        b_y = tl.sum(b_cache, 1)
+    if HAS_BIAS:
+        b_y += tl.load(bias + o_d, mask=m_d)
+
+    if ACTIVATION == 'swish' or ACTIVATION == 'silu':
+        b_y = b_y * tl.sigmoid(b_y)
+
+    if HAS_RESIDUAL:
+        b_y += tl.load(residual + i_n * D + o_d, mask=m_d, other=0)
+
+    tl.store(y + i_n * D + o_d, tl.cast(b_y, dtype=y.dtype.element_ty, fp_downcast_rounding='rtne'), mask=m_d)
+
+    b_cache = tl.cast(b_cache, dtype=cache.dtype.element_ty, fp_downcast_rounding='rtne')
+    # update the cache in-place
+    p_cache = tl.make_block_ptr(cache + i_n * D*W, (D, W), (W, 1), (i_d * BD, 0), (BD, W), (1, 0))
+    tl.store(p_cache, b_cache, boundary_check=(0, 1))
+
+
 def causal_conv1d_fwd(
     x: torch.Tensor,
     weight: torch.Tensor,
@@ -236,17 +296,18 @@ def grid(meta): return (triton.cdiv(D, meta['BD']), NT, B)
 
 def causal_conv1d_bwd(
     x: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    residual: torch.Tensor,
     dy: torch.Tensor,
+    weight: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+    residual: Optional[torch.Tensor] = None,
     activation: Optional[str] = None,
     cu_seqlens: Optional[torch.Tensor] = None
 ):
     shape = x.shape
     if x.shape[-1] != weight.shape[0]:
         x = rearrange(x, 'b t ... -> b t (...)')
-    B, T, D, W = *x.shape, weight.shape[1]
+    B, T, D = x.shape
+    W = weight.shape[1] if weight is not None else None
     BT = min(64, triton.next_power_of_2(triton.cdiv(max(16, B*T), get_multiprocessor_count(x.device.index))))
     chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
     NT = len(chunk_indices) if cu_seqlens is not None else triton.cdiv(T, BT)
@@ -295,6 +356,42 @@ def grid(meta): return (triton.cdiv(D, meta['BD']), NT, B)
     return dx.view(shape), dw, db, dr
 
 
+@input_guard
+def causal_conv1d_update(
+    x: torch.Tensor,
+    cache: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    weight: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+    activation: Optional[str] = None
+) -> torch.Tensor:
+    shape = x.shape
+    if weight is not None and x.shape[-1] != weight.shape[0]:
+        x = rearrange(x, 'b t ... -> b t (...)')
+    *_, D = x.shape
+    N = x.numel() // D
+    W = weight.shape[1] if weight is not None else None
+    BD = 16
+
+    y = torch.empty_like(x)
+    # NOTE: autotuning is disabled as cache is updated in-place
+    def grid(meta): return (triton.cdiv(D, meta['BD']), N)
+    causal_conv1d_update_kernel[grid](
+        x=x,
+        cache=cache,
+        residual=residual,
+        y=y,
+        weight=weight,
+        bias=bias,
+        D=D,
+        W=W,
+        BD=BD,
+        ACTIVATION=activation,
+        num_warps=32,
+    )
+    return y.view(shape), cache
+
+
 class CausalConv1dFunction(torch.autograd.Function):
 
     @staticmethod
@@ -319,9 +416,9 @@ def backward(ctx, dy: torch.Tensor):
         x, weight, bias, residual = ctx.saved_tensors
         dx, dw, db, dr = causal_conv1d_bwd(
             x=x,
+            dy=dy,
             weight=weight,
             bias=bias,
-            dy=dy,
             residual=residual,
             activation=ctx.activation,
             cu_seqlens=ctx.cu_seqlens
@@ -338,10 +435,10 @@ def causal_conv1d(
     cu_seqlens: Optional[torch.Tensor] = None
 ):
     """
-    Implementation of Causal Conv1d layer used by Mamba/Mamba2 and DeltaNet.
+    A causal 1D convolution implementation that powers Mamba/Mamba2 and DeltaNet architectures.
 
-    If `residual` is provided, it functions as the Canon operation
-    as mentioned in https://papers.ssrn.com/sol3/papers.cfm?abstract_id=5240330.
+    When a residual connection is provided, this implements the Canon operation
+    described in the paper at https://papers.ssrn.com/sol3/papers.cfm?abstract_id=5240330.
 
     Args:
         x:
@@ -386,39 +483,6 @@ def fft_conv(u, k, dropout_mask, gelu=True, k_rev=None):
         return out.to(dtype=u.dtype)
 
 
-@checkpoint
-def proj_then_conv1d(
-    x: torch.Tensor,
-    proj_weight: torch.Tensor,
-    conv1d_weight: torch.Tensor,
-    conv1d_bias: Optional[torch.Tensor] = None,
-    cache: Optional[torch.Tensor] = None
-) -> torch.Tensor:
-    # We do matmul and transpose BLH -> HBL at the same time
-    x = rearrange(proj_weight @ rearrange(x, "b t d -> d (b t)"), "d (b t) -> b d t", t=x.shape[-2])
-
-    if causal_conv1d_fn is None:
-        raise ImportError("`causal_conv1d_fn` is not available. Please install `causal-conv1d` first.")
-    if cache is None:
-        x = causal_conv1d_fn(
-            x=x,
-            weight=rearrange(conv1d_weight, "d 1 w -> d w"),
-            bias=conv1d_bias,
-            activation="silu",
-        ).transpose(1, 2)
-    else:
-        assert x.shape[-1] == 1, "Only support decoding with 1 token at a time for now"
-        x = x.squeeze(-1)
-        x = causal_conv1d_update(
-            x=x,
-            weight=rearrange(conv1d_weight, "d 1 w -> d w"),
-            bias=conv1d_bias,
-            cache=cache,
-            activation="silu",
-        )
-    return x
-
-
 @triton.jit
 def causal_conv1d_varlen_states_fwd_kernel(
     x,
@@ -478,6 +542,7 @@ def __init__(
         use_fast_conv1d: Optional[bool] = True,
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
+        **kwargs,
     ):
         super().__init__(
             in_channels=hidden_size,
@@ -492,11 +557,22 @@ def __init__(
 
         self.hidden_size = hidden_size
         self.activation = None
+
+        if kernel_size % 2 != 0:
+            raise ValueError("Kernel size must be the power of 2")
         if activation is not None:
             assert activation in ['silu', 'swish'], f"Activation `{activation}` not supported yet."
             self.activation = activation
 
         self.use_fast_conv1d = use_fast_conv1d
+        if use_fast_conv1d:
+            if causal_conv1d_fn is None:
+                warnings.warn(
+                    "The `use_fast_conv1d` parameter is set to `True`, but `causal_conv1d_fn` is not available. "
+                    "Switching to the Triton implementation instead. "
+                    "Consider installing `causal_conv1d` to enable the CUDA implementation."
+                )
+                self.use_fast_conv1d = False
 
     def extra_repr(self):
         s = ('{in_channels}, {out_channels}, kernel_size={kernel_size}'
@@ -568,7 +644,7 @@ def forward(
             else:
                 cache[:, :, -min(W, T):].copy_(rearrange(x[..., -min(W, T):, :], 'n w d -> n d w'))
 
-        if self.use_fast_conv1d and causal_conv1d_fn is None:
+        if not self.use_fast_conv1d:
             y = causal_conv1d(
                 x=x,
                 weight=rearrange(self.weight, "d 1 w -> d w"),
@@ -580,29 +656,27 @@ def forward(
             return y, cache
 
         x = rearrange(x, 'b t d -> b d t')
-        if self.use_fast_conv1d:
-            # Sequence index for each token. Used for varlen.
-            # Suppose a batch consists of two sequences with lengths 3 and 4,
-            # seq_idx=[0, 0, 0, 1, 1, 1, 1] for this batch.
-            # NOTE: No need to provide this arg if `cu_seqlens` is passed.
-            # This arg is just for BC, and will be removed in the future.
-            # [B, T]
-            seq_idx = kwargs.get('seq_idx', None)
-            if cu_seqlens is not None and seq_idx is None:
-                seq_idx = prepare_sequence_ids(cu_seqlens).to(torch.int32).unsqueeze(0)
-            y = causal_conv1d_fn(
-                x=x,
-                weight=rearrange(self.weight, "d 1 w -> d w"),
-                bias=self.bias,
-                activation=self.activation,
-                seq_idx=seq_idx,
-            )
-        else:
-            if cu_seqlens is not None:
-                raise ValueError("`cu_seqlens` is not supported for the naive Pytorch version")
-            y = self._conv_forward(x, self.weight, self.bias)[..., :x.shape[-1]]
-            if self.activation is not None:
-                y = ACT2FN[self.activation](x)
+        # Sequence index for each token. Used for varlen.
+        # Suppose a batch consists of two sequences with lengths 3 and 4,
+        # seq_idx=[0, 0, 0, 1, 1, 1, 1] for this batch.
+        # NOTE: No need to provide this arg if `cu_seqlens` is passed.
+        # This arg is just for BC, and will be removed in the future.
+        # [B, T]
+        seq_idx = kwargs.get('seq_idx', None)
+        if cu_seqlens is not None and seq_idx is None:
+            seq_idx = prepare_sequence_ids(cu_seqlens).to(torch.int32).unsqueeze(0)
+
+        # equivalent to:
+        # y = self._conv_forward(x, self.weight, self.bias)[..., :x.shape[-1]]
+        # if self.activation is not None:
+        #     y = ACT2FN[self.activation](x)
+        y = causal_conv1d_fn(
+            x=x,
+            weight=rearrange(self.weight, "d 1 w -> d w"),
+            bias=self.bias,
+            activation=self.activation,
+            seq_idx=seq_idx,
+        )
         y = rearrange(y, 'b d t -> b t d')
         if residual is not None:
             y = y + residual
@@ -615,26 +689,30 @@ def step(
         cache: torch.Tensor,
         cu_seqlens: Optional[torch.LongTensor] = None
     ):
-        shape = x.shape
-        x = x.squeeze(0) if cu_seqlens is not None else x.squeeze(1)
-        if self.use_fast_conv1d:
-            y = causal_conv1d_update(
+        # NOTE: we follow the fast mode that updates the cache in-place
+        if not self.use_fast_conv1d:
+            return causal_conv1d_update(
                 x=x,
-                conv_state=cache,
-                weight=rearrange(self.weight, "d 1 w -> d w"),
+                cache=cache,
+                residual=residual,
+                weight=self.weight,
                 bias=self.bias,
                 activation=self.activation,
             )
-        else:
-            dtype = x.dtype
-            # we follow the fast mode that updates the cache in-place
-            cache.copy_(cache.roll(shifts=-1, dims=-1))
-            cache[:, :, -1] = x
-            y = torch.sum(cache * rearrange(self.weight, "d 1 w -> d w"), dim=-1)
-            if self.bias is not None:
-                y = y + self.bias
-            if self.activation is not None:
-                y = ACT2FN[self.activation](y).to(dtype=dtype)
+
+        shape = x.shape
+        x = x.squeeze(0) if cu_seqlens is not None else x.squeeze(1)
+        # equivalent to:
+        # cache.copy_(cache.roll(shifts=-1, dims=-1))
+        # cache[:, :, -1] = x
+        # y = torch.sum(cache * rearrange(self.weight, "d 1 w -> d w"), dim=-1)
+        y = causal_conv1d_update_cuda(
+            x=x,
+            conv_state=cache,
+            weight=rearrange(self.weight, "d 1 w -> d w"),
+            bias=self.bias,
+            activation=self.activation,
+        )
         y = y.view(shape)
         if residual is not None:
             y = y + residual