huggingface
diff --git a/‎docs/source/en/api/pipelines/cosmos.md‎
Lines changed: 14 additions & 6 deletions b/‎docs/source/en/api/pipelines/cosmos.md‎
Lines changed: 14 additions & 6 deletions
diff --git a/‎docs/source/en/training/distributed_inference.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/training/distributed_inference.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/convert_cosmos_to_diffusers.py‎
Lines changed: 21 additions & 3 deletions b/‎scripts/convert_cosmos_to_diffusers.py‎
Lines changed: 21 additions & 3 deletions
diff --git a/‎src/diffusers/models/attention_dispatch.py‎
Lines changed: 101 additions & 68 deletions b/‎src/diffusers/models/attention_dispatch.py‎
Lines changed: 101 additions & 68 deletions
diff --git a/‎src/diffusers/models/controlnets/controlnet_cosmos.py‎
Lines changed: 6 additions & 1 deletion b/‎src/diffusers/models/controlnets/controlnet_cosmos.py‎
Lines changed: 6 additions & 1 deletion
@@ -46,6 +46,20 @@ output = pipe(
 output.save("output.png")
 ```
 
+## Cosmos2_5_TransferPipeline
+
+[[autodoc]] Cosmos2_5_TransferPipeline
+  - all
+  - __call__
+
+
+## Cosmos2_5_PredictBasePipeline
+
+[[autodoc]] Cosmos2_5_PredictBasePipeline
+  - all
+  - __call__
+
+
 ## CosmosTextToWorldPipeline
 
 [[autodoc]] CosmosTextToWorldPipeline
@@ -70,12 +84,6 @@ output.save("output.png")
   - all
   - __call__
 
-## Cosmos2_5_PredictBasePipeline
-
-[[autodoc]] Cosmos2_5_PredictBasePipeline
-  - all
-  - __call__
-
 ## CosmosPipelineOutput
 
 [[autodoc]] pipelines.cosmos.pipeline_output.CosmosPipelineOutput
 
@@ -111,7 +111,7 @@ if __name__ == "__main__":
 Call `torchrun` to run the inference script and use the `--nproc_per_node` argument to set the number of GPUs to use.
 
 ```bash
-torchrun run_distributed.py --nproc_per_node=2
+torchrun --nproc_per_node=2 run_distributed.py
 ```
 
 ## device_map
 
@@ -94,9 +94,15 @@
     --transformer_type Cosmos-2.5-Transfer-General-2B \
     --transformer_ckpt_path $transformer_ckpt_path \
     --vae_type wan2.1 \
-    --output_path converted/transfer/2b/general/depth \
+    --output_path converted/transfer/2b/general/depth/pipeline \
     --save_pipeline
 
+python scripts/convert_cosmos_to_diffusers.py \
+    --transformer_type Cosmos-2.5-Transfer-General-2B \
+    --transformer_ckpt_path $transformer_ckpt_path \
+    --vae_type wan2.1 \
+    --output_path converted/transfer/2b/general/depth/models
+
 # edge
 transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Transfer2.5-2B/snapshots/eb5325b77d358944da58a690157dd2b8071bbf85/general/edge/61f5694b-0ad5-4ecd-8ad7-c8545627d125_ema_bf16.pt
 
@@ -120,18 +126,30 @@
     --transformer_type Cosmos-2.5-Transfer-General-2B \
     --transformer_ckpt_path $transformer_ckpt_path \
     --vae_type wan2.1 \
-    --output_path converted/transfer/2b/general/blur \
+    --output_path converted/transfer/2b/general/blur/pipeline \
     --save_pipeline
 
+python scripts/convert_cosmos_to_diffusers.py \
+    --transformer_type Cosmos-2.5-Transfer-General-2B \
+    --transformer_ckpt_path $transformer_ckpt_path \
+    --vae_type wan2.1 \
+    --output_path converted/transfer/2b/general/blur/models
+
 # seg
 transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Transfer2.5-2B/snapshots/eb5325b77d358944da58a690157dd2b8071bbf85/general/seg/5136ef49-6d8d-42e8-8abf-7dac722a304a_ema_bf16.pt
 
 python scripts/convert_cosmos_to_diffusers.py \
     --transformer_type Cosmos-2.5-Transfer-General-2B \
     --transformer_ckpt_path $transformer_ckpt_path \
     --vae_type wan2.1 \
-    --output_path converted/transfer/2b/general/seg \
+    --output_path converted/transfer/2b/general/seg/pipeline \
     --save_pipeline
+
+python scripts/convert_cosmos_to_diffusers.py \
+    --transformer_type Cosmos-2.5-Transfer-General-2B \
+    --transformer_ckpt_path $transformer_ckpt_path \
+    --vae_type wan2.1 \
+    --output_path converted/transfer/2b/general/seg/models
 ```
 """
 
 
@@ -329,7 +329,11 @@ class _HubKernelConfig:
 _HUB_KERNELS_REGISTRY: dict["AttentionBackendName", _HubKernelConfig] = {
     # TODO: temporary revision for now. Remove when merged upstream into `main`.
     AttentionBackendName._FLASH_3_HUB: _HubKernelConfig(
-        repo_id="kernels-community/flash-attn3", function_attr="flash_attn_func", revision="fake-ops-return-probs"
+        repo_id="kernels-community/flash-attn3",
+        function_attr="flash_attn_func",
+        revision="fake-ops-return-probs",
+        wrapped_forward_attr="flash_attn_interface._flash_attn_forward",
+        wrapped_backward_attr="flash_attn_interface._flash_attn_backward",
     ),
     AttentionBackendName._FLASH_3_VARLEN_HUB: _HubKernelConfig(
         repo_id="kernels-community/flash-attn3",
@@ -729,7 +733,7 @@ def _wrapped_flash_attn_3(
 ) -> tuple[torch.Tensor, torch.Tensor]:
     # Hardcoded for now because pytorch does not support tuple/int type hints
     window_size = (-1, -1)
-    out, lse, *_ = flash_attn_3_func(
+    result = flash_attn_3_func(
         q=q,
         k=k,
         v=v,
@@ -746,7 +750,9 @@ def _wrapped_flash_attn_3(
         pack_gqa=pack_gqa,
         deterministic=deterministic,
         sm_margin=sm_margin,
+        return_attn_probs=True,
     )
+    out, lse, *_ = result
     lse = lse.permute(0, 2, 1)
     return out, lse
 
@@ -1290,36 +1296,62 @@ def _flash_attention_3_hub_forward_op(
     if enable_gqa:
         raise ValueError("`enable_gqa` is not yet supported for flash-attn 3 hub kernels.")
 
-    func = _HUB_KERNELS_REGISTRY[AttentionBackendName._FLASH_3_HUB].kernel_fn
-    out = func(
-        q=query,
-        k=key,
-        v=value,
-        softmax_scale=scale,
+    config = _HUB_KERNELS_REGISTRY[AttentionBackendName._FLASH_3_HUB]
+    wrapped_forward_fn = config.wrapped_forward_fn
+    if wrapped_forward_fn is None:
+        raise RuntimeError(
+            "Flash attention 3 hub kernels must expose `flash_attn_interface._flash_attn_forward` "
+            "for context parallel execution."
+        )
+
+    if scale is None:
+        scale = query.shape[-1] ** (-0.5)
+
+    out, softmax_lse, *_ = wrapped_forward_fn(
+        query,
+        key,
+        value,
+        None,
+        None,  # k_new, v_new
+        None,  # qv
+        None,  # out
+        None,
+        None,
+        None,  # cu_seqlens_q/k/k_new
+        None,
+        None,  # seqused_q/k
+        None,
+        None,  # max_seqlen_q/k
+        None,
+        None,
+        None,  # page_table, kv_batch_idx, leftpad_k
+        None,
+        None,
+        None,  # rotary_cos/sin, seqlens_rotary
+        None,
+        None,
+        None,  # q_descale, k_descale, v_descale
+        scale,
         causal=is_causal,
-        qv=None,
-        q_descale=None,
-        k_descale=None,
-        v_descale=None,
-        window_size=window_size,
+        window_size_left=window_size[0],
+        window_size_right=window_size[1],
+        attention_chunk=0,
         softcap=softcap,
         num_splits=num_splits,
         pack_gqa=pack_gqa,
-        deterministic=deterministic,
         sm_margin=sm_margin,
-        return_attn_probs=return_lse,
     )
 
-    lse = None
-    if return_lse:
-        out, lse = out
-        lse = lse.permute(0, 2, 1).contiguous()
+    lse = softmax_lse.permute(0, 2, 1).contiguous() if return_lse else None
 
     if _save_ctx:
-        ctx.save_for_backward(query, key, value)
+        ctx.save_for_backward(query, key, value, out, softmax_lse)
         ctx.scale = scale
         ctx.is_causal = is_causal
-        ctx._hub_kernel = func
+        ctx.window_size = window_size
+        ctx.softcap = softcap
+        ctx.deterministic = deterministic
+        ctx.sm_margin = sm_margin
 
     return (out, lse) if return_lse else out
 
@@ -1328,55 +1360,50 @@ def _flash_attention_3_hub_backward_op(
     ctx: torch.autograd.function.FunctionCtx,
     grad_out: torch.Tensor,
     *args,
-    window_size: tuple[int, int] = (-1, -1),
-    softcap: float = 0.0,
-    num_splits: int = 1,
-    pack_gqa: bool | None = None,
-    deterministic: bool = False,
-    sm_margin: int = 0,
+    **kwargs,
 ):
-    query, key, value = ctx.saved_tensors
-    kernel_fn = ctx._hub_kernel
-    # NOTE: Unlike the FA2 hub kernel, the FA3 hub kernel does not expose separate wrapped forward/backward
-    # primitives (no `wrapped_forward_attr`/`wrapped_backward_attr` in its `_HubKernelConfig`). We
-    # therefore rerun the forward pass under `torch.enable_grad()` and differentiate through it with
-    # `torch.autograd.grad()`. This is a second forward pass during backward; it can be avoided once
-    # the FA3 hub exposes a dedicated fused backward kernel (analogous to `_wrapped_flash_attn_backward`
-    # in the FA2 hub), at which point this can be refactored to match `_flash_attention_hub_backward_op`.
-    with torch.enable_grad():
-        query_r = query.detach().requires_grad_(True)
-        key_r = key.detach().requires_grad_(True)
-        value_r = value.detach().requires_grad_(True)
-
-        out = kernel_fn(
-            q=query_r,
-            k=key_r,
-            v=value_r,
-            softmax_scale=ctx.scale,
-            causal=ctx.is_causal,
-            qv=None,
-            q_descale=None,
-            k_descale=None,
-            v_descale=None,
-            window_size=window_size,
-            softcap=softcap,
-            num_splits=num_splits,
-            pack_gqa=pack_gqa,
-            deterministic=deterministic,
-            sm_margin=sm_margin,
-            return_attn_probs=False,
-        )
-        if isinstance(out, tuple):
-            out = out[0]
-
-        grad_query, grad_key, grad_value = torch.autograd.grad(
-            out,
-            (query_r, key_r, value_r),
-            grad_out,
-            retain_graph=False,
-            allow_unused=False,
+    config = _HUB_KERNELS_REGISTRY[AttentionBackendName._FLASH_3_HUB]
+    wrapped_backward_fn = config.wrapped_backward_fn
+    if wrapped_backward_fn is None:
+        raise RuntimeError(
+            "Flash attention 3 hub kernels must expose `flash_attn_interface._flash_attn_backward` "
+            "for context parallel execution."
         )
 
+    query, key, value, out, softmax_lse = ctx.saved_tensors
+    grad_query = torch.empty_like(query)
+    grad_key = torch.empty_like(key)
+    grad_value = torch.empty_like(value)
+
+    wrapped_backward_fn(
+        grad_out,
+        query,
+        key,
+        value,
+        out,
+        softmax_lse,
+        None,
+        None,  # cu_seqlens_q, cu_seqlens_k
+        None,
+        None,  # seqused_q, seqused_k
+        None,
+        None,  # max_seqlen_q, max_seqlen_k
+        grad_query,
+        grad_key,
+        grad_value,
+        ctx.scale,
+        ctx.is_causal,
+        ctx.window_size[0],
+        ctx.window_size[1],
+        ctx.softcap,
+        ctx.deterministic,
+        ctx.sm_margin,
+    )
+
+    grad_query = grad_query[..., : grad_out.shape[-1]]
+    grad_key = grad_key[..., : grad_out.shape[-1]]
+    grad_value = grad_value[..., : grad_out.shape[-1]]
+
     return grad_query, grad_key, grad_value
 
 
@@ -2676,7 +2703,7 @@ def _flash_varlen_attention_3(
     key_packed = torch.cat(key_valid, dim=0)
     value_packed = torch.cat(value_valid, dim=0)
 
-    out, lse, *_ = flash_attn_3_varlen_func(
+    result = flash_attn_3_varlen_func(
         q=query_packed,
         k=key_packed,
         v=value_packed,
@@ -2686,7 +2713,13 @@ def _flash_varlen_attention_3(
         max_seqlen_k=max_seqlen_k,
         softmax_scale=scale,
         causal=is_causal,
+        return_attn_probs=return_lse,
     )
+    if isinstance(result, tuple):
+        out, lse, *_ = result
+    else:
+        out = result
+        lse = None
     out = out.unflatten(0, (batch_size, -1))
 
     return (out, lse) if return_lse else out
 
@@ -191,7 +191,12 @@ def forward(
                 dim=1,
             )
 
-        control_hidden_states = torch.cat([control_hidden_states, torch.zeros_like(controls_latents[:, :1])], dim=1)
+        if condition_mask is not None:
+            control_hidden_states = torch.cat([control_hidden_states, condition_mask], dim=1)
+        else:
+            control_hidden_states = torch.cat(
+                [control_hidden_states, torch.zeros_like(controls_latents[:, :1])], dim=1
+            )
 
         padding_mask_resized = transforms.functional.resize(
             padding_mask, list(control_hidden_states.shape[-2:]), interpolation=transforms.InterpolationMode.NEAREST