vllm-project
diff --git a/‎csrc/flash_attn/flash_api.cpp‎
Lines changed: 14 additions & 12 deletions b/‎csrc/flash_attn/flash_api.cpp‎
Lines changed: 14 additions & 12 deletions
diff --git a/‎csrc/xpu/attn/attn_interface.cpp‎
Lines changed: 2 additions & 0 deletions b/‎csrc/xpu/attn/attn_interface.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎csrc/xpu/attn/attn_interface.h‎
Lines changed: 1 addition & 0 deletions b/‎csrc/xpu/attn/attn_interface.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎csrc/xpu/attn/xe_2/chunk_prefill.hpp‎
Lines changed: 82 additions & 10 deletions b/‎csrc/xpu/attn/xe_2/chunk_prefill.hpp‎
Lines changed: 82 additions & 10 deletions
diff --git a/‎csrc/xpu/attn/xe_2/chunk_prefill_extern.hpp‎
Lines changed: 1 addition & 1 deletion b/‎csrc/xpu/attn/xe_2/chunk_prefill_extern.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/xpu/attn/xe_2/chunk_prefill_kernel_template.cpp.in‎
Lines changed: 1 addition & 1 deletion b/‎csrc/xpu/attn/xe_2/chunk_prefill_kernel_template.cpp.in‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/xpu/attn/xe_2/chunk_prefill_utils.hpp‎
Lines changed: 6 additions & 5 deletions b/‎csrc/xpu/attn/xe_2/chunk_prefill_utils.hpp‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎csrc/xpu/attn/xe_2/collective/chunk_prefill_epilogue.hpp‎
Lines changed: 1 addition & 1 deletion b/‎csrc/xpu/attn/xe_2/collective/chunk_prefill_epilogue.hpp‎
Lines changed: 1 addition & 1 deletion
@@ -49,6 +49,7 @@ std::vector<at::Tensor> mha_varlen_fwd(
     int max_seqlen_q,
     int max_seqlen_k,
     float p_dropout,
+    std::optional<const at::Tensor>& q_scale,
     std::optional<const at::Tensor>& k_scale,
     std::optional<const at::Tensor>& v_scale,
     float softmax_scale,
@@ -63,21 +64,17 @@ std::vector<at::Tensor> mha_varlen_fwd(
     std::optional<int> num_splits) {
   auto q_type = q.scalar_type();
   auto k_type = k.scalar_type();
-  TORCH_CHECK(
-      q_type == at::ScalarType::Half || q_type == at::ScalarType::BFloat16,
-      "VLLM Kernel XPU only supports fp16 and bf16 type");
+  auto v_type = v.scalar_type();
 
   TORCH_CHECK(
       v.scalar_type() == k_type, "key and value must have the same dtype");
-  bool is_fp8kv = false;
-  if (k_type == at::ScalarType::Float8_e5m2 ||
-      k_type == at::ScalarType::Float8_e4m3fn) {
-    is_fp8kv = true;
-  } else {
+  bool is_fp8_q = q_type == at::ScalarType::Float8_e5m2 ||
+                  q_type == at::ScalarType::Float8_e4m3fn;
+  bool is_fp8kv = k_type == at::ScalarType::Float8_e5m2 ||
+                  k_type == at::ScalarType::Float8_e4m3fn;
+  if (is_fp8kv == is_fp8_q) {
     TORCH_CHECK(
         k.scalar_type() == q_type, "query and key must have the same dtype");
-    TORCH_CHECK(
-        v.scalar_type() == q_type, "query and value must have the same dtype");
   }
 
   CHECK_DEVICE(q);
@@ -128,6 +125,10 @@ std::vector<at::Tensor> mha_varlen_fwd(
   } else {
     out = torch::empty_like(q);
   }
+  TORCH_CHECK(
+      out.scalar_type() == at::ScalarType::Half ||
+          out.scalar_type() == at::ScalarType::BFloat16,
+      "VLLM Kernel XPU only supports fp16 and bf16 type");
 
   bool is_varlen = true;
   bool is_local = (window_size_left != -1) | (window_size_right != -1);
@@ -147,6 +148,7 @@ std::vector<at::Tensor> mha_varlen_fwd(
         seqlens_k,
         max_seqlen_q,
         max_seqlen_k,
+        q_scale,
         k_scale,
         v_scale,
         softmax_scale,
@@ -227,8 +229,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "cu_seqlens_q, "
       "Tensor cu_seqlens_k, Tensor? seqused_k, Tensor? leftpad_k, Tensor? "
       "block_table, Tensor? alibi_slopes, "
-      "int max_seqlen_q, int max_seqlen_k, float p_dropout, Tensor? k_scale, "
-      "Tensor? v_scale, "
+      "int max_seqlen_q, int max_seqlen_k, float p_dropout, Tensor? q_scale, "
+      "Tensor? k_scale, Tensor? v_scale, "
       "float softmax_scale, Tensor? softmax_sink, bool zero_tensors, "
       "bool is_causal, int window_size_left, int window_size_right, float "
       "softcap, bool return_softmax, "
 
@@ -17,6 +17,7 @@ void cutlass_chunk_prefill_interface(
     const at::Tensor& cu_seqlens_k,
     int max_seqlen_q,
     int max_seqlen_k,
+    std::optional<const at::Tensor>& q_scale,
     std::optional<const at::Tensor>& k_scale,
     std::optional<const at::Tensor>& v_scale,
     double sm_scale,
@@ -42,6 +43,7 @@ void cutlass_chunk_prefill_interface(
         cu_seqlens_k,
         max_seqlen_q,
         max_seqlen_k,
+        q_scale,
         k_scale,
         v_scale,
         sm_scale,
 
@@ -10,6 +10,7 @@ void cutlass_chunk_prefill_interface(
     const at::Tensor& cu_seqlens_k,
     int max_seqlen_q,
     int max_seqlen_k,
+    std::optional<const at::Tensor>& q_scale,
     std::optional<const at::Tensor>& k_scale,
     std::optional<const at::Tensor>& v_scale,
     double sm_scale,
 
@@ -36,6 +36,7 @@ struct chunk_prefill_args_t {
   int max_keys;
   int total_seqlen_q;
   int total_seqlen_k;
+  void* q_scale;
   void* k_scale;
   void* v_scale;
   float sm_scale;
@@ -145,8 +146,9 @@ struct KernelLauncher {
          stride_V,
          reinterpret_cast<ElementO*>(args.out),
          stride_O,
-         reinterpret_cast<ElementQ*>(args.sm_sink)},
+         reinterpret_cast<ElementO*>(args.sm_sink)},
         {args.sm_scale,
+         args.q_scale,
          args.k_scale,
          args.v_scale,
          static_cast<int*>(args.block_table),
@@ -232,9 +234,10 @@ template <
 struct FMHAConfig {
   static constexpr int SGTileQ =
       get<0>(shape_div(TileShapeQK{}, shape(SubgroupLayoutQK{})))();
+  // Note that always use output dtype for MMAOperation
   using MMAOperation = cute::conditional_t<
       is_void_v<MMAOperation_>,
-      XE_DPAS_TT<cute::gcd(SGTileQ, 8), float, ElementQ>,
+      XE_DPAS_TT<cute::gcd(SGTileQ, 8), float, ElementO>,
       MMAOperation_>;
   using SubgroupLayoutPV = cute::conditional_t<
       is_void_v<SubgroupLayoutPV_>,
@@ -287,6 +290,7 @@ struct FMHAConfig {
         TensorQ,
         TensorK,
         TensorV,
+        TensorO,
         GmemTiledCopyQ,
         GmemTiledCopyK,
         GmemTiledCopyV>;
@@ -320,11 +324,11 @@ struct FMHAConfig {
 template <typename chunk_policy, bool Paged, bool Causal, bool Local, bool Sink>
 void policy_dispatch_impl(
     sycl::queue& queue,
-    CutlassQKType& cuQKType,
+    CutlassQKOType& cuQKOType,
     const chunk_prefill_args_t& args) {
   const int PipelineStages = 2;
-  if (cuQKType.q_type == CutlassDType::half) {
-    if (cuQKType.k_type == CutlassDType::half) {
+  if (cuQKOType.q_type == CutlassDType::half) {
+    if (cuQKOType.k_type == CutlassDType::half) {
       return FMHAConfig<
           typename chunk_policy::ShapeQK,
           typename chunk_policy::ShapePV,
@@ -340,7 +344,7 @@ void policy_dispatch_impl(
           half_t,
           half_t,
           half_t>::kernel_dispatch(queue, args);
-    } else if (cuQKType.k_type == CutlassDType::float8_e4m3) {
+    } else if (cuQKOType.k_type == CutlassDType::float8_e4m3) {
       return FMHAConfig<
           typename chunk_policy::ShapeQK,
           typename chunk_policy::ShapePV,
@@ -356,7 +360,7 @@ void policy_dispatch_impl(
           float_e4m3_t,
           float_e4m3_t,
           half_t>::kernel_dispatch(queue, args);
-    } else if (cuQKType.k_type == CutlassDType::float8_e5m2) {
+    } else if (cuQKOType.k_type == CutlassDType::float8_e5m2) {
       return FMHAConfig<
           typename chunk_policy::ShapeQK,
           typename chunk_policy::ShapePV,
@@ -373,8 +377,76 @@ void policy_dispatch_impl(
           float_e5m2_t,
           half_t>::kernel_dispatch(queue, args);
     }
+  } else if (cuQKOType.q_type == CutlassDType::float8_e4m3) {
+    if (cuQKOType.o_type == CutlassDType::half) {
+      return FMHAConfig<
+          typename chunk_policy::ShapeQK,
+          typename chunk_policy::ShapePV,
+          typename chunk_policy::ShapeOut,
+          typename chunk_policy::SubgroupLayoutQK,
+          void,
+          PipelineStages,
+          Paged,
+          Causal,
+          Local,
+          Sink,
+          float_e4m3_t,
+          float_e4m3_t,
+          float_e4m3_t,
+          half_t>::kernel_dispatch(queue, args);
+    } else {
+      return FMHAConfig<
+          typename chunk_policy::ShapeQK,
+          typename chunk_policy::ShapePV,
+          typename chunk_policy::ShapeOut,
+          typename chunk_policy::SubgroupLayoutQK,
+          void,
+          PipelineStages,
+          Paged,
+          Causal,
+          Local,
+          Sink,
+          float_e4m3_t,
+          float_e4m3_t,
+          float_e4m3_t,
+          bfloat16_t>::kernel_dispatch(queue, args);
+    }
+  } else if (cuQKOType.q_type == CutlassDType::float8_e5m2) {
+    if (cuQKOType.o_type == CutlassDType::half) {
+      return FMHAConfig<
+          typename chunk_policy::ShapeQK,
+          typename chunk_policy::ShapePV,
+          typename chunk_policy::ShapeOut,
+          typename chunk_policy::SubgroupLayoutQK,
+          void,
+          PipelineStages,
+          Paged,
+          Causal,
+          Local,
+          Sink,
+          float_e5m2_t,
+          float_e5m2_t,
+          float_e5m2_t,
+          half_t>::kernel_dispatch(queue, args);
+    } else {
+      return FMHAConfig<
+          typename chunk_policy::ShapeQK,
+          typename chunk_policy::ShapePV,
+          typename chunk_policy::ShapeOut,
+          typename chunk_policy::SubgroupLayoutQK,
+          void,
+          PipelineStages,
+          Paged,
+          Causal,
+          Local,
+          Sink,
+          float_e5m2_t,
+          float_e5m2_t,
+          float_e5m2_t,
+          bfloat16_t>::kernel_dispatch(queue, args);
+    }
   } else {
-    if (cuQKType.k_type == CutlassDType::bfloat16) {
+    if (cuQKOType.k_type == CutlassDType::bfloat16) {
       return FMHAConfig<
           typename chunk_policy::ShapeQK,
           typename chunk_policy::ShapePV,
@@ -390,7 +462,7 @@ void policy_dispatch_impl(
           bfloat16_t,
           bfloat16_t,
           bfloat16_t>::kernel_dispatch(queue, args);
-    } else if (cuQKType.k_type == CutlassDType::float8_e4m3) {
+    } else if (cuQKOType.k_type == CutlassDType::float8_e4m3) {
       return FMHAConfig<
           typename chunk_policy::ShapeQK,
           typename chunk_policy::ShapePV,
@@ -406,7 +478,7 @@ void policy_dispatch_impl(
           float_e4m3_t,
           float_e4m3_t,
           bfloat16_t>::kernel_dispatch(queue, args);
-    } else if (cuQKType.k_type == CutlassDType::float8_e5m2) {
+    } else if (cuQKOType.k_type == CutlassDType::float8_e5m2) {
       return FMHAConfig<
           typename chunk_policy::ShapeQK,
           typename chunk_policy::ShapePV,
 
@@ -26,7 +26,7 @@
   extern template void                                                     \
   policy_dispatch_impl<POLICY, PAGED, CAUSAL, LOCAL, SINK>(                \
       sycl::queue & queue,                                                 \
-      CutlassQKType & cuQKType,                                            \
+      CutlassQKOType & cuQKOType,                                          \
       const chunk_prefill_args_t& args);
 
 // Generate all 16 bool combinations for a given policy using nested macros
 
@@ -21,7 +21,7 @@ using namespace cute;
       static_cast<bool>(IMPL_KISLOCAL),  \
       static_cast<bool>(IMPL_KISSINK)>(  \
       sycl::queue & queue,               \
-      CutlassQKType& cuQKType,           \
+      CutlassQKOType& cuQKOType,           \
       const chunk_prefill_args_t& args);
 
 INSTANTIATE_KERNEL()
@@ -5,24 +5,24 @@ using namespace cute;
 template <typename chunk_policy, bool... Bs>
 void policy_dispatch_func(
     sycl::queue& queue,
-    CutlassQKType& cuQKType,
+    CutlassQKOType& cuQKOType,
     const chunk_prefill_args_t& args) {
-  policy_dispatch_impl<chunk_policy, Bs...>(queue, cuQKType, args);
+  policy_dispatch_impl<chunk_policy, Bs...>(queue, cuQKOType, args);
 }
 
 template <typename chunk_policy, bool... Bs, typename... Ts>
 void policy_dispatch_func(
     sycl::queue& queue,
-    CutlassQKType& cuQKType,
+    CutlassQKOType& cuQKOType,
     const chunk_prefill_args_t& args,
     bool b,
     Ts... ts) {
   if (b) {
     policy_dispatch_func<chunk_policy, Bs..., true>(
-        queue, cuQKType, args, ts...);
+        queue, cuQKOType, args, ts...);
   } else {
     policy_dispatch_func<chunk_policy, Bs..., false>(
-        queue, cuQKType, args, ts...);
+        queue, cuQKOType, args, ts...);
   }
 }
 
@@ -37,6 +37,7 @@ void cutlass_chunk_prefill_impl(
     const at::Tensor& cu_seqlens_k,
     int max_seqlen_q,
     int max_seqlen_k,
+    std::optional<const at::Tensor>& q_scale,
     std::optional<const at::Tensor>& k_scale,
     std::optional<const at::Tensor>& v_scale,
     double sm_scale,
 
@@ -77,7 +77,7 @@ class FMHAFwdEpilogue {
 
   // softmax sink, same dtype
   static constexpr bool Sink = Sink_;
-  using ElementSink = typename CollectiveMainloop::TensorQ::element_type;
+  using ElementSink = typename CollectiveMainloop::TensorO::element_type;
 
   // Split k-reduced tiles between participating subgroups.
   // Assumption: the A tile is contiguous.