Metal backend: Add v2 entry point with enable_gqa (#19145)

manuelcandales · web-flow · commit c0c079bd2cbf · 2026-04-27T11:21:09.000-04:00
diff --git a/backends/apple/metal/metal_backend.py b/backends/apple/metal/metal_backend.py
@@ -35,6 +35,7 @@ def get_supported_fallback_kernels(cls) -> Dict[str, Any]:
             "aoti_torch_mps_convolution": None,
             "aoti_torch_mps_mm_out": None,
             "at::_ops::_scaled_dot_product_attention_math_for_mps::call": None,
+            "at::_ops::_scaled_dot_product_attention_math_for_mps_v2::call": None,
             "torchao::_linear_fp_act_4bit_weight": None,
             "at::_ops::topk::call": None,
             "metal::gather_qmv": None,
diff --git a/backends/apple/metal/runtime/ops/op_sdpa.mm b/backends/apple/metal/runtime/ops/op_sdpa.mm
@@ -251,6 +251,21 @@
 
 extern "C" {
 
+// Forward declaration of the implementation shared by both v1 and v2.
+static AOTITorchError sdpa_mps_impl(
+    AOTITensorHandle query,
+    AOTITensorHandle key,
+    AOTITensorHandle value,
+    AOTITensorHandle* attn_mask,
+    double dropout_p,
+    int32_t is_causal,
+    AOTITensorHandle* dropout_mask,
+    double* scale,
+    int32_t enable_gqa,
+    AOTITensorHandle* ret0,
+    AOTITensorHandle* ret1);
+
+// v1: Original signature without enable_gqa (for old .pte files).
 AOTITorchError aoti_torch_mps__scaled_dot_product_attention_math_for_mps(
     AOTITensorHandle query,
     AOTITensorHandle key,
@@ -262,6 +277,41 @@ AOTITorchError aoti_torch_mps__scaled_dot_product_attention_math_for_mps(
     double* scale,
     AOTITensorHandle* ret0,
     AOTITensorHandle* ret1) {
+  return sdpa_mps_impl(
+      query, key, value, attn_mask, dropout_p, is_causal,
+      dropout_mask, scale, /*enable_gqa=*/0, ret0, ret1);
+}
+
+// v2: New signature with enable_gqa (for new .pte files).
+AOTITorchError aoti_torch_mps__scaled_dot_product_attention_math_for_mps_v2(
+    AOTITensorHandle query,
+    AOTITensorHandle key,
+    AOTITensorHandle value,
+    AOTITensorHandle* attn_mask,
+    double dropout_p,
+    int32_t is_causal,
+    AOTITensorHandle* dropout_mask,
+    double* scale,
+    int32_t enable_gqa,
+    AOTITensorHandle* ret0,
+    AOTITensorHandle* ret1) {
+  return sdpa_mps_impl(
+      query, key, value, attn_mask, dropout_p, is_causal,
+      dropout_mask, scale, enable_gqa, ret0, ret1);
+}
+
+static AOTITorchError sdpa_mps_impl(
+    AOTITensorHandle query,
+    AOTITensorHandle key,
+    AOTITensorHandle value,
+    AOTITensorHandle* attn_mask,
+    double dropout_p,
+    int32_t is_causal,
+    AOTITensorHandle* dropout_mask,
+    double* scale,
+    int32_t enable_gqa,
+    AOTITensorHandle* ret0,
+    AOTITensorHandle* ret1) {
 
   ET_LOG(Debug, "aoti_torch_mps__scaled_dot_product_attention_math_for_mps: Starting with Metal kernel implementation");