Qwen 3.5 MoE: Add Metal build target (#18881)

manuelcandales · web-flow · commit 2c707e992ddb · 2026-04-21T14:49:05.000-04:00
Adds make qwen3_5_moe-metal which builds the runner linked against the
Metal backend instead of CUDA:
- CMakeLists.txt: conditional metal_backend vs aoti_cuda_backend linking
- CMakePresets.json: add qwen3-5-moe-metal preset (Darwin only)
- main.cpp: guard CUDA includes/calls behind EXECUTORCH_BUILD_CUDA,
  route T=1 prompts to decode method (prefill has min seq_len=2)
- Makefile: add qwen3_5_moe-metal target
diff --git a/Makefile b/Makefile
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu qwen3_5_moe-cuda clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu qwen3_5_moe-cuda qwen3_5_moe-metal clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -125,6 +125,7 @@ help:
 	@echo "  gemma3-cuda         - Build Gemma3 runner with CUDA backend"
 	@echo "  gemma3-cpu          - Build Gemma3 runner with CPU backend"
 	@echo "  qwen3_5_moe-cuda    - Build Qwen3.5 MoE runner with CUDA backend"
+	@echo "  qwen3_5_moe-metal   - Build Qwen3.5 MoE runner with Metal backend"
 	@echo "  clean               - Clean build artifacts"
 
 voxtral-cuda:
@@ -404,6 +405,15 @@ qwen3_5_moe-cuda:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
 
+qwen3_5_moe-metal:
+	@echo "==> Building and installing ExecuTorch with Metal..."
+	cmake --workflow --preset llm-release-metal
+	@echo "==> Building Qwen3.5 MoE runner with Metal..."
+	cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-metal
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
+
 clean:
 	rm -rf cmake-out \
 	       extension/llm/tokenizers/build \
diff --git a/examples/models/qwen3_5_moe/CMakeLists.txt b/examples/models/qwen3_5_moe/CMakeLists.txt
@@ -42,10 +42,20 @@ list(
   extension_flat_tensor
 )
 
-# CUDA backend (required)
-find_package(CUDAToolkit REQUIRED)
-list(APPEND link_libraries aoti_cuda_backend)
-executorch_target_link_options_shared_lib(aoti_cuda_backend)
+# Backend selection
+if(EXECUTORCH_BUILD_METAL)
+  list(APPEND link_libraries metal_backend)
+  executorch_target_link_options_shared_lib(metal_backend)
+elseif(EXECUTORCH_BUILD_CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  list(APPEND link_libraries aoti_cuda_backend)
+  executorch_target_link_options_shared_lib(aoti_cuda_backend)
+  add_compile_definitions(EXECUTORCH_BUILD_CUDA)
+else()
+  message(
+    FATAL_ERROR "Set EXECUTORCH_BUILD_CUDA=ON or EXECUTORCH_BUILD_METAL=ON"
+  )
+endif()
 
 # Tokenizer
 list(APPEND link_libraries tokenizers::tokenizers)
diff --git a/examples/models/qwen3_5_moe/CMakePresets.json b/examples/models/qwen3_5_moe/CMakePresets.json
@@ -23,6 +23,19 @@
                 "string": "${hostSystemName}",
                 "list": ["Linux", "Windows"]
             }
+        },
+        {
+            "name": "qwen3-5-moe-metal",
+            "displayName": "Qwen3.5 MoE runner (Metal)",
+            "inherits": ["qwen3-5-moe-base"],
+            "cacheVariables": {
+                "EXECUTORCH_BUILD_METAL": "ON"
+            },
+            "condition": {
+                "lhs": "${hostSystemName}",
+                "type": "equals",
+                "rhs": "Darwin"
+            }
         }
     ],
     "buildPresets": [
@@ -31,6 +44,12 @@
             "displayName": "Build Qwen3.5 MoE runner (CUDA)",
             "configurePreset": "qwen3-5-moe-cuda",
             "targets": ["qwen3_5_moe_runner"]
+        },
+        {
+            "name": "qwen3-5-moe-metal",
+            "displayName": "Build Qwen3.5 MoE runner (Metal)",
+            "configurePreset": "qwen3-5-moe-metal",
+            "targets": ["qwen3_5_moe_runner"]
         }
     ],
     "workflowPresets": [
@@ -47,6 +66,20 @@
                     "name": "qwen3-5-moe-cuda"
                 }
             ]
+        },
+        {
+            "name": "qwen3-5-moe-metal",
+            "displayName": "Configure and build Qwen3.5 MoE runner (Metal)",
+            "steps": [
+                {
+                    "type": "configure",
+                    "name": "qwen3-5-moe-metal"
+                },
+                {
+                    "type": "build",
+                    "name": "qwen3-5-moe-metal"
+                }
+            ]
         }
     ]
 }
diff --git a/examples/models/qwen3_5_moe/main.cpp b/examples/models/qwen3_5_moe/main.cpp
@@ -20,7 +20,9 @@
 #include <string>
 #include <vector>
 
+#ifdef EXECUTORCH_BUILD_CUDA
 #include <cuda_runtime.h>
+#endif
 
 DEFINE_string(model_path, "", "Model .pte file path.");
 DEFINE_string(data_path, "", "Data file (.ptd) for CUDA backend.");
@@ -130,7 +132,13 @@ int main(int argc, char** argv) {
   uint64_t cur_token = 0;
   auto prefill_start = std::chrono::steady_clock::now();
 
-  // Chunked prefill
+  // Use prefill method for T>=2, decode method for T=1
+  // (prefill was exported with min seq_len=2)
+  std::string run_method = prefill_method;
+  if (dual_method && num_prompt_tokens == 1) {
+    run_method = "decode";
+  }
+
   std::vector<int64_t> pos_data(num_prompt_tokens);
   for (int64_t i = 0; i < num_prompt_tokens; i++) {
     pos_data[i] = i;
@@ -149,7 +157,7 @@ int main(int argc, char** argv) {
   prefill_inputs.push_back(tokens_tensor);
   prefill_inputs.push_back(pos_tensor);
 
-  auto prefill_result = module->execute(prefill_method, prefill_inputs);
+  auto prefill_result = module->execute(run_method, prefill_inputs);
   if (prefill_result.error() != Error::Ok) {
     ET_LOG(Error, "Prefill failed");
     return 1;
@@ -171,10 +179,12 @@ int main(int argc, char** argv) {
       prefill_ms,
       num_prompt_tokens * 1000.0 / prefill_ms);
 
+#ifdef EXECUTORCH_BUILD_CUDA
   // Synchronize CUDA device to ensure prefill's writes to shared mutable
   // buffers (KV cache, conv_state, recurrent_state) are visible to the
   // decode method, which may run on a different CUDA stream.
   cudaDeviceSynchronize();
+#endif
 
   if (!dual_method) {
     printf("Single-method mode: skipping decode\n");