Skip to content

Commit 2c707e9

Browse files
Qwen 3.5 MoE: Add Metal build target (#18881)
Adds make qwen3_5_moe-metal which builds the runner linked against the Metal backend instead of CUDA: - CMakeLists.txt: conditional metal_backend vs aoti_cuda_backend linking - CMakePresets.json: add qwen3-5-moe-metal preset (Darwin only) - main.cpp: guard CUDA includes/calls behind EXECUTORCH_BUILD_CUDA, route T=1 prompts to decode method (prefill has min seq_len=2) - Makefile: add qwen3_5_moe-metal target
1 parent 799bf5a commit 2c707e9

4 files changed

Lines changed: 70 additions & 7 deletions

File tree

Makefile

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@
9191
#
9292
# ==============================================================================
9393

94-
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu qwen3_5_moe-cuda clean help
94+
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu qwen3_5_moe-cuda qwen3_5_moe-metal clean help
9595

9696
help:
9797
@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -125,6 +125,7 @@ help:
125125
@echo " gemma3-cuda - Build Gemma3 runner with CUDA backend"
126126
@echo " gemma3-cpu - Build Gemma3 runner with CPU backend"
127127
@echo " qwen3_5_moe-cuda - Build Qwen3.5 MoE runner with CUDA backend"
128+
@echo " qwen3_5_moe-metal - Build Qwen3.5 MoE runner with Metal backend"
128129
@echo " clean - Clean build artifacts"
129130

130131
voxtral-cuda:
@@ -404,6 +405,15 @@ qwen3_5_moe-cuda:
404405
@echo "✓ Build complete!"
405406
@echo " Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
406407

408+
qwen3_5_moe-metal:
409+
@echo "==> Building and installing ExecuTorch with Metal..."
410+
cmake --workflow --preset llm-release-metal
411+
@echo "==> Building Qwen3.5 MoE runner with Metal..."
412+
cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-metal
413+
@echo ""
414+
@echo "✓ Build complete!"
415+
@echo " Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
416+
407417
clean:
408418
rm -rf cmake-out \
409419
extension/llm/tokenizers/build \

examples/models/qwen3_5_moe/CMakeLists.txt

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,20 @@ list(
4242
extension_flat_tensor
4343
)
4444

45-
# CUDA backend (required)
46-
find_package(CUDAToolkit REQUIRED)
47-
list(APPEND link_libraries aoti_cuda_backend)
48-
executorch_target_link_options_shared_lib(aoti_cuda_backend)
45+
# Backend selection
46+
if(EXECUTORCH_BUILD_METAL)
47+
list(APPEND link_libraries metal_backend)
48+
executorch_target_link_options_shared_lib(metal_backend)
49+
elseif(EXECUTORCH_BUILD_CUDA)
50+
find_package(CUDAToolkit REQUIRED)
51+
list(APPEND link_libraries aoti_cuda_backend)
52+
executorch_target_link_options_shared_lib(aoti_cuda_backend)
53+
add_compile_definitions(EXECUTORCH_BUILD_CUDA)
54+
else()
55+
message(
56+
FATAL_ERROR "Set EXECUTORCH_BUILD_CUDA=ON or EXECUTORCH_BUILD_METAL=ON"
57+
)
58+
endif()
4959

5060
# Tokenizer
5161
list(APPEND link_libraries tokenizers::tokenizers)

examples/models/qwen3_5_moe/CMakePresets.json

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,19 @@
2323
"string": "${hostSystemName}",
2424
"list": ["Linux", "Windows"]
2525
}
26+
},
27+
{
28+
"name": "qwen3-5-moe-metal",
29+
"displayName": "Qwen3.5 MoE runner (Metal)",
30+
"inherits": ["qwen3-5-moe-base"],
31+
"cacheVariables": {
32+
"EXECUTORCH_BUILD_METAL": "ON"
33+
},
34+
"condition": {
35+
"lhs": "${hostSystemName}",
36+
"type": "equals",
37+
"rhs": "Darwin"
38+
}
2639
}
2740
],
2841
"buildPresets": [
@@ -31,6 +44,12 @@
3144
"displayName": "Build Qwen3.5 MoE runner (CUDA)",
3245
"configurePreset": "qwen3-5-moe-cuda",
3346
"targets": ["qwen3_5_moe_runner"]
47+
},
48+
{
49+
"name": "qwen3-5-moe-metal",
50+
"displayName": "Build Qwen3.5 MoE runner (Metal)",
51+
"configurePreset": "qwen3-5-moe-metal",
52+
"targets": ["qwen3_5_moe_runner"]
3453
}
3554
],
3655
"workflowPresets": [
@@ -47,6 +66,20 @@
4766
"name": "qwen3-5-moe-cuda"
4867
}
4968
]
69+
},
70+
{
71+
"name": "qwen3-5-moe-metal",
72+
"displayName": "Configure and build Qwen3.5 MoE runner (Metal)",
73+
"steps": [
74+
{
75+
"type": "configure",
76+
"name": "qwen3-5-moe-metal"
77+
},
78+
{
79+
"type": "build",
80+
"name": "qwen3-5-moe-metal"
81+
}
82+
]
5083
}
5184
]
5285
}

examples/models/qwen3_5_moe/main.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020
#include <string>
2121
#include <vector>
2222

23+
#ifdef EXECUTORCH_BUILD_CUDA
2324
#include <cuda_runtime.h>
25+
#endif
2426

2527
DEFINE_string(model_path, "", "Model .pte file path.");
2628
DEFINE_string(data_path, "", "Data file (.ptd) for CUDA backend.");
@@ -130,7 +132,13 @@ int main(int argc, char** argv) {
130132
uint64_t cur_token = 0;
131133
auto prefill_start = std::chrono::steady_clock::now();
132134

133-
// Chunked prefill
135+
// Use prefill method for T>=2, decode method for T=1
136+
// (prefill was exported with min seq_len=2)
137+
std::string run_method = prefill_method;
138+
if (dual_method && num_prompt_tokens == 1) {
139+
run_method = "decode";
140+
}
141+
134142
std::vector<int64_t> pos_data(num_prompt_tokens);
135143
for (int64_t i = 0; i < num_prompt_tokens; i++) {
136144
pos_data[i] = i;
@@ -149,7 +157,7 @@ int main(int argc, char** argv) {
149157
prefill_inputs.push_back(tokens_tensor);
150158
prefill_inputs.push_back(pos_tensor);
151159

152-
auto prefill_result = module->execute(prefill_method, prefill_inputs);
160+
auto prefill_result = module->execute(run_method, prefill_inputs);
153161
if (prefill_result.error() != Error::Ok) {
154162
ET_LOG(Error, "Prefill failed");
155163
return 1;
@@ -171,10 +179,12 @@ int main(int argc, char** argv) {
171179
prefill_ms,
172180
num_prompt_tokens * 1000.0 / prefill_ms);
173181

182+
#ifdef EXECUTORCH_BUILD_CUDA
174183
// Synchronize CUDA device to ensure prefill's writes to shared mutable
175184
// buffers (KV cache, conv_state, recurrent_state) are visible to the
176185
// decode method, which may run on a different CUDA stream.
177186
cudaDeviceSynchronize();
187+
#endif
178188

179189
if (!dual_method) {
180190
printf("Single-method mode: skipping decode\n");

0 commit comments

Comments
 (0)