Qwen 3.5 MoE Metal: Use max-sized prefill example for dynamic inputs

manuelcandales · manuelcandales · commit dc7a78530792 · 2026-04-21T14:43:34.000-04:00
With alloc_graph_input=False, ExecuTorch sets the input tensor's numel_bound_ from the serialized example size. A small example (T=2) prevents runtime inputs larger than 2 tokens. Use max_seq_len-1 as the prefill example size so any prompt length is accepted at runtime. Authored with Claude. ghstack-source-id: 601c7ed ghstack-comment-id: 4263712315 Pull-Request: #18956
diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py
@@ -661,10 +661,14 @@ def _export_metal(model, config, args):
     print("Decode export successful!")
 
     # --- Prefill method (T>=2, dynamic shape) ---
+    # Use max-sized example so the serialized numel_bound_ is large enough
+    # for any runtime input (Metal/AOTI pattern: alloc_graph_input=False
+    # means numel_bound_ comes from the export example size).
     print("Exporting prefill method...")
-    prefill_tokens = torch.tensor([[0, 1]], dtype=torch.long)
-    prefill_pos = torch.tensor([0, 1], dtype=torch.long)
-    seq_dim = Dim("seq_len", min=2, max=config.max_seq_len - 1)
+    max_prefill = config.max_seq_len - 1
+    prefill_tokens = torch.zeros((1, max_prefill), dtype=torch.long)
+    prefill_pos = torch.arange(max_prefill, dtype=torch.long)
+    seq_dim = Dim("seq_len", min=2, max=max_prefill)
     prefill_dynamic_shapes = ({1: seq_dim}, {0: seq_dim})
     with torch.no_grad():
         prefill_ep = export(