pytorch
diff --git a/‎.ci/docker/build.sh‎
Lines changed: 1 addition & 5 deletions b/‎.ci/docker/build.sh‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎.ci/scripts/test_model_e2e.sh‎
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/test_model_e2e.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/cuda.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/aoti/slim/c10/core/ScalarType.h‎
Lines changed: 9 additions & 2 deletions b/‎backends/aoti/slim/c10/core/ScalarType.h‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎backends/aoti/slim/c10/core/test/test_scalar_type.cpp‎
Lines changed: 35 additions & 0 deletions b/‎backends/aoti/slim/c10/core/test/test_scalar_type.cpp‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎backends/apple/metal/runtime/metal_backend.cpp‎
Lines changed: 4 additions & 1 deletion b/‎backends/apple/metal/runtime/metal_backend.cpp‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/arm/_passes/fuse_constant_ops_pass.py‎
Lines changed: 51 additions & 17 deletions b/‎backends/arm/_passes/fuse_constant_ops_pass.py‎
Lines changed: 51 additions & 17 deletions
diff --git a/‎backends/arm/_passes/insert_table_ops.py‎
Lines changed: 14 additions & 10 deletions b/‎backends/arm/_passes/insert_table_ops.py‎
Lines changed: 14 additions & 10 deletions
diff --git a/‎backends/arm/_passes/rewrite_conv_pass.py‎
Lines changed: 12 additions & 3 deletions b/‎backends/arm/_passes/rewrite_conv_pass.py‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎backends/arm/_passes/size_adjust_input_pass.py‎
Lines changed: 7 additions & 0 deletions b/‎backends/arm/_passes/size_adjust_input_pass.py‎
Lines changed: 7 additions & 0 deletions
@@ -1,6 +1,7 @@
 #!/bin/bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -94,11 +95,6 @@ BUILD_DOCS=1
 # Copy requirements-lintrunner.txt from root to here
 cp ../../requirements-lintrunner.txt ./
 
-# Copy arm setup script from root to here
-# TODO(huydhn): Figure out a way to rebuild the Docker image automatically
-# with a new image hash when the content here is updated
-cp -r ../../examples/arm/ ./arm
-
 docker build \
   --no-cache \
   --progress=plain \
 
@@ -354,7 +354,7 @@ EOF
     fi
     ;;
   qwen3_5_moe)
-    RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --prompt 'What is the capital of France?' --max_new_tokens 128 --temperature 0"
+    RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --prompt 'What is the capital of France?' --max_new_tokens 128 --temperature 0 --cuda_graph"
     ;;
   voxtral_realtime)
     RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR}/$PREPROCESSOR --audio_path ${MODEL_DIR}/$AUDIO_FILE --temperature 0"
 
@@ -145,8 +145,8 @@ jobs:
         # Run CUDA backend Python tests
         python -m pytest backends/cuda/tests backends/cuda/passes/tests -v -o "addopts="
 
-        # Run Qwen 3.5 MoE tests (quantize roundtrip + TurboQuant KV cache)
-        python -m pytest examples/models/qwen3_5_moe/test_quantize_roundtrip.py examples/models/qwen3_5_moe/test_turboquant.py -v -o "addopts="
+        # Run Qwen 3.5 MoE tests (quantize roundtrip + TurboQuant KV cache + sampler)
+        python -m pytest examples/models/qwen3_5_moe/test_quantize_roundtrip.py examples/models/qwen3_5_moe/test_turboquant.py examples/models/qwen3_5_moe/test_sampler.py -v -o "addopts="
 
   export-model-cuda-artifact:
     name: export-model-cuda-artifact
 
@@ -28,7 +28,7 @@ enum class ScalarType : int8_t {
   Short = 2, // int16_t
   Int = 3, // int32_t
   Long = 4, // int64_t
-  // Half = 5,     // float16 - not currently needed
+  Half = 5, // float16
   Float = 6, // float
   // Double = 7,   // double - not currently needed
   // ComplexHalf = 8,
@@ -48,6 +48,7 @@ constexpr ScalarType kChar = ScalarType::Char;
 constexpr ScalarType kShort = ScalarType::Short;
 constexpr ScalarType kInt = ScalarType::Int;
 constexpr ScalarType kLong = ScalarType::Long;
+constexpr ScalarType kHalf = ScalarType::Half;
 constexpr ScalarType kFloat = ScalarType::Float;
 constexpr ScalarType kBool = ScalarType::Bool;
 constexpr ScalarType kBFloat16 = ScalarType::BFloat16;
@@ -67,6 +68,8 @@ inline size_t elementSize(ScalarType t) {
       return sizeof(int32_t);
     case ScalarType::Long:
       return sizeof(int64_t);
+    case ScalarType::Half:
+      return 2; // sizeof(__half) = 2 bytes
     case ScalarType::Float:
       return sizeof(float);
     case ScalarType::Bool:
@@ -93,6 +96,8 @@ inline const char* toString(ScalarType t) {
       return "Int";
     case ScalarType::Long:
       return "Long";
+    case ScalarType::Half:
+      return "Half";
     case ScalarType::Float:
       return "Float";
     case ScalarType::Bool:
@@ -110,7 +115,8 @@ inline const char* toString(ScalarType t) {
 /// @param t The scalar type to check.
 /// @return true if the scalar type is floating point, false otherwise.
 inline bool isFloatingType(ScalarType t) {
-  return t == ScalarType::Float || t == ScalarType::BFloat16;
+  return t == ScalarType::Half || t == ScalarType::Float ||
+      t == ScalarType::BFloat16;
 }
 
 /// Checks if the scalar type is an integral type (including bool optionally).
@@ -149,6 +155,7 @@ inline bool isValidScalarType(ScalarType t) {
     case ScalarType::Short:
     case ScalarType::Int:
     case ScalarType::Long:
+    case ScalarType::Half:
     case ScalarType::Float:
     case ScalarType::Bool:
     case ScalarType::BFloat16:
 
@@ -36,6 +36,7 @@ const std::vector<ScalarTypeTestData> kAllScalarTypes = {
     {ScalarType::Short, 2, 2, "Short", false, true, true, false},
     {ScalarType::Int, 3, 4, "Int", false, true, true, false},
     {ScalarType::Long, 4, 8, "Long", false, true, true, false},
+    {ScalarType::Half, 5, 2, "Half", true, false, false, false},
     {ScalarType::Float, 6, 4, "Float", true, false, false, false},
     {ScalarType::Bool, 11, 1, "Bool", false, false, true, true},
     {ScalarType::BFloat16, 15, 2, "BFloat16", true, false, false, false},
@@ -128,6 +129,10 @@ TEST_F(ScalarTypeConstantsTest, KLongConstant) {
   EXPECT_EQ(kLong, ScalarType::Long);
 }
 
+TEST_F(ScalarTypeConstantsTest, KHalfConstant) {
+  EXPECT_EQ(kHalf, ScalarType::Half);
+}
+
 TEST_F(ScalarTypeConstantsTest, KFloatConstant) {
   EXPECT_EQ(kFloat, ScalarType::Float);
 }
@@ -185,6 +190,10 @@ TEST_F(ElementSizeConsistencyTest, LongMatchesSizeofInt64) {
   EXPECT_EQ(elementSize(ScalarType::Long), sizeof(int64_t));
 }
 
+TEST_F(ElementSizeConsistencyTest, HalfIs2Bytes) {
+  EXPECT_EQ(elementSize(ScalarType::Half), 2);
+}
+
 TEST_F(ElementSizeConsistencyTest, FloatMatchesSizeofFloat) {
   EXPECT_EQ(elementSize(ScalarType::Float), sizeof(float));
 }
@@ -196,3 +205,29 @@ TEST_F(ElementSizeConsistencyTest, BoolMatchesSizeofBool) {
 TEST_F(ElementSizeConsistencyTest, BFloat16MatchesSizeofBFloat16) {
   EXPECT_EQ(elementSize(ScalarType::BFloat16), sizeof(BFloat16));
 }
+
+// =============================================================================
+// isValidScalarType Tests
+// =============================================================================
+
+class IsValidScalarTypeTest : public ::testing::Test {};
+
+TEST_F(IsValidScalarTypeTest, HalfIsValid) {
+  EXPECT_TRUE(isValidScalarType(ScalarType::Half));
+}
+
+TEST_F(IsValidScalarTypeTest, AllSupportedTypesAreValid) {
+  EXPECT_TRUE(isValidScalarType(ScalarType::Byte));
+  EXPECT_TRUE(isValidScalarType(ScalarType::Char));
+  EXPECT_TRUE(isValidScalarType(ScalarType::Short));
+  EXPECT_TRUE(isValidScalarType(ScalarType::Int));
+  EXPECT_TRUE(isValidScalarType(ScalarType::Long));
+  EXPECT_TRUE(isValidScalarType(ScalarType::Half));
+  EXPECT_TRUE(isValidScalarType(ScalarType::Float));
+  EXPECT_TRUE(isValidScalarType(ScalarType::Bool));
+  EXPECT_TRUE(isValidScalarType(ScalarType::BFloat16));
+}
+
+TEST_F(IsValidScalarTypeTest, UndefinedIsNotValid) {
+  EXPECT_FALSE(isValidScalarType(ScalarType::Undefined));
+}
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/safe_numerics.h>
 #include <dlfcn.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
@@ -459,8 +460,10 @@ class ET_EXPERIMENTAL MetalBackend final
 
     ET_LOG(Debug, "MetalBackend n_outputs %zd generated", n_outputs);
 
+    size_t n_io_sum = 0;
     ET_CHECK_OR_RETURN_ERROR(
-        n_inputs + n_outputs == args.size(),
+        !c10::add_overflows(n_inputs, n_outputs, &n_io_sum) &&
+            n_io_sum == args.size(),
         InvalidArgument,
         "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
         n_inputs,
 
@@ -4,7 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
-from typing import Set, Type
+from collections.abc import Mapping
+from typing import Sequence, Set, Type
 
 import torch._export.utils
 import torch.fx
@@ -18,6 +19,7 @@
 from executorch.backends.arm._passes.fuse_equal_placeholders_pass import (
     FuseEqualPlaceholdersPass,
 )
+from executorch.backends.arm.tosa.dialect.shape import meta_has_shape_mark
 from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.backends.transforms.utils import (
     create_constant_placeholder,
@@ -53,6 +55,36 @@ def __init__(self, exported_program: ExportedProgram, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.exported_program = exported_program
 
+    @staticmethod
+    def _is_tosa_dialect_op(target) -> bool:
+        target_str = str(target)
+        return (
+            "executorch.exir.dialects.backend._ops.tosa." in target_str
+            or "<EdgeOpOverload: tosa." in target_str
+        )
+
+    @staticmethod
+    def _arg_contains_symbolic_shape(arg) -> bool:
+        if isinstance(arg, torch.fx.Node):
+            if meta_has_shape_mark(arg.meta):
+                return True
+            return FuseConstantArgsPass._arg_contains_symbolic_shape(
+                arg.meta.get("val")
+            )
+        if isinstance(arg, torch.SymInt):
+            return True
+        if isinstance(arg, Mapping):
+            return any(
+                FuseConstantArgsPass._arg_contains_symbolic_shape(k)
+                or FuseConstantArgsPass._arg_contains_symbolic_shape(v)
+                for k, v in arg.items()
+            )
+        if isinstance(arg, Sequence) and not isinstance(arg, (str, bytes)):
+            return any(
+                FuseConstantArgsPass._arg_contains_symbolic_shape(v) for v in arg
+            )
+        return False
+
     def _propagate_special_dtype(self, from_nodes, to_node, data):
         """Propagate special dtype meta if it exists."""
         special_dtypes = set()
@@ -83,21 +115,24 @@ def _fuse_nodes(self, node) -> bool:
         input_nodes = list(node.all_input_nodes)
         qparams = node.meta.get("input_qparams", None)
 
-        def resolve_arg(arg):
+        def resolve_arg(arg, arg_index=None):
+            qparam = (
+                qparams.get(arg_index) if qparams and arg_index is not None else None
+            )
             if isinstance(arg, torch.fx.Node) and arg in input_nodes:
-                idx = input_nodes.index(arg)
                 t = get_param_tensor(self.exported_program, arg)
-                # Check if qparams exist for this arg
-                if qparams and idx in qparams.keys():
-                    t = qparams[idx].dequantize_value(t)
+                if qparam is not None:
+                    t = qparam.dequantize_value(t)
                 return t
             if isinstance(arg, tuple):
-                return tuple(resolve_arg(x) for x in arg)
+                return tuple(resolve_arg(x, arg_index) for x in arg)
             if isinstance(arg, list):
-                return [resolve_arg(x) for x in arg]
+                return [resolve_arg(x, arg_index) for x in arg]
             return arg
 
-        new_args = tuple(resolve_arg(a) for a in node.args)
+        new_args = tuple(
+            resolve_arg(arg, arg_index) for arg_index, arg in enumerate(node.args)
+        )
         new_kwargs = {k: resolve_arg(v) for k, v in node.kwargs.items()}
 
         data = node.target(*new_args, **new_kwargs)
@@ -139,13 +174,13 @@ def call(self, graph_module):
         for node in graph_module.graph.nodes:
             if node.op != "call_function":
                 continue
-            if node.target in [
-                exir_ops.backend.tosa.MATMUL.default,
-                exir_ops.backend.tosa.RESCALE.default,
-                exir_ops.backend.tosa.RESIZE.default,
-                exir_ops.backend.tosa.TABLE.default,
-                exir_ops.backend.tosa.TRANSPOSE.default,
-            ]:
+            # Don't fuse TOSA dialect ops as they do not have eager forward functions.
+            # Also don't fuse ops whose explicit args/kwargs include symbolic shape values.
+            if (
+                self._is_tosa_dialect_op(node.target)
+                or self._arg_contains_symbolic_shape(node.args)
+                or self._arg_contains_symbolic_shape(node.kwargs)
+            ):
                 continue
 
             input_nodes = node.all_input_nodes
@@ -161,7 +196,6 @@ def call(self, graph_module):
             )
             if not all(input_nodes_constant):
                 continue
-
             try:
                 did_fuse = self._fuse_nodes(node)
                 if did_fuse:
 
@@ -139,6 +139,17 @@ def register_buffer(self, buffer_name: str, buffer: torch.Tensor) -> None:
         """Add buffer to self.exported_program.state_dict."""
         self.exported_program.state_dict[buffer_name] = buffer
 
+    @staticmethod
+    def _get_8bit_table_domain() -> torch.Tensor:
+        """Return the canonical 8-bit TOSA TABLE input domain."""
+        int8_info = torch.iinfo(torch.int8)
+        # torch.arange excludes the end value, so use max + 1 to include 127.
+        return torch.arange(
+            int8_info.min,
+            int8_info.max + 1,
+            dtype=torch.int8,
+        )
+
     def generate_8bit_table_values(
         self,
         torch_op: Callable[[torch.Tensor], torch.Tensor],
@@ -157,17 +168,10 @@ def f(x: torch.Tensor) -> torch.Tensor:
             x = torch_op(x)
             return out_quantargs.quantize_value(x)
 
-        return (
-            f(
-                torch.linspace(
-                    start=in_quantargs.qmin,
-                    end=in_quantargs.qmax,
-                    steps=256,
-                    dtype=torch.int8,
-                )
-            ).to(dtype=torch.int8),
-            0,
+        effective_codes = self._get_8bit_table_domain().clamp(
+            in_quantargs.qmin, in_quantargs.qmax
         )
+        return (f(effective_codes).to(dtype=torch.int8), 0)
 
     def generate_16_bit_table_values(
         self,
 
@@ -21,6 +21,9 @@
     get_input_qparams,
     get_output_qparams,
 )
+from executorch.backends.arm._passes.symbolic_value_range import (
+    evaluate_symbolic_expr_values,
+)
 from executorch.backends.arm.constants import HWCM_ORDER, NHWC_INVERSE_ORDER
 from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.backends.arm.tosa.specification import get_context_shape_env
@@ -83,16 +86,22 @@ def _adjust_pad_if_needed(
 
         if isinstance(mod_remainder, torch.SymInt):
             shape_env = get_context_shape_env()
-            value_ranges = shape_env.bound_sympy(mod_remainder.node.expr)
-            mod_remainder_upper = int(value_ranges.upper)
+            exact_values = evaluate_symbolic_expr_values(
+                mod_remainder.node.expr, shape_env
+            )
+            if exact_values is not None:
+                mod_remainder_upper = max(exact_values)
+            else:
+                value_ranges = shape_env.bound_sympy(mod_remainder.node.expr)
+                mod_remainder_upper = int(value_ranges.upper)
             if mod_remainder_upper == 0:
                 mod_remainder = 0
         else:
             mod_remainder_upper = mod_remainder
 
         if mod_remainder_upper > pad:
             raise RuntimeError(
-                "This case should be handled by the SizeAdjustInputPass, is it enabled?"
+                "This case should be handled by the SizeAdjustInputPass, is it enabled?\n"
             )
         return pad - mod_remainder
 
 
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 from typing import cast, Sequence, Set, Type, TypeAlias
 
+import torch
 import torch.fx
 from executorch.backends.arm._passes import ArmPass
 from executorch.backends.arm._passes.arm_pass_utils import (
@@ -12,6 +13,9 @@
 )
 from executorch.backends.arm._passes.rewrite_conv_pass import RewriteConvPass
 from executorch.backends.arm._passes.rewrite_max_pool2d_pass import RewriteMaxPool2dPass
+from executorch.backends.arm._passes.symbolic_value_range import (
+    evaluate_symbolic_expr_values,
+)
 from executorch.backends.arm.tosa.specification import get_context_shape_env
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
@@ -49,6 +53,9 @@ def _greater_than(input: SymIntLike, other: int) -> bool | torch.SymBool:
     """Returns whether an int or SymInt is greater than another value."""
     if isinstance(input, torch.SymInt):
         shape_env = get_context_shape_env()
+        exact_values = evaluate_symbolic_expr_values(input.node.expr, shape_env)
+        if exact_values is not None:
+            return max(exact_values) > other
         value_ranges = shape_env.bound_sympy(input.node.expr)
         return value_ranges.upper > other
     else: