Fixes for LLVM 22, 23 (#8939)

alexreinking · web-flow · commit c0ab35883f15 · 2026-02-11T17:44:53.000-05:00
* Migrate away from NoInfsFPMath, which was removed in LLVM 23 * Prevent usage of ninja 1.13.0 due to ninja-build/ninja#2616 * Run compile_module with a large stack * Don't be so verbose in apps/onnx tests and pin onnx to 1.18.0
diff --git a/apps/onnx/CMakeLists.txt b/apps/onnx/CMakeLists.txt
@@ -121,12 +121,12 @@ target_link_libraries(model_cpp PRIVATE Halide::Halide onnx_app::oclib)
 
 add_test(
     NAME model_test
-    COMMAND ${Python_EXECUTABLE} -m unittest ${CMAKE_CURRENT_SOURCE_DIR}/model_test.py -v
+    COMMAND ${Python_EXECUTABLE} -m unittest ${CMAKE_CURRENT_SOURCE_DIR}/model_test.py
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
 )
 add_test(
     NAME halide_as_onnx_backend_test
-    COMMAND ${Python_EXECUTABLE} -m unittest ${CMAKE_CURRENT_SOURCE_DIR}/halide_as_onnx_backend_test.py -v
+    COMMAND ${Python_EXECUTABLE} -m unittest ${CMAKE_CURRENT_SOURCE_DIR}/halide_as_onnx_backend_test.py
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
 )
 set_tests_properties(
diff --git a/pyproject.toml b/pyproject.toml
@@ -68,12 +68,14 @@ dev = [
     "setuptools-scm>=8.3.1",
 ]
 apps = [
-    "onnx>=1.18.0", # for apps/onnx
+    "onnx==1.18.0", # for apps/onnx
     "pytest", # unspecified onnx dependency
 ]
 tools = [
     "cmake>=3.28",
-    "ninja>=1.11",
+    # 1.13.0 uses LF in .rsp files, breaking MSVC link.exe.
+    # See: https://github.com/ninja-build/ninja/pull/2616
+    "ninja>=1.11,!=1.13.0",
     "ruff>=0.12",
     "tbump>=6.11",
 ]
@@ -87,6 +89,7 @@ Repository = "https://github.com/halide/Halide.git"
 
 [tool.scikit-build]
 cmake.version = ">=3.28"
+ninja.version = ">=1.11,!=1.13.0"
 wheel.install-dir = "halide"
 sdist.include = ["dependencies/"]
 sdist.exclude = [".github/", "apps/", "test/", "tutorial/", "dependencies/update-*.sh"]
diff --git a/requirements.txt b/requirements.txt
@@ -2,80 +2,70 @@
 #    uv export --all-groups --no-hashes --no-emit-project -o requirements.txt
 cli-ui==0.19.0
     # via tbump
-cmake==4.1.0
+cmake==4.2.1
 colorama==0.4.6
     # via
     #   cli-ui
     #   pytest
 docopt==0.6.2
     # via tbump
-exceptiongroup==1.3.0 ; python_full_version < '3.11'
+exceptiongroup==1.3.1 ; python_full_version < '3.11'
     # via
     #   pytest
     #   scikit-build-core
-imageio==2.37.0
+imageio==2.37.2
     # via halide
-importlib-metadata==8.7.0 ; python_full_version < '3.10'
-    # via setuptools-scm
-iniconfig==2.1.0
+iniconfig==2.3.0
     # via pytest
-ninja==1.13.0
-numpy==2.0.2 ; python_full_version < '3.10'
-    # via
-    #   halide
-    #   imageio
-    #   onnx
-numpy==2.2.6 ; python_full_version == '3.10.*'
+ninja==1.11.1.4
+numpy==2.2.6 ; python_full_version < '3.11'
     # via
     #   halide
     #   imageio
     #   onnx
-numpy==2.3.2 ; python_full_version >= '3.11'
+numpy==2.4.2 ; python_full_version >= '3.11'
     # via
     #   halide
     #   imageio
     #   onnx
 onnx==1.18.0
-packaging==25.0
+packaging==26.0
     # via
     #   pytest
     #   scikit-build-core
     #   setuptools-scm
-pathspec==0.12.1
+pathspec==1.0.4
     # via scikit-build-core
-pillow==11.3.0
+pillow==12.1.1
     # via imageio
 pluggy==1.6.0
     # via pytest
-protobuf==6.31.1
+protobuf==6.33.5
     # via onnx
-pybind11==3.0.0
+pybind11==3.0.1
 pygments==2.19.2
     # via pytest
-pytest==8.4.1
-ruff==0.12.8
-schema==0.7.7
+pytest==9.0.2
+ruff==0.15.0
+schema==0.7.8
     # via tbump
-scikit-build-core==0.11.5
-setuptools==80.9.0
+scikit-build-core==0.11.6
+setuptools==82.0.0
     # via setuptools-scm
-setuptools-scm==8.3.1
+setuptools-scm==9.2.2
 tabulate==0.9.0
     # via cli-ui
 tbump==6.11.0
-tomli==2.2.1 ; python_full_version < '3.11'
+tomli==2.4.0 ; python_full_version < '3.11'
     # via
     #   pytest
     #   scikit-build-core
     #   setuptools-scm
 tomlkit==0.11.8
     # via tbump
-typing-extensions==4.14.1
+typing-extensions==4.15.0
     # via
     #   exceptiongroup
     #   onnx
-    #   setuptools-scm
 unidecode==1.4.0
     # via cli-ui
-zipp==3.23.0 ; python_full_version < '3.10'
-    # via importlib-metadata
diff --git a/src/CodeGen_Internal.cpp b/src/CodeGen_Internal.cpp
@@ -615,7 +615,9 @@ void get_target_options(const llvm::Module &module, llvm::TargetOptions &options
 #if LLVM_VERSION < 210
     options.UnsafeFPMath = !per_instruction_fast_math_flags;
 #endif
+#if LLVM_VERSION < 230
     options.NoInfsFPMath = !per_instruction_fast_math_flags;
+#endif
     options.NoNaNsFPMath = !per_instruction_fast_math_flags;
     options.HonorSignDependentRoundingFPMathOption = !per_instruction_fast_math_flags;
     options.NoZerosInBSS = false;
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
@@ -283,6 +283,21 @@ void CodeGen_LLVM::init_context() {
 
     // Ensure no Value pointers carry over from previous context.
     struct_type_recovery.clear();
+
+    if (any_strict_float) {
+        // Default all operations to strict, and relax any non-strict operations
+        // when possible. This is better than defaulting to relaxed and making
+        // some operations strict, because properties like no-nans are
+        // viral. It's no use having a strict comparison that respects nans if
+        // the source of the inputs was an op tagged with no-nans.
+        set_strict_fp_math();
+        // If the target has the strict_float flag, we act as if we're already
+        // inside a strict_float intrinsic.
+        in_strict_float = target.has_feature(Target::StrictFloat);
+    } else {
+        // Default all operations to relaxed.
+        set_fast_fp_math();
+    }
 }
 
 void CodeGen_LLVM::init_module() {
@@ -443,21 +458,6 @@ void CodeGen_LLVM::init_codegen(const std::string &name) {
 
     semaphore_t_type = get_llvm_struct_type_by_name(module.get(), "struct.halide_semaphore_t");
     internal_assert(semaphore_t_type) << "Did not find halide_semaphore_t in initial module";
-
-    if (any_strict_float) {
-        // Default all operations to strict, and relax any non-strict operations
-        // when possible. This is better than defaulting to relaxed and making
-        // some operations strict, because properties like no-nans are
-        // viral. It's no use having a strict comparison that respects nans if
-        // the source of the inputs was an op tagged with no-nans.
-        set_strict_fp_math();
-        // If the target has the strict_float flag, we act as if we're already
-        // inside a strict_float intrinsic.
-        in_strict_float = target.has_feature(Target::StrictFloat);
-    } else {
-        // Default all operations to relaxed.
-        set_fast_fp_math();
-    }
 }
 
 void CodeGen_LLVM::set_fast_fp_math() {
diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
@@ -38,7 +38,7 @@ namespace {
 class CodeGen_PTX_Dev : public CodeGen_LLVM, public CodeGen_GPU_Dev {
 public:
     /** Create a PTX device code generator. */
-    CodeGen_PTX_Dev(const Target &host);
+    CodeGen_PTX_Dev(const Target &host, bool any_strict_float);
     ~CodeGen_PTX_Dev() override;
 
     void add_kernel(Stmt stmt,
@@ -105,8 +105,9 @@ class CodeGen_PTX_Dev : public CodeGen_LLVM, public CodeGen_GPU_Dev {
     bool supports_atomic_add(const Type &t) const override;
 };
 
-CodeGen_PTX_Dev::CodeGen_PTX_Dev(const Target &host)
+CodeGen_PTX_Dev::CodeGen_PTX_Dev(const Target &host, bool any_strict_float)
     : CodeGen_LLVM(host) {
+    this->any_strict_float = any_strict_float;
     context = new llvm::LLVMContext();
 }
 
@@ -615,7 +616,9 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
 #if LLVM_VERSION < 210
     options.UnsafeFPMath = true;
 #endif
+#if LLVM_VERSION < 230
     options.NoInfsFPMath = true;
+#endif
     options.NoNaNsFPMath = true;
     options.HonorSignDependentRoundingFPMathOption = false;
     options.NoZerosInBSS = false;
@@ -819,13 +822,13 @@ bool CodeGen_PTX_Dev::supports_atomic_add(const Type &t) const {
 
 }  // namespace
 
-std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_PTX_Dev(const Target &target) {
-    return std::make_unique<CodeGen_PTX_Dev>(target);
+std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_PTX_Dev(const Target &target, bool any_strict_float) {
+    return std::make_unique<CodeGen_PTX_Dev>(target, any_strict_float);
 }
 
 #else  // WITH_PTX
 
-std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_PTX_Dev(const Target &target) {
+std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_PTX_Dev(const Target &target, bool /*any_strict_float*/) {
     user_error << "PTX not enabled for this build of Halide.\n";
     return nullptr;
 }
diff --git a/src/CodeGen_PTX_Dev.h b/src/CodeGen_PTX_Dev.h
@@ -15,7 +15,7 @@ namespace Internal {
 
 struct CodeGen_GPU_Dev;
 
-std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_PTX_Dev(const Target &target);
+std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_PTX_Dev(const Target &target, bool any_strict_float);
 
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/JITModule.cpp b/src/JITModule.cpp
@@ -21,6 +21,7 @@
 #include "LLVM_Output.h"
 #include "LLVM_Runtime_Linker.h"
 #include "Pipeline.h"
+#include "Util.h"
 #include "WasmExecutor.h"
 
 namespace Halide {
@@ -295,9 +296,12 @@ JITModule::JITModule(const Module &m, const LoweredFunc &fn,
     llvm::reportAndResetTimings();
 }
 
-void JITModule::compile_module(std::unique_ptr<llvm::Module> m, const string &function_name, const Target &target,
-                               const std::vector<JITModule> &dependencies,
-                               const std::vector<std::string> &requested_exports) {
+namespace {
+void compile_module_impl(
+    IntrusivePtr<JITModuleContents> &jit_module,
+    std::unique_ptr<llvm::Module> m, const string &function_name, const Target &target,
+    const std::vector<JITModule> &dependencies,
+    const std::vector<std::string> &requested_exports) {
 
     // Ensure that LLVM is initialized
     CodeGen_LLVM::initialize_llvm();
@@ -348,10 +352,10 @@ void JITModule::compile_module(std::unique_ptr<llvm::Module> m, const string &fu
     if ((target.arch == Target::Arch::X86 && target.bits == 32) ||
         (target.arch == Target::Arch::ARM && target.bits == 32) ||
         target.os == Target::Windows) {
-// Fallback to RTDyld-based linking to workaround errors:
-// i386: "JIT session error: Unsupported i386 relocation:4" (R_386_PLT32)
-// ARM 32bit: Unsupported target machine architecture in ELF object shared runtime-jitted-objectbuffer
-// Windows 64-bit: JIT session error: could not register eh-frame: __register_frame function not found
+        // Fallback to RTDyld-based linking to workaround errors:
+        // i386: "JIT session error: Unsupported i386 relocation:4" (R_386_PLT32)
+        // ARM 32bit: Unsupported target machine architecture in ELF object shared runtime-jitted-objectbuffer
+        // Windows 64-bit: JIT session error: could not register eh-frame: __register_frame function not found
 #if LLVM_VERSION >= 210
         linkerBuilder = [&](llvm::orc::ExecutionSession &session) {
             return std::make_unique<llvm::orc::RTDyldObjectLinkingLayer>(session, [&](const llvm::MemoryBuffer &) {
@@ -424,6 +428,7 @@ void JITModule::compile_module(std::unique_ptr<llvm::Module> m, const string &fu
     debug(1) << "JIT compiling " << module_name
              << " for " << target.to_string() << "\n";
 
+    using Symbol = JITModule::Symbol;
     std::map<std::string, Symbol> exports;
 
     Symbol entrypoint;
@@ -451,6 +456,18 @@ void JITModule::compile_module(std::unique_ptr<llvm::Module> m, const string &fu
     jit_module->argv_entrypoint = argv_entrypoint;
     jit_module->name = function_name;
 }
+}  // namespace
+
+void JITModule::compile_module(std::unique_ptr<llvm::Module> m, const string &function_name, const Target &target,
+                               const std::vector<JITModule> &dependencies,
+                               const std::vector<std::string> &requested_exports) {
+    // LLJIT's SimpleCompiler triggers LLVM's AsmPrinter, which can use a large
+    // amount of stack (observed stack overflows on macOS worker threads with
+    // 512KB default stacks). Use run_with_large_stack to ensure enough space.
+    run_with_large_stack([&]() {
+        compile_module_impl(jit_module, std::move(m), function_name, target, dependencies, requested_exports);
+    });
+}
 
 /*static*/
 JITModule JITModule::make_trampolines_module(const Target &target_arg,
diff --git a/src/Lower.cpp b/src/Lower.cpp
@@ -493,7 +493,7 @@ void lower_impl(const vector<Function> &output_funcs,
 
     if (t.has_gpu_feature()) {
         debug(1) << "Offloading GPU loops...\n";
-        s = inject_gpu_offload(s, t);
+        s = inject_gpu_offload(s, t, any_strict_float);
         debug(2) << "Lowering after splitting off GPU loops:\n"
                  << s << "\n\n";
     } else {
diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp
@@ -245,7 +245,7 @@ class InjectGpuOffload : public IRMutator {
     }
 
 public:
-    InjectGpuOffload(const Target &target)
+    InjectGpuOffload(const Target &target, bool any_strict_float)
         : target(target) {
         Target device_target = target;
         // For the GPU target we just want to pass the flags, to avoid the
@@ -254,7 +254,7 @@ class InjectGpuOffload : public IRMutator {
         device_target.os = Target::OSUnknown;
         device_target.arch = Target::ArchUnknown;
         if (target.has_feature(Target::CUDA)) {
-            cgdev[DeviceAPI::CUDA] = new_CodeGen_PTX_Dev(device_target);
+            cgdev[DeviceAPI::CUDA] = new_CodeGen_PTX_Dev(device_target, any_strict_float);
         }
         if (target.has_feature(Target::OpenCL)) {
             cgdev[DeviceAPI::OpenCL] = new_CodeGen_OpenCL_Dev(device_target);
@@ -315,8 +315,8 @@ class InjectGpuOffload : public IRMutator {
 
 }  // namespace
 
-Stmt inject_gpu_offload(const Stmt &s, const Target &host_target) {
-    return InjectGpuOffload(host_target).inject(s);
+Stmt inject_gpu_offload(const Stmt &s, const Target &host_target, bool any_strict_float) {
+    return InjectGpuOffload(host_target, any_strict_float).inject(s);
 }
 
 }  // namespace Internal
diff --git a/src/OffloadGPULoops.h b/src/OffloadGPULoops.h
@@ -17,7 +17,7 @@ namespace Internal {
 
 /** Pull loops marked with GPU device APIs to a separate
  * module, and call them through the appropriate host runtime module. */
-Stmt inject_gpu_offload(const Stmt &s, const Target &host_target);
+Stmt inject_gpu_offload(const Stmt &s, const Target &host_target, bool any_strict_float);
 
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/Util.h b/src/Util.h
@@ -539,9 +539,9 @@ void set_compiler_stack_size(size_t);
 constexpr size_t default_compiler_stack_size = 32 * 1024 * 1024;
 
 /** Return how much stack size the compiler should use for calls that
- * go through run_with_large_stack below. Currently that's lowering
- * and codegen. If no call to set_compiler_stack_size has been made,
- * this checks the value of the environment variable
+ * go through run_with_large_stack below. Currently that's lowering,
+ * codegen, and JIT compilation. If no call to set_compiler_stack_size
+ * has been made, this checks the value of the environment variable
  * HL_COMPILER_STACK_SIZE. If that's unset, it returns
  * default_compiler_stack_size, defined above. */
 size_t get_compiler_stack_size();

Original file line number	Diff line number	Diff line change
`@@ -121,12 +121,12 @@ target_link_libraries(model_cpp PRIVATE Halide::Halide onnx_app::oclib)`
`121`	`121`
`122`	`122`	`add_test(`
`123`	`123`	`NAME model_test`
`124`		`- COMMAND ${Python_EXECUTABLE} -m unittest ${CMAKE_CURRENT_SOURCE_DIR}/model_test.py -v`
	`124`	`+ COMMAND ${Python_EXECUTABLE} -m unittest ${CMAKE_CURRENT_SOURCE_DIR}/model_test.py`
`125`	`125`	`WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}`
`126`	`126`	`)`
`127`	`127`	`add_test(`
`128`	`128`	`NAME halide_as_onnx_backend_test`
`129`		`- COMMAND ${Python_EXECUTABLE} -m unittest ${CMAKE_CURRENT_SOURCE_DIR}/halide_as_onnx_backend_test.py -v`
	`129`	`+ COMMAND ${Python_EXECUTABLE} -m unittest ${CMAKE_CURRENT_SOURCE_DIR}/halide_as_onnx_backend_test.py`
`130`	`130`	`WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}`
`131`	`131`	`)`
`132`	`132`	`set_tests_properties(`