Skip to content

Commit c0ab358

Browse files
authored
Fixes for LLVM 22, 23 (#8939)
* Migrate away from NoInfsFPMath, which was removed in LLVM 23 * Prevent usage of ninja 1.13.0 due to ninja-build/ninja#2616 * Run compile_module with a large stack * Don't be so verbose in apps/onnx tests and pin onnx to 1.18.0
1 parent c2a6e34 commit c0ab358

File tree

12 files changed

+86
-71
lines changed

12 files changed

+86
-71
lines changed

apps/onnx/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,12 +121,12 @@ target_link_libraries(model_cpp PRIVATE Halide::Halide onnx_app::oclib)
121121

122122
add_test(
123123
NAME model_test
124-
COMMAND ${Python_EXECUTABLE} -m unittest ${CMAKE_CURRENT_SOURCE_DIR}/model_test.py -v
124+
COMMAND ${Python_EXECUTABLE} -m unittest ${CMAKE_CURRENT_SOURCE_DIR}/model_test.py
125125
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
126126
)
127127
add_test(
128128
NAME halide_as_onnx_backend_test
129-
COMMAND ${Python_EXECUTABLE} -m unittest ${CMAKE_CURRENT_SOURCE_DIR}/halide_as_onnx_backend_test.py -v
129+
COMMAND ${Python_EXECUTABLE} -m unittest ${CMAKE_CURRENT_SOURCE_DIR}/halide_as_onnx_backend_test.py
130130
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
131131
)
132132
set_tests_properties(

pyproject.toml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,14 @@ dev = [
6868
"setuptools-scm>=8.3.1",
6969
]
7070
apps = [
71-
"onnx>=1.18.0", # for apps/onnx
71+
"onnx==1.18.0", # for apps/onnx
7272
"pytest", # unspecified onnx dependency
7373
]
7474
tools = [
7575
"cmake>=3.28",
76-
"ninja>=1.11",
76+
# 1.13.0 uses LF in .rsp files, breaking MSVC link.exe.
77+
# See: https://github.com/ninja-build/ninja/pull/2616
78+
"ninja>=1.11,!=1.13.0",
7779
"ruff>=0.12",
7880
"tbump>=6.11",
7981
]
@@ -87,6 +89,7 @@ Repository = "https://github.com/halide/Halide.git"
8789

8890
[tool.scikit-build]
8991
cmake.version = ">=3.28"
92+
ninja.version = ">=1.11,!=1.13.0"
9093
wheel.install-dir = "halide"
9194
sdist.include = ["dependencies/"]
9295
sdist.exclude = [".github/", "apps/", "test/", "tutorial/", "dependencies/update-*.sh"]

requirements.txt

Lines changed: 20 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2,80 +2,70 @@
22
# uv export --all-groups --no-hashes --no-emit-project -o requirements.txt
33
cli-ui==0.19.0
44
# via tbump
5-
cmake==4.1.0
5+
cmake==4.2.1
66
colorama==0.4.6
77
# via
88
# cli-ui
99
# pytest
1010
docopt==0.6.2
1111
# via tbump
12-
exceptiongroup==1.3.0 ; python_full_version < '3.11'
12+
exceptiongroup==1.3.1 ; python_full_version < '3.11'
1313
# via
1414
# pytest
1515
# scikit-build-core
16-
imageio==2.37.0
16+
imageio==2.37.2
1717
# via halide
18-
importlib-metadata==8.7.0 ; python_full_version < '3.10'
19-
# via setuptools-scm
20-
iniconfig==2.1.0
18+
iniconfig==2.3.0
2119
# via pytest
22-
ninja==1.13.0
23-
numpy==2.0.2 ; python_full_version < '3.10'
24-
# via
25-
# halide
26-
# imageio
27-
# onnx
28-
numpy==2.2.6 ; python_full_version == '3.10.*'
20+
ninja==1.11.1.4
21+
numpy==2.2.6 ; python_full_version < '3.11'
2922
# via
3023
# halide
3124
# imageio
3225
# onnx
33-
numpy==2.3.2 ; python_full_version >= '3.11'
26+
numpy==2.4.2 ; python_full_version >= '3.11'
3427
# via
3528
# halide
3629
# imageio
3730
# onnx
3831
onnx==1.18.0
39-
packaging==25.0
32+
packaging==26.0
4033
# via
4134
# pytest
4235
# scikit-build-core
4336
# setuptools-scm
44-
pathspec==0.12.1
37+
pathspec==1.0.4
4538
# via scikit-build-core
46-
pillow==11.3.0
39+
pillow==12.1.1
4740
# via imageio
4841
pluggy==1.6.0
4942
# via pytest
50-
protobuf==6.31.1
43+
protobuf==6.33.5
5144
# via onnx
52-
pybind11==3.0.0
45+
pybind11==3.0.1
5346
pygments==2.19.2
5447
# via pytest
55-
pytest==8.4.1
56-
ruff==0.12.8
57-
schema==0.7.7
48+
pytest==9.0.2
49+
ruff==0.15.0
50+
schema==0.7.8
5851
# via tbump
59-
scikit-build-core==0.11.5
60-
setuptools==80.9.0
52+
scikit-build-core==0.11.6
53+
setuptools==82.0.0
6154
# via setuptools-scm
62-
setuptools-scm==8.3.1
55+
setuptools-scm==9.2.2
6356
tabulate==0.9.0
6457
# via cli-ui
6558
tbump==6.11.0
66-
tomli==2.2.1 ; python_full_version < '3.11'
59+
tomli==2.4.0 ; python_full_version < '3.11'
6760
# via
6861
# pytest
6962
# scikit-build-core
7063
# setuptools-scm
7164
tomlkit==0.11.8
7265
# via tbump
73-
typing-extensions==4.14.1
66+
typing-extensions==4.15.0
7467
# via
7568
# exceptiongroup
7669
# onnx
77-
# setuptools-scm
7870
unidecode==1.4.0
7971
# via cli-ui
80-
zipp==3.23.0 ; python_full_version < '3.10'
81-
# via importlib-metadata

src/CodeGen_Internal.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -615,7 +615,9 @@ void get_target_options(const llvm::Module &module, llvm::TargetOptions &options
615615
#if LLVM_VERSION < 210
616616
options.UnsafeFPMath = !per_instruction_fast_math_flags;
617617
#endif
618+
#if LLVM_VERSION < 230
618619
options.NoInfsFPMath = !per_instruction_fast_math_flags;
620+
#endif
619621
options.NoNaNsFPMath = !per_instruction_fast_math_flags;
620622
options.HonorSignDependentRoundingFPMathOption = !per_instruction_fast_math_flags;
621623
options.NoZerosInBSS = false;

src/CodeGen_LLVM.cpp

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,21 @@ void CodeGen_LLVM::init_context() {
283283

284284
// Ensure no Value pointers carry over from previous context.
285285
struct_type_recovery.clear();
286+
287+
if (any_strict_float) {
288+
// Default all operations to strict, and relax any non-strict operations
289+
// when possible. This is better than defaulting to relaxed and making
290+
// some operations strict, because properties like no-nans are
291+
// viral. It's no use having a strict comparison that respects nans if
292+
// the source of the inputs was an op tagged with no-nans.
293+
set_strict_fp_math();
294+
// If the target has the strict_float flag, we act as if we're already
295+
// inside a strict_float intrinsic.
296+
in_strict_float = target.has_feature(Target::StrictFloat);
297+
} else {
298+
// Default all operations to relaxed.
299+
set_fast_fp_math();
300+
}
286301
}
287302

288303
void CodeGen_LLVM::init_module() {
@@ -443,21 +458,6 @@ void CodeGen_LLVM::init_codegen(const std::string &name) {
443458

444459
semaphore_t_type = get_llvm_struct_type_by_name(module.get(), "struct.halide_semaphore_t");
445460
internal_assert(semaphore_t_type) << "Did not find halide_semaphore_t in initial module";
446-
447-
if (any_strict_float) {
448-
// Default all operations to strict, and relax any non-strict operations
449-
// when possible. This is better than defaulting to relaxed and making
450-
// some operations strict, because properties like no-nans are
451-
// viral. It's no use having a strict comparison that respects nans if
452-
// the source of the inputs was an op tagged with no-nans.
453-
set_strict_fp_math();
454-
// If the target has the strict_float flag, we act as if we're already
455-
// inside a strict_float intrinsic.
456-
in_strict_float = target.has_feature(Target::StrictFloat);
457-
} else {
458-
// Default all operations to relaxed.
459-
set_fast_fp_math();
460-
}
461461
}
462462

463463
void CodeGen_LLVM::set_fast_fp_math() {

src/CodeGen_PTX_Dev.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ namespace {
3838
class CodeGen_PTX_Dev : public CodeGen_LLVM, public CodeGen_GPU_Dev {
3939
public:
4040
/** Create a PTX device code generator. */
41-
CodeGen_PTX_Dev(const Target &host);
41+
CodeGen_PTX_Dev(const Target &host, bool any_strict_float);
4242
~CodeGen_PTX_Dev() override;
4343

4444
void add_kernel(Stmt stmt,
@@ -105,8 +105,9 @@ class CodeGen_PTX_Dev : public CodeGen_LLVM, public CodeGen_GPU_Dev {
105105
bool supports_atomic_add(const Type &t) const override;
106106
};
107107

108-
CodeGen_PTX_Dev::CodeGen_PTX_Dev(const Target &host)
108+
CodeGen_PTX_Dev::CodeGen_PTX_Dev(const Target &host, bool any_strict_float)
109109
: CodeGen_LLVM(host) {
110+
this->any_strict_float = any_strict_float;
110111
context = new llvm::LLVMContext();
111112
}
112113

@@ -615,7 +616,9 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
615616
#if LLVM_VERSION < 210
616617
options.UnsafeFPMath = true;
617618
#endif
619+
#if LLVM_VERSION < 230
618620
options.NoInfsFPMath = true;
621+
#endif
619622
options.NoNaNsFPMath = true;
620623
options.HonorSignDependentRoundingFPMathOption = false;
621624
options.NoZerosInBSS = false;
@@ -819,13 +822,13 @@ bool CodeGen_PTX_Dev::supports_atomic_add(const Type &t) const {
819822

820823
} // namespace
821824

822-
std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_PTX_Dev(const Target &target) {
823-
return std::make_unique<CodeGen_PTX_Dev>(target);
825+
std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_PTX_Dev(const Target &target, bool any_strict_float) {
826+
return std::make_unique<CodeGen_PTX_Dev>(target, any_strict_float);
824827
}
825828

826829
#else // WITH_PTX
827830

828-
std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_PTX_Dev(const Target &target) {
831+
std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_PTX_Dev(const Target &target, bool /*any_strict_float*/) {
829832
user_error << "PTX not enabled for this build of Halide.\n";
830833
return nullptr;
831834
}

src/CodeGen_PTX_Dev.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ namespace Internal {
1515

1616
struct CodeGen_GPU_Dev;
1717

18-
std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_PTX_Dev(const Target &target);
18+
std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_PTX_Dev(const Target &target, bool any_strict_float);
1919

2020
} // namespace Internal
2121
} // namespace Halide

src/JITModule.cpp

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "LLVM_Output.h"
2222
#include "LLVM_Runtime_Linker.h"
2323
#include "Pipeline.h"
24+
#include "Util.h"
2425
#include "WasmExecutor.h"
2526

2627
namespace Halide {
@@ -295,9 +296,12 @@ JITModule::JITModule(const Module &m, const LoweredFunc &fn,
295296
llvm::reportAndResetTimings();
296297
}
297298

298-
void JITModule::compile_module(std::unique_ptr<llvm::Module> m, const string &function_name, const Target &target,
299-
const std::vector<JITModule> &dependencies,
300-
const std::vector<std::string> &requested_exports) {
299+
namespace {
300+
void compile_module_impl(
301+
IntrusivePtr<JITModuleContents> &jit_module,
302+
std::unique_ptr<llvm::Module> m, const string &function_name, const Target &target,
303+
const std::vector<JITModule> &dependencies,
304+
const std::vector<std::string> &requested_exports) {
301305

302306
// Ensure that LLVM is initialized
303307
CodeGen_LLVM::initialize_llvm();
@@ -348,10 +352,10 @@ void JITModule::compile_module(std::unique_ptr<llvm::Module> m, const string &fu
348352
if ((target.arch == Target::Arch::X86 && target.bits == 32) ||
349353
(target.arch == Target::Arch::ARM && target.bits == 32) ||
350354
target.os == Target::Windows) {
351-
// Fallback to RTDyld-based linking to workaround errors:
352-
// i386: "JIT session error: Unsupported i386 relocation:4" (R_386_PLT32)
353-
// ARM 32bit: Unsupported target machine architecture in ELF object shared runtime-jitted-objectbuffer
354-
// Windows 64-bit: JIT session error: could not register eh-frame: __register_frame function not found
355+
// Fallback to RTDyld-based linking to workaround errors:
356+
// i386: "JIT session error: Unsupported i386 relocation:4" (R_386_PLT32)
357+
// ARM 32bit: Unsupported target machine architecture in ELF object shared runtime-jitted-objectbuffer
358+
// Windows 64-bit: JIT session error: could not register eh-frame: __register_frame function not found
355359
#if LLVM_VERSION >= 210
356360
linkerBuilder = [&](llvm::orc::ExecutionSession &session) {
357361
return std::make_unique<llvm::orc::RTDyldObjectLinkingLayer>(session, [&](const llvm::MemoryBuffer &) {
@@ -424,6 +428,7 @@ void JITModule::compile_module(std::unique_ptr<llvm::Module> m, const string &fu
424428
debug(1) << "JIT compiling " << module_name
425429
<< " for " << target.to_string() << "\n";
426430

431+
using Symbol = JITModule::Symbol;
427432
std::map<std::string, Symbol> exports;
428433

429434
Symbol entrypoint;
@@ -451,6 +456,18 @@ void JITModule::compile_module(std::unique_ptr<llvm::Module> m, const string &fu
451456
jit_module->argv_entrypoint = argv_entrypoint;
452457
jit_module->name = function_name;
453458
}
459+
} // namespace
460+
461+
void JITModule::compile_module(std::unique_ptr<llvm::Module> m, const string &function_name, const Target &target,
462+
const std::vector<JITModule> &dependencies,
463+
const std::vector<std::string> &requested_exports) {
464+
// LLJIT's SimpleCompiler triggers LLVM's AsmPrinter, which can use a large
465+
// amount of stack (observed stack overflows on macOS worker threads with
466+
// 512KB default stacks). Use run_with_large_stack to ensure enough space.
467+
run_with_large_stack([&]() {
468+
compile_module_impl(jit_module, std::move(m), function_name, target, dependencies, requested_exports);
469+
});
470+
}
454471

455472
/*static*/
456473
JITModule JITModule::make_trampolines_module(const Target &target_arg,

src/Lower.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -493,7 +493,7 @@ void lower_impl(const vector<Function> &output_funcs,
493493

494494
if (t.has_gpu_feature()) {
495495
debug(1) << "Offloading GPU loops...\n";
496-
s = inject_gpu_offload(s, t);
496+
s = inject_gpu_offload(s, t, any_strict_float);
497497
debug(2) << "Lowering after splitting off GPU loops:\n"
498498
<< s << "\n\n";
499499
} else {

src/OffloadGPULoops.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ class InjectGpuOffload : public IRMutator {
245245
}
246246

247247
public:
248-
InjectGpuOffload(const Target &target)
248+
InjectGpuOffload(const Target &target, bool any_strict_float)
249249
: target(target) {
250250
Target device_target = target;
251251
// For the GPU target we just want to pass the flags, to avoid the
@@ -254,7 +254,7 @@ class InjectGpuOffload : public IRMutator {
254254
device_target.os = Target::OSUnknown;
255255
device_target.arch = Target::ArchUnknown;
256256
if (target.has_feature(Target::CUDA)) {
257-
cgdev[DeviceAPI::CUDA] = new_CodeGen_PTX_Dev(device_target);
257+
cgdev[DeviceAPI::CUDA] = new_CodeGen_PTX_Dev(device_target, any_strict_float);
258258
}
259259
if (target.has_feature(Target::OpenCL)) {
260260
cgdev[DeviceAPI::OpenCL] = new_CodeGen_OpenCL_Dev(device_target);
@@ -315,8 +315,8 @@ class InjectGpuOffload : public IRMutator {
315315

316316
} // namespace
317317

318-
Stmt inject_gpu_offload(const Stmt &s, const Target &host_target) {
319-
return InjectGpuOffload(host_target).inject(s);
318+
Stmt inject_gpu_offload(const Stmt &s, const Target &host_target, bool any_strict_float) {
319+
return InjectGpuOffload(host_target, any_strict_float).inject(s);
320320
}
321321

322322
} // namespace Internal

0 commit comments

Comments
 (0)