cuda support and some optimizations

SimonDanisch · SimonDanisch · commit 898eb271ec1f · 2026-02-23T17:16:53.000+01:00
diff --git a/src/camera/camera.jl b/src/camera/camera.jl
@@ -54,7 +54,7 @@ Same as `generate_ray`, but also computes rays for pixels shifted one pixel
 in x & y directions on the film plane.
 Useful for anti-aliasing textures.
 """
-function generate_ray_differential(
+@inline function generate_ray_differential(
         camera::C, sample::CameraSample,
     )::Tuple{RayDifferentials,Float32} where C<:Camera
 
diff --git a/src/film.jl b/src/film.jl
@@ -420,17 +420,19 @@ function fill_aux_buffers!(film::Film, scene, camera; has_infinite_lights::Bool=
 
     backend = KA.get_backend(albedo)
     kernel! = aux_buffer_kernel!(backend)
+    # Pass scene.accel directly instead of the full scene struct to avoid
+    # misaligned address errors on CUDA from passing large nested structs.
     kernel!(
         albedo, normal, depth,
         resolution, crop_bounds,
-        scene, camera, miss_depth;
+        scene.accel, camera, miss_depth;
         ndrange = length(albedo)
     )
     KA.synchronize(backend)
     return film
 end
 
-@kernel inbounds=true function aux_buffer_kernel!(albedo, normal, depth, resolution, crop_bounds, scene, camera, miss_depth::Float32)
+@kernel inbounds=true function aux_buffer_kernel!(albedo, normal, depth, resolution, crop_bounds, accel, camera, miss_depth::Float32)
     idx = @index(Global)
 
     # Convert linear index to 2D pixel coordinates
@@ -444,13 +446,15 @@ end
     py = Float32(row) + crop_bounds.p_min[2] - 1f0
     pixel = Point2f(px + 0.5f0, py + 0.5f0)  # Center of pixel
 
-    # Generate primary ray
+    # Generate primary ray (use generate_ray, not generate_ray_differential —
+    # aux buffers don't need ray differentials, and the simpler call avoids
+    # large return-value stack pressure on CUDA)
     camera_sample = CameraSample(pixel, Point2f(0.5f0), 0f0)
-    ray, ω = generate_ray_differential(camera, camera_sample)
+    ray, ω = generate_ray(camera, camera_sample)
 
      if ω > 0f0
         # Trace primary ray
-        hit, _primitive, si = intersect!(scene, ray)
+        hit, _primitive, si = intersect!(accel, ray)
 
         if hit
             # Store normal (world space)
diff --git a/src/integrators/volpath/volpath.jl b/src/integrators/volpath/volpath.jl
@@ -250,14 +250,6 @@ This replaces rand() calls with correlated low-discrepancy samples.
         py = u_int32(div(pixel_idx_0, rng.width)) + Int32(1)
 
         # Base dimension: 6 (camera) + 7 * depth
-        # Camera uses dims 0-5: wavelength(1), film_x(1), film_y(1), lens_x(1), lens_y(1), filter(1)
-        # pbrt-v4's Get1D/Get2D increment dimension BEFORE computing hash, so:
-        # - Get1D for direct.uc: dim++ → 7, hash uses 7
-        # - Get2D for direct.u: dim+=2 → 9, hash uses 9
-        # - Get1D for indirect.uc: dim++ → 10, hash uses 10
-        # - Get2D for indirect.u: dim+=2 → 12, hash uses 12
-        # - Get1D for indirect.rr: dim++ → 13, hash uses 13
-        # So dimensions used are: 7, 9, 10, 12, 13 (for depth 0)
         base_dim = Int32(6) + Int32(7) * depth
         # Generate 7 samples for this bounce using SobolRNG
         # Direct lighting: light selection (1D) + light position (2D)
@@ -290,12 +282,11 @@ function vp_generate_ray_samples!(
     depth::Int32,
     sobol_rng::SobolRNG
 )
-    ray_queue = current_ray_queue(state)
-    n_rays = length(ray_queue)
-    n_rays == 0 && return
-
     # Access SOA components of pixel_samples
+    ray_queue = current_ray_queue(state)
     pixel_samples = state.pixel_samples
+    n = length(ray_queue)
+    n == 0 && return
 
     kernel! = vp_generate_ray_samples_kernel!(backend)
     kernel!(
@@ -309,7 +300,7 @@ function vp_generate_ray_samples!(
         sample_idx,
         depth,
         sobol_rng;  # Pass whole SobolRNG, Adapt handles conversion
-        ndrange=n_rays
+        ndrange=n
     )
 end
 
diff --git a/src/integrators/workqueue.jl b/src/integrators/workqueue.jl
@@ -172,16 +172,17 @@ end
     end
 end
 
-function Base.foreach(f, queue::WorkQueue, args...; workgroupsize=nothing)
+# Default workgroupsize=256 gives ~14% speedup on CUDA (Ampere) vs auto-selection.
+# Static workgroupsize helps the compiler optimize register allocation and enables
+# more concurrent blocks per SM.
+const DEFAULT_WORKGROUPSIZE = 256
+
+function Base.foreach(f, queue::WorkQueue, args...; workgroupsize=DEFAULT_WORKGROUPSIZE)
     n = length(queue)
     n == 0 && return nothing
     backend = KA.get_backend(queue.items)
-    kernel! = _workqueue_map_kernel!(backend)
-    if workgroupsize === nothing
-        kernel!(f, queue, args...; ndrange=n)
-    else
-        kernel!(f, queue, args...; ndrange=n, workgroupsize=workgroupsize)
-    end
+    kernel! = _workqueue_map_kernel!(backend, workgroupsize)
+    kernel!(f, queue, args...; ndrange=n)
     return nothing
 end
 
diff --git a/src/sampler/sobol.jl b/src/sampler/sobol.jl
@@ -111,12 +111,9 @@ Arguments:
     v = UInt32(0)
     base_i = dimension * SOBOL_MATRIX_SIZE + Int32(1)  # Julia is 1-indexed
 
-    # Fully unrolled loop using @nexprs for SPIR-V structured control flow compatibility
-    # Each iteration: branchless XOR - always read matrix, mask with bit value
-    # Uses pure bitwise ops to avoid any conditional branches
-    Base.Cartesian.@nexprs 52 bit -> begin
-        bit0 = Int32(bit) - Int32(1)
-        # Extract bit and spread to all 32 bits: 0 if bit clear, 0xffffffff if bit set
+    # Regular for loop — CUDA compiler selects optimal unroll factor.
+    # Branchless XOR: always read matrix, mask with bit value.
+    for bit0 in Int32(0):Int32(SOBOL_MATRIX_SIZE - 1)
         bit_val = UInt32((a >> bit0) & Int64(1))
         mask = (bit_val * UInt32(0xffffffff))  # 0 or 0xffffffff
         @inbounds v ⊻= sobol_matrices[base_i + bit0] & mask
@@ -224,9 +221,9 @@ Uses compile-time unrolled loop with branchless operations for SPIR-V compatibil
     last_digit = pow2_flag
     pow2_adjust = pow2_flag
 
-    # Compile-time unrolled loop (32 iterations covers up to 64-bit Morton codes)
-    Base.Cartesian.@nexprs 32 iter -> begin
-        iter0 = Int32(iter) - Int32(1)
+    # Regular for loop — CUDA compiler selects optimal unroll factor.
+    # 32 iterations covers up to 64-bit Morton codes.
+    for iter0 in Int32(0):Int32(31)
         i = n_base4_digits - Int32(1) - iter0
 
         # Branchless max to ensure digit_shift >= 0
diff --git a/src/spectral/spectral.jl b/src/spectral/spectral.jl
@@ -276,11 +276,7 @@ Check if wavelength i has non-zero PDF (should contribute).
     Atomix.@atomic pixel_L[base_idx+Int32(4)] += contrib[4]
 end
 
-_pointer(x, idx) = pointer(x, idx)
-_pointer(x::Base.Experimental.Const, idx) = pointer(x.a, idx)
-
-Base.@propagate_inbounds function load(array::AbstractArray{Float32}, index::Integer, ::Type{T}) where T
-    ptr = _pointer(array, index)
-    ptr32 = as_pointer(T, ptr)
-    return Base.unsafe_load(ptr32)
+@inline function load(array, index::Integer, ::Type{T}) where T
+    N = sizeof(T) ÷ sizeof(Float32)
+    @inbounds T(ntuple(i -> array[index + i - 1], Val(N)))
 end