Skip to content

Commit 898eb27

Browse files
committed
cuda support and some optimizations
1 parent 387d2a6 commit 898eb27

File tree

6 files changed

+31
-42
lines changed

6 files changed

+31
-42
lines changed

src/camera/camera.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ Same as `generate_ray`, but also computes rays for pixels shifted one pixel
5454
in x & y directions on the film plane.
5555
Useful for anti-aliasing textures.
5656
"""
57-
function generate_ray_differential(
57+
@inline function generate_ray_differential(
5858
camera::C, sample::CameraSample,
5959
)::Tuple{RayDifferentials,Float32} where C<:Camera
6060

src/film.jl

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -420,17 +420,19 @@ function fill_aux_buffers!(film::Film, scene, camera; has_infinite_lights::Bool=
420420

421421
backend = KA.get_backend(albedo)
422422
kernel! = aux_buffer_kernel!(backend)
423+
# Pass scene.accel directly instead of the full scene struct to avoid
424+
# misaligned address errors on CUDA from passing large nested structs.
423425
kernel!(
424426
albedo, normal, depth,
425427
resolution, crop_bounds,
426-
scene, camera, miss_depth;
428+
scene.accel, camera, miss_depth;
427429
ndrange = length(albedo)
428430
)
429431
KA.synchronize(backend)
430432
return film
431433
end
432434

433-
@kernel inbounds=true function aux_buffer_kernel!(albedo, normal, depth, resolution, crop_bounds, scene, camera, miss_depth::Float32)
435+
@kernel inbounds=true function aux_buffer_kernel!(albedo, normal, depth, resolution, crop_bounds, accel, camera, miss_depth::Float32)
434436
idx = @index(Global)
435437

436438
# Convert linear index to 2D pixel coordinates
@@ -444,13 +446,15 @@ end
444446
py = Float32(row) + crop_bounds.p_min[2] - 1f0
445447
pixel = Point2f(px + 0.5f0, py + 0.5f0) # Center of pixel
446448

447-
# Generate primary ray
449+
# Generate primary ray (use generate_ray, not generate_ray_differential —
450+
# aux buffers don't need ray differentials, and the simpler call avoids
451+
# large return-value stack pressure on CUDA)
448452
camera_sample = CameraSample(pixel, Point2f(0.5f0), 0f0)
449-
ray, ω = generate_ray_differential(camera, camera_sample)
453+
ray, ω = generate_ray(camera, camera_sample)
450454

451455
if ω > 0f0
452456
# Trace primary ray
453-
hit, _primitive, si = intersect!(scene, ray)
457+
hit, _primitive, si = intersect!(accel, ray)
454458

455459
if hit
456460
# Store normal (world space)

src/integrators/volpath/volpath.jl

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -250,14 +250,6 @@ This replaces rand() calls with correlated low-discrepancy samples.
250250
py = u_int32(div(pixel_idx_0, rng.width)) + Int32(1)
251251

252252
# Base dimension: 6 (camera) + 7 * depth
253-
# Camera uses dims 0-5: wavelength(1), film_x(1), film_y(1), lens_x(1), lens_y(1), filter(1)
254-
# pbrt-v4's Get1D/Get2D increment dimension BEFORE computing hash, so:
255-
# - Get1D for direct.uc: dim++ → 7, hash uses 7
256-
# - Get2D for direct.u: dim+=2 → 9, hash uses 9
257-
# - Get1D for indirect.uc: dim++ → 10, hash uses 10
258-
# - Get2D for indirect.u: dim+=2 → 12, hash uses 12
259-
# - Get1D for indirect.rr: dim++ → 13, hash uses 13
260-
# So dimensions used are: 7, 9, 10, 12, 13 (for depth 0)
261253
base_dim = Int32(6) + Int32(7) * depth
262254
# Generate 7 samples for this bounce using SobolRNG
263255
# Direct lighting: light selection (1D) + light position (2D)
@@ -290,12 +282,11 @@ function vp_generate_ray_samples!(
290282
depth::Int32,
291283
sobol_rng::SobolRNG
292284
)
293-
ray_queue = current_ray_queue(state)
294-
n_rays = length(ray_queue)
295-
n_rays == 0 && return
296-
297285
# Access SOA components of pixel_samples
286+
ray_queue = current_ray_queue(state)
298287
pixel_samples = state.pixel_samples
288+
n = length(ray_queue)
289+
n == 0 && return
299290

300291
kernel! = vp_generate_ray_samples_kernel!(backend)
301292
kernel!(
@@ -309,7 +300,7 @@ function vp_generate_ray_samples!(
309300
sample_idx,
310301
depth,
311302
sobol_rng; # Pass whole SobolRNG, Adapt handles conversion
312-
ndrange=n_rays
303+
ndrange=n
313304
)
314305
end
315306

src/integrators/workqueue.jl

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -172,16 +172,17 @@ end
172172
end
173173
end
174174

175-
function Base.foreach(f, queue::WorkQueue, args...; workgroupsize=nothing)
175+
# Default workgroupsize=256 gives ~14% speedup on CUDA (Ampere) vs auto-selection.
176+
# Static workgroupsize helps the compiler optimize register allocation and enables
177+
# more concurrent blocks per SM.
178+
const DEFAULT_WORKGROUPSIZE = 256
179+
180+
function Base.foreach(f, queue::WorkQueue, args...; workgroupsize=DEFAULT_WORKGROUPSIZE)
176181
n = length(queue)
177182
n == 0 && return nothing
178183
backend = KA.get_backend(queue.items)
179-
kernel! = _workqueue_map_kernel!(backend)
180-
if workgroupsize === nothing
181-
kernel!(f, queue, args...; ndrange=n)
182-
else
183-
kernel!(f, queue, args...; ndrange=n, workgroupsize=workgroupsize)
184-
end
184+
kernel! = _workqueue_map_kernel!(backend, workgroupsize)
185+
kernel!(f, queue, args...; ndrange=n)
185186
return nothing
186187
end
187188

src/sampler/sobol.jl

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -111,12 +111,9 @@ Arguments:
111111
v = UInt32(0)
112112
base_i = dimension * SOBOL_MATRIX_SIZE + Int32(1) # Julia is 1-indexed
113113

114-
# Fully unrolled loop using @nexprs for SPIR-V structured control flow compatibility
115-
# Each iteration: branchless XOR - always read matrix, mask with bit value
116-
# Uses pure bitwise ops to avoid any conditional branches
117-
Base.Cartesian.@nexprs 52 bit -> begin
118-
bit0 = Int32(bit) - Int32(1)
119-
# Extract bit and spread to all 32 bits: 0 if bit clear, 0xffffffff if bit set
114+
# Regular for loop — CUDA compiler selects optimal unroll factor.
115+
# Branchless XOR: always read matrix, mask with bit value.
116+
for bit0 in Int32(0):Int32(SOBOL_MATRIX_SIZE - 1)
120117
bit_val = UInt32((a >> bit0) & Int64(1))
121118
mask = (bit_val * UInt32(0xffffffff)) # 0 or 0xffffffff
122119
@inbounds v ⊻= sobol_matrices[base_i + bit0] & mask
@@ -224,9 +221,9 @@ Uses compile-time unrolled loop with branchless operations for SPIR-V compatibil
224221
last_digit = pow2_flag
225222
pow2_adjust = pow2_flag
226223

227-
# Compile-time unrolled loop (32 iterations covers up to 64-bit Morton codes)
228-
Base.Cartesian.@nexprs 32 iter -> begin
229-
iter0 = Int32(iter) - Int32(1)
224+
# Regular for loop — CUDA compiler selects optimal unroll factor.
225+
# 32 iterations covers up to 64-bit Morton codes.
226+
for iter0 in Int32(0):Int32(31)
230227
i = n_base4_digits - Int32(1) - iter0
231228

232229
# Branchless max to ensure digit_shift >= 0

src/spectral/spectral.jl

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -276,11 +276,7 @@ Check if wavelength i has non-zero PDF (should contribute).
276276
Atomix.@atomic pixel_L[base_idx+Int32(4)] += contrib[4]
277277
end
278278

279-
_pointer(x, idx) = pointer(x, idx)
280-
_pointer(x::Base.Experimental.Const, idx) = pointer(x.a, idx)
281-
282-
Base.@propagate_inbounds function load(array::AbstractArray{Float32}, index::Integer, ::Type{T}) where T
283-
ptr = _pointer(array, index)
284-
ptr32 = as_pointer(T, ptr)
285-
return Base.unsafe_load(ptr32)
279+
@inline function load(array, index::Integer, ::Type{T}) where T
280+
N = sizeof(T) ÷ sizeof(Float32)
281+
@inbounds T(ntuple(i -> array[index + i - 1], Val(N)))
286282
end

0 commit comments

Comments
 (0)