Skip to content

Commit e24e526

Browse files
authored
Merge branch 'main' into qiskit-testing-coverage
2 parents 9e96f9b + 52bc028 commit e24e526

File tree

15 files changed

+897
-78
lines changed

15 files changed

+897
-78
lines changed

qdp/qdp-core/src/encoding/mod.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,12 @@ pub(crate) fn stream_encode<E: ChunkEncoder>(
146146
let num_samples = reader_core.total_rows;
147147

148148
// Allocate output state vector
149-
let total_state_vector = GpuStateVector::new_batch(&engine.device, num_samples, num_qubits)?;
149+
let total_state_vector = GpuStateVector::new_batch(
150+
&engine.device,
151+
num_samples,
152+
num_qubits,
153+
crate::Precision::Float64,
154+
)?;
150155
const PIPELINE_EVENT_SLOTS: usize = 2;
151156
let ctx = PipelineContext::new(&engine.device, PIPELINE_EVENT_SLOTS)?;
152157

qdp/qdp-core/src/gpu/encodings/amplitude.rs

Lines changed: 117 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ use crate::gpu::memory::{ensure_device_memory_available, map_allocation_error};
4040
use cudarc::driver::{DevicePtr, DevicePtrMut};
4141
#[cfg(target_os = "linux")]
4242
use qdp_kernels::{
43-
launch_amplitude_encode, launch_amplitude_encode_batch, launch_l2_norm, launch_l2_norm_batch,
44-
launch_l2_norm_f32,
43+
launch_amplitude_encode, launch_amplitude_encode_batch, launch_amplitude_encode_batch_f32,
44+
launch_l2_norm, launch_l2_norm_batch, launch_l2_norm_batch_f32, launch_l2_norm_f32,
4545
};
4646
#[cfg(target_os = "linux")]
4747
use std::ffi::c_void;
@@ -206,7 +206,7 @@ impl QuantumEncoder for AmplitudeEncoder {
206206
// Allocate single large GPU buffer for all states
207207
let batch_state_vector = {
208208
crate::profile_scope!("GPU::AllocBatch");
209-
GpuStateVector::new_batch(device, num_samples, num_qubits)?
209+
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
210210
};
211211

212212
// Upload input data to GPU
@@ -386,7 +386,7 @@ impl QuantumEncoder for AmplitudeEncoder {
386386
let input_batch_d = input_batch_d as *const f64;
387387
let batch_state_vector = {
388388
crate::profile_scope!("GPU::AllocBatch");
389-
GpuStateVector::new_batch(device, num_samples, num_qubits)?
389+
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
390390
};
391391
let inv_norms_gpu = {
392392
crate::profile_scope!("GPU::BatchNormKernel");
@@ -579,6 +579,119 @@ impl AmplitudeEncoder {
579579
}
580580

581581
impl AmplitudeEncoder {
582+
/// Encode a batch directly from a GPU float32 pointer.
583+
///
584+
/// # Safety
585+
/// The caller must ensure `input_batch_d` points to valid GPU memory containing
586+
/// at least `num_samples * sample_size` f32 elements on the same device as `device`.
587+
#[cfg(target_os = "linux")]
588+
pub unsafe fn encode_batch_from_gpu_ptr_f32_with_stream(
589+
device: &Arc<CudaDevice>,
590+
input_batch_d: *const f32,
591+
num_samples: usize,
592+
sample_size: usize,
593+
num_qubits: usize,
594+
stream: *mut c_void,
595+
) -> Result<GpuStateVector> {
596+
let state_len = 1 << num_qubits;
597+
if num_samples == 0 {
598+
return Err(MahoutError::InvalidInput(
599+
"Number of samples cannot be zero".into(),
600+
));
601+
}
602+
if sample_size == 0 {
603+
return Err(MahoutError::InvalidInput(
604+
"Sample size cannot be zero".into(),
605+
));
606+
}
607+
if sample_size > state_len {
608+
return Err(MahoutError::InvalidInput(format!(
609+
"Sample size {} exceeds state vector size {} (2^{} qubits)",
610+
sample_size, state_len, num_qubits
611+
)));
612+
}
613+
614+
let batch_state_vector =
615+
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float32)?;
616+
617+
let inv_norms_gpu = {
618+
crate::profile_scope!("GPU::BatchNormKernelF32");
619+
use cudarc::driver::DevicePtrMut;
620+
621+
let mut buffer = device.alloc_zeros::<f32>(num_samples).map_err(|e| {
622+
MahoutError::MemoryAllocation(format!(
623+
"Failed to allocate f32 norm buffer: {:?}",
624+
e
625+
))
626+
})?;
627+
let ret = unsafe {
628+
launch_l2_norm_batch_f32(
629+
input_batch_d,
630+
num_samples,
631+
sample_size,
632+
*buffer.device_ptr_mut() as *mut f32,
633+
stream,
634+
)
635+
};
636+
if ret != 0 {
637+
return Err(MahoutError::KernelLaunch(format!(
638+
"Norm reduction kernel f32 failed with CUDA error code: {} ({})",
639+
ret,
640+
cuda_error_to_string(ret)
641+
)));
642+
}
643+
buffer
644+
};
645+
646+
{
647+
crate::profile_scope!("GPU::NormValidationF32");
648+
let host_inv_norms = device.dtoh_sync_copy(&inv_norms_gpu).map_err(|e| {
649+
MahoutError::Cuda(format!("Failed to copy f32 norms to host: {:?}", e))
650+
})?;
651+
if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
652+
return Err(MahoutError::InvalidInput(
653+
"One or more float32 samples have zero or invalid norm".to_string(),
654+
));
655+
}
656+
}
657+
658+
{
659+
crate::profile_scope!("GPU::BatchKernelLaunchF32");
660+
use cudarc::driver::DevicePtr;
661+
662+
let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| {
663+
MahoutError::InvalidInput(
664+
"Batch state vector precision mismatch (expected float32 buffer)".to_string(),
665+
)
666+
})?;
667+
let ret = unsafe {
668+
launch_amplitude_encode_batch_f32(
669+
input_batch_d,
670+
state_ptr as *mut c_void,
671+
*inv_norms_gpu.device_ptr() as *const f32,
672+
num_samples,
673+
sample_size,
674+
state_len,
675+
stream,
676+
)
677+
};
678+
if ret != 0 {
679+
return Err(MahoutError::KernelLaunch(format!(
680+
"Batch kernel f32 launch failed with CUDA error code: {} ({})",
681+
ret,
682+
cuda_error_to_string(ret)
683+
)));
684+
}
685+
}
686+
687+
{
688+
crate::profile_scope!("GPU::Synchronize");
689+
sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
690+
}
691+
692+
Ok(batch_state_vector)
693+
}
694+
582695
/// Compute inverse L2 norm on GPU using the reduction kernel.
583696
///
584697
/// # Arguments

qdp/qdp-core/src/gpu/encodings/angle.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ impl QuantumEncoder for AngleEncoder {
168168

169169
let batch_state_vector = {
170170
crate::profile_scope!("GPU::AllocBatch");
171-
GpuStateVector::new_batch(device, num_samples, num_qubits)?
171+
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
172172
};
173173

174174
let input_bytes = std::mem::size_of_val(batch_data);
@@ -337,7 +337,7 @@ impl QuantumEncoder for AngleEncoder {
337337
}
338338
let batch_state_vector = {
339339
crate::profile_scope!("GPU::AllocBatch");
340-
GpuStateVector::new_batch(device, num_samples, num_qubits)?
340+
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
341341
};
342342
let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
343343
MahoutError::InvalidInput(
@@ -412,7 +412,7 @@ impl AngleEncoder {
412412
) -> Result<GpuStateVector> {
413413
let batch_state_vector = {
414414
crate::profile_scope!("GPU::AllocBatch");
415-
GpuStateVector::new_batch(device, num_samples, num_qubits)?
415+
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
416416
};
417417

418418
let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {

qdp/qdp-core/src/gpu/encodings/basis.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ impl QuantumEncoder for BasisEncoder {
169169
// Allocate batch state vector
170170
let batch_state_vector = {
171171
crate::profile_scope!("GPU::AllocBatch");
172-
GpuStateVector::new_batch(device, num_samples, num_qubits)?
172+
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
173173
};
174174

175175
// Upload basis indices to GPU
@@ -298,7 +298,7 @@ impl QuantumEncoder for BasisEncoder {
298298
let basis_indices_d = input_batch_d as *const usize;
299299
let batch_state_vector = {
300300
crate::profile_scope!("GPU::AllocBatch");
301-
GpuStateVector::new_batch(device, num_samples, num_qubits)?
301+
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
302302
};
303303
let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
304304
MahoutError::InvalidInput(

qdp/qdp-core/src/gpu/encodings/iqp.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ impl QuantumEncoder for IqpEncoder {
190190

191191
let batch_state_vector = {
192192
crate::profile_scope!("GPU::AllocBatch");
193-
GpuStateVector::new_batch(device, num_samples, num_qubits)?
193+
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
194194
};
195195

196196
let input_bytes = std::mem::size_of_val(batch_data);

qdp/qdp-core/src/gpu/memory.rs

Lines changed: 49 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -342,9 +342,14 @@ impl GpuStateVector {
342342
self.size_elements
343343
}
344344

345-
/// Create GPU state vector for a batch of samples
346-
/// Allocates num_samples * 2^qubits complex numbers on GPU
347-
pub fn new_batch(_device: &Arc<CudaDevice>, num_samples: usize, qubits: usize) -> Result<Self> {
345+
/// Create GPU state vector for a batch of samples with the given precision.
346+
/// Allocates `num_samples * 2^qubits` complex numbers on GPU.
347+
pub fn new_batch(
348+
_device: &Arc<CudaDevice>,
349+
num_samples: usize,
350+
qubits: usize,
351+
precision: Precision,
352+
) -> Result<Self> {
348353
let single_state_size: usize = 1usize << qubits;
349354
let total_elements = num_samples.checked_mul(single_state_size).ok_or_else(|| {
350355
MahoutError::MemoryAllocation(format!(
@@ -355,34 +360,51 @@ impl GpuStateVector {
355360

356361
#[cfg(target_os = "linux")]
357362
{
358-
let requested_bytes = total_elements
359-
.checked_mul(std::mem::size_of::<CuDoubleComplex>())
360-
.ok_or_else(|| {
361-
MahoutError::MemoryAllocation(format!(
362-
"Requested GPU allocation size overflow (elements={})",
363-
total_elements
364-
))
365-
})?;
363+
let buffer = match precision {
364+
Precision::Float32 => {
365+
let requested_bytes = total_elements
366+
.checked_mul(std::mem::size_of::<CuComplex>())
367+
.ok_or_else(|| {
368+
MahoutError::MemoryAllocation(format!(
369+
"Requested GPU allocation size overflow (elements={})",
370+
total_elements
371+
))
372+
})?;
366373

367-
// Pre-flight check
368-
ensure_device_memory_available(
369-
requested_bytes,
370-
"batch state vector allocation",
371-
Some(qubits),
372-
)?;
374+
let context = "batch state vector allocation (f32)";
375+
ensure_device_memory_available(requested_bytes, context, Some(qubits))?;
373376

374-
let slice =
375-
unsafe { _device.alloc::<CuDoubleComplex>(total_elements) }.map_err(|e| {
376-
map_allocation_error(
377-
requested_bytes,
378-
"batch state vector allocation",
379-
Some(qubits),
380-
e,
381-
)
382-
})?;
377+
let slice =
378+
unsafe { _device.alloc::<CuComplex>(total_elements) }.map_err(|e| {
379+
map_allocation_error(requested_bytes, context, Some(qubits), e)
380+
})?;
381+
382+
BufferStorage::F32(GpuBufferRaw { slice })
383+
}
384+
Precision::Float64 => {
385+
let requested_bytes = total_elements
386+
.checked_mul(std::mem::size_of::<CuDoubleComplex>())
387+
.ok_or_else(|| {
388+
MahoutError::MemoryAllocation(format!(
389+
"Requested GPU allocation size overflow (elements={})",
390+
total_elements
391+
))
392+
})?;
393+
394+
let context = "batch state vector allocation";
395+
ensure_device_memory_available(requested_bytes, context, Some(qubits))?;
396+
397+
let slice = unsafe { _device.alloc::<CuDoubleComplex>(total_elements) }
398+
.map_err(|e| {
399+
map_allocation_error(requested_bytes, context, Some(qubits), e)
400+
})?;
401+
402+
BufferStorage::F64(GpuBufferRaw { slice })
403+
}
404+
};
383405

384406
Ok(Self {
385-
buffer: Arc::new(BufferStorage::F64(GpuBufferRaw { slice })),
407+
buffer: Arc::new(buffer),
386408
num_qubits: qubits,
387409
size_elements: total_elements,
388410
num_samples: Some(num_samples),

qdp/qdp-core/src/lib.rs

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,78 @@ impl QdpEngine {
605605
Ok(state_vector.to_dlpack())
606606
}
607607

608+
/// Encode a batch from an existing GPU pointer (float32 input, amplitude encoding only).
609+
///
610+
/// Zero-copy batch encoding from PyTorch CUDA float32 tensors. Uses the default CUDA stream.
611+
/// For stream interop use `encode_batch_from_gpu_ptr_f32_with_stream`.
612+
///
613+
/// # Safety
614+
/// The input pointer must:
615+
/// - Point to valid GPU memory on the same device as the engine
616+
/// - Contain at least `num_samples * sample_size` f32 elements
617+
/// - Remain valid for the duration of this call
618+
#[cfg(target_os = "linux")]
619+
pub unsafe fn encode_batch_from_gpu_ptr_f32(
620+
&self,
621+
input_batch_d: *const f32,
622+
num_samples: usize,
623+
sample_size: usize,
624+
num_qubits: usize,
625+
) -> Result<*mut DLManagedTensor> {
626+
unsafe {
627+
self.encode_batch_from_gpu_ptr_f32_with_stream(
628+
input_batch_d,
629+
num_samples,
630+
sample_size,
631+
num_qubits,
632+
std::ptr::null_mut(),
633+
)
634+
}
635+
}
636+
637+
/// Encode a float32 amplitude batch from an existing GPU pointer on a specified CUDA stream.
638+
///
639+
/// # Safety
640+
/// In addition to the `encode_batch_from_gpu_ptr_f32` requirements, the stream pointer
641+
/// must remain valid for the duration of this call.
642+
#[cfg(target_os = "linux")]
643+
pub unsafe fn encode_batch_from_gpu_ptr_f32_with_stream(
644+
&self,
645+
input_batch_d: *const f32,
646+
num_samples: usize,
647+
sample_size: usize,
648+
num_qubits: usize,
649+
stream: *mut c_void,
650+
) -> Result<*mut DLManagedTensor> {
651+
crate::profile_scope!("Mahout::EncodeBatchFromGpuPtrF32");
652+
653+
if num_samples == 0 {
654+
return Err(MahoutError::InvalidInput(
655+
"Number of samples cannot be zero".into(),
656+
));
657+
}
658+
if sample_size == 0 {
659+
return Err(MahoutError::InvalidInput(
660+
"Sample size cannot be zero".into(),
661+
));
662+
}
663+
664+
validate_cuda_input_ptr(&self.device, input_batch_d as *const c_void)?;
665+
666+
let batch_state_vector = unsafe {
667+
gpu::AmplitudeEncoder::encode_batch_from_gpu_ptr_f32_with_stream(
668+
&self.device,
669+
input_batch_d,
670+
num_samples,
671+
sample_size,
672+
num_qubits,
673+
stream,
674+
)
675+
}?;
676+
let batch_state_vector = batch_state_vector.to_precision(&self.device, self.precision)?;
677+
Ok(batch_state_vector.to_dlpack())
678+
}
679+
608680
/// Encode batch from existing GPU pointer (zero-copy for CUDA tensors)
609681
///
610682
/// This method enables zero-copy batch encoding from PyTorch CUDA tensors.

0 commit comments

Comments
 (0)