Skip to content

Commit 42da30d

Browse files
authored
[QDP] Add zero-copy amplitude encoding from float32 GPU tensors (#999)
* feat: add float32 GPU pointer encoding and inverse norm calculation with stream support * refactor: streamline GPU state vector encoding to support precision conversion for both Float32 and Float64 * test: add test file for GPU pointer encoding with Float32 precision * refactor: improve GPU pointer validation and update documentation for encoding methods * test: update unsupported encoding test to reflect changes in CUDA tensor encoding methods * test: add unit test for handling null pointer in GPU pointer encoding for Float32
1 parent 31e8ae6 commit 42da30d

File tree

5 files changed

+512
-66
lines changed

5 files changed

+512
-66
lines changed

qdp/qdp-core/src/gpu/encodings/amplitude.rs

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,28 @@ impl AmplitudeEncoder {
510510
device: &Arc<CudaDevice>,
511511
input_ptr: *const f32,
512512
len: usize,
513+
) -> Result<f32> {
514+
unsafe {
515+
Self::calculate_inv_norm_gpu_f32_with_stream(
516+
device,
517+
input_ptr,
518+
len,
519+
std::ptr::null_mut(),
520+
)
521+
}
522+
}
523+
524+
/// Compute inverse L2 norm on GPU for float32 input on a given stream.
525+
///
526+
/// # Safety
527+
/// The caller must ensure `input_ptr` points to valid GPU memory containing
528+
/// at least `len` f32 elements on the same device as `device`.
529+
#[cfg(target_os = "linux")]
530+
pub unsafe fn calculate_inv_norm_gpu_f32_with_stream(
531+
device: &Arc<CudaDevice>,
532+
input_ptr: *const f32,
533+
len: usize,
534+
stream: *mut c_void,
513535
) -> Result<f32> {
514536
crate::profile_scope!("GPU::NormSingleF32");
515537

@@ -522,7 +544,7 @@ impl AmplitudeEncoder {
522544
input_ptr,
523545
len,
524546
*norm_buffer.device_ptr_mut() as *mut f32,
525-
std::ptr::null_mut(), // default stream
547+
stream,
526548
)
527549
};
528550

@@ -534,6 +556,8 @@ impl AmplitudeEncoder {
534556
)));
535557
}
536558

559+
sync_cuda_stream(stream, "Norm stream synchronize failed (f32)")?;
560+
537561
let inv_norm_host = device
538562
.dtoh_sync_copy(&norm_buffer)
539563
.map_err(|e| MahoutError::Cuda(format!("Failed to copy f32 norm to host: {:?}", e)))?;

qdp/qdp-core/src/lib.rs

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,63 @@ pub use pipeline_runner::{
4545
run_throughput_pipeline,
4646
};
4747

48+
use std::ffi::c_void;
4849
use std::sync::Arc;
4950

5051
use crate::dlpack::DLManagedTensor;
5152
use crate::gpu::get_encoder;
5253
use cudarc::driver::CudaDevice;
5354

55+
#[cfg(target_os = "linux")]
56+
fn validate_cuda_input_ptr(device: &CudaDevice, ptr: *const c_void) -> Result<()> {
57+
use crate::gpu::cuda_ffi::{
58+
CUDA_MEMORY_TYPE_DEVICE, CUDA_MEMORY_TYPE_MANAGED, CudaPointerAttributes,
59+
cudaPointerGetAttributes,
60+
};
61+
62+
if ptr.is_null() {
63+
return Err(MahoutError::InvalidInput(
64+
"Input GPU pointer is null".to_string(),
65+
));
66+
}
67+
68+
let mut attrs = CudaPointerAttributes {
69+
memory_type: 0,
70+
device: 0,
71+
device_pointer: std::ptr::null_mut(),
72+
host_pointer: std::ptr::null_mut(),
73+
is_managed: 0,
74+
allocation_flags: 0,
75+
};
76+
77+
let ret = unsafe { cudaPointerGetAttributes(&mut attrs as *mut _, ptr) };
78+
if ret != 0 {
79+
return Err(MahoutError::InvalidInput(format!(
80+
"cudaPointerGetAttributes failed for input pointer: {} ({})",
81+
ret,
82+
cuda_error_to_string(ret)
83+
)));
84+
}
85+
86+
if attrs.memory_type != CUDA_MEMORY_TYPE_DEVICE && attrs.memory_type != CUDA_MEMORY_TYPE_MANAGED
87+
{
88+
return Err(MahoutError::InvalidInput(format!(
89+
"Input pointer is not CUDA device memory (memory_type={})",
90+
attrs.memory_type
91+
)));
92+
}
93+
94+
let device_ordinal = device.ordinal() as i32;
95+
if attrs.device >= 0 && attrs.device != device_ordinal {
96+
return Err(MahoutError::InvalidInput(format!(
97+
"Input pointer device mismatch: pointer on cuda:{}, engine on cuda:{}",
98+
attrs.device, device_ordinal
99+
)));
100+
}
101+
102+
Ok(())
103+
}
104+
54105
/// Main entry point for Mahout QDP
55106
///
56107
/// Manages GPU context and dispatches encoding tasks.
@@ -418,6 +469,14 @@ impl QdpEngine {
418469
) -> Result<*mut DLManagedTensor> {
419470
crate::profile_scope!("Mahout::EncodeFromGpuPtr");
420471

472+
if input_len == 0 {
473+
return Err(MahoutError::InvalidInput(
474+
"Input data cannot be empty".into(),
475+
));
476+
}
477+
478+
validate_cuda_input_ptr(&self.device, input_d)?;
479+
421480
let state_len = 1usize << num_qubits;
422481
let method = encoding_method.to_ascii_lowercase();
423482

@@ -600,6 +659,130 @@ impl QdpEngine {
600659
}
601660
}
602661

662+
/// Encode from existing GPU pointer (float32 input, amplitude encoding only)
663+
///
664+
/// Zero-copy encoding from PyTorch CUDA float32 tensors. Uses the default CUDA stream.
665+
/// For stream interop use `encode_from_gpu_ptr_f32_with_stream`.
666+
///
667+
/// # Arguments
668+
/// * `input_d` - Device pointer to input data (f32 array on GPU)
669+
/// * `input_len` - Number of f32 elements in the input
670+
/// * `num_qubits` - Number of qubits for encoding
671+
///
672+
/// # Returns
673+
/// DLPack pointer (state vector in engine precision) for zero-copy PyTorch integration.
674+
/// Internal computation is f32; output is converted to [`Precision`] of the engine.
675+
///
676+
/// # Safety
677+
/// The input pointer must:
678+
/// - Point to valid GPU memory on the same device as the engine
679+
/// - Contain at least `input_len` f32 elements
680+
/// - Remain valid for the duration of this call
681+
#[cfg(target_os = "linux")]
682+
pub unsafe fn encode_from_gpu_ptr_f32(
683+
&self,
684+
input_d: *const f32,
685+
input_len: usize,
686+
num_qubits: usize,
687+
) -> Result<*mut DLManagedTensor> {
688+
unsafe {
689+
self.encode_from_gpu_ptr_f32_with_stream(
690+
input_d,
691+
input_len,
692+
num_qubits,
693+
std::ptr::null_mut(),
694+
)
695+
}
696+
}
697+
698+
/// Encode from existing GPU pointer (float32) on a specified CUDA stream.
699+
///
700+
/// # Returns
701+
/// DLPack pointer (state vector in engine precision). Pass null for `stream` to use the default stream.
702+
///
703+
/// # Safety
704+
/// In addition to the `encode_from_gpu_ptr_f32` requirements, the stream pointer
705+
/// must remain valid for the duration of this call.
706+
#[cfg(target_os = "linux")]
707+
pub unsafe fn encode_from_gpu_ptr_f32_with_stream(
708+
&self,
709+
input_d: *const f32,
710+
input_len: usize,
711+
num_qubits: usize,
712+
stream: *mut c_void,
713+
) -> Result<*mut DLManagedTensor> {
714+
crate::profile_scope!("Mahout::EncodeFromGpuPtrF32");
715+
716+
if input_len == 0 {
717+
return Err(MahoutError::InvalidInput(
718+
"Input data cannot be empty".into(),
719+
));
720+
}
721+
722+
validate_cuda_input_ptr(&self.device, input_d as *const c_void)?;
723+
724+
let state_len = 1usize << num_qubits;
725+
if input_len > state_len {
726+
return Err(MahoutError::InvalidInput(format!(
727+
"Input size {} exceeds state vector size {} (2^{} qubits)",
728+
input_len, state_len, num_qubits
729+
)));
730+
}
731+
732+
let state_vector = {
733+
crate::profile_scope!("GPU::Alloc");
734+
gpu::GpuStateVector::new(&self.device, num_qubits, Precision::Float32)?
735+
};
736+
737+
let inv_norm = {
738+
crate::profile_scope!("GPU::NormFromPtr");
739+
unsafe {
740+
gpu::AmplitudeEncoder::calculate_inv_norm_gpu_f32_with_stream(
741+
&self.device,
742+
input_d,
743+
input_len,
744+
stream,
745+
)?
746+
}
747+
};
748+
749+
let state_ptr = state_vector.ptr_f32().ok_or_else(|| {
750+
MahoutError::InvalidInput(
751+
"State vector precision mismatch (expected float32 buffer)".to_string(),
752+
)
753+
})?;
754+
755+
{
756+
crate::profile_scope!("GPU::KernelLaunch");
757+
let ret = unsafe {
758+
qdp_kernels::launch_amplitude_encode_f32(
759+
input_d,
760+
state_ptr as *mut std::ffi::c_void,
761+
input_len,
762+
state_len,
763+
inv_norm,
764+
stream,
765+
)
766+
};
767+
768+
if ret != 0 {
769+
return Err(MahoutError::KernelLaunch(format!(
770+
"Amplitude encode (f32) kernel failed with CUDA error code: {} ({})",
771+
ret,
772+
cuda_error_to_string(ret)
773+
)));
774+
}
775+
}
776+
777+
{
778+
crate::profile_scope!("GPU::Synchronize");
779+
gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
780+
}
781+
782+
let state_vector = state_vector.to_precision(&self.device, self.precision)?;
783+
Ok(state_vector.to_dlpack())
784+
}
785+
603786
/// Encode batch from existing GPU pointer (zero-copy for CUDA tensors)
604787
///
605788
/// This method enables zero-copy batch encoding from PyTorch CUDA tensors.
@@ -671,6 +854,14 @@ impl QdpEngine {
671854
));
672855
}
673856

857+
if sample_size == 0 {
858+
return Err(MahoutError::InvalidInput(
859+
"Sample size cannot be zero".into(),
860+
));
861+
}
862+
863+
validate_cuda_input_ptr(&self.device, input_batch_d)?;
864+
674865
match method.as_str() {
675866
"amplitude" => {
676867
if sample_size == 0 {

qdp/qdp-core/tests/common/mod.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,14 @@
1414
// See the License for the specific language governing permissions and
1515
// limitations under the License.
1616

17-
/// Creates normalized test data
17+
/// Creates normalized test data (f64)
1818
#[allow(dead_code)] // Used by multiple test modules
1919
pub fn create_test_data(size: usize) -> Vec<f64> {
2020
(0..size).map(|i| (i as f64) / (size as f64)).collect()
2121
}
22+
23+
/// Creates normalized test data (f32)
24+
#[allow(dead_code)]
25+
pub fn create_test_data_f32(size: usize) -> Vec<f32> {
26+
(0..size).map(|i| (i as f32) / (size as f32)).collect()
27+
}

0 commit comments

Comments
 (0)