apache
diff --git a/‎qdp/qdp-core/src/encoding/mod.rs‎
Lines changed: 6 additions & 1 deletion b/‎qdp/qdp-core/src/encoding/mod.rs‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎qdp/qdp-core/src/gpu/encodings/amplitude.rs‎
Lines changed: 117 additions & 4 deletions b/‎qdp/qdp-core/src/gpu/encodings/amplitude.rs‎
Lines changed: 117 additions & 4 deletions
diff --git a/‎qdp/qdp-core/src/gpu/encodings/angle.rs‎
Lines changed: 3 additions & 3 deletions b/‎qdp/qdp-core/src/gpu/encodings/angle.rs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎qdp/qdp-core/src/gpu/encodings/basis.rs‎
Lines changed: 2 additions & 2 deletions b/‎qdp/qdp-core/src/gpu/encodings/basis.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎qdp/qdp-core/src/gpu/encodings/iqp.rs‎
Lines changed: 1 addition & 1 deletion b/‎qdp/qdp-core/src/gpu/encodings/iqp.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎qdp/qdp-core/src/gpu/memory.rs‎
Lines changed: 49 additions & 27 deletions b/‎qdp/qdp-core/src/gpu/memory.rs‎
Lines changed: 49 additions & 27 deletions
diff --git a/‎qdp/qdp-core/src/lib.rs‎
Lines changed: 72 additions & 0 deletions b/‎qdp/qdp-core/src/lib.rs‎
Lines changed: 72 additions & 0 deletions
@@ -146,7 +146,12 @@ pub(crate) fn stream_encode<E: ChunkEncoder>(
     let num_samples = reader_core.total_rows;
 
     // Allocate output state vector
-    let total_state_vector = GpuStateVector::new_batch(&engine.device, num_samples, num_qubits)?;
+    let total_state_vector = GpuStateVector::new_batch(
+        &engine.device,
+        num_samples,
+        num_qubits,
+        crate::Precision::Float64,
+    )?;
     const PIPELINE_EVENT_SLOTS: usize = 2;
     let ctx = PipelineContext::new(&engine.device, PIPELINE_EVENT_SLOTS)?;
 
 
@@ -40,8 +40,8 @@ use crate::gpu::memory::{ensure_device_memory_available, map_allocation_error};
 use cudarc::driver::{DevicePtr, DevicePtrMut};
 #[cfg(target_os = "linux")]
 use qdp_kernels::{
-    launch_amplitude_encode, launch_amplitude_encode_batch, launch_l2_norm, launch_l2_norm_batch,
-    launch_l2_norm_f32,
+    launch_amplitude_encode, launch_amplitude_encode_batch, launch_amplitude_encode_batch_f32,
+    launch_l2_norm, launch_l2_norm_batch, launch_l2_norm_batch_f32, launch_l2_norm_f32,
 };
 #[cfg(target_os = "linux")]
 use std::ffi::c_void;
@@ -206,7 +206,7 @@ impl QuantumEncoder for AmplitudeEncoder {
         // Allocate single large GPU buffer for all states
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
         };
 
         // Upload input data to GPU
@@ -386,7 +386,7 @@ impl QuantumEncoder for AmplitudeEncoder {
         let input_batch_d = input_batch_d as *const f64;
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
         };
         let inv_norms_gpu = {
             crate::profile_scope!("GPU::BatchNormKernel");
@@ -579,6 +579,119 @@ impl AmplitudeEncoder {
 }
 
 impl AmplitudeEncoder {
+    /// Encode a batch directly from a GPU float32 pointer.
+    ///
+    /// # Safety
+    /// The caller must ensure `input_batch_d` points to valid GPU memory containing
+    /// at least `num_samples * sample_size` f32 elements on the same device as `device`.
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_batch_from_gpu_ptr_f32_with_stream(
+        device: &Arc<CudaDevice>,
+        input_batch_d: *const f32,
+        num_samples: usize,
+        sample_size: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        let state_len = 1 << num_qubits;
+        if num_samples == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Number of samples cannot be zero".into(),
+            ));
+        }
+        if sample_size == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Sample size cannot be zero".into(),
+            ));
+        }
+        if sample_size > state_len {
+            return Err(MahoutError::InvalidInput(format!(
+                "Sample size {} exceeds state vector size {} (2^{} qubits)",
+                sample_size, state_len, num_qubits
+            )));
+        }
+
+        let batch_state_vector =
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float32)?;
+
+        let inv_norms_gpu = {
+            crate::profile_scope!("GPU::BatchNormKernelF32");
+            use cudarc::driver::DevicePtrMut;
+
+            let mut buffer = device.alloc_zeros::<f32>(num_samples).map_err(|e| {
+                MahoutError::MemoryAllocation(format!(
+                    "Failed to allocate f32 norm buffer: {:?}",
+                    e
+                ))
+            })?;
+            let ret = unsafe {
+                launch_l2_norm_batch_f32(
+                    input_batch_d,
+                    num_samples,
+                    sample_size,
+                    *buffer.device_ptr_mut() as *mut f32,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Norm reduction kernel f32 failed with CUDA error code: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+            buffer
+        };
+
+        {
+            crate::profile_scope!("GPU::NormValidationF32");
+            let host_inv_norms = device.dtoh_sync_copy(&inv_norms_gpu).map_err(|e| {
+                MahoutError::Cuda(format!("Failed to copy f32 norms to host: {:?}", e))
+            })?;
+            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
+                return Err(MahoutError::InvalidInput(
+                    "One or more float32 samples have zero or invalid norm".to_string(),
+                ));
+            }
+        }
+
+        {
+            crate::profile_scope!("GPU::BatchKernelLaunchF32");
+            use cudarc::driver::DevicePtr;
+
+            let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| {
+                MahoutError::InvalidInput(
+                    "Batch state vector precision mismatch (expected float32 buffer)".to_string(),
+                )
+            })?;
+            let ret = unsafe {
+                launch_amplitude_encode_batch_f32(
+                    input_batch_d,
+                    state_ptr as *mut c_void,
+                    *inv_norms_gpu.device_ptr() as *const f32,
+                    num_samples,
+                    sample_size,
+                    state_len,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Batch kernel f32 launch failed with CUDA error code: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
+        }
+
+        Ok(batch_state_vector)
+    }
+
     /// Compute inverse L2 norm on GPU using the reduction kernel.
     ///
     /// # Arguments
 
@@ -168,7 +168,7 @@ impl QuantumEncoder for AngleEncoder {
 
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
         };
 
         let input_bytes = std::mem::size_of_val(batch_data);
@@ -337,7 +337,7 @@ impl QuantumEncoder for AngleEncoder {
         }
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
         };
         let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
             MahoutError::InvalidInput(
@@ -412,7 +412,7 @@ impl AngleEncoder {
     ) -> Result<GpuStateVector> {
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
         };
 
         let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
 
@@ -169,7 +169,7 @@ impl QuantumEncoder for BasisEncoder {
         // Allocate batch state vector
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
         };
 
         // Upload basis indices to GPU
@@ -298,7 +298,7 @@ impl QuantumEncoder for BasisEncoder {
         let basis_indices_d = input_batch_d as *const usize;
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
         };
         let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
             MahoutError::InvalidInput(
 
@@ -190,7 +190,7 @@ impl QuantumEncoder for IqpEncoder {
 
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
         };
 
         let input_bytes = std::mem::size_of_val(batch_data);
 
@@ -342,9 +342,14 @@ impl GpuStateVector {
         self.size_elements
     }
 
-    /// Create GPU state vector for a batch of samples
-    /// Allocates num_samples * 2^qubits complex numbers on GPU
-    pub fn new_batch(_device: &Arc<CudaDevice>, num_samples: usize, qubits: usize) -> Result<Self> {
+    /// Create GPU state vector for a batch of samples with the given precision.
+    /// Allocates `num_samples * 2^qubits` complex numbers on GPU.
+    pub fn new_batch(
+        _device: &Arc<CudaDevice>,
+        num_samples: usize,
+        qubits: usize,
+        precision: Precision,
+    ) -> Result<Self> {
         let single_state_size: usize = 1usize << qubits;
         let total_elements = num_samples.checked_mul(single_state_size).ok_or_else(|| {
             MahoutError::MemoryAllocation(format!(
@@ -355,34 +360,51 @@ impl GpuStateVector {
 
         #[cfg(target_os = "linux")]
         {
-            let requested_bytes = total_elements
-                .checked_mul(std::mem::size_of::<CuDoubleComplex>())
-                .ok_or_else(|| {
-                    MahoutError::MemoryAllocation(format!(
-                        "Requested GPU allocation size overflow (elements={})",
-                        total_elements
-                    ))
-                })?;
+            let buffer = match precision {
+                Precision::Float32 => {
+                    let requested_bytes = total_elements
+                        .checked_mul(std::mem::size_of::<CuComplex>())
+                        .ok_or_else(|| {
+                            MahoutError::MemoryAllocation(format!(
+                                "Requested GPU allocation size overflow (elements={})",
+                                total_elements
+                            ))
+                        })?;
 
-            // Pre-flight check
-            ensure_device_memory_available(
-                requested_bytes,
-                "batch state vector allocation",
-                Some(qubits),
-            )?;
+                    let context = "batch state vector allocation (f32)";
+                    ensure_device_memory_available(requested_bytes, context, Some(qubits))?;
 
-            let slice =
-                unsafe { _device.alloc::<CuDoubleComplex>(total_elements) }.map_err(|e| {
-                    map_allocation_error(
-                        requested_bytes,
-                        "batch state vector allocation",
-                        Some(qubits),
-                        e,
-                    )
-                })?;
+                    let slice =
+                        unsafe { _device.alloc::<CuComplex>(total_elements) }.map_err(|e| {
+                            map_allocation_error(requested_bytes, context, Some(qubits), e)
+                        })?;
+
+                    BufferStorage::F32(GpuBufferRaw { slice })
+                }
+                Precision::Float64 => {
+                    let requested_bytes = total_elements
+                        .checked_mul(std::mem::size_of::<CuDoubleComplex>())
+                        .ok_or_else(|| {
+                            MahoutError::MemoryAllocation(format!(
+                                "Requested GPU allocation size overflow (elements={})",
+                                total_elements
+                            ))
+                        })?;
+
+                    let context = "batch state vector allocation";
+                    ensure_device_memory_available(requested_bytes, context, Some(qubits))?;
+
+                    let slice = unsafe { _device.alloc::<CuDoubleComplex>(total_elements) }
+                        .map_err(|e| {
+                            map_allocation_error(requested_bytes, context, Some(qubits), e)
+                        })?;
+
+                    BufferStorage::F64(GpuBufferRaw { slice })
+                }
+            };
 
             Ok(Self {
-                buffer: Arc::new(BufferStorage::F64(GpuBufferRaw { slice })),
+                buffer: Arc::new(buffer),
                 num_qubits: qubits,
                 size_elements: total_elements,
                 num_samples: Some(num_samples),
 
@@ -605,6 +605,78 @@ impl QdpEngine {
         Ok(state_vector.to_dlpack())
     }
 
+    /// Encode a batch from an existing GPU pointer (float32 input, amplitude encoding only).
+    ///
+    /// Zero-copy batch encoding from PyTorch CUDA float32 tensors. Uses the default CUDA stream.
+    /// For stream interop use `encode_batch_from_gpu_ptr_f32_with_stream`.
+    ///
+    /// # Safety
+    /// The input pointer must:
+    /// - Point to valid GPU memory on the same device as the engine
+    /// - Contain at least `num_samples * sample_size` f32 elements
+    /// - Remain valid for the duration of this call
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_batch_from_gpu_ptr_f32(
+        &self,
+        input_batch_d: *const f32,
+        num_samples: usize,
+        sample_size: usize,
+        num_qubits: usize,
+    ) -> Result<*mut DLManagedTensor> {
+        unsafe {
+            self.encode_batch_from_gpu_ptr_f32_with_stream(
+                input_batch_d,
+                num_samples,
+                sample_size,
+                num_qubits,
+                std::ptr::null_mut(),
+            )
+        }
+    }
+
+    /// Encode a float32 amplitude batch from an existing GPU pointer on a specified CUDA stream.
+    ///
+    /// # Safety
+    /// In addition to the `encode_batch_from_gpu_ptr_f32` requirements, the stream pointer
+    /// must remain valid for the duration of this call.
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_batch_from_gpu_ptr_f32_with_stream(
+        &self,
+        input_batch_d: *const f32,
+        num_samples: usize,
+        sample_size: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<*mut DLManagedTensor> {
+        crate::profile_scope!("Mahout::EncodeBatchFromGpuPtrF32");
+
+        if num_samples == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Number of samples cannot be zero".into(),
+            ));
+        }
+        if sample_size == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Sample size cannot be zero".into(),
+            ));
+        }
+
+        validate_cuda_input_ptr(&self.device, input_batch_d as *const c_void)?;
+
+        let batch_state_vector = unsafe {
+            gpu::AmplitudeEncoder::encode_batch_from_gpu_ptr_f32_with_stream(
+                &self.device,
+                input_batch_d,
+                num_samples,
+                sample_size,
+                num_qubits,
+                stream,
+            )
+        }?;
+        let batch_state_vector = batch_state_vector.to_precision(&self.device, self.precision)?;
+        Ok(batch_state_vector.to_dlpack())
+    }
+
     /// Encode batch from existing GPU pointer (zero-copy for CUDA tensors)
     ///
     /// This method enables zero-copy batch encoding from PyTorch CUDA tensors.