apache
diff --git a/‎qdp/qdp-core/src/gpu/encodings/amplitude.rs‎
Lines changed: 25 additions & 1 deletion b/‎qdp/qdp-core/src/gpu/encodings/amplitude.rs‎
Lines changed: 25 additions & 1 deletion
diff --git a/‎qdp/qdp-core/src/lib.rs‎
Lines changed: 191 additions & 0 deletions b/‎qdp/qdp-core/src/lib.rs‎
Lines changed: 191 additions & 0 deletions
diff --git a/‎qdp/qdp-core/tests/common/mod.rs‎
Lines changed: 7 additions & 1 deletion b/‎qdp/qdp-core/tests/common/mod.rs‎
Lines changed: 7 additions & 1 deletion
@@ -510,6 +510,28 @@ impl AmplitudeEncoder {
         device: &Arc<CudaDevice>,
         input_ptr: *const f32,
         len: usize,
+    ) -> Result<f32> {
+        unsafe {
+            Self::calculate_inv_norm_gpu_f32_with_stream(
+                device,
+                input_ptr,
+                len,
+                std::ptr::null_mut(),
+            )
+        }
+    }
+
+    /// Compute inverse L2 norm on GPU for float32 input on a given stream.
+    ///
+    /// # Safety
+    /// The caller must ensure `input_ptr` points to valid GPU memory containing
+    /// at least `len` f32 elements on the same device as `device`.
+    #[cfg(target_os = "linux")]
+    pub unsafe fn calculate_inv_norm_gpu_f32_with_stream(
+        device: &Arc<CudaDevice>,
+        input_ptr: *const f32,
+        len: usize,
+        stream: *mut c_void,
     ) -> Result<f32> {
         crate::profile_scope!("GPU::NormSingleF32");
 
@@ -522,7 +544,7 @@ impl AmplitudeEncoder {
                 input_ptr,
                 len,
                 *norm_buffer.device_ptr_mut() as *mut f32,
-                std::ptr::null_mut(), // default stream
+                stream,
             )
         };
 
@@ -534,6 +556,8 @@ impl AmplitudeEncoder {
             )));
         }
 
+        sync_cuda_stream(stream, "Norm stream synchronize failed (f32)")?;
+
         let inv_norm_host = device
             .dtoh_sync_copy(&norm_buffer)
             .map_err(|e| MahoutError::Cuda(format!("Failed to copy f32 norm to host: {:?}", e)))?;
 
@@ -45,12 +45,63 @@ pub use pipeline_runner::{
     run_throughput_pipeline,
 };
 
+use std::ffi::c_void;
 use std::sync::Arc;
 
 use crate::dlpack::DLManagedTensor;
 use crate::gpu::get_encoder;
 use cudarc::driver::CudaDevice;
 
+#[cfg(target_os = "linux")]
+fn validate_cuda_input_ptr(device: &CudaDevice, ptr: *const c_void) -> Result<()> {
+    use crate::gpu::cuda_ffi::{
+        CUDA_MEMORY_TYPE_DEVICE, CUDA_MEMORY_TYPE_MANAGED, CudaPointerAttributes,
+        cudaPointerGetAttributes,
+    };
+
+    if ptr.is_null() {
+        return Err(MahoutError::InvalidInput(
+            "Input GPU pointer is null".to_string(),
+        ));
+    }
+
+    let mut attrs = CudaPointerAttributes {
+        memory_type: 0,
+        device: 0,
+        device_pointer: std::ptr::null_mut(),
+        host_pointer: std::ptr::null_mut(),
+        is_managed: 0,
+        allocation_flags: 0,
+    };
+
+    let ret = unsafe { cudaPointerGetAttributes(&mut attrs as *mut _, ptr) };
+    if ret != 0 {
+        return Err(MahoutError::InvalidInput(format!(
+            "cudaPointerGetAttributes failed for input pointer: {} ({})",
+            ret,
+            cuda_error_to_string(ret)
+        )));
+    }
+
+    if attrs.memory_type != CUDA_MEMORY_TYPE_DEVICE && attrs.memory_type != CUDA_MEMORY_TYPE_MANAGED
+    {
+        return Err(MahoutError::InvalidInput(format!(
+            "Input pointer is not CUDA device memory (memory_type={})",
+            attrs.memory_type
+        )));
+    }
+
+    let device_ordinal = device.ordinal() as i32;
+    if attrs.device >= 0 && attrs.device != device_ordinal {
+        return Err(MahoutError::InvalidInput(format!(
+            "Input pointer device mismatch: pointer on cuda:{}, engine on cuda:{}",
+            attrs.device, device_ordinal
+        )));
+    }
+
+    Ok(())
+}
+
 /// Main entry point for Mahout QDP
 ///
 /// Manages GPU context and dispatches encoding tasks.
@@ -418,6 +469,14 @@ impl QdpEngine {
     ) -> Result<*mut DLManagedTensor> {
         crate::profile_scope!("Mahout::EncodeFromGpuPtr");
 
+        if input_len == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Input data cannot be empty".into(),
+            ));
+        }
+
+        validate_cuda_input_ptr(&self.device, input_d)?;
+
         let state_len = 1usize << num_qubits;
         let method = encoding_method.to_ascii_lowercase();
 
@@ -600,6 +659,130 @@ impl QdpEngine {
         }
     }
 
+    /// Encode from existing GPU pointer (float32 input, amplitude encoding only)
+    ///
+    /// Zero-copy encoding from PyTorch CUDA float32 tensors. Uses the default CUDA stream.
+    /// For stream interop use `encode_from_gpu_ptr_f32_with_stream`.
+    ///
+    /// # Arguments
+    /// * `input_d` - Device pointer to input data (f32 array on GPU)
+    /// * `input_len` - Number of f32 elements in the input
+    /// * `num_qubits` - Number of qubits for encoding
+    ///
+    /// # Returns
+    /// DLPack pointer (state vector in engine precision) for zero-copy PyTorch integration.
+    /// Internal computation is f32; output is converted to [`Precision`] of the engine.
+    ///
+    /// # Safety
+    /// The input pointer must:
+    /// - Point to valid GPU memory on the same device as the engine
+    /// - Contain at least `input_len` f32 elements
+    /// - Remain valid for the duration of this call
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_from_gpu_ptr_f32(
+        &self,
+        input_d: *const f32,
+        input_len: usize,
+        num_qubits: usize,
+    ) -> Result<*mut DLManagedTensor> {
+        unsafe {
+            self.encode_from_gpu_ptr_f32_with_stream(
+                input_d,
+                input_len,
+                num_qubits,
+                std::ptr::null_mut(),
+            )
+        }
+    }
+
+    /// Encode from existing GPU pointer (float32) on a specified CUDA stream.
+    ///
+    /// # Returns
+    /// DLPack pointer (state vector in engine precision). Pass null for `stream` to use the default stream.
+    ///
+    /// # Safety
+    /// In addition to the `encode_from_gpu_ptr_f32` requirements, the stream pointer
+    /// must remain valid for the duration of this call.
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_from_gpu_ptr_f32_with_stream(
+        &self,
+        input_d: *const f32,
+        input_len: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<*mut DLManagedTensor> {
+        crate::profile_scope!("Mahout::EncodeFromGpuPtrF32");
+
+        if input_len == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Input data cannot be empty".into(),
+            ));
+        }
+
+        validate_cuda_input_ptr(&self.device, input_d as *const c_void)?;
+
+        let state_len = 1usize << num_qubits;
+        if input_len > state_len {
+            return Err(MahoutError::InvalidInput(format!(
+                "Input size {} exceeds state vector size {} (2^{} qubits)",
+                input_len, state_len, num_qubits
+            )));
+        }
+
+        let state_vector = {
+            crate::profile_scope!("GPU::Alloc");
+            gpu::GpuStateVector::new(&self.device, num_qubits, Precision::Float32)?
+        };
+
+        let inv_norm = {
+            crate::profile_scope!("GPU::NormFromPtr");
+            unsafe {
+                gpu::AmplitudeEncoder::calculate_inv_norm_gpu_f32_with_stream(
+                    &self.device,
+                    input_d,
+                    input_len,
+                    stream,
+                )?
+            }
+        };
+
+        let state_ptr = state_vector.ptr_f32().ok_or_else(|| {
+            MahoutError::InvalidInput(
+                "State vector precision mismatch (expected float32 buffer)".to_string(),
+            )
+        })?;
+
+        {
+            crate::profile_scope!("GPU::KernelLaunch");
+            let ret = unsafe {
+                qdp_kernels::launch_amplitude_encode_f32(
+                    input_d,
+                    state_ptr as *mut std::ffi::c_void,
+                    input_len,
+                    state_len,
+                    inv_norm,
+                    stream,
+                )
+            };
+
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Amplitude encode (f32) kernel failed with CUDA error code: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
+        }
+
+        let state_vector = state_vector.to_precision(&self.device, self.precision)?;
+        Ok(state_vector.to_dlpack())
+    }
+
     /// Encode batch from existing GPU pointer (zero-copy for CUDA tensors)
     ///
     /// This method enables zero-copy batch encoding from PyTorch CUDA tensors.
@@ -671,6 +854,14 @@ impl QdpEngine {
             ));
         }
 
+        if sample_size == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Sample size cannot be zero".into(),
+            ));
+        }
+
+        validate_cuda_input_ptr(&self.device, input_batch_d)?;
+
         match method.as_str() {
             "amplitude" => {
                 if sample_size == 0 {
 
@@ -14,8 +14,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-/// Creates normalized test data
+/// Creates normalized test data (f64)
 #[allow(dead_code)] // Used by multiple test modules
 pub fn create_test_data(size: usize) -> Vec<f64> {
     (0..size).map(|i| (i as f64) / (size as f64)).collect()
 }
+
+/// Creates normalized test data (f32)
+#[allow(dead_code)]
+pub fn create_test_data_f32(size: usize) -> Vec<f32> {
+    (0..size).map(|i| (i as f32) / (size as f32)).collect()
+}