@@ -45,12 +45,63 @@ pub use pipeline_runner::{
4545 run_throughput_pipeline,
4646} ;
4747
48+ use std:: ffi:: c_void;
4849use std:: sync:: Arc ;
4950
5051use crate :: dlpack:: DLManagedTensor ;
5152use crate :: gpu:: get_encoder;
5253use cudarc:: driver:: CudaDevice ;
5354
55+ #[ cfg( target_os = "linux" ) ]
56+ fn validate_cuda_input_ptr ( device : & CudaDevice , ptr : * const c_void ) -> Result < ( ) > {
57+ use crate :: gpu:: cuda_ffi:: {
58+ CUDA_MEMORY_TYPE_DEVICE , CUDA_MEMORY_TYPE_MANAGED , CudaPointerAttributes ,
59+ cudaPointerGetAttributes,
60+ } ;
61+
62+ if ptr. is_null ( ) {
63+ return Err ( MahoutError :: InvalidInput (
64+ "Input GPU pointer is null" . to_string ( ) ,
65+ ) ) ;
66+ }
67+
68+ let mut attrs = CudaPointerAttributes {
69+ memory_type : 0 ,
70+ device : 0 ,
71+ device_pointer : std:: ptr:: null_mut ( ) ,
72+ host_pointer : std:: ptr:: null_mut ( ) ,
73+ is_managed : 0 ,
74+ allocation_flags : 0 ,
75+ } ;
76+
77+ let ret = unsafe { cudaPointerGetAttributes ( & mut attrs as * mut _ , ptr) } ;
78+ if ret != 0 {
79+ return Err ( MahoutError :: InvalidInput ( format ! (
80+ "cudaPointerGetAttributes failed for input pointer: {} ({})" ,
81+ ret,
82+ cuda_error_to_string( ret)
83+ ) ) ) ;
84+ }
85+
86+ if attrs. memory_type != CUDA_MEMORY_TYPE_DEVICE && attrs. memory_type != CUDA_MEMORY_TYPE_MANAGED
87+ {
88+ return Err ( MahoutError :: InvalidInput ( format ! (
89+ "Input pointer is not CUDA device memory (memory_type={})" ,
90+ attrs. memory_type
91+ ) ) ) ;
92+ }
93+
94+ let device_ordinal = device. ordinal ( ) as i32 ;
95+ if attrs. device >= 0 && attrs. device != device_ordinal {
96+ return Err ( MahoutError :: InvalidInput ( format ! (
97+ "Input pointer device mismatch: pointer on cuda:{}, engine on cuda:{}" ,
98+ attrs. device, device_ordinal
99+ ) ) ) ;
100+ }
101+
102+ Ok ( ( ) )
103+ }
104+
54105/// Main entry point for Mahout QDP
55106///
56107/// Manages GPU context and dispatches encoding tasks.
@@ -418,6 +469,14 @@ impl QdpEngine {
418469 ) -> Result < * mut DLManagedTensor > {
419470 crate :: profile_scope!( "Mahout::EncodeFromGpuPtr" ) ;
420471
472+ if input_len == 0 {
473+ return Err ( MahoutError :: InvalidInput (
474+ "Input data cannot be empty" . into ( ) ,
475+ ) ) ;
476+ }
477+
478+ validate_cuda_input_ptr ( & self . device , input_d) ?;
479+
421480 let state_len = 1usize << num_qubits;
422481 let method = encoding_method. to_ascii_lowercase ( ) ;
423482
@@ -600,6 +659,130 @@ impl QdpEngine {
600659 }
601660 }
602661
662+ /// Encode from existing GPU pointer (float32 input, amplitude encoding only)
663+ ///
664+ /// Zero-copy encoding from PyTorch CUDA float32 tensors. Uses the default CUDA stream.
665+ /// For stream interop use `encode_from_gpu_ptr_f32_with_stream`.
666+ ///
667+ /// # Arguments
668+ /// * `input_d` - Device pointer to input data (f32 array on GPU)
669+ /// * `input_len` - Number of f32 elements in the input
670+ /// * `num_qubits` - Number of qubits for encoding
671+ ///
672+ /// # Returns
673+ /// DLPack pointer (state vector in engine precision) for zero-copy PyTorch integration.
674+ /// Internal computation is f32; output is converted to [`Precision`] of the engine.
675+ ///
676+ /// # Safety
677+ /// The input pointer must:
678+ /// - Point to valid GPU memory on the same device as the engine
679+ /// - Contain at least `input_len` f32 elements
680+ /// - Remain valid for the duration of this call
681+ #[ cfg( target_os = "linux" ) ]
682+ pub unsafe fn encode_from_gpu_ptr_f32 (
683+ & self ,
684+ input_d : * const f32 ,
685+ input_len : usize ,
686+ num_qubits : usize ,
687+ ) -> Result < * mut DLManagedTensor > {
688+ unsafe {
689+ self . encode_from_gpu_ptr_f32_with_stream (
690+ input_d,
691+ input_len,
692+ num_qubits,
693+ std:: ptr:: null_mut ( ) ,
694+ )
695+ }
696+ }
697+
698+ /// Encode from existing GPU pointer (float32) on a specified CUDA stream.
699+ ///
700+ /// # Returns
701+ /// DLPack pointer (state vector in engine precision). Pass null for `stream` to use the default stream.
702+ ///
703+ /// # Safety
704+ /// In addition to the `encode_from_gpu_ptr_f32` requirements, the stream pointer
705+ /// must remain valid for the duration of this call.
706+ #[ cfg( target_os = "linux" ) ]
707+ pub unsafe fn encode_from_gpu_ptr_f32_with_stream (
708+ & self ,
709+ input_d : * const f32 ,
710+ input_len : usize ,
711+ num_qubits : usize ,
712+ stream : * mut c_void ,
713+ ) -> Result < * mut DLManagedTensor > {
714+ crate :: profile_scope!( "Mahout::EncodeFromGpuPtrF32" ) ;
715+
716+ if input_len == 0 {
717+ return Err ( MahoutError :: InvalidInput (
718+ "Input data cannot be empty" . into ( ) ,
719+ ) ) ;
720+ }
721+
722+ validate_cuda_input_ptr ( & self . device , input_d as * const c_void ) ?;
723+
724+ let state_len = 1usize << num_qubits;
725+ if input_len > state_len {
726+ return Err ( MahoutError :: InvalidInput ( format ! (
727+ "Input size {} exceeds state vector size {} (2^{} qubits)" ,
728+ input_len, state_len, num_qubits
729+ ) ) ) ;
730+ }
731+
732+ let state_vector = {
733+ crate :: profile_scope!( "GPU::Alloc" ) ;
734+ gpu:: GpuStateVector :: new ( & self . device , num_qubits, Precision :: Float32 ) ?
735+ } ;
736+
737+ let inv_norm = {
738+ crate :: profile_scope!( "GPU::NormFromPtr" ) ;
739+ unsafe {
740+ gpu:: AmplitudeEncoder :: calculate_inv_norm_gpu_f32_with_stream (
741+ & self . device ,
742+ input_d,
743+ input_len,
744+ stream,
745+ ) ?
746+ }
747+ } ;
748+
749+ let state_ptr = state_vector. ptr_f32 ( ) . ok_or_else ( || {
750+ MahoutError :: InvalidInput (
751+ "State vector precision mismatch (expected float32 buffer)" . to_string ( ) ,
752+ )
753+ } ) ?;
754+
755+ {
756+ crate :: profile_scope!( "GPU::KernelLaunch" ) ;
757+ let ret = unsafe {
758+ qdp_kernels:: launch_amplitude_encode_f32 (
759+ input_d,
760+ state_ptr as * mut std:: ffi:: c_void ,
761+ input_len,
762+ state_len,
763+ inv_norm,
764+ stream,
765+ )
766+ } ;
767+
768+ if ret != 0 {
769+ return Err ( MahoutError :: KernelLaunch ( format ! (
770+ "Amplitude encode (f32) kernel failed with CUDA error code: {} ({})" ,
771+ ret,
772+ cuda_error_to_string( ret)
773+ ) ) ) ;
774+ }
775+ }
776+
777+ {
778+ crate :: profile_scope!( "GPU::Synchronize" ) ;
779+ gpu:: cuda_sync:: sync_cuda_stream ( stream, "CUDA stream synchronize failed" ) ?;
780+ }
781+
782+ let state_vector = state_vector. to_precision ( & self . device , self . precision ) ?;
783+ Ok ( state_vector. to_dlpack ( ) )
784+ }
785+
603786 /// Encode batch from existing GPU pointer (zero-copy for CUDA tensors)
604787 ///
605788 /// This method enables zero-copy batch encoding from PyTorch CUDA tensors.
@@ -671,6 +854,14 @@ impl QdpEngine {
671854 ) ) ;
672855 }
673856
857+ if sample_size == 0 {
858+ return Err ( MahoutError :: InvalidInput (
859+ "Sample size cannot be zero" . into ( ) ,
860+ ) ) ;
861+ }
862+
863+ validate_cuda_input_ptr ( & self . device , input_batch_d) ?;
864+
674865 match method. as_str ( ) {
675866 "amplitude" => {
676867 if sample_size == 0 {
0 commit comments