feat: add direct encoding method for float32 tensors

viiccwen · viiccwen · commit ac8b5ba65650 · 2026-02-07T08:22:27.000Z
diff --git a/qdp/qdp-python/src/lib.rs b/qdp/qdp-python/src/lib.rs
@@ -359,7 +359,16 @@ fn validate_cuda_tensor_for_encoding(
     let dtype_str: String = dtype.str()?.extract()?;
     let dtype_str_lower = dtype_str.to_ascii_lowercase();
     match method.as_str() {
-        "amplitude" | "angle" => {
+        "amplitude" => {
+            if !(dtype_str_lower.contains("float64") || dtype_str_lower.contains("float32")) {
+                return Err(PyRuntimeError::new_err(format!(
+                    "CUDA tensor must have dtype float64 or float32 for amplitude encoding, got {}. \
+                     Use tensor.to(torch.float64) or tensor.to(torch.float32)",
+                    dtype_str
+                )));
+            }
+        }
+        "angle" => {
             if !dtype_str_lower.contains("float64") {
                 return Err(PyRuntimeError::new_err(format!(
                     "CUDA tensor must have dtype float64 for {} encoding, got {}. \
@@ -715,76 +724,7 @@ impl QdpEngine {
         if is_pytorch_tensor(data)? {
             // Check if it's a CUDA tensor - use zero-copy GPU encoding
             if is_cuda_tensor(data)? {
-                // Validate CUDA tensor for direct GPU encoding
-                validate_cuda_tensor_for_encoding(
-                    data,
-                    self.engine.device().ordinal(),
-                    encoding_method,
-                )?;
-
-                // Extract GPU pointer directly from PyTorch tensor
-                let tensor_info = extract_cuda_tensor_info(data)?;
-                let stream_ptr = get_torch_cuda_stream_ptr(data)?;
-
-                let ndim: usize = data.call_method0("dim")?.extract()?;
-
-                match ndim {
-                    1 => {
-                        // 1D CUDA tensor: single sample encoding
-                        let input_len = tensor_info.shape[0] as usize;
-                        // SAFETY: tensor_info.data_ptr was obtained via PyTorch's data_ptr() from a
-                        // valid CUDA tensor. The tensor remains alive during this call
-                        // (held by Python's GIL), and we validated dtype/contiguity/device above.
-                        let ptr = unsafe {
-                            self.engine
-                                .encode_from_gpu_ptr_with_stream(
-                                    tensor_info.data_ptr as *const std::ffi::c_void,
-                                    input_len,
-                                    num_qubits,
-                                    encoding_method,
-                                    stream_ptr,
-                                )
-                                .map_err(|e| {
-                                    PyRuntimeError::new_err(format!("Encoding failed: {}", e))
-                                })?
-                        };
-                        return Ok(QuantumTensor {
-                            ptr,
-                            consumed: false,
-                        });
-                    }
-                    2 => {
-                        // 2D CUDA tensor: batch encoding
-                        let num_samples = tensor_info.shape[0] as usize;
-                        let sample_size = tensor_info.shape[1] as usize;
-                        // SAFETY: Same as above - pointer from validated PyTorch CUDA tensor
-                        let ptr = unsafe {
-                            self.engine
-                                .encode_batch_from_gpu_ptr_with_stream(
-                                    tensor_info.data_ptr as *const std::ffi::c_void,
-                                    num_samples,
-                                    sample_size,
-                                    num_qubits,
-                                    encoding_method,
-                                    stream_ptr,
-                                )
-                                .map_err(|e| {
-                                    PyRuntimeError::new_err(format!("Encoding failed: {}", e))
-                                })?
-                        };
-                        return Ok(QuantumTensor {
-                            ptr,
-                            consumed: false,
-                        });
-                    }
-                    _ => {
-                        return Err(PyRuntimeError::new_err(format!(
-                            "Unsupported CUDA tensor shape: {}D. Expected 1D tensor for single \
-                             sample encoding or 2D tensor (batch_size, features) for batch encoding.",
-                            ndim
-                        )));
-                    }
-                }
+                return self._encode_from_cuda_tensor(data, num_qubits, encoding_method);
             }
             // CPU PyTorch tensor path
             return self.encode_from_pytorch(data, num_qubits, encoding_method);
@@ -1213,6 +1153,143 @@ impl QdpEngine {
             .run_dual_stream_encode(&data_slice, num_qubits, encoding_method)
             .map_err(|e| PyRuntimeError::new_err(format!("run_dual_stream_encode failed: {}", e)))
     }
+
+    /// encode directly from a PyTorch CUDA tensor. Internal helper.
+    ///
+    /// Dispatches to the core f32 GPU pointer API for 1D float32 amplitude encoding,
+    /// or to the float64/basis GPU pointer APIs for other dtypes and batch encoding.
+    ///
+    /// Args:
+    ///     data: PyTorch CUDA tensor
+    ///     num_qubits: Number of qubits
+    ///     encoding_method: Encoding strategy (currently only "amplitude")
+    fn _encode_from_cuda_tensor(
+        &self,
+        data: &Bound<'_, PyAny>,
+        num_qubits: usize,
+        encoding_method: &str,
+    ) -> PyResult<QuantumTensor> {
+        // Validate CUDA tensor for direct GPU encoding (shape, contiguity, device, dtype)
+        validate_cuda_tensor_for_encoding(data, self.engine.device().ordinal(), encoding_method)?;
+
+        // Determine dtype for dispatch (float32 vs float64, etc.).
+        let dtype = data.getattr("dtype")?;
+        let dtype_str: String = dtype.str()?.extract()?;
+        let dtype_str_lower = dtype_str.to_ascii_lowercase();
+        let is_f32 = dtype_str_lower.contains("float32");
+        let method = encoding_method.to_ascii_lowercase();
+
+        // Current f32 CUDA path only supports amplitude encoding for 1D tensors.
+        let ndim: usize = data.call_method0("dim")?.extract()?;
+
+        if method.as_str() == "amplitude" && is_f32 {
+            match ndim {
+                1 => {
+                    // 1D CUDA tensor, float32 amplitude encoding using core f32 GPU pointer API.
+                    let input_len: usize = data.call_method0("numel")?.extract()?;
+                    if input_len == 0 {
+                        return Err(PyRuntimeError::new_err("CUDA tensor cannot be empty"));
+                    }
+
+                    let stream_ptr = get_torch_cuda_stream_ptr(data)?;
+                    let data_ptr_u64: u64 = data.call_method0("data_ptr")?.extract()?;
+                    if data_ptr_u64 == 0 {
+                        return Err(PyRuntimeError::new_err(
+                            "PyTorch returned a null data pointer for CUDA tensor",
+                        ));
+                    }
+                    let data_ptr = data_ptr_u64 as *const f32;
+
+                    let ptr = unsafe {
+                        self.engine
+                            .encode_from_gpu_ptr_f32_with_stream(
+                                data_ptr, input_len, num_qubits, stream_ptr,
+                            )
+                            .map_err(|e| {
+                                PyRuntimeError::new_err(format!(
+                                    "Encoding failed (float32 amplitude): {}",
+                                    e
+                                ))
+                            })?
+                    };
+
+                    Ok(QuantumTensor {
+                        ptr,
+                        consumed: false,
+                    })
+                }
+                2 => Err(PyRuntimeError::new_err(
+                    "CUDA float32 batch amplitude encoding is not yet supported. \
+                     Use float64 (tensor.to(torch.float64)) or encode samples individually.",
+                )),
+                _ => Err(PyRuntimeError::new_err(format!(
+                    "Unsupported CUDA tensor shape: {}D. Expected 1D tensor for single \
+                     sample encoding or 2D tensor (batch_size, features) for batch encoding.",
+                    ndim
+                ))),
+            }
+        } else {
+            // Existing float64 (and basis/int64) CUDA path using direct GPU pointer.
+            let tensor_info = extract_cuda_tensor_info(data)?;
+            let stream_ptr = get_torch_cuda_stream_ptr(data)?;
+
+            match ndim {
+                1 => {
+                    // 1D CUDA tensor: single sample encoding
+                    let input_len = tensor_info.shape[0] as usize;
+                    // SAFETY: tensor_info.data_ptr was obtained via PyTorch's data_ptr() from a
+                    // valid CUDA tensor. The tensor remains alive during this call
+                    // (held by Python's GIL), and we validated dtype/contiguity/device above.
+                    let ptr = unsafe {
+                        self.engine
+                            .encode_from_gpu_ptr_with_stream(
+                                tensor_info.data_ptr as *const std::ffi::c_void,
+                                input_len,
+                                num_qubits,
+                                encoding_method,
+                                stream_ptr,
+                            )
+                            .map_err(|e| {
+                                PyRuntimeError::new_err(format!("Encoding failed: {}", e))
+                            })?
+                    };
+                    Ok(QuantumTensor {
+                        ptr,
+                        consumed: false,
+                    })
+                }
+                2 => {
+                    // 2D CUDA tensor: batch encoding
+                    let num_samples = tensor_info.shape[0] as usize;
+                    let sample_size = tensor_info.shape[1] as usize;
+                    // SAFETY: Same as above - pointer from validated PyTorch CUDA tensor
+                    let ptr = unsafe {
+                        self.engine
+                            .encode_batch_from_gpu_ptr_with_stream(
+                                tensor_info.data_ptr as *const std::ffi::c_void,
+                                num_samples,
+                                sample_size,
+                                num_qubits,
+                                encoding_method,
+                                stream_ptr,
+                            )
+                            .map_err(|e| {
+                                PyRuntimeError::new_err(format!("Encoding failed: {}", e))
+                            })?
+                    };
+                    Ok(QuantumTensor {
+                        ptr,
+                        consumed: false,
+                    })
+                }
+                _ => Err(PyRuntimeError::new_err(format!(
+                    "Unsupported CUDA tensor shape: {}D. Expected 1D tensor for single \
+                     sample encoding or 2D tensor (batch_size, features) for batch encoding.",
+                    ndim
+                ))),
+            }
+        }
+    }
 }
 
 /// Runs the full throughput pipeline in Rust with GIL released. Returns (duration_sec, vectors_per_sec, latency_ms_per_vector).
diff --git a/qdp/qdp-python/tests/test_dlpack_validation.py b/qdp/qdp-python/tests/test_dlpack_validation.py
@@ -38,16 +38,30 @@ def _engine():
 
 
 @pytest.mark.skipif(not _cuda_available(), reason="CUDA not available")
-def test_dtype_validation_float32_rejected():
-    """DLPack tensor must be float64; float32 CUDA tensor should fail with clear message."""
+def test_cuda_float32_amplitude_supported():
+    """1D float32 CUDA tensor should be supported for amplitude encoding via GPU pointer f32 path."""
     engine = _engine()
     # 1D float32 CUDA tensor (contiguous)
     t = torch.randn(4, dtype=torch.float32, device="cuda")
-    with pytest.raises(RuntimeError) as exc_info:
+    result = engine.encode(t, num_qubits=2, encoding_method="amplitude")
+    assert result is not None
+
+    # Verify DLPack round-trip works and tensor is on CUDA
+    qt = torch.from_dlpack(result)
+    assert qt.is_cuda
+    # With default engine precision=float32, complex64 is expected
+    assert qt.dtype in (torch.complex64, torch.complex128)
+
+
+@pytest.mark.skipif(not _cuda_available(), reason="CUDA not available")
+def test_cuda_float32_amplitude_2d_unsupported():
+    """2D float32 CUDA tensor with amplitude encoding should raise a clear error."""
+    engine = _engine()
+    t = torch.randn(2, 4, dtype=torch.float32, device="cuda")
+    with pytest.raises(
+        RuntimeError, match="float32 batch amplitude encoding is not yet supported"
+    ):
         engine.encode(t, num_qubits=2, encoding_method="amplitude")
-    msg = str(exc_info.value).lower()
-    assert "float64" in msg
-    assert "code=" in msg or "bits=" in msg or "lanes=" in msg
 
 
 @pytest.mark.skipif(not _cuda_available(), reason="CUDA not available")
diff --git a/testing/qdp/test_bindings.py b/testing/qdp/test_bindings.py
@@ -315,7 +315,7 @@ def test_encode_cuda_tensor(data_shape, expected_shape, expected_batch_size):
 @requires_qdp
 @pytest.mark.gpu
 def test_encode_cuda_tensor_wrong_dtype():
-    """Test error when CUDA tensor has wrong dtype (non-float64)."""
+    """Test error when CUDA tensor has wrong dtype for amplitude (e.g. float16)."""
     pytest.importorskip("torch")
     from _qdp import QdpEngine
 
@@ -324,9 +324,9 @@ def test_encode_cuda_tensor_wrong_dtype():
 
     engine = QdpEngine(0)
 
-    # Create CUDA tensor with float32 dtype (wrong)
-    data = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float32, device="cuda:0")
-    with pytest.raises(RuntimeError, match="CUDA tensor must have dtype float64"):
+    # Amplitude encoding accepts float64 or float32 only; float16 is invalid
+    data = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float16, device="cuda:0")
+    with pytest.raises(RuntimeError, match="float64 or float32"):
         engine.encode(data, 2, "amplitude")
 
 
@@ -537,6 +537,31 @@ def test_encode_cuda_tensor_output_dtype(precision, expected_dtype):
     )
 
 
+@requires_qdp
+@pytest.mark.gpu
+@pytest.mark.parametrize(
+    "precision,expected_dtype",
+    [
+        ("float32", torch.complex64),
+        ("float64", torch.complex128),
+    ],
+)
+def test_encode_cuda_tensor_float32_input_output_dtype(precision, expected_dtype):
+    """Test that 1D float32 CUDA amplitude encoding respects engine precision (f32 path)."""
+    pytest.importorskip("torch")
+    from _qdp import QdpEngine
+
+    if not torch.cuda.is_available():
+        pytest.skip("GPU required for QdpEngine")
+
+    engine = QdpEngine(0, precision=precision)
+    data = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float32, device="cuda:0")
+    result = torch.from_dlpack(engine.encode(data, 2, "amplitude"))
+    assert result.dtype == expected_dtype, (
+        f"Expected {expected_dtype}, got {result.dtype}"
+    )
+
+
 @requires_qdp
 @pytest.mark.gpu
 def test_basis_encode_basic():