Skip to content

Commit 5b4f800

Browse files
committed
fix and update
1 parent 03fac63 commit 5b4f800

File tree

7 files changed

+367
-85
lines changed

7 files changed

+367
-85
lines changed

qdp/qdp-python/pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,12 @@ benchmark = [
4949

5050
[tool.uv.sources]
5151
qumat = { path = "../..", editable = true }
52+
torch = { index = "pytorch" }
5253

54+
# CUDA 12.6 wheels to match driver (libnvJitLink 12_6); cu122 pulls libs that need 12_8 and fail.
5355
[[tool.uv.index]]
5456
name = "pytorch"
55-
url = "https://download.pytorch.org/whl/cu122"
57+
url = "https://download.pytorch.org/whl/cu126"
5658
explicit = true
5759

5860
# Invalidate uv cache when Rust or Cargo changes so extension is rebuilt (run_throughput_pipeline_py etc.).

qdp/qdp-python/qumat_qdp/loader.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,14 @@
3030
from __future__ import annotations
3131

3232
from functools import lru_cache
33-
from typing import TYPE_CHECKING, Any, Iterator, Optional
33+
from typing import TYPE_CHECKING, Any, Iterator, Optional, cast
3434

3535
import numpy as np
3636

3737
if TYPE_CHECKING:
3838
import _qdp # noqa: F401 -- for type checkers only
3939

40-
# Optional torch for as_torch()/as_numpy(); import at use site to avoid hard dependency.
40+
# Optional torch for as_torch(); as_numpy() uses QuantumTensor.to_numpy() (no torch needed).
4141
try:
4242
import torch as _torch
4343
except ImportError:
@@ -144,7 +144,7 @@ def as_torch(self, device: str = "cuda") -> QuantumDataLoader:
144144
return self
145145

146146
def as_numpy(self) -> QuantumDataLoader:
147-
"""Yield batches as NumPy arrays (CPU). Conversion is done inside the loader. Returns self."""
147+
"""Yield batches as NumPy float64 arrays (CPU). Uses QuantumTensor.to_numpy() — no PyTorch required. Returns self."""
148148
self._output_format = ("numpy",)
149149
return self
150150

@@ -367,7 +367,8 @@ def _wrap_iterator(self, raw_iter: Iterator[object]) -> Iterator[Any]:
367367
yield t.cpu() if device == "cpu" else t
368368
elif kind == "numpy":
369369
for qt in raw_iter:
370-
yield _torch.from_dlpack(qt).cpu().numpy()
370+
# Rust QuantumTensor has to_numpy(); raw_iter is Iterator[object]
371+
yield cast(Any, qt).to_numpy()
371372
else:
372373
yield from raw_iter
373374

qdp/qdp-python/src/engine.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -675,6 +675,7 @@ impl QdpEngine {
675675
encoding_method,
676676
0,
677677
None,
678+
qdp_core::reader::NullHandling::FillZero,
678679
);
679680
let engine = self.engine.clone();
680681
let iter = py

qdp/qdp-python/src/tensor.rs

Lines changed: 104 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,18 @@
1414
// See the License for the specific language governing permissions and
1515
// limitations under the License.
1616

17+
use numpy::{PyArray2, ndarray::Array2};
1718
use pyo3::exceptions::PyRuntimeError;
1819
use pyo3::ffi;
1920
use pyo3::prelude::*;
2021
use qdp_core::dlpack::DLManagedTensor;
22+
use std::ffi::c_void;
23+
24+
// CUDA Runtime API — already linked transitively by qdp-core.
25+
unsafe extern "C" {
26+
fn cudaMemcpy(dst: *mut c_void, src: *const c_void, count: usize, kind: i32) -> i32;
27+
}
28+
const CUDA_MEMCPY_DEVICE_TO_HOST: i32 = 2;
2129

2230
/// Quantum tensor wrapper implementing DLPack protocol
2331
///
@@ -98,6 +106,100 @@ impl QuantumTensor {
98106
}
99107
}
100108

109+
/// Copy encoded quantum state from GPU to a NumPy array (CPU, float64).
110+
///
111+
/// Performs a synchronous cudaMemcpy D2H without requiring PyTorch.
112+
/// Complex128 output (imaginary parts are always 0.0 per the CUDA kernel)
113+
/// is reduced to float64 by discarding the zero imaginary components.
114+
///
115+
/// Returns:
116+
/// numpy.ndarray of shape (batch_size, state_len), dtype float64.
117+
///
118+
/// Raises:
119+
/// RuntimeError: If the tensor has already been consumed, the pointer is
120+
/// invalid, the dtype is unsupported, or the CUDA copy fails.
121+
#[allow(clippy::wrong_self_convention)] // mut required: sets self.consumed and calls DLPack deleter
122+
fn to_numpy<'py>(&mut self, py: Python<'py>) -> PyResult<Bound<'py, PyArray2<f64>>> {
123+
if self.consumed {
124+
return Err(PyRuntimeError::new_err(
125+
"DLPack tensor already consumed (can only be used once)",
126+
));
127+
}
128+
if self.ptr.is_null() {
129+
return Err(PyRuntimeError::new_err("Invalid DLPack tensor pointer"));
130+
}
131+
132+
let (rows, cols, host_data) = unsafe {
133+
let dl_tensor = &(*self.ptr).dl_tensor;
134+
135+
// Shape — require 1-D or 2-D.
136+
let ndim = dl_tensor.ndim as usize;
137+
if ndim == 0 || ndim > 2 || dl_tensor.shape.is_null() {
138+
return Err(PyRuntimeError::new_err(
139+
"to_numpy() requires a 1-D or 2-D tensor",
140+
));
141+
}
142+
let shape = std::slice::from_raw_parts(dl_tensor.shape, ndim);
143+
let (rows, cols) = if ndim == 1 {
144+
(1usize, shape[0] as usize)
145+
} else {
146+
(shape[0] as usize, shape[1] as usize)
147+
};
148+
149+
// Dtype: complex128 (DL_COMPLEX=5, bits=128) or float64 (DL_FLOAT=2, bits=64).
150+
let dtype = &dl_tensor.dtype;
151+
let (is_complex, elem_bytes) = match (dtype.code, dtype.bits) {
152+
(5, 128) => (true, 16usize),
153+
(2, 64) => (false, 8usize),
154+
_ => {
155+
return Err(PyRuntimeError::new_err(format!(
156+
"to_numpy() unsupported dtype: code={}, bits={}",
157+
dtype.code, dtype.bits
158+
)));
159+
}
160+
};
161+
162+
let n_elems = rows * cols;
163+
// For complex128 each element is two consecutive f64 values.
164+
let host_f64_count = if is_complex { n_elems * 2 } else { n_elems };
165+
let mut host_buf = vec![0.0f64; host_f64_count];
166+
167+
let data_ptr = (dl_tensor.data as *const u8).add(dl_tensor.byte_offset as usize);
168+
169+
let ret = cudaMemcpy(
170+
host_buf.as_mut_ptr() as *mut c_void,
171+
data_ptr as *const c_void,
172+
n_elems * elem_bytes,
173+
CUDA_MEMCPY_DEVICE_TO_HOST,
174+
);
175+
if ret != 0 {
176+
return Err(PyRuntimeError::new_err(format!(
177+
"cudaMemcpy D2H failed with error code {}",
178+
ret
179+
)));
180+
}
181+
182+
// Consumed: GPU memory is ours to free now.
183+
self.consumed = true;
184+
if let Some(deleter) = (*self.ptr).deleter {
185+
deleter(self.ptr);
186+
}
187+
188+
// complex128 → float64: discard imaginary parts (always 0.0).
189+
let host_data: Vec<f64> = if is_complex {
190+
host_buf.into_iter().step_by(2).collect()
191+
} else {
192+
host_buf
193+
};
194+
195+
(rows, cols, host_data)
196+
};
197+
198+
let arr = Array2::from_shape_vec((rows, cols), host_data)
199+
.map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
200+
Ok(PyArray2::from_owned_array(py, arr))
201+
}
202+
101203
/// Returns DLPack device information
102204
///
103205
/// Returns:
@@ -122,8 +224,8 @@ impl QuantumTensor {
122224

123225
impl Drop for QuantumTensor {
124226
fn drop(&mut self) {
125-
// Only free if not consumed by __dlpack__
126-
// If consumed, PyTorch/consumer will call the deleter
227+
// Only free if not consumed; __dlpack__ leaves freeing to PyTorch,
228+
// to_numpy() calls the deleter itself after the D2H copy.
127229
if !self.consumed && !self.ptr.is_null() {
128230
unsafe {
129231
// Defensive check: qdp-core always provides a deleter

qdp/qdp-python/tests/test_quantum_data_loader.py

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616

1717
"""tests for Quantum Data Loader."""
1818

19+
from unittest.mock import patch
20+
21+
import numpy as np
1922
import pytest
2023

2124
try:
@@ -28,6 +31,15 @@ def _loader_available():
2831
return QuantumDataLoader is not None
2932

3033

34+
def _cuda_available():
35+
try:
36+
import torch
37+
38+
return torch.cuda.is_available()
39+
except ImportError:
40+
return False
41+
42+
3143
@pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
3244
def test_mutual_exclusion_both_sources_raises():
3345
"""Calling both .source_synthetic() and .source_file() then __iter__ raises ValueError."""
@@ -184,3 +196,134 @@ def test_null_handling_default_is_none():
184196
"""By default, _null_handling is None (Rust will use FillZero)."""
185197
loader = QuantumDataLoader(device_id=0)
186198
assert loader._null_handling is None
199+
200+
201+
# --- as_torch() / as_numpy() output format tests ---
202+
203+
204+
@pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
205+
def test_as_torch_raises_at_config_time_when_torch_missing():
206+
"""as_torch() raises RuntimeError immediately (config time) when torch is not installed."""
207+
with patch("qumat_qdp.loader._torch", None):
208+
loader = QuantumDataLoader(device_id=0).qubits(4).batches(2, size=4)
209+
with pytest.raises(RuntimeError) as exc_info:
210+
loader.as_torch()
211+
msg = str(exc_info.value)
212+
assert "PyTorch" in msg or "torch" in msg.lower()
213+
assert "pip install" in msg
214+
215+
216+
@pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
217+
def test_as_numpy_succeeds_at_config_time_without_torch():
218+
"""as_numpy() does not raise at config time even when torch is not installed."""
219+
with patch("qumat_qdp.loader._torch", None):
220+
loader = (
221+
QuantumDataLoader(device_id=0)
222+
.qubits(4)
223+
.batches(2, size=4)
224+
.source_synthetic()
225+
.as_numpy()
226+
)
227+
assert loader._output_format == ("numpy",)
228+
229+
230+
@pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
231+
@pytest.mark.skipif(not _cuda_available(), reason="CUDA GPU required")
232+
def test_as_numpy_yields_float64_arrays():
233+
"""as_numpy() yields numpy float64 arrays with correct shape; no torch required."""
234+
num_qubits = 4
235+
batch_size = 8
236+
state_len = 2**num_qubits # 16
237+
238+
batches = []
239+
with patch("qumat_qdp.loader._torch", None):
240+
loader = (
241+
QuantumDataLoader(device_id=0)
242+
.qubits(num_qubits)
243+
.batches(3, size=batch_size)
244+
.source_synthetic()
245+
.as_numpy()
246+
)
247+
for batch in loader:
248+
batches.append(batch)
249+
250+
assert len(batches) == 3
251+
for batch in batches:
252+
assert isinstance(batch, np.ndarray), f"expected ndarray, got {type(batch)}"
253+
assert batch.dtype == np.float64, f"expected float64, got {batch.dtype}"
254+
assert batch.ndim == 2
255+
assert batch.shape == (batch_size, state_len), f"unexpected shape {batch.shape}"
256+
257+
258+
@pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
259+
@pytest.mark.skipif(not _cuda_available(), reason="CUDA GPU required")
260+
def test_as_numpy_amplitudes_are_unit_norm():
261+
"""Each row from as_numpy() should be a unit-norm state vector (amplitude encoding)."""
262+
num_qubits = 4
263+
batch_size = 16
264+
265+
loader = (
266+
QuantumDataLoader(device_id=0)
267+
.qubits(num_qubits)
268+
.batches(2, size=batch_size)
269+
.source_synthetic()
270+
.as_numpy()
271+
)
272+
for batch in loader:
273+
arr = np.asarray(batch, dtype=np.float64)
274+
norms = np.linalg.norm(arr, axis=1)
275+
np.testing.assert_allclose(norms, 1.0, atol=1e-5)
276+
277+
278+
@pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
279+
@pytest.mark.skipif(not _cuda_available(), reason="CUDA GPU required")
280+
def test_as_torch_yields_cuda_tensors():
281+
"""as_torch(device='cuda') yields torch tensors on CUDA."""
282+
try:
283+
import torch
284+
except ImportError:
285+
pytest.skip("torch not installed")
286+
287+
num_qubits = 4
288+
batch_size = 8
289+
state_len = 2**num_qubits
290+
291+
loader = (
292+
QuantumDataLoader(device_id=0)
293+
.qubits(num_qubits)
294+
.batches(2, size=batch_size)
295+
.source_synthetic()
296+
.as_torch(device="cuda")
297+
)
298+
for batch in loader:
299+
assert isinstance(batch, torch.Tensor)
300+
assert batch.is_cuda
301+
assert batch.shape == (batch_size, state_len)
302+
303+
304+
@pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
305+
@pytest.mark.skipif(not _cuda_available(), reason="CUDA GPU required")
306+
def test_as_numpy_from_source_array():
307+
"""as_numpy() works with source_array(), yielding correct shapes and dtype."""
308+
num_qubits = 3
309+
state_len = 2**num_qubits # 8
310+
n_samples = 12
311+
batch_size = 4
312+
313+
rng = np.random.default_rng(42)
314+
X = rng.standard_normal((n_samples, state_len))
315+
316+
loader = (
317+
QuantumDataLoader(device_id=0)
318+
.qubits(num_qubits)
319+
.batches(1, size=batch_size)
320+
.encoding("amplitude")
321+
.source_array(X)
322+
.as_numpy()
323+
)
324+
batches = list(loader)
325+
assert len(batches) == n_samples // batch_size
326+
for batch in batches:
327+
assert isinstance(batch, np.ndarray)
328+
assert batch.dtype == np.float64
329+
assert batch.shape[1] == state_len

0 commit comments

Comments
 (0)