Skip to content

Commit 48ffaa4

Browse files
ilan-goldLDeakin
andauthored
perf: remove dtype + fill val handling per chunk (#124)
* (feat): first pass remove dtype + fill val handling per chunk * chore: bump `zarrs` to 0.23.0-beta.1 * chore: bump `zarrs` to 0.23.0-beta.2 * chore: incr to 0.2.2-dev * chore: minimise diff * chore: bump `zarrs` to 0.23.0-beta.3 * feat: upgrade zarr v3 * Revert "chore: incr to 0.2.2-dev" This reverts commit 95f2886. * fix: unsupported data type tests * fix: give a real title to zarr store * fix: don't pass in any metadata * fix: warning * fix: cleanups * chore: small cleanups * chore: use `is_whole_chunk` more * chore: bump `zarrs` to 0.23.0-beta.4 * chore: bump `zarrs` to 0.23.0-beta.5 * chore: bump `zarrs` to 0.23.0-beta.6 * chore: bump `zarrs` to 0.23.0 * fix: use `map_py_err` in `WithSubset::new` * rename: `WithSubset` * run on ci while waiting for rustfmt to install * fix: import * remove unused import * fix: no fill warning * key/shape * fix: pyi * remove old `ValueError` * feat: improve data type / fill value incompatibility error `UserWarning: Array is unsupported by ZarrsCodecPipeline: incompatible fill value metadata: dtype=|V7, fill_value=null` * v2 * Revert "v2" This reverts commit 86b8118. --------- Co-authored-by: Lachlan Deakin <ljdgit@gmail.com>
1 parent 44d5611 commit 48ffaa4

File tree

7 files changed

+146
-288
lines changed

7 files changed

+146
-288
lines changed

python/zarrs/_internal.pyi

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,15 @@ import numpy.typing
88
import zarr.abc.store
99

1010
@typing.final
11-
class Basic:
12-
def __new__(cls, byte_interface: typing.Any, chunk_spec: typing.Any) -> Basic: ...
11+
class ChunkItem:
12+
def __new__(
13+
cls,
14+
key: builtins.str,
15+
chunk_subset: typing.Sequence[slice],
16+
chunk_shape: typing.Sequence[builtins.int],
17+
subset: typing.Sequence[slice],
18+
shape: typing.Sequence[builtins.int],
19+
) -> ChunkItem: ...
1320

1421
@typing.final
1522
class CodecPipelineImpl:
@@ -26,22 +33,12 @@ class CodecPipelineImpl:
2633
) -> CodecPipelineImpl: ...
2734
def retrieve_chunks_and_apply_index(
2835
self,
29-
chunk_descriptions: typing.Sequence[WithSubset],
36+
chunk_descriptions: typing.Sequence[ChunkItem],
3037
value: numpy.typing.NDArray[typing.Any],
3138
) -> None: ...
3239
def store_chunks_with_indices(
3340
self,
34-
chunk_descriptions: typing.Sequence[WithSubset],
41+
chunk_descriptions: typing.Sequence[ChunkItem],
3542
value: numpy.typing.NDArray[typing.Any],
3643
write_empty_chunks: builtins.bool,
3744
) -> None: ...
38-
39-
@typing.final
40-
class WithSubset:
41-
def __new__(
42-
cls,
43-
item: Basic,
44-
chunk_subset: typing.Sequence[slice],
45-
subset: typing.Sequence[slice],
46-
shape: typing.Sequence[builtins.int],
47-
) -> WithSubset: ...

python/zarrs/utils.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from zarr.core.array_spec import ArraySpec
1111
from zarr.core.indexing import SelectorTuple, is_integer
1212

13-
from zarrs._internal import Basic, WithSubset
13+
from zarrs._internal import ChunkItem
1414

1515
if TYPE_CHECKING:
1616
from collections.abc import Iterable
@@ -148,7 +148,7 @@ def get_implicit_fill_value(dtype: ZDType, fill_value: Any) -> Any:
148148

149149
@dataclass(frozen=True)
150150
class RustChunkInfo:
151-
chunk_info_with_indices: list[WithSubset]
151+
chunk_info_with_indices: list[ChunkItem]
152152
write_empty_chunks: bool
153153

154154

@@ -160,7 +160,7 @@ def make_chunk_info_for_rust_with_indices(
160160
shape: tuple[int, ...],
161161
) -> RustChunkInfo:
162162
shape = shape if shape else (1,) # constant array
163-
chunk_info_with_indices: list[WithSubset] = []
163+
chunk_info_with_indices: list[ChunkItem] = []
164164
write_empty_chunks: bool = True
165165
for (
166166
byte_getter,
@@ -178,7 +178,6 @@ def make_chunk_info_for_rust_with_indices(
178178
chunk_spec.config,
179179
chunk_spec.prototype,
180180
)
181-
chunk_info = Basic(byte_getter, chunk_spec)
182181
out_selection_as_slices = selector_tuple_to_slice_selection(out_selection)
183182
chunk_selection_as_slices = selector_tuple_to_slice_selection(chunk_selection)
184183
shape_chunk_selection_slices = get_shape_for_selector(
@@ -195,9 +194,10 @@ def make_chunk_info_for_rust_with_indices(
195194
f"{shape_chunk_selection} != {shape_chunk_selection_slices}"
196195
)
197196
chunk_info_with_indices.append(
198-
WithSubset(
199-
chunk_info,
197+
ChunkItem(
198+
key=byte_getter.path,
200199
chunk_subset=chunk_selection_as_slices,
200+
chunk_shape=chunk_spec.shape,
201201
subset=out_selection_as_slices,
202202
shape=shape,
203203
)

src/chunk_item.rs

Lines changed: 29 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -1,175 +1,74 @@
11
use std::num::NonZeroU64;
22

33
use pyo3::{
4-
Bound, PyAny, PyErr, PyResult,
5-
exceptions::{PyIndexError, PyRuntimeError, PyValueError},
4+
Bound, PyErr, PyResult,
5+
exceptions::{PyIndexError, PyValueError},
66
pyclass, pymethods,
7-
types::{PyAnyMethods, PyBytes, PyBytesMethods, PyInt, PySlice, PySliceMethods as _},
7+
types::{PySlice, PySliceMethods as _},
88
};
99
use pyo3_stub_gen::derive::{gen_stub_pyclass, gen_stub_pymethods};
10-
use zarrs::{
11-
array::{ArraySubset, ChunkShape, DataType, FillValue},
12-
metadata::v3::MetadataV3,
13-
storage::StoreKey,
14-
};
10+
use zarrs::{array::ArraySubset, storage::StoreKey};
1511

1612
use crate::utils::PyErrExt;
1713

18-
pub(crate) trait ChunksItem {
19-
fn key(&self) -> &StoreKey;
20-
fn shape(&self) -> &[NonZeroU64];
21-
fn data_type(&self) -> &DataType;
22-
fn fill_value(&self) -> &FillValue;
23-
}
24-
25-
#[derive(Clone)]
26-
#[gen_stub_pyclass]
27-
#[pyclass]
28-
pub(crate) struct Basic {
29-
key: StoreKey,
30-
shape: ChunkShape,
31-
data_type: DataType,
32-
fill_value: FillValue,
33-
}
34-
35-
fn fill_value_to_bytes(dtype: &str, fill_value: &Bound<'_, PyAny>) -> PyResult<Vec<u8>> {
36-
if dtype == "string" {
37-
// Match zarr-python 2.x.x string fill value behaviour with a 0 fill value
38-
// See https://github.com/zarr-developers/zarr-python/issues/2792#issuecomment-2644362122
39-
if let Ok(fill_value_downcast) = fill_value.cast::<PyInt>() {
40-
let fill_value_usize: usize = fill_value_downcast.extract()?;
41-
if fill_value_usize == 0 {
42-
return Ok(vec![]);
43-
}
44-
Err(PyErr::new::<PyValueError, _>(format!(
45-
"Cannot understand non-zero integer {fill_value_usize} fill value for dtype {dtype}"
46-
)))?;
47-
}
48-
}
49-
50-
if let Ok(fill_value_downcast) = fill_value.cast::<PyBytes>() {
51-
Ok(fill_value_downcast.as_bytes().to_vec())
52-
} else if fill_value.hasattr("tobytes")? {
53-
Ok(fill_value.call_method0("tobytes")?.extract()?)
54-
} else {
55-
Err(PyErr::new::<PyValueError, _>(format!(
56-
"Unsupported fill value {fill_value:?}"
57-
)))
58-
}
59-
}
60-
61-
#[gen_stub_pymethods]
62-
#[pymethods]
63-
impl Basic {
64-
#[new]
65-
fn new(byte_interface: &Bound<'_, PyAny>, chunk_spec: &Bound<'_, PyAny>) -> PyResult<Self> {
66-
let path: String = byte_interface.getattr("path")?.extract()?;
67-
68-
let shape: Vec<NonZeroU64> = chunk_spec.getattr("shape")?.extract()?;
69-
70-
let mut dtype: String = chunk_spec
71-
.getattr("dtype")?
72-
.call_method0("to_native_dtype")?
73-
.call_method0("__str__")?
74-
.extract()?;
75-
if dtype == "object" {
76-
// zarrs doesn't understand `object` which is the output of `np.dtype("|O").__str__()`
77-
// but maps it to "string" internally https://github.com/LDeakin/zarrs/blob/0532fe983b7b42b59dbf84e50a2fe5e6f7bad4ce/zarrs_metadata/src/v2_to_v3.rs#L288
78-
dtype = String::from("string");
79-
}
80-
let data_type = get_data_type_from_dtype(&dtype)?;
81-
let fill_value: Bound<'_, PyAny> = chunk_spec.getattr("fill_value")?;
82-
let fill_value = FillValue::new(fill_value_to_bytes(&dtype, &fill_value)?);
83-
Ok(Self {
84-
key: StoreKey::new(path).map_py_err::<PyValueError>()?,
85-
shape,
86-
data_type,
87-
fill_value,
14+
fn to_nonzero_u64_vec(v: Vec<u64>) -> PyResult<Vec<NonZeroU64>> {
15+
v.into_iter()
16+
.map(|dim| {
17+
NonZeroU64::new(dim).ok_or_else(|| {
18+
PyErr::new::<PyValueError, _>(
19+
"subset dimensions must be greater than zero".to_string(),
20+
)
21+
})
8822
})
89-
}
23+
.collect::<PyResult<Vec<NonZeroU64>>>()
9024
}
9125

9226
#[derive(Clone)]
9327
#[gen_stub_pyclass]
9428
#[pyclass]
95-
pub(crate) struct WithSubset {
96-
pub item: Basic,
29+
pub(crate) struct ChunkItem {
30+
pub key: StoreKey,
9731
pub chunk_subset: ArraySubset,
9832
pub subset: ArraySubset,
33+
pub shape: Vec<NonZeroU64>,
34+
pub num_elements: u64,
9935
}
10036

10137
#[gen_stub_pymethods]
10238
#[pymethods]
103-
impl WithSubset {
39+
impl ChunkItem {
10440
#[new]
10541
#[allow(clippy::needless_pass_by_value)]
10642
fn new(
107-
item: Basic,
43+
key: String,
10844
chunk_subset: Vec<Bound<'_, PySlice>>,
45+
chunk_shape: Vec<u64>,
10946
subset: Vec<Bound<'_, PySlice>>,
11047
shape: Vec<u64>,
11148
) -> PyResult<Self> {
112-
let chunk_subset = selection_to_array_subset(&chunk_subset, &item.shape)?;
113-
let shape: Vec<NonZeroU64> = shape
114-
.into_iter()
115-
.map(|dim| {
116-
NonZeroU64::new(dim)
117-
.ok_or("subset dimensions must be greater than zero")
118-
.map_py_err::<PyValueError>()
119-
})
120-
.collect::<PyResult<Vec<NonZeroU64>>>()?;
121-
let subset = selection_to_array_subset(&subset, &shape)?;
49+
let num_elements = chunk_shape.iter().product();
50+
let shape_nonzero_u64 = to_nonzero_u64_vec(shape)?;
51+
let chunk_shape_nonzero_u64 = to_nonzero_u64_vec(chunk_shape)?;
52+
let chunk_subset = selection_to_array_subset(&chunk_subset, &chunk_shape_nonzero_u64)?;
53+
let subset = selection_to_array_subset(&subset, &shape_nonzero_u64)?;
12254
// Check that subset and chunk_subset have the same number of elements.
12355
// This permits broadcasting of a constant input.
12456
if subset.num_elements() != chunk_subset.num_elements() && subset.num_elements() > 1 {
12557
return Err(PyErr::new::<PyIndexError, _>(format!(
12658
"the size of the chunk subset {chunk_subset} and input/output subset {subset} are incompatible",
12759
)));
12860
}
61+
12962
Ok(Self {
130-
item,
63+
key: StoreKey::new(key).map_py_err::<PyValueError>()?,
13164
chunk_subset,
13265
subset,
66+
shape: chunk_shape_nonzero_u64,
67+
num_elements,
13368
})
13469
}
13570
}
13671

137-
impl ChunksItem for Basic {
138-
fn key(&self) -> &StoreKey {
139-
&self.key
140-
}
141-
fn shape(&self) -> &[NonZeroU64] {
142-
&self.shape
143-
}
144-
fn data_type(&self) -> &DataType {
145-
&self.data_type
146-
}
147-
fn fill_value(&self) -> &FillValue {
148-
&self.fill_value
149-
}
150-
}
151-
152-
impl ChunksItem for WithSubset {
153-
fn key(&self) -> &StoreKey {
154-
&self.item.key
155-
}
156-
fn shape(&self) -> &[NonZeroU64] {
157-
&self.item.shape
158-
}
159-
fn data_type(&self) -> &DataType {
160-
&self.item.data_type
161-
}
162-
fn fill_value(&self) -> &FillValue {
163-
&self.item.fill_value
164-
}
165-
}
166-
167-
fn get_data_type_from_dtype(dtype: &str) -> PyResult<DataType> {
168-
let data_type =
169-
DataType::from_metadata(&MetadataV3::new(dtype)).map_py_err::<PyRuntimeError>()?;
170-
Ok(data_type)
171-
}
172-
17372
fn slice_to_range(slice: &Bound<'_, PySlice>, length: isize) -> PyResult<std::ops::Range<u64>> {
17473
let indices = slice.indices(length)?;
17574
if indices.start < 0 {

src/concurrency.rs

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use zarrs::array::{
44
concurrency::calc_concurrency_outer_inner,
55
};
66

7-
use crate::{CodecPipelineImpl, chunk_item::ChunksItem, utils::PyCodecErrExt as _};
7+
use crate::{CodecPipelineImpl, chunk_item::ChunkItem, utils::PyCodecErrExt as _};
88

99
pub trait ChunkConcurrentLimitAndCodecOptions {
1010
fn get_chunk_concurrent_limit_and_codec_options(
@@ -13,22 +13,19 @@ pub trait ChunkConcurrentLimitAndCodecOptions {
1313
) -> PyResult<Option<(usize, CodecOptions)>>;
1414
}
1515

16-
impl<T> ChunkConcurrentLimitAndCodecOptions for Vec<T>
17-
where
18-
T: ChunksItem,
19-
{
16+
impl ChunkConcurrentLimitAndCodecOptions for Vec<ChunkItem> {
2017
fn get_chunk_concurrent_limit_and_codec_options(
2118
&self,
2219
codec_pipeline_impl: &CodecPipelineImpl,
2320
) -> PyResult<Option<(usize, CodecOptions)>> {
2421
let num_chunks = self.len();
25-
let Some(chunk_descriptions0) = self.first() else {
22+
let Some(item) = self.first() else {
2623
return Ok(None);
2724
};
2825

2926
let codec_concurrency = codec_pipeline_impl
3027
.codec_chain
31-
.recommended_concurrency(chunk_descriptions0.shape(), chunk_descriptions0.data_type())
28+
.recommended_concurrency(&item.shape, &codec_pipeline_impl.data_type)
3229
.map_codec_err()?;
3330

3431
let min_concurrent_chunks =

0 commit comments

Comments
 (0)