Skip to content

Commit 36aa743

Browse files
Added chunking to CLI
Closes #69 Closes #118
1 parent 1d77d1f commit 36aa743

File tree

5 files changed

+45
-6
lines changed

5 files changed

+45
-6
lines changed

tests/test_cli.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# MIT License
22
#
3-
# Copyright (c) 2019 Tskit Developers
3+
# Copyright (c) 2019-2026 Tskit Developers
44
#
55
# Permission is hereby granted, free of charge, to any person obtaining a copy
66
# of this software and associated documentation files (the "Software"), to deal
@@ -34,6 +34,7 @@
3434

3535
import tszip
3636
import tszip.cli as cli
37+
from tszip import compat
3738

3839

3940
def get_stdout_for_pytest():
@@ -98,6 +99,7 @@ def test_default_values(self):
9899
self.assertEqual(args.decompress, False)
99100
self.assertEqual(args.list, False)
100101
self.assertEqual(args.stdout, False)
102+
self.assertEqual(args.chunk_size, tszip.DEFAULT_CHUNK_SIZE)
101103
self.assertEqual(args.variants_only, False)
102104
self.assertEqual(args.suffix, ".tsz")
103105

@@ -123,6 +125,14 @@ def test_decompress(self):
123125
args = parser.parse_args([infile, "--decompress"])
124126
self.assertTrue(args.decompress)
125127

128+
def test_chunk_size(self):
129+
parser = cli.tszip_cli_parser()
130+
infile = "tmp.trees.tsz"
131+
args = parser.parse_args([infile, "-C", "1234"])
132+
self.assertEqual(args.chunk_size, 1234)
133+
args = parser.parse_args([infile, "--chunk-size=1234"])
134+
self.assertTrue(args.chunk_size, 1234)
135+
126136

127137
class TestCli(unittest.TestCase):
128138
"""
@@ -248,6 +258,20 @@ def test_variants_only(self):
248258
G2 = self.ts.genotype_matrix()
249259
self.assertTrue(np.array_equal(G1, G2))
250260

261+
def test_chunk_size(self):
262+
self.assertTrue(self.trees_path.exists())
263+
self.run_tszip([str(self.trees_path), "--chunk-size=20"])
264+
self.assertFalse(self.trees_path.exists())
265+
outpath = pathlib.Path(str(self.trees_path) + ".tsz")
266+
self.assertTrue(outpath.exists())
267+
ts = tszip.decompress(outpath)
268+
self.assertEqual(ts.tables, self.ts.tables)
269+
store = compat.create_zip_store(str(outpath), mode="r")
270+
root = compat.create_zarr_group(store=store)
271+
for _, g in root.groups():
272+
for _, a in g.arrays():
273+
assert a.chunks == (20,)
274+
251275
def test_keep(self):
252276
self.assertTrue(self.trees_path.exists())
253277
self.run_tszip([str(self.trees_path), "--keep"])

tests/test_compression.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# MIT License
22
#
3-
# Copyright (c) 2021 Tskit Developers
3+
# Copyright (c) 2021-2026 Tskit Developers
44
#
55
# Permission is hereby granted, free of charge, to any person obtaining a copy
66
# of this software and associated documentation files (the "Software"), to deal
@@ -307,7 +307,9 @@ def test_provenance(self):
307307
root = compat.create_zarr_group(store=store)
308308
self.assertEqual(
309309
root.attrs["provenance"],
310-
provenance.get_provenance_dict({"variants_only": variants_only}),
310+
provenance.get_provenance_dict(
311+
{"variants_only": variants_only, "chunk_size": None}
312+
),
311313
)
312314

313315
def write_file(self, attrs, path):
@@ -540,7 +542,7 @@ def test_good_chunks(self, tmpdir, chunk_size):
540542
ts2 = tszip.decompress(path)
541543
assert ts1 == ts2
542544

543-
store = compat.create_zip_store(path, mode="r")
545+
store = compat.create_zip_store(str(path), mode="r")
544546
root = compat.create_zarr_group(store=store)
545547
for _, g in root.groups():
546548
for _, a in g.arrays():

tszip/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
# SOFTWARE.
2222
from .compression import compress # NOQA
2323
from .compression import decompress # NOQA
24+
from .compression import DEFAULT_CHUNK_SIZE # NOQA
2425
from .compression import load # NOQA
2526
from .compression import print_summary # NOQA
2627
from .provenance import __version__ # NOQA

tszip/cli.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,14 @@ def tszip_cli_parser():
6464
"-v", "--verbosity", action="count", default=0, help="Increase the verbosity"
6565
)
6666
parser.add_argument("files", nargs="+", help="The files to compress/decompress.")
67+
parser.add_argument(
68+
"-C",
69+
"--chunk-size",
70+
type=int,
71+
default=tszip.DEFAULT_CHUNK_SIZE,
72+
help="Sets the size of array chunks to be compressed to the specified "
73+
f"number of elements. Default={tszip.DEFAULT_CHUNK_SIZE}",
74+
)
6775
parser.add_argument(
6876
"--variants-only",
6977
action="store_true",
@@ -125,7 +133,9 @@ def run_compress(args):
125133
check_output(outfile, args)
126134
if args.stdout:
127135
outfile = get_stdout()
128-
tszip.compress(ts, outfile, variants_only=args.variants_only)
136+
tszip.compress(
137+
ts, outfile, variants_only=args.variants_only, chunk_size=args.chunk_size
138+
)
129139
remove_input(infile, args)
130140

131141

tszip/compression.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@
4848
FORMAT_NAME = "tszip"
4949
FORMAT_VERSION = [1, 0]
5050

51+
DEFAULT_CHUNK_SIZE = 2**20
52+
5153

5254
def minimal_dtype(array):
5355
"""
@@ -178,7 +180,7 @@ def compress_zarr(ts, root, variants_only=False, chunk_size=None):
178180
)
179181

180182
if chunk_size is None:
181-
chunk_size = 2**20
183+
chunk_size = DEFAULT_CHUNK_SIZE
182184
if not isinstance(chunk_size, numbers.Integral):
183185
raise TypeError("Chunk size must be an integer")
184186
if chunk_size < 1:

0 commit comments

Comments
 (0)