Skip to content

Commit 9dd1df5

Browse files
committed
add tests for inmemory reader and fix bug
1 parent 03ec186 commit 9dd1df5

File tree

3 files changed

+114
-16
lines changed

3 files changed

+114
-16
lines changed

coolbox/utilities/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
from os.path import abspath, dirname, join
33
from collections import deque
44

5-
from .bed import *
65
from .figtools import *
76
from .filetool import *
87
from .fmtconvert import *

coolbox/utilities/reader/tab.py

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -189,10 +189,14 @@ class TabFileReader(abc.ABC):
189189
"""
190190
def __init__(self, path: str, **params):
191191
self.path = path
192-
suffix = osp.splitext(path.rstrip(".bgz"))[1].lower()
192+
if path.endswith(".bgz"):
193+
_p = path.rstrip(".bgz")
194+
else:
195+
_p = path
196+
suffix = osp.splitext(_p)[1].lower()
193197
self.suffix = suffix
194198
self.bed_type = None
195-
if suffix in [".bed", ".bedgraph"]:
199+
if suffix in [".bed", ".bedgraph", ".bg"]:
196200
self.bed_type = guess_bed_type(path)
197201
self.params = params
198202
self.is_2d = False
@@ -211,7 +215,7 @@ def __init__(self, path: str, **params):
211215
suffix = self.suffix
212216
if suffix == ".gtf":
213217
ds = ox.from_gtf(self.path)
214-
elif suffix in [".bed", ".bedgraph"]:
218+
elif suffix in [".bed", ".bedgraph", ".bg"]:
215219
ds = ox.from_bed(self.path)
216220
elif suffix in ['.bw', '.bigwig']:
217221
ds = ox.from_bigwig(self.path)
@@ -224,7 +228,7 @@ def __init__(self, path: str, **params):
224228
def query(self, gr: GenomeRange, **kwargs) -> pd.DataFrame:
225229
sub = self.ds.regions(str(gr))
226230
df = sub.pd()
227-
if self.suffix in [".bed", ".bedgraph"]:
231+
if self.suffix in [".bed", ".bedgraph", ".bg"]:
228232
rest = df.pop('rest')
229233
df_rest = rest.str.split('\t', expand=True)
230234
df = pd.concat([df, df_rest], axis=1)
@@ -260,7 +264,7 @@ def query(
260264
itr = tabix_query(self.path, gr, split=True)
261265
rows = list(itr)
262266
df = pd.DataFrame(rows)
263-
if self.suffix in [".bed", ".bedgraph"]:
267+
if self.suffix in [".bed", ".bedgraph", ".bg"]:
264268
columns = FMT2COLUMNS[self.bed_type]
265269
else:
266270
fmt = self.suffix[1:]
@@ -277,7 +281,7 @@ def query(
277281
class TabFileReaderInMemory(TabFileReader):
278282
def __init__(self, path: str, **params):
279283
super().__init__(path, **params)
280-
if self.suffix in [".bed", ".bedgraph"]:
284+
if self.suffix in [".bed", ".bedgraph", ".bg"]:
281285
columns = FMT2COLUMNS[self.bed_type]
282286
else:
283287
fmt = self.suffix[1:]
@@ -312,27 +316,28 @@ def query(
312316
'chr1': 'chrom1',
313317
'start1': 'start1',
314318
'end1': 'end1',
315-
'chrom2': 'chrom2',
319+
'chr2': 'chrom2',
316320
'start2': 'start2',
317321
'end2': 'end2',
318322
}
319323

320324
if second is not None:
321325
sdf = self.df.query(
322-
f"{field_names['chrom1']} == '{gr.chrom}' and {field_names['start1']} >= {gr.start} and {field_names['end1']} <= {gr.end} "
323-
f"and {field_names['chrom2']} == '{second.chrom}' and {field_names['start2']} >= {second.start} and {field_names['end2']} <= {second.end}"
326+
f"{field_names['chr1']} == '{gr.chrom}' and {field_names['start1']} >= {gr.start} and {field_names['end1']} <= {gr.end} "
327+
f"and {field_names['chr2']} == '{second.chrom}' and {field_names['start2']} >= {second.start} and {field_names['end2']} <= {second.end}"
324328
)
325329
return sdf
326330
else:
327331
if open_region:
328-
sdf = self.df.query(
329-
f"{field_names['chrom1']} == '{gr.chrom}' and {field_names['start1']} >= {gr.start} and {field_names['end1']} <= {gr.end} "
330-
f"and {field_names['chrom2']} == '{gr.chrom}"
332+
q = (
333+
f"{field_names['chr1']} == '{gr.chrom}' and {field_names['start1']} >= {gr.start} and {field_names['end1']} <= {gr.end} "
334+
f"and {field_names['chr2']} == '{gr.chrom}'"
331335
)
336+
sdf = self.df.query(q)
332337
else:
333338
sdf = self.df.query(
334-
f"{field_names['chrom1']} == '{gr.chrom}' and {field_names['start1']} >= {gr.start} and {field_names['end1']} <= {gr.end} "
335-
f"and {field_names['chrom2']} == '{gr.chrom}' and {field_names['start2']} >= {gr.start} and {field_names['end2']} <= {gr.end}"
339+
f"{field_names['chr1']} == '{gr.chrom}' and {field_names['start1']} >= {gr.start} and {field_names['end1']} <= {gr.end} "
340+
f"and {field_names['chr2']} == '{gr.chrom}' and {field_names['start2']} >= {gr.start} and {field_names['end2']} <= {gr.end}"
336341
)
337342
return sdf
338343
else:
@@ -367,7 +372,7 @@ def _build_bgz_file(
367372
cat_cmd = "zcat" if input_is_gz else "cat"
368373
if prefix.lower().endswith(".gtf"):
369374
cmd = f'{cat_cmd} {path} | grep -v ^"#" | sort -k1,1 -k4,4n | bgzip > {output_path}'
370-
elif prefix.lower().endswith('.bed') or prefix.lower().endswith('.bedgraph'):
375+
elif prefix.lower().endswith('.bed') or prefix.lower().endswith('.bedgraph') or prefix.lower().endswith('.bg'):
371376
cmd = f'{cat_cmd} {path} | sort -k1,1 -k2,2n | bgzip > {output_path}'
372377
elif prefix.lower().endswith('.bedpe'):
373378
cmd = f'{cat_cmd} {path} | sort -k1,1 -k4,4 -k2,2n -k5,5n | bgzip > {output_path}'

tests/test_tab_reader.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import platform
2+
import pandas as pd
3+
import pytest
4+
5+
from coolbox.utilities.reader.tab import (
6+
TabFileReaderInMemory,
7+
get_indexed_tab_reader,
8+
guess_bed_type,
9+
FMT2COLUMNS,
10+
)
11+
12+
13+
def test_guess_bed_type(data_dir, test_itv):
14+
assert guess_bed_type(f"{data_dir}/bed_{test_itv}.bed") == "bed12"
15+
assert guess_bed_type(f"{data_dir}/bed6_{test_itv}.bed") == "bed6"
16+
assert guess_bed_type(f"{data_dir}/bed9_{test_itv}.bed") == "bed9"
17+
assert guess_bed_type(f"{data_dir}/bedgraph_{test_itv}.bg") == "bedgraph"
18+
19+
20+
def test_inmemory_bed6_query(data_dir, test_interval, empty_interval, test_itv):
21+
path = f"{data_dir}/bed6_{test_itv}.bed"
22+
rdr = TabFileReaderInMemory(path)
23+
df = rdr.query(test_interval)
24+
assert isinstance(df, pd.DataFrame)
25+
assert list(df.columns) == FMT2COLUMNS["bed6"]
26+
assert (df["chrom"] == test_interval.chrom).all()
27+
assert (df["start"] >= test_interval.start).all()
28+
assert (df["end"] <= test_interval.end).all()
29+
# empty interval returns empty frame
30+
df_empty = rdr.query(empty_interval)
31+
assert df_empty.shape[0] == 0
32+
33+
34+
def test_inmemory_bed12_query(data_dir, test_interval, test_itv):
35+
path = f"{data_dir}/bed_{test_itv}.bed"
36+
rdr = TabFileReaderInMemory(path)
37+
df = rdr.query(test_interval)
38+
assert isinstance(df, pd.DataFrame)
39+
assert list(df.columns) == FMT2COLUMNS["bed12"]
40+
assert df.shape[0] > 0
41+
42+
43+
def test_inmemory_bedgraph_query(data_dir, test_interval, empty_interval, test_itv):
44+
path = f"{data_dir}/bedgraph_{test_itv}.bg"
45+
rdr = TabFileReaderInMemory(path)
46+
df = rdr.query(test_interval)
47+
assert list(df.columns) == FMT2COLUMNS["bedgraph"]
48+
assert df.shape[0] > 0
49+
rdr.query(empty_interval) # should not raise
50+
51+
52+
def test_inmemory_gtf_query(data_dir, test_interval, empty_interval, test_itv):
53+
path = f"{data_dir}/gtf_{test_itv}.gtf"
54+
rdr = TabFileReaderInMemory(path)
55+
df = rdr.query(test_interval)
56+
# GTF uses seqname instead of chrom in in-memory reader
57+
assert list(df.columns) == FMT2COLUMNS["gtf"]
58+
assert df.shape[0] > 0
59+
df_empty = rdr.query(empty_interval)
60+
assert df_empty.shape[0] == 0
61+
62+
63+
def test_inmemory_pairs_query(data_dir, test_interval, test_itv):
64+
path = f"{data_dir}/pairs_{test_itv}.pairs"
65+
rdr = TabFileReaderInMemory(path)
66+
# 1D within same region
67+
df_same = rdr.query(test_interval)
68+
assert list(df_same.columns) == FMT2COLUMNS["pairs"]
69+
assert df_same.shape[0] > 0
70+
# 2D with explicit second region
71+
df_2d = rdr.query(test_interval, second=test_interval)
72+
assert df_2d.shape[0] > 0
73+
# open_region allows second chrom-only matching
74+
df_open = rdr.query(test_interval, open_region=True)
75+
assert df_open.shape[0] >= df_same.shape[0]
76+
77+
78+
def test_inmemory_bedpe_query(data_dir, test_interval, test_itv):
79+
path = f"{data_dir}/bedpe_{test_itv}.bedpe"
80+
rdr = TabFileReaderInMemory(path)
81+
df_same = rdr.query(test_interval)
82+
assert list(df_same.columns) == FMT2COLUMNS["bedpe"]
83+
assert df_same.shape[0] > 0
84+
df_2d = rdr.query(test_interval, second=test_interval)
85+
assert df_2d.shape[0] > 0
86+
87+
88+
@pytest.mark.skipif(platform.system() != "Windows", reason="Specific to Windows fallback behavior")
89+
def test_get_indexed_tab_reader_fallback_to_inmemory_on_windows(data_dir, test_itv):
90+
# On Windows, bgzip/tabix are unavailable and ensure_unix raises, so it must fall back
91+
path = f"{data_dir}/bed6_{test_itv}.bed"
92+
rdr = get_indexed_tab_reader(path, columns=FMT2COLUMNS["bed6"])
93+
assert isinstance(rdr, TabFileReaderInMemory)
94+

0 commit comments

Comments
 (0)