add tests for inmemory reader and fix bug

Nanguage · Nanguage · commit 9dd1df52557b · 2025-10-27T17:12:58.000-07:00
diff --git a/coolbox/utilities/__init__.py b/coolbox/utilities/__init__.py
@@ -2,7 +2,6 @@
 from os.path import abspath, dirname, join
 from collections import deque
 
-from .bed import *
 from .figtools import *
 from .filetool import *
 from .fmtconvert import *
diff --git a/coolbox/utilities/reader/tab.py b/coolbox/utilities/reader/tab.py
@@ -189,10 +189,14 @@ class TabFileReader(abc.ABC):
     """
     def __init__(self, path: str, **params):
         self.path = path
-        suffix = osp.splitext(path.rstrip(".bgz"))[1].lower()
+        if path.endswith(".bgz"):
+            _p = path.rstrip(".bgz")
+        else:
+            _p = path
+        suffix = osp.splitext(_p)[1].lower()
         self.suffix = suffix
         self.bed_type = None
-        if suffix in [".bed", ".bedgraph"]:
+        if suffix in [".bed", ".bedgraph", ".bg"]:
             self.bed_type = guess_bed_type(path)
         self.params = params
         self.is_2d = False
@@ -211,7 +215,7 @@ def __init__(self, path: str, **params):
         suffix = self.suffix
         if suffix == ".gtf":
             ds = ox.from_gtf(self.path)
-        elif suffix in [".bed", ".bedgraph"]:
+        elif suffix in [".bed", ".bedgraph", ".bg"]:
             ds = ox.from_bed(self.path)
         elif suffix in ['.bw', '.bigwig']:
             ds = ox.from_bigwig(self.path)
@@ -224,7 +228,7 @@ def __init__(self, path: str, **params):
     def query(self, gr: GenomeRange, **kwargs) -> pd.DataFrame:
         sub = self.ds.regions(str(gr))
         df = sub.pd()
-        if self.suffix in [".bed", ".bedgraph"]:
+        if self.suffix in [".bed", ".bedgraph", ".bg"]:
             rest = df.pop('rest')
             df_rest = rest.str.split('\t', expand=True)
             df = pd.concat([df, df_rest], axis=1)
@@ -260,7 +264,7 @@ def query(
             itr = tabix_query(self.path, gr, split=True)
         rows = list(itr)
         df = pd.DataFrame(rows)
-        if self.suffix in [".bed", ".bedgraph"]:
+        if self.suffix in [".bed", ".bedgraph", ".bg"]:
             columns = FMT2COLUMNS[self.bed_type]
         else:
             fmt = self.suffix[1:]
@@ -277,7 +281,7 @@ def query(
 class TabFileReaderInMemory(TabFileReader):
     def __init__(self, path: str, **params):
         super().__init__(path, **params)
-        if self.suffix in [".bed", ".bedgraph"]:
+        if self.suffix in [".bed", ".bedgraph", ".bg"]:
             columns = FMT2COLUMNS[self.bed_type]
         else:
             fmt = self.suffix[1:]
@@ -312,27 +316,28 @@ def query(
                     'chr1': 'chrom1',
                     'start1': 'start1',
                     'end1': 'end1',
-                    'chrom2': 'chrom2',
+                    'chr2': 'chrom2',
                     'start2': 'start2',
                     'end2': 'end2',
                 }
 
             if second is not None:
                 sdf = self.df.query(
-                    f"{field_names['chrom1']} == '{gr.chrom}' and {field_names['start1']} >= {gr.start} and {field_names['end1']} <= {gr.end} "
-                    f"and {field_names['chrom2']} == '{second.chrom}' and {field_names['start2']} >= {second.start} and {field_names['end2']} <= {second.end}"
+                    f"{field_names['chr1']} == '{gr.chrom}' and {field_names['start1']} >= {gr.start} and {field_names['end1']} <= {gr.end} "
+                    f"and {field_names['chr2']} == '{second.chrom}' and {field_names['start2']} >= {second.start} and {field_names['end2']} <= {second.end}"
                 )
                 return sdf
             else:
                 if open_region:
-                    sdf = self.df.query(
-                        f"{field_names['chrom1']} == '{gr.chrom}' and {field_names['start1']} >= {gr.start} and {field_names['end1']} <= {gr.end} "
-                        f"and {field_names['chrom2']} == '{gr.chrom}"
+                    q = (
+                        f"{field_names['chr1']} == '{gr.chrom}' and {field_names['start1']} >= {gr.start} and {field_names['end1']} <= {gr.end} "
+                        f"and {field_names['chr2']} == '{gr.chrom}'"
                     )
+                    sdf = self.df.query(q)
                 else:
                     sdf = self.df.query(
-                        f"{field_names['chrom1']} == '{gr.chrom}' and {field_names['start1']} >= {gr.start} and {field_names['end1']} <= {gr.end} "
-                        f"and {field_names['chrom2']} == '{gr.chrom}' and {field_names['start2']} >= {gr.start} and {field_names['end2']} <= {gr.end}"
+                        f"{field_names['chr1']} == '{gr.chrom}' and {field_names['start1']} >= {gr.start} and {field_names['end1']} <= {gr.end} "
+                        f"and {field_names['chr2']} == '{gr.chrom}' and {field_names['start2']} >= {gr.start} and {field_names['end2']} <= {gr.end}"
                     )
                 return sdf
         else:
@@ -367,7 +372,7 @@ def _build_bgz_file(
     cat_cmd = "zcat" if input_is_gz else "cat"
     if prefix.lower().endswith(".gtf"):
         cmd = f'{cat_cmd} {path} | grep -v ^"#" | sort -k1,1 -k4,4n | bgzip > {output_path}'
-    elif prefix.lower().endswith('.bed') or prefix.lower().endswith('.bedgraph'):
+    elif prefix.lower().endswith('.bed') or prefix.lower().endswith('.bedgraph') or prefix.lower().endswith('.bg'):
         cmd = f'{cat_cmd} {path} | sort -k1,1 -k2,2n | bgzip > {output_path}'
     elif prefix.lower().endswith('.bedpe'):
         cmd = f'{cat_cmd} {path} | sort -k1,1 -k4,4 -k2,2n -k5,5n | bgzip > {output_path}'
diff --git a/tests/test_tab_reader.py b/tests/test_tab_reader.py
@@ -0,0 +1,94 @@
+import platform
+import pandas as pd
+import pytest
+
+from coolbox.utilities.reader.tab import (
+    TabFileReaderInMemory,
+    get_indexed_tab_reader,
+    guess_bed_type,
+    FMT2COLUMNS,
+)
+
+
+def test_guess_bed_type(data_dir, test_itv):
+    assert guess_bed_type(f"{data_dir}/bed_{test_itv}.bed") == "bed12"
+    assert guess_bed_type(f"{data_dir}/bed6_{test_itv}.bed") == "bed6"
+    assert guess_bed_type(f"{data_dir}/bed9_{test_itv}.bed") == "bed9"
+    assert guess_bed_type(f"{data_dir}/bedgraph_{test_itv}.bg") == "bedgraph"
+
+
+def test_inmemory_bed6_query(data_dir, test_interval, empty_interval, test_itv):
+    path = f"{data_dir}/bed6_{test_itv}.bed"
+    rdr = TabFileReaderInMemory(path)
+    df = rdr.query(test_interval)
+    assert isinstance(df, pd.DataFrame)
+    assert list(df.columns) == FMT2COLUMNS["bed6"]
+    assert (df["chrom"] == test_interval.chrom).all()
+    assert (df["start"] >= test_interval.start).all()
+    assert (df["end"] <= test_interval.end).all()
+    # empty interval returns empty frame
+    df_empty = rdr.query(empty_interval)
+    assert df_empty.shape[0] == 0
+
+
+def test_inmemory_bed12_query(data_dir, test_interval, test_itv):
+    path = f"{data_dir}/bed_{test_itv}.bed"
+    rdr = TabFileReaderInMemory(path)
+    df = rdr.query(test_interval)
+    assert isinstance(df, pd.DataFrame)
+    assert list(df.columns) == FMT2COLUMNS["bed12"]
+    assert df.shape[0] > 0
+
+
+def test_inmemory_bedgraph_query(data_dir, test_interval, empty_interval, test_itv):
+    path = f"{data_dir}/bedgraph_{test_itv}.bg"
+    rdr = TabFileReaderInMemory(path)
+    df = rdr.query(test_interval)
+    assert list(df.columns) == FMT2COLUMNS["bedgraph"]
+    assert df.shape[0] > 0
+    rdr.query(empty_interval)  # should not raise
+
+
+def test_inmemory_gtf_query(data_dir, test_interval, empty_interval, test_itv):
+    path = f"{data_dir}/gtf_{test_itv}.gtf"
+    rdr = TabFileReaderInMemory(path)
+    df = rdr.query(test_interval)
+    # GTF uses seqname instead of chrom in in-memory reader
+    assert list(df.columns) == FMT2COLUMNS["gtf"]
+    assert df.shape[0] > 0
+    df_empty = rdr.query(empty_interval)
+    assert df_empty.shape[0] == 0
+
+
+def test_inmemory_pairs_query(data_dir, test_interval, test_itv):
+    path = f"{data_dir}/pairs_{test_itv}.pairs"
+    rdr = TabFileReaderInMemory(path)
+    # 1D within same region
+    df_same = rdr.query(test_interval)
+    assert list(df_same.columns) == FMT2COLUMNS["pairs"]
+    assert df_same.shape[0] > 0
+    # 2D with explicit second region
+    df_2d = rdr.query(test_interval, second=test_interval)
+    assert df_2d.shape[0] > 0
+    # open_region allows second chrom-only matching
+    df_open = rdr.query(test_interval, open_region=True)
+    assert df_open.shape[0] >= df_same.shape[0]
+
+
+def test_inmemory_bedpe_query(data_dir, test_interval, test_itv):
+    path = f"{data_dir}/bedpe_{test_itv}.bedpe"
+    rdr = TabFileReaderInMemory(path)
+    df_same = rdr.query(test_interval)
+    assert list(df_same.columns) == FMT2COLUMNS["bedpe"]
+    assert df_same.shape[0] > 0
+    df_2d = rdr.query(test_interval, second=test_interval)
+    assert df_2d.shape[0] > 0
+
+
+@pytest.mark.skipif(platform.system() != "Windows", reason="Specific to Windows fallback behavior")
+def test_get_indexed_tab_reader_fallback_to_inmemory_on_windows(data_dir, test_itv):
+    # On Windows, bgzip/tabix are unavailable and ensure_unix raises, so it must fall back
+    path = f"{data_dir}/bed6_{test_itv}.bed"
+    rdr = get_indexed_tab_reader(path, columns=FMT2COLUMNS["bed6"])
+    assert isinstance(rdr, TabFileReaderInMemory)
+