Skip to content

Commit 5bad720

Browse files
committed
use new reader to load data in tracks
1 parent 1543436 commit 5bad720

File tree

15 files changed

+107
-244
lines changed

15 files changed

+107
-244
lines changed

coolbox/core/track/arcs/base.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1+
import typing as T
2+
13
import pandas as pd
24

35
from coolbox.utilities import GenomeRange
6+
from coolbox.utilities.reader.tab import get_indexed_tab_reader
47
from coolbox.core.track.base import Track
58
from .plot import PlotContacts
69

@@ -83,18 +86,24 @@ def __init__(self, **kwargs):
8386
properties = ArcsBase.DEFAULT_PROPERTIES.copy()
8487
properties.update(kwargs)
8588
super().__init__(properties)
89+
self.reader = get_indexed_tab_reader(self.file)
8690

87-
def fetch_plot_data(self, gr: GenomeRange, **kwargs) -> pd.DataFrame:
91+
def fetch_data(
92+
self,
93+
gr: GenomeRange,
94+
gr2: T.Optional[GenomeRange] = None,
95+
**kwargs) -> pd.DataFrame:
8896
"""
89-
9097
Returns
9198
-------
9299
intervals : pandas.core.frame.DataFrame
93100
Can be two types:
94101
1: with columns: ['pos1', 'pos2', 'score'] 'score' is optional
95102
2: with columns: ['start1', 'end1', 'start2', 'end2', 'score'] 'score' is optional
96103
"""
97-
return self.fetch_data(gr, **kwargs)
104+
open_region = self.properties.get("open_region") in ["yes", True]
105+
df = self.reader.query_var_chr(gr, second=gr2, open_region=open_region, **kwargs)
106+
return df
98107

99108
def plot(self, ax, gr: GenomeRange, **kwargs):
100109
"""

coolbox/core/track/arcs/bedpe.py

Lines changed: 1 addition & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
11
import pandas as pd
22

33
from .base import ArcsBase
4-
from .fetch import FetchParix
5-
from coolbox.utilities.bed import process_bedpe
64
from coolbox.utilities.genome import GenomeRange
75

86

9-
class BEDPE(ArcsBase, FetchParix):
7+
class BEDPE(ArcsBase):
108
"""
119
Arcs track from .bedpe file.
1210
@@ -32,26 +30,6 @@ def __init__(self, file, **kwargs):
3230
**kwargs
3331
})
3432
super().__init__(**properties)
35-
self.bgz_file = process_bedpe(file)
36-
37-
def fetch_data(self, gr: GenomeRange, **kwargs) -> pd.DataFrame:
38-
# filter peaks manually for hicpeaks style in fetch_plot_data
39-
df = self.fetch_intervals(self.bgz_file, gr, kwargs.get('gr2'))
40-
# TODO the returned df has no named columns, may cause error
41-
if len(df) == 0:
42-
return df
43-
44-
columns = list(df.columns)
45-
for i, col in enumerate(self.FIELDS):
46-
if i >= len(columns):
47-
break
48-
columns[i] = col
49-
df.columns = columns
50-
for col in ['start1', 'end1', 'start2', 'end2']:
51-
df[col] = df[col].astype(int)
52-
if 'score' in df.columns:
53-
df['score'] = df['score'].astype(float)
54-
return df
5533

5634
def fetch_plot_data(self, gr: GenomeRange, **kwargs) -> pd.DataFrame:
5735
df = self.fetch_data(gr, **kwargs)

coolbox/core/track/arcs/fetch.py

Lines changed: 0 additions & 16 deletions
This file was deleted.

coolbox/core/track/arcs/pairs.py

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,8 @@
1-
import pandas as pd
2-
31
from .base import ArcsBase
4-
from .fetch import FetchParix
5-
from coolbox.utilities.bed import process_pairs
62
from coolbox.utilities.genome import GenomeRange
73

84

9-
class Pairs(ArcsBase, FetchParix):
5+
class Pairs(ArcsBase):
106
"""
117
Arcs track from .pairs file.
128
@@ -27,22 +23,4 @@ def __init__(self, file, **kwargs):
2723
'file': file,
2824
**kwargs
2925
})
30-
super().__init__(**properties)
31-
self.bgz_file = process_pairs(file)
32-
33-
def fetch_data(self, gr: GenomeRange, **kwargs):
34-
# filter peaks manually in peaks style
35-
df = self.fetch_intervals(self.bgz_file, gr, kwargs.get('gr2'))
36-
# TODO the returned df has no named columns, may cause error
37-
if len(df) == 0:
38-
return df
39-
40-
columns = list(df.columns)
41-
for i, col in enumerate(self.FIELDS):
42-
if i >= len(columns):
43-
break
44-
columns[i] = col
45-
df.columns = columns
46-
for col in ['pos1', 'pos2']:
47-
df[col] = df[col].astype(int)
48-
return df
26+
super().__init__(**properties)

coolbox/core/track/bam.py

Lines changed: 3 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from coolbox.utilities import (
66
get_logger, GenomeRange, split_genome_range
77
)
8-
from coolbox.utilities.bam import process_bam, query_bam
8+
from coolbox.utilities.reader.tab import get_indexed_tab_reader
99
from .base import Track
1010

1111
log = get_logger(__name__)
@@ -39,7 +39,7 @@ def __init__(self, file, **kwargs):
3939
})
4040
properties.update(kwargs)
4141
super().__init__(properties)
42-
self.indexed_bam = process_bam(file)
42+
self.reader = get_indexed_tab_reader(file)
4343

4444
def fetch_data(self, gr: GenomeRange, **kwargs) -> pd.DataFrame:
4545
"""
@@ -53,7 +53,7 @@ def fetch_data(self, gr: GenomeRange, **kwargs) -> pd.DataFrame:
5353
columns = ["qname", "flag", "rname", "pos", "mapq", "cigar",
5454
"rnext", "pnext", "tlen", "seq", "qual", "options"]
5555
"""
56-
return self.fetch_intervals(gr)
56+
return self.reader.query_var_chr(gr)
5757

5858
def plot(self, ax, gr: GenomeRange, **kwargs):
5959
self.plot_align(ax, gr)
@@ -85,22 +85,3 @@ def plot_align(self, ax, gr: GenomeRange):
8585
with_ruler=False,
8686
draw_line=False
8787
)
88-
89-
def fetch_intervals(self, genome_range: GenomeRange):
90-
chrom, start, end = split_genome_range(genome_range)
91-
rows = [
92-
row_items
93-
for row_items in query_bam(
94-
self.indexed_bam, chrom, start, end, split=True
95-
)
96-
]
97-
98-
# https://samtools.github.io/hts-specs/SAMv1.pdf
99-
fields = ["qname", "flag", "rname", "pos", "mapq", "cigar",
100-
"rnext", "pnext", "tlen", "seq", "qual", "options"]
101-
df = pd.DataFrame(rows, columns=fields)
102-
if df.shape[0] > 0:
103-
df['flag'] = df['flag'].astype(int)
104-
df['pos'] = df['pos'].astype(int)
105-
df['mapq'] = df['mapq'].astype(int)
106-
return df

coolbox/core/track/bed/base.py

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import matplotlib
55

66
from coolbox.utilities import get_logger
7-
from coolbox.utilities.bed import build_bed_index
7+
from coolbox.utilities.reader.tab import get_indexed_tab_reader
88
from coolbox.utilities.genome import GenomeRange
99
from coolbox.core.track.base import Track
1010

@@ -52,7 +52,7 @@ def __init__(self, file, **kwargs):
5252
**kwargs
5353
})
5454
super().__init__(properties)
55-
self.bgz_file = build_bed_index(file)
55+
self.reader = get_indexed_tab_reader(file)
5656

5757
def fetch_data(self, gr: GenomeRange, **kwargs) -> pd.DataFrame:
5858
"""
@@ -135,18 +135,21 @@ def get_rgb_and_edge_color(self, bed):
135135
@staticmethod
136136
def infer_bed_type(df: pd.DataFrame) -> Union[str, None]:
137137
# bed_type of dataframe are store in dataframe's __dict__ in FetchBed.fetch_intervals
138-
if 'bed_type' in df.__dict__:
139-
bed_type = df.bed_type
140-
else:
141-
bed_types = {
142-
12: 'bed12',
143-
9: 'bed9',
144-
6: 'bed6',
145-
3: 'bed3'
146-
}
147-
num_col = len(df.columns)
148-
bed_type = bed_types[num_col] if num_col in bed_types else 'bed3'
149-
if bed_type == 'bed3' and num_col < 3:
150-
raise ValueError(f"Invalid dataframe for bed3 with columns: {df.columns}")
138+
bed_types = {
139+
12: 'bed12',
140+
9: 'bed9',
141+
6: 'bed6',
142+
3: 'bed3'
143+
}
144+
num_col = len(df.columns)
145+
bed_type = bed_types[num_col] if num_col in bed_types else 'bed3'
146+
if bed_type == 'bed3' and num_col < 3:
147+
raise ValueError(f"Invalid dataframe for bed3 with columns: {df.columns}")
151148
return bed_type
152149

150+
def fetch_intervals(self, gr: GenomeRange) -> pd.DataFrame:
151+
"""
152+
Fetch intervals within input chromosome range.
153+
"""
154+
df = self.reader.query_var_chr(gr)
155+
return df

coolbox/core/track/bed/bed.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
1-
from coolbox.core.track.bed.fetch import FetchBed
21
from coolbox.utilities import (
32
get_logger
43
)
54
from coolbox.utilities.genome import GenomeRange
65
from .base import BedBase
76
from .plot import PlotGenes
87

8+
import pandas as pd
9+
10+
911
log = get_logger(__name__)
1012

1113

12-
class BED(BedBase, PlotGenes, FetchBed):
14+
class BED(BedBase, PlotGenes):
1315
"""
1416
Bed Track for plotting 1d intervals data from .bed file.
1517
The input bed file can be bed3/bed6/bed9/bed12

coolbox/core/track/bed/fetch.py

Lines changed: 0 additions & 61 deletions
This file was deleted.

coolbox/core/track/hist/bam.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from .base import HistBase, HistData, GenomeRange
2-
from coolbox.utilities.bam import process_bam, coverage_by_samtools
2+
from coolbox.utilities.reader.tab import process_bam, coverage_by_samtools
33

44

55
class BAMCov(HistBase):
Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
11
import pandas as pd
22

3-
from coolbox.utilities import (
4-
change_chrom_names,
5-
GenomeRange, get_logger,
6-
)
7-
from coolbox.utilities.bed import tabix_query, build_bedgraph_bgz
3+
from coolbox.utilities import GenomeRange, get_logger
4+
from coolbox.utilities.reader.tab import get_indexed_tab_reader
85
from .base import HistBase
96

107
log = get_logger(__name__)
@@ -32,7 +29,7 @@ def __init__(self, file, **kwargs):
3229
**kwargs
3330
})
3431
super().__init__(**properties)
35-
self.bgz_file = build_bedgraph_bgz(file)
32+
self.reader = get_indexed_tab_reader(file)
3633

3734
def fetch_plot_data(self, gr: GenomeRange, **kwargs) -> pd.DataFrame:
3835
itv_df = self.fetch_data(gr, **kwargs)
@@ -41,18 +38,4 @@ def fetch_plot_data(self, gr: GenomeRange, **kwargs) -> pd.DataFrame:
4138
return itv_df
4239

4340
def fetch_data(self, gr: GenomeRange, **kwargs) -> pd.DataFrame:
44-
rows = self.load(gr)
45-
if len(rows) == 0:
46-
gr.chrom = change_chrom_names(gr.chrom)
47-
rows = self.load(gr)
48-
49-
return pd.DataFrame(rows, columns=['chromsome', 'start', 'end', 'score'])
50-
51-
def load(self, genome_range):
52-
gr = genome_range
53-
return [
54-
[it[0], int(it[1]), int(it[2]), float(it[3])]
55-
for it in tabix_query(
56-
self.bgz_file, gr.chrom, gr.start, gr.end, split=True
57-
)
58-
]
41+
return self.reader.query_var_chr(gr)

0 commit comments

Comments
 (0)