Skip to content

Commit 26cfc76

Browse files
committed
Add options to apply SNP filter on allc files using filter-allc; add multiprocess option in filter-allc
1 parent e44f823 commit 26cfc76

File tree

2 files changed

+144
-24
lines changed

2 files changed

+144
-24
lines changed

methylpy/parser.py

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -246,11 +246,14 @@ def parse_args():
246246
no_reindex=args.no_reindex)
247247
elif args.command == "filter-allc":
248248
from methylpy.utilities import filter_allc_file
249-
filter_allc_file(allc_file=args.allc_file,
250-
output_file=args.output_file,
249+
filter_allc_file(allc_files=args.allc_files,
250+
output_files=args.output_files,
251+
num_procs=args.num_procs,
251252
mc_type=args.mc_type,
252253
chroms=args.chroms,
253254
compress_output=args.compress_output,
255+
max_mismatch=args.max_mismatch,
256+
max_mismatch_frac=args.max_mismatch_frac,
254257
min_cov=args.min_cov)
255258

256259
elif args.command == "allc-to-bigwig":
@@ -1480,22 +1483,29 @@ def add_filter_allc_subparser(subparsers):
14801483
help="Filter allc file")
14811484

14821485
filter_allc_req = filter_allc.add_argument_group("required inputs")
1483-
filter_allc_req.add_argument("--allc-file",
1484-
type=str,
1485-
required=True,
1486-
help="Allc file to filter.")
1486+
filter_allc_req.add_argument("--allc-files",
1487+
type=str,
1488+
required=True,
1489+
nargs="+",
1490+
help="allc files to filter.")
1491+
1492+
filter_allc_req.add_argument("--output-files",
1493+
type=str,
1494+
required=True,
1495+
nargs="+",
1496+
help="Name of output files. Each output file matches each allc file.")
1497+
1498+
filter_allc_req.add_argument("--num-procs",
1499+
type=int,
1500+
default=1,
1501+
help="Number of processors you wish to use to parallelize this function")
14871502

1488-
filter_allc_req.add_argument("--output-file",
1489-
type=str,
1490-
required=True,
1491-
help="Name of output file.")
1492-
14931503
filter_allc_opt = filter_allc.add_argument_group("optional inputs")
14941504

14951505
filter_allc_opt.add_argument("--mc-type",
14961506
type=str,
14971507
nargs="+",
1498-
default=["CGN"],
1508+
default=None,
14991509
help="List of space separated cytosine nucleotide contexts for "
15001510
+ "sites to be included in output file. These classifications "
15011511
+ "may use the wildcards H (indicating anything but a G) and "
@@ -1507,6 +1517,28 @@ def add_filter_allc_subparser(subparsers):
15071517
help="Minimum number of reads that must cover a site for it to be "
15081518
+ "included in output file.")
15091519

1520+
filter_allc_opt.add_argument("--max-mismatch",
1521+
type=int,
1522+
nargs="+",
1523+
default=None,
1524+
help="Maximum numbers of mismatch basecalls allowed in each nucleotide in "
1525+
+"the sequence context of a site for it to be included in output file. If "
1526+
+"the sequence context has three nucleotides, an example of this option is \"0 1 2\". "
1527+
+"It requires no mismatch basecall at the first nucleotide, at most one mismatch "
1528+
+"basecall at the second nucleotide, and at most two at the third nucleotide for a site "
1529+
+"to be reported.")
1530+
1531+
filter_allc_opt.add_argument("--max-mismatch-frac",
1532+
type=float,
1533+
nargs="+",
1534+
default=None,
1535+
help="Maximum fraction of mismatch basecalls out of unambiguous basecalls allowed "
1536+
+"in each nucleotide in the sequence context of a site for it to be included "
1537+
+" in output file. If the sequence context has three nucleotides, an example "
1538+
+"of this option is \"0 0 0.1\". It requires no mismatch basecall at the first "
1539+
+"and second nucleotide, and at most 10% mismatches out of unambiguous basecalls "
1540+
+"at the third nucleotide for a site to be reported.")
1541+
15101542
filter_allc_opt.add_argument("--compress-output",
15111543
type=str2bool,
15121544
default=True,

methylpy/utilities.py

Lines changed: 100 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -191,15 +191,68 @@ def convert_allc_to_bigwig(input_allc_file,
191191
subprocess.check_call(shlex.split("rm "+output_file+".wig "+output_file+".chrom_size"))
192192

193193

194-
def filter_allc_file(allc_file,
195-
output_file,
196-
mc_type="CGN",
194+
def filter_allc_file(allc_files,
195+
output_files,
196+
num_procs=1,
197+
mc_type=None,
197198
chroms = None,
198199
compress_output=False,
199200
min_cov=0,
201+
max_mismatch=None,
202+
max_mismatch_frac=None,
200203
buffer_line_number=100000):
201204

202-
mc_class = expand_nucleotide_code(mc_type)
205+
# User input checks
206+
if not isinstance(allc_files, list):
207+
exit("allc files must be a list of string(s)")
208+
if not isinstance(output_files, list):
209+
exit("output files must be a list of string(s)")
210+
if len(allc_files) != len(output_files):
211+
exit("Number of allc files does not match number of output files")
212+
213+
if num_procs > 1:
214+
pool = multiprocessing.Pool(min(num_procs,len(allc_files)))
215+
print_checkpoint("Filtering allc files using "
216+
+str(min(num_procs,len(allc_files)))
217+
+" node(s).")
218+
for allc_file,output_file in zip(allc_files,output_files):
219+
pool.apply_async(filter_allc_file_worker,
220+
(),
221+
{"allc_file":allc_file,
222+
"output_file":output_file,
223+
"mc_type":mc_type,
224+
"chroms":chroms,
225+
"compress_output":compress_output,
226+
"min_cov":min_cov,
227+
"max_mismatch":max_mismatch,
228+
"max_mismatch_frac":max_mismatch_frac
229+
})
230+
pool.close()
231+
pool.join()
232+
else:
233+
print_checkpoint("Filtering allc files using single node.")
234+
for allc_file,output_file in zip(allc_files,output_files):
235+
filter_allc_file_worker(allc_file=allc_file,
236+
output_file=output_file,
237+
mc_type=mc_type,
238+
chroms=chroms,
239+
compress_output=compress_output,
240+
min_cov=min_cov,
241+
max_mismatch=max_mismatch,
242+
max_mismatch_frac=max_mismatch_frac)
243+
244+
def filter_allc_file_worker(allc_file,
245+
output_file,
246+
mc_type=None,
247+
chroms = None,
248+
compress_output=False,
249+
min_cov=0,
250+
max_mismatch=None,
251+
max_mismatch_frac=None,
252+
buffer_line_number=100000):
253+
254+
if mc_type is not None:
255+
mc_class = expand_nucleotide_code(mc_type)
203256
# input & output
204257
f = open_allc_file(allc_file)
205258
if compress_output:
@@ -214,14 +267,48 @@ def filter_allc_file(allc_file,
214267
for line in f:
215268
fields = line.split("\t")
216269
if (chroms is None or fields[0] in chroms) \
217-
and fields[3] in mc_class \
270+
and (mc_type is None or fields[3] in mc_class) \
218271
and int(fields[5]) >= min_cov:
219-
line_counts += 1
220-
out += line
272+
pass
273+
else:
274+
continue
275+
276+
if max_mismatch is not None or max_mismatch_frac is not None:
277+
try:
278+
matches = map(int,fields[7].split(","))
279+
mismatches = map(int,fields[8].split(","))
280+
except:
281+
print_error("allc file "+allc_file+"does not contain SNP information used "
282+
+"for applying mismatch-based filtering!\n")
283+
pass_snp_filter = True
284+
if max_mismatch is not None:
285+
try:
286+
for index,cutoff in enumerate(max_mismatch):
287+
if mismatches[index] > cutoff:
288+
pass_snp_filter = False
289+
break
290+
except:
291+
pass
292+
if max_mismatch_frac is not None:
293+
try:
294+
for index,cutoff in enumerate(max_mismatch_frac):
295+
cov = mismatches[index] + matches[index]
296+
if cov > 0 and float(mismatches[index])/float(cov) > cutoff:
297+
pass_snp_filter = False
298+
break
299+
except:
300+
pass
301+
if not pass_snp_filter:
302+
continue
303+
304+
line_counts += 1
305+
out += line
306+
# print out once reach buffer limit
221307
if line_counts > buffer_line_number:
222308
output_fhandler.write(out)
223309
line_counts = 0
224310
out = ""
311+
# clear buffer
225312
if line_counts > 0:
226313
output_fhandler.write(out)
227314
out = ""
@@ -382,16 +469,17 @@ def merge_allc_files_worker(allc_files,
382469
fhandles = []
383470
chrom_pointer = []
384471
chroms = set([])
385-
try:
472+
#try:
473+
if True:
386474
for index,allc_file in enumerate(allc_files):
387475
fhandles.append(open_allc_file(allc_file))
388476
chrom_pointer.append(read_allc_index(allc_file))
389477
for chrom in chrom_pointer[index].keys():
390478
chroms.add(chrom)
391-
except:
392-
for f in fhandles:
393-
f.close()
394-
exit() # exit due to failure of openning all allc files at once
479+
# except:
480+
# for f in fhandles:
481+
# f.close()
482+
# exit() # exit due to failure of openning all allc files at once
395483
if query_chroms is not None:
396484
if isinstance(query_chroms,list):
397485
chroms = query_chroms

0 commit comments

Comments
 (0)