-
Notifications
You must be signed in to change notification settings - Fork 0
/
gen_read_dist.py
60 lines (37 loc) · 1.25 KB
/
gen_read_dist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
'''Computes a fragment length distibution and outputs it in the same format
as salmon's flendist.txt.
Will only used non-multimapped reads
Usage:
python flendist.py BAMFILE
distribution comes on stdout. Note that it will be mixed with the log unless
you redirect one of log (using -L) or stdout (using -S).
'''
from cgatcore import experiment as E
from collections import defaultdict
import pysam
parser = E.OptionParser(usage = globals()["__doc__"])
options, args = E.start(parser)
bamfile = pysam.AlignmentFile(args[0])
lengthdist = [0.0]*1001
total = 0
all_total = 0
E.info("Starting Run")
for read in bamfile.fetch(until_eof=True):
all_total += 1
if all_total %1000000 == 0:
E.debug("Done %s reads; %s used" % (all_total, total))
if read.is_unmapped:
continue
if read.has_tag("NH") and read.get_tag("NH") != 1:
continue
if read.is_read2:
continue
flen = read.template_length
total += 1
if abs(flen) <= 1000:
lengthdist[abs(flen)] += 1.0
bamfile.close()
lengthdist = [f/total for f in lengthdist]
options.stdout.write("\t".join(map(str,lengthdist)))
E.info("Used %s reads out of %s to built distribution. Total weight = %s " % (total, all_total, sum(lengthdist)))
E.stop()