-
Notifications
You must be signed in to change notification settings - Fork 0
/
umi_hist.py
82 lines (55 loc) · 1.66 KB
/
umi_hist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
'''
umi_stats.py - Count frequency of UMIs
====================================================
:Author: Ian Sudbery
:Release: $Id$
:Date: |today|
:Tags: Python
Purpose
-------
Takes a BAM file produced from mapping a FASTQ created by
extract_umi.py and produces a histogram of their usage.
Options
-------
Will read bamfile off of the stdin or from -I, but
BAM file must be indexed and so cannot be manipulated on
stdin.
Usage
-----
python umi_stats.py -I [BAMFILE]
Command line options
--------------------
'''
import sys
import collections
import CGAT.Experiment as E
import pysam
def main(argv=None):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""
if argv is None:
argv = sys.argv
# setup command line parser
parser = E.OptionParser(version="%prog version: $Id$",
usage=globals()["__doc__"])
# add common options (-h/--help, ...) and parse command line
(options, args) = E.Start(parser, argv=argv)
out_hist = collections.defaultdict(int)
if options.stdin == sys.stdin:
in_bam = pysam.Samfile("-", "rb")
else:
fn = options.stdin.name
options.stdin.close()
in_bam = pysam.Samfile(fn, "rb")
for read in in_bam.fetch():
barcode = read.qname.split("_")[-1]
out_hist[barcode] += 1
header = "\t".join(["UMI", "Count"])
outlines = ["\t".join(map(str, x)) for x in out_hist.iteritems()]
outlines = header + "\n" + "\n".join(outlines) + "\n"
options.stdout.write(outlines)
# write footer and output benchmark information.
E.Stop()
if __name__ == "__main__":
sys.exit(main(sys.argv))