forked from saketkc/gencode_regions
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_lincRNA.py
executable file
·29 lines (25 loc) · 1.05 KB
/
extract_lincRNA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#!/usr/bin/env
'''
Extract lincRNA coordinates from GTF
'''
import sys
import GTF
import numpy as np
import pandas as pd
def main(GENCODE):
gc = GTF.dataframe(GENCODE)
gc.gene_id = gc.gene_id.replace(to_replace=r'\.[0-9]+', value='', regex=True)
idx = (gc.feature == 'transcript') & (gc.transcript_type == 'lincRNA')
lincRNA = gc.ix[idx, ['seqname','start','end','gene_id','gene_name', 'strand']]
lincRNA.start = lincRNA.start.astype(int)
lincRNA.end = lincRNA.end.astype(int)
lincRNA.sort_values(by=['seqname','start','end'], inplace=True)
lincRNA.to_csv('lincRNA.bed', sep='\t', header=False, index=False)
idx = (gc.feature == 'gene') & (gc.gene_type == 'lincRNA')
lincRNA = gc.ix[idx, ['seqname','start','end','gene_id','gene_name', 'strand']]
lincRNA.start = lincRNA.start.astype(int)
lincRNA.end = lincRNA.end.astype(int)
lincRNA.sort_values(by=['seqname','start','end'], inplace=True)
lincRNA.to_csv('lincRNA_genes.bed', sep='\t', header=False, index=False)
if __name__ == '__main__':
main(sys.argv[1])