-
Notifications
You must be signed in to change notification settings - Fork 0
/
pipeline_sylamer.py
117 lines (90 loc) · 2.6 KB
/
pipeline_sylamer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""===========================
pipeline_sylamer.py
===========================
Overview
========
This pipeline runs sylamer on input fasta file for a list of given miRNA seeds.
files :file:``pipeline.yml` and :file:`conf.py`.
Usage
=====
See :ref:`PipelineSettingUp` and :ref:`PipelineRunning` on general
information how to use CGAT pipelines.
Configuration
-------------
The pipeline requires a configured :file:`pipeline.yml` file.
CGATReport report requires a :file:`conf.py` and optionally a
:file:`cgatreport.ini` file (see :ref:`PipelineReporting`).
Default configuration files can be generated by executing:
python <srcdir>/pipeline_sylamer.py config
Input files
-----------
- fasta file of 3'UTRs of mRNAs to look for miRNA seeds
- list of miRNA seeds to look for
- ranked list for each mRNA present in the fasta file
Requirements
------------
On top of the default CGAT setup, the pipeline requires the following
software to be installed and in the path:
- GSL library and sylamer
Pipeline output
===============
- directories for each seed before running sylamer on each
- output directories with sylamer results for each miRNA seed present in input list of seeds
Code
====
"""
import sys
import os
import sqlite3
from cgatcore import pipeline as P
import cgat.GTF as GTF
import cgatcore.iotools as IOTools
from ruffus import *
#Laod prarams
PARAMS = P.get_parameters("pipeline.yml")
@follows(mkdir("seeds.dir"))
@subdivide(PARAMS["seeds"],
formatter(),
"{path[0]}/seeds.dir/seed.*")
def splitSeeds(infile, outfiles):
job_memory = "8G"
job_threads = 4
outfile = "seeds.dir/seed."
'''Split seeds list to feed sylamer'''
statement="""
split %(infile)s %(outfile)s -l 1 -d --numeric-suffixes=1
"""
P.run(statement)
@transform(splitSeeds,
regex("seeds.dir/seed.(.+)"),
r"sylamer_seed\1")
def runSylamer(infile, outfile):
job_memory = "8G"
job_threads = 4
fasta_file = PARAMS["fasta"]
ranks = PARAMS["ranks"]
#Open and read seed input
seed = open(infile, "r").readlines()[0].replace("\n", "")
length_seed = len(seed)
minim = PARAMS["sylamer_min"]
freq_dump = infile+".freq"
log = outfile+".log"
'''run sylamer'''
statement="""
sylamer -fasta %(fasta_file)s
-universe %(ranks)s
-k %(length_seed)s
-m %(minim)s
-w %(seed)s
-grow 100
-oob 400
-v
-log=%(log)sc
-o=%(outfile)s
"""
P.run(statement)
@follows(runSylamer)
def full():
''' Later alligator '''
pass
P.main()