pipeline_sylamer.py

"""===========================
pipeline_sylamer.py
===========================

Overview
========

This pipeline runs sylamer on input fasta file for a list of given miRNA seeds.

files :file:``pipeline.yml` and :file:`conf.py`.

Usage
=====

See :ref:`PipelineSettingUp` and :ref:`PipelineRunning` on general
information how to use CGAT pipelines.

Configuration
-------------

The pipeline requires a configured :file:`pipeline.yml` file.
CGATReport report requires a :file:`conf.py` and optionally a
:file:`cgatreport.ini` file (see :ref:`PipelineReporting`).

Default configuration files can be generated by executing:

   python <srcdir>/pipeline_sylamer.py config

Input files
-----------

- fasta file of 3'UTRs of mRNAs to look for miRNA seeds
- list of miRNA seeds to look for 
- ranked list for each mRNA present in the fasta file

Requirements
------------

On top of the default CGAT setup, the pipeline requires the following
software to be installed and in the path:
    - GSL library and sylamer

Pipeline output
===============

- directories for each seed before running sylamer on each
- output directories with sylamer results for each miRNA seed present in input list of seeds 

Code
====

"""


import sys
import os
import sqlite3

from cgatcore import pipeline as P
import cgat.GTF as GTF
import cgatcore.iotools as IOTools
from ruffus import *

#Laod prarams
PARAMS = P.get_parameters("pipeline.yml")

@follows(mkdir("seeds.dir"))
@subdivide(PARAMS["seeds"],
           formatter(),
           "{path[0]}/seeds.dir/seed.*")
def splitSeeds(infile, outfiles):
    job_memory = "8G"
    job_threads = 4
    outfile = "seeds.dir/seed."
    '''Split seeds list to feed sylamer'''
    statement="""
    split %(infile)s %(outfile)s -l 1 -d --numeric-suffixes=1
    """
    P.run(statement)


@transform(splitSeeds,
           regex("seeds.dir/seed.(.+)"),
           r"sylamer_seed\1")
def runSylamer(infile, outfile):
    job_memory = "8G"
    job_threads = 4
    fasta_file = PARAMS["fasta"]
    ranks = PARAMS["ranks"]
    #Open and read seed input
    seed = open(infile, "r").readlines()[0].replace("\n", "")
    length_seed = len(seed)
    minim = PARAMS["sylamer_min"]
    freq_dump = infile+".freq"
    log = outfile+".log"
    '''run sylamer'''
    statement="""
    sylamer -fasta %(fasta_file)s 
            -universe %(ranks)s
            -k %(length_seed)s
            -m %(minim)s
            -w %(seed)s
            -grow 100
            -oob 400
            -v
            -log=%(log)sc
            -o=%(outfile)s
    """
    P.run(statement)

@follows(runSylamer)
def full():
    ''' Later alligator '''
    pass

P.main()