unit test fixes

uct-cbio · Sep 20, 2023 · abdc962 · abdc962
1 parent 936abc2
commit abdc962
Show file tree

Hide file tree

Showing 14 changed files with 566 additions and 227 deletions.
diff --git a/MQProteogenomics.md b/MQProteogenomics.md
@@ -1,4 +1,4 @@
-
+
 # MQProteogenomics
 
 MQProteogenomics is an open-source proteogenomics pipeline for multistrain genome annotation, variant and frameshift detection between strains and relative to a reference strain.
@@ -10,7 +10,7 @@ MQProteogenomics is an open-source proteogenomics pipeline for multistrain genom
 Ensure singularity is installed, and make sure at least 2 cores and 4 GB of RAM are available.
 
 ~~~
-cd proteomics-pipelines/singularity/mqproteogenomics
+cd proteomics-pipelines/singularity/mqproteogenomics
 ./create_image.sh
 ~~~
 
@@ -33,3 +33,7 @@ mqproteogenomics.sh mq_proteogeomics_test.yml
 ~~~
 
 Now adjust the config file for our own data.
+
+#### 4. Notes
+MaxQuant protein search database entries are mapped to STOP to STOP genome six frame translated sequences for each strain. Alligned ORFs with a evalue of 0 are considered mapped. ORFs are mapped to Reference proteomes using BLAST, with the top hit per proteome selected with an evalue cutoff of 0.001 used for annotation.
+
diff --git a/bin/bash/mqproteogenomics.sh b/bin/bash/mqproteogenomics.sh
@@ -50,28 +50,40 @@ if [ ! -d $outdir/blast ] ; then
     mkdir $outdir/blast
 fi
 
+################################################ 
+# BLAST or protein group FASTA to SIX FRAME DB #
+################################################
+# speed up by using an nr database for query fasta
+# speed up by using an nr database for target fasta
+if [ ! -d $outdir/blast/groups2orfs ] ; then
+    mkdir $outdir/blast/groups2orfs
+    mq_blast_groups2orfs.py $config $outdir  || ( rm -rf $outdir/blast/groups2orfs ; exit 1 )
+fi
+
+
 if [ ! -d $outdir/blast/orfs2proteins ] ; then
     mkdir $outdir/blast/orfs2proteins
     mq_blast_orfs2refproteome.py $config $outdir  || ( rm -rf $outdir/blast/orfs2proteins ; exit 1 )
 fi
 
+
+
 if [ ! -d $outdir/blast/orfs2genome ] ; then
     mkdir $outdir/blast/orfs2genome
     mq_blast_orfs2refgenome.py $config $outdir  || ( rm -rf $outdir/blast/orfs2genome ; exit 1 )
 fi
 
 # this output is not used for now, peptides are all mapped to the same reference, to facilitate gbrowse on the same reference sequence if it is needed
-if [ ! -d $outdir/blast/peptides2genome ] ; then
-    mkdir $outdir/blast/peptides2genome
-    mq_blast_peptides2refgenome.py $config $outdir  #|| rm -rf $outdir/blast/peptides2genome
-fi
+#if [ ! -d $outdir/blast/peptides2genome ] ; then
+#    mkdir $outdir/blast/peptides2genome
+#    mq_blast_peptides2refgenome.py $config $outdir  #|| rm -rf $outdir/blast/peptides2genome
+#fi
 
 if [ ! -d $outdir/blast/peptides2orfs ] ; then
     mkdir $outdir/blast/peptides2orfs
     mq_blast_peptides2reforfs.py $config $outdir  || ( rm -rf $outdir/blast/peptides2orfs ; exit 1 )
 fi
 
-exit 0
 ##################
 # Operon Mapping #
 ##################
@@ -83,25 +95,36 @@ exit 0
 ##################
 # Combined table #
 ##################
-
-#if [ ! -f $outdir/combined.csv ] ; then
-ls $outdir/*combined.csv > /dev/null 2>&1  || mq_peptide_to_protein.py $config $outdir || ( rm -rf $outdir/*combined.csv ; exit 1 )
-
 if [ ! -f $outdir/config.yml ] ; then
     echo $outdir/config.yml
     mqmetaproteomics.py $config
 fi
+exit 0
+#if [ ! -f $outdir/combined.csv ] ; then
+ls $outdir/*combined.csv > /dev/null 2>&1  || mq_peptide_to_protein.py $config $outdir || ( rm -rf $outdir/*combined.csv ; exit 1 )
+
+#############################
+# Identification Statistics #
+#############################
+if [ ! -d $outdir/stats ] ; then
+    mq_basestats.py $config $outdir || rm -rf $outdir/stats
+fi
+
+if [ ! -d $outdir/tables ] ;  then      
+    mq_export_tables.py $config $outdir || rm -rf $outdir/tables
+fi
+
 
 if [ ! -d $outdir/jbrowse ] ; then
     mkdir $outdir/jbrowse
-    #mq_strains2ref_peptides.py $config $outdir  || ( rm -rf $outdir/jbrowse ; exit 1 )
-    mq_strains2ref.py $config $outdir      || ( rm -rf $outdir/j_ibrowse ; exit 1 )
+    # not needed  #mq_strains2ref_peptides.py $config $outdir  || ( rm -rf $outdir/jbrowse ; exit 1 )
+    mq_strains2ref.py $config $outdir      || ( rm -rf $outdir/jbrowse ; exit 1 )
     mq_peptide_features.py $config $outdir
 fi
 
-mq_jbrowse_upload_script.py $config $outdir # || ( rm -rf $outdir/jbrowse ; exit 1 )
-cd $outdir/jbrowse/local && jbrowse admin-server
-exit 0
+mq_jbrowse_upload_script.py $config $outdir || ( rm -rf $outdir/jbrowse ; exit 1 )
+
+cd $outdir/jbrowse/local && echo $hostname && jbrowse admin-server
 
 #mq_domain_features.py $config $outdir
 #mq_contig_heatmaps.py $config $outdir
@@ -111,16 +134,7 @@ exit 0
 
 
 
-#if [ ! -d $outdir/tables ] ;  then      
-#    mq_export_tables.py $config $outdir || rm -rf $outdir/tables
-#fi
 
-#############################
-# Identification Statistics #
-#############################
-#if [ ! -d $outdir/stats ] ; then
-#    mq_basestats.py $config $outdir || rm -rf $outdir/stats
-#fi
 
 
 ###################### 

diff --git a/bin/python/mq_basestats.py b/bin/python/mq_basestats.py
@@ -11,92 +11,93 @@
 import pickle
 import yaml
 
-config = yaml.load(open(sys.argv[1]).read())
+config = yaml.load(open(sys.argv[1]).read(),  Loader=yaml.Loader)
 
 output = sys.argv[2]
 
 peptides = pd.read_csv(config['mq_txt'] + '/peptides.txt', sep='\t',engine='python')
 peptides = peptides[(peptides['Potential contaminant'].notnull()) & (peptides['Reverse'].notnull())]
-data = pd.read_csv(output +'/combined.csv')
-
-stats = output +'/stats'
-try:
-    os.mkdir(stats)
-except:
-    shutil.rmtree(stats)
-    os.mkdir(stats)
-
-
-def collist(df, col):
-    peps = set()
-    for pepset in df[col].dropna().tolist():
-        for pep in pepset.split('\n'):
-            peps.add(pep)
-    return peps
-
-novel_peptides = collist(data,"Specific novel peptides - all strains")
-
-strain_all_peptides = defaultdict(set)
-
-for col in data.columns:
-    rex = 'All peptides strain '
-    if col.startswith(rex):
-        strain = col.split(rex)[1]
-        peptides = data[col].dropna().tolist()
-        for pepset in peptides:
+
+for reference in config['reference']:
+    data = pd.read_csv(output +'/{}_combined.csv'.format(reference), sep='\t')
+    stats = output +'/stats/{}'.format(reference)
+    try:
+        os.makedirs(stats)
+    except:
+        shutil.rmtree(stats)
+        os.makedirs(stats)
+
+
+    def collist(df, col):
+        peps = set()
+        for pepset in df[col].dropna().tolist():
             for pep in pepset.split('\n'):
-                strain_all_peptides[strain].add(pep)
-        st_peptides = strain_all_peptides[strain]
-        st_novel =  st_peptides & novel_peptides
-
-        w = open( stats +'/all.peptides.strain.{}.txt'.format(strain),'w')
-        w.write('\n'.join(st_peptides))
-        w.close()
-
-        w = open( stats +'/specific.novel.peptides.strain.{}.txt'.format(strain),'w')
-        w.write('\n'.join(st_novel))
-        w.close()
-
-
-    rex = 'Specific peptides strain '
-    if col.startswith(rex):
-        strain = col.split(rex)[1]
-        strain_pg = [str(i) for i in list(data[col].dropna().index)] 
-        w = open( stats +'/protein.groups.strain.{}.txt'.format(strain),'w')
-        w.write('\n'.join(strain_pg))
-        w.close()
-
-        peptides = collist(data, col)
-        w = open( stats +'/specific.peptides.strain.{}.txt'.format(strain),'w')
-        w.write('\n'.join(peptides))
-        w.close()
+                peps.add(pep)
+        return peps
+
+    novel_peptides = collist(data,"Specific novel peptides - all strains")
+
+    strain_all_peptides = defaultdict(set)
+
+    for col in data.columns:
+        rex = 'All peptides strain '
+        if col.startswith(rex):
+            strain = col.split(rex)[1]
+            peptides = data[col].dropna().tolist()
+            for pepset in peptides:
+                for pep in pepset.split('\n'):
+                    strain_all_peptides[strain].add(pep)
+            st_peptides = strain_all_peptides[strain]
+            st_novel =  st_peptides & novel_peptides
+
+            w = open( stats +'/all.peptides.strain.{}.txt'.format(strain),'w')
+            w.write('\n'.join(st_peptides))
+            w.close()
+
+            w = open( stats +'/specific.novel.peptides.strain.{}.txt'.format(strain),'w')
+            w.write('\n'.join(st_novel))
+            w.close()
+
 
-    rex = "Exclusive peptides strain "
-    if col.startswith(rex):
-        strain = col.split(rex)[1]
-        peptides = collist(data, col)
-
-        w = open( stats +'/exclusive.peptides.strain.{}.txt'.format(strain),'w')
-        w.write('\n'.join(peptides))
-        w.close()
-    rex = "Non-genomic peptides strain " 
-    if col.startswith(rex):
-        strain = col.split(rex)[1]
-        peptides = collist(data, col)
-
-        w = open( stats +'/unmapped.peptides.strain.{}.txt'.format(strain),'w')
-        w.write('\n'.join(peptides))
-        w.close()
-
-ref = data[data["Reference proteins mapped - all strains"].notnull()]
-ref_unmapped = data[~data["Reference proteins mapped - all strains"].notnull()]
-#taxon = data[data['_taxon.best.matches'].notnull()]
-#taxon_unmapped = data[~data['_taxon.best.matches'].notnull()]
-
-print(len(ref))
-print(len(ref_unmapped))
-
-#print(len(taxon))
-#print(len(taxon_unmapped))
-#print(data.columns.tolist())
+        rex = 'Specific peptides strain '
+        if col.startswith(rex):
+            strain = col.split(rex)[1]
+            strain_pg = [str(i) for i in list(data[col].dropna().index)] 
+            w = open( stats +'/protein.groups.strain.{}.txt'.format(strain),'w')
+            w.write('\n'.join(strain_pg))
+            w.close()
+
+            peptides = collist(data, col)
+            w = open( stats +'/specific.peptides.strain.{}.txt'.format(strain),'w')
+            w.write('\n'.join(peptides))
+            w.close()
+
+        rex = "Exclusive peptides strain "
+        if col.startswith(rex):
+            strain = col.split(rex)[1]
+            peptides = collist(data, col)
+
+            w = open( stats +'/exclusive.peptides.strain.{}.txt'.format(strain),'w')
+            w.write('\n'.join(peptides))
+            w.close()
+        rex = "Non-genomic peptides strain " 
+        if col.startswith(rex):
+            strain = col.split(rex)[1]
+            peptides = collist(data, col)
+
+            w = open( stats +'/unmapped.peptides.strain.{}.txt'.format(strain),'w')
+            w.write('\n'.join(peptides))
+            w.close()
+
+    ref = data[data["Reference proteins mapped - all strains"].notnull()]
+    ref_unmapped = data[~data["Reference proteins mapped - all strains"].notnull()]
+    #taxon = data[data['_taxon.best.matches'].notnull()]
+    #taxon_unmapped = data[~data['_taxon.best.matches'].notnull()]
+
+    print(len(ref))
+    print(len(ref_unmapped))
+
+    #print(len(taxon))
+    #print(len(taxon_unmapped))
+    #print(data.columns.tolist())