Skip to content

Commit

Permalink
unit test fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Thys3Potgieter committed Sep 20, 2023
1 parent 936abc2 commit abdc962
Show file tree
Hide file tree
Showing 14 changed files with 566 additions and 227 deletions.
8 changes: 6 additions & 2 deletions MQProteogenomics.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@


# MQProteogenomics

MQProteogenomics is an open-source proteogenomics pipeline for multistrain genome annotation, variant and frameshift detection between strains and relative to a reference strain.
Expand All @@ -10,7 +10,7 @@ MQProteogenomics is an open-source proteogenomics pipeline for multistrain genom
Ensure singularity is installed, and make sure at least 2 cores and 4 GB of RAM are available.

~~~
cd proteomics-pipelines/singularity/mqproteogenomics
cd proteomics-pipelines/singularity/mqproteogenomics
./create_image.sh
~~~

Expand All @@ -33,3 +33,7 @@ mqproteogenomics.sh mq_proteogeomics_test.yml
~~~

Now adjust the config file for our own data.

#### 4. Notes
MaxQuant protein search database entries are mapped to STOP to STOP genome six frame translated sequences for each strain. Alligned ORFs with a evalue of 0 are considered mapped. ORFs are mapped to Reference proteomes using BLAST, with the top hit per proteome selected with an evalue cutoff of 0.001 used for annotation.

60 changes: 37 additions & 23 deletions bin/bash/mqproteogenomics.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,28 +50,40 @@ if [ ! -d $outdir/blast ] ; then
mkdir $outdir/blast
fi

################################################
# BLAST or protein group FASTA to SIX FRAME DB #
################################################
# speed up by using an nr database for query fasta
# speed up by using an nr database for target fasta
if [ ! -d $outdir/blast/groups2orfs ] ; then
mkdir $outdir/blast/groups2orfs
mq_blast_groups2orfs.py $config $outdir || ( rm -rf $outdir/blast/groups2orfs ; exit 1 )
fi


if [ ! -d $outdir/blast/orfs2proteins ] ; then
mkdir $outdir/blast/orfs2proteins
mq_blast_orfs2refproteome.py $config $outdir || ( rm -rf $outdir/blast/orfs2proteins ; exit 1 )
fi



if [ ! -d $outdir/blast/orfs2genome ] ; then
mkdir $outdir/blast/orfs2genome
mq_blast_orfs2refgenome.py $config $outdir || ( rm -rf $outdir/blast/orfs2genome ; exit 1 )
fi

# this output is not used for now, peptides are all mapped to the same reference, to facilitate gbrowse on the same reference sequence if it is needed
if [ ! -d $outdir/blast/peptides2genome ] ; then
mkdir $outdir/blast/peptides2genome
mq_blast_peptides2refgenome.py $config $outdir #|| rm -rf $outdir/blast/peptides2genome
fi
#if [ ! -d $outdir/blast/peptides2genome ] ; then
# mkdir $outdir/blast/peptides2genome
# mq_blast_peptides2refgenome.py $config $outdir #|| rm -rf $outdir/blast/peptides2genome
#fi

if [ ! -d $outdir/blast/peptides2orfs ] ; then
mkdir $outdir/blast/peptides2orfs
mq_blast_peptides2reforfs.py $config $outdir || ( rm -rf $outdir/blast/peptides2orfs ; exit 1 )
fi

exit 0
##################
# Operon Mapping #
##################
Expand All @@ -83,25 +95,36 @@ exit 0
##################
# Combined table #
##################

#if [ ! -f $outdir/combined.csv ] ; then
ls $outdir/*combined.csv > /dev/null 2>&1 || mq_peptide_to_protein.py $config $outdir || ( rm -rf $outdir/*combined.csv ; exit 1 )

if [ ! -f $outdir/config.yml ] ; then
echo $outdir/config.yml
mqmetaproteomics.py $config
fi
exit 0
#if [ ! -f $outdir/combined.csv ] ; then
ls $outdir/*combined.csv > /dev/null 2>&1 || mq_peptide_to_protein.py $config $outdir || ( rm -rf $outdir/*combined.csv ; exit 1 )

#############################
# Identification Statistics #
#############################
if [ ! -d $outdir/stats ] ; then
mq_basestats.py $config $outdir || rm -rf $outdir/stats
fi

if [ ! -d $outdir/tables ] ; then
mq_export_tables.py $config $outdir || rm -rf $outdir/tables
fi


if [ ! -d $outdir/jbrowse ] ; then
mkdir $outdir/jbrowse
#mq_strains2ref_peptides.py $config $outdir || ( rm -rf $outdir/jbrowse ; exit 1 )
mq_strains2ref.py $config $outdir || ( rm -rf $outdir/j_ibrowse ; exit 1 )
# not needed #mq_strains2ref_peptides.py $config $outdir || ( rm -rf $outdir/jbrowse ; exit 1 )
mq_strains2ref.py $config $outdir || ( rm -rf $outdir/jbrowse ; exit 1 )
mq_peptide_features.py $config $outdir
fi

mq_jbrowse_upload_script.py $config $outdir # || ( rm -rf $outdir/jbrowse ; exit 1 )
cd $outdir/jbrowse/local && jbrowse admin-server
exit 0
mq_jbrowse_upload_script.py $config $outdir || ( rm -rf $outdir/jbrowse ; exit 1 )

cd $outdir/jbrowse/local && echo $hostname && jbrowse admin-server

#mq_domain_features.py $config $outdir
#mq_contig_heatmaps.py $config $outdir
Expand All @@ -111,16 +134,7 @@ exit 0



#if [ ! -d $outdir/tables ] ; then
# mq_export_tables.py $config $outdir || rm -rf $outdir/tables
#fi

#############################
# Identification Statistics #
#############################
#if [ ! -d $outdir/stats ] ; then
# mq_basestats.py $config $outdir || rm -rf $outdir/stats
#fi


######################
Expand Down
163 changes: 82 additions & 81 deletions bin/python/mq_basestats.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,92 +11,93 @@
import pickle
import yaml

config = yaml.load(open(sys.argv[1]).read())
config = yaml.load(open(sys.argv[1]).read(), Loader=yaml.Loader)

output = sys.argv[2]

peptides = pd.read_csv(config['mq_txt'] + '/peptides.txt', sep='\t',engine='python')
peptides = peptides[(peptides['Potential contaminant'].notnull()) & (peptides['Reverse'].notnull())]
data = pd.read_csv(output +'/combined.csv')

stats = output +'/stats'
try:
os.mkdir(stats)
except:
shutil.rmtree(stats)
os.mkdir(stats)


def collist(df, col):
peps = set()
for pepset in df[col].dropna().tolist():
for pep in pepset.split('\n'):
peps.add(pep)
return peps

novel_peptides = collist(data,"Specific novel peptides - all strains")

strain_all_peptides = defaultdict(set)

for col in data.columns:
rex = 'All peptides strain '
if col.startswith(rex):
strain = col.split(rex)[1]
peptides = data[col].dropna().tolist()
for pepset in peptides:

for reference in config['reference']:
data = pd.read_csv(output +'/{}_combined.csv'.format(reference), sep='\t')
stats = output +'/stats/{}'.format(reference)
try:
os.makedirs(stats)
except:
shutil.rmtree(stats)
os.makedirs(stats)


def collist(df, col):
peps = set()
for pepset in df[col].dropna().tolist():
for pep in pepset.split('\n'):
strain_all_peptides[strain].add(pep)
st_peptides = strain_all_peptides[strain]
st_novel = st_peptides & novel_peptides

w = open( stats +'/all.peptides.strain.{}.txt'.format(strain),'w')
w.write('\n'.join(st_peptides))
w.close()

w = open( stats +'/specific.novel.peptides.strain.{}.txt'.format(strain),'w')
w.write('\n'.join(st_novel))
w.close()


rex = 'Specific peptides strain '
if col.startswith(rex):
strain = col.split(rex)[1]
strain_pg = [str(i) for i in list(data[col].dropna().index)]
w = open( stats +'/protein.groups.strain.{}.txt'.format(strain),'w')
w.write('\n'.join(strain_pg))
w.close()

peptides = collist(data, col)
w = open( stats +'/specific.peptides.strain.{}.txt'.format(strain),'w')
w.write('\n'.join(peptides))
w.close()
peps.add(pep)
return peps

novel_peptides = collist(data,"Specific novel peptides - all strains")

strain_all_peptides = defaultdict(set)

for col in data.columns:
rex = 'All peptides strain '
if col.startswith(rex):
strain = col.split(rex)[1]
peptides = data[col].dropna().tolist()
for pepset in peptides:
for pep in pepset.split('\n'):
strain_all_peptides[strain].add(pep)
st_peptides = strain_all_peptides[strain]
st_novel = st_peptides & novel_peptides

w = open( stats +'/all.peptides.strain.{}.txt'.format(strain),'w')
w.write('\n'.join(st_peptides))
w.close()

w = open( stats +'/specific.novel.peptides.strain.{}.txt'.format(strain),'w')
w.write('\n'.join(st_novel))
w.close()


rex = "Exclusive peptides strain "
if col.startswith(rex):
strain = col.split(rex)[1]
peptides = collist(data, col)

w = open( stats +'/exclusive.peptides.strain.{}.txt'.format(strain),'w')
w.write('\n'.join(peptides))
w.close()
rex = "Non-genomic peptides strain "
if col.startswith(rex):
strain = col.split(rex)[1]
peptides = collist(data, col)

w = open( stats +'/unmapped.peptides.strain.{}.txt'.format(strain),'w')
w.write('\n'.join(peptides))
w.close()

ref = data[data["Reference proteins mapped - all strains"].notnull()]
ref_unmapped = data[~data["Reference proteins mapped - all strains"].notnull()]
#taxon = data[data['_taxon.best.matches'].notnull()]
#taxon_unmapped = data[~data['_taxon.best.matches'].notnull()]

print(len(ref))
print(len(ref_unmapped))

#print(len(taxon))
#print(len(taxon_unmapped))
#print(data.columns.tolist())
rex = 'Specific peptides strain '
if col.startswith(rex):
strain = col.split(rex)[1]
strain_pg = [str(i) for i in list(data[col].dropna().index)]
w = open( stats +'/protein.groups.strain.{}.txt'.format(strain),'w')
w.write('\n'.join(strain_pg))
w.close()

peptides = collist(data, col)
w = open( stats +'/specific.peptides.strain.{}.txt'.format(strain),'w')
w.write('\n'.join(peptides))
w.close()

rex = "Exclusive peptides strain "
if col.startswith(rex):
strain = col.split(rex)[1]
peptides = collist(data, col)

w = open( stats +'/exclusive.peptides.strain.{}.txt'.format(strain),'w')
w.write('\n'.join(peptides))
w.close()
rex = "Non-genomic peptides strain "
if col.startswith(rex):
strain = col.split(rex)[1]
peptides = collist(data, col)

w = open( stats +'/unmapped.peptides.strain.{}.txt'.format(strain),'w')
w.write('\n'.join(peptides))
w.close()

ref = data[data["Reference proteins mapped - all strains"].notnull()]
ref_unmapped = data[~data["Reference proteins mapped - all strains"].notnull()]
#taxon = data[data['_taxon.best.matches'].notnull()]
#taxon_unmapped = data[~data['_taxon.best.matches'].notnull()]

print(len(ref))
print(len(ref_unmapped))

#print(len(taxon))
#print(len(taxon_unmapped))
#print(data.columns.tolist())

Loading

0 comments on commit abdc962

Please sign in to comment.