Skip to content

Commit

Permalink
unit test fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Thys3Potgieter committed Sep 22, 2023
1 parent ca0328b commit ec1c334
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 9 deletions.
44 changes: 36 additions & 8 deletions bin/python/mq_blast_groups2orfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,39 @@
import pickle
import yaml
import subprocess
from collections import defaultdict

config = yaml.load(open(sys.argv[1]), Loader=yaml.Loader)
output = os.path.abspath( sys.argv[2])

query_fasta = os.path.abspath(config['search_fasta'])
target_fasta = output + '/strains/all_mapped_trans_orfs.fasta'


nr_targets = defaultdict(list)
targets = list(SeqIO.parse(target_fasta, 'fasta'))

for rec in targets:
nr_targets[str(rec.seq)].append(rec.id)

export = []
mapping = {}
count = 1
for seq in nr_targets:
id_ = 'nr_sequence_{}'.format(str(count))
ids = nr_targets[seq]
mapping[id_] = ids
description =';'.join(ids)
seq = Seq(''.join(seq.split('*')))
record = SeqRecord(id = id_, description = description, seq = seq)
export.append(record)
count += 1

new_target_path = output +'/blast/groups2orfs/nr.fasta'
SeqIO.write(export, new_target_path, 'fasta')



txt_path = os.path.abspath(config['mq_txt'])
pg = pd.read_csv(txt_path +'/proteinGroups.txt', sep='\t')
pg_ids = []
Expand All @@ -48,7 +74,7 @@
with open(new_query_path,'w') as w:
SeqIO.write(new_queries, w, 'fasta')

cmd="cp {} {} && cd {} && makeblastdb -in {} -dbtype 'prot' -out {}".format(target_fasta, newfolder, newfolder, 'all_mapped_trans_orfs.fasta', 'mapped_orfs' )
cmd="cd {} && makeblastdb -in {} -dbtype 'prot' -out {}".format(newfolder, 'nr.fasta', 'mapped_orfs' )
process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
process.wait()
assert process.returncode == 0
Expand Down Expand Up @@ -81,7 +107,8 @@
data = pd.read_csv(out+'.csv')


#data = data[(data['_alignment_rank']==1) & (data['_hsp_rank']==1)]
data = data[(data['_alignment_rank']==1) & (data['_hsp_rank']==1)]


mp = defaultdict(list)

Expand All @@ -90,13 +117,14 @@ def get_mapping(df):
#ids = df['blast_record.query'].split()[0].split(';')
evalue= df['hsp.expect']
for i in ids:
print(i)
i = i.split('|')[1]#.split('.')[0]
#if evalue < 0.0001:
if evalue == 0:
mapped = df['_alignment.entry']
if not mapped in mp[i]:
mp[i].append(mapped)
if evalue < 0.0001:
nr_id = df['alignment.hit_def'].split()[0]
mapped_ids = mapping[nr_id]
for mapped in mapped_ids:

if not mapped in mp[i]:
mp[i].append(mapped.split('|')[1])

data.apply(get_mapping, axis=1)

Expand Down
1 change: 1 addition & 0 deletions lib/mqparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -1948,6 +1948,7 @@ def leading_protein_ko(self, df):
ko = rfunc.up2ko(row[1]['Leading Protein'])
df.loc[ ind, 'Leading Protein Kegg Orthology ID'] = ko.ko
df.loc[ ind, 'Leading Protein Kegg Orthology Name'] = ko.name
df.loc[ ind, 'Leading Protein Kegg Pathways'] = ko.pathways
if not ko.ko == '':
_ = ko.ko +' ' + ko.name
else:
Expand Down
13 changes: 12 additions & 1 deletion lib/rfunc.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,15 +328,20 @@ def __init__(self, up):
res = mg.query(self.up)
self.ids = []
self.names = []
self.pathways=[]
for i in res['hits']:
if 'entrezgene' in i:
symbol = i['entrezgene']
ko = entrez2ko(symbol)
if not ko.ko is None:
self.ids.append(ko.ko)
self.names.append(ko.name)
if not ko.pathways is None:
self.pathways.append(ko.pathways)
self.ko = ';'.join(set(self.ids))
self.name = ';'.join(set(self.names))
self.pathways = ';'.join(set(self.pathways))


class string2ko: # uniprot id
def __init__(self, string):
Expand All @@ -352,7 +357,7 @@ def __init__(self, entrez):
self.entrez = entrez
self.ko = None
self.name = None

self.pathways=None
c = 'library(KEGGREST)';ro.r(c)
c = 'conv <- keggConv("genes", "ncbi-geneid:{}")'.format(self.entrez); ro.r(c)
try:
Expand All @@ -369,6 +374,12 @@ def __init__(self, entrez):
self.name= ';'.join(robjects.r(c))
except:
pass
try:
c = 'names(ko[[1]]$PATHWAY)'
res = robjects.r(c)
self.pathways=';'.join(res)
except:
pass
# keggConv("genes", "uniprot:Q05025"))
# q <- keggGet("cjo:107318960")
# q[[1]]$ORTHOLOGY
Expand Down

0 comments on commit ec1c334

Please sign in to comment.