From ec1c33400171f6b3e8c74f3333ef006ac9e3e6d7 Mon Sep 17 00:00:00 2001 From: Matthys Potgieter Date: Fri, 22 Sep 2023 14:45:05 +0200 Subject: [PATCH] unit test fixes --- bin/python/mq_blast_groups2orfs.py | 44 ++++++++++++++++++++++++------ lib/mqparse.py | 1 + lib/rfunc.py | 13 ++++++++- 3 files changed, 49 insertions(+), 9 deletions(-) diff --git a/bin/python/mq_blast_groups2orfs.py b/bin/python/mq_blast_groups2orfs.py index 38c9554..95e3612 100755 --- a/bin/python/mq_blast_groups2orfs.py +++ b/bin/python/mq_blast_groups2orfs.py @@ -16,6 +16,7 @@ import pickle import yaml import subprocess +from collections import defaultdict config = yaml.load(open(sys.argv[1]), Loader=yaml.Loader) output = os.path.abspath( sys.argv[2]) @@ -23,6 +24,31 @@ query_fasta = os.path.abspath(config['search_fasta']) target_fasta = output + '/strains/all_mapped_trans_orfs.fasta' + +nr_targets = defaultdict(list) +targets = list(SeqIO.parse(target_fasta, 'fasta')) + +for rec in targets: + nr_targets[str(rec.seq)].append(rec.id) + +export = [] +mapping = {} +count = 1 +for seq in nr_targets: + id_ = 'nr_sequence_{}'.format(str(count)) + ids = nr_targets[seq] + mapping[id_] = ids + description =';'.join(ids) + seq = Seq(''.join(seq.split('*'))) + record = SeqRecord(id = id_, description = description, seq = seq) + export.append(record) + count += 1 + +new_target_path = output +'/blast/groups2orfs/nr.fasta' +SeqIO.write(export, new_target_path, 'fasta') + + + txt_path = os.path.abspath(config['mq_txt']) pg = pd.read_csv(txt_path +'/proteinGroups.txt', sep='\t') pg_ids = [] @@ -48,7 +74,7 @@ with open(new_query_path,'w') as w: SeqIO.write(new_queries, w, 'fasta') -cmd="cp {} {} && cd {} && makeblastdb -in {} -dbtype 'prot' -out {}".format(target_fasta, newfolder, newfolder, 'all_mapped_trans_orfs.fasta', 'mapped_orfs' ) +cmd="cd {} && makeblastdb -in {} -dbtype 'prot' -out {}".format(newfolder, 'nr.fasta', 'mapped_orfs' ) process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) process.wait() assert process.returncode == 0 @@ -81,7 +107,8 @@ data = pd.read_csv(out+'.csv') -#data = data[(data['_alignment_rank']==1) & (data['_hsp_rank']==1)] +data = data[(data['_alignment_rank']==1) & (data['_hsp_rank']==1)] + mp = defaultdict(list) @@ -90,13 +117,14 @@ def get_mapping(df): #ids = df['blast_record.query'].split()[0].split(';') evalue= df['hsp.expect'] for i in ids: - print(i) i = i.split('|')[1]#.split('.')[0] - #if evalue < 0.0001: - if evalue == 0: - mapped = df['_alignment.entry'] - if not mapped in mp[i]: - mp[i].append(mapped) + if evalue < 0.0001: + nr_id = df['alignment.hit_def'].split()[0] + mapped_ids = mapping[nr_id] + for mapped in mapped_ids: + + if not mapped in mp[i]: + mp[i].append(mapped.split('|')[1]) data.apply(get_mapping, axis=1) diff --git a/lib/mqparse.py b/lib/mqparse.py index 07f7143..76f1741 100644 --- a/lib/mqparse.py +++ b/lib/mqparse.py @@ -1948,6 +1948,7 @@ def leading_protein_ko(self, df): ko = rfunc.up2ko(row[1]['Leading Protein']) df.loc[ ind, 'Leading Protein Kegg Orthology ID'] = ko.ko df.loc[ ind, 'Leading Protein Kegg Orthology Name'] = ko.name + df.loc[ ind, 'Leading Protein Kegg Pathways'] = ko.pathways if not ko.ko == '': _ = ko.ko +' ' + ko.name else: diff --git a/lib/rfunc.py b/lib/rfunc.py index eda8adc..bd0d0ca 100644 --- a/lib/rfunc.py +++ b/lib/rfunc.py @@ -328,6 +328,7 @@ def __init__(self, up): res = mg.query(self.up) self.ids = [] self.names = [] + self.pathways=[] for i in res['hits']: if 'entrezgene' in i: symbol = i['entrezgene'] @@ -335,8 +336,12 @@ def __init__(self, up): if not ko.ko is None: self.ids.append(ko.ko) self.names.append(ko.name) + if not ko.pathways is None: + self.pathways.append(ko.pathways) self.ko = ';'.join(set(self.ids)) self.name = ';'.join(set(self.names)) + self.pathways = ';'.join(set(self.pathways)) + class string2ko: # uniprot id def __init__(self, string): @@ -352,7 +357,7 @@ def __init__(self, entrez): self.entrez = entrez self.ko = None self.name = None - + self.pathways=None c = 'library(KEGGREST)';ro.r(c) c = 'conv <- keggConv("genes", "ncbi-geneid:{}")'.format(self.entrez); ro.r(c) try: @@ -369,6 +374,12 @@ def __init__(self, entrez): self.name= ';'.join(robjects.r(c)) except: pass + try: + c = 'names(ko[[1]]$PATHWAY)' + res = robjects.r(c) + self.pathways=';'.join(res) + except: + pass # keggConv("genes", "uniprot:Q05025")) # q <- keggGet("cjo:107318960") # q[[1]]$ORTHOLOGY