From ec1c33400171f6b3e8c74f3333ef006ac9e3e6d7 Mon Sep 17 00:00:00 2001
From: Matthys Potgieter <matthys.potgieter@gmail.com>
Date: Fri, 22 Sep 2023 14:45:05 +0200
Subject: [PATCH] unit test fixes

---
 bin/python/mq_blast_groups2orfs.py | 44 ++++++++++++++++++++++++------
 lib/mqparse.py                     |  1 +
 lib/rfunc.py                       | 13 ++++++++-
 3 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/bin/python/mq_blast_groups2orfs.py b/bin/python/mq_blast_groups2orfs.py
index 38c9554..95e3612 100755
--- a/bin/python/mq_blast_groups2orfs.py
+++ b/bin/python/mq_blast_groups2orfs.py
@@ -16,6 +16,7 @@
 import pickle
 import yaml
 import subprocess
+from collections import defaultdict
 
 config = yaml.load(open(sys.argv[1]), Loader=yaml.Loader)
 output = os.path.abspath( sys.argv[2])
@@ -23,6 +24,31 @@
 query_fasta  = os.path.abspath(config['search_fasta'])
 target_fasta = output + '/strains/all_mapped_trans_orfs.fasta'
 
+
+nr_targets = defaultdict(list)
+targets = list(SeqIO.parse(target_fasta, 'fasta'))
+
+for rec in targets:
+    nr_targets[str(rec.seq)].append(rec.id)
+
+export = []
+mapping = {}
+count = 1
+for seq in nr_targets:
+    id_ = 'nr_sequence_{}'.format(str(count)) 
+    ids = nr_targets[seq] 
+    mapping[id_] = ids
+    description =';'.join(ids) 
+    seq = Seq(''.join(seq.split('*'))) 
+    record = SeqRecord(id = id_, description = description, seq = seq)
+    export.append(record)   
+    count += 1
+
+new_target_path = output +'/blast/groups2orfs/nr.fasta'
+SeqIO.write(export, new_target_path, 'fasta') 
+
+
+
 txt_path = os.path.abspath(config['mq_txt'])
 pg = pd.read_csv(txt_path +'/proteinGroups.txt', sep='\t')
 pg_ids = []
@@ -48,7 +74,7 @@
 with open(new_query_path,'w') as w:
     SeqIO.write(new_queries, w, 'fasta')
 
-cmd="cp {} {} && cd {} && makeblastdb -in {} -dbtype 'prot' -out {}".format(target_fasta, newfolder, newfolder, 'all_mapped_trans_orfs.fasta', 'mapped_orfs' )
+cmd="cd {} && makeblastdb -in {} -dbtype 'prot' -out {}".format(newfolder, 'nr.fasta', 'mapped_orfs' )
 process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
 process.wait()
 assert process.returncode == 0
@@ -81,7 +107,8 @@
 data = pd.read_csv(out+'.csv')
 
 
-#data = data[(data['_alignment_rank']==1) & (data['_hsp_rank']==1)]
+data = data[(data['_alignment_rank']==1) & (data['_hsp_rank']==1)]
+
 
 mp = defaultdict(list)
 
@@ -90,13 +117,14 @@ def get_mapping(df):
     #ids = df['blast_record.query'].split()[0].split(';')
     evalue= df['hsp.expect']
     for i in ids:
-        print(i)
         i = i.split('|')[1]#.split('.')[0]
-        #if evalue < 0.0001:
-        if evalue == 0:
-            mapped = df['_alignment.entry']
-            if not mapped in mp[i]:
-                mp[i].append(mapped)
+        if evalue < 0.0001:
+            nr_id = df['alignment.hit_def'].split()[0]
+            mapped_ids = mapping[nr_id]
+            for mapped in mapped_ids:
+               
+                if not mapped in mp[i]:
+                    mp[i].append(mapped.split('|')[1])
 
 data.apply(get_mapping, axis=1)
 
diff --git a/lib/mqparse.py b/lib/mqparse.py
index 07f7143..76f1741 100644
--- a/lib/mqparse.py
+++ b/lib/mqparse.py
@@ -1948,6 +1948,7 @@ def leading_protein_ko(self, df):
             ko = rfunc.up2ko(row[1]['Leading Protein'])
             df.loc[ ind, 'Leading Protein Kegg Orthology ID'] = ko.ko
             df.loc[ ind, 'Leading Protein Kegg Orthology Name'] = ko.name
+            df.loc[ ind, 'Leading Protein Kegg Pathways'] = ko.pathways
             if not ko.ko == '':
                 _ = ko.ko +' ' + ko.name 
             else:
diff --git a/lib/rfunc.py b/lib/rfunc.py
index eda8adc..bd0d0ca 100644
--- a/lib/rfunc.py
+++ b/lib/rfunc.py
@@ -328,6 +328,7 @@ def __init__(self, up):
         res = mg.query(self.up)
         self.ids = []
         self.names = []
+        self.pathways=[]
         for i in res['hits']:
             if 'entrezgene' in i:
                 symbol = i['entrezgene']
@@ -335,8 +336,12 @@ def __init__(self, up):
                 if not ko.ko is None: 
                     self.ids.append(ko.ko)
                     self.names.append(ko.name)
+                if not ko.pathways is None:
+                    self.pathways.append(ko.pathways)
         self.ko = ';'.join(set(self.ids))
         self.name = ';'.join(set(self.names))
+        self.pathways = ';'.join(set(self.pathways))
+
 
 class string2ko: # uniprot id
     def __init__(self, string):
@@ -352,7 +357,7 @@ def __init__(self, entrez):
         self.entrez = entrez
         self.ko = None
         self.name = None
-
+        self.pathways=None
         c = 'library(KEGGREST)';ro.r(c)
         c = 'conv <- keggConv("genes", "ncbi-geneid:{}")'.format(self.entrez); ro.r(c) 
         try:
@@ -369,6 +374,12 @@ def __init__(self, entrez):
             self.name= ';'.join(robjects.r(c))
         except:
             pass
+        try:
+            c = 'names(ko[[1]]$PATHWAY)'
+            res = robjects.r(c)
+            self.pathways=';'.join(res)
+        except:
+            pass
     # keggConv("genes", "uniprot:Q05025"))
     # q <- keggGet("cjo:107318960")
     # q[[1]]$ORTHOLOGY