create_receptor_structures

source $vini_dir/globals

UNIPROT_ID_LIST="_uniprot_entries.txt"
outdir=$vini_dir/genes/pdb_files  #where the predicted structures are placed

outdir=$vini_dir/genes/pdb_files  #Define AlphaFold parameter


cp $ROSETTA/tools/protein_tools/scripts/clean_pdb.py ./
cp $ROSETTA/tools/protein_tools/scripts/amino_acids.py ./
rm -f *pdb

#http://mmb.irbbarcelona.org/gitlab/DBW/PDBBrowser/raw/d80ad01c7569f5cfaac3a247f6e086d63db815cc/PDBData/cmpd_res.idx
#The database of all RCSB structure names

rm -f $vini_dir/database/KEGG_cancer_pathways/${CANCER_PATHWAY}/${CANCER_PATHWAY}${UNIPROT_ID_LIST}

> $WORKDIR/failed_pubchem_structures

echo "finalizing receptor file, please wait..."
> $WORKDIR/receptors_contracted

noentries=`wc -l < $WORKDIR/receptors_expanded`
echo "number of entries in pathway is" $noentries
entry=1


while read -r line
do     
    KEGG_entry_id=`echo $line | awk '{print $1}'` #get KEGG entry ID (number) and KEGG gene name (hsa:xxxx)
                                                  # note one KEGG entry may have several genes! 
    KEGG_gene_name=`echo $line | awk '{print $2}'`           

    if  [[ ${KEGG_gene_name:0:1} == "C" ]]         #checking if KEGG entry is the compound
    then
        wget -O $WORKDIR/tmp -q  http://rest.kegg.jp/conv/pubchem/${KEGG_gene_name}
        pubchem_id=`head -1 $WORKDIR/tmp | awk '{print $2}'`

        echo -n "KEGG entry" $entry "is a compound with Pubchem ID" $pubchem_id "Checking if in repo..."
        if  [[ -e $vini_dir/genes/pdb_files/${pubchem_id}.pdb ]] || [[ -e $vini_dir/database/genes/pdb_files/${pubchem_id}.pdb ]]  
        then 
            if [ ! -e $vini_dir/genes/pdb_files/${pubchem_id}.pdb ]
            then
                cp $vini_dir/genes/database/pdb_files/${pubchem_id}.pdb $vini_dir/genes/pdb_files
            fi
            echo " yes. Adding" $pubchem_id "to" $WORKDIR/receptors_contracted "with E flag."
            echo $KEGG_entry_id $KEGG_gene_name $pubchem_id "E" >> $WORKDIR/receptors_contracted
        else
            echo -n "no. Trying to obtain the structure from Pubchem..."
            echo ${pubchem_id}
            sh download_substance_structure ${pubchem_id}
            if  [ -e $vini_dir/genes/pdb_files/${pubchem_id}.pdb ]
            then
                echo "success. Adding" $pubchem_id "to" $WORKDIR/receptors_contracted "list with E flag."
            else
                echo " failed. Adding" $pubchem_id "to" $WORKDIR/receptors_contracted "with F flag."
                echo $KEGG_entry_id $KEGG_gene_name $pubchem_id "F" >> $WORKDIR/receptors_contracted
                echo "KEGG ID:" $KEGG_entry_id "   " "Pubchem ID:" $pubchem_id >> $WORKDIR/failed_pubchem_structures
            fi
        fi
    else  #KEGG entry is protein
        wget -O $WORKDIR/tmp -q  http://rest.kegg.jp/conv/uniprot/${KEGG_gene_name}
        uniprot_id=`head -1 $WORKDIR/tmp | awk '{print $2}'`
        uniprot_id=`echo  $uniprot_id | sed 's/\://g'`
        uniprot_id=`echo  $uniprot_id | sed 's/\up//g'`
        if  [ -z "$uniprot_id" ]
        then
            echo "Uniprot ID for $KEGG_gene_name not found via KEGG API."
            echo "Converting KEGG ID $KEGG_gene_name to Uniprot ID via table."
            uniprot_id=`grep "${KEGG_gene_name}" kegg2uniprot | awk '{print $2}'`
            if  [ -z "$uniprot_id" ]
            then
                echo "cannot convert ${KEGG_gene_name} to Uniprot ID. Vini will stop now." >> Vini.crashlog
                echo "convert ${KEGG_gene_name} to $uniprot_id and update kegg2uniprot table." >> Vini.crashlog
                masterpid=`cat masterpid`
                kill -9 $masterpid
            fi
        fi
        echo $uniprot_id >> $vini_dir/database/KEGG_cancer_pathways/${CANCER_PATHWAY}/${CANCER_PATHWAY}${UNIPROT_ID_LIST}
        echo -n "KEGG entry" $entry "is a protein-encoding gene" $uniprot_id
        uline=`grep $uniprot_id $vini_dir/database/uniprot_db` #Check if pdb structure is available
        nowords=`echo $uline | wc -w`
        if  [ $nowords -eq $TWO ]
        then
            echo -n " without structure. Checking if in repo..."
            if  [ -e $vini_dir/genes/pdb_files/${uniprot_id}.pdb ]   
            then
                echo "yes. Adding" ${uniprot_id} "to" $WORKDIR/receptors_contracted "with P flag." 
                echo $KEGG_entry_id $KEGG_gene_name $uniprot_id "P" >> $WORKDIR/receptors_contracted
            else
                echo "no."
                if  [ ! -e $vini_dir/genes/fasta_files/${uniprot_id}.fasta ]
                then
                    wget -O $vini_dir/genes/fasta_files/${uniprot_id}.fasta --no-check-certificate -q https://www.uniprot.org/uniprot/${uniprot_id}.fasta #get fasta sequence
                fi
                fasta_file=$vini_dir/genes/fasta_files/${uniprot_id}.fasta
                sh $vini_dir/predict_with_AlphaFold ${cores} ${partition} ${AlphaFold_base} ${fasta_file} ${outdir}
                echo "Adding" ${uniprot_id} "to" $WORKDIR/receptors_contracted "list with W flag."
                echo $KEGG_entry_id $KEGG_gene_name ${uniprot_id} "W" >> $WORKDIR/receptors_contracted
            fi
        else
            echo -n " with structure. Checking if in repo..."
            if  [[ -e $vini_dir/genes/pdb_files/${uniprot_id}.pdb ]] || [[ -e $vini_dir/database/genes/pdb_files/${uniprot_id}.pdb ]]
            then 
                if  [ ! -e $vini_dir/genes/pdb_files/${uniprot_id}.pdb ] 
                then
                    cp $vini_dir/database&genes/pdb_files/${uniprot_id}.pdb $vini_dir/genes/pdb_files
                fi
                echo " yes. Adding" $uniprot_id "to" $WORKDIR/receptors_contracted "with E flag."
                echo ${KEGG_entry_id} ${KEGG_gene_name} ${uniprot_id} "E" >> $WORKDIR/receptors_contracted
            else
                echo "no. Analyzing available RCSB structures, may take a while... "  
                rm -f $WORKDIR/completeness_list
                sh create_completeness_list ${uniprot_id}
                noentries=`wc -l < $WORKDIR/completeness_list`
                if  [ $noentries -ne $NULL ]
                then
                    for (( i=1; i<$((noentries+1)); i++ ))
                    do
                        accession_code=`head -$i $WORKDIR/completeness_list | tail -1    | awk '{print $1}'`
                        echo "Downloading ${accession_code} structure from RCSB and cleaning it from water and ligands..."
                        wget -q -O ${uniprot_id}.pdb.gz  http://www.rcsb.org/pdb/files/${accession_code}.pdb.gz
                        gzip -df ${uniprot_id}.pdb.gz
                        grep COMPND ${uniprot_id}.pdb | grep CHAIN | grep -v MOLECULE > tmp
                        line=`head -1 tmp `
                        chain=`echo $line | awk '{print $4}' `
                        nowords=`echo $line | wc -w`
                        if  [ $nowords -eq $FOUR ]
                        then
                            chain=`echo $chain | tr -d ";"`
                        else
                            chain=`echo $chain | tr -d ","`
                        fi
                        echo "Cleaning $uniprot_id chain $chain" ; echo 
                        python clean_pdb.py ${uniprot_id}.pdb ${chain}

                        if  [ -e ${uniprot_id}_${chain}.pdb ]
                        then
                            mv ${uniprot_id}_${chain}.pdb ${uniprot_id}.pdb.sav
                            rm -f *pdb
                            mv ${uniprot_id}.pdb.sav ${uniprot_id}.pdb
                            echo "Adding missing residues with Chimera, please wait..."
                            chimera --nogui --script "add_missing_residues.py ${uniprot_id}" | tee -a chimera_log.${uniprot_id}
                            grep Error chimera_log.${uniprot_id} > tmp   #checking if the operation succeeded
                            
                            if [ ! -s tmp ]
                            then
                                cp ${uniprot_id}_res.pdb $vini_dir/genes/pdb_files/${uniprot_id}.pdb
                            else
                                cp ${uniprot_id}.pdb $vini_dir/genes/pdb_files/${uniprot_id}.pdb
                            fi
                            echo "success. Adding ${uniprot_id} to $WORKDIR/receptors_contracted list with E flag."
                            echo ${KEGG_entry_id} ${KEGG_gene_name} ${uniprot_id} "E" >> $WORKDIR/receptors_contracted
                            break
                        else
                            echo "failed. Trying to predict the structure."
                            if   [ ! -e $vini_dir/genes/fasta_files/${uniprot_id}.fasta ]
                            then
                                wget -O ${uniprot_id}.fasta --no-check-certificate -q https://www.uniprot.org/uniprot/${uniprot_id}.fasta #get & prep sequence
                                mv ${uniprot_id}.fasta $vini_dir/genes/fasta_files/${uniprot_id}.fasta
                            fi
                            fasta_file=$vini_dir/genes/fasta_files/${uniprot_id}.fasta
                            sh $vini_dir/predict_with_AlphaFold ${cpus} ${partition} ${AlphaFold_base} ${fasta_file} ${outdir}
                            echo "Adding" ${uniprot_id} "to" $WORKDIR/receptors_contracted "list with W flag."
                            echo $KEGG_entry_id $KEGG_gene_name $uniprot_id "W" >> $WORKDIR/receptors_contracted
                        fi
                    done
                else
                    echo "no RCSB single model found. Trying to predict the structure."
                    wget -O $vini_dir/genes/fasta_files/${uniprot_id}.fasta --no-check-certificate -q https://www.uniprot.org/uniprot/${uniprot_id}.fasta #get & prep sequence
                    fasta_file=$vini_dir/genes/fasta_files/${uniprot_id}.fasta
                    sh $vini_dir/predict_with_AlphaFold ${cpus} ${partition} ${AlphaFold_base} ${fasta_file} ${outdir}
                    echo "Adding" ${uniprot_id} "to" $WORKDIR/receptors_contracted "list with W flag."
                    echo ${KEGG_entry_id} ${KEGG_gene_name} ${uniprot_id} "W" >> $WORKDIR/receptors_contracted
                fi
            fi
        #fi
        fi
    fi
    let "entry++"
done < $WORKDIR/receptors_expanded

cp $vini_dir/genes/pdb_files/* $vini_dir/database/genes/pdb_files

awk '!seen[$0]++' $vini_dir/database/KEGG_cancer_pathways/$CANCER_PATHWAY/$CANCER_PATHWAY${UNIPROT_ID_LIST} > $WORKDIR/tmp #remove duplicate lines
shuf $WORKDIR/tmp -o tmp2 ; mv tmp2 $WORKDIR/tmp
grep "\S" $WORKDIR/tmp > $vini_dir/database/KEGG_cancer_pathways/$CANCER_PATHWAY/$CANCER_PATHWAY$UNIPROT_ID_LIST #remove empty lines
cp $vini_dir/database/KEGG_cancer_pathways/$CANCER_PATHWAY/$CANCER_PATHWAY${UNIPROT_ID_LIST} $vini_dir/genes/Uniprot_ID_list
rm -f $WORKDIR/tmp chimera_log.* *pdb *fasta #cleanup