-
Notifications
You must be signed in to change notification settings - Fork 4
/
create_receptor_structures
executable file
·189 lines (172 loc) · 10.7 KB
/
create_receptor_structures
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
source $vini_dir/globals
UNIPROT_ID_LIST="_uniprot_entries.txt"
outdir=$vini_dir/genes/pdb_files #where the predicted structures are placed
outdir=$vini_dir/genes/pdb_files #Define AlphaFold parameter
cp $ROSETTA/tools/protein_tools/scripts/clean_pdb.py ./
cp $ROSETTA/tools/protein_tools/scripts/amino_acids.py ./
rm -f *pdb
#http://mmb.irbbarcelona.org/gitlab/DBW/PDBBrowser/raw/d80ad01c7569f5cfaac3a247f6e086d63db815cc/PDBData/cmpd_res.idx
#The database of all RCSB structure names
rm -f $vini_dir/database/KEGG_cancer_pathways/${CANCER_PATHWAY}/${CANCER_PATHWAY}${UNIPROT_ID_LIST}
> $WORKDIR/failed_pubchem_structures
echo "finalizing receptor file, please wait..."
> $WORKDIR/receptors_contracted
noentries=`wc -l < $WORKDIR/receptors_expanded`
echo "number of entries in pathway is" $noentries
entry=1
while read -r line
do
KEGG_entry_id=`echo $line | awk '{print $1}'` #get KEGG entry ID (number) and KEGG gene name (hsa:xxxx)
# note one KEGG entry may have several genes!
KEGG_gene_name=`echo $line | awk '{print $2}'`
if [[ ${KEGG_gene_name:0:1} == "C" ]] #checking if KEGG entry is the compound
then
wget -O $WORKDIR/tmp -q http://rest.kegg.jp/conv/pubchem/${KEGG_gene_name}
pubchem_id=`head -1 $WORKDIR/tmp | awk '{print $2}'`
echo -n "KEGG entry" $entry "is a compound with Pubchem ID" $pubchem_id "Checking if in repo..."
if [[ -e $vini_dir/genes/pdb_files/${pubchem_id}.pdb ]] || [[ -e $vini_dir/database/genes/pdb_files/${pubchem_id}.pdb ]]
then
if [ ! -e $vini_dir/genes/pdb_files/${pubchem_id}.pdb ]
then
cp $vini_dir/genes/database/pdb_files/${pubchem_id}.pdb $vini_dir/genes/pdb_files
fi
echo " yes. Adding" $pubchem_id "to" $WORKDIR/receptors_contracted "with E flag."
echo $KEGG_entry_id $KEGG_gene_name $pubchem_id "E" >> $WORKDIR/receptors_contracted
else
echo -n "no. Trying to obtain the structure from Pubchem..."
echo ${pubchem_id}
sh download_substance_structure ${pubchem_id}
if [ -e $vini_dir/genes/pdb_files/${pubchem_id}.pdb ]
then
echo "success. Adding" $pubchem_id "to" $WORKDIR/receptors_contracted "list with E flag."
else
echo " failed. Adding" $pubchem_id "to" $WORKDIR/receptors_contracted "with F flag."
echo $KEGG_entry_id $KEGG_gene_name $pubchem_id "F" >> $WORKDIR/receptors_contracted
echo "KEGG ID:" $KEGG_entry_id " " "Pubchem ID:" $pubchem_id >> $WORKDIR/failed_pubchem_structures
fi
fi
else #KEGG entry is protein
wget -O $WORKDIR/tmp -q http://rest.kegg.jp/conv/uniprot/${KEGG_gene_name}
uniprot_id=`head -1 $WORKDIR/tmp | awk '{print $2}'`
uniprot_id=`echo $uniprot_id | sed 's/\://g'`
uniprot_id=`echo $uniprot_id | sed 's/\up//g'`
if [ -z "$uniprot_id" ]
then
echo "Uniprot ID for $KEGG_gene_name not found via KEGG API."
echo "Converting KEGG ID $KEGG_gene_name to Uniprot ID via table."
uniprot_id=`grep "${KEGG_gene_name}" kegg2uniprot | awk '{print $2}'`
if [ -z "$uniprot_id" ]
then
echo "cannot convert ${KEGG_gene_name} to Uniprot ID. Vini will stop now." >> Vini.crashlog
echo "convert ${KEGG_gene_name} to $uniprot_id and update kegg2uniprot table." >> Vini.crashlog
masterpid=`cat masterpid`
kill -9 $masterpid
fi
fi
echo $uniprot_id >> $vini_dir/database/KEGG_cancer_pathways/${CANCER_PATHWAY}/${CANCER_PATHWAY}${UNIPROT_ID_LIST}
echo -n "KEGG entry" $entry "is a protein-encoding gene" $uniprot_id
uline=`grep $uniprot_id $vini_dir/database/uniprot_db` #Check if pdb structure is available
nowords=`echo $uline | wc -w`
if [ $nowords -eq $TWO ]
then
echo -n " without structure. Checking if in repo..."
if [ -e $vini_dir/genes/pdb_files/${uniprot_id}.pdb ]
then
echo "yes. Adding" ${uniprot_id} "to" $WORKDIR/receptors_contracted "with P flag."
echo $KEGG_entry_id $KEGG_gene_name $uniprot_id "P" >> $WORKDIR/receptors_contracted
else
echo "no."
if [ ! -e $vini_dir/genes/fasta_files/${uniprot_id}.fasta ]
then
wget -O $vini_dir/genes/fasta_files/${uniprot_id}.fasta --no-check-certificate -q https://www.uniprot.org/uniprot/${uniprot_id}.fasta #get fasta sequence
fi
fasta_file=$vini_dir/genes/fasta_files/${uniprot_id}.fasta
sh $vini_dir/predict_with_AlphaFold ${cores} ${partition} ${AlphaFold_base} ${fasta_file} ${outdir}
echo "Adding" ${uniprot_id} "to" $WORKDIR/receptors_contracted "list with W flag."
echo $KEGG_entry_id $KEGG_gene_name ${uniprot_id} "W" >> $WORKDIR/receptors_contracted
fi
else
echo -n " with structure. Checking if in repo..."
if [[ -e $vini_dir/genes/pdb_files/${uniprot_id}.pdb ]] || [[ -e $vini_dir/database/genes/pdb_files/${uniprot_id}.pdb ]]
then
if [ ! -e $vini_dir/genes/pdb_files/${uniprot_id}.pdb ]
then
cp $vini_dir/database&genes/pdb_files/${uniprot_id}.pdb $vini_dir/genes/pdb_files
fi
echo " yes. Adding" $uniprot_id "to" $WORKDIR/receptors_contracted "with E flag."
echo ${KEGG_entry_id} ${KEGG_gene_name} ${uniprot_id} "E" >> $WORKDIR/receptors_contracted
else
echo "no. Analyzing available RCSB structures, may take a while... "
rm -f $WORKDIR/completeness_list
sh create_completeness_list ${uniprot_id}
noentries=`wc -l < $WORKDIR/completeness_list`
if [ $noentries -ne $NULL ]
then
for (( i=1; i<$((noentries+1)); i++ ))
do
accession_code=`head -$i $WORKDIR/completeness_list | tail -1 | awk '{print $1}'`
echo "Downloading ${accession_code} structure from RCSB and cleaning it from water and ligands..."
wget -q -O ${uniprot_id}.pdb.gz http://www.rcsb.org/pdb/files/${accession_code}.pdb.gz
gzip -df ${uniprot_id}.pdb.gz
grep COMPND ${uniprot_id}.pdb | grep CHAIN | grep -v MOLECULE > tmp
line=`head -1 tmp `
chain=`echo $line | awk '{print $4}' `
nowords=`echo $line | wc -w`
if [ $nowords -eq $FOUR ]
then
chain=`echo $chain | tr -d ";"`
else
chain=`echo $chain | tr -d ","`
fi
echo "Cleaning $uniprot_id chain $chain" ; echo
python clean_pdb.py ${uniprot_id}.pdb ${chain}
if [ -e ${uniprot_id}_${chain}.pdb ]
then
mv ${uniprot_id}_${chain}.pdb ${uniprot_id}.pdb.sav
rm -f *pdb
mv ${uniprot_id}.pdb.sav ${uniprot_id}.pdb
echo "Adding missing residues with Chimera, please wait..."
chimera --nogui --script "add_missing_residues.py ${uniprot_id}" | tee -a chimera_log.${uniprot_id}
grep Error chimera_log.${uniprot_id} > tmp #checking if the operation succeeded
if [ ! -s tmp ]
then
cp ${uniprot_id}_res.pdb $vini_dir/genes/pdb_files/${uniprot_id}.pdb
else
cp ${uniprot_id}.pdb $vini_dir/genes/pdb_files/${uniprot_id}.pdb
fi
echo "success. Adding ${uniprot_id} to $WORKDIR/receptors_contracted list with E flag."
echo ${KEGG_entry_id} ${KEGG_gene_name} ${uniprot_id} "E" >> $WORKDIR/receptors_contracted
break
else
echo "failed. Trying to predict the structure."
if [ ! -e $vini_dir/genes/fasta_files/${uniprot_id}.fasta ]
then
wget -O ${uniprot_id}.fasta --no-check-certificate -q https://www.uniprot.org/uniprot/${uniprot_id}.fasta #get & prep sequence
mv ${uniprot_id}.fasta $vini_dir/genes/fasta_files/${uniprot_id}.fasta
fi
fasta_file=$vini_dir/genes/fasta_files/${uniprot_id}.fasta
sh $vini_dir/predict_with_AlphaFold ${cpus} ${partition} ${AlphaFold_base} ${fasta_file} ${outdir}
echo "Adding" ${uniprot_id} "to" $WORKDIR/receptors_contracted "list with W flag."
echo $KEGG_entry_id $KEGG_gene_name $uniprot_id "W" >> $WORKDIR/receptors_contracted
fi
done
else
echo "no RCSB single model found. Trying to predict the structure."
wget -O $vini_dir/genes/fasta_files/${uniprot_id}.fasta --no-check-certificate -q https://www.uniprot.org/uniprot/${uniprot_id}.fasta #get & prep sequence
fasta_file=$vini_dir/genes/fasta_files/${uniprot_id}.fasta
sh $vini_dir/predict_with_AlphaFold ${cpus} ${partition} ${AlphaFold_base} ${fasta_file} ${outdir}
echo "Adding" ${uniprot_id} "to" $WORKDIR/receptors_contracted "list with W flag."
echo ${KEGG_entry_id} ${KEGG_gene_name} ${uniprot_id} "W" >> $WORKDIR/receptors_contracted
fi
fi
#fi
fi
fi
let "entry++"
done < $WORKDIR/receptors_expanded
cp $vini_dir/genes/pdb_files/* $vini_dir/database/genes/pdb_files
awk '!seen[$0]++' $vini_dir/database/KEGG_cancer_pathways/$CANCER_PATHWAY/$CANCER_PATHWAY${UNIPROT_ID_LIST} > $WORKDIR/tmp #remove duplicate lines
shuf $WORKDIR/tmp -o tmp2 ; mv tmp2 $WORKDIR/tmp
grep "\S" $WORKDIR/tmp > $vini_dir/database/KEGG_cancer_pathways/$CANCER_PATHWAY/$CANCER_PATHWAY$UNIPROT_ID_LIST #remove empty lines
cp $vini_dir/database/KEGG_cancer_pathways/$CANCER_PATHWAY/$CANCER_PATHWAY${UNIPROT_ID_LIST} $vini_dir/genes/Uniprot_ID_list
rm -f $WORKDIR/tmp chimera_log.* *pdb *fasta #cleanup