forked from zhaotao1987/SynNet-Pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SynetAdding.sh
248 lines (167 loc) · 7.23 KB
/
SynetAdding.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
# !/bin/bash
display_usage() {
echo -e "\nThis Script is for Adding New Genomes into the Synteny Network Database"
echo -e "\nUsage:bash SynetAdd-X.sh k s m p Existing_Path(containing *.dmnd)\n"
echo "k: parameter for Diamond: # of top hits"
echo -e "\n\t-k 0: All hits\n\t-k 25: Diamond Default\n\t-k 6: MCScanX suggested\n"
echo -e "s: parameter for MCSCanX: Minimum # of Anchors for a synteny block \n Higher, stricter"
echo -e "\n\t-s 5: MCSCanX Default\n"
echo -e "m: parameter for MCSCanX: Maximum # of Genes allowed as the GAP between Anchors\n Fewer, stricter"
echo -e "\n\t-m 25: MCScanX Default gaps\n"
echo -e "p: Number of CPUs: Used for Diamond makedb|Diamond blastp|MCSCanX \n"
echo -e "\n\t-p 25: Use 25 CPUs\n"
}
# if less than one arguments supplied, display usage
if [ $# -le 0 ]
then
display_usage
exit 1
fi
# check whether user had supplied -h or --help . If yes display usage
if [[ ( $* == "--help") || $* == "-h" ]]
then
display_usage
exit 0
fi
####################################
#
# First, Pass the user-input variables
#
####################################
hits=$1
anchors=$2
gaps=$3
CPU=$4
Existingdmnd='/path/to/diamond/databases/'
Existingpepbed='/path/to/genome/files/pep/bed/files'
# We define a function here called "scale", to parrallel MCSCanX
function scale {
while [ $(jobs -p | wc -l) -ge "$CPU" ]; do
sleep 1
done
}
#########################
# Step_1: species list
#########################
# Change the items in this list, prepare .pep and .bed for each genome.
# rm Mimp Chaf
# array=(vra van pvu gma cca tpr mtr adu Anan Mimp Chaf lja Lang car pmu Rchi Dryd ppe \
# pbr Mald Tori roc fve Mnot Zjuj Pand mes rco lus Potr ptr Cuma csa cme cla jre Bpen cgr cru Alyr ath Csat bol bra bnp thh tsa Lmey aar spa Bost Alp cgy tha \
#array=(vra van pvu gma cca tpr mtr adu Anan lja Lang car pmu Rchi Dryd ppe \
#pbr Mald Tori roc fve Mnot Zjuj Pand mes rco lus ptr Cuma csa cme cla jre Bpen cgr cru Alyr ath Csat bol bra bnp thh tsa Lmey aar spa Bost Alp cgy tha \
#cpa gra Goba Ghir Dzib tca egr Cmax csi vvi spe sly stu can Caba Cach pax Inil Cuca Oeur sin mgu coc Lsat HanX Toff dca \
#ach Cqui bvu Ahyp Psom Mcor Aqco nnu Trdc Horv bdi ogl Opun osa oru lpe Zmay sbi sit Sevi Sacc oth Ecru \
#aco egu Pdac mac Dcat peq Doff Ashe Aoff Xvis spo zom Lchi atr Such Peam Ckan \
#Xsor Sbir Qrob Pgra Prim Nymp Mole Kalf Fsyl Datg Cfol Cgla Cill Ccat Cnen Begf Aeri Acya)
array=(ppe pmu pbr Mald fve Rchi roc)
before=(ppe pmu pbr Mald fve)
new=(Rchi roc)
foldername=$(date +"SynNetAdd6Now129"%Y%m%d_%H%M)
mkdir "$foldername-SynNet-k$1s$2m$3"
cd "$foldername-SynNet-k$1s$2m$3"
#############################################################
# Step_2: Lets's first generate database for new genomes
#############################################################
#mkdir ../DiamondGenomesDB
for i in "${new[@]}";do
echo make database for species_$i
diamond makedb --algo 0 --in $Existingpepbed/$i.pep -d $Existingdmnd/$i -p $CPU
done
####################################################################
# Step_3: Now lets run Diamond and Generate BLASTP-like results
####################################################################
for i in "${array[@]}";do
for j in "${new[@]}";do
# option: --sensitive
# Sensitive mode is suggested by Diamomd.
# Example: diamond blastp --sensitive -q ../$i.pep -d ../DiamondGenomesDB/$j -o "$i"_"$j" -p $4 --max-hsps 1 -k $1
# Here we use fast mode without --sensitive
echo blast all to new @@ "$i" against "$j"
diamond blastp --algo 0 -q $Existingpepbed/$i.pep -d $Existingdmnd/$j -o "$i"_"$j" -p $CPU --max-hsps 1 -k $hits
echo blast new to all @@ "$j" against "$i"
diamond blastp --algo 0 -q $Existingpepbed/$j.pep -d $Existingdmnd/$i -o "$j"_"$i" -p $CPU --max-hsps 1 -k $hits
done
done
###################################################################
# Step_4.1: Now we combine corresponding files to prepare .blast
# This step is for intraspecies synteny detection!
##################################################################
echo Now we do inter old-new MCScanX, prepare inputs..
for i in "${before[@]}";do
for j in "${new[@]}";do
# if [ "$i" = "$j" ]; then
scale;(
echo merge "$i"_"$j" "$j"_"$i" into "$i"_"$j".blast;\
cat "$i"_"$j" "$j"_"$i" > "$i"_"$j".blast;\
echo merge "$i".bed "$j".bed into "$i"_"$j".gff;\
cat $Existingpepbed/"$i".bed $Existingpepbed/"$j".bed > "$i"_"$j".gff;\
echo Inter-species MCScanX here for species $i
MCScanX -a -b 2 "$i"_"$j" -s $anchors -m $gaps;\
# echo Now we check missed tandem pairs around anchors;\
# cat "$i"_"$j".blast "$i".blast "$j".blast > "$i"_"$j".all.blast;\
# detect_collinear_tandem_arrays -g "$i"_"$j".gff -b "$i"_"$j".all.blast -c "$i"_"$j".collinearity -o "$i"_"$j".tandem.collinear;\
# grep -o -E "," "$i"_"$j".tandem.collinear|wc -l > "$i"_"$j".tandem.collinear.added ;\
)&
# fi
done
done
wait
#######################################################################
# Step_4.2 Another loop
# This step is for inter-species synteny detection
# How to avoid duplicates! A-B is the same as B-A, so keep only one
# use my small program
#######################################################################
m=0
n=0
echo Now we do inter within-news [inter, intra] MCScanX, prepare inputs..
for i in "${new[@]:m}";do
for j in "${new[@]:n}";do
if [ "$i" != "$j" ]; then
echo new genomes: inter
# Do the following in parrallel
scale;(
echo merge "$i"_"$j" "$j"_"$i" into "$i"_"$j".blast;\
cat "$i"_"$j" "$j"_"$i" > "$i"_"$j".blast;\
echo merge "$i".bed "$j".bed into "$i"_"$j".gff;\
cat $Existingpepbed/"$i".bed $Existingpepbed/"$j".bed > "$i"_"$j".gff;\
echo Inter-species mcscan for "$i"_"$j";\
MCScanX -a -b 2 "$i"_"$j" -s $anchors -m $gaps ;\
# echo Now we check missed tandem pairs around anchors;\
# cat "$i"_"$j".blast "$i".blast "$j".blast > "$i"_"$j".all.blast;\
# detect_collinear_tandem_arrays -g "$i"_"$j".gff -b "$i"_"$j".all.blast -c "$i"_"$j".collinearity -o "$i"_"$j".tandem.collinear;\
# grep -o -E "," "$i"_"$j".tandem.collinear|wc -l > "$i"_"$j".tandem.collinear.added ;\
)&
elif [ "$i" = "$j" ]; then
echo new genomes: intra
scale;(
echo "$i"_"$i"is $i.blast
mv "$i"_"$i" $i.blast
echo $i.bed is $i.gff
cat $Existingpepbed/$i.bed > $i.gff
echo Intra-species mcscan for "$i"_"$j";\
MCScanX $i -s $anchors -m $gaps;\
echo duplication modes!
# Optional
#duplicate_gene_classifier $i;\
# Optional
#echo Now we detect collinear tandem arrays.
#detect_collinear_tandem_arrays -g $i.gff -b $i.blast -c $i.collinearity -o $i.tandem.collinear;\
#echo sum the number of added genes!
#grep -o -E "," $i.tandem.collinear|wc -l > $i.tandemAdded
)&
fi
done
let n++
done
wait
#################################
# Step_5: Now sort .collinearity files!
# We are almost there!
# Keep up!
##################################
mkdir AddedBlocks/
mv *.collinearity AddedBlocks/
################################
# step_6: now we have a SynNet
################################