forked from zhaotao1987/SynNet-Pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SynNet.sh
133 lines (101 loc) · 4.29 KB
/
SynNet.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# !/bin/bash
# Input files: A pep fasta file and bed file of each genome.
# Example: at.bed, at.pep, al.bed, al.pep...
# The program will first do intra- and inter- genome comparisons using RAPsearch, then trimming the outputs for pairwise synteny
# detection by MCscanX, and finally trimming the synteny blocks(.collinearity files) into a final synteny network (database).
#########################
# Step_1: Species list
#########################
# separated by space
array=(pv gm cc mt ca lj cl cs pt rc md pb pp pm fv at al cb bo br aeth tare cp gr ta ci eg vv st sl cu ug ah bv nn tu hv bd os zm sb si el ma pe zo ar pa sm ph cr)
#############################################################
# Step_2: Lets's first make database of each genome
#############################################################
for i in "${array[@]}";do
echo make database for species_$i
../RAPsearch2/RAPSearch2.23_64bits/bin/prerapsearch -d $i.pep -n $i.pep.db
done
####################################################################
# Step_3: Now lets run RAPSearch and Generate BLASTP-like results
####################################################################
for i in "${array[@]}";do
for j in "${array[@]}";do
echo #### blast "$i" against "$j" ########
../RAPsearch2/RAPSearch2.23_64bits/bin/rapsearch -q $i.pep -d $j.pep.db -o "$i"_"$j" -z 10 -b 0 -v 20 -t a -p f -a t
done
done
####################################################################
# Step_4: Now we need a module to alter the output and make it neat.
####################################################################
for f in *.m8; do sed '/#/d;s/\r//g' $f > $f.homo
#.rd means remove duplicates
awk '!a[$1$2]++' $f.homo > $f.rd
# keep the top 6 or another number of hits!
awk 'a[$1]++<6' $f.rd > $f.fix
done
rm *.rd
###################################################################
# Step_5.1: Now we combine corresponding files to prepare .blast
# This step is for intraspecies synteny detection!
##################################################################
echo ##########Now we do intraspecies MCScanX, prepare inputs..##########
for i in "${array[@]}";do
for j in "${array[@]}";do
# if at=at and so on, prepare file especially
if [ "$i" = "$j" ]; then
echo "$i"_"$i".homo is $i.blast
mv "$i"_"$i".m8.fix $i.blast
echo $i.bed is $i.gff
cat $i.bed > $i.gff
echo Intra-species MCScanX here for species $i
../MCScanX/MCScanX $i
echo duplication modes!
../MCScanX/duplicate_gene_classifier $i
fi
done
done
# for inter species starts here!
# but you have to be careful
# to avoid duplications
#######################################################################
# Step_5.2 Another loop
# This step is for interspecies synteny detection
# How to avoid duplicates! A-B is the same as B-A, so keep only one
# use my small program
#######################################################################
m=0
n=1
for i in "${array[@]:m}";do
for j in "${array[@]:n}";do
# before, the following commands have some typo ; .m8.homo should be .m8.fix
echo "$i"_"$j".m8.fix "$j"_"$i".m8.fix "$i"_"$j".blast
cat "$i"_"$j".m8.fix "$j"_"$i".m8.fix > "$i"_"$j".blast
echo "$i".bed "$j".bed "$i"_"$j".gff
cat "$i".bed "$j".bed > "$i"_"$j".gff
echo Inter-species mcscan for "$i"_"$j"
# commands below
../MCScanX/MCScanX -a -b 2 "$i"_"$j"
done
let n++
done
#################################
# Step_6: Now sort .collinearity files!
# We are almost there!
# Keep up!
##################################
for f in *.collinearity; do sed 's/^/'$f'/' $f >> 1; done
# split columns and kept the block score
# pay attention to ! \"\t\"
awk '{if($0~/.*score/){split($0,a,"=");print a[1]}else{print $0"\t"a[2]}}' 1 > 2
# delete lines containing #
sed '/#/d' 2 > 3
# replace unwanted strings/letters
sed 's/.collinearity//g; s/e_value//g; s/ //g; s/:/b/g' 3 > 4
# Pay attention to the following command, replace with the corrected gene names
#sed 's/ca__\([0-9]\+\)/ca\1ca/g;s/[a-z][a-z]__//g;s/\(p5_sc[0-9]\+.V1.gene[0-9]\+\)/el\1el/g;s/\(evm.TU.supercontig_[0-9]\+.[0-9]\+\)/cp\1cp/g;s/\(scaffold[0-9]\+\.[0-9]\+\)/ar\1ar/g;s/\(Scf[0-9]\+.g[0-9]\+\)/ug\1ug/g' 4|
awk '{print $1"\t"$5"\t"$2"\t"$3}' 4 > Final_Network
# delete middle files
rm 1 2 3 4
################################
# step_7: now we have a SynNet
################################