-
Notifications
You must be signed in to change notification settings - Fork 0
/
5.GeneAlignment.sh
executable file
·157 lines (123 loc) · 7.03 KB
/
5.GeneAlignment.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/bin/bash
#####################################
#### Iterative alignment of protein-coding sequences with MACSE and of non-coding sequences with CLUSTAL-W
#### Uses as input gene sequences from the previous step (4.GetAnnFasta).
####
#### Usage: bash 5.GeneAlignment.sh Prefix Output bin
#### - Prefix = ID of the sample being processed
#### - Output = Path to the output folder of your choice (must be the same as in steps 1,2,3 and 4)
#### - bin = Path for the bin folder
####
#### Dependencies: AmbiguityConsensus.py, GapAlnRemover.py, RefCollMtDNA.formatted.fasta, gapCount.py, ConcatenateFasta2Nexus.py, macse_v1.01b.jar (all in bin folder) and ClustalW2
####
#### The RefCollMtDNA.formatted.fasta file is a file containing all the gene sequences available of the
#### species/genus of interest dowloaded from a database (METAMIGA, GENBANK).
####
#### Francesco Cicconardi (2016)
#####################################
### Getting script arguments
PREFIX=$1
OUTPUT=$2
BIN=$3
### Important parameters
allgeneFas=$OUTPUT/$PREFIX.mtDNA.All.genes.fasta #Output from the step 4
ref_genes=bin/RefCollMtDNA.formatted.fasta #Gene sequences from public database
GAP_PERCENTAGE_CODING=33 #Removing sequences shorter than the 33% of the overall alignment, in coding sequences
GAP_PERCENTAGE_NONCOD=75 #Removing sequences shorter than the 75% of the overall alignment, in non-coding sequences
gene2sort='nad2 cox1 rrnL cox2 atp8 atp6 cox3 nad3 nad5 nad4 nad4l nad6 cob nad1 rrnL rrnS'
OVERWRITE=yes #If 'yes', overwrite previous run
KEEPTMP='no' #If 'no', remove temporary files
### Alignement of coding sequences, all the genes at one time
for gene in $gene2sort
do
echo -e "\n#######################################################################"
echo -e "\n`date` Align gene $gene."
if [[ $OVERWRITE == 'yes' ]]
then
rm $OUTPUT/$PREFIX.mtDNA.All.$gene.alned.NT.fasta
fi
if [ -f $OUTPUT/$PREFIX.mtDNA.All.$gene.alned.NT.fasta ] # If there is this aligned fasta create it's consensus and deletes gaps otherwise it does the alignment.
then
echo -e "Generate consensus"
python bin/AmbiguityConsensus.py -l $gene -i $OUTPUT/$PREFIX.mtDNA.All.$gene.alned.NT.fasta -o $OUTPUT/$PREFIX.mtDNA.All.$gene.consensus.fasta
echo -e "Remove gaps from consensus"
python bin/GapAlnRemover.py -c $OUTPUT/$PREFIX.mtDNA.All.$gene.consensus.fasta -i $OUTPUT/$PREFIX.mtDNA.All.$gene.alned.NT.fasta -t coding -o $OUTPUT/$PREFIX.mtDNA.All.$gene.alned.NT.ungapped.fasta
else
ls $allgeneFas
# Extract single gene from file containint all genes and create file to align. Creating the dataset to align.
grep --no-group-separator -A1 -w $gene $allgeneFas > $OUTPUT/$PREFIX.mtDNA.All.$gene.fasta
#grep --no-group-separator -A1 -w $gene $ref_genes >> $OUTPUT/$PREFIX.mtDNA.All.$gene.fasta #Keeps reference genes in the final alignment
# Count file length
Aln_len=`grep -c '^>' $OUTPUT/$PREFIX.mtDNA.All.$gene.fasta`
# Start iteration as long as the length of the file.
for i in $(seq $Aln_len)
do
echo -e "\n`date` Iteration $i started."
iter=`bc <<< $i+1`
if [ $i == 1 ]
then
ALNMENT=$OUTPUT/$PREFIX.mtDNA.All.$gene
else
ALNMENT=$OUTPUT/$PREFIX.mtDNA.All.$gene.$i.Iteration
fi
N_SEQ=`grep -c '^>' $ALNMENT.fasta`
echo -e "`date` Alignment of $N_SEQ sequences in $ALNMENT.fasta file."
if [[ $gene == rrnL || $gene == rrnS ]]
then
GEPTR=$GAP_PERCENTAGE_NONCOD
sed -ibk 's/;//g' $ALNMENT.fasta
rm $ALNMENT.fastabk
cat $ALNMENT.fasta
clustalw2 -infile=$ALNMENT.fasta -outfile=$ALNMENT.alned.fasta \
-output=FASTA -dnamatrix=IUB \
-gapopen=10 -gapext=0.2 \
-gapdist=10 -iteration=ALIGNMENT \
-numiter=1000 -clustering=UPGMA
else
GEPTR=$GAP_PERCENTAGE_CODING
java -jar bin/macse_v1.01b.jar \
-prog alignSequences \
-gc_def 5 \
-out_NT $ALNMENT.alned.fasta \
-seq $ALNMENT.fasta
fi
cat $ALNMENT.alned.fasta
echo -e "\n`date` Check and exclude short sequences from the alignment."
echo -e "gapCount.py $ALNMENT.alned.fasta $GEPTR tmp | cut -d ' ' -f 1 | sort | cut -f 1| sed 's/_H_/; /g' | cut -d ' ' -f 1 > $OUTPUT/$PREFIX.mtDNA.All.$gene.alned.$i.ids"
python bin/gapCount.py $ALNMENT.alned.fasta $GEPTR $OUTPUT/$PREFIX.mtDNA.All.$gene.$iter.Iteration.fasta > $OUTPUT/$PREFIX.mtDNA.All.$gene.alned.$i.ids
p_iter=`bc <<< $i-1`
if [ -f $OUTPUT/$PREFIX.mtDNA.All.$gene.alned.$p_iter.ids ]
then
FILE1=`wc -l $OUTPUT/$PREFIX.mtDNA.All.$gene.alned.$p_iter.ids | cut -d ' ' -f 1`
FILE2=`wc -l $OUTPUT/$PREFIX.mtDNA.All.$gene.alned.$i.ids | cut -d ' ' -f 1`
if [[ $FILE1 == $FILE2 || $FILE2 == 0 ]]
then
if [[ $gene == rrnL || $gene == rrnS ]]
then
echo -e "Removing temporary files"
sed 's/_H_/; /g' $ALNMENT.alned.fasta > $OUTPUT/$PREFIX.mtDNA.All.$gene.alned.NT.fasta
else
echo -e "Removing temporary files"
mv $ALNMENT.alned.fasta $OUTPUT/$PREFIX.mtDNA.All.$gene.alned.NT.fasta
mv $OUTPUT/$PREFIX.mtDNA.All.$gene.$i.Iteration_macse_AA.fasta $OUTPUT/$PREFIX.mtDNA.All.$gene.alned.AA.fasta
fi
echo -e "Generate consensus"
python bin/AmbiguityConsensus.py -l $gene -i $OUTPUT/$PREFIX.mtDNA.All.$gene.alned.NT.fasta -o $OUTPUT/$PREFIX.mtDNA.All.$gene.consensus.fasta
echo -e "Remove gaps from consensus"
python bin/GapAlnRemover.py -c $OUTPUT/$PREFIX.mtDNA.All.$gene.consensus.fasta -i $OUTPUT/$PREFIX.mtDNA.All.$gene.alned.NT.fasta -t coding -o $OUTPUT/$PREFIX.mtDNA.All.$gene.alned.NT.ungapped.fasta
if [[ $KEEPTMP='no' ]]
then
rm -f $OUTPUT/$PREFIX.tmp $OUTPUT/$PREFIX.*.ids $OUTPUT/$PREFIX.*.Iteration.* $OUTPUT/$PREFIX.*.alned.fasta $OUTPUT/$PREFIX.*_macse_AA.fasta $OUTPUT/$PREFIX.mtDNA.All.$gene.fasta $OUTPUT/$PREFIX.*.dnd
fi
break
fi
fi
n_long=`wc -l $OUTPUT/$PREFIX.mtDNA.All.$gene.alned.$i.ids | cut -d ' ' -f 1`
n_short=`bc <<< $Aln_len-$n_long`
done
fi
done
cd $OUTPUT
echo -e "FASTA alignment conversion to NEXUS and concatenation"
python $BIN/ConcatenateFasta2Nexus.py -i $OUTPUT -s .ungapped.fasta -o $OUTPUT/$PREFIX.mtDNA.All.concatenated.nex
cd ..