-
Notifications
You must be signed in to change notification settings - Fork 2
/
03.Chimera_removal.sh
148 lines (110 loc) · 4.26 KB
/
03.Chimera_removal.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
## Script for sample-wise chimera removal
## 1. reference-based chimera removal
## 2. de novo chimera search. Sequences identified as putative chimeras will have lower priority in clustering
## Input = FASTQ, Output = FASTQ
cat > chimera_rm.sh <<'EOT'
# #!/bin/bash
# $1 = input file name
# $2 = output prefix
# $3 = log file
# $4 = reference database path
echo -e "Start " "$1" > "$3"
TMPDIR=$(mktemp -d -t "chimera_XXXXXXX")
echo -e "\n### Dereplication\n" >> "$3"
## Dereplicate
vsearch \
--derep_fulllength "$1" \
--output - \
--fasta_width 0 \
--threads 1 \
--sizein --sizeout \
2>> "$3" \
| gzip -2 > "$TMPDIR"/derep.fa.gz
echo -e "\n### Reference-based chimera removal\n" >> "$3"
## Reference-based
vsearch \
--uchime_ref "$TMPDIR"/derep.fa.gz \
--db "$4" \
--fasta_width 0 \
--sizein --sizeout \
--threads 1 \
--chimeras - \
--nonchimeras "$TMPDIR"/NoChimera_Ref.fasta \
2>> "$3" \
| gzip -2 > "$TMPDIR"/Ref_Chimeras.fasta.gz
echo -e "\n### De novo chimera removal\n" >> "$3"
## De-novo
numseqs=$(grep -c "^>" "$TMPDIR"/derep.fa.gz)
if [[ $numseqs -gt 3 ]] ; then
vsearch \
--uchime_denovo "$TMPDIR"/derep.fa.gz \
--chimeras "$TMPDIR"/Chimeras_Denovo.fasta \
--nonchimeras - \
--fasta_width 0 \
--sizein --xsize \
2>> "$3" \
| gzip -2 > "$TMPDIR"/NoChimera_DeNovo.fasta.gz
else
echo "The number of sequences is too small. Do not perform de novo search." >> "$3"
fi
echo -e "\n### Sequence extraction\n" >> "$3"
## Get sequences
if [ -f "$TMPDIR"/NoChimera_DeNovo.fasta.gz ]; then
zcat "$TMPDIR"/NoChimera_DeNovo.fasta.gz | awk '$0 !~ /^>/ {print toupper($0)}' | sort > "$TMPDIR"/tmp_ok.txt
elif [ -f "$TMPDIR"/NoChimera_Ref.fasta ]; then
awk '$0 !~ /^>/ {print toupper($0)}' "$TMPDIR"/NoChimera_Ref.fasta | sort > "$TMPDIR"/tmp_ok.txt
else
touch "$TMPDIR"/tmp_ok.txt
fi
if [ -f "$TMPDIR"/Chimeras_Denovo.fasta ]; then
awk '$0 !~ /^>/ {print toupper($0)}' "$TMPDIR"/Chimeras_Denovo.fasta | sort > "$TMPDIR"/tmp_chim_denovo.txt
else
touch "$TMPDIR"/tmp_chim_denovo.txt
fi
if [ -f "$TMPDIR"/Ref_Chimeras.fasta.gz ]; then
zcat "$TMPDIR"/Ref_Chimeras.fasta.gz | awk '$0 !~ /^>/ {print toupper($0)}' | sort > "$TMPDIR"/chim_ref.txt
else
touch "$TMPDIR"/chim_ref.txt
fi
## Remove de-novo chimeras identified with reference database
comm -13 "$TMPDIR"/chim_ref.txt "$TMPDIR"/tmp_ok.txt > "$TMPDIR"/ok.txt
## Remove seqs identified as reference chimeras
comm -13 "$TMPDIR"/chim_ref.txt "$TMPDIR"/tmp_chim_denovo.txt > "$TMPDIR"/chim_denovo.txt
echo -e "Number of unique non-chimeric sequences: " $(wc -l < "$TMPDIR"/ok.txt) >> "$3"
echo -e "Number of unique reference-based chimeric sequences: " $(wc -l < "$TMPDIR"/chim_ref.txt) >> "$3"
echo -e "Number of unique de novo chimeric sequences: " $(wc -l < "$TMPDIR"/chim_denovo.txt) >> "$3"
#### To improve speed, split patterns into chunks (with 100 lines each), then ripgrep
## Good sequences
if [[ -s "$TMPDIR"/ok.txt ]]; then
split -l 100 "$TMPDIR"/ok.txt "$TMPDIR"/patt.split.
for CHUNK in "$TMPDIR"/patt.split.* ; do
rg -z -B 1 -A 2 -x -f "$CHUNK" "$1" >> "$TMPDIR"/tmp_NoChimera.fq
done
rm "$TMPDIR"/patt.split.*
sed '/^--$/d' "$TMPDIR"/tmp_NoChimera.fq | gzip -6 > "$2"_NoChimera.fq.gz
fi
## De novo chimeras
if [[ -s "$TMPDIR"/chim_denovo.txt ]]; then
split -l 100 "$TMPDIR"/chim_denovo.txt "$TMPDIR"/patt.split.
for CHUNK in "$TMPDIR"/patt.split.* ; do
rg -z -B 1 -A 2 -x -f "$CHUNK" "$1" >> "$TMPDIR"/tmp_ChimDeNov.fq
done
rm "$TMPDIR"/patt.split.*
sed '/^--$/d' "$TMPDIR"/tmp_ChimDeNov.fq | gzip -6 > "$2"_ChimDeNov.fq.gz
fi
## Reference-based chimeras
if [[ -s "$TMPDIR"/chim_ref.txt ]]; then
split -l 100 "$TMPDIR"/chim_ref.txt "$TMPDIR"/patt.split.
for CHUNK in "$TMPDIR"/patt.split.* ; do
rg -z -B 1 -A 2 -x -f "$CHUNK" "$1" >> "$TMPDIR"/tmp_ChimRef.fq
done
rm "$TMPDIR"/patt.split.*
sed '/^--$/d' "$TMPDIR"/tmp_ChimRef.fq | gzip -6 > "$2"_ChimRef.fq.gz
fi
## Clean up
rm -r "$TMPDIR"
echo -e "\nDone " "$1" >> "$3"
EOT
chmod +x chimera_rm.sh
## Example:
# ./chimera_rm.sh Sample.fq.gz Samp Samp.log Reference_chimera_DB.fasta.gz