-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path751_bash_fasta_blast_fullnt.sh
executable file
·193 lines (140 loc) · 6.36 KB
/
751_bash_fasta_blast_fullnt.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/env bash
#SBATCH --job-name BLAST
#SBATCH --time 06:00:00
#SBATCH --mem 550G # 50 GB for running - 2 * 250 GB for database
#SBATCH --ntasks 1
#SBATCH --cpus-per-task 36 # half a node
#SBATCH --account=uoo03176
#SBATCH [email protected]
#SBATCH --mail-type=ALL
#SBATCH --output 221114_OUeDNA.%j.out
#SBATCH --error 221114_OUeDNA.%j.err
# 14.11.2022 - Paul Czechowski - [email protected]
# ========================================================
# * Blast `fasta` against NCBI's nt database. See
# https://stackoverflow.com/questions/45014279/running-locally-blastn-against-nt-db-thru-python-script.
# * For array handling refer to
# https://stackoverflow.com/questions/23356779/how-can-i-store-find-command-result-as-arrays-in-bash
# https://stackoverflow.com/questions/7442417/how-to-sort-an-array-in-bash)
# * For execution on NESI see
# https://support.nesi.org.nz/hc/en-gb/articles/208619807-BLAST
#
# For jobs which need more than 24 CPU-hours, eg: those that use large
# databases (> 10 GB) or large amounts of query sequence (> 1 GB), or slow
# BLAST searches such as classic blastn (blastn -task blastn).
#
# This script copies the BLAST database into the per-job temporary
# directory $TMPDIR before starting the search. Since compute nodes do not
# have local disks, this database copy is in memory, and so must be
# allowed for in the memory requested by the job. As of mid 2021 that is
# 125 GB for the nt database, 124 GB for refseq_protein.
# * For parssing use MEGAN and `xml` format, '-outfmt 5'
#
# * safety settings
# "-u" makes it an error to reference a non-existent environment variable
# "pipefail" makes things like misspeled-command raise errors.
set -eu
set -o pipefail
# * adjust terminal colour ---
bold=$(tput bold)
normal=$(tput sgr0)
# * set machine-specific environment ---
if [[ "$HOSTNAME" != "Pauls-MacBook-Pro.local" ]] && [[ "$HOSTNAME" != "mbpro.local" ]]; then
# diagnostic message
printf "\n${bold}$(date):${normal} Blasting on remote...\n"
# load Blast Modules on NESI
module purge
module load BLAST/2.13.0-GCC-11.3.0
module load BLASTDB/2022-10
# check available db versions using the command below and alter the date in the previous line accordingly
# cd "$BLASTDB" && cd .. && ls
# copy BLASTDB to RAM
# check files: `ls "/opt/nesi/db/blast/2022-10"/{nt,taxdb}*`
# check memory requirements `du -bch "/opt/nesi/db/blast/2022-10"/{nt,taxdb}* | tail -n 1`
cp "$BLASTDB"/{nt,taxdb}* "$TMPDIR"/
# refresh / export path and other variables
export BLASTDB="$TMPDIR"/nt
export BASEPATH="/nesi/project/uoo03176/OU_eDNA"
export NOBACKUP="/nesi/nobackup/uoo03176/OU_eDNA"
export CPUS="$SLURM_CPUS_PER_TASK"
elif [[ "$HOSTNAME" == "mbpro.local" ]] || [[ "$HOSTNAME" == "macmini-fastpost.local" ]]; then
# diagnostic message
printf "\n${bold}$(date):${normal} Blasting on local...\n"
# export path and other variables
export BLASTDB="/Volumes/HGST1TB/Users/paul/Sequences/References/blastdb/nt"
export BASEPATH="/Users/paul/Documents/OU_eDNA"
export NOBACKUP="/Volumes/HGST1TB/Users/paul/Documents/OU_eDNA"
export CPUS="2"
fi
# * Copy files to scratch ---
if [ -d "$BASEPATH" ]; then
# diagnostic message
printf "\n${bold}$(date):${normal} Syncing files from \"$BASEPATH\" to \"$NOBACKUP\"...\n"
rsync -azi --delete-after --progress "$BASEPATH" "$(dirname "$NOBACKUP")"
fi
# * Decompress negative GI list ---
if [ ! -f "$NOBACKUP"/201126_preprocessing/blast/221027_gi_list_environmental.txt ]; then
# diagnostic message
printf "\n${bold}$(date):${normal} Decompressing neagtive GI List...\n"
gzip -dc \
"$NOBACKUP"/201126_preprocessing/blast/221027_gi_list_environmental.txt.gz >> \
"$NOBACKUP"/201126_preprocessing/blast/221027_gi_list_environmental.txt
fi
# * find all .fasta files incl. their paths and put them into an array
# diagnostic message
printf "\n${bold}$(date):${normal} Finding fasta files...\n"
inpth_seq_unsorted=()
while IFS= read -r -d $'\0'; do
inpth_seq_unsorted+=("$REPLY")
done < <(find "$NOBACKUP/201126_preprocessing/qiime" -type f \( -name "*.fasta.gz" \) -print0)
# * sort array
IFS=$'\n' inpth_seq=($(sort <<<"${inpth_seq_unsorted[*]}"))
unset IFS
# * print sorted input filenames
# printf '%s\n' "${inpth_seq[@]}"
# * loop over array of fasta files, create result file, call Blast
for fasta in "${inpth_seq[@]}";do
# create Blast results file name
filename=$(dirname "$fasta")
src_dir=$(basename "$fasta")
tmp_file="751${src_dir:3}"
tgt_file="${tmp_file%%.*}_blast-noenv.xml"
# test Blast results file name creation
# printf "$fasta\n"
# printf "$NOBACKUP"/201126_preprocessing/blast/"$tgt_file.gz\n"
# exit
if [ ! -f "$NOBACKUP"/201126_preprocessing/blast/"$tgt_file".gz ]; then
# Diagnostic message
printf "\n${bold}$(date):${normal} Querying \"$fasta\" against \"$BLASTDB\"...\n" && \
gzip -dc "$fasta" | \
blastn \
-db "$BLASTDB" \
-task blastn \
-evalue 1e-10 \
-max_hsps 5 \
-outfmt 5 \
-max_target_seqs 5 \
-out "$NOBACKUP"/201126_preprocessing/blast/"$tgt_file" \
-num_threads "$CPUS" \
-qcov_hsp_perc 95 \
-perc_identity 75 \
-negative_gilist "$NOBACKUP"/201126_preprocessing/blast/221027_gi_list_environmental.txt && \
printf "\n${bold}$(date):${normal} Blast finished writing to \"$NOBACKUP/201126_preprocessing/blast/$tgt_file\".\n" || \
{ printf "\n${bold}$(date):${normal} Blastn failed on \"$fasta\". \n" ; exit 1; }
printf "\n${bold}$(date):${normal} Compressing \"$NOBACKUP/201126_preprocessing/blast/$tgt_file\".\n"
gzip "$NOBACKUP"/201126_preprocessing/blast/"$tgt_file"
fi
done
# * Erase decompressed negative GI list ---
if [ -f "$NOBACKUP"/201126_preprocessing/blast/221027_gi_list_environmental.txt ]; then
# diagnostic message
printf "\n${bold}$(date):${normal} Erasing decompressed neagtive GI List...\n"
rm "$NOBACKUP"/201126_preprocessing/blast/221027_gi_list_environmental.txt
fi
# * Copy files from scratch ---
if [ -d "$NOBACKUP" ]; then
# diagnostic message
printf "\n${bold}$(date):${normal} Syncing files from \"$NOBACKUP\" to \"$BASEPATH\"...\n"
rsync -avzui --progress --delete-after --delete-after --progress "$NOBACKUP" "$(dirname "$BASEPATH")"
fi
exit