-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.nf
191 lines (158 loc) · 5.35 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/env nextflow
nextflow.enable.dsl=2
// If the user uses the --help flag, print the help text below
params.help = false
// Function which prints help message text
def helpMessage() {
log.info"""
Blast sequences against a database
Required Arguments:
Multiple files:
--seedfile CSV file with sample_name and fasta_file columns; redundant with --query and --sample_name
One sample/file at a time:
--query Query file in fasta format; redundant with --seedfile
--sample_name Sample name; redundant with --seedfile
Blast arguments:
--db Blast database
--blast_type Which blast would you like to run? (default: ${params.blast_type})
--prefix Output prefix (default: ${params.prefix})
--project Folder to place analysis outputs (default: ${params.project})
Options
--chunksize Number of sequences to be processed per node (default: ${params.chunksize})
""".stripIndent()
}
// Show help message if the user specifies the --help flag at runtime
if (params.help){
// Invoke the function above which prints the help message
helpMessage()
// Exit out and do not run anything else
exit 0
}
// // Show help message if the user specifies a fasta file but not makedb or db
if ((params.db == null) || (params.blast_type == null)){
// Invoke the function above which prints the help message
helpMessage()
// Exit out and do not run anything else
exit 1
}
if ((params.query == null) && (params.seedfile == null)){
// Invoke the function above which prints the help message
helpMessage()
// Exit out and do not run anything else
exit 1
}
// Write a function to read the db parameter and get the full path from databases json file
// and error if database does not exist
def db_map = [
"nt":"/mnt/efs/databases/Blast/nt/db/nt",
"nr":"/mnt/efs/databases/Blast/nr/db/nr",
"ncbi_16s":"/mnt/efs/databases/Blast/16S_ribosomal_RNA/db/16S_ribosomal_RNA",
"cdd":"/mnt/efs/databases/Blast/cdd/db/cdd",
"silva":"/mnt/efs/databases/Blast/Silva/v138.1/silva138",
"silva_nr":"/mnt/efs/databases/Blast/Silva/v138.1/silva138_nr",
"immeDB":"/mnt/efs/databases/Blast/immeDB/db/immeDB",
]
def db_path = null
if (db_map[params.db]){
db_path = db_map[params.db]
log.info"""Using database at location ${db_path}""".stripIndent()
} else {
log.info"""
Cannot find the database specified by --db ${params.db}. Must use one of:
""".stripIndent()
db_map.each { key, value -> log.info "$key"}
exit 0
}
// Base path for all output files
basepath = params.outdir + "/" + params.project
/*
* Executes a BLAST job for each chunk emitted by the 'fasta_ch' channel
*/
process BLAST {
tag { params.sample_name }
cpus 2
memory 8.GB
input:
path 'query.fa'
output:
// file 'blast_result' into hits_ch
path 'blast_result' emit
script:
"""
${params.blast_type} \
-num_threads $task.cpus \
-query query.fa \
-db $db_path \
-dbsize ${params.dbsize} \
-num_alignments ${params.max_aln} \
-outfmt ${params.outfmt} > blast_result
"""
}
/*
* Executes a BLAST job for each row/file emitted by the 'fasta_ch' channel
*/
process BLASTS {
tag { name }
cpus 2
memory 8.GB
publishDir "${basepath}/${name}/${params.db}/${params.prefix}", mode: 'copy', pattern: "*.tsv"
input:
tuple val(name), file(fasta_file)
output:
path("${name}.${params.blast_type}.tsv")
script:
"""
${params.blast_type} \
-num_threads $task.cpus \
-query $fasta_file \
-db $db_path \
-dbsize ${params.dbsize} \
-num_alignments ${params.max_aln} \
${params.additional_params} \
-outfmt ${params.outfmt} > $name.${params.blast_type}.tsv
"""
}
process show_downloadable_databases {
// directives
// a container images is required
container "ncbi/blast:latest"
// compute resources for the Batch Job
cpus 1
memory '512 MB'
script:
"""
update_blastdb.pl --source aws --showall > /mnt/efs/databases/NCBI/db_names.list
cat /mnt/efs/databases/NCBI/db_names.list
update_blastdb.pl -h
"""
}
workflow {
if (params.seedfile == null){
//Creates working dir
workingpath = basepath + "/" + params.sample_name + "/" + params.db + "/" + params.prefix
workingdir = file(workingpath)
if( !workingdir.exists() ) {
if( !workingdir.mkdirs() ) {
exit 1, "Cannot create working directory: $workingpath"
}
}
out = "${workingpath}/${params.sample_name}.${params.blast_type}.tsv"
/*
* Given the query parameter creates a channel emitting the query fasta file(s),
* the file is split in chunks containing as many sequences as defined by the parameter 'chunksize'.
* Finally assign the result channel to the variable 'fasta_ch'
*/
fasta_ch = Channel
.fromPath(params.query)
.ifEmpty { exit 1, "Cannot find matching fasta file" }
.splitFasta(by: params.chunksize, file:true)
BLAST(fasta_ch).collectFile(name:out)
} else {
fasta_ch = Channel
.fromPath(params.seedfile)
.ifEmpty { exit 1, "Cannot find any seed file matching: ${params.seedfile}." }
.splitCsv(header: true, sep: ',')
.map{ row -> tuple(row.sample_name, file(row.fasta_file)) }
BLASTS(fasta_ch)
}
}