-
Notifications
You must be signed in to change notification settings - Fork 17
/
main.nf
1296 lines (1132 loc) · 49.3 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
===============================================================================
Nextflow pipeline using QIIME 2 to process CCS data via DADA2 plugin. Takes
in demultiplexed 16S amplicon sequencing FASTQ file.
===============================================================================
Author: Khi Pin, Chua
Updated: 2023-3-31
*/
nextflow.enable.dsl=2
def helpMessage() {
log.info"""
Usage:
This pipeline takes in the standard sample manifest and metadata file used in
QIIME 2 and produces QC summary, taxonomy classification results and visualization.
For samples TSV, two columns named "sample-id" and "absolute-filepath" are
required. For metadata TSV file, at least two columns named "sample_name" and
"condition" to separate samples into different groups.
nextflow run main.nf --input samples.tsv --metadata metadata.tsv \\
--dada2_cpu 8 --vsearch_cpu 8
By default, sequences are first trimmed with cutadapt. If adapters are already trimmed, you can skip
cutadapt by specifying "--skip_primer_trim".
Other important options:
--front_p Forward primer sequence. Default to F27. (default: AGRGTTYGATYMTGGCTCAG)
--adapter_p Reverse primer sequence. Default to R1492. (default: AAGTCGTAACAAGGTARCY)
--filterQ Filter input reads above this Q value (default: 20).
--downsample Limit reads to a maximum of N reads if there are more than N reads (default: off)
--max_ee DADA2 max_EE parameter. Reads with number of expected errors higher than
this value will be discarded (default: 2)
--minQ DADA2 minQ parameter. Reads with any base lower than this score
will be removed (default: 0)
--min_len Minimum length of sequences to keep (default: 1000)
--max_len Maximum length of sequences to keep (default: 1600)
--pooling_method QIIME 2 pooling method for DADA2 denoise see QIIME 2
documentation for more details (default: "pseudo", alternative: "independent")
--maxreject max-reject parameter for VSEARCH taxonomy classification method in QIIME 2
(default: 100)
--maxaccept max-accept parameter for VSEARCH taxonomy classification method in QIIME 2
(default: 100)
--min_asv_totalfreq Total frequency of any ASV must be above this threshold
across all samples to be retained. Set this to 0 to disable filtering
(default 5)
--min_asv_sample ASV must exist in at least min_asv_sample to be retained.
Set this to 0 to disable. (default 1)
--vsearch_identity Minimum identity to be considered as hit (default 0.97)
--rarefaction_depth Rarefaction curve "max-depth" parameter. By default the pipeline
automatically select a cut-off above the minimum of the denoised
reads for >80% of the samples. This cut-off is stored in a file called
"rarefaction_depth_suggested.txt" file in the results folder
(default: null)
--dada2_cpu Number of threads for DADA2 denoising (default: 8)
--vsearch_cpu Number of threads for VSEARCH taxonomy classification (default: 8)
--cutadapt_cpu Number of threads for primer removal using cutadapt (default: 16)
--outdir Output directory name (default: "results")
--vsearch_db Location of VSEARCH database (e.g. silva-138-99-seqs.qza can be
downloaded from QIIME database)
--vsearch_tax Location of VSEARCH database taxonomy (e.g. silva-138-99-tax.qza can be
downloaded from QIIME database)
--silva_db Location of Silva 138 database for taxonomy classification
--gtdb_db Location of GTDB r202 for taxonomy classification
--refseq_db Location of RefSeq+RDP database for taxonomy classification
--skip_primer_trim Skip all primers trimming (switch off cutadapt and DADA2 primers
removal) (default: trim with cutadapt)
--skip_nb Skip Naive-Bayes classification (only uses VSEARCH) (default: false)
--colorby Columns in metadata TSV file to use for coloring the MDS plot
in HTML report (default: condition)
--run_picrust2 Run PICRUSt2 pipeline. Note that pathway inference with 16S using PICRUSt2
has not been tested systematically (default: false)
--download_db Download databases needed for taxonomy classification only. Will not
run the pipeline. Databases will be downloaded to a folder "databases"
in the Nextflow pipeline directory.
--publish_dir_mode Outputs mode based on Nextflow "publishDir" directive. Specify "copy"
if requires hard copies. (default: symlink)
--version Output version
"""
}
// Show help message
params.help = false
if (params.help) exit 0, helpMessage()
params.version = false
version = "0.7"
if (params.version) exit 0, log.info("$version")
params.download_db = false
params.skip_primer_trim = false
params.skip_nb = false
params.run_picrust2 = false
params.filterQ = 20
params.min_len = 1000
params.max_len = 1600
params.colorby = "condition"
params.skip_phylotree = false
params.minQ = 0
// Downsample to N reads, default to 0 which means disable
params.downsample = 0
// Check input
params.input = false
if (params.input){
n_sample=file(params.input).countLines() - 1
if (n_sample == 1) {
params.min_asv_totalfreq = 0
params.min_asv_sample = 0
println("Only 1 sample. min_asv_sample and min_asv_totalfreq set to 0.")
} else {
params.min_asv_totalfreq = 5
params.min_asv_sample = 1
}
} else {
println("No input file given to --input!")
n_sample=0
params.min_asv_totalfreq = 0
params.min_asv_sample = 0
}
if (params.skip_primer_trim) {
params.front_p = 'none'
params.adapter_p = 'none'
trim_cutadapt = "No"
} else {
trim_cutadapt = "Yes"
// These are default V1-V9 adapter
params.front_p = 'AGRGTTYGATYMTGGCTCAG'
params.adapter_p = 'AAGTCGTAACAAGGTARCY'
}
// Hidden parameters in case need to trim with DADA2 removePrimers function
// StrainID primers
// params.front_p = 'AGRRTTYGATYHTDGYTYAG'
// params.adapter_p = 'AGTACYRHRARGGAANGR'
params.pooling_method = 'pseudo'
params.vsearch_db = "$projectDir/databases/GTDB_ssu_all_r207.qza"
params.vsearch_tax = "$projectDir/databases/GTDB_ssu_all_r207.taxonomy.qza"
params.silva_db = "$projectDir/databases/silva_nr99_v138.1_wSpecies_train_set.fa.gz"
params.refseq_db = "$projectDir/databases/RefSeq_16S_6-11-20_RDPv16_fullTaxo.fa.gz"
params.gtdb_db = "$projectDir/databases/GTDB_bac120_arc53_ssu_r207_fullTaxo.fa.gz"
params.maxreject = 100
params.maxaccept = 100
params.rarefaction_depth = null
params.dada2_cpu = 8
params.vsearch_cpu = 8
params.vsearch_identity = 0.97
params.outdir = "results"
params.max_ee = 2
params.rmd_vis_biom_script= "$projectDir/scripts/visualize_biom.Rmd"
// Helper script for biom vis script
params.rmd_helper = "$projectDir/scripts/import_biom.R"
params.cutadapt_cpu = 16
params.primer_fasta = "$projectDir/scripts/16S_primers.fasta"
params.dadaCCS_script = "$projectDir/scripts/run_dada_2023.2.R"
params.dadaAssign_script = "$projectDir/scripts/dada2_assign_tax.R"
params.publish_dir_mode = "symlink"
log_text = """
Parameters set for pb-16S-nf pipeline for PacBio HiFi 16S
=========================================================
Number of samples in samples TSV: $n_sample
Filter input reads above Q: $params.filterQ
Trim primers with cutadapt: $trim_cutadapt
Limit to N reads if exceeding N reads (0 = disabled): $params.downsample
Forward primer: $params.front_p
Reverse primer: $params.adapter_p
Minimum amplicon length filtered in DADA2: $params.min_len
Maximum amplicon length filtered in DADA2: $params.max_len
maxEE parameter for DADA2 filterAndTrim: $params.max_ee
minQ parameter for DADA2 filterAndTrim: $params.minQ
Pooling method for DADA2 denoise process: $params.pooling_method
Minimum number of samples required to keep any ASV: $params.min_asv_sample
Minimum number of reads required to keep any ASV: $params.min_asv_totalfreq
Taxonomy sequence database for VSEARCH: $params.vsearch_db
Taxonomy annotation database for VSEARCH: $params.vsearch_tax
Skip Naive Bayes classification: $params.skip_nb
SILVA database for Naive Bayes classifier: $params.silva_db
GTDB database for Naive Bayes classifier: $params.gtdb_db
RefSeq + RDP database for Naive Bayes classifier: $params.refseq_db
VSEARCH maxreject: $params.maxreject
VSEARCH maxaccept: $params.maxaccept
VSEARCH perc-identity: $params.vsearch_identity
QIIME 2 rarefaction curve sampling depth: $params.rarefaction_depth
Number of threads specified for cutadapt: $params.cutadapt_cpu
Number of threads specified for DADA2: $params.dada2_cpu
Number of threads specified for VSEARCH: $params.vsearch_cpu
Script location for HTML report generation: $params.rmd_vis_biom_script
Container enabled via docker/singularity: $params.enable_container
Version of Nextflow pipeline: $version
"""
log.info(log_text)
// Save parameters into file
process write_log{
conda (params.enable_conda ? "$projectDir/env/pb-16s-pbtools.yml" : null)
container "kpinpb/pb-16s-nf-tools:latest"
publishDir "$params.outdir", mode: params.publish_dir_mode
label 'cpu_def'
input:
val(logs)
output:
path "parameters.txt"
script:
"""
echo '$logs' > parameters.txt
"""
}
// QC before cutadapt
process QC_fastq {
conda (params.enable_conda ? "$projectDir/env/pb-16s-pbtools.yml" : null)
container "kpinpb/pb-16s-nf-tools:latest"
label 'cpu8'
publishDir "$params.outdir/filtered_input_FASTQ", pattern: '*filterQ*.fastq.gz', mode: params.publish_dir_mode
input:
tuple val(sampleID), path(sampleFASTQ)
output:
path "${sampleID}.seqkit.readstats.tsv", emit: all_seqkit_stats
path "${sampleID}.seqkit.summarystats.tsv", emit: all_seqkit_summary
tuple val(sampleID), path("${sampleID}.filterQ${params.filterQ}.fastq.gz"), emit: filtered_fastq
path("${sampleID}.filterQ${params.filterQ}.fastq.gz"), emit: filtered_fastq_files
script:
if (params.downsample > 0){
"""
seqkit fx2tab -j $task.cpus -q --gc -l -H -n -i $sampleFASTQ |\
csvtk mutate2 -C '%' -t -n sample -e '"${sampleID}"' > ${sampleID}.seqkit.readstats.tsv
seqkit stats -T -j $task.cpus -a ${sampleFASTQ} |\
csvtk mutate2 -C '%' -t -n sample -e '"${sampleID}"' > ${sampleID}.seqkit.summarystats.tsv
seqkit seq -j $task.cpus --min-qual $params.filterQ $sampleFASTQ |\
seqkit head -n $params.downsample --out-file ${sampleID}.filterQ${params.filterQ}.fastq.gz
"""
} else {
"""
seqkit fx2tab -j $task.cpus -q --gc -l -H -n -i $sampleFASTQ |\
csvtk mutate2 -C '%' -t -n sample -e '"${sampleID}"' > ${sampleID}.seqkit.readstats.tsv
seqkit stats -T -j $task.cpus -a ${sampleFASTQ} |\
csvtk mutate2 -C '%' -t -n sample -e '"${sampleID}"' > ${sampleID}.seqkit.summarystats.tsv
seqkit seq -j $task.cpus --min-qual $params.filterQ $sampleFASTQ --out-file ${sampleID}.filterQ${params.filterQ}.fastq.gz
"""
}
}
// Trim full length 16S primers with cutadapt
process cutadapt {
conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null)
container "kpinpb/pb-16s-nf-qiime:v0.7"
publishDir "$params.outdir/trimmed_primers_FASTQ", pattern: '*.fastq.gz', mode: params.publish_dir_mode
publishDir "$params.outdir/cutadapt_summary", pattern: '*.report', mode: params.publish_dir_mode
cpus params.cutadapt_cpu
input:
tuple val(sampleID), path(sampleFASTQ)
output:
tuple val(sampleID), path("${sampleID}.trimmed.fastq.gz"), emit: cutadapt_fastq
path "*.report", emit: cutadapt_summary
path "cutadapt_summary_${sampleID}.tsv", emit: summary_tocollect
path("${sampleID}.trimmed.fastq.gz"), emit: cutadapt_fastq_files
script:
"""
cutadapt -g "${params.front_p}...${params.adapter_p}" \
${sampleFASTQ} \
-o ${sampleID}.trimmed.fastq.gz \
-j ${task.cpus} --trimmed-only --revcomp -e 0.1 \
--json ${sampleID}.cutadapt.report
input_read=`jq -r '.read_counts | .input' ${sampleID}.cutadapt.report`
demux_read=`jq -r '.read_counts | .output' ${sampleID}.cutadapt.report`
echo -e "sample\tinput_reads\tdemuxed_reads" > cutadapt_summary_${sampleID}.tsv
echo -e "${sampleID}\t\$input_read\t\$demux_read" >> cutadapt_summary_${sampleID}.tsv
"""
}
// QC after cutadapt
process QC_fastq_post_trim {
conda (params.enable_conda ? "$projectDir/env/pb-16s-pbtools.yml" : null)
container "kpinpb/pb-16s-nf-tools:latest"
label 'cpu8'
publishDir "$params.outdir/filtered_input_FASTQ", pattern: '*post_trim.tsv', mode: params.publish_dir_mode
input:
tuple val(sampleID), path(sampleFASTQ)
output:
path "${sampleID}.seqkit.readstats.post_trim.tsv", emit: all_seqkit_stats
script:
"""
seqkit fx2tab -j $task.cpus -q --gc -l -H -n -i $sampleFASTQ |\
csvtk mutate2 -C '%' -t -n sample -e '"${sampleID}"' > ${sampleID}.seqkit.readstats.post_trim.tsv
"""
}
// Collect QC into single files
process collect_QC {
conda (params.enable_conda ? "$projectDir/env/pb-16s-pbtools.yml" : null)
container "kpinpb/pb-16s-nf-tools:latest"
publishDir "$params.outdir/results/reads_QC", mode: params.publish_dir_mode
label 'cpu8'
input:
path "*"
path "*"
path "*"
path "*"
output:
path "all_samples_seqkit.readstats.tsv", emit: all_samples_readstats
path "all_samples_seqkit.readstats.post_trim.tsv", emit: all_samples_readstats_post_trim
path "all_samples_seqkit.summarystats.tsv", emit: all_samples_summarystats
path "seqkit.summarised_stats.group_by_samples.tsv", emit: summarised_sample_readstats
path "seqkit.summarised_stats.group_by_samples.pretty.tsv"
path "all_samples_cutadapt_stats.tsv", emit: cutadapt_summary
script:
"""
csvtk concat -t -C '%' *.seqkit.readstats.tsv > all_samples_seqkit.readstats.tsv
csvtk concat -t -C '%' *.seqkit.readstats.post_trim.tsv > all_samples_seqkit.readstats.post_trim.tsv
csvtk concat -t -C '%' *.seqkit.summarystats.tsv > all_samples_seqkit.summarystats.tsv
csvtk concat -t cutadapt_summary*.tsv > all_samples_cutadapt_stats.tsv
# Summary read_qual for each sample
csvtk summary -t -C '%' -g sample -f length:q1,length:q3,length:median,avg.qual:q1,avg.qual:q3,avg.qual:median all_samples_seqkit.readstats.tsv > seqkit.summarised_stats.group_by_samples.tsv
csvtk pretty -t -C '%' seqkit.summarised_stats.group_by_samples.tsv > seqkit.summarised_stats.group_by_samples.pretty.tsv
"""
}
// Collect QC into single files if skipping cutadapt
process collect_QC_skip_cutadapt {
conda (params.enable_conda ? "$projectDir/env/pb-16s-pbtools.yml" : null)
container "kpinpb/pb-16s-nf-tools:latest"
publishDir "$params.outdir/results/reads_QC", mode: params.publish_dir_mode
label 'cpu8'
input:
path "*"
path "*"
output:
path "all_samples_seqkit.readstats.tsv", emit: all_samples_readstats
path "all_samples_seqkit.summarystats.tsv", emit: all_samples_summarystats
path "seqkit.summarised_stats.group_by_samples.tsv", emit: summarised_sample_readstats
path "seqkit.summarised_stats.group_by_samples.pretty.tsv"
script:
"""
csvtk concat -t -C '%' *.seqkit.readstats.tsv > all_samples_seqkit.readstats.tsv
csvtk concat -t -C '%' *.seqkit.summarystats.tsv > all_samples_seqkit.summarystats.tsv
# Summary read_qual for each sample
csvtk summary -t -C '%' -g sample -f length:q1,length:q3,length:median,avg.qual:q1,avg.qual:q3,avg.qual:median all_samples_seqkit.readstats.tsv > seqkit.summarised_stats.group_by_samples.tsv
csvtk pretty -t -C '%' seqkit.summarised_stats.group_by_samples.tsv > seqkit.summarised_stats.group_by_samples.pretty.tsv
"""
}
// Put all trimmed samples into a single file for QIIME2 import
process prepare_qiime2_manifest {
label 'cpu_def'
publishDir "$params.outdir/results/", mode: params.publish_dir_mode
conda (params.enable_conda ? "$projectDir/env/pb-16s-pbtools.yml" : null)
container "kpinpb/pb-16s-nf-tools:latest"
input:
val(samplesFASTQ)
path(metadata)
output:
path "samplefile*.txt", emit: sample_trimmed_file
"""
echo -e "sample-id\tabsolute-filepath" > samplefile.txt
echo -e "$samplesFASTQ" | sed -e 's/\\[//g' -e 's/\\]//g' | sed -r 's/\s+//g' | tr ',' '\\n' | split -l 2 - sample_ind_
for i in \$(ls sample_ind_*); do sample=\$(cat \${i} | tr '\\n' '\\t' | cut -f1); fastq=\$(basename \$(cat \${i} | tr '\\n' '\\t' | cut -f2)); echo -e "\${sample}\t\${fastq}"; done >> samplefile.txt
rm -f sample_ind_*
# If pool column exists, split sample files
poolyes=\$(csvtk headers -t $metadata | grep pool | wc -l)
if [[ \$poolyes -eq 1 ]]
then
csvtk join -t samplefile.txt <(csvtk cut -t -f sample_name,pool $metadata) -f "sample-id;sample_name" | csvtk split -t -f pool
counter=1
for i in \$(ls stdin*.tsv); do csvtk cut -t -f -pool \${i} > samplefile\${counter}.txt; rm -f \${i}; let counter++; done
rm -f samplefile.txt
fi
"""
}
// Put all trimmed samples into a single file for QIIME2 import if skipping cutadapt
process prepare_qiime2_manifest_skip_cutadapt {
label 'cpu_def'
publishDir "$params.outdir/results/", mode: params.publish_dir_mode
conda (params.enable_conda ? "$projectDir/env/pb-16s-pbtools.yml" : null)
container "kpinpb/pb-16s-nf-tools:latest"
input:
val(samplesFASTQ)
path(metadata)
output:
path "samplefile*.txt", emit: sample_trimmed_file
"""
echo -e "sample-id\tabsolute-filepath" > samplefile.txt
echo -e "$samplesFASTQ" | sed -e 's/\\[//g' -e 's/\\]//g' | sed -r 's/\s+//g' | tr ',' '\\n' | split -l 2 - sample_ind_
for i in \$(ls sample_ind_*); do sample=\$(cat \${i} | tr '\\n' '\\t' | cut -f1); fastq=\$(basename \$(cat \${i} | tr '\\n' '\\t' | cut -f2)); echo -e "\${sample}\t\${fastq}"; done >> samplefile.txt
rm -f sample_ind_*
# If pool column exists, split sample files
poolyes=\$(csvtk headers -t $metadata | grep pool | wc -l)
if [[ \$poolyes -eq 1 ]]
then
csvtk join -t samplefile.txt <(csvtk cut -t -f sample_name,pool $metadata) -f "sample-id;sample_name" | csvtk split -t -f pool
counter=1
for i in \$(ls stdin*.tsv); do csvtk cut -t -f -pool \${i} > samplefile\${counter}.txt; rm -f \${i}; let counter++; done
rm -f samplefile.txt
fi
"""
}
// Import data into QIIME 2, note the `path *` in input is staging all the input
// FASTQ into the process so it'll work on AWS that has no notion of local directory path
process import_qiime2 {
conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null)
container "kpinpb/pb-16s-nf-qiime:v0.7"
publishDir "$params.outdir/import_qiime", mode: params.publish_dir_mode
label 'cpu_def'
input:
path sample_manifest
path '*'
output:
path 'samples.qza'
script:
"""
# Make sure path is absolute
awk -v wdir="\$(pwd)/" -F\$'\t' '{if (NR>1){print \$1,wdir\$2} else {print \$0}}' OFS=\$'\t' $sample_manifest > sample_list.txt
qiime tools import --type 'SampleData[SequencesWithQuality]' \
--input-path sample_list.txt \
--output-path samples.qza \
--input-format SingleEndFastqManifestPhred33V2
"""
}
// Merge sample manifest file for final vis purpose later on
process merge_sample_manifest {
label 'cpu_def'
conda (params.enable_conda ? "$projectDir/env/pb-16s-pbtools.yml" : null)
container "kpinpb/pb-16s-nf-tools:latest"
input:
path sample_manifest
output:
path 'merged_sample_file.txt', emit: merged_samplefile
script:
"""
csvtk concat -t $sample_manifest > merged_sample_file.txt
"""
}
// Summarize demultiplex statistics
process demux_summarize {
conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null)
container "kpinpb/pb-16s-nf-qiime:v0.7"
publishDir "$params.outdir/summary_demux", mode: params.publish_dir_mode
label 'cpu_def'
input:
path samples_qza
output:
path "samples.demux.summary.qzv"
path "per-sample-fastq-counts.tsv"
script:
"""
qiime demux summarize --i-data $samples_qza \
--o-visualization samples.demux.summary.qzv
qiime tools export --input-path samples.demux.summary.qzv \
--output-path ./
"""
}
// Denoise into ASVs
process dada2_denoise {
conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null)
container "kpinpb/pb-16s-nf-qiime:v0.7"
publishDir "$params.outdir/dada2", mode: params.publish_dir_mode
cpus params.dada2_cpu
input:
path samples_qza
path dada_ccs_script
val(minQ)
output:
path "dada2-ccs_rep*.qza", emit: asv_seq
path "dada2-ccs_table*.qza", emit: asv_freq
path "dada2-ccs_stats*.qza", emit:asv_stats
path "seqtab_nochim*.rds", emit: dada2_rds
path "plot_error_model*.pdf"
script:
"""
# Use custom script that can skip primer trimming
mkdir -p dada2_custom_script
cp $dada_ccs_script dada2_custom_script/run_dada.R
# sed 's/minQ\\ =\\ 0/minQ=$minQ/g' dada2_custom_script/run_dada_ccs_original.R > \
# dada2_custom_script/run_dada_ccs.R
chmod +x dada2_custom_script/run_dada.R
export PATH="./dada2_custom_script:\$PATH"
which run_dada.R
qiime dada2 denoise-ccs --i-demultiplexed-seqs $samples_qza \
--o-table dada2-ccs_table.qza \
--o-representative-sequences dada2-ccs_rep.qza \
--o-denoising-stats dada2-ccs_stats.qza \
--p-min-len $params.min_len --p-max-len $params.max_len \
--p-max-ee $params.max_ee \
--p-front 'none' \
--p-adapter 'none' \
--p-n-threads $task.cpus \
--p-pooling-method \'$params.pooling_method\'
"""
}
// TODO Merge ASV table and sequences of all chunks, if chunked
// Take note of the output above! There will only be one set of out from
// different group in dada2 folder after dada2_denoise
process mergeASV {
conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null)
container "kpinpb/pb-16s-nf-qiime:v0.7"
publishDir "$params.outdir/dada2", mode: params.publish_dir_mode
cpus params.dada2_cpu
input:
path "dada2-ccs_rep*.qza"
path "dada2-ccs_table*.qza"
path "dada2-ccs_stats*.qza"
output:
path "dada2-ccs_rep_merged.qza", emit: asv_seq
path "dada2-ccs_table_merged.qza", emit: asv_freq
path "dada2-ccs_stats_merged.qza", emit: asv_stats
script:
"""
qiime feature-table merge \
--i-tables dada2-ccs_table*.qza \
--o-merged-table dada2-ccs_table_merged.qza
qiime feature-table merge-seqs \
--i-data dada2-ccs_rep*.qza \
--o-merged-data dada2-ccs_rep_merged.qza
for i in \$(ls dada2-ccs_stats*.qza);
do
qiime tools export --input-path \${i} \
--output-path ./\${i%%.qza}_export/
done
cat ./*_export/stats.tsv | awk '{if (NR>2 && \$0~/^#|sample-id/) {next} else {print \$0}}' > merged_stats.tsv
qiime tools import --input-path merged_stats.tsv --output-path dada2-ccs_stats_merged --type 'SampleData[DADA2Stats]'
"""
}
// Filter DADA2 ASVs using minimum number of samples and frequency of ASVs
process filter_dada2 {
conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null)
container "kpinpb/pb-16s-nf-qiime:v0.7"
publishDir "$params.outdir/dada2", mode: params.publish_dir_mode
cpus params.dada2_cpu
input:
path asv_table
path asv_seq
output:
path "dada2-ccs_rep_filtered.qza", emit: asv_seq
path "dada2-ccs_table_filtered.qza", emit: asv_freq
path "dada2_ASV.fasta", emit:asv_seq_fasta
script:
"""
if [ $params.min_asv_sample -gt 0 ] && [ $params.min_asv_totalfreq -gt 0 ]
then
# Filter ASVs and sequences
qiime feature-table filter-features \
--i-table $asv_table \
--p-min-frequency $params.min_asv_totalfreq \
--p-min-samples $params.min_asv_sample \
--p-filter-empty-samples \
--o-filtered-table dada2-ccs_table_filtered.qza
elif [ $params.min_asv_sample -gt 0 ] && [ $params.min_asv_totalfreq -eq 0 ]
then
qiime feature-table filter-features \
--i-table $asv_table \
--p-min-samples $params.min_asv_sample \
--p-filter-empty-samples \
--o-filtered-table dada2-ccs_table_filtered.qza
elif [ $params.min_asv_sample -eq 0 ] && [ $params.min_asv_totalfreq -gt 0 ]
then
qiime feature-table filter-features \
--i-table $asv_table \
--p-min-frequency $params.min_asv_totalfreq \
--p-filter-empty-samples \
--o-filtered-table dada2-ccs_table_filtered.qza
else
mv $asv_table dada2-ccs_table_filtered.qza
fi
qiime feature-table filter-seqs \
--i-data $asv_seq \
--i-table dada2-ccs_table_filtered.qza \
--o-filtered-data dada2-ccs_rep_filtered.qza
# Export FASTA file for ASVs
qiime tools export --input-path dada2-ccs_rep_filtered.qza \
--output-path .
mv dna-sequences.fasta dada2_ASV.fasta
"""
}
// Assign taxonomies to SILVA, GTDB and RefSeq using DADA2
// assignTaxonomy function based on Naive Bayes classifier
process dada2_assignTax {
conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null)
container "kpinpb/pb-16s-nf-qiime:v0.7"
publishDir "$params.outdir/results", pattern: 'best_tax*', mode: params.publish_dir_mode
publishDir "$params.outdir/nb_tax", mode: params.publish_dir_mode
cpus params.vsearch_cpu
input:
path asv_seq_fasta
path asv_seq
path asv_freq
path silva_db
path gtdb_db
path refseq_db
path assign_script
output:
path "best_tax.qza", emit:best_nb_tax_qza
path "best_taxonomy.tsv", emit: best_nb_tax
path "best_taxonomy_withDB.tsv"
path "best_tax_merged_freq_tax.tsv", emit: best_nb_tax_tsv
path "silva_nb.tsv"
path "gtdb_nb.tsv"
path "refseq_rdp_nb.tsv"
script:
"""
Rscript --vanilla $assign_script $asv_seq_fasta $task.cpus $silva_db $gtdb_db $refseq_db 80
qiime feature-table transpose --i-table $asv_freq \
--o-transposed-feature-table transposed-asv.qza
qiime tools import --type "FeatureData[Taxonomy]" \
--input-format "TSVTaxonomyFormat" \
--input-path best_taxonomy.tsv --output-path best_tax.qza
qiime metadata tabulate --m-input-file $asv_seq \
--m-input-file best_tax.qza \
--m-input-file transposed-asv.qza \
--o-visualization merged_freq_tax.qzv
qiime tools export --input-path merged_freq_tax.qzv \
--output-path merged_freq_tax_tsv
mv merged_freq_tax_tsv/metadata.tsv best_tax_merged_freq_tax.tsv
"""
}
// QC summary for dada2
process dada2_qc {
conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null)
container "kpinpb/pb-16s-nf-qiime:v0.7"
publishDir "$params.outdir/results", mode: params.publish_dir_mode
label 'cpu_def'
input:
path asv_stats
path asv_freq
path metadata
output:
path "dada2_stats.qzv", emit: dada2_stats
path "dada2_table.qzv", emit: dada2_table
path "stats.tsv", emit: dada2_stats_tsv
path "dada2_qc.tsv", emit: dada2_qc_tsv
env(int_rarefaction_d), emit: rarefaction_depth
path "rarefaction_depth_suggested.txt"
env(int_alpha_d), emit:alpha_depth
script:
"""
qiime metadata tabulate --m-input-file $asv_stats \
--o-visualization dada2_stats.qzv
qiime feature-table summarize --i-table $asv_freq \
--o-visualization dada2_table.qzv \
--m-sample-metadata-file $metadata
qiime tools export --input-path dada2_table.qzv \
--output-path dada2_table_summary
qiime tools export --input-path $asv_stats \
--output-path ./
# Get number of reads for ASV covering 80% of samples
number=`tail -n+2 dada2_table_summary/sample-frequency-detail.csv | wc -l`
ninety=0.8
# Handle samples less than 5 as we are using at least 20% of samples for
# rarefaction sampling depth (i.e. if sample is less than 5 the math result is 0)
if [ \${number} -le 2 ];
then
rarefaction_d=`tail -n+2 dada2_table_summary/sample-frequency-detail.csv | sort -t, -k2 -nr | \
tail -n1 | cut -f2 -d,`
int_rarefaction_d=\${rarefaction_d%%.*}
echo \${int_rarefaction_d} > rarefaction_depth_suggested.txt
alpha_d=`tail -n+2 dada2_table_summary/sample-frequency-detail.csv | sort -t, -k2 -nr | \
tail -n1 | cut -f2 -d,`
int_alpha_d=\${alpha_d%%.*}
elif [ \${number} -gt 2 ] && [ \${number} -lt 5 ];
then
result=`echo "(\$number * \$ninety)" | bc -l`
rarefaction_d=`tail -n+2 dada2_table_summary/sample-frequency-detail.csv | sort -t, -k2 -nr | \
head -n \${result%%.*} | tail -n1 | cut -f2 -d,`
int_rarefaction_d=\${rarefaction_d%%.*}
echo \${int_rarefaction_d} > rarefaction_depth_suggested.txt
alpha_d=`tail -n+2 dada2_table_summary/sample-frequency-detail.csv | sort -t, -k2 -nr | \
tail -n1 | cut -f2 -d,`
int_alpha_d=\${alpha_d%%.*}
else
result=`echo "(\$number * \$ninety)" | bc -l`
rarefaction_d=`tail -n+2 dada2_table_summary/sample-frequency-detail.csv | sort -t, -k2 -nr | \
head -n \${result%%.*} | tail -n1 | cut -f2 -d,`
int_rarefaction_d=\${rarefaction_d%%.*}
echo \${int_rarefaction_d} > rarefaction_depth_suggested.txt
# Use lowest depth covering just 20% of sample for alpha rarefaction curve
alpha_res=`echo "(\$number * 0.2)" | bc -l`
alpha_d=`tail -n+2 dada2_table_summary/sample-frequency-detail.csv | sort -t, -k2 -nr | \
head -n \${alpha_res%%.*} | tail -n1 | cut -f2 -d,`
int_alpha_d=\${alpha_d%%.*}
fi
qiime tools export --input-path dada2_stats.qzv \
--output-path ./dada2_stats
mv dada2_stats/metadata.tsv dada2_qc.tsv
"""
}
process qiime2_phylogeny_diversity {
conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null)
container "kpinpb/pb-16s-nf-qiime:v0.7"
publishDir "$params.outdir/results/phylogeny_diversity", mode: params.publish_dir_mode
label 'cpu8'
input:
path metadata
path asv_seq
path asv_freq
val(rarefaction_depth)
output:
path "*.qza"
path "core-metrics-diversity"
path "phylotree_mafft_rooted.nwk", emit:qiime2_tree
path "core-metrics-diversity/bray_curtis_distance_matrix.tsv", emit:bray_mat
path "core-metrics-diversity/weighted_unifrac_distance_matrix.tsv", emit:wunifrac_mat
path "core-metrics-diversity/unweighted_unifrac_distance_matrix.tsv", emit:unifrac_mat
script:
if (n_sample > 1)
"""
qiime phylogeny align-to-tree-mafft-fasttree \
--p-n-threads $task.cpus \
--i-sequences $asv_seq \
--o-alignment mafft_alignment.qza \
--o-masked-alignment mafft_alignment_masked.qza \
--o-tree phylotree_mafft_unrooted.qza \
--o-rooted-tree phylotree_mafft_rooted.qza
qiime tools export --input-path phylotree_mafft_rooted.qza \
--output-path ./
mv tree.nwk phylotree_mafft_rooted.nwk
qiime diversity core-metrics-phylogenetic \
--p-n-jobs-or-threads $task.cpus \
--i-phylogeny phylotree_mafft_rooted.qza \
--i-table $asv_freq \
--m-metadata-file $metadata \
--p-sampling-depth $rarefaction_depth \
--output-dir ./core-metrics-diversity
# Export various matrix for plotting later
qiime tools export --input-path ./core-metrics-diversity/bray_curtis_distance_matrix.qza \
--output-path ./core-metrics-diversity
mv ./core-metrics-diversity/distance-matrix.tsv \
./core-metrics-diversity/bray_curtis_distance_matrix.tsv
qiime tools export --input-path ./core-metrics-diversity/weighted_unifrac_distance_matrix.qza \
--output-path ./core-metrics-diversity
mv ./core-metrics-diversity/distance-matrix.tsv \
./core-metrics-diversity/weighted_unifrac_distance_matrix.tsv
qiime tools export --input-path ./core-metrics-diversity/unweighted_unifrac_distance_matrix.qza \
--output-path ./core-metrics-diversity
mv ./core-metrics-diversity/distance-matrix.tsv \
./core-metrics-diversity/unweighted_unifrac_distance_matrix.tsv
"""
else
"""
qiime phylogeny align-to-tree-mafft-fasttree \
--p-n-threads $task.cpus \
--i-sequences $asv_seq \
--o-alignment mafft_alignment.qza \
--o-masked-alignment mafft_alignment_masked.qza \
--o-tree phylotree_mafft_unrooted.qza \
--o-rooted-tree phylotree_mafft_rooted.qza
qiime tools export --input-path phylotree_mafft_rooted.qza \
--output-path ./
mv tree.nwk phylotree_mafft_rooted.nwk
mkdir ./core-metrics-diversity
touch ./core-metrics-diversity/bray_curtis_distance_matrix.tsv \
./core-metrics-diversity/weighted_unifrac_distance_matrix.tsv \
./core-metrics-diversity/unweighted_unifrac_distance_matrix.tsv
"""
}
// Rarefaction visualization
process dada2_rarefaction {
conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null)
container "kpinpb/pb-16s-nf-qiime:v0.7"
publishDir "$params.outdir/results", mode: params.publish_dir_mode
label 'cpu_def'
input:
path asv_freq
path metadata
val(alpha_d)
output:
path "*"
script:
"""
qiime diversity alpha-rarefaction --i-table $asv_freq \
--m-metadata-file $metadata \
--o-visualization alpha-rarefaction-curves.qzv \
--p-min-depth 10 --p-max-depth $alpha_d
"""
}
// Classify taxonomy and export table using VSEARCH approach
process class_tax {
conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null)
container "kpinpb/pb-16s-nf-qiime:v0.7"
publishDir "$params.outdir/results", mode: params.publish_dir_mode
cpus params.vsearch_cpu
input:
path asv_seq
path asv_freq
path vsearch_db
path vsearch_tax
output:
path "taxonomy.vsearch.qza", emit: tax_vsearch
path "tax_export/taxonomy.tsv", emit: tax_tsv
path "merged_freq_tax.qzv", emit: tax_freq_tab
path "vsearch_merged_freq_tax.tsv", emit: tax_freq_tab_tsv
path "taxonomy.vsearch.blast_results.qza", emit: tax_vsearch_blast
script:
"""
qiime feature-classifier classify-consensus-vsearch --i-query $asv_seq \
--o-classification taxonomy.vsearch.qza \
--i-reference-reads $vsearch_db \
--i-reference-taxonomy $vsearch_tax \
--p-threads $task.cpus \
--p-maxrejects $params.maxreject \
--p-maxaccepts $params.maxaccept \
--p-perc-identity $params.vsearch_identity \
--p-top-hits-only \
--o-search-results taxonomy.vsearch.blast_results.qza
qiime tools export --input-path taxonomy.vsearch.qza --output-path tax_export
qiime feature-table transpose --i-table $asv_freq \
--o-transposed-feature-table transposed-asv.qza
qiime metadata tabulate --m-input-file $asv_seq \
--m-input-file taxonomy.vsearch.qza \
--m-input-file transposed-asv.qza \
--o-visualization merged_freq_tax.qzv
qiime tools export --input-path merged_freq_tax.qzv \
--output-path merged_freq_tax_tsv
mv merged_freq_tax_tsv/metadata.tsv vsearch_merged_freq_tax.tsv
"""
}
// Export results into biom for use with phyloseq
process export_biom {
conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null)
container "kpinpb/pb-16s-nf-qiime:v0.7"
publishDir "$params.outdir/results", mode: params.publish_dir_mode
label 'cpu_def'
input:
path asv_freq
path tax_tsv
path tax_tsv_vsearch
output:
path "feature-table-tax.biom", emit:biom
path "feature-table-tax_vsearch.biom", emit:biom_vsearch
script:
"""
qiime tools export --input-path $asv_freq --output-path asv_freq/
sed 's/Feature ID/#OTUID/' $tax_tsv | sed 's/Taxon/taxonomy/' | \
sed 's/Consensus/confidence/' > biom-taxonomy.tsv
biom add-metadata -i asv_freq/feature-table.biom \
-o feature-table-tax.biom \
--observation-metadata-fp biom-taxonomy.tsv \
--sc-separated taxonomy
sed 's/Feature ID/#OTUID/' $tax_tsv_vsearch | sed 's/Taxon/taxonomy/' | \
sed 's/Consensus/confidence/' > biom-taxonomy_vsearch.tsv
biom add-metadata -i asv_freq/feature-table.biom \
-o feature-table-tax_vsearch.biom \
--observation-metadata-fp biom-taxonomy_vsearch.tsv \
--sc-separated taxonomy
"""
}
// Export results into biom for use with phyloseq
process export_biom_skip_nb {
conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null)
container "kpinpb/pb-16s-nf-qiime:v0.7"
publishDir "$params.outdir/results", mode: params.publish_dir_mode
label 'cpu_def'
input:
path asv_freq
path tax_tsv_vsearch
output:
path "feature-table-tax_vsearch.biom", emit:biom_vsearch
script:
"""
qiime tools export --input-path $asv_freq --output-path asv_freq/
sed 's/Feature ID/#OTUID/' $tax_tsv_vsearch | sed 's/Taxon/taxonomy/' | \
sed 's/Consensus/confidence/' > biom-taxonomy_vsearch.tsv
biom add-metadata -i asv_freq/feature-table.biom \
-o feature-table-tax_vsearch.biom \
--observation-metadata-fp biom-taxonomy_vsearch.tsv \
--sc-separated taxonomy
"""
}
process picrust2 {
conda (params.enable_conda ? "$projectDir/env/pb-16s-pbtools.yml" : null)
container "kpinpb/pb-16s-nf-tools:latest"
label 'cpu32'
publishDir "$params.outdir/results", mode: params.publish_dir_mode
input:
path dada2_asv
path vsearch_biom
output:
path "picrust2"
script:
"""
picrust2_pipeline.py -s $dada2_asv -i $vsearch_biom \
-o picrust2 \
--in_traits EC,KO \