From 2a41155cc716df15373291bbec1cb7b5370cdf39 Mon Sep 17 00:00:00 2001 From: Khi Pin Date: Wed, 27 Jul 2022 01:42:02 -0700 Subject: [PATCH 01/14] updated to GTDB r207 for naive bayes classifier --- main.nf | 4 ++-- scripts/dada2_assign_tax.R | 26 +++++++++----------------- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/main.nf b/main.nf index 6aca67e..386e911 100644 --- a/main.nf +++ b/main.nf @@ -123,7 +123,7 @@ params.vsearch_db = "$projectDir/databases/GTDB_ssu_all_r207.qza" params.vsearch_tax = "$projectDir/databases/GTDB_ssu_all_r207.taxonomy.qza" params.silva_db = "$projectDir/databases/silva_nr99_v138.1_wSpecies_train_set.fa.gz" params.refseq_db = "$projectDir/databases/RefSeq_16S_6-11-20_RDPv16_fullTaxo.fa.gz" -params.gtdb_db = "$projectDir/databases/GTDB_bac120_arc122_ssu_r202_fullTaxo.fa.gz" +params.gtdb_db = "$projectDir/databases/GTDB_bac120_arc53_ssu_r207_fullTaxo.fa.gz" params.maxreject = 100 params.maxaccept = 100 params.rarefaction_depth = null @@ -929,7 +929,7 @@ process download_db { echo "Downloading SILVA sequences for VSEARCH..." wget -N --content-disposition 'https://zenodo.org/record/4587955/files/silva_nr99_v138.1_wSpecies_train_set.fa.gz?download=1' echo "Downloading GTDB sequences for VSEARCH..." - wget -N --content-disposition 'https://zenodo.org/record/4735821/files/GTDB_bac120_arc122_ssu_r202_fullTaxo.fa.gz?download=1' + wget -N --content-disposition 'https://zenodo.org/record/6655692/files/GTDB_bac120_arc53_ssu_r207_fullTaxo.fa.gz?download=1' echo "Downloading RefSeq + RDP sequences for VSEARCH..." wget -N --content-disposition 'https://zenodo.org/record/4735821/files/RefSeq_16S_6-11-20_RDPv16_fullTaxo.fa.gz?download=1' diff --git a/scripts/dada2_assign_tax.R b/scripts/dada2_assign_tax.R index 1afbcb4..2e805a2 100644 --- a/scripts/dada2_assign_tax.R +++ b/scripts/dada2_assign_tax.R @@ -32,8 +32,8 @@ gtdb_spec <- as.data.frame(gtdb_spec) colnames(gtdb_spec)[grepl("tax.", colnames(gtdb_spec))] <- gsub("tax\\.(.*)", "\\1", colnames(gtdb_spec)[grepl("tax.", colnames(gtdb_spec))]) # Rename species -gtdb_spec[, 'Species'] <- gsub("(.*)\\ (.*)\\(.*\\)", "\\2", gtdb_spec[, 'Species']) -buf <- data.frame("Assignment" = rep("GTDB r202", nrow(gtdb_spec))) +gtdb_spec[, 'Species'] <- gsub("(.*)_(.*)\\(.*\\)", "\\2", gtdb_spec[, 'Species']) +buf <- data.frame("Assignment" = rep("GTDB r207", nrow(gtdb_spec))) gtdb_spec <- cbind(gtdb_spec, buf) refseq_spec <- assignTaxonomy(seqs, refFasta = refseq_db, minBoot = minBoot_num, multithread = threads, outputBootstraps = TRUE) @@ -46,37 +46,29 @@ buf <- data.frame("Assignment" = rep("RefSeq + RDP", nrow(refseq_spec))) refseq_spec <- cbind(refseq_spec, buf) # Iteratively join at species level -final_spec <- silva_spec -final_spec[is.na(final_spec[, 'Species']), ] <- gtdb_spec[is.na(final_spec[, 'Species']), ] +final_spec <- gtdb_spec +final_spec[is.na(final_spec[, 'Species']), ] <- silva_spec[is.na(final_spec[, 'Species']), ] final_spec[is.na(final_spec[, 'Species']), ] <- refseq_spec[is.na(final_spec[, 'Species']), ] # Iteratively join at genus level -final_spec[is.na(final_spec[, 'Genus']), ] <- gtdb_spec[is.na(final_spec[, 'Genus']), ] +final_spec[is.na(final_spec[, 'Genus']), ] <- silva_spec[is.na(final_spec[, 'Genus']), ] final_spec[is.na(final_spec[, 'Genus']), ] <- refseq_spec[is.na(final_spec[, 'Genus']), ] # uncultured or metagenome -final_spec[grepl("uncultured", final_spec[, 'Species'], ignore.case = TRUE)] <- - gtdb_spec[grepl("uncultured", final_spec[, 'Species'], ignore.case = TRUE)] final_spec[grepl("uncultured", final_spec[, 'Species'], ignore.case = TRUE)] <- refseq_spec[grepl("uncultured", final_spec[, 'Species'], ignore.case = TRUE)] -final_spec[grepl("metagenome", final_spec[, 'Species'], ignore.case = TRUE)] <- - gtdb_spec[grepl("metagenome", final_spec[, 'Species'], ignore.case = TRUE)] final_spec[grepl("metagenome", final_spec[, 'Species'], ignore.case = TRUE)] <- refseq_spec[grepl("metagenome", final_spec[, 'Species'], ignore.case = TRUE)] # uncultured or metagenome at genus level final_spec[grepl("uncultured", final_spec[, 'Genus'], ignore.case = TRUE)] <- gtdb_spec[grepl("uncultured", final_spec[, 'Genus'], ignore.case = TRUE)] -final_spec[grepl("uncultured", final_spec[, 'Genus'], ignore.case = TRUE)] <- - refseq_spec[grepl("uncultured", final_spec[, 'Genus'], ignore.case = TRUE)] final_spec[grepl("metagenome", final_spec[, 'Genus'], ignore.case = TRUE)] <- gtdb_spec[grepl("metagenome", final_spec[, 'Genus'], ignore.case = TRUE)] -final_spec[grepl("metagenome", final_spec[, 'Genus'], ignore.case = TRUE)] <- - refseq_spec[grepl("metagenome", final_spec[, 'Genus'], ignore.case = TRUE)] -# If still NA, revert back to Silva -final_spec[is.na(final_spec[, 'Species']), ] <- silva_spec[is.na(final_spec[, 'Species']), ] -final_spec[is.na(final_spec[, 'Genus']), ] <- silva_spec[is.na(final_spec[, 'Genus']), ] +# If still NA, revert back to GTDB +final_spec[is.na(final_spec[, 'Species']), ] <- gtdb_spec[is.na(final_spec[, 'Species']), ] +final_spec[is.na(final_spec[, 'Genus']), ] <- gtdb_spec[is.na(final_spec[, 'Genus']), ] rownames(final_spec) <- otu_id # For each row, look for the deepest level assigned and assign confidence value @@ -119,4 +111,4 @@ write.table(to_save, "best_taxonomy.tsv", quote = FALSE, write.table(to_save2, "best_taxonomy_withDB.tsv", quote = FALSE, sep = "\t",row.names = FALSE, - col.names = c("Feature ID", "Taxon", "Confidence", "Assignment Database")) \ No newline at end of file + col.names = c("Feature ID", "Taxon", "Confidence", "Assignment Database")) From 015b95499adcde60ba4a02f35d21f81e89f014e6 Mon Sep 17 00:00:00 2001 From: Khi Pin Date: Wed, 27 Jul 2022 01:43:26 -0700 Subject: [PATCH 02/14] Updated v0.3 --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index b6bc0d5..724fac9 100644 --- a/main.nf +++ b/main.nf @@ -81,7 +81,7 @@ def helpMessage() { params.help = false if (params.help) exit 0, helpMessage() params.version = false -version = "0.2" +version = "0.3" if (params.version) exit 0, log.info("$version") params.download_db = false params.skip_primer_trim = false From 27e9431679edcc043862c03c6e5a9a95101b2396 Mon Sep 17 00:00:00 2001 From: Khi Pin Date: Wed, 27 Jul 2022 02:10:16 -0700 Subject: [PATCH 03/14] Updated documentation on taxonomy classification with GTDB r207 --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 3507684..20ea1e4 100644 --- a/README.md +++ b/README.md @@ -247,15 +247,15 @@ primers removal? The "besttax" assignment uses the `assignTaxonomy` Naive-Bayes classifier function from DADA2 to carry out taxonomy assignment. It uses 3 databases to classify the ASVs (requiring a minimum bootstrap of 80 using the minBoot parameter) and the priority of assignment - is Silva 138, followed by GTDB r202, then lastly RefSeq + RDP. This means for example - if an ASV is not assigned at Species level using Silva, it will check if it can be assigned - with GTDB. This ensure we assign as many ASVs as possible. + is GTDB r207, followed by Silva v138, then lastly RefSeq + RDP. This means for example + if an ASV is not assigned at Species level using GTDB, it will check if it can be assigned + with Silva. This ensure we assign as many ASVs as possible. This process is done first at Species level, then at Genus level. In addition, if any ASV - is assigned as "uncultured" or "metagenome", it will go through the iterative assignment - process just like the unclassified ASVs. Note that while this method will assign - a high amount of ASVs, there may be issues such as how the taxonomy is annotated - in different databases. + is assigned as "uncultured" or "metagenome" (many entries like this in Silva), + it will go through the iterative assignment process just like the unclassified ASVs. + Note that while this method will assign a high amount of ASVs, there may be issues + such as how the taxonomy is annotated in different databases. As such, there is also a VSEARCH taxonomy classification using GTDB database (r207) only in the file called `results/vsearch_merged_freq_tax.tsv` that may provide a more consistent annotation. This uses From 984a98a87755487e74632a36f90f51965bcaaa8c Mon Sep 17 00:00:00 2001 From: Khi Pin Date: Wed, 27 Jul 2022 02:26:03 -0700 Subject: [PATCH 04/14] Provide zenodo for db download --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 20ea1e4..c986ae9 100644 --- a/README.md +++ b/README.md @@ -285,7 +285,8 @@ primers removal? * Can I download the databases for taxonomic classification manually? The taxonomy classification step of the pipeline requires a few databases that will be downloaded with the - `--download_db` parameters into a "databases" folder. These databases can also be downloaded + `--download_db` parameters into a "databases" folder. All the databases are also + collected on [Zenodo](https://zenodo.org/record/6912512). These databases can also be downloaded manually from the following links if the download script above does not work. The GTDB database for VSEARCH will require some processing using the `QIIME 2` package. See `scripts/download_db.sh` for details. From dae87ced95e0bc1011d4c821cc3a1e2d82ca06f2 Mon Sep 17 00:00:00 2001 From: Khi Pin Date: Thu, 28 Jul 2022 01:22:39 -0700 Subject: [PATCH 05/14] Fixed GTDB naming issue for naive bayes classifier --- scripts/dada2_assign_tax.R | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/scripts/dada2_assign_tax.R b/scripts/dada2_assign_tax.R index 2e805a2..d3ae098 100644 --- a/scripts/dada2_assign_tax.R +++ b/scripts/dada2_assign_tax.R @@ -32,7 +32,9 @@ gtdb_spec <- as.data.frame(gtdb_spec) colnames(gtdb_spec)[grepl("tax.", colnames(gtdb_spec))] <- gsub("tax\\.(.*)", "\\1", colnames(gtdb_spec)[grepl("tax.", colnames(gtdb_spec))]) # Rename species -gtdb_spec[, 'Species'] <- gsub("(.*)_(.*)\\(.*\\)", "\\2", gtdb_spec[, 'Species']) +gtdb_spec[, 'Species'] <- gsub("(.*)\\(.*\\)", "\\1", gtdb_spec[, 'Species']) +# Replace GTDB underscore with space +gtdb_spec[, 'Species'] <- gsub("_", "\\ ", gtdb_spec[, 'Species']) buf <- data.frame("Assignment" = rep("GTDB r207", nrow(gtdb_spec))) gtdb_spec <- cbind(gtdb_spec, buf) refseq_spec <- assignTaxonomy(seqs, refFasta = refseq_db, minBoot = minBoot_num, @@ -49,6 +51,12 @@ refseq_spec <- cbind(refseq_spec, buf) final_spec <- gtdb_spec final_spec[is.na(final_spec[, 'Species']), ] <- silva_spec[is.na(final_spec[, 'Species']), ] final_spec[is.na(final_spec[, 'Species']), ] <- refseq_spec[is.na(final_spec[, 'Species']), ] +# For those that's been replaced by either SILVA or refSeq, paste species name +class_nonGTDB <- final_spec[is.na(gtdb_spec[, 'Species']), ] +class_nonGTDB <- class_nonGTDB[!is.na(class_nonGTDB[, 'Species']),] +final_spec[rownames(class_nonGTDB), 'Species'] <- + paste(final_spec[rownames(class_nonGTDB), 'Genus', drop=TRUE], + final_spec[rownames(class_nonGTDB), 'Species', drop=TRUE], sep=" ") # Iteratively join at genus level final_spec[is.na(final_spec[, 'Genus']), ] <- silva_spec[is.na(final_spec[, 'Genus']), ] @@ -97,7 +105,7 @@ for (i in 1:nrow(final_spec)){ tosave_row <- data.frame( "Feature ID" = rownames(row), "Taxon" = paste( - paste(c("d__", "p__", "c__", "o__", "f__", "g__", "s__"), c(row[, c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus")], paste(row[, "Genus"], row[, "Species"], sep=" ")), sep = ""), + paste(c("d__", "p__", "c__", "o__", "f__", "g__", "s__"), c(row[, c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")]), sep=""), collapse = "; "), "Confidence" = conf, "Assignment Database" = row['Assignment'] From a236e0d9be2dad7c93e96d0c924c61f2483046df Mon Sep 17 00:00:00 2001 From: Khi Pin Date: Thu, 28 Jul 2022 01:53:00 -0700 Subject: [PATCH 06/14] Added picrust2 --- container_def/pb-16s-pbtools.yml | 1 + container_def/pb-16s-pbtools_docker/pb-16s-pbtools.yml | 1 + env/pb-16s-pbtools.yml | 1 + 3 files changed, 3 insertions(+) diff --git a/container_def/pb-16s-pbtools.yml b/container_def/pb-16s-pbtools.yml index 64c652b..79e885b 100644 --- a/container_def/pb-16s-pbtools.yml +++ b/container_def/pb-16s-pbtools.yml @@ -8,3 +8,4 @@ dependencies: - csvtk=0.23.0 - kraken2=2.1.2 - bracken=2.6.2 + - picrust2=2.5.0 diff --git a/container_def/pb-16s-pbtools_docker/pb-16s-pbtools.yml b/container_def/pb-16s-pbtools_docker/pb-16s-pbtools.yml index 64c652b..79e885b 100644 --- a/container_def/pb-16s-pbtools_docker/pb-16s-pbtools.yml +++ b/container_def/pb-16s-pbtools_docker/pb-16s-pbtools.yml @@ -8,3 +8,4 @@ dependencies: - csvtk=0.23.0 - kraken2=2.1.2 - bracken=2.6.2 + - picrust2=2.5.0 diff --git a/env/pb-16s-pbtools.yml b/env/pb-16s-pbtools.yml index 64c652b..79e885b 100644 --- a/env/pb-16s-pbtools.yml +++ b/env/pb-16s-pbtools.yml @@ -8,3 +8,4 @@ dependencies: - csvtk=0.23.0 - kraken2=2.1.2 - bracken=2.6.2 + - picrust2=2.5.0 From 8d3075f17845e6d2738578a64f8b268f579ce531 Mon Sep 17 00:00:00 2001 From: proteinosome Date: Thu, 28 Jul 2022 02:37:43 -0700 Subject: [PATCH 07/14] Added latest tag to containers --- main.nf | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/main.nf b/main.nf index 724fac9..f77139e 100644 --- a/main.nf +++ b/main.nf @@ -177,7 +177,7 @@ log.info """ // QC before cutadapt process QC_fastq { conda (params.enable_conda ? "$projectDir/env/pb-16s-pbtools.yml" : null) - container "kpinpb/pb-16s-nf-tools" + container "kpinpb/pb-16s-nf-tools:latest" label 'cpu8' publishDir "$params.outdir/filtered_input_FASTQ", pattern: '*filterQ*.fastq.gz' @@ -204,7 +204,7 @@ process QC_fastq { // Trim full length 16S primers with cutadapt process cutadapt { conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime" + container "kpinpb/pb-16s-nf-qiime:latest" publishDir "$params.outdir/trimmed_primers_FASTQ", pattern: '*.fastq.gz' publishDir "$params.outdir/cutadapt_summary", pattern: '*.report' cpus params.cutadapt_cpu @@ -237,7 +237,7 @@ process cutadapt { // Collect QC into single files process collect_QC { conda (params.enable_conda ? "$projectDir/env/pb-16s-pbtools.yml" : null) - container "kpinpb/pb-16s-nf-tools" + container "kpinpb/pb-16s-nf-tools:latest" publishDir "$params.outdir/results/reads_QC" label 'cpu8' @@ -267,7 +267,7 @@ process collect_QC { // Collect QC into single files if skipping cutadapt process collect_QC_skip_cutadapt { conda (params.enable_conda ? "$projectDir/env/pb-16s-pbtools.yml" : null) - container "kpinpb/pb-16s-nf-tools" + container "kpinpb/pb-16s-nf-tools:latest" publishDir "$params.outdir/results/reads_QC" label 'cpu8' @@ -330,7 +330,7 @@ process prepare_qiime2_manifest_skip_cutadapt { // Import data into QIIME 2 process import_qiime2 { conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime" + container "kpinpb/pb-16s-nf-qiime:latest" publishDir "$params.outdir/import_qiime" label 'cpu_def' @@ -351,7 +351,7 @@ process import_qiime2 { process demux_summarize { conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime" + container "kpinpb/pb-16s-nf-qiime:latest" publishDir "$params.outdir/summary_demux" label 'cpu_def' @@ -375,7 +375,7 @@ process demux_summarize { process dada2_denoise { conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime" + container "kpinpb/pb-16s-nf-qiime:latest" publishDir "$params.outdir/dada2" cpus params.dada2_cpu @@ -416,7 +416,7 @@ process dada2_denoise { process filter_dada2 { conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime" + container "kpinpb/pb-16s-nf-qiime:latest" publishDir "$params.outdir/dada2" cpus params.dada2_cpu @@ -473,7 +473,7 @@ process filter_dada2 { // assignTaxonomy function based on Naive Bayes classifier process dada2_assignTax { conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime" + container "kpinpb/pb-16s-nf-qiime:latest" publishDir "$params.outdir/results" cpus params.vsearch_cpu @@ -518,7 +518,7 @@ process dada2_assignTax { // QC summary for dada2 process dada2_qc { conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime" + container "kpinpb/pb-16s-nf-qiime:latest" publishDir "$params.outdir/results" label 'cpu_def' @@ -596,7 +596,7 @@ process dada2_qc { process qiime2_phylogeny_diversity { conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime" + container "kpinpb/pb-16s-nf-qiime:latest" publishDir "$params.outdir/results/phylogeny_diversity" label 'cpu8' @@ -674,7 +674,7 @@ process qiime2_phylogeny_diversity { // Rarefaction visualization process dada2_rarefaction { conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime" + container "kpinpb/pb-16s-nf-qiime:latest" publishDir "$params.outdir/results" label 'cpu_def' @@ -698,7 +698,7 @@ process dada2_rarefaction { // Classify taxonomy and export table process class_tax { conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime" + container "kpinpb/pb-16s-nf-qiime:latest" publishDir "$params.outdir/results" cpus params.vsearch_cpu @@ -746,7 +746,7 @@ process class_tax { // Export results into biom for use with phyloseq process export_biom { conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime" + container "kpinpb/pb-16s-nf-qiime:latest" publishDir "$params.outdir/results" label 'cpu_def' @@ -783,7 +783,7 @@ process export_biom { process barplot { conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime" + container "kpinpb/pb-16s-nf-qiime:latest" publishDir "$params.outdir/results" label 'cpu_def' @@ -812,7 +812,7 @@ process barplot { // HTML report process html_rep { conda (params.enable_conda ? "$projectDir/env/pb-16s-vis-conda.yml" : null) - container "kpinpb/pb-16s-vis" + container "kpinpb/pb-16s-vis:latest" publishDir "$params.outdir/results" label 'cpu_def' @@ -852,7 +852,7 @@ process html_rep { // HTML report process html_rep_skip_cutadapt { conda (params.enable_conda ? "$projectDir/env/pb-16s-vis-conda.yml" : null) - container "kpinpb/pb-16s-vis" + container "kpinpb/pb-16s-vis:latest" publishDir "$params.outdir/results" label 'cpu_def' @@ -892,7 +892,7 @@ process html_rep_skip_cutadapt { // Krona plot process krona_plot { conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime" + container "kpinpb/pb-16s-nf-qiime:latest" publishDir "$params.outdir/results" label 'cpu_def' // Ignore if this fail @@ -923,7 +923,7 @@ process krona_plot { process download_db { conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "/home/kpin/gitlab/pacbio/singularity/qiime2-2022.2-py38-linux-conda.sif" + container "kpinpb/pb-16s-nf-qiime:latest" publishDir "$projectDir/databases", mode: "copy" label 'cpu_def' From 169ed24ddf03f0defe150f03a52307dc60c6bbf3 Mon Sep 17 00:00:00 2001 From: Khi Pin Date: Sun, 31 Jul 2022 23:20:26 -0700 Subject: [PATCH 08/14] Added picrust2 --- main.nf | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/main.nf b/main.nf index f77139e..2265697 100644 --- a/main.nf +++ b/main.nf @@ -781,6 +781,28 @@ process export_biom { """ } +process picrust2 { + conda (params.enable_conda ? "$projectDir/env/pb-16s-pbtools.yml" : null) + container "kpinpb/pb-16s-nf-tools:latest" + label 'cpu32' + publishDir "$params.outdir/results/picrust2" + + input: + path dada2_asv + path vsearch_biom + + output: + path "picrust2" + + script: + """ + picrust2_pipeline.py -s $dada2_asv -i $vsearch_biom \ + -o picrust2 \ + --in_traits EC,KO \ + --verbose -p $task.cpus + """ +} + process barplot { conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) container "kpinpb/pb-16s-nf-qiime:latest" @@ -1000,6 +1022,7 @@ workflow pb16S { dada2_assignTax(filter_dada2.out.asv_seq_fasta, filter_dada2.out.asv_seq, filter_dada2.out.asv_freq, params.silva_db, params.gtdb_db, params.refseq_db, params.dadaAssign_script) export_biom(filter_dada2.out.asv_freq, dada2_assignTax.out.best_nb_tax, class_tax.out.tax_tsv) + picrust2(filter_dada2.out.asv_seq_fasta, export_biom.biom_vsearch) barplot(filter_dada2.out.asv_freq, dada2_assignTax.out.best_nb_tax_qza, class_tax.out.tax_vsearch, metadata_file) if (params.skip_primer_trim){ html_rep_skip_cutadapt(dada2_assignTax.out.best_nb_tax_tsv, metadata_file, qiime2_manifest, From 37331ed378176121e63128a44a67cf7d3b3620a6 Mon Sep 17 00:00:00 2001 From: proteinosome Date: Tue, 2 Aug 2022 02:35:10 +0000 Subject: [PATCH 09/14] Fixed picrust2 output folder and function argument --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 2265697..48afc0c 100644 --- a/main.nf +++ b/main.nf @@ -785,7 +785,7 @@ process picrust2 { conda (params.enable_conda ? "$projectDir/env/pb-16s-pbtools.yml" : null) container "kpinpb/pb-16s-nf-tools:latest" label 'cpu32' - publishDir "$params.outdir/results/picrust2" + publishDir "$params.outdir/results" input: path dada2_asv @@ -1022,7 +1022,7 @@ workflow pb16S { dada2_assignTax(filter_dada2.out.asv_seq_fasta, filter_dada2.out.asv_seq, filter_dada2.out.asv_freq, params.silva_db, params.gtdb_db, params.refseq_db, params.dadaAssign_script) export_biom(filter_dada2.out.asv_freq, dada2_assignTax.out.best_nb_tax, class_tax.out.tax_tsv) - picrust2(filter_dada2.out.asv_seq_fasta, export_biom.biom_vsearch) + picrust2(filter_dada2.out.asv_seq_fasta, export_biom.out.biom_vsearch) barplot(filter_dada2.out.asv_freq, dada2_assignTax.out.best_nb_tax_qza, class_tax.out.tax_vsearch, metadata_file) if (params.skip_primer_trim){ html_rep_skip_cutadapt(dada2_assignTax.out.best_nb_tax_tsv, metadata_file, qiime2_manifest, From 705183afedf9a083c476d51bad1418d544ce058f Mon Sep 17 00:00:00 2001 From: proteinosome Date: Tue, 2 Aug 2022 06:05:36 +0000 Subject: [PATCH 10/14] Added picrust option --- README.md | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index c986ae9..fe3192f 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,7 @@ nextflow run main.nf --help removal) (default: trim with cutadapt) --colorby Columns in metadata TSV file to use for coloring the MDS plot in HTML report (default: condition) + --run_picrust2 Run PICRUSt2 pipeline (default: false) --download_db Download databases needed for taxonomy classification only. Will not run the pipeline. Databases will be downloaded to a folder "databases" in the Nextflow pipeline directory. @@ -312,33 +313,35 @@ primers removal? ## References ### QIIME 2 -1. Bolyen, E. et al. Reproducible, interactive, scalable and extensible microbiome data science using QIIME 2. Nat Biotechnol 37, 852–857 (2019). +* Bolyen, E. et al. Reproducible, interactive, scalable and extensible microbiome data science using QIIME 2. Nat Biotechnol 37, 852–857 (2019). * For individual citations of plugins, you can use the `--citations` command for the relevant plugins. For example, if you want to cite `VSEARCH` plugin, type `qiime feature-classifier classify-consensus-vsearch --citations` after activating the conda environment. You can activate the environment installed by the pipelines by typing `conda activate $HOME/nf_conda/$ENV` (Change `$ENBV` to the name of the environment you want to activate). ### DADA2 -2. Callahan, B. J. et al. DADA2: High-resolution sample inference from Illumina amplicon data. Nat Methods 13, 581–583 (2016). +* Callahan, B. J. et al. DADA2: High-resolution sample inference from Illumina amplicon data. Nat Methods 13, 581–583 (2016). ### Seqkit -3. Shen, W., Le, S., Li, Y. & Hu, F. SeqKit: A Cross-Platform and Ultrafast Toolkit for FASTA/Q File Manipulation. PLOS ONE 11, e0163962 (2016). +* Shen, W., Le, S., Li, Y. & Hu, F. SeqKit: A Cross-Platform and Ultrafast Toolkit for FASTA/Q File Manipulation. PLOS ONE 11, e0163962 (2016). ### Cutadapt -4. Martin, M. Cutadapt removes adapter sequences from high-throughput sequencing reads. EMBnet.journal 17, 10–12 (2011). +* Martin, M. Cutadapt removes adapter sequences from high-throughput sequencing reads. EMBnet.journal 17, 10–12 (2011). ### GTDB database -5. Parks, D. H. et al. A standardized bacterial taxonomy based on genome phylogeny substantially revises the tree of life. Nat Biotechnol 36, 996–1004 (2018). -6. Parks, D. H. et al. A complete domain-to-species taxonomy for Bacteria and Archaea. Nat Biotechnol 38, 1079–1086 (2020). +* Parks, D. H. et al. A standardized bacterial taxonomy based on genome phylogeny substantially revises the tree of life. Nat Biotechnol 36, 996–1004 (2018). +* Parks, D. H. et al. A complete domain-to-species taxonomy for Bacteria and Archaea. Nat Biotechnol 38, 1079–1086 (2020). ### SILVA database -7. Yilmaz, P. et al. The SILVA and “All-species Living Tree Project (LTP)” taxonomic frameworks. Nucleic Acids Research 42, D643–D648 (2014). -8. Quast, C. et al. The SILVA ribosomal RNA gene database project: improved data processing and web-based tools. Nucleic Acids Research 41, D590–D596 (2013). +* Yilmaz, P. et al. The SILVA and “All-species Living Tree Project (LTP)” taxonomic frameworks. Nucleic Acids Research 42, D643–D648 (2014). +* Quast, C. et al. The SILVA ribosomal RNA gene database project: improved data processing and web-based tools. Nucleic Acids Research 41, D590–D596 (2013). ### RDP database -9. Cole, J. R. et al. Ribosomal Database Project: data and tools for high throughput rRNA analysis. Nucleic Acids Res 42, D633–D642 (2014). +* Cole, J. R. et al. Ribosomal Database Project: data and tools for high throughput rRNA analysis. Nucleic Acids Res 42, D633–D642 (2014). ### RefSeq database -10. O’Leary, N. A. et al. Reference sequence (RefSeq) database at NCBI: current status, taxonomic expansion, and functional annotation. Nucleic Acids Res 44, D733-745 (2016). +* O’Leary, N. A. et al. Reference sequence (RefSeq) database at NCBI: current status, taxonomic expansion, and functional annotation. Nucleic Acids Res 44, D733-745 (2016). ### Krona plot -11. Bd, O., Nh, B. & Am, P. Interactive metagenomic visualization in a Web browser. BMC bioinformatics 12, (2011). +* Bd, O., Nh, B. & Am, P. Interactive metagenomic visualization in a Web browser. BMC bioinformatics 12, (2011). * We use the QIIME 2 plugin implementation here: https://github.com/kaanb93/q2-krona ### Phyloseq and Tidyverse (for HTML report visualization) -12. McMurdie, P. J. & Holmes, S. phyloseq: An R Package for Reproducible Interactive Analysis and Graphics of Microbiome Census Data. PLOS ONE 8, e61217 (2013). -13. Wickham, H. et al. Welcome to the Tidyverse. Journal of Open Source Software 4, 1686 (2019). +* McMurdie, P. J. & Holmes, S. phyloseq: An R Package for Reproducible Interactive Analysis and Graphics of Microbiome Census Data. PLOS ONE 8, e61217 (2013). +* Wickham, H. et al. Welcome to the Tidyverse. Journal of Open Source Software 4, 1686 (2019). +### PICRUSt2 +* Douglas, G. M. et al. PICRUSt2 for prediction of metagenome functions. Nat Biotechnol 38, 685–688 (2020). ## DISCLAIMER THIS WEBSITE AND CONTENT AND ALL SITE-RELATED SERVICES, INCLUDING ANY DATA, From 2a59cd9a4827125c6c64acc1cbf64cb3062e6861 Mon Sep 17 00:00:00 2001 From: proteinosome Date: Tue, 2 Aug 2022 06:05:54 +0000 Subject: [PATCH 11/14] Capture DADA2 output to log --- main.nf | 6 +++++- scripts/run_dada_ccs.R | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 48afc0c..17755f7 100644 --- a/main.nf +++ b/main.nf @@ -70,6 +70,7 @@ def helpMessage() { removal) (default: trim with cutadapt) --colorby Columns in metadata TSV file to use for coloring the MDS plot in HTML report (default: condition) + --run_picrust2 Run PICRUSt2 pipeline (default: false) --download_db Download databases needed for taxonomy classification only. Will not run the pipeline. Databases will be downloaded to a folder "databases" in the Nextflow pipeline directory. @@ -85,6 +86,7 @@ version = "0.3" if (params.version) exit 0, log.info("$version") params.download_db = false params.skip_primer_trim = false +params.run_picrust2 = false params.filterQ = 20 params.min_len = 1000 params.max_len = 1600 @@ -1022,7 +1024,9 @@ workflow pb16S { dada2_assignTax(filter_dada2.out.asv_seq_fasta, filter_dada2.out.asv_seq, filter_dada2.out.asv_freq, params.silva_db, params.gtdb_db, params.refseq_db, params.dadaAssign_script) export_biom(filter_dada2.out.asv_freq, dada2_assignTax.out.best_nb_tax, class_tax.out.tax_tsv) - picrust2(filter_dada2.out.asv_seq_fasta, export_biom.out.biom_vsearch) + if (params.run_picrust2){ + picrust2(filter_dada2.out.asv_seq_fasta, export_biom.out.biom_vsearch) + } barplot(filter_dada2.out.asv_freq, dada2_assignTax.out.best_nb_tax_qza, class_tax.out.tax_vsearch, metadata_file) if (params.skip_primer_trim){ html_rep_skip_cutadapt(dada2_assignTax.out.best_nb_tax_tsv, metadata_file, qiime2_manifest, diff --git a/scripts/run_dada_ccs.R b/scripts/run_dada_ccs.R index 17ba96f..a120301 100755 --- a/scripts/run_dada_ccs.R +++ b/scripts/run_dada_ccs.R @@ -125,6 +125,11 @@ # # +# Added by Khi Pin to save all outputs from script +con <- file("dada2.log") +sink(con, append=TRUE) +sink(con, append=TRUE, type="message") + cat(R.version$version.string, "\n") errQuit <- function(mesg, status=1) { message("Error: ", mesg); q(status=status) } args <- commandArgs(TRUE) From 7482775e256d09638321c3cae24755960acc7d16 Mon Sep 17 00:00:00 2001 From: proteinosome Date: Tue, 2 Aug 2022 06:14:41 +0000 Subject: [PATCH 12/14] Output DADA2 error profile plot --- main.nf | 1 + scripts/run_dada_ccs.R | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/main.nf b/main.nf index 17755f7..256e63e 100644 --- a/main.nf +++ b/main.nf @@ -391,6 +391,7 @@ process dada2_denoise { path "dada2-ccs_table.qza", emit: asv_freq path "dada2-ccs_stats.qza", emit:asv_stats path "seqtab_nochim.rds", emit: dada2_rds + path "plot_error_model.pdf" script: """ diff --git a/scripts/run_dada_ccs.R b/scripts/run_dada_ccs.R index a120301..0056226 100755 --- a/scripts/run_dada_ccs.R +++ b/scripts/run_dada_ccs.R @@ -234,6 +234,10 @@ cat("3) Learning Error Rates\n") err <- suppressWarnings(learnErrors(filts, nreads=nreads.learn, errorEstimationFunction=dada2:::PacBioErrfun, multithread=multithread, BAND_SIZE=32)) +err_plot <- plotErrors(err) +pdf("plot_error_model.pdf", width=12, height=8, useDingbats=FALSE) +err_plot +dev.off() ### PROCESS ALL SAMPLES ### # Loop over rest in streaming fashion with learned error rates From 3a48e28afe29c4b73d5eaede6ecf4f82cdb10f6a Mon Sep 17 00:00:00 2001 From: proteinosome Date: Tue, 2 Aug 2022 06:25:03 +0000 Subject: [PATCH 13/14] Caution about PICRUSt2 --- main.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 256e63e..0f4b673 100644 --- a/main.nf +++ b/main.nf @@ -70,7 +70,8 @@ def helpMessage() { removal) (default: trim with cutadapt) --colorby Columns in metadata TSV file to use for coloring the MDS plot in HTML report (default: condition) - --run_picrust2 Run PICRUSt2 pipeline (default: false) + --run_picrust2 Run PICRUSt2 pipeline. Note that pathway inference with 16S using PICRUSt2 + has not been tested systematically (default: false) --download_db Download databases needed for taxonomy classification only. Will not run the pipeline. Databases will be downloaded to a folder "databases" in the Nextflow pipeline directory. From dab848bb9fb54a19c6b4bb5a2aaf434afae55083 Mon Sep 17 00:00:00 2001 From: proteinosome Date: Tue, 2 Aug 2022 08:46:36 +0000 Subject: [PATCH 14/14] Download GTDB r207 from Zenodo instead --- main.nf | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/main.nf b/main.nf index 0f4b673..06cc68d 100644 --- a/main.nf +++ b/main.nf @@ -970,13 +970,15 @@ process download_db { wget -N --content-disposition 'https://data.qiime2.org/2022.2/common/silva-138-99-tax.qza' echo "Downloading GTDB database" - wget -N --content-disposition --no-check-certificate 'https://data.gtdb.ecogenomic.org/releases/release207/207.0/genomic_files_all/ssu_all_r207.tar.gz' - tar xfz ssu_all_r207.tar.gz - mv ssu_all_r207.fna GTDB_ssu_all_r207.fna - grep "^>" GTDB_ssu_all_r207.fna | awk '{print gensub(/^>(.*)/, "\\\\1", "g", \$1),gensub(/^>.*\\ (d__.*)\\ \\[.*\\[.*\\[.*/, "\\\\1", "g", \$0)}' OFS=\$'\\t' > GTDB_ssu_all_r207.taxonomy.tsv - qiime tools import --type 'FeatureData[Sequence]' --input-path GTDB_ssu_all_r207.fna --output-path GTDB_ssu_all_r207.qza - qiime tools import --type 'FeatureData[Taxonomy]' --input-format HeaderlessTSVTaxonomyFormat \ - --input-path GTDB_ssu_all_r207.taxonomy.tsv --output-path GTDB_ssu_all_r207.taxonomy.qza + wget -N --content-disposition --no-check-certificate 'https://zenodo.org/record/6912512/files/GTDB_ssu_all_r207.taxonomy.qza?download=1' + wget -N --content-disposition --no-check-certificate 'https://zenodo.org/record/6912512/files/GTDB_ssu_all_r207.qza?download=1' + # wget -N --content-disposition --no-check-certificate 'https://data.gtdb.ecogenomic.org/releases/release207/207.0/genomic_files_all/ssu_all_r207.tar.gz' + # tar xfz ssu_all_r207.tar.gz + # mv ssu_all_r207.fna GTDB_ssu_all_r207.fna + # grep "^>" GTDB_ssu_all_r207.fna | awk '{print gensub(/^>(.*)/, "\\\\1", "g", \$1),gensub(/^>.*\\ (d__.*)\\ \\[.*\\[.*\\[.*/, "\\\\1", "g", \$0)}' OFS=\$'\\t' > GTDB_ssu_all_r207.taxonomy.tsv + # qiime tools import --type 'FeatureData[Sequence]' --input-path GTDB_ssu_all_r207.fna --output-path GTDB_ssu_all_r207.qza + # qiime tools import --type 'FeatureData[Taxonomy]' --input-format HeaderlessTSVTaxonomyFormat \ + # --input-path GTDB_ssu_all_r207.taxonomy.tsv --output-path GTDB_ssu_all_r207.taxonomy.qza """ }