From 2a41155cc716df15373291bbec1cb7b5370cdf39 Mon Sep 17 00:00:00 2001
From: Khi Pin <khipinchua@gmail.com>
Date: Wed, 27 Jul 2022 01:42:02 -0700
Subject: [PATCH 01/14] updated to GTDB r207 for naive bayes classifier

---
 main.nf                    |  4 ++--
 scripts/dada2_assign_tax.R | 26 +++++++++-----------------
 2 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/main.nf b/main.nf
index 6aca67e..386e911 100644
--- a/main.nf
+++ b/main.nf
@@ -123,7 +123,7 @@ params.vsearch_db = "$projectDir/databases/GTDB_ssu_all_r207.qza"
 params.vsearch_tax = "$projectDir/databases/GTDB_ssu_all_r207.taxonomy.qza"
 params.silva_db = "$projectDir/databases/silva_nr99_v138.1_wSpecies_train_set.fa.gz"
 params.refseq_db = "$projectDir/databases/RefSeq_16S_6-11-20_RDPv16_fullTaxo.fa.gz"
-params.gtdb_db = "$projectDir/databases/GTDB_bac120_arc122_ssu_r202_fullTaxo.fa.gz"
+params.gtdb_db = "$projectDir/databases/GTDB_bac120_arc53_ssu_r207_fullTaxo.fa.gz"
 params.maxreject = 100
 params.maxaccept = 100
 params.rarefaction_depth = null
@@ -929,7 +929,7 @@ process download_db {
   echo "Downloading SILVA sequences for VSEARCH..."
   wget -N --content-disposition 'https://zenodo.org/record/4587955/files/silva_nr99_v138.1_wSpecies_train_set.fa.gz?download=1'
   echo "Downloading GTDB sequences for VSEARCH..."
-  wget -N --content-disposition 'https://zenodo.org/record/4735821/files/GTDB_bac120_arc122_ssu_r202_fullTaxo.fa.gz?download=1'
+  wget -N --content-disposition 'https://zenodo.org/record/6655692/files/GTDB_bac120_arc53_ssu_r207_fullTaxo.fa.gz?download=1'
   echo "Downloading RefSeq + RDP sequences for VSEARCH..."
   wget -N --content-disposition 'https://zenodo.org/record/4735821/files/RefSeq_16S_6-11-20_RDPv16_fullTaxo.fa.gz?download=1'
 
diff --git a/scripts/dada2_assign_tax.R b/scripts/dada2_assign_tax.R
index 1afbcb4..2e805a2 100644
--- a/scripts/dada2_assign_tax.R
+++ b/scripts/dada2_assign_tax.R
@@ -32,8 +32,8 @@ gtdb_spec <- as.data.frame(gtdb_spec)
 colnames(gtdb_spec)[grepl("tax.", colnames(gtdb_spec))] <-
   gsub("tax\\.(.*)", "\\1", colnames(gtdb_spec)[grepl("tax.", colnames(gtdb_spec))])
 # Rename species
-gtdb_spec[, 'Species'] <- gsub("(.*)\\ (.*)\\(.*\\)", "\\2", gtdb_spec[, 'Species'])
-buf <- data.frame("Assignment" = rep("GTDB r202", nrow(gtdb_spec)))
+gtdb_spec[, 'Species'] <- gsub("(.*)_(.*)\\(.*\\)", "\\2", gtdb_spec[, 'Species'])
+buf <- data.frame("Assignment" = rep("GTDB r207", nrow(gtdb_spec)))
 gtdb_spec <- cbind(gtdb_spec, buf)
 refseq_spec <- assignTaxonomy(seqs, refFasta = refseq_db, minBoot = minBoot_num,
                               multithread = threads, outputBootstraps = TRUE)
@@ -46,37 +46,29 @@ buf <- data.frame("Assignment" = rep("RefSeq + RDP", nrow(refseq_spec)))
 refseq_spec <- cbind(refseq_spec, buf)
 
 # Iteratively join at species level
-final_spec <- silva_spec
-final_spec[is.na(final_spec[, 'Species']), ] <- gtdb_spec[is.na(final_spec[, 'Species']), ]
+final_spec <- gtdb_spec
+final_spec[is.na(final_spec[, 'Species']), ] <- silva_spec[is.na(final_spec[, 'Species']), ]
 final_spec[is.na(final_spec[, 'Species']), ] <- refseq_spec[is.na(final_spec[, 'Species']), ]
 
 # Iteratively join at genus level
-final_spec[is.na(final_spec[, 'Genus']), ] <- gtdb_spec[is.na(final_spec[, 'Genus']), ]
+final_spec[is.na(final_spec[, 'Genus']), ] <- silva_spec[is.na(final_spec[, 'Genus']), ]
 final_spec[is.na(final_spec[, 'Genus']), ] <- refseq_spec[is.na(final_spec[, 'Genus']), ]
 
 # uncultured or metagenome
-final_spec[grepl("uncultured", final_spec[, 'Species'], ignore.case = TRUE)] <- 
-  gtdb_spec[grepl("uncultured", final_spec[, 'Species'], ignore.case = TRUE)]
 final_spec[grepl("uncultured", final_spec[, 'Species'], ignore.case = TRUE)] <- 
   refseq_spec[grepl("uncultured", final_spec[, 'Species'], ignore.case = TRUE)]
-final_spec[grepl("metagenome", final_spec[, 'Species'], ignore.case = TRUE)] <- 
-  gtdb_spec[grepl("metagenome", final_spec[, 'Species'], ignore.case = TRUE)]
 final_spec[grepl("metagenome", final_spec[, 'Species'], ignore.case = TRUE)] <- 
   refseq_spec[grepl("metagenome", final_spec[, 'Species'], ignore.case = TRUE)]
 
 # uncultured or metagenome at genus level
 final_spec[grepl("uncultured", final_spec[, 'Genus'], ignore.case = TRUE)] <- 
   gtdb_spec[grepl("uncultured", final_spec[, 'Genus'], ignore.case = TRUE)]
-final_spec[grepl("uncultured", final_spec[, 'Genus'], ignore.case = TRUE)] <- 
-  refseq_spec[grepl("uncultured", final_spec[, 'Genus'], ignore.case = TRUE)]
 final_spec[grepl("metagenome", final_spec[, 'Genus'], ignore.case = TRUE)] <- 
   gtdb_spec[grepl("metagenome", final_spec[, 'Genus'], ignore.case = TRUE)]
-final_spec[grepl("metagenome", final_spec[, 'Genus'], ignore.case = TRUE)] <- 
-  refseq_spec[grepl("metagenome", final_spec[, 'Genus'], ignore.case = TRUE)]
 
-# If still NA, revert back to Silva
-final_spec[is.na(final_spec[, 'Species']), ] <- silva_spec[is.na(final_spec[, 'Species']), ]
-final_spec[is.na(final_spec[, 'Genus']), ] <- silva_spec[is.na(final_spec[, 'Genus']), ]
+# If still NA, revert back to GTDB
+final_spec[is.na(final_spec[, 'Species']), ] <- gtdb_spec[is.na(final_spec[, 'Species']), ]
+final_spec[is.na(final_spec[, 'Genus']), ] <- gtdb_spec[is.na(final_spec[, 'Genus']), ]
 rownames(final_spec) <- otu_id
 
 # For each row, look for the deepest level assigned and assign confidence value
@@ -119,4 +111,4 @@ write.table(to_save, "best_taxonomy.tsv", quote = FALSE,
 
 write.table(to_save2, "best_taxonomy_withDB.tsv", quote = FALSE,
            sep = "\t",row.names = FALSE, 
-           col.names = c("Feature ID", "Taxon", "Confidence", "Assignment Database"))
\ No newline at end of file
+           col.names = c("Feature ID", "Taxon", "Confidence", "Assignment Database"))

From 015b95499adcde60ba4a02f35d21f81e89f014e6 Mon Sep 17 00:00:00 2001
From: Khi Pin <khipinchua@gmail.com>
Date: Wed, 27 Jul 2022 01:43:26 -0700
Subject: [PATCH 02/14] Updated v0.3

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index b6bc0d5..724fac9 100644
--- a/main.nf
+++ b/main.nf
@@ -81,7 +81,7 @@ def helpMessage() {
 params.help = false
 if (params.help) exit 0, helpMessage()
 params.version = false
-version = "0.2"
+version = "0.3"
 if (params.version) exit 0, log.info("$version")
 params.download_db = false
 params.skip_primer_trim = false

From 27e9431679edcc043862c03c6e5a9a95101b2396 Mon Sep 17 00:00:00 2001
From: Khi Pin <khipinchua@gmail.com>
Date: Wed, 27 Jul 2022 02:10:16 -0700
Subject: [PATCH 03/14] Updated documentation on taxonomy classification with
 GTDB r207

---
 README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 3507684..20ea1e4 100644
--- a/README.md
+++ b/README.md
@@ -247,15 +247,15 @@ primers removal?
   The "besttax" assignment uses the `assignTaxonomy` Naive-Bayes classifier function from DADA2 
   to carry out taxonomy assignment. It uses 3 databases to classify the ASVs 
   (requiring a minimum bootstrap of 80 using the minBoot parameter) and the priority of assignment
-  is Silva 138, followed by GTDB r202, then lastly RefSeq + RDP. This means for example
-  if an ASV is not assigned at Species level using Silva, it will check if it can be assigned
-  with GTDB. This ensure we assign as many ASVs as possible.
+  is GTDB r207, followed by Silva v138, then lastly RefSeq + RDP. This means for example
+  if an ASV is not assigned at Species level using GTDB, it will check if it can be assigned
+  with Silva. This ensure we assign as many ASVs as possible.
 
   This process is done first at Species level, then at Genus level. In addition, if any ASV
-  is assigned as "uncultured" or "metagenome", it will go through the iterative assignment
-  process just like the unclassified ASVs. Note that while this method will assign
-  a high amount of ASVs, there may be issues such as how the taxonomy is annotated
-  in different databases. 
+  is assigned as "uncultured" or "metagenome" (many entries like this in Silva), 
+  it will go through the iterative assignment process just like the unclassified ASVs. 
+  Note that while this method will assign a high amount of ASVs, there may be issues 
+  such as how the taxonomy is annotated in different databases. 
 
   As such, there is also a VSEARCH taxonomy classification using GTDB database (r207) only in the file called 
   `results/vsearch_merged_freq_tax.tsv` that may provide a more consistent annotation. This uses

From 984a98a87755487e74632a36f90f51965bcaaa8c Mon Sep 17 00:00:00 2001
From: Khi Pin <khipinchua@gmail.com>
Date: Wed, 27 Jul 2022 02:26:03 -0700
Subject: [PATCH 04/14] Provide zenodo for db download

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 20ea1e4..c986ae9 100644
--- a/README.md
+++ b/README.md
@@ -285,7 +285,8 @@ primers removal?
 * Can I download the databases for taxonomic classification manually?
 
   The taxonomy classification step of the pipeline requires a few databases that will be downloaded with the
-  `--download_db` parameters into a "databases" folder. These databases can also be downloaded
+  `--download_db` parameters into a "databases" folder. All the databases are also 
+  collected on [Zenodo](https://zenodo.org/record/6912512). These databases can also be downloaded
   manually from the following links if the download script above does not work. The GTDB database
   for VSEARCH will require some processing using the `QIIME 2` package. See `scripts/download_db.sh` for details.
   

From dae87ced95e0bc1011d4c821cc3a1e2d82ca06f2 Mon Sep 17 00:00:00 2001
From: Khi Pin <khipinchua@gmail.com>
Date: Thu, 28 Jul 2022 01:22:39 -0700
Subject: [PATCH 05/14] Fixed GTDB naming issue for naive bayes classifier

---
 scripts/dada2_assign_tax.R | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/scripts/dada2_assign_tax.R b/scripts/dada2_assign_tax.R
index 2e805a2..d3ae098 100644
--- a/scripts/dada2_assign_tax.R
+++ b/scripts/dada2_assign_tax.R
@@ -32,7 +32,9 @@ gtdb_spec <- as.data.frame(gtdb_spec)
 colnames(gtdb_spec)[grepl("tax.", colnames(gtdb_spec))] <-
   gsub("tax\\.(.*)", "\\1", colnames(gtdb_spec)[grepl("tax.", colnames(gtdb_spec))])
 # Rename species
-gtdb_spec[, 'Species'] <- gsub("(.*)_(.*)\\(.*\\)", "\\2", gtdb_spec[, 'Species'])
+gtdb_spec[, 'Species'] <- gsub("(.*)\\(.*\\)", "\\1", gtdb_spec[, 'Species'])
+# Replace GTDB underscore with space
+gtdb_spec[, 'Species'] <- gsub("_", "\\ ", gtdb_spec[, 'Species'])
 buf <- data.frame("Assignment" = rep("GTDB r207", nrow(gtdb_spec)))
 gtdb_spec <- cbind(gtdb_spec, buf)
 refseq_spec <- assignTaxonomy(seqs, refFasta = refseq_db, minBoot = minBoot_num,
@@ -49,6 +51,12 @@ refseq_spec <- cbind(refseq_spec, buf)
 final_spec <- gtdb_spec
 final_spec[is.na(final_spec[, 'Species']), ] <- silva_spec[is.na(final_spec[, 'Species']), ]
 final_spec[is.na(final_spec[, 'Species']), ] <- refseq_spec[is.na(final_spec[, 'Species']), ]
+# For those that's been replaced by either SILVA or refSeq, paste species name
+class_nonGTDB <- final_spec[is.na(gtdb_spec[, 'Species']), ]
+class_nonGTDB <- class_nonGTDB[!is.na(class_nonGTDB[, 'Species']),]
+final_spec[rownames(class_nonGTDB), 'Species'] <-
+  paste(final_spec[rownames(class_nonGTDB), 'Genus', drop=TRUE],
+        final_spec[rownames(class_nonGTDB), 'Species', drop=TRUE], sep=" ")
 
 # Iteratively join at genus level
 final_spec[is.na(final_spec[, 'Genus']), ] <- silva_spec[is.na(final_spec[, 'Genus']), ]
@@ -97,7 +105,7 @@ for (i in 1:nrow(final_spec)){
   tosave_row <- data.frame(
     "Feature ID" = rownames(row),
     "Taxon" = paste(
-      paste(c("d__", "p__", "c__", "o__", "f__", "g__", "s__"), c(row[, c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus")], paste(row[, "Genus"], row[, "Species"], sep=" ")), sep = ""), 
+      paste(c("d__", "p__", "c__", "o__", "f__", "g__", "s__"), c(row[, c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")]), sep=""), 
       collapse = "; "),
     "Confidence" = conf,
     "Assignment Database" = row['Assignment']

From a236e0d9be2dad7c93e96d0c924c61f2483046df Mon Sep 17 00:00:00 2001
From: Khi Pin <khipinchua@gmail.com>
Date: Thu, 28 Jul 2022 01:53:00 -0700
Subject: [PATCH 06/14] Added picrust2

---
 container_def/pb-16s-pbtools.yml                       | 1 +
 container_def/pb-16s-pbtools_docker/pb-16s-pbtools.yml | 1 +
 env/pb-16s-pbtools.yml                                 | 1 +
 3 files changed, 3 insertions(+)

diff --git a/container_def/pb-16s-pbtools.yml b/container_def/pb-16s-pbtools.yml
index 64c652b..79e885b 100644
--- a/container_def/pb-16s-pbtools.yml
+++ b/container_def/pb-16s-pbtools.yml
@@ -8,3 +8,4 @@ dependencies:
   - csvtk=0.23.0
   - kraken2=2.1.2
   - bracken=2.6.2
+  - picrust2=2.5.0
diff --git a/container_def/pb-16s-pbtools_docker/pb-16s-pbtools.yml b/container_def/pb-16s-pbtools_docker/pb-16s-pbtools.yml
index 64c652b..79e885b 100644
--- a/container_def/pb-16s-pbtools_docker/pb-16s-pbtools.yml
+++ b/container_def/pb-16s-pbtools_docker/pb-16s-pbtools.yml
@@ -8,3 +8,4 @@ dependencies:
   - csvtk=0.23.0
   - kraken2=2.1.2
   - bracken=2.6.2
+  - picrust2=2.5.0
diff --git a/env/pb-16s-pbtools.yml b/env/pb-16s-pbtools.yml
index 64c652b..79e885b 100644
--- a/env/pb-16s-pbtools.yml
+++ b/env/pb-16s-pbtools.yml
@@ -8,3 +8,4 @@ dependencies:
   - csvtk=0.23.0
   - kraken2=2.1.2
   - bracken=2.6.2
+  - picrust2=2.5.0

From 8d3075f17845e6d2738578a64f8b268f579ce531 Mon Sep 17 00:00:00 2001
From: proteinosome <khipinchua@gmail.com>
Date: Thu, 28 Jul 2022 02:37:43 -0700
Subject: [PATCH 07/14] Added latest tag to containers

---
 main.nf | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/main.nf b/main.nf
index 724fac9..f77139e 100644
--- a/main.nf
+++ b/main.nf
@@ -177,7 +177,7 @@ log.info """
 // QC before cutadapt
 process QC_fastq {
   conda (params.enable_conda ? "$projectDir/env/pb-16s-pbtools.yml" : null)
-  container "kpinpb/pb-16s-nf-tools"
+  container "kpinpb/pb-16s-nf-tools:latest"
   label 'cpu8'
   publishDir "$params.outdir/filtered_input_FASTQ", pattern: '*filterQ*.fastq.gz'
 
@@ -204,7 +204,7 @@ process QC_fastq {
 // Trim full length 16S primers with cutadapt
 process cutadapt {
   conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null)
-  container "kpinpb/pb-16s-nf-qiime"
+  container "kpinpb/pb-16s-nf-qiime:latest"
   publishDir "$params.outdir/trimmed_primers_FASTQ", pattern: '*.fastq.gz'
   publishDir "$params.outdir/cutadapt_summary", pattern: '*.report'
   cpus params.cutadapt_cpu
@@ -237,7 +237,7 @@ process cutadapt {
 // Collect QC into single files
 process collect_QC {
   conda (params.enable_conda ? "$projectDir/env/pb-16s-pbtools.yml" : null)
-  container "kpinpb/pb-16s-nf-tools"
+  container "kpinpb/pb-16s-nf-tools:latest"
   publishDir "$params.outdir/results/reads_QC"
   label 'cpu8'
 
@@ -267,7 +267,7 @@ process collect_QC {
 // Collect QC into single files if skipping cutadapt
 process collect_QC_skip_cutadapt {
   conda (params.enable_conda ? "$projectDir/env/pb-16s-pbtools.yml" : null)
-  container "kpinpb/pb-16s-nf-tools"
+  container "kpinpb/pb-16s-nf-tools:latest"
   publishDir "$params.outdir/results/reads_QC"
   label 'cpu8'
 
@@ -330,7 +330,7 @@ process prepare_qiime2_manifest_skip_cutadapt {
 // Import data into QIIME 2
 process import_qiime2 {
   conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null)
-  container "kpinpb/pb-16s-nf-qiime"
+  container "kpinpb/pb-16s-nf-qiime:latest"
   publishDir "$params.outdir/import_qiime"
   label 'cpu_def'
 
@@ -351,7 +351,7 @@ process import_qiime2 {
 
 process demux_summarize {
   conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null)
-  container "kpinpb/pb-16s-nf-qiime"
+  container "kpinpb/pb-16s-nf-qiime:latest"
   publishDir "$params.outdir/summary_demux"
   label 'cpu_def'
 
@@ -375,7 +375,7 @@ process demux_summarize {
 
 process dada2_denoise {
   conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null)
-  container "kpinpb/pb-16s-nf-qiime"
+  container "kpinpb/pb-16s-nf-qiime:latest"
   publishDir "$params.outdir/dada2"
   cpus params.dada2_cpu
 
@@ -416,7 +416,7 @@ process dada2_denoise {
 
 process filter_dada2 {
   conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null)
-  container "kpinpb/pb-16s-nf-qiime"
+  container "kpinpb/pb-16s-nf-qiime:latest"
   publishDir "$params.outdir/dada2"
   cpus params.dada2_cpu
 
@@ -473,7 +473,7 @@ process filter_dada2 {
 // assignTaxonomy function based on Naive Bayes classifier
 process dada2_assignTax {
   conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null)
-  container "kpinpb/pb-16s-nf-qiime"
+  container "kpinpb/pb-16s-nf-qiime:latest"
   publishDir "$params.outdir/results"
   cpus params.vsearch_cpu
 
@@ -518,7 +518,7 @@ process dada2_assignTax {
 // QC summary for dada2
 process dada2_qc {
   conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null)
-  container "kpinpb/pb-16s-nf-qiime"
+  container "kpinpb/pb-16s-nf-qiime:latest"
   publishDir "$params.outdir/results"
   label 'cpu_def'
 
@@ -596,7 +596,7 @@ process dada2_qc {
 
 process qiime2_phylogeny_diversity {
   conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null)
-  container "kpinpb/pb-16s-nf-qiime"
+  container "kpinpb/pb-16s-nf-qiime:latest"
   publishDir "$params.outdir/results/phylogeny_diversity"
   label 'cpu8' 
 
@@ -674,7 +674,7 @@ process qiime2_phylogeny_diversity {
 // Rarefaction visualization
 process dada2_rarefaction {
   conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null)
-  container "kpinpb/pb-16s-nf-qiime"
+  container "kpinpb/pb-16s-nf-qiime:latest"
   publishDir "$params.outdir/results"
   label 'cpu_def'
 
@@ -698,7 +698,7 @@ process dada2_rarefaction {
 // Classify taxonomy and export table
 process class_tax {
   conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null)
-  container "kpinpb/pb-16s-nf-qiime"
+  container "kpinpb/pb-16s-nf-qiime:latest"
   publishDir "$params.outdir/results"
   cpus params.vsearch_cpu
 
@@ -746,7 +746,7 @@ process class_tax {
 // Export results into biom for use with phyloseq
 process export_biom {
   conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null)
-  container "kpinpb/pb-16s-nf-qiime"
+  container "kpinpb/pb-16s-nf-qiime:latest"
   publishDir "$params.outdir/results"
   label 'cpu_def'
 
@@ -783,7 +783,7 @@ process export_biom {
 
 process barplot {
   conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null)
-  container "kpinpb/pb-16s-nf-qiime"
+  container "kpinpb/pb-16s-nf-qiime:latest"
   publishDir "$params.outdir/results"
   label 'cpu_def'
 
@@ -812,7 +812,7 @@ process barplot {
 // HTML report
 process html_rep {
   conda (params.enable_conda ? "$projectDir/env/pb-16s-vis-conda.yml" : null)
-  container "kpinpb/pb-16s-vis"
+  container "kpinpb/pb-16s-vis:latest"
   publishDir "$params.outdir/results"
   label 'cpu_def'
 
@@ -852,7 +852,7 @@ process html_rep {
 // HTML report
 process html_rep_skip_cutadapt {
   conda (params.enable_conda ? "$projectDir/env/pb-16s-vis-conda.yml" : null)
-  container "kpinpb/pb-16s-vis"
+  container "kpinpb/pb-16s-vis:latest"
   publishDir "$params.outdir/results"
   label 'cpu_def'
 
@@ -892,7 +892,7 @@ process html_rep_skip_cutadapt {
 // Krona plot
 process krona_plot {
   conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null)
-  container "kpinpb/pb-16s-nf-qiime"
+  container "kpinpb/pb-16s-nf-qiime:latest"
   publishDir "$params.outdir/results"
   label 'cpu_def'
   // Ignore if this fail
@@ -923,7 +923,7 @@ process krona_plot {
 
 process download_db {
   conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null)
-  container "/home/kpin/gitlab/pacbio/singularity/qiime2-2022.2-py38-linux-conda.sif"
+  container "kpinpb/pb-16s-nf-qiime:latest"
   publishDir "$projectDir/databases", mode: "copy"
   label 'cpu_def'
 

From 169ed24ddf03f0defe150f03a52307dc60c6bbf3 Mon Sep 17 00:00:00 2001
From: Khi Pin <khipinchua@gmail.com>
Date: Sun, 31 Jul 2022 23:20:26 -0700
Subject: [PATCH 08/14] Added picrust2

---
 main.nf | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/main.nf b/main.nf
index f77139e..2265697 100644
--- a/main.nf
+++ b/main.nf
@@ -781,6 +781,28 @@ process export_biom {
   """
 }
 
+process picrust2 {
+  conda (params.enable_conda ? "$projectDir/env/pb-16s-pbtools.yml" : null)
+  container "kpinpb/pb-16s-nf-tools:latest"
+  label 'cpu32'
+  publishDir "$params.outdir/results/picrust2"
+
+  input:
+  path dada2_asv
+  path vsearch_biom
+
+  output:
+  path "picrust2"
+
+  script:
+  """
+  picrust2_pipeline.py -s $dada2_asv -i $vsearch_biom \
+    -o picrust2 \
+    --in_traits EC,KO \
+    --verbose -p $task.cpus
+  """
+}
+
 process barplot {
   conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null)
   container "kpinpb/pb-16s-nf-qiime:latest"
@@ -1000,6 +1022,7 @@ workflow pb16S {
     dada2_assignTax(filter_dada2.out.asv_seq_fasta, filter_dada2.out.asv_seq, filter_dada2.out.asv_freq,
         params.silva_db, params.gtdb_db, params.refseq_db, params.dadaAssign_script)
     export_biom(filter_dada2.out.asv_freq, dada2_assignTax.out.best_nb_tax, class_tax.out.tax_tsv)
+    picrust2(filter_dada2.out.asv_seq_fasta, export_biom.biom_vsearch)
     barplot(filter_dada2.out.asv_freq, dada2_assignTax.out.best_nb_tax_qza, class_tax.out.tax_vsearch, metadata_file)
     if (params.skip_primer_trim){
       html_rep_skip_cutadapt(dada2_assignTax.out.best_nb_tax_tsv, metadata_file, qiime2_manifest,

From 37331ed378176121e63128a44a67cf7d3b3620a6 Mon Sep 17 00:00:00 2001
From: proteinosome <khipinchua@gmail.com>
Date: Tue, 2 Aug 2022 02:35:10 +0000
Subject: [PATCH 09/14] Fixed picrust2 output folder and function argument

---
 main.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 2265697..48afc0c 100644
--- a/main.nf
+++ b/main.nf
@@ -785,7 +785,7 @@ process picrust2 {
   conda (params.enable_conda ? "$projectDir/env/pb-16s-pbtools.yml" : null)
   container "kpinpb/pb-16s-nf-tools:latest"
   label 'cpu32'
-  publishDir "$params.outdir/results/picrust2"
+  publishDir "$params.outdir/results"
 
   input:
   path dada2_asv
@@ -1022,7 +1022,7 @@ workflow pb16S {
     dada2_assignTax(filter_dada2.out.asv_seq_fasta, filter_dada2.out.asv_seq, filter_dada2.out.asv_freq,
         params.silva_db, params.gtdb_db, params.refseq_db, params.dadaAssign_script)
     export_biom(filter_dada2.out.asv_freq, dada2_assignTax.out.best_nb_tax, class_tax.out.tax_tsv)
-    picrust2(filter_dada2.out.asv_seq_fasta, export_biom.biom_vsearch)
+    picrust2(filter_dada2.out.asv_seq_fasta, export_biom.out.biom_vsearch)
     barplot(filter_dada2.out.asv_freq, dada2_assignTax.out.best_nb_tax_qza, class_tax.out.tax_vsearch, metadata_file)
     if (params.skip_primer_trim){
       html_rep_skip_cutadapt(dada2_assignTax.out.best_nb_tax_tsv, metadata_file, qiime2_manifest,

From 705183afedf9a083c476d51bad1418d544ce058f Mon Sep 17 00:00:00 2001
From: proteinosome <khipinchua@gmail.com>
Date: Tue, 2 Aug 2022 06:05:36 +0000
Subject: [PATCH 10/14] Added picrust option

---
 README.md | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index c986ae9..fe3192f 100644
--- a/README.md
+++ b/README.md
@@ -118,6 +118,7 @@ nextflow run main.nf --help
                         removal) (default: trim with cutadapt)
   --colorby    Columns in metadata TSV file to use for coloring the MDS plot
                in HTML report (default: condition)
+  --run_picrust2    Run PICRUSt2 pipeline (default: false)
   --download_db    Download databases needed for taxonomy classification only. Will not
                    run the pipeline. Databases will be downloaded to a folder "databases"
                    in the Nextflow pipeline directory.
@@ -312,33 +313,35 @@ primers removal?
 
 ## References
 ### QIIME 2
-1. Bolyen, E. et al. Reproducible, interactive, scalable and extensible microbiome data science using QIIME 2. Nat Biotechnol 37, 852–857 (2019).
+* Bolyen, E. et al. Reproducible, interactive, scalable and extensible microbiome data science using QIIME 2. Nat Biotechnol 37, 852–857 (2019).
 * For individual citations of plugins, you can use the `--citations` command for the relevant plugins. For example, if you want
   to cite `VSEARCH` plugin, type `qiime feature-classifier classify-consensus-vsearch --citations` after activating the conda
   environment. You can activate the environment installed by the pipelines by typing `conda activate $HOME/nf_conda/$ENV` (Change 
   `$ENBV` to the name of the environment you want to activate).
 ### DADA2
-2. Callahan, B. J. et al. DADA2: High-resolution sample inference from Illumina amplicon data. Nat Methods 13, 581–583 (2016).
+* Callahan, B. J. et al. DADA2: High-resolution sample inference from Illumina amplicon data. Nat Methods 13, 581–583 (2016).
 ### Seqkit
-3. Shen, W., Le, S., Li, Y. & Hu, F. SeqKit: A Cross-Platform and Ultrafast Toolkit for FASTA/Q File Manipulation. PLOS ONE 11, e0163962 (2016).
+* Shen, W., Le, S., Li, Y. & Hu, F. SeqKit: A Cross-Platform and Ultrafast Toolkit for FASTA/Q File Manipulation. PLOS ONE 11, e0163962 (2016).
 ### Cutadapt
-4. Martin, M. Cutadapt removes adapter sequences from high-throughput sequencing reads. EMBnet.journal 17, 10–12 (2011).
+* Martin, M. Cutadapt removes adapter sequences from high-throughput sequencing reads. EMBnet.journal 17, 10–12 (2011).
 ### GTDB database
-5. Parks, D. H. et al. A standardized bacterial taxonomy based on genome phylogeny substantially revises the tree of life. Nat Biotechnol 36, 996–1004 (2018).
-6. Parks, D. H. et al. A complete domain-to-species taxonomy for Bacteria and Archaea. Nat Biotechnol 38, 1079–1086 (2020).
+* Parks, D. H. et al. A standardized bacterial taxonomy based on genome phylogeny substantially revises the tree of life. Nat Biotechnol 36, 996–1004 (2018).
+* Parks, D. H. et al. A complete domain-to-species taxonomy for Bacteria and Archaea. Nat Biotechnol 38, 1079–1086 (2020).
 ### SILVA database
-7. Yilmaz, P. et al. The SILVA and “All-species Living Tree Project (LTP)” taxonomic frameworks. Nucleic Acids Research 42, D643–D648 (2014).
-8. Quast, C. et al. The SILVA ribosomal RNA gene database project: improved data processing and web-based tools. Nucleic Acids Research 41, D590–D596 (2013).
+* Yilmaz, P. et al. The SILVA and “All-species Living Tree Project (LTP)” taxonomic frameworks. Nucleic Acids Research 42, D643–D648 (2014).
+* Quast, C. et al. The SILVA ribosomal RNA gene database project: improved data processing and web-based tools. Nucleic Acids Research 41, D590–D596 (2013).
 ### RDP database
-9. Cole, J. R. et al. Ribosomal Database Project: data and tools for high throughput rRNA analysis. Nucleic Acids Res 42, D633–D642 (2014).
+* Cole, J. R. et al. Ribosomal Database Project: data and tools for high throughput rRNA analysis. Nucleic Acids Res 42, D633–D642 (2014).
 ### RefSeq database
-10. O’Leary, N. A. et al. Reference sequence (RefSeq) database at NCBI: current status, taxonomic expansion, and functional annotation. Nucleic Acids Res 44, D733-745 (2016).
+* O’Leary, N. A. et al. Reference sequence (RefSeq) database at NCBI: current status, taxonomic expansion, and functional annotation. Nucleic Acids Res 44, D733-745 (2016).
 ### Krona plot
-11. Bd, O., Nh, B. & Am, P. Interactive metagenomic visualization in a Web browser. BMC bioinformatics 12, (2011).
+* Bd, O., Nh, B. & Am, P. Interactive metagenomic visualization in a Web browser. BMC bioinformatics 12, (2011).
 * We use the QIIME 2 plugin implementation here: https://github.com/kaanb93/q2-krona
 ### Phyloseq and Tidyverse (for HTML report visualization)
-12. McMurdie, P. J. & Holmes, S. phyloseq: An R Package for Reproducible Interactive Analysis and Graphics of Microbiome Census Data. PLOS ONE 8, e61217 (2013).
-13. Wickham, H. et al. Welcome to the Tidyverse. Journal of Open Source Software 4, 1686 (2019).
+* McMurdie, P. J. & Holmes, S. phyloseq: An R Package for Reproducible Interactive Analysis and Graphics of Microbiome Census Data. PLOS ONE 8, e61217 (2013).
+* Wickham, H. et al. Welcome to the Tidyverse. Journal of Open Source Software 4, 1686 (2019).
+### PICRUSt2
+* Douglas, G. M. et al. PICRUSt2 for prediction of metagenome functions. Nat Biotechnol 38, 685–688 (2020).
 
 ## DISCLAIMER
 THIS WEBSITE AND CONTENT AND ALL SITE-RELATED SERVICES, INCLUDING ANY DATA, 

From 2a59cd9a4827125c6c64acc1cbf64cb3062e6861 Mon Sep 17 00:00:00 2001
From: proteinosome <khipinchua@gmail.com>
Date: Tue, 2 Aug 2022 06:05:54 +0000
Subject: [PATCH 11/14] Capture DADA2 output to log

---
 main.nf                | 6 +++++-
 scripts/run_dada_ccs.R | 5 +++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 48afc0c..17755f7 100644
--- a/main.nf
+++ b/main.nf
@@ -70,6 +70,7 @@ def helpMessage() {
                         removal) (default: trim with cutadapt)
   --colorby    Columns in metadata TSV file to use for coloring the MDS plot
                in HTML report (default: condition)
+  --run_picrust2    Run PICRUSt2 pipeline (default: false)
   --download_db    Download databases needed for taxonomy classification only. Will not
                    run the pipeline. Databases will be downloaded to a folder "databases"
                    in the Nextflow pipeline directory.
@@ -85,6 +86,7 @@ version = "0.3"
 if (params.version) exit 0, log.info("$version")
 params.download_db = false
 params.skip_primer_trim = false
+params.run_picrust2 = false
 params.filterQ = 20
 params.min_len = 1000
 params.max_len = 1600
@@ -1022,7 +1024,9 @@ workflow pb16S {
     dada2_assignTax(filter_dada2.out.asv_seq_fasta, filter_dada2.out.asv_seq, filter_dada2.out.asv_freq,
         params.silva_db, params.gtdb_db, params.refseq_db, params.dadaAssign_script)
     export_biom(filter_dada2.out.asv_freq, dada2_assignTax.out.best_nb_tax, class_tax.out.tax_tsv)
-    picrust2(filter_dada2.out.asv_seq_fasta, export_biom.out.biom_vsearch)
+    if (params.run_picrust2){
+      picrust2(filter_dada2.out.asv_seq_fasta, export_biom.out.biom_vsearch)
+    }
     barplot(filter_dada2.out.asv_freq, dada2_assignTax.out.best_nb_tax_qza, class_tax.out.tax_vsearch, metadata_file)
     if (params.skip_primer_trim){
       html_rep_skip_cutadapt(dada2_assignTax.out.best_nb_tax_tsv, metadata_file, qiime2_manifest,
diff --git a/scripts/run_dada_ccs.R b/scripts/run_dada_ccs.R
index 17ba96f..a120301 100755
--- a/scripts/run_dada_ccs.R
+++ b/scripts/run_dada_ccs.R
@@ -125,6 +125,11 @@
 #
 #
 
+# Added by Khi Pin to save all outputs from script
+con <- file("dada2.log")
+sink(con, append=TRUE)
+sink(con, append=TRUE, type="message")
+
 cat(R.version$version.string, "\n")
 errQuit <- function(mesg, status=1) { message("Error: ", mesg); q(status=status) }
 args <- commandArgs(TRUE)

From 7482775e256d09638321c3cae24755960acc7d16 Mon Sep 17 00:00:00 2001
From: proteinosome <khipinchua@gmail.com>
Date: Tue, 2 Aug 2022 06:14:41 +0000
Subject: [PATCH 12/14] Output DADA2 error profile plot

---
 main.nf                | 1 +
 scripts/run_dada_ccs.R | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/main.nf b/main.nf
index 17755f7..256e63e 100644
--- a/main.nf
+++ b/main.nf
@@ -391,6 +391,7 @@ process dada2_denoise {
   path "dada2-ccs_table.qza", emit: asv_freq
   path "dada2-ccs_stats.qza", emit:asv_stats
   path "seqtab_nochim.rds", emit: dada2_rds
+  path "plot_error_model.pdf"
 
   script:
   """
diff --git a/scripts/run_dada_ccs.R b/scripts/run_dada_ccs.R
index a120301..0056226 100755
--- a/scripts/run_dada_ccs.R
+++ b/scripts/run_dada_ccs.R
@@ -234,6 +234,10 @@ cat("3) Learning Error Rates\n")
 err <- suppressWarnings(learnErrors(filts, nreads=nreads.learn,
                                     errorEstimationFunction=dada2:::PacBioErrfun,
                                     multithread=multithread, BAND_SIZE=32))
+err_plot <- plotErrors(err)
+pdf("plot_error_model.pdf", width=12, height=8, useDingbats=FALSE)
+err_plot
+dev.off()
 
 ### PROCESS ALL SAMPLES ###
 # Loop over rest in streaming fashion with learned error rates

From 3a48e28afe29c4b73d5eaede6ecf4f82cdb10f6a Mon Sep 17 00:00:00 2001
From: proteinosome <khipinchua@gmail.com>
Date: Tue, 2 Aug 2022 06:25:03 +0000
Subject: [PATCH 13/14] Caution about PICRUSt2

---
 main.nf | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 256e63e..0f4b673 100644
--- a/main.nf
+++ b/main.nf
@@ -70,7 +70,8 @@ def helpMessage() {
                         removal) (default: trim with cutadapt)
   --colorby    Columns in metadata TSV file to use for coloring the MDS plot
                in HTML report (default: condition)
-  --run_picrust2    Run PICRUSt2 pipeline (default: false)
+  --run_picrust2    Run PICRUSt2 pipeline. Note that pathway inference with 16S using PICRUSt2
+                    has not been tested systematically (default: false)
   --download_db    Download databases needed for taxonomy classification only. Will not
                    run the pipeline. Databases will be downloaded to a folder "databases"
                    in the Nextflow pipeline directory.

From dab848bb9fb54a19c6b4bb5a2aaf434afae55083 Mon Sep 17 00:00:00 2001
From: proteinosome <khipinchua@gmail.com>
Date: Tue, 2 Aug 2022 08:46:36 +0000
Subject: [PATCH 14/14] Download GTDB r207 from Zenodo instead

---
 main.nf | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/main.nf b/main.nf
index 0f4b673..06cc68d 100644
--- a/main.nf
+++ b/main.nf
@@ -970,13 +970,15 @@ process download_db {
   wget -N --content-disposition 'https://data.qiime2.org/2022.2/common/silva-138-99-tax.qza'
 
   echo "Downloading GTDB database"
-  wget -N --content-disposition --no-check-certificate 'https://data.gtdb.ecogenomic.org/releases/release207/207.0/genomic_files_all/ssu_all_r207.tar.gz'
-  tar xfz ssu_all_r207.tar.gz
-  mv ssu_all_r207.fna GTDB_ssu_all_r207.fna
-  grep "^>" GTDB_ssu_all_r207.fna | awk '{print gensub(/^>(.*)/, "\\\\1", "g", \$1),gensub(/^>.*\\ (d__.*)\\ \\[.*\\[.*\\[.*/, "\\\\1", "g", \$0)}' OFS=\$'\\t' > GTDB_ssu_all_r207.taxonomy.tsv
-  qiime tools import --type 'FeatureData[Sequence]' --input-path GTDB_ssu_all_r207.fna --output-path GTDB_ssu_all_r207.qza
-  qiime tools import --type 'FeatureData[Taxonomy]' --input-format HeaderlessTSVTaxonomyFormat \
-    --input-path GTDB_ssu_all_r207.taxonomy.tsv --output-path GTDB_ssu_all_r207.taxonomy.qza
+  wget -N --content-disposition --no-check-certificate 'https://zenodo.org/record/6912512/files/GTDB_ssu_all_r207.taxonomy.qza?download=1'
+  wget -N --content-disposition --no-check-certificate 'https://zenodo.org/record/6912512/files/GTDB_ssu_all_r207.qza?download=1'
+  #   wget -N --content-disposition --no-check-certificate 'https://data.gtdb.ecogenomic.org/releases/release207/207.0/genomic_files_all/ssu_all_r207.tar.gz'
+  #   tar xfz ssu_all_r207.tar.gz
+  #   mv ssu_all_r207.fna GTDB_ssu_all_r207.fna
+  #   grep "^>" GTDB_ssu_all_r207.fna | awk '{print gensub(/^>(.*)/, "\\\\1", "g", \$1),gensub(/^>.*\\ (d__.*)\\ \\[.*\\[.*\\[.*/, "\\\\1", "g", \$0)}' OFS=\$'\\t' > GTDB_ssu_all_r207.taxonomy.tsv
+  #   qiime tools import --type 'FeatureData[Sequence]' --input-path GTDB_ssu_all_r207.fna --output-path GTDB_ssu_all_r207.qza
+  #   qiime tools import --type 'FeatureData[Taxonomy]' --input-format HeaderlessTSVTaxonomyFormat \
+  #     --input-path GTDB_ssu_all_r207.taxonomy.tsv --output-path GTDB_ssu_all_r207.taxonomy.qza
   """
 }