From 037797d90a3b98c491b71f002fab0c83f42ef5f8 Mon Sep 17 00:00:00 2001 From: Khi Pin Chua Date: Tue, 9 May 2023 06:40:15 +0000 Subject: [PATCH] v0.7 --- CHANGELOG.md | 5 + README.md | 1 + env/qiime2-2023.2-py38-linux-conda.yml | 684 +++++++++++++++++++++++++ main.nf | 110 ++-- nextflow.config | 1 + scripts/run_dada_2023.2.R | 527 +++++++++++++++++++ scripts/visualize_biom.Rmd | 2 + 7 files changed, 1284 insertions(+), 46 deletions(-) create mode 100644 env/qiime2-2023.2-py38-linux-conda.yml create mode 100755 scripts/run_dada_2023.2.R diff --git a/CHANGELOG.md b/CHANGELOG.md index b938b44..c0b209a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +# v0.7 changelog +* New feature to limit reads to N reads (`--downsample`) in case of samples with extremely high depth. +* Updated Qiime2 to 2023.2 version (This should not affect downstream results) to get rid of +annoying installation issue with old Qiime2 version. + # v0.5 changelog * Allow splitting samples into group for dada2 noise (pool column in metadata TSV, see GitHub documentation [here](https://github.com/PacificBiosciences/pb-16S-nf#pooling)). diff --git a/README.md b/README.md index bef9595..c6ea2a3 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,7 @@ nextflow run main.nf --help --front_p Forward primer sequence. Default to F27. (default: AGRGTTYGATYMTGGCTCAG) --adapter_p Reverse primer sequence. Default to R1492. (default: AAGTCGTAACAAGGTARCY) --filterQ Filter input reads above this Q value (default: 20). + --downsample Limit reads to a maximum of N reads if there are more than N reads (default: off) --max_ee DADA2 max_EE parameter. Reads with number of expected errors higher than this value will be discarded (default: 2) --minQ DADA2 minQ parameter. Reads with any base lower than this score diff --git a/env/qiime2-2023.2-py38-linux-conda.yml b/env/qiime2-2023.2-py38-linux-conda.yml new file mode 100644 index 0000000..e47d2a4 --- /dev/null +++ b/env/qiime2-2023.2-py38-linux-conda.yml @@ -0,0 +1,684 @@ +channels: + - qiime2/label/r2023.2 + - conda-forge + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.1 + - alsa-lib=1.2.8 + - altair=4.2.2 + - anyio=3.6.2 + - argcomplete=2.0.0 + - argon2-cffi=21.3.0 + - argon2-cffi-bindings=21.2.0 + - astor=0.8.1 + - asttokens=2.2.1 + - atpublic=3.0.1 + - attr=2.5.1 + - attrs=22.2.0 + - backcall=0.2.0 + - backports=1.0 + - backports.functools_lru_cache=1.6.4 + - beautifulsoup4=4.11.2 + - bibtexparser=1.4.0 + - binutils_impl_linux-64=2.40 + - bioconductor-ancombc=2.0.1 + - bioconductor-beachmat=2.14.0 + - bioconductor-biobase=2.58.0 + - bioconductor-biocbaseutils=1.0.0 + - bioconductor-biocgenerics=0.44.0 + - bioconductor-biocneighbors=1.16.0 + - bioconductor-biocparallel=1.32.5 + - bioconductor-biocsingular=1.14.0 + - bioconductor-biomformat=1.26.0 + - bioconductor-biostrings=2.66.0 + - bioconductor-dada2=1.26.0 + - bioconductor-data-packages=20230202 + - bioconductor-decipher=2.26.0 + - bioconductor-decontam=1.18.0 + - bioconductor-delayedarray=0.24.0 + - bioconductor-delayedmatrixstats=1.20.0 + - bioconductor-dirichletmultinomial=1.40.0 + - bioconductor-genomeinfodb=1.34.8 + - bioconductor-genomeinfodbdata=1.2.9 + - bioconductor-genomicalignments=1.34.0 + - bioconductor-genomicranges=1.50.0 + - bioconductor-iranges=2.32.0 + - bioconductor-matrixgenerics=1.10.0 + - bioconductor-mia=1.6.0 + - bioconductor-multiassayexperiment=1.24.0 + - bioconductor-multtest=2.54.0 + - bioconductor-phyloseq=1.42.0 + - bioconductor-rhdf5=2.42.0 + - bioconductor-rhdf5filters=1.10.0 + - bioconductor-rhdf5lib=1.20.0 + - bioconductor-rhtslib=2.0.0 + - bioconductor-rsamtools=2.14.0 + - bioconductor-s4vectors=0.36.0 + - bioconductor-scaledmatrix=1.6.0 + - bioconductor-scater=1.26.0 + - bioconductor-scuttle=1.8.0 + - bioconductor-shortread=1.56.0 + - bioconductor-singlecellexperiment=1.20.0 + - bioconductor-sparsematrixstats=1.10.0 + - bioconductor-summarizedexperiment=1.28.0 + - bioconductor-treeio=1.22.0 + - bioconductor-treesummarizedexperiment=2.6.0 + - bioconductor-xvector=0.38.0 + - bioconductor-zlibbioc=1.44.0 + - biom-format=2.1.12 + - blast=2.13.0 + - bleach=6.0.0 + - bokeh=3.0.3 + - bowtie2=2.5.1 + - brotli=1.0.9 + - brotli-bin=1.0.9 + - brotlipy=0.7.0 + - bwidget=1.9.14 + - bzip2=1.0.8 + - c-ares=1.18.1 + - ca-certificates=2022.12.7 + - cachecontrol=0.12.11 + - cached_property=1.5.2 + - cairo=1.16.0 + - certifi=2022.12.7 + - cffi=1.15.1 + - charset-normalizer=2.1.1 + - click=8.1.3 + - colorama=0.4.6 + - comm=0.1.2 + - contourpy=1.0.7 + - cryptography=39.0.0 + - curl=7.87.0 + - cutadapt=4.2 + - cycler=0.11.0 + - cython=0.29.33 + - dbus=1.13.6 + - deblur=1.1.1 + - debugpy=1.6.6 + - decorator=4.4.2 + - defusedxml=0.7.1 + - dendropy=4.5.2 + - dill=0.3.6 + - dnaio=0.10.0 + - emperor=1.0.3 + - entrez-direct=16.2 + - entrypoints=0.4 + - exceptiongroup=1.1.0 + - executing=1.2.0 + - expat=2.5.0 + - fastcluster=1.2.6 + - fasttree=2.1.11 + - fftw=3.3.10 + - flit-core=3.8.0 + - flufl.lock=7.1 + - font-ttf-dejavu-sans-mono=2.37 + - font-ttf-inconsolata=3.000 + - font-ttf-source-code-pro=2.038 + - font-ttf-ubuntu=0.83 + - fontconfig=2.14.2 + - fonts-conda-ecosystem=1 + - fonts-conda-forge=1 + - fonttools=4.38.0 + - formulaic=0.5.2 + - freetype=2.12.1 + - fribidi=1.0.10 + - future=0.18.3 + - gcc_impl_linux-64=12.2.0 + - gettext=0.21.1 + - gfortran_impl_linux-64=12.2.0 + - giflib=5.2.1 + - glib=2.74.1 + - glib-tools=2.74.1 + - glpk=5.0 + - gmp=6.2.1 + - gneiss=0.4.6 + - graphite2=1.3.13 + - graphlib-backport=1.0.3 + - gsl=2.7 + - gst-plugins-base=1.21.3 + - gstreamer=1.21.3 + - gstreamer-orc=0.4.33 + - gxx_impl_linux-64=12.2.0 + - h5py=2.10.0 + - harfbuzz=6.0.0 + - hdf5=1.10.6 + - hdmedians=0.14.2 + - hmmer=3.1b2 + - htslib=1.16 + - icu=70.1 + - idna=3.4 + - ijson=3.2.0.post0 + - importlib-metadata=4.8.3 + - importlib_metadata=4.8.3 + - importlib_resources=5.12.0 + - iniconfig=2.0.0 + - interface_meta=1.3.0 + - iow=1.0.5 + - ipykernel=6.21.2 + - ipython=8.10.0 + - ipython_genutils=0.2.0 + - ipywidgets=8.0.4 + - iqtree=2.2.0.3 + - isa-l=2.30.0 + - jack=1.9.22 + - jedi=0.18.2 + - jinja2=3.1.2 + - joblib=1.2.0 + - jpeg=9e + - jq=1.6 + - jsonschema=4.17.3 + - jupyter_client=8.0.3 + - jupyter_core=5.2.0 + - jupyter_events=0.6.3 + - jupyter_server=2.3.0 + - jupyter_server_terminals=0.4.4 + - jupyterlab_pygments=0.2.2 + - jupyterlab_widgets=3.0.5 + - kernel-headers_linux-64=2.6.32 + - keyutils=1.6.1 + - kiwisolver=1.4.4 + - krb5=1.20.1 + - lame=3.100 + - lcms2=2.14 + - ld_impl_linux-64=2.40 + - lerc=4.0.0 + - libblas=3.9.0 + - libbrotlicommon=1.0.9 + - libbrotlidec=1.0.9 + - libbrotlienc=1.0.9 + - libcap=2.66 + - libcblas=3.9.0 + - libclang=15.0.7 + - libclang13=15.0.7 + - libcups=2.3.3 + - libcurl=7.87.0 + - libdb=6.2.32 + - libdeflate=1.13 + - libedit=3.1.20191231 + - libev=4.33 + - libevent=2.1.10 + - libffi=3.4.2 + - libflac=1.4.2 + - libgcc-devel_linux-64=12.2.0 + - libgcc-ng=12.2.0 + - libgcrypt=1.10.1 + - libgfortran-ng=12.2.0 + - libgfortran5=12.2.0 + - libglib=2.74.1 + - libgomp=12.2.0 + - libgpg-error=1.46 + - libhwloc=2.9.0 + - libiconv=1.17 + - libidn2=2.3.4 + - liblapack=3.9.0 + - liblapacke=3.9.0 + - libllvm11=11.1.0 + - libllvm15=15.0.7 + - libnghttp2=1.51.0 + - libnsl=2.0.0 + - libogg=1.3.4 + - libopenblas=0.3.21 + - libopus=1.3.1 + - libpng=1.6.39 + - libpq=15.1 + - libsanitizer=12.2.0 + - libsndfile=1.2.0 + - libsodium=1.0.18 + - libsqlite=3.40.0 + - libssh2=1.10.0 + - libstdcxx-devel_linux-64=12.2.0 + - libstdcxx-ng=12.2.0 + - libsystemd0=252 + - libtiff=4.4.0 + - libtool=2.4.7 + - libudev1=253 + - libunistring=0.9.10 + - libuuid=2.32.1 + - libvorbis=1.3.7 + - libwebp-base=1.2.4 + - libxcb=1.13 + - libxkbcommon=1.5.0 + - libxml2=2.10.3 + - libxslt=1.1.37 + - libzlib=1.2.13 + - llvmlite=0.39.1 + - lockfile=0.12.2 + - lxml=4.9.2 + - lz4=4.3.2 + - lz4-c=1.9.4 + - mafft=7.515 + - make=4.3 + - markupsafe=2.1.2 + - matplotlib=3.6.0 + - matplotlib-base=3.6.0 + - matplotlib-inline=0.1.6 + - mistune=2.0.5 + - mpfr=4.1.0 + - mpg123=1.31.2 + - msgpack-python=1.0.4 + - munkres=1.1.4 + - mysql-common=8.0.32 + - mysql-libs=8.0.32 + - natsort=8.3.0 + - nbclassic=0.5.2 + - nbclient=0.7.2 + - nbconvert=7.2.9 + - nbconvert-core=7.2.9 + - nbconvert-pandoc=7.2.9 + - nbformat=5.7.3 + - ncurses=6.3 + - nest-asyncio=1.5.6 + - networkx=3.0 + - nlopt=2.7.1 + - nose=1.3.7 + - notebook=6.5.2 + - notebook-shim=0.2.2 + - nspr=4.35 + - nss=3.88 + - numba=0.56.4 + - numpy=1.23.5 + - oniguruma=6.9.8 + - openjdk=17.0.3 + - openjpeg=2.5.0 + - openssl=1.1.1t + - packaging=23.0 + - pandas=1.5.3 + - pandoc=2.19.2 + - pandocfilters=1.5.0 + - pango=1.50.13 + - parso=0.8.3 + - patsy=0.5.3 + - pbzip2=1.1.13 + - pcre=8.45 + - pcre2=10.40 + - perl=5.32.1 + - perl-archive-tar=2.40 + - perl-carp=1.50 + - perl-common-sense=3.75 + - perl-compress-raw-bzip2=2.201 + - perl-compress-raw-zlib=2.202 + - perl-encode=3.19 + - perl-exporter=5.74 + - perl-exporter-tiny=1.002002 + - perl-extutils-makemaker=7.66 + - perl-io-compress=2.201 + - perl-io-zlib=1.14 + - perl-json=4.10 + - perl-json-xs=2.34 + - perl-list-moreutils=0.430 + - perl-list-moreutils-xs=0.430 + - perl-parent=0.241 + - perl-pathtools=3.75 + - perl-scalar-list-utils=1.63 + - perl-storable=3.15 + - perl-types-serialiser=1.01 + - pexpect=4.8.0 + - pickleshare=0.7.5 + - pigz=2.6 + - pillow=9.2.0 + - pip=23.0.1 + - pixman=0.40.0 + - pkgutil-resolve-name=1.3.10 + - platformdirs=3.0.0 + - pluggy=1.0.0 + - ply=3.11 + - prometheus_client=0.16.0 + - prompt-toolkit=3.0.36 + - psutil=5.9.4 + - pthread-stubs=0.4 + - ptyprocess=0.7.0 + - pulseaudio=16.1 + - pure_eval=0.2.2 + - pycparser=2.21 + - pygments=2.14.0 + - pynndescent=0.5.8 + - pyopenssl=23.0.0 + - pyparsing=3.0.9 + - pyqt=5.15.7 + - pyqt5-sip=12.11.0 + - pyrsistent=0.19.3 + - pysocks=1.7.1 + - pytest=7.2.1 + - python=3.8.15 + - python-dateutil=2.8.2 + - python-fastjsonschema=2.16.3 + - python-isal=1.1.0 + - python-json-logger=2.0.7 + - python_abi=3.8 + - pytz=2022.7.1 + - pyyaml=6.0 + - pyzmq=25.0.0 + - q2-alignment=2023.2.0 + - q2-composition=2023.2.0 + - q2-cutadapt=2023.2.0 + - q2-dada2=2023.2.0 + - q2-deblur=2023.2.0 + - q2-demux=2023.2.0 + - q2-diversity=2023.2.0 + - q2-diversity-lib=2023.2.0 + - q2-emperor=2023.2.0 + - q2-feature-classifier=2023.2.0 + - q2-feature-table=2023.2.0 + - q2-fragment-insertion=2023.2.0 + - q2-gneiss=2023.2.0 + - q2-longitudinal=2023.2.0 + - q2-metadata=2023.2.0 + - q2-mystery-stew=2023.2.0 + - q2-phylogeny=2023.2.0 + - q2-quality-control=2023.2.0 + - q2-quality-filter=2023.2.0 + - q2-sample-classifier=2023.2.0 + - q2-taxa=2023.2.0 + - q2-types=2023.2.0 + - q2-vsearch=2023.2.0 + - q2cli=2023.2.0 + - q2galaxy=2023.2.0 + - q2templates=2023.2.0 + - qiime2=2023.2.0 + - qt-main=5.15.6 + - r-acepack=1.4.1 + - r-ade4=1.7_22 + - r-ape=5.7 + - r-askpass=1.1 + - r-assertthat=0.2.1 + - r-backports=1.4.1 + - r-base=4.2.2 + - r-base64enc=0.1_3 + - r-beeswarm=0.4.0 + - r-bh=1.81.0_1 + - r-bibtex=0.5.1 + - r-bit=4.0.5 + - r-bit64=4.0.5 + - r-bitops=1.0_7 + - r-blob=1.2.3 + - r-boot=1.3_28.1 + - r-broom=1.0.3 + - r-bslib=0.4.2 + - r-cachem=1.0.7 + - r-cairo=1.6_0 + - r-callr=3.7.3 + - r-cellranger=1.1.0 + - r-checkmate=2.1.0 + - r-class=7.3_21 + - r-cli=3.6.0 + - r-clipr=0.8.0 + - r-cluster=2.1.4 + - r-codetools=0.2_19 + - r-colorspace=2.1_0 + - r-cpp11=0.4.3 + - r-crayon=1.5.2 + - r-curl=4.3.3 + - r-cvxr=1.0_11 + - r-data.table=1.14.8 + - r-dbi=1.1.3 + - r-dbplyr=2.3.1 + - r-deldir=1.0_6 + - r-desctools=0.99.48 + - r-digest=0.6.31 + - r-doparallel=1.0.17 + - r-dorng=1.8.6 + - r-dplyr=1.1.0 + - r-dqrng=0.3.0 + - r-dtplyr=1.3.0 + - r-e1071=1.7_13 + - r-ecosolver=0.5.4 + - r-ellipsis=0.3.2 + - r-emmeans=1.8.4_1 + - r-energy=1.7_11 + - r-estimability=1.4.1 + - r-evaluate=0.20 + - r-exact=3.2 + - r-expm=0.999_7 + - r-fansi=1.0.4 + - r-farver=2.1.1 + - r-fastmap=1.1.1 + - r-fnn=1.1.3.1 + - r-forcats=1.0.0 + - r-foreach=1.5.2 + - r-foreign=0.8_84 + - r-formatr=1.14 + - r-formula=1.2_5 + - r-frictionless=1.0.2 + - r-fs=1.6.1 + - r-futile.logger=1.4.3 + - r-futile.options=1.0.1 + - r-gargle=1.3.0 + - r-gbrd=0.4_11 + - r-generics=0.1.3 + - r-getopt=1.20.3 + - r-ggbeeswarm=0.7.1 + - r-ggplot2=3.4.1 + - r-ggrastr=1.0.1 + - r-ggrepel=0.9.3 + - r-gld=2.6.6 + - r-glue=1.6.2 + - r-gmp=0.7_1 + - r-googledrive=2.0.0 + - r-googlesheets4=1.0.1 + - r-gridextra=2.3 + - r-gsl=2.1_8 + - r-gtable=0.3.1 + - r-haven=2.5.1 + - r-highr=0.10 + - r-hmisc=4.8_0 + - r-hms=1.1.2 + - r-htmltable=2.4.1 + - r-htmltools=0.5.4 + - r-htmlwidgets=1.6.1 + - r-httr=1.4.5 + - r-hwriter=1.3.2.1 + - r-ids=1.0.1 + - r-igraph=1.4.1 + - r-interp=1.1_3 + - r-irlba=2.3.5.1 + - r-isoband=0.2.7 + - r-iterators=1.0.14 + - r-jpeg=0.1_10 + - r-jquerylib=0.1.4 + - r-jsonlite=1.8.4 + - r-knitr=1.42 + - r-labeling=0.4.2 + - r-lambda.r=1.2.4 + - r-lattice=0.20_45 + - r-latticeextra=0.6_30 + - r-lazyeval=0.2.2 + - r-lifecycle=1.0.3 + - r-lme4=1.1_31 + - r-lmertest=3.1_3 + - r-lmom=2.9 + - r-lubridate=1.9.2 + - r-magrittr=2.0.3 + - r-mass=7.3_58.2 + - r-matrix=1.5_3 + - r-matrixstats=0.63.0 + - r-memoise=2.0.1 + - r-mgcv=1.8_41 + - r-mime=0.12 + - r-minqa=1.2.5 + - r-modelr=0.1.10 + - r-munsell=0.5.0 + - r-mvtnorm=1.1_3 + - r-nlme=3.1_162 + - r-nloptr=2.0.3 + - r-nnet=7.3_18 + - r-numderiv=2016.8_1.1 + - r-openssl=2.0.5 + - r-optparse=1.7.3 + - r-osqp=0.6.0.8 + - r-permute=0.9_7 + - r-pheatmap=1.0.12 + - r-pillar=1.8.1 + - r-pixmap=0.4_12 + - r-pkgconfig=2.0.3 + - r-pkgmaker=0.32.8 + - r-plogr=0.2.0 + - r-plyr=1.8.8 + - r-png=0.1_8 + - r-prettyunits=1.1.1 + - r-processx=3.8.0 + - r-progress=1.2.2 + - r-proxy=0.4_27 + - r-ps=1.7.2 + - r-purrr=1.0.1 + - r-r6=2.5.1 + - r-ragg=1.2.4 + - r-rappdirs=0.3.3 + - r-rbibutils=2.2.13 + - r-rcolorbrewer=1.1_3 + - r-rcpp=1.0.10 + - r-rcppannoy=0.0.20 + - r-rcppeigen=0.3.3.9.3 + - r-rcpphnsw=0.4.1 + - r-rcppml=0.3.7 + - r-rcppparallel=5.1.6 + - r-rcppprogress=0.4.2 + - r-rcurl=1.98_1.10 + - r-rdpack=2.4 + - r-readr=2.1.4 + - r-readxl=1.4.2 + - r-registry=0.5_1 + - r-rematch=1.0.1 + - r-rematch2=2.1.2 + - r-reprex=2.0.2 + - r-reshape2=1.4.4 + - r-rlang=1.0.6 + - r-rmarkdown=2.20 + - r-rmpfr=0.9_1 + - r-rngtools=1.5.2 + - r-rootsolve=1.8.2.3 + - r-rpart=4.1.19 + - r-rspectra=0.16_1 + - r-rsqlite=2.2.20 + - r-rstudioapi=0.14 + - r-rsvd=1.0.5 + - r-rtsne=0.16 + - r-rvest=1.0.3 + - r-sass=0.4.5 + - r-scales=1.2.1 + - r-scs=3.0_1 + - r-selectr=0.4_2 + - r-sitmo=2.0.2 + - r-snow=0.4_4 + - r-sp=1.6_0 + - r-statmod=1.5.0 + - r-stringi=1.7.12 + - r-stringr=1.5.0 + - r-survival=3.5_3 + - r-sys=3.4.1 + - r-systemfonts=1.0.4 + - r-textshaping=0.3.6 + - r-tibble=3.1.8 + - r-tidyr=1.3.0 + - r-tidyselect=1.2.0 + - r-tidytree=0.4.2 + - r-tidyverse=1.3.2 + - r-timechange=0.2.0 + - r-tinytex=0.44 + - r-tzdb=0.3.0 + - r-utf8=1.2.3 + - r-uuid=1.1_0 + - r-uwot=0.1.14 + - r-vctrs=0.5.2 + - r-vegan=2.6_4 + - r-vipor=0.4.5 + - r-viridis=0.6.2 + - r-viridislite=0.4.1 + - r-vroom=1.6.1 + - r-withr=2.5.0 + - r-xfun=0.37 + - r-xml2=1.3.3 + - r-xtable=1.8_4 + - r-yaml=2.3.7 + - r-yulab.utils=0.0.6 + - raxml=8.2.12 + - readline=8.1.2 + - requests=2.28.2 + - rfc3339-validator=0.1.4 + - rfc3986-validator=0.1.1 + - samtools=1.16.1 + - scikit-bio=0.5.7 + - scikit-learn=0.24.1 + - scipy=1.8.1 + - seaborn=0.12.2 + - seaborn-base=0.12.2 + - sed=4.8 + - send2trash=1.8.0 + - sepp=4.3.10 + - setuptools=67.4.0 + - sip=6.7.7 + - six=1.16.0 + - sniffio=1.3.0 + - sortmerna=2.0 + - soupsieve=2.3.2.post1 + - stack_data=0.6.2 + - statsmodels=0.13.5 + - sysroot_linux-64=2.12 + - tbb=2021.8.0 + - terminado=0.17.1 + - threadpoolctl=3.1.0 + - tinycss2=1.2.1 + - tk=8.6.12 + - tktable=2.10 + - toml=0.10.2 + - tomli=2.0.1 + - toolz=0.12.0 + - tornado=6.2 + - tqdm=4.64.1 + - traitlets=5.9.0 + - typing-extensions=4.4.0 + - typing_extensions=4.4.0 + - tzlocal=2.1 + - umap-learn=0.5.3 + - unicodedata2=15.0.0 + - unifrac=1.1.1 + - unifrac-binaries=1.1.1 + - urllib3=1.26.14 + - vsearch=2.22.1 + - wcwidth=0.2.6 + - webencodings=0.5.1 + - websocket-client=1.5.1 + - wget=1.20.3 + - wheel=0.38.4 + - widgetsnbextension=4.0.5 + - wrapt=1.15.0 + - xcb-util=0.4.0 + - xcb-util-image=0.4.0 + - xcb-util-keysyms=0.4.0 + - xcb-util-renderutil=0.3.9 + - xcb-util-wm=0.4.1 + - xmltodict=0.13.0 + - xopen=1.7.0 + - xorg-fixesproto=5.0 + - xorg-inputproto=2.3.2 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.7.2 + - xorg-libxau=1.0.9 + - xorg-libxdmcp=1.1.3 + - xorg-libxext=1.3.4 + - xorg-libxfixes=5.0.3 + - xorg-libxi=1.7.10 + - xorg-libxrender=0.9.10 + - xorg-libxt=1.2.1 + - xorg-libxtst=1.2.3 + - xorg-recordproto=1.14.2 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xyzservices=2023.2.0 + - xz=5.2.6 + - yaml=0.2.5 + - yq=3.1.1 + - zeromq=4.3.4 + - zipp=3.15.0 + - zlib=1.2.13 + - zstandard=0.19.0 + - zstd=1.5.2 + - krona=2.8.1 + - bc=1.07.1 + - git=2.39.1 + - jq=1.6 diff --git a/main.nf b/main.nf index 28d4a2d..68da920 100644 --- a/main.nf +++ b/main.nf @@ -7,7 +7,7 @@ in demultiplexed 16S amplicon sequencing FASTQ file. =============================================================================== Author: Khi Pin, Chua -Updated: 2023-1-25 +Updated: 2023-3-31 */ nextflow.enable.dsl=2 @@ -32,6 +32,7 @@ def helpMessage() { --front_p Forward primer sequence. Default to F27. (default: AGRGTTYGATYMTGGCTCAG) --adapter_p Reverse primer sequence. Default to R1492. (default: AAGTCGTAACAAGGTARCY) --filterQ Filter input reads above this Q value (default: 20). + --downsample Limit reads to a maximum of N reads if there are more than N reads (default: off) --max_ee DADA2 max_EE parameter. Reads with number of expected errors higher than this value will be discarded (default: 2) --minQ DADA2 minQ parameter. Reads with any base lower than this score @@ -86,7 +87,7 @@ def helpMessage() { params.help = false if (params.help) exit 0, helpMessage() params.version = false -version = "0.5" +version = "0.6" if (params.version) exit 0, log.info("$version") params.download_db = false params.skip_primer_trim = false @@ -98,6 +99,8 @@ params.max_len = 1600 params.colorby = "condition" params.skip_phylotree = false params.minQ = 0 +// Downsample to N reads, default to 0 which means disable +params.downsample = 0 // Check input params.input = false if (params.input){ @@ -150,7 +153,7 @@ params.rmd_vis_biom_script= "$projectDir/scripts/visualize_biom.Rmd" params.rmd_helper = "$projectDir/scripts/import_biom.R" params.cutadapt_cpu = 16 params.primer_fasta = "$projectDir/scripts/16S_primers.fasta" -params.dadaCCS_script = "$projectDir/scripts/run_dada_ccs.R" +params.dadaCCS_script = "$projectDir/scripts/run_dada_2023.2.R" params.dadaAssign_script = "$projectDir/scripts/dada2_assign_tax.R" params.publish_dir_mode = "symlink" @@ -160,6 +163,7 @@ log_text = """ Number of samples in samples TSV: $n_sample Filter input reads above Q: $params.filterQ Trim primers with cutadapt: $trim_cutadapt + Limit to N reads if exceeding N reads (0 = disabled): $params.downsample Forward primer: $params.front_p Reverse primer: $params.adapter_p Minimum amplicon length filtered in DADA2: $params.min_len @@ -194,6 +198,7 @@ process write_log{ conda (params.enable_conda ? "$projectDir/env/pb-16s-pbtools.yml" : null) container "kpinpb/pb-16s-nf-tools:latest" publishDir "$params.outdir", mode: params.publish_dir_mode + label 'cpu_def' input: val(logs) @@ -224,6 +229,16 @@ process QC_fastq { path("${sampleID}.filterQ${params.filterQ}.fastq.gz"), emit: filtered_fastq_files script: + if (params.downsample > 0){ + """ + seqkit fx2tab -j $task.cpus -q --gc -l -H -n -i $sampleFASTQ |\ + csvtk mutate2 -C '%' -t -n sample -e '"${sampleID}"' > ${sampleID}.seqkit.readstats.tsv + seqkit stats -T -j $task.cpus -a ${sampleFASTQ} |\ + csvtk mutate2 -C '%' -t -n sample -e '"${sampleID}"' > ${sampleID}.seqkit.summarystats.tsv + seqkit seq -j $task.cpus --min-qual $params.filterQ $sampleFASTQ |\ + seqkit head -n $params.downsample --out-file ${sampleID}.filterQ${params.filterQ}.fastq.gz + """ + } else { """ seqkit fx2tab -j $task.cpus -q --gc -l -H -n -i $sampleFASTQ |\ csvtk mutate2 -C '%' -t -n sample -e '"${sampleID}"' > ${sampleID}.seqkit.readstats.tsv @@ -231,12 +246,13 @@ process QC_fastq { csvtk mutate2 -C '%' -t -n sample -e '"${sampleID}"' > ${sampleID}.seqkit.summarystats.tsv seqkit seq -j $task.cpus --min-qual $params.filterQ $sampleFASTQ --out-file ${sampleID}.filterQ${params.filterQ}.fastq.gz """ + } } // Trim full length 16S primers with cutadapt process cutadapt { - conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime:latest" + conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null) + container "kpinpb/pb-16s-nf-qiime:v0.7" publishDir "$params.outdir/trimmed_primers_FASTQ", pattern: '*.fastq.gz', mode: params.publish_dir_mode publishDir "$params.outdir/cutadapt_summary", pattern: '*.report', mode: params.publish_dir_mode cpus params.cutadapt_cpu @@ -389,7 +405,7 @@ process prepare_qiime2_manifest_skip_cutadapt { path(metadata) output: - path "samplefile.txt", emit: sample_trimmed_file + path "samplefile*.txt", emit: sample_trimmed_file """ echo -e "sample-id\tabsolute-filepath" > samplefile.txt @@ -412,8 +428,8 @@ process prepare_qiime2_manifest_skip_cutadapt { // Import data into QIIME 2, note the `path *` in input is staging all the input // FASTQ into the process so it'll work on AWS that has no notion of local directory path process import_qiime2 { - conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime:latest" + conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null) + container "kpinpb/pb-16s-nf-qiime:v0.7" publishDir "$params.outdir/import_qiime", mode: params.publish_dir_mode label 'cpu_def' @@ -455,8 +471,8 @@ process merge_sample_manifest { // Summarize demultiplex statistics process demux_summarize { - conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime:latest" + conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null) + container "kpinpb/pb-16s-nf-qiime:v0.7" publishDir "$params.outdir/summary_demux", mode: params.publish_dir_mode label 'cpu_def' @@ -480,8 +496,8 @@ process demux_summarize { // Denoise into ASVs process dada2_denoise { - conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime:latest" + conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null) + container "kpinpb/pb-16s-nf-qiime:v0.7" publishDir "$params.outdir/dada2", mode: params.publish_dir_mode cpus params.dada2_cpu @@ -501,12 +517,12 @@ process dada2_denoise { """ # Use custom script that can skip primer trimming mkdir -p dada2_custom_script - cp $dada_ccs_script dada2_custom_script/run_dada_ccs_original.R - sed 's/minQ\\ =\\ 0/minQ=$minQ/g' dada2_custom_script/run_dada_ccs_original.R > \ - dada2_custom_script/run_dada_ccs.R - chmod +x dada2_custom_script/run_dada_ccs.R + cp $dada_ccs_script dada2_custom_script/run_dada.R + # sed 's/minQ\\ =\\ 0/minQ=$minQ/g' dada2_custom_script/run_dada_ccs_original.R > \ + # dada2_custom_script/run_dada_ccs.R + chmod +x dada2_custom_script/run_dada.R export PATH="./dada2_custom_script:\$PATH" - which run_dada_ccs.R + which run_dada.R qiime dada2 denoise-ccs --i-demultiplexed-seqs $samples_qza \ --o-table dada2-ccs_table.qza \ --o-representative-sequences dada2-ccs_rep.qza \ @@ -525,8 +541,8 @@ process dada2_denoise { // Take note of the output above! There will only be one set of out from // different group in dada2 folder after dada2_denoise process mergeASV { - conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime:latest" + conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null) + container "kpinpb/pb-16s-nf-qiime:v0.7" publishDir "$params.outdir/dada2", mode: params.publish_dir_mode cpus params.dada2_cpu @@ -564,8 +580,8 @@ process mergeASV { // Filter DADA2 ASVs using minimum number of samples and frequency of ASVs process filter_dada2 { - conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime:latest" + conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null) + container "kpinpb/pb-16s-nf-qiime:v0.7" publishDir "$params.outdir/dada2", mode: params.publish_dir_mode cpus params.dada2_cpu @@ -621,8 +637,8 @@ process filter_dada2 { // Assign taxonomies to SILVA, GTDB and RefSeq using DADA2 // assignTaxonomy function based on Naive Bayes classifier process dada2_assignTax { - conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime:latest" + conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null) + container "kpinpb/pb-16s-nf-qiime:v0.7" publishDir "$params.outdir/results", pattern: 'best_tax*', mode: params.publish_dir_mode publishDir "$params.outdir/nb_tax", mode: params.publish_dir_mode cpus params.vsearch_cpu @@ -670,8 +686,8 @@ process dada2_assignTax { // QC summary for dada2 process dada2_qc { - conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime:latest" + conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null) + container "kpinpb/pb-16s-nf-qiime:v0.7" publishDir "$params.outdir/results", mode: params.publish_dir_mode label 'cpu_def' @@ -748,8 +764,8 @@ process dada2_qc { } process qiime2_phylogeny_diversity { - conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime:latest" + conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null) + container "kpinpb/pb-16s-nf-qiime:v0.7" publishDir "$params.outdir/results/phylogeny_diversity", mode: params.publish_dir_mode label 'cpu8' @@ -826,8 +842,8 @@ process qiime2_phylogeny_diversity { // Rarefaction visualization process dada2_rarefaction { - conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime:latest" + conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null) + container "kpinpb/pb-16s-nf-qiime:v0.7" publishDir "$params.outdir/results", mode: params.publish_dir_mode label 'cpu_def' @@ -850,8 +866,8 @@ process dada2_rarefaction { // Classify taxonomy and export table using VSEARCH approach process class_tax { - conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime:latest" + conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null) + container "kpinpb/pb-16s-nf-qiime:v0.7" publishDir "$params.outdir/results", mode: params.publish_dir_mode cpus params.vsearch_cpu @@ -866,6 +882,7 @@ process class_tax { path "tax_export/taxonomy.tsv", emit: tax_tsv path "merged_freq_tax.qzv", emit: tax_freq_tab path "vsearch_merged_freq_tax.tsv", emit: tax_freq_tab_tsv + path "taxonomy.vsearch.blast_results.qza", emit: tax_vsearch_blast script: """ @@ -877,7 +894,8 @@ process class_tax { --p-maxrejects $params.maxreject \ --p-maxaccepts $params.maxaccept \ --p-perc-identity $params.vsearch_identity \ - --p-top-hits-only + --p-top-hits-only \ + --o-search-results taxonomy.vsearch.blast_results.qza qiime tools export --input-path taxonomy.vsearch.qza --output-path tax_export @@ -898,8 +916,8 @@ process class_tax { // Export results into biom for use with phyloseq process export_biom { - conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime:latest" + conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null) + container "kpinpb/pb-16s-nf-qiime:v0.7" publishDir "$params.outdir/results", mode: params.publish_dir_mode label 'cpu_def' @@ -936,8 +954,8 @@ process export_biom { // Export results into biom for use with phyloseq process export_biom_skip_nb { - conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime:latest" + conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null) + container "kpinpb/pb-16s-nf-qiime:v0.7" publishDir "$params.outdir/results", mode: params.publish_dir_mode label 'cpu_def' @@ -985,8 +1003,8 @@ process picrust2 { } process barplot { - conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime:latest" + conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null) + container "kpinpb/pb-16s-nf-qiime:v0.7" publishDir "$params.outdir/results", mode: params.publish_dir_mode label 'cpu_def' @@ -1008,8 +1026,8 @@ process barplot { } process barplot_nb { - conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime:latest" + conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null) + container "kpinpb/pb-16s-nf-qiime:v0.7" publishDir "$params.outdir/results", mode: params.publish_dir_mode label 'cpu_def' @@ -1122,8 +1140,8 @@ process html_rep_skip_cutadapt { // Krona plot process krona_plot { - conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime:latest" + conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null) + container "kpinpb/pb-16s-nf-qiime:v0.7" publishDir "$params.outdir/results", mode: params.publish_dir_mode label 'cpu_def' // Ignore if this fail @@ -1153,8 +1171,8 @@ process krona_plot { } process download_db { - conda (params.enable_conda ? "$projectDir/env/qiime2-2022.2-py38-linux-conda.yml" : null) - container "kpinpb/pb-16s-nf-qiime:latest" + conda (params.enable_conda ? "$projectDir/env/qiime2-2023.2-py38-linux-conda.yml" : null) + container "kpinpb/pb-16s-nf-qiime:v0.7" publishDir "$projectDir/databases", mode: "copy" label 'cpu_def' @@ -1171,8 +1189,8 @@ process download_db { wget -N --content-disposition 'https://zenodo.org/record/4735821/files/RefSeq_16S_6-11-20_RDPv16_fullTaxo.fa.gz?download=1' echo "Downloading SILVA sequences and taxonomies for Naive Bayes classifier" - wget -N --content-disposition 'https://data.qiime2.org/2022.2/common/silva-138-99-seqs.qza' - wget -N --content-disposition 'https://data.qiime2.org/2022.2/common/silva-138-99-tax.qza' + wget -N --content-disposition 'https://data.qiime2.org/2023.2/common/silva-138-99-seqs.qza' + wget -N --content-disposition 'https://data.qiime2.org/2023.2/common/silva-138-99-tax.qza' echo "Downloading GTDB database" wget -N --content-disposition --no-check-certificate 'https://zenodo.org/record/6912512/files/GTDB_ssu_all_r207.taxonomy.qza?download=1' diff --git a/nextflow.config b/nextflow.config index 75a3006..5680b67 100644 --- a/nextflow.config +++ b/nextflow.config @@ -104,4 +104,5 @@ timeline { dag { enabled = true file = "report_$params.outdir/dag.html" + overwrite = true } diff --git a/scripts/run_dada_2023.2.R b/scripts/run_dada_2023.2.R new file mode 100755 index 0000000..2d8be72 --- /dev/null +++ b/scripts/run_dada_2023.2.R @@ -0,0 +1,527 @@ +#!/usr/bin/env Rscript + +################################################### +# This R script takes an input directory of .fastq.gz files +# and outputs a tsv file of the dada2 processed sequence +# table. It is intended for use with the QIIME2 plugin +# for DADA2. +# +# Ex: Rscript run_dada_single.R input_dir output.tsv track.tsv filtered_dir 200 0 2.0 2 Inf pooled 1.0 0 1000000 NULL 32 +#################################################### + +#################################################### +# DESCRIPTION OF ARGUMENTS # +#################################################### +# NOTE: All numeric arguments should be zero or positive. +# NOTE: All numeric arguments save maxEE are expected to be integers. +# NOTE: Currently the filterered_dir must already exist. +# NOTE: ALL ARGUMENTS ARE POSITIONAL! +# +### FILE SYSTEM ARGUMENTS ### +# +# 1) input_directory - File path to directory with the .fastq.gz files to be processed. +# Ex: path/to/dir/with/fastqgzs +# +# 2) output_path - File path to output tsv file. If already exists, will be overwritten. +# Ex: path/to/output_file.tsv +# +# 3) output_track - File path to tracking tsv file. If already exists, will be overwritten. +# Ex: path/to/tracking_stats.tsv +# +# 4) removed_primer_directory - File path to directory in which to write the primer.removed .fastq.gz +# files. These files are intermediate for the full workflow. +# Currently they remain after the script finishes. +# Directory must already exist. +# Ex: path/to/dir/with/fastqgzs/primerremoved +# +# 5) filtered_directory - File path to directory in which to write the filtered .fastq.gz files. +# These files are intermediate for the full workflow. +# Currently they remain after the script finishes. +# Directory must already exist. +# Ex: path/to/dir/with/fastqgzs/filtered +# +### PRIMER REMOVING ARGUMENTS ### +# +# 6) forward_primer - Primer front of Pacbio CCS sequences. +# Ex: 'AGRGTTYGATYMTGGCTCAG' +# +# 7) reverse_primer - Primer adapter of Pacbio CCS sequences. +# Ex: 'RGYTACCTTGTTACGACTT' +# +# 8) max_mismatch - The number of mismatches to tolerate when matching +# reads to primer sequences. +# Ex: 2 +# +# 9) indels - Allow insertions or deletions of bases when matching adapters. +# Ex: FALSE +# +### FILTERING ARGUMENTS ### +# +# 10) truncation_length - The position at which to truncate reads. Reads shorter +# than truncation_length will be discarded. +# Special values: 0 - no truncation or length filtering. +# Ex: 150 +# +# 11) trim_left - The number of nucleotides to remove from the start of +# each read. Should be less than truncation_length for obvious reasons. +# Ex: 0 +# +# 12) max_expected_errors - Reads with expected errors higher than maxEE are discarded. +# Ex: 2.0 +# +# 13) truncation_quality_score - Reads are truncated at the first instance of quality score truncation_quality_score. +# If the read is then shorter than truncation_length , it is discarded. +# Ex: 2 +# +# 14) min_length - Remove reads with length shorter than min_length. min_length is enforced +# after trimming and truncation. +# Default Inf - no maximum. +# +# 15) max_length - Remove reads with length greater than max_length. max_length is enforced on the raw reads. +# Default Inf - no maximum. +# Ex: 300 +# +### SENSITIVITY ARGUMENTS ### +# +# 16) pooling_method- The method used to pool (or not) samples during denoising. +# Valid options are: +# independent: (Default) No pooling, samples are denoised indpendently. +# pseudo: Samples are "pseudo-pooled" for denoising. +# Ex: independent +# +# +### CHIMERA ARGUMENTS ### +# +# 17) chimera_method - The method used to remove chimeras. Valid options are: +# none: No chimera removal is performed. +# pooled: All reads are pooled prior to chimera detection. +# consensus: Chimeras are detected in samples individually, and a consensus decision +# is made for each sequence variant. +# Ex: consensus +# +# 18) min_parental_fold - The minimum abundance of potential "parents" of a sequence being +# tested as chimeric, expressed as a fold-change versus the abundance of the sequence being +# tested. Values should be greater than or equal to 1 (i.e. parents should be more +# abundant than the sequence being tested). +# Ex: 1.0 +# +### SPEED ARGUMENTS ### +# +# 19) num_threads - The number of threads to use. +# Special values: 0 - detect available cores and use all. +# Ex: 1 +# +# 20) learn_min_reads - The minimum number of reads to learn the error model from. +# Special values: 0 - Use all input reads. +# Ex: 1000000 +# +### GLOBAL OPTION ARGUMENTS ### +# +# 21) homopolymer_gap_penalty - The cost of gaps in homopolymer regions (>=3 repeated bases). +# Default is NULL, which causes homopolymer gaps +# to be treated as normal gaps. +# Ex: -1 +# +# 22) band_size - When set, banded Needleman-Wunsch alignments are performed. +# The default value of band_size is 16. Setting BAND_SIZE to a negative +# number turns off banding (i.e. full Needleman-Wunsch). +# Ex: 32 +# + +# error handling ----------------- +options(error = function() { + sink(stderr()) + on.exit(sink(NULL)) + traceback(3) + if (!interactive()) { + q(status = 1) + } +}) + +library("optparse") + +cat(R.version$version.string, "\n") +errQuit <- function(mesg, status=1) { message("Error: ", mesg); q(status=status) } +getN <- function(x) sum(getUniques(x)) #Function added from paired read processing + +option_list = list( + make_option(c("--input_directory"), action="store", default='NULL', type='character', + help="File path to directory with the .fastq.gz files to be processed"), + make_option(c("--input_directory_reverse"), action="store", default='NULL', type='character', + help="File path to reverse directory with the .fastq.gz files to be processed. Only used in paired-end processing"), + make_option(c("--output_path"), action="store", default='NULL', type='character', + help="File path to output tsv file. If already exists, will be overwritten"), + make_option(c("--output_track"), action="store", default='NULL', type='character', + help="File path to tracking tsv file. If already exists, will be overwritten"), + make_option(c("--removed_primer_directory"), action="store", default='NULL', type='character', + help="File path to directory in which to write the primer.removed .fastq.gz files"), + make_option(c("--filtered_directory"), action="store", default='NULL', type='character', + help="File path to directory in which to write the filtered .fastq.gz files. These files are intermediate"), + make_option(c("--filtered_directory_reverse"), action="store", default='NULL', type='character', + help="File path to directory in which to write the reverse filtered .fastq.gz files. These files are intermediate. Only used in paired-end processing"), + make_option(c("--forward_primer"), action="store", default='NULL', type='character', + help="Primer front of Pacbio CCS sequences"), + make_option(c("--reverse_primer"), action="store", default='NULL', type='character', + help="Primer adapter of Pacbio CCS sequences"), + make_option(c("--max_mismatch"), action="store", default='NULL', type='character', + help="The number of mismatches to tolerate when matching reads to primer sequences."), + make_option(c("--indels"), action="store", default='NULL', type='character', + help="Allow insertions or deletions of bases when matching adapters"), + make_option(c("--truncation_length"), action="store", default='NULL', type='character', + help="The position at which to truncate reads. Reads shorter then truncation_length will be discarded."), + make_option(c("--truncation_length_reverse"), action="store", default='NULL', type='character', + help="The position at which to truncate reverse reads. Reads shorter then truncation_length will be discarded. Only used in paired-end processing"), + make_option(c("--trim_left"), action="store", default='NULL', type='character', + help="The number of nucleotides to remove from the start of each read. Should be less than truncation_length for obvious reasons"), + make_option(c("--trim_left_reverse"), action="store", default='NULL', type='character', + help="The number of nucleotides to remove from the start of each reverse read. Should be less than truncation_length for obvious reasons. Only used in paired-end processing"), + make_option(c("--max_expected_errors"), action="store", default='NULL', type='character', + help="Reads with expected errors higher than max_expected_errors are discarded"), + make_option(c("--max_expected_errors_reverse"), action="store", default='NULL', type='character', + help="Reverse reads with expected errors higher than max_expected_errors are discarded. Only used in paired-end processing"), + make_option(c("--truncation_quality_score"), action="store", default='NULL', type='character', + help="Reads are truncated at the first instance of quality score truncation_quality_score.If the read is then shorter than truncation_length, it is discarded"), + make_option(c("--min_length"), action="store", default='NULL', type='character', + help="Remove reads with length shorter than min_length. min_length is enforced after trimming and truncation."), + make_option(c("--max_length"), action="store", default='NULL', type='character', + help="Remove reads with length greater than max_length. max_length is enforced on the raw reads."), + make_option(c("--min_overlap"), action="store", default='NULL', type='character', + help="Minimum overlap allowed between reads"), + make_option(c("--pooling_method"), action="store", default='NULL', type='character', + help="The method used to pool (or not) samples during denoising (independent/pseudo)"), + make_option(c("--chimera_method"), action="store", default='NULL', type='character', + help="The method used to remove chimeras (none/pooled/consensus)"), + make_option(c("--min_parental_fold"), action="store", default='NULL', type='character', + help="The minimum abundance of potential parents of a sequence being tested as chimeric, expressed as a fold-change versus the abundance of the sequence being tested. Values should be greater than or equal to 1"), + make_option(c("--allow_one_off"), action="store", default='NULL', type='character', + help="Bimeras that are one-off (one mismatch/indel away from an exact bimera) are also identified if allow_one_off is TRUE."), + make_option(c("--num_threads"), action="store", default='NULL', type='character', + help="The number of threads to use"), + make_option(c("--learn_min_reads"), action="store", default='NULL', type='character', + help="The minimum number of reads to learn the error model from"), + make_option(c("--homopolymer_gap_penalty"), action="store", default='NULL', type='character', + help="The cost of gaps in homopolymer regions (>=3 repeated bases).Default is NULL, which causes homopolymer gaps to be treated as normal gaps."), + make_option(c("--band_size"), action="store", default='NULL', type='character', + help="When set, banded Needleman-Wunsch alignments are performed.") +) +opt = parse_args(OptionParser(option_list=option_list)) + +# Added by Khi Pin 2023-4-21 to save all outputs from scripts +con <- file("dada2.log") +sink(con, append=TRUE) +sink(con, append=TRUE, type="message") + + +# Assign each of the arguments, in positional order, to an appropriately named R variable +inp.dir <- opt$input_directory +inp.dirR <- opt$input_directory_reverse #added from paired arguments +out.path <- opt$output_path +out.track <- opt$output_track +primer.removed.dir <- opt$removed_primer_directory #added from CCS arguments +filtered.dir <- opt$filtered_directory +filtered.dirR<- opt$filtered_directory_reverse #added from paired arguments +primerF <- opt$forward_primer #added from CCS arguments +primerR <- opt$reverse_primer #added from CCS arguments +maxMismatch <- if(opt$max_mismatch=='NULL') NULL else as.numeric(opt$max_mismatch) #added from CCS arguments +indels <- if(opt$indels=='NULL') NULL else as.logical(opt$indels) #added from CCS arguments +truncLen <- if(opt$truncation_length=='NULL') NULL else as.integer(opt$truncation_length) +truncLenR <- if(opt$truncation_length_reverse=='NULL') NULL else as.integer(opt$truncation_length_reverse) #added from paired arguments +trimLeft <- if(opt$trim_left=='NULL') NULL else as.integer(opt$trim_left) +trimLeftR<-if(opt$trim_left_reverse=='NULL') NULL else as.integer(opt$trim_left_reverse) #added from paired arguments +maxEE <- if(opt$max_expected_errors=='NULL') NULL else as.numeric(opt$max_expected_errors) +maxEER <- if(opt$max_expected_errors_reverse=='NULL') NULL else as.numeric(opt$max_expected_errors_reverse) #added from paired arguments +truncQ <- if(opt$truncation_quality_score=='NULL') NULL else as.integer(opt$truncation_quality_score) +minLen <- if(opt$min_length=='NULL') NULL else as.numeric(opt$min_length) #added from CCS arguments +maxLen <- if(opt$max_length=='NULL') NULL else as.numeric(opt$max_length) # Allows Inf +minOverlap <- if(opt$min_overlap=='NULL') NULL else as.integer(opt$min_overlap) #added from paired arguments +poolMethod <- opt$pooling_method +chimeraMethod <- opt$chimera_method +minParentFold <- if(opt$min_parental_fold=='NULL') NULL else as.numeric(opt$min_parental_fold) +allowOneOff <-if(opt$allow_one_off=='NULL') NULL else as.logical(opt$allow_one_off) +nthreads <- if(opt$num_threads=='NULL') NULL else as.integer(opt$num_threads) +nreads.learn <- if(opt$learn_min_reads=='NULL') NULL else as.integer(opt$learn_min_reads) +# The following args are not directly exposed to end users in q2-dada2, +# but rather indirectly, via the methods `denoise-single` and `denoise-pyro`. +if (opt$homopolymer_gap_penalty=='NULL'){ + HOMOPOLYMER_GAP_PENALTY<-NULL +}else{ + HOMOPOLYMER_GAP_PENALTY<-as.integer(opt$homopolymer_gap_penalty) + if(HOMOPOLYMER_GAP_PENALTY>0){ + HOMOPOLYMER_GAP_PENALTY<-HOMOPOLYMER_GAP_PENALTY*(-1) #negative numbers cannot be passed using optparse, so we convert it here + } +} +BAND_SIZE <- if(opt$band_size=='NULL') NULL else as.integer(opt$band_size) + +### VALIDATE ARGUMENTS ### +# Input directory is expected to contain .fastq.gz file(s) +# that have not yet been filtered and globally trimmed +# to the same length. +if(!dir.exists(inp.dir)) { + errQuit("Input directory does not exist.") +} else { + unfilts <- list.files(inp.dir, pattern=".fastq.gz$", full.names=TRUE) + if(length(unfilts) == 0) { + errQuit("No input files with the expected filename format found.") + } + if(inp.dirR!='NULL'){ + unfiltsR <- list.files(inp.dirR, pattern=".fastq.gz$", full.names=TRUE) + if(length(unfiltsR) == 0) { + errQuit("No input reverse files with the expected filename format found.") + } + if(length(unfilts) != length(unfiltsR)) { + errQuit("Different numbers of forward and reverse .fastq.gz files.") + } + + } + +} + +# Output files are to be filenames (not directories) and are to be +# removed and replaced if already present. +for(fn in c(out.path, out.track)) { + if(dir.exists(fn)) { + errQuit("Output filename ", fn, " is a directory.") + } else if(file.exists(fn)) { + invisible(file.remove(fn)) + } +} + +# Convert nthreads to the logical/numeric expected by dada2 +if(nthreads < 0) { + errQuit("nthreads must be non-negative.") +} else if(nthreads == 0) { + multithread <- TRUE # detect and use all +} else if(nthreads == 1) { + multithread <- FALSE +} else { + multithread <- nthreads +} + +### LOAD LIBRARIES ### +suppressWarnings(library(methods)) +suppressWarnings(library(dada2)) +cat("DADA2:", as.character(packageVersion("dada2")), "/", + "Rcpp:", as.character(packageVersion("Rcpp")), "/", + "RcppParallel:", as.character(packageVersion("RcppParallel")), "\n") + +### Remove Primers ### +if(primer.removed.dir!='NULL'){ #for CCS read analysis + cat("1) Removing Primers\n") + nop <- file.path(primer.removed.dir, basename(unfilts)) + + if(primerF == 'none' && primerR == 'none'){ + nop <- unfilts + } else { + prim <- suppressWarnings(removePrimers(unfilts, nop, primer, dada2:::rc(primerR), + max.mismatch = maxMismatch, allow.indels = indels, + orient = TRUE, verbose = TRUE)) + cat(ifelse(file.exists(nop), ".", "x"), sep="") + nop <- list.files(primer.removed.dir, pattern=".fastq.gz$", full.names=TRUE) + cat("\n") + if(length(nop) == 0) { # All reads were filtered out + errQuit("No reads passed the Removing Primers step (Did you select the right primers?)", status=2) + } + } +} + +### TRIM AND FILTER ### +cat("2) Filtering ") +if(primer.removed.dir!='NULL'){ #for CCS read analysis + filts <- file.path(filtered.dir, basename(nop)) + out <- suppressWarnings(filterAndTrim(nop, filts, truncLen = truncLen, trimLeft = trimLeft, + maxEE = maxEE, truncQ = truncQ, rm.phix = FALSE, + multithread = multithread, maxLen = maxLen, minLen = minLen, minQ = 0)) +}else{ + filts <- file.path(filtered.dir, basename(unfilts)) + if(inp.dirR!='NULL'){#for paired read analysis + filtsR <- file.path(filtered.dirR, basename(unfiltsR)) + out <- suppressWarnings(filterAndTrim(unfilts, filts, unfiltsR, filtsR, + truncLen=c(truncLen, truncLenR), trimLeft=c(trimLeft, trimLeftR), + maxEE=c(maxEE, maxEER), truncQ=truncQ, rm.phix=TRUE, + multithread=multithread)) + }else{#for sinlge/pyro read analysis + out <- suppressWarnings(filterAndTrim(unfilts, filts, truncLen=truncLen, trimLeft=trimLeft, + maxEE=maxEE, truncQ=truncQ, rm.phix=TRUE, + multithread=multithread, maxLen=maxLen)) + } +} + +cat(ifelse(file.exists(filts), ".", "x"), sep="") +filts <- list.files(filtered.dir, pattern=".fastq.gz$", full.names=TRUE) + +if(inp.dirR!='NULL'){ + filtsR <- list.files(filtered.dirR, pattern=".fastq.gz$", full.names=TRUE) +} +cat("\n") +if(length(filts) == 0) { # All reads were filtered out + errQuit("No reads passed the filter (was truncLen longer than the read length?)", status=2) +} + +### LEARN ERROR RATES ### +# Dereplicate enough samples to get nreads.learn total reads +cat("3) Learning Error Rates\n") +if(primer.removed.dir!='NULL'){#for CCS read analysis + err <- suppressWarnings(learnErrors(filts, nreads=nreads.learn, + errorEstimationFunction=dada2:::PacBioErrfun, + multithread=multithread, BAND_SIZE=BAND_SIZE)) + err_plot <- plotErrors(err) + pdf("plot_error_model.pdf", width=12, height=8, useDingbats=FALSE) + err_plot + dev.off() + +}else if(inp.dirR!='NULL'){#for paired read analysis + + err <- suppressWarnings(learnErrors(filts, nreads=nreads.learn, multithread=multithread)) + errR <- suppressWarnings(learnErrors(filtsR, nreads=nreads.learn, multithread=multithread)) + +}else{#for sinlge/pyro read analysis + err <- suppressWarnings(learnErrors(filts, nreads=nreads.learn, multithread=multithread, + HOMOPOLYMER_GAP_PENALTY=HOMOPOLYMER_GAP_PENALTY, BAND_SIZE=BAND_SIZE)) +} + +### PROCESS ALL SAMPLES ### +# Loop over rest in streaming fashion with learned error rates + +if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis + dds <- vector("list", length(filts)) + cat("4) Denoise samples ") + cat("\n") + for(j in seq(length(filts))) { + drp <- derepFastq(filts[[j]]) + dds[[j]] <- dada(drp, err=err, multithread=multithread,HOMOPOLYMER_GAP_PENALTY=HOMOPOLYMER_GAP_PENALTY, + BAND_SIZE=BAND_SIZE, verbose=FALSE) + cat(".") + } + cat("\n") + if(poolMethod == "pseudo") { + cat(" Pseudo-pool step ") + ### TEMPORARY, to be removed once 1.12 makes its way to Q2 + ### Needed for now to manage pseudo-pooling memory, as 1.10 didn't do this appropriately. + ### pseudo_priors code copied from dada2.R + st <- makeSequenceTable(dds) + pseudo_priors <- colnames(st)[colSums(st>0) >= 2 | colSums(st) >= Inf] + rm(st) + ### \pseudo_priors code copied from dada2.R + ### code copied from previous loop through samples in this script + for(j in seq(length(filts))) { + drp <- derepFastq(filts[[j]]) + dds[[j]] <- dada(drp, err=err, multithread=multithread, + priors=pseudo_priors, HOMOPOLYMER_GAP_PENALTY=HOMOPOLYMER_GAP_PENALTY, + BAND_SIZE=BAND_SIZE, verbose=FALSE) + cat(".") + } + cat("\n") + ### \code copied from previous loop through samples in this script + } + # Make sequence table + seqtab <- makeSequenceTable(dds) +}else{#for paired read analysis + denoisedF <- rep(0, length(filts)) + ddsF <- vector("list", length(filts)) + ddsR <- vector("list", length(filts)) + mergers <- vector("list", length(filts)) + cat("3) Denoise samples ") + + for(j in seq(length(filts))) { + drpF <- derepFastq(filts[[j]]) + ddsF[[j]] <- dada(drpF, err=err, multithread=multithread, verbose=FALSE) + drpR <- derepFastq(filtsR[[j]]) + ddsR[[j]] <- dada(drpR, err=errR, multithread=multithread, verbose=FALSE) + cat(".") + } + cat("\n") + if(poolMethod == "pseudo") { + cat(" Pseudo-pool step ") + ### TEMPORARY, to be removed once 1.12 makes its way to Q2 + ### Needed for now to manage pseudo-pooling memory, as 1.10 didn't do this appropriately. + ### pseudo_priors code copied from dada2.R + stF <- makeSequenceTable(ddsF) + pseudo_priorsF <- colnames(stF)[colSums(stF>0) >= 2 | colSums(stF) >= Inf] + rm(stF) + stR <- makeSequenceTable(ddsR) + pseudo_priorsR <- colnames(stR)[colSums(stR>0) >= 2 | colSums(stR) >= Inf] + rm(stR) + ### \pseudo_priors code copied from dada2.R + ### code copied from previous loop through samples in this script + for(j in seq(length(filts))) { + drpF <- derepFastq(filts[[j]]) + ddsF[[j]] <- dada(drpF, err=err, priors=pseudo_priorsF, + multithread=multithread, verbose=FALSE) + drpR <- derepFastq(filtsR[[j]]) + ddsR[[j]] <- dada(drpR, err=errR, priors=pseudo_priorsR, + multithread=multithread, verbose=FALSE) + cat(".") + } + cat("\n") + ### \code copied from previous loop through samples in this script + } + + ### Now loop through and do merging + for(j in seq(length(filts))) { + drpF <- derepFastq(filts[[j]]) + drpR <- derepFastq(filtsR[[j]]) + mergers[[j]] <- mergePairs(ddsF[[j]], drpF, ddsR[[j]], drpR, minOverlap=minOverlap) + denoisedF[[j]] <- getN(ddsF[[j]]) + cat(".") + } + cat("\n") + # Make sequence table + seqtab <- makeSequenceTable(mergers) + +} + + +### Remove chimeras +cat("5) Remove chimeras (method = ", chimeraMethod, ")\n", sep="") +if(chimeraMethod %in% c("pooled", "consensus")) { + seqtab.nochim <- removeBimeraDenovo(seqtab, method=chimeraMethod, minFoldParentOverAbundance=minParentFold, allowOneOff=allowOneOff, multithread=multithread) +} else { # No chimera removal, copy seqtab to seqtab.nochim + seqtab.nochim <- seqtab +} + +### REPORT READ FRACTIONS THROUGH PIPELINE ### +cat("6) Report read numbers through the pipeline\n") +if(inp.dirR =='NULL'){ + # Handle edge cases: Samples lost in filtering; One sample + if(primer.removed.dir!='NULL'){ #for CCS read analysis + if (primerF == 'none' && primerR == 'none'){ + track <- cbind(out[ ,1:2,drop=FALSE], matrix(0, nrow=nrow(out), ncol=2)) + colnames(track) <- c("input", "filtered", "denoised", "non-chimeric") + passed.filtering <- track[,"filtered"] > 0 + track[passed.filtering,"denoised"] <- rowSums(seqtab) + track[passed.filtering,"non-chimeric"] <- rowSums(seqtab.nochim) + write.table(track, out.track, sep="\t", row.names=TRUE, col.names=NA, + quote=FALSE) + } else { + track <- cbind(prim,out[ ,2], matrix(0, nrow=nrow(out), ncol=2)) + colnames(track) <- c("input", "primer-removed","filtered", "denoised", "non-chimeric") + passed.filtering <- track[,"filtered"] > 0 + track[passed.filtering,"denoised"] <- rowSums(seqtab) + track[passed.filtering,"non-chimeric"] <- rowSums(seqtab.nochim) + write.table(track, out.track, sep="\t", row.names=TRUE, col.names=NA, + quote=FALSE) + } + }else{#for paired end reads + # Handle edge cases: Samples lost in filtering; One sample + track <- cbind(out, matrix(0, nrow=nrow(out), ncol=3)) + colnames(track) <- c("input", "filtered", "denoised", "merged", "non-chimeric") + passed.filtering <- track[,"filtered"] > 0 + track[passed.filtering,"denoised"] <- denoisedF + track[passed.filtering,"merged"] <- rowSums(seqtab) + track[passed.filtering,"non-chimeric"] <- rowSums(seqtab.nochim) + write.table(track, out.track, sep="\t", row.names=TRUE, col.names=NA, + quote=FALSE) + } +} + +### WRITE OUTPUT AND QUIT ### +# Formatting as tsv plain-text sequence table table +cat("7) Write output\n") +seqtab.nochim <- t(seqtab.nochim) # QIIME has OTUs as rows +col.names <- basename(filts) +col.names[[1]] <- paste0("#OTU ID\t", col.names[[1]]) +write.table(seqtab.nochim, out.path, sep="\t", + row.names=TRUE, col.names=col.names, quote=FALSE) +saveRDS(seqtab.nochim, "seqtab_nochim.rds") ### TESTING + +q(status=0) diff --git a/scripts/visualize_biom.Rmd b/scripts/visualize_biom.Rmd index 5feefdd..4a408f7 100644 --- a/scripts/visualize_biom.Rmd +++ b/scripts/visualize_biom.Rmd @@ -220,6 +220,8 @@ dada2_qc <- dada2_qc %>% * Missing samples (Not enough reads, do not pass QC, etc): `r removed_sample` * Total number of CCS reads before filtering and primers trimming: `r nrow(reads_qc)` * Was primers trimmed prior to DADA2? `r paste0(" ", trim_cutadapt)` +* Note that if downsampling was enabled, the reads will be filtered first, then restricted + to a maximum of N reads. * Total number of reads after quality filtering: `r if (skip_cutadapt != "Yes") {sum(all_read_count$input_reads)} else {sum(as.numeric(dada2_qc$input))}` (`r if (skip_cutadapt != "Yes") {paste0(round(sum(all_read_count$input_reads)/nrow(reads_qc), 4)*100, "%")} else {paste0(round(sum(as.numeric(dada2_qc$input))/nrow(reads_qc), 4)*100, "%")}`) * Total number of reads after primers trimming (DADA2 input): `r if (skip_cutadapt != "Yes") {sum(all_read_count$demuxed_reads)} else {paste0("Skipped cutadapt trimming")}` (`r if (skip_cutadapt != "Yes") {paste0(round(sum(all_read_count$demuxed_reads)/sum(all_read_count$input_reads), 4)*100, "%")} else {paste0("Skipped cutadapt trimming")}`) * Total number of ASVs found: `r total_asv`