From d47b62035b0e8ae0d43c21436de82dbf53ebfec0 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 21 Dec 2022 15:41:03 +0000 Subject: [PATCH 01/71] Bump pipeline version to 1.10dev --- CHANGELOG.md | 4 ++++ nextflow.config | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 54242118..6765ac12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unpublished Version / DEV] + +### Enhancements & fixes + ## [[1.9](https://github.com/nf-core/fetchngs/releases/tag/1.9)] - 2022-12-21 ### Enhancements & fixes diff --git a/nextflow.config b/nextflow.config index d6686ae4..1ead02d0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -177,7 +177,7 @@ manifest { description = """Pipeline to fetch metadata and raw FastQ files from public databases""" mainScript = 'main.nf' nextflowVersion = '!>=22.10.1' - version = '1.9' + version = '1.10dev' doi = 'https://doi.org/10.5281/zenodo.5070524' } From 9a88c5ca48afc5a6d9af4283736a1ad0c455419c Mon Sep 17 00:00:00 2001 From: Rob Syme Date: Mon, 3 Apr 2023 15:44:44 +0100 Subject: [PATCH 02/71] Update module versions Changelog update --- CHANGELOG.md | 1 + modules.json | 4 ++-- modules/nf-core/custom/dumpsoftwareversions/main.nf | 6 +++--- .../dumpsoftwareversions/templates/dumpsoftwareversions.py | 0 .../sratools/prefetch/templates/retry_with_backoff.sh | 2 ++ 5 files changed, 8 insertions(+), 5 deletions(-) mode change 100755 => 100644 modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 6765ac12..cadb21a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unpublished Version / DEV] +- Bumped modules version to allow for sratools download of sralite format files. ### Enhancements & fixes diff --git a/modules.json b/modules.json index c8cd187a..bd085eba 100644 --- a/modules.json +++ b/modules.json @@ -7,7 +7,7 @@ "nf-core": { "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "7101db4432d3268b7fcb5b8f75fa0a022dc5561b", "installed_by": ["modules"] }, "custom/sratoolsncbisettings": { @@ -22,7 +22,7 @@ }, "sratools/prefetch": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "cded1570b7cb0ea128a5c89dd6ec3a62035c1526", "installed_by": ["fastq_download_prefetch_fasterqdump_sratools"] } } diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index 3df21765..800a6099 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda "bioconda::multiqc=1.13" + conda "bioconda::multiqc=1.14" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.14--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py old mode 100755 new mode 100644 diff --git a/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh b/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh index cec0ab43..e08dbb6a 100755 --- a/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh +++ b/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh @@ -47,6 +47,8 @@ retry_with_backoff !{args2} \ !{args} \ !{id} +[ -f !{id}.sralite ] && { mkdir -p !{id}; mv "!{id}.sralite" !{id}/; } + vdb-validate !{id} cat <<-END_VERSIONS > versions.yml From ef7effc9beb884ba7e1aff6d05f4b8f69c71ecdb Mon Sep 17 00:00:00 2001 From: Rob Syme Date: Mon, 3 Apr 2023 16:46:19 +0100 Subject: [PATCH 03/71] Prettier fix --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index cadb21a6..05e123b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unpublished Version / DEV] + - Bumped modules version to allow for sratools download of sralite format files. ### Enhancements & fixes From 7885fcd294f3ff2134a73599e5e131d1731fbee2 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 25 Apr 2023 11:09:17 +0100 Subject: [PATCH 04/71] Update all nf-core subworkflows --- modules.json | 2 +- .../fastq_download_prefetch_fasterqdump_sratools/meta.yml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/modules.json b/modules.json index bd085eba..6a0c61a8 100644 --- a/modules.json +++ b/modules.json @@ -31,7 +31,7 @@ "nf-core": { "fastq_download_prefetch_fasterqdump_sratools": { "branch": "master", - "git_sha": "03711bcb7fa2a7088eb54abb1fca326d30e602c2", + "git_sha": "ca2bf9212707e83717934ec6c5d4cab42b39ca69", "installed_by": ["subworkflows"] } } diff --git a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml index c385ca21..ed93a54a 100644 --- a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml +++ b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml @@ -1,3 +1,4 @@ +# yaml-language-server: $schema=https://github.com/nf-core/modules/tree/master/subworkflows/yaml-schema.json name: fastq_download_prefetch_fasterqdump_sratools description: Download FASTQ sequencing reads from the NCBI's Sequence Read Archive (SRA). keywords: From aba1d0bdac2cb906b7d6f54579fcfb73f51cdb63 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 25 Apr 2023 23:04:08 +0100 Subject: [PATCH 05/71] Add fasterq-dump options to --split-files --- conf/modules.config | 1 + 1 file changed, 1 insertion(+) diff --git a/conf/modules.config b/conf/modules.config index c42f4d60..f5cb1c77 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -77,6 +77,7 @@ if (params.input_type == 'sra') { } withName: SRATOOLS_FASTERQDUMP { + ext.args = '--split-files --include-technical' publishDir = [ path: { "${params.outdir}/fastq" }, mode: params.publish_dir_mode, From fff5da74cd08870fc08388ad086b388366afd6f0 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 25 Apr 2023 23:50:05 +0100 Subject: [PATCH 06/71] Use generic glob for output fastq files from fasterq-dump --- modules/nf-core/sratools/fasterqdump/main.nf | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/modules/nf-core/sratools/fasterqdump/main.nf b/modules/nf-core/sratools/fasterqdump/main.nf index ca5ee763..2336c318 100644 --- a/modules/nf-core/sratools/fasterqdump/main.nf +++ b/modules/nf-core/sratools/fasterqdump/main.nf @@ -12,8 +12,8 @@ process SRATOOLS_FASTERQDUMP { path ncbi_settings output: - tuple val(meta), path(fastq), emit: reads - path "versions.yml" , emit: versions + tuple val(meta), path('*.fastq.gz'), emit: reads + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -22,11 +22,6 @@ process SRATOOLS_FASTERQDUMP { def args = task.ext.args ?: '' def args2 = task.ext.args2 ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - - // WARNING: Paired-end data extracted by fasterq-dump (--split-3 the default) - // always creates *_1.fastq *_2.fastq files but sometimes also - // an additional *.fastq file for unpaired reads which we ignore here. - fastq = meta.single_end ? '*.fastq.gz' : '*_{1,2}.fastq.gz' def outfile = meta.single_end ? "${prefix}.fastq" : prefix """ export NCBI_SETTINGS="\$PWD/${ncbi_settings}" From 37f0a785bc2d7d2c40ba4b3173cbc5bcd57305bb Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 25 Apr 2023 23:57:32 +0100 Subject: [PATCH 07/71] Update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 05e123b6..3a443a96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unpublished Version / DEV] +- [#144](https://github.com/nf-core/fetchngs/issues/144) - Add support to download 10X Genomics data - Bumped modules version to allow for sratools download of sralite format files. ### Enhancements & fixes From 0dcc5f37f0968a9784da03047b645d0bdfc53a6e Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 26 Apr 2023 00:29:00 +0100 Subject: [PATCH 08/71] Log contents of reads variable --- workflows/sra.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/sra.nf b/workflows/sra.nf index 1085598b..ba39d233 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -124,6 +124,7 @@ workflow SRA { .map { meta, fastq -> def reads = meta.single_end ? [ fastq ] : fastq + log.info "$meta.id: $reads" def meta_clone = meta.clone() meta_clone.fastq_1 = reads[0] ? "${params.outdir}/fastq/${reads[0].getName()}" : '' meta_clone.fastq_2 = reads[1] && !meta.single_end ? "${params.outdir}/fastq/${reads[1].getName()}" : '' From 2d653037d127f03d8a4228833a8ac3041babe818 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 26 Apr 2023 00:35:43 +0100 Subject: [PATCH 09/71] Add more logging --- workflows/sra.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/sra.nf b/workflows/sra.nf index ba39d233..2a704395 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -116,6 +116,8 @@ workflow SRA { ch_sra_reads.sra.map { meta, reads -> [ meta, meta.run_accession ] } ) ch_versions = ch_versions.mix(FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS.out.versions.first()) + + FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS.out.reads.view() SRA_FASTQ_FTP .out @@ -124,7 +126,7 @@ workflow SRA { .map { meta, fastq -> def reads = meta.single_end ? [ fastq ] : fastq - log.info "$meta.id: $reads" + log.info "$meta.id: $meta.single_end: $reads" def meta_clone = meta.clone() meta_clone.fastq_1 = reads[0] ? "${params.outdir}/fastq/${reads[0].getName()}" : '' meta_clone.fastq_2 = reads[1] && !meta.single_end ? "${params.outdir}/fastq/${reads[1].getName()}" : '' From 1867a80f45c5bd58f28bdb00ed5ffc94ef5a2082 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 26 Apr 2023 01:02:44 +0100 Subject: [PATCH 10/71] Fix logic to factor for reads being a nested list --- workflows/sra.nf | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/workflows/sra.nf b/workflows/sra.nf index 2a704395..38915944 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -116,8 +116,6 @@ workflow SRA { ch_sra_reads.sra.map { meta, reads -> [ meta, meta.run_accession ] } ) ch_versions = ch_versions.mix(FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS.out.versions.first()) - - FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS.out.reads.view() SRA_FASTQ_FTP .out @@ -125,12 +123,10 @@ workflow SRA { .mix(FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS.out.reads) .map { meta, fastq -> - def reads = meta.single_end ? [ fastq ] : fastq - log.info "$meta.id: $meta.single_end: $reads" - def meta_clone = meta.clone() - meta_clone.fastq_1 = reads[0] ? "${params.outdir}/fastq/${reads[0].getName()}" : '' - meta_clone.fastq_2 = reads[1] && !meta.single_end ? "${params.outdir}/fastq/${reads[1].getName()}" : '' - return meta_clone + def reads = fastq instanceof List ? fastq.flatten() : [ fastq ] + def fastq_1 = reads[0] ? "${params.outdir}/fastq/${reads[0].getName()}" : '' + def fastq_2 = reads[1] && !meta.single_end ? "${params.outdir}/fastq/${reads[1].getName()}" : '' + return [ meta + [ fastq_1: fastq_1, fastq_2: fastq_2 ], fastq ] } .set { ch_sra_metadata } } From 399081edcf83af28a137a29344f7c5b4a6f9e78b Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 26 Apr 2023 01:11:41 +0100 Subject: [PATCH 11/71] Remove fastq files from channel --- workflows/sra.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/sra.nf b/workflows/sra.nf index 38915944..d2ce0c2b 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -126,7 +126,7 @@ workflow SRA { def reads = fastq instanceof List ? fastq.flatten() : [ fastq ] def fastq_1 = reads[0] ? "${params.outdir}/fastq/${reads[0].getName()}" : '' def fastq_2 = reads[1] && !meta.single_end ? "${params.outdir}/fastq/${reads[1].getName()}" : '' - return [ meta + [ fastq_1: fastq_1, fastq_2: fastq_2 ], fastq ] + return [ meta + [ fastq_1: fastq_1, fastq_2: fastq_2 ] ] } .set { ch_sra_metadata } } From 935de8cf64c67b3542b4c2a233ca3c1392119ea1 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 26 Apr 2023 01:35:19 +0100 Subject: [PATCH 12/71] Revert to cloning map because adding entries returns a list --- workflows/sra.nf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/workflows/sra.nf b/workflows/sra.nf index d2ce0c2b..52479771 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -124,9 +124,10 @@ workflow SRA { .map { meta, fastq -> def reads = fastq instanceof List ? fastq.flatten() : [ fastq ] - def fastq_1 = reads[0] ? "${params.outdir}/fastq/${reads[0].getName()}" : '' - def fastq_2 = reads[1] && !meta.single_end ? "${params.outdir}/fastq/${reads[1].getName()}" : '' - return [ meta + [ fastq_1: fastq_1, fastq_2: fastq_2 ] ] + def meta_clone = meta.clone() + meta_clone.fastq_1 = reads[0] ? "${params.outdir}/fastq/${reads[0].getName()}" : '' + meta_clone.fastq_2 = reads[1] && !meta.single_end ? "${params.outdir}/fastq/${reads[1].getName()}" : '' + return meta_clone } .set { ch_sra_metadata } } From aa82d896b7a7f70700d23df8bb964f64ba388c3e Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 26 Apr 2023 10:54:54 +0100 Subject: [PATCH 13/71] Add full-sized data for 10x to test_full --- conf/test_full.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test_full.config b/conf/test_full.config index 2f0303ea..c16fda7f 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -15,5 +15,5 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test_full.csv' } From b47cab18718eaebe8b592d53a39e87653d12b727 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 26 Apr 2023 12:33:05 +0100 Subject: [PATCH 14/71] Re-install sratools/fasterqdump from nf-core/modules --- modules.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules.json b/modules.json index 6a0c61a8..f6d1001c 100644 --- a/modules.json +++ b/modules.json @@ -17,7 +17,7 @@ }, "sratools/fasterqdump": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "10cb20f6a130d104fef335a8290f3ffce650f28d", "installed_by": ["fastq_download_prefetch_fasterqdump_sratools"] }, "sratools/prefetch": { From 81fa94becb4fd22ba306d477bda4e54948a0c0ac Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Fri, 28 Apr 2023 14:21:31 +0000 Subject: [PATCH 15/71] Template update for nf-core/tools version 2.8 --- .editorconfig | 2 +- .github/ISSUE_TEMPLATE/bug_report.yml | 2 +- .github/PULL_REQUEST_TEMPLATE.md | 3 +- .github/workflows/awsfulltest.yml | 2 +- .github/workflows/awstest.yml | 2 +- .github/workflows/branch.yml | 2 +- .github/workflows/clean-up.yml | 24 ++++ .github/workflows/linting.yml | 2 +- .pre-commit-config.yaml | 5 + CHANGELOG.md | 2 +- README.md | 74 ++++++---- bin/check_samplesheet.py | 3 - conf/base.config | 2 +- conf/igenomes.config | 8 ++ conf/test_full.config | 2 + docs/usage.md | 130 +++++------------- lib/NfcoreSchema.groovy | 4 +- lib/WorkflowFetchngs.groovy | 12 +- lib/WorkflowMain.groovy | 13 +- main.nf | 1 - modules.json | 4 +- modules/local/samplesheet_check.nf | 2 +- .../custom/dumpsoftwareversions/main.nf | 6 +- .../custom/dumpsoftwareversions/meta.yml | 2 + modules/nf-core/multiqc/main.nf | 6 +- modules/nf-core/multiqc/meta.yml | 3 +- nextflow.config | 31 ++++- tower.yml | 5 + 28 files changed, 195 insertions(+), 159 deletions(-) create mode 100644 .github/workflows/clean-up.yml create mode 100644 .pre-commit-config.yaml create mode 100644 tower.yml diff --git a/.editorconfig b/.editorconfig index b78de6e6..b6b31907 100644 --- a/.editorconfig +++ b/.editorconfig @@ -8,7 +8,7 @@ trim_trailing_whitespace = true indent_size = 4 indent_style = space -[*.{md,yml,yaml,html,css,scss,js,cff}] +[*.{md,yml,yaml,html,css,scss,js}] indent_size = 2 # These files are edited and tested upstream in nf-core/modules diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index fea6264d..9a85c826 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -45,6 +45,6 @@ body: * Nextflow version _(eg. 22.10.1)_ * Hardware _(eg. HPC, Desktop, Cloud)_ * Executor _(eg. slurm, local, awsbatch)_ - * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter or Charliecloud)_ + * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, or Apptainer)_ * OS _(eg. CentOS Linux, macOS, Linux Mint)_ * Version of nf-core/fetchngs _(eg. 1.1, 1.5, 1.8.2)_ diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index c9f23b88..a8f29dc0 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -15,7 +15,8 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/fetc - [ ] This comment contains a description of changes (with reason). - [ ] If you've fixed a bug or added code that should be tested, add tests! -- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/fetchngs/tree/master/.github/CONTRIBUTING.md)- [ ] If necessary, also make a PR on the nf-core/fetchngs _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. +- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/fetchngs/tree/master/.github/CONTRIBUTING.md) +- [ ] If necessary, also make a PR on the nf-core/fetchngs _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core lint`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index b471dde0..523b1033 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Launch workflow via tower - uses: nf-core/tower-action@v3 + uses: seqeralabs/action-tower-launch@v1 # TODO nf-core: You can customise AWS full pipeline tests as required # Add full size test data (but still relatively small datasets for few samples) # on the `test_full.config` test runs with only one set of parameters diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index 6e6a8c52..cdfba2f0 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -12,7 +12,7 @@ jobs: steps: # Launch workflow using Tower CLI tool action - name: Launch workflow via tower - uses: nf-core/tower-action@v3 + uses: seqeralabs/action-tower-launch@v1 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index 708158fa..3747c012 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -13,7 +13,7 @@ jobs: - name: Check PRs if: github.repository == 'nf-core/fetchngs' run: | - { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/fetchngs ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] + { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/fetchngs ]] && [[ $GITHUB_HEAD_REF == "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] # If the above check failed, post a comment on the PR explaining the failure # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets diff --git a/.github/workflows/clean-up.yml b/.github/workflows/clean-up.yml new file mode 100644 index 00000000..694e90ec --- /dev/null +++ b/.github/workflows/clean-up.yml @@ -0,0 +1,24 @@ +name: "Close user-tagged issues and PRs" +on: + schedule: + - cron: "0 0 * * 0" # Once a week + +jobs: + clean-up: + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + steps: + - uses: actions/stale@v7 + with: + stale-issue-message: "This issue has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment otherwise this issue will be closed in 20 days." + stale-pr-message: "This PR has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment if it is still useful." + close-issue-message: "This issue was closed because it has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor and then staled for 20 days with no activity." + days-before-stale: 30 + days-before-close: 20 + days-before-pr-close: -1 + any-of-labels: "awaiting-changes,awaiting-feedback" + exempt-issue-labels: "WIP" + exempt-pr-labels: "WIP" + repo-token: "${{ secrets.GITHUB_TOKEN }}" diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 858d622e..888cb4bc 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -78,7 +78,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: "3.7" + python-version: "3.8" architecture: "x64" - name: Install dependencies diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..0c31cdb9 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,5 @@ +repos: + - repo: https://github.com/pre-commit/mirrors-prettier + rev: "v2.7.1" + hooks: + - id: prettier diff --git a/CHANGELOG.md b/CHANGELOG.md index deef01a9..2792d527 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v1.9dev - [date] +## v1.10dev - [date] Initial release of nf-core/fetchngs, created with the [nf-core](https://nf-co.re/) template. diff --git a/README.md b/README.md index 92478b96..c229c26c 100644 --- a/README.md +++ b/README.md @@ -8,57 +8,71 @@ [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/nf-core/fetchngs) -[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23fetchngs-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/fetchngs)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) +[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23fetchngs-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/fetchngs)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) ## Introduction - +**nf-core/fetchngs** is a bioinformatics pipeline that ... -**nf-core/fetchngs** is a bioinformatics best-practice analysis pipeline for Pipeline to fetch metadata and raw FastQ files from public databases. - -The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! - - - -On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources.The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/fetchngs/results). - -## Pipeline summary + + 1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) 2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) -## Quick Start +## Usage + +> **Note** +> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how +> to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) +> with `-profile test` before running the workflow on actual data. + + - Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string. +Now, you can run the pipeline using: - > - The pipeline comes with config profiles called `docker`, `singularity`, `podman`, `shifter`, `charliecloud` and `conda` which instruct the pipeline to use the named tool for software management. For example, `-profile test,docker`. - > - Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. - > - If you are using `singularity`, please use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to download images first, before running the pipeline. Setting the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options enables you to store and re-use the images from a central location for future pipeline runs. - > - If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs. + -4. Start running your own analysis! +```bash +nextflow run nf-core/fetchngs \ + -profile \ + --input samplesheet.csv \ + --outdir +``` - +> **Warning:** +> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those +> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; +> see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). - ```bash - nextflow run nf-core/fetchngs --input samplesheet.csv --outdir --genome GRCh37 -profile - ``` +For more details, please refer to the [usage documentation](https://nf-co.re/fetchngs/usage) and the [parameter documentation](https://nf-co.re/fetchngs/parameters). -## Documentation +## Pipeline output -The nf-core/fetchngs pipeline comes with documentation about the pipeline [usage](https://nf-co.re/fetchngs/usage), [parameters](https://nf-co.re/fetchngs/parameters) and [output](https://nf-co.re/fetchngs/output). +To see the the results of a test run with a full size dataset refer to the [results](https://nf-co.re/fetchngs/results) tab on the nf-core website pipeline page. +For more details about the output files and reports, please refer to the +[output documentation](https://nf-co.re/fetchngs/output). ## Credits diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 11b15572..4a758fe0 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -158,9 +158,6 @@ def sniff_format(handle): peek = read_head(handle) handle.seek(0) sniffer = csv.Sniffer() - if not sniffer.has_header(peek): - logger.critical("The given sample sheet does not appear to contain a header.") - sys.exit(1) dialect = sniffer.sniff(peek) return dialect diff --git a/conf/base.config b/conf/base.config index a15391ba..58b4ca34 100644 --- a/conf/base.config +++ b/conf/base.config @@ -15,7 +15,7 @@ process { memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } - errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } maxRetries = 1 maxErrors = '-1' diff --git a/conf/igenomes.config b/conf/igenomes.config index 7a1b3ac6..3f114377 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -36,6 +36,14 @@ params { macs_gsize = "2.7e9" blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" } + 'CHM13' { + fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/BWAIndex/" + bwamem2 = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/BWAmem2Index/" + gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/CHM13/Annotation/Genes/genes.gtf" + gff = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/914/755/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz" + mito_name = "chrM" + } 'GRCm38' { fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa" bwa = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BWAIndex/version0.6.0/" diff --git a/conf/test_full.config b/conf/test_full.config index 460f61d4..4c032754 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -10,6 +10,8 @@ ---------------------------------------------------------------------------------------- */ +cleanup = true + params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' diff --git a/docs/usage.md b/docs/usage.md index b3c188a3..efd5e798 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -71,6 +71,29 @@ work # Directory containing the nextflow working files # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` +If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. + +Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. + +> ⚠️ Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). +> The above pipeline run specified with a params file in yaml format: + +```bash +nextflow run nf-core/fetchngs -profile docker -params-file params.yaml +``` + +with `params.yaml` containing: + +```yaml +input: './samplesheet.csv' +outdir: './results/' +genome: 'GRCh37' +input: 'data' +<...> +``` + +You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: @@ -87,6 +110,10 @@ First, go to the [nf-core/fetchngs releases page](https://github.com/nf-core/fet This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports. +To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. + +> 💡 If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. + ## Core Nextflow arguments > **NB:** These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). @@ -95,7 +122,7 @@ This version number will be logged in reports when you run the pipeline, so that Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. -Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Conda) - see below. +Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. > We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. @@ -119,8 +146,10 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) - `charliecloud` - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) +- `apptainer` + - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) - `conda` - - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud. + - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter, Charliecloud, or Apptainer. ### `-resume` @@ -138,102 +167,19 @@ Specify the path to a specific config file (this is a core Nextflow command). Se Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L18) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. -For example, if the nf-core/rnaseq pipeline is failing after multiple re-submissions of the `STAR_ALIGN` process due to an exit code of `137` this would indicate that there is an out of memory issue: - -```console -[62/149eb0] NOTE: Process `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) -- Execution is retried (1) -Error executing process > 'NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)' - -Caused by: - Process `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) - -Command executed: - STAR \ - --genomeDir star \ - --readFilesIn WT_REP1_trimmed.fq.gz \ - --runThreadN 2 \ - --outFileNamePrefix WT_REP1. \ - - -Command exit status: - 137 - -Command output: - (empty) - -Command error: - .command.sh: line 9: 30 Killed STAR --genomeDir star --readFilesIn WT_REP1_trimmed.fq.gz --runThreadN 2 --outFileNamePrefix WT_REP1. -Work dir: - /home/pipelinetest/work/9d/172ca5881234073e8d76f2a19c88fb - -Tip: you can replicate the issue by changing to the process work dir and entering the command `bash .command.run` -``` - -#### For beginners - -A first step to bypass this error, you could try to increase the amount of CPUs, memory, and time for the whole pipeline. Therefor you can try to increase the resource for the parameters `--max_cpus`, `--max_memory`, and `--max_time`. Based on the error above, you have to increase the amount of memory. Therefore you can go to the [parameter documentation of rnaseq](https://nf-co.re/rnaseq/3.9/parameters) and scroll down to the `show hidden parameter` button to get the default value for `--max_memory`. In this case 128GB, you than can try to run your pipeline again with `--max_memory 200GB -resume` to skip all process, that were already calculated. If you can not increase the resource of the complete pipeline, you can try to adapt the resource for a single process as mentioned below. - -#### Advanced option on process level - -To bypass this error you would need to find exactly which resources are set by the `STAR_ALIGN` process. The quickest way is to search for `process STAR_ALIGN` in the [nf-core/rnaseq Github repo](https://github.com/nf-core/rnaseq/search?q=process+STAR_ALIGN). -We have standardised the structure of Nextflow DSL2 pipelines such that all module files will be present in the `modules/` directory and so, based on the search results, the file we want is `modules/nf-core/star/align/main.nf`. -If you click on the link to that file you will notice that there is a `label` directive at the top of the module that is set to [`label process_high`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L9). -The [Nextflow `label`](https://www.nextflow.io/docs/latest/process.html#label) directive allows us to organise workflow processes in separate groups which can be referenced in a configuration file to select and configure subset of processes having similar computing requirements. -The default values for the `process_high` label are set in the pipeline's [`base.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L33-L37) which in this case is defined as 72GB. -Providing you haven't set any other standard nf-core parameters to **cap** the [maximum resources](https://nf-co.re/usage/configuration#max-resources) used by the pipeline then we can try and bypass the `STAR_ALIGN` process failure by creating a custom config file that sets at least 72GB of memory, in this case increased to 100GB. -The custom config below can then be provided to the pipeline via the [`-c`](#-c) parameter as highlighted in previous sections. - -```nextflow -process { - withName: 'NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN' { - memory = 100.GB - } -} -``` - -> **NB:** We specify the full process name i.e. `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN` in the config file because this takes priority over the short name (`STAR_ALIGN`) and allows existing configuration using the full process name to be correctly overridden. -> -> If you get a warning suggesting that the process selector isn't recognised check that the process name has been specified correctly. - -### Updating containers (advanced users) - -The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. If for some reason you need to use a different version of a particular tool with the pipeline then you just need to identify the `process` name and override the Nextflow `container` definition for that process using the `withName` declaration. For example, in the [nf-core/viralrecon](https://nf-co.re/viralrecon) pipeline a tool called [Pangolin](https://github.com/cov-lineages/pangolin) has been used during the COVID-19 pandemic to assign lineages to SARS-CoV-2 genome sequenced samples. Given that the lineage assignments change quite frequently it doesn't make sense to re-release the nf-core/viralrecon everytime a new version of Pangolin has been released. However, you can override the default container used by the pipeline by creating a custom config file and passing it as a command-line argument via `-c custom.config`. - -1. Check the default version used by the pipeline in the module file for [Pangolin](https://github.com/nf-core/viralrecon/blob/a85d5969f9025409e3618d6c280ef15ce417df65/modules/nf-core/software/pangolin/main.nf#L14-L19) -2. Find the latest version of the Biocontainer available on [Quay.io](https://quay.io/repository/biocontainers/pangolin?tag=latest&tab=tags) -3. Create the custom config accordingly: - - - For Docker: +To change the resource requests, please see the [max resources](https://nf-co.re/docs/usage/configuration#max-resources) and [tuning workflow resources](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources) section of the nf-core website. - ```nextflow - process { - withName: PANGOLIN { - container = 'quay.io/biocontainers/pangolin:3.0.5--pyhdfd78af_0' - } - } - ``` +### Custom Containers - - For Singularity: +In some cases you may wish to change which container or conda environment a step of the pipeline uses for a particular tool. By default nf-core pipelines use containers and software from the [biocontainers](https://biocontainers.pro/) or [bioconda](https://bioconda.github.io/) projects. However in some cases the pipeline specified version maybe out of date. - ```nextflow - process { - withName: PANGOLIN { - container = 'https://depot.galaxyproject.org/singularity/pangolin:3.0.5--pyhdfd78af_0' - } - } - ``` +To use a different container from the default container or conda environment specified in a pipeline, please see the [updating tool versions](https://nf-co.re/docs/usage/configuration#updating-tool-versions) section of the nf-core website. - - For Conda: +### Custom Tool Arguments - ```nextflow - process { - withName: PANGOLIN { - conda = 'bioconda::pangolin=3.0.5' - } - } - ``` +A pipeline might not always support every possible argument or option of a particular tool used in pipeline. Fortunately, nf-core pipelines provide some freedom to users to insert additional parameters that the pipeline does not include by default. -> **NB:** If you wish to periodically update individual tool-specific results (e.g. Pangolin) generated by the pipeline then you must ensure to keep the `work/` directory otherwise the `-resume` ability of the pipeline will be compromised and it will restart from scratch. +To learn how to provide additional arguments to a particular tool of the pipeline, please see the [customising tool arguments](https://nf-co.re/docs/usage/configuration#customising-tool-arguments) section of the nf-core website. ### nf-core/configs diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy index 33cd4f6e..9b34804d 100755 --- a/lib/NfcoreSchema.groovy +++ b/lib/NfcoreSchema.groovy @@ -2,6 +2,7 @@ // This file holds several functions used to perform JSON parameter validation, help and summary rendering for the nf-core pipeline template. // +import nextflow.Nextflow import org.everit.json.schema.Schema import org.everit.json.schema.loader.SchemaLoader import org.everit.json.schema.ValidationException @@ -83,6 +84,7 @@ class NfcoreSchema { 'stub-run', 'test', 'w', + 'with-apptainer', 'with-charliecloud', 'with-conda', 'with-dag', @@ -177,7 +179,7 @@ class NfcoreSchema { } if (has_error) { - System.exit(1) + Nextflow.error('Exiting!') } } diff --git a/lib/WorkflowFetchngs.groovy b/lib/WorkflowFetchngs.groovy index 7a1e5ab7..748aaa28 100755 --- a/lib/WorkflowFetchngs.groovy +++ b/lib/WorkflowFetchngs.groovy @@ -2,6 +2,7 @@ // This file holds several functions specific to the workflow/fetchngs.nf in the nf-core/fetchngs pipeline // +import nextflow.Nextflow import groovy.text.SimpleTemplateEngine class WorkflowFetchngs { @@ -14,8 +15,7 @@ class WorkflowFetchngs { if (!params.fasta) { - log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." - System.exit(1) + Nextflow.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." } } @@ -61,17 +61,19 @@ class WorkflowFetchngs { def description_html = engine.createTemplate(methods_text).make(meta) return description_html - }// + } + + // // Exit pipeline if incorrect --genome key provided // private static void genomeExistsError(params, log) { if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { - log.error "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " Genome '${params.genome}' not found in any config files provided to the pipeline.\n" + " Currently, the available genome keys are:\n" + " ${params.genomes.keySet().join(", ")}\n" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - System.exit(1) + Nextflow.error(error_string) } } } diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 30fcedde..f1dcf537 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -2,6 +2,8 @@ // This file holds several functions specific to the main.nf workflow in the nf-core/fetchngs pipeline // +import nextflow.Nextflow + class WorkflowMain { // @@ -21,7 +23,7 @@ class WorkflowMain { // // Generate help string // - public static String help(workflow, params, log) { + public static String help(workflow, params) { def command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GRCh37 -profile docker" def help_string = '' help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs) @@ -34,7 +36,7 @@ class WorkflowMain { // // Generate parameter summary log string // - public static String paramsSummaryLog(workflow, params, log) { + public static String paramsSummaryLog(workflow, params) { def summary_log = '' summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs) summary_log += NfcoreSchema.paramsSummaryLog(workflow, params) @@ -49,7 +51,7 @@ class WorkflowMain { public static void initialise(workflow, params, log) { // Print help to screen if required if (params.help) { - log.info help(workflow, params, log) + log.info help(workflow, params) System.exit(0) } @@ -61,7 +63,7 @@ class WorkflowMain { } // Print parameter summary log to screen - log.info paramsSummaryLog(workflow, params, log) + log.info paramsSummaryLog(workflow, params) // Validate workflow parameters via the JSON schema if (params.validate_params) { @@ -81,8 +83,7 @@ class WorkflowMain { // Check input has been provided if (!params.input) { - log.error "Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'" - System.exit(1) + Nextflow.error("Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'") } } // diff --git a/main.nf b/main.nf index 0bded66d..4da08510 100644 --- a/main.nf +++ b/main.nf @@ -4,7 +4,6 @@ nf-core/fetchngs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Github : https://github.com/nf-core/fetchngs - Website: https://nf-co.re/fetchngs Slack : https://nfcore.slack.com/channels/fetchngs ---------------------------------------------------------------------------------------- diff --git a/modules.json b/modules.json index a306549a..37262f97 100644 --- a/modules.json +++ b/modules.json @@ -7,7 +7,7 @@ "nf-core": { "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "76cc4938c1f6ea5c7d83fed1eeffc146787f9543", "installed_by": ["modules"] }, "fastqc": { @@ -17,7 +17,7 @@ }, "multiqc": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "f2d63bd5b68925f98f572eed70993d205cc694b7", "installed_by": ["modules"] } } diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index a0cc023e..bc9f6fbd 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -5,7 +5,7 @@ process SAMPLESHEET_CHECK { conda "conda-forge::python=3.8.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'quay.io/biocontainers/python:3.8.3' }" + 'biocontainers/python:3.8.3' }" input: path samplesheet diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index 3df21765..800a6099 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda "bioconda::multiqc=1.13" + conda "bioconda::multiqc=1.14" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.14--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml index 60b546a0..c32657de 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/meta.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -1,7 +1,9 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json name: custom_dumpsoftwareversions description: Custom module used to dump software versions within the nf-core pipeline template keywords: - custom + - dump - version tools: - custom: diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index 68f66bea..4b604749 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_single' - conda "bioconda::multiqc=1.13" + conda "bioconda::multiqc=1.14" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.14--pyhdfd78af_0' }" input: path multiqc_files, stageAs: "?/*" diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index ebc29b27..f93b5ee5 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -1,3 +1,4 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json name: MultiQC description: Aggregate results from bioinformatics analyses across many samples into a single report keywords: @@ -37,7 +38,7 @@ output: description: MultiQC report file pattern: "multiqc_report.html" - data: - type: dir + type: directory description: MultiQC data dir pattern: "multiqc_data" - plots: diff --git a/nextflow.config b/nextflow.config index 5a863a50..a7d70e06 100644 --- a/nextflow.config +++ b/nextflow.config @@ -78,7 +78,11 @@ try { profiles { - debug { process.beforeScript = 'echo $HOSTNAME' } + debug { + dumpHashes = true + process.beforeScript = 'echo $HOSTNAME' + cleanup = false + } conda { conda.enabled = true docker.enabled = false @@ -86,6 +90,7 @@ profiles { podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } mamba { conda.enabled = true @@ -95,14 +100,18 @@ profiles { podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } docker { docker.enabled = true + docker.registry = 'quay.io' docker.userEmulation = true + conda.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } arm { docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' @@ -110,31 +119,49 @@ profiles { singularity { singularity.enabled = true singularity.autoMounts = true + conda.enabled = false docker.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } podman { podman.enabled = true + podman.registry = 'quay.io' + conda.enabled = false docker.enabled = false singularity.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } shifter { shifter.enabled = true + conda.enabled = false docker.enabled = false singularity.enabled = false podman.enabled = false charliecloud.enabled = false + apptainer.enabled = false } charliecloud { charliecloud.enabled = true + conda.enabled = false docker.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false + apptainer.enabled = false + } + apptainer { + apptainer.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false } gitpod { executor.name = 'local' @@ -193,7 +220,7 @@ manifest { description = """Pipeline to fetch metadata and raw FastQ files from public databases""" mainScript = 'main.nf' nextflowVersion = '!>=22.10.1' - version = '1.9dev' + version = '1.10dev' doi = '' } diff --git a/tower.yml b/tower.yml new file mode 100644 index 00000000..787aedfe --- /dev/null +++ b/tower.yml @@ -0,0 +1,5 @@ +reports: + multiqc_report.html: + display: "MultiQC HTML report" + samplesheet.csv: + display: "Auto-created samplesheet with collated metadata and FASTQ paths" From d6125e1f0722a65e5e6dce95ff9381713c2f0447 Mon Sep 17 00:00:00 2001 From: Esha Joshi Date: Tue, 2 May 2023 19:51:10 -0400 Subject: [PATCH 16/71] fix: remove default ena meta fields to match api spec v2.0 --- bin/sra_ids_to_runinfo.py | 2 -- nextflow.config | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 70627791..5da7c708 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -64,7 +64,6 @@ "secondary_sample_accession", "study_accession", "secondary_study_accession", - "parent_study", "submission_accession", "run_alias", "experiment_alias", @@ -84,7 +83,6 @@ "sample_title", "experiment_title", "study_title", - "description", "sample_description", "fastq_md5", "fastq_bytes", diff --git a/nextflow.config b/nextflow.config index d6686ae4..d3a1f3f0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,7 +15,7 @@ params { nf_core_pipeline = null nf_core_rnaseq_strandedness = 'auto' ena_metadata_fields = null - sample_mapping_fields = 'experiment_accession,run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description,description' + sample_mapping_fields = 'experiment_accession,run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description' synapse_config = null force_sratools_download = false skip_fastq_download = false From 24ff80b4e01e75de776fb3d585c5cacf5bbc2bc1 Mon Sep 17 00:00:00 2001 From: Esha Joshi Date: Tue, 2 May 2023 21:01:44 -0400 Subject: [PATCH 17/71] feat: improve error handling for meta field validation --- bin/sra_ids_to_runinfo.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 5da7c708..a0f6fcb3 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -57,7 +57,6 @@ # Full list of accepted fields can be obtained here: # https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run ENA_METADATA_FIELDS = ( - "accession", "run_accession", "experiment_accession", "sample_accession", @@ -364,10 +363,12 @@ def validate_fields_parameter(param, valid_vals, param_desc): if len(set(user_vals) & set(valid_vals)) == len(user_vals): return user_vals else: + invalid_vals = [x for x in user_vals if x not in valid_vals] logger.error( f"Please provide a valid value for {param_desc}!\n" f"Provided values = {param}\n" - f"Accepted values = {','.join(valid_vals)}" + f"Accepted values = {','.join(valid_vals)}\n" + f"The following values are invalid: {','.join(invalid_vals)}\n" ) sys.exit(1) From 9e454f2ee24215b16505ac4487b26f85898896b1 Mon Sep 17 00:00:00 2001 From: Esha Joshi Date: Tue, 2 May 2023 21:10:05 -0400 Subject: [PATCH 18/71] fix: update default mapping fields in schema and specify API version --- nextflow_schema.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 696914e4..563c5e7b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -32,13 +32,13 @@ "type": "string", "fa_icon": "fas fa-columns", "description": "Comma-separated list of ENA metadata fields to fetch before downloading data.", - "help_text": "The default list of fields used by the pipeline can be found at the top of the [`bin/sra_ids_to_runinfo.py`](https://github.com/nf-core/fetchngs/blob/master/bin/sra_ids_to_runinfo.py) script within the pipeline repo. This pipeline requires a minimal set of fields to download FastQ files i.e. `'run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5'`. Full list of accepted metadata fields can be obtained from the [ENA API](https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run])." + "help_text": "The default list of fields used by the pipeline can be found at the top of the [`bin/sra_ids_to_runinfo.py`](https://github.com/nf-core/fetchngs/blob/master/bin/sra_ids_to_runinfo.py) script within the pipeline repo. This pipeline requires a minimal set of fields to download FastQ files i.e. `'run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5'`. Full list of accepted metadata fields can be obtained from the [ENA API v2.0](https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run])." }, "sample_mapping_fields": { "type": "string", "fa_icon": "fas fa-globe-americas", "description": "Comma-separated list of ENA metadata fields used to create a separate 'id_mappings.csv' and 'multiqc_config.yml' with selected fields that can be used to rename samples in general and in MultiQC.", - "default": "experiment_accession,run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description,description" + "default": "experiment_accession,run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description" }, "nf_core_pipeline": { "type": "string", From b1019a595181db0db77b766eb85f050642be6e9e Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Thu, 4 May 2023 22:24:11 +0100 Subject: [PATCH 19/71] Remove Sys.exit calls and quay.io in Docker definition from local modules --- docs/usage.md | 11 +++++------ lib/WorkflowMain.groovy | 10 ++++------ lib/WorkflowSra.groovy | 13 +++++++------ main.nf | 4 ++-- modules/local/multiqc_mappings_config.nf | 2 +- modules/local/sra_fastq_ftp.nf | 2 +- modules/local/sra_ids_to_runinfo.nf | 2 +- modules/local/sra_merge_samplesheet.nf | 2 +- modules/local/sra_runinfo_to_ftp.nf | 2 +- modules/local/synapse_get.nf | 6 +++--- modules/local/synapse_list.nf | 6 +++--- modules/local/synapse_merge_samplesheet.nf | 2 +- modules/local/synapse_show.nf | 6 +++--- nextflow.config | 2 +- workflows/sra.nf | 4 ++-- 15 files changed, 36 insertions(+), 38 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 807abb75..55e47133 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -75,7 +75,7 @@ If FTP connections are blocked on your network use the [`--force_sratools_downlo The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/fetchngs --input ids.csv --outdir -profile docker +nextflow run nf-core/fetchngs --input ./ids.csv --outdir ./results -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -94,7 +94,8 @@ If you wish to repeatedly use the same parameters for multiple runs, rather than Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. > ⚠️ Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). -> The above pipeline run specified with a params file in yaml format: + +The above pipeline run specified with a params file in yaml format: ```bash nextflow run nf-core/fetchngs -profile docker -params-file params.yaml @@ -103,10 +104,8 @@ nextflow run nf-core/fetchngs -profile docker -params-file params.yaml with `params.yaml` containing: ```yaml -input: './samplesheet.csv' +input: './ids.csv' outdir: './results/' -genome: 'GRCh37' -input: 'data' <...> ``` @@ -189,7 +188,7 @@ To change the resource requests, please see the [max resources](https://nf-co.re ### Custom Containers -In some cases you may wish to change which container or conda environment a step of the pipeline uses for a particular tool. By default nf-core pipelines use containers and software from the [biocontainers](https://biocontainers.pro/) or [bioconda](https://bioconda.github.io/) projects. However in some cases the pipeline specified version maybe out of date. +In some cases you may wish to change which container or conda environment a step of the pipeline uses for a particular tool. By default nf-core pipelines use containers and software from the [biocontainers](https://biocontainers.pro/) or [bioconda](https://bioconda.github.io/) projects. However in some cases the pipeline specified version may be out of date. To use a different container from the default container or conda environment specified in a pipeline, please see the [updating tool versions](https://nf-co.re/docs/usage/configuration#updating-tool-versions) section of the nf-core website. diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 24d82c4e..13fd6d82 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -94,7 +94,7 @@ class WorkflowMain { } // Check if input ids are from the SRA - public static Boolean isSraId(input, log) { + public static Boolean isSraId(input) { def is_sra = false def total_ids = 0 def no_match_ids = [] @@ -111,15 +111,14 @@ class WorkflowMain { if (num_match == total_ids) { is_sra = true } else { - log.error "Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ or Synapse ids!" - System.exit(1) + Nextflow.error("Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ or Synapse ids!") } } return is_sra } // Check if input ids are from the Synapse platform - public static Boolean isSynapseId(input, log) { + public static Boolean isSynapseId(input) { def is_synapse = false def total_ids = 0 def no_match_ids = [] @@ -136,8 +135,7 @@ class WorkflowMain { if (num_match == total_ids) { is_synapse = true } else { - log.error "Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ or Synapse ids!" - System.exit(1) + Nextflow.error("Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ or Synapse ids!") } } return is_synapse diff --git a/lib/WorkflowSra.groovy b/lib/WorkflowSra.groovy index 90d86f1c..061f7ffb 100755 --- a/lib/WorkflowSra.groovy +++ b/lib/WorkflowSra.groovy @@ -2,17 +2,18 @@ // This file holds several functions specific to the workflow/sra.nf in the nf-core/fetchngs pipeline // +import nextflow.Nextflow + class WorkflowSra { // // Check and validate parameters // - public static void initialise(params, log, valid_params) { + public static void initialise(params, valid_params) { // Check minimal ENA fields are provided to download FastQ files def ena_metadata_fields = params.ena_metadata_fields ? params.ena_metadata_fields.split(',').collect{ it.trim().toLowerCase() } : valid_params['ena_metadata_fields'] if (!ena_metadata_fields.containsAll(valid_params['ena_metadata_fields'])) { - log.error "Invalid option: '${params.ena_metadata_fields}'. Minimally required fields for '--ena_metadata_fields': '${valid_params['ena_metadata_fields'].join(',')}'" - System.exit(1) + Nextflow.error("Invalid option: '${params.ena_metadata_fields}'. Minimally required fields for '--ena_metadata_fields': '${valid_params['ena_metadata_fields'].join(',')}'") } } @@ -31,18 +32,18 @@ class WorkflowSra { } // Fail pipeline if input ids are from the GEO - public static void isGeoFail(ids, log) { + public static void isGeoFail(ids) { def pattern = /^(GS[EM])(\d+)$/ for (id in ids) { if (id =~ pattern) { - log.error "===================================================================================\n" + + def error_string = "===================================================================================\n" + " GEO id detected: ${id}\n" + " Support for GEO ids was dropped in v1.7 due to breaking changes in the NCBI API.\n" + " Please remove any GEO ids from the input samplesheet.\n\n" + " Please see:\n" + " https://github.com/nf-core/fetchngs/pull/102\n" + "===================================================================================" - System.exit(1) + Nextflow.error(error_string) } } } diff --git a/main.nf b/main.nf index 2c4b52f2..e7d67165 100644 --- a/main.nf +++ b/main.nf @@ -39,9 +39,9 @@ Channel // Auto-detect input id type def input_type = '' -if (WorkflowMain.isSraId(ch_input, log)) { +if (WorkflowMain.isSraId(ch_input)) { input_type = 'sra' -} else if (WorkflowMain.isSynapseId(ch_input, log)) { +} else if (WorkflowMain.isSynapseId(ch_input)) { input_type = 'synapse' } else { exit 1, 'Ids provided via --input not recognised please make sure they are either SRA / ENA / DDBJ or Synapse ids!' diff --git a/modules/local/multiqc_mappings_config.nf b/modules/local/multiqc_mappings_config.nf index 4f6c95bd..8efe1caa 100644 --- a/modules/local/multiqc_mappings_config.nf +++ b/modules/local/multiqc_mappings_config.nf @@ -4,7 +4,7 @@ process MULTIQC_MAPPINGS_CONFIG { conda "conda-forge::python=3.9.5" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.9--1' : - 'quay.io/biocontainers/python:3.9--1' }" + 'biocontainers/python:3.9--1' }" input: path csv diff --git a/modules/local/sra_fastq_ftp.nf b/modules/local/sra_fastq_ftp.nf index 464a327e..2b7769ff 100644 --- a/modules/local/sra_fastq_ftp.nf +++ b/modules/local/sra_fastq_ftp.nf @@ -7,7 +7,7 @@ process SRA_FASTQ_FTP { conda "bioconda::sra-tools=2.11.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/sra-tools:2.11.0--pl5321ha49a11a_3' : - 'quay.io/biocontainers/sra-tools:2.11.0--pl5321ha49a11a_3' }" + 'biocontainers/sra-tools:2.11.0--pl5321ha49a11a_3' }" input: tuple val(meta), val(fastq) diff --git a/modules/local/sra_ids_to_runinfo.nf b/modules/local/sra_ids_to_runinfo.nf index 49c83554..7d47f5e3 100644 --- a/modules/local/sra_ids_to_runinfo.nf +++ b/modules/local/sra_ids_to_runinfo.nf @@ -6,7 +6,7 @@ process SRA_IDS_TO_RUNINFO { conda "conda-forge::python=3.9.5" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.9--1' : - 'quay.io/biocontainers/python:3.9--1' }" + 'biocontainers/python:3.9--1' }" input: val id diff --git a/modules/local/sra_merge_samplesheet.nf b/modules/local/sra_merge_samplesheet.nf index 4b94a823..66d697aa 100644 --- a/modules/local/sra_merge_samplesheet.nf +++ b/modules/local/sra_merge_samplesheet.nf @@ -3,7 +3,7 @@ process SRA_MERGE_SAMPLESHEET { conda "conda-forge::sed=4.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'docker.io/library/ubuntu:20.04' }" input: path ('samplesheets/*') diff --git a/modules/local/sra_runinfo_to_ftp.nf b/modules/local/sra_runinfo_to_ftp.nf index 3a060f7b..9c83cf53 100644 --- a/modules/local/sra_runinfo_to_ftp.nf +++ b/modules/local/sra_runinfo_to_ftp.nf @@ -4,7 +4,7 @@ process SRA_RUNINFO_TO_FTP { conda "conda-forge::python=3.9.5" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.9--1' : - 'quay.io/biocontainers/python:3.9--1' }" + 'biocontainers/python:3.9--1' }" input: path runinfo diff --git a/modules/local/synapse_get.nf b/modules/local/synapse_get.nf index f43e4d5e..c8a6d7a4 100644 --- a/modules/local/synapse_get.nf +++ b/modules/local/synapse_get.nf @@ -4,10 +4,10 @@ process SYNAPSE_GET { label 'process_low' label 'error_retry' - conda "bioconda::synapseclient=2.6.0" + conda "bioconda::synapseclient=2.7.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/synapseclient:2.6.0--pyh5e36f6f_0' : - 'quay.io/biocontainers/synapseclient:2.6.0--pyh5e36f6f_0' }" + 'https://depot.galaxyproject.org/singularity/synapseclient:2.7.1--pyh7cba7a3_0' : + 'biocontainers/synapseclient:2.7.1--pyh7cba7a3_0' }" input: val meta diff --git a/modules/local/synapse_list.nf b/modules/local/synapse_list.nf index f42357ab..0c03f8b2 100644 --- a/modules/local/synapse_list.nf +++ b/modules/local/synapse_list.nf @@ -3,10 +3,10 @@ process SYNAPSE_LIST { tag "$id" label 'process_low' - conda "bioconda::synapseclient=2.6.0" + conda "bioconda::synapseclient=2.7.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/synapseclient:2.6.0--pyh5e36f6f_0' : - 'quay.io/biocontainers/synapseclient:2.6.0--pyh5e36f6f_0' }" + 'https://depot.galaxyproject.org/singularity/synapseclient:2.7.1--pyh7cba7a3_0' : + 'biocontainers/synapseclient:2.7.1--pyh7cba7a3_0' }" input: val id diff --git a/modules/local/synapse_merge_samplesheet.nf b/modules/local/synapse_merge_samplesheet.nf index f46a1fbf..bfe89693 100644 --- a/modules/local/synapse_merge_samplesheet.nf +++ b/modules/local/synapse_merge_samplesheet.nf @@ -4,7 +4,7 @@ process SYNAPSE_MERGE_SAMPLESHEET { conda "conda-forge::sed=4.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'docker.io/library/ubuntu:20.04' }" input: path ('samplesheets/*') diff --git a/modules/local/synapse_show.nf b/modules/local/synapse_show.nf index 0bd6cc12..e1f756a5 100644 --- a/modules/local/synapse_show.nf +++ b/modules/local/synapse_show.nf @@ -3,10 +3,10 @@ process SYNAPSE_SHOW { tag "$id" label 'process_low' - conda "bioconda::synapseclient=2.6.0" + conda "bioconda::synapseclient=2.7.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/synapseclient:2.6.0--pyh5e36f6f_0' : - 'quay.io/biocontainers/synapseclient:2.6.0--pyh5e36f6f_0' }" + 'https://depot.galaxyproject.org/singularity/synapseclient:2.7.1--pyh7cba7a3_0' : + 'biocontainers/synapseclient:2.7.1--pyh7cba7a3_0' }" input: val id diff --git a/nextflow.config b/nextflow.config index 6731df72..74ae84d6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -73,7 +73,7 @@ profiles { debug { dumpHashes = true process.beforeScript = 'echo $HOSTNAME' - cleanup = false + cleanup = false } conda { conda.enabled = true diff --git a/workflows/sra.nf b/workflows/sra.nf index 52479771..e3ac447f 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -11,7 +11,7 @@ def valid_params = [ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) // Validate input parameters -WorkflowSra.initialise(params, log, valid_params) +WorkflowSra.initialise(params, valid_params) /* ======================================================================================== @@ -55,7 +55,7 @@ workflow SRA { // ids .collect() - .map { WorkflowSra.isGeoFail(it, log) } + .map { WorkflowSra.isGeoFail(it) } // // MODULE: Get SRA run information for public database ids From e7623943f86fa8131da3b3de9c1848ce65249fe7 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Thu, 4 May 2023 22:37:18 +0100 Subject: [PATCH 20/71] Run nf-core modules update to fix quay.io container definitions --- modules.json | 8 ++++---- modules/nf-core/custom/dumpsoftwareversions/main.nf | 2 +- modules/nf-core/custom/sratoolsncbisettings/main.nf | 2 +- modules/nf-core/sratools/prefetch/main.nf | 2 +- .../fastq_download_prefetch_fasterqdump_sratools/meta.yml | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/modules.json b/modules.json index 02ca48e7..dea81e71 100644 --- a/modules.json +++ b/modules.json @@ -7,12 +7,12 @@ "nf-core": { "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "76cc4938c1f6ea5c7d83fed1eeffc146787f9543", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "custom/sratoolsncbisettings": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["fastq_download_prefetch_fasterqdump_sratools"] }, "sratools/fasterqdump": { @@ -22,7 +22,7 @@ }, "sratools/prefetch": { "branch": "master", - "git_sha": "cded1570b7cb0ea128a5c89dd6ec3a62035c1526", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["fastq_download_prefetch_fasterqdump_sratools"] } } @@ -31,7 +31,7 @@ "nf-core": { "fastq_download_prefetch_fasterqdump_sratools": { "branch": "master", - "git_sha": "ca2bf9212707e83717934ec6c5d4cab42b39ca69", + "git_sha": "a9784afdd5dcda23b84e64db75dc591065d64653", "installed_by": ["subworkflows"] } } diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index 800a6099..ebc87273 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -5,7 +5,7 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { conda "bioconda::multiqc=1.14" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.14--pyhdfd78af_0' }" + 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/sratoolsncbisettings/main.nf b/modules/nf-core/custom/sratoolsncbisettings/main.nf index 36be10d3..5deb8892 100644 --- a/modules/nf-core/custom/sratoolsncbisettings/main.nf +++ b/modules/nf-core/custom/sratoolsncbisettings/main.nf @@ -5,7 +5,7 @@ process CUSTOM_SRATOOLSNCBISETTINGS { conda "bioconda::sra-tools=2.11.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/sra-tools:2.11.0--pl5321ha49a11a_3' : - 'quay.io/biocontainers/sra-tools:2.11.0--pl5321ha49a11a_3' }" + 'biocontainers/sra-tools:2.11.0--pl5321ha49a11a_3' }" output: path('*.mkfg') , emit: ncbi_settings diff --git a/modules/nf-core/sratools/prefetch/main.nf b/modules/nf-core/sratools/prefetch/main.nf index 57e8a3c9..91b8aec2 100644 --- a/modules/nf-core/sratools/prefetch/main.nf +++ b/modules/nf-core/sratools/prefetch/main.nf @@ -5,7 +5,7 @@ process SRATOOLS_PREFETCH { conda "bioconda::sra-tools=2.11.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/sra-tools:2.11.0--pl5321ha49a11a_3' : - 'quay.io/biocontainers/sra-tools:2.11.0--pl5321ha49a11a_3' }" + 'biocontainers/sra-tools:2.11.0--pl5321ha49a11a_3' }" input: tuple val(meta), val(id) diff --git a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml index ed93a54a..4599bbd2 100644 --- a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml +++ b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml @@ -1,4 +1,4 @@ -# yaml-language-server: $schema=https://github.com/nf-core/modules/tree/master/subworkflows/yaml-schema.json +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json name: fastq_download_prefetch_fasterqdump_sratools description: Download FASTQ sequencing reads from the NCBI's Sequence Read Archive (SRA). keywords: From e39a54ad3ab9cebf2b1686afa215e521d4aa397f Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Fri, 5 May 2023 10:01:44 +0100 Subject: [PATCH 21/71] Make CHANGELOG more templaty --- CHANGELOG.md | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 533488b3..d959887c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,12 +5,36 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [[1.10.0](https://github.com/nf-core/fetchngs/releases/tag/1.10.0)] - 2023-05-12 +### Credits + +Special thanks to the following for their contributions to the release: + +- [Adam Talbot](https://github.com/adamrtalbot) +- [Esha Joshi](https://github.com/ejseqera) +- [Maxime Garcia](https://github.com/maxulysse) +- [Moritz E. Beber](https://github.com/Midnighter) +- [Rob Syme](https://github.com/robsyme) + +Thank you to everyone else that has contributed by reporting bugs, enhancements or in any other way, shape or form. + +### Enhancements & fixes + - [#144](https://github.com/nf-core/fetchngs/issues/144) - Add support to download 10X Genomics data - [#148](https://github.com/nf-core/fetchngs/pull/148) - Fix default metadata fields for ENA API v2.0 - Bumped modules version to allow for sratools download of sralite format files. - Updated pipeline template to [nf-core/tools 2.8](https://github.com/nf-core/tools/releases/tag/2.8) -### Enhancements & fixes +### Software dependencies + +| Dependency | Old version | New version | +| --------------- | ----------- | ----------- | +| `synapseclient` | 2.6.0 | 2.7.1 | + +> **NB:** Dependency has been **updated** if both old and new version information is present. +> +> **NB:** Dependency has been **added** if just the new version information is present. +> +> **NB:** Dependency has been **removed** if new version information isn't present. ## [[1.9](https://github.com/nf-core/fetchngs/releases/tag/1.9)] - 2022-12-21 From b9888eafb65469ed99ae3313e7fe8cfbb4d78f3e Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Fri, 5 May 2023 12:32:30 +0100 Subject: [PATCH 22/71] Use ubuntu image from quay.io nf-core account --- modules/local/sra_merge_samplesheet.nf | 2 +- modules/local/synapse_merge_samplesheet.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/sra_merge_samplesheet.nf b/modules/local/sra_merge_samplesheet.nf index 66d697aa..1c2ee7df 100644 --- a/modules/local/sra_merge_samplesheet.nf +++ b/modules/local/sra_merge_samplesheet.nf @@ -3,7 +3,7 @@ process SRA_MERGE_SAMPLESHEET { conda "conda-forge::sed=4.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'docker.io/library/ubuntu:20.04' }" + 'nf-core/ubuntu:20.04' }" input: path ('samplesheets/*') diff --git a/modules/local/synapse_merge_samplesheet.nf b/modules/local/synapse_merge_samplesheet.nf index bfe89693..4cb2abc3 100644 --- a/modules/local/synapse_merge_samplesheet.nf +++ b/modules/local/synapse_merge_samplesheet.nf @@ -4,7 +4,7 @@ process SYNAPSE_MERGE_SAMPLESHEET { conda "conda-forge::sed=4.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'docker.io/library/ubuntu:20.04' }" + 'nf-core/ubuntu:20.04' }" input: path ('samplesheets/*') From c0afa084e3d353bf6a315c14a54ecae983d6e0bc Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sat, 6 May 2023 15:24:26 +0100 Subject: [PATCH 23/71] Add #85 to CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d959887c..b5b1a287 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements ### Enhancements & fixes +- [#85](https://github.com/nf-core/fetchngs/issues/85) - Not able to fetch metadata for ERR* ids associated with ArrayExpress - [#144](https://github.com/nf-core/fetchngs/issues/144) - Add support to download 10X Genomics data - [#148](https://github.com/nf-core/fetchngs/pull/148) - Fix default metadata fields for ENA API v2.0 - Bumped modules version to allow for sratools download of sralite format files. From 2aca881c5466590c74f8b36452ac788be10b5757 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sat, 6 May 2023 16:12:00 +0100 Subject: [PATCH 24/71] Fix #129 --- CHANGELOG.md | 1 + bin/sra_ids_to_runinfo.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b5b1a287..1b3efa85 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements ### Enhancements & fixes - [#85](https://github.com/nf-core/fetchngs/issues/85) - Not able to fetch metadata for ERR* ids associated with ArrayExpress +- [#129](https://github.com/nf-core/fetchngs/issues/129) - Pipeline is working with SRA run IDs, but failing with corresponding Biosample IDs - [#144](https://github.com/nf-core/fetchngs/issues/144) - Add support to download 10X Genomics data - [#148](https://github.com/nf-core/fetchngs/pull/148) - Fix default metadata fields for ENA API v2.0 - Bumped modules version to allow for sratools download of sralite format files. diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index a0f6fcb3..6cf16800 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -191,7 +191,6 @@ class DatabaseResolver: _GEO_PREFIXES = {"GSE", "GSM"} _SRA_PREFIXES = { "PRJNA", - "SAMN", "DRA", "DRP", "DRS", @@ -199,7 +198,7 @@ class DatabaseResolver: "PRJDB", "SAMD", } - _ENA_PREFIXES = {"ERR", "SRR", "DRR"} + _ENA_PREFIXES = {"ERR", "SRR", "SAMN", "DRR"} @classmethod def expand_identifier(cls, identifier): From bc4edf654c7df6f2edaf8654870f6617e4f73c55 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sat, 6 May 2023 16:18:47 +0100 Subject: [PATCH 25/71] Fix prettier --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b3efa85..c1bf6a64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements ### Enhancements & fixes -- [#85](https://github.com/nf-core/fetchngs/issues/85) - Not able to fetch metadata for ERR* ids associated with ArrayExpress +- [#85](https://github.com/nf-core/fetchngs/issues/85) - Not able to fetch metadata for ERR\* ids associated with ArrayExpress - [#129](https://github.com/nf-core/fetchngs/issues/129) - Pipeline is working with SRA run IDs, but failing with corresponding Biosample IDs - [#144](https://github.com/nf-core/fetchngs/issues/144) - Add support to download 10X Genomics data - [#148](https://github.com/nf-core/fetchngs/pull/148) - Fix default metadata fields for ENA API v2.0 From b6b43947bee5f776090081b0ba1291bfc8b4603a Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sat, 6 May 2023 20:20:19 +0100 Subject: [PATCH 26/71] Add multi-cloud CI for full-sized tests --- .github/workflows/cloud_tests_full.yml | 81 +++++++++++++++++++++++++ .github/workflows/cloud_tests_small.yml | 76 +++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 .github/workflows/cloud_tests_full.yml create mode 100644 .github/workflows/cloud_tests_small.yml diff --git a/.github/workflows/cloud_tests_full.yml b/.github/workflows/cloud_tests_full.yml new file mode 100644 index 00000000..d0ed0552 --- /dev/null +++ b/.github/workflows/cloud_tests_full.yml @@ -0,0 +1,81 @@ +name: full-sized tests on cloud providers +run-name: Submitting workflow to all cloud providers using full sized data +on: + release: + types: [published] + workflow_dispatch: + inputs: + platform: + description: "Platform to run test" + required: true + default: "all" + type: choice + options: + - all + - aws + - azure + - gcp +jobs: + run-full-tests-on-aws: + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'aws' || !github.event.inputs }} + runs-on: ubuntu-latest + steps: + - uses: seqeralabs/action-tower-launch@v1 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_AWS_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_AWS }}/work/fetchngs/work-${{ github.sha }}" + run_name: "aws_fetchngs_full" + profiles: test_full + parameters: | + { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", + "outdir": "${{ secrets.TOWER_BUCKET_AWS }}/fetchngs/results-${{ github.sha }}" + } + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log + run-full-tests-on-gcp: + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'gcp' || !github.event.inputs }} + runs-on: ubuntu-latest + steps: + - uses: seqeralabs/action-tower-launch@v1 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_GCP_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_GCP }}/work/fetchngs/work-${{ github.sha }}" + run_name: "gcp_fetchngs_full" + profiles: test_full + parameters: | + { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", + "outdir": "${{ secrets.TOWER_BUCKET_GCP }}/fetchngs/results-${{ github.sha }}" + } + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log + run-full-tests-on-azure: + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'azure' || !github.event.inputs }} + runs-on: ubuntu-latest + steps: + - uses: seqeralabs/action-tower-launch@v1 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_AZURE_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_AZURE }}/work/fetchngs/work-${{ github.sha }}" + run_name: "azure_fetchngs_full" + profiles: test_full + parameters: | + { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", + "outdir": "${{ secrets.TOWER_BUCKET_AZURE }}/fetchngs/results-${{ github.sha }}" + } + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log diff --git a/.github/workflows/cloud_tests_small.yml b/.github/workflows/cloud_tests_small.yml new file mode 100644 index 00000000..12b8daab --- /dev/null +++ b/.github/workflows/cloud_tests_small.yml @@ -0,0 +1,76 @@ +name: small-sized tests on cloud providers +run-name: Submitting workflow to all cloud providers using small sized data +on: + workflow_dispatch: + inputs: + platform: + description: "Platform to run test" + required: true + default: "all" + type: choice + options: + - all + - aws + - azure + - gcp +jobs: + run-small-tests-on-aws: + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'aws' }} + runs-on: ubuntu-latest + steps: + - uses: seqeralabs/action-tower-launch@v1 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_AWS_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_AWS }}/work/fetchngs/work-${{ github.sha }}" + run_name: "aws_fetchngs_small" + profiles: test + parameters: | + { + "outdir": "${{ secrets.TOWER_BUCKET_AWS }}/fetchngs/results-test-${{ github.sha }}" + } + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log + run-small-tests-on-gcp: + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'gcp' }} + runs-on: ubuntu-latest + steps: + - uses: seqeralabs/action-tower-launch@v1 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_GCP_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_GCP }}/work/fetchngs/work-${{ github.sha }}" + run_name: "gcp_fetchngs_small" + profiles: test + parameters: | + { + "outdir": "${{ secrets.TOWER_BUCKET_GCP }}/fetchngs/results-test-${{ github.sha }}" + } + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log + run-small-tests-on-azure: + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'azure' }} + runs-on: ubuntu-latest + steps: + - uses: seqeralabs/action-tower-launch@v1 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_AZURE_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_AZURE }}/work/fetchngs/work-${{ github.sha }}" + run_name: "azure_fetchngs_small" + profiles: test + parameters: | + { + "outdir": "${{ secrets.TOWER_BUCKET_AZURE }}/fetchngs/results-test-${{ github.sha }}" + } + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log From 9e112c1f3a761b2c39ef0151b937895ff2db92e8 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sat, 6 May 2023 20:28:41 +0100 Subject: [PATCH 27/71] Remove old AWS CI tests --- .github/workflows/awsfulltest.yml | 31 ------------------------------- .github/workflows/awstest.yml | 29 ----------------------------- 2 files changed, 60 deletions(-) delete mode 100644 .github/workflows/awsfulltest.yml delete mode 100644 .github/workflows/awstest.yml diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml deleted file mode 100644 index 773c3a7e..00000000 --- a/.github/workflows/awsfulltest.yml +++ /dev/null @@ -1,31 +0,0 @@ -name: nf-core AWS full size tests -# This workflow is triggered on published releases. -# It can be additionally triggered manually with GitHub actions workflow dispatch button. -# It runs the -profile 'test_full' on AWS batch - -on: - release: - types: [published] - workflow_dispatch: -jobs: - run-tower: - name: Run AWS full tests - if: github.repository == 'nf-core/fetchngs' - runs-on: ubuntu-latest - steps: - - name: Launch workflow via tower - uses: seqeralabs/action-tower-launch@v1 - with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} - access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/fetchngs/work-${{ github.sha }} - parameters: | - { - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/fetchngs/results-${{ github.sha }}" - } - profiles: test_full,aws_tower - - uses: actions/upload-artifact@v3 - with: - name: Tower debug log file - path: tower_action_*.log diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml deleted file mode 100644 index cdfba2f0..00000000 --- a/.github/workflows/awstest.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: nf-core AWS test -# This workflow can be triggered manually with the GitHub actions workflow dispatch button. -# It runs the -profile 'test' on AWS batch - -on: - workflow_dispatch: -jobs: - run-tower: - name: Run AWS tests - if: github.repository == 'nf-core/fetchngs' - runs-on: ubuntu-latest - steps: - # Launch workflow using Tower CLI tool action - - name: Launch workflow via tower - uses: seqeralabs/action-tower-launch@v1 - with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} - access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/fetchngs/work-${{ github.sha }} - parameters: | - { - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/fetchngs/results-test-${{ github.sha }}" - } - profiles: test,aws_tower - - uses: actions/upload-artifact@v3 - with: - name: Tower debug log file - path: tower_action_*.log From 7d858a3aed5800a743e13051069fab741987cd91 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sat, 6 May 2023 20:36:41 +0100 Subject: [PATCH 28/71] Update CHANGELOG --- CHANGELOG.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c1bf6a64..4bb46ddb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,12 +19,13 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements ### Enhancements & fixes -- [#85](https://github.com/nf-core/fetchngs/issues/85) - Not able to fetch metadata for ERR\* ids associated with ArrayExpress -- [#129](https://github.com/nf-core/fetchngs/issues/129) - Pipeline is working with SRA run IDs, but failing with corresponding Biosample IDs +- [#85](https://github.com/nf-core/fetchngs/issues/85) - Not able to fetch metadata for ERR ids associated with ArrayExpress +- [#129](https://github.com/nf-core/fetchngs/issues/129) - Pipeline is working with SRA run ids but failing with corresponding Biosample ids - [#144](https://github.com/nf-core/fetchngs/issues/144) - Add support to download 10X Genomics data -- [#148](https://github.com/nf-core/fetchngs/pull/148) - Fix default metadata fields for ENA API v2.0 -- Bumped modules version to allow for sratools download of sralite format files. -- Updated pipeline template to [nf-core/tools 2.8](https://github.com/nf-core/tools/releases/tag/2.8) +- [PR #140](https://github.com/nf-core/fetchngs/pull/140) - Bumped modules version to allow for sratools download of sralite format files +- [PR #147](https://github.com/nf-core/fetchngs/pull/147) - Updated pipeline template to [nf-core/tools 2.8](https://github.com/nf-core/tools/releases/tag/2.8) +- [PR #148](https://github.com/nf-core/fetchngs/pull/148) - Fix default metadata fields for ENA API v2.0 +- [PR #150](https://github.com/nf-core/fetchngs/pull/150) - Add infrastructure and CI for multi-cloud full-sized tests run via Nextflow Tower ### Software dependencies From eaecac87ca7a50b3a74ee16d1ac03f1ecc46be92 Mon Sep 17 00:00:00 2001 From: ejseqera Date: Mon, 8 May 2023 02:02:48 -0400 Subject: [PATCH 29/71] feat: add support for jwt as input file --- modules/nf-core/sratools/fasterqdump/main.nf | 3 +++ modules/nf-core/sratools/prefetch/main.nf | 2 ++ nextflow.config | 1 + .../main.nf | 12 ++++++++++-- 4 files changed, 16 insertions(+), 2 deletions(-) diff --git a/modules/nf-core/sratools/fasterqdump/main.nf b/modules/nf-core/sratools/fasterqdump/main.nf index 2336c318..4ed92814 100644 --- a/modules/nf-core/sratools/fasterqdump/main.nf +++ b/modules/nf-core/sratools/fasterqdump/main.nf @@ -10,6 +10,7 @@ process SRATOOLS_FASTERQDUMP { input: tuple val(meta), path(sra) path ncbi_settings + path certificate output: tuple val(meta), path('*.fastq.gz'), emit: reads @@ -23,6 +24,7 @@ process SRATOOLS_FASTERQDUMP { def args2 = task.ext.args2 ?: '' def prefix = task.ext.prefix ?: "${meta.id}" def outfile = meta.single_end ? "${prefix}.fastq" : prefix + def key_file = certificate ? "--perm ${certificate}" : '' """ export NCBI_SETTINGS="\$PWD/${ncbi_settings}" @@ -30,6 +32,7 @@ process SRATOOLS_FASTERQDUMP { $args \\ --threads $task.cpus \\ --outfile $outfile \\ + ${key_file} \\ ${sra.name} pigz \\ diff --git a/modules/nf-core/sratools/prefetch/main.nf b/modules/nf-core/sratools/prefetch/main.nf index 91b8aec2..5d894b88 100644 --- a/modules/nf-core/sratools/prefetch/main.nf +++ b/modules/nf-core/sratools/prefetch/main.nf @@ -10,6 +10,7 @@ process SRATOOLS_PREFETCH { input: tuple val(meta), val(id) path ncbi_settings + path certificate output: tuple val(meta), path(id), emit: sra @@ -20,6 +21,7 @@ process SRATOOLS_PREFETCH { shell: args = task.ext.args ?: '' + args += certificate ? " --perm ${certificate}" : '' args2 = task.ext.args2 ?: '5 1 100' // template 'retry_with_backoff.sh' } diff --git a/nextflow.config b/nextflow.config index 74ae84d6..61f89783 100644 --- a/nextflow.config +++ b/nextflow.config @@ -19,6 +19,7 @@ params { synapse_config = null force_sratools_download = false skip_fastq_download = false + dbgap_key = null // Boilerplate options outdir = null diff --git a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf index 1e1d0d7b..22b756ca 100644 --- a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf +++ b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf @@ -23,13 +23,21 @@ workflow FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS { // // Prefetch sequencing reads in SRA format. // - SRATOOLS_PREFETCH ( ch_sra_ids, settings ) + if (!params.dbgap_key) { + SRATOOLS_PREFETCH ( ch_sra_ids, settings, [] ) + } else { + SRATOOLS_PREFETCH ( ch_sra_ids, settings, certificate = file(params.dbgap_key, checkIfExists: true) ) + } ch_versions = ch_versions.mix(SRATOOLS_PREFETCH.out.versions.first()) // // Convert the SRA format into one or more compressed FASTQ files. // - SRATOOLS_FASTERQDUMP ( SRATOOLS_PREFETCH.out.sra, settings ) + if (!params.dbgap_key) { + SRATOOLS_FASTERQDUMP ( SRATOOLS_PREFETCH.out.sra, settings, [] ) + } else { + SRATOOLS_FASTERQDUMP ( SRATOOLS_PREFETCH.out.sra, settings, certificate = file(params.dbgap_key, checkIfExists: true) ) + } ch_versions = ch_versions.mix(SRATOOLS_FASTERQDUMP.out.versions.first()) emit: From b71105020bfc4cef617ebb2c609fe52e756ca41e Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Mon, 8 May 2023 18:08:45 +0000 Subject: [PATCH 30/71] refactor: simplify logic for params.dbgap_key --- .../main.nf | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf index 22b756ca..965d8779 100644 --- a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf +++ b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf @@ -21,23 +21,26 @@ workflow FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS { ch_versions = ch_versions.mix(CUSTOM_SRATOOLSNCBISETTINGS.out.versions) // - // Prefetch sequencing reads in SRA format. + // Prefetch sequencing reads in SRA format and convert into one or more compressed FASTQ files. + // If specified in params, use the provided JWT file for pulling protected SRA runs, else provide + // an empty list. // + if (!params.dbgap_key) { - SRATOOLS_PREFETCH ( ch_sra_ids, settings, [] ) - } else { - SRATOOLS_PREFETCH ( ch_sra_ids, settings, certificate = file(params.dbgap_key, checkIfExists: true) ) - } - ch_versions = ch_versions.mix(SRATOOLS_PREFETCH.out.versions.first()) + + SRATOOLS_PREFETCH ( ch_sra_ids, settings, [] ) + SRATOOLS_FASTERQDUMP ( SRATOOLS_PREFETCH.out.sra, settings, [] ) - // - // Convert the SRA format into one or more compressed FASTQ files. - // - if (!params.dbgap_key) { - SRATOOLS_FASTERQDUMP ( SRATOOLS_PREFETCH.out.sra, settings, [] ) } else { - SRATOOLS_FASTERQDUMP ( SRATOOLS_PREFETCH.out.sra, settings, certificate = file(params.dbgap_key, checkIfExists: true) ) + + certificate = file(params.dbgap_key, checkIfExists: true) // optional input channel for JWT + + SRATOOLS_PREFETCH ( ch_sra_ids, settings, certificate ) + SRATOOLS_FASTERQDUMP ( SRATOOLS_PREFETCH.out.sra, settings, certificate ) + } + + ch_versions = ch_versions.mix(SRATOOLS_PREFETCH.out.versions.first()) ch_versions = ch_versions.mix(SRATOOLS_FASTERQDUMP.out.versions.first()) emit: From acf5ed960574e41de5f13e9cea1bda73a2d50b8b Mon Sep 17 00:00:00 2001 From: ejseqera Date: Mon, 8 May 2023 14:23:24 -0400 Subject: [PATCH 31/71] chore(nextflow_schema.json): update schema with dbgap_key param --- nextflow_schema.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/nextflow_schema.json b/nextflow_schema.json index 563c5e7b..487de1e6 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -64,6 +64,13 @@ "fa_icon": "fas fa-fast-forward", "description": "Only download metadata for public data database ids and don't download the FastQ files." }, + "dbgap_key": { + "type": "string", + "default": null, + "fa_icon": "fas fa-address-card", + "help_text": "Path to a JWT cart file used to access protected dbGAP data on SRA using the sra-toolkit. Users with granted access to controlled data can download the JWT cart file for the study from the SRA Run Selector upon logging in. The JWT file can only be used on cloud platforms and is valid for 1 hour upon creation.", + "format": "file-path" + }, "outdir": { "type": "string", "format": "directory-path", From a56363538ff1c1d901e8d88d5643bd531e8f204d Mon Sep 17 00:00:00 2001 From: ejseqera Date: Mon, 8 May 2023 14:39:03 -0400 Subject: [PATCH 32/71] docs(usage.md): add usage detail on providing jwt file --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 55e47133..0f47198f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -22,7 +22,7 @@ The pipeline has been set-up to automatically download and process the raw FastQ If `SRR`/`ERR`/`DRR` run ids are provided then these will be resolved back to their appropriate `SRX`/`ERX`/`DRX` ids to be able to merge multiple runs from the same experiment. This is conceptually the same as merging multiple libraries sequenced from the same sample. -The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5 sums. If download links exist, the files will be downloaded in parallel by FTP. Otherwise they are downloaded using sra-tools. +The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5 sums. If download links exist, the files will be downloaded in parallel by FTP. Otherwise they are downloaded using sra-tools. To download protected dbGAP data using sra-tools, a JWT cart file can be specified with `--dbgap_key /path/to/cart.jwt`. All of the sample metadata obtained from the ENA will be appended as additional columns to help you manually curate the generated samplesheet before you run the pipeline. You can customise the metadata fields that are appended to the samplesheet via the `--ena_metadata_fields` parameter. The default list of fields used by the pipeline can be found at the top of the [`bin/sra_ids_to_runinfo.py`](https://github.com/nf-core/fetchngs/blob/master/bin/sra_ids_to_runinfo.py) script within the pipeline repo. However, this pipeline requires a minimal set of fields to download FastQ files i.e. `'run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5'`. A comprehensive list of accepted metadata fields can be obtained from the [ENA API](https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run). From 87b2bd8ea3d0dd87113a2318f68f991dd22477bf Mon Sep 17 00:00:00 2001 From: ejseqera Date: Mon, 8 May 2023 21:45:19 -0400 Subject: [PATCH 33/71] chore: update changelog --- CHANGELOG.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c1bf6a64..066e3d71 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,12 +19,14 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements ### Enhancements & fixes -- [#85](https://github.com/nf-core/fetchngs/issues/85) - Not able to fetch metadata for ERR\* ids associated with ArrayExpress -- [#129](https://github.com/nf-core/fetchngs/issues/129) - Pipeline is working with SRA run IDs, but failing with corresponding Biosample IDs +- [#85](https://github.com/nf-core/fetchngs/issues/85) - Not able to fetch metadata for ERR ids associated with ArrayExpress +- [#129](https://github.com/nf-core/fetchngs/issues/129) - Pipeline is working with SRA run ids but failing with corresponding Biosample ids - [#144](https://github.com/nf-core/fetchngs/issues/144) - Add support to download 10X Genomics data -- [#148](https://github.com/nf-core/fetchngs/pull/148) - Fix default metadata fields for ENA API v2.0 -- Bumped modules version to allow for sratools download of sralite format files. -- Updated pipeline template to [nf-core/tools 2.8](https://github.com/nf-core/tools/releases/tag/2.8) +- [PR #140](https://github.com/nf-core/fetchngs/pull/140) - Bumped modules version to allow for sratools download of sralite format files +- [PR #147](https://github.com/nf-core/fetchngs/pull/147) - Updated pipeline template to [nf-core/tools 2.8](https://github.com/nf-core/tools/releases/tag/2.8) +- [PR #148](https://github.com/nf-core/fetchngs/pull/148) - Fix default metadata fields for ENA API v2.0 +- [PR #150](https://github.com/nf-core/fetchngs/pull/150) - Add infrastructure and CI for multi-cloud full-sized tests run via Nextflow Tower +- [PR #152](https://github.com/nf-core/fetchngs/pull/152) - Add support for downloading protected dbGAP data using a JWT file ### Software dependencies From 1265a8fb9e55c6139bfb139205fb50b0ccd74356 Mon Sep 17 00:00:00 2001 From: ejseqera Date: Tue, 9 May 2023 23:38:12 -0400 Subject: [PATCH 34/71] fix: update gse resolver to use esearch api endpoint --- bin/sra_ids_to_runinfo.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 6cf16800..4297590f 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -14,7 +14,7 @@ from urllib.error import HTTPError, URLError from urllib.parse import urlencode from urllib.request import urlopen - +import json logger = logging.getLogger() @@ -240,16 +240,13 @@ def _id_to_srx(cls, identifier): @classmethod def _gse_to_srx(cls, identifier): - """Resolve the identifier to SRA experiments.""" + """Resolve the GEO identifier to SRA experiments.""" ids = [] - params = {"id": identifier, "db": "gds", "rettype": "runinfo", "retmode": "text"} - response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{urlencode(params)}") + params = {"term": identifier, "db": "sra", "retmode": "json"} + response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?{urlencode(params)}") cls._content_check(response, identifier) - gsm_ids = [ - line.split("=")[1].strip() - for line in response.text().splitlines() - if line.split("=")[1].strip().startswith("GSM") - ] + r_json = json.loads(response.text()) + gsm_ids = r_json['esearchresult']['idlist'] for gsm_id in gsm_ids: ids += cls._id_to_srx(gsm_id) return ids From 0bdeac4b44d311f03e095355ef4a57e83e758a8d Mon Sep 17 00:00:00 2001 From: ejseqera Date: Wed, 10 May 2023 01:29:04 -0400 Subject: [PATCH 35/71] feat: add support for GSE/GDS IDs and resolver to GSM then SRA --- bin/sra_ids_to_runinfo.py | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 4297590f..fc8942d1 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -188,7 +188,8 @@ def is_valid(cls, identifier): class DatabaseResolver: """Define a service class for resolving various identifiers to experiments.""" - _GEO_PREFIXES = {"GSE", "GSM"} + _GEO_GSM_PREFIXES = {"GSM"} + _GEO_GSE_PREFIXES = {"GDS", "GSE"} _SRA_PREFIXES = { "PRJNA", "DRA", @@ -214,7 +215,9 @@ def expand_identifier(cls, identifier): """ prefix = ID_REGEX.match(identifier).group(1) - if prefix in cls._GEO_PREFIXES: + if prefix in cls._GEO_GSM_PREFIXES: + return cls._gsm_to_srx(identifier) + elif prefix in cls._GEO_GSE_PREFIXES: return cls._gse_to_srx(identifier) elif prefix in cls._SRA_PREFIXES: return cls._id_to_srx(identifier) @@ -239,7 +242,7 @@ def _id_to_srx(cls, identifier): return [row["Experiment"] for row in open_table(response, delimiter=",")] @classmethod - def _gse_to_srx(cls, identifier): + def _gsm_to_srx(cls, identifier): """Resolve the GEO identifier to SRA experiments.""" ids = [] params = {"term": identifier, "db": "sra", "retmode": "json"} @@ -250,6 +253,32 @@ def _gse_to_srx(cls, identifier): for gsm_id in gsm_ids: ids += cls._id_to_srx(gsm_id) return ids + + @classmethod + def _gds_to_gsm(cls, identifier): + """Resolve the GEO UIDs to GSM IDs to then resolve to SRA IDs.""" + ids = [] + params = {"id": identifier, "db": "gds", "retmode": "json", "retmax": 10} + response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?{urlencode(params)}") + cls._content_check(response, identifier) + r_json = json.loads(response.text()) + + for each in r_json['result'][identifier]['samples'][0:]: + ids += cls._gsm_to_srx(each['accession']) + return ids + + @classmethod + def _gse_to_srx(cls, identifier): + """Resolve the GSE identifier to GEO UIDs.""" + ids = [] + params = {"term": identifier, "db": "gds", "retmode": "json"} + response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?{urlencode(params)}") + cls._content_check(response, identifier) + r_json = json.loads(response.text()) + gds_uids = r_json['esearchresult']['idlist'] + for gds_uid in gds_uids: + ids += cls._gds_to_gsm(gds_uid) + return ids @classmethod def _id_to_erx(cls, identifier): From 0c226d2aaf2a539ef428974f78c7585f98b797f6 Mon Sep 17 00:00:00 2001 From: ejseqera Date: Wed, 10 May 2023 01:29:53 -0400 Subject: [PATCH 36/71] fix: remove support for bioproject ID (doesn't work) --- bin/sra_ids_to_runinfo.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index fc8942d1..84a6623d 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -191,7 +191,6 @@ class DatabaseResolver: _GEO_GSM_PREFIXES = {"GSM"} _GEO_GSE_PREFIXES = {"GDS", "GSE"} _SRA_PREFIXES = { - "PRJNA", "DRA", "DRP", "DRS", From bd540c0b1dd4f5cc2e3bc9a7cef19d92ecb52857 Mon Sep 17 00:00:00 2001 From: sirclockalot Date: Wed, 10 May 2023 13:59:37 +0200 Subject: [PATCH 37/71] Update nextflow_schema.json Corrected the url for ena metadata --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 696914e4..fc0e83ee 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -32,7 +32,7 @@ "type": "string", "fa_icon": "fas fa-columns", "description": "Comma-separated list of ENA metadata fields to fetch before downloading data.", - "help_text": "The default list of fields used by the pipeline can be found at the top of the [`bin/sra_ids_to_runinfo.py`](https://github.com/nf-core/fetchngs/blob/master/bin/sra_ids_to_runinfo.py) script within the pipeline repo. This pipeline requires a minimal set of fields to download FastQ files i.e. `'run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5'`. Full list of accepted metadata fields can be obtained from the [ENA API](https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run])." + "help_text": "The default list of fields used by the pipeline can be found at the top of the [`bin/sra_ids_to_runinfo.py`](https://github.com/nf-core/fetchngs/blob/master/bin/sra_ids_to_runinfo.py) script within the pipeline repo. This pipeline requires a minimal set of fields to download FastQ files i.e. `'run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5'`. Full list of accepted metadata fields can be obtained from the [ENA API](https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run)." }, "sample_mapping_fields": { "type": "string", From a17861a2e04541b69ce4eda4e482a82ff7113aa6 Mon Sep 17 00:00:00 2001 From: Robert Syme Date: Wed, 10 May 2023 22:51:42 +0000 Subject: [PATCH 38/71] Set a default docker registry outside of profile scope. The `docker.registry` configuration should always be set, as running on cloud executors will need to pull docker images but will not necessarily use the `docker` profile. --- nextflow.config | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 74ae84d6..3f336ca7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -96,7 +96,6 @@ profiles { } docker { docker.enabled = true - docker.registry = 'quay.io' docker.userEmulation = true conda.enabled = false singularity.enabled = false @@ -179,6 +178,9 @@ env { // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] +// Set default docker registry (will be unused unless pulling docker images) +docker.registry = 'quay.io' + def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true From 3fdb87991fa6172bf444343c6022dd22ed251fbc Mon Sep 17 00:00:00 2001 From: ejseqera Date: Wed, 10 May 2023 19:03:57 -0400 Subject: [PATCH 39/71] chore: update docs to support GEO IDs, remove error handling for GEO --- README.md | 15 ++------------- assets/schema_input.json | 4 ++-- docs/output.md | 6 +++--- docs/usage.md | 18 +++++++++--------- lib/WorkflowMain.groovy | 4 ++-- lib/WorkflowSra.groovy | 19 +------------------ main.nf | 4 ++-- nextflow_schema.json | 2 +- workflows/sra.nf | 7 ------- 9 files changed, 22 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index 94e6a5ba..a14f7fb1 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ ## Introduction -**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from both public and private databases. At present, the pipeline supports SRA / ENA / DDBJ / Synapse ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). +**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from both public and private databases. At present, the pipeline supports SRA / ENA / DDBJ / GEO / Synapse ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). ## Usage @@ -56,7 +56,7 @@ For more details, please refer to the [usage documentation](https://nf-co.re/fet Via a single file of ids, provided one-per-line (see [example input file](https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test.csv)) the pipeline performs the following steps: -### SRA / ENA / DDBJ ids +### SRA / ENA / DDBJ / GEO ids 1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html) 2. Fetch extensive id metadata via ENA API @@ -65,17 +65,6 @@ Via a single file of ids, provided one-per-line (see [example input file](https: - Otherwise use [`sra-tools`](https://github.com/ncbi/sra-tools) to download `.sra` files and convert them to FastQ 4. Collate id metadata and paths to FastQ files in a single samplesheet -### GEO ids - -Support for GEO ids was dropped in [[v1.7](https://github.com/nf-core/fetchngs/releases/tag/1.7)] due to breaking changes introduced in the NCBI API. For more detailed information please see [this PR](https://github.com/nf-core/fetchngs/pull/102). - -As a workaround, if you have a GEO accession you can directly download a text file containing the appropriate SRA ids to pass to the pipeline instead: - -- Search for your GEO accession on [GEO](https://www.ncbi.nlm.nih.gov/geo) -- Click `SRA Run Selector` at the bottom of the GEO accession page -- Select the desired samples in the `SRA Run Selector` and then download the `Accession List` - -This downloads a text file called `SRR_Acc_List.txt` that can be directly provided to the pipeline once renamed with a .csv extension e.g. `--input SRR_Acc_List.csv`. ### Synapse ids diff --git a/assets/schema_input.json b/assets/schema_input.json index 71f0f976..9a800216 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -8,8 +8,8 @@ "type": "array", "items": { "type": "string", - "pattern": "^(((SR|ER|DR)[APRSX])|(SAM(N|EA|D))|(PRJ(NA|EB|DB))|(syn))(\\d+)$", - "errorMessage": "Please provide a valid SRA, ENA, DDBJ identifier" + "pattern": "^(((SR|ER|DR)[APRSX])|(SAM(N|EA|D))|(PRJ(NA|EB|DB))|(GS[EM])|(syn))(\\d+)$", + "errorMessage": "Please provide a valid SRA, ENA, DDBJ or GEO identifier" } } } diff --git a/docs/output.md b/docs/output.md index daaca914..7402976c 100644 --- a/docs/output.md +++ b/docs/output.md @@ -9,19 +9,19 @@ This document describes the output produced by the pipeline. The directories lis The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data depending on the type of ids provided: - Download FastQ files and create samplesheet from: - 1. [SRA / ENA / DDBJ ids](#sra--ena--ddbj-ids) + 1. [SRA / ENA / DDBJ / GEO ids](#sra--ena--ddbj--geo-ids) 2. [Synapse ids](#synapse-ids) - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introduction) for a list of supported public repository identifiers and how to provide them to the pipeline. -### SRA / ENA / DDBJ ids +### SRA / ENA / DDBJ / GEO ids
Output files - `fastq/` - - `*.fastq.gz`: Paired-end/single-end reads downloaded from the SRA / ENA / DDBJ. + - `*.fastq.gz`: Paired-end/single-end reads downloaded from the SRA / ENA / DDBJ / GEO. - `fastq/md5/` - `*.md5`: Files containing `md5` sum for FastQ files downloaded from the ENA. - `samplesheet/` diff --git a/docs/usage.md b/docs/usage.md index 55e47133..c2e32fa0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -8,15 +8,15 @@ The pipeline has been set-up to automatically download and process the raw FastQ files from both public and private repositories. Identifiers can be provided in a file, one-per-line via the `--input` parameter. Currently, the following types of example identifiers are supported: -| `SRA` | `ENA` | `DDBJ` | `Synapse` | -| ------------ | ------------ | ------------ | ----------- | -| SRR11605097 | ERR4007730 | DRR171822 | syn26240435 | -| SRX8171613 | ERX4009132 | DRX162434 | | -| SRS6531847 | ERS4399630 | DRS090921 | | -| SAMN14689442 | SAMEA6638373 | SAMD00114846 | | -| SRP256957 | ERP120836 | DRP004793 | | -| SRA1068758 | ERA2420837 | DRA008156 | | -| PRJNA625551 | PRJEB37513 | PRJDB4176 | | +| `SRA` | `ENA` | `DDBJ` | `GEO` | `Synapse` | +| ------------ | ------------ | ------------ | ----------- | ----------- | +| SRR11605097 | ERR4007730 | DRR171822 | GSM4432381 | syn26240435 | +| SRX8171613 | ERX4009132 | DRX162434 | GSE147507 | | +| SRS6531847 | ERS4399630 | DRS090921 | | | +| SAMN14689442 | SAMEA6638373 | SAMD00114846 | | | +| SRP256957 | ERP120836 | DRP004793 | | | +| SRA1068758 | ERA2420837 | DRA008156 | | | +| PRJNA625551 | PRJEB37513 | PRJDB4176 | | | ### SRR / ERR / DRR ids diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 13fd6d82..99858d2a 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -111,7 +111,7 @@ class WorkflowMain { if (num_match == total_ids) { is_sra = true } else { - Nextflow.error("Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ or Synapse ids!") + Nextflow.error("Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / GEO / DDBJ or Synapse ids!") } } return is_sra @@ -135,7 +135,7 @@ class WorkflowMain { if (num_match == total_ids) { is_synapse = true } else { - Nextflow.error("Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ or Synapse ids!") + Nextflow.error("Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / GEO / DDBJ or Synapse ids!") } } return is_synapse diff --git a/lib/WorkflowSra.groovy b/lib/WorkflowSra.groovy index 061f7ffb..2cd5ab75 100755 --- a/lib/WorkflowSra.groovy +++ b/lib/WorkflowSra.groovy @@ -30,21 +30,4 @@ class WorkflowSra { " running nf-core/other pipelines.\n" + "===================================================================================" } - - // Fail pipeline if input ids are from the GEO - public static void isGeoFail(ids) { - def pattern = /^(GS[EM])(\d+)$/ - for (id in ids) { - if (id =~ pattern) { - def error_string = "===================================================================================\n" + - " GEO id detected: ${id}\n" + - " Support for GEO ids was dropped in v1.7 due to breaking changes in the NCBI API.\n" + - " Please remove any GEO ids from the input samplesheet.\n\n" + - " Please see:\n" + - " https://github.com/nf-core/fetchngs/pull/102\n" + - "===================================================================================" - Nextflow.error(error_string) - } - } - } -} +} \ No newline at end of file diff --git a/main.nf b/main.nf index e7d67165..6da732be 100644 --- a/main.nf +++ b/main.nf @@ -44,7 +44,7 @@ if (WorkflowMain.isSraId(ch_input)) { } else if (WorkflowMain.isSynapseId(ch_input)) { input_type = 'synapse' } else { - exit 1, 'Ids provided via --input not recognised please make sure they are either SRA / ENA / DDBJ or Synapse ids!' + exit 1, 'Ids provided via --input not recognised please make sure they are either SRA / ENA / GEO / DDBJ or Synapse ids!' } if (params.input_type == input_type) { @@ -63,7 +63,7 @@ if (params.input_type == input_type) { workflow NFCORE_FETCHNGS { // - // WORKFLOW: Download FastQ files for SRA / ENA / DDBJ ids + // WORKFLOW: Download FastQ files for SRA / ENA / GEO / DDBJ ids // if (params.input_type == 'sra') { SRA ( ch_ids ) diff --git a/nextflow_schema.json b/nextflow_schema.json index 563c5e7b..3f72c2f1 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -19,7 +19,7 @@ "pattern": "^\\S+\\.(csv|tsv|txt)$", "schema": "assets/schema_input.json", "fa_icon": "fas fa-file-excel", - "description": "File containing SRA/ENA/DDBJ identifiers one per line to download their associated metadata and FastQ files." + "description": "File containing SRA/ENA/GEO/DDBJ identifiers one per line to download their associated metadata and FastQ files." }, "input_type": { "type": "string", diff --git a/workflows/sra.nf b/workflows/sra.nf index e3ac447f..ae891aab 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -50,13 +50,6 @@ workflow SRA { main: ch_versions = Channel.empty() - // - // Fail the pipeline if GEO ids detected - // - ids - .collect() - .map { WorkflowSra.isGeoFail(it) } - // // MODULE: Get SRA run information for public database ids // From d19e7f41f6609d03d3814df65930d176a0a97e38 Mon Sep 17 00:00:00 2001 From: ejseqera Date: Wed, 10 May 2023 19:26:08 -0400 Subject: [PATCH 40/71] chore: update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4bb46ddb..5fe8e99a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements - [PR #147](https://github.com/nf-core/fetchngs/pull/147) - Updated pipeline template to [nf-core/tools 2.8](https://github.com/nf-core/tools/releases/tag/2.8) - [PR #148](https://github.com/nf-core/fetchngs/pull/148) - Fix default metadata fields for ENA API v2.0 - [PR #150](https://github.com/nf-core/fetchngs/pull/150) - Add infrastructure and CI for multi-cloud full-sized tests run via Nextflow Tower +- [PR #155](https://github.com/nf-core/fetchngs/pull/155) - Add support back in for [GEO IDs](https://www.ncbi.nlm.nih.gov/geo) (removed in v1.7) ### Software dependencies From 2db65cc1e6f6fb09877888b341516d75181b79ed Mon Sep 17 00:00:00 2001 From: ejseqera Date: Wed, 10 May 2023 23:24:26 -0400 Subject: [PATCH 41/71] docs: update usage.md with dbGAP download details --- docs/usage.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 0f47198f..231da022 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -22,7 +22,7 @@ The pipeline has been set-up to automatically download and process the raw FastQ If `SRR`/`ERR`/`DRR` run ids are provided then these will be resolved back to their appropriate `SRX`/`ERX`/`DRX` ids to be able to merge multiple runs from the same experiment. This is conceptually the same as merging multiple libraries sequenced from the same sample. -The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5 sums. If download links exist, the files will be downloaded in parallel by FTP. Otherwise they are downloaded using sra-tools. To download protected dbGAP data using sra-tools, a JWT cart file can be specified with `--dbgap_key /path/to/cart.jwt`. +The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5 sums. If download links exist, the files will be downloaded in parallel by FTP. Otherwise they are downloaded using sra-tools. All of the sample metadata obtained from the ENA will be appended as additional columns to help you manually curate the generated samplesheet before you run the pipeline. You can customise the metadata fields that are appended to the samplesheet via the `--ena_metadata_fields` parameter. The default list of fields used by the pipeline can be found at the top of the [`bin/sra_ids_to_runinfo.py`](https://github.com/nf-core/fetchngs/blob/master/bin/sra_ids_to_runinfo.py) script within the pipeline repo. However, this pipeline requires a minimal set of fields to download FastQ files i.e. `'run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5'`. A comprehensive list of accepted metadata fields can be obtained from the [ENA API](https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run). @@ -70,6 +70,15 @@ From v1.9 of this pipeline the default `strandedness` in the output samplesheet If FTP connections are blocked on your network use the [`--force_sratools_download`](https://nf-co.re/fetchngs/parameters#force_sratools_download) parameter to force the pipeline to download data using sra-tools instead of the ENA FTP. +### Downloading dbGAP data with JWT + +As of v1.10, the SRA Toolkit used in this pipeline can be configured to access protected data from dbGAP using a [JWT cart file](https://www.ncbi.nlm.nih.gov/sra/docs/sra-dbGAP-cloud-download/). The JWT cart file can be specified with `--dbgap_key /path/to/cart.jwt`. + + +Note that due to the way the pipeline resolves SRA IDs down to the experiment to be able to merge multiple runs, your JWT cart file must be generated for *all* runs in an experiment. Otherwise, upon running `prefetch` and `fasterq-dump`, the pipeline will return a `403 Error` when trying to download data for other runs under an experiment that are not authenticated for with the provided JWT cart file. + +Users can log into the [SRA Run Selector](https://www.ncbi.nlm.nih.gov/Traces/study/), search for the dbGAP study they have been granted access to using the phs identifier, and select all available runs to activate the `JWT Cart` button to download the file. + ## Running the pipeline The typical command for running the pipeline is as follows: From b2c3d5ba69c43005fa285576b7d709904f5d219a Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Thu, 11 May 2023 08:25:47 +0100 Subject: [PATCH 42/71] Update nextflow.config --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 3f336ca7..fe374cae 100644 --- a/nextflow.config +++ b/nextflow.config @@ -178,7 +178,7 @@ env { // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] -// Set default docker registry (will be unused unless pulling docker images) +// Set default docker registry (will not be used unless pulling docker images) docker.registry = 'quay.io' def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') From b80afbb4160b4c727f1337b13be4f33fa401220a Mon Sep 17 00:00:00 2001 From: Maxime U Garcia Date: Thu, 11 May 2023 11:07:39 +0200 Subject: [PATCH 43/71] Update nextflow.config --- nextflow.config | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index fe374cae..b554f445 100644 --- a/nextflow.config +++ b/nextflow.config @@ -119,7 +119,6 @@ profiles { } podman { podman.enabled = true - podman.registry = 'quay.io' conda.enabled = false docker.enabled = false singularity.enabled = false @@ -178,8 +177,11 @@ env { // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] -// Set default docker registry (will not be used unless pulling docker images) +// Set default registry for Docker and Podman independent of -profile +// Will not be used unless Docker / Podman are enabled +// Set to your registry if you have a mirror of containers docker.registry = 'quay.io' +podman.registry = 'quay.io' def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { From 37d0a033a7dd7a2175107af36aca5c577a778087 Mon Sep 17 00:00:00 2001 From: ejseqera Date: Thu, 11 May 2023 14:07:26 -0400 Subject: [PATCH 44/71] chore: apply linting for black and prettier --- bin/sra_ids_to_runinfo.py | 12 ++++++------ lib/WorkflowSra.groovy | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 84a6623d..414d32a7 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -248,11 +248,11 @@ def _gsm_to_srx(cls, identifier): response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?{urlencode(params)}") cls._content_check(response, identifier) r_json = json.loads(response.text()) - gsm_ids = r_json['esearchresult']['idlist'] + gsm_ids = r_json["esearchresult"]["idlist"] for gsm_id in gsm_ids: ids += cls._id_to_srx(gsm_id) return ids - + @classmethod def _gds_to_gsm(cls, identifier): """Resolve the GEO UIDs to GSM IDs to then resolve to SRA IDs.""" @@ -261,9 +261,9 @@ def _gds_to_gsm(cls, identifier): response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?{urlencode(params)}") cls._content_check(response, identifier) r_json = json.loads(response.text()) - - for each in r_json['result'][identifier]['samples'][0:]: - ids += cls._gsm_to_srx(each['accession']) + + for each in r_json["result"][identifier]["samples"][0:]: + ids += cls._gsm_to_srx(each["accession"]) return ids @classmethod @@ -274,7 +274,7 @@ def _gse_to_srx(cls, identifier): response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?{urlencode(params)}") cls._content_check(response, identifier) r_json = json.loads(response.text()) - gds_uids = r_json['esearchresult']['idlist'] + gds_uids = r_json["esearchresult"]["idlist"] for gds_uid in gds_uids: ids += cls._gds_to_gsm(gds_uid) return ids diff --git a/lib/WorkflowSra.groovy b/lib/WorkflowSra.groovy index 2cd5ab75..3c092a6f 100755 --- a/lib/WorkflowSra.groovy +++ b/lib/WorkflowSra.groovy @@ -30,4 +30,4 @@ class WorkflowSra { " running nf-core/other pipelines.\n" + "===================================================================================" } -} \ No newline at end of file +} From 92af4687833ee77874bf3431b7236551a1356d58 Mon Sep 17 00:00:00 2001 From: ejseqera Date: Thu, 11 May 2023 14:13:48 -0400 Subject: [PATCH 45/71] chore: prettier linting --- docs/usage.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 231da022..52eaf63d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -22,7 +22,7 @@ The pipeline has been set-up to automatically download and process the raw FastQ If `SRR`/`ERR`/`DRR` run ids are provided then these will be resolved back to their appropriate `SRX`/`ERX`/`DRX` ids to be able to merge multiple runs from the same experiment. This is conceptually the same as merging multiple libraries sequenced from the same sample. -The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5 sums. If download links exist, the files will be downloaded in parallel by FTP. Otherwise they are downloaded using sra-tools. +The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5 sums. If download links exist, the files will be downloaded in parallel by FTP. Otherwise they are downloaded using sra-tools. All of the sample metadata obtained from the ENA will be appended as additional columns to help you manually curate the generated samplesheet before you run the pipeline. You can customise the metadata fields that are appended to the samplesheet via the `--ena_metadata_fields` parameter. The default list of fields used by the pipeline can be found at the top of the [`bin/sra_ids_to_runinfo.py`](https://github.com/nf-core/fetchngs/blob/master/bin/sra_ids_to_runinfo.py) script within the pipeline repo. However, this pipeline requires a minimal set of fields to download FastQ files i.e. `'run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5'`. A comprehensive list of accepted metadata fields can be obtained from the [ENA API](https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run). @@ -72,12 +72,11 @@ If FTP connections are blocked on your network use the [`--force_sratools_downlo ### Downloading dbGAP data with JWT -As of v1.10, the SRA Toolkit used in this pipeline can be configured to access protected data from dbGAP using a [JWT cart file](https://www.ncbi.nlm.nih.gov/sra/docs/sra-dbGAP-cloud-download/). The JWT cart file can be specified with `--dbgap_key /path/to/cart.jwt`. - +As of v1.10, the SRA Toolkit used in this pipeline can be configured to access protected data from dbGAP using a [JWT cart file](https://www.ncbi.nlm.nih.gov/sra/docs/sra-dbGAP-cloud-download/). The JWT cart file can be specified with `--dbgap_key /path/to/cart.jwt`. -Note that due to the way the pipeline resolves SRA IDs down to the experiment to be able to merge multiple runs, your JWT cart file must be generated for *all* runs in an experiment. Otherwise, upon running `prefetch` and `fasterq-dump`, the pipeline will return a `403 Error` when trying to download data for other runs under an experiment that are not authenticated for with the provided JWT cart file. +Note that due to the way the pipeline resolves SRA IDs down to the experiment to be able to merge multiple runs, your JWT cart file must be generated for _all_ runs in an experiment. Otherwise, upon running `prefetch` and `fasterq-dump`, the pipeline will return a `403 Error` when trying to download data for other runs under an experiment that are not authenticated for with the provided JWT cart file. -Users can log into the [SRA Run Selector](https://www.ncbi.nlm.nih.gov/Traces/study/), search for the dbGAP study they have been granted access to using the phs identifier, and select all available runs to activate the `JWT Cart` button to download the file. +Users can log into the [SRA Run Selector](https://www.ncbi.nlm.nih.gov/Traces/study/), search for the dbGAP study they have been granted access to using the phs identifier, and select all available runs to activate the `JWT Cart` button to download the file. ## Running the pipeline From 21f37073e8f0041fb036905ddefc90e57cd5afad Mon Sep 17 00:00:00 2001 From: ejseqera Date: Thu, 11 May 2023 14:16:29 -0400 Subject: [PATCH 46/71] chore: lint docs and usage --- README.md | 5 ++--- docs/usage.md | 18 +++++++++--------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index a14f7fb1..4d6c2d35 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ ## Introduction -**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from both public and private databases. At present, the pipeline supports SRA / ENA / DDBJ / GEO / Synapse ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). +**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from both public and private databases. At present, the pipeline supports SRA / ENA / DDBJ / GEO / Synapse ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). ## Usage @@ -56,7 +56,7 @@ For more details, please refer to the [usage documentation](https://nf-co.re/fet Via a single file of ids, provided one-per-line (see [example input file](https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test.csv)) the pipeline performs the following steps: -### SRA / ENA / DDBJ / GEO ids +### SRA / ENA / DDBJ / GEO ids 1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html) 2. Fetch extensive id metadata via ENA API @@ -65,7 +65,6 @@ Via a single file of ids, provided one-per-line (see [example input file](https: - Otherwise use [`sra-tools`](https://github.com/ncbi/sra-tools) to download `.sra` files and convert them to FastQ 4. Collate id metadata and paths to FastQ files in a single samplesheet - ### Synapse ids 1. Resolve Synapse directory ids to their corresponding FastQ files ids via the `synapse list` command. diff --git a/docs/usage.md b/docs/usage.md index c2e32fa0..963d9229 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -8,15 +8,15 @@ The pipeline has been set-up to automatically download and process the raw FastQ files from both public and private repositories. Identifiers can be provided in a file, one-per-line via the `--input` parameter. Currently, the following types of example identifiers are supported: -| `SRA` | `ENA` | `DDBJ` | `GEO` | `Synapse` | -| ------------ | ------------ | ------------ | ----------- | ----------- | -| SRR11605097 | ERR4007730 | DRR171822 | GSM4432381 | syn26240435 | -| SRX8171613 | ERX4009132 | DRX162434 | GSE147507 | | -| SRS6531847 | ERS4399630 | DRS090921 | | | -| SAMN14689442 | SAMEA6638373 | SAMD00114846 | | | -| SRP256957 | ERP120836 | DRP004793 | | | -| SRA1068758 | ERA2420837 | DRA008156 | | | -| PRJNA625551 | PRJEB37513 | PRJDB4176 | | | +| `SRA` | `ENA` | `DDBJ` | `GEO` | `Synapse` | +| ------------ | ------------ | ------------ | ---------- | ----------- | +| SRR11605097 | ERR4007730 | DRR171822 | GSM4432381 | syn26240435 | +| SRX8171613 | ERX4009132 | DRX162434 | GSE147507 | | +| SRS6531847 | ERS4399630 | DRS090921 | | | +| SAMN14689442 | SAMEA6638373 | SAMD00114846 | | | +| SRP256957 | ERP120836 | DRP004793 | | | +| SRA1068758 | ERA2420837 | DRA008156 | | | +| PRJNA625551 | PRJEB37513 | PRJDB4176 | | | ### SRR / ERR / DRR ids From 0cb79229dc5d82bb4601ccfb8de87e5a22df3b7d Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sat, 13 May 2023 20:27:31 +0100 Subject: [PATCH 47/71] Remove quay.io prefix from mulled containers --- modules/nf-core/sratools/fasterqdump/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/nf-core/sratools/fasterqdump/main.nf b/modules/nf-core/sratools/fasterqdump/main.nf index 2336c318..57a3dc9c 100644 --- a/modules/nf-core/sratools/fasterqdump/main.nf +++ b/modules/nf-core/sratools/fasterqdump/main.nf @@ -5,7 +5,7 @@ process SRATOOLS_FASTERQDUMP { conda "bioconda::sra-tools=2.11.0 conda-forge::pigz=2.6" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' : - 'quay.io/biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' }" + 'biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' }" input: tuple val(meta), path(sra) From c5839cafa06732333faa7474397dacba6ba25d9d Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sat, 13 May 2023 20:43:22 +0100 Subject: [PATCH 48/71] Add public_aws_ecr.config and use in CI tests --- .github/workflows/cloud_tests_full.yml | 2 +- .github/workflows/cloud_tests_small.yml | 2 +- conf/public_aws_ecr.config | 18 ++++++++++++++++++ nextflow.config | 3 +++ 4 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 conf/public_aws_ecr.config diff --git a/.github/workflows/cloud_tests_full.yml b/.github/workflows/cloud_tests_full.yml index d0ed0552..e25a97aa 100644 --- a/.github/workflows/cloud_tests_full.yml +++ b/.github/workflows/cloud_tests_full.yml @@ -27,7 +27,7 @@ jobs: compute_env: ${{ secrets.TOWER_CE_AWS_CPU }} workdir: "${{ secrets.TOWER_BUCKET_AWS }}/work/fetchngs/work-${{ github.sha }}" run_name: "aws_fetchngs_full" - profiles: test_full + profiles: test_full,public_aws_ecr parameters: | { "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", diff --git a/.github/workflows/cloud_tests_small.yml b/.github/workflows/cloud_tests_small.yml index 12b8daab..e7636fdc 100644 --- a/.github/workflows/cloud_tests_small.yml +++ b/.github/workflows/cloud_tests_small.yml @@ -25,7 +25,7 @@ jobs: compute_env: ${{ secrets.TOWER_CE_AWS_CPU }} workdir: "${{ secrets.TOWER_BUCKET_AWS }}/work/fetchngs/work-${{ github.sha }}" run_name: "aws_fetchngs_small" - profiles: test + profiles: test,public_aws_ecr parameters: | { "outdir": "${{ secrets.TOWER_BUCKET_AWS }}/fetchngs/results-test-${{ github.sha }}" diff --git a/conf/public_aws_ecr.config b/conf/public_aws_ecr.config new file mode 100644 index 00000000..59ac8c01 --- /dev/null +++ b/conf/public_aws_ecr.config @@ -0,0 +1,18 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + AWS ECR Config +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config to set public AWS ECR images wherever possible + This improves speed when running on AWS infrastructure. + Use this as an example template when using your own private registry. +---------------------------------------------------------------------------------------- +*/ + +docker.registry = 'public.ecr.aws' +podman.registry = 'public.ecr.aws' + +process { + withName: '.*:SRATOOLS_FASTERQDUMP' { + container = 'quay.io/biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index b554f445..18e065d0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -158,6 +158,9 @@ profiles { executor.cpus = 16 executor.memory = 60.GB } + public_aws_ecr { + includeConfig 'conf/public_aws_ecr.config' + } test { includeConfig 'conf/test.config' } test_synapse { includeConfig 'conf/test_synapse.config' } test_full { includeConfig 'conf/test_full.config' } From 6a47ccd3c7e8e0e3450e52062fc159f7dbaf8687 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sat, 13 May 2023 20:44:59 +0100 Subject: [PATCH 49/71] Update CHANGELOG --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4bb46ddb..297d1941 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ Special thanks to the following for their contributions to the release: - [Maxime Garcia](https://github.com/maxulysse) - [Moritz E. Beber](https://github.com/Midnighter) - [Rob Syme](https://github.com/robsyme) +- [sirclockalot](https://github.com/sirclockalot) Thank you to everyone else that has contributed by reporting bugs, enhancements or in any other way, shape or form. @@ -26,6 +27,7 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements - [PR #147](https://github.com/nf-core/fetchngs/pull/147) - Updated pipeline template to [nf-core/tools 2.8](https://github.com/nf-core/tools/releases/tag/2.8) - [PR #148](https://github.com/nf-core/fetchngs/pull/148) - Fix default metadata fields for ENA API v2.0 - [PR #150](https://github.com/nf-core/fetchngs/pull/150) - Add infrastructure and CI for multi-cloud full-sized tests run via Nextflow Tower +- [PR #157](https://github.com/nf-core/fetchngs/pull/157) - Add `public_aws_ecr.config` to source mulled containers when using `public.ecr.aws` Docker Biocontainer registry ### Software dependencies From 3e558a68195f4fccbe8c6af89caad3840e2d0211 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sat, 13 May 2023 20:46:09 +0100 Subject: [PATCH 50/71] Run nf-core modules update to fix quay.io container definitions --- modules.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules.json b/modules.json index dea81e71..ccf017da 100644 --- a/modules.json +++ b/modules.json @@ -17,7 +17,7 @@ }, "sratools/fasterqdump": { "branch": "master", - "git_sha": "10cb20f6a130d104fef335a8290f3ffce650f28d", + "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", "installed_by": ["fastq_download_prefetch_fasterqdump_sratools"] }, "sratools/prefetch": { From c9afc5cff2c80bd664e55420beb65e65540c2dc1 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sat, 13 May 2023 20:47:37 +0100 Subject: [PATCH 51/71] Fix ECLint --- conf/public_aws_ecr.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/public_aws_ecr.config b/conf/public_aws_ecr.config index 59ac8c01..b9129571 100644 --- a/conf/public_aws_ecr.config +++ b/conf/public_aws_ecr.config @@ -15,4 +15,4 @@ process { withName: '.*:SRATOOLS_FASTERQDUMP' { container = 'quay.io/biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' } -} \ No newline at end of file +} From 64a4971bea976940950d6cf24572d5ce3e730b19 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sun, 14 May 2023 20:09:43 +0100 Subject: [PATCH 52/71] Update CHANGELOG --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 297d1941..d62cfdde 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[1.10.0](https://github.com/nf-core/fetchngs/releases/tag/1.10.0)] - 2023-05-12 +## [[1.10.0](https://github.com/nf-core/fetchngs/releases/tag/1.10.0)] - 2023-05-15 ### Credits From 740b2ce06d6e0b997432eac1c7d63f923436e2cc Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sun, 14 May 2023 20:10:07 +0100 Subject: [PATCH 53/71] Bump pipeline version to 1.10.0 --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 18e065d0..c1c66b07 100644 --- a/nextflow.config +++ b/nextflow.config @@ -211,7 +211,7 @@ manifest { description = """Pipeline to fetch metadata and raw FastQ files from public databases""" mainScript = 'main.nf' nextflowVersion = '!>=22.10.1' - version = '1.10.0dev' + version = '1.10.0' doi = 'https://doi.org/10.5281/zenodo.5070524' } From f510057968bc205cf5f8cccf488440da2a6d9365 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sun, 14 May 2023 20:58:05 +0100 Subject: [PATCH 54/71] Add ubuntu containers to public_aws_ecr.config --- conf/public_aws_ecr.config | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/conf/public_aws_ecr.config b/conf/public_aws_ecr.config index b9129571..abbe0fe4 100644 --- a/conf/public_aws_ecr.config +++ b/conf/public_aws_ecr.config @@ -15,4 +15,10 @@ process { withName: '.*:SRATOOLS_FASTERQDUMP' { container = 'quay.io/biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' } + withName: '.*:SRA_MERGE_SAMPLESHEET' { + container = 'quay.io/nf-core/ubuntu:20.04' + } + withName: '.*:SYNAPSE_MERGE_SAMPLESHEET' { + container = 'quay.io/nf-core/ubuntu:20.04' + } } From 3eb8afbdf39b6eaf299a866a0a6c3562fa970203 Mon Sep 17 00:00:00 2001 From: Esha Joshi <128735622+ejseqera@users.noreply.github.com> Date: Mon, 15 May 2023 15:06:19 +0100 Subject: [PATCH 55/71] Update nextflow_schema.json Co-authored-by: Harshil Patel --- nextflow_schema.json | 1 - 1 file changed, 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 87d5024c..0abb1e39 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -66,7 +66,6 @@ }, "dbgap_key": { "type": "string", - "default": null, "fa_icon": "fas fa-address-card", "help_text": "Path to a JWT cart file used to access protected dbGAP data on SRA using the sra-toolkit. Users with granted access to controlled data can download the JWT cart file for the study from the SRA Run Selector upon logging in. The JWT file can only be used on cloud platforms and is valid for 1 hour upon creation.", "format": "file-path" From fb7b6054c5c2d1a84fd553aa8df9c257fb07462e Mon Sep 17 00:00:00 2001 From: ejseqera Date: Mon, 15 May 2023 10:36:59 -0400 Subject: [PATCH 56/71] feat: move dbgap key param as input into parent workflow --- workflows/sra.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/sra.nf b/workflows/sra.nf index e3ac447f..abf23dc3 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -113,7 +113,8 @@ workflow SRA { // SUBWORKFLOW: Download sequencing reads without FTP links using sra-tools. // FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS ( - ch_sra_reads.sra.map { meta, reads -> [ meta, meta.run_accession ] } + ch_sra_reads.sra.map { meta, reads -> [ meta, meta.run_accession ] }, + params.dbgap_key ?: [] ) ch_versions = ch_versions.mix(FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS.out.versions.first()) From 43400b10746d79b5a502659358187dfbad294337 Mon Sep 17 00:00:00 2001 From: Esha Joshi <128735622+ejseqera@users.noreply.github.com> Date: Mon, 15 May 2023 15:37:19 +0100 Subject: [PATCH 57/71] Update docs/usage.md Co-authored-by: Harshil Patel --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 231da022..c9882467 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -22,7 +22,7 @@ The pipeline has been set-up to automatically download and process the raw FastQ If `SRR`/`ERR`/`DRR` run ids are provided then these will be resolved back to their appropriate `SRX`/`ERX`/`DRX` ids to be able to merge multiple runs from the same experiment. This is conceptually the same as merging multiple libraries sequenced from the same sample. -The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5 sums. If download links exist, the files will be downloaded in parallel by FTP. Otherwise they are downloaded using sra-tools. +The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5 sums. If download links exist, the files will be downloaded in parallel by FTP. Otherwise they are downloaded using sra-tools. All of the sample metadata obtained from the ENA will be appended as additional columns to help you manually curate the generated samplesheet before you run the pipeline. You can customise the metadata fields that are appended to the samplesheet via the `--ena_metadata_fields` parameter. The default list of fields used by the pipeline can be found at the top of the [`bin/sra_ids_to_runinfo.py`](https://github.com/nf-core/fetchngs/blob/master/bin/sra_ids_to_runinfo.py) script within the pipeline repo. However, this pipeline requires a minimal set of fields to download FastQ files i.e. `'run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5'`. A comprehensive list of accepted metadata fields can be obtained from the [ENA API](https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run). From d74d0561b4dd46c2f6d570a868245412d71c4c32 Mon Sep 17 00:00:00 2001 From: ejseqera Date: Mon, 15 May 2023 10:44:36 -0400 Subject: [PATCH 58/71] docs: update changelog to reference dbgap issue and not PR --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 066e3d71..73666bbe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,7 +26,7 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements - [PR #147](https://github.com/nf-core/fetchngs/pull/147) - Updated pipeline template to [nf-core/tools 2.8](https://github.com/nf-core/tools/releases/tag/2.8) - [PR #148](https://github.com/nf-core/fetchngs/pull/148) - Fix default metadata fields for ENA API v2.0 - [PR #150](https://github.com/nf-core/fetchngs/pull/150) - Add infrastructure and CI for multi-cloud full-sized tests run via Nextflow Tower -- [PR #152](https://github.com/nf-core/fetchngs/pull/152) - Add support for downloading protected dbGAP data using a JWT file +- [#138](https://github.com/nf-core/fetchngs/issues/138) - Add support for downloading protected dbGAP data using a JWT file ### Software dependencies From b07ca4290d6177e9c67c4fc0b444bd904072f72b Mon Sep 17 00:00:00 2001 From: ejseqera Date: Mon, 15 May 2023 11:50:06 -0400 Subject: [PATCH 59/71] refactor: fix changelog to reference geo issue --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5fe8e99a..1478af9f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,11 +22,11 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements - [#85](https://github.com/nf-core/fetchngs/issues/85) - Not able to fetch metadata for ERR ids associated with ArrayExpress - [#129](https://github.com/nf-core/fetchngs/issues/129) - Pipeline is working with SRA run ids but failing with corresponding Biosample ids - [#144](https://github.com/nf-core/fetchngs/issues/144) - Add support to download 10X Genomics data +- [#104](https://github.com/nf-core/fetchngs/issues/104) - Add support back in for [GEO IDs](https://www.ncbi.nlm.nih.gov/geo) (removed in v1.7) - [PR #140](https://github.com/nf-core/fetchngs/pull/140) - Bumped modules version to allow for sratools download of sralite format files - [PR #147](https://github.com/nf-core/fetchngs/pull/147) - Updated pipeline template to [nf-core/tools 2.8](https://github.com/nf-core/tools/releases/tag/2.8) - [PR #148](https://github.com/nf-core/fetchngs/pull/148) - Fix default metadata fields for ENA API v2.0 - [PR #150](https://github.com/nf-core/fetchngs/pull/150) - Add infrastructure and CI for multi-cloud full-sized tests run via Nextflow Tower -- [PR #155](https://github.com/nf-core/fetchngs/pull/155) - Add support back in for [GEO IDs](https://www.ncbi.nlm.nih.gov/geo) (removed in v1.7) ### Software dependencies From b3681617524b9cd4a4e6e8d4be792b4c1d02bd2b Mon Sep 17 00:00:00 2001 From: ejseqera Date: Mon, 15 May 2023 12:10:22 -0400 Subject: [PATCH 60/71] refactor: move up issue reference for dbgap --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 73666bbe..235f5c41 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,11 +22,11 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements - [#85](https://github.com/nf-core/fetchngs/issues/85) - Not able to fetch metadata for ERR ids associated with ArrayExpress - [#129](https://github.com/nf-core/fetchngs/issues/129) - Pipeline is working with SRA run ids but failing with corresponding Biosample ids - [#144](https://github.com/nf-core/fetchngs/issues/144) - Add support to download 10X Genomics data +- [#138](https://github.com/nf-core/fetchngs/issues/138) - Add support for downloading protected dbGAP data using a JWT file - [PR #140](https://github.com/nf-core/fetchngs/pull/140) - Bumped modules version to allow for sratools download of sralite format files - [PR #147](https://github.com/nf-core/fetchngs/pull/147) - Updated pipeline template to [nf-core/tools 2.8](https://github.com/nf-core/tools/releases/tag/2.8) - [PR #148](https://github.com/nf-core/fetchngs/pull/148) - Fix default metadata fields for ENA API v2.0 - [PR #150](https://github.com/nf-core/fetchngs/pull/150) - Add infrastructure and CI for multi-cloud full-sized tests run via Nextflow Tower -- [#138](https://github.com/nf-core/fetchngs/issues/138) - Add support for downloading protected dbGAP data using a JWT file ### Software dependencies From 32796feccc16b094411257b83e8fc78f5554f3b0 Mon Sep 17 00:00:00 2001 From: ejseqera Date: Mon, 15 May 2023 12:30:58 -0400 Subject: [PATCH 61/71] fix: modules update for sratools prefetch and fasterqdump --- CHANGELOG.md | 1 + modules.json | 6 ++-- modules/nf-core/sratools/fasterqdump/main.nf | 2 +- modules/nf-core/sratools/fasterqdump/meta.yml | 6 +++- modules/nf-core/sratools/prefetch/main.nf | 2 +- modules/nf-core/sratools/prefetch/meta.yml | 8 +++-- .../main.nf | 34 +++++++------------ .../meta.yml | 5 +++ 8 files changed, 34 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 235f5c41..4b950fae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements - [PR #147](https://github.com/nf-core/fetchngs/pull/147) - Updated pipeline template to [nf-core/tools 2.8](https://github.com/nf-core/tools/releases/tag/2.8) - [PR #148](https://github.com/nf-core/fetchngs/pull/148) - Fix default metadata fields for ENA API v2.0 - [PR #150](https://github.com/nf-core/fetchngs/pull/150) - Add infrastructure and CI for multi-cloud full-sized tests run via Nextflow Tower +- [nf-core/modules#3417](https://github.com/nf-core/modules/pull/3417) - Update sratools modules to optionally take a JWT file as input ### Software dependencies diff --git a/modules.json b/modules.json index dea81e71..3137b88b 100644 --- a/modules.json +++ b/modules.json @@ -17,12 +17,12 @@ }, "sratools/fasterqdump": { "branch": "master", - "git_sha": "10cb20f6a130d104fef335a8290f3ffce650f28d", + "git_sha": "6712754854ae2832abfff3f0800cdb4a6a60bfca", "installed_by": ["fastq_download_prefetch_fasterqdump_sratools"] }, "sratools/prefetch": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "6712754854ae2832abfff3f0800cdb4a6a60bfca", "installed_by": ["fastq_download_prefetch_fasterqdump_sratools"] } } @@ -31,7 +31,7 @@ "nf-core": { "fastq_download_prefetch_fasterqdump_sratools": { "branch": "master", - "git_sha": "a9784afdd5dcda23b84e64db75dc591065d64653", + "git_sha": "6712754854ae2832abfff3f0800cdb4a6a60bfca", "installed_by": ["subworkflows"] } } diff --git a/modules/nf-core/sratools/fasterqdump/main.nf b/modules/nf-core/sratools/fasterqdump/main.nf index 4ed92814..2d9090e2 100644 --- a/modules/nf-core/sratools/fasterqdump/main.nf +++ b/modules/nf-core/sratools/fasterqdump/main.nf @@ -5,7 +5,7 @@ process SRATOOLS_FASTERQDUMP { conda "bioconda::sra-tools=2.11.0 conda-forge::pigz=2.6" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' : - 'quay.io/biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' }" + 'biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' }" input: tuple val(meta), path(sra) diff --git a/modules/nf-core/sratools/fasterqdump/meta.yml b/modules/nf-core/sratools/fasterqdump/meta.yml index d6fbd444..629bdca5 100644 --- a/modules/nf-core/sratools/fasterqdump/meta.yml +++ b/modules/nf-core/sratools/fasterqdump/meta.yml @@ -27,7 +27,11 @@ input: description: > An NCBI user settings file. pattern: "*.mkfg" - + - certificate: + type: file + description: > + Path to a JWT cart file used to access protected dbGAP data on SRA using the sra-toolkit + pattern: "*.cart" output: - meta: type: map diff --git a/modules/nf-core/sratools/prefetch/main.nf b/modules/nf-core/sratools/prefetch/main.nf index 5d894b88..ba7be4bd 100644 --- a/modules/nf-core/sratools/prefetch/main.nf +++ b/modules/nf-core/sratools/prefetch/main.nf @@ -10,7 +10,7 @@ process SRATOOLS_PREFETCH { input: tuple val(meta), val(id) path ncbi_settings - path certificate + path certificate output: tuple val(meta), path(id), emit: sra diff --git a/modules/nf-core/sratools/prefetch/meta.yml b/modules/nf-core/sratools/prefetch/meta.yml index a3a26522..9817b0b2 100644 --- a/modules/nf-core/sratools/prefetch/meta.yml +++ b/modules/nf-core/sratools/prefetch/meta.yml @@ -19,7 +19,7 @@ input: Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - id: - type: val + type: string description: > A string denoting an SRA id. - ncbi_settings: @@ -27,7 +27,11 @@ input: description: > An NCBI user settings file. pattern: "*.mkfg" - + - certificate: + type: file + description: > + Path to a JWT cart file used to access protected dbGAP data on SRA using the sra-toolkit + pattern: "*.cart" output: - meta: type: map diff --git a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf index 965d8779..de31637e 100644 --- a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf +++ b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf @@ -7,7 +7,8 @@ include { SRATOOLS_FASTERQDUMP } from '../../../modules/nf-core/sratools/ // workflow FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS { take: - ch_sra_ids // channel: [ val(meta), val(id) ] + ch_sra_ids // channel: [ val(meta), val(id) ] + ch_dbgap_key // channel: [ path(dbgap_key) ] main: @@ -17,33 +18,22 @@ workflow FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS { // Detect existing NCBI user settings or create new ones. // CUSTOM_SRATOOLSNCBISETTINGS() - def settings = CUSTOM_SRATOOLSNCBISETTINGS.out.ncbi_settings // value channel: path(settings) + ch_ncbi_settings = CUSTOM_SRATOOLSNCBISETTINGS.out.ncbi_settings ch_versions = ch_versions.mix(CUSTOM_SRATOOLSNCBISETTINGS.out.versions) // - // Prefetch sequencing reads in SRA format and convert into one or more compressed FASTQ files. - // If specified in params, use the provided JWT file for pulling protected SRA runs, else provide - // an empty list. + // Prefetch sequencing reads in SRA format. // - - if (!params.dbgap_key) { - - SRATOOLS_PREFETCH ( ch_sra_ids, settings, [] ) - SRATOOLS_FASTERQDUMP ( SRATOOLS_PREFETCH.out.sra, settings, [] ) - - } else { - - certificate = file(params.dbgap_key, checkIfExists: true) // optional input channel for JWT - - SRATOOLS_PREFETCH ( ch_sra_ids, settings, certificate ) - SRATOOLS_FASTERQDUMP ( SRATOOLS_PREFETCH.out.sra, settings, certificate ) - - } - + SRATOOLS_PREFETCH ( ch_sra_ids, ch_ncbi_settings, ch_dbgap_key ) ch_versions = ch_versions.mix(SRATOOLS_PREFETCH.out.versions.first()) + + // + // Convert the SRA format into one or more compressed FASTQ files. + // + SRATOOLS_FASTERQDUMP ( SRATOOLS_PREFETCH.out.sra, ch_ncbi_settings, ch_dbgap_key ) ch_versions = ch_versions.mix(SRATOOLS_FASTERQDUMP.out.versions.first()) emit: - reads = SRATOOLS_FASTERQDUMP.out.reads // channel: [ val(meta), [ reads ] ] - versions = ch_versions // channel: [ versions.yml ] + reads = SRATOOLS_FASTERQDUMP.out.reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml index 4599bbd2..6ff9442b 100644 --- a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml +++ b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml @@ -22,6 +22,11 @@ input: type: string description: > SRA run identifier. + - certificate: + type: file + description: > + Path to a JWT cart file used to access protected dbGAP data on SRA using the sra-toolkit + pattern: "*.cart" # TODO Update when we decide on a standard for subworkflow docs output: - meta: From 1d75d1867c35411f0476b46c5c073a072303b617 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 15 May 2023 18:14:44 +0100 Subject: [PATCH 62/71] Apply suggestions from code review --- CHANGELOG.md | 2 +- docs/usage.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c653e7d1..6eda7411 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,8 +22,8 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements - [#85](https://github.com/nf-core/fetchngs/issues/85) - Not able to fetch metadata for ERR ids associated with ArrayExpress - [#129](https://github.com/nf-core/fetchngs/issues/129) - Pipeline is working with SRA run ids but failing with corresponding Biosample ids -- [#144](https://github.com/nf-core/fetchngs/issues/144) - Add support to download 10X Genomics data - [#138](https://github.com/nf-core/fetchngs/issues/138) - Add support for downloading protected dbGAP data using a JWT file +- [#144](https://github.com/nf-core/fetchngs/issues/144) - Add support to download 10X Genomics data - [PR #140](https://github.com/nf-core/fetchngs/pull/140) - Bumped modules version to allow for sratools download of sralite format files - [PR #147](https://github.com/nf-core/fetchngs/pull/147) - Updated pipeline template to [nf-core/tools 2.8](https://github.com/nf-core/tools/releases/tag/2.8) - [PR #148](https://github.com/nf-core/fetchngs/pull/148) - Fix default metadata fields for ENA API v2.0 diff --git a/docs/usage.md b/docs/usage.md index 52eaf63d..f0371eb3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -72,7 +72,7 @@ If FTP connections are blocked on your network use the [`--force_sratools_downlo ### Downloading dbGAP data with JWT -As of v1.10, the SRA Toolkit used in this pipeline can be configured to access protected data from dbGAP using a [JWT cart file](https://www.ncbi.nlm.nih.gov/sra/docs/sra-dbGAP-cloud-download/). The JWT cart file can be specified with `--dbgap_key /path/to/cart.jwt`. +As of v1.10.0, the SRA Toolkit used in this pipeline can be configured to access protected data from dbGAP using a [JWT cart file](https://www.ncbi.nlm.nih.gov/sra/docs/sra-dbGAP-cloud-download/). The JWT cart file can be specified with `--dbgap_key /path/to/cart.jwt`. Note that due to the way the pipeline resolves SRA IDs down to the experiment to be able to merge multiple runs, your JWT cart file must be generated for _all_ runs in an experiment. Otherwise, upon running `prefetch` and `fasterq-dump`, the pipeline will return a `403 Error` when trying to download data for other runs under an experiment that are not authenticated for with the provided JWT cart file. From b641a7adb80d1623ce6f58d67ee9fdd364ef2d36 Mon Sep 17 00:00:00 2001 From: ejseqera Date: Mon, 15 May 2023 14:56:55 -0400 Subject: [PATCH 63/71] docs: add SRA example for dbgap data download with jwt --- docs/usage.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index f0371eb3..e7a83805 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -72,12 +72,17 @@ If FTP connections are blocked on your network use the [`--force_sratools_downlo ### Downloading dbGAP data with JWT -As of v1.10.0, the SRA Toolkit used in this pipeline can be configured to access protected data from dbGAP using a [JWT cart file](https://www.ncbi.nlm.nih.gov/sra/docs/sra-dbGAP-cloud-download/). The JWT cart file can be specified with `--dbgap_key /path/to/cart.jwt`. +As of v1.10.0, the SRA Toolkit used in this pipeline can be configured to access protected data from dbGAP using a [JWT cart file](https://www.ncbi.nlm.nih.gov/sra/docs/sra-dbGAP-cloud-download/) on a supported cloud computing environment (Amazon Web Services or Google Cloud Platform). The JWT cart file can be specified with `--dbgap_key /path/to/cart.jwt`. Note that due to the way the pipeline resolves SRA IDs down to the experiment to be able to merge multiple runs, your JWT cart file must be generated for _all_ runs in an experiment. Otherwise, upon running `prefetch` and `fasterq-dump`, the pipeline will return a `403 Error` when trying to download data for other runs under an experiment that are not authenticated for with the provided JWT cart file. Users can log into the [SRA Run Selector](https://www.ncbi.nlm.nih.gov/Traces/study/), search for the dbGAP study they have been granted access to using the phs identifier, and select all available runs to activate the `JWT Cart` button to download the file. +To test this functionality in your cloud computing environment, you can use the protected dbGAP cloud testing study with accession `SRR1219902`: + +- On the [SRA Run Selector page for `SRR1219902`](https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRR1219902&o=acc_s%3Aa&s=SRR1219902), select the available run and click on `JWT Cart` to download a key file called `jwt.cart` that can be directly provided to the pipeline with `--dbgap_key jwt.cart` +- Click on `Accession List` to download a text file called `SRR_Acc_List.txt` that can be directly provided to the pipeline with `--input SRR_Acc_List.txt` + ## Running the pipeline The typical command for running the pipeline is as follows: From 76ad4b723ed7dd9a225b407c67a2d5695c55ebac Mon Sep 17 00:00:00 2001 From: ejseqera Date: Mon, 15 May 2023 15:30:45 -0400 Subject: [PATCH 64/71] docs: update test JWT sra accession ID to experiment ID --- docs/usage.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index e7a83805..b3f215f3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -78,10 +78,10 @@ Note that due to the way the pipeline resolves SRA IDs down to the experiment to Users can log into the [SRA Run Selector](https://www.ncbi.nlm.nih.gov/Traces/study/), search for the dbGAP study they have been granted access to using the phs identifier, and select all available runs to activate the `JWT Cart` button to download the file. -To test this functionality in your cloud computing environment, you can use the protected dbGAP cloud testing study with accession `SRR1219902`: +To test this functionality in your cloud computing environment, you can use the protected dbGAP cloud testing study with experiment accession `SRX512039`: -- On the [SRA Run Selector page for `SRR1219902`](https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRR1219902&o=acc_s%3Aa&s=SRR1219902), select the available run and click on `JWT Cart` to download a key file called `jwt.cart` that can be directly provided to the pipeline with `--dbgap_key jwt.cart` -- Click on `Accession List` to download a text file called `SRR_Acc_List.txt` that can be directly provided to the pipeline with `--input SRR_Acc_List.txt` +- On the [SRA Run Selector page for `SRX512039`](https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRX512039&o=acc_s%3Aa), select the two available runs (`SRR1219865` and `SRR1219902`) and click on `JWT Cart` to download a key file called `cart.jwt` that can be directly provided to the pipeline with `--dbgap_key cart.jwt` +- Click on `Accession List` to download a text file called `SRR_Acc_List.txt` with the SRR IDs that can be directly provided to the pipeline with `--input SRR_Acc_List.txt` ## Running the pipeline From b0d32c2abc84a7cc508c16d69875f3a06b90601a Mon Sep 17 00:00:00 2001 From: Esha Joshi <128735622+ejseqera@users.noreply.github.com> Date: Mon, 15 May 2023 20:34:22 +0100 Subject: [PATCH 65/71] Update workflows/sra.nf Co-authored-by: Harshil Patel --- workflows/sra.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/sra.nf b/workflows/sra.nf index abf23dc3..82120249 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -114,7 +114,7 @@ workflow SRA { // FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS ( ch_sra_reads.sra.map { meta, reads -> [ meta, meta.run_accession ] }, - params.dbgap_key ?: [] + params.dbgap_key ? file(params.dbgap_key, checkIfExists: true) : [] ) ch_versions = ch_versions.mix(FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS.out.versions.first()) From e84521daba5998e5307489e3b2bcc38a1e4fbd3b Mon Sep 17 00:00:00 2001 From: ejseqera Date: Mon, 15 May 2023 22:10:52 -0400 Subject: [PATCH 66/71] feat: add backoff strategy and handling for API responses --- bin/sra_ids_to_runinfo.py | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 414d32a7..0ffafba8 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -15,6 +15,7 @@ from urllib.parse import urlencode from urllib.request import urlopen import json +import time logger = logging.getLogger() @@ -399,13 +400,40 @@ def validate_fields_parameter(param, valid_vals, param_desc): def fetch_url(url): """Return a response object for the given URL and handle errors appropriately.""" + sleep_time = 5 # Hardcode sleep duration in seconds + max_num_attempts = 3 # Hardcode max number of request attempts + attempt = 0 + try: with urlopen(url) as response: return Response(response=response) + except HTTPError as e: - logger.error("The server couldn't fulfill the request.") - logger.error(f"Status: {e.code} {e.reason}") - sys.exit(1) + if e.status == 429: + # If the response is 429, sleep and retry + if "Retry-After" in e.headers: + retry_after = int(e.headers["Retry-After"]) + logging.warning(f"Received 429 response from server. Retrying after {retry_after} seconds...") + time.sleep(retry_after) + else: + logging.warning(f"Received 429 response from server. Retrying in {sleep_time} seconds...") + time.sleep(sleep_time) + sleep_time *= 2 # Increment sleep time + attempt += 1 + return fetch_url(url) # Recursive call to retry request + + elif e.status == 500: + # If the response is 500, sleep and retry max 3 times + if attempt <= max_num_attempts: + logging.warning(f"Received 500 response from server. Retrying in {sleep_time} seconds...") + time.sleep(sleep_time) + sleep_time *= 2 + attempt += 1 + return fetch_url(url) + else: + logging.error("Exceeded max request attempts. Exiting.") + sys.exit(1) + except URLError as e: logger.error("We failed to reach a server.") logger.error(f"Reason: {e.reason}") From 945d87029a5acece603e22d6d66145a2675396ee Mon Sep 17 00:00:00 2001 From: ejseqera Date: Mon, 15 May 2023 22:14:39 -0400 Subject: [PATCH 67/71] docs: update changelog --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1478af9f..0f936ef0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[1.10.0](https://github.com/nf-core/fetchngs/releases/tag/1.10.0)] - 2023-05-12 +## [[1.10.0](https://github.com/nf-core/fetchngs/releases/tag/1.10.0)] - 2023-05-16 ### Credits @@ -20,9 +20,9 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements ### Enhancements & fixes - [#85](https://github.com/nf-core/fetchngs/issues/85) - Not able to fetch metadata for ERR ids associated with ArrayExpress +- [#104](https://github.com/nf-core/fetchngs/issues/104) - Add support back in for [GEO IDs](https://www.ncbi.nlm.nih.gov/geo) (removed in v1.7) - [#129](https://github.com/nf-core/fetchngs/issues/129) - Pipeline is working with SRA run ids but failing with corresponding Biosample ids - [#144](https://github.com/nf-core/fetchngs/issues/144) - Add support to download 10X Genomics data -- [#104](https://github.com/nf-core/fetchngs/issues/104) - Add support back in for [GEO IDs](https://www.ncbi.nlm.nih.gov/geo) (removed in v1.7) - [PR #140](https://github.com/nf-core/fetchngs/pull/140) - Bumped modules version to allow for sratools download of sralite format files - [PR #147](https://github.com/nf-core/fetchngs/pull/147) - Updated pipeline template to [nf-core/tools 2.8](https://github.com/nf-core/tools/releases/tag/2.8) - [PR #148](https://github.com/nf-core/fetchngs/pull/148) - Fix default metadata fields for ENA API v2.0 From 6eeddaea28826db53f02853bfefa9cef62e7d4c8 Mon Sep 17 00:00:00 2001 From: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com> Date: Tue, 16 May 2023 10:30:54 +0000 Subject: [PATCH 68/71] fix: --- conf/public_aws_ecr.config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/conf/public_aws_ecr.config b/conf/public_aws_ecr.config index abbe0fe4..b6711a54 100644 --- a/conf/public_aws_ecr.config +++ b/conf/public_aws_ecr.config @@ -12,6 +12,9 @@ docker.registry = 'public.ecr.aws' podman.registry = 'public.ecr.aws' process { + withName: 'SRA_IDS_TO_RUNINFO|SRA_RUNINFO_TO_FTP' { + container = 'quay.io/biocontainers/python:3.8.3' + } withName: '.*:SRATOOLS_FASTERQDUMP' { container = 'quay.io/biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' } From 751828548ff8d9eed7cb89ce75e321349e810b60 Mon Sep 17 00:00:00 2001 From: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com> Date: Tue, 16 May 2023 10:35:47 +0000 Subject: [PATCH 69/71] Additional container image --- conf/public_aws_ecr.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/public_aws_ecr.config b/conf/public_aws_ecr.config index b6711a54..184d8c18 100644 --- a/conf/public_aws_ecr.config +++ b/conf/public_aws_ecr.config @@ -12,7 +12,7 @@ docker.registry = 'public.ecr.aws' podman.registry = 'public.ecr.aws' process { - withName: 'SRA_IDS_TO_RUNINFO|SRA_RUNINFO_TO_FTP' { + withName: 'SRA_IDS_TO_RUNINFO|SRA_RUNINFO_TO_FTP|MULTIQC_MAPPINGS_CONFIG' { container = 'quay.io/biocontainers/python:3.8.3' } withName: '.*:SRATOOLS_FASTERQDUMP' { From c1bb2148a268e7947616247a644e651d425e917d Mon Sep 17 00:00:00 2001 From: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com> Date: Tue, 16 May 2023 10:50:20 +0000 Subject: [PATCH 70/71] Explicit container images per process --- conf/public_aws_ecr.config | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/conf/public_aws_ecr.config b/conf/public_aws_ecr.config index 184d8c18..71180374 100644 --- a/conf/public_aws_ecr.config +++ b/conf/public_aws_ecr.config @@ -12,8 +12,14 @@ docker.registry = 'public.ecr.aws' podman.registry = 'public.ecr.aws' process { - withName: 'SRA_IDS_TO_RUNINFO|SRA_RUNINFO_TO_FTP|MULTIQC_MAPPINGS_CONFIG' { - container = 'quay.io/biocontainers/python:3.8.3' + withName: '.*:SRA_IDS_TO_RUNINFO' { + container = 'quay.io/biocontainers/python:3.9--1' + } + withName: '.*:SRA_RUNINFO_TO_FTP' { + container = 'quay.io/biocontainers/python:3.9--1' + } + withName: '.*:MULTIQC_MAPPINGS_CONFIG' { + container = 'quay.io/biocontainers/python:3.9--1' } withName: '.*:SRATOOLS_FASTERQDUMP' { container = 'quay.io/biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' From 255190ef0b634d09cf0bc92b7658986ed9f96031 Mon Sep 17 00:00:00 2001 From: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com> Date: Tue, 16 May 2023 10:53:40 +0000 Subject: [PATCH 71/71] fixup --- conf/public_aws_ecr.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/public_aws_ecr.config b/conf/public_aws_ecr.config index 71180374..14b577d7 100644 --- a/conf/public_aws_ecr.config +++ b/conf/public_aws_ecr.config @@ -16,7 +16,7 @@ process { container = 'quay.io/biocontainers/python:3.9--1' } withName: '.*:SRA_RUNINFO_TO_FTP' { - container = 'quay.io/biocontainers/python:3.9--1' + container = 'quay.io/biocontainers/python:3.9--1' } withName: '.*:MULTIQC_MAPPINGS_CONFIG' { container = 'quay.io/biocontainers/python:3.9--1'