feat: Add resource profiles (#151)

* refactor: Change some module resource labels Based on previous testing * feat: Add resource profiles and change base reqs - Also makes recombination run by default * refactor: Change panaroo run to high * refactor: Increase time for high * docs: Add basic resource profiles documentation * refactor: Increase reqs for some modules - Decided after Entirococcus testing * docs: Update resource requirement documentation * refactor: Run all ann tools in large config * docs: Bolden sentence * fix: Change descriptions for profiles
beiko-lab · Aug 22, 2023 · 349b428 · 349b428
1 parent 81a7a9b
commit 349b428
Show file tree

Hide file tree

Showing 20 changed files with 113 additions and 46 deletions.
diff --git a/README.md b/README.md
@@ -104,6 +104,7 @@ See our [roadmap](ROADMAP.md) for a full list of future development targets.
     3.1. Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile <institute>` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment.
 
     3.2. If you are using `singularity` then the pipeline will auto-detect this and attempt to download the Singularity images directly as opposed to performing a conversion from Docker images. If you are persistently observing issues downloading Singularity images directly due to timeout or network issues then please use the `--singularity_pull_docker_container` parameter to pull and convert the Docker image instead.
+    In case of input datasets larger than 100 samples, [check our resource profiles documentation](https://beiko-lab.github.io/arete/resource-profiles/), for optimal usage.
 
 4.  Start running your own analysis (ideally using `-profile docker` or `-profile singularity` for stability)!
 
@@ -174,9 +175,8 @@ nextflow run beiko-lab/ARETE \
  --poppunk_model dbscan \
  --run_recombination \
  --run_gubbins \
- --use_ppanggolin \
  -entry annotation \
- -profile docker
+ -profile medium,docker
 ```
 
 Parameters used:
@@ -187,19 +187,17 @@ Parameters used:
 - `--run_gubbins` - Run [Gubbins](https://github.com/nickjcroucher/gubbins) as part of the recombination subworkflow.
 - `--use_ppanggolin` - Use [PPanGGOLiN](https://github.com/labgem/PPanGGOLiN) for calculating the pangenome. Tends to perform better on larger input sets.
 - `-entry annotation` - Run annotation subworkflow and further steps (See [usage](https://beiko-lab.github.io/arete/usage/)).
-- `-profile docker` - Run tools in docker containers.
+- `-profile medium,docker` - Run tools in docker containers. For `-profile medium`, check our [resource requirements documentation](https://beiko-lab.github.io/arete/resource_profiles/).
 
 ### Annotation to evolutionary dynamics on 10,000 genomes
 
 ```bash
 nextflow run beiko-lab/ARETE \
  --input_sample_table samplesheet.csv \
  --poppunk_model dbscan \
- --use_ppanggolin \
  --run_recombination \
- --enable_subsetting \
  -entry annotation \
- -profile docker
+ -profile large,docker
 ```
 
 Parameters used:
@@ -210,7 +208,7 @@ Parameters used:
 - `--use_ppanggolin` - Use [PPanGGOLiN](https://github.com/labgem/PPanGGOLiN) for calculating the pangenome. Tends to perform better on larger input sets.
 - `--enable_subsetting` - Enable subsetting workflow based on genome similarity (See [subsetting documentation](https://beiko-lab.github.io/arete/subsampling/))
 - `-entry annotation` - Run annotation subworkflow and further steps (See [usage](https://beiko-lab.github.io/arete/usage/)).
-- `-profile docker` - Run tools in docker containers.
+- `-profile large,docker` - Run tools in docker containers. For `-profile large`, check our [resource requirements documentation](https://beiko-lab.github.io/arete/resource_profiles/).
 
 ## Credits <a name="credits"></a>
 

diff --git a/conf/base.config b/conf/base.config
@@ -1,6 +1,6 @@
 /*
 ========================================================================================
-    nf-core/arete Nextflow base config file
+    beiko-lab/ARETE Nextflow base config file
 ========================================================================================
     A 'blank slate' config file, appropriate for general use on most high performance
     compute environments. Assumes that all software is installed and available on
@@ -10,7 +10,6 @@
 
 process {
 
-    // TODO nf-core: Check the defaults for all processes
     cpus   = { check_max( 1    * task.attempt, 'cpus'   ) }
     memory = { check_max( 6.GB * task.attempt, 'memory' ) }
     time   = { check_max( 4.h  * task.attempt, 'time'   ) }
@@ -19,38 +18,31 @@ process {
     maxRetries    = 3
     maxErrors     = '-1'
 
-    // Process-specific resource requirements
-    // NOTE - Please try and re-use the labels below as much as possible.
-    //        These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
-    //        If possible, it would be nice to keep the same label naming convention when
-    //        adding in your local modules too.
-    // TODO nf-core: Customise requirements for specific processes.
-    // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
     withLabel:process_single {
-        cpus   = { check_max( 1                  , 'cpus'    ) }
-        memory = { check_max( 6.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 4.h  * task.attempt, 'time'    ) }
+        cpus   = { check_max( 1, 'cpus'    ) }
+        memory = { check_max( 6.GB, 'memory'  ) }
+        time   = { check_max( 4.h, 'time'    ) }
     }
     withLabel:process_low {
         cpus   = { check_max( 6     * task.attempt, 'cpus'    ) }
-        memory = { check_max( 20.GB * task.attempt, 'memory'  ) }
+        memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
         time   = { check_max( 4.h   * task.attempt, 'time'    ) }
     }
     withLabel:process_medium {
-        cpus   = { check_max( 12    * task.attempt, 'cpus'    ) }
-        memory = { check_max( 48.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 8.h   * task.attempt, 'time'    ) }
+        cpus   = { check_max( 10, 'cpus'    ) }
+        memory = { check_max( 36.GB, 'memory'  ) }
+        time   = { check_max( 8.h, 'time'    ) }
     }
     withLabel:process_high {
-        cpus   = { check_max( 24    * task.attempt, 'cpus'    ) }
-        memory = { check_max( 72.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 168.h, 'time'    ) }
+        cpus   = { check_max( 14, 'cpus'    ) }
+        memory = { check_max( 72.GB, 'memory'  ) }
+        time   = { check_max( 20.h * task.attempt, 'time'    ) }
     }
     withLabel:process_long {
-        time   = { check_max( 20.h  * task.attempt, 'time'    ) }
+        time   = { check_max( 72.h, 'time'    ) }
     }
     withLabel:process_high_memory {
-        memory = { check_max( 240.GB, 'memory' ) }
+        memory = { check_max( 100.GB, 'memory' ) }
     }
     withLabel:error_ignore {
         errorStrategy = 'ignore'

diff --git a/conf/large.config b/conf/large.config
@@ -0,0 +1,28 @@
+params {
+    config_profile_name        = 'Large profile'
+    config_profile_description = 'Profile for >1000 sample datasets with subsetting enabled.'
+
+    use_ppanggolin = true
+    use_fasttree = true
+    enable_subsetting = true
+    skip_profile_creation = true
+}
+
+process {
+    withLabel:process_medium {
+        cpus   = { check_max( 18     * task.attempt, 'cpus'    ) }
+        memory = { check_max( 36.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 8.h   * task.attempt, 'time'    ) }
+    }
+    withLabel:process_high {
+        cpus   = { check_max( 24    * task.attempt, 'cpus'    ) }
+        memory = { check_max( 72.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 20.h  * task.attempt, 'time'    ) }
+    }
+    withLabel:process_long {
+        time   = { check_max( 72.h, 'time'    ) }
+    }
+    withLabel:process_high_memory {
+        memory = { check_max( 249.GB, 'memory' ) }
+    }
+}
diff --git a/conf/medium.config b/conf/medium.config
@@ -0,0 +1,23 @@
+params {
+    config_profile_name        = 'Medium profile'
+    config_profile_description = 'Profile for 100-1000 sample datasets'
+
+    use_ppanggolin = true
+    use_fasttree = true
+}
+
+process {
+    withLabel:process_medium {
+        cpus   = { check_max( 12    * task.attempt, 'cpus'    ) }
+        memory = { check_max( 36.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 8.h   * task.attempt, 'time'    ) }
+    }
+    withLabel:process_high {
+        cpus   = { check_max( 16    * task.attempt, 'cpus'    ) }
+        memory = { check_max( 72.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 20.h  * task.attempt, 'time'    ) }
+    }
+    withLabel:process_high_memory {
+        memory = { check_max( 200.GB, 'memory' ) }
+    }
+}
diff --git a/docs/resource_profiles.md b/docs/resource_profiles.md
@@ -0,0 +1,19 @@
+# ARETE and dataset size
+
+Currently ARETE has three distinct profiles that change the pipeline execution in some ways: The default profile (which we can call `small`), the `medium` profile and the `large` profile.
+
+These three profiles were developed based on the size and diversity of the input dataset and change some parameter defaults based on tests we have performed on similar-sized datasets.
+
+If you want to first gauge the potential diversity of your dataset and have some input assemblies you can try the [PopPUNK entry](https://beiko-lab.github.io/arete/usage/#poppunk-entry). One of the outputs will provide insight into how many clusters, or lineages, your dataset divides into.
+
+The sizes are:
+
+- For the default or `small` profile, we expect datasets with 100 samples/assemblies or fewer.
+  It runs on the default pipeline parameters, with no changes.
+
+- For the `medium` profile, we expect datasets with >100 and <1000 samples.
+  It increases the default resource requirements for most processes and also uses [PPanGGoLiN](https://github.com/labgem/PPanGGOLiN) for pangenome construction, instead of [Panaroo](https://github.com/gtonkinhill/panaroo/).
+
+- For the `large` profile, we expect datasets with >1000 samples.
+  It also increases default resource requirements for some processes and uses PPanGGoLin.
+  Additionally, **it enables [PopPUNK subsampling](subsampling.md), with default parameters**.
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -9,6 +9,7 @@ nav:
   - Citations: CITATIONS.md
   - Roadmap: ROADMAP.md
   - Reference:
+      - Dataset Size: resource_profiles.md
       - Parameters: params.md
       - Subsampling: subsampling.md
 repo_url: https://github.com/beiko-lab/arete

diff --git a/modules/local/blast_databases.nf b/modules/local/blast_databases.nf
@@ -1,5 +1,5 @@
 process GET_CAZYDB {
-    label 'process_low'
+    label 'process_single'
     label 'error_retry_delay'
 
     output:
@@ -12,7 +12,7 @@ process GET_CAZYDB {
 }
 
 process GET_VFDB{
-    label 'process_low'
+    label 'process_single'
     label 'error_retry_delay'
 
     output:
@@ -29,7 +29,7 @@ process GET_VFDB{
 }
 
 process GET_BACMET{
-    label 'process_low'
+    label 'process_single'
     label 'error_retry_delay'
 
     output:
@@ -46,7 +46,7 @@ process GET_BACMET{
 }
 
 process GET_ICEBERG {
-    label 'process_low'
+    label 'process_single'
     label 'error_retry_delay'
 
     output:

diff --git a/modules/local/chunked_fasttree.nf b/modules/local/chunked_fasttree.nf
@@ -1,5 +1,6 @@
 process CHUNKED_FASTTREE {
     label 'process_high'
+    label 'process_long'
 
     conda (params.enable_conda ? "bioconda::fasttree=2.1.10" : null)
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?

diff --git a/modules/local/create_report.nf b/modules/local/create_report.nf
@@ -1,6 +1,5 @@
 process CREATE_REPORT {
     label 'process_high'
-    label 'process_high_memory'
 
     conda (params.enable_conda ? "conda-forge::pandas=1.4.3" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {

diff --git a/modules/local/get_db_cache.nf b/modules/local/get_db_cache.nf
@@ -1,6 +1,6 @@
 //process for acquiring cached databases
 process GET_DB_CACHE {
-    label 'process_medium'
+    label 'process_single'
 
     input:
         path(dbcache)

diff --git a/modules/local/get_minikraken.nf b/modules/local/get_minikraken.nf
@@ -7,7 +7,7 @@ options        = initOptions(params.options)
 process KRAKEN2_DB {
     //publishDir 'dbcache/', mode:'copy'
     tag "minikraken"
-    label 'process_high'
+    label 'process_medium'
     label 'error_retry_delay'
 
     output:

diff --git a/modules/local/graphviz/gml2gv/main.nf b/modules/local/graphviz/gml2gv/main.nf
@@ -1,5 +1,5 @@
 process GML2GV {
-    label 'process_low'
+    label 'process_single'
 
     conda "bioconda::perl-graphviz=2.24"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?

diff --git a/modules/local/islandpath/main.nf b/modules/local/islandpath/main.nf
@@ -1,6 +1,6 @@
 process ISLANDPATH {
     tag "$meta.id"
-    label 'process_medium'
+    label 'process_low'
 
     conda "bioconda::islandpath=1.0.6"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?

diff --git a/modules/local/poppunk_samplesheet.nf b/modules/local/poppunk_samplesheet.nf
@@ -1,5 +1,5 @@
 process POPPUNK_MAKE_SAMPLESHEET {
-    label "process_low"
+    label "process_single"
 
     input:
     path(samplesheets)

diff --git a/modules/local/ppanggolin/workflow/main.nf b/modules/local/ppanggolin/workflow/main.nf
@@ -2,6 +2,7 @@ process PPANGGOLIN_WORKFLOW {
     tag "$meta.id"
     label 'process_high'
     label 'process_high_memory'
+    label 'process_long'
 
     conda "bioconda::ppanggolin=1.2.105"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?

diff --git a/modules/local/ska.nf b/modules/local/ska.nf
@@ -1,6 +1,6 @@
 process SKA2 {
     tag "$cluster"
-    label 'process_medium'
+    label 'process_high'
 
     conda "bioconda::gubbins=3.3.0"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?

diff --git a/modules/nf-core/panaroo/run/main.nf b/modules/nf-core/panaroo/run/main.nf
@@ -1,6 +1,6 @@
 process PANAROO_RUN {
     tag "$meta.id"
-    label 'process_medium'
+    label 'process_high'
 
     conda (params.enable_conda ? "bioconda::panaroo=1.3.2" : null)
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?

diff --git a/modules/nf-core/panaroo/run/panaroo-run.diff b/modules/nf-core/panaroo/run/panaroo-run.diff
@@ -1,9 +1,11 @@
 Changes in module 'nf-core/panaroo/run'
 --- modules/nf-core/panaroo/run/main.nf
 +++ modules/nf-core/panaroo/run/main.nf
-@@ -2,17 +2,19 @@
+@@ -1,18 +1,20 @@
+ process PANAROO_RUN {
      tag "$meta.id"
-     label 'process_medium'
+-    label 'process_medium'
++    label 'process_high'
 
 -    conda "bioconda::panaroo=1.2.9"
 +    conda (params.enable_conda ? "bioconda::panaroo=1.3.2" : null)
@@ -20,7 +22,7 @@ Changes in module 'nf-core/panaroo/run'
      tuple val(meta), path("results/*")                                      , emit: results
 +    tuple val(meta), path("results/final_graph.gml")        , optional: true, emit: graph_gml
      tuple val(meta), path("results/core_gene_alignment.aln"), optional: true, emit: aln
-+    path("results/aligned_gene_sequences/*aln.fas")         , optional: true, emit: accessory_aln
++    path "results/aligned_gene_sequences/*aln.fas"          , optional: true, emit: accessory_aln
      path "versions.yml"                                                     , emit: versions
 
      when:

diff --git a/nextflow.config b/nextflow.config
@@ -47,7 +47,7 @@ params {
     skip_kraken                = false
 
     // Recombination
-    run_recombination          = false
+    run_recombination          = true
     run_verticall              = true
     run_gubbins                = false
 
@@ -88,7 +88,7 @@ params {
     // Defaults only, expecting to be overwritten
     max_memory                 = '125.GB'
     max_cpus                   = 72
-    max_time                   = '440.h'
+    max_time                   = '168.h'
 
 }
 
@@ -154,6 +154,8 @@ profiles {
     }
     test      { includeConfig 'conf/test.config'      }
     test_full { includeConfig 'conf/test_full.config' }
+    medium    { includeConfig 'conf/medium.config' }
+    large     { includeConfig 'conf/large.config' }
 }
 
 // Export these variables to prevent local Python/R libraries from conflicting with those in the container

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -208,7 +208,8 @@
                 "run_recombination": {
                     "type": "boolean",
                     "description": "Run Recombination",
-                    "fa_icon": "fas fa-tree"
+                    "fa_icon": "fas fa-tree",
+                    "default": true
                 },
                 "run_verticall": {
                     "type": "boolean",