EBI-Metagenomics · mberacochea · Aug 19, 2024 · Aug 8, 2024 · Aug 9, 2024 · Aug 9, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -9,7 +9,6 @@ on:
 
 env:
   NXF_ANSI_LOG: false
-  NFTEST_VER: "0.8.4"
 
 jobs:
   test:
@@ -30,7 +29,9 @@ jobs:
 
       - name: Install nf-test
         uses: nf-core/setup-nf-test@v1
+        with:
+          version: 0.9.0
 
       - name: Run pipeline with test data
         run: |
-          nf-test test
+          nf-test test --tag samplesheet --ci
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -1,4 +1,5 @@
 repository_type: pipeline
+org_path: ebi-metagenomics
 template:
   prefix: ebi-metagenomics
   skip:

diff --git a/README.md b/README.md
@@ -120,26 +120,89 @@ PRJ2,ERR2,/path/to/reads/ERR2.fq.gz,,single,genomic,megahit,32
 The outputs of the pipeline are organized as follows:
 
 ```
-results/SRP1154
-└── SRP115494
-    └── SRR6180
-        └── SRR6180434
-            ├── assembly
-            │   └── metaspades
-            │       └── 3.15.5
-            │           ├── coverage
-            │           ├── decontamination
-            │           └── qc
-            │               ├── multiqc
-            │               └── quast
-            └── qc
-                ├── fastp
-                └── fastqc
-
+results
+├── pipeline_info
+├── DRP0076
+│   └── DRP007622
+│       ├── DRR2807
+│       │   └── DRR280712
+│       │       ├── assembly
+│       │       │   └── megahit
+│       │       │       └── 1.2.9
+│       │       │           ├── coverage
+│       │       │           ├── decontamination
+│       │       │           └── qc
+│       │       │               ├── multiqc
+│       │       │               └── quast
+│       │       │                   └── DRR280712
+│       │       └── qc
+│       │           ├── fastp
+│       │           └── fastqc
+│       └── multiqc
+└── SRP1154
+    └── SRP115494
+        ├── multiqc
+        ├── SRR5949
+        │   └── SRR5949318
+        │       ├── assembly
+        │       │   └── metaspades
+        │       │       └── 3.15.5
+        │       │           ├── coverage
+        │       │           ├── decontamination
+        │       │           └── qc
+        │       │               ├── multiqc
+        │       │               └── quast
+        │       │                   └── SRR5949318
+        │       └── qc
+        │           ├── fastp
+        │           └── fastqc
+        └── SRR6180
+            └── SRR6180434 --> QC Failed (not assembled)
+                └── qc
+                    ├── fastp
+                    └── fastqc
 ```
 
 The nested structure based on ENA Study and Reads accessions was created to suit the Microbiome Informatics team’s needs. The benefit of this structure is that results from different runs of the same study won’t overwrite any results.
 
+### Top Level Reports
+
+#### MultiQC
+
+The pipeline produces two [MultiQC](https://multiqc.info) reports: one per study and one per run. These reports aggregate statistics related to raw reads, read QC, assembly, and assembly QC.
+
+The run-level MultiQC report is generated for runs that passed QC and were assembled. The study-level MultiQC report includes all runs; however, runs without assemblies will not have assembly stats included.
+
+#### QC failed runs
+
+QC failed runs are filtered out to prevent downstream assembly failures.
+
+Runs that fail QC checks are excluded from the assembly process. These runs are listed in the file `qc_failed_runs.csv`, along with the corresponding exclusion message. Assembling such runs may cause the pipeline to fail or produce very poor assemblies.
+
+Example:
+
+```csv
+SRR6180434,filter_ratio_threshold_exceeded
+```
+
+##### Runs exclusion messages
+
+| Exclusion Message                 | Description                                                                                                                                                                                                                                                                            |
+| --------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `filter_ratio_threshold_exceeded` | The maximum fraction of reads that are allowed to be filtered out. If exceeded, it flags excessive filtering. The default value is 0.9, meaning that if more than 90% of the reads are filtered out, the threshold is considered exceeded, and the run is not assembled. |
+| `low_reads_count_threshold`       | The minimum number of reads required after filtering. If below, it flags a low read count, and the run is not assembled.                                                                                                                                                               |
+
+#### Assembled Runs
+
+Runs that were successfully assembled are listed in a CSV file named `assembled_runs.csv`. This file contains the run accession, assembler, and assembler version used.
+
+Example:
+
+```csv
+DRR280712,megahit,1.2.9
+SRR5949318,metaspades,3.15.5
+```
+
 ## Tests
 
 There is a very small test data set ready to use:

diff --git a/modules/nf-core/fastp/fastp.diff b/modules/nf-core/fastp/fastp.diff
diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf
diff --git a/nextflow.config b/nextflow.config
@@ -46,6 +46,10 @@ params {
     library_layout                   = null
     library_strategy                 = null
 
+    // Reads QC filtering options
+    filter_ratio_threshold           = 0.9
+    low_reads_count_threshold        = 1000
+
     // Reference genome
     reference_genome                 = null
 

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -155,6 +155,27 @@
                 }
             }
         },
+        "reads_qc": {
+            "title": "Reads QC options",
+            "type": "object",
+            "fa_icon": "fab fa-acquisitions-incorporated",
+            "description": "Set the thresholds for the reads QC/filtering steps. Reads that fail QC won't be assembled.",
+            "help_text": "Use these options to define the quality control thresholds for your reads. You can specify the maximum allowed filtering ratio and the minimum acceptable read count. If the filtering ratio exceeds the set limit or the read count falls below the threshold, the reads will be flagged and excluded from further assembly. The information about those runs that failed are aggregated in the qc_failed_runs.csv file.",
+            "properties": {
+                "filter_ratio_threshold": {
+                    "type": "number",
+                    "description": "The maximum fraction of reads that are allowed to be filtered out. If exceeded, it flags excessive filtering. The default value is 0.9, meaning that if more than 90% of the reads are filtered out, the threshold is considered exceeded, and the run is not assembled.",
+                    "default": 0.9,
+                    "minimum": 0.0,
+                    "maximum": 1.0
+                },
+                "low_reads_count_threshold": {
+                    "type": "number",
+                    "description": "The minimum number of reads required after filtering. If below, it flags a low read count and the run is not assembled.",
+                    "default": 1000
+                }
+            }
+        },
         "max_job_request_options": {
             "title": "Max job request options",
             "type": "object",
@@ -278,6 +299,9 @@
         {
             "$ref": "#/defs/input_output_options"
         },
+        {
+            "$ref": "#/defs/reads_qc"
+        },
         {
             "$ref": "#/defs/max_job_request_options"
         },

diff --git a/subworkflows/local/assembly_coverage.nf b/subworkflows/local/assembly_coverage.nf
@@ -6,14 +6,14 @@ include { METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS } from '../../modules/nf-core/met
 workflow ASSEMBLY_COVERAGE {
 
     take:
-    reads_assembly   // [ val(meta), path(reads), path(assembly_fasta) ]
+    assembly_reads   // [ val(meta), path(assembly_fasta), path(reads) ]
 
     main:
 
     ch_versions = Channel.empty()
 
-    reads = reads_assembly.map { meta, reads, _ -> [meta, reads]}
-    assembly = reads_assembly.map { meta, _, assembly -> [meta, assembly]}
+    reads = assembly_reads.map { meta, _, reads -> [meta, reads] }
+    assembly = assembly_reads.map { meta, assembly, _ -> [meta, assembly] }
 
     BWAMEM2_INDEX(
         assembly