Merge branch 'dorado_080' into 'dev'

Template update and Dorado v0.8.0 See merge request epi2melabs/workflows/wf-basecalling!91
epi2me-labs · Sep 17, 2024 · 5d5b44d · 5d5b44d
2 parents 0c054d2 + ecebfeb
commit 5d5b44d
Show file tree

Hide file tree

Showing 12 changed files with 117 additions and 46 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ repos:
         always_run: true
         pass_filenames: false
         additional_dependencies:
-          - epi2melabs==0.0.56
+          - epi2melabs==0.0.57
       - id: build_models
         name: build_models
         entry: datamodel-codegen --strict-nullable --base-class workflow_glue.results_schema_helpers.BaseModel --use-schema-description --disable-timestamp --input results_schema.yml --input-file-type openapi --output bin/workflow_glue/results_schema.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,13 +4,14 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [unreleased]
+## [v1.4.0]
 ### Added
 - IGV configuration file with `--ref --igv` options and either `--output_fmt bam` or `--output_fmt cram`.
 - Support for gzipped reference genomes.
 - `output_fmt` selects the output format for basecalled and aligned files.
 ### Changed
-- Reconciled workflow with wf-template v5.2.3.
+- Updated Dorado to [v0.8.0](https://github.com/nanoporetech/dorado/releases/tag/v0.8.0)
+- Reconciled workflow with wf-template v5.2.6.
 - Do not emit the reference FASTA file.
 - Collapse redundant RG and PG header lines when emitting BAM or CRAM.
 ### Fixed

diff --git a/README.md b/README.md
@@ -52,7 +52,7 @@ therefore Nextflow will need to be
 installed before attempting to run the workflow.
 
 The workflow can currently be run using either
-[Docker](https://www.docker.com/products/docker-desktop
+[Docker](https://www.docker.com/products/docker-desktop)
 or [Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html)
 to provide isolation of the required software.
 Both methods are automated out-of-the-box provided

diff --git a/bin/report.py b/bin/report.py
@@ -3,6 +3,7 @@
 import argparse
 import json
 
+from bokeh.models import Title
 from dominate.tags import p
 import ezcharts as ezc
 from ezcharts.components.ezchart import EZChart
@@ -11,7 +12,7 @@
 from ezcharts.layout.snippets import Grid
 from ezcharts.layout.snippets import Tabs
 import pandas as pd
-import report_utils
+from report_utils import read_length_plot, read_quality_plot
 from ezcharts.util import get_named_logger  # noqa: ABS101
 
 
@@ -42,16 +43,21 @@ def main(args):
                             continue
                         with Grid(columns=2):
                             EZChart(
-                                report_utils.read_quality_plot(data), THEME)
-                            EZChart(report_utils.read_length_plot(data), THEME)
+                                read_quality_plot(data), THEME)
+                            EZChart(read_length_plot(data), THEME)
                 with tabs.add_tab('total'):
                     with Grid(columns=1):  # total read counts per sample
                         df_stats = pd.DataFrame.from_dict(total_reads.items())
                         df_stats.columns = ['Sample_name', 'Number of reads']
                         plt = ezc.barplot(
                             data=df_stats, x='Sample_name', y='Number of reads')
-                        plt.title = {"text": "Number of reads per sample."}
-                        plt.tooltip = {'trigger': 'axis'}
+                        plt._fig.add_layout(
+                            Title(
+                                text="Number of reads per sample.",
+                                text_font_size="1.5em"
+                            ),
+                            'above'
+                        )
                         EZChart(plt, THEME)
 
     # If pairing rates are provided, show them.

diff --git a/bin/report_utils.py b/bin/report_utils.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 """Create tables for the report."""
+from bokeh.models import Title
 from ezcharts.plots.distribution import histplot
 import pandas as pd
 
@@ -21,10 +22,14 @@ def read_quality_plot(seq_summary, min_qual=4, max_qual=30, title='Read quality'
         bins=len(df),
         weights=list(df['counts'])
         )
-    plt.title = dict(text=title)
-    plt.xAxis.name = 'Quality score'
-    plt.xAxis.min, plt.xAxis.max = min_qual, max_qual
-    plt.yAxis.name = 'Number of reads'
+    plt._fig.add_layout(
+        Title(text=title, text_font_size="1.5em"),
+        'above'
+    )
+    plt._fig.xaxis.axis_label = "Quality score"
+    plt._fig.yaxis.axis_label = "Number of reads"
+    plt._fig.x_range.start = min_qual
+    plt._fig.x_range.end = max_qual
     return plt
 
 
@@ -38,7 +43,11 @@ def read_length_plot(seq_summary, title='Read length'):
         data=df['read_length'],
         bins=len(df),
         weights=list(df['counts']))
-    plt.title = dict(text=title)
-    plt.xAxis.name = 'Read length / kb'
-    plt.yAxis.name = 'Number of reads'
+    plt._fig.add_layout(
+        Title(text=title, text_font_size="1.5em"),
+        'above'
+    )
+    plt._fig.x_range.start = 0
+    plt._fig.xaxis.axis_label = "Read length / kb"
+    plt._fig.yaxis.axis_label = "Number of reads"
     return plt
diff --git a/bin/workflow_glue/check_bam_headers_in_dir.py b/bin/workflow_glue/check_bam_headers_in_dir.py
@@ -29,7 +29,13 @@ def main(args):
     for xam_file in target_files:
         # get the `@SQ` and `@HD` lines in the header
         with pysam.AlignmentFile(xam_file, check_sq=False) as f:
-            sq_lines = f.header.get("SQ")
+            # compare only the SN/LN/M5 elements of SQ to avoid labelling XAM with
+            # same reference but different SQ.UR as mixed_header (see CW-4842)
+            sq_lines = [{
+                "SN": sq["SN"],
+                "LN": sq["LN"],
+                "M5": sq.get("M5"),
+            } for sq in f.header.get("SQ", [])]
             hd_lines = f.header.get("HD")
         # Check if it is sorted.
         # When there is more than one BAM, merging/sorting

diff --git a/bin/workflow_glue/check_xam_index.py b/bin/workflow_glue/check_xam_index.py
@@ -14,12 +14,12 @@ def validate_xam_index(xam_file):
     Invalid indexes will fail the call with a ValueError:
     ValueError: fetch called on bamfile without index
     """
-    alignments = pysam.AlignmentFile(xam_file, check_sq=False)
-    try:
-        alignments.fetch()
-        has_valid_index = True
-    except ValueError:
-        has_valid_index = False
+    with pysam.AlignmentFile(xam_file, check_sq=False) as alignments:
+        try:
+            alignments.fetch()
+            has_valid_index = True
+        except ValueError:
+            has_valid_index = False
     return has_valid_index
 
 

diff --git a/docs/04_install_and_run.md b/docs/04_install_and_run.md
@@ -9,7 +9,7 @@ therefore Nextflow will need to be
 installed before attempting to run the workflow.
 
 The workflow can currently be run using either
-[Docker](https://www.docker.com/products/docker-desktop
+[Docker](https://www.docker.com/products/docker-desktop)
 or [Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html)
 to provide isolation of the required software.
 Both methods are automated out-of-the-box provided

diff --git a/lib/common.nf b/lib/common.nf
@@ -15,6 +15,7 @@ process getParams {
 }
 
 process configure_igv {
+    publishDir "${params.out_dir}/", mode: 'copy', pattern: 'igv.json', enabled: params.containsKey("igv") && params.igv
     label "wf_common"
     cpus 1
     memory "2 GB"

diff --git a/lib/ingress.nf b/lib/ingress.nf
@@ -197,15 +197,15 @@ def fastq_ingress(Map arguments)
         .map { meta, files, stats ->
             // new `arity: '1..*'` would be nice here
             files = files instanceof List ? files : [files]
-            new_keys = [
+            def new_keys = [
                 "group_key": groupKey(meta["alias"], files.size()),
                 "n_fastq": files.size()]
-            grp_index = (0..<files.size()).collect()
+            def grp_index = (0..<files.size()).collect()
             [meta + new_keys, files, grp_index, stats]
         }
         .transpose(by: [1, 2])  // spread multiple fastq files into separate emissions
         .map { meta, files, grp_i, stats ->
-            new_keys = [
+            def new_keys = [
                 "group_index": "${meta["alias"]}_${grp_i}"]
             [meta + new_keys, files, stats]
         }
@@ -279,17 +279,19 @@ def xam_ingress(Map arguments)
         // sorted, the index will be used.
         meta, paths -> 
         boolean is_array = paths instanceof ArrayList
-        String xai_fn
+        String src_xam
+        String src_xai
         // Using `.uri` or `.Uri()` leads to S3 paths to be prefixed with `s3:///`
         // instead of `s3://`, causing the workflow to not find the index file.
         // `.toUriString()` returns the correct path.
         if (!is_array){
+            src_xam = paths.toUriString()
             def xai = file(paths.toUriString() + ".bai")
             if (xai.exists()){
-                xai_fn = xai.toUriString()
+                src_xai = xai.toUriString()
             }
         }
-        [meta + [xai_fn: xai_fn], paths]
+        [meta + [src_xam: src_xam, src_xai: src_xai], paths]
     }
     | checkBamHeaders
     | map { meta, paths, is_unaligned_env, mixed_headers_env, is_sorted_env ->
@@ -331,9 +333,9 @@ def xam_ingress(Map arguments)
         //  - between 1 and `N_OPEN_FILES_LIMIT` aligned files
         no_files: n_files == 0
         indexed: \
-            n_files == 1 && (meta["is_unaligned"] || meta["is_sorted"]) && meta["xai_fn"]
-        to_index: 
-            n_files == 1 && (meta["is_unaligned"] || meta["is_sorted"]) && !meta["xai_fn"]
+            n_files == 1 && (meta["is_unaligned"] || meta["is_sorted"]) && meta["src_xai"]
+        to_index: \
+            n_files == 1 && (meta["is_unaligned"] || meta["is_sorted"]) && !meta["src_xai"]
         to_catsort: \
             (n_files == 1) || (n_files > N_OPEN_FILES_LIMIT) || meta["is_unaligned"]
         to_merge: true
@@ -358,20 +360,20 @@ def xam_ingress(Map arguments)
             .map { meta, files, stats -> 
                 // new `arity: '1..*'` would be nice here
                 files = files instanceof List ? files : [files]
-                new_keys = [
+                def new_keys = [
                     "group_key": groupKey(meta["alias"], files.size()),
                     "n_fastq": files.size()]
-                grp_index = (0..<files.size()).collect()
+                def grp_index = (0..<files.size()).collect()
                 [meta + new_keys, files, grp_index, stats]
             }
             .transpose(by: [1, 2])  // spread multiple fastq files into separate emissions
             .map { meta, files, grp_i, stats ->
-                new_keys = [
+                def new_keys = [
                     "group_index": "${meta["alias"]}_${grp_i}"]
                 [meta + new_keys, files, stats]
             }
             .map { meta, path, stats ->
-                [meta.findAll { it.key !in ['xai_fn', 'is_sorted'] }, path, stats]
+                [meta.findAll { it.key !in ['is_sorted', 'src_xam', 'src_xai'] }, path, stats]
             }
 
         // add number of reads, run IDs, and basecall models to meta
@@ -388,18 +390,26 @@ def xam_ingress(Map arguments)
     | sortBam
     | groupTuple
     | mergeBams
+    | map{
+        meta, bam, bai ->
+        [meta + [src_xam: null, src_xai: null], bam, bai]
+    }
 
     // now handle samples with too many files for `samtools merge`
     ch_catsorted = ch_result.to_catsort
     | catSortBams
+    | map{
+        meta, bam, bai ->
+        [meta + [src_xam: null, src_xai: null], bam, bai]
+    }
 
     // Validate the index of the input BAM.
     // If the input BAM index is invalid, regenerate it.
     // First separate the BAM from the null input channels.
     ch_to_validate = ch_result.indexed
     | map{
         meta, paths ->
-        bai = paths && meta.xai_fn ? file(meta.xai_fn) : null
+        def bai = paths && meta.src_xai ? file(meta.src_xai) : null
         [meta, paths, bai]
     }
     | branch {
@@ -429,6 +439,10 @@ def xam_ingress(Map arguments)
     ch_indexed = ch_result.to_index
     | mix( ch_validated.invalid_idx )
     | samtools_index
+    | map{
+        meta, bam, bai ->
+        [meta + [src_xai: null], bam, bai]
+    }
 
     // Add extra null for the missing index to input.missing
     // as well as the missing metadata.
@@ -439,7 +453,7 @@ def xam_ingress(Map arguments)
     )
     | map{
         meta, paths ->
-        [meta + [xai_fn: null, is_sorted: false], paths, null]
+        [meta + [src_xam: null, src_xai: null, is_sorted: false], paths, null]
     }
 
     // Combine all possible inputs
@@ -480,7 +494,7 @@ def xam_ingress(Map arguments)
     }
 
     // Remove metadata that are unnecessary downstream:
-    // meta.xai_fn: not needed, as it will be part of the channel as a file
+    // meta.src_xai: not needed, as it will be part of the channel as a file
     // meta.is_sorted: if data are aligned, they will also be sorted/indexed
     //
     // The output meta can contain the following flags:
@@ -498,7 +512,7 @@ def xam_ingress(Map arguments)
         ch_result
             | map{
                 meta, bam, bai, stats ->
-                [meta.findAll { it.key !in ['xai_fn', 'is_sorted'] }, [bam, bai], stats]
+                [meta.findAll { it.key !in ['is_sorted'] }, [bam, bai], stats]
             }, 
         "xam"
     )
@@ -508,6 +522,19 @@ def xam_ingress(Map arguments)
     | map{
         it.flatten()
     }
+    // Final check to ensure that src_xam/src_xai is not an s3
+    // path. If so, drop it. We check src_xam also for src_xai
+    // as, the latter is irrelevant if the former is in s3.
+    | map{
+        meta, bam, bai, stats ->
+        def xam = meta.src_xam
+        def xai = meta.src_xai
+        if (meta.src_xam){
+            xam = meta.src_xam.startsWith('s3://') ? null : meta.src_xam
+            xai = meta.src_xam.startsWith('s3://') ? null : meta.src_xai
+        }
+        [ meta + [src_xam: xam, src_xai: xai], bam, bai, stats ]
+    }
 
     return ch_result
 }

diff --git a/nextflow.config b/nextflow.config
@@ -66,10 +66,10 @@ params {
 
     wf {
         basecaller_container = "ontresearch/dorado"
-        container_sha = "sha58b978562389bd0f1842601fb83cdf1eb2920218"
+        container_sha = "shaa2ceb44eb92c08f9a3a53f97077904d7e23e28ec"
         bonito_container = "ontresearch/bonito"
         bonito_sha = "shaea43ca2333f91fa78a823f640ba158e4268f1f98"
-        common_sha = "shad399cf22079b5b153920ac39ee40095a677933f1"
+        common_sha = "shad28e55140f75a68f59bbecc74e880aeab16ab158"
         example_cmd = [
             "--basecaller_cfg '[email protected]'",
             "--dorado_ext 'pod5'",
@@ -88,7 +88,7 @@ manifest {
     description     = 'Helper workflow for basecalling ONT reads.'
     mainScript      = 'main.nf'
     nextflowVersion = '>=23.04.2'
-    version         = '1.3.0'
+    version         = '1.4.0'
 }
 
 epi2melabs {