Skip to content

Commit

Permalink
Merge branch 'dorado_080' into 'dev'
Browse files Browse the repository at this point in the history
Template update and Dorado v0.8.0

See merge request epi2melabs/workflows/wf-basecalling!91
  • Loading branch information
RenzoTale88 committed Sep 17, 2024
2 parents 0c054d2 + ecebfeb commit 5d5b44d
Show file tree
Hide file tree
Showing 12 changed files with 117 additions and 46 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ repos:
always_run: true
pass_filenames: false
additional_dependencies:
- epi2melabs==0.0.56
- epi2melabs==0.0.57
- id: build_models
name: build_models
entry: datamodel-codegen --strict-nullable --base-class workflow_glue.results_schema_helpers.BaseModel --use-schema-description --disable-timestamp --input results_schema.yml --input-file-type openapi --output bin/workflow_glue/results_schema.py
Expand Down
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [unreleased]
## [v1.4.0]
### Added
- IGV configuration file with `--ref --igv` options and either `--output_fmt bam` or `--output_fmt cram`.
- Support for gzipped reference genomes.
- `output_fmt` selects the output format for basecalled and aligned files.
### Changed
- Reconciled workflow with wf-template v5.2.3.
- Updated Dorado to [v0.8.0](https://github.com/nanoporetech/dorado/releases/tag/v0.8.0)
- Reconciled workflow with wf-template v5.2.6.
- Do not emit the reference FASTA file.
- Collapse redundant RG and PG header lines when emitting BAM or CRAM.
### Fixed
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ therefore Nextflow will need to be
installed before attempting to run the workflow.

The workflow can currently be run using either
[Docker](https://www.docker.com/products/docker-desktop
[Docker](https://www.docker.com/products/docker-desktop)
or [Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html)
to provide isolation of the required software.
Both methods are automated out-of-the-box provided
Expand Down
16 changes: 11 additions & 5 deletions bin/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import argparse
import json

from bokeh.models import Title
from dominate.tags import p
import ezcharts as ezc
from ezcharts.components.ezchart import EZChart
Expand All @@ -11,7 +12,7 @@
from ezcharts.layout.snippets import Grid
from ezcharts.layout.snippets import Tabs
import pandas as pd
import report_utils
from report_utils import read_length_plot, read_quality_plot
from ezcharts.util import get_named_logger # noqa: ABS101


Expand Down Expand Up @@ -42,16 +43,21 @@ def main(args):
continue
with Grid(columns=2):
EZChart(
report_utils.read_quality_plot(data), THEME)
EZChart(report_utils.read_length_plot(data), THEME)
read_quality_plot(data), THEME)
EZChart(read_length_plot(data), THEME)
with tabs.add_tab('total'):
with Grid(columns=1): # total read counts per sample
df_stats = pd.DataFrame.from_dict(total_reads.items())
df_stats.columns = ['Sample_name', 'Number of reads']
plt = ezc.barplot(
data=df_stats, x='Sample_name', y='Number of reads')
plt.title = {"text": "Number of reads per sample."}
plt.tooltip = {'trigger': 'axis'}
plt._fig.add_layout(
Title(
text="Number of reads per sample.",
text_font_size="1.5em"
),
'above'
)
EZChart(plt, THEME)

# If pairing rates are provided, show them.
Expand Down
23 changes: 16 additions & 7 deletions bin/report_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python
"""Create tables for the report."""
from bokeh.models import Title
from ezcharts.plots.distribution import histplot
import pandas as pd

Expand All @@ -21,10 +22,14 @@ def read_quality_plot(seq_summary, min_qual=4, max_qual=30, title='Read quality'
bins=len(df),
weights=list(df['counts'])
)
plt.title = dict(text=title)
plt.xAxis.name = 'Quality score'
plt.xAxis.min, plt.xAxis.max = min_qual, max_qual
plt.yAxis.name = 'Number of reads'
plt._fig.add_layout(
Title(text=title, text_font_size="1.5em"),
'above'
)
plt._fig.xaxis.axis_label = "Quality score"
plt._fig.yaxis.axis_label = "Number of reads"
plt._fig.x_range.start = min_qual
plt._fig.x_range.end = max_qual
return plt


Expand All @@ -38,7 +43,11 @@ def read_length_plot(seq_summary, title='Read length'):
data=df['read_length'],
bins=len(df),
weights=list(df['counts']))
plt.title = dict(text=title)
plt.xAxis.name = 'Read length / kb'
plt.yAxis.name = 'Number of reads'
plt._fig.add_layout(
Title(text=title, text_font_size="1.5em"),
'above'
)
plt._fig.x_range.start = 0
plt._fig.xaxis.axis_label = "Read length / kb"
plt._fig.yaxis.axis_label = "Number of reads"
return plt
8 changes: 7 additions & 1 deletion bin/workflow_glue/check_bam_headers_in_dir.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,13 @@ def main(args):
for xam_file in target_files:
# get the `@SQ` and `@HD` lines in the header
with pysam.AlignmentFile(xam_file, check_sq=False) as f:
sq_lines = f.header.get("SQ")
# compare only the SN/LN/M5 elements of SQ to avoid labelling XAM with
# same reference but different SQ.UR as mixed_header (see CW-4842)
sq_lines = [{
"SN": sq["SN"],
"LN": sq["LN"],
"M5": sq.get("M5"),
} for sq in f.header.get("SQ", [])]
hd_lines = f.header.get("HD")
# Check if it is sorted.
# When there is more than one BAM, merging/sorting
Expand Down
12 changes: 6 additions & 6 deletions bin/workflow_glue/check_xam_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ def validate_xam_index(xam_file):
Invalid indexes will fail the call with a ValueError:
ValueError: fetch called on bamfile without index
"""
alignments = pysam.AlignmentFile(xam_file, check_sq=False)
try:
alignments.fetch()
has_valid_index = True
except ValueError:
has_valid_index = False
with pysam.AlignmentFile(xam_file, check_sq=False) as alignments:
try:
alignments.fetch()
has_valid_index = True
except ValueError:
has_valid_index = False
return has_valid_index


Expand Down
2 changes: 1 addition & 1 deletion docs/04_install_and_run.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ therefore Nextflow will need to be
installed before attempting to run the workflow.

The workflow can currently be run using either
[Docker](https://www.docker.com/products/docker-desktop
[Docker](https://www.docker.com/products/docker-desktop)
or [Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html)
to provide isolation of the required software.
Both methods are automated out-of-the-box provided
Expand Down
1 change: 1 addition & 0 deletions lib/common.nf
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ process getParams {
}

process configure_igv {
publishDir "${params.out_dir}/", mode: 'copy', pattern: 'igv.json', enabled: params.containsKey("igv") && params.igv
label "wf_common"
cpus 1
memory "2 GB"
Expand Down
61 changes: 44 additions & 17 deletions lib/ingress.nf
Original file line number Diff line number Diff line change
Expand Up @@ -197,15 +197,15 @@ def fastq_ingress(Map arguments)
.map { meta, files, stats ->
// new `arity: '1..*'` would be nice here
files = files instanceof List ? files : [files]
new_keys = [
def new_keys = [
"group_key": groupKey(meta["alias"], files.size()),
"n_fastq": files.size()]
grp_index = (0..<files.size()).collect()
def grp_index = (0..<files.size()).collect()
[meta + new_keys, files, grp_index, stats]
}
.transpose(by: [1, 2]) // spread multiple fastq files into separate emissions
.map { meta, files, grp_i, stats ->
new_keys = [
def new_keys = [
"group_index": "${meta["alias"]}_${grp_i}"]
[meta + new_keys, files, stats]
}
Expand Down Expand Up @@ -279,17 +279,19 @@ def xam_ingress(Map arguments)
// sorted, the index will be used.
meta, paths ->
boolean is_array = paths instanceof ArrayList
String xai_fn
String src_xam
String src_xai
// Using `.uri` or `.Uri()` leads to S3 paths to be prefixed with `s3:///`
// instead of `s3://`, causing the workflow to not find the index file.
// `.toUriString()` returns the correct path.
if (!is_array){
src_xam = paths.toUriString()
def xai = file(paths.toUriString() + ".bai")
if (xai.exists()){
xai_fn = xai.toUriString()
src_xai = xai.toUriString()
}
}
[meta + [xai_fn: xai_fn], paths]
[meta + [src_xam: src_xam, src_xai: src_xai], paths]
}
| checkBamHeaders
| map { meta, paths, is_unaligned_env, mixed_headers_env, is_sorted_env ->
Expand Down Expand Up @@ -331,9 +333,9 @@ def xam_ingress(Map arguments)
// - between 1 and `N_OPEN_FILES_LIMIT` aligned files
no_files: n_files == 0
indexed: \
n_files == 1 && (meta["is_unaligned"] || meta["is_sorted"]) && meta["xai_fn"]
to_index:
n_files == 1 && (meta["is_unaligned"] || meta["is_sorted"]) && !meta["xai_fn"]
n_files == 1 && (meta["is_unaligned"] || meta["is_sorted"]) && meta["src_xai"]
to_index: \
n_files == 1 && (meta["is_unaligned"] || meta["is_sorted"]) && !meta["src_xai"]
to_catsort: \
(n_files == 1) || (n_files > N_OPEN_FILES_LIMIT) || meta["is_unaligned"]
to_merge: true
Expand All @@ -358,20 +360,20 @@ def xam_ingress(Map arguments)
.map { meta, files, stats ->
// new `arity: '1..*'` would be nice here
files = files instanceof List ? files : [files]
new_keys = [
def new_keys = [
"group_key": groupKey(meta["alias"], files.size()),
"n_fastq": files.size()]
grp_index = (0..<files.size()).collect()
def grp_index = (0..<files.size()).collect()
[meta + new_keys, files, grp_index, stats]
}
.transpose(by: [1, 2]) // spread multiple fastq files into separate emissions
.map { meta, files, grp_i, stats ->
new_keys = [
def new_keys = [
"group_index": "${meta["alias"]}_${grp_i}"]
[meta + new_keys, files, stats]
}
.map { meta, path, stats ->
[meta.findAll { it.key !in ['xai_fn', 'is_sorted'] }, path, stats]
[meta.findAll { it.key !in ['is_sorted', 'src_xam', 'src_xai'] }, path, stats]
}

// add number of reads, run IDs, and basecall models to meta
Expand All @@ -388,18 +390,26 @@ def xam_ingress(Map arguments)
| sortBam
| groupTuple
| mergeBams
| map{
meta, bam, bai ->
[meta + [src_xam: null, src_xai: null], bam, bai]
}

// now handle samples with too many files for `samtools merge`
ch_catsorted = ch_result.to_catsort
| catSortBams
| map{
meta, bam, bai ->
[meta + [src_xam: null, src_xai: null], bam, bai]
}

// Validate the index of the input BAM.
// If the input BAM index is invalid, regenerate it.
// First separate the BAM from the null input channels.
ch_to_validate = ch_result.indexed
| map{
meta, paths ->
bai = paths && meta.xai_fn ? file(meta.xai_fn) : null
def bai = paths && meta.src_xai ? file(meta.src_xai) : null
[meta, paths, bai]
}
| branch {
Expand Down Expand Up @@ -429,6 +439,10 @@ def xam_ingress(Map arguments)
ch_indexed = ch_result.to_index
| mix( ch_validated.invalid_idx )
| samtools_index
| map{
meta, bam, bai ->
[meta + [src_xai: null], bam, bai]
}

// Add extra null for the missing index to input.missing
// as well as the missing metadata.
Expand All @@ -439,7 +453,7 @@ def xam_ingress(Map arguments)
)
| map{
meta, paths ->
[meta + [xai_fn: null, is_sorted: false], paths, null]
[meta + [src_xam: null, src_xai: null, is_sorted: false], paths, null]
}

// Combine all possible inputs
Expand Down Expand Up @@ -480,7 +494,7 @@ def xam_ingress(Map arguments)
}

// Remove metadata that are unnecessary downstream:
// meta.xai_fn: not needed, as it will be part of the channel as a file
// meta.src_xai: not needed, as it will be part of the channel as a file
// meta.is_sorted: if data are aligned, they will also be sorted/indexed
//
// The output meta can contain the following flags:
Expand All @@ -498,7 +512,7 @@ def xam_ingress(Map arguments)
ch_result
| map{
meta, bam, bai, stats ->
[meta.findAll { it.key !in ['xai_fn', 'is_sorted'] }, [bam, bai], stats]
[meta.findAll { it.key !in ['is_sorted'] }, [bam, bai], stats]
},
"xam"
)
Expand All @@ -508,6 +522,19 @@ def xam_ingress(Map arguments)
| map{
it.flatten()
}
// Final check to ensure that src_xam/src_xai is not an s3
// path. If so, drop it. We check src_xam also for src_xai
// as, the latter is irrelevant if the former is in s3.
| map{
meta, bam, bai, stats ->
def xam = meta.src_xam
def xai = meta.src_xai
if (meta.src_xam){
xam = meta.src_xam.startsWith('s3://') ? null : meta.src_xam
xai = meta.src_xam.startsWith('s3://') ? null : meta.src_xai
}
[ meta + [src_xam: xam, src_xai: xai], bam, bai, stats ]
}

return ch_result
}
Expand Down
6 changes: 3 additions & 3 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,10 @@ params {

wf {
basecaller_container = "ontresearch/dorado"
container_sha = "sha58b978562389bd0f1842601fb83cdf1eb2920218"
container_sha = "shaa2ceb44eb92c08f9a3a53f97077904d7e23e28ec"
bonito_container = "ontresearch/bonito"
bonito_sha = "shaea43ca2333f91fa78a823f640ba158e4268f1f98"
common_sha = "shad399cf22079b5b153920ac39ee40095a677933f1"
common_sha = "shad28e55140f75a68f59bbecc74e880aeab16ab158"
example_cmd = [
"--basecaller_cfg '[email protected]'",
"--dorado_ext 'pod5'",
Expand All @@ -88,7 +88,7 @@ manifest {
description = 'Helper workflow for basecalling ONT reads.'
mainScript = 'main.nf'
nextflowVersion = '>=23.04.2'
version = '1.3.0'
version = '1.4.0'
}

epi2melabs {
Expand Down
Loading

0 comments on commit 5d5b44d

Please sign in to comment.