From afac990664d3bc66337ac74d807fb8171453c4af Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Tue, 18 Jun 2024 15:38:24 +1200 Subject: [PATCH 1/2] Fixed a bug where is_masked was ignored by the pipeline --- CHANGELOG.md | 16 ++++++++ assets/schema_input.json | 3 +- nextflow.config | 2 +- subworkflows/local/prepare_assembly.nf | 56 +++++++++++++------------- workflows/pangene.nf | 5 ++- 5 files changed, 51 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bb696d7..5dea5a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,22 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## 0.3.3 - [18-Jun-2024] + +### `Added` + +### `Fixed` + +1. Fixed a bug where `is_masked` was ignored by the pipeline +2. Fixed a bug in param validation which allowed specification of `braker_hints` without `braker_gff3` + +### `Dependencies` + +1. NextFlow!>=23.04.4 +2. nf-validation=1.1.3 + +### `Deprecated` + ## 0.3.2 - [13-May-2024] ### `Added` diff --git a/assets/schema_input.json b/assets/schema_input.json index d8e0fdd..287b222 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -61,7 +61,8 @@ "maxLength": 0 } ], - "errorMessage": "BRAKER hints GFF/GFF3 file path cannot contain spaces and must have extension '.gff.gz', '.gff3.gz', '.gff' or '.gff3'" + "errorMessage": "BRAKER hints GFF/GFF3 file path cannot contain spaces and must have extension '.gff.gz', '.gff3.gz', '.gff' or '.gff3'", + "dependentRequired": ["braker_gff3"] } }, "required": ["tag", "fasta", "is_masked"] diff --git a/nextflow.config b/nextflow.config index bd4cccc..1b191f6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -58,7 +58,7 @@ manifest { description = """A NextFlow pipeline for pan-genome annotation""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.4' - version = '0.3.2' + version = '0.3.3' doi = '' } diff --git a/subworkflows/local/prepare_assembly.nf b/subworkflows/local/prepare_assembly.nf index f0c9d7f..c3701bb 100644 --- a/subworkflows/local/prepare_assembly.nf +++ b/subworkflows/local/prepare_assembly.nf @@ -14,6 +14,7 @@ workflow PREPARE_ASSEMBLY { te_library // channel: [ meta, fasta ] repeat_annotator // val(String), 'repeatmodeler' or 'edta' exclude_assemblies // channel: val(assembly_x,assembly_y) + ch_is_masked // channel: [ meta, val(true|false) ] main: ch_versions = Channel.empty() @@ -63,26 +64,35 @@ workflow PREPARE_ASSEMBLY { ch_versions = ch_versions.mix(GUNZIP_TE_LIBRARY.out.versions.first()) // SUBWORKFLOW: FASTA_EDTA_LAI - ch_annotator_inputs = ch_validated_assembly + ch_unmasked_masked_branch = ch_validated_assembly + | combine( exclude_assemblies ) + | map { meta, fasta, ex_assemblies -> + ex_assemblies.tokenize(",").contains( meta.id ) + ? null + : [ meta, fasta ] + } + | join( + ch_is_masked + ) + | branch { meta, fasta, is_masked -> + unmasked: ! is_masked + return [ meta, fasta ] + masked: is_masked + return [ meta, fasta ] + } + + ch_annotator_inputs = ch_unmasked_masked_branch.unmasked | join( ch_gunzip_te_library, remainder: true ) | filter { meta, assembly, teLib -> - teLib == null + teLib == null && ( assembly != null ) } - | map { meta, assembly, teLib -> [meta, assembly] } + | map { meta, assembly, teLib -> [ meta, assembly ] } ch_edta_inputs = repeat_annotator != 'edta' ? Channel.empty() : ch_annotator_inputs - | combine( exclude_assemblies ) - | map { meta, fasta, ex_assemblies -> - def ex_list = ex_assemblies.split(",") - - if ( !( ex_list.contains( meta.id ) ) ) { - [ meta, fasta ] - } - } FASTA_EDTA_LAI( ch_edta_inputs, @@ -96,14 +106,6 @@ workflow PREPARE_ASSEMBLY { ch_repeatmodeler_inputs = repeat_annotator != 'repeatmodeler' ? Channel.empty() : ch_annotator_inputs - | combine( exclude_assemblies ) - | map { meta, fasta, ex_assemblies -> - def ex_list = ex_assemblies.split(",") - - if ( !( ex_list.contains( meta.id ) ) ) { - [ meta, fasta ] - } - } REPEATMODELER_BUILDDATABASE ( ch_repeatmodeler_inputs ) @@ -112,7 +114,7 @@ workflow PREPARE_ASSEMBLY { // MODULE: REPEATMODELER_REPEATMODELER REPEATMODELER_REPEATMODELER ( REPEATMODELER_BUILDDATABASE.out.db ) - ch_assembly_and_te_lib = ch_validated_assembly + ch_assembly_and_te_lib = ch_unmasked_masked_branch.unmasked | join( repeat_annotator == 'edta' ? FASTA_EDTA_LAI.out.te_lib_fasta.mix(ch_gunzip_te_library) @@ -123,21 +125,21 @@ workflow PREPARE_ASSEMBLY { // MODULE: REPEATMASKER REPEATMASKER( - ch_assembly_and_te_lib.map { meta, assembly, teLib -> [meta, assembly] }, + ch_assembly_and_te_lib.map { meta, assembly, teLib -> [ meta, assembly ] }, ch_assembly_and_te_lib.map { meta, assembly, teLib -> teLib }, ) + ch_masked_assembly = ch_unmasked_masked_branch.masked + | mix(REPEATMASKER.out.fasta_masked) ch_versions = ch_versions.mix(REPEATMASKER.out.versions.first()) // MODULE: STAR_GENOMEGENERATE ch_genomegenerate_inputs = ch_validated_assembly | combine( exclude_assemblies ) | map { meta, fasta, ex_assemblies -> - def ex_list = ex_assemblies.split(",") - - if ( !( ex_list.contains( meta.id ) ) ) { - [ meta, fasta ] - } + ex_assemblies.tokenize(",").contains( meta.id ) + ? null + : [ meta, fasta ] } @@ -151,7 +153,7 @@ workflow PREPARE_ASSEMBLY { emit: target_assemby = ch_validated_assembly // channel: [ meta, fasta ] - masked_target_assembly = REPEATMASKER.out.fasta_masked // channel: [ meta, fasta ] + masked_target_assembly = ch_masked_assembly // channel: [ meta, fasta ] target_assemby_index = ch_assembly_index // channel: [ meta, star_index ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/pangene.nf b/workflows/pangene.nf index c0f3c32..b11a014 100644 --- a/workflows/pangene.nf +++ b/workflows/pangene.nf @@ -42,7 +42,7 @@ workflow PANGENE { it.join(",") } - ch_masked = ch_input + ch_is_masked = ch_input | map { it -> def tag = it[0] def is_masked = it[2] @@ -144,7 +144,8 @@ workflow PANGENE { ch_target_assembly, ch_te_library, params.repeat_annotator, - ch_braker_ex_asm_str + ch_braker_ex_asm_str, + ch_is_masked ) ch_valid_target_assembly = PREPARE_ASSEMBLY.out.target_assemby From 1f957a2b271517628602700a3b07c166747c2b20 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Tue, 18 Jun 2024 15:43:51 +1200 Subject: [PATCH 2/2] Added a stub test to evaluate the case where an assembly is soft masked but has no annotations --- CHANGELOG.md | 2 ++ tests/stub/assemblysheet.csv | 1 + tests/stub/target/red5_v3_chr1.fasta | 0 3 files changed, 3 insertions(+) create mode 100644 tests/stub/target/red5_v3_chr1.fasta diff --git a/CHANGELOG.md b/CHANGELOG.md index 5dea5a6..be03a5c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` +1. Added a stub test to evaluate the case where an assembly is soft masked but has no annotations + ### `Fixed` 1. Fixed a bug where `is_masked` was ignored by the pipeline diff --git a/tests/stub/assemblysheet.csv b/tests/stub/assemblysheet.csv index cfa0cdb..7fdff29 100644 --- a/tests/stub/assemblysheet.csv +++ b/tests/stub/assemblysheet.csv @@ -1,3 +1,4 @@ tag,fasta,is_masked,te_lib,braker_gff3,braker_hints red5_v2p1,tests/stub/target/red5_v2p1_chr1.fasta.gz,no,,tests/stub/braker/red5_v2p1.gff3.gz,tests/stub/braker/red5_v2p1.hints.gff.gz donghong,tests/stub/target/donghong.chr1.fsa.gz,no,tests/stub/te_lib/donghong.TElib.fa.gz,tests/stub/braker/red5_v2p1.gff3.gz,tests/stub/braker/red5_v2p1.hints.gff.gz +red5_v3,tests/stub/target/red5_v3_chr1.fasta,yes diff --git a/tests/stub/target/red5_v3_chr1.fasta b/tests/stub/target/red5_v3_chr1.fasta new file mode 100644 index 0000000..e69de29