Skip to content

Commit

Permalink
Merge pull request #143 from LouisLeNezet/fix_checkchr
Browse files Browse the repository at this point in the history
Fix checkchr
  • Loading branch information
atrigila authored Oct 29, 2024
2 parents e351bb3 + abd697f commit 262290e
Show file tree
Hide file tree
Showing 16 changed files with 120 additions and 172 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Initial release of nf-core/phaseimpute, created with the [nf-core](https://nf-co
- [#118](https://github.com/nf-core/phaseimpute/pull/118) - Explain how to customize arguments in the pipeline.
- [#111](https://github.com/nf-core/phaseimpute/pull/111) - Add nf-test for all sbwf, wf, modules and functions.
- [#131](https://github.com/nf-core/phaseimpute/pull/131) - Set normalisation as optional. Fix extension detection function. Add support for validation with vcf files. Concatenate vcf only if more than one file. Change `--phased` to `--phase` for consistency.
- [#143](https://github.com/nf-core/phaseimpute/pull/143) - Improve contigs warning and error logging. The number of chromosomes contigs is summarized if above `max_chr_names`.

### `Changed`

Expand Down
6 changes: 3 additions & 3 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -207,17 +207,17 @@
"nf-core": {
"utils_nextflow_pipeline": {
"branch": "master",
"git_sha": "9d05360da397692321d377b6102d2fb22507c6ef",
"git_sha": "3aa0aec1d52d492fe241919f0c6100ebf0074082",
"installed_by": ["subworkflows"]
},
"utils_nfcore_pipeline": {
"branch": "master",
"git_sha": "772684d9d66f37b650c8ba5146ac1ee3ecba2acb",
"git_sha": "1b6b9a3338d011367137808b49b923515080e3ba",
"installed_by": ["subworkflows"]
},
"utils_nfschema_plugin": {
"branch": "master",
"git_sha": "bbd5a41f4535a8defafe6080e00ea74c45f4f96c",
"git_sha": "2fd2cd6d0e7b273747f32e465fdc6bcc3ae0814e",
"installed_by": ["subworkflows"]
}
}
Expand Down
3 changes: 1 addition & 2 deletions modules/local/add_columns/environment.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
name: gawk
channels:
- conda-forge
- bioconda
dependencies:
- anaconda::gawk=5.3.0
- conda-forge::gawk=5.3.0
6 changes: 2 additions & 4 deletions modules/local/bam_chr_extract/environment.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
name: bam_chr_extract
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- bioconda::samtools=1.20
- bioconda::htslib=1.20
- bioconda::htslib=1.21
- bioconda::samtools=1.21
3 changes: 1 addition & 2 deletions modules/local/list_to_file/environment.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
name: gawk
channels:
- conda-forge
- bioconda
dependencies:
- anaconda::gawk=5.3.0
- conda-forge::gawk=5.3.0
2 changes: 0 additions & 2 deletions modules/local/vcf_chr_extract/environment.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
name: vcf_chr_extract
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- bioconda::bcftools=1.20
5 changes: 4 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,12 @@ params {
phase = false
normalize = true
compute_freq = false
rename_chr = false
remove_samples = null

// ChrCheck parameters
rename_chr = false
max_chr_names = 4

// References
genome = null
igenomes_base = 's3://ngi-igenomes/igenomes/'
Expand Down
6 changes: 6 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@
"description": "Should the panel vcf files be renamed to match the reference genome (e.g. 'chr1' -> '1')",
"pattern": "true|false"
},
"max_chr_names": {
"type": "integer",
"description": "Maximum number of contigs name to print before resuming (i.e. show only subset and add '...' at the end).",
"hidden": true,
"default": 4
},
"remove_samples": {
"type": "string",
"description": "Comma-separated list of samples to remove from the reference panel. Useful for benchmarking purposes."
Expand Down
3 changes: 2 additions & 1 deletion subworkflows/local/utils_nfcore_chrcheck_pipeline/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def diffChr(chr_target, chr_ref, file) {
}
new_diff = diff - new_chr
if (new_diff.size() != 0) {
error "Contig names: ${new_diff} absent from file: ${file} and cannot be solved by adding or removing the `chr` prefix."
chr_names = new_diff.size() > params.max_chr_names ? new_diff[0..params.max_chr_names - 1] + ['...'] : new_diff
error "Contig names: ${chr_names} absent from file: ${file} and cannot be solved by adding or removing the `chr` prefix."
}
diff = to_rename
}
Expand Down
15 changes: 13 additions & 2 deletions subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -258,14 +258,23 @@ workflow PIPELINE_INITIALISATION {
chr_all_mis = chr_ref_mis.concat(chr_chunks_mis, chr_map_mis, chr_panel_mis, chr_posfile_mis)
.unique()
.toList()
.subscribe{ chr -> if (chr.size() > 0) { log.warn "The following contigs are absent from at least one file : ${chr} and therefore won't be used" } }
.subscribe{ chr ->
if (chr.size() > 0) {
chr_names = chr.size() > params.max_chr_names ? chr[0..params.max_chr_names - 1] + ['...'] : chr
log.warn "The following contigs are absent from at least one file : ${chr_names} and therefore won't be used" } }

ch_regions = ch_regions
.combine(chr_all_mis.toList())
.filter { meta, regions, chr_mis ->
!(meta.chr in chr_mis)
}
.map { meta, regions, chr_mis -> [meta, regions] }
.ifEmpty { error "No regions left to process" }

ch_regions
.map { it[1] }
.collect()
.subscribe { log.info "The following contigs will be processed: ${it}" }

// Check that all input files have the correct index
checkFileIndex(ch_input.mix(ch_input_truth, ch_ref_gen, ch_panel))
Expand Down Expand Up @@ -469,7 +478,9 @@ def checkMetaChr(chr_a, chr_b, name){
.map{
a, b ->
if (b != [[]] && !(a - b).isEmpty()) {
log.warn "Chr : ${a - b} is missing from ${name}"
chr_names = (a - b).size() > params.max_chr_names ? (a - b)[0..params.max_chr_names - 1] + ['...'] : (a - b)
verb = (a - b).size() == 1 ? "is" : "are"
log.warn "Chr : ${chr_names} ${verb} missing from ${name}"
return (a-b)
}
return []
Expand Down
4 changes: 2 additions & 2 deletions subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 12 additions & 3 deletions workflows/chrcheck/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,16 @@ workflow CHRCHECK {
ch_versions = Channel.empty()
// Split the input between VCF and BAM files
ch_input = ch_input.branch{
bam: it[1] =~ 'bam|cram|sam'
bam: it[1] =~ 'bam|cram'
vcf: it[1] =~ 'vcf|bcf'
other: it[1].size() > 0
empty: true
}

ch_input.other.map {
error "File: ${it[1]} is not a VCF, BCFT or BAM, CRAM file."
}

// Check if channel is empty
chr_vcf_disjoint = Channel.empty()
// Extract the contig names from the VCF files
Expand Down Expand Up @@ -51,10 +58,12 @@ workflow CHRCHECK {
ch_vcf_renamed = VCF_CHR_RENAME_BCFTOOLS.out.vcf_renamed
} else {
chr_vcf_disjoint.to_rename.map {
error "Contig names: ${it[3]} in VCF: ${it[1]} are not present in reference genome with same writing. Please set `rename_chr` to `true` to rename the contigs."
chr_names = it[3].size() > params.max_chr_names ? it[3][0..params.max_chr_names - 1] + ['...'] : it[3]
error "Contig names: ${chr_names} in VCF: ${it[1]} are not present in reference genome with same writing. Please set `rename_chr` to `true` to rename the contigs."
}
chr_bam_disjoint.to_rename.map {
error "Contig names: ${it[3]} in BAM: ${it[1]} are not present in reference genome with same writing. Please set `rename_chr` to `true` to rename the contigs."
chr_names = it[3].size() > params.max_chr_names ? it[3][0..params.max_chr_names - 1] + ['...'] : it[3]
error "Contig names: ${chr_names} in BAM: ${it[1]} are not present in reference genome with same writing. Please set `rename_chr` to `true` to rename the contigs."
}
ch_vcf_renamed = Channel.empty()
ch_bam_renamed = Channel.empty()
Expand Down
77 changes: 45 additions & 32 deletions workflows/chrcheck/tests/main.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ nextflow_workflow {
input[0] = Channel.fromList([
[
[id: "VCF_chr22"],
file(params.test_data['homo_sapiens']['genome']['dbsnp_146_hg38_vcf_gz'],checkIfExist:true),
file(params.test_data['homo_sapiens']['genome']['dbsnp_146_hg38_vcf_gz_tbi'],checkIfExist:true),
file(params.modules_testdata_base_path + '/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz',checkIfExist:true),
file(params.modules_testdata_base_path + '/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz.tbi',checkIfExist:true),
["22"]
],
[
[id: "BAM_chr22"],
file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam'], checkIfExist:true),
file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExist:true),
file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam', checkIfExist:true),
file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExist:true),
["22"]
]
])
Expand All @@ -44,21 +44,28 @@ nextflow_workflow {
then {
assertAll(
{ assert workflow.success },
{ assert snapshot(workflow.out).match() },
{ assert snapshot(workflow.out.output.collect{
if (it[1].endsWith("vcf.gz")) {
path(it[1]).vcf.summary
} else {
bam(it[1]).getHeader().findAll { it.startsWith ("@SQ") }
}
}).match("headernochr")
{ assert snapshot(workflow.out.output
.collect{
if (it[1].endsWith("vcf.gz")) {
path(it[1]).vcf.summary
} else {
bam(it[1]).getHeader().findAll { it.startsWith ("@SQ") }
}
},
workflow.out.output.collect{
file(it[1]).getName()
file(it[2]).getName()
},
workflow.out.versions
).match()
}
)
}
}

test("Rename: VCF no chr + fasta chr") {
config "./nextflow_rename.config"
tag "test"
setup {
run("BAM_CHR_RENAME_SAMTOOLS", alias: "PREPROCESS") {
script "../../../subworkflows/local/bam_chr_rename_samtools/main.nf"
Expand All @@ -67,8 +74,8 @@ nextflow_workflow {
input[0] = Channel.fromList([
[
[id: "BAM_22"],
file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam'], checkIfExist:true),
file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExist:true),
file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam', checkIfExist:true),
file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExist:true),
"nochr"
]
])
Expand Down Expand Up @@ -97,14 +104,20 @@ nextflow_workflow {
then {
assertAll(
{ assert workflow.success },
{ assert snapshot(workflow.out).match() },
{ assert snapshot(workflow.out.output.collect{
if (it[1].endsWith("vcf.gz")) {
path(it[1]).vcf.summary
} else {
bam(it[1]).getHeader().findAll { it.startsWith ("@SQ") }
}
}).match("headerwithchr")
{ assert snapshot(
workflow.out.output.collect{
if (it[1].endsWith("vcf.gz")) {
path(it[1]).vcf.summary
} else {
bam(it[1]).getHeader().findAll { it.startsWith ("@SQ") }
}
},
workflow.out.output.collect{
file(it[1]).getName()
file(it[2]).getName()
},
workflow.out.versions
).match()
}
)
}
Expand All @@ -118,8 +131,8 @@ nextflow_workflow {
input[0] = Channel.fromList([
[
[id: "VCF_chr22"],
file(params.test_data['homo_sapiens']['genome']['dbsnp_146_hg38_vcf_gz'],checkIfExist:true),
file(params.test_data['homo_sapiens']['genome']['dbsnp_146_hg38_vcf_gz_tbi'],checkIfExist:true),
file(params.modules_testdata_base_path + '/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz',checkIfExist:true),
file(params.modules_testdata_base_path + '/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz.tbi',checkIfExist:true),
["22"]
]
])
Expand All @@ -145,8 +158,8 @@ nextflow_workflow {
input[0] = Channel.fromList([
[
[id: "BAM_chr22"],
file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam'], checkIfExist:true),
file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExist:true),
file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam', checkIfExist:true),
file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExist:true),
["22"]
]
])
Expand All @@ -169,12 +182,12 @@ nextflow_workflow {
when {
workflow {
"""
lst_chr = ["chr22", "chr34", "GL000207.1"]
lst_chr = ["chr22", "chr34", "GL000207.1", "chr45", "chr46", "chr47", "chr48", "chr49"]
input[0] = Channel.fromList([
[
[id: "VCF_AllNoChr"],
file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz",checkIfExist:true),
file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz.tbi",checkIfExist:true),
file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz',checkIfExist:true),
file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz.tbi',checkIfExist:true),
lst_chr
],
])
Expand All @@ -185,7 +198,7 @@ nextflow_workflow {
then {
assertAll(
{ assert workflow.failed },
{ assert workflow.errorReport.contains("Contig names: [chr34, GL000207.1] absent from file: /nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz and cannot be solved by adding or removing the `chr` prefix.")}
{ assert workflow.errorReport.contains("Contig names: [chr34, GL000207.1, chr45, chr46, ...] absent from file: /nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz and cannot be solved by adding or removing the `chr` prefix.")}
)
}
}
Expand All @@ -198,8 +211,8 @@ nextflow_workflow {
input[0] = Channel.fromList([
[
[id: "BAM_chr22"],
file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam'], checkIfExist:true),
file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExist:true),
file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam', checkIfExist:true),
file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExist:true),
["chr1"]
],
])
Expand Down
Loading

0 comments on commit 262290e

Please sign in to comment.