diff --git a/workflow/envs/paired_end_read_merger.yaml b/workflow/envs/paired_end_read_merger.yaml index 2bccdb22..8b655505 100644 --- a/workflow/envs/paired_end_read_merger.yaml +++ b/workflow/envs/paired_end_read_merger.yaml @@ -16,6 +16,7 @@ dependencies: - scipy - matplotlib-base - mafft + - git-lfs - pip - pip: - git+https://github.com/cbg-ethz/smallgenomeutilities@dev diff --git a/workflow/rules/snv.smk b/workflow/rules/snv.smk index 5fd4bc73..462e337f 100644 --- a/workflow/rules/snv.smk +++ b/workflow/rules/snv.smk @@ -295,14 +295,21 @@ rule paired_end_read_merger: if config.viloca["consensus"] else reference_file ), + fname_ref_idx=( + cohortdir("cohort_consensus.fasta") + if config.viloca["consensus"] + else reference_file + )+".fai", output: - fname_sam_merged=f"results/{{sample}}/alignment/REF_aln.merged.sam", + fname_bam_merged="{dataset}/alignment/REF_aln.merged.bam", + fname_sam_merged=temp_with_prefix("{dataset}/alignment/REF_aln.merged.sam"), + fname_sam=temp_with_prefix("{dataset}/alignment/REF_aln.sam"), + fname_sam_nonmerged="{dataset}/alignment/REF_aln.nonmerged.sam", + fname_sam_sort=temp_with_prefix("{dataset}/alignment/REF_aln.sort.sam"), params: - fname_sam=temp(f"results/{{sample}}/alignment/REF_aln.sam"), - fname_sam_nonmerged=f"results/{{sample}}/alignment/REF_aln.nonmerged.sam", - fname_sam_sort=temp(f"results/{{sample}}/alignment/REF_aln.sort.sam"), SAMTOOLS=config.applications["samtools"], PAIRED_END_READ_MERGER=config.applications["paired_end_read_merger"], + sort_tmp=temp_prefix("{dataset}.tmp"), log: outfile="{dataset}/alignment/paired_end_read_merger.out.log", errfile="{dataset}/alignment/paired_end_read_merger.err.log", @@ -311,12 +318,17 @@ rule paired_end_read_merger: shell: """ ## Preparation - fname_reference_idx = "${input.fname_reference_idx}.fai" - {params.SAMTOOLS} view -h -T {input.fname_reference} -t ${fname_reference_idx} {input.fname_bam} > {params.fname_sam} + {params.SAMTOOLS} view -h -T {input.fname_ref} -t {input.fname_ref_idx} {input.fname_bam} -o {output.fname_sam} > {log.outfile} 2> >(tee {log.errfile} >&2) ## sort accrording to QNAME - {params.SAMTOOLS} sort -T tmp -O sam -n {params.fname_sam} > {params.fname_sam_sort} + rm -f '{params.sort_tmp}'.[0-9]*.bam + {params.SAMTOOLS} sort -T "{params.sort_tmp}" -O sam -n {output.fname_sam} -o {output.fname_sam_sort} >> {log.outfile} 2> >(tee -a {log.errfile} >&2) ## run script - {params.PAIRED_END_READ_MERGER} {input.fname_sam_sort} {output.fname_sam_merged} {params.fname_sam_nonmerged} {input.fname_ref} + {params.PAIRED_END_READ_MERGER} {output.fname_sam_sort} {output.fname_sam_merged} {output.fname_sam_nonmerged} {input.fname_ref} >> {log.outfile} 2> >(tee -a {log.errfile} >&2) + touch {output.fname_sam_nonmerged} + ## sort + rm -f '{params.sort_tmp}'.[0-9]*.bam + {params.SAMTOOLS} sort -T "{params.sort_tmp}" -o "{output.fname_bam_merged}" "{output.fname_sam_merged}" >> {log.outfile} 2> >(tee -a {log.errfile} >&2) + {params.SAMTOOLS} index "{output.fname_bam_merged}" >> {log.outfile} 2> >(tee -a {log.errfile} >&2) """ @@ -328,7 +340,7 @@ rule viloca: else reference_file ), BAM=( - rules.paired_end_read_merger.output.fname_sam_merged, + rules.paired_end_read_merger.output.fname_bam_merged if config.viloca["merge_paired_end_reads"] else alignment_wildcard ), diff --git a/workflow/schemas/config_schema.json b/workflow/schemas/config_schema.json index cc25d855..797c9a3b 100644 --- a/workflow/schemas/config_schema.json +++ b/workflow/schemas/config_schema.json @@ -1237,6 +1237,12 @@ "description": "Indicate whether to use the cohort-consensus sequence from the analyzed samples (output from `minor_variants` rule located in the cohort-wide output `results/cohort_onsensus.fasta`) or the reference sequence by setting this option to False.", "examples": [false] }, + "merge_paired_end_reads": { + "type": "boolean", + "default": false, + "description": "Merge paired-end reads in the preprocessing. This is a preprocessing snakemake rule.", + "examples": [false] + }, "shift": { "type": "integer", "default": 3,