diff --git a/workflow/envs/viloca.yaml b/workflow/envs/viloca.yaml new file mode 100644 index 00000000..51aacd4c --- /dev/null +++ b/workflow/envs/viloca.yaml @@ -0,0 +1,11 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - libshorah + - python=3.10.4 + - pip + - pip: + - pandas + - git+https://github.com/cbg-ethz/VILOCA@master diff --git a/workflow/rules/snv.smk b/workflow/rules/snv.smk index 0389569c..f78a26de 100644 --- a/workflow/rules/snv.smk +++ b/workflow/rules/snv.smk @@ -286,11 +286,63 @@ rule lofreq: {params.LOFREQ} call {params.EXTRA} --call-indels -f {input.REF} -o {output.SNVs} --verbose {output.BAM} >> {log.outfile} 2> >(tee -a {log.errfile} >&2) """ +rule viloca: + input: + REF=( + cohortdir("cohort_consensus.fasta") + if config.lofreq["consensus"] + else reference_file + ), + BAM=alignment_wildcard, + output: + SNVs="{dataset}/variants/SNVs/snvs.vcf", + CSV="{dataset}/variants/SNVs/snv/cooccurring_mutations.csv", + params: + READ_LEN=read_len, + INSERT_FILE=config.viloca["insert_bedfile"], + MODE=config.viloca["mode"], + SHIFT=config.viloca["shift"], + OUTDIR="{dataset}/variants/SNVs", + EXTRA=config.viloca["extra"], + VILOCA=config.applications["viloca"], + log: + outfile="{dataset}/variants/SNVs/viloca.out.log", + errfile="{dataset}/variants/SNVs/viloca.err.log", + conda: + config.viloca["conda"] + benchmark: + "{dataset}/variants/SNVs/viloca.benchmark" + threads: config.viloca["threads"] + resources: + disk_mb=2000, + mem_mb=config.viloca["mem"], + runtime=config.viloca["time"], + shell: + """ + let "WINDOW_SHIFTS=({params.READ_LEN} * 4/5 + {params.SHIFT}) / {params.SHIFT}" + let "WINDOW_LEN=WINDOW_SHIFTS * {params.SHIFT}" + + # Run VILOCA + echo "Running VILOCA" >> {log.outfile} + if [[ "{params.INSERT_FILE}" == "None" ]]; then + {params.VILOCA} {params.EXTRA} -t {threads} --mode {params.MODE} -w ${{WINDOW_LEN}} -s {params.SHIFT} -f {input.REF} -b {input.BAM} >> {log.outfile} 2> >(tee -a {log.errfile} >&2) + else + {params.VILOCA} {params.EXTRA} -t {threads} --mode {params.MODE} -z {params.INSERT_FILE} -f {input.REF} -b {input.BAM} >> {log.outfile} 2> >(tee -a {log.errfile} >&2) + fi + + # rename viloca output snv/SNVs_0.010000_final.vcf --> snvs.vcf + cp "${params.OUTDIR}/snv/SNVs_0.010000_final.vcf" {output.SNVs} + """ + if config.general["snv_caller"] == "shorah": - ruleorder: snv > lofreq + ruleorder: snv > lofreq > viloca elif config.general["snv_caller"] == "lofreq": - ruleorder: lofreq > snv + ruleorder: lofreq > snv > viloca + +elif config.general["snv_caller"] == "viloca": + + ruleorder: viloca > lofreq > snv diff --git a/workflow/schemas/config_schema.json b/workflow/schemas/config_schema.json index 648e5ef6..a0bb2c6d 100644 --- a/workflow/schemas/config_schema.json +++ b/workflow/schemas/config_schema.json @@ -38,8 +38,8 @@ "snv_caller": { "type": "string", "default": "shorah", - "enum": ["shorah","lofreq"], - "description": "There are two options available for calling single nucleotide variants, either using [ShoRAH (`shorah`)](https://github.com/cbg-ethz/shorah) [^5] or [LoFreq (`lofreq`)](https://csb5.github.io/lofreq/) [^6]. ShoRAH is used by default. If you prefer to use LoFreq, then indicate so in the configuration file as in the example\n\n[^5]: Zagordi, O. et al. ShoRAH: estimating the genetic diversity of a mixed sample from next-generation sequencing data. BMC Bioinformatics. 2011.\n[^6]: Wilm, A. et al. LoFreq: A sequence-quality aware, ultra-sensitive variant caller for uncovering cell-population heterogeneity from high-throughput sequencing datasets. Nucleic Acids Res. 2012.", + "enum": ["shorah","lofreq", "viloca"], + "description": "There are three options available for calling single nucleotide variants, either using [ShoRAH (`shorah`)](https://github.com/cbg-ethz/shorah) [^5], [LoFreq (`lofreq`)](https://csb5.github.io/lofreq/) [^6] or [VILOCA (`viloca`)](https://github.com/cbg-ethz/viloca) [^7] . ShoRAH is used by default. If you prefer to use LoFreq, then indicate so in the configuration file as in the example\n\n[^5]: Zagordi, O. et al. ShoRAH: estimating the genetic diversity of a mixed sample from next-generation sequencing data. BMC Bioinformatics. 2011.\n[^6]: Wilm, A. et al. LoFreq: A sequence-quality aware, ultra-sensitive variant caller for uncovering cell-population heterogeneity from high-throughput sequencing datasets. Nucleic Acids Res. 2012.", "examples": ["lofreq"] }, "haplotype_reconstruction": { @@ -228,7 +228,7 @@ "local": { "type": "boolean", "default": false, - "description": "This option activates local haplotype reconstruction (only available when using ShoRAH).", + "description": "This option activates local haplotype reconstruction (only available when using ShoRAH or VILOCA).", "examples": [true] }, "global": { @@ -386,6 +386,10 @@ "type": "string", "default": "shorah shotgun" }, + "viloca": { + "type": "string", + "default": "viloca run" + }, "lofreq": { "type": "string", "default": "lofreq" @@ -1185,6 +1189,53 @@ "default": {}, "type": "object" }, + "viloca": { + "properties": { + "mem": { + "type": "integer", + "default": 10000 + }, + "threads": { + "type": "integer" + }, + "time": { + "type": "integer", + "default": 2880 + }, + "conda": { + "type": "string", + "default": "{VPIPE_BASEDIR}/envs/viloca.yaml" + }, + "consensus": { + "type": "boolean", + "default": false, + "description": "Indicate whether to use the cohort-consensus sequence from the analyzed samples (output from `minor_variants` rule located in the cohort-wide output `results/cohort_onsensus.fasta`) or the reference sequence by setting this option to False.", + "examples": [false] + }, + "shift": { + "type": "integer", + "default": 3, + "description": "VILOCA performs local haplotype reconstruction on windows of the read alignment. The overlap between these windows is defined by the window shifts. By default, it is set to 3, i.e., apart from flanking regions each position is covered by 3 windows." + }, + "insert_bedfile": { + "type": "string", + "default": "None", + "description": "VILOCA performs local haplotype reconstruction on windows of the read alignment. In a first step the alignment is tiled into local regions. This can be done uniformly then set this value None, otherwise path to an (optional) insert file (primer tiling strategy)" + }, + "mode": { + "type": "string", + "default": "use_quality_scores", + "description": "Mode in which to run VILOCA: shorah, learn_error_params, use_quality_scores. If quality scores are available, we recommend this option" + }, + "extra": { + "type": "string", + "default": "", + "description": "Pass additional options to run `lofreq call`" + } + }, + "default": {}, + "type": "object" + }, "lofreq": { "properties": { "mem": {