diff --git a/VERSION b/VERSION index 1efec57..a918a2a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.5.0-alpha +0.6.0 diff --git a/config/cluster.json b/config/cluster.json index ff1902a..4190fbf 100644 --- a/config/cluster.json +++ b/config/cluster.json @@ -170,7 +170,8 @@ "glnexus": { "threads": "24", "mem": "240G", - "time": "4-00:00:00" + "time": "4-00:00:00", + "gres": "lscratch:256" }, "gtype_refinement": { "threads": "2", @@ -189,9 +190,9 @@ "mem": "96G" }, "hmftools_amber": { - "mem": "96G", + "mem": "365G", "time" : "48:00:00", - "threads": "24" + "threads": "56" }, "hmftools_cobalt": { "mem": "96G", @@ -203,6 +204,22 @@ "time" : "48:00:00", "threads": "8" }, + "somatic_purple_maf": { + "threads": "4", + "mem": "32G", + "time": "1-12:00:00", + "gres": "lscratch:32" + }, + "purple_cohort_maf": { + "threads": "2", + "mem": "8G", + "time": "4:00:00" + }, + "purple_cohort_maftools": { + "threads": "2", + "mem": "8G", + "time": "4:00:00" + }, "inbreeding": { "threads": "8", "partition": "norm" @@ -221,6 +238,11 @@ "mem": "96G", "time": "1-00:00:00" }, + "manta_somatic_filter": { + "threads": "4", + "mem": "72G", + "time": "1-00:00:00" + }, "opencravat_germline_merge": { "threads": "4", "mem": "120G", @@ -256,7 +278,7 @@ "opencravat_germline": { "threads": "20", "mem": "48G", - "time": "12:00:00" + "time": "1-12:00:00" }, "opencravat_somatic": { "threads": "32", @@ -270,14 +292,19 @@ }, "octopus_somatic": { "threads": "28", - "mem": "48G", - "time": "1-18:00:00" + "mem": "120G", + "time": "2-12:00:00" }, "octopus_merge": { "threads": "4", "mem": "48G", "time": "12:00:00" }, + "octopus_filter": { + "threads": "4", + "mem": "48G", + "time": "12:00:00" + }, "ped": { "threads": "2", "partition": "norm", @@ -363,6 +390,11 @@ "mem": "64G", "time": "24:00:00" }, + "strelka_format": { + "threads": "16", + "mem": "64G", + "time": "24:00:00" + }, "vcftools": { "threads": "2", "time": "12:00:00" diff --git a/config/containers.json b/config/containers.json index ff0340d..40e63cf 100644 --- a/config/containers.json +++ b/config/containers.json @@ -4,7 +4,7 @@ "genome-seek_trim_map": "docker://skchronicles/genome-seek_trim_map:v0.1.0", "genome-seek_somatic": "docker://skchronicles/genome-seek_somatic:v0.1.0", "genome-seek_qc": "docker://skchronicles/genome-seek_qc:v0.1.0", - "genome-seek_sv": "docker://skchronicles/genome-seek_sv:v0.1.0", + "genome-seek_sv": "docker://skchronicles/genome-seek_sv:v0.2.0", "genome-seek_cnv": "docker://skchronicles/genome-seek_cnv:v0.2.0", "genome-seek_hla": "docker://skchronicles/genome-seek_hla:v0.1.0", "base": "docker://skchronicles/ccbr_wes_base:v0.1.0", diff --git a/config/genome.json b/config/genome.json index 576d004..47d7a14 100644 --- a/config/genome.json +++ b/config/genome.json @@ -56,6 +56,9 @@ ], "GENOME": "/data/OpenOmics/references/genome-seek/Homo_sapiens_assembly38.fasta", "GNOMAD": "/data/OpenOmics/references/genome-seek/GNOMAD/somatic-hg38-af-only-gnomad.hg38.vcf.gz", + "SLIVARGNOMAD": "/data/OpenOmics/references/genome-seek/slivar/gnomad.hg38.v2.zip", + "SLIVARGNOMAD3": "/data/OpenOmics/references/genome-seek/slivar/gnomad.hg38.genomes.v3.fix.zip", + "SLIVARTOPMED": "/data/OpenOmics/references/genome-seek/slivar/topmed.hg38.dbsnp.151.zip", "GISTIC_REFGENE": "/data/OpenOmics/references/genome-seek/gistic/hg38.UCSC.add_miR.160920.refgene.mat", "HLA_LA_GRAPH": "/data/OpenOmics/references/genome-seek/HLA-LA/graphs/PRG_MHC_GRCh38_withIMGT", "HMFTOOLS_REF_VERSION": "38", @@ -88,6 +91,7 @@ "recal5": ["chr11", "chr12", "chr17", "chr19", "chr22", "chrX"] }, "MANTA_CALLREGIONS": "/data/OpenOmics/references/genome-seek/Manta/Homo_sapiens_assembly38_main_chromosomes.bed.gz", + "MANTA_FILTER_CHROMOSEQ_TRANSLOCATION": "/data/OpenOmics/references/genome-seek/chromoseq/chromoseq_translocations.bedpe", "OC_LIFTOVER": "hg38", "OC_MODULES": "/data/OpenOmics/references/genome-seek/OpenCRAVAT/modules", "OCTOPUS_SOMATIC_FOREST_MODEL": "/data/OpenOmics/references/genome-seek/Octopus/somatic.v0.7.4.forest", diff --git a/config/modules.json b/config/modules.json index 94a57f0..aa35c7e 100644 --- a/config/modules.json +++ b/config/modules.json @@ -5,6 +5,7 @@ "fastp": "fastp/0.23.1", "bowtie": "bowtie/2-2.4.5", "bcftools": "bcftools/1.13", + "bedtools": "bedtools/2.27.1", "canvas": "Canvas/1.40", "circos": "circos/0.69-9", "deepvariant": "deepvariant/1.4.0", @@ -20,6 +21,7 @@ "rlang": "R/4.2.0", "samblaster": "samblaster/0.1.25", "strelka": "strelka/2.9.10", + "svtools": "svtools/0.5.1", "python2": "python/2.7", "python3": "python/3.8", "fastq_screen": "fastq_screen/0.14.1", @@ -28,6 +30,7 @@ "gatk3": "GATK/3.8-1", "gatk4": "GATK/4.2.0.0", "multiqc": "multiqc/1.11", + "minimap2": "minimap2/2.26", "vcftools": "vcftools/0.1.16" } } diff --git a/workflow/Snakefile b/workflow/Snakefile index f272ebb..0f0ccd6 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -309,6 +309,10 @@ rule all: join(workpath, "MANTA", "somatic", "{name}", "results", "variants", "somaticSV.vcf.gz"), name=provided(provided(tumors, call_somatic), call_sv), ), + expand( + join(workpath, "MANTA", "somatic", "{name}", "results", "variants", "somaticSV.filtered.vcf"), + name=provided(provided(tumors, call_somatic), call_sv), + ), # Mutect2, call somatic variants, # scattered and merged on chroms # to speed up overall run time, @@ -371,44 +375,19 @@ rule all: caller=provided(tn_somatic_callers, call_somatic), name=provided(tumorWnormal, call_somatic), ), - # Post-process somatic VCF, splits sites - # found in the tumor and normal sample, - # and to merge callsets across callers, + # Post-process somatic VCF, + # and merge callsets across callers, # only runs if provided --call-somatic # @imported from rules/somatic.smk expand( - join(workpath, "{caller}", "somatic", "{name}.{caller}.filtered.norm.tumor.vcf.gz"), - caller=provided(somatic_callers, call_somatic), - name=provided(tumors, call_somatic), - ), - # Conditionally add tumor-normal callers, - # MuSE and Strelka - expand( - join(workpath, "{caller}", "somatic", "{name}.{caller}.filtered.norm.tumor.vcf.gz"), - caller=provided(tn_somatic_callers, call_somatic), - name=provided(tumorWnormal, call_somatic), - ), - expand( - join(workpath, "{caller}", "somatic", "{name}.{caller}.filtered.norm.normal.vcf.gz"), - caller=provided(somatic_callers, call_somatic), - name=provided(tumorWnormal, call_somatic), - ), - # Conditionally add tumor-normal callers, - # MuSE and Strelka - expand( - join(workpath, "{caller}", "somatic", "{name}.{caller}.filtered.norm.normal.vcf.gz"), - caller=provided(tn_somatic_callers, call_somatic), - name=provided(tumorWnormal, call_somatic), - ), - expand( - join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.tumor.vcf.gz"), + join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.vcf.gz"), name=provided(tumors, call_somatic), ), # vcf2maf, create per-sample MAF file, # only runs if provided --call-somatic # @imported from rules/somatic.smk expand( - join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.tumor.maf"), + join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.maf"), name=provided(tumors, call_somatic), ), # vcf2maf, create cohort-level MAF file, @@ -443,16 +422,29 @@ rule all: # @imported from rules/cnv.smk expand( join(workpath, "hmftools", "amber", "{name}", "{name}.amber.baf.tsv"), - name=provided(provided(tumors, call_somatic), call_cnv), + name=provided(provided(tumors, call_somatic), call_cnv) ), expand( join(workpath, "hmftools", "cobalt", "{name}", "{name}.cobalt.ratio.tsv"), - name=provided(provided(tumors, call_somatic), call_cnv), + name=provided(provided(tumors, call_somatic), call_cnv) ), expand( - join(workpath, "hmftools", "purple", "{name}", "{caller}", "{name}.purple.cnv.somatic.tsv"), - name=provided(provided(tumors, call_somatic), call_cnv), - caller=provided(purple_callers, call_somatic), + join(workpath, "hmftools", "purple", "{name}", "{name}.purple.cnv.somatic.tsv"), + name=provided(provided(tumors, call_somatic), call_cnv) + ), + expand( + join(workpath, "hmftools", "purple", "{name}", "{name}.purple.maf"), + name=provided(provided(tumors, call_somatic), call_cnv) + ), + # Maftools, create cohort-level summary plots from purple somatic VCF, + # only runs if provided --call-somatic AND --call-cnv + # @imported from rules/somatic.smk + provided( + provided( + [join(workpath, "hmftools", "cohort_oncoplot.pdf")], + call_somatic + ), + call_cnv ), # OpenCRAVAT, somatic annotation, rank and score variants, # only runs if provided --open-cravat AND --call-somatic diff --git a/workflow/rules/cnv.smk b/workflow/rules/cnv.smk index fda5082..7490635 100644 --- a/workflow/rules/cnv.smk +++ b/workflow/rules/cnv.smk @@ -24,11 +24,12 @@ def get_manta_calls(wildcards): tumor = wildcards.name if call_sv: # Runs in a tumor, normal mode - return join(workpath, "MANTA", "somatic", tumor, "results", "variants", "somaticSV.vcf.gz"), + return join(workpath, "MANTA", "somatic", tumor, "results", "variants", "somaticSV.filtered.vcf"), else: # Runs in tumor-only mode return [] + # Germline Copy Number Variation rule peddy: """ @@ -207,7 +208,6 @@ rule hmftools_amber: tumor = join(workpath, "BAM", "{name}.recal.bam"), normal = get_normal_recal_bam, output: - con = join(workpath, "hmftools", "amber", "{name}", "{name}.amber.contamination.vcf.gz"), baf = join(workpath, "hmftools", "amber", "{name}", "{name}.amber.baf.tsv"), params: rname = 'hmfamber', @@ -240,7 +240,6 @@ rule hmftools_amber: """ - rule hmftools_cobalt: """Data-processing step to determines the read depth ratios for Purple's copy number fitting. HMF Tools is a suite of tools the @@ -307,17 +306,22 @@ rule hmftools_purple: input: amber = join(workpath, "hmftools", "amber", "{name}", "{name}.amber.baf.tsv"), cobalt = join(workpath, "hmftools", "cobalt", "{name}", "{name}.cobalt.ratio.tsv"), - vcf = join(workpath, "{caller}", "somatic", "{name}.{caller}.filtered.norm.vcf"), + vcf = join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.temp.vep.vcf"), sv = get_manta_calls, output: - purity = join(workpath, "hmftools", "purple", "{name}", "{caller}", "{name}.purple.purity.tsv"), - cnv = join(workpath, "hmftools", "purple", "{name}", "{caller}", "{name}.purple.cnv.somatic.tsv"), - driver = join(workpath, "hmftools", "purple", "{name}", "{caller}", "{name}.driver.catalog.somatic.tsv"), + purity = join(workpath, "hmftools", "purple", "{name}", "{name}.purple.purity.tsv"), + cnv = join(workpath, "hmftools", "purple", "{name}", "{name}.purple.cnv.somatic.tsv"), + driver = join(workpath, "hmftools", "purple", "{name}", "{name}.driver.catalog.somatic.tsv"), + vcf = join(workpath, "hmftools", "purple", "{name}", "{name}.merged.filtered.norm.biallelic.vcf"), + purplevcf = join(workpath, "hmftools", "purple", "{name}", "{name}.purple.somatic.vcf.gz"), params: rname = 'hmfpurple', tumor = '{name}', - outdir = join(workpath, "hmftools", "purple", "{name}", "{caller}"), + outdir = join(workpath, "hmftools", "purple", "{name}"), genome = config['references']['GENOME'], + gnomadv2 = config['references']['SLIVARGNOMAD'], + gnomadv3 = config['references']['SLIVARGNOMAD3'], + topmed = config['references']['SLIVARTOPMED'], ref_ver = config['references']['HMFTOOLS_REF_VERSION'], purple_jar = config['references']['HMFTOOLS_PURPLE_JAR'], gc_profile = config['references']['HMFTOOLS_GC_PROFILE'], @@ -333,7 +337,7 @@ rule hmftools_purple: tumor_flag = lambda w: "" if tumor2normal[w.name] else "-tumor_only", # Building optional flag for SV calls sv_option = lambda w: "-structural_vcf {0}".format( - join(workpath, "MANTA", "somatic", w.name, "results", "variants", "somaticSV.vcf.gz"), + join(workpath, "MANTA", "somatic", w.name, "results", "variants", "somaticSV.filtered.vcf"), ) if call_sv else "", threads: int(allocated("threads", "hmftools_purple", cluster)), @@ -341,6 +345,7 @@ rule hmftools_purple: envmodules: config['tools']['rlang'], config['tools']['circos'], + config['tools']['bcftools'], shell: """ # Set output directories # for Amber and Cobalt @@ -348,7 +353,28 @@ rule hmftools_purple: cobalt_outdir="$(dirname "{input.cobalt}")" echo "Amber output directory: $amber_outdir" echo "Cobalt output directory: $cobalt_outdir" - + + # Decompress and filter merged + # somatic variant VCF to remove + # biallelic variants and variants + # with > 0.01 frequency in the + # general population + bcftools view \\ + -O v \\ + --max-alleles 2 \\ + {input.vcf} \\ + | slivar expr \\ + --vcf - --pass-only \\ + -g {params.gnomadv2} \\ + -g {params.topmed} \\ + --info 'INFO.impactful && INFO.gnomad_popmax_af < 0.01 && INFO.topmed_af < 0.05' \\ + | sed 's/gnomad_/gnomadV2_/g' \\ + | slivar expr \\ + --vcf - --pass-only \\ + -g {params.gnomadv3} \\ + --info 'INFO.gnomad_popmax_af < 0.01' \\ + > {output.vcf} + # Run Purple to find CNVs, # purity and ploidy, and # cancer driver events @@ -366,5 +392,119 @@ rule hmftools_purple: -somatic_hotspots {params.somatic_hotspot} \\ -germline_hotspots {params.germline_hotspot} \\ -threads {threads} {params.tumor_flag} \\ - -somatic_vcf {input.vcf} {params.sv_option} + -somatic_vcf {output.vcf} {params.sv_option} + """ + + +rule somatic_purple_maf: + """Data-processing step to convert the merged somatic calls from + each caller into a MAF file. This step takes filtered, norm, tumor + callset from all callers and annotates the variants with VEP/106 + and converts the resulting VCF file into MAF file format. vcf2maf + requires the input vcf file is NOT compressed. + @Input: + Somatic variants found in the tumor sample (scatter-per-sample) + @Output: + Annotated, merged MAF file contaning somatic callsets + """ + input: + vcf = join(workpath, "hmftools", "purple", "{name}", "{name}.purple.somatic.vcf.gz"), + output: + vcf = temp(join(workpath, "hmftools", "purple", "{name}", "{name}.purple.somatic.vcf")), + vep = join(workpath, "hmftools", "purple", "{name}", "{name}.purple.somatic.vep.vcf"), + maf = join(workpath, "hmftools", "purple", "{name}", "{name}.purple.maf"), + params: + rname = 'purplemaf', + tumor = '{name}', + memory = allocated("mem", "somatic_purple_maf", cluster).lower().rstrip('g'), + vep_data = config['references']['VEP_DATA'], + vep_build = config['references']['VEP_BUILD'], + vep_species = config['references']['VEP_SPECIES'], + ref_version = config['references']['VEP_REF_VERSION'], + genome = config['references']['GENOME'], + # Building optional argument for paired normal + normal_option = lambda w: "--normal-id {0}".format( + tumor2normal[w.name] + ) if tumor2normal[w.name] else "", + threads: + int(allocated("threads", "somatic_purple_maf", cluster)) + container: config['images']['vcf2maf'] + shell: """ + # vcf2maf needs an uncompressed VCF file + zcat {input.vcf} \\ + > {output.vcf} + # Run VEP and convert VCF into MAF file + vcf2maf.pl \\ + --input-vcf {output.vcf} \\ + --output-maf {output.maf} \\ + --vep-path ${{VEP_HOME}} \\ + --vep-data {params.vep_data} \\ + --cache-version {params.ref_version} \\ + --ref-fasta {params.genome} \\ + --vep-forks {threads} \\ + --tumor-id {params.tumor} {params.normal_option} \\ + --ncbi-build {params.vep_build} \\ + --species {params.vep_species} \\ + --retain-info set,PURPLE_CN,SUBCL,HOTSPOT,IMPACT,NEAR_HOTSPOT,PNOISE,PNOISE2,PON,PURPLE_AF,PURPLE_GERMLINE,PURPLE_MACN,PURPLE_VCN,REPORTED,SOMATIC,SomaticEVS + """ + + +rule purple_cohort_maf: + """Data-processing step to merge the per-sample MAF files into a + single MAF file for all samples, a cohort MAF file. + @Input: + Somatic tumor MAF files (gather-per-sample) + @Output: + Merged somatic tumor MAF, cohort-level, with all call sets + """ + input: + mafs = expand(join(workpath, "hmftools", "purple", "{name}", "{name}.purple.maf"), name=tumors), + output: + maf = join(workpath, "hmftools", "cohort_somatic_variants.maf"), + params: + rname = 'cohortmaf', + memory = allocated("mem", "purple_cohort_maf", cluster).lower().rstrip('g'), + threads: + int(allocated("threads", "purple_cohort_maf", cluster)) + container: config['images']['vcf2maf'] + shell: """ + echo "Combining MAFs..." + head -2 {input.mafs[0]} > {output.maf} + awk 'FNR>2 {{print}}' {input.mafs} >> {output.maf} + """ + + +rule purple_cohort_maftools: + """Data-processing step to run maftools on merged cohort-level + MAF file, produces summarized plots like an oncoplot. + @Input: + Cohort-level somatic MAF file (indirect-gather-due-to-aggregation) + @Output: + TCGA comparsion plot + Top 20 genes by Vaf plot + MAF Summary plot + Oncoplot + """ + input: + maf = join(workpath, "hmftools", "cohort_somatic_variants.maf"), + output: + tcga = join(workpath, "hmftools", "cohort_tcga_comparison.pdf"), + gvaf = join(workpath, "hmftools", "cohort_genes_by_VAF.pdf"), + summary = join(workpath, "hmftools", "cohort_maf_summary.pdf"), + oncoplot = join(workpath, "hmftools", "cohort_oncoplot.pdf"), + params: + rname = 'maftools', + wdir = join(workpath, "hmftools"), + memory = allocated("mem", "purple_cohort_maftools", cluster).lower().rstrip('g'), + script = join("workflow", "scripts", "maftools.R"), + threads: + int(allocated("threads", "purple_cohort_maftools", cluster)), + container: config['images']['genome-seek_somatic'] + envmodules: config['tools']['rlang'] + shell: """ + Rscript {params.script} \\ + {params.wdir} \\ + {input.maf} \\ + {output.summary} \\ + {output.oncoplot} """ diff --git a/workflow/rules/open_cravat.smk b/workflow/rules/open_cravat.smk index cbaa215..b52a5bf 100644 --- a/workflow/rules/open_cravat.smk +++ b/workflow/rules/open_cravat.smk @@ -370,7 +370,7 @@ rule opencravat_somatic: Per sample caller merged somatic SQLite OpenCravat results """ input: - vcfs = expand(join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.tumor.vcf.gz"), name=tumors), + vcfs = expand(join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.vcf.gz"), name=tumors), output: db = join(workpath, "OpenCRAVAT", "somatic", "cravat_somatic.sqlite"), params: diff --git a/workflow/rules/somatic.smk b/workflow/rules/somatic.smk index aac0c82..52d3410 100644 --- a/workflow/rules/somatic.smk +++ b/workflow/rules/somatic.smk @@ -35,28 +35,24 @@ def get_normal_pileup_table(wildcards): # Runs in tumor-only mode return [] -def get_somatic_merge_tumor(wildcards): - """Returns somatic variants found in the tumor sample - for all somatic callers. For tumor-normal samples, extra - somatic callers (i.e. MuSE and Strelka) are run. Tumor-only - samples will only have callsets from Mutect2 and octopus. +def get_somatic_tn_callers(wildcards): + """Returns somatic variants found with tumor-normal variant + callers. For tumor-normal samples, extra somatic callers + (i.e. MuSE and Strelka) are run. Tumor-only return empty + list (rule already has reference in input section). See config['pairs'] for tumor, normal pairs. """ - callset = somatic_callers tumor = wildcards.name normal = tumor2normal[tumor] if normal: - # Callers = Octopus, Mutect2, MuSE, Strelka + # Callers = MuSE, Strelka return [ - join(workpath, caller, "somatic", "{0}.{1}.filtered.norm.tumor.vcf.gz".format(tumor, caller)) \ - for caller in somatic_callers + tn_somatic_callers + join(workpath, caller, "somatic", "{0}.{1}.filtered.norm.vcf".format(tumor, caller)) \ + for caller in tn_somatic_callers ] else: - # Callers = Octopus, Mutect2 - return [ - join(workpath, caller, "somatic", "{0}.{1}.filtered.norm.tumor.vcf.gz".format(tumor, caller)) \ - for caller in somatic_callers - ] + # No paired normal, return nothing + return [] # Data processing rules for calling somatic variants @@ -127,7 +123,13 @@ rule octopus_merge: output: lsl = join(workpath, "octopus", "somatic", "{name}.list"), raw = join(workpath, "octopus", "somatic", "{name}.octopus.raw.vcf"), - vcf = join(workpath, "octopus", "somatic", "{name}.octopus.vcf"), + vcf = join(workpath, "octopus", "somatic", "{name}.octopus.unfiltered.vcf"), + chroms = temp(join(workpath, "octopus", "somatic", "{name}.chroms")), + starts = temp(join(workpath, "octopus", "somatic", "{name}.starts")), + stops = temp(join(workpath, "octopus", "somatic", "{name}.stops")), + bed = temp(join(workpath, "octopus", "somatic", "{name}.list.bed")), + sortbed = temp(join(workpath, "octopus", "somatic", "{name}.sort.bed")), + sortlsl = join(workpath, "octopus", "somatic", "{name}.sort.list"), params: genome = config['references']['GENOME'], rname = "octomerge", @@ -136,11 +138,32 @@ rule octopus_merge: threads: int(allocated("threads", "octopus_merge", cluster)) container: config['images']['genome-seek'] - envmodules: config['tools']['bcftools'] + envmodules: config['tools']['bcftools'], shell: """ # Create list of chunks to merge find {params.octopath} -iname '{params.tumor}.vcf.gz' \\ > {output.lsl} + + # Sort list of chunks to merge + awk -F '/' '{{print $(NF-1)}}' {output.lsl} \\ + | awk -F ':' '{{print $(1)}}' > {output.chroms} + awk -F '/' '{{print $(NF-1)}}' {output.lsl} \\ + | awk -F ':' '{{print $(2)}}' \\ + | awk -F '-' '{{print $(NF-1)}}' > {output.starts} + awk -F '/' '{{print $(NF-1)}}' {output.lsl} \\ + | awk -F ':' '{{print $(2)}}' \\ + | awk -F '-' '{{print $(NF)}}' > {output.stops} + paste {output.chroms} \\ + {output.starts} \\ + {output.stops} \\ + {output.lsl} \\ + > {output.bed} + bedtools sort \\ + -i {output.bed} \\ + -faidx {params.genome}.fai \\ + > {output.sortbed} + cut -f4 {output.sortbed} > {output.sortlsl} + # Merge octopus chunk calls, # contains both germline and # somatic variants @@ -148,7 +171,7 @@ rule octopus_merge: --threads {threads} \\ -d exact \\ -a \\ - -f {output.lsl} \\ + -f {output.sortlsl} \\ -o {output.raw} \\ -O v # Filter Octopus callset for @@ -157,6 +180,58 @@ rule octopus_merge: > {output.vcf} """ +rule octopus_filter: + """ + Data-processing step to merge scattered variant calls from Octopus. Octopus + is scattered across genomic intervals or chunks to reduce its overall runtime. + @Input: + Somatic variants in VCF format (gather-per-sample-per-chunks) + @Output: + Per sample somatic variants in VCF format + """ + input: + vcf = join(workpath, "octopus", "somatic", "{name}.octopus.unfiltered.vcf"), + output: + vcfa = join(workpath, "octopus", "somatic", "{name}.octopus.PASS.vcf"), + vcfb = join(workpath, "octopus", "somatic", "{name}.octopus.SOMATIC.vcf"), + vcfc = join(workpath, "octopus", "somatic", "{name}.octopus.SOMATIC_PASS.vcf"), + vcfsort = join(workpath, "octopus", "somatic", "{name}.octopus.vcf"), + params: + genome = config['references']['GENOME'], + rname = "octofilter", + tumor = "{name}", + octopath = join(workpath, "octopus", "somatic", "chunks"), + # Building optional argument for paired normal + bcftools_filter_i_option = lambda w: 'FMT/FT[1:0]=="PASS"' \ + if tumor2normal[w.name] else 'FMT/FT[0:0]=="PASS"', + threads: + int(allocated("threads", "octopus_filter", cluster)) + container: config['images']['genome-seek'] + envmodules: config['tools']['bcftools'] + shell: """ + bcftools filter \\ + -o {output.vcfa} \\ + -O v \\ + -i 'FILTER=="PASS"' \\ + {input.vcf} + + awk -F '\\t' -v OFS='\\t' \\ + '( $1 ~ /^#/ ) || ( $8 ~ /SOMATIC/ ) {{print}}' \\ + {output.vcfa} \\ + > {output.vcfb} + + bcftools filter \\ + -i 'FMT/FT[0:0]=="PASS"' \\ + {output.vcfb} \\ + | bcftools filter \\ + -i '{params.bcftools_filter_i_option}' \\ + -o {output.vcfc} + + bcftools sort \\ + -o {output.vcfsort} \\ + -O v \\ + {output.vcfc} + """ rule octopus_germline: """ @@ -591,11 +666,12 @@ rule strelka: indels = join(workpath, "strelka", "{name}", "results", "variants", "somatic.indels.vcf.gz"), vcf = temp(join(workpath, "strelka", "{name}", "{name}.tmp.vcf")), header = temp(join(workpath, "strelka", "{name}", "{name}.samples")), - final = join(workpath, "strelka", "somatic", "{name}.strelka.vcf"), + rehead = temp(join(workpath, "strelka", "somatic", "{name}.rehead.vcf")), params: tmpdir = tmpdir, tumor = '{name}', rname = 'strelka', + purple_jar = config['references']['HMFTOOLS_PURPLE_JAR'], outdir = join(workpath, "strelka", "{name}"), workflow = join(workpath, "strelka", "{name}", "runWorkflow.py"), regions = config['references']['MANTA_CALLREGIONS'], @@ -663,9 +739,56 @@ rule strelka: echo -e "TUMOR\\t{params.tumor}{params.normal_header}" \\ > {output.header} bcftools reheader \\ - -o {output.final} \\ + -o {output.rehead} \\ -s {output.header} \\ {output.vcf} + """ + +rule strelka_format: + """Data-processing step to call somatic mutations with Strelka. This tool is + optimized for rapid clinical analysis of germline variation in small cohorts + and somatic variation in tumor/normal sample pairs. More information about + strelka can be found here: https://github.com/Illumina/strelka + @Input: + Realigned, recalibrated BAM file for a normal in a TN pair + @Output: + Per sample VCF of somatic variants + """ + input: + rehead = join(workpath, "strelka", "somatic", "{name}.rehead.vcf"), + output: + final = join(workpath, "strelka", "somatic", "{name}.strelka.vcf"), + params: + tmpdir = tmpdir, + tumor = '{name}', + rname = 'strelka_format', + purple_jar = config['references']['HMFTOOLS_PURPLE_JAR'], + outdir = join(workpath, "strelka", "{name}"), + workflow = join(workpath, "strelka", "{name}", "runWorkflow.py"), + regions = config['references']['MANTA_CALLREGIONS'], + genome = config['references']['GENOME'], + pon = config['references']['PON'], + memory = allocated("mem", "strelka_format", cluster).rstrip('G'), + # Building optional argument for paired normal + normal_option = lambda w: "--normalBam {0}.recal.bam".format( + join(workpath, "BAM", tumor2normal[w.name]) + ) if tumor2normal[w.name] else "", + # Creating optional reheader for paired normal, + # resolves to "\nNORMAL\t${normalName}" + normal_header = lambda w: "\\nNORMAL\\t{0}".format( + tumor2normal[w.name] + ) if tumor2normal[w.name] else "", + threads: + max(int(allocated("threads", "strelka_format", cluster))-1, 1) + container: config['images']['genome-seek_cnv'] + envmodules: + config['tools']['rlang'], + shell: """ + #Adding AD annotation to VCF + java -Xmx{params.memory}g -cp {params.purple_jar} \\ + com.hartwig.hmftools.purple.tools.AnnotateStrelkaWithAllelicDepth \\ + -in {input.rehead} \\ + -out {output.final} """ @@ -726,108 +849,6 @@ rule somatic_selectvar: {output.filt} """ - -rule somatic_split_tumor: - """Data-processing step to post-process vcf file generated by all the - somatic callers. This step takes a re-header, filtered, and normalized - somatic vcf file and filters the callset to only contain sites in the - tumor sample (needed due to mixed sites in the tumor-normal callset). - @Input: - Per sample, per caller somatic variants - @Output: - Per sample, per caller somatic variants only found in the tumor sample - """ - input: - vcf = join(workpath, "{caller}", "somatic", "{name}.{caller}.filtered.norm.vcf"), - output: - tmp = temp(join(workpath, "{caller}", "somatic", "{name}.{caller}.filtered.norm.tumor.temp.vcf.gz")), - header = temp(join(workpath, "{caller}", "somatic", "{name}.{caller}.tumor.samples")), - tumor = join(workpath, "{caller}", "somatic", "{name}.{caller}.filtered.norm.tumor.vcf.gz"), - tbi = join(workpath, "{caller}", "somatic", "{name}.{caller}.filtered.norm.tumor.vcf.gz.tbi"), - params: - rname = 'tumorsplit', - sample = '{name}', - rename = '{caller}:{name}', - memory = allocated("mem", "somatic_split_tumor", cluster).rstrip('G'), - threads: - int(allocated("threads", "somatic_split_tumor", cluster)) - container: config['images']['genome-seek_somatic'] - envmodules: config['tools']['bcftools'] - shell: """ - # Filter call set to tumor sites - bcftools view \\ - -c1 \\ - -Oz \\ - -s '{params.sample}' \\ - -o {output.tmp} \\ - {input.vcf} - # Renaming sample name in VCF - # to contain caller name - echo -e "{params.sample}\\t{params.rename}" \\ - > {output.header} - bcftools reheader \\ - -o {output.tumor} \\ - -s {output.header} \\ - {output.tmp} - # Create an VCF index for intersect - bcftools index \\ - -f \\ - --tbi \\ - {output.tumor} - """ - - -rule somatic_split_normal: - """Data-processing step to post-process vcf file generated by all the - somatic callers. This step takes a re-header, filtered, and normalized - somatic vcf file and filters the callset to only contain sites in the - normal sample (needed due to mixed sites in the tumor-normal callset). - @Input: - Per sample, per caller somatic variants - @Output: - Per sample, per caller somatic variants only found in the normal sample - """ - input: - vcf = join(workpath, "{caller}", "somatic", "{name}.{caller}.filtered.norm.vcf"), - output: - tmp = temp(join(workpath, "{caller}", "somatic", "{name}.{caller}.filtered.norm.normal.temp.vcf.gz")), - header = temp(join(workpath, "{caller}", "somatic", "{name}.{caller}.normal.samples")), - normal = join(workpath, "{caller}", "somatic", "{name}.{caller}.filtered.norm.normal.vcf.gz"), - tbi = join(workpath, "{caller}", "somatic", "{name}.{caller}.filtered.norm.normal.vcf.gz.tbi"), - params: - rname = 'normalsplit', - sample = lambda w: "{0}".format(tumor2normal[w.name]), - rename = lambda w: "{0}:{1}".format(w.caller, tumor2normal[w.name]), - memory = allocated("mem", "somatic_split_normal", cluster).rstrip('G'), - threads: - int(allocated("threads", "somatic_split_normal", cluster)) - container: config['images']['genome-seek_somatic'] - envmodules: config['tools']['bcftools'] - shell: """ - # Filter call set to normal sites - bcftools view \\ - --force-samples \\ - -c1 \\ - -Oz \\ - -s '{params.sample}' \\ - -o {output.tmp} \\ - {input.vcf} - # Renaming sample name in VCF - # to contain caller name - echo -e "{params.sample}\\t{params.rename}" \\ - > {output.header} - bcftools reheader \\ - -o {output.normal} \\ - -s {output.header} \\ - {output.tmp} - # Create an VCF index for intersect - bcftools index \\ - -f \\ - --tbi \\ - {output.normal} - """ - - rule somatic_merge_tumor: """Data-processing step to post-process vcf file generated by all the somatic callers. This step takes filtered tumor sample callsets from @@ -839,51 +860,49 @@ rule somatic_merge_tumor: Variants found in at least 2 callers """ input: - tumors = get_somatic_merge_tumor, + tn_callers = get_somatic_tn_callers, + octopus = join(workpath, "octopus", "somatic", "{name}.octopus.filtered.norm.vcf"), + mutect2 = join(workpath, "mutect2", "somatic", "{name}.mutect2.filtered.norm.vcf"), + # muse = join(workpath, "muse", "somatic", "{name}.muse.filtered.norm.vcf"), + # strelka = join(workpath, "strelka", "somatic", "{name}.strelka.filtered.norm.vcf"), output: - lsl = join(workpath, "merged", "somatic", "{name}.intersect.lsl"), - merged = join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.tumor.vcf.gz"), - tbi = join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.tumor.vcf.gz.tbi"), + merged = join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.vcf.gz"), params: rname = 'tumormerge', + genome = config['references']['GENOME'], memory = allocated("mem", "somatic_merge_tumor", cluster).rstrip('G'), - isec_dir = join(workpath, "merged", "somatic", "intersect"), + tmpdir = tmpdir, + # Dynamically update the priority list + # based on wether a sample is a tumor-only + # or a tumor-normal + priority_list = lambda w: "mutect2,octopus,muse,strelka" \ + if tumor2normal[w.name] else "mutect2,octopus", + strelka_option = lambda w: "--variant:strelka {0}.strelka.filtered.norm.vcf".format( + join(workpath, "strelka", "somatic", w.name) + ) if tumor2normal[w.name] else "", + muse_option = lambda w: "--variant:muse {0}.muse.filtered.norm.vcf".format( + join(workpath, "muse", "somatic", w.name) + ) if tumor2normal[w.name] else "", threads: int(allocated("threads", "somatic_merge_tumor", cluster)) container: config['images']['genome-seek_somatic'] - envmodules: config['tools']['bcftools'] + envmodules: config['tools']['gatk3'] shell: """ - # Delete previous attempts output - # directory to ensure hard restart - if [ -d "{params.isec_dir}" ]; then - rm -rf "{params.isec_dir}" - fi # Intersect somatic callset to find # variants in at least two callers - bcftools isec \\ - -Oz \\ - -n+2 \\ - -c none \\ - -p {params.isec_dir} \\ - {input.tumors} - # Create list of files to merge - find {params.isec_dir} \\ - -name '*.vcf.gz' \\ - | sort \\ - > {output.lsl} - # Merge variants found in at - # least two callers - bcftools merge \\ - -Oz \\ - -o {output.merged} \\ - -l {output.lsl} - # Create an VCF index for merge - bcftools index \\ - -f \\ - --tbi \\ - {output.merged} - """ + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT + java -Xmx{params.memory}g -Djava.io.tmpdir=${{tmp}} \\ + -XX:ParallelGCThreads={threads} -jar $GATK_JAR -T CombineVariants \\ + -R {params.genome} \\ + --filteredrecordsmergetype KEEP_IF_ANY_UNFILTERED \\ + --genotypemergeoption PRIORITIZE \\ + --rod_priority_list {params.priority_list} \\ + -o {output.merged} \\ + --variant:octopus {input.octopus} --variant:mutect2 {input.mutect2} {params.strelka_option} {params.muse_option} + """ rule somatic_sample_maf: """Data-processing step to convert the merged somatic calls from @@ -897,11 +916,11 @@ rule somatic_sample_maf: Annotated, merged MAF file contaning somatic callsets """ input: - vcf = join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.tumor.vcf.gz"), + vcf = join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.vcf.gz"), output: - vcf = temp(join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.tumor.temp.vcf")), - vep = temp(join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.tumor.temp.vep.vcf")), - maf = join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.tumor.maf"), + vcf = temp(join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.temp.vcf")), + vep = join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.temp.vep.vcf"), + maf = join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.maf"), params: rname = 'samplemaf', tumor = '{name}', @@ -933,7 +952,8 @@ rule somatic_sample_maf: --vep-forks {threads} \\ --tumor-id {params.tumor} {params.normal_option} \\ --ncbi-build {params.vep_build} \\ - --species {params.vep_species} + --species {params.vep_species} \\ + --retain-info set """ @@ -946,7 +966,7 @@ rule somatic_cohort_maf: Merged somatic tumor MAF, cohort-level, with all call sets """ input: - mafs = expand(join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.tumor.maf"), name=tumors), + mafs = expand(join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.maf"), name=tumors), output: maf = join(workpath, "merged", "somatic", "cohort_somatic_variants.maf"), params: @@ -1023,7 +1043,7 @@ rule somatic_sample_sigprofiler: SigProfiler Sample Portrait Plot (pdf) """ input: - maf = join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.tumor.maf"), + maf = join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.maf"), output: pdf = join(workpath, "sigprofiler", "sample_portrait_{name}.pdf"), params: diff --git a/workflow/rules/sv.smk b/workflow/rules/sv.smk index 1f08375..9bd3c9a 100644 --- a/workflow/rules/sv.smk +++ b/workflow/rules/sv.smk @@ -73,7 +73,6 @@ rule manta_germline: -g {params.memory} """ - # Somatic SV calling rule manta_somatic: """ @@ -102,6 +101,15 @@ rule manta_somatic: normal_option = lambda w: "--normalBam {0}.recal.bam".format( join(workpath, "BAM", tumor2normal[w.name]) ) if tumor2normal[w.name] else "", + # Building command to create a renamed symlink + # called, somaticSV.vcf.gz, when run in tumor- + # only mode. Ensures a consistent output file + # names regardless of whether a sample has a + # paired normal or not. + symlink = lambda w: "ln -sf {0} {1}".format( + join(workpath, "MANTA", "somatic", w.name, "results", "variants", "tumorSV.vcf.gz"), + join(workpath, "MANTA", "somatic", w.name, "results", "variants", "somaticSV.vcf.gz"), + ) if not tumor2normal[w.name] else "", threads: int(allocated("threads", "manta_somatic", cluster)) container: config['images']['genome-seek_sv'] envmodules: config['tools']['manta'] @@ -124,5 +132,77 @@ rule manta_somatic: echo "Starting Manta workflow..." {params.workflow} \\ -j {threads} \\ - -g {params.memory} + -g {params.memory} + {params.symlink} + """ + +# Filter Somatic SV calls +rule manta_somatic_filter: + """ + Data processing step to call somatic structural variants using manta. + Manta is optimized for optimized for analysis of germline variation + in small sets of individuals and somatic variation in tumor/normal + sample pairs. Manta's Github Repo: https://github.com/Illumina/manta + @Input: + Single-sample VCF file with called somatic structural variants (scatter) + @Output: + Filtered single-sample VCF file with called somatic structural variants + """ + input: + vcf = join(workpath, "MANTA", "somatic", "{name}", "results", "variants", "somaticSV.vcf.gz"), + output: + samplevcf = join(workpath, "MANTA", "somatic", "{name}", "results", "variants", "somaticSV.sample.vcf.gz"), + flagged = join(workpath, "MANTA", "somatic", "{name}", "results", "variants", "somaticSV.flagged.vcf.gz"), + filtered = join(workpath, "MANTA", "somatic", "{name}", "results", "variants", "somaticSV.filtered.vcf"), + params: + rname = "manta_somatic_filter", + sample = "{name}", + outdir = join(workpath, "MANTA", "somatic", "{name}"), + filtermanta = join(workpath, "workflow", "scripts", "FilterManta.pl"), + genome = config['references']['GENOME'], + filter_ref = config['references']['MANTA_FILTER_CHROMOSEQ_TRANSLOCATION'], + memory = allocated("mem", "manta_somatic_filter", cluster).rstrip('G'), + threads: int(allocated("threads", "manta_somatic_filter", cluster)) + container: config['images']['genome-seek_sv'] + envmodules: + config['tools']['bcftools'], + config['tools']['samtools'], + config['tools']['bedtools'], + config['tools']['svtools'], + config['tools']['minimap2'], + config['tools']['perl'], + shell: """ + # Flag and filter SVs based on the + # following: read support, contig + # re-mapping, and allele fraction. + # Also removes non-PASS SVs from + # somatic SV callset. + bcftools view \\ + -s {params.sample} \\ + -O z \\ + -o {output.samplevcf} \\ + {input.vcf} + # Script to filter manta results + # from chromoseq pipeline, + # https://github.com/genome/docker-basespace_chromoseq + # Needs paths to its depedencies: + bedtools_path="$(type -P bedtools)" + minimap2_path="$(type -P minimap2)" + svtools_path="$(type -P svtools)" + echo "Paths of all dependencies... ${{bedtools_path}}:${{minimap2_path}}:${{svtools_path}}" + perl {params.filtermanta} \\ + -r {params.genome} \\ + -k {params.filter_ref} \\ + -b "${{bedtools_path}}" \\ + -p "${{minimap2_path}}" \\ + -s "${{svtools_path}}" \\ + -t {params.outdir} \\ + {output.samplevcf} \\ + {output.flagged} + # Remove non-PASS-ing SVs + bcftools view \\ + -O v \\ + -i 'FILTER=="PASS"' \\ + {output.flagged} \\ + > {output.filtered} """ diff --git a/workflow/scripts/FilterManta.pl b/workflow/scripts/FilterManta.pl new file mode 100644 index 0000000..2108e74 --- /dev/null +++ b/workflow/scripts/FilterManta.pl @@ -0,0 +1,312 @@ +#!/usr/bin/perl + +use List::Util qw(min max); +use Getopt::Long; +use strict; + +my $svtools = "/opt/conda/envs/python2/bin/svtools"; +my $bedtools = "/usr/local/bin/bedtools"; +my $minimap = "/usr/local/bin/minimap2"; +my $tee = "/usr/bin/tee"; +my $refseq = "/gscuser/dspencer/refdata/hg38/all_sequences.fa"; +my $tmp = "/tmp"; + +my $slop = 200; +my $masklevel = 0.8; +my $PR = 2; +my $SR = 2; +my $minSVabund = 5.0; +my $delcovratio = 0.8; +my $dupcovratio = 1.3; + +my $minMQ = 0; +my $fracIdent = 0.95; + +my $knownfile = ''; + +my $usage = < + + options: + -r refseq + + [ manta hit filtering ] + -m masklevel + -pr PR reads + -sr SR reads + -a min VAF/SVabund + -l slop length for known translocations + -q min map quality of contig hits + -i min fraction identity for contig hits + + [ file/tool paths ] + -b bedtools path + -s svtools path + -p minimap path + -t tmpdir + +END + +GetOptions("pr=i" => \$PR, + "sr=i" => \$SR, + "a=f" => \$minSVabund, + "b=s" => \$bedtools, + "s=s" => \$svtools, + "p=s" => \$minimap, + "t=s" => \$tmp, + "m=f" => \$masklevel, + "r=s" => \$refseq, + "l=i" => \$slop, + "q=i" => \$minMQ, + "i=f" => \$fracIdent, + "k=s" => \$knownfile); + + +die "$usage\n" if ! -e $knownfile || ! -e $ARGV[0] || !$ARGV[1] || ! -e $svtools || ! -e $bedtools || ! -e $minimap || ! -e $refseq || $ARGV[0] !~ /vcf.gz$/; + +my $out = $ARGV[1]; + +#open manta VCF +open(VCF,"/bin/zcat $ARGV[0] |") || die "Cant open manta file $ARGV[0]"; + +# create a fastq file +open(FQ,">$tmp/input.fq") || die "cant make temporary fastq file"; + +# read in manta records +while(){ + chomp; + next if /^#/; + my @F = split("\t",$_); + + # get the contig tag, if it exists, and print sequence to a temp fastq file + if ($F[7] =~ /CONTIG=([ACGTNactgn]+)/){ + my $contig = $1; + my $quals = '#' x length($contig); + print FQ "\@$F[2]\n$contig\n+\n$quals\n"; + } +} +close VCF; +close FQ; + +my %hits = (); # hash of hits +my %totalhits = (); + +# map fastq file to reference with minimap2 and iterate through records +open(SAM,"minimap2 -N 50 -p 0.5 --mask-level 0.8 -ax sr $refseq $tmp/input.fq | $tee $tmp/minimap.sam |") || die; +open(D,">$tmp/minimap.tsv") || die; +while(){ + chomp; + + next if /^@/; + + my @F = split("\t",$_); + + $totalhits{$F[0]}++; + + my $flag = $F[1]; + + next if $flag & hex("0x800"); #skip if its a supp alignment + + next if !/SA:Z/; # must have a supplementary alignment in the SA tag + + # get supplemental hit + /SA:Z:(\S+);/; + my @l=split(",",$1); + + # check strand of primary hit + my $pstrand = "+"; + $pstrand = "-" if $flag & hex("0x10"); + + /NM:i:(\d+)/; + my $pnm = $1; + + # get the alignment coordinates on the read from the cigar string (e.g., positions X through Y are aligned) + my $cigar = $F[5]; + my $st = 0; # align start + my $alen = 0; # align length + my $qlen = 0; # read length + while($cigar =~ /(\d+)([MSHDI])/g){ # get cigar operation + my $s = $1; + my $o = $2; + $qlen += $s if $s ne 'D'; # add to the query length unless its a del + $st += $s if ($o =~ /S|H/ and $alen == 0); # change start position iff this is the first operation and its a clip + $alen += $s if ($o =~ /[MI]/); # add to the alignment length if its a M or I + } + my $pid = 1 - $pnm / $alen; + my @p = ($st+1,$st+$alen-1); # make array with (start,end) + @p = reverse(map { $qlen - $_ } @p) if ($pstrand eq '-'); # reverse if its a minus strand hit + + map { die "parse cigar failed" if $_ < 0 or $_ > $qlen } @p; + + # do same for supplemental cigar + my $scigar = $l[3]; + my $snm = $l[5]; + my $sstrand = $l[2]; + $st = 0; + $alen = 0; + $qlen = 0; + while($scigar =~ /(\d+)([MSHDI])/g){ + my $s = $1; + my $o = $2; + $qlen += $s if $s ne 'D'; # add to the query length unless its a del + $st += $s if ($o =~ /S|H/ and $alen == 0); # change start position iff this is the first operation and its a clip + $alen += $s if ($o =~ /[MI]/); # add to the alignment length if its a M or I + } + my $sid = 1 - $snm / $alen; + my @s = ($st,$st+$alen-1); + @s = reverse(map { $qlen - $_ } @s) if ($sstrand eq '-'); + + map { die "parse cigar failed" if $_ < 0 or $_ > $qlen } @s; + + # calculate MASKLEVEL (1.0 - MASKLEVEL) == fraction of query that overlaps between primary and supplemental hits + my $ov = 0; + unless ($s[0] > $p[1] or $p[0] > $s[1]){ + $ov = (min($p[1],$s[1]) - max($p[0],$s[0])) / min($p[1]-$p[0],$s[1]-$s[0]); + } + + die "masklevel failed" if $ov < 0 or $ov > 1; + + my $mq = ($F[4] < $l[4] ? $F[4] : $l[4]); + # skip if masklevel threshold exceeded + next if $ov > (1 - $masklevel); + + next unless $mq >= $minMQ && $pid > $fracIdent && $sid > $fracIdent; + + # store hits + push @{$hits{$F[0]}}, [ $F[2],$F[3]-1,$F[3],$l[0],$l[1]-1,$l[1],$F[0],$mq,$pstrand,$sstrand,$F[5] . ";" . $l[3] ]; + print D join("\t",$F[2],$F[3]-1,$F[3],$l[0],$l[1]-1,$l[1],$F[0],$mq,$pstrand,$sstrand,$F[5] . ";" . $l[3]),"\n"; +} +close SAM; +close D; + +die "no minimap hits!" if scalar keys %hits == 0; + +# get hotspot annotations using bedtools. store in hash by mantaID +my %knowntrans = (); +open(BT,"$svtools vcftobedpe -i $ARGV[0] 2> /dev/null | $bedtools pairtopair -is -slop $slop -a stdin -b $knownfile |") || die "cant run bedtools"; +while(){ + chomp; + my @F = split("\t",$_); + + # get orientation of manta hit + my $orientation= 'same'; + $orientation = 'opposite' if ($F[14] =~ /^[ACTGactg]+\]|\[[ACTGactg]+$/); + + # store name if right orientation + if ($F[$#F] eq '.' or ($orientation eq 'same' && $F[$#F] eq $F[$#F-1]) or ($orientation eq 'opposite' && $F[$#F] ne $F[$#F-1])){ + $knowntrans{$F[12]} = $F[$#F-3]; + $knowntrans{$F[15]} = $F[$#F-3]; + } +} +close BT; + +# outfile +open(O,">$out") || die; + +#open manta VCF again +open(VCF,"zcat $ARGV[0] |") || die "Cant open manta vcf file $ARGV[0]"; +while(){ + # add info tag and filters + if (/^##FILTER/){ + print O '##INFO=',"\n"; + print O '##INFO=',"\n"; + do { + print O; + $_ = ; + } while(/^##FILTER/); + print O '##FILTER=',"\n"; + print O '##FILTER=',"\n"; + print O '##FILTER=',"\n"; + next; + } elsif (/^#/){ + print O; + next; + } + + # handle records + chomp; + my @F = split("\t",$_); + + my @fmtk = split(":",$F[8]); + my @fmtv = split(":",$F[9]); + + my %fmt = (); + map { $fmt{$fmtk[$_]} = $fmtv[$_]; $fmt{$fmtk[$_]} = [ split(",",$fmt{$fmtk[$_]}) ] if $fmt{$fmtk[$_]}=~/,/; } 0..$#fmtk; + + $F[7] =~ /SVTYPE=([^;]+)/; + my $svtype = $1; + + # add hotspot tag, if exists + $F[7] .= ";KNOWNSV=" . $knowntrans{$F[2]} if (defined($knowntrans{$F[2]})); + + # add low reads filter if PR and SR reads are low or VAF is too low + my $svabund = 0.0; + my @filter = (); + @filter = split(',',$F[6]) if $F[6] ne 'PASS'; + + if (defined($fmt{PR}) and defined($fmt{SR}) and (max(@{$fmt{PR}}) + max(@{$fmt{SR}})) > 0){ + $svabund = (($fmt{PR}->[1] + $fmt{SR}->[1]) / ($fmt{PR}->[0] + $fmt{PR}->[1] + $fmt{SR}->[0] + $fmt{SR}->[1])) * 100; + } + + if (!defined($fmt{PR}) or !defined($fmt{SR}) or $fmt{PR}->[1] < $PR or $fmt{SR}->[1] < $SR or $svabund < $minSVabund){ + push @filter, "LowReads"; + + } elsif (defined($fmt{DHFFC}) and defined($fmt{DHBFC}) and ($svtype =~ /DEL/ and $fmt{DHFFC} > $delcovratio) or ($svtype =~ /DUP/ and $fmt{DHBFC} < $dupcovratio)){ + push @filter, "FailedCov"; + + # add no contig filter if no contig/imprecise breakends + } elsif ($F[7] !~ /CONTIG=/ or !defined($hits{$F[2]})){ + push @filter, "FailedContig"; + + } else { + + my $orientation= 'same'; + my $chr1 = ''; + my $chr2 = ''; + my $pos1 = ''; + my $pos2 = ''; + + # get positions + if ($F[2] =~ /DEL|INV|DUP|INS/){ + $F[7] =~/END=(\d+)/; + + $chr1 = $F[0]; + $pos1 = $F[1]; + $chr2 = $F[0]; + $pos2 = $1; + + } elsif ($F[2] =~ /BND/) { + $F[4] =~/([^\[\]]+):(\d+)/; + + $chr1 = $F[0]; + $pos1 = $F[1]; + $chr2 = $1; + $pos2 = $2; + + $orientation = 'opposite' if ($F[4] =~ /^[ACTGactg]+\]|\[[ACTGactg]+$/); + } + + # add hotspot tag, if exists + $F[7] .= ";CONTIGHITS=" . $totalhits{$F[2]}; + + # get hits that overlap, allowing for some $slop + my $foundhit = 0; + foreach my $i (@{$hits{$F[2]}}){ + $foundhit = 1 if ((($i->[0] eq $chr1 && $pos1 < $i->[2]+$slop && $pos1 > $i->[1]-$slop && + $i->[3] eq $chr2 && $pos2 < $i->[5]+$slop && $pos2 > $i->[4]-$slop) || + ($i->[0] eq $chr2 && $pos2 < $i->[2]+$slop && $pos2 > $i->[1]-$slop && + $i->[3] eq $chr1 && $pos1 < $i->[5]+$slop && $pos1 > $i->[4]-$slop)) && + (($orientation eq 'same' && $i->[8] eq $i->[9]) || ($orientation eq 'opposite' && $i->[8] ne $i->[9]))); + } + push @filter, "FailedContig" if $foundhit == 0; + + } + $F[6] = join(';',@filter); + $F[6] = "PASS" if scalar @filter == 0; + + print O join("\t",@F),"\n"; +} +close VCF; +close O; + +exit;