Merge pull request #43 from CCBR/feat_hg19

Feat hg19
CCBR · May 7, 2024 · 97ab644 · 97ab644
2 parents 09293c6 + fe3d57f
commit 97ab644
Show file tree

Hide file tree

Showing 8 changed files with 367 additions and 246 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # LOGAN 🔬 [![Docker Pulls](https://img.shields.io/docker/pulls/nciccbr/ccbr_wes_base)](https://hub.docker.com/r/nciccbr/ccbr_wes_base) [![GitHub issues](https://img.shields.io/github/issues/ccbr/LOGAN?color=brightgreen)](https://github.com/ccbr/LOGAN/issues)  [![GitHub license](https://img.shields.io/github/license/ccbr/LOGAN)](https://github.com/ccbr/LOGAN/blob/master/LICENSE) 
 
-> **_LOGAN-whoLe genOme-sequencinG Analysis pipeliNe_**. This is the home of the LOGAN Pipeline. Accurately call germline and somatic variants, CNVs, and SVs and  annotate variants!
+> **_LOGAN-whoLe genOme-sequencinG Analysis pipeliNe_**. Call germline and somatic variants, CNVs, and SVs and  annotate variants!
 
 ## Overview
 Welcome to LOGAN! Before getting started, we highly recommend reading through [LOGAN's documentation](https://ccbr.github.io/LOGAN).
@@ -72,7 +72,10 @@ Adding flags determines SNV (germline and/or somatic), SV, and/or CNV calling mo
 
 `--vc`- Enables somatic CNV calling using FREEC, Sequenza, and Purple (hg38 only)
 
+#### Optional Arguments
+`--indelrealign` - Enables indel realignment when running alignment steps. May be helpful for certain callers (VarScan, VarDict)
 
+`--callers`- Comma separated argument for callers, the default is to use all available. Example: `--callers mutect2,octopus,vardict,varscan`
 
 ## Running LOGAN
 Example of Tumor only calling mode 
@@ -87,7 +90,7 @@ logan run --mode local -profile ci_stub --genome hg38 --outdir out --fastq_input
 logan run --mode slurm -profile biowulf,slurm --genome hg38 --outdir out --fastq_input "*R{1,2}.fastq.gz" --vc --sv --cnv
 ```
 
-We currently support the hg38 and mm10 genomes. 
+We currently support the hg38, hg19 (in progress), and mm10 genomes. 
 
 
 

diff --git a/conf/genomes.config b/conf/genomes.config
@@ -7,8 +7,6 @@ params {
             genomedict= "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/genome/Homo_sapiens_assembly38.dict"
             wgsregion = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/resources_broad_hg38_v0_wgs_calling_regions.hg38.interval_list"
             intervals= "${projectDir}/assets/hg38_v0_wgs_calling_regions.hg38.bed"
-            millsindel = "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/GATK_resource_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz" //Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
-            shapeitindel =  "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/GATK_resource_bundle/ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz" //ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz" //file(params.gold_indels2) //
             INDELREF = "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/GATK_resource_bundle/ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz" //ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz" 
             KNOWNINDELS = "-known /data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/GATK_resource_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz -known /data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/GATK_resource_bundle/ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz"
             KNOWNRECAL = '--known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/dbsnp_138.hg38.vcf.gz --known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz'
@@ -31,10 +29,39 @@ params {
             SEQUENZAGC = "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/SEQUENZA/hg38_gc50Base.txt.gz"
             chromosomes = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']
         }
+
+        'hg19' {
+            genome = "/data/CCBR_Pipeliner/db/PipeDB/lib/hg19.with_extra.fa"
+            genomefai = "/data/CCBR_Pipeliner/db/PipeDB/lib/hg19.with_extra.fa.fai"
+            bwagenome= "/data/CCBR_Pipeliner/db/PipeDB/lib/hs37d5.fa"
+            genomedict= "/data/CCBR_Pipeliner/db/PipeDB/lib/hs37d5.dict"
+            intervals= "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg19/hg19_noblacklistsort_vc.bed"
+            INDELREF = "/fdb/GATK_resource_bundle/b37/Mills_and_1000G_gold_standard.indels.b37.vcf" //ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz" 
+            KNOWNINDELS = "-known /fdb/GATK_resource_bundle/b37/Mills_and_1000G_gold_standard.indels.b37.vcf -known /fdb/GATK_resource_bundle/b37/1000G_phase1.indels.b37.vcf"
+            KNOWNRECAL = '--known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/dbsnp_138.hg38.vcf.gz --known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz'
+            dbsnp = "/fdb/GATK_resource_bundle/hg19-2.8/dbsnp_138.hg19.vcf.gz"
+            gnomad = '--germline-resource /data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/GNOMAD/somatic-hg38-af-only-gnomad.hg38.vcf.gz' // /data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/GNOMAD/somatic-hg38-af-only-gnomad.hg38.vcf.gz
+            pon = "/data/CCBR_Pipeliner/db/PipeDB/lib/GRCh37.noCOSMIC_ClinVar.pon.vcf.gz"   
+            kgp = "/fdb/GATK_resource_bundle/hg19-2.8/dbsnp_138.hg19.vcf.gz"
+            KRAKENBACDB = "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/kraken/20180907_standard_kraken2"
+            snpeff_genome = "GRCh37.75"
+            snpeff_config = "/usr/local/apps/snpEff/4.3t/snpEff.config"
+            snpeff_bundle = "/usr/local/apps/snpEff/4.3t/snpEff.confi"
+            sites_vcf= "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/somalier/sites.hg38.vcf.gz"
+            somalier_ancestrydb="/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/somalier/1kg-somalier"
+            vepcache = "/fdb/VEP/102/cache"
+            vepspecies = "homo_sapiens"
+            vepbuild = "GRCh37"
+            annotsvgenome = "GRCh37"
+            octopus_sforest= "--somatic-forest /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/octopus/somatic.v0.7.4.forest"
+            octopus_gforest= "--forest /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/octopus/germline.v0.7.4.forest"
+            SEQUENZAGC = "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/SEQUENZA/hg38_gc50Base.txt.gz"
+            chromosomes = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']
+        }
 
         'mm10' {
-            genome = "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/genome/bwamem2index/genome.fa" // file(params.genome)
-            genomefai = "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/genome/bwamem2index/genome.fa.fai" // file(params.genome)
+            genome = "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/genome/bwamem2index/genome.fa" 
+            genomefai = "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/genome/bwamem2index/genome.fa.fai" 
             bwagenome= "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/genome/bwaindex/genome.fa"
             genomedict= "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/genome/bwamem2index/genome.dict"
             intervals="/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/genome/bwamem2index/mm10_wgsregions.bed"

diff --git a/modules/local/splitbed.nf b/modules/local/splitbed.nf
@@ -30,4 +30,10 @@ bedtools subtract -a GRCh38.primary_assembly.genome.bed -b ../hg38.blacklist.bed
 
 gatk BedToIntervalList -I GRCh38.primary_assembly.genome.interval.bed -O \ 
 GRCh38.primary_assembly.genome.interval_list -SD GRCh38.primary_assembly.genome.dict
+
+#hg19
+awk -F '\t' '{printf("%s\t0\t%s\n",$1,$2);}' /data/CCBR_Pipeliner/db/PipeDB/lib/hg19.with_extra.fa.fai >hg19_all.bed
+bedtools subtract -a hg19_all.bed -b hg19-blacklist.v2.bed > hg19_noblacklist.bed
+bedtools sort -i hg19_noblacklist.bed -chrThenSizeD  >hg19_noblacklistsort.bed
+awk '/^chr[0-9,X,Y,M]*\t/ {printf("%s\t%s\t%s\n",$1,$2,$3);}' hg19_noblacklistsort.bed  > hg19_noblacklistsort_vc.bed
 */
diff --git a/modules/local/variant_calling.nf b/modules/local/variant_calling.nf
@@ -472,6 +472,7 @@ process octopus_tn {
     $GERMLINE_FOREST \
     $SOMATIC_FOREST \
     --target-working-memory 64Gb \
+    -B 64Gb \
     -o ${tumorname}_vs_${normalname}_${bed.simpleName}.octopus.vcf.gz
     """
 
@@ -759,7 +760,7 @@ process somaticcombine {
 
     input:
         tuple val(tumorsample), val(normal),
-        val(callers),
+        val(caller),
         path(vcfs), path(vcfindex)
 
     output:
@@ -768,24 +769,27 @@ process somaticcombine {
         path("${tumorsample}_vs_${normal}_combined.vcf.gz.tbi")
 
     script:
-        vcfin1=[callers, vcfs].transpose().collect { a, b -> a + " " + b }
+        vcfin1=[caller, vcfs].transpose().collect { a, b -> a + " " + b }
         vcfin2="-V:" + vcfin1.join(" -V:")
 
+        callerin=caller.join(",")
     """
     /usr/lib/jvm/java-8-openjdk-amd64/bin/java -jar \$GATK_JAR -T CombineVariants  \
         -R $GENOMEREF \
         --genotypemergeoption PRIORITIZE \
-        --rod_priority_list mutect2,strelka,octopus,muse,lofreq,vardict,varscan \
+        --rod_priority_list $callerin \
         --filteredrecordsmergetype KEEP_IF_ANY_UNFILTERED \
         -o ${tumorsample}_vs_${normal}_combined.vcf.gz \
         $vcfin2
         
     """
 
     stub:
-    vcfin1=[callers, vcfs].transpose().collect { a, b -> a + " " + b }
+    vcfin1=[caller, vcfs].transpose().collect { a, b -> a + " " + b }
     vcfin2="-V:" + vcfin1.join(" -V:")
 
+    callerin=caller.join(",")
+
     """
     touch ${tumorsample}_vs_${normal}_combined.vcf.gz
     touch ${tumorsample}_vs_${normal}_combined.vcf.gz.tbi

diff --git a/modules/local/variant_calling_tonly.nf b/modules/local/variant_calling_tonly.nf
@@ -339,9 +339,11 @@ process octopus_tonly {
     octopus -R $GENOMEREF -C cancer -I ${tumor} \
     --annotations AC AD DP \
     --target-working-memory 64Gb \
+    -B 64Gb \
     -t ${bed} \
+    --threads ${task.cpus}\
     $SOMATIC_FOREST \
-    -o ${tumorname}_${bed.simpleName}.tonly.octopus.vcf.gz --threads ${task.cpus}
+    -o ${tumorname}_${bed.simpleName}.tonly.octopus.vcf.gz 
     """
 
     stub:
@@ -385,7 +387,7 @@ process somaticcombine_tonly {
 
     input:
         tuple val(tumorsample),
-        val(callers),
+        val(caller),
         path(vcfs), path(vcfindex)
 
     output:
@@ -394,20 +396,27 @@ process somaticcombine_tonly {
         path("${tumorsample}_combined_tonly.vcf.gz.tbi")
 
     script:
-        vcfin1=[callers, vcfs].transpose().collect { a, b -> a + " " + b }
+        vcfin1=[caller, vcfs].transpose().collect { a, b -> a + " " + b }
         vcfin2="-V:" + vcfin1.join(" -V:")
 
+        callerin=caller.join(",").replaceAll("_tonly","")
+
     """
     /usr/lib/jvm/java-8-openjdk-amd64/bin/java -jar \$GATK_JAR -T CombineVariants  \
         -R $GENOMEREF \
         --genotypemergeoption PRIORITIZE \
-        --rod_priority_list mutect2_tonly,octopus_tonly,vardict_tonly,varscan_tonly \
+        --rod_priority_list $callerin \
         --filteredrecordsmergetype KEEP_IF_ANY_UNFILTERED \
         -o ${tumorsample}_combined_tonly.vcf.gz \
         $vcfin2
     """
 
     stub:
+
+    vcfin1=[caller, vcfs].transpose().collect { a, b -> a + " " + b }
+    vcfin2="-V:" + vcfin1.join(" -V:")
+
+    callerin=caller.join(",").replaceAll("_tonly","")
     """
     touch ${tumorsample}_combined_tonly.vcf.gz ${tumorsample}_combined_tonly.vcf.gz.tbi
     """

diff --git a/nextflow.config b/nextflow.config
@@ -27,9 +27,10 @@ params {
     script_freecpaired = "${projectDir}/bin/make_freec_genome_paired.pl"
     freec_significance = "${projectDir}/bin/assess_significance.R"
     freec_plot = "${projectDir}/bin/makeGraph.R"
+
     lofreq_convert = "${projectDir}/bin/add_gt_lofreq.sh"
     split_regions = "24" //Number of regions to split by
-
+    
     vep_cache = "/fdb/VEP/102/cache"
 
     //SUB WORKFLOWS to SPLIT
@@ -40,16 +41,15 @@ params {
     qc=null
     bam=null
     indelrealign=null
+
     //Set all Inputs to null
     sample_sheet=null
-
     fastq_file_input=null
     bam_file_input=null
     file_input=null
 
     fastq_input=null
     bam_input=null
-
     BAMINPUT=null
 
     publish_dir_mode = 'symlink'