diff --git a/compute5/NGS_DNA/batchIDList_NO.csv b/compute5/NGS_DNA/batchIDList_NO.csv new file mode 100755 index 00000000..b6b0c280 --- /dev/null +++ b/compute5/NGS_DNA/batchIDList_NO.csv @@ -0,0 +1,2 @@ +batchID +1 diff --git a/compute5/NGS_DNA/batchIDList_chr.csv b/compute5/NGS_DNA/batchIDList_chr.csv new file mode 100755 index 00000000..f8f20faf --- /dev/null +++ b/compute5/NGS_DNA/batchIDList_chr.csv @@ -0,0 +1,24 @@ +batchID +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +X diff --git a/compute5/NGS_DNA/generate_template.sh b/compute5/NGS_DNA/generate_template.sh index 7ad4345a..975724c4 100755 --- a/compute5/NGS_DNA/generate_template.sh +++ b/compute5/NGS_DNA/generate_template.sh @@ -8,7 +8,7 @@ PROJECT=projectXX TMPDIR=tmpXX WORKDIR="/groups/umcg-gaf/${TMPDIR}" RUNID=runXX -## For small batchsize (6) leave BATCH empty, else choose _exome (10 batches) or _wgs (20 batches) +## For small batchsize (6) leave BATCH empty, _chr (per chrosomomes), _NO (1 batch), _exome (10 batches) or _wgs (20 batches) BATCH="" SAMPLESIZE=$(cat ${WORKDIR}/generatedscripts/${PROJECT}/${PROJECT}.csv | wc -l) diff --git a/compute5/NGS_DNA/parameters.csv b/compute5/NGS_DNA/parameters.csv index bb8d3fce..d4f572be 100755 --- a/compute5/NGS_DNA/parameters.csv +++ b/compute5/NGS_DNA/parameters.csv @@ -9,16 +9,16 @@ checkStage,module list ### Tool versions #### bwaVersion,BWA/0.7.12-goolf-1.7.20 -computeVersion,v15.11.1-Java-1.8.0_45 +computeVersion,v15.12.4-Java-1.8.0_45 cutadaptVersion,1.8.1-goolf-1.7.20-Python-2.7.9 dbNSFPVersion,2.7 dellyVersion,v0.7.1 fastqcVersion,FastQC/0.11.3-Java-1.7.0_80 gatkVersion,GATK/3.4-0-Java-1.7.0_80 -javaVersion,Java/1.7.0_80 +javaVersion,Java/1.8.0_45 picardVersion,picard/1.130-Java-1.7.0_80 rVersion,R/3.2.1-goolf-1.7.20 -sambambaVersion,sambamba_v0.5.9 +sambambaVersion,sambamba/v0.5.9-goolf-1.7.20 samtoolsVersion,SAMtools/1.2-goolf-1.7.20 snpEffVersion,snpEff/4.1g-Java-1.7.0_80 tabixVersion,tabix/0.2.6-goolf-1.7.20 @@ -26,6 +26,7 @@ molgenisAnnotatorVersion,CmdLineAnnotator/1.9.0-Java-1.8.0_45 hpoVersion,90 gatkJar,GenomeAnalysisTK.jar picardJar,picard.jar +sambambaTool,sambamba_v0.5.9 ##### GENERAL DIRECTORIES ##### dataDir,${root}/data/ @@ -59,7 +60,7 @@ indexFileDictionary,${indicesDir}/${indexFileIDPhiX}.dict #### Prefixes #### runPrefix,${sequencingStartDate}_${sequencer}_${run}_${flowcell} filePrefix,${runPrefix}_L${lane} -sample,${intermediateDir}/${externalSampleID} +sampleNameID,${intermediateDir}/${externalSampleID} #### INTERVALS #### nameBed,captured @@ -96,7 +97,7 @@ phiXPrefix,150504_WGSIM_9999_ZZZZZZZZXX phiXEnd1Gz,${humanPhiXdir}/${phiXPrefix}/${phiXPrefix}_L9_ZZZZZZ_1.${rawFileExt} phiXEnd2Gz,${humanPhiXdir}/${phiXPrefix}/${phiXPrefix}_L9_ZZZZZZ_2.${rawFileExt} -### Protocols 5, 6, 7, 8 and 9 (SamToBam, SortBam, MergeBam, MarkDuplicates, IndelRealignment) ### +### Protocols 5, 6, 7, 8 and 9 (SamToBam, SortBam, MergeBam, MarkDuplicates) ### samToBamJar,SamFormatConverter sortSamJar,SortSam mergeSamFilesJar,MergeSamFiles @@ -105,11 +106,15 @@ alignedSam,${fileWithIndexId}.sam alignedBam,${fileWithIndexId}.bam alignedSortedBam,${fileWithIndexId}.sorted.bam alignedSortedBamIdx,${fileWithIndexId}.sorted.bai -sampleMergedBam,${sample}.merged.bam -sampleMergedBamIdx,${sample}.merged.bai -dedupBam,${sample}.merged.dedup.bam -dedupBamIdx,${sample}.merged.dedup.bam.bai -dedupMetrics,${sample}.merged.dedup.metrics, +sampleMergedBam,${sampleNameID}.merged.bam +sampleMergedBai,${sampleNameID}.merged.bai +sampleMergedBamIdx,${sampleNameID}.merged.bam.bai +dedupBam,${sampleNameID}.merged.dedup.bam +dedupBamIdx,${sampleNameID}.merged.dedup.bam.bai +dedupMetrics,${sampleNameID}.merged.dedup.metrics, +dedupBamCram,${sampleNameID}.merged.dedup.bam.cram +dedupBamCramIdx,${sampleNameID}.merged.dedup.bam.cram.bai +dedupBamCramBam,${sampleNameID}.merged.dedup.bam.cram.bam KGPhase1IndelsVcf,${indicesDir}/indels/1000G_phase1.indels.b37.vcf KGPhase1IndelsVcfIdx,${KGPhase1IndelsVcf}.idx MillsGoldStandardDir,${indicesDir}/Mills_and_1000G_gold_standard/ @@ -117,23 +122,29 @@ MillsGoldStandardIndelsVcf,${MillsGoldStandardDir}/1000G_phase1.indels_Mills_and MillsGoldStandardChr1Intervals,${MillsGoldStandardDir}/1000G_phase1.indels_Mills_and_1000G_gold_standard.indels.b37.human_g1k_v37.chr1.intervals ### Protocols 11, 12, 13 and 14 (CheckSex, Delly, CoveragePerBase, SequonomConcordanceCheck, CollectBamMetrics) ### -whichSex,${sample}.chosenSex.txt -checkSexMeanCoverage,${sample}.checkSex.filter.meancoverage.txt +whichSex,${sampleNameID}.chosenSex.txt +checkSexMeanCoverage,${sampleNameID}.checkSex.filter.meancoverage.txt capturedIntervals_nonAutoChrX,${intermediateDir}/${nameBed}.nonAutosomalChrX.interval_list -familyList,${sample}.familylist.txt -arrayMapFile,${sample}.concordance.map -sampleConcordanceFile,${sample}.concordance.ngsVSarray.txt +familyList,${sampleNameID}.familylist.txt +arrayMapFile,${sampleNameID}.concordance.map +sampleConcordanceFile,${sampleNameID}.concordance.ngsVSarray.txt sequenomReport,${tmpDataDir}/rawdata/array/${project}_Sequenom_Report.txt sequenomInfo,${sequenomDir}/Sequonome_SNPinfo.txt collectMultipleMetricsJar,CollectMultipleMetrics +bamIndexStats,${dedupBam}.bam_index_stats +bamIndexStatsJar,BamIndexStats +gcBiasMetrics,${dedupBam}.gc_bias_metrics gcBiasMetricsJar,CollectGcBiasMetrics +insertSizeMetrics,${dedupBam}.insert_size_metrics hsMetricsJar,CalculateHsMetrics +hsMetrics,${dedupBam}.hs_metrics +hsMetricsNonAutosomalRegionChrX,${dedupBam}.nonAutosomalRegionChrX_hs_metrics recreateInsertSizePdfR,createInsertSizePlot_c5.R -bamIndexStatsJar,BamIndexStats -projectDellyAnnotatorOutputVcf,${intermediateDir}/${project}.delly.snpeff.hpo.vcf +projectDellyAnnotatorOutputVcf,${sampleNameID}.delly.snpeff.hpo.vcf collectBamMetricsPrefix,${intermediateDir}/${externalSampleID}.merged.dedup hpoTerms,${hpoDir}/build.${hpoVersion}/ALL_SOURCES_TYPICAL_FEATURES_diseases_to_genes_to_phenotypes.txt -dellyVcf,${intermediateDir}/${project}.delly.vcf +dellyVcf,${sampleNameID}.delly.vcf +dellySnpEffVcf,${sampleNameID}.delly.snpeff.vcf dellyTypeDEL,DEL dellyTypeDUP,DUP dellyTypeINV,INV @@ -144,11 +155,11 @@ dbSNP137Vcf,${dbSNPDir}/dbsnp_137.b37.vcf dbSNP137VcfIdx,${dbSNP137Vcf}.idx dbSNPExSiteAfter129Vcf,${dbSNPDir}/dbsnp_137.b37.excluding_sites_after_129.vcf dbSNPExSiteAfter129VcfIdx,${dbSNPExSiteAfter129Vcf}.idx -sampleBatchVariantCalls,${sample}.batch-${batchID}.variant.calls.g.vcf +sampleBatchVariantCalls,${sampleNameID}.batch-${batchID}.variant.calls.g.vcf sampleBatchVariantCallsIdx,${sampleBatchVariantCalls}.idx -sampleBatchVariantCallsFemale,${sample}.batch-${batchID}.chrX.female.variant.calls.g.vcf +sampleBatchVariantCallsFemale,${sampleNameID}.batch-${batchID}.chrX.female.variant.calls.g.vcf sampleBatchVariantCallsFemaleIdx,${sampleBatchVariantCallsFemale}.idx -sampleBatchVariantCallsMaleNONPAR,${sample}.batch-${batchID}.chrX.male.NONPAR.variant.calls.g.vcf +sampleBatchVariantCallsMaleNONPAR,${sampleNameID}.batch-${batchID}.chrX.male.NONPAR.variant.calls.g.vcf sampleBatchVariantCallsMaleNONPARIdx,${sampleBatchVariantCallsMaleNONPAR}.idx projectBatchCombinedVariantCalls,${projectPrefix}.batch-${batchID}.variant.calls.combined.g.vcf projectBatchGenotypedVariantCalls,${projectPrefix}.batch-${batchID}.variant.calls.genotyped.vcf @@ -163,12 +174,12 @@ snpEffGenesTxt,${projectPrefix}.snpEff.calls.genes.txt snpEffCallsVcf,${projectPrefix}.calls.snpEff.vcf snpEffCallsSortedVcf,${projectPrefix}.calls.snpEff.sorted.vcf dbNSFP,${dbNSFPDir}/${dbNSFPVersion}/dbNSFP${dbNSFPVersion}.txt.gz -dbNSFPSampleVcf,${sample}.snpEff.annotated.snps.dbnsfp.vcf +dbNSFPSampleVcf,${sampleNameID}.snpEff.annotated.snps.dbnsfp.vcf variantAnnotatorOutputVcf,${projectPrefix}.snpEff.annotated.vcf -variantAnnotatorSampleOutputIndelsVcf,${sample}.snpEff.annotated.indels.vcf -variantAnnotatorSampleOutputSnpsVcf,${sample}.snpEff.annotated.snps.vcf -variantAnnotatorSampleOutputIndelsFilteredVcf,${sample}.snpEff.annotated.filtered.indels.vcf -variantAnnotatorSampleOutputSnpsFilteredVcf,${sample}.snpEff.annotated.filtered.snps.vcf +variantAnnotatorSampleOutputIndelsVcf,${sampleNameID}.snpEff.annotated.indels.vcf +variantAnnotatorSampleOutputSnpsVcf,${sampleNameID}.snpEff.annotated.snps.vcf +variantAnnotatorSampleOutputIndelsFilteredVcf,${sampleNameID}.snpEff.annotated.filtered.indels.vcf +variantAnnotatorSampleOutputSnpsFilteredVcf,${sampleNameID}.snpEff.annotated.filtered.snps.vcf ### Protocols 24 and 25 (VcfToTable, QCReport) ### variantsFinalProjectVcfTable,${projectPrefix}.final.vcf.table diff --git a/compute5/NGS_DNA/protocols/CollectBamIndexMetrics.sh b/compute5/NGS_DNA/protocols/CollectBamIndexMetrics.sh index 6d4c937e..9dae165a 100755 --- a/compute5/NGS_DNA/protocols/CollectBamIndexMetrics.sh +++ b/compute5/NGS_DNA/protocols/CollectBamIndexMetrics.sh @@ -8,16 +8,17 @@ #string bamIndexStatsJar #string dedupBam #string dedupBamIdx -#string collectBamMetricsPrefix #string tempDir #string capturingKit #string picardJar +#string bamIndexStats +#string project #Load Picard module ${stage} ${picardVersion} -makeTmpDir ${collectBamMetricsPrefix} -tmpCollectBamMetricsPrefix=${MC_tmpFile} +makeTmpDir ${bamIndexStats} +tmpBamIndexStats=${MC_tmpFile} #Run Picard BamIndexStats @@ -25,8 +26,8 @@ java -jar -Xmx4g ${EBROOTPICARD}/${picardJar} ${bamIndexStatsJar} \ INPUT=${dedupBam} \ VALIDATION_STRINGENCY=LENIENT \ TMP_DIR=${tempDir} \ -> ${tmpCollectBamMetricsPrefix}.bam_index_stats +> ${tmpBamIndexStats} -echo -e "\nBamIndexStats finished succesfull. Moving temp files to final.\n\n" -mv ${tmpCollectBamMetricsPrefix}.bam_index_stats ${dedupBam}.bam_index_stats +mv ${tmpBamIndexStats} ${bamIndexStats} +echo "moved ${tmpBamIndexStats} to ${bamIndexStats}" diff --git a/compute5/NGS_DNA/protocols/CollectGCBiasMetrics.sh b/compute5/NGS_DNA/protocols/CollectGCBiasMetrics.sh index 59118e9a..696627f0 100755 --- a/compute5/NGS_DNA/protocols/CollectGCBiasMetrics.sh +++ b/compute5/NGS_DNA/protocols/CollectGCBiasMetrics.sh @@ -16,6 +16,9 @@ #string capturingKit #string seqType #string picardJar +#string insertSizeMetrics +#string gcBiasMetrics +#string project #Load Picard module ${stage} ${picardVersion} @@ -25,30 +28,30 @@ ${stage} ${rVersion} ${stage} ngs-utils ${checkStage} -makeTmpDir ${collectBamMetricsPrefix} -tmpCollectBamMetricsPrefix=${MC_tmpFile} +makeTmpDir ${gcBiasMetrics} +tmpGcBiasMetrics=${MC_tmpFile} #Run Picard GcBiasMetrics java -XX:ParallelGCThreads=4 -jar -Xmx4g ${EBROOTPICARD}/${picardJar} ${gcBiasMetricsJar} \ R=${indexFile} \ I=${dedupBam} \ -O=${tmpCollectBamMetricsPrefix}.gc_bias_metrics \ -CHART=${tmpCollectBamMetricsPrefix}.gc_bias_metrics.pdf \ +O=${tmpGcBiasMetrics} \ +CHART=${tmpGcBiasMetrics}.pdf \ VALIDATION_STRINGENCY=LENIENT \ TMP_DIR=${tempDir} echo -e "\nGcBiasMetrics finished succesfull. Moving temp files to final.\n\n" - mv ${tmpCollectBamMetricsPrefix}.gc_bias_metrics ${dedupBam}.gc_bias_metrics - mv ${tmpCollectBamMetricsPrefix}.gc_bias_metrics.pdf ${dedupBam}.gc_bias_metrics.pdf + mv ${tmpGcBiasMetrics} ${gcBiasMetrics} + mv ${tmpGcBiasMetrics}.pdf ${gcBiasMetrics}.pdf ######IS THIS STILL NEEDED, IMPROVEMENTS/UPDATES TO BE DONE?##### #Create nicer insertsize plots if seqType is PE #if [ "${seqType}" == "PE" ] #then # Overwrite the PDFs that were just created by nicer onces: - ${recreateInsertSizePdfR} \ - --insertSizeMetrics ${dedupBam}.insert_size_metrics \ - --pdf ${dedupBam}.insert_size_histogram.pdf +${recreateInsertSizePdfR} \ +--insertSizeMetrics ${insertSizeMetrics} \ +--pdf ${insertSizeMetrics}.pdf #else # Don't do insert size analysis because seqType != "PE" diff --git a/compute5/NGS_DNA/protocols/CollectHSMetrics.sh b/compute5/NGS_DNA/protocols/CollectHSMetrics.sh index 491aaae6..d6ab45fa 100755 --- a/compute5/NGS_DNA/protocols/CollectHSMetrics.sh +++ b/compute5/NGS_DNA/protocols/CollectHSMetrics.sh @@ -6,43 +6,45 @@ #string checkStage #string picardVersion #string hsMetricsJar +#string hsMetrics #string dedupBam #string dedupBamIdx -#string collectBamMetricsPrefix #string tempDir #string recreateInsertSizePdfR #string capturedIntervals #string capturingKit #string picardJar +#string project #Load Picard module ${stage} ${picardVersion} -makeTmpDir ${collectBamMetricsPrefix} -tmpCollectBamMetricsPrefix=${MC_tmpFile} +makeTmpDir ${hsMetrics} +tmpHsMetrics=${MC_tmpFile} #Run Picard HsMetrics if capturingKit was used if [ "${capturingKit}" != "None" ] then java -jar -Xmx4g ${EBROOTPICARD}/${picardJar} ${hsMetricsJar} \ INPUT=${dedupBam} \ - OUTPUT=${tmpCollectBamMetricsPrefix}.hs_metrics \ + OUTPUT=${tmpHsMetrics} \ BAIT_INTERVALS=${capturedIntervals} \ TARGET_INTERVALS=${capturedIntervals} \ VALIDATION_STRINGENCY=LENIENT \ TMP_DIR=${tempDir} else - echo "## net.sf.picard.metrics.StringHeader" > ${tmpCollectBamMetricsPrefix}.hs_metrics - echo "#" >> ${tmpCollectBamMetricsPrefix}.hs_metrics - echo "## net.sf.picard.metrics.StringHeader" >> ${tmpCollectBamMetricsPrefix}.hs_metrics - echo "#" >> ${tmpCollectBamMetricsPrefix}.hs_metrics - echo "" >> ${tmpCollectBamMetricsPrefix}.hs_metrics - echo "## METRICS CLASS net.sf.picard.analysis.directed.HsMetrics" >> ${tmpCollectBamMetricsPrefix}.hs_metrics - echo "BAIT_SET GENOME_SIZE BAIT_TERRITORY TARGET_TERRITORY BAIT_DESIGN_EFFICIENCY TOTAL_READS PF_READS PF_UNIQUE_READS PCT_PF_READS PCT_PF_UQ_READS PF_UQ_READS_ALIGNED PCT_PF_UQ_READS_ALIGNED PF_UQ_BASES_ALIGNED ON_BAIT_BASES NEAR_BAIT_BASES OFF_BAIT_BASES ON_TARGET_BASES PCT_SELECTED_BASES PCT_OFF_BAIT ON_BAIT_VS_SELECTED MEAN_BAIT_COVERAGE MEAN_TARGET_COVERAGE PCT_USABLE_BASES_ON_BAIT PCT_USABLE_BASES_ON_TARGET FOLD_ENRICHMENT ZERO_CVG_TARGETS_PCT FOLD_80_BASE_PENALTY PCT_TARGET_BASES_2X PCT_TARGET_BASES_10X PCT_TARGET_BASES_20X PCT_TARGET_BASES_30X HS_LIBRARY_SIZE HS_PENALTY_10X HS_PENALTY_20X HS_PENALTY_30X AT_DROPOUT GC_DROPOUT SAMPLE LIBRARY READ_GROUP" >> ${tmpCollectBamMetricsPrefix}.hs_metrics - echo "NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA" >> ${tmpCollectBamMetricsPrefix}.hs_metrics + echo "## net.sf.picard.metrics.StringHeader" > ${tmpHsMetrics} + echo "#" >> ${tmpHsMetrics} + echo "## net.sf.picard.metrics.StringHeader" >> ${tmpHsMetrics} + echo "#" >> ${tmpHsMetrics} + echo "" >> ${tmpHsMetrics} + echo "## METRICS CLASS net.sf.picard.analysis.directed.HsMetrics" >> ${tmpHsMetrics} + echo "BAIT_SET GENOME_SIZE BAIT_TERRITORY TARGET_TERRITORY BAIT_DESIGN_EFFICIENCY TOTAL_READS PF_READS PF_UNIQUE_READS PCT_PF_READS PCT_PF_UQ_READS PF_UQ_READS_ALIGNED PCT_PF_UQ_READS_ALIGNED PF_UQ_BASES_ALIGNED ON_BAIT_BASES NEAR_BAIT_BASES OFF_BAIT_BASES ON_TARGET_BASES PCT_SELECTED_BASES PCT_OFF_BAIT ON_BAIT_VS_SELECTED MEAN_BAIT_COVERAGE MEAN_TARGET_COVERAGE PCT_USABLE_BASES_ON_BAIT PCT_USABLE_BASES_ON_TARGET FOLD_ENRICHMENT ZERO_CVG_TARGETS_PCT FOLD_80_BASE_PENALTY PCT_TARGET_BASES_2X PCT_TARGET_BASES_10X PCT_TARGET_BASES_20X PCT_TARGET_BASES_30X HS_LIBRARY_SIZE HS_PENALTY_10X HS_PENALTY_20X HS_PENALTY_30X AT_DROPOUT GC_DROPOUT SAMPLE LIBRARY READ_GROUP" >> ${tmpHsMetrics} + echo "NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA" >> ${tmpHsMetrics} fi -echo -e "\nHsMetrics finished succesfull. Moving temp files to final.\n\n" -mv ${tmpCollectBamMetricsPrefix}.hs_metrics ${dedupBam}.hs_metrics + +mv ${tmpHsMetrics} ${hsMetrics} +echo "moved ${tmpHsMetrics} to ${hsMetrics}" diff --git a/compute5/NGS_DNA/protocols/CollectMultipleMetrics.sh b/compute5/NGS_DNA/protocols/CollectMultipleMetrics.sh index 739f2517..276b8b0a 100755 --- a/compute5/NGS_DNA/protocols/CollectMultipleMetrics.sh +++ b/compute5/NGS_DNA/protocols/CollectMultipleMetrics.sh @@ -13,6 +13,7 @@ #string tempDir #string seqType #string picardJar +#string project #Load Picard module ${stage} ${picardVersion} diff --git a/compute5/NGS_DNA/protocols/CopyToResultsDir.sh b/compute5/NGS_DNA/protocols/CopyToResultsDir.sh index 274ec4e3..9c12bbc2 100755 --- a/compute5/NGS_DNA/protocols/CopyToResultsDir.sh +++ b/compute5/NGS_DNA/protocols/CopyToResultsDir.sh @@ -95,6 +95,16 @@ do then cp ${intermediateDir}/${sample}.coveragePerBase.txt ${projectResultsDir}/coverage/ fi + if [ -f ${intermediateDir}/${sample}.coveragePerGene.txt ] + then + cp ${intermediateDir}/${sample}.coveragePerGene.txt ${projectResultsDir}/coverage/ + fi + if [ -f ${intermediateDir}/${sample}.coveragePerTarget.txt ] + then + cp ${intermediateDir}/${sample}.coveragePerTarget.txt ${projectResultsDir}/coverage/ + fi + + done echo "Copied vcf file + coveragePerBase.txt (8/11)" diff --git a/compute5/NGS_DNA/protocols/CountAllFinishedFiles.sh b/compute5/NGS_DNA/protocols/CountAllFinishedFiles.sh index 4193b9ba..f39bedea 100755 --- a/compute5/NGS_DNA/protocols/CountAllFinishedFiles.sh +++ b/compute5/NGS_DNA/protocols/CountAllFinishedFiles.sh @@ -1,7 +1,7 @@ #MOLGENIS walltime=00:01:00 mem=1gb #string projectJobsDir #string intermediateDir - +#string project cd $projectJobsDir countShScripts=`ls *.sh | wc -l` diff --git a/compute5/NGS_DNA/protocols/CoveragePerBase.sh b/compute5/NGS_DNA/protocols/CoveragePerBase.sh index 80392ff2..34da2af6 100755 --- a/compute5/NGS_DNA/protocols/CoveragePerBase.sh +++ b/compute5/NGS_DNA/protocols/CoveragePerBase.sh @@ -6,33 +6,52 @@ #string intermediateDir #string dedupBam #string project -#string sample +#string externalSampleID #string indexFile #string capturedIntervalsPerBase #string capturedBed #string GCC_Analysis +#string sampleNameID sleep 5 module load ${gatkVersion} +module load ngs-utils -if [ "${GCC_Analysis}" == "diagnostiek" ] || [ "${GCC_Analysis}" == "diagnostics" ] +if [ "${GCC_Analysis}" == "diagnostiek" ] || [ "${GCC_Analysis}" == "diagnostics" ] || [ "${GCC_Analysis}" == "Diagnostiek" ] || [ "${GCC_Analysis}" == "Diagnostics" ] then if [ -f ${capturedIntervalsPerBase} ] then java -Xmx10g -XX:ParallelGCThreads=4 -jar ${EBROOTGATK}/${gatkJar} \ -R ${indexFile} \ -T DepthOfCoverage \ - -o ${sample}.samtools.coveragePerBase \ + -o ${sampleNameID}.coveragePerBase \ + --omitLocusTable \ -I ${dedupBam} \ -L ${capturedIntervalsPerBase} - sed '1d' ${sample}.samtools.coveragePerBase > ${sample}.samtools.coveragePerBase_withoutHeader + sed '1d' ${sampleNameID}.coveragePerBase > ${sampleNameID}.coveragePerBase_withoutHeader - paste ${capturedIntervalsPerBase} ${sample}.samtools.coveragePerBase_withoutHeader > ${sample}.combined_bedfile_and_samtoolsoutput.txt + paste ${capturedIntervalsPerBase} ${sampleNameID}.coveragePerBase_withoutHeader > ${sampleNameID}.combined_bedfile_and_samtoolsoutput.txt - echo -e "chr\tstart\tstop\tgene\tcoverage" > ${sample}.coveragePerBase.txt + echo -e "chr\tstart\tstop\tgene\tcoverage" > ${sampleNameID}.coveragePerBase.txt + + awk -v OFS='\t' '{print $1,$2,$3,$5,$7}' ${sampleNameID}.combined_bedfile_and_samtoolsoutput.txt > ${sampleNameID}.coveragePerBase.txt + + java -Xmx10g -XX:ParallelGCThreads=4 -jar ${EBROOTGATK}/${gatkJar} \ + -R ${indexFile} \ + -T DepthOfCoverage \ + -o ${sampleNameID}.coveragePerTarget \ + -I ${dedupBam} \ + --omitDepthOutputAtEachBase \ + -L ${capturedBed} + + awk -v OFS='\t' '{print $1,$3}' ${sampleNameID}.coveragePerTarget.sample_interval_summary > ${sampleNameID}.coveragePerTarget.coveragePerTarget.txt.tmp + paste ${sampleNameID}.coveragePerTarget.coveragePerTarget.txt.tmp ${capturedBed}.genesOnly > ${sampleNameID}.coveragePerTarget.coveragePerTarget.txt + + python ${EBROOTNGSMINUTILS}/calculateCoveragePerGene.py --input ${sampleNameID}.coveragePerBase.txt --output ${sampleNameID}.coveragePerGene.txt.tmp + + sort ${sampleNameID}.coveragePerGene.txt.tmp > ${sampleNameID}.coveragePerGene.txt - awk -v OFS='\t' '{print $1,$2,$3,$5,$7}' ${sample}.combined_bedfile_and_samtoolsoutput.txt >> ${sample}.coveragePerBase.txt else echo "there is no capturedIntervalsPerBase: ${capturedIntervalsPerBase}, please run coverageperbase: (module load ngs-utils --> run coverage_per_base.sh)" fi @@ -40,3 +59,4 @@ else echo "CoveragePerBase skipped" fi + diff --git a/compute5/NGS_DNA/protocols/CramConversion.sh b/compute5/NGS_DNA/protocols/CramConversion.sh new file mode 100644 index 00000000..a8a0615f --- /dev/null +++ b/compute5/NGS_DNA/protocols/CramConversion.sh @@ -0,0 +1,38 @@ +#MOLGENIS walltime=23:59:00 mem=1gb ppn=9 +#string dedupBam +#string indexFile +#string dedupBamCram +#string dedupBamCramBam +#string indexFile +#string project + +module load io_lib +module list + +makeTmpDir ${dedupBamCram} +tmpDedupBamCram=${MC_tmpFile} + +makeTmpDir ${dedupBamCramBam} +tmpDedupBamCramBam=${MC_tmpFile} + +scramble \ +-I bam \ +-O cram \ +-r ${indexFile} \ +-m \ +-t 8 \ +${dedupBam} \ +${tmpDedupBamCram} + +scramble \ +-I cram \ +-O bam \ +-r ${indexFile} \ +-m \ +-t 8 \ +${tmpDedupBamCram} \ +${tmpDedupBamCramBam} + +echo "dirname" +mv ${tmpDedupBamCram} ${dedupBamCram} +mv ${tmpDedupBamCramBam} ${dedupBamCramBam} diff --git a/compute5/NGS_DNA/protocols/CreateInhouseProjects.sh b/compute5/NGS_DNA/protocols/CreateInhouseProjects.sh index 8d0edd39..477d1546 100755 --- a/compute5/NGS_DNA/protocols/CreateInhouseProjects.sh +++ b/compute5/NGS_DNA/protocols/CreateInhouseProjects.sh @@ -32,7 +32,7 @@ umask 0007 module list - +module load Molgenis-Compute/${computeVersion} # # Create project dirs. # @@ -123,10 +123,7 @@ fi echo "before run second rocket" echo pwd -NGS_DNA_HOME="/groups/umcg-gaf/tmp04/software/NGS_DNA-3.0.1/" -EBROOTMOLGENISMINCOMPUTE=/apps/software/Molgenis-Compute/v15.04.1-Java-1.7.0_80 - sh ${EBROOTMOLGENISMINCOMPUTE}/molgenis_compute.sh -p ${mainParameters} \ -p ${batchIDList} -p ${projectJobsDir}/${project}.csv -p ${environment_parameters} -rundir ${projectJobsDir} \ --w ${workflowpath} -b slurm -g -weave -runid ${runid} +--header ${EBROOTMOLGENISMINCOMPUTE}/templates/slurm/header.ftl -w ${workflowpath} -b slurm -g -weave -runid ${runid} diff --git a/compute5/NGS_DNA/protocols/Delly.sh b/compute5/NGS_DNA/protocols/Delly.sh index 4fc99666..f938c910 100755 --- a/compute5/NGS_DNA/protocols/Delly.sh +++ b/compute5/NGS_DNA/protocols/Delly.sh @@ -1,49 +1,25 @@ -#MOLGENIS walltime=23:59:00 mem=4gb +#MOLGENIS walltime=63:59:00 mem=10gb #string project #string indexFile #string intermediateDir #string dellyVersion #string dellyType -#list dedupBam +#string dellyInput #string dellyVcf module load delly/${dellyVersion} module list -#Function to check if array contains value -array_contains () { - local array="$1[@]" - local seeking=$2 - local in=1 - for element in "${!array-}"; do - if [[ $element == $seeking ]]; then - in=0 - break - fi - done - return $in -} - -UNIQUEBAMS=() - makeTmpDir ${dellyVcf} tmpDellyVcf=${MC_tmpFile} - -for bamFile in "${dedupBam[@]}" -do - array_contains UNIQUEBAMS "$bamFile" || UNIQUEBAMS+=("$bamFile") # If bamFile does not exist in array add it -done - -echo "Size of the UNIQUEBAMS: ${#UNIQUEBAMS[@]}" - ${EBROOTDELLY}/delly \ -n \ -t ${dellyType} \ -x human.hg19.excl.tsv \ -o ${tmpDellyVcf} \ -g ${indexFile} \ -${UNIQUEBAMS[@]} +${dellyInput} mv ${tmpDellyVcf} ${dellyVcf} echo "moved ${tmpDellyVcf} to ${dellyVcf}" diff --git a/compute5/NGS_DNA/protocols/DellyAnnotator.sh b/compute5/NGS_DNA/protocols/DellyAnnotator.sh index dc6587a7..0e16d59a 100755 --- a/compute5/NGS_DNA/protocols/DellyAnnotator.sh +++ b/compute5/NGS_DNA/protocols/DellyAnnotator.sh @@ -11,9 +11,10 @@ #string javaVersion #string molgenisAnnotatorVersion #string projectDellyAnnotatorOutputVcf +#string dellyVcf +#string dellySnpEffVcf -sleep 5 - +sleep 3 makeTmpDir ${projectDellyAnnotatorOutputVcf} tmpProjectDellyAnnotatorOutputVcf=${MC_tmpFile} @@ -23,14 +24,12 @@ ${stage} ${snpEffVersion} ${stage} ${molgenisAnnotatorVersion} ${checkStage} - - #Run Molgenis CmdLineAnnotator with snpEff to add "Gene_Name" to be used for HPO annotation java -Xmx10g -jar -Djava.io.tmpdir=${tempDir} ${EBROOTCMDLINEANNOTATOR}/CmdLineAnnotator-1.9.0.jar \ snpEff \ ${EBROOTSNPEFF}/snpEff.jar \ -${intermediateDir}/${project}.delly.vcf \ -${intermediateDir}/${project}.delly.snpeff.vcf +${dellyVcf} \ +${dellySnpEffVcf} echo "Finished SnpEff annotation" @@ -38,10 +37,10 @@ echo "Finished SnpEff annotation" java -Xmx10g -jar -Djava.io.tmpdir=${tempDir} ${EBROOTCMDLINEANNOTATOR}/CmdLineAnnotator-1.9.0.jar \ hpo \ ${hpoTerms} \ -${intermediateDir}/${project}.delly.snpeff.vcf \ +${dellySnpEffVcf} \ ${tmpProjectDellyAnnotatorOutputVcf} echo "Finished HPO annotation" - mv ${tmpProjectDellyAnnotatorOutputVcf} ${projectDellyAnnotatorOutputVcf} +mv ${tmpProjectDellyAnnotatorOutputVcf} ${projectDellyAnnotatorOutputVcf} diff --git a/compute5/NGS_DNA/protocols/FilterOnIntervalList.sh b/compute5/NGS_DNA/protocols/FilterOnIntervalList.sh index e8767b59..20d61acb 100755 --- a/compute5/NGS_DNA/protocols/FilterOnIntervalList.sh +++ b/compute5/NGS_DNA/protocols/FilterOnIntervalList.sh @@ -9,6 +9,7 @@ #string picardJar #string picardVersion #string samtoolsVersion +#string project module load ${samtoolsVersion} module load ${picardVersion} diff --git a/compute5/NGS_DNA/protocols/GenderCalculate.sh b/compute5/NGS_DNA/protocols/GenderCalculate.sh new file mode 100755 index 00000000..8079a0f5 --- /dev/null +++ b/compute5/NGS_DNA/protocols/GenderCalculate.sh @@ -0,0 +1,40 @@ +#MOLGENIS ppn=4 mem=6gb walltime=03:00:00 + +#string dedupBam +#string capturedIntervals +#string capturedIntervals_nonAutoChrX +#string indexFileDictionary +#string sampleNameID +#string intermediateDir +#string whichSex +#string tempDir +#string checkSexMeanCoverage +#string picardJar +#string hsMetricsNonAutosomalRegionChrX +#string project + +module load picard +sleep 5 + +makeTmpDir ${hsMetricsNonAutosomalRegionChrX} +tmpHsMetricsNonAutosomalRegionChrX=${MC_tmpFile} + +#make intervallist +if [ -f ${capturedIntervals_nonAutoChrX} ] +then + rm ${capturedIntervals_nonAutoChrX} +fi + +cp ${indexFileDictionary} ${capturedIntervals_nonAutoChrX} +awk '{if ($0 ~ /^X/){print $0}}' ${capturedIntervals} >> ${capturedIntervals_nonAutoChrX} + +#Calculate coverage chromosome X +java -jar -XX:ParallelGCThreads=2 -Xmx4g ${EBROOTPICARD}/${picardJar} CalculateHsMetrics \ +INPUT=${dedupBam} \ +TARGET_INTERVALS=${capturedIntervals_nonAutoChrX} \ +BAIT_INTERVALS=${capturedIntervals_nonAutoChrX} \ +TMP_DIR=${tempDir} \ +OUTPUT=${tmpHsMetricsNonAutosomalRegionChrX} + +mv ${tmpHsMetricsNonAutosomalRegionChrX} ${hsMetricsNonAutosomalRegionChrX} +echo "mv ${tmpHsMetricsNonAutosomalRegionChrX} ${hsMetricsNonAutosomalRegionChrX}" diff --git a/compute5/NGS_DNA/protocols/CheckSex.sh b/compute5/NGS_DNA/protocols/GenderCheck.sh similarity index 73% rename from compute5/NGS_DNA/protocols/CheckSex.sh rename to compute5/NGS_DNA/protocols/GenderCheck.sh index 872d0545..33dcaf2a 100755 --- a/compute5/NGS_DNA/protocols/CheckSex.sh +++ b/compute5/NGS_DNA/protocols/GenderCheck.sh @@ -4,34 +4,18 @@ #string capturedIntervals #string capturedIntervals_nonAutoChrX #string indexFileDictionary -#string sample #string intermediateDir #string whichSex #string tempDir #string checkSexMeanCoverage #string picardJar +#string hsMetricsNonAutosomalRegionChrX +#string project module load picard sleep 5 -#make intervallist -if [ -f ${capturedIntervals_nonAutoChrX} ] -then - rm ${capturedIntervals_nonAutoChrX} -fi - -cp ${indexFileDictionary} ${capturedIntervals_nonAutoChrX} -awk '{if ($0 ~ /^X/){print $0}}' ${capturedIntervals} >> ${capturedIntervals_nonAutoChrX} - -#Calculate coverage chromosome X -java -jar -XX:ParallelGCThreads=2 -Xmx4g ${EBROOTPICARD}/${picardJar} CalculateHsMetrics \ -INPUT=${dedupBam} \ -TARGET_INTERVALS=${capturedIntervals_nonAutoChrX} \ -BAIT_INTERVALS=${capturedIntervals_nonAutoChrX} \ -TMP_DIR=${tempDir} \ -OUTPUT=${dedupBam}.nonAutosomalRegionChrX_hs_metrics - -rm -rf ${sample}.checkSex.filter.meancoverage.txt +rm $checkSexMeanCoverage #select only the mean target coverage of the whole genome file awk '{ @@ -59,19 +43,20 @@ awk '{ }else{ print $22 } -}' ${dedupBam}.nonAutosomalRegionChrX_hs_metrics >> ${checkSexMeanCoverage} +}' ${hsMetricsNonAutosomalRegionChrX} >> ${checkSexMeanCoverage} perl -pi -e 's/\n/\t/' ${checkSexMeanCoverage} -RESULT=`awk '{ +RESULT=$(awk '{ if ( "NA" == $1 || "?" == $2 ){ + print "1) This is probably a whole genome sample, due to time saving there is no coverage calculated" print "Unknown" } else { printf "%.2f \n", $2/$1 } -}' ${checkSexMeanCoverage}` +}' ${checkSexMeanCoverage}) echo "RESULT: $RESULT" awk '{ @@ -85,6 +70,7 @@ awk '{ exit 0 } else if ( "NA" == $1 || "?" == $2 ) { + print "2) This is probably a whole genome sample, due to time saving there is no coverage calculated" print "Unknown" } else if ($2/$1 < 0.65 ){ diff --git a/compute5/NGS_DNA/protocols/MakeDedupBamMd5.sh b/compute5/NGS_DNA/protocols/MakeDedupBamMd5.sh index 5ee325e8..14ce3d4a 100755 --- a/compute5/NGS_DNA/protocols/MakeDedupBamMd5.sh +++ b/compute5/NGS_DNA/protocols/MakeDedupBamMd5.sh @@ -1,6 +1,7 @@ #MOLGENIS walltime=23:59:00 mem=4gb #string dedupBam +#string project md5sum ${dedupBam} > ${dedupBam}.md5 diff --git a/compute5/NGS_DNA/protocols/MarkDuplicates.sh b/compute5/NGS_DNA/protocols/MarkDuplicates.sh deleted file mode 100755 index 78f4c315..00000000 --- a/compute5/NGS_DNA/protocols/MarkDuplicates.sh +++ /dev/null @@ -1,46 +0,0 @@ -#MOLGENIS walltime=23:59:00 mem=6gb ppn=6 - -#Parameter mapping -#string stage -#string checkStage -#string picardVersion -#string markDuplicatesJar -#string sampleMergedBam -#string sampleMergedBamIdx -#string tempDir -#string dedupBam -#string dedupBamIdx -#string dedupMetrics -#string tmpDataDir -#string picardJar - -#Load Picard module -${stage} ${picardVersion} -${checkStage} - -makeTmpDir ${dedupBam} -tmpDedupBam=${MC_tmpFile} - -makeTmpDir ${dedupBamIdx} -tmpDedupBamIdx=${MC_tmpFile} - -makeTmpDir ${dedupMetrics} -tmpDedupMetrics=${MC_tmpFile} - -#Run picard, sort BAM file and create index on the fly -java -XX:ParallelGCThreads=4 -jar -Xmx4g ${EBROOTPICARD}/${picardJar} ${markDuplicatesJar} \ -INPUT=${sampleMergedBam} \ -METRICS_FILE=${tmpDedupMetrics} \ -OUTPUT=${tmpDedupBam} \ -REMOVE_DUPLICATES=false \ -CREATE_INDEX=true \ -CREATE_MD5_FILE=true \ -VALIDATION_STRINGENCY=LENIENT \ -MAX_RECORDS_IN_RAM=4000000 \ -TMP_DIR=${tempDir} - -echo -e "\nMarkDuplicates finished succesfull. Moving temp files to final.\n\n" -mv ${tmpDedupBam} ${dedupBam} -mv ${tmpDedupBamIdx} ${dedupBamIdx} -mv ${tmpDedupMetrics} ${dedupMetrics} - diff --git a/compute5/NGS_DNA/protocols/MergeIndelsAndSnps.sh b/compute5/NGS_DNA/protocols/MergeIndelsAndSnps.sh index a4a44b24..d7bd918c 100755 --- a/compute5/NGS_DNA/protocols/MergeIndelsAndSnps.sh +++ b/compute5/NGS_DNA/protocols/MergeIndelsAndSnps.sh @@ -8,6 +8,7 @@ #string checkStage #string projectPrefix #string intermediateDir +#string project #Function to check if array contains value array_contains () { diff --git a/compute5/NGS_DNA/protocols/SamToBam.sh b/compute5/NGS_DNA/protocols/SamToBam.sh index 9f0264bf..bcc75a3b 100755 --- a/compute5/NGS_DNA/protocols/SamToBam.sh +++ b/compute5/NGS_DNA/protocols/SamToBam.sh @@ -9,6 +9,7 @@ #string alignedSam #string tempDir #string intermediateDir +#string alignedSam #string alignedBam #string tmpDataDir #string project @@ -17,7 +18,7 @@ ${stage} ${picardVersion} ${checkStage} -makeTmpDir ${alignedSam} +makeTmpDir ${alignedBam} tmpAlignedBam=${MC_tmpFile} #Run picard, convert SAM to BAM diff --git a/compute5/NGS_DNA/protocols/Sambamba.sh b/compute5/NGS_DNA/protocols/Sambamba.sh index ac97bdcb..b634f202 100755 --- a/compute5/NGS_DNA/protocols/Sambamba.sh +++ b/compute5/NGS_DNA/protocols/Sambamba.sh @@ -1,10 +1,8 @@ -#MOLGENIS walltime=23:59:00 mem=6gb ppn=6 +#MOLGENIS walltime=23:59:00 mem=20gb ppn=5 #Parameter mapping #string stage #string checkStage -#string picardVersion -#string markDuplicatesJar #string sampleMergedBam #string sampleMergedBamIdx #string tempDir @@ -14,10 +12,14 @@ #string tmpDataDir #string picardJar #string sambambaVersion +#string sambambaTool +#string dedupMetrics +#string project #Load Picard module -${stage} sambamba +${stage} ${sambambaVersion} ${checkStage} +sleep 5 makeTmpDir ${dedupBam} tmpDedupBam=${MC_tmpFile} @@ -25,15 +27,34 @@ tmpDedupBam=${MC_tmpFile} makeTmpDir ${dedupBamIdx} tmpDedupBamIdx=${MC_tmpFile} +makeTmpDir ${dedupMetrics} +tmpDedupMetrics=${MC_tmpFile} + #Run picard, sort BAM file and create index on the fly -${EBROOTSAMBAMBA}/${sambambaVersion} markdup \ +${EBROOTSAMBAMBA}/${sambambaTool} markdup \ --nthreads=4 \ --overflow-list-size 1000000 \ --hash-table-size 1000000 \ --p --tmpdir=${tempDir} \ +-p \ +--tmpdir=${tempDir} \ ${sampleMergedBam} ${tmpDedupBam} + +#make metrics file +${EBROOTSAMBAMBA}/${sambambaTool} \ +flagstat \ +--nthreads=4 \ +${tmpDedupBam} > ${tmpDedupBam}.flagstat + +echo -e "READ_PAIR_DUPLICATES\tPERCENT_DUPLICATION" > ${tmpDedupMetrics} +sed -n '1p;4p' ${tmpDedupBam}.flagstat | awk '{print $1}' | perl -wpe 's|\n|\t|' | awk '{print $2"\t"($2/$1)*100}' >> ${tmpDedupMetrics} + echo -e "\nMarkDuplicates finished succesfull. Moving temp files to final.\n\n" mv ${tmpDedupBam} ${dedupBam} mv ${tmpDedupBamIdx} ${dedupBamIdx} +echo "moved ${tmpDedupBam} ${dedupBam}" +echo "mv ${tmpDedupBamIdx} ${dedupBamIdx}" + +mv ${tmpDedupMetrics} ${dedupMetrics} +echo "mv ${tmpDedupMetrics} ${dedupMetrics}" diff --git a/compute5/NGS_DNA/protocols/SambambaMerge.sh b/compute5/NGS_DNA/protocols/SambambaMerge.sh new file mode 100755 index 00000000..4d2b0ffb --- /dev/null +++ b/compute5/NGS_DNA/protocols/SambambaMerge.sh @@ -0,0 +1,72 @@ +#MOLGENIS walltime=23:59:00 mem=4gb ppn=10 + +#Parameter mapping +#string stage +#string checkStage +#string samtoolsVersion +#string sampleMergedBam +#string sampleMergedBai +#string sampleMergedBamIdx +#string tempDir +#list inputMergeBam +#string tmpDataDir +#string project +#string intermediateDir +#string sambambaVersion +#string sambambaTool + +sleep 5 + +#Function to check if array contains value +array_contains () { + local array="$1[@]" + local seeking=$2 + local in=1 + for element in "${!array-}"; do + if [[ "$element" == "$seeking" ]]; then + in=0 + break + fi + done + return $in +} + +makeTmpDir ${sampleMergedBam} +tmpSampleMergedBam=${MC_tmpFile} + +makeTmpDir ${sampleMergedBamIdx} +tmpSampleMergedBamIdx=${MC_tmpFile} + +${stage} ${sambambaVersion} +${checkStage} + +#Create string with input BAM files for Picard +#This check needs to be performed because Compute generates duplicate values in array +INPUTS=() +INPUTBAMS=() + +for bamFile in "${inputMergeBam[@]}" +do + array_contains INPUTS "$bamFile" || INPUTS+=("$bamFile") # If bamFile does not exist in array add it + array_contains INPUTBAMS "$bamFile" || INPUTBAMS+=("$bamFile") # If bamFile does not exist in array add it +done + +if [ ${#INPUTS[@]} == 1 ] +then + ln -sf ${INPUTBAMS[0]} ${sampleMergedBam} + echo "nothing to merge because there is only one sample" + +else + ${EBROOTSAMBAMBA}/${sambambaTool} merge \ + ${tmpSampleMergedBam} \ + ${INPUTS[@]} + + mv ${tmpSampleMergedBam} ${sampleMergedBam} + mv ${tmpSampleMergedBamIdx} ${sampleMergedBamIdx} + echo "mv ${tmpSampleMergedBam} ${sampleMergedBam}" + echo "mv ${tmpSampleMergedBamIdx} ${sampleMergedBamIdx}" + +fi + +ln -sf ${sampleMergedBamIdx} ${sampleMergedBai} +echo "ln -sf ${sampleMergedBamIdx} ${sampleMergedBai}" diff --git a/compute5/NGS_DNA/protocols/SambambaSort.sh b/compute5/NGS_DNA/protocols/SambambaSort.sh new file mode 100755 index 00000000..e7cddc68 --- /dev/null +++ b/compute5/NGS_DNA/protocols/SambambaSort.sh @@ -0,0 +1,34 @@ +#MOLGENIS walltime=23:59:00 mem=5gb ppn=10 + +#Parameter mapping +#string stage +#string checkStage +#string sambambaVersion +#string sambambaTool +#string alignedBam +#string alignedSortedBam +#string tmpDataDir +#string project +#string tempDir + +#Load Picard module +${stage} ${sambambaVersion} +${checkStage} + +makeTmpDir ${alignedSortedBam} +tmpAlignedSortedBam=${MC_tmpFile} + +${EBROOTSAMBAMBA}/${sambambaTool} sort \ +--tmpdir=${tempDir} \ +-t 10 \ +-m 4GB \ +-o ${tmpAlignedSortedBam} \ +${alignedBam} + +echo -e "\nSambambaSort finished succesfull. Moving temp files to final.\n\n" +mv ${tmpAlignedSortedBam} ${alignedSortedBam} +echo "mv ${tmpAlignedSortedBam} ${alignedSortedBam}" + + + + diff --git a/compute5/NGS_DNA/protocols/SequenomConcordanceCheck.sh b/compute5/NGS_DNA/protocols/SequenomConcordanceCheck.sh index d2b369b2..f62004c5 100755 --- a/compute5/NGS_DNA/protocols/SequenomConcordanceCheck.sh +++ b/compute5/NGS_DNA/protocols/SequenomConcordanceCheck.sh @@ -1,7 +1,7 @@ #MOLGENIS walltime=01:00:00 nodes=1 ppn=4 mem=6gb #string sampleConcordanceFile -#string sample +#string sampleNameID #string externalSampleID #string familyList #string arrayMapFile @@ -14,6 +14,7 @@ #string sequenomInfo #string gatkJar #string gatkVersion +#string project sleep 5 # @@ -75,10 +76,10 @@ else module list plink \ - --lfile ${sample}.concordance \ + --lfile ${sampleNameID}.concordance \ --recode \ --noweb \ - --out ${sample}.concordance \ + --out ${sampleNameID}.concordance \ --keep ${familyList} module unload plink @@ -88,23 +89,23 @@ else ##Create genotype VCF for sample plink108 \ --recode-vcf \ - --ped ${sample}.concordance.ped \ + --ped ${sampleNameID}.concordance.ped \ --map ${arrayMapFile} \ - --out ${sample}.concordance + --out ${sampleNameID}.concordance ##Rename plink.vcf to sample.vcf - mv ${sample}.concordance.vcf ${sample}.genotypeArray.vcf + mv ${sampleNameID}.concordance.vcf ${sampleNameID}.genotypeArray.vcf ##Replace chr23 and 24 with X and Y - perl -pi -e 's/^23/X/' ${sample}.genotypeArray.vcf - perl -pi -e 's/^24/Y/' ${sample}.genotypeArray.vcf + perl -pi -e 's/^23/X/' ${sampleNameID}.genotypeArray.vcf + perl -pi -e 's/^24/Y/' ${sampleNameID}.genotypeArray.vcf ##Remove family ID from sample in header genotype VCF - perl -pi -e 's/1_${externalSampleID}/${externalSampleID}/' ${sample}.genotypeArray.vcf + perl -pi -e 's/1_${externalSampleID}/${externalSampleID}/' ${sampleNameID}.genotypeArray.vcf ##Create binary ped (.bed) and make tab-delimited .fasta file for all genotypes - sed -e 's/chr//' ${sample}.genotypeArray.vcf | awk '{OFS="\t"; if (!/^#/){print $1,$2-1,$2}}' \ - > ${sample}.genotypeArray.bed + sed -e 's/chr//' ${sampleNameID}.genotypeArray.vcf | awk '{OFS="\t"; if (!/^#/){print $1,$2-1,$2}}' \ + > ${sampleNameID}.genotypeArray.bed if [ "${build}" == "build37" ] then @@ -119,30 +120,30 @@ then ##Create tabular fasta from bed fastaFromBed \ -fi ${indexFile} \ - -bed ${sample}.genotypeArray.bed \ - -fo ${sample}.genotypeArray.fasta -tab + -bed ${sampleNameID}.genotypeArray.bed \ + -fo ${sampleNameID}.genotypeArray.fasta -tab ##Align vcf to reference AND DO NOT FLIP STRANDS!!! (genotype data is already in forward-forward format) If flipping is needed use "-f" command before sample.genotype_array.vcf align-vcf-to-ref.pl -f \ - ${sample}.genotypeArray.vcf \ - ${sample}.genotypeArray.fasta \ - ${sample}.genotypeArray.aligned_to_ref.vcf \ - > ${sample}.genotypeArray.aligned_to_ref.vcf.out + ${sampleNameID}.genotypeArray.vcf \ + ${sampleNameID}.genotypeArray.fasta \ + ${sampleNameID}.genotypeArray.aligned_to_ref.vcf \ + > ${sampleNameID}.genotypeArray.aligned_to_ref.vcf.out ##Some GATK versions sort header alphabetically, which results in wrong individual genotypes. So cut header from "original" sample.genotype_array.vcf and replace in sample.genotype_array.aligned_to_ref.lifted_over.out - head -3 ${sample}.genotypeArray.vcf > ${sample}.genotypeArray.header.txt + head -3 ${sampleNameID}.genotypeArray.vcf > ${sampleNameID}.genotypeArray.header.txt - sed '1,3d' ${sample}.genotypeArray.aligned_to_ref.vcf \ - > ${sample}.genotypeArray.headerless.vcf + sed '1,3d' ${sampleNameID}.genotypeArray.aligned_to_ref.vcf \ + > ${sampleNameID}.genotypeArray.headerless.vcf - cat ${sample}.genotypeArray.header.txt \ - ${sample}.genotypeArray.headerless.vcf \ - > ${sample}.genotypeArray.updated.header.vcf + cat ${sampleNameID}.genotypeArray.header.txt \ + ${sampleNameID}.genotypeArray.headerless.vcf \ + > ${sampleNameID}.genotypeArray.updated.header.vcf ##Create interval_list of CHIP SNPs to call SNPs in sequence data on iChip_pos_to_interval_list.pl \ - ${sample}.genotypeArray.updated.header.vcf \ - ${sample}.genotypeArray.updated.header.interval_list + ${sampleNameID}.genotypeArray.updated.header.vcf \ + ${sampleNameID}.genotypeArray.updated.header.interval_list module unload GATK module load 1.2-1-g33967a4 @@ -155,23 +156,23 @@ then -T UnifiedGenotyper \ -R ${indexFile} \ -I ${dedupBam} \ - -o ${sample}.concordance.allSites.vcf \ + -o ${sampleNameID}.concordance.allSites.vcf \ -stand_call_conf 30.0 \ -stand_emit_conf 10.0 \ -out_mode EMIT_ALL_SITES \ - -L ${sample}.genotypeArray.updated.header.interval_list + -L ${sampleNameID}.genotypeArray.updated.header.interval_list ##Change FILTER column from GATK "called SNPs". All SNPs having Q20 & DP10 change to "PASS", all other SNPs are "filtered" (not used in concordance check) change_vcf_filter.pl \ - ${sample}.concordance.allSites.vcf \ - ${sample}.concordance.q20.dp10.vcf 10 20 + ${sampleNameID}.concordance.allSites.vcf \ + ${sampleNameID}.concordance.q20.dp10.vcf 10 20 ##Calculate condordance between genotype SNPs and GATK "called SNPs" java -Xmx2g -Djava.io.tmpdir=${tempDir} -jar ${EBROOTGATK}/${gatkJar} \ -T VariantEval \ - -eval:eval,VCF ${sample}.concordance.q20.dp10.vcf \ - -comp:comp_immuno,VCF ${sample}.genotypeArray.updated.header.vcf \ - -o ${sample}.concordance.q20.dp10.eval \ + -eval:eval,VCF ${sampleNameID}.concordance.q20.dp10.vcf \ + -comp:comp_immuno,VCF ${sampleNameID}.genotypeArray.updated.header.vcf \ + -o ${sampleNameID}.concordance.q20.dp10.eval \ -R ${indexFile} \ -D:dbSNP,VCF ${dbSNPExSiteAfter129Vcf} \ -EV GenotypeConcordance @@ -187,7 +188,7 @@ then ##Retrieve name,step,#SNPs,%dbSNP,Ti/Tv known,Ti/Tv Novel,Non-Ref Sensitivity,Non-Ref discrepancy,Overall concordance from sample.q20_dp10_concordance.eval ##Don't forget to add .libPaths("/target/gpfs2/gcc/tools/GATK-1.3-24-gc8b1c92/public/R") to your ~/.Rprofile extract_info_GATK_variantEval_V3.R \ - --in ${sample}.concordance.q20.dp10.eval \ + --in ${sampleNameID}.concordance.q20.dp10.eval \ --step q20_dp10_concordance \ --name ${externalSampleID} \ --comp comp_immuno \ diff --git a/compute5/NGS_DNA/protocols/TrimReads.sh b/compute5/NGS_DNA/protocols/TrimReads.sh index 9912f309..589d3ada 100644 --- a/compute5/NGS_DNA/protocols/TrimReads.sh +++ b/compute5/NGS_DNA/protocols/TrimReads.sh @@ -1,4 +1,4 @@ -#MOLGENIS nodes=1 ppn=1 mem=10gb walltime=05:00:00 +#MOLGENIS nodes=1 ppn=2 mem=20gb walltime=25:00:00 #Parameter mapping #string seqType @@ -10,6 +10,7 @@ #string peEnd2BarcodeTrimmedFqGz #string srBarcodeTrimmedFqGz #string cutadaptVersion +#string project #Load module module load cutadapt/${cutadaptVersion} diff --git a/compute5/NGS_DNA/protocols/VariantGVCFCalling.sh b/compute5/NGS_DNA/protocols/VariantGVCFCalling.sh index 1f9f3d8e..bb0650eb 100644 --- a/compute5/NGS_DNA/protocols/VariantGVCFCalling.sh +++ b/compute5/NGS_DNA/protocols/VariantGVCFCalling.sh @@ -20,6 +20,7 @@ #string sampleBatchVariantCallsFemaleIdx #string tmpDataDir #string externalSampleID +#string project #string dedupBam @@ -99,7 +100,7 @@ else -L ${capturedBatchBed} \ --emitRefConfidence GVCF \ -ploidy 2 - elif [ "${sex}" == "Female" ] + elif [ "${sex}" == "Female" || "${sex}" == "Unknown" ] then echo "X (female)" #Run GATK HaplotypeCaller in DISCOVERY mode to call SNPs and indels @@ -116,6 +117,9 @@ else -L ${capturedBatchBed} \ --emitRefConfidence GVCF \ -ploidy 2 + else + echo "The sex has not a known option (Male, Female, Unknown)" + exit 1 fi elif [[ "${capturedBatchBed}" == *batch-[0-9]*Y.bed ]] then diff --git a/compute5/NGS_DNA/protocols/VcfToTable.sh b/compute5/NGS_DNA/protocols/VcfToTable.sh index ee8ddf17..1ce7c850 100755 --- a/compute5/NGS_DNA/protocols/VcfToTable.sh +++ b/compute5/NGS_DNA/protocols/VcfToTable.sh @@ -7,6 +7,7 @@ #string projectPrefix #string intermediateDir #list externalSampleID +#string project makeTmpDir ${variantsFinalProjectVcfTable} tmpVariantsFinalProjectVcfTable=${MC_tmpFile} diff --git a/compute5/NGS_DNA/workflow.csv b/compute5/NGS_DNA/workflow.csv index d76d26a6..1a1baf31 100755 --- a/compute5/NGS_DNA/workflow.csv +++ b/compute5/NGS_DNA/workflow.csv @@ -4,11 +4,13 @@ s02_CheckIlluminaEncoding,protocols/CheckIlluminaEncoding.sh,s01_SpikePhiX s03_FastQC,protocols/FastQC.sh,s02_CheckIlluminaEncoding s04_BwaAlign,protocols/BwaAlign.sh,s02_CheckIlluminaEncoding s05_SamToBam,protocols/SamToBam.sh,s04_BwaAlign -s06_SortBam,protocols/SortBam.sh,s05_SamToBam -s07_MergeBam,protocols/MergeBam.sh,s06_SortBam;inputMergeBam=alignedSortedBam;inputMergeBamIdx=alignedSortedBamIdx -s08_MarkDuplicates,protocols/Sambamba.sh,s07_MergeBam -s10a_Delly_DEL,protocols/Delly.sh,s08_MarkDuplicates;dellyType=dellyTypeDEL -s10b_DellyAnnotator,protocols/DellyAnnotator.sh,s10a_Delly_DEL +s06_SambambaSort,protocols/SambambaSort.sh,s05_SamToBam +s07_SambambaMerge,protocols/SambambaMerge.sh,s06_SambambaSort;inputMergeBam=alignedSortedBam +s08_MarkDuplicates,protocols/Sambamba.sh,s07_SambambaMerge +s09a_Delly_DEL,protocols/Delly.sh,s07_SambambaMerge;dellyType=dellyTypeDEL;dellyInput=sampleMergedBam +s09b_DellyAnnotator,protocols/DellyAnnotator.sh,s09a_Delly_DEL +s10a_GenderCalculate,protocols/GenderCalculate.sh,s08_MarkDuplicates +#s10b_CramConversion,protocols/CramConversion.sh,s08_MarkDuplicates s11_MakeDedupBamMd5,protocols/MakeDedupBamMd5.sh,s08_MarkDuplicates s12_SequenomConcordanceCheck,protocols/SequenomConcordanceCheck.sh,s08_MarkDuplicates s13_CoveragePerBase,protocols/CoveragePerBase.sh,s08_MarkDuplicates @@ -16,8 +18,8 @@ s14a_CollectMultipleMetrics,protocols/CollectMultipleMetrics.sh,s08_MarkDuplicat s14b_CollectHSMetrics,protocols/CollectHSMetrics.sh,s08_MarkDuplicates s14c_CollectGCBiasMetrics,protocols/CollectGCBiasMetrics.sh,s08_MarkDuplicates s14d_CollectBamIndexMetrics,protocols/CollectBamIndexMetrics.sh,s08_MarkDuplicates -s15_CheckSex,protocols/CheckSex.sh,s14a_CollectMultipleMetrics -s16a_VariantCalling,protocols/VariantGVCFCalling.sh,s15_CheckSex +s15_GenderCheck,protocols/GenderCheck.sh,s10a_GenderCalculate;s14b_CollectHSMetrics +s16a_VariantCalling,protocols/VariantGVCFCalling.sh,s15_GenderCheck s16c_GenotypeVariants,protocols/VariantGVCFGenotyping.sh,s16a_VariantCalling s17_MergeBatches,protocols/MergeBatches.sh,s16c_GenotypeVariants s18_SnpEff,protocols/SnpEff.sh,s17_MergeBatches @@ -30,5 +32,5 @@ s23_MergeIndelsAndSnps,protocols/MergeIndelsAndSnps.sh,s21b_IndelFiltration;s22_ s24_VcfToTable,protocols/VcfToTable.sh,s23_MergeIndelsAndSnps s25_InSilicoConcordance,protocols/InSilicoConcordance.sh,s24_VcfToTable s26_QCReport,protocols/QCReport.sh,s12_SequenomConcordanceCheck;s25_InSilicoConcordance -s27_CountAllFinishedFiles,protocols/CountAllFinishedFiles.sh,s26_QCReport +s27_CountAllFinishedFiles,protocols/CountAllFinishedFiles.sh,s26_QCReport;s10b_DellyAnnotator s28_CopyToResultsDir,protocols/CopyToResultsDir.sh,s27_CountAllFinishedFiles diff --git a/compute5/NGS_DNA/workflow_samplesize_bigger_than_200.csv b/compute5/NGS_DNA/workflow_samplesize_bigger_than_200.csv index 59e32e73..f7aa8f08 100755 --- a/compute5/NGS_DNA/workflow_samplesize_bigger_than_200.csv +++ b/compute5/NGS_DNA/workflow_samplesize_bigger_than_200.csv @@ -6,7 +6,7 @@ s04_BwaAlign,protocols/BwaAlign.sh,s02_CheckIlluminaEncoding s05_SamToBam,protocols/SamToBam.sh,s04_BwaAlign s06_SortBam,protocols/SortBam.sh,s05_SamToBam s07_MergeBam,protocols/MergeBam.sh,s06_SortBam;inputMergeBam=alignedSortedBam;inputMergeBamIdx=alignedSortedBamIdx -s08_MarkDuplicates,protocols/MarkDuplicates.sh,s07_MergeBam +s08_MarkDuplicates,protocols/Sambamba.sh,s07_MergeBam s10a_Delly_DEL,protocols/Delly.sh,s08_MarkDuplicates;dellyType=dellyTypeDEL s10b_DellyAnnotator,protocols/DellyAnnotator.sh,s10a_Delly_DEL s11_MakeDedupBamMd5,protocols/MakeDedupBamMd5.sh,s08_MarkDuplicates @@ -31,5 +31,5 @@ s23_MergeIndelsAndSnps,protocols/MergeIndelsAndSnps.sh,s21b_IndelFiltration;s22_ s24_VcfToTable,protocols/VcfToTable.sh,s23_MergeIndelsAndSnps s25_InSilicoConcordance,protocols/InSilicoConcordance.sh,s24_VcfToTable s26_QCReport,protocols/QCReport.sh,s12_SequenomConcordanceCheck;s25_InSilicoConcordance -s27_CountAllFinishedFiles,protocols/CountAllFinishedFiles.sh,s26_QCReport +s27_CountAllFinishedFiles,protocols/CountAllFinishedFiles.sh,s26_QCReport;s10b_DellyAnnotator s28_CopyToResultsDir,protocols/CopyToResultsDir.sh,s27_CountAllFinishedFiles diff --git a/compute5/NGS_DNA/workflow_trimReads.csv b/compute5/NGS_DNA/workflow_trimReads.csv index a635787d..27dd4bae 100755 --- a/compute5/NGS_DNA/workflow_trimReads.csv +++ b/compute5/NGS_DNA/workflow_trimReads.csv @@ -7,7 +7,7 @@ s04_BwaAlign,protocols/BwaAlign.sh,s02_CheckIlluminaEncoding s05_SamToBam,protocols/SamToBam.sh,s04_BwaAlign s06_SortBam,protocols/SortBam.sh,s05_SamToBam s07_MergeBam,protocols/MergeBam.sh,s06_SortBam;inputMergeBam=alignedSortedBam;inputMergeBamIdx=alignedSortedBamIdx -s08_MarkDuplicates,protocols/MarkDuplicates.sh,s07_MergeBam +s08_MarkDuplicates,protocols/Sambamba.sh,s07_MergeBam s10a_Delly_DEL,protocols/Delly.sh,s08_MarkDuplicates;dellyType=dellyTypeDEL s10b_DellyAnnotator,protocols/DellyAnnotator.sh,s10a_Delly_DEL s11_MakeDedupBamMd5,protocols/MakeDedupBamMd5.sh,s08_MarkDuplicates @@ -31,5 +31,5 @@ s23_MergeIndelsAndSnps,protocols/MergeIndelsAndSnps.sh,s21b_IndelFiltration;s22_ s24_VcfToTable,protocols/VcfToTable.sh,s23_MergeIndelsAndSnps s25_InSilicoConcordance,protocols/InSilicoConcordance.sh,s24_VcfToTable s26_QCReport,protocols/QCReport.sh,s12_SequenomConcordanceCheck;s25_InSilicoConcordance -s27_CountAllFinishedFiles,protocols/CountAllFinishedFiles.sh,s26_QCReport +s27_CountAllFinishedFiles,protocols/CountAllFinishedFiles.sh,s26_QCReport;s10b_DellyAnnotator s28_CopyToResultsDir,protocols/CopyToResultsDir.sh,s27_CountAllFinishedFiles