Merge pull request #192 from RoanKanninga/master

fix coverageCalculations issue + check for bedfile
molgenis · Mar 26, 2018 · ea01f2a · ea01f2a
2 parents 8e88bbf + ad5d807
commit ea01f2a
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -16,6 +16,7 @@ The bwa-mem command from Burrows-Wheeler Aligner(BWA) [[2]](#r2) is used to alig
 The GATK [[4]](#r4) HaplotypeCaller estimates the most likely genotypes and allele frequencies in an alignment using a Bayesian likelihood model for every position of the genome regardless of whether a variant was detected at that site or not. This information can later be used in the project based genotyping step.
 A joint analysis has been performed of all the samples in the project. This leads to a posterior probability of a variant allele at a site. SNPs and small Indels are written to a VCF file, along with information such as genotype quality, allele frequency, strand bias and read depth for that SNP/Indel. Based on quality thresholds from the GATK "best practices" [[5]](#r5). The SNPs and indels are filtered and marked as Lowqual or Pass resulting in a final VCF file.
 
+
 ### References
 <a name="r1"> 1. Andrews S. (2010). FastQC: a quality control tool for high throughput sequence data. Available online at:http://www.bioinformatics.babraham.ac.uk/projects/fastqc </a>
 

diff --git a/protocols/CoverageCalculations.sh b/protocols/CoverageCalculations.sh
@@ -78,7 +78,7 @@ then
 
 		awk -v OFS='\t' '{print $1,$3}' "${sampleNameID}.${perTarget}.coveragePerTarget.sample_interval_summary" | sed '1d' > "${sampleNameID}.${perTarget}.coveragePerTarget.coveragePerTarget.txt.tmp.tmp"
 		sort -V "${sampleNameID}.${perTarget}.coveragePerTarget.coveragePerTarget.txt.tmp.tmp" > "${sampleNameID}.${perTarget}.coveragePerTarget.coveragePerTarget.txt.tmp"
-		perl -pi -e 's|-|\^|' "${perTargetDir}/${perTarget}.genesOnly" > "${sampleNameID}.${perTarget}.coveragePerTarget.genesOnly.tmp"
+		perl -p -e 's|-|\^|' "${perTargetDir}/${perTarget}.genesOnly" > "${sampleNameID}.${perTarget}.coveragePerTarget.genesOnly.tmp"
 		paste "${sampleNameID}.${perTarget}.coveragePerTarget.coveragePerTarget.txt.tmp" "${sampleNameID}.${perTarget}.coveragePerTarget.genesOnly.tmp" > "${sampleNameID}.${perTarget}.coveragePerTarget_inclGenes.txt"
 		##Paste command produces ^M character
 

diff --git a/protocols/CreateExternSamplesProjects.sh b/protocols/CreateExternSamplesProjects.sh
@@ -33,6 +33,10 @@
 #list lane
 #string ngsUtilsVersion
 
+#string dataDir
+#string coveragePerBaseDir
+#string coveragePerTargetDir
+
 set -e 
 set -u
 
@@ -113,10 +117,29 @@ extract_samples_from_GAF_list.pl --i "${worksheet}" --o "${projectJobsDir}/${pro
 
 batching="_small"
 
-capturingKitProject=$(python ${EBROOTNGS_DNA}/scripts/getCapturingKit.py "${projectJobsDir}/${project}.csv")
+capturingKitProject=$(python ${EBROOTNGS_DNA}/scripts/getCapturingKit.py "${projectJobsDir}/${project}.csv" | sed 's|\\||' )
+captKit=$(echo "capturingKitProject" | awk 'BEGIN {FS="/"}{print $2}')
+
+if [ ! -d "${dataDir}/${capturingKitProject}" ]
+then
+	echo "Bedfile does not exist! Exiting"
+        exit 1
+fi
+
 if [[ "${capturingKitProject}" == *"Exoom"* || "${capturingKitProject}" == *"All_Exon_v1"* || "${capturingKitProject}" == *"wgs"* || "${capturingKitProject}" == *"WGS"* ]]
 then
-        batching="_chr"
+	batching="_chr"
+        if [ ! -e "${coveragePerTargetDir}/${captKit}/${captKit}" ]
+        then
+		echo "Bedfile in ${coveragePerTargetDir} does not exist! Exiting"
+                exit 1
+        fi
+else
+	if [ ! -e "${coveragePerBaseDir}/${captKit}/${captKit}" ]
+        then
+		echo "Bedfile in ${coveragePerBaseDir} does not exist! Exiting"
+                exit 1
+        fi
 fi
 
 if [ -f .compute.properties ];

diff --git a/protocols/CreateInhouseProjects.sh b/protocols/CreateInhouseProjects.sh
@@ -31,6 +31,11 @@
 #string ngsversion
 #string ngsUtilsVersion
 
+#string dataDir
+
+#string coveragePerBaseDir
+#string coveragePerTargetDir
+
 #string project
 #string logsDir 
 
@@ -118,12 +123,32 @@ fi
 
 batching="_small"
 
-capturingKitProject=$(python ${EBROOTNGS_DNA}/scripts/getCapturingKit.py "${projectJobsDir}/${project}.csv")
+capturingKitProject=$(python ${EBROOTNGS_DNA}/scripts/getCapturingKit.py "${projectJobsDir}/${project}.csv" | sed 's|\\||')
+captKit=$(echo "capturingKitProject" | awk 'BEGIN {FS="/"}{print $2}')
+
+if [ ! -d "${dataDir}/${capturingKitProject}" ]
+then
+	echo "Bedfile does not exist! Exiting"
+	exit 1
+fi
+
 if [[ "${capturingKitProject}" == *"Exoom"* || "${capturingKitProject}" == *"All_Exon_v1"* || "${capturingKitProject}" == *"wgs"* || "${capturingKitProject}" == *"WGS"* ]]
 then
 	batching="_chr"
+	if [ ! -e "${coveragePerTargetDir}/${captKit}/${captKit}" ]
+	then
+		echo "Bedfile in ${coveragePerTargetDir} does not exist! Exiting"
+		exit 1
+	fi
+else
+	if [ ! -e "${coveragePerBaseDir}/${captKit}/${captKit}" ]
+        then
+                echo "Bedfile in ${coveragePerBaseDir} does not exist! Exiting"
+                exit 1
+        fi
 fi
 
+
 echo "BATCHIDLIST=${EBROOTNGS_DNA}/batchIDList${batching}.csv"
 
 sh "${EBROOTMOLGENISMINCOMPUTE}/molgenis_compute.sh" \