Skip to content

Commit

Permalink
Merge pull request #468 from Gerbenvandervries/NGS_RNA-3.2.2
Browse files Browse the repository at this point in the history
for release 3.2.2, variant calling per chr, bugfixes qc report per sample…
  • Loading branch information
RoanKanninga committed Mar 30, 2016
2 parents e290508 + f147d08 commit db521ab
Show file tree
Hide file tree
Showing 14 changed files with 248 additions and 116 deletions.
26 changes: 26 additions & 0 deletions compute5/NGS_RNA/chromosomes.homo_sapiens.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
chr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
23 changes: 23 additions & 0 deletions compute5/NGS_RNA/chromosomes.mus_musculus.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
chr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
11 changes: 3 additions & 8 deletions compute5/NGS_RNA/generate_template.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,9 @@ GAF="/groups/umcg-gaf/${TMPDIR}"
BUILD="b37" # b37, b38
ENVIRONMENT="calculon" # zinc-finger, calculon
SPECIES="homo_sapiens" # callithrix_jacchus, mus_musculus, homo_sapiens
PIPELINE="lexogen" # hisat, lexogen
PIPELINE="hisat" # hisat, lexogen

SAMPLESIZE=$(cat ${GAF}/generatedscripts/${PROJECT}/${PROJECT}.csv | wc -l)
if [ $SAMPLESIZE -gt 200 ]
then
WORKFLOW=${EBROOTNGS_RNA}/workflow_${PIPELINE}_samplesize_bigger_than_200.csv
else
WORKFLOW=${EBROOTNGS_RNA}/workflow_${PIPELINE}.csv
fi
WORKFLOW=${EBROOTNGS_RNA}/workflow_${PIPELINE}.csv

if [ -f .compute.properties ];
then
Expand Down Expand Up @@ -59,4 +53,5 @@ ngsversion=$(module list | grep -o -P 'NGS_RNA(.+)');\
worksheet=${GAF}/generatedscripts/${PROJECT}/${PROJECT}.csv;\
parameters_build=${GAF}/generatedscripts/${PROJECT}/parameters.${BUILD}.csv;\
parameters_species=${GAF}/generatedscripts/${PROJECT}/parameters.${SPECIES}.csv;\
parameters_chromosomes=${EBROOTNGS_RNA}/chromosomes.${SPECIES}.csv;\
parameters_environment=${GAF}/generatedscripts/${PROJECT}/parameters.${ENVIRONMENT}.csv;"
11 changes: 7 additions & 4 deletions compute5/NGS_RNA/parameters.csv
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ ensembleDir,${dataDir}/ftp.ensembl.org/pub/release-${ensembleReleaseVersion}/gtf
geneAnnotationTxt,${ensembleDir}/${annotationFileName}.${ensembleReleaseVersion}.annotation.geneIds.txt.gz
annotationGtf,${ensembleDir}/${annotationFileName}.${ensembleReleaseVersion}.gtf
annotationRefFlat,${ensembleDir}/${annotationFileName}.${ensembleReleaseVersion}.gtf.annotation.refFlat
annotationIntervalList,${ensembleDir}/${annotationFileName}.${ensembleReleaseVersion}.rrna.interval_list
indexFile,${indexSpecies}
dbsnpVcf,${dbSNPDir}${dbSNPFileID}.vcf

Expand Down Expand Up @@ -106,16 +107,18 @@ GatkHaplotypeCallerGvcf,${intermediateDir}${externalSampleID}.GatkHaplotypeCalle
GatkHaplotypeCallerGvcfidx,${intermediateDir}${externalSampleID}.GatkHaplotypeCallerGvcf.g.vcf.idx
GatkMergeGvcf,${intermediateDir}${externalSampleID}.MergeGvcf.g.vcf
GatkMergeGvcfidx,${intermediateDir}.MergeGvcf.g.vcf.idx
projectBatchGenotypedVariantCalls,${projectPrefix}.variant.calls.genotyped.vcf
projectBatchCombinedVariantCalls,${projectPrefix}.variant.calls.combined.g.vcf
projectBatchGenotypedVariantCalls,${projectPrefix}.variant.calls.genotyped.chr${chr}.vcf
projectBatchCombinedVariantCalls,${projectPrefix}.variant.calls.combined.chr${chr}.g.vcf

##### Protocols 2,7 (QCStats, QC_Report) #####
collectMultipleMetricsPrefix,${intermediateDir}${externalSampleID}
flagstatMetrics,${intermediateDir}${externalSampleID}.flagstat
dupStatMetrics,${intermediateDir}${externalSampleID}.mdupmetrics
rnaSeqMetrics,${intermediateDir}${externalSampleID}.collectrnaseqmetrics
alignmentMetrics,${intermediateDir}${externalSampleID}.alignment_summary_metrics
insertsizeMetrics,${intermediateDir}${externalSampleID}.insertsizemetrics
insertsizeMetricspdf,${intermediateDir}${externalSampleID}.insertsizemetrics.pdf
insertsizeMetricspng,${intermediateDir}${externalSampleID}.insertsizemetrics.png
insertsizeMetricspdf,${intermediateDir}${externalSampleID}.insert_size_histogram.pdf
insertsizeMetricspng,${intermediateDir}${externalSampleID}.insert_size_histogram.png
qcMatricsList,${intermediateDir}/${project}_qcMatricsList.txt
gcPlotList,${intermediateDir}/${project}_gcPlotList.txt
recreateinsertsizepdfR,createInsertSizePlot.R
Expand Down
5 changes: 3 additions & 2 deletions compute5/NGS_RNA/parameters.homo_sapiens.csv
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
#### GENOME VARIABLES ####
species,homo_sapiens
annotationFileName,Homo_sapiens.${GR}
indexFileID,human_g1k_v37_phiX
indexFileID,human_g1k_v37
indexFolderName,human_g1k_v37
dbSNPFileID,dbsnp_137.${genome}
indicesDir,${dataDir}/1000G/phase1
indicesDir,${dataDir}/ftp.broadinstitute.org/bundle/2.8/${genome}/
dbSNPDir,${dataDir}/dbSNP/
hisatIndex,${dataDir}/ftp.broadinstitute.org/bundle/2.8/${genome}/${hisatVersion}/${indexFolderName}
kallistoIndex,${indicesDir}/${indexFileID}.cdna.all.fa.idx
indexFileFastaIndex,${indexSpecies}.fai
indexSpecies,${indicesDir}/${indexFileID}.fasta
indexChrIntervalList,${indicesDir}/${indexFileID}.chr${chr}.interval_list
indexFileDictionary,${indexSpecies}.dict
56 changes: 37 additions & 19 deletions compute5/NGS_RNA/protocols/CopyToResultsDir.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
#string pythonVersion
#string gatkVersion
#string ghostscriptVersion
#string kallistoVersion
#string ensembleReleaseVersion

# Change permissions
Expand All @@ -39,6 +38,7 @@ mkdir -p ${projectResultsDir}/expression/perSampleExpression
mkdir -p ${projectResultsDir}/expression/expressionTable
mkdir -p ${projectResultsDir}/images
mkdir -p ${projectResultsDir}/variants
mkdir -p ${projectResultsDir}/qcmetrics

# Copy error, out and finished logs to project jobs directory

Expand All @@ -56,7 +56,7 @@ cp ${projectJobsDir}/${project}.csv ${projectResultsDir}

# Copy BAM plus index plus md5 sum to results directory

if [ -f "${intermediateDir}/*.unique_mapping_reads.sorted.merged.dedup.splitAndTrim.bam" ]
if [ "${intermediateDir}/*.unique_mapping_reads.sorted.merged.dedup.splitAndTrim.bam" ]
then
cp ${intermediateDir}/*.unique_mapping_reads.sorted.merged.dedup.splitAndTrim.bam ${projectResultsDir}/alignment
cp ${intermediateDir}/*.unique_mapping_reads.sorted.merged.dedup.splitAndTrim.bam.md5 ${projectResultsDir}/alignment
Expand All @@ -69,8 +69,25 @@ else
cp ${intermediateDir}/*.unique_mapping_reads.sorted.merged.dedup.bai.md5 ${projectResultsDir}/alignment

fi
cp ${intermediateDir}/*.hisat.final.log ${projectResultsDir}/alignment
cp ${intermediateDir}/*.flagstat ${projectResultsDir}/alignment

# copy qc metrics to qcmetrics folder

cp ${intermediateDir}/*.hisat.log ${projectResultsDir}/qcmetrics
cp ${intermediateDir}/*.quality_by_cycle_metrics ${projectResultsDir}/qcmetrics
cp ${intermediateDir}/*.quality_by_cycle.pdf ${projectResultsDir}/qcmetrics
cp ${intermediateDir}/*.quality_distribution.pdf ${projectResultsDir}/qcmetrics
cp ${intermediateDir}/*.quality_distribution_metrics ${projectResultsDir}/qcmetrics
cp ${intermediateDir}/*.base_distribution_by_cycle.pdf ${projectResultsDir}/qcmetrics
cp ${intermediateDir}/*.base_distribution_by_cycle_metrics ${projectResultsDir}/qcmetrics
cp ${intermediateDir}/*.alignment_summary_metrics ${projectResultsDir}/qcmetrics
cp ${intermediateDir}/*.flagstat ${projectResultsDir}/qcmetrics
cp ${intermediateDir}/*.mdupmetrics ${projectResultsDir}/qcmetrics
cp ${intermediateDir}/*.collectrnaseqmetrics ${projectResultsDir}/qcmetrics

if [ "${intermediateDir}/*.insert_size_metrics" ]
then
cp ${intermediateDir}/*.insert_size_metrics ${projectResultsDir}/qcmetrics
fi

# copy GeneCounts to results directory

Expand All @@ -86,26 +103,25 @@ fi

# Copy variants vcfs to results directory

if [ -f "${intermediateDir}/${project}.variant.calls.genotyped.vcf" ]
if [ "${intermediateDir}/${project}.variant.calls.genotyped.*.vcf" ]
then
cp ${intermediateDir}/${project}.variant.calls.genotyped.vcf ${projectResultsDir}/variants
cp ${intermediateDir}/${project}.variant.calls.genotyped.*.vcf* ${projectResultsDir}/variants
fi
#only available with PE
if [ -f "${intermediateDir}/*.insertsizemetrics.png" ]
if [ "${intermediateDir}/*.insert_size_metrics.png" ]
then
cp ${intermediateDir}/*.insertsizemetrics.png ${projectResultsDir}/images
cp ${intermediateDir}/.insert_size_histogram.pdf ${projectResultsDir}/images
fi


# write README.txt file

cat > ${projectResultsDir}/README.txt <<'endmsg'
Patrick Deelen
Morris A. Swertz
University of Groningen, University Medical Center Groningen, Genomics Coordination Center, Groningen, the Netherlands
University of Groningen, University Medical Center Groningen, Department of Genetics, Groningen, the Netherlands
Please use both affiliations
Description of the different steps used in the RNA analysis pipeline
Expand All @@ -121,18 +137,18 @@ sequenced on an Illumina HiSeq2500 using default parameters (single read 1x50bp
End 2 x 100 bp) in pools of multiple samples.
Gene expression quantification
The trimmed fastQ files where aligned to build ${indexFileID} ensembleResease ${ensembleReleaseVersion} reference genome using hisat
${hisatVersion} [1] allowing for 2 mismatches. Before gene quantification
SAMtools ${samtoolsVersion} [2] was used to sort the aligned reads.
The gene level quantification was performed by HTSeq in Anaconda ${anacondaVersion} [3] using --mode=union
--stranded=no and, Ensembl version ${ensembleReleaseVersion} was used as gene annotation database which is included
in folder expression/.
The trimmed fastQ files where aligned to build ${indexFileID} ensembleResease ${ensembleReleaseVersion}
reference genome using ${hisatVersion} [1] with default settings. Before gene quantification
${samtoolsVersion} [2] was used to sort the aligned reads.
The gene level quantification was performed by HTSeq-count ${htseqVersion} [3] using --mode=union,
Ensembl version ${ensembleReleaseVersion} was used as gene annotation database which is included
in folder expression/.
Calculate QC metrics on raw and aligned data
Quality control (QC) metrics are calculated for the raw sequencing data. This is done using
the tool FastQC ${fastqcVersion} [4]. QC metrics are calculated for the aligned reads using
Picard-tools ${picardVersion} [5] CollectRnaSeqMetrics, MarkDuplicates, CollectInsertSize-
Metrics and SAMtools ${samtoolsVersion} flagstat.
Metrics and {samtoolsVersion} flagstat.
GATK variant calling
#TODO
Expand All @@ -144,10 +160,12 @@ The zipped archive contains the following data and subfolders:
- expression: textfiles with gene level quantification per sample and per project.
- fastqc: FastQC output
- images: QC images
- qcmetrics: Multiple qcMetrics generated with Picard-tools or SAMTools Flagstat.
- variants: Variants calls using GATK. (optional)
- rawdata: raw sequence file in the form of a gzipped fastq file (.fq.gz)
The root of the results directory contains the final QC report, and the samplesheet which
were the basis for this analysis.
The root of the results directory contains the final QC report, README.txt and the samplesheet which
form the basis for this analysis.
Used toolversions:
Expand Down Expand Up @@ -182,7 +200,7 @@ cd ${projectResultsDir}

zip -gr ${projectResultsDir}/${project}.zip fastqc
zip -g ${projectResultsDir}/${project}.zip ${project}.csv
zip -gr ${projectResultsDir}/${project}.zip alignment
zip -gr ${projectResultsDir}/${project}.zip qcmetrics
zip -gr ${projectResultsDir}/${project}.zip expression
zip -gr ${projectResultsDir}/${project}.zip variants
zip -gr ${projectResultsDir}/${project}.zip images
Expand Down
2 changes: 2 additions & 0 deletions compute5/NGS_RNA/protocols/CreateInhouseRnaSeqProjects.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#string parameters_build
#string parameters_species
#string parameters_environment
#string parameters_chromosomes
#string ngsversion

#string worksheet
Expand Down Expand Up @@ -133,6 +134,7 @@ sh ${EBROOTMOLGENISMINCOMPUTE}/molgenis_compute.sh \
-p ${parameters_build} \
-p ${parameters_species} \
-p ${parameters_environment} \
-p ${parameters_chromosomes} \
-p ${projectJobsDir}/${project}.csv -rundir ${projectJobsDir} \
-w ${workflowpath} -b slurm -g -weave -runid ${runid} \
-o "ngsversion=${ngsversion};"
50 changes: 25 additions & 25 deletions compute5/NGS_RNA/protocols/GatkGenotypeGvcf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#string dbsnpVcf
#string gatkVersion
#string indexFile
#list GatkHaplotypeCallerGvcf,GatkHaplotypeCallerGvcfidx
#list externalSampleID
#string intermediateDir
#string projectPrefix
#string projectBatchGenotypedVariantCalls
Expand Down Expand Up @@ -43,29 +43,27 @@ ${checkStage}

echo "## "$(date)" Start $0"

SAMPLESIZE=$(cat ${projectJobsDir}/${project}.csv | wc -l)
numberofbatches=$(($SAMPLESIZE / 200))
INPUTS=()
ALLGVCFs=()

if [ $SAMPLESIZE -gt 200 ]
then
for b in $(seq 0 $numberofbatches)
do
if [ -f ${projectBatchCombinedVariantCalls}.$b ]
then
ALLGVCFs+=(--variant ${projectBatchCombinedVariantCalls}.$b)
fi
done
else
for sbatch in "${GatkHaplotypeCallerGvcf[@]}"
do
if [ -f $sbatch ]
then
array_contains ALLGVCFs "--variant $sbatch" || ALLGVCFs+=("--variant $sbatch")
fi
done
fi
for external in "${externalSampleID[@]}"
do
array_contains INPUTS "$external" || INPUTS+=("$external") # If vcfFile does not exist in array add it
done

SAMPLESIZE=${#INPUTS[@]}
numberofbatches=$(($SAMPLESIZE / 200))

for b in $(seq 0 $numberofbatches)
do
if [ -f ${projectBatchCombinedVariantCalls}.$b ]
then
ALLGVCFs+=("--variant ${projectBatchCombinedVariantCalls}.$b")
fi
done

GvcfSize=${#ALLGVCFs[@]}

if [ ${GvcfSize} -ne 0 ]
then

Expand All @@ -81,14 +79,16 @@ then

mv ${tmpProjectBatchGenotypedVariantCalls} ${projectBatchGenotypedVariantCalls}
echo "moved ${tmpProjectBatchGenotypedVariantCalls} to ${projectBatchGenotypedVariantCalls} "

cd ${intermediateDir}
md5sum $(basename ${projectBatchGenotypedVariantCalls})> $(basename ${projectBatchGenotypedVariantCalls}).md5sum
cd -
echo "succes moving files"


else
echo ""
echo "there is nothing to genotype, skipped"
echo ""
fi





Loading

0 comments on commit db521ab

Please sign in to comment.