Config

!/bin/bash

#   More complete information on how to fill out
#       this Config file can be found at:
#       https://github.com/MorrellLAB/sequence_handling/wiki/Configuration

###################################################
##########     Shared across handlers    ##########
###################################################

#   Where are we storing the output files?
#       Final directory is ${OUT_DIR}/Name_of_Handler
OUT_DIR=

#   Name this project
PROJECT=

#   What email should we use for job notifications?
EMAIL=

#   What encoding is used for quality values?
#       Look at the FastQC files to determine the sequence encoding.
#       Choose from: 'sanger', 'illumina', 'solexa', or 'phred'
#           Illumina 1.8+ and Oxford Nanopore use 'sanger' encoding.
#
QUAL_ENCODING=

#   Sequencing platform technology
#       What platform were the reads produced?
#       Valid options are:
#           CAPILLARY, LS454, ILLUMINA,
#           SOLID, HELICOS, IONTORRENT,
#           ONT, and PACBIO
SEQ_PLATFORM=

#   What reference genome are we using?
#       Include the full physical file path.
REF_GEN=

#   Is this organism barley?
#       Choose from: "true" or "false"
BARLEY=

#   Are you running the analysis on the Minnesota Supercomputing Institute (MSI)?
#       Choose from: "true" or "false"
MSI=

#   Are you submitting the job with qsub from PBS (Portable Batch System)
#       Choose from: "true" or "false"
USE_PBS=

#   Do the quality scores need to be adjusted for GATK? Default: false
#       Change to true if you get errors from GATK like:
#       "<Sample> appears to be using the wrong encoding for quality scores: we encountered an extremely high quality score"
FIX_QUALITY_SCORES=false

#   Where should Picard/GATK store temporary files?
#       If you've encountered issues with running out of temp space
#           with picard, you can optionally specify a temp directory
#       Otherwise, leave blank
TMP=

############################################
##########     GBS_Demultiplex    ##########
############################################

#   What are our QS
GD_QSUB="mem=1gb,nodes=1:ppn=4,walltime=6:00:00"

GBS_SAMPLES=

#   A key file with the following columns:
#       Flowcell Lane Barcode Sample PlateName Row Column
KEY_FILE=

#   Are the barcodes at the beginning or the end of the sequence?
#       Choose from: 'beginning' or 'end'
LINE_ENDING=

#   How many mismatches are allowed?
MISMATCH_TOL=1

#   Do we allow partial overlap?
PARTIAL=0

#   What extension or naming convention do the samples have?
FILE_TYPE=

############################################
##########   Quality_Assessment   ##########
############################################

#   What are our QSub settings for Quality_Assessment?
#       Below are the recommended settings
QA_QSUB="mem=1gb,nodes=1:ppn=4,walltime=6:00:00"

#   Provide a list of FastQ, SAM, or BAM files to be used
#       Include the full file path.
QA_SAMPLES=

#   What is the size of the genome in basepairs (for whole-genome sequencing)
#   or capture region (for exome capture)?
#   If unavailable, put "NA"
TARGET=

############################################
##########    Adapter_Trimming    ##########
############################################

#   What are our QSub settings for Adapter_Trimming?
#       Below are the recommended settings
AT_QSUB="mem=1gb,nodes=1:ppn=4,walltime=10:00:00"

#   Where is the list of original FastQ files?
#       Include the full file path.
RAW_SAMPLES=

#   What shared suffix do the forward samples have?
#       Example: _1_sequence.txt.gz
FORWARD_NAMING=_R1.fastq.gz

#   What shared suffix do the reverse samples have?
#       Example: _2_sequence.txt.gz
REVERSE_NAMING=_R2.fastq.gz

#   If you have single-end samples, leave
#       FORWARD_NAMING and REVERSE_NAMING
#       filled with values that do NOT
#       match your samples

#	  Where is the adapter file? Include the full physical file path.
ADAPTERS=

#   What is Scythe's prior contamination rate?
#       Scythe's documentation suggests starting at
#       0.05 and then experimenting as needed
PRIOR=0.05

############################################
##########    Quality_Trimming    ##########
############################################

#   What are our QSub settings for Quality_Trimming?
#       Below are the recommended settings
QT_QSUB="mem=1gb,nodes=1:ppn=4,walltime=10:00:00"

#   Where is our list of trimmed samples? Include the full file path.
ADAPTED_LIST=${OUT_DIR}/Adapter_Trimming/${PROJECT}_trimmed_adapters.txt

#   What shared suffix do the trimmed forward samples have?
#       If using the Adapter_Trimming handler,
#       leave the following definition as is
FORWARD_ADAPTED=_Forward_ScytheTrimmed.fastq.gz

#   What shared suffix do the trimmed reverse samples have?
#       If using the Adapter_Trimming handler,
#       leave the following definition as is
REVERSE_ADAPTED=_Reverse_ScytheTrimmed.fastq.gz

#   What shared suffix do the trimmed single samples have?
#       If using the Adapter_Trimming handler,
#       leave the following definition as is
SINGLES_ADAPTED=_Single_ScytheTrimmed.fastq.gz

#   If you have single-end samples, leave
#       FORWARD_ADAPTED and REVERSE_ADAPTED
#       filled with values that do NOT
#       match your samples

#   If you have paired-end samples, leave
#       SINGLES_ADAPTED filled with values that
#       do NOT match your samples

#   What is the threshold for trimming?
#       For standard trimming, use 20
QT_THRESHOLD=

############################################
##########  	Read_Mapping	  ##########
############################################

#   What are our QSub settings for Read_Mapping?
#       Below are the recommended settings
#       For large files, it may be necessary to use mesabi
RM_QSUB="mem=22gb,nodes=1:ppn=16,walltime=24:00:00"

#   Where is our list of trimmed samples?
#       Include the full file path.
TRIMMED_LIST=

#   How are trimmed forward samples named?
#       If using the Adapter_Trimming handler
#       FORWARD_TRIMMED=_Forward_ScytheTrimmed.fastq.gz
#       If using the Quality_Trimming handler
#       FORWARD_TRIMMED=_R1_trimmed.fastq.gz
FORWARD_TRIMMED=_Forward_ScytheTrimmed.fastq.gz

#   How are trimmed reverse samples named?
#       If using the Adapter_Trimming handler
#       REVERSE_TRIMMED=_Reverse_ScytheTrimmed.fastq.gz
#       If using the Quality_Trimming handler
#       REVERSE_TRIMMED=_R2_trimmed.fastq.gz
REVERSE_TRIMMED=_Reverse_ScytheTrimmed.fastq.gz

#   How are trimmed single-end samples named?
#       If using the Adapter_Trimming handler
#       SINGLES_TRIMMED=_Single_ScytheTrimmed.fastq.gz
#       If using the Quality_Trimming handler
#       SINGLES_TRIMMED=_single_trimmed.fastq.gz
SINGLES_TRIMMED=_Single_ScytheTrimmed.fastq.gz

#   BWA mem parameters; below are the defaults for BWA mem
#       Note that you may need to adjust parameters based on species
#       How many threads are we using?
THREADS=8

#       What is our minimum seed length?
SEED=8

#       What is the band width?
WIDTH=100

#       What is our off-diagonal x-dropoff (Z-dropoff)?
DROPOFF=100

#       What is our re-seed value?
RE_SEED=1.0

#       What is our cutoff value?
CUTOFF=10000

#       What is our matching score?
MATCH=1

#       What is our mismatch penalty?
MISMATCH=4

#       What is our gap penalty?
GAP=8

#       What is our gap extension penalty?
EXTENSION=1

#       What is our clipping penalty?
CLIP=5

#       What is our unpaired read penatly?
UNPAIRED=9

#       Do we rescue missing hits? Note, this means that reads may not be matched. Requires paired-end mode
RESCUE=false

#       Do we assume the first input query is interleaved?
INTERLEAVED=false

#       What is our minimum threshold?
RM_THRESHOLD=85

#       Do we output all alignments, and mark as secondary?
SECONDARY=false

#       Do we append FASTA/Q comments to SAM files?
APPEND=false

#       Do we use hard clipping?
HARD=false

#       Do we mark split hits as secondary?
SPLIT=true

#       What is the verbosity level?
#           Choose from:
#               'disabled', 'errors'
#               'warnings', 'all', or 'debug'
VERBOSITY='all'

############################################
##########     SAM_Processing     ##########
############################################

#   What are our QSub settings for SAM_Processing?
#       Below are the recommended settings
SP_QSUB="mem=22gb,nodes=1:ppn=16,walltime=24:00:00"

#   Which queue are we submitting the job to?
SP_QUEUE="small"

#   Specify how you would like your SAM files to be processed
#       Choose from:
#           'picard' (recommended) or 'samtools'
METHOD=

#   Number of threads used for SAM_Processing
#   Using more threads requires more memory
#       This is ignored when USE_PBS=true
SAM_PROCESSING_THREADS=

#   Where is the list of read-mapped samples?
#       To generate this list, use sample_list_generator.sh
#       located at /sequence_handling/HelperScripts/sample_list_generator.sh
MAPPED_LIST=

#   The next two variables are only used if processing with Picard
#       If using SAMtools, leave these variables blank

#   What is the maximum number of file handles that we can use?
#       For UNIX systems, the per-process maximum number of files
#           that can be open may be found with 'ulimit -n'
#       We recommend setting MAX_FILES to be slightly under this value
MAX_FILES=1000

############################################
##########    Coverage_Mapping    ##########
############################################

#   What are our Qsub settings for Coverage_Mapping?
#       Below are the recommended settings
CM_QSUB="mem=22gb,nodes=1:ppn=16,walltime=24:00:00"

#   Where is the list of finished BAM files?
#       To generate this list, use sample_list_generator.sh
#       located at /sequence_handling/HelperScripts/sample_list_generator.sh
BAM_LIST=

#   For exome capture, use the capture regions file (BED format)
#   For whole-genome sequencing, leave this blank
#   Coverage_Mapping should not be used for GBS data
REGIONS_FILE=

############################################
##########    Haplotype_Caller    ##########
############################################

#   What are our Qsub settings for Haplotype_Caller?
#       Below are the recommended settings
#     NOTE: "mem=" is used even if qsub is not used
HC_QSUB="mem=250gb,nodes=1:ppn=24,walltime=24:00:00"

#   Which queue are we submitting the job to?
HC_QUEUE="ram256g"

#   Number of threads used for Haplotype_Caller
#   Using more threads requires more memory
#   This is ignored when USE_PBS=true
HAPLOTYPE_CALLER_THREADS=

#   Where is the list of finished BAM files?
#       To generate this list, use sample_list_generator.sh
#       located at /sequence_handling/HelperScripts/sample_list_generator.sh
FINISHED_BAM_LIST=

#   What is the nucleotide diversity per base pair (Watterson's theta)?
#       For barley: 0.008
#       For soybean: 0.001
THETA=

#   If true, GATK will not trim down the active region from the full region
#       (active + extension) to just the active interval for genotyping
#       Recommended value: false
DO_NOT_TRIM_ACTIVE_REGIONS=false

#   If true, all bases will be considered active regions
#       Recommended value: false
FORCE_ACTIVE=false

#   Optional: Provide a list of custom intervals if you are interested in specific regions AND/OR want to parallelize processing across regions/chromosomes.
#   Default is set to false. Otherwise, fill in full path to intervals file.
#   IMPORTANT: GATK accepts the following file extensions:
#       .intervals, .list, .interval_list, .bed, and .vcf
#       sequence_handling Haplotype_Caller currently handles: .intervals, .list
#   For .intervals/.list files, format one per line as follows: chr1:100-200
#   For example files, see: https://github.com/MorrellLAB/sequence_handling/tree/master/Example_Files
#       For additional info, please see Wiki page.
HC_CUSTOM_INTERVALS=false

#   Optional: If your genome has scaffolds or parts of the reference not covered by
#       the chromosomes above, include the full filepath to a SORTED list of
#       names. One scaffold name per line.
#   Default is set to false. Otherwise, fill in full path to intervals file.
HC_SCAFFOLDS=false

#   Do we want to parallelize across regions?
#   Set to true if yes.
#   Default is set to false.
#       Parallelizing across regions for Haplotype_Caller means each
#       region/chromosome provided in HC_CUSTOM_INTERVALS list gets run
#       in as it's own job for every sample.
HC_PARALLELIZE=false

################################################
##########     Genomics_DB_Import     ##########
################################################

#   USAGE NOTE: Genomics DB Import is used in GATK4.

#   What are our Qsub settings for Genomics_DB_Import?
#       Below are the default settings but this depends on the number of samples
#     NOTE: mem has to be in the  unit of gb, and it has to be greater than 4gb
#     NOTE2: "mem=" is used even if qsub is not used
GDBI_QSUB="mem=22gb,nodes=1:ppn=8,walltime=24:00:00"

#   Which queue are we submitting the job to?
GDBI_QUEUE="ram1t"

#   Number of threads used for Genomics_DB_IMPORT                                                                                                  
#   Using more threads requires more memory                                                                                                        
#       This is ignored when USE_PBS=true                                                                                                          
GDBI_THREADS=

#   REQUIRED: Please fill out variables under "Genotype_GVCFs" listed below.
#       Genomics_DB_Import shares variables with Genotype_GVCFs.

############################################
##########     Genotype_GVCFs     ##########
############################################

#   USAGE NOTE: For accurate results, run Genotype_GVCFs on all of your samples at once
#       DO NOT break this job into batches of samples

#   What are our Qsub settings for Genotype_GVCFs?
#       Below are the recommended settings
#     NOTE: "mem=" is used even if qsub is not used
GG_QSUB="mem=22gb,nodes=1:ppn=4,walltime=24:00:00"

#   Which queue are we submitting the job to?
GG_QUEUE="small"

#   Number of threads used for Genotype_GVCFs                                                                                                      
#   Using more threads requires more memory                                                                                                        
#       This is ignored when USE_PBS=true                                                                                                          
GG_THREADS=

#   Where is the list of GVCF files?
#       To generate this list, use sample_list_generator.sh
#       located at /sequence_handling/HelperScripts/sample_list_generator.sh
GVCF_LIST=

#   What is the nucleotide diversity per base pair (Watterson's theta)?
#   Genotype_GVCFs uses the THETA under Haplotype_Caller for this

#   Where is the reference dictionary file? (.dict)
#       Include the full file path.
REF_DICT=

#   How many chromosomes or chromosome parts does the reference have, excluding scaffolds? (integer value)
#       For barley: 15 (7*2 chromosome parts + chrUn)
#       For soybean: 20
NUM_CHR=

#   Optional: Provide a list of custom intervals if you are interested in specific regions AND/OR want to parallelize processing across regions/chromosomes.
#   IMPORTANT: GATK accepts the following file extensions:
#       .intervals, .list, .interval_list, .bed, and .vcf
#       sequence_handling currently handles: .intervals, .list, and .bed
#   For .intervals/.list files, format one per line as follows: chr1:100-200
#   For example files, see: https://github.com/MorrellLAB/sequence_handling/tree/master/Example_Files
#       For additional info, please see Wiki page.
#   Default is set to false. Otherwise, fill in full path to intervals file.
#   NOTE: If HC_PARALLELIZE=true and HC_CUSTOM_INTERVALS is set, the value below is ignored (uses HC_CUSTOM_INTERVALS).
CUSTOM_INTERVALS=false

#   Optional: If your genome has scaffolds or parts of the reference not covered by
#       the chromosomes above, include the full filepath to a SORTED list of
#       names. One scaffold name per line.
#   Default is set to false. Otherwise, fill in full path to intervals file.
#   NOTE: If HC_PARALLELIZE=true and HC_CUSTOM_INTERVALS is set, the value below is ignored (uses HC_SCAFFOLDS).
SCAFFOLDS=false

#   Do we want to parallelize across regions?
#   Set to true if yes.
#   Default is set to false.
#       Parallelizing across regions means each region/chromosome provided in
#       CUSTOM_INTERVALS list gets run as its own job.
#       In the case that we are parallelizing across regions AND we have scaffolds,
#       regions in CUSTOM_INTERVALS will be parallelized but the scaffolds will not be parallelized.
#       Scaffolds will instead be imported into its own gendb workspace.
#   NOTE: If HC_PARALLELIZE=true and HC_CUSTOM_INTERVALS is set, the value below is ignored.
PARALLELIZE=false

#  When parallelized, separate vcf output files are creatred for separate intervals in                                                             
#    ${OUT_DIR}/Genotype_GVCFs/vcf_split_regions                                                                                                   
#  If you set the following option to true, these separate files get combined into a single vcf:                                                   
#     ${OUT_DIR}/Genotype_GVCFs/raw_variants.vcf                                                                                                   
#  This option is ignored if PARALLELIZE=false                                                                                                     
#     NOTE: this could requires lots of memory, so if it fails, you need to increase
#           mem= option in GG_QSUB (this option is used even if you are not using qsub)
#   Default is set to false. 
GG_COMBINED_VCF=false

#   What is the sample ploidy? (integer value)
#   For highly inbred samples (most barleys): 1
PLOIDY=

############################################
##########    Create_HC_Subset    ##########
############################################

#   What are our Qsub settings for Create_HC_Subset?
#       Below are the recommended settings
CHS_QSUB="mem=22gb,nodes=1:ppn=16,walltime=24:00:00"

#   Which queue are we submitting the job to?
CHS_QUEUE="small"

#   Where is the list of chromosome part VCF files?
#       This step will concatenate VCF files split into parts.
#       To generate this list, use sample_list_generator.sh
#       located at /sequence_handling/HelperScripts/sample_list_generator.sh
CHS_VCF_LIST=

#   If exome capture, include the full file path to the capture regions file (BED format)
#   This should be the same file as the REGIONS_FILE in Coverage_Mapping. In this case, put "${REGIONS_FILE}" below.
#   If not exome capture, put "NA"
CAPTURE_REGIONS=

#   What is the depth per sample (DP) cutoff? (integer)
#       If a sample's DP is below this threshold, it will count as a "bad" sample for that site.
#       Recommended value: 5
CHS_DP_PER_SAMPLE_CUTOFF=5

#   What is the genotyping quality (GQ) cutoff? (integer)
#       If a sample's GQ is below this threshold, it will count as a "bad" sample for that site.
#       Recommended value: 10th percentile of the raw GQ percentile table
CHS_GQ_CUTOFF=

#   What is the maximum number of "bad" (low GQ, low DP, or missing genotype data) samples allowed at a site? (integer)
#       Sites with more "bad" samples than this threshold will be filtered out.
#       Recommended value: total number of samples * 0.2 (rounded to the nearest whole number)
CHS_MAX_BAD=

#   What is the maximum number of samples at a site that can be heterozygous? (integer)
#       Recommended value: total number of samples * 0.9 (rounded to the nearest whole number)
CHS_MAX_HET=

#   What is the QUAL score cutoff? (integer)
#       Sites with a QUAL below this cutoff will be excluded.
#       Recommended value: 40
CHS_QUAL_CUTOFF=40

############################################
##########  Variant_Recalibrator  ##########
############################################

#   DO NOT RUN VARIANT_RECALIBRATOR IF YOU HAVE LESS THAN 30 EXOME SAMPLES
#   GATK has empirically found in human data that you need at least 1 whole genome or 30 exome samples to have enough variant sites for decent modeling.
#   What matters is having a large number of variant sites, so this minimum number of samples is only an estimate.
#   See documentation here for details: https://gatk.broadinstitute.org/hc/en-us/articles/360035531612-Variant-Quality-Score-Recalibration-VQSR-

#   What are our Qsub settings for Variant_Recalibrator?
#       Below are the recommended settings
VR_QSUB="mem=250gb,nodes=1:ppn=4,walltime=24:00:00"

#   Which queue are we submitting the job to?
VR_QUEUE="ram1t"

#   What reference genome are we using?
#       Include the full file path.
#       If barley AND running GATK3, make sure to use the pseudomolecular reference
#           NOT the parts reference used above
#       If running GATK4, make sure this is the same reference used during Haplotype_Caller and Genotype_GVCFs
VR_REF=

#   Do we have a single concatenated raw VCF file output from Create_HC_Subset?
#   If not, put "NA" below
#   GATK4 only accepts a single input variant file unlike earlier version of GATK, which accepted multiple input variant files.
#       To generate this list, use sample_list_generator.sh
#       located at /sequence_handling/HelperScripts/sample_list_generator.sh
VR_RAW_VCF=

#   Where is the list of chromosome part VCF files from Genotype_GVCFs?
#   Leave as the default "NA" if we already have a concatenated raw VCF file. Change otherwise.
#   GATK3 accepts multiple input variant files but GATK4 does not. If we are running GATK4, the handler will
#   take the list of split VCF files and concatenate them for you.
#       To generate this list, use sample_list_generator.sh
#       located at /sequence_handling/HelperScripts/sample_list_generator.sh
VR_VCF_LIST="NA"

#   Which annotations do we want to include?
#   INDEL and SNPs are recalibrated separately, for flexibility you can specify different sets of annotations to use for each below.
#   Please following this example format to add/exlude annotations:
#       VR_ANN_INDEL="-an FS -an ReadPosRankSum -an MQRankSum -an QD -an SOR -an DP"
#   The annotations included below are used as defaults, please add/exclude annotations as you see fit.
#   Specifically, read up on GATK Variant Recalibrator's caveats for specific types of data (example: DP annotation should not be used for exome datasets)
#   See doc here for what the annotations mean: https://gatk.broadinstitute.org/hc/en-us/articles/360035890471
#   NOTE: MQ is intentionally left out when recalibrating indels, see this tutorial for more info: https://gatk.broadinstitute.org/hc/en-us/articles/360035531112--How-to-Filter-variants-either-with-VQSR-or-by-hard-filtering
#   IMPORTANT: DP should NOT be used for exome datasets (see here https://gatk.broadinstitute.org/hc/en-us/articles/360035531612-Variant-Quality-Score-Recalibration-VQSR-)
VR_ANN_INDEL="-an QD -an FS -an ReadPosRankSum -an MQRankSum -an SOR -an DP"
VR_ANN_SNP="-an QD -an FS -an ReadPosRankSum -an MQ -an MQRankSum -an SOR -an DP"

#   EXTRA OPTIONS. Are there additional options you would like to include?
#   Default is "NA", modify otherwise. DO NOT leave blank!
#   Use the same options format as listed in the gatk docs except separated by space, all on one line, and surrounded by double quotes.
#       Example format: RECAL_EXTRA_OPTIONS_INDEL="-L genomic_intervals -titv 2.15"
#   Recalibration step:
#   INDEL mode: Currently, indel recalibration uses the following options:
#       -R, -V, -an (see VR_ANN_INDEL and VR_ANN_SNP above), -mode INDEL, -O, --resource, --tranches-file, --rscript-file
RECAL_EXTRA_OPTIONS_INDEL="NA"
#   SNP mode: Currently, SNP recalibration uses the following options:
#       -R, -V, -an (see VR_ANN_INDEL and VR_ANN_SNP above), -mode SNP, -O, --resource, --tranches-file, --rscript-file
RECAL_EXTRA_OPTIONS_SNP="NA"

#   ApplyVQSR step (filtering step):
#   INDEL mode: Currently, indel ApplyVQSR uses the following options:
#       -R, -V, -mode INDEL, --truth-sensitivity-filter-level, --recal-file, --tranches-file, --create-output-variant-index, -O
FILTER_EXTRA_OPTIONS_INDEL="NA"
#   SNP mode: Currently, snp ApplyVQSR uses the following options:
#       -R, -V, -mode SNP, --truth-sensitivity-filter-level, --recal-file, --tranches-file, --create-output-variant-index, -O
FILTER_EXTRA_OPTIONS_SNP="NA"

#   Where are resource VCF files and their priors (integers) to be used for training our model?
#       At least one resource (high confidence subset counts as one) and prior pair is required.
#   MPORTANT: GATK 4 requires both training AND truth sets to run, otherwise it will return an error.
#       These can be supplied in a single resource or as separate resources (fill out the next section below).
#   Where is the high confidence subset of variants?
#   The high confidence set recommended settings: known=false,training=true,truth=false
#       If you used Create_HC_Subset, leave as the default
HC_SUBSET=${OUT_DIR}/Create_HC_Subset/${PROJECT}_high_confidence_subset.vcf
HC_PRIOR=5.0
HC_KNOWN=false
HC_TRAIN=true
HC_TRUTH=false

#   Do we have additional resource VCF files we can use?
#   "NA" is the default for missing resource files and priors, modify otherwise
#   Please specify "true" or "false" for KNOWN, TRAINING, and TRUTH for each resource below (this is currently set to defaults)
#   See GATK's documentation for definitions of known, training, and truth sets:
#       https://gatk.broadinstitute.org/hc/en-us/articles/360035890831-Known-variants-Training-resources-Truth-sets
RESOURCE_1="NA"
PRIOR_1="NA"
KNOWN_1=true
TRAINING_1=true
TRUTH_1=false

RESOURCE_2="NA"
PRIOR_2="NA"
KNOWN_2=true
TRAINING_2=true
TRUTH_2=false

RESOURCE_3="NA"
PRIOR_3="NA"
KNOWN_3=true
TRAINING_3=true
TRUTH_3=false

RESOURCE_4="NA"
PRIOR_4="NA"
KNOWN_4=true
TRAINING_4=true
TRUTH_4=false

#   What is our truth sensitivity filter level?
#       Default is set to 99.9
TS_FILTER_LEVEL="99.9"

############################################
##########    Variant_Filtering   ##########
############################################

#   What are our Qsub settings for Variant_Filtering?
#       Below are the recommended settings
VF_QSUB="mem=22gb,nodes=1:ppn=16,walltime=24:00:00"

#   Where is the VCF file to be filtered?
#	If you used Variant_Recalibrator, leave as the default
VF_VCF=${OUT_DIR}/Variant_Recalibrator/${PROJECT}_recalibrated.vcf

#   If exome capture, include the full file path to the capture regions file (BED format)
#   For barley use the pseudomolecular BED file, not the parts file
#   If not exome capture, put "NA"
VF_CAPTURE_REGIONS=

#   What is the minimum number of reads needed to support a genotype? (integer)
#       Recommended value: 5
MIN_DP=5

#   What is the maximum number of reads allowed to support a genotype? (too many = gene duplication problems) (integer)
#       Recommended value: 95th percentile of the raw DP per sample percentile table
MAX_DP=

#   What is the maximum percent deviation from 50/50 reference/alternative reads allowed in heterozygotes? (decimal)
#       For example, MAX_DEV=0.1 allows 60/40 ref/alt and also 40/60 ref/alt but not anything more skewed
#       Recommended value: 0.1
MAX_DEV=0.1

#   What is the depth per sample (DP) cutoff? (integer)
#       If a sample's DP is below this threshold, it will count as a "bad" sample for that site.
#       Recommended value: 5
DP_PER_SAMPLE_CUTOFF=5

#   What is the genotyping quality (GQ) cutoff? (integer)
#       If a sample's GQ is below this threshold, it will count as a "bad" sample for that site.
#       Recommended value: 10th percentile of the raw GQ percentile table
GQ_CUTOFF=

#   What is the maximum number of "bad" (low GQ, low DP, or missing genotype data) samples allowed at a site? (integer)
#       Sites with more "bad" samples than this threshold will be filtered out.
#       Recommended value: total number of samples * 0.2 (rounded to the nearest whole number)
MAX_BAD=

#   What is the maximum number of samples at a site that can be heterozygous? (integer)
#       Recommended value: total number of samples * 0.9 (rounded to the nearest whole number)
MAX_HET=

#   What is the QUAL score cutoff? (integer)
#       Sites with a QUAL below this cutoff will be excluded.
#       Recommended value: 40
QUAL_CUTOFF=40

############################################
##########    Variant_Analysis    ##########
############################################

#   What are our Qsub settings for Variant_Analysis?
#       Below are the recommended settings
VA_QSUB="mem=22gb,nodes=1:ppn=16,walltime=24:00:00"

#   Where is the VCF file? Include the full file path.
VA_VCF=

############################################
##########      Dependencies      ##########
############################################

#   This section defines installations to
#       various dependencies for sequence_handling.
#   If you are using the Minnesota Supercomputing Institute cluster
#       then uncomment all 'module load' lines.
#       Make sure you have access to all '_ML' modules.
#   If you need to install a dependency from source,
#       then uncomment the lines for the dependency and the
#       'export PATH=', and write the full path to the executable
#       for the program. For example:
#       PARALLEL=${HOME}/software/parallel-20151122/bin/parallel
#   If you have a system-wide installation for a program, you can
#       leave all lines commented out. sequence_handling will find
#       system-wide installed programs automatically.

#   Please visit https://github.com/MorrellLab/sequence_handling/wiki/Dependencies
#       for information on version requirements and compatibility

if [[ "$MSI" == "true" ]] ; then
#   Do we have GNU parallel installed
module load parallel/20190122
#PARALLEL=
#export PATH=${PARALLEL}:${PATH}

#   Do we have the Fastx Toolkit installed?
module load fastx_toolkit/0.0.14
#FASTX_TOOLKIT=
#export PATH=${FASTX_TOOLKIT}:${PATH}

#   Do we have FastQC installed?
module load fastqc/0.11.8
#FASTQC=
#export PATH=${FASTQC}:${PATH}

#   Do we have Riss-util installed?
module load riss_util/1.0
#RISS_UTIL=
#export PATH=${RISS_UTIL}:${PATH}

#   Do we have Seqqs installed?
module load seqqs_ML/3d05375
#SEQQS=
#export PATH=${SEQQS}:${PATH}

#   Do we have Sickle installed?
module load sickle_ML/1.33
#SICKLE=
#export PATH=${SICKLE}:${PATH}

#   Do we have Scythe installed?
module load scythe_ML/0.994
#SCYTHE=
#export PATH=${SCYTHE}:${PATH}

#   Do we have R installed?
module load R/3.6.3
#R=
#export PATH=${R}:${PATH}
#   REQUIRED if running on MSI: Where are the local R libraries stored?
#   To figure out the path, run the following interactively on MSI:
#       module load R/3.6.0
#       R # Start up R session on MSI
#       Sys.getenv()
#   Now, look for R_LIBS_USER and paste the path below
R_LIBS_USER="~/R/x86_64-pc-linux-gnu-library/3.6"

#   Do we have BWA installed?
module load bwa/0.7.17
#BWA=
#export PATH=${BWA}:${PATH}

#   Do we have SAMTools installed?
module load samtools/1.9
#SAMTOOLS=
#export PATH=${SAMTOOLS}:${PATH}

#   Do we have BEDTools 2.17.0 installed?
module load bedtools/2.17.0
#BEDTOOLS=
#export PATH=${BEDTOOLS}:${PATH}

#   Do we have htslib 1.9 installed?
#HTSLIB=
#export PATH=${HTSLIB}:${PATH}
module load htslib/1.9

#   Do we have Freebayes 1.3.1 from commit on 20190710 installed?
#FREEBAYES=
#export PATH=${FREEBAYES}:${PATH}
module load freebayes_ML/1.3.1_20190710

#   Do we have Java installed?
#module load java/jdk1.8.0_45
#JAVA=
#export PATH=${JAVA}:${PATH}
module load java/openjdk-8_202

#   What is the full file path for the GATK jar file?
#   You need BOTH the jar file and the module
GATK_JAR=/panfs/roc/groups/9/morrellp/public/Software/gatk-4.1.8.0/gatk
module load gatk_ML/4.1.8

#   What is the full file path for the Picard jar file?
PICARD_JAR=/panfs/roc/groups/9/morrellp/public/Software/picard_ML_2.23.1/picard.jar

#   Do we have vcftools installed?
module load vcftools_ML/0.1.14
#VCFTOOLS=
#export PATH=${VCFTOOLS}:${PATH}

#   Do we have vcflib installed?
module load vcflib_ML/1.0.0_rc2
#VCFLIB=
#export PATH=${VCFLIB}:${PATH}

#   Do we have python 3 installed?
module load python3_ML/3.7.1_anaconda
#PYTHON3=
#export PATH=${PYTHON3}:${PATH}

#   Do we have analysis installed?
module load analysis/0.8.2
#ANALYSIS=
#export PATH=${ANALYSIS}:${PATH}

#   Do we have bcftools installed?
module load bcftools/1.9
#BCFTOOLS=
#export PATH=${BCFTOOLS}:${PATH}

#   Do we have python-epd installed?
#module load python-epd/1.5.2
#PYTHON-EPD=
#export PATH=${PYTHON-EPD}:${PATH}
module load python3_ML/2.7.13

#   Do we have texlive installed?
module load texlive/20131202
#TEXLIVE=
#export PATH=${TEXLIVE}:${PATH}
fi