-
Notifications
You must be signed in to change notification settings - Fork 3
The Structure of a Project Configuration File
[General]
output_directory = /home/kakapo/kakapo-output
project_name = kakapo-prj-01
entrez_api_key = xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
run_rcorrector = Yes
run_inter_pro_scan = No
prepend_assembly_name_to_sequence_name = Yes
kraken_2_confidence = 0.20
requery_after = 7
use_colors = Yes
-
output_directory
: a path to a directory wherekakapo
should place all of its output. -
project_name
: a short (hopefully meaningful) name for the analysis.kakapo
will create a subdirectory with this name where a bunch of project-specific output will be stored (log files, backups of the configuration files used, assembled sequences, etc):[output_directory]/02-project-specific/[project_name]
. -
entrez_api_key
:kakapo
will make use of the public resources on GenBank. GenBank users are allowed 3 requests/second without an API key. With an API key, the limit is increased to 10 requests/second. You will need to generate the key here: https://www.ncbi.nlm.nih.gov/account/settings -
run_rcorrector
: if set toYes
, the reads will be processed by Rcorrector. -
run_inter_pro_scan
: if set toYes
, the translated CDS sequences found bykakapo
will be submitted for functional annotation by InterProScan to https://www.ebi.ac.uk/interpro/search/sequence. -
prepend_assembly_name_to_sequence_name
: if set toYes
, will prepend sample name to the assembled isoform names. The sample name is derived based on the type of input:- SRA: GenBank metadata.
- FASTQ: file name.
- FASTA (user-provided assembly): file name.
if set to
No
, unaltered names produced by SPAdes will be used inkakapo
output. I highly discourage setting this option toNo
when more than one sample is being analyzed. -
kraken_2_confidence
: this should be set to a value between0
and1
. I find that a value of0.20
works quite well. Higher confidence values will reduce the number of reads classified (filtered out), and lower values will increase it. See discussion here, for more guidance. -
requery_after
: Do not search GenBank and/or Pfam if the search was performed already and the results are less than this many days old. -
use_colors
: if set toYes
, adds color to the log messages in the terminal, may not look great on light terminal backgrounds.
[Target filters]
# Allow other start codons, in addition to AUG.
# A set of appropriate start codons will be chosen
# using GenBank taxonomical classification.
#
# If allow_non_aug_start_codon is set to No,
# allow_missing_start_codon will have no effect
allow_non_aug_start_codon = No
# Annotate ORFs even if the start codon is missing.
#
# For allow_missing_start_codon to have any effect,
# allow_non_aug_start_codon must be set to Yes
allow_missing_start_codon = No
# Annotate ORFs even if the stop codon is missing.
allow_missing_stop_codon = No
[Query taxonomic group]
# Choose between: animals, archaea, bacteria, fungi, plants, viruses
plants
[Target SRA accessions]
# SRR7829961
# SRR23214014
# ...
[Target FASTQ files]
# pe_001 = /home/kakapo/kakapo-input/fastq/Solanum_chilense_sample1_R*.fastq.gz
# pe_002 = /home/kakapo/kakapo-input/fastq/Solanum_chilense_sample2_R*.fastq
# pe_003 = /home/kakapo/kakapo-input/fastq/Solanum_chilense_sample3_R*.fq.gz
# ...
# se_001 = /home/kakapo/kakapo-input/fastq/Solanum_chilense_sample4_R1.fastq.gz
# se_002 = /home/kakapo/kakapo-input/fastq/Solanum_chilense_sample5_R1.fastq
# se_003 = /home/kakapo/kakapo-input/fastq/Solanum_chilense_sample6_R1.fq
# ...
[Target assemblies: FASTA files (DNA)]
# /home/kakapo/kakapo-input/assemblies/Matucana_madisoniorum_HBG13.fasta
# ...
[Bowtie2 filter order]
# cactus_virus_x = /home/kakapo/kakapo-input/reference_genomes/cactus_virus_x.fasta
# plastid
# mitochondrion
# ...
[Kraken2 filter order]
# 16S_Silva132
# 16S_Silva138
# viral
# mitochondrion
# plastid
# mitochondrion_and_plastid
# minikraken_8GB_2020-03-12
[BLAST SRA/FASTQ]
evalue = 1e-5
max_hsps = 10000
qcov_hsp_perc = 1
best_hit_overhang = 0.05
best_hit_score_edge = 0.25
max_target_seqs = 1000000
# If any of these settings are present in the
# search strategies files, they will be overwritten
# for each search strategy
[BLAST assemblies]
evalue = 1e-20
max_hsps = 4
qcov_hsp_perc = 70
best_hit_overhang = 0.15
best_hit_score_edge = 0.15
max_target_seqs = 500