-
Notifications
You must be signed in to change notification settings - Fork 1
/
CharONT2.nf
219 lines (197 loc) · 11.4 KB
/
CharONT2.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#!/usr/bin/env nextflow
/*
========================================================================================
MaestSi/CharONT2
========================================================================================
MaestSi/CharONT2 analysis pipeline.
#### Homepage / Documentation
https://github.com/MaestSi/CharONT2
----------------------------------------------------------------------------------------
*/
def helpMessage() {
log.info"""
Usage:
nextflow -c CharONT2.conf run CharONT2.nf --fastq_files = "/path/to/files*.fastq" --scripts_dir = "/path/to/scripts_dir" --results_dir = "/path/to/results_dir" -profile docker
Mandatory argument:
-profile Configuration profile to use. Available: docker, singularity
Other mandatory arguments which may be specified in the CharONT2.conf file
--fastq_files Path to fastq files, use wildcards to select multiple samples
--results_dir Path to a folder where to store results
--num_alleles num_alleles represents the number of expected alleles; use --num_alleles="auto" for automatic num_alleles detection based on Silhouette coefficient
--PCRThr PCRThr is the identity threshold for in-silico PCR in case inSilicoPCR=true
--primerSeqOne primerSeqOne is a primer sequence used for in-silico PCR in case inSilicoPCR=true
--primerSeqTwo primerSeqTwo is a primer sequence used for in-silico PCR in case inSilicoPCR=true
--scripts_dir scripts_dir is the directory containing all scripts
--target_reads_consensus target_reads_consensus defines the maximum number of reads used for consensus calling
--target_reads_polishing target_reads_polishing defines the maximum number of reads used for consensus polishing
--max_reads_preliminary max_reads_preliminary defines the maximum number of reads used for preliminary clustering and consensus calling
--clustering_id_threshold identity threshold for clustering preliminary allele assembly
--plurality MAFFT plurality value: minimum fraction of aligned reads supporting a basis for including it in the preliminary consensus
--min_maf minimum minor allele frequency; if less than min_maf*100% of reads are assigned to Allele //2, the sample is assumed homozygous
--IQR_outliers_coef_precl label as candidate outliers reads with score > 3rd_QR + IQR_outliers_coef_precl*IQR or score < 1st_QR - IQR_outliers_coef_precl*IQR
--IQR_outliers_coef label as outliers reads with score > 3rd_QR + IQR_outliers_coef*IQR or score < 1st_QR - IQR_outliers_coef*IQR; IQR is computed within each cluster
--fast_alignment_flag set fast_alignment_flag=1 if you want to perform fast multiple sequence alignment; otherwise set fast_alignment_flag=0
--min_clipped_len minimum number of soft-clipped bases to be considered as a DEL or INS
--sd_noise_score sd_noise_score is the standard deviation of gaussian-distributed noise with zero mean added to score
--primers_length primers_length defines how many bases are trimmed from consensus sequences
--medaka_model medaka model for consensus polishing
""".stripIndent()
}
// Show help message
if (params.help) {
helpMessage()
exit 0
}
// Input of sample names, conditions, and FAST5s path.
Channel
.fromPath(params.fastq_files)
.map {tuple( it.name.split('\\.f')[0], it )}
.set{inputFiles_inSilicoPCR}
// in-silico PCR
process inSilicoPCR {
input:
tuple val(sample), val(fastq)
output:
val sample
script:
if(params.inSilicoPCR)
"""
mkdir -p ${params.results_dir}/inSilicoPCR
sample=\$(echo \$(basename \$(realpath ${fastq})) | sed \'s/\\.fa.*//\' | sed \'s/\\.fq.*//\')
sam_file_one=\$sample"_in_silico_pcr_one.sam"
sam_file_two=\$sample"_in_silico_pcr_two.sam"
trimmed_reads_fq=${params.results_dir}"/inSilicoPCR/"\$sample"_trimmed.fastq"
export PATH=\$PATH:/opt/conda/envs/CharONT_env/bin/
mem=\$(echo ${task.memory} | sed \'s/ //\' | sed \'s/B//\' | sed \'s/G/g/\' | sed \'s/M/m/\')
/opt/conda/envs/CharONT_env/bin/msa.sh in=${fastq} out=\$sam_file_one literal=${params.primerSeqOne} qin=33 cutoff=${params.PCRThr} -Xmx\$mem
/opt/conda/envs/CharONT_env/bin/msa.sh in=${fastq} out=\$sam_file_two literal=${params.primerSeqTwo} qin=33 cutoff=${params.PCRThr} -Xmx\$mem
/opt/conda/envs/CharONT_env/bin/cutprimers.sh in=${fastq} out=\$trimmed_reads_fq sam1=\$sam_file_one sam2=\$sam_file_two qin=33 fake=f include=t fixjunk -Xmx\$mem
ln -s \$trimmed_reads_fq ./trimmed.fastq
"""
else
"""
mkdir -p ${params.results_dir}/inSilicoPCR
reads_full=\$(realpath ${fastq})
sample=\$(echo \$(basename \$(realpath ${fastq})) | sed \'s/\\.fa.*//\' | sed \'s/\\.fq.*//\')
trimmed_reads_fq=${params.results_dir}"/inSilicoPCR/"\$sample"_trimmed.fastq"
cp ${fastq} \$trimmed_reads_fq
"""
}
// Preliminary consensus sequence generation
process preliminaryConsensusCalling {
input:
val sample
output:
val sample
script:
if(params.preliminaryConsensusCalling)
"""
mkdir -p ${params.results_dir}/preliminaryConsensusCalling
export PATH=\$PATH:/opt/conda/envs/CharONT_env/bin/
/opt/conda/envs/CharONT_env/bin/Rscript ${params.scripts_dir}/Obtain_preliminary_consensus.R fastq_file=${params.results_dir}/inSilicoPCR/${sample}_trimmed.fastq TRC=${params.target_reads_consensus} max_num_reads_clustering=${params.max_reads_preliminary} THR=${params.clustering_id_threshold} PLUR=${params.plurality} num_threads=${task.cpus}
"""
else
"""
echo "Skipped."
"""
}
process readsAssignment {
input:
val sample
output:
val sample
script:
if(params.readsAssignment)
"""
mkdir -p ${params.results_dir}/readsAssignment
export PATH=\$PATH:/opt/conda/envs/CharONT_env/bin/
/opt/conda/envs/CharONT_env/bin/Rscript ${params.scripts_dir}/Cluster_reads.R fastq_file=${params.results_dir}/inSilicoPCR/${sample}_trimmed.fastq first_allele_preliminary=${params.results_dir}/preliminaryConsensusCalling/${sample}/${sample}_preliminary_allele.fasta IQR_outliers_coef_precl=${params.IQR_outliers_coef_precl} IQR_outliers_coef=${params.IQR_outliers_coef} min_clipped_len=${params.min_clipped_len} sd_noise_score=${params.sd_noise_score} num_alleles=${params.num_alleles}
"""
else
"""
echo "Skipped."
"""
}
process draftConsensusCalling {
input:
val sample
output:
val sample
script:
if(params.draftConsensusCalling)
"""
mkdir -p ${params.results_dir}/draftConsensusCalling
export PATH=\$PATH:/opt/conda/envs/CharONT_env/bin/
fastq_files=\$(find ${params.results_dir}/readsAssignment/${sample} | grep "\\d.*\\.fastq" | grep -v "outliers")
for f in \$fastq_files; do
allele_number=\$(echo \$(basename \$f) | sed \'s/.*_allele_//\' | sed \'s/\\.fastq//\')
/opt/conda/envs/CharONT_env/bin/Rscript ${params.scripts_dir}/Obtain_draft_consensus.R fastq_file=\$f allele_num=\$allele_number min_maf=${params.min_maf} TRC=${params.target_reads_consensus} PLUR=${params.plurality} num_threads=${task.cpus} fast_alignment_flag=${params.fast_alignment_flag}
done
"""
else
"""
echo "Skipped."
"""
}
process consensusPolishing {
input:
val sample
output:
val sample
script:
if(params.consensusPolishing)
"""
mkdir -p ${params.results_dir}/consensusPolishing
export PATH=\$PATH:/opt/conda/envs/CharONT_env/bin/
draft_consensus_files=\$(find ${params.results_dir}/draftConsensusCalling/${sample} | grep ${sample}"_draft_allele.*\\.fasta" | grep -v "tmp")
for f in \$draft_consensus_files; do
allele_number=\$(echo \$(basename \$f) | sed \'s/.*_allele_//\' | sed \'s/\\.fasta//\')
fastq_file=\$(find ${params.results_dir}/readsAssignment/${sample} | grep "\\d.*\\.fastq" | grep "allele_"\$allele_number)
/opt/conda/envs/CharONT_env/bin/Rscript ${params.scripts_dir}/Polish_consensus.R draft_consensus=\$f fastq_file=\$fastq_file allele_num=\$allele_number TRP=${params.target_reads_polishing} num_threads=${task.cpus} primers_length=${params.primers_length} medaka_model=${params.medaka_model}
done
"""
else
"""
mkdir -p ${params.results_dir}/consensusPolishing
draft_consensus_files=\$(find ${params.results_dir}/draftConsensusCalling/${sample} | grep ${sample}"_draft_allele.*\\.fasta" | grep -v "tmp")
for f in \$draft_consensus_files; do
polished_consensus_file=\$(echo \$f | sed \'s/draft/polished/\' | sed \'s/draftConsensusCalling/consensusPolishing/\')
/opt/conda/envs/CharONT_env/bin/seqtk trimfq \$f -b ${params.primers_length} -e ${params.primers_length} > \$polished_consensus_file
done
"""
}
process trfAnnotate {
input:
val sample
output:
script:
if(params.trfAnnotate)
"""
mkdir -p ${params.results_dir}/trfAnnotate/
polished_consensus_files=\$(find ${params.results_dir}/consensusPolishing/${sample} | grep ${sample}"_allele.*\\.fasta" | grep -v "untrimmed")
mkdir -p ${params.results_dir}/trfAnnotate/${sample}
cd ${params.results_dir}/trfAnnotate/${sample}
cp \$polished_consensus_files ${params.results_dir}/trfAnnotate/${sample}
for f in \$polished_consensus_files; do
/opt/conda/envs/CharONT_env/bin/trf \$f 2 7 7 80 10 50 500 || echo "processed \$? TRs"
/opt/conda/envs/CharONT_env/bin/trf \$f 2 3 5 80 10 50 500 || echo "processed \$? TRs"
/opt/conda/envs/CharONT_env/bin/trf \$f 2 500 500 80 10 50 500 || echo "processed \$? TRs"
done
"""
else
"""
mkdir -p ${params.results_dir}/trfAnnotate/
polished_consensus_files=\$(find ${params.results_dir}/consensusPolishing/${sample} | grep ${sample}"_allele.*\\.fasta" | grep -v "untrimmed")
mkdir -p ${params.results_dir}/trfAnnotate/${sample}
cd ${params.results_dir}/trfAnnotate/${sample}
cp \$polished_consensus_files ${params.results_dir}/trfAnnotate/${sample}
"""
}
workflow {
inSilicoPCR(inputFiles_inSilicoPCR)
preliminaryConsensusCalling(inSilicoPCR.out)
readsAssignment(preliminaryConsensusCalling.out)
draftConsensusCalling(readsAssignment.out)
consensusPolishing(draftConsensusCalling.out)
trfAnnotate(consensusPolishing.out)
}