-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.nf
executable file
·730 lines (630 loc) · 35.5 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
#!/usr/bin/env nextflow
// Call processes from modules directory
include { KPOPCOUNT as KPOPCOUNT1 } from './modules/kpopCount'
include { KPOPCOUNT as KPOPCOUNT2 } from './modules/kpopCount'
include { KPOPCOUNT_BY_CLASS as KPOPCOUNT_BY_CLASS1 } from './modules/kpopCount'
include { KPOPCOUNT_BY_CLASS as KPOPCOUNT_BY_CLASS2 } from './modules/kpopCount'
include { KPOPCOUNT_COMBINE_CLASS_COUNTS as KPOPCOUNT_COMBINE_CLASS_COUNTS1 } from './modules/kpopCount'
include { KPOPCOUNT_COMBINE_CLASS_COUNTS as KPOPCOUNT_COMBINE_CLASS_COUNTS2 } from './modules/kpopCount'
include { KPOPCOUNT_READS_BY_CLASS } from './modules/kpopCount'
include { KPOPCOUNT_READS as KPOPCOUNT_READS1 } from './modules/kpopCount'
include { KPOPCOUNT_READS as KPOPCOUNT_READS2 } from './modules/kpopCount'
include { GENERATE_TEST_TWISTED } from './modules/generate_test_twisted'
include { GENERATE_TEST_TWISTED_FROM_KPOPCOUNTER } from './modules/generate_test_twisted'
include { KPOPPHYLO } from './modules/kpopPhylo'
include { KPOPTWIST as KPOPTWIST1 } from './modules/kpopTwist'
include { KPOPTWIST as KPOPTWIST2 } from './modules/kpopTwist'
include { PREDICT_TEST_SET } from './modules/predict_test_set'
include { ASSEMBLE_FASTQS as ASSEMBLE_FASTQS1 } from './modules/assemble_fastqs'
include { ASSEMBLE_FASTQS as ASSEMBLE_FASTQS2 } from './modules/assemble_fastqs'
include { MATCH_REFERENCE_CONTIGS as MATCH_REFERENCE_CONTIGS1 } from './modules/match_reference_contigs'
include { MATCH_REFERENCE_CONTIGS as MATCH_REFERENCE_CONTIGS2 } from './modules/match_reference_contigs'
include { ASSEMBLY_STATS as ASSEMBLY_STATS1 } from './modules/assembly_stats'
include { ASSEMBLY_STATS as ASSEMBLY_STATS2 } from './modules/assembly_stats'
include { META_COLOURED_TREE } from './modules/meta_coloured_tree'
include { DOWNLOAD_SRAS as DOWNLOAD_SRAS1 } from './modules/download_samples'
include { DOWNLOAD_SRAS as DOWNLOAD_SRAS2 } from './modules/download_samples'
include { FASTERQ_DUMP as FASTERQ_DUMP1 } from './modules/download_samples'
include { FASTERQ_DUMP as FASTERQ_DUMP2 } from './modules/download_samples'
include { KPOPTWIST_UPDATE; UPDATE_PLOT } from './modules/retwist'
include { GENERATE_KPOPTWISTED as GENERATE_KPOPTWISTED1 } from './modules/retwist'
include { GENERATE_KPOPTWISTED as GENERATE_KPOPTWISTED2 } from './modules/retwist'
include { INPUT_VALIDATION as FASTA_VALIDATION } from './modules/input_validation'
include { INPUT_VALIDATION as TEST_FASTA_VALIDATION } from './modules/input_validation'
include { INPUT_VALIDATION as FASTQ_VALIDATION } from './modules/input_validation'
include { INPUT_VALIDATION as TEST_FASTQ_VALIDATION } from './modules/input_validation'
include { CLUSTERING } from './modules/dimension_reduction'
include { FASTQ_QC as FASTQ_QC1 } from './modules/quality_control'
include { FASTQ_QC as FASTQ_QC2 } from './modules/quality_control'
// Create a help message
def helpMessage() {
log.info"""
Usage:
nextflow run main.nf [--cluster|--classify] --input_dir <path/to/dir> [--test_dir <path/to/dir> (required only if --classify used)]
Required arguments:
Workflow (At least one of these; if one of these is not supplied the program will download, QC and assemble files then quit)
--cluster Data is run through clustering workflow, starting with an unknown dataset the pipeline produces \
a distance matrix and pseudophylogenetic tree showing relatedness between samples. Creates \
.KPopTwister and KPopTwisted files
--classify Data is run through classification workflow, starting with separate training and test datasets, \
a model is created using the training dataset and known class metadata> This model is used to \
predict the classes of the unknown test dataset. Requires --test_dir argument. Creates \
.KPopTwister and KPopTwisted files
--update New data is run added to an existing database, creating updated .KPopTwister and KPopTwisted files. \
The data type should match the original data type that the database was built on. \
In other words, avoid updating a database built from fasta files with fastqs and vice versa
Input (At least one of these)
--input_dir Path to directory containing fasta/fastq files. Paired-end fastqs require "R1" and "R2" in filenames. \
Gzipped files are allowed. \
If --classify used this directory is the training dataset. \
If --update used this directory is the new dataset used to update
--accession_list Supply a list of SRA IDs to download as input samples in the form of a text file, with one SRA per line
--test_dir Directory containing unseen test dataset. Only required if --classify workflow invoked
--test_accession_list Supply a list of SRA IDs to download as test samples in the form of a text file, with one SRA per line. \
Only required if --classify workflow invoked
--twisted_file Full path to .KPopTwisted file. Only required for --update workflow
--twister_file Full path to .KPopTwister file. Only required for --update workflow
--kpopcount_input .KPopCounter file as an input. \
Incompatible with --no_assembly, --match_reference, --input_dir, --accession_list, --max_dim and --meta_data options
--kpopcount_test_input .KPopCounter file as a test input. Only required if --classify workflow invoked. \
Incompatible with --no_assembly, --match_reference, --input_dir, --accession_list, --max_dim and --meta_data options
Optional arguments:
General arguments
--kmer_len Length of k-mer to use when generating spectra. For choosing best k size for DNA we recommend using the base 4 log of the genome size. \
E.g. If the genome size is 5Mbp, then base 4 log of 5000000 is 11.13, so optimal kmer size is likely 11 or 12 [12]
--output_dir Path to output directory. If directory doesn't exist then a new directory will be created. [projectDir/results]
--output_prefix Prefix for output files [output]
--no_assembly Do not perform assembly on the reads, the workflow will count the number of kmers from the raw reads directly instead of assemblies
--no_qc Do not perform quality control using trim_galore
--validate_inputs Perform validation check on fastq and fasta inputs to ensure they are formatted correctly, incorrectly formatted files will be skipped
--cpu_num Number of CPUs used per process [8]
--meta_data Tsv file with two required columns with defined headers; "fileName" and "class". \
"fileName" is file name if a fasta or fasta.gz file, or file prefix if paired-end fastqs. E.g. sample1.fasta.gz if fasta file or \
sample1 if sample1_R1.fastq.gz and sample1_R2.fastq.gz. Additional columns allowed
--match_reference Full path to reference fasta file, used to select contigs that only match the supplied reference
--max_dim Maximum number of dimensions used to separate data. Choosing 0 uses all available dimensions, which will be one less than the number of samples \
for --cluster or one less than the number of classes if --classify. A lower number will reduce memory usage. \
If the data cannot be separated into the number chosen, less dimensions will be chosen automatically. Must not be a number above the maximum number of samples [0]
--min_contig_match_len Minimum number of query contig base pairs that match the reference. Only used with --match_reference option [250]
--min_contig_match_proportion Minimum fraction of query contig base pairs that match reference. Only used with --match_reference option [0.6]
--pred_class_num Specify the top n number of best predictions to be included in .KPopSummary file. E.g. 2 would choose the top two closest classes [all]
--tree_type Specify the type of tree generated by ggtree - 'rectangular' or 'circular' ['rectangular']
--tree_label_size Specify the size of the labels on the tree generated by ggtree, choose 0 to remove labels [3]
--help Print help instructions
Nextflow arguments
-profile conda Install the required conda environment automatically from the environment.yml file found in the same directory as main.nf. Slower than installing it manually.
Flash arguments (https://pubmed.ncbi.nlm.nih.gov/21903629/)
--flash_minOverlap The minimum required overlap length between two reads to provide a confident overlap. Only used on fastq inputs [20]
--flash_maxOverlap Maximum overlap length expected in approximately 90% of read pairs. Only used on fastq inputs [1000]
--extra_flash Any additional arguments for flash. E.g. --extra_flash '-O -x 0.35'
Megahit arguments (https://pubmed.ncbi.nlm.nih.gov/25609793/)
--extra_megahit Any additional arguments for Megahit. E.g. --extra_megahit '--k-min 25'
TrimGalore arguments (https://github.com/FelixKrueger/TrimGalore)
--extra_trimGalore Any additional arguments for TrimGalore. E.g. --extra_trimGalore '--quality 40'
SRA-toolkit arguments (https://github.com/ncbi/sra-tools)
--extra_prefetch Any additional arguments for fasterq-dump. E.g. --extra_prefetch '--verify'
--extra_fasterq_dump Any additional arguments for fasterq-dump. E.g. --extra_fasterq_dump '--concatenate-reads'
KPop arguments (https://www.biorxiv.org/content/10.1101/2022.06.22.497172v2)
--extra_kpopCount Any additional arguments for KPopCount. E.g. --extra_kpopCount
--extra_kpopCountDB Any additional arguments for KPopCountDB. E.g. --extra_kpopCountDB
--extra_kpopTwist Any additional arguments for KPopTwist. E.g. --extra_kpopTwist
--extra_kpopTwistDB Any additional arguments for KPopTwistDB. E.g. --extra_kpopTwistDB
--kpopPhylo_power Set the external power when computing distances [2]
--kpopPhylo_distance Distance measure to be used. This must be one of 'euclidean', 'maximum', 'manhattan', 'canberra', 'binary' or 'minkowski'. ['euclidean']
--kpopPhylo_magic Cluster-related variable (Not currently implemented) ['1.']
--kpopScale_power Set the external power when computing distances [2]
"""
.stripIndent()
}
// Print important information for each run at the start
log.info"""
${workflow.manifest.name} v${workflow.manifest.version}
==========================
input from : ${params.input_dir}
output to : ${params.output_dir}
kmer length : ${params.kmer_len}
--
run as : ${workflow.commandLine}
cpus used : ${params.cpu_num}
started at : ${workflow.start}
config files : ${workflow.configFiles}
"""
.stripIndent()
// Print help message if --help invoked
if (params.help){
helpMessage()
exit 0
}
// Define warnings
if (!params.cluster && !params.classify && !params.update){
log.warn"""Current run will only perform steps before KPop analysis. --cluster, --classify or --update required for KPop workflows. Use --help for more information.
""".stripIndent()
}
if (params.classify && params.meta_data == "" && params.kpopcount_input == ""){
log.warn"""Each individual file in --input_dir will be treated as a class as --meta_data not specified. If you want to combine files into classes or rename classes please use --meta_data. Use --help for more information.
""".stripIndent()
}
// Define errors
if (params.input_dir == "" && params.accession_list == "" && params.kpopcount_input == ""){
log.error"""--input_dir or --accession_list or --kpopcount_input required for workflow. Use --help for more information.
""".stripIndent()
exit 0
}
if (params.classify && params.test_dir == "" && params.test_accession_list == "" && params.kpopcount_test_input == ""){
log.error"""--test_dir, --test_accession_list or kpopcount_test_input required for --classify workflow. Use --help for more information.
""".stripIndent()
exit 0
}
if (params.kpopcount_input != "" && (params.no_assembly == true || params.match_reference != "" || params.input_dir != "" || params.accession_list != "" || params.max_dim != 0 || params.meta_data != "")){
log.error"""--kpopcount_input is incompatible with --no_assembly, --match_reference, --input_dir, --accession_list, --max_dim and --meta_data options. Use --help for more information.
""".stripIndent()
exit 0
}
if (params.input_dir != "") { // If an input directory is supplied
// Channels created for all fastas, paired fastas and all paired fastqs in input_dir
FASTAS = "${params.input_dir}/*.{fasta,fa,fasta.gz,fa.gz}"
FASTQS = "${params.input_dir}/*.{fastq,fq,fastq.gz,fq.gz}"
PAIRED_FASTQS = "${params.input_dir}/*_{R1,R2}.{fastq,fq,fastq.gz,fq.gz}"
Channel
.fromPath( FASTAS )
.map { it -> [[fileName: it.toString().split("/")[-1]], file(it)]}
.set {fasta_files}
Channel
.fromFilePairs( PAIRED_FASTQS )
.map { it -> [[fileName: it[0].toString()], it[1]]}
.set {paired_fastq_files}
Channel
.fromPath( FASTQS )
.map { it -> [[fileName: it.toString().split("/")[-1]], file(it)]}
.filter { !(it.fileName =~ /(R1|R2)/) }
.concat(paired_fastq_files)
.set {fastq_files}
if (!file( FASTAS ) && !file( FASTQS ) && !file( PAIRED_FASTQS)){
log.error"""--input_dir needs to contain at least one fasta or fastq file. Please check specified directory again. Use --help for more information.
""".stripIndent()
exit 0
}
fasta_files
.concat(fastq_files)
.map {it -> it[0].fileName}
.filter{ if (it =~/[\[\]\?\*\ ]/) {throw new IllegalArgumentException("Value '$it' contains illegal character")} else { it }}
} else {
// Create empty channels if no fasta or fastq files found in input_dir, this allows for downloading of data from SRA
Channel
.empty()
.set {fasta_files}
Channel
.empty()
.set {paired_fastq_files}
Channel
.empty()
.set {fastq_files}
}
// Create channel that includes class meta data for training
if (params.meta_data != "") {
if (!("fileName" in file(params.meta_data).splitCsv(header:true, sep: "\t").first().keySet()) || !("class" in file(params.meta_data).splitCsv(header:true, sep: "\t").first().keySet())) {
log.error"""File given with --meta_data needs to have two columns named 'fileName' and 'class' in it. Please check header names in specified file again. Use --help for more information.
""".stripIndent()
exit 0
}
Channel
.fromPath(params.meta_data)
.splitCsv(header:true, sep: "\t")
.map { row -> meta = [[fileName: row.fileName.toString().split("/")[-1]], [meta_class: row.class]] }
.set {meta_file}
fasta_files
.join(meta_file)
.map {it -> [it[2], it[1]]}
.groupTuple(by: [0])
.set {meta_fasta_files}
}
// Channels created for all fastas, paired fastas and all paired fastqs in test_dir
if (params.test_dir != "") {
TEST_FASTAS = "${params.test_dir}/*.{fasta,fa,fasta.gz,fa.gz}"
TEST_FASTQS = "${params.test_dir}/*.{fastq,fq,fastq.gz,fq.gz}"
TEST_PAIRED_FASTQS = "${params.test_dir}/*_{R1,R2}.{fastq,fq,fastq.gz,fq.gz}"
if (!file( TEST_FASTAS ) && !file( TEST_FASTQS ) && !file( TEST_PAIRED_FASTQS ) && params.test_accession_list == ""){
log.error"""--test_dir needs to contain at least one fasta or fastq file. Please check specified directory again. Use --help for more information.
""".stripIndent()
exit 0
}
Channel
.fromPath( TEST_FASTAS )
.map { it -> [[fileName: it.toString().split("/")[-1]], file(it)]}
.set {test_fasta_files}
Channel
.fromFilePairs( TEST_PAIRED_FASTQS )
.map { it -> [[fileName: it[0].toString()], it[1]]}
.set {test_paired_fastq_files}
Channel
.fromPath( TEST_FASTQS )
.map { it -> [[fileName: it.toString().split("/")[-1]], file(it)]}
.filter { !(it.fileName =~ /(R1|R2)/) }
.concat(test_paired_fastq_files)
.set {test_fastq_files}
test_fasta_files
.concat(test_fastq_files)
.map {it -> it[0].fileName}
.filter{ if (it =~/[\[\]\?\*\ ]/) {throw new IllegalArgumentException("Value '$it' contains illegal character")} }
} else {
// Create empty channels if no fasta or fastq files found in input_dir, this allows for downloading of data from SRA
Channel
.empty()
.set {test_fasta_files}
Channel
.empty()
.set {test_paired_fastq_files}
Channel
.empty()
.set {test_fastq_files}
}
workflow {
/// Download training set
if (params.accession_list != "") {
DOWNLOAD_SRAS1(file(params.accession_list))
.flatten()
.set {SRA_FILES}
FASTERQ_DUMP1(SRA_FILES)
.map {it -> [[fileName: it[0].toString()], it[1]]}
.concat(fastq_files)
.set {fastq_files}
}
/// Download test set
if (params.test_accession_list != "") {
DOWNLOAD_SRAS2(file(params.test_accession_list))
.flatten()
.set {TEST_SRA_FILES}
FASTERQ_DUMP2(TEST_SRA_FILES)
.map {it -> [[fileName: it[0].toString()], it[1]]}
.concat(test_fastq_files)
.set {test_fastq_files}
}
/// Input Validation
if (params.validate_inputs) {
FASTQ_VALIDATION(fastq_files)
.set {fastq_files}
TEST_FASTQ_VALIDATION(test_fastq_files)
.set {test_fastq_files}
FASTA_VALIDATION(fasta_files)
.set {fasta_files}
TEST_FASTA_VALIDATION(test_fasta_files)
.set {test_fasta_files}
}
if (!params.no_qc) {
/// FASTQ Quality Control
FASTQ_QC1(fastq_files)
.set { fastq_files }
FASTQ_QC2(test_fastq_files)
.set { test_fastq_files }
}
/// Either assemble the reads (default) or use fastqs if '--no_assembly' is false
if (params.no_assembly) {
assembled_fastas = fastq_files
} else {
ASSEMBLE_FASTQS1(fastq_files)
.map(it -> [it[0], it[1]])
.set {assembled_fastas}
ASSEMBLY_STATS1(assembled_fastas)
}
/// Clustering workflow
if (params.cluster) {
if (params.kpopcount_input != "") { // If .KPopCount is supplied as input
Channel
.fromPath( params.kpopcount_input )
.map(it -> [it, params.output_prefix])
.set {kpopcount_file}
} else if (params.no_assembly) { // If starting from reads
//TO DO: USE READS THAT MAP ONLY TO A REFERENCE
fastq_files
.map(it -> it[1].toString().replace(", ", "?")) //Need to replace the ", " with something unique so we can separate the files in KPOPCOUNT_READS
.toSortedList()
.map(it -> [it, params.output_prefix])
.set {fastq_list}
KPOPCOUNT_READS1(fastq_list)
.map(it -> [it, params.output_prefix])
.set {kpopcount_file}
} else { // If starting with fastas
assembled_fastas.concat(fasta_files)
.set {concat_fasta_files}
// Only use sequences that match with reference if reference provided
if (params.match_reference != "") {
concat_fasta_files
.map(it -> [it[1], it[0].fileName.toString().split("/")[-1]
.replace(".fasta.gz", "").replace(".fastq.gz", "")
.replace(".fasta", "").replace(".fastq", "")
.replace(".fa.gz", "").replace(".fq.gz", "")
.replace(".fa", "").replace(".fq", ""), file(params.match_reference, checkIfExists: true)])
.set {adjusted_concat_fasta_files}
MATCH_REFERENCE_CONTIGS1(adjusted_concat_fasta_files)
.map(it -> [[fileName: it[1]], it[0]])
.set {concat_fasta_files}
}
concat_fasta_files
.map(it -> it[1])
.toSortedList()
.map(it -> [it, params.output_prefix])
.set {fasta_list}
KPOPCOUNT1(fasta_list)
.map(it -> [it, params.output_prefix])
.set {kpopcount_file}
}
if (params.max_dim != 0) { // If the dimension size is specified, else run all dimensions available
// Generate KPopCounts and KPopTwister for the number of dimensions
if (params.no_assembly) { // If starting from reads
fastq_files
| randomSample( params.max_dim+1, 1)
| map(it -> it[1].toString().replace(", ", "?")) //Need to replace the ", " with something unique so we can separate the files in KPOPCOUNT_READS
| toSortedList
| map(it -> [it, "reduced_dim"])
| KPOPCOUNT_READS2
| map(it -> [it, "reduced_dim"])
| KPOPTWIST1
| set {red_dim_kpoptwist_files}
// Twisting all samples with reduced twister
GENERATE_KPOPTWISTED1(kpopcount_file.combine(red_dim_kpoptwist_files))
.map(it -> [it[0], "reduced_temp", it[2], it[3]])
.set {retwisted_files}
/// Create embeddings using KPopScale
CLUSTERING(retwisted_files)
.set {cluster_file}
// KPopCount per cluster
cluster_file
.splitCsv(header:true, sep: "\t")
.map { row -> meta = [[fileName: row.fileName.toString().split("/")[-1]], [new_class: "clusteredClass_${row.class}"]] }
.set {cluster_meta_file}
cluster_meta_file
.map(it -> it[0])
fastq_files
.map( it -> [[fileName: it[0].fileName.toString()
.replace(".fastq.gz", "")
.replace(".fastq", "")
.replace(".fq.gz", "")
.replace(".fq", "")
.replace("_matched", "")], it[1].toString().replace(", ", "?")]) //Need to replace the ", " with something unique so we can separate the files in KPOPCOUNT_READS_BY_CLASS
.join(cluster_meta_file, by: [0])
.map {it -> [it[2], it[1]]}
.groupTuple(by: [0])
.map {it -> [it[1], it[0].new_class]}
.set {cluster_fastq_list}
KPOPCOUNT_READS_BY_CLASS(cluster_fastq_list)
.collect()
.map(it -> [it, "clustered"])
.set {raw_fastq_cluster_count_list}
KPOPCOUNT_COMBINE_CLASS_COUNTS1(raw_fastq_cluster_count_list)
.map(it -> [it, "clustered"])
.set {cluster_kpopcount_file}
} else {
concat_fasta_files
| randomSample( params.max_dim+1, 1)
| map(it -> it[1])
| toSortedList()
| map(it -> [it, "reduced_dim"])
| KPOPCOUNT2
| map(it -> [it, "reduced_dim"])
| KPOPTWIST1
| set {red_dim_kpoptwist_files}
// Twisting all samples with reduced twister
GENERATE_KPOPTWISTED1(kpopcount_file.combine(red_dim_kpoptwist_files))
.map(it -> [it[0], "reduced_temp", it[2], it[3]])
.set {retwisted_files}
/// Create embeddings using KPopScale
CLUSTERING(retwisted_files)
.set {cluster_file}
// KPopCount per cluster
cluster_file
.splitCsv(header:true, sep: "\t")
.map { row -> meta = [[fileName: row.fileName.toString().split("/")[-1]], [new_class: "clusteredClass_${row.class}"]] }
.set {cluster_meta_file}
concat_fasta_files
.map( it -> [[fileName: it[1].toString().split("/")[-1]
.replace(".fasta.gz", "")
.replace(".fasta", "")
.replace(".fa.gz", "")
.replace(".fa", "")
.replace("_matched", "")], it[1]])
.join(cluster_meta_file, by: [0])
.map {it -> [it[2], it[1]]}
.groupTuple(by: [0])
.map {it -> [it[1], it[0].new_class]}
.set {cluster_fasta_list}
KPOPCOUNT_BY_CLASS1(cluster_fasta_list)
.collect()
.map(it -> [it, "clustered"])
.set {raw_cluster_count_list}
KPOPCOUNT_COMBINE_CLASS_COUNTS1(raw_cluster_count_list)
.map(it -> [it, "clustered"])
.set {cluster_kpopcount_file}
}
// Generate Twister file based on clusters
KPOPTWIST2(cluster_kpopcount_file)
.set {clustered_kpoptwist_files}
// Twist all samples using cluster twister
GENERATE_KPOPTWISTED2(kpopcount_file.combine(clustered_kpoptwist_files))
.map(it -> [it[2], it[0], params.output_prefix])
.set {kpoptwist_files}
} else { // Use all samples and dimensions instead
KPOPTWIST1(kpopcount_file)
.map(it -> [it[0], it[1], params.output_prefix])
.set {kpoptwist_files}
}
// Generate tree and additional output files produced by KPopPhylo
KPOPPHYLO(kpoptwist_files).nwk_file
.set {phylo_nwk_file}
// Generate tree with leaves coloured by metadata
if (params.meta_data != "") {
phylo_nwk_file
.map(it -> [file(params.meta_data), it, params.output_prefix])
.set {meta_ch}
META_COLOURED_TREE(meta_ch)
}
}
/// Classification workflow
if (params.classify) {
//TO DO: USE READS THAT MAP ONLY TO A REFERENCE
// Linking data to supplied metadata file if available, if not then each sample is treated as own class
if (params.meta_data != "") {
assembled_fastas
.join(meta_file)
.map {it -> [it[2], it[1]]}
.concat(meta_fasta_files.transpose())
.set {concat_meta_fasta_files}
} else {
assembled_fastas
.concat(fasta_files)
.map {it -> [[meta_class: it[0].fileName], it[1]]}
.set {concat_meta_fasta_files}
}
// Only use train sequences that match with reference if reference provided
if (params.match_reference != "") {
concat_meta_fasta_files
.map(it -> [it[1], "${it[0].meta_class}_${it[1].getBaseName(file(it[1]).name.endsWith('.gz')? 2: 1)}", file(params.match_reference, checkIfExists: true)])
.set {adjusted_concat_meta_fasta_files}
MATCH_REFERENCE_CONTIGS1(adjusted_concat_meta_fasta_files)
.map(it -> [[meta_class: it[1].toString().split("_")[0]], it[0]])
.set {concat_meta_fasta_files}
}
// Assemble test samples, calculate assembly stats and combine all files
ASSEMBLE_FASTQS2(test_fastq_files)
.set {test_assembled_fastas}
ASSEMBLY_STATS2(test_assembled_fastas)
test_assembled_fastas.concat(test_fasta_files)
.set {test_concat_fasta_files}
// Only use test sequences that match with reference if reference provided
if (params.match_reference != "") {
test_concat_fasta_files
.map(it -> [it[1], it[0].fileName.toString().split("/")[-1]
.replace(".fasta.gz", "").replace(".fastq.gz", "")
.replace(".fasta", "").replace(".fastq", "")
.replace(".fa.gz", "").replace(".fq.gz", "")
.replace(".fa", "").replace(".fq", ""), file(params.match_reference, checkIfExists: true)])
.set {test_adjusted_concat_fasta_files}
MATCH_REFERENCE_CONTIGS2(test_adjusted_concat_fasta_files)
.map(it -> [[fileName: it[1]], it[0]])
.set {test_concat_fasta_files}
}
test_concat_fasta_files
.map(it -> it[1])
.toSortedList()
.map(it -> [it, "test"])
.set {test_fasta_list}
// Combine and count kmers training data
if (params.kpopcount_input != "") { // If .KPopCount is supplied as input
Channel
.fromPath( params.kpopcount_input )
.map(it -> [it, "train"])
.set {train_kpopcount_file}
} else {
concat_meta_fasta_files
.groupTuple(by: [0])
.map(it -> [it[1], it[0].meta_class])
.set {train_fasta_list}
KPOPCOUNT_BY_CLASS1(train_fasta_list)
.collect()
.map(it -> [it, "train"])
.set {raw_class_count_list}
KPOPCOUNT_COMBINE_CLASS_COUNTS1(raw_class_count_list)
.map(it -> [it, "train"])
.set {train_kpopcount_file}
}
if (params.max_dim != 0) { // If the dimension size is specified, else run all dimensions available
concat_meta_fasta_files
| randomSample( params.max_dim+1, 1)
| map(it -> it[1])
| toSortedList()
| map(it -> [it, "reduced_dim"])
| KPOPCOUNT2
| map(it -> [it, "reduced_dim"])
| KPOPTWIST1
| set {train_red_dim_kpoptwist_files}
// Twisting all samples with reduced twister
GENERATE_KPOPTWISTED1(train_kpopcount_file.combine(train_red_dim_kpoptwist_files))
.map(it -> [it[0], "reduced_temp", it[2], it[3]])
.set {train_retwisted_files}
/// Create embeddings using KPopScale
CLUSTERING(train_retwisted_files)
.set {train_cluster_file}
// KPopCount per cluster
train_cluster_file
.splitCsv(header:true, sep: "\t")
.map { row -> meta = [[meta_class: row.fileName], [new_class: "clusteredClass_${row.class}"]] }
.set {train_cluster_meta_file}
concat_meta_fasta_files
.groupTuple(by: [0])
.join(train_cluster_meta_file, by: [0][0])
.map {it -> [it[1], it[2]]}
.groupTuple(by: [1])
.map {it -> [it[0].flatten().toList(), it[1].new_class]}
.set {train_cluster_fasta_list}
KPOPCOUNT_BY_CLASS2(train_cluster_fasta_list)
.collect()
.map(it -> [it, "clustered"])
.set {raw_cluster_count_list}
KPOPCOUNT_COMBINE_CLASS_COUNTS2(raw_cluster_count_list)
.map(it -> [it, "clustered"])
.set {train_cluster_kpopcount_file}
// Generate Twister file based on clusters
KPOPTWIST2(train_cluster_kpopcount_file)
.set {train_clustered_kpoptwist_files}
// Twist all samples using cluster twister
GENERATE_KPOPTWISTED2(train_kpopcount_file.combine(train_clustered_kpoptwist_files))
.map(it -> [[it[2], it[0]], "train"])
.set {train_kpoptwist_files}
} else {
// Twist training data
KPOPTWIST1(train_kpopcount_file)
.map(it -> [it, "train"])
.set {train_kpoptwist_files}
}
// Twist test files based on training twister and use the positional information within multidimensional space to predict classes
if (params.kpopcount_input != "") { // If .KPopCount is supplied as input
train_test_files = train_kpoptwist_files
.map(it -> [it[0][0], it[0][1], file(params.kpopcount_test_input)])
train_test_kpop_files = GENERATE_TEST_TWISTED_FROM_KPOPCOUNTER(train_test_files)
} else {
train_test_files = train_kpoptwist_files
.combine(test_fasta_list)
.map(it -> [it[0][0], it[0][1], it[2]])
train_test_kpop_files = GENERATE_TEST_TWISTED(train_test_files)
}
PREDICT_TEST_SET(train_test_kpop_files)
}
/// Update workflow
if (params.update) {
if (params.no_assembly) { // If starting from reads
//TO DO: USE READS THAT MAP ONLY TO A REFERENCE
fastq_files
.map(it -> it[1].toString().replace(", ", "?")) //Need to replace the ", " with something unique so we can separate the files in KPOPCOUNT_READS
.toSortedList()
.map(it -> [it, params.output_prefix])
.set {fastq_list}
KPOPCOUNT_READS1(fastq_list)
.map(it -> [it, "temp", file(params.twister_file), file(params.twisted_file)])
.set {kpopcount_file}
} else {
assembled_fastas.concat(fasta_files)
.set {concat_fasta_files}
if (params.match_reference != "") {
concat_fasta_files
.map(it -> [it[1], it[0].fileName, file(params.match_reference, checkIfExists: true)])
.set {adjusted_concat_fasta_files}
MATCH_REFERENCE_CONTIGS1(adjusted_concat_fasta_files)
.map(it -> [[fileName: it[1]], it[0]])
.set {concat_fasta_files}
}
concat_fasta_files
.map(it -> it[1])
.toSortedList()
.map(it -> [it, params.output_prefix])
.set {fasta_list}
KPOPCOUNT1(fasta_list)
.map(it -> [it, "temp", file(params.twister_file), file(params.twisted_file)])
.set {kpopcount_file}
}
GENERATE_KPOPTWISTED1(kpopcount_file)
.map(it -> [it[0], params.output_prefix, it[2], it[3]])
.set {updating_file}
KPOPTWIST_UPDATE(updating_file)
.set {updated_files}
UPDATE_PLOT(updated_files)
}
}