-
Notifications
You must be signed in to change notification settings - Fork 3
/
main.py
689 lines (611 loc) · 37.9 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
#-- coding: UTF-8 --
import argparse
import re
import subprocess
import datetime
import json
import os
import sys
import time
from multiprocessing import cpu_count
from module.Util import Logger, file_exist, read_fasta
current_folder = os.path.dirname(os.path.abspath(__file__))
project_dir = os.path.join(current_folder, ".")
if __name__ == '__main__':
# We define the default parameters for HiTE.
default_threads = int(cpu_count())
default_fixed_extend_base_threshold = 1000
default_chunk_size = 400
default_tandem_region_cutoff = 0.5
default_max_single_repeat_len = 30000
default_plant = 1
default_te_type = 'all'
default_curated_lib = None
default_recover = 0
default_annotate = 0
default_intact_annotate = 0
default_search_struct = 1
default_BM_RM2 = 0
default_BM_EDTA = 0
default_BM_HiTE = 0
default_EDTA_home = ''
default_coverage_threshold = 0.95
default_skip_HiTE = 0
default_flanking_len = 50
default_is_denovo_nonltr = 1
default_debug = 0
default_chrom_seg_length = 100000
default_classified = 1
default_domain = 0
default_miu = str(1.3e-8)
default_remove_nested = 1
default_use_HybridLTR = 1
default_use_NeuralTE = 1
default_is_wicker = 0
default_is_output_LTR_lib = 1
version_num = '3.3'
describe_image = '\n' + \
' __ __ __ ______ ______ \n' + \
' /\ \_\ \ /\ \ /\__ _\ /\ ___\ \n' + \
' \ \ __ \ \ \ \ \/_/\ \/ \ \ __\ \n' + \
' \ \_\ \_\ \ \_\ \ \_\ \ \_____\ \n' + \
' \/_/\/_/ \/_/ \/_/ \/_____/ ' + \
'version ' + str(version_num) + '\n\n'
print(describe_image)
# 1.parse args
describe_info = '########################## HiTE, version ' + str(version_num) + ' ##########################'
parser = argparse.ArgumentParser(description=describe_info)
parser.add_argument('--genome', required=True, metavar='genome', help='Input genome assembly path')
parser.add_argument('--outdir', required=True, metavar='output_dir', help='The path of output directory; It is recommended to use a new directory to avoid automatic deletion of important files.')
parser.add_argument('--thread', metavar='thread_num', help='Input thread num, default = [ '+str(default_threads)+' ]')
parser.add_argument('--chunk_size', metavar='chunk_size', help='The chunk size of genome, default = [ ' + str(default_chunk_size) + ' MB ]')
parser.add_argument('--miu', metavar='miu', help='The neutral mutation rate (per bp per ya), default = [ ' + str(default_miu) + ' ]')
parser.add_argument('--plant', metavar='is_plant', help='Is it a plant genome, 1: true, 0: false. default = [ ' + str(default_plant) + ' ]')
parser.add_argument('--te_type', metavar='te_type', help='Retrieve specific type of TE output [ltr|tir|helitron|non-ltr|all]. default = [ ' + str(default_te_type) + ' ]')
parser.add_argument('--curated_lib', metavar='curated_lib', help='Provide a fully trusted curated library, which will be used to pre-mask highly homologous sequences in the genome. We recommend using TE libraries from Repbase. default = [ ' + str(default_curated_lib) + ' ]')
# parser.add_argument('--classified', metavar='is_classified', help='Whether to classify TE models, HiTE uses RepeatClassifier from RepeatModeler to classify TEs, 1: true, 0: false. default = [ ' + str(default_classified) + ' ]')
parser.add_argument('--remove_nested', metavar='is_remove_nested',help='Whether to remove nested TE, 1: true, 0: false. default = [ ' + str(default_remove_nested) + ' ]')
parser.add_argument('--domain', metavar='is_domain', help='Whether to obtain TE domains, HiTE uses RepeatPeps.lib from RepeatMasker to obtain TE domains, 1: true, 0: false. default = [ ' + str(default_domain) + ' ]')
parser.add_argument('--recover', metavar='is_recover', help='Whether to enable recovery mode to avoid starting from the beginning, 1: true, 0: false. default = [ ' + str(default_recover) + ' ]')
parser.add_argument('--annotate', metavar='is_annotate', help='Whether to annotate the genome using the TE library generated, 1: true, 0: false. default = [ ' + str(default_annotate) + ' ]')
parser.add_argument('--intact_anno', metavar='intact_anno', help='Whether to generate annotation of full-length TEs, 1: true, 0: false. default = [ ' + str(default_intact_annotate) + ' ]')
parser.add_argument('--search_struct', metavar='search_struct', help='Is the structural information of full-length copies being searched, 1: true, 0: false. default = [ ' + str(default_search_struct) + ' ]')
parser.add_argument('--BM_RM2', metavar='BM_RM2', help='Whether to conduct benchmarking of RepeatModeler2, 1: true, 0: false. default = [ ' + str(default_BM_RM2) + ' ]')
parser.add_argument('--BM_EDTA', metavar='BM_EDTA', help='Whether to conduct benchmarking of EDTA, 1: true, 0: false. default = [ ' + str(default_BM_EDTA) + ' ]')
parser.add_argument('--BM_HiTE', metavar='BM_HiTE', help='Whether to conduct benchmarking of HiTE, 1: true, 0: false. default = [ ' + str(default_BM_HiTE) + ' ]')
parser.add_argument('--EDTA_home', metavar='EDTA_home', help='When conducting benchmarking of EDTA, you will be asked to input EDTA home path.')
parser.add_argument('--coverage_threshold', metavar='coverage_threshold', help='The coverage threshold of benchmarking methods.')
parser.add_argument('--species', metavar='species', help='Which species you want to conduct benchmarking, six species support (dmel, rice, cb, zebrafish, maize, ath).')
parser.add_argument('--skip_HiTE', metavar='skip_HiTE', help='Whether to skip_HiTE, 1: true, 0: false. default = [ ' + str(default_skip_HiTE) + ' ]')
parser.add_argument('--is_denovo_nonltr', metavar='is_denovo_nonltr', help='Whether to detect non-ltr de novo, 1: true, 0: false. default = [ ' + str(default_is_denovo_nonltr) + ' ]')
parser.add_argument('--debug', metavar='is_debug', help='Open debug mode, and temporary files will be kept, 1: true, 0: false. default = [ ' + str(default_debug) + ' ]')
parser.add_argument('--use_HybridLTR', metavar='use_HybridLTR', help='Whether to use HybridLTR to identify LTRs, 1: true, 0: false. default = [' + str(default_use_HybridLTR) + ' ]')
parser.add_argument('--use_NeuralTE', metavar='use_NeuralTE', help='Whether to use NeuralTE to classify TEs, 1: true, 0: false. default = [' + str(default_use_NeuralTE) + ' ]')
parser.add_argument('--is_wicker', metavar='is_wicker', help='Use Wicker or RepeatMasker classification labels, 1: Wicker, 0: RepeatMasker. default = [ ' + str(default_is_wicker) + ' ]')
parser.add_argument('--is_output_LTR_lib', metavar='is_output_LTR_lib', help='Whether to output LTR library. default = [ ' + str(default_is_output_LTR_lib) + ' ]')
parser.add_argument('--flanking_len', metavar='flanking_len', help='The flanking length of candidates to find the true boundaries, default = [ ' + str(default_flanking_len) + ' ]')
parser.add_argument('--fixed_extend_base_threshold', metavar='fixed_extend_base_threshold', help='The length of variation can be tolerated during pairwise alignment, default = [ '+str(default_fixed_extend_base_threshold)+' ]')
parser.add_argument('--tandem_region_cutoff', metavar='tandem_region_cutoff', help='Cutoff of the candidates regarded as tandem region, default = [ '+str(default_tandem_region_cutoff)+' ]')
parser.add_argument('--max_repeat_len', metavar='max_repeat_len', help='The maximum length of a single repeat, default = [ ' + str(default_max_single_repeat_len) + ' ]')
parser.add_argument('--chrom_seg_length', metavar='chrom_seg_length', help='The length of genome segments, default = [ ' + str(default_chrom_seg_length) + ' ]')
args = parser.parse_args()
reference = args.genome
threads = args.thread
output_dir = args.outdir
fixed_extend_base_threshold = args.fixed_extend_base_threshold
chunk_size = args.chunk_size
tandem_region_cutoff = args.tandem_region_cutoff
max_repeat_len = args.max_repeat_len
chrom_seg_length = args.chrom_seg_length
flanking_len = args.flanking_len
plant = args.plant
te_type = args.te_type
curated_lib = args.curated_lib
# classified = args.classified
remove_nested = args.remove_nested
domain = args.domain
miu = args.miu
recover = args.recover
annotate = args.annotate
intact_anno = args.intact_anno
search_struct = args.search_struct
BM_RM2 = args.BM_RM2
BM_EDTA = args.BM_EDTA
BM_HiTE = args.BM_HiTE
EDTA_home = args.EDTA_home
coverage_threshold = args.coverage_threshold
species = args.species
skip_HiTE = args.skip_HiTE
is_denovo_nonltr = args.is_denovo_nonltr
debug = args.debug
use_HybridLTR = args.use_HybridLTR
use_NeuralTE = args.use_NeuralTE
is_wicker = args.is_wicker
is_output_LTR_lib = args.is_output_LTR_lib
i = datetime.datetime.now()
# tmp_output_dir = output_dir + '/CRD.' + str(i.date()) + '.' + str(i.hour) + '-' + str(i.minute) + '-' + str(i.second)
tmp_output_dir = os.path.abspath(output_dir + '/')
if not os.path.exists(tmp_output_dir):
os.makedirs(tmp_output_dir)
log = Logger(tmp_output_dir+'/HiTE.log', level='debug')
if reference is None:
log.logger.error('\nreference path can not be empty')
parser.print_help()
exit(-1)
if output_dir is None:
output_dir = project_dir + '/output'
log.logger.warning('\noutput directory path is empty, set to: ' + str(output_dir))
if not os.path.isabs(reference):
reference = os.path.abspath(reference)
if not os.path.isabs(output_dir):
output_dir = os.path.abspath(output_dir)
if not os.path.exists(reference):
log.logger.error('\nCannot find input genome assembly: ' + str(reference))
parser.print_help()
exit(-1)
if threads is None:
threads = int(default_threads)
else:
threads = int(threads)
if use_HybridLTR is None:
use_HybridLTR = int(default_use_HybridLTR)
else:
use_HybridLTR = int(use_HybridLTR)
if use_NeuralTE is None:
use_NeuralTE = int(default_use_NeuralTE)
else:
use_NeuralTE = int(use_NeuralTE)
if is_wicker is None:
is_wicker = int(default_is_wicker)
else:
is_wicker = int(is_wicker)
if fixed_extend_base_threshold is None:
fixed_extend_base_threshold = default_fixed_extend_base_threshold
else:
fixed_extend_base_threshold = int(fixed_extend_base_threshold)
if chunk_size is None:
chunk_size = default_chunk_size
else:
chunk_size = float(chunk_size)
if tandem_region_cutoff is None:
tandem_region_cutoff = default_tandem_region_cutoff
else:
tandem_region_cutoff = float(tandem_region_cutoff)
if flanking_len is None:
flanking_len = default_flanking_len
else:
flanking_len = int(flanking_len)
if plant is None:
plant = default_plant
else:
plant = int(plant)
if te_type is None:
te_type = default_te_type
else:
te_type = te_type
if curated_lib is None:
curated_lib = default_curated_lib
else:
curated_lib = curated_lib
if is_output_LTR_lib is None:
is_output_LTR_lib = default_is_output_LTR_lib
else:
is_output_LTR_lib = int(is_output_LTR_lib)
if remove_nested is None:
remove_nested = default_remove_nested
else:
remove_nested = int(remove_nested)
if domain is None:
domain = default_domain
else:
domain = int(domain)
if miu is None:
miu = default_miu
else:
miu = str(miu)
if recover is None:
recover = default_recover
else:
recover = int(recover)
if annotate is None:
annotate = default_annotate
else:
annotate = int(annotate)
if intact_anno is None:
intact_anno = default_intact_annotate
else:
intact_anno = int(intact_anno)
if search_struct is None:
search_struct = default_search_struct
else:
search_struct = int(search_struct)
if BM_RM2 is None:
BM_RM2 = default_BM_RM2
else:
BM_RM2 = int(BM_RM2)
if BM_EDTA is None:
BM_EDTA = default_BM_EDTA
else:
BM_EDTA = int(BM_EDTA)
if BM_HiTE is None:
BM_HiTE = default_BM_HiTE
else:
BM_HiTE = int(BM_HiTE)
if EDTA_home is None:
EDTA_home = default_EDTA_home
else:
EDTA_home = str(EDTA_home)
if coverage_threshold is None:
coverage_threshold = default_coverage_threshold
else:
coverage_threshold = float(coverage_threshold)
if skip_HiTE is None:
skip_HiTE = default_skip_HiTE
else:
skip_HiTE = int(skip_HiTE)
if is_denovo_nonltr is None:
is_denovo_nonltr = default_is_denovo_nonltr
else:
is_denovo_nonltr = int(is_denovo_nonltr)
if debug is None:
debug = default_debug
else:
debug = int(debug)
if max_repeat_len is None:
max_repeat_len = default_max_single_repeat_len
else:
max_repeat_len = int(max_repeat_len)
if chrom_seg_length is None:
chrom_seg_length = default_chrom_seg_length
else:
chrom_seg_length = int(chrom_seg_length)
partitions_num = int(threads)
total_starttime = time.time()
tools_dir = project_dir + '/tools'
(ref_dir, ref_filename) = os.path.split(reference)
(ref_name, ref_extension) = os.path.splitext(ref_filename)
os.system('cp ' + reference + ' ' + tmp_output_dir)
reference = tmp_output_dir + '/' + ref_filename
# preset steps, no needs to execute time-consuming job again
# when the job is retried due to abnormal termination.
is_recover = False
recover = int(recover)
if recover == 1:
is_recover = True
all_te_types = ['ltr', 'tir', 'helitron', 'non-ltr', 'all']
if te_type not in all_te_types:
log.logger.error('Specified an invalid TE type: ' + te_type + '. Please choose from ' + str(all_te_types))
sys.exit(-1)
if curated_lib is not None:
curated_lib = os.path.realpath(curated_lib)
curated_names, curated_contigs = read_fasta(curated_lib)
if len(curated_names) <= 0:
log.logger.error('You have provided an invalid or empty curated library: ' + str(curated_lib) + '. Please check and correct it.')
sys.exit(-1)
if BM_EDTA == 1 and not os.path.exists(EDTA_home + '/lib-test.pl'):
print('Cannot conduct benchmarking of EDTA, Invalid EDTA home: ' + EDTA_home)
sys.exit(-1)
log.logger.info('\n-------------------------------------------------------------------------------------------\n'
'Copyright (C) 2022 Kang Hu ( [email protected] )\n'
'Hunan Provincial Key Lab on Bioinformatics, School of Computer Science and \n'
'Engineering, Central South University, Changsha 410083, P.R. China.\n'
'-------------------------------------------------------------------------------------------')
log.logger.info('\nParameters configuration\n'
'====================================System settings========================================\n'
' [Setting] Reference sequences / assemblies path = [ ' + str(reference) + ' ]\n'
# ' [Setting] Is classified = [ ' + str(classified) + ' ] Default( ' + str(default_classified) + ' )\n'
' [Setting] Is remove nested TE = [ ' + str(remove_nested) + ' ] Default( ' + str(default_remove_nested) + ' )\n'
' [Setting] Is getting domain = [ ' + str(domain) + ' ] Default( ' + str(default_domain) + ' )\n'
' [Setting] The neutral mutation rate (per bp per ya) = [ ' + str(miu) + ' ] Default( ' + str(default_miu) + ' )\n'
' [Setting] Threads = [ ' + str(threads) + ' ] Default( ' + str(default_threads) + ' )\n'
' [Setting] The chunk size of large genome = [ ' + str(chunk_size) + ' ] MB Default( ' + str(default_chunk_size) + ' ) MB\n'
' [Setting] Is plant genome = [ ' + str(plant) + ' ] Default( ' + str(default_plant) + ' )\n'
' [Setting] Retrieve specific type of TE output = [ ' + str(te_type) + ' ] Default( ' + str(default_te_type) + ' )\n'
' [Setting] Curated library = [ ' + str(curated_lib) + ' ] Default( ' + str(default_curated_lib) + ' )\n'
' [Setting] recover = [ ' + str(recover) + ' ] Default( ' + str(default_recover) + ' )\n'
' [Setting] annotate = [ ' + str(annotate) + ' ] Default( ' + str(default_annotate) + ' )\n'
' [Setting] intact_anno = [ ' + str(intact_anno) + ' ] Default( ' + str(default_intact_annotate) + ' )\n'
' [Setting] search_struct = [ ' + str(search_struct) + ' ] Default( ' + str(default_search_struct) + ' )\n'
' [Setting] BM_RM2 = [ ' + str(BM_RM2) + ' ] Default( ' + str(default_BM_RM2) + ' )\n'
' [Setting] BM_EDTA = [ ' + str(BM_EDTA) + ' ] Default( ' + str(default_BM_EDTA) + ' )\n'
' [Setting] BM_HiTE = [ ' + str(BM_HiTE) + ' ] Default( ' + str(default_BM_HiTE) + ' )\n'
' [Setting] EDTA_home = [' + str(EDTA_home) + ']\n'
' [Setting] coverage_threshold = [ ' + str(coverage_threshold) + ' ] Default( ' + str(default_coverage_threshold) + ' )\n'
' [Setting] skip_HiTE = [ ' + str(skip_HiTE) + ' ] Default( ' + str(default_skip_HiTE) + ' )\n'
' [Setting] is_denovo_nonltr = [ ' + str(is_denovo_nonltr) + ' ] Default( ' + str(default_is_denovo_nonltr) + ' )\n'
' [Setting] use_HybridLTR = [ ' + str(use_HybridLTR) + ' ] Default( ' + str(default_use_HybridLTR) + ' )\n'
' [Setting] use_NeuralTE = [ ' + str(use_NeuralTE) + ' ] Default( ' + str(default_use_NeuralTE) + ' )\n'
' [Setting] is_wicker = [ ' + str(is_wicker) + ' ] Default( ' + str(default_is_wicker) + ' )\n'
' [Setting] debug = [ ' + str(debug) + ' ] Default( ' + str(default_debug) + ' )\n'
' [Setting] Output Directory = [' + str(output_dir) + ']\n'
' [Setting] Fixed extend bases threshold = [ ' + str(fixed_extend_base_threshold) + ' ] Default( ' + str(default_fixed_extend_base_threshold) + ' )\n'
' [Setting] Flanking length of TE = [ ' + str(flanking_len) + ' ] Default( ' + str(default_flanking_len) + ' )\n'
' [Setting] Cutoff of the repeat regarded as tandem sequence = [ ' + str(tandem_region_cutoff) + ' ] Default( ' + str(default_tandem_region_cutoff) + ' )\n'
' [Setting] The length of genome segments = [ ' + str(chrom_seg_length) + ' ] Default( ' + str(default_chrom_seg_length) + ' )'
)
TRsearch_dir = tools_dir
test_home = project_dir + '/module'
# The HiTE pipeline performs recognition of LTR, Non-LTR, TIR, and Helitron transposons.
# The organizational structure of HiTE is as follows:
# Pipeline: main.py
# ├──LTR: judge_LTR_transposons.py
# ├──Homology-Non-LTR: judge_Other_transposons.py
# ├──split genome into chunks: split_genome_chunks.py
# ├──De novo TE searching: coarse_boundary.py
# ├──TIR: judge_TIR_transposons.py
# ├──Helitron: judge_Helitron_transposons.py
# └──De novo-Non-LTR: judge_Non_LTR_transposons.py
# ├──generate TE library: get_nonRedundant_lib.py
# └──unwrap nested TE: remove_nested_lib.py
# ├──genome annotation: annotate_genome.py
# ├──benchmarking reproduction: benchmarking.py
# └──clean temporary files: clean_lib.py
pipeline_starttime = time.time()
if skip_HiTE != 1:
log.logger.info('Start step0: Structural Based LTR Searching')
confident_ltr_cut_path = tmp_output_dir + '/confident_ltr_cut.fa'
resut_file = confident_ltr_cut_path
if not is_recover or not file_exist(resut_file):
if te_type == 'all' or te_type == 'ltr':
starttime = time.time()
TEClass_home = project_dir + '/classification'
LTR_identification_command = 'judge_LTR_transposons.py ' \
+ ' -g ' + reference + ' -t ' + str(threads) \
+ ' --tmp_output_dir ' + tmp_output_dir + ' --recover ' + str(recover) \
+ ' --miu ' + str(miu) + ' --use_HybridLTR ' + str(use_HybridLTR) \
+ ' --use_NeuralTE ' + str(use_NeuralTE) + ' --is_wicker ' + str(is_wicker) \
+ ' --is_output_lib ' + str(is_output_LTR_lib) + ' --debug ' + str(debug)
log.logger.info(LTR_identification_command)
os.system(LTR_identification_command)
endtime = time.time()
dtime = endtime - starttime
log.logger.info("Running time of step0: %.8s s" % (dtime))
else:
log.logger.info(resut_file + ' exists, skip...')
confident_other_path = tmp_output_dir + '/confident_other.fa'
resut_file = confident_other_path
if not is_recover or not file_exist(resut_file):
if te_type == 'all' or te_type == 'non-ltr':
starttime = time.time()
log.logger.info('Start step1: homology-based other TE searching')
other_identification_command = 'judge_Other_transposons.py ' \
+ ' -r ' + reference \
+ ' -t ' + str(threads) \
+ ' --tmp_output_dir ' + tmp_output_dir \
+ ' --recover ' + str(recover)
log.logger.info(other_identification_command)
os.system(other_identification_command)
endtime = time.time()
dtime = endtime - starttime
log.logger.info("Running time of step1: %.8s s" % (dtime))
else:
log.logger.info(resut_file + ' exists, skip...')
# --------------------------------------------------------------------------------------
starttime = time.time()
log.logger.info('Start step2.0: Splitting genome assembly into chunks')
split_genome_command = 'split_genome_chunks.py -g ' \
+ reference + ' --tmp_output_dir ' + tmp_output_dir \
+ ' --chrom_seg_length ' + str(chrom_seg_length) + ' --chunk_size ' + str(chunk_size)
log.logger.info(split_genome_command)
os.system(split_genome_command)
endtime = time.time()
dtime = endtime - starttime
log.logger.info("Running time of step2.0: %.8s s" % (dtime))
reg_str = 'genome.cut(\d+).fa$'
cut_references = []
for filename in os.listdir(tmp_output_dir):
match = re.search(reg_str, filename)
if match:
ref_index = match.group(1)
cut_references.append((ref_index, tmp_output_dir + '/' + filename))
# Using identified TEs to mask the genome in order to reduce computational load in all-vs-all alignments.
prev_TE = tmp_output_dir + '/prev_TE.fa'
if os.path.exists(confident_ltr_cut_path):
os.system('cat ' + confident_ltr_cut_path + ' > ' + prev_TE)
if curated_lib is not None and os.path.exists(curated_lib):
os.system('cat ' + curated_lib + ' >> ' + prev_TE)
# The outcomes of homologous methods can only serve as supplementary information and should not be used as masks,
# as this could potentially obscure many genuine non-LTR local masks, rendering the de novo method unable to identify them.
# os.system('cat ' + confident_other_path + ' >> ' + prev_TE)
split_ref_dir = tmp_output_dir + '/ref_chr'
for cut_reference_item in cut_references:
ref_index = cut_reference_item[0]
cut_reference = cut_reference_item[1]
log.logger.info('Current chunk: ' + str(ref_index))
longest_repeats_flanked_path = tmp_output_dir + '/longest_repeats_' + str(ref_index) + '.flanked.fa'
longest_repeats_path = tmp_output_dir + '/longest_repeats_' + str(ref_index) + '.fa'
resut_file = longest_repeats_path
if not is_recover or not file_exist(resut_file) or not file_exist(longest_repeats_flanked_path):
if te_type != 'ltr':
starttime = time.time()
log.logger.info('Start 2.1: Coarse-grained boundary mapping')
coarse_boundary_command = 'coarse_boundary.py ' \
+ ' -g ' + cut_reference + ' --tmp_output_dir ' + tmp_output_dir \
+ ' --prev_TE ' + str(prev_TE) \
+ ' --fixed_extend_base_threshold ' + str(fixed_extend_base_threshold) \
+ ' --max_repeat_len ' + str(max_repeat_len) \
+ ' --thread ' + str(threads) \
+ ' --flanking_len ' + str(flanking_len) \
+ ' --tandem_region_cutoff ' + str(tandem_region_cutoff) \
+ ' --ref_index ' + str(ref_index) \
+ ' -r ' + reference + ' --recover ' + str(recover) \
+ ' --debug ' + str(debug)
log.logger.info(coarse_boundary_command)
os.system(coarse_boundary_command)
endtime = time.time()
dtime = endtime - starttime
log.logger.info("Running time of step2.1: %.8s s" % (dtime))
else:
log.logger.info(resut_file + ' exists, skip...')
longest_repeats_flanked_path = tmp_output_dir + '/longest_repeats_' + str(ref_index) + '.flanked.fa'
resut_file = tmp_output_dir + '/confident_tir_'+str(ref_index)+'.fa'
if not is_recover or not file_exist(resut_file):
if te_type == 'all' or te_type == 'tir':
starttime = time.time()
log.logger.info('Start step2.2: determine fine-grained TIR')
tir_identification_command = 'judge_TIR_transposons.py ' \
+ ' --seqs ' + longest_repeats_flanked_path \
+ ' -t ' + str(threads) \
+ ' --tmp_output_dir ' + tmp_output_dir \
+ ' --tandem_region_cutoff ' + str(tandem_region_cutoff) \
+ ' --ref_index ' + str(ref_index) \
+ ' --plant ' + str(plant) \
+ ' --flanking_len ' + str(flanking_len) \
+ ' --recover ' + str(recover) \
+ ' --debug ' + str(debug) \
+ ' -r ' + reference \
+ ' --split_ref_dir ' + split_ref_dir \
+ ' --prev_TE ' + prev_TE
log.logger.debug(tir_identification_command)
os.system(tir_identification_command)
endtime = time.time()
dtime = endtime - starttime
log.logger.info("Running time of step2.2: %.8s s" % (dtime))
else:
log.logger.info(resut_file + ' exists, skip...')
resut_file = tmp_output_dir + '/confident_helitron_'+str(ref_index)+'.fa'
if not is_recover or not file_exist(resut_file):
if te_type == 'all' or te_type == 'helitron':
starttime = time.time()
log.logger.info('Start step2.3: determine fine-grained Helitron')
helitron_identification_command = 'judge_Helitron_transposons.py --seqs ' \
+ longest_repeats_flanked_path + ' -r ' + reference + ' -t ' + str(threads) \
+ ' --tmp_output_dir ' + tmp_output_dir \
+ ' --ref_index ' + str(ref_index) + ' --flanking_len ' + str(flanking_len) \
+ ' --recover ' + str(recover) + ' --debug ' + str(debug) + ' --split_ref_dir ' + split_ref_dir \
+ ' --prev_TE ' + prev_TE
log.logger.info(helitron_identification_command)
os.system(helitron_identification_command)
endtime = time.time()
dtime = endtime - starttime
log.logger.info("Running time of step2.3: %.8s s" % (dtime))
else:
log.logger.info(resut_file + ' exists, skip...')
resut_file = tmp_output_dir + '/confident_non_ltr_' + str(ref_index) + '.fa'
if not is_recover or not file_exist(resut_file):
if te_type == 'all' or te_type == 'non-ltr':
starttime = time.time()
log.logger.info('Start step2.4: determine fine-grained Non-LTR')
non_ltr_identification_command = 'judge_Non_LTR_transposons.py'\
+ ' --seqs ' + longest_repeats_flanked_path + ' -t ' + str(threads) \
+ ' --tmp_output_dir ' + tmp_output_dir \
+ ' --recover ' + str(recover) \
+ ' --plant ' + str(plant) \
+ ' --debug ' + str(debug) \
+ ' --flanking_len ' + str(flanking_len) \
+ ' --ref_index ' + str(ref_index) \
+ ' --is_denovo_nonltr ' + str(is_denovo_nonltr) \
+ ' -r ' + reference \
+ ' --prev_TE ' + prev_TE
log.logger.debug(non_ltr_identification_command)
os.system(non_ltr_identification_command)
endtime = time.time()
dtime = endtime - starttime
log.logger.info("Running time of step2.4: %.8s s" % (dtime))
else:
log.logger.info(resut_file + ' exists, skip...')
# Merging all TEs generated from individual chunks.
confident_tir_path = tmp_output_dir + '/confident_tir_merge.fa'
confident_helitron_path = tmp_output_dir + '/confident_helitron_merge.fa'
confident_non_ltr_path = tmp_output_dir + '/confident_non_ltr_merge.fa'
os.system('rm -f ' + confident_tir_path)
os.system('rm -f ' + confident_helitron_path)
os.system('rm -f ' + confident_non_ltr_path)
for ref_index, ref_rename_path in enumerate(cut_references):
cur_confident_tir_path = tmp_output_dir + '/confident_tir_' + str(ref_index) + '.fa'
cur_confident_helitron_path = tmp_output_dir + '/confident_helitron_' + str(ref_index) + '.fa'
cur_confident_non_ltr_path = tmp_output_dir + '/confident_non_ltr_' + str(ref_index) + '.fa'
if os.path.exists(cur_confident_tir_path):
os.system('cat ' + cur_confident_tir_path + ' >> ' + confident_tir_path)
if os.path.exists(cur_confident_helitron_path):
os.system('cat ' + cur_confident_helitron_path + ' >> ' + confident_helitron_path)
if os.path.exists(cur_confident_non_ltr_path):
os.system('cat ' + cur_confident_non_ltr_path + ' >> ' + confident_non_ltr_path)
starttime = time.time()
log.logger.info('Start step3: generate non-redundant library')
TEClass_home = project_dir + '/classification'
generate_lib_command = 'get_nonRedundant_lib.py' \
+ ' --confident_ltr_cut ' + confident_ltr_cut_path \
+ ' --confident_tir ' + confident_tir_path \
+ ' --confident_helitron ' + confident_helitron_path \
+ ' --confident_non_ltr ' + confident_non_ltr_path \
+ ' --confident_other ' + confident_other_path \
+ ' -t ' + str(threads) + ' --tmp_output_dir ' + tmp_output_dir \
+ ' --use_NeuralTE ' + str(use_NeuralTE) \
+ ' --is_wicker ' + str(is_wicker) \
+ ' --domain ' + str(domain) \
+ ' --curated_lib ' + str(curated_lib)
log.logger.info(generate_lib_command)
os.system(generate_lib_command)
endtime = time.time()
dtime = endtime - starttime
log.logger.info("Running time of step3: %.8s s" % (dtime))
if intact_anno == 1:
starttime = time.time()
log.logger.info('Start step4: get full-length TE annotation')
chr_name_map = tmp_output_dir + '/chr_name.map'
ltr_list = tmp_output_dir + '/genome.rename.fa.pass.list'
confident_tir_path = tmp_output_dir + '/confident_tir.fa'
confident_helitron_path = tmp_output_dir + '/confident_helitron.fa'
confident_non_ltr_path = tmp_output_dir + '/confident_non_ltr.fa'
confident_other_path = tmp_output_dir + '/confident_other.fa'
classified_TE_path = tmp_output_dir + '/TE_merge_tmp.fa.classified'
full_length_anno_command = 'get_full_length_annotation.py' \
+ ' -t ' + str(threads) + ' --ltr_list ' + ltr_list \
+ ' --tir_lib ' + str(confident_tir_path) \
+ ' --helitron_lib ' + confident_helitron_path \
+ ' --nonltr_lib ' + confident_non_ltr_path \
+ ' --other_lib ' + confident_other_path \
+ ' --chr_name_map ' + chr_name_map \
+ ' -r ' + reference \
+ ' --tmp_output_dir ' + tmp_output_dir \
+ ' --search_struct ' + str(search_struct) \
+ ' --classified_TE_path ' + str(classified_TE_path)
log.logger.info(full_length_anno_command)
os.system(full_length_anno_command)
endtime = time.time()
dtime = endtime - starttime
log.logger.info("Running time of step4: %.8s s" % (dtime))
confident_TE_consensus = tmp_output_dir + '/confident_TE.cons.fa'
if not os.path.exists(confident_TE_consensus):
log.logger.error('Error, Cannot find TE path: ' + confident_TE_consensus)
else:
starttime = time.time()
log.logger.info('Start step5: annotate genome')
TEClass_home = project_dir + '/classification'
annotate_genome_command = 'annotate_genome.py' \
+ ' -t ' + str(threads) + ' --classified_TE_consensus ' + confident_TE_consensus \
+ ' --annotate ' + str(annotate) \
+ ' -r ' + reference \
+ ' --tmp_output_dir ' + tmp_output_dir
log.logger.info(annotate_genome_command)
os.system(annotate_genome_command)
endtime = time.time()
dtime = endtime - starttime
log.logger.info("Running time of step5: %.8s s" % (dtime))
starttime = time.time()
log.logger.info('Start step6: Start conduct benchmarking of RepeatModeler2, EDTA, and HiTE')
benchmarking_command = 'benchmarking.py' \
+ ' --tmp_output_dir ' + tmp_output_dir \
+ ' --BM_RM2 ' + str(BM_RM2) + ' --BM_EDTA ' + str(BM_EDTA) + ' --BM_HiTE ' + str(BM_HiTE) \
+ ' --coverage_threshold ' + str(coverage_threshold) + ' -t ' + str(threads) \
+ ' --TE_lib ' + str(confident_TE_consensus) \
+ ' -r ' + reference + ' --recover ' + str(recover)
if EDTA_home is not None and EDTA_home.strip() != '':
benchmarking_command += ' --EDTA_home ' + str(EDTA_home)
if species is None or species.strip() == '':
benchmarking_command += ' --species test'
else:
benchmarking_command += ' --species ' + str(species)
log.logger.info(benchmarking_command)
os.system(benchmarking_command)
endtime = time.time()
dtime = endtime - starttime
log.logger.info("Running time of step6: %.8s s" % (dtime))
clean_lib_command = 'clean_lib.py' \
+ ' --tmp_output_dir ' + tmp_output_dir \
+ ' --debug ' + str(debug)
os.system(clean_lib_command)
pipeline_endtime = time.time()
dtime = pipeline_endtime - pipeline_starttime
log.logger.info("Running time of the whole pipeline: %.8s s" % (dtime))