-
Notifications
You must be signed in to change notification settings - Fork 10
/
uLTRA
executable file
·566 lines (465 loc) · 34.7 KB
/
uLTRA
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
#! /usr/bin/env python
from __future__ import print_function
import os
import sys
import glob
from time import time
import itertools
from itertools import islice, chain
from struct import *
import shutil
import gzip
import argparse
import errno
import math
# import pickle
import dill as pickle
import gffutils
import pysam
import multiprocessing as mp
from multiprocessing import Pool
from collections import defaultdict
# from collections import OrderedDict
# from modules import create_splice_graph as splice_graph
# from modules import graph_chainer
from modules import create_augmented_gene as augmented_gene
from modules import seed_wrapper
from modules import colinear_solver
from modules import help_functions
from modules import classify_read_with_mams
from modules import classify_alignment2
from modules import sam_output
from modules import pc
from modules import prefilter_genomic_reads
def load_reference(args):
refs = {acc : seq for acc, (seq, _) in help_functions.readfq(open(args.ref,"r"))}
refs_lengths = { acc : len(seq) for acc, seq in refs.items()}
return refs, refs_lengths
def prep_splicing(args, refs_lengths):
if args.index:
index_folder = args.index
help_functions.mkdir_p(index_folder)
else:
index_folder = args.outfolder
database = os.path.join(index_folder,'database.db')
if os.path.isfile(database):
print("Database found in directory using this one.")
print("If you want to recreate the database, please remove the file: {0}".format(database))
print()
db = gffutils.FeatureDB(database, keep_order=True)
# sys.exit()
elif not args.disable_infer:
db = gffutils.create_db(args.gtf, dbfn=database, force=True, keep_order=True, merge_strategy='merge',
sort_attribute_values=True)
db = gffutils.FeatureDB(database, keep_order=True)
else:
db = gffutils.create_db(args.gtf, dbfn=database, force=True, keep_order=True, merge_strategy='merge',
sort_attribute_values=True, disable_infer_genes=True, disable_infer_transcripts=True)
db = gffutils.FeatureDB(database, keep_order=True)
# segment_to_ref, parts_to_segments, splices_to_transcripts, \
# transcripts_to_splices, all_splice_pairs_annotations, \
# all_splice_sites_annotations, segment_id_to_choordinates, \
# segment_to_gene, gene_to_small_segments, flank_choordinates, \
# max_intron_chr, exon_choordinates_to_id, chr_to_id, id_to_chr, tiling_structures = augmented_gene.create_graph_from_exon_parts(db, args.flank_size, args.small_exon_threshold, args.min_segm, refs_lengths)
segment_to_ref, parts_to_segments, splices_to_transcripts, \
transcripts_to_splices, all_splice_pairs_annotations, \
all_splice_sites_annotations, segment_id_to_choordinates, \
segment_to_gene, gene_to_small_segments, flank_choordinates, \
max_intron_chr, exon_choordinates_to_id, chr_to_id, id_to_chr = augmented_gene.create_graph_from_exon_parts(db, args.flank_size, args.small_exon_threshold, args.min_segm, refs_lengths)
# dump to pickle here! Both graph and reference seqs
# help_functions.pickle_dump(args, genes_to_ref, 'genes_to_ref.pickle')
help_functions.pickle_dump(index_folder, segment_to_ref, 'segment_to_ref.pickle')
help_functions.pickle_dump(index_folder, splices_to_transcripts, 'splices_to_transcripts.pickle')
help_functions.pickle_dump(index_folder, transcripts_to_splices, 'transcripts_to_splices.pickle')
help_functions.pickle_dump(index_folder, parts_to_segments, 'parts_to_segments.pickle')
help_functions.pickle_dump(index_folder, all_splice_pairs_annotations, 'all_splice_pairs_annotations.pickle')
help_functions.pickle_dump(index_folder, all_splice_sites_annotations, 'all_splice_sites_annotations.pickle')
help_functions.pickle_dump(index_folder, segment_id_to_choordinates, 'segment_id_to_choordinates.pickle')
help_functions.pickle_dump(index_folder, segment_to_gene, 'segment_to_gene.pickle')
help_functions.pickle_dump(index_folder, gene_to_small_segments, 'gene_to_small_segments.pickle')
help_functions.pickle_dump(index_folder, flank_choordinates, 'flank_choordinates.pickle')
help_functions.pickle_dump(index_folder, max_intron_chr, 'max_intron_chr.pickle')
help_functions.pickle_dump(index_folder, exon_choordinates_to_id, 'exon_choordinates_to_id.pickle')
help_functions.pickle_dump(index_folder, chr_to_id, 'chr_to_id.pickle')
help_functions.pickle_dump(index_folder, id_to_chr, 'id_to_chr.pickle')
# tiling_segment_id_to_choordinates, tiling_segment_to_gene, tiling_segment_to_ref, tiling_parts_to_segments, tiling_gene_to_small_segments = tiling_structures # unpacking tiling structures
# help_functions.pickle_dump(args, tiling_segment_id_to_choordinates, 'tiling_segment_id_to_choordinates.pickle')
# help_functions.pickle_dump(args, tiling_segment_to_gene, 'tiling_segment_to_gene.pickle')
# help_functions.pickle_dump(args, tiling_segment_to_ref, 'tiling_segment_to_ref.pickle')
# help_functions.pickle_dump(args, tiling_parts_to_segments, 'tiling_parts_to_segments.pickle')
# help_functions.pickle_dump(args, tiling_gene_to_small_segments, 'tiling_gene_to_small_segments.pickle')
def prep_seqs(args, refs, refs_lengths):
if args.index:
index_folder = args.index
else:
index_folder = args.outfolder
parts_to_segments = help_functions.pickle_load( os.path.join(index_folder, 'parts_to_segments.pickle') )
segment_id_to_choordinates = help_functions.pickle_load( os.path.join(index_folder, 'segment_id_to_choordinates.pickle') )
segment_to_ref = help_functions.pickle_load( os.path.join(index_folder, 'segment_to_ref.pickle') )
flank_choordinates = help_functions.pickle_load( os.path.join(index_folder, 'flank_choordinates.pickle') )
exon_choordinates_to_id = help_functions.pickle_load( os.path.join(index_folder, 'exon_choordinates_to_id.pickle') )
chr_to_id = help_functions.pickle_load( os.path.join(index_folder, 'chr_to_id.pickle') )
id_to_chr = help_functions.pickle_load( os.path.join(index_folder, 'id_to_chr.pickle') )
# for chr_id in id_to_chr:
# print(chr_id, id_to_chr[chr_id])
# tiling_parts_to_segments = help_functions.pickle_load( os.path.join(args.outfolder, 'tiling_parts_to_segments.pickle') )
# tiling_segment_id_to_choordinates = help_functions.pickle_load( os.path.join(args.outfolder, 'tiling_segment_id_to_choordinates.pickle') )
# tiling_segment_to_ref = help_functions.pickle_load( os.path.join(args.outfolder, 'tiling_segment_to_ref.pickle') )
print( "Number of ref seqs in gff:", len(parts_to_segments.keys()))
refs_id = {}
not_in_annot = set()
for acc, seq in refs.items():
if acc not in chr_to_id:
not_in_annot.add(acc)
else:
acc_id = chr_to_id[acc]
refs_id[acc_id] = seq
refs_id_lengths = { acc_id : len(seq) for acc_id, seq in refs_id.items()}
help_functions.pickle_dump(index_folder, refs_id_lengths, 'refs_id_lengths.pickle')
help_functions.pickle_dump(index_folder, refs_lengths, 'refs_lengths.pickle')
print( "Number of ref seqs in fasta:", len(refs.keys()))
not_in_ref = set(chr_to_id.keys()) - set(refs.keys())
if not_in_ref:
print("Warning: Detected {0} sequences that are in annotation but not in reference fasta. Using only sequences present in fasta. The following sequences cannot be detected in reference fasta:\n".format(len(not_in_ref)))
for s in not_in_ref:
print(s)
if not_in_annot:
print("Warning: Detected {0} sequences in reference fasta that are not in annotation:\n".format(len(not_in_annot)))
for s in not_in_annot:
print(s, "with length:{0}".format(len(refs[s])))
# ref_part_sequences, ref_flank_sequences = augmented_gene.get_part_sequences_from_choordinates(parts_to_segments, flank_choordinates, refs_id)
ref_part_sequences = augmented_gene.get_sequences_from_choordinates(parts_to_segments, refs_id)
ref_flank_sequences = augmented_gene.get_sequences_from_choordinates(flank_choordinates, refs_id)
# print([unpack('LLL',t) for t in ref_flank_sequences.keys()])
ref_part_sequences = help_functions.update_nested(ref_part_sequences, ref_flank_sequences)
ref_segment_sequences = augmented_gene.get_sequences_from_choordinates(segment_id_to_choordinates, refs_id)
# ref_flank_sequences = augmented_gene.get_sequences_from_choordinates(flank_choordinates, refs_id)
ref_exon_sequences = augmented_gene.get_sequences_from_choordinates(exon_choordinates_to_id, refs_id)
help_functions.pickle_dump(index_folder, segment_id_to_choordinates, 'segment_id_to_choordinates.pickle')
help_functions.pickle_dump(index_folder, ref_part_sequences, 'ref_part_sequences.pickle')
help_functions.pickle_dump(index_folder, ref_segment_sequences, 'ref_segment_sequences.pickle')
help_functions.pickle_dump(index_folder, ref_flank_sequences, 'ref_flank_sequences.pickle')
help_functions.pickle_dump(index_folder, ref_exon_sequences, 'ref_exon_sequences.pickle')
# tiling_ref_segment_sequences = augmented_gene.get_sequences_from_choordinates(tiling_segment_id_to_choordinates, refs_id)
# help_functions.pickle_dump(args, tiling_ref_segment_sequences, 'tiling_ref_segment_sequences.pickle')
def batch(dictionary, size, batch_type):
# if batch_type == 'nt':
# total_nt = sum([len(seq) for seq in dictionary.values() ])
batches = []
sub_dict = {}
curr_nt_count = 0
for i, (acc, seq) in enumerate(dictionary.items()):
curr_nt_count += len(seq)
if curr_nt_count >= size:
sub_dict[acc] = seq
batches.append(sub_dict)
sub_dict = {}
curr_nt_count = 0
else:
sub_dict[acc] = seq
if curr_nt_count/size != 0:
sub_dict[acc] = seq
batches.append(sub_dict)
return batches
def score(cigartuples):
diffs = {1,2,8} # cigar IDs for INS, DEL, SUBS
matches = sum([length for type_, length in cigartuples if type_ == 7])
diffs = sum([length for type_, length in cigartuples if type_ in diffs])
return (matches - diffs)
def output_final_alignments( ultra_alignments_path, path_indexed_aligned, path_unindexed_aligned):
"""
This function have been improved in memory usage at the cost of speed.
As opposed ot previous solution, we now never keep a full set of SAM records
in memory for comparison as in previous functions. Steps:
1. Read in dictionary {read_acc : score} from uLTRA's alignments
2. Steam over alternative aligner's (only mm2 impl currently) reads and log in a set {read_acc} where uLtra's alignments are better.
3. Delete dictionary {read_acc : score} from step 1.
4. Stream over uLTRA's reads and save {read_acc : read (full SAM record)} for uLTRA's primary alignments with better score
5. Steam over alternative aligners records and replace the records from step 4 with uLTRA's
Instead of parsing two SAM files (keeping one in RAM for coparison) as previous solution,
this approach contains four parses at but is only keeping either all the read accessions plus an integer (cores)
or all uLTRA alignments and therir full sam record as max memory.
"""
replaced_unaligned_cntr = 0
equal_score = 0
slightly_worse_score = 0
worse_score = 0
ultra_unmapped = 0
# Step 1
ultra_SAM = pysam.AlignmentFile( ultra_alignments_path, "r" )
# take only primary alignments
ultra_scores = { read.query_name : score(read.cigartuples) for read in ultra_SAM.fetch() if (not read.is_secondary) and (not read.is_unmapped) }
ultra_SAM.close()
# Step 2
mm2_SAM = pysam.AlignmentFile(path_indexed_aligned, "r", check_sq=False)
ultra_better = set()
for read in mm2_SAM.fetch(until_eof=True):
if read.query_name not in ultra_scores:
ultra_unmapped += 1
continue
if read.is_unmapped:
if read.query_name in ultra_scores: # uLTRA aligned the read
ultra_better.add(read.query_name)
replaced_unaligned_cntr += 1
else:
pass
if (not read.is_secondary): # primary alignment in both files
mm2_score = score(read.cigartuples)
if mm2_score < ultra_scores[read.query_name]:
ultra_better.add(read.query_name)
elif mm2_score == ultra_scores[read.query_name]:
equal_score += 1
elif mm2_score <= ultra_scores[read.query_name] + 10:
slightly_worse_score += 1
else:
worse_score += 1
if (not read.is_secondary):
del ultra_scores[read.query_name]
mm2_SAM.close()
print("Total mm2's primary alignments replaced with uLTRA:", len(ultra_better))
print("Total mm2's alignments unmapped but mapped with uLTRA:", replaced_unaligned_cntr)
# Step 3
del ultra_scores
# Step 4
ultra_SAM = pysam.AlignmentFile( ultra_alignments_path, "r" )
ultra_better_records = { read.query_name: read for read in ultra_SAM.fetch() if (not read.is_secondary) and (read.query_name in ultra_better) }
tmp_merged_sam = pysam.AlignmentFile( ultra_alignments_path.decode()+ 'tmp', "w", template= ultra_SAM)
ultra_SAM.close()
# Step 5
mm2_SAM = pysam.AlignmentFile(path_indexed_aligned, "r", check_sq=False)
for read in mm2_SAM.fetch(until_eof=True):
if not read.is_secondary: # replacing primary alignments
if read.query_name in ultra_better_records:
read = ultra_better_records[read.query_name]
tmp_merged_sam.write(read)
mm2_SAM.close()
# add all reads that we did not attempt to align with uLTRA
# these reads had a primary alignment to unindexed regions by other pre-processing aligner (minimap2 as of now)
not_attempted_cntr = 0
unindexed = pysam.AlignmentFile(path_unindexed_aligned, "r")
for read in unindexed.fetch():
tmp_merged_sam.write(read)
if not read.is_secondary:
not_attempted_cntr += 1
unindexed.close()
tmp_merged_sam.close()
print("{0} primary alignments had better score with uLTRA.".format(len(ultra_better_records)))
print("{0} primary alignments had equal score with alternative aligner.".format(equal_score))
print("{0} primary alignments had slightly better score with alternative aligner (typically ends bonus giving better scoring in ends, which needs to be implemented in uLTRA).".format(slightly_worse_score))
print("{0} primary alignments had significantly better score with alternative aligner.".format(worse_score))
print("{0} reads were unmapped with ultra but not by alternative aligner.".format(ultra_unmapped))
print("{0} reads were not attempted to be aligned with ultra (unindexed regions), instead alternative aligner was used.".format(not_attempted_cntr))
shutil.move(ultra_alignments_path.decode()+ 'tmp', ultra_alignments_path)
#########################
def align_reads(args):
if args.index:
if os.path.isdir(args.index):
index_folder = args.index
else:
print("The index folder specified for alignment is not found. You specified: ", args.index )
print("Build the index to this folder, or specify another forder where the index has been built." )
sys.exit()
else:
index_folder = args.outfolder
ref_part_sequences = help_functions.pickle_load( os.path.join(index_folder, 'ref_part_sequences.pickle') )
refs_id_lengths = help_functions.pickle_load( os.path.join(index_folder, 'refs_id_lengths.pickle') )
refs_lengths = help_functions.pickle_load( os.path.join(index_folder, 'refs_lengths.pickle') )
if not args.disable_mm2:
print("Filtering reads aligned to unindexed regions with minimap2 ")
print("Running minimap2...")
mm2_start = time()
nr_reads_to_ignore, path_reads_to_align = prefilter_genomic_reads.main(ref_part_sequences, args.ref, args.reads, args.outfolder, index_folder, args.nr_cores, args.genomic_frac, args.mm2_ksize)
args.reads = path_reads_to_align
print("Done filtering. Reads filtered:{0}".format(nr_reads_to_ignore))
print("Time for minimap2:{0} seconds.".format(time() - mm2_start))
ref_path = os.path.join(args.outfolder, "refs_sequences.fa")
refs_file = open(ref_path, 'w')
for sequence_id, seq in ref_part_sequences.items():
chr_id, start, stop = unpack('LLL',sequence_id)
refs_file.write(">{0}\n{1}\n".format(str(chr_id) + str("^") + str(start) + "^" + str(stop), seq))
refs_file.close()
del ref_part_sequences
######### FIND MEMS WITH NAMFINDER ##########
#############################################
#############################################
print("Processing reads for NAM finding")
processing_start = time()
reads_tmp = gzip.open(os.path.join(args.outfolder, 'reads_tmp.fa.gz'), 'wb', compresslevel=1)
for acc, (seq, qual) in help_functions.readfq(open(args.reads, 'r')):
seq_mod, _ = help_functions.remove_read_polyA_ends(seq, qual, args.reduce_read_ployA, 5)
record = '>{0}\n{1}\n'.format(acc, seq_mod )
reads_tmp.write(record.encode('utf-8'))
reads_tmp.close()
print("Completed processing poly-A tails")
print("Time for processing reads:{0} seconds.".format(time() - processing_start))
args.reads_tmp = reads_tmp.name
namfinder_start = time()
seed_file_name = seed_wrapper.find_nams_namfinder(args.outfolder, args.reads_tmp, ref_path, args.outfolder, args.nr_cores, args.s, args.thinning)
print("Time for namfinder to find seeds:{0} seconds.".format(time() - namfinder_start))
#############################################
#############################################
#############################################
# ## For development omitting prefilter and namfinder steps.
# seed_file_name = os.path.join(args.outfolder, "seeds.txt.gz")
# args.reads = os.path.join(args.outfolder, "reads_after_genomic_filtering.fasta")
# ############################################################
print("Starting aligning reads.")
aln_file_name = os.path.join(args.outfolder, args.prefix+".sam")
alignment_outfile = pysam.AlignmentFile(aln_file_name , "w", reference_names=list(refs_lengths.keys()), reference_lengths=list(refs_lengths.values()) ) #, template=samfile)
alignment_outfile.close()
aligning_start = time()
tot_counts = pc.main(args.reads, seed_file_name, aln_file_name, args)
print("Time to align reads:{0} seconds.".format(time()-aligning_start))
alignment_outfile.close()
alignment_outfile_name = alignment_outfile.filename
# need to merge genomic/unindexed alignments with the uLTRA-aligned alignments
if not args.disable_mm2:
output_start = time()
path_indexed_aligned = os.path.join(args.outfolder, "indexed.sam")
path_unindexed_aligned = os.path.join(args.outfolder, "unindexed.sam")
output_final_alignments(alignment_outfile_name, path_indexed_aligned, path_unindexed_aligned)
print("Time for selecting final best alignments (selecting the best of mm2's vs uLTRA's alignments):{0} seconds.".format(time() - output_start))
print("FSM : {0}, NO_SPLICE : {1}, Insufficient_junction_coverage_unclassified : {2}, ISM/NIC_known : {3}, NIC_novel : {4}, NNC : {5}".format(tot_counts[1], *tot_counts[3:]))
print("total alignment coverage:", tot_counts[0])
if not args.keep_temporary_files:
print("Deleting temporary files...")
seeds = glob.glob(os.path.join(args.outfolder, "seeds_*"))
mum = glob.glob(os.path.join(args.outfolder, "mummer*"))
sla = glob.glob(os.path.join(args.outfolder, "slamem*"))
reads_tmp = glob.glob(os.path.join(args.outfolder, "reads_batch*"))
minimap_tmp = glob.glob(os.path.join(args.outfolder, "minimap2*"))
ultra_tmp = glob.glob(os.path.join(args.outfolder, "uLTRA_batch*"))
f1 = os.path.join(args.outfolder, "reads_after_genomic_filtering.fastq")
f2 = os.path.join(args.outfolder, "indexed.sam")
f3 = os.path.join(args.outfolder, "unindexed.sam")
f4 = os.path.join(args.outfolder, "refs_sequences.fa")
f5 = os.path.join(args.outfolder, "refs_sequences.fa")
f6 = os.path.join(args.outfolder, "reads_rc.fq")
f7 = os.path.join(args.outfolder, "reads_tmp.fa")
misc_files = [f1,f2,f3,f4,f5,f6,f7]
for f in seeds + mum + sla + reads_tmp + minimap_tmp + ultra_tmp+ misc_files:
if os.path.isfile(f):
os.remove(f)
print("removed:", f)
print("Done.")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="uLTRA -- Align and classify long transcriptomic reads based on colinear chaining algorithms to gene regions")
parser.add_argument('--version', action='version', version='%(prog)s 0.1')
subparsers = parser.add_subparsers(help='Subcommands for eaither constructing a graph, or align reads')
# parser.add_argument("-v", help='Different subcommands for eaither constructing a graph, or align reads')
pipeline_parser = subparsers.add_parser('pipeline', help= "Perform all in one: prepare splicing database and reference sequences and align reads.")
indexing_parser = subparsers.add_parser('index', help= "Construct data structures used for alignment.")
align_reads_parser = subparsers.add_parser('align', help="Classify and align reads with colinear chaining to DAGs")
pipeline_parser.add_argument('ref', type=str, help='Reference genome (fasta)')
pipeline_parser.add_argument('gtf', type=str, help='Path to gtf file with gene models.')
pipeline_parser.add_argument('reads', type=str, help='Path to fasta/fastq file with reads.')
pipeline_parser.add_argument('outfolder', type=str, help='Path to output folder.')
group = pipeline_parser.add_mutually_exclusive_group()
group.add_argument('--ont', action="store_true", help='Set parameters suitable for ONT (Sets: --min_seed 18 --s 9 --min_acc 0.6 --alignment_threshold 0.5).')
group.add_argument('--isoseq', action="store_true", help='Set parameters suitable for IsoSeq (Sets: --min_seed 20 --s 10 --min_acc 0.8 --alignment_threshold 0.5).')
pipeline_parser.add_argument('--min_segm', type=int, default=25, help='Threshold for minimum segment size considered.')
pipeline_parser.add_argument('--min_acc', type=float, default=0.5, help='Minimum accuracy of MAM to be considered in mam chaining.')
pipeline_parser.add_argument('--flank_size', type=int, default=1000, help='Size of genomic regions surrounding genes.')
pipeline_parser.add_argument('--max_intron', type=int, default=1200000, help='Set global maximum size between mems considered in chaining solution. This is otherwise inferred from GTF file per chromosome.')
pipeline_parser.add_argument('--small_exon_threshold', type=int, default=200, help='Considered in MAM solution even if they cont contain MEMs.')
pipeline_parser.add_argument('--reduce_read_ployA', type=int, default=8, help='Reduces polyA tails longer than X bases (default 8) in reads to 5bp before MEM finding. This helps MEM matching to spurios regions but does not affect final read alignments.')
pipeline_parser.add_argument('--alignment_threshold', type=int, default=0.5, help='Lower threshold for considering an alignment. \
Counted as the difference between total match score and total mismatch penalty. \
If a read has 25%% errors (under edit distance scoring), the difference between \
matches and mismatches would be (very roughly) 0.75 - 0.25 = 0.5 with default alignment parameters \
match =2, subs=-2, gap open 3, gap ext=1. Default val (0.5) sets that a score\
higher than 2*0.5*read_length would be considered an alignment, otherwise unaligned.')
pipeline_parser.add_argument('--t', dest = 'nr_cores', type=int, default=3, help='Number of cores.')
pipeline_parser.add_argument('--index', type=str, default="", help='Path to where index files will be written to (in indexing step) and read from (in alignment step) [default is the outfolder path].')
pipeline_parser.add_argument('--prefix', type=str, default="reads", help='Outfile prefix [default=reads]. "--prefix sample_X" will output a file sample_X.sam.')
pipeline_parser.add_argument('--non_covered_cutoff', type=int, default=15, help='Threshold for what is counted as varation/intron in alignment as opposed to deletion.')
pipeline_parser.add_argument('--dropoff', type=float, default=0.95, help='Ignore alignment to hits with read coverage of this fraction less than the best hit.')
pipeline_parser.add_argument('--max_loc', type=float, default=5, help='Limit read to be aligned to at most max_loc places (default 5).\
This prevents time blowup for reads from highly repetitive regions (e.g. some genomic intra-priming reads)\
but may limit all posible alignments to annotated gene families with many highly similar copies.')
pipeline_parser.add_argument('--ignore_rc', action='store_true', help='Ignore to map to reverse complement.')
pipeline_parser.add_argument('--disable_infer', action='store_true', help='Makes splice creation step much faster. This parameter can be set if gene and transcript name fields are provided in gtf file, which is standard for the ones provided by GENCODE and Ensemble.')
pipeline_parser.add_argument('--disable_mm2', action='store_true', help='Disables utilizing minimap2 to detect genomic primary alignments and to quality check uLTRAs primary alignments.\
An alignment is classified as genomic if more than --genomic_frac (default 10%%) of its aligned length is outside\
regions indexed by uLTRA. Note that uLTRA indexes flank regions such as 3 prime, 5 prime and (parts of) introns.')
pipeline_parser.add_argument('--genomic_frac', type=float, default=0.1, help='If parameter prefilter_genomic is set, this is the threshild for fraction of aligned portion of read that is outside uLTRA indexed regions to be considered genomic (default 0.1).')
pipeline_parser.add_argument('--keep_temporary_files', action='store_true', help='Keeps all intermediate files used for the alignment. This parameter is manily good for bugfixing and development.')
pipeline_parser.add_argument('--thinning', type=int, default=0, help='Seed thinning level 0-2: 0 (deactivated, default) 1 (in expectation every third base) or 2 (in expectation every fifth base). Thinning makes seedins step much faster and less memory consuming at cost of accuracy.')
pipeline_parser.add_argument('--s', type=int, default=10, help='Strobe size (default s=10). Uses strobemers of (2, s, s+1, 35).')
pipeline_parser.set_defaults(which='pipeline')
indexing_parser.add_argument('ref', type=str, help='Reference genome (fasta)')
indexing_parser.add_argument('gtf', type=str, help='Path to gtf or gtf file with gene models.')
indexing_parser.add_argument('outfolder', type=str, help='Path to output folder.')
indexing_parser.add_argument('--min_segm', type=int, default=25, help='Threshold for minimum segment size considered.')
indexing_parser.add_argument('--flank_size', type=int, default=1000, help='Size of genomic regions surrounding genes.')
indexing_parser.add_argument('--small_exon_threshold', type=int, default=200, help='Considered in MAM solution even if they cont contain MEMs.')
indexing_parser.add_argument('--disable_infer', action='store_true', help='Makes splice creation step much faster. Thes parameter can be set if gene and transcript name fields are provided in gtf file.')
indexing_parser.add_argument('--thinning', type=int, default=0, help='Seed thinning level 0-2: 0 (deactivated, default) 1 (in expectation every third base) or 2 (in expectation every fifth base). Thinning makes seedins step much faster and less memory consuming at cost of accuracy.')
indexing_parser.add_argument('--s', type=int, default=10, help='Strobe size (default s=10). Uses strobemers of (2, s, s+1, 35).')
indexing_parser.set_defaults(which='index')
align_reads_parser.add_argument('ref', type=str, help='Reference genome (fasta).')
align_reads_parser.add_argument('reads', type=str, help='Path to fasta/fastq file with reads.')
align_reads_parser.add_argument('outfolder', type=str, help='Path to output folder.')
align_reads_parser.add_argument('--t', dest = 'nr_cores', type=int, default=3, help='Number of cores.')
align_reads_parser.add_argument('--index', type=str, default="", help='Path to where index files will be read from [default is the outfolder path].')
align_reads_parser.add_argument('--prefix', default="reads", type=str, help='Outfile prefix [default=reads]. "--prefix sample_X" will output a file sample_X.sam.')
align_reads_parser.add_argument('--max_intron', type=int, default=1200000, help='Set global maximum size between mems considered in chaining solution. This is otherwise inferred from GTF file per chromosome.')
align_reads_parser.add_argument('--reduce_read_ployA', type=int, default=8, help='Reduces polyA tails longer than X bases (default 8) in reads to 5bp before MEM finding. This helps MEM matching to spurios regions but does not affect final read alignments.')
align_reads_parser.add_argument('--alignment_threshold', type=int, default=0.5, help='Lower threshold for considering an alignment. \
Counted as the difference between total match score and total mismatch penalty. \
If a read has 25%% errors (under edit distance scoring), the difference between \
matches and mismatches would be (very roughly) 0.75 - 0.25 = 0.5 with default alignment parameters \
match =2, subs=-2, gap open 3, gap ext=1. Default val (0.5) sets that a score\
higher than 2*0.5*read_length would be considered an alignment, otherwise unaligned.')
align_reads_parser.add_argument('--non_covered_cutoff', type=int, default=15, help='Threshold for what is counted as varation/intron in alignment as opposed to deletion.')
align_reads_parser.add_argument('--dropoff', type=float, default=0.95, help='Ignore alignment to hits with read coverage of this fraction less than the best hit.')
align_reads_parser.add_argument('--max_loc', type=float, default=5, help='Limit read to be aligned to at most max_loc places (default 5).\
This prevents time blowup for reads from highly repetitive regions (e.g. some genomic intra-priming reads)\
but may limit all posible alignments to annotated gene families with many highly similar copies.')
align_reads_parser.add_argument('--ignore_rc', action='store_true', help='Ignore to map to reverse complement.')
align_reads_parser.add_argument('--min_acc', type=float, default=0.5, help='Minimum accuracy of MAM to be considered in mam chaining.')
align_reads_parser.add_argument('--disable_mm2', action='store_true', help='Disables utilizing minimap2 to detect genomic primary alignments and to quality check uLTRAs primary alignments.\
An alignment is classified as genomic if more than --genomic_frac (default 10%%) of its aligned length is outside\
regions indexed by uLTRA. Note that uLTRA indexes flank regions such as 3 prime, 5 prime and (parts of) introns.')
align_reads_parser.add_argument('--genomic_frac', type=float, default=0.1, help='If parameter prefilter_genomic is set, this is the threshild for fraction of aligned portion of read that is outside uLTRA indexed regions to be considered genomic (default 0.1).')
align_reads_parser.add_argument('--keep_temporary_files', action='store_true', help='Keeps all intermediate files used for the alignment. This parameter is manily good for bugfixing and development.')
align_reads_parser.add_argument('--thinning', type=int, default=0, help='Seed thinning level (0-2): 0 (deactivated, default) 1 (in expectation every third base) or 2 (in expectation every fifth base). Thinning makes seedins step much faster and less memory consuming at cost of accuracy.')
align_reads_parser.add_argument('--s', type=int, default=10, help='Strobe size (default s=10). Uses strobemers of (2, s, s+1, 35).')
group2 = align_reads_parser.add_mutually_exclusive_group()
group2.add_argument('--ont', action="store_true", help='Set parameters suitable for ONT (Sets: --s 9 --min_acc 0.6).')
group2.add_argument('--isoseq', action="store_true", help='Set parameters suitable for IsoSeq (Sets: --s 10 --min_acc 0.8).')
align_reads_parser.set_defaults(which='align_reads')
args = parser.parse_args()
if len(sys.argv) == 1:
parser.print_help()
sys.exit()
help_functions.mkdir_p(args.outfolder)
if len(sys.argv)==1:
parser.print_help()
sys.exit()
if args.thinning < 0 or args.thinning > 2:
print("Invalid thinning level. Choose 0, 1 or 2.")
sys.exit()
if args.which == 'align_reads' or args.which == 'pipeline':
args.mm2_ksize = 15
if args.ont:
args.min_acc = 0.6
args.mm2_ksize = 14
args.s = 9
if args.isoseq:
args.min_acc = 0.8
args.s = 10
if args.which == 'index':
args.index = args.outfolder
refs, refs_lengths = load_reference(args)
prep_splicing(args, refs_lengths)
prep_seqs(args, refs, refs_lengths)
elif args.which == 'align_reads':
align_reads(args)
elif args.which == 'pipeline':
refs, refs_lengths = load_reference(args)
prep_splicing(args, refs_lengths)
prep_seqs(args, refs, refs_lengths)
align_reads(args)
else:
print('invalid call')