forked from lh3/minimap2
-
Notifications
You must be signed in to change notification settings - Fork 8
/
minimap.h
495 lines (434 loc) · 16.8 KB
/
minimap.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
/* The MIT License
Copyright (c) 2018- Dana-Farber Cancer Institute
2017-2018 Broad Institute, Inc.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Modified Copyright (C) 2021 Intel Corporation
Contacts: Saurabh Kalikar <[email protected]>;
Vasimuddin Md <[email protected]>; Sanchit Misra <[email protected]>;
Chirag Jain <[email protected]>; Heng Li <[email protected]>
*/
#ifndef MINIMAP2_H
#define MINIMAP2_H
#include <stdint.h>
#include <stdio.h>
#include <sys/types.h>
//klocwork fix -- declare constants as 64-bit
#define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit
#define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name
#define MM_F_CIGAR (0x004LL)
#define MM_F_OUT_SAM (0x008LL)
#define MM_F_NO_QUAL (0x010LL)
#define MM_F_OUT_CG (0x020LL)
#define MM_F_OUT_CS (0x040LL)
#define MM_F_SPLICE (0x080LL) // splice mode
#define MM_F_SPLICE_FOR (0x100LL) // match GT-AG
#define MM_F_SPLICE_REV (0x200LL) // match CT-AC, the reverse complement of GT-AG
#define MM_F_NO_LJOIN (0x400LL)
#define MM_F_OUT_CS_LONG (0x800LL)
#define MM_F_SR (0x1000LL)
#define MM_F_FRAG_MODE (0x2000LL)
#define MM_F_NO_PRINT_2ND (0x4000LL)
#define MM_F_2_IO_THREADS (0x8000LL)
#define MM_F_LONG_CIGAR (0x10000LL)
#define MM_F_INDEPEND_SEG (0x20000LL)
#define MM_F_SPLICE_FLANK (0x40000LL)
#define MM_F_SOFTCLIP (0x80000LL)
#define MM_F_FOR_ONLY (0x100000LL)
#define MM_F_REV_ONLY (0x200000LL)
#define MM_F_HEAP_SORT (0x400000LL)
#define MM_F_ALL_CHAINS (0x800000LL)
#define MM_F_OUT_MD (0x1000000LL)
#define MM_F_COPY_COMMENT (0x2000000LL)
#define MM_F_EQX (0x4000000LL) // use =/X instead of M
#define MM_F_PAF_NO_HIT (0x8000000LL) // output unmapped reads to PAF
#define MM_F_NO_END_FLT (0x10000000LL)
#define MM_F_HARD_MLEVEL (0x20000000LL)
#define MM_F_SAM_HIT_ONLY (0x40000000LL)
#if 0
#define MM_F_NO_DIAG 0x001 // no exact diagonal hit
#define MM_F_NO_DUAL 0x002 // skip pairs where query name is lexicographically larger than target name
#define MM_F_CIGAR 0x004
#define MM_F_OUT_SAM 0x008
#define MM_F_NO_QUAL 0x010
#define MM_F_OUT_CG 0x020
#define MM_F_OUT_CS 0x040
#define MM_F_SPLICE 0x080 // splice mode
#define MM_F_SPLICE_FOR 0x100 // match GT-AG
#define MM_F_SPLICE_REV 0x200 // match CT-AC, the reverse complement of GT-AG
#define MM_F_NO_LJOIN 0x400
#define MM_F_OUT_CS_LONG 0x800
#define MM_F_SR 0x1000
#define MM_F_FRAG_MODE 0x2000
#define MM_F_NO_PRINT_2ND 0x4000
#define MM_F_2_IO_THREADS 0x8000
#define MM_F_LONG_CIGAR 0x10000
#define MM_F_INDEPEND_SEG 0x20000
#define MM_F_SPLICE_FLANK 0x40000
#define MM_F_SOFTCLIP 0x80000
#define MM_F_FOR_ONLY 0x100000
#define MM_F_REV_ONLY 0x200000
#define MM_F_HEAP_SORT 0x400000
#define MM_F_ALL_CHAINS 0x800000
#define MM_F_OUT_MD 0x1000000
#define MM_F_COPY_COMMENT 0x2000000
#define MM_F_EQX 0x4000000 // use =/X instead of M
#define MM_F_PAF_NO_HIT 0x8000000 // output unmapped reads to PAF
#define MM_F_NO_END_FLT 0x10000000
#define MM_F_HARD_MLEVEL 0x20000000
#define MM_F_SAM_HIT_ONLY 0x40000000
#endif
#define MM_F_RMQ (0x80000000LL)
#define MM_F_QSTRAND (0x100000000LL)
#define MM_F_NO_INV (0x200000000LL)
#define MM_F_NO_HASH_NAME (0x400000000LL)
#define MM_I_HPC 0x1
#define MM_I_NO_SEQ 0x2
#define MM_I_NO_NAME 0x4
#define MM_IDX_MAGIC "MMI\2"
#define MM_MAX_SEG 255
#define MM_CIGAR_MATCH 0
#define MM_CIGAR_INS 1
#define MM_CIGAR_DEL 2
#define MM_CIGAR_N_SKIP 3
#define MM_CIGAR_SOFTCLIP 4
#define MM_CIGAR_HARDCLIP 5
#define MM_CIGAR_PADDING 6
#define MM_CIGAR_EQ_MATCH 7
#define MM_CIGAR_X_MISMATCH 8
#define MM_CIGAR_STR "MIDNSHP=XB"
#ifdef __cplusplus
extern "C" {
#endif
// emulate 128-bit integers and arrays
typedef struct { uint64_t x, y; } mm128_t;
typedef struct { size_t n, m; mm128_t *a; } mm128_v;
// minimap2 index
typedef struct {
char *name; // name of the db sequence
uint64_t offset; // offset in mm_idx_t::S
uint32_t len; // length
uint32_t is_alt;
} mm_idx_seq_t;
typedef struct {
int32_t b, w, k, flag;
uint32_t n_seq; // number of reference sequences
int32_t index;
int32_t n_alt;
mm_idx_seq_t *seq; // sequence name, length and offset
uint32_t *S; // 4-bit packed sequence
struct mm_idx_bucket_s *B; // index (hidden)
struct mm_idx_intv_s *I; // intervals (hidden)
void *km, *h;
} mm_idx_t;
// minimap2 alignment
typedef struct {
uint32_t capacity; // the capacity of cigar[]
int32_t dp_score, dp_max, dp_max2; // DP score; score of the max-scoring segment; score of the best alternate mappings
uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for -
uint32_t n_cigar; // number of cigar operations in cigar[]
uint32_t cigar[];
} mm_extra_t;
typedef struct {
int32_t id; // ID for internal uses (see also parent below)
int32_t cnt; // number of minimizers; if on the reverse strand
int32_t rid; // reference index; if this is an alignment from inversion rescue
int32_t score; // DP alignment score
int32_t qs, qe, rs, re; // query start and end; reference start and end
int32_t parent, subsc; // parent==id if primary; best alternate mapping score
int32_t as; // offset in the a[] array (for internal uses only)
int32_t mlen, blen; // seeded exact match length; seeded alignment block length
int32_t n_sub; // number of suboptimal mappings
int32_t score0; // initial chaining score (before chain merging/spliting)
uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, is_alt:1, strand_retained:1, dummy:5;
uint32_t hash;
float div;
mm_extra_t *p;
} mm_reg1_t;
// indexing and mapping options
typedef struct {
short k, w, flag, bucket_bits;
int64_t mini_batch_size;
uint64_t batch_size;
} mm_idxopt_t;
typedef struct {
int64_t flag; // see MM_F_* macros
int seed;
int sdust_thres; // score threshold for SDUST; 0 to disable
int max_qlen; // max query length
int bw, bw_long; // bandwidth
int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window
int max_frag_len;
int max_chain_skip, max_chain_iter;
int min_cnt; // min number of minimizers on each chain
int min_chain_score; // min chaining score
float chain_gap_scale;
float chain_skip_scale;
int rmq_size_cap, rmq_inner_dist;
int rmq_rescue_size;
float rmq_rescue_ratio;
float mask_level;
int mask_len;
float pri_ratio;
int best_n; // top best_n chains are subjected to DP alignment
float alt_drop;
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
int sc_ambi; // score when one or both bases are "N"
int noncan; // cost of non-canonical splicing sites
int junc_bonus;
int zdrop, zdrop_inv; // break alignment if alignment score drops too fast along the diagonal
int end_bonus;
int min_dp_max; // drop an alignment if the score of the max scoring segment is below this threshold
int min_ksw_len;
int anchor_ext_len, anchor_ext_shift;
float max_clip_ratio; // drop an alignment if BOTH ends are clipped above this ratio
int rank_min_len;
float rank_frac;
int pe_ori, pe_bonus;
float mid_occ_frac; // only used by mm_mapopt_update(); see below
float q_occ_frac;
int32_t min_mid_occ, max_mid_occ;
int32_t mid_occ; // ignore seeds with occurrences above this threshold
int32_t max_occ, max_max_occ, occ_dist;
int64_t mini_batch_size; // size of a batch of query bases to process in parallel
int64_t max_sw_mat;
int64_t cap_kalloc;
const char *split_prefix;
} mm_mapopt_t;
// index reader
typedef struct {
int is_idx, n_parts;
int64_t idx_size;
mm_idxopt_t opt;
FILE *fp_out;
union {
struct mm_bseq_file_s *seq;
FILE *idx;
} fp;
} mm_idx_reader_t;
// memory buffer for thread-local storage during mapping
typedef struct mm_tbuf_s mm_tbuf_t;
// global variables
extern int mm_verbose, mm_dbg_flag; // verbose level: 0 for no info, 1 for error, 2 for warning, 3 for message (default); debugging flag
extern double mm_realtime0; // wall-clock timer
/**
* Set default or preset parameters
*
* @param preset NULL to set all parameters as default; otherwise apply preset to affected parameters
* @param io pointer to indexing parameters
* @param mo pointer to mapping parameters
*
* @return 0 if success; -1 if _present_ unknown
*/
int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo);
int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo);
/**
* Update mm_mapopt_t::mid_occ via mm_mapopt_t::mid_occ_frac
*
* If mm_mapopt_t::mid_occ is 0, this function sets it to a number such that no
* more than mm_mapopt_t::mid_occ_frac of minimizers in the index have a higher
* occurrence.
*
* @param opt mapping parameters
* @param mi minimap2 index
*/
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi);
void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len);
/**
* Initialize an index reader
*
* @param fn index or fasta/fastq file name (this function tests the file type)
* @param opt indexing parameters
* @param fn_out if not NULL, write built index to this file
*
* @return an index reader on success; NULL if fail to open _fn_
*/
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out);
/**
* Read/build an index
*
* If the input file is an index file, this function reads one part of the
* index and returns. If the input file is a sequence file (fasta or fastq),
* this function constructs the index for about mm_idxopt_t::batch_size bases.
* Importantly, for a huge collection of sequences, this function may only
* return an index for part of sequences. It needs to be repeatedly called
* to traverse the entire index/sequence file.
*
* @param r index reader
* @param n_threads number of threads for constructing index
*
* @return an index on success; NULL if reaching the end of the input file
*/
mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads);
/**
* Destroy/deallocate an index reader
*
* @param r index reader
*/
void mm_idx_reader_close(mm_idx_reader_t *r);
int mm_idx_reader_eof(const mm_idx_reader_t *r);
/**
* Check whether the file contains a minimap2 index
*
* @param fn file name
*
* @return the file size if fn is an index file; 0 if fn is not.
*/
int64_t mm_idx_is_idx(const char *fn);
/**
* Load a part of an index
*
* Given a uni-part index, this function loads the entire index into memory.
* Given a multi-part index, it loads one part only and places the file pointer
* at the end of that part.
*
* @param fp pointer to FILE object
*
* @return minimap2 index read from fp
*/
mm_idx_t *mm_idx_load(FILE *fp);
/**
* Append an index (or one part of a full index) to file
*
* @param fp pointer to FILE object
* @param mi minimap2 index
*/
void mm_idx_dump(FILE *fp, const mm_idx_t *mi);
/**
* Store hash table from minimap2 index into a file
* @param f_name File name for output file
* @param mi minimap2 index
*/
void mm_idx_dump_hash(const char* f_name, const mm_idx_t *mi);
/**
* Create an index from strings in memory
*
* @param w minimizer window size
* @param k minimizer k-mer size
* @param is_hpc use HPC k-mer if true
* @param bucket_bits number of bits for the first level of the hash table
* @param n number of sequences
* @param seq sequences in A/C/G/T
* @param name sequence names; could be NULL
*
* @return minimap2 index
*/
mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const char **seq, const char **name);
/**
* Print index statistics to stderr
*
* @param mi minimap2 index
*/
void mm_idx_stat(const mm_idx_t *idx);
/**
* Destroy/deallocate an index
*
* @param r minimap2 index
*/
void mm_idx_destroy(mm_idx_t *mi);
/**
* Destroy/deallocate an hash table index
*
* @param r minimap2 index
*/
void mm_idx_destroy_mm_hash(mm_idx_t *mi);
/**
* Destroy/deallocate target sequences
*
* @param r minimap2 index
*/
void mm_idx_destroy_seq(mm_idx_t *mi);
/**
* Initialize a thread-local buffer for mapping
*
* Each mapping thread requires a buffer specific to the thread (see mm_map()
* below). The primary purpose of this buffer is to reduce frequent heap
* allocations across threads. A buffer shall not be used by two or more
* threads.
*
* @return pointer to a thread-local buffer
*/
mm_tbuf_t *mm_tbuf_init(void);
/**
* Destroy/deallocate a thread-local buffer for mapping
*
* @param b the buffer
*/
void mm_tbuf_destroy(mm_tbuf_t *b);
void *mm_tbuf_get_km(mm_tbuf_t *b);
/**
* Align a query sequence against an index
*
* This function possibly finds multiple alignments of the query sequence.
* The returned array and the mm_reg1_t::p field of each element are allocated
* with malloc().
*
* @param mi minimap2 index
* @param l_seq length of the query sequence
* @param seq the query sequence
* @param n_regs number of hits (out)
* @param b thread-local buffer; two mm_map() calls shall not use one buffer at the same time!
* @param opt mapping parameters
* @param name query name, used for all-vs-all overlapping and debugging
*
* @return an array of hits which need to be deallocated with free() together
* with mm_reg1_t::p of each element. The size is written to _n_regs_.
*/
mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name);
void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname);
/**
* Align a fasta/fastq file and print alignments to stdout
*
* @param idx minimap2 index
* @param fn fasta/fastq file name
* @param opt mapping parameters
* @param n_threads number of threads
*
* @return 0 on success; -1 if _fn_ can't be read
*/
int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads);
int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads);
/**
* Generate the cs tag (new in 2.12)
*
* @param km memory blocks; set to NULL if unsure
* @param buf buffer to write the cs/MD tag; typicall NULL on the first call
* @param max_len max length of the buffer; typically set to 0 on the first call
* @param mi index
* @param r alignment
* @param seq query sequence
* @param no_iden true to use : instead of =
*
* @return the length of cs
*/
int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden);
int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq);
// query sequence name and sequence in the minimap2 index
int mm_idx_index_name(mm_idx_t *mi);
int mm_idx_name2id(const mm_idx_t *mi, const char *name);
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
int mm_idx_alt_read(mm_idx_t *mi, const char *fn);
int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc);
int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s);
// deprecated APIs for backward compatibility
void mm_mapopt_init(mm_mapopt_t *opt);
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);
#ifdef __cplusplus
}
#endif
#endif // MINIMAP2_H