-
Notifications
You must be signed in to change notification settings - Fork 24
/
pandaseq-common.h
638 lines (582 loc) · 16 KB
/
pandaseq-common.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
/* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers.
Copyright (C) 2011-2012 Andre Masella
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _PANDASEQ_COMMON_H
# define _PANDASEQ_COMMON_H
# ifdef __cplusplus
# define EXTERN_C_BEGIN extern "C" {
# define EXTERN_C_END }
# else
# define EXTERN_C_BEGIN
# define EXTERN_C_END
# endif
# ifdef _WIN32
# ifdef PANDA_LIB_COMPILING
# define PANDA_EXTERN extern __declspec(dllexport)
# else
# define PANDA_EXTERN extern __declspec(dllimport)
# endif
# else
# define PANDA_EXTERN extern
# endif
# include <stdarg.h>
# include <stdio.h>
# include <stdbool.h>
EXTERN_C_BEGIN
/**
* Maximum length of a sequence
*/
# define PANDA_MAX_LEN (panda_max_len())
# define PANDA_TAG_LEN 50
extern size_t panda_max_len(
void);
/* === Objects === */
/**
* The algorithm used to select the overlap.
*/
typedef struct panda_algorithm *PandaAlgorithm;
/**
* The algorithm's class template.
*
* Yes, we're pretending we have inheritance.
*/
typedef const struct panda_algorithm_class *PandaAlgorithmClass;
/**
* The manager for an assembly
*/
typedef struct panda_assembler *PandaAssembler;
/**
* The standard argument handler for a pair of FASTQ files from Illumina.
*/
typedef struct panda_args_fastq *PandaArgsFastq;
/**
* The standard argument handler for overhanging read pair trimmer.
*/
typedef struct panda_args_hang *PandaArgsHang;
/**
* Iterate over a sequence presenting all k-mers without Ns or other denegerate bases.
*
* Iterators are not reference counted.
*/
typedef struct panda_iter *PandaIter;
/**
* A structure to read lines from an input stream.
*/
typedef struct panda_linebuf *PandaLineBuf;
/**
* Logging proxy object.
*/
typedef struct panda_log_proxy *PandaLogProxy;
/**
* Sequence validity checker
*/
typedef struct panda_module *PandaModule;
/**
* A threading-safe wrapper to allow multiple assemblers to share a single data source.
*/
typedef struct panda_mux *PandaMux;
/**
* A set of sequence identifiers against which to match.
*/
typedef struct panda_idset *PandaSet;
/**
* A transaction stream writer to improve output throughtput while using threads.
*
* This buffers writes to output into small transactions that are written in
* groups to an output source. This is mean to alleviate contention since each
* thread need not obtain an output lock for every write.
*/
typedef struct panda_writer *PandaWriter;
/* === Enum and Flags === */
/**
* Codes used for logging conditions during the assembly.
*
* Some of these are errors and some are informational.
*/
typedef enum {
PANDA_CODE_BAD_NT,
PANDA_CODE_BEST_OVERLAP,
PANDA_CODE_BUILD_FORWARD,
PANDA_CODE_BUILD_OVERLAP,
PANDA_CODE_BUILD_REVERSE,
PANDA_CODE_FORWARD_KMER,
PANDA_CODE_ID_PARSE_FAILURE,
PANDA_CODE_INSUFFICIENT_KMER_TABLE,
PANDA_CODE_LOST_KMER,
PANDA_CODE_LOW_QUALITY_REJECT,
PANDA_CODE_MISMATCHED_BASE,
PANDA_CODE_MOD_INFO,
PANDA_CODE_NEGATIVE_SEQUENCE_LENGTH,
PANDA_CODE_NO_DATA,
PANDA_CODE_NO_FILE,
PANDA_CODE_NO_FORWARD_PRIMER,
PANDA_CODE_NO_QUALITY_INFO,
PANDA_CODE_NO_REVERSE_PRIMER,
PANDA_CODE_NOT_PAIRED,
PANDA_CODE_OVERLAP_POSSIBILITY,
PANDA_CODE_PARSE_FAILURE,
PANDA_CODE_PREMATURE_EOF,
PANDA_CODE_READ_TOO_LONG,
PANDA_CODE_RECONSTRUCTION_PARAM,
PANDA_CODE_REJECT_STAT,
PANDA_CODE_REVERSE_KMER,
PANDA_CODE_SEQUENCE_TOO_LONG,
PANDA_CODE_PHRED_OFFSET,
} PandaCode;
/**
* Decide what kinds of messages are passed to the logger.
*/
typedef unsigned int PandaDebug;
/**
* The policy for Illumina tags/barcodes in sequence names.
*/
typedef enum {
/**
* The parsing should return an error if the sequence does not have a tag.
*/
PANDA_TAG_PRESENT,
/**
* The parsing should return an error if the sequence has a tag.
*/
PANDA_TAG_ABSENT,
/**
* The parsing should not care if the sequence a tag.
*/
PANDA_TAG_OPTIONAL,
} PandaTagging;
/**
* A single nucleotide
*/
typedef char panda_nt;
/**
* FASTQ header format
*/
typedef enum {
PANDA_IDFMT_UNKNOWN,
PANDA_IDFMT_SRA,
PANDA_IDFMT_CASAVA_1_4,
PANDA_IDFMT_CASAVA_1_7,
PANDA_IDFMT_EBI_SRA,
PANDA_IDFMT_CASAVA_CONVERTED,
} PandaIdFmt;
/* === Structures === */
/**
* A k-mer and its position in the original sequence.
*/
typedef struct {
size_t kmer;
size_t posn;
} panda_kmer;
/**
* A single nucleotide with quality information
*/
typedef struct {
/**
* The nucleotide
*/
panda_nt nt;
/**
* The quality score as a PHRED score
*/
char qual;
} panda_qual;
typedef struct {
/**
* The nucleotide
*/
panda_nt nt;
/**
* The quality score as a log probability
*/
double p;
} panda_result;
/**
* Illumina sequence information from the FASTQ header
*/
typedef struct {
char instrument[100];
char run[100];
char flowcell[100];
int lane;
int tile;
int x;
int y;
char tag[PANDA_TAG_LEN];
} panda_seq_identifier;
/**
* Describes a command line option.
*/
typedef struct panda_tweak_general {
/**
* The command line option.
*/
char flag;
/**
* Whether the flag needs to be specified.
*
* This is used in the help output only.
*/
bool optional;
/**
* The name of the argument as it appears in the help. If null, the argument is assumed to be boolean.
*/
const char *takes_argument;
/**
* The help information to display to the user.
*/
const char *help;
/**
* If the argument can be repeated. This is only considered if is not a boolean flag.
*/
bool repeatable;
} panda_tweak_general;
/**
* A reconstructed sequence with meta information
*/
typedef struct {
/**
* Calculated quality score as the log of the geometric mean of the product of the Illumina quality scores of the included bases.
*/
double quality;
/**
* Number of uncalled bases in the sequence.
*/
size_t degenerates;
/**
* The sequence identification information
*/
panda_seq_identifier name;
/**
* The reconstructed sequence with quality information
*/
panda_result *sequence;
size_t sequence_length;
/**
* The original forward sequence
*/
panda_qual const *forward;
size_t forward_length;
/**
* The original reverse sequence
*/
panda_qual const *reverse;
size_t reverse_length;
/**
* The number of nucleotides clipped from the forward sequence
*/
size_t forward_offset;
/**
* The number of nucleotides clipped from the reverse sequence
*/
size_t reverse_offset;
/**
* The number of mismatches in the overlap region.
*/
size_t overlap_mismatches;
/**
* The number of overlaps that were examined to determine the one finally used.
*/
size_t overlaps_examined;
/**
* The overlap chosen.
*/
size_t overlap;
/**
* The probability of the overlap region being the correct one by the original estimation.
*/
double estimated_overlap_probability;
} panda_result_seq;
/* === Function Pointers === */
/**
* Construct a new algorithm from a string of arguments
* @args: (allow-none): The arguments from the user.
*/
typedef PandaAlgorithm (
*PandaAlgorithmCreate) (
const char *args);
/**
* Assemble a sequence from read pairs.
*
* This is normally panda_assembler_next while doing diffs, but it can be mocked out if needed.
*/
typedef const panda_result_seq *(
*PandaAssemble) (
void *user_data,
panda_seq_identifier *id,
const panda_qual *forward,
size_t forward_length,
const panda_qual *reverse,
size_t reverse_length);
/**
* Check a sequence after reconstruction for validity.
*/
typedef bool (
*PandaCheck) (
PandaLogProxy logger,
const panda_result_seq *sequence,
void *user_data);
/**
* Compute the probability of a match or mismatch given two quality scores.
*
* @private_data: (closure): the private data for the algorithm
* @match: do the nucleotides match
* @a: the PHRED score of one base
* @b: the PHRED score of the other base
* Return: the log probability the match.
*/
typedef double (
*PandaComputeMatch) (
void *private_data,
bool match,
char a,
char b);
/**
* Compute the probability of an offset being a good one.
*
* @private_data: (closure): the private data for the algorithm
* @forward: (array length=forward_length): the forward read
* @reverse: (array length=reverse_length): the reverse read
* @overlap: the overlap length to check
* Return: the log probability the overlap is correct.
*/
typedef double (
*PandaComputeOverlap) (
void *private_data,
const panda_qual *forward,
size_t forward_length,
const panda_qual *reverse,
size_t reverse_length,
size_t overlap);
/**
* Free user data
*
* Any method which takes user data with a function pointer will call a destroy function when the user data is not longer needed such that the memory can be freed, if necessary. A destroy function may always be null, in which case, the memory managment is the responsibility of the caller.
*/
typedef void (
*PandaDestroy) (
void *user_data);
/**
* Handle a failed alignment
*
* This is called when an assembler fails to align a sequence because it can't compute a reasonable overlap.
* @assembler: The assembler that made the attempt
* @id: the sequence id of the failed pair
* @forward: (array length=forward_length): the forward read
* @reverse: (array length=reverse_length): the reverse read
* @user_data: (closure): context data
*/
typedef void (
*PandaFailAlign) (
PandaAssembler assembler,
const panda_seq_identifier *id,
const panda_qual *forward,
size_t forward_length,
const panda_qual *reverse,
size_t reverse_length,
void *user_data);
/**
* Get the next characters from an input stream.
*
* For assembly from an alternate source of data, this function reads data from the stream.
* @buffer:(array length=buffer_length): the buffer to fill
* @read:(out): the number of bytes successfully read from the buffer
* @data: (closure): some user context data provided
* Returns: false if an error occured, true otherwise
*/
typedef bool (
*PandaBufferRead) (
char *buffer,
size_t buffer_length,
size_t *read,
void *data);
/**
* Write data to an output stream.
* @buffer:(array length=buffer_length): the buffer to write
* @data: (closure): some user context data provided
*/
typedef void (
*PandaBufferWrite) (
const char *buffer,
size_t buffer_length,
void *data);
/**
* Process a key-value pair.
*
* Returns: whether processing was successful.
*/
typedef bool (
*PandaKeyParsed) (
const char *key,
const char *value,
void *data);
/**
* A callback for iterating over the current modules.
* @assembler: the assembler which is being queried
* @module: the module selected
* @rejected: the number of sequences rejected by this module in the context of the current assembler.
* @data: (closure): some user context data provided
* Returns: true to continue iterating, false to stop
*/
typedef bool (
*PandaModuleCallback) (
PandaAssembler assembler,
PandaModule module,
size_t rejected,
void *data);
/**
* Get the next sequence pair.
*
* For assembly from a non-FASTQ text source, this function can provide the next sequence. The function must provide the sequences and metadata for assembly by modifing the values of its parameters.
* @id: (out caller-allocates): the identifier information for the sequence pair
* @forward: (array length=forward_length) (allow-none): the location of the parsed sequence data of the forward read. This memory is not managed by the assembler.
* @reverse: (array length=reverse_length) (allow-none): the location of the parsed sequence data of the reverse read. This memory is not managed by the assembler.
* Returns: true if there is a sequence available. All the parameters must be set correctly. If false, no more sequences will be read and the values in the parameters are ignored.
*/
typedef bool (
*PandaNextSeq) (
panda_seq_identifier *id,
const panda_qual **forward,
size_t *forward_length,
const panda_qual **reverse,
size_t *reverse_length,
void *user_data);
/**
* Write a finsihed sequence to an appropriate place.
* @sequence: the sequence from assembly
* @user_data: (closure): the context provided
*/
typedef bool (
*PandaOutputSeq) (
const panda_result_seq *sequence,
void *user_data);
/**
* Check a sequence before reconstruction for validity.
* @forward: (array length=forward_length): The forward read.
* @reverse: (array length=reverse_length): The reverse read.
*/
typedef bool (
*PandaPreCheck) (
PandaLogProxy logger,
const panda_seq_identifier *id,
const panda_qual *forward,
size_t forward_length,
const panda_qual *reverse,
size_t reverse_length,
void *user_data);
/**
* Printf-like function for output.
*
* @data: (closure): The context for the error logging.
* @format: (printf-like): The format string for use by printf.
*/
typedef void (
*PandaPrintf) (
void *data,
const char *format,
...);
/**
* Create a sequence reader after argument parsing.
*
* This returns a sequence source and a failure handler so that assembly can proceed.
* @logger: The logging proxy to use, if needed.
* @fail: (closure fail_data) (transfer full) (allow-none) (out callee-allocates) (scope notified): the handler for any sequences which do not align, if desired.
* @user_data:(closure): the context
* Returns: (closure next_data) (scope notified) (allow-none): the sequence source, or null to indicate a failure
*/
typedef PandaNextSeq (
*PandaOpener) (
void *user_data,
PandaLogProxy logger,
PandaFailAlign *fail,
void **fail_data,
PandaDestroy *fail_destroy,
void **next_data,
PandaDestroy *next_destroy);
/**
* Perform any modifications to the assembler after creation.
*/
typedef bool (
*PandaSetup) (
void *user_data,
PandaAssembler assembler);
/**
* Process a command-line flag specified by the user.
* @assembler: the assembler to which to make the adjustments
* @flag: the command line flag specified
* @argument: (transfer full) (allow-none): the command line argument, or null if not set.
* Returns: whether the flag was parsed successfully
*/
typedef bool (
*PandaTweakAssembler) (
PandaAssembler assembler,
char flag,
char *argument);
/**
* Process a command-line flag specified by the user.
* @user_data: (closure): the context
* @flag: the command line flag specified
* @argument: the option passed specified with the flag, if requested.
* Returns: whether the flag was parsed succesfully
*/
typedef bool (
*PandaTweakGeneral) (
void *user_data,
char flag,
const char *argument);
/* === Round 2 Structures === */
/**
* The base structure for an assembly algorithm.
*/
struct panda_algorithm_class {
/**
* The number of bytes to allocate for the private data.
*/
size_t data_size;
const char *name;
PandaAlgorithmCreate create;
PandaDestroy data_destroy;
PandaComputeOverlap overlap_probability;
PandaComputeMatch match_probability;
const double prob_unpaired;
};
/**
* Describes a command line option that can be applied to an assembler.
*/
typedef struct panda_tweak_assembler {
/**
* The command line option.
*/
char flag;
/**
* The name of the argument as it appears in the help. If null, the argument is assumed to be boolean.
*/
const char *takes_argument;
/**
* The description of the option.
*/
const char *help;
/**
* The callback to make the appropriate changes to the assembler.
*/
PandaTweakAssembler setup;
/**
* If the argument can be repeated. This is only considered if is not a boolean flag.
*/
bool repeatable;
} panda_tweak_assembler;
typedef struct {
const panda_tweak_assembler *tweak;
char *arg;
} panda_tweak_assembler_opt;
EXTERN_C_END
#endif