-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodify_alleles.cpp
492 lines (437 loc) · 16.1 KB
/
modify_alleles.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
// ******************************************************
// vcfCTools (c) 2011 Alistair Ward
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ------------------------------------------------------
// Last modified: 18 February 2011
// ------------------------------------------------------
// Tools for modifying the alleles. This includes
// trimming the reference and alternate alleles as well
// as using the Smith-Waterman algorithm for aligning
// alleles to each other.
// ******************************************************
#define GAP '-'
#define MATCH '='
#define MISMATCH 'X'
#define INSERTION 'I'
#define DELETION 'D'
#include "modify_alleles.h"
using namespace std;
using namespace vcfCTools;
// Constructor
modifyAlleles::modifyAlleles(string& refSeq, int pos, string ref, string alt) {
originalPosition = pos;
originalRef = ref;
originalAlt = alt;
referenceSequence = refSeq;
};
// Destructor.
modifyAlleles::~modifyAlleles(void) {};
// Trim the reference and alternate alleles to produce an unambiguous
// description of the alleles. For example, a SNP could be represented
// as CAT -> CAG as a result of other variants appearing in the same
// record. If this is the case, the unambiguous description results when
// the alleles are trimmed to reveal T -> G.
void modifyAlleles::trim() {
modifiedPosition = originalPosition;
workingRef = originalRef;
workingAlt = originalAlt;
// Start at the end and work backwards.
string::reverse_iterator rRefIter = workingRef.rbegin();
string::reverse_iterator rAltIter = workingAlt.rbegin();
while ( ( rRefIter != workingRef.rend() ) && ( *rRefIter == *rAltIter) && workingRef.length() > 1 && workingAlt.length() > 1) {
workingRef.erase( --(rRefIter.base()) );
rRefIter = workingRef.rbegin();
workingAlt.erase( --(rAltIter.base()) );
rAltIter = workingAlt.rbegin();
}
// Start at the beginning and work forwards.
string::iterator refIter = workingRef.begin();
string::iterator altIter = workingAlt.begin();
while ( ( refIter != workingRef.end() ) && ( *refIter == *altIter) && workingRef.length() > 1 && workingAlt.length() > 1 ) {
modifiedPosition++;
refIter = workingRef.erase(refIter);
altIter = workingAlt.erase(altIter);
}
// Update the modified alleles.
modifiedRef = workingRef;
modifiedAlt = workingAlt;
}
// Extend the reference and alternate alleles using both the reference sequence
// and a string of Z's.
void modifyAlleles::extendAlleles() {
string anchor;
anchor.assign(50, 'Z');
flankLength = 50;
getFlankingReference();
// Build the alleles to align.
workingRef = anchor + flankFront + modifiedRef + flankEnd + anchor;
workingAlt = anchor + flankFront + modifiedAlt + flankEnd + anchor;
}
// Align the alleles to each other.
void modifyAlleles::alignAlleles() {
// Align the alleles to each other.
smithWaterman();
// Generate the CIGAR string separating matches and mismatches.
generateCigar();
cerr << modifiedRef << endl << modifiedAlt << endl << endl;
cerr << workingRef << endl << workingAlt << endl << endl;
cerr << cigar << endl;
exit(0);
// The variant type is known, so check that the CIGAR contains the expected
// values.
//checkAlignment();
// Determine if the allele is fully left aligned and realign if not. Find
// the genomic coordinate of the new alleles.
//processAlignment();
//leftAlign();
}
// Use fastahack to find the reference flanking the ref allele.
void modifyAlleles::getFlankingReference() {
// Add flanking sequence to the ref and alt allele in order to perform a Smith
// Waterman alignemnt.
int frontPos = modifiedPosition - flankLength - 1;
int length = modifiedRef.length() + 2 * flankLength;
if (frontPos <= 0) {frontPos = 1;}
FastaReference* fr = new FastaReference(fasta);
flank = fr->getSubSequence(referenceSequence, frontPos, length);
delete fr;
flankFront = flank.substr(0, flankLength);
flankEnd = flank.substr(flankLength + modifiedRef.length(), flankLength);
}
// Set parameters and use the Smith-Waterman algorithm to align the reference
// to the alternate allele.
void modifyAlleles::smithWaterman() {
// Initialise Smith-Waterman parameters.
unsigned int referencePos;
float matchScore = 10.0f;
float mismatchScore = -9.0f;
float gapOpenPenalty = 15.0f;
float gapExtendPenalty = 6.66f;
const char* reference = workingRef.c_str();
const char* query = workingAlt.c_str();
const unsigned int referenceLen = strlen(reference);
const unsigned int queryLen = strlen(query);
// Call the Smith-Waterman routines.
CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty);
sw.Align(referencePos, workingRef, workingAlt, reference, referenceLen, query, queryLen);
}
// After the alignment has taken place, generate the CIGAR string.
// Distinguish between matches and mismatches ('=' and 'X') instead
// of the standard 'M' for all.
void modifyAlleles::generateCigar() {
bool gapRegion = false;
bool matchRegion = false;
bool mismatchRegion = false;
numberMatch = 0;
numberMismatch = 0;
numberDeletion = 0;
numberInsertion = 0;
matchGroups = 0;
mismatchGroups = 0;
insertionGroups = 0;
deletionGroups = 0;
ostringstream oCigar;
unsigned int m = 0, x = 0, i = 0, d = 0;
// Strip the leading and lagging Z's.
workingRef = workingRef.substr(15, workingRef.size() - 30);
workingAlt = workingAlt.substr(15, workingAlt.size() - 30);
string::iterator refIter = workingRef.begin();
string::iterator altIter = workingAlt.begin();
for (; refIter != workingRef.end(); refIter++) {
// If both ref and alt alleles are bases and not gaps.
if (*refIter != GAP && *altIter != GAP) {
// Write out to the CIGAR string if the sequence has
// changed from gapped to match/mismatch.
if (gapRegion) {
if (d != 0) {
oCigar << d << "D";
deletionGroups++;
} else {
oCigar << i << "I";
insertionGroups++;
}
}
gapRegion = false;
// If the bases match, add one to the m value.
if (*refIter == *altIter) {
if (!matchRegion && x != 0) {
oCigar << x << "X";
mismatchGroups++;
}
m++;
numberMatch++;
x = 0;
matchRegion = true;
mismatchRegion = false;
// If the bases mismatch.
} else {
if (!mismatchRegion && m != 0) {
oCigar << m << "=";
matchGroups++;
}
x++;
numberMismatch++;
m = 0;
matchRegion = false;
mismatchRegion = true;
}
// Reset the deletion and insertion lengths to zero since the bases matched/mismatched
// at this position.
d = 0;
i = 0;
// If the ref and alt both have a gap at this position.
} else {
if (!gapRegion) {
// If not already in a gap region, flush the matches/mismatches to
// the CIGAR.
if (m != 0) {
oCigar << m << '=';
matchGroups++;
} else if (x != 0) {
oCigar << x << 'X';
mismatchGroups++;
}
}
gapRegion = true;
matchRegion = false;
mismatchRegion = false;
m = 0;
x = 0;
if (*refIter == GAP ) {
if (d != 0) {
oCigar << d << 'D';
deletionGroups++;
}
i++;
numberInsertion++;
d = 0;
} else {
if (i != 0) {
oCigar << i << 'I';
insertionGroups++;
}
d++;
numberDeletion++;
i = 0;
}
}
altIter++;
}
if (m != 0) {
oCigar << m << '=';
matchGroups++;
} else if (x != 0) {
oCigar << d << 'X';
mismatchGroups++;
} else if (d != 0) {
oCigar << d << 'D';
deletionGroups++;
} else if (i != 0) {
oCigar << i << 'I';
insertionGroups++;
}
cigar = oCigar.str();
}
// Check that the alignment has produced the expected results. For example,
// if the variant is a deletion, there should only be matches and deletions
// in the CIGAR string. There should also only be a single contiguous
// deletion.
void modifyAlleles::checkAlignment() {
// SNPs.
if (type.isBiallelicSnp || type.isTriallelicSnp || type.isQuadallelicSnp) {
if ( (numberInsertion + numberDeletion) != 0 || numberMismatch != 1) {
cerr << "ERROR: Alignment of SNP failed. CIGAR: " << cigar << endl;
exit(1);
}
// MNPs.
} else if (type.isMnp) {
if ( (numberInsertion + numberDeletion) != 0) {
cerr << "ERROR: Alignment of MNP failed. CIGAR: " << cigar << endl;
exit(1);
}
// Insertions.
} else if (type.isInsertion) {
if ( (numberMismatch + numberDeletion) != 0 || insertionGroups != 1) {
cerr << "ERROR: Alignment of insertion failed. CIGAR: " << cigar << endl;
exit(1);
}
// Deletions.
} else if (type.isDeletion) {
if ( (numberMismatch + numberInsertion) != 0 || deletionGroups != 1) {
cerr << "ERROR: Alignment of deletion failed at " << referenceSequence << ":";
cerr << originalPosition << ". CIGAR: " << cigar << endl;
exit(1);
}
// Complex events.
} else if (type.isComplex) {
}
}
// Left align the allele.
void modifyAlleles::leftAlign() {
int altpos = 0;
int refpos = 0;
int len;
string slen;
vector<pair<int, char> > cigarData;
cerr << "START: " << endl << workingRef << endl << workingAlt << endl << cigar << endl;
for (string::iterator c = cigar.begin(); c != cigar.end(); ++c) {
cerr << *c << endl;
switch (*c) {
case 'I':
len = atoi(slen.c_str());
slen.clear();
cigarData.push_back(make_pair(len, *c));
cerr << "CASE I: " << workingAlt.substr(altpos, len) << " " << refpos + workingPosition << endl;
//variants.push_back(VariantAllele("", alternateQuery.substr(altpos, len), refpos - paddingLen + position));
altpos += len;
break;
case 'D':
len = atoi(slen.c_str());
slen.clear();
cigarData.push_back(make_pair(len, *c));
cerr << "CASE D: " << workingRef.substr(refpos, len) << " " << refpos + workingPosition << endl;
//variants.push_back(VariantAllele(reference.substr(refpos, len), "", refpos - paddingLen + position));
refpos += len;
break;
case '=':
cerr << "MATCH" << endl;
{
len = atoi(slen.c_str());
cerr << "LEN=" << len << endl;
slen.clear();
cigarData.push_back(make_pair(len, *c));
//string refmatch = reference.substr(refpos, len);
//string altmatch = alternateQuery.substr(altpos, len);
string refmatch = workingRef.substr(refpos, len);
string altmatch = workingAlt.substr(altpos, len);
cerr << refmatch << " " << altmatch << endl;
bool inmismatch = false;
int mismatchStart = 0;
for (int i = 0; i < refmatch.size(); ++i) {
cerr << "COMPARE: " << refmatch.at(i) << " " << altmatch.at(i) << endl;
if (refmatch.at(i) == altmatch.at(i)) {
cerr << "EQUAL " << inmismatch << endl;
if (inmismatch) {
//variants.push_back(VariantAllele(
//refmatch.substr(mismatchStart, i - mismatchStart),
//altmatch.substr(mismatchStart, i - mismatchStart),
//mismatchStart - paddingLen + position));
cerr << "CASE M: " << refmatch.substr(mismatchStart, i - mismatchStart) << " ";
cerr << altmatch.substr(mismatchStart, i - mismatchStart) << " ";
cerr << mismatchStart + workingPosition << endl;
}
inmismatch = false;
} else {
cerr << "NOT EQUAL" << endl;
if (!inmismatch) {
mismatchStart = i;
inmismatch = true;
}
}
++refpos;
++altpos;
}
}
break;
case 'S':
len = atoi(slen.c_str());
slen.clear();
cigarData.push_back(make_pair(len, *c));
refpos += len;
altpos += len;
break;
default:
len = 0;
slen += *c;
break;
}
}
}
// If the alleles can be further left aligned, update the working
// alleles and send back to the alignment process.
void modifyAlleles::updateWorkingAlleles() {
// Double the size of the flanking reference sequence.
flankLength = 2 * flankLength;
getFlankingReference();
// Build the alleles to align.
workingRef = "ZZZZZZZZZZZZZZZ" + flankFront + modifiedRef + flankEnd + "ZZZZZZZZZZZZZZZ";
workingAlt = "ZZZZZZZZZZZZZZZ" + flankFront + modifiedAlt + flankEnd + "ZZZZZZZZZZZZZZZ";
}
// Attempt to place the indel further left than reported in the vcf file.
// Use fastahack to get flanking reference sequence.
void modifyAlleles::stepAlleles() {
bool leftAligned = false;
bool reset;
unsigned int count;
unsigned int refPosition;
unsigned int indelPosition;
string anchor;
string laggingBase;
// Get the flanking reference sequence. The variable sequence is
// populated with the inserted/deleted bases.
FastaReference* fr = new FastaReference(fasta);
if (type.isInsertion) {
flankLength = 30 * modifiedAlt.length();
sequence = modifiedAlt.substr(1, modifiedAlt.length());
flank = fr->getSubSequence(referenceSequence, originalPosition - 1 - flankLength, flankLength);
flank += modifiedAlt;
laggingBase = fr->getSubSequence(referenceSequence, originalPosition, 1);
} else if (type.isDeletion) {
flankLength = 30 * modifiedRef.length();
sequence = modifiedRef.substr(1, modifiedRef.length());
flank = fr->getSubSequence(referenceSequence, originalPosition - 1 - flankLength, flankLength);
flank += modifiedRef;
laggingBase = fr->getSubSequence(referenceSequence, originalPosition + sequence.length(), 1);
}
// Try stepping the inserted/deleted alleles backwards through the
// reference sequence. The first base is an anchor base and may be
// permitted to change. For example, consider the deletion of an
// AC in a dinucleotide repeat. If the alleles are represented as
// CAC -> C, the starting C in the ref allele may be subject to
// change. If the context is ACGACAC and the coordinate referes to
// deletion of the second AC, this can be replaced by GAC -> G and
// thus the starting C is not preserved.
count = 0;
refPosition = flank.length() - sequence.length() - 1;
workingPosition = modifiedPosition;
while (count < sequence.length()) {
indelPosition = 0;
while (sequence[indelPosition] == flank[refPosition + indelPosition] && indelPosition < sequence.length()) {
// If the whole sequence matches the reference then the sequence
// can be moved to this position.
if (indelPosition == sequence.length() - 1) {
workingPosition = workingPosition - count - 1;
count = 0;
anchor = flank[refPosition - 1];
// Ensure that replacing the reported allele representation with
// the left-aligned one results in the same alleles.
string oldAllele, newAllele;
if (type.isInsertion) {
oldAllele = flank.substr(refPosition - 1, flankLength - refPosition + 2) + sequence + laggingBase;
newAllele = anchor[0] + flank.substr(refPosition - 1 + sequence.length() - 1, flankLength - refPosition + 1) + sequence + laggingBase;
} else if (type.isDeletion) {
oldAllele = flank.substr(refPosition - 1, flankLength - refPosition + 2) + laggingBase;
newAllele = anchor[0] + flank.substr(refPosition - 1 + sequence.length() - 1, flankLength - refPosition + 1) + laggingBase;
}
if (oldAllele == newAllele) {
leftAligned = true;
reset = true;
}
}
indelPosition++;
}
refPosition--;
if (reset) {reset = false;}
else {count++;}
}
if (leftAligned) {
// Change the first base in the reference and alternate alleles to
// the anchor base. This is the leftmost base that both alleles
// share.
modifiedRef[0] = anchor[0];
modifiedAlt[0] = anchor[0];
modifiedPosition = workingPosition;
}
delete fr;
}