-
Notifications
You must be signed in to change notification settings - Fork 0
/
intersect.cpp~
263 lines (223 loc) · 12.2 KB
/
intersect.cpp~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
// ******************************************************
// vcfCTools (c) 2011 Alistair Ward
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ------------------------------------------------------
// Last modified: 18 February 2011
// ------------------------------------------------------
// Calculate the intersection of two vcf files or a vcf
// file and a bed file.
// ******************************************************
#include "intersect.h"
using namespace std;
using namespace vcfCTools;
// Intersect two vcf files. Intersect by variant position only.
void intersectVcf(vcf& v1, variant& var1, vcf& v2, variant& var2, bool findUnion, bool findUnique, bool annotate, string writeFrom, ostream* output) {
// The following Boolean flags are used when adding variants to the map. They
// determine whether or not to write out records that are unique to one of the
// files. write1 is set to true if the vcf file is being annotated. This
// ensures that every record from the vcf file is written out regardless of
// whether it intersects with the other vcf file. In the case that the two
// files intersect, the additional annotation routine will be called.
bool write1 = ( (findUnique && writeFrom == "a") || findUnion || annotate) ? true : false;
bool write2 = ( (findUnique && writeFrom == "b") || findUnion) ? true : false;
// Build the variant structures for each vcf file.
v1.success = var1.buildVariantStructure(v1);
v2.success = var2.buildVariantStructure(v2);
// Set the pointers to the start of each variant map.
var1.vmIter = var1.variantMap.begin();
var2.vmIter = var2.variantMap.begin();
// Parse and compare the two variant structures until the end of one of the files
// is reached and the variant structure for that file is empty.
while ( !(var1.variantMap.size() == 0 && !v1.success) && !(var2.variantMap.size() == 0 && !v2.success) ) {
// If the two variant structures are built with the same reference sequence, compare
// the contents and parse through all varians for this reference sequence.
if (var1.vmIter->second.referenceSequence == var2.vmIter->second.referenceSequence) {
string currentReferenceSequence = var1.vmIter->second.referenceSequence;
while (var1.variantMap.size() != 0 && var2.variantMap.size() != 0) {
// Variants at the same locus.
if (var1.vmIter->first == var2.vmIter->first) {
if (!findUnique) {
if (annotate) {var1.annotateRecordVcf(var2.vmIter->second, v2.dbsnpVcf);}
var1.writeVariants(output);
}
// Clear the compared variants from the structure and add the next one from
// the file into the structure if it is from the same reference sequence.
var1.variantMap.erase(var1.vmIter);
if (v1.variantRecord.referenceSequence == currentReferenceSequence && v1.success) {
var1.addVariantToStructure(v1.position, v1.variantRecord);
v1.success = v1.getRecord(currentReferenceSequence);
}
if (var1.variantMap.size() != 0) {var1.vmIter = var1.variantMap.begin();}
var2.variantMap.erase(var2.vmIter);
if (v2.variantRecord.referenceSequence == currentReferenceSequence && v2.success) {
var2.addVariantToStructure(v2.position, v2.variantRecord);
v2.success = v2.getRecord(currentReferenceSequence);
}
if (var2.variantMap.size() != 0) {var2.vmIter = var2.variantMap.begin();}
// Variant from the first vcf file is at a larger coordinate than that in the
// second vcf file. Parse through the second file until the position is greater
// than or equal to that in the second file.
} else if (var1.vmIter->first > var2.vmIter->first) {
if (write2) {var2.writeVariants(output);}
var2.variantMap.erase(var2.vmIter);
if (v2.variantRecord.referenceSequence == currentReferenceSequence && v2.success) {
var2.addVariantToStructure(v2.position, v2.variantRecord);
v2.success = v2.getRecord(currentReferenceSequence);
}
if (var2.variantMap.size() != 0) {var2.vmIter = var2.variantMap.begin();}
// Variant from the first vcf file is at a smaller coordinate than that in the
// second vcf file.
} else if (var1.vmIter->first < var2.vmIter->first) {
if (write1) {var1.writeVariants(output);}
var1.variantMap.erase(var1.vmIter);
if (v1.variantRecord.referenceSequence == currentReferenceSequence && v1.success) {
var1.addVariantToStructure(v1.position, v1.variantRecord);
v1.success = v1.getRecord(currentReferenceSequence);
}
if (var1.variantMap.size() != 0) {var1.vmIter = var1.variantMap.begin();}
}
}
// Having finished comparing, there may still be variants left from one of the two files.
// Check that the two variant structures are empty and if not, finish processing the
// remaining variants for this reference sequence.
if (var1.variantMap.size() != 0) {var1.clearReferenceSequence(v1, currentReferenceSequence, write1, output);}
if (var2.variantMap.size() != 0) {var2.clearReferenceSequence(v2, currentReferenceSequence, write2, output);}
// Now both variant maps are exhausted, so rebuild the maps with the variants from the
// next reference sequence in the file.
v1.success = var1.buildVariantStructure(v1);
v2.success = var2.buildVariantStructure(v2);
if (var1.variantMap.size() != 0) {var1.vmIter = var1.variantMap.begin();}
if (var2.variantMap.size() != 0) {var2.vmIter = var2.variantMap.begin();}
// If the variant structures are from different reference sequences, parse through the
// second vcf file until the next reference sequence is found. If finding the union
// of the variants unique to the second vcf file, write them out.
} else {
if (var2.variantMap.size() != 0) {var2.clearReferenceSequence(v2, var2.vmIter->second.referenceSequence, write2, output);}
var2.buildVariantStructure(v2);
if (var2.variantMap.size() != 0) {var2.vmIter = var2.variantMap.begin();}
}
}
// If the variant structures are not empty, there was a problem and a warning is given.
if (var1.variantMap.size() != 0 || var2.variantMap.size() != 0) {
cerr << "WARNING: Not all records were flushed out of the variant structure." << endl;
}
}
// Intersect a vcf file and a bed file. It is assumed that the
// two files are sorted by genomic coordinates and the reference
// sequences are in the same order. Do not group together variants
// in common reference sequence.
void intersectVcfBed(vcf& v, variant& var, bed& b, bedStructure& bs, bool findUnique, bool annotate, ostream* output) {
map<int, bedRecord>::iterator bmNext;
bool lastBedInterval = false;
// Build the variant structures for the vcf and bed files.
v.success = var.buildVariantStructure(v);
b.success = bs.buildBedStructure(b);
// Set the pointers to the start of each variant map.
var.vmIter = var.variantMap.begin();
bs.bmIter = bs.bedMap.begin();
bmNext = bs.bedMap.begin();
bmNext++;
if (bmNext == bs.bedMap.end()) {lastBedInterval = true;}
// Check if the end position of the current interval is larger than the start
// position of the next interval in the structure. If so, the intervals
// overlap and need to be split up.
if (!lastBedInterval) {
if (bs.bmIter->second.end >= bmNext->first) {
bs.resolveOverlaps(annotate);
bs.bmIter = bs.bedMap.begin();
}
}
// Define the current reference sequence as that from the first entry in the
// variant map.
string currentReferenceSequence = var.vmIter->second.referenceSequence;
// Parse through the vcf file until it is finished and the variant structure is empty.
//while (var.variantMap.size() != 0 && bs.bedMap.size() != 0) {
while (var.variantMap.size() != 0 || v.success) {
if (var.vmIter->second.referenceSequence == bs.bmIter->second.referenceSequence) {
// Variant is prior to the bed interval.
if (var.vmIter->first < bs.bmIter->first) {
if (findUnique || annotate) {var.writeVariants(output);}
var.variantMap.erase(var.vmIter);
if (v.variantRecord.referenceSequence == currentReferenceSequence && v.success) {
var.addVariantToStructure(v.position, v.variantRecord);
v.success = v.getRecord(currentReferenceSequence);
}
if (var.variantMap.size() != 0) {var.vmIter = var.variantMap.begin();}
// Variant has start coordinate past the bed interval.
} else if (var.vmIter->first > bs.bmIter->second.end) {
bs.bedMap.erase(bs.bmIter);
if (b.bRecord.referenceSequence == currentReferenceSequence && b.success) {
bs.addIntervalToStructure(b.bRecord);
b.success = b.getRecord();
}
if (bs.bedMap.size() != 0) {
bs.bmIter = bs.bedMap.begin();
if (bs.bedMap.begin()++ != bs.bedMap.end()) {
bmNext = bs.bedMap.begin();
bmNext++;
} else {lastBedInterval = true;}
// Check if the end position of the current interval is larger than the start
// position of the next interval in the structure. If so, the intervals
// overlap and need to be split up.
if (!lastBedInterval) {
if (bs.bmIter->second.end >= bmNext->first) {
bs.resolveOverlaps(annotate);
bs.bmIter = bs.bedMap.begin();
}
}
// If the bed map is exhausted, clear out the variant map and move to the next
// reference sequence.
} else {
// If there are variants in the variant map, clear them out and set the current
// reference sequence to the next reference sequence in the vcf file.
if (var.variantMap.size() != 0) {
bool write = (findUnique || annotate) ? true : false;
var.clearReferenceSequence(v, var.vmIter->second.referenceSequence, write, output);
}
v.success = var.buildVariantStructure(v);
if (var.variantMap.size() != 0) {var.vmIter = var.variantMap.begin();}
lastBedInterval = false;
// Build the new bed structure.
if (b.success) {
b.success = bs.buildBedStructure(b);
if (bs.bedMap.size() != 0) {bs.bmIter = bs.bedMap.begin();}
}
}
// Variant lies within the bed interval.
} else {
if (annotate) {var.annotateRecordBed(bs.bmIter->second);}
if (!findUnique) {var.writeVariants(output);}
var.variantMap.erase(var.vmIter);
if (v.variantRecord.referenceSequence == currentReferenceSequence && v.success) {
var.addVariantToStructure(v.position, v.variantRecord);
v.success = v.getRecord(currentReferenceSequence);
}
if (var.variantMap.size() != 0) {var.vmIter = var.variantMap.begin();}
}
// If the reference sequence of the bed interval is not the same as that of the vcf
// record,
} else {
// If there are variants in the variant map, clear them out and set the current
// reference sequence to the next reference sequence in the vcf file.
if (var.variantMap.size() != 0) {
bool write = (findUnique || annotate) ? true : false;
var.clearReferenceSequence(v, var.vmIter->second.referenceSequence, write, output);
}
v.success = var.buildVariantStructure(v);
if (var.variantMap.size() != 0) {var.vmIter = var.variantMap.begin();}
lastBedInterval = false;
// Clear the bed map.
bs.bedMap.clear();
// Read through the remaining records until the correct reference sequence is
// found. There is no need to put the records in the map, since the vcf will
// not be intersected with these records.
while (b.bRecord.referenceSequence != currentReferenceSequence && b.success) {b.success = b.getRecord();}
// Build the new bed structure.
b.success = bs.buildBedStructure(b);
if (bs.bedMap.size() != 0) {bs.bmIter = bs.bedMap.begin();}
}
}
// If the variant structure is not empty, not all of the records were parsed.
if (var.variantMap.size() != 0) {cerr << "WARNING: Not all records were flushed out of the variant structure." << endl;}
}