forked from kevinjlang/cpc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfm85Merging.c
325 lines (269 loc) · 12 KB
/
fm85Merging.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
// Copyright 2018, Kevin Lang, Oath Research
#include "fm85Merging.h"
UG85 * ug85Make (Short lgK) {
assert (lgK >= 4);
UG85 * self = (UG85 *) malloc (sizeof(UG85));
assert (self != NULL);
self->lgK = lgK;
// We begin with the accumulator holding an EMPTY sketch object.
// As an optimization the accumulator could start as NULL, but that would require changes elsewhere.
self->accumulator = fm85Make (lgK);
self->bitMatrix = NULL;
return (self);
}
/*******************************************************************************************/
void ug85Free (UG85 * self) {
if (self != NULL) {
if (self->accumulator != NULL) { fm85Free (self->accumulator); }
if (self->bitMatrix != NULL) { free (self->bitMatrix); }
free (self);
}
}
/*******************************************************************************************/
// This is used for testing purposes only.
U64 * bitMatrixOfUG85 (UG85 * self, Boolean * needToFreePtr) {
if (self->bitMatrix != NULL) { // return the matrix
assert (self->accumulator == NULL);
*needToFreePtr = 0; // or we could make a copy of the bitmatrix
return (self->bitMatrix);
}
else { // construct a matrix
assert (self->accumulator != NULL);
*needToFreePtr = 1;
return (bitMatrixOfSketch (self->accumulator));
}
}
/*******************************************************************************************/
/*******************************************************************************************/
void walkTableUpdatingSketch (FM85 * dest, u32Table * table) {
U32 * slots = table->slots;
Long numSlots = (1LL << table->lgSize);
assert (dest->lgK <= 26);
U32 destMask = (((1 << dest->lgK) - 1) << 6) | 63; // downsamples when destlgK < srcLgK
// Using a golden ratio stride fixes the snowplow effect.
double golden = 0.6180339887498949025;
Long stride = (Long) (golden * ((double) numSlots));
assert (stride >= 2);
if (stride == ((stride >> 1) << 1)) { stride += 1; }; // force the stride to be odd
assert (stride >= 3 && stride < numSlots);
Long i,j;
for (i = 0, j = 0; i < numSlots; i++, j += stride) {
j &= (numSlots - 1LL);
U32 rowCol = slots[j];
if (rowCol != ALL32BITS) {
fm85RowColUpdate (dest, rowCol & destMask);
}
}
}
/*******************************************************************************************/
void orTableIntoMatrix (U64 * bitMatrix, Short destLgK, u32Table * table) {
U32 * slots = table->slots;
Long numSlots = (1LL << table->lgSize);
Long destMask = (1LL << destLgK) - 1LL; // downsamples when destlgK < srcLgK
Long i = 0;
for (i = 0; i < numSlots; i++) {
U32 rowCol = slots[i];
if (rowCol != ALL32BITS) {
Short col = (Short) (rowCol & 63);
Long row = (Long) (rowCol >> 6);
bitMatrix[row & destMask] |= (1ULL << col); // Set the bit.
}
}
}
/*******************************************************************************************/
void orWindowIntoMatrix (U64 * destMatrix, Short destLgK, U8 * srcWindow, Short srcOffset, Short srcLgK) {
assert (destLgK <= srcLgK);
Long destMask = (1LL << destLgK) - 1LL; // downsamples when destlgK < srcLgK
Long srcK = (1LL << srcLgK);
Long srcRow = 0;
for (srcRow = 0; srcRow < srcK; srcRow++) {
destMatrix[srcRow & destMask] |= (((U64) srcWindow[srcRow]) << srcOffset);
}
}
/*******************************************************************************************/
void orMatrixIntoMatrix (U64 * destMatrix, Short destLgK, U64 * srcMatrix, Short srcLgK) {
assert (destLgK <= srcLgK);
Long destMask = (1LL << destLgK) - 1LL; // downsamples when destlgK < srcLgK
Long srcK = (1LL << srcLgK);
Long srcRow = 0;
for (srcRow = 0; srcRow < srcK; srcRow++) {
destMatrix[srcRow & destMask] |= srcMatrix[srcRow];
}
}
/*******************************************************************************************/
void ug85ReduceK (UG85 * unioner, Short newLgK) {
assert (newLgK < unioner->lgK);
assert (unioner->accumulator != NULL || unioner->bitMatrix != NULL);
if (unioner->bitMatrix != NULL) { // downsample the unioner's bit matrix
assert (unioner->accumulator == NULL);
Long newK = (1LL << newLgK);
U64 * newMatrix = (U64 *) malloc ((size_t) (newK * sizeof(U64)));
assert (newMatrix != NULL);
Long i = 0;
for (i = 0; i < newK; i++) { newMatrix[i] = 0LL; } // clear the bit matrix
orMatrixIntoMatrix (newMatrix, newLgK, unioner->bitMatrix, unioner->lgK);
free (unioner->bitMatrix);
unioner->bitMatrix = newMatrix;
unioner->lgK = newLgK;
return;
}
if (unioner->accumulator != NULL) { // downsample the unioner's sketch
assert (unioner->bitMatrix == NULL);
FM85 * oldSketch = unioner->accumulator;
if (oldSketch->numCoupons == 0) { // if the accumulator is EMPTY, simply change its K.
assert (oldSketch->surprisingValueTable == NULL);
oldSketch->lgK = newLgK;
unioner->lgK = newLgK;
return;
}
FM85 * newSketch = fm85Make (newLgK);
assert (oldSketch->slidingWindow == NULL && oldSketch->surprisingValueTable != NULL);
walkTableUpdatingSketch (newSketch, oldSketch->surprisingValueTable);
enum flavorType finalNewFlavor = determineSketchFlavor(newSketch);
assert (finalNewFlavor != EMPTY);
if (finalNewFlavor == SPARSE) {
unioner->accumulator = newSketch;
unioner->lgK = newLgK;
fm85Free (oldSketch);
return;
}
else { // the new sketch has graduated beyond sparse, so convert to bitMatrix
unioner->accumulator = NULL;
unioner->bitMatrix = bitMatrixOfSketch (newSketch);
unioner->lgK = newLgK;
fm85Free (oldSketch);
fm85Free (newSketch);
return;
}
}
assert (1 == 0); // in other words, fail if this point is reached.
}
/*******************************************************************************************/
void ug85MergeInto (UG85 * unioner, FM85 * source) {
if (NULL == unioner) { FATAL_ERROR ("unionerMergeInto(NULL)"); }
if (NULL == source) return;
enum flavorType sourceFlavor = determineSketchFlavor(source);
if (EMPTY == sourceFlavor) return;
if (source->lgK < unioner->lgK) { ug85ReduceK (unioner, source->lgK); }
assert (source->lgK >= unioner->lgK);
assert (unioner->accumulator != NULL || unioner->bitMatrix != NULL);
if (SPARSE == sourceFlavor && unioner->accumulator != NULL) { // Case A
assert (unioner->bitMatrix == NULL);
enum flavorType initialDestFlavor = determineSketchFlavor (unioner->accumulator);
assert (EMPTY == initialDestFlavor || SPARSE == initialDestFlavor);
// The following partially fixes the snowplow problem provided that the K's are equal.
// A complete fix is coming soon.
if (EMPTY == initialDestFlavor && unioner->lgK == source->lgK) {
fm85Free (unioner->accumulator);
unioner->accumulator = fm85Copy(source);
}
walkTableUpdatingSketch (unioner->accumulator, source->surprisingValueTable);
enum flavorType finalDestFlavor = determineSketchFlavor(unioner->accumulator);
// if the accumulator has graduated beyond sparse, switch to a bitMatrix representation
if (finalDestFlavor != EMPTY && finalDestFlavor != SPARSE) {
unioner->bitMatrix = bitMatrixOfSketch (unioner->accumulator);
fm85Free (unioner->accumulator);
unioner->accumulator = NULL;
}
return;
}
if (SPARSE == sourceFlavor && unioner->bitMatrix != NULL) { // Case B
assert (unioner->accumulator == NULL);
orTableIntoMatrix (unioner->bitMatrix, unioner->lgK, source->surprisingValueTable);
return;
}
assert (HYBRID == sourceFlavor || PINNED == sourceFlavor || SLIDING == sourceFlavor);
// source is past SPARSE mode, so make sure that dest is a bitMatrix.
if (unioner->accumulator != NULL) {
assert (unioner->bitMatrix == NULL);
enum flavorType destFlavor = determineSketchFlavor (unioner->accumulator);
assert (EMPTY == destFlavor || SPARSE == destFlavor);
unioner->bitMatrix = bitMatrixOfSketch (unioner->accumulator);
fm85Free (unioner->accumulator);
unioner->accumulator = NULL;
}
assert (unioner->bitMatrix != NULL);
if (HYBRID == sourceFlavor || PINNED == sourceFlavor) { // Case C
orWindowIntoMatrix (unioner->bitMatrix, unioner->lgK, source->slidingWindow, source->windowOffset, source->lgK);
orTableIntoMatrix (unioner->bitMatrix, unioner->lgK, source->surprisingValueTable);
return;
}
// SLIDING mode involves inverted logic, so we can't just walk the source sketch.
// Instead, we convert it to a bitMatrix that can be OR'ed into the destination.
assert (SLIDING == sourceFlavor); // Case D
U64 * sourceMatrix = bitMatrixOfSketch (source);
orMatrixIntoMatrix (unioner->bitMatrix, unioner->lgK, sourceMatrix, source->lgK);
free (sourceMatrix);
return;
}
/*******************************************************************************************/
FM85 * ug85GetResult (UG85 * unioner) {
assert (unioner != NULL);
assert (unioner->accumulator != NULL || unioner->bitMatrix != NULL);
if (unioner->accumulator != NULL) { // start of case where unioner contains a sketch
assert (unioner->bitMatrix == NULL);
assert (unioner->lgK == unioner->accumulator->lgK);
if (unioner->accumulator->numCoupons == 0) {
FM85 * result = fm85Make (unioner->lgK);
result->mergeFlag = 1;
return (result);
}
assert (SPARSE == determineSketchFlavor(unioner->accumulator));
FM85 * result = fm85Copy (unioner->accumulator);
result->mergeFlag = 1;
return (result);
} // end of case where unioner contains a sketch
// start of case where unioner contains a bitMatrix
assert (unioner->bitMatrix != NULL);
assert (unioner->accumulator == NULL);
U64 * matrix = unioner->bitMatrix;
Short lgK = unioner->lgK;
FM85 * result = fm85Make (unioner->lgK);
Long k = (1LL << lgK);
Long numCoupons = countBitsSetInMatrix (matrix, k);
result->numCoupons = numCoupons;
enum flavorType flavor = determineFlavor (lgK, numCoupons);
assert (flavor == HYBRID || flavor == PINNED || flavor == SLIDING);
Short offset = determineCorrectOffset (lgK, numCoupons);
result->windowOffset = offset;
U8 * window = (U8 *) malloc ((size_t) (k * sizeof(U8)));
assert (window != NULL);
// bzero ((void *) window, (size_t) k); // don't need to zero the window's memory
assert (result->slidingWindow == NULL);
result->slidingWindow = window;
// u32Table * table = u32TableMake (2, 6 + lgK); // dynamically growing caused snowplow effect
Short newTableSize = lgK - 4; // K/16; in some cases this will end up being oversized
if (newTableSize < 2) newTableSize = 2;
u32Table * table = u32TableMake (newTableSize, 6 + lgK);
assert (table != NULL);
assert (result->surprisingValueTable == NULL);
result->surprisingValueTable = table;
// I believe that the following works even when the offset is zero.
U64 maskForClearingWindow = (0xffULL << offset) ^ ALL64BITS;
U64 maskForFlippingEarlyZone = (1ULL << offset) - 1;
U64 allSurprisesORed = 0;
Long i = 0;
// The snowplow effect was caused by processing the rows in order,
// but we have fixed it by using a sufficiently large hash table.
for (i = 0; i < k; i++) {
U64 pattern = matrix[i];
window[i] = (U8) ((pattern >> offset) & 0xff);
pattern &= maskForClearingWindow;
pattern ^= maskForFlippingEarlyZone; // This flipping converts surprising 0's to 1's.
allSurprisesORed |= pattern;
while (pattern != 0) {
Short col = countTrailingZerosInUnsignedLong (pattern);
pattern = pattern ^ (1ULL << col); // erase the 1.
U32 rowCol = (i << 6) | col;
Boolean isNovel = u32TableMaybeInsert (table, rowCol);
assert (isNovel == 1);
}
}
// At this point we could shrink an oversize hash table, but the relative waste isn't very big.
result->firstInterestingColumn = countTrailingZerosInUnsignedLong (allSurprisesORed);
if (result->firstInterestingColumn > offset) result->firstInterestingColumn = offset; // corner case
// NB: the HIP-related fields will contain bogus values, but that is okay.
result->mergeFlag = 1;
return result;
// end of case where unioner contains a bitMatrix
}