-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfm85Util.c
286 lines (236 loc) · 7.82 KB
/
fm85Util.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
// Copyright 2018, Kevin Lang, Oath Research
#include "fm85Util.h"
/******************************************/
void * shallowCopy (void * oldObject, size_t numBytes) {
if (oldObject == NULL || numBytes == 0) { FATAL_ERROR ("shallowCopyObject: bad arguments"); }
void * newObject = malloc (numBytes);
if (newObject == NULL) { FATAL_ERROR ("shallowCopyObject: malloc failed"); }
memcpy (newObject, oldObject, numBytes);
return (newObject);
}
/******************************************/
U8 byteLeadingZerosTable[256];
U8 slow_count_leading_zeros_in_byte (U8 the_byte)
{
int shift;
int count = 0;
for (shift = 7; shift >= 0; shift--) {
int is_zero = !((the_byte >> shift) & 1);
if (is_zero)
count++;
else
break;
}
return count;
}
void fillByteLeadingZerosTable(void)
{
int j;
for (j = 0; j < 256; j++)
byteLeadingZerosTable[j] = slow_count_leading_zeros_in_byte ((U8) j);
}
/******************************************/
#define FCLZ_MASK_56 ((U64) 0x00ffffffffffffff)
#define FCLZ_MASK_48 ((U64) 0x0000ffffffffffff)
#define FCLZ_MASK_40 ((U64) 0x000000ffffffffff)
#define FCLZ_MASK_32 ((U64) 0x00000000ffffffff)
#define FCLZ_MASK_24 ((U64) 0x0000000000ffffff)
#define FCLZ_MASK_16 ((U64) 0x000000000000ffff)
#define FCLZ_MASK_08 ((U64) 0x00000000000000ff)
Short countLeadingZerosInUnsignedLong (U64 theInput) {
if (theInput > FCLZ_MASK_56)
return ((Short) ( 0 + byteLeadingZerosTable[(theInput >> 56) & FCLZ_MASK_08]));
if (theInput > FCLZ_MASK_48)
return ((Short) ( 8 + byteLeadingZerosTable[(theInput >> 48) & FCLZ_MASK_08]));
if (theInput > FCLZ_MASK_40)
return ((Short) (16 + byteLeadingZerosTable[(theInput >> 40) & FCLZ_MASK_08]));
if (theInput > FCLZ_MASK_32)
return ((Short) (24 + byteLeadingZerosTable[(theInput >> 32) & FCLZ_MASK_08]));
if (theInput > FCLZ_MASK_24)
return ((Short) (32 + byteLeadingZerosTable[(theInput >> 24) & FCLZ_MASK_08]));
if (theInput > FCLZ_MASK_16)
return ((Short) (40 + byteLeadingZerosTable[(theInput >> 16) & FCLZ_MASK_08]));
if (theInput > FCLZ_MASK_08)
return ((Short) (48 + byteLeadingZerosTable[(theInput >> 8) & FCLZ_MASK_08]));
if (1)
return ((Short) (56 + byteLeadingZerosTable[(theInput >> 0) & FCLZ_MASK_08]));
}
/******************************************/
U8 byteTrailingZerosTable[256];
// U8 lookupByteTrailingZeros (int x) { return (byteTrailingZerosTable[x]); }
U8 slow_count_trailing_zeros_in_byte (U8 the_byte)
{
int shift;
int count = 0;
for (shift = 0; shift <= 7; shift++) {
int is_zero = !((the_byte >> shift) & 1);
if (is_zero)
count++;
else
break;
}
return count;
}
void fillByteTrailingZerosTable(void)
{
int j;
for (j = 0; j < 256; j++)
byteTrailingZerosTable[j] = slow_count_trailing_zeros_in_byte ((U8) j);
}
Short countTrailingZerosInUnsignedLong (U64 theInput) {
U64 tmp = theInput;
int byte;
int j = 0;
for (j = 0; j < 8; j++) {
byte = (tmp & 0xffULL);
if (byte != 0) return ((Short) ((j << 3) + byteTrailingZerosTable[byte]));
tmp >>= 8;
}
return ((Short) (64));
}
/******************************************/
double invPow2Tab[256];
void fillInvPow2Tab (void)
{
int j;
for (j = 0; j < 256; j++) {
invPow2Tab[j] = pow (2.0, (-1.0 * ((double) j)));
}
}
/******************************************/
double kxpByteLookup[256];
void fillKxpByteLookup (void) // must call fillInvPow2Tab() first
{
int byte, col;
for (byte = 0; byte < 256; byte++) {
double sum = 0.0;
for (col = 0; col < 8; col++) {
int bit = (byte >> col) & 1;
if (bit == 0) { // note the inverted logic
sum += invPow2Tab[col+1]; // note the "+1"
}
}
kxpByteLookup[byte] = sum;
}
}
/******************************************/
Long divideLongsRoundingUp (Long x, Long y) {
assert (x >= 0 && y > 0);
Long quotient = x / y;
if (quotient * y == x) return (quotient);
else return (quotient + 1);
}
Long longFloorLog2OfLong (Long x) {
assert (x >= 1L); // throw an exception
Long p = 0;
Long y = 1;
log2Loop:
if (y == x) return (p);
if (y > x) return (p-1);
p += 1;
y <<= 1;
goto log2Loop;
}
/***********************************/
// returns an integer that is between
// zero and ceiling(log_2(k))-1, inclusive
// Long oldGolombChooseNumberOfBaseBits (Long k, Long raw_count) {
// assert (k >= 1L);
// assert (raw_count >= 0L);
// Long count = (raw_count == 0L) ? 1L : raw_count;
// Long quotient = (k - count) / count; // integer division
// if (quotient == 0) return (0);
// else return (longFloorLog2OfLong(quotient));
// }
Long golombChooseNumberOfBaseBits (Long k, Long count) {
assert (k >= 1L);
assert (count >= 1L);
Long quotient = (k - count) / count; // integer division
if (quotient == 0) return (0);
else return (longFloorLog2OfLong(quotient));
}
/*******************************************************/
// This place-holder code was inadequate because it caused
// the cost of the post-merge getResult() operation to be O(C)
// instead of O(K). It did have the advantage of being
// very simple and trustworthy during initial testing.
Long wegnerCountBitsSetInMatrix (U64 * array, Long length) {
Long i = 0;
U64 pattern = 0;
Long count = 0;
// clock_t t0, t1;
// t0 = clock();
// Wegner's Bit-Counting Algorithm, CACM 3 (1960), p. 322.
for (i = 0; i < length; i++) {
pattern = array[i];
while (pattern != 0) {
pattern &= (pattern - 1);
count++;
}
}
// t1 = clock();
// printf ("\n(Wegner CountBitsTime %.1f)\n", ((double) (t1 - t0)) / 1000.0);
// fflush (stdout);
return count;
}
/*******************************************************/
// Note: this is an adaptation of the Java code that Lee sent me,
// which is apparently a variation of Figure 5-2 in "Hacker's Delight"
// by Henry S. Warren.
static inline Long warrenBitCount(U64 i) {
i = i - ((i >> 1) & 0x5555555555555555ULL);
i = (i & 0x3333333333333333ULL) + ((i >> 2) & 0x3333333333333333ULL);
i = (i + (i >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
i = i + (i >> 8);
i = i + (i >> 16);
i = i + (i >> 32);
return (Long)i & 0x7f;
}
Long warrenCountBitsSetInMatrix (U64 * array, Long length) {
Long i = 0;
Long count = 0;
// clock_t t0, t1;
// t0 = clock();
for (i = 0; i < length; i++) {
count += warrenBitCount(array[i]);
}
// t1 = clock();
// printf ("(Warren CountBitsTime %.1f)\n", ((double) (t1 - t0)) / 1000.0);
// fflush (stdout);
return count;
}
/*******************************************************/
// This code is Figure 5-9 in "Hacker's Delight" by Henry S. Warren.
#define CSA(h,l,a,b,c) {U64 u = a^b; U64 v = c; h = (a&b) | (u&v); l = u^v;}
Long countBitsSetInMatrix (U64 * A, Long length) {
assert ((length & 0x7) == 0); // the length of the array must be a multiple of 8.
// clock_t t0, t1;
// t0 = clock();
Long tot, i;
U64 ones, twos, twosA, twosB, fours, foursA, foursB, eights;
tot = 0;
fours = twos = ones = 0;
for (i = 0; i <= length - 8; i = i + 8) {
CSA(twosA, ones, ones, A[i+0], A[i+1]);
CSA(twosB, ones, ones, A[i+2], A[i+3]);
CSA(foursA, twos, twos, twosA, twosB);
CSA(twosA, ones, ones, A[i+4], A[i+5]);
CSA(twosB, ones, ones, A[i+6], A[i+7]);
CSA(foursB, twos, twos, twosA, twosB);
CSA(eights, fours, fours, foursA, foursB);
tot += warrenBitCount(eights);
}
tot = 8*tot + 4*warrenBitCount(fours) + 2*warrenBitCount(twos) + warrenBitCount(ones);
// t1 = clock();
// printf ("(CSA CountBitsTime %.1f)\n", ((double) (t1 - t0)) / 1000.0);
// fflush (stdout);
// Because I still don't fully trust this fancy version.
assert(tot == wegnerCountBitsSetInMatrix(A, length));
return (tot);
}
/*********************************************/
// Here are some timings made with quickTestMerge.c
// for the "5 5" case:
// Wegner CountBitsTime 29.3
// Warren CountBitsTime 5.3
// CSA CountBitsTime 4.3