This repository has been archived by the owner on Sep 28, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathhyperloglog.js
135 lines (110 loc) · 4.86 KB
/
hyperloglog.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
function compute_alpha_times_bucket_count_squared(bucket_count) {
return 0.7213 / (1 + 1.079 / bucket_count) * bucket_count * bucket_count;
}
// Create a HyperLogLog counter of 2^n buckets.
// 2^0 to 2^32 - requires that many BYTES (really 6 bit words for 64 bit hashing)
// The limit of 2^32 comes from using the first 32 bit int of the hash
// for the bucket index. Theoretically we could scale that to allow more, but that means
// more than 4GB per HLL, which is unlikely.
function HyperLogLog(n) {
var bucket_count = Math.pow(2, n);
var alpha_times_bucket_count_squared = compute_alpha_times_bucket_count_squared(bucket_count);
var buckets = new Buffer(bucket_count);
buckets.fill(0);
// Maintain some running counts so that returning cardinality is cheap.
var sum_of_inverses = bucket_count;
var count_zero_buckets = bucket_count;
var self = {
add: function add(unique_hash) {
if (unique_hash === null) {
return; // nothing to add
}
var bucket = unique_hash[0] >>> (32 - n);
var trailing_zeroes = 1;
count_zeroes:
for (var i = 3; i >= 2; --i) {
var data = unique_hash[i];
for (var j = 32; j; --j) {
if (data & 0x1) {
break count_zeroes;
}
++trailing_zeroes;
data = data >>> 1;
}
}
// Maintain a running sum of inverses for quick cardinality checking.
var old_value = buckets[bucket];
var new_value = Math.max(trailing_zeroes, old_value);
sum_of_inverses += Math.pow(2, -new_value) - Math.pow(2, -old_value);
if (new_value !== 0 && old_value === 0) {
--count_zero_buckets;
}
buckets[bucket] = new_value;
return self;
},
count: function count() {
/*var sum_of_inverses = 0;
var count_zero_buckets = 0;
for (var i = 0; i < bucket_count; ++i) {
var bucket = buckets[i];
if (bucket === 0) ++count_zero_buckets;
sum_of_inverses += 1 / Math.pow(2, bucket);
}*/
// No longer need to compute this all every time, since we keep running counts to keep this cheap.
var estimate = alpha_times_bucket_count_squared / sum_of_inverses;
// Apply small cardinality correction
if (count_zero_buckets > 0 && estimate < 5/2 * bucket_count) {
estimate = bucket_count * Math.log(bucket_count / count_zero_buckets);
}
return Math.floor(estimate + 0.5);
},
relative_error: function relative_error() {
// Estimate the relative error for this HLL.
return 1.04 / Math.sqrt(bucket_count);
},
output: function output() {
return {
n: n,
buckets: buckets
}
},
merge: function merge(data) {
if (n > data.n) {
// Fold this HLL down to the size of the incoming one.
var new_bucket_count = Math.pow(2, data.n);
var old_buckets_per_new_bucket = Math.pow(2, n - data.n);
var new_buckets = new Buffer(new_bucket_count);
for (var i = 0; i < new_bucket_count; ++i) {
var new_bucket_value = data.buckets[i];
for (var j = 0; j < old_buckets_per_new_bucket; ++j) {
new_bucket_value = Math.max(new_bucket_value, buckets[i * old_buckets_per_new_bucket + j]);
}
new_buckets[i] = new_bucket_value;
}
buckets = new_buckets;
n = data.n;
bucket_count = Math.pow(2, n);
alpha_times_bucket_count_squared = compute_alpha_times_bucket_count_squared(bucket_count);
} else {
var new_buckets_per_existing = Math.pow(2, data.n - n);
for (var i = data.buckets.length - 1; i >= 0; --i) {
var existing_bucket_index = (i / new_buckets_per_existing) | 0;
buckets[existing_bucket_index] = Math.max(buckets[existing_bucket_index], data.buckets[i]);
}
}
// Recompute running totals
sum_of_inverses = 0;
count_zero_buckets = 0;
for (var i = 0; i < bucket_count; ++i) {
var bucket = buckets[i];
if (bucket === 0) {
++count_zero_buckets;
}
sum_of_inverses += Math.pow(2, -bucket);
}
}
};
return self;
};
module.exports = HyperLogLog;
module.exports.hash = require('murmurhash3').murmur128Sync;