diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java
deleted file mode 100644
index bfa696cad..000000000
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.datasketches.filters.bloomfilter;
-
-import static org.apache.datasketches.common.Util.LS;
-
-import org.apache.datasketches.common.SketchesArgumentException;
-import org.apache.datasketches.memory.Buffer;
-import org.apache.datasketches.memory.Memory;
-import org.apache.datasketches.memory.WritableMemory;
-
-/**
- * This class holds an array of bits suitable for use in a Bloom Filter
- *
- *
Rounds the number of bits up to the smallest multiple of 64 (one long)
- * that is not smaller than the specified number.
- */
-abstract class BitArray {
- // MAX_BITS using longs, based on array indices being capped at Integer.MAX_VALUE
- protected static final long MAX_BITS = Integer.MAX_VALUE * (long) Long.SIZE;
-
- protected BitArray() {}
-
- static BitArray heapify(final Buffer mem, final boolean isEmpty) {
- return HeapBitArray.heapify(mem, isEmpty);
- }
-
- static BitArray wrap(final Memory mem, final boolean isEmpty) {
- return DirectBitArrayR.wrap(mem, isEmpty);
- }
-
- static BitArray writableWrap(final WritableMemory wmem, final boolean isEmpty) {
- return DirectBitArray.writableWrap(wmem, isEmpty);
- }
-
- boolean isEmpty() {
- return !isDirty() && getNumBitsSet() == 0;
- }
-
- abstract boolean hasMemory();
-
- abstract boolean isDirect();
-
- abstract boolean isReadOnly();
-
- abstract boolean getBit(final long index);
-
- abstract boolean getAndSetBit(final long index);
-
- abstract void setBit(final long index);
-
- abstract long getNumBitsSet();
-
- abstract void reset();
-
- abstract long getCapacity();
-
- abstract int getArrayLength();
-
- abstract void union(final BitArray other);
-
- abstract void intersect(final BitArray other);
-
- abstract void invert();
-
- // prints the raw BitArray as 0s and 1s, one long per row
- @Override
- public String toString() {
- final StringBuilder sb = new StringBuilder();
- for (int i = 0; i < getArrayLength(); ++i) {
- sb.append(i + ": ")
- .append(printLong(getLong(i)))
- .append(LS);
- }
- return sb.toString();
- }
-
- long getSerializedSizeBytes() {
- // We only really need an int for array length but this will keep everything
- // aligned to 8 bytes.
- // Always write array length, but write numBitsSet only if empty
- return Long.BYTES * (isEmpty() ? 1L : (2L + getArrayLength()));
- }
-
- // returns the number of bytes needed for a non-empty BitArray of the requested size
- static long getSerializedSizeBytes(final long numBits) {
- if (numBits <= 0) {
- throw new SketchesArgumentException("Requested number of bits must be strictly positive");
- }
- if (numBits > MAX_BITS) {
- throw new SketchesArgumentException("Requested number of bits exceeds maximum allowed. "
- + "Requested: " + numBits + ", maximum: " + MAX_BITS);
- }
- final int numLongs = (int) Math.ceil(numBits / 64.0);
- return Long.BYTES * (numLongs + 2L);
- }
-
- abstract protected boolean isDirty();
-
- // used to get a long from the array regardless of underlying storage
- // NOT used to query individual bits
- abstract protected long getLong(final int arrayIndex);
-
- // used to set a long in the array regardless of underlying storage
- // NOT used to set individual bits
- abstract protected void setLong(final int arrayIndex, final long value);
-
- // prints a long as a series of 0s and 1s as little endian
- protected static String printLong(final long val) {
- final StringBuilder sb = new StringBuilder();
- for (int j = 0; j < Long.SIZE; ++j) {
- sb.append((val & (1L << j)) != 0 ? "1" : "0");
- if (j % 8 == 7) { sb.append(" "); }
- }
- return sb.toString();
- }
-
-}
diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
index 3ea73b9bd..10829d7b7 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
+++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
@@ -26,6 +26,9 @@
import org.apache.datasketches.common.Family;
import org.apache.datasketches.common.SketchesArgumentException;
import org.apache.datasketches.common.SketchesStateException;
+import org.apache.datasketches.filters.common.BitArray;
+import org.apache.datasketches.filters.common.DirectBitArray;
+import org.apache.datasketches.filters.common.HeapBitArray;
import org.apache.datasketches.memory.Buffer;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.memory.WritableBuffer;
diff --git a/src/main/java/org/apache/datasketches/filters/common/BitArray.java b/src/main/java/org/apache/datasketches/filters/common/BitArray.java
new file mode 100644
index 000000000..8320a369f
--- /dev/null
+++ b/src/main/java/org/apache/datasketches/filters/common/BitArray.java
@@ -0,0 +1,305 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.filters.common;
+
+import static org.apache.datasketches.common.Util.LS;
+
+import org.apache.datasketches.common.SketchesArgumentException;
+import org.apache.datasketches.memory.Buffer;
+import org.apache.datasketches.memory.Memory;
+import org.apache.datasketches.memory.WritableMemory;
+
+/**
+ * This class holds an array of bits and should be suitable for use in
+ * the various membership filters. The representation is not compressed and
+ * is designed to fit in a single array, meaning that the maximum number
+ * of bits is limited by the maximize size of an array of longs in Java.
+ *
+ *
Rounds the number of bits up to the smallest multiple of 64 (one long)
+ * that is not smaller than the specified number.
+ */
+public abstract class BitArray {
+
+ /**
+ * The maximum number of bits that can be represented using longs,
+ * based on array indices being capped at Integer.MAX_VALUE
+ * and allowing room for encoding both the size and the number of bits set.
+ */
+ protected static final long MAX_BITS = (Integer.MAX_VALUE - 1) * (long) Long.SIZE;
+
+ /**
+ * Constructs a new BitArray.
+ */
+ BitArray() {}
+
+ /**
+ * Creates a BitArray from a given Buffer.
+ *
+ * @param mem The Buffer to heapify.
+ * @param isEmpty Indicates whether the BitArray is empty.
+ * @return The heapified BitArray.
+ */
+ public static BitArray heapify(final Buffer mem, final boolean isEmpty) {
+ return HeapBitArray.heapify(mem, isEmpty);
+ }
+
+ /**
+ * Creates a BitArray from a given Memory.
+ *
+ * @param mem The Memory to wrap.
+ * @param isEmpty Indicates whether the BitArray is empty.
+ * @return The wrapped BitArray.
+ */
+ public static BitArray wrap(final Memory mem, final boolean isEmpty) {
+ return DirectBitArrayR.wrap(mem, isEmpty);
+ }
+
+ /**
+ * Creates a writable BitArray from a given WritableMemory.
+ *
+ * @param wmem The WritableMemory to wrap.
+ * @param isEmpty Indicates whether the BitArray is empty.
+ * @return The writable wrapped BitArray.
+ */
+ public static BitArray writableWrap(final WritableMemory wmem, final boolean isEmpty) {
+ return DirectBitArray.writableWrap(wmem, isEmpty);
+ }
+
+ /**
+ * Checks if the BitArray is empty.
+ *
+ * @return True if the BitArray is empty, false otherwise.
+ */
+ public boolean isEmpty() {
+ return !isDirty() && getNumBitsSet() == 0;
+ }
+
+ /**
+ * Checks if the BitArray has a backing Memory.
+ *
+ * @return True if the BitArray has a backing Memory, false otherwise.
+ */
+ public abstract boolean hasMemory();
+
+ /**
+ * Checks if the BitArray is direct.
+ *
+ * @return True if the BitArray is direct, false otherwise.
+ */
+ public abstract boolean isDirect();
+
+ /**
+ * Checks if the BitArray is read-only.
+ *
+ * @return True if the BitArray is read-only, false otherwise.
+ */
+ public abstract boolean isReadOnly();
+
+ /**
+ * Gets the value of a bit at the specified index.
+ *
+ * @param index The index of the bit.
+ * @return The value of the bit at the specified index.
+ */
+ public abstract boolean getBit(final long index);
+
+ /**
+ * Gets the a specified number of bits starting at the given index. Limited
+ * to a single long (64 bits).
+ *
+ * @param index The starting index.
+ * @param numBits The number of bits to return.
+ * @return The value of the requested bits, starting at bit 0 of the result.
+ */
+ public abstract long getBits(final long index, final int numBits);
+
+ /**
+ * Gets the value of a bit at the specified index and sets it to true.
+ *
+ * @param index The index of the bit.
+ * @return The previous value of the bit at the specified index.
+ */
+ public abstract boolean getAndSetBit(final long index);
+
+ /**
+ * Assigns the value of a bit at the specified index to true.
+ *
+ * @param index The index of the bit.
+ */
+ public abstract void setBit(final long index);
+
+ /**
+ * Assigns the value of a bit at the specified index to false.
+ *
+ * @param index The index of the bit.
+ */
+ public abstract void clearBit(final long index);
+
+ /**
+ * Assigns the given value of a bit at the specified index.
+ *
+ * @param index The index of the bit.
+ * @param value The value to set the bit to.
+ */
+ public abstract void assignBit(final long index, final boolean value);
+
+ /**
+ /**
+ * Sets {@code numBits} starting from {@code index} to the specified value.
+ * Limited to a single long (64 bits).
+ *
+ * @param index the starting index of the range (inclusive)
+ * @param numBits the number of bits to write
+ * @param bits the value to set the bits to, starting with bit 0
+ */
+ public abstract void setBits(final long index, final int numBits, final long bits);
+
+ /**
+ * Gets the number of bits that are set to true in the BitArray.
+ *
+ * @return The number of bits set to true.
+ */
+ public abstract long getNumBitsSet();
+
+ /**
+ * Resets the BitArray, setting all bits to false.
+ */
+ public abstract void reset();
+
+ /**
+ * Gets the capacity of the BitArray in bits.
+ *
+ * @return The capacity of the BitArray in bits
+ */
+ public abstract long getCapacity();
+
+ /**
+ * Gets the length of the underlying array in longs.
+ *
+ * @return The length of the underlying array in longs.
+ */
+ public abstract int getArrayLength();
+
+ /**
+ * Performs a union operation with another BitArray.
+ *
+ * @param other The other BitArray to perform the union with.
+ */
+ public abstract void union(final BitArray other);
+
+ /**
+ * Performs an intersection operation with another BitArray.
+ *
+ * @param other The other BitArray to perform the intersection with.
+ */
+ public abstract void intersect(final BitArray other);
+
+ /**
+ * Inverts the BitArray, flipping all bits.
+ */
+ public abstract void invert();
+
+ /**
+ * Returns a string representation of the BitArray.
+ *
+ * @return A string representation of the BitArray.
+ */
+ @Override
+ public String toString() {
+ final StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < getArrayLength(); ++i) {
+ sb.append(i + ": ")
+ .append(printLong(getLong(i)))
+ .append(LS);
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Gets the serialized size of the BitArray in bytes.
+ *
+ * @return The serialized size of the BitArray in bytes.
+ */
+ public long getSerializedSizeBytes() {
+ // We only really need an int for array length but this will keep everything
+ // aligned to 8 bytes.
+ // Always write array length, but write numBitsSet only if empty
+ return Long.BYTES * (isEmpty() ? 1L : (2L + getArrayLength()));
+ }
+
+ /**
+ * Gets the serialized size of a non-empty BitArray of the specified size in bytes.
+ *
+ * @param numBits The number of bits in the BitArray.
+ * @return The serialized size of the BitArray in bytes.
+ * @throws SketchesArgumentException If the requested number of bits is not strictly positive
+ * or exceeds the maximum allowed.
+ */
+ public static long getSerializedSizeBytes(final long numBits) {
+ if (numBits <= 0) {
+ throw new SketchesArgumentException("Requested number of bits must be strictly positive");
+ }
+ if (numBits > MAX_BITS) {
+ throw new SketchesArgumentException("Requested number of bits exceeds maximum allowed. "
+ + "Requested: " + numBits + ", maximum: " + MAX_BITS);
+ }
+ final int numLongs = (int) Math.ceil(numBits / 64.0);
+ return Long.BYTES * (numLongs + 2L);
+ }
+
+ /**
+ * Checks if the BitArray has changes not reflected in state variables.
+ *
+ * @return True if the BitArray is dirty, false otherwise.
+ */
+ abstract boolean isDirty();
+
+ /**
+ * Gets the long value at the specified array index.
+ *
+ * @param arrayIndex The index of the long value in the array.
+ * @return The long value at the specified array index.
+ */
+ abstract long getLong(final int arrayIndex);
+
+ /**
+ * Sets the long value at the specified array index.
+ *
+ * @param arrayIndex The index of the long value in the array.
+ * @param value The value to set the long to.
+ */
+ abstract void setLong(final int arrayIndex, final long value);
+
+ /**
+ * Returns a string representation of a long value as a series of 0s and 1s (little endian).
+ *
+ * @param val The long value to print.
+ * @return A string representation of the long value.
+ */
+ public static String printLong(final long val) {
+ final StringBuilder sb = new StringBuilder();
+ for (int j = 0; j < Long.SIZE; ++j) {
+ sb.append((val & (1L << j)) != 0 ? "1" : "0");
+ if (j % 8 == 7) { sb.append(" "); }
+ }
+ return sb.toString();
+ }
+
+}
diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/DirectBitArray.java b/src/main/java/org/apache/datasketches/filters/common/DirectBitArray.java
similarity index 62%
rename from src/main/java/org/apache/datasketches/filters/bloomfilter/DirectBitArray.java
rename to src/main/java/org/apache/datasketches/filters/common/DirectBitArray.java
index 77c24f027..25521672e 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/DirectBitArray.java
+++ b/src/main/java/org/apache/datasketches/filters/common/DirectBitArray.java
@@ -17,21 +17,21 @@
* under the License.
*/
-package org.apache.datasketches.filters.bloomfilter;
+package org.apache.datasketches.filters.common;
import org.apache.datasketches.common.SketchesArgumentException;
import org.apache.datasketches.memory.WritableMemory;
-final class DirectBitArray extends DirectBitArrayR {
+public final class DirectBitArray extends DirectBitArrayR {
- DirectBitArray(final int dataLength, final long storedNumBitsSet, final WritableMemory wmem) {
+ public DirectBitArray(final int dataLength, final long storedNumBitsSet, final WritableMemory wmem) {
super(dataLength, 0, wmem); // we'll set numBitsSet_ ourselves so pass 0
// can recompute later if needed
numBitsSet_ = storedNumBitsSet;
}
- DirectBitArray(final int dataLength, final WritableMemory wmem) {
+ public DirectBitArray(final int dataLength, final WritableMemory wmem) {
super(dataLength, 0, wmem);
wmem_.putInt(0, dataLength_);
@@ -39,7 +39,7 @@ final class DirectBitArray extends DirectBitArrayR {
wmem_.clear(DATA_OFFSET, (long) dataLength_ * Long.BYTES);
}
- static DirectBitArray initialize(final long numBits, final WritableMemory wmem) {
+ public static DirectBitArray initialize(final long numBits, final WritableMemory wmem) {
if (numBits <= 0) {
throw new SketchesArgumentException("Number of bits must be strictly positive. Found: " + numBits);
}
@@ -58,7 +58,7 @@ static DirectBitArray initialize(final long numBits, final WritableMemory wmem)
return new DirectBitArray(arrayLength, wmem);
}
- static DirectBitArray writableWrap(final WritableMemory mem, final boolean isEmpty) {
+ public static DirectBitArray writableWrap(final WritableMemory mem, final boolean isEmpty) {
final int arrayLength = mem.getInt(0);
final long storedNumBitsSet = isEmpty ? 0L : mem.getLong(NUM_BITS_OFFSET);
@@ -81,7 +81,7 @@ static DirectBitArray writableWrap(final WritableMemory mem, final boolean isEmp
}
@Override
- long getNumBitsSet() {
+ public long getNumBitsSet() {
// update numBitsSet and store in array
if (isDirty()) {
numBitsSet_ = 0;
@@ -95,17 +95,17 @@ long getNumBitsSet() {
}
@Override
- protected boolean isDirty() {
+ public boolean isDirty() {
return numBitsSet_ == -1;
}
@Override
- boolean getBit(final long index) {
+ public boolean getBit(final long index) {
return (wmem_.getByte(DATA_OFFSET + ((int) index >>> 3)) & (1 << (index & 0x7))) != 0;
}
@Override
- protected long getLong(final int arrayIndex) {
+ public long getLong(final int arrayIndex) {
return wmem_.getLong(DATA_OFFSET + (arrayIndex << 3));
}
@@ -115,21 +115,83 @@ public boolean isReadOnly() {
}
@Override
- void reset() {
+ public void reset() {
setNumBitsSet(0);
wmem_.clear(DATA_OFFSET, (long) dataLength_ * Long.BYTES);
}
@Override
- void setBit(final long index) {
+ public void setBit(final long index) {
final long memoryOffset = DATA_OFFSET + ((int) index >>> 3);
final byte val = wmem_.getByte(memoryOffset);
- wmem_.setBits(memoryOffset, (byte) (val | (1 << (index & 0x07))));
+ wmem_.putByte(memoryOffset, (byte) (val | (1 << (index & 0x07))));
setNumBitsSet(-1); // mark dirty
}
@Override
- boolean getAndSetBit(final long index) {
+ public void clearBit(final long index) {
+ final long memoryOffset = DATA_OFFSET + ((int) index >>> 3);
+ final byte val = wmem_.getByte(memoryOffset);
+ wmem_.putByte(memoryOffset, (byte) (val & ~(1 << (index & 0x07))));
+ setNumBitsSet(-1); // mark dirty
+ }
+
+ @Override
+ public void assignBit(final long index, final boolean value) {
+ if (value) {
+ setBit(index);
+ } else {
+ clearBit(index);
+ }
+ }
+
+ @Override
+ public void setBits(final long index, final int numBits, final long bits) {
+ if (numBits < 0 || numBits > 64) {
+ throw new SketchesArgumentException("numBits must be between 0 and 64 (inclusive)");
+ } else if (index + numBits > getCapacity()) {
+ throw new SketchesArgumentException("End of range exceeds capacity");
+ }
+
+ // TODO: since Memory provides byte offsets even when reading a long, we can be sure
+ // that the result always fits in a single long. We can potentially optimize this, but
+ // need to handle cases where a long would read beyond the end of the Memory.
+
+ final long endBit = index + numBits - 1;
+
+ // these are indices into a long[] array, need to adjust to byte offsets
+ // when calling wmem_.getLong()
+ final int fromIndex = (int) index >>> 6;
+ final int toIndex = (int) endBit >>> 6;
+
+ setNumBitsSet(-1); // mark dirty
+ final long fromOffset = index & 0x3F;
+ final long toOffset = endBit & 0x3F;
+
+ // within a single long
+ if (fromIndex == toIndex) {
+ final long toMask = (toOffset == 63) ? -1L : (1L << (toOffset + 1)) - 1L;
+ final long fromMask = (1L << fromOffset) - 1L;
+ final long mask = toMask - fromMask;
+ final long maskedVal = wmem_.getLong(DATA_OFFSET + (fromIndex << 3)) & ~mask;
+ wmem_.putLong(DATA_OFFSET + (fromIndex << 3), maskedVal | ((bits << fromOffset) & mask));
+ return;
+ }
+
+ // spans longs, need to set bits in two longs
+ final long splitBit = Long.SIZE - (fromOffset);
+ final long fromMask = (1L << fromOffset) - 1; // inverse mask in this case
+ final long toMask = (1L << (toOffset + 1)) - 1;
+
+ final long maskedFromVal = wmem_.getLong(DATA_OFFSET + (fromIndex << 3)) & fromMask;
+ final long maskedToVal = wmem_.getLong(DATA_OFFSET + (toIndex << 3)) & ~toMask;
+
+ wmem_.putLong(DATA_OFFSET + (fromIndex << 3), maskedFromVal | ((bits << fromOffset) & ~fromMask));
+ wmem_.putLong(DATA_OFFSET + (toIndex << 3), maskedToVal | ((bits >>> splitBit) & toMask));
+ }
+
+ @Override
+ public boolean getAndSetBit(final long index) {
final long memoryOffset = DATA_OFFSET + ((int) index >>> 3);
final byte mask = (byte) (1 << (index & 0x07));
final byte val = wmem_.getByte(memoryOffset);
@@ -143,7 +205,7 @@ boolean getAndSetBit(final long index) {
}
@Override
- void intersect(final BitArray other) {
+ public void intersect(final BitArray other) {
if (getCapacity() != other.getCapacity()) {
throw new SketchesArgumentException("Cannot intersect bit arrays with unequal lengths");
}
@@ -158,7 +220,7 @@ void intersect(final BitArray other) {
}
@Override
- void union(final BitArray other) {
+ public void union(final BitArray other) {
if (getCapacity() != other.getCapacity()) {
throw new SketchesArgumentException("Cannot intersect bit arrays with unequal lengths");
}
@@ -173,7 +235,7 @@ void union(final BitArray other) {
}
@Override
- void invert() {
+ public void invert() {
if (isDirty()) {
numBitsSet_ = 0;
for (int i = 0; i < dataLength_; ++i) {
@@ -191,7 +253,7 @@ void invert() {
}
@Override
- protected void setLong(final int arrayIndex, final long value) {
+ void setLong(final int arrayIndex, final long value) {
wmem_.putLong(DATA_OFFSET + (arrayIndex << 3), value);
}
diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayR.java b/src/main/java/org/apache/datasketches/filters/common/DirectBitArrayR.java
similarity index 58%
rename from src/main/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayR.java
rename to src/main/java/org/apache/datasketches/filters/common/DirectBitArrayR.java
index 8acc36be2..6d0d4bad3 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayR.java
+++ b/src/main/java/org/apache/datasketches/filters/common/DirectBitArrayR.java
@@ -17,7 +17,7 @@
* under the License.
*/
-package org.apache.datasketches.filters.bloomfilter;
+package org.apache.datasketches.filters.common;
import org.apache.datasketches.common.SketchesArgumentException;
import org.apache.datasketches.common.SketchesReadOnlyException;
@@ -35,7 +35,7 @@ public class DirectBitArrayR extends BitArray {
final protected WritableMemory wmem_; // for inheritance; we won't write to it
protected long numBitsSet_; // could be final here but writable direct will update it
- protected DirectBitArrayR(final int dataLength, final long storedNumBitsSet, final Memory mem) {
+ public DirectBitArrayR(final int dataLength, final long storedNumBitsSet, final Memory mem) {
super();
dataLength_ = dataLength;
@@ -53,7 +53,7 @@ protected DirectBitArrayR(final int dataLength, final long storedNumBitsSet, fin
// assumes we have a region with only the portion of Memory
// the BitArray cares about
- static DirectBitArrayR wrap(final Memory mem, final boolean isEmpty) {
+ public static DirectBitArrayR wrap(final Memory mem, final boolean isEmpty) {
final int arrayLength = mem.getInt(0);
final long storedNumBitsSet = isEmpty ? 0L : mem.getLong(NUM_BITS_OFFSET);
@@ -71,34 +71,73 @@ static DirectBitArrayR wrap(final Memory mem, final boolean isEmpty) {
}
@Override
- long getCapacity() {
+ public long getCapacity() {
return (long) dataLength_ * Long.SIZE;
}
@Override
- long getNumBitsSet() {
+ public long getNumBitsSet() {
return numBitsSet_;
}
@Override
- protected boolean isDirty() {
+ public boolean isDirty() {
// read-only so necessarily false
return false;
}
@Override
- int getArrayLength() {
+ public int getArrayLength() {
return dataLength_;
}
@Override
- boolean getBit(final long index) {
+ public boolean getBit(final long index) {
if (isEmpty()) { return false; }
return (wmem_.getByte(DATA_OFFSET + ((int) index >>> 3)) & (1 << (index & 0x7))) != 0;
}
@Override
- protected long getLong(final int arrayIndex) {
+ public long getBits(final long index, final int numBits) {
+ if (numBits < 0 || numBits > 64) {
+ throw new SketchesArgumentException("numBits must be between 0 and 64 (inclusive)");
+ } else if (index + numBits > getCapacity()) {
+ throw new SketchesArgumentException("End of range exceeds capacity");
+ }
+ if (isEmpty()) { return 0L; }
+
+ // TODO: since Memory provides byte offsets even when reading a long, we can be sure
+ // that the result always fits in a single long. We can potentially optimize this, but
+ // need to handle cases where a long would read beyond the end of the Memory.
+
+ final long endBit = index + numBits - 1;
+
+ // these are indices into a long[] array, need to adjust to byte offsets
+ // when calling wmem_.getLong()
+ final int fromIndex = (int) index >>> 6;
+ final int toIndex = (int) endBit >>> 6;
+ final long fromOffset = index & 0x3F;
+ final long toOffset = endBit & 0x3F;
+
+ // within a single long
+ if (fromIndex == toIndex) {
+ final long toMask = (toOffset == 63) ? -1L : (1L << (toOffset + 1)) - 1L;
+ final long fromMask = (1L << fromOffset) - 1L;
+ return (wmem_.getLong(DATA_OFFSET + (fromIndex << 3)) & (toMask - fromMask)) >>> fromOffset;
+ }
+
+ // spans longs, need to combine bits from two longs
+ final long splitBit = Long.SIZE - (fromOffset);
+ final long fromMask = ~((1L << fromOffset) - 1);
+ final long toMask = (1L << (toOffset + 1)) - 1;
+
+ long result = (wmem_.getLong(DATA_OFFSET + (fromIndex << 3)) & fromMask) >>> fromOffset;
+ result |= (wmem_.getLong(DATA_OFFSET + (toIndex << 3)) & toMask) << splitBit;
+ return result;
+ }
+
+ @Override
+ long getLong(final int arrayIndex) {
if (isEmpty()) { return 0L; }
return wmem_.getLong(DATA_OFFSET + (arrayIndex << 3));
}
@@ -119,37 +158,52 @@ public boolean isReadOnly() {
}
@Override
- void reset() {
+ public void reset() {
throw new SketchesReadOnlyException("Attempt to call reset() on read-only memory");
}
@Override
- void setBit(final long index) {
+ public void setBit(final long index) {
+ throw new SketchesReadOnlyException("Attempt to call setBit() on read-only memory");
+ }
+
+ @Override
+ public void clearBit(final long index) {
+ throw new SketchesReadOnlyException("Attempt to call clearBit() on read-only memory");
+ }
+
+ @Override
+ public void setBits(final long index, final int numBits, final long bits) {
+ throw new SketchesReadOnlyException("Attempt to call setBits() on read-only memory");
+ }
+
+ @Override
+ public void assignBit(final long index, final boolean value) {
throw new SketchesReadOnlyException("Attempt to call setBit() on read-only memory");
}
@Override
- boolean getAndSetBit(final long index) {
+ public boolean getAndSetBit(final long index) {
throw new SketchesReadOnlyException("Attempt to call getAndSetBit() on read-only memory");
}
@Override
- void intersect(final BitArray other) {
+ public void intersect(final BitArray other) {
throw new SketchesReadOnlyException("Attempt to call intersect() on read-only memory");
}
@Override
- void union(final BitArray other) {
+ public void union(final BitArray other) {
throw new SketchesReadOnlyException("Attempt to call union() on read-only memory");
}
@Override
- void invert() {
+ public void invert() {
throw new SketchesReadOnlyException("Attempt to call invert() on read-only memory");
}
@Override
- protected void setLong(final int arrayIndex, final long value) {
+ void setLong(final int arrayIndex, final long value) {
throw new SketchesReadOnlyException("Attempt to call setLong() on read-only memory");
}
}
diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/HeapBitArray.java b/src/main/java/org/apache/datasketches/filters/common/HeapBitArray.java
similarity index 57%
rename from src/main/java/org/apache/datasketches/filters/bloomfilter/HeapBitArray.java
rename to src/main/java/org/apache/datasketches/filters/common/HeapBitArray.java
index 4048b6775..ca81ae073 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/HeapBitArray.java
+++ b/src/main/java/org/apache/datasketches/filters/common/HeapBitArray.java
@@ -17,7 +17,7 @@
* under the License.
*/
-package org.apache.datasketches.filters.bloomfilter;
+package org.apache.datasketches.filters.common;
import java.util.Arrays;
@@ -31,13 +31,13 @@
*
Rounds the number of bits up to the smallest multiple of 64 (one long)
* that is not smaller than the specified number.
*/
-final class HeapBitArray extends BitArray {
+public final class HeapBitArray extends BitArray {
private long numBitsSet_; // if -1, need to recompute value
private boolean isDirty_;
final private long[] data_;
// creates an array of a given size
- HeapBitArray(final long numBits) {
+ public HeapBitArray(final long numBits) {
super();
if (numBits <= 0) {
@@ -54,7 +54,7 @@ final class HeapBitArray extends BitArray {
}
// uses the provided array
- HeapBitArray(final long numBitsSet, final long[] data) {
+ public HeapBitArray(final long numBitsSet, final long[] data) {
super();
data_ = data;
@@ -64,7 +64,7 @@ final class HeapBitArray extends BitArray {
// reads a serialized image, but the BitArray is not fully self-describing so requires
// a flag to indicate whether the array is empty
- static HeapBitArray heapify(final Buffer buffer, final boolean isEmpty) {
+ public static HeapBitArray heapify(final Buffer buffer, final boolean isEmpty) {
final int numLongs = buffer.getInt();
if (numLongs < 0) {
throw new SketchesArgumentException("Possible corruption: Must have strictly positive array size. Found: " + numLongs);
@@ -85,40 +85,124 @@ static HeapBitArray heapify(final Buffer buffer, final boolean isEmpty) {
}
@Override
- protected boolean isDirty() {
+ public boolean isDirty() {
return isDirty_;
}
@Override
- boolean hasMemory() {
+ public boolean hasMemory() {
return false;
}
@Override
- boolean isDirect() {
+ public boolean isDirect() {
return false;
}
@Override
- boolean isReadOnly() { return false; }
+ public boolean isReadOnly() { return false; }
// queries a single bit in the array
@Override
- boolean getBit(final long index) {
+ public boolean getBit(final long index) {
return (data_[(int) index >>> 6] & (1L << index)) != 0 ? true : false;
}
+ @Override
+ public long getBits(final long index, final int numBits) {
+ if (numBits < 0 || numBits > 64) {
+ throw new SketchesArgumentException("numBits must be between 0 and 64 (inclusive)");
+ } else if (index + numBits > getCapacity()) {
+ throw new SketchesArgumentException("End of range exceeds capacity");
+ }
+ if (numBits == 0) { return 0; }
+
+ final long endBit = index + numBits - 1;
+
+ final int fromIndex = (int) index >>> 6;
+ final int toIndex = (int) endBit >>> 6;
+ final long fromOffset = index & 0x3F;
+ final long toOffset = endBit & 0x3F;
+
+ // within a single long
+ if (fromIndex == toIndex) {
+ final long toMask = (toOffset == 63) ? -1L : (1L << (toOffset + 1)) - 1L;
+ final long fromMask = (1L << fromOffset) - 1L;
+ return (data_[fromIndex] & (toMask - fromMask)) >>> fromOffset;
+ }
+
+ // spans longs, need to combine bits from two longs
+ final long splitBit = Long.SIZE - (fromOffset);
+ final long fromMask = ~((1L << fromOffset) - 1);
+ final long toMask = (1L << (toOffset + 1)) - 1;
+
+ long result = (data_[fromIndex] & fromMask) >>> fromOffset;
+ result |= (data_[toIndex] & toMask) << splitBit;
+ return result;
+ }
+
// sets a single bit in the array without querying, meaning the method
// cannot properly track the number of bits set so set isDirty = true
@Override
- void setBit(final long index) {
+ public void setBit(final long index) {
data_[(int) index >>> 6] |= 1L << index;
isDirty_ = true;
}
+ @Override
+ public void clearBit(final long index) {
+ data_[(int) index >>> 6] &= ~(1L << index);
+ isDirty_ = true;
+ }
+
+ // assigns a single bit in the array without querying
+ @Override
+ public void assignBit(final long index, final boolean value) {
+ if (value) {
+ setBit(index);
+ } else {
+ clearBit(index);
+ }
+ }
+
+ @Override
+ public void setBits(final long index, final int numBits, final long bits) {
+ if (numBits < 0 || numBits > 64) {
+ throw new SketchesArgumentException("numBits must be between 0 and 64 (inclusive)");
+ } else if (index + numBits > getCapacity()) {
+ throw new SketchesArgumentException("End of range exceeds capacity");
+ }
+ if (numBits == 0) { return; }
+
+ isDirty_ = true;
+ final long endBit = index + numBits - 1;
+
+ final int fromIndex = (int) index >>> 6;
+ final int toIndex = (int) endBit >>> 6;
+ final long fromOffset = index & 0x3F;
+ final long toOffset = endBit & 0x3F;
+
+ // within a single long
+ if (fromIndex == toIndex) {
+ final long toMask = (toOffset == 63) ? -1L : (1L << (toOffset + 1)) - 1L;
+ final long fromMask = (1L << fromOffset) - 1L;
+ final long mask = toMask - fromMask;
+ data_[fromIndex] = (data_[fromIndex] & ~mask) | ((bits << fromOffset) & mask);
+ return;
+ }
+
+ // spans longs, need to set bits in two longs
+ final long splitBit = Long.SIZE - (fromOffset);
+ final long fromMask = (1L << fromOffset) - 1; // inverse mask in this case
+ final long toMask = (1L << (toOffset + 1)) - 1;
+
+ data_[fromIndex] = (data_[fromIndex] & fromMask) | ((bits << fromOffset) & ~fromMask);
+ data_[toIndex] = (data_[toIndex] & ~toMask) | ((bits >>> splitBit) & toMask);
+ }
+
// returns existing value of bit
@Override
- boolean getAndSetBit(final long index) {
+ public boolean getAndSetBit(final long index) {
final int offset = (int) index >>> 6;
final long mask = 1L << index;
if ((data_[offset] & mask) != 0) {
@@ -134,7 +218,7 @@ boolean getAndSetBit(final long index) {
// O(1) if only getAndSetBit() has been used
// O(data_.length) if setBit() has ever been used
@Override
- long getNumBitsSet() {
+ public long getNumBitsSet() {
if (isDirty_) {
numBitsSet_ = 0;
for (final long val : data_) {
@@ -145,14 +229,14 @@ long getNumBitsSet() {
}
@Override
- long getCapacity() { return (long) data_.length * Long.SIZE; }
+ public long getCapacity() { return (long) data_.length * Long.SIZE; }
@Override
- int getArrayLength() { return data_.length; }
+ public int getArrayLength() { return data_.length; }
// applies logical OR
@Override
- void union(final BitArray other) {
+ public void union(final BitArray other) {
if (getCapacity() != other.getCapacity()) {
throw new SketchesArgumentException("Cannot union bit arrays with unequal lengths");
}
@@ -168,7 +252,7 @@ void union(final BitArray other) {
// applies logical AND
@Override
- void intersect(final BitArray other) {
+ public void intersect(final BitArray other) {
if (getCapacity() != other.getCapacity()) {
throw new SketchesArgumentException("Cannot intersect bit arrays with unequal lengths");
}
@@ -184,7 +268,7 @@ void intersect(final BitArray other) {
// applies bitwise inversion
@Override
- void invert() {
+ public void invert() {
if (isDirty_) {
numBitsSet_ = 0;
for (int i = 0; i < data_.length; ++i) {
@@ -200,7 +284,7 @@ void invert() {
}
}
- void writeToBuffer(final WritableBuffer wbuf) {
+ public void writeToBuffer(final WritableBuffer wbuf) {
wbuf.putInt(data_.length);
wbuf.putInt(0); // unused
@@ -211,18 +295,18 @@ void writeToBuffer(final WritableBuffer wbuf) {
}
@Override
- protected long getLong(final int arrayIndex) {
+ public long getLong(final int arrayIndex) {
return data_[arrayIndex];
}
@Override
- protected void setLong(final int arrayIndex, final long value) {
+ public void setLong(final int arrayIndex, final long value) {
data_[arrayIndex] = value;
}
// clears the array
@Override
- void reset() {
+ public void reset() {
Arrays.fill(data_, 0);
numBitsSet_ = 0;
isDirty_ = false;
diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java
new file mode 100644
index 000000000..53dfd1c4b
--- /dev/null
+++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.datasketches.filters.quotientfilter;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+
+
+import org.apache.datasketches.memory.XxHash;
+
+
+public abstract class Filter {
+
+ //HashType hash_type;
+
+ //abstract boolean rejuvenate(long key);
+ //abstract boolean expand();
+ //protected abstract boolean _delete(long large_hash);
+ abstract protected boolean _insert(long large_hash);
+ abstract protected boolean _search(long large_hash);
+
+
+ //public boolean delete(long input) {
+// return _delete(get_hash(input));
+// }
+
+// public boolean delete(String input) {
+// ByteBuffer input_buffer = ByteBuffer.wrap(input.getBytes(StandardCharsets.UTF_8));
+// //return _delete(HashFunctions.xxhash(input_buffer));
+// return _delete(XxHash.hashLong(input_buffer));
+// }
+
+// public boolean delete(byte[] input) {
+// ByteBuffer input_buffer = ByteBuffer.wrap(input);
+// return _delete(HashFunctions.xxhash(input_buffer));
+// }
+//
+ public boolean insert(long input) {
+ //System.out.println("The ABC input is " + input);
+ long hash = get_hash(input);
+ //System.out.println("The ABC hash is " + hash);
+ return _insert(hash);
+ }
+//
+// public boolean insert(String input, boolean insert_only_if_no_match) {
+// ByteBuffer input_buffer = ByteBuffer.wrap(input.getBytes(StandardCharsets.UTF_8));
+// return _insert(HashFunctions.xxhash(input_buffer), insert_only_if_no_match);
+// }
+//
+// public boolean insert(byte[] input, boolean insert_only_if_no_match) {
+// ByteBuffer input_buffer = ByteBuffer.wrap(input);
+// return _insert(HashFunctions.xxhash(input_buffer), insert_only_if_no_match);
+// }
+//
+ public boolean search(long input) {
+ return _search(get_hash(input));
+ }
+//
+// public boolean search(String input) {
+// ByteBuffer input_buffer = ByteBuffer.wrap(input.getBytes(StandardCharsets.UTF_8));
+// return _search(HashFunctions.xxhash(input_buffer));
+// }
+//
+// public boolean search(byte[] input) {
+// ByteBuffer input_buffer = ByteBuffer.wrap(input);
+// return _search(HashFunctions.xxhash(input_buffer));
+// }
+//
+ long get_hash(long input) {
+// long hash = 0;
+// if (hash_type == HashType.arbitrary) {
+// hash = HashFunctions.normal_hash((int)input);
+// }
+// else if (hash_type == HashType.xxh) {
+// hash = HashFunctions.xxhash(input);
+// }
+// else {
+// System.exit(1);
+// }
+// return hash;
+ return XxHash.hashLong(input, 0L) ; // CD edit for datasketches hash function using same seed.
+ }
+
+ public long getSpaceUse() { return 0 ; }
+// public int get_bits_per_entry() { return 0 ; }
+//
+// public abstract long get_num_entries(boolean include_all_internal_filters);
+//
+// public double get_utilization() {
+// return 0;
+// }
+//
+// public double measure_num_bits_per_entry() {
+// return 0;
+// }
+//
+// static void print_int_in_binary(int num, int length) {
+// String str = "";
+// for (int i = 0; i < length; i++) {
+// int mask = (int)Math.pow(2, i);
+// int masked = num & mask;
+// str += masked > 0 ? "1" : "0";
+// }
+// System.out.println(str);
+// }
+//
+// static void print_long_in_binary(long num, int length) {
+// String str = "";
+// for (int i = 0; i < length; i++) {
+// long mask = (long)Math.pow(2, i);
+// long masked = num & mask;
+// str += masked > 0 ? "1" : "0";
+// }
+// System.out.println(str);
+// }
+//
+// String get_fingerprint_str(long fp, int length) {
+// String str = "";
+// for (int i = 0; i < length; i++) {
+// str += Bitmap.get_fingerprint_bit(i, fp) ? "1" : "0";
+// }
+// return str;
+// }
+//
+// public void pretty_print() {
+//
+// }
+
+
+}
+
diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java
new file mode 100644
index 000000000..e04e6cd12
--- /dev/null
+++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.datasketches.filters.quotientfilter;
+
+import java.util.ArrayDeque;
+import java.util.Queue;
+
+public class Iterator {
+
+ QuotientFilter qf;
+ long index;
+ long bucket_index;
+ long fingerprint;
+ Queue s;
+
+ Iterator(QuotientFilter new_qf) {
+ qf = new_qf;
+ s = new ArrayDeque();
+ //s = new ArrayDeque();
+ index = 0;
+ bucket_index = -1;
+ fingerprint = -1;
+ }
+
+ void clear() {
+ s.clear();
+ index = 0;
+ bucket_index = -1;
+ fingerprint = -1;
+ }
+
+ boolean next() {
+
+ if (index == qf.getNumSlots()) {
+ return false;
+ }
+
+ long slot = qf.getSlot(index);
+ boolean occupied = (slot & 1) != 0;
+ boolean continuation = (slot & 2) != 0;
+ boolean shifted = (slot & 4) != 0;
+
+
+ while (!occupied && !continuation && !shifted && index < qf.getNumSlots()) {
+ index++;
+ if (index == qf.getNumSlots()) {
+ return false;
+ }
+ slot = qf.getSlot(index);
+ occupied = (slot & 1) != 0;
+ continuation = (slot & 2) != 0;
+ shifted = (slot & 4) != 0;
+ }
+
+ if (occupied && !continuation && !shifted) {
+ s.clear();
+ s.add(index);
+ bucket_index = index;
+ }
+ else if (occupied && continuation && shifted) {
+ s.add(index);
+ }
+ else if (!occupied && !continuation && shifted) {
+ s.remove();
+ bucket_index = s.peek();
+ }
+ else if (!occupied && continuation && shifted) {
+ // do nothing
+ }
+ else if (occupied && !continuation && shifted) {
+ s.add(index);
+ s.remove();
+ bucket_index = s.peek();
+ }
+ fingerprint = slot >> 3;
+ index++;
+ return true;
+ }
+
+ void print() {
+ System.out.println("original slot: " + index + " " + bucket_index);
+ }
+
+}
diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java
new file mode 100644
index 000000000..b6ff86731
--- /dev/null
+++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java
@@ -0,0 +1,569 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.filters.quotientfilter;
+
+import static org.apache.datasketches.common.Util.LS;
+
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.Queue;
+import java.util.Set;
+
+import org.apache.datasketches.common.SketchesArgumentException;
+import org.apache.datasketches.common.SketchesException;
+import org.apache.datasketches.filters.common.BitArray;
+import org.apache.datasketches.filters.common.HeapBitArray;
+
+public class QuotientFilter extends Filter {
+
+ public static final float DEFAULT_LOAD_FACTOR = 0.8f;
+
+ int lgQ_;
+ int numFingerprintBits_;
+ float loadFactor_;
+ int numEntries_;
+ int numExpansions_;
+ BitArray bitArray_;
+
+ public QuotientFilter(final int lgQ, final int numFingerprintBits) {
+ this(lgQ, numFingerprintBits, DEFAULT_LOAD_FACTOR);
+ }
+
+ public QuotientFilter(final int lgQ, final int numFingerprintBits, final float loadFactor) {
+ lgQ_ = lgQ;
+ numFingerprintBits_ = numFingerprintBits;
+ loadFactor_ = loadFactor;
+ bitArray_ = makeFilter(getNumSlots(), getNumBitsPerEntry());
+ numExpansions_ = 0;
+ }
+
+ public boolean rejuvenate(final long key) {
+ return false;
+ }
+
+ public long getNumEntries() {
+ return numEntries_;
+ }
+
+ public int getNumExpansions() {
+ return numExpansions_;
+ }
+
+ public long getMaxEntriesBeforeExpansion() {
+ return (long)(getNumSlots() * loadFactor_);
+ }
+
+ BitArray makeFilter(final long initSize, final int bitsPerEntry) {
+ return new HeapBitArray(initSize * bitsPerEntry);
+ }
+
+ public int getFingerprintLength() {
+ return numFingerprintBits_;
+ }
+
+ void expand() {
+ if (getFingerprintLength() < 2) {
+ throw new SketchesException("for expansion value must have at least 2 bits");
+ }
+ final QuotientFilter other = new QuotientFilter(lgQ_ + 1, numFingerprintBits_ - 1, loadFactor_);
+
+ long i = 0;
+ if (!isSlotEmpty(i)) { i = findClusterStart(i); }
+
+ final Queue fifo = new LinkedList();
+ long count = 0;
+ while (count < numEntries_) {
+ if (!isSlotEmpty(i)) {
+ if (isOccupied(i)) { fifo.add(i); }
+ final long fingerprint = getFingerprint(i);
+ final long newQuotient = (fifo.element() << 1) | (fingerprint >> other.getFingerprintLength());
+ final long newFingerprint = fingerprint & other.getFingerprintMask();
+ other.insert(newFingerprint, newQuotient);
+ count++;
+ }
+ i = (i + 1) & getSlotMask();
+ if (!fifo.isEmpty() && ! isContinuation(i)) { fifo.remove(); }
+ }
+ lgQ_++;
+ numFingerprintBits_--;
+ bitArray_ = other.bitArray_;
+ numExpansions_++;
+ }
+
+ public int getLgQ() {
+ return lgQ_;
+ }
+
+ public float getLoadFactor() {
+ return loadFactor_;
+ }
+
+ // returns the number of slots in the filter without the extension/buffer slots
+ public long getNumSlots() {
+ return 1L << lgQ_;
+ }
+
+ long getSlotMask() {
+ return getNumSlots() - 1;
+ }
+
+ long getFingerprintMask() {
+ return (1L << getFingerprintLength()) - 1;
+ }
+
+ // sets the metadata flag bits for a given slot index
+ void modifySlot(final boolean isOccupied, final boolean isContinuation, final boolean isShifted, final long index) {
+ setOccupied(index, isOccupied);
+ setContinuation(index, isContinuation);
+ setShifted(index, isShifted);
+ }
+
+ // sets the fingerprint for a given slot index
+ void setFingerprint(final long index, final long fingerprint) {
+ bitArray_.setBits(index * getNumBitsPerEntry() + 3, getFingerprintLength(), fingerprint);
+ }
+
+ // print a nice representation of the filter that can be understood.
+ // if vertical is on, each line will represent a slot
+ public String getPrettyStr(final boolean vertical) {
+ final StringBuffer sbr = new StringBuffer();
+ final long numBits = getNumSlots() * getNumBitsPerEntry();
+ for (long i = 0; i < numBits; i++) {
+ final long remainder = i % getNumBitsPerEntry();
+ if (remainder == 0) {
+ final long slot = i / getNumBitsPerEntry();
+ sbr.append(" ");
+ if (vertical) {
+ sbr.append("\n" + String.format("%-10d", slot) + "\t");
+ }
+ }
+ if (remainder == 3) {
+ sbr.append(" ");
+ }
+ sbr.append(bitArray_.getBit(i) ? "1" : "0");
+ }
+ sbr.append("\n");
+ return sbr.toString();
+ }
+
+ // print a representation of the filter that can be humanly read.
+ public void prettyPrint() {
+ System.out.print(getPrettyStr(true));
+ }
+
+ // return a fingerprint in a given slot index
+ long getFingerprint(final long index) {
+ return bitArray_.getBits(index * getNumBitsPerEntry() + 3, getFingerprintLength());
+ }
+
+ // return an entire slot representation, including metadata flags and fingerprint
+ long getSlot(final long index) {
+ return bitArray_.getBits(index * getNumBitsPerEntry(), getNumBitsPerEntry());
+ }
+
+ // compare a fingerprint input to the fingerprint in some slot index
+ protected boolean compare(final long index, final long fingerprint) {
+ return getFingerprint(index) == fingerprint;
+ }
+
+ // modify the flags and fingerprint of a given slot
+ void modifySlot(final boolean isOccupied, final boolean isContinuation, final boolean isShifted,
+ final long index, final long fingerprint) {
+ modifySlot(isOccupied, isContinuation, isShifted, index);
+ setFingerprint(index, fingerprint);
+ }
+
+ public String toString() {
+ final StringBuilder sb = new StringBuilder();
+ final long slots = getNumSlots();
+ final long numBits = slots * getNumBitsPerEntry();
+ sb.append("***Quotient Filter Summary***").append(LS);
+ sb.append("lgQ: " + lgQ_).append(LS);
+ sb.append("FP length: " + getFingerprintLength()).append(LS);
+ sb.append("load factor: " + getLoadFactor()).append(LS);
+ sb.append("bits: " + numBits).append(LS);
+ sb.append("bits/entry: " + numBits / (double)numEntries_).append(LS);
+ sb.append("entries: " + numEntries_).append(LS);
+ sb.append("expansions: " + numExpansions_).append(LS);
+ sb.append("load: " + numEntries_ / (double)(slots)).append(LS);
+ sb.append("*********End Summary*********").append(LS);
+ return sb.toString();
+ }
+
+ /*
+ * Returns the number of bits used for the filter
+ */
+ @Override
+ public long getSpaceUse() {
+ return getNumSlots() * getNumBitsPerEntry();
+ }
+
+ public int getNumBitsPerEntry() {
+ return numFingerprintBits_ + 3;
+ }
+
+ boolean isOccupied(final long index) {
+ return bitArray_.getBit(index * getNumBitsPerEntry());
+ }
+
+ boolean isContinuation(final long index) {
+ return bitArray_.getBit(index * getNumBitsPerEntry() + 1);
+ }
+
+ boolean isShifted(final long index) {
+ return bitArray_.getBit(index * getNumBitsPerEntry() + 2);
+ }
+
+ void setOccupied(final long index, final boolean val) {
+ bitArray_.assignBit(index * getNumBitsPerEntry(), val);
+ }
+
+ void setContinuation(final long index, final boolean val) {
+ bitArray_.assignBit(index * getNumBitsPerEntry() + 1, val);
+ }
+
+ void setShifted(final long index, final boolean val) {
+ bitArray_.assignBit(index * getNumBitsPerEntry() + 2, val);
+ }
+
+ boolean isSlotEmpty(final long index) {
+ return !isOccupied(index) && !isContinuation(index) && !isShifted(index);
+ }
+
+ // scan the cluster leftwards until finding the start of the cluster and returning its slot index
+ // used by deletes
+ long findClusterStart(long index) {
+ while (isShifted(index)) {
+ index = (index - 1) & getSlotMask();
+ }
+ return index;
+ }
+
+ // given a canonical slot A, finds the actual index B of where the run belonging to slot A now resides
+ // since the run might have been shifted to the right due to collisions
+ long findRunStart(long index) {
+ int numRunsToSkip = 0;
+ while (isShifted(index)) {
+ index = (index - 1) & getSlotMask();
+ if (isOccupied(index)) {
+ numRunsToSkip++;
+ }
+ }
+ while (numRunsToSkip > 0) {
+ index = (index + 1) & getSlotMask();
+ if (!isContinuation(index)) {
+ numRunsToSkip--;
+ }
+ }
+ return index;
+ }
+
+ // given the start of a run, scan the run and return the index of the first matching fingerprint
+ // if not found returns the insertion position as bitwise complement to make it negative
+ long findFirstFingerprintInRun(long index, final long fingerprint) {
+ assert !isContinuation(index);
+ do {
+ final long fingerprintAtIndex = getFingerprint(index);
+ if (fingerprintAtIndex == fingerprint) {
+ return index;
+ } else if (fingerprintAtIndex > fingerprint) {
+ return ~index;
+ }
+ index = (index + 1) & getSlotMask();
+ } while (isContinuation(index));
+ return ~index;
+ }
+
+ // delete the last matching fingerprint in the run
+ long decideWhichFingerprintToDelete(long index, final long fingerprint) {
+ assert !isContinuation(index);
+ long matchingFingerprintIndex = -1;
+ do {
+ if (compare(index, fingerprint)) {
+ matchingFingerprintIndex = index;
+ }
+ index = (index + 1) & getSlotMask();
+ } while (isContinuation(index));
+ return matchingFingerprintIndex;
+ }
+
+ // given the start of a run, find the last slot index that still belongs to this run
+ long findRunEnd(long index) {
+ while (isContinuation((index + 1) & getSlotMask())) {
+ index = (index + 1) & getSlotMask();
+ }
+ return index;
+ }
+
+ // given a canonical index slot and a fingerprint, find the relevant run and check if there is a matching fingerprint within it
+ boolean search(final long fingerprint, final long index) {
+ if (!isOccupied(index)) {
+ return false;
+ }
+ final long runStartIndex = findRunStart(index);
+ final long foundIndex = findFirstFingerprintInRun(runStartIndex, fingerprint);
+ return foundIndex >= 0;
+ }
+
+ // Given a canonical slot index, find the corresponding run and return all fingerprints in the run.
+ // This method is only used for testing purposes.
+ Set getAllFingerprints(final long bucketIndex) {
+ final boolean doesRunExist = isOccupied(bucketIndex);
+ final HashSet set = new HashSet();
+ if (!doesRunExist) {
+ return set;
+ }
+ long runIndex = findRunStart(bucketIndex);
+ do {
+ set.add(getFingerprint(runIndex));
+ runIndex = (runIndex + 1) & getSlotMask();
+ } while (isContinuation(runIndex));
+ return set;
+ }
+
+ boolean insert(final long fingerprint, final long index) {
+ if (index >= getNumSlots() || numEntries_ == getNumSlots()) {
+ return false;
+ }
+ final long runStart = findRunStart(index);
+ if (!isOccupied(index)) {
+ insertFingerprintAndPushAllElse(fingerprint, runStart, index, true, true);
+ return true;
+ }
+ final long foundIndex = findFirstFingerprintInRun(runStart, fingerprint);
+ if (foundIndex >= 0) {
+ return false;
+ }
+ insertFingerprintAndPushAllElse(fingerprint, ~foundIndex, index, false, ~foundIndex == runStart);
+ return true;
+ }
+
+ void insertFingerprintAndPushAllElse(long fingerprint, long index, final long canonical,
+ final boolean isNewRun, final boolean isRunStart) {
+ // in the first shifted entry set isContinuation flag if inserting at the start of the existing run
+ // otherwise just shift the existing flag as it is
+ boolean forceContinuation = !isNewRun && isRunStart;
+
+ // prepare flags for the current slot
+ boolean isContinuation = !isRunStart;
+ boolean isShifted = index != canonical;
+
+ // remember the existing entry from the current slot to be shifted to the next slot
+ // isOccupied flag belongs to the slot, therefore it is never shifted
+ // isShifted flag is always true for all shifted entries, no need to remember it
+ long existingFingerprint = getFingerprint(index);
+ boolean existingIsContinuation = isContinuation(index);
+
+ while (!isSlotEmpty(index)) {
+ // set the current slot
+ setFingerprint(index, fingerprint);
+ setContinuation(index, isContinuation);
+ setShifted(index, isShifted);
+
+ // prepare values for the next slot
+ fingerprint = existingFingerprint;
+ isContinuation = existingIsContinuation | forceContinuation;
+ isShifted = true;
+
+ index = (index + 1) & getSlotMask();
+
+ // remember the existing entry to be shifted
+ existingFingerprint = getFingerprint(index);
+ existingIsContinuation = isContinuation(index);
+
+ forceContinuation = false; // this is needed for the first shift only
+ }
+ // at this point the current slot is empty, so just populate with prepared values
+ // either the incoming fingerprint or the last shifted one
+ setFingerprint(index, fingerprint);
+ setContinuation(index, isContinuation);
+ setShifted(index, isShifted);
+
+ if (isNewRun) {
+ setOccupied(canonical, true);
+ }
+ numEntries_++;
+ }
+
+ boolean delete(final long canonicalSlot, final long runStartIndex, final long matchingFingerprintIndex) {
+ long runEnd = findRunEnd(matchingFingerprintIndex);
+
+ // the run has only one entry, we need to disable its is_occupied flag
+ // we just remember we need to do this here, and we do it later to not interfere with counts
+ final boolean turnOffOccupied = runStartIndex == runEnd;
+
+ // First thing to do is move everything else in the run back by one slot
+ for (long i = matchingFingerprintIndex; i != runEnd; i = (i + 1) & getSlotMask()) {
+ final long f = getFingerprint((i + 1) & getSlotMask());
+ setFingerprint(i, f);
+ }
+
+ // for each slot, we want to know by how much the entry there is shifted
+ // we can do this by counting the number of continuation flags set to true
+ // and the number of occupied flags set to false from the start of the cluster to the given cell
+ // and then subtracting: num_shifted_count - num_non_occupied = number of slots by which an entry is shifted
+ final long clusterStart = findClusterStart(canonicalSlot);
+ long numShiftedCount = 0;
+ long numNonOccupied = 0;
+ for (long i = clusterStart; i != ((runEnd + 1) & getSlotMask()); i = (i + 1) & getSlotMask()) {
+ if (isContinuation(i)) {
+ numShiftedCount++;
+ }
+ if (!isOccupied(i)) {
+ numNonOccupied++;
+ }
+ }
+
+ setFingerprint(runEnd, 0);
+ setShifted(runEnd, false);
+ setContinuation(runEnd, false);
+
+ // we now have a nested loop. The outer do-while iterates over the remaining runs in the cluster.
+ // the inner for loop iterates over cells of particular runs, pushing entries one slot back.
+ do {
+ // we first check if the next run actually exists and if it is shifted.
+ // only if both conditions hold, we need to shift it back one slot.
+ //boolean does_next_run_exist = !is_slot_empty(run_end + 1);
+ //boolean is_next_run_shifted = is_shifted(run_end + 1);
+ //if (!does_next_run_exist || !is_next_run_shifted) {
+ if (isSlotEmpty((runEnd + 1) & getSlotMask()) || !isShifted((runEnd + 1) & getSlotMask())) {
+ if (turnOffOccupied) {
+ // if we eliminated a run and now need to turn the isOccupied flag off, we do it at the end to not interfere in our counts
+ setOccupied(canonicalSlot, false);
+ }
+ return true;
+ }
+
+ // we now find the start and end of the next run
+ final long nextRunStart = (runEnd + 1) & getSlotMask();
+ runEnd = findRunEnd(nextRunStart);
+
+ // before we start processing the next run, we check whether the previous run we shifted is now back to its canonical slot
+ // The condition num_shifted_count - num_non_occupied == 1 ensures that the run was shifted by only 1 slot,
+ // meaning it is now back in its proper place
+ if (isOccupied((nextRunStart - 1) & getSlotMask()) && numShiftedCount - numNonOccupied == 1) {
+ setShifted((nextRunStart - 1) & getSlotMask(), false);
+ } else {
+ setShifted((nextRunStart - 1) & getSlotMask(), true);
+ }
+
+ for (long i = nextRunStart; i != ((runEnd + 1) & getSlotMask()); i = (i + 1) & getSlotMask()) {
+ final long f = getFingerprint(i);
+ setFingerprint((i - 1) & getSlotMask(), f);
+ if (isContinuation(i)) {
+ setContinuation((i - 1) & getSlotMask(), true);
+ }
+ if (!isOccupied(i)) {
+ numNonOccupied++;
+ }
+ if (i != nextRunStart) {
+ numShiftedCount++;
+ }
+ }
+ setFingerprint(runEnd, 0);
+ setShifted(runEnd, false);
+ setContinuation(runEnd, false);
+ } while (true);
+ }
+
+ boolean delete(final long fingerprint, final long canonicalSlot) {
+ // if the run doesn't exist, the key can't have possibly been inserted
+ final boolean doesRunExist = isOccupied(canonicalSlot);
+ if (!doesRunExist) {
+ return false;
+ }
+ final long runStartIndex = findRunStart(canonicalSlot);
+ final long matchingFingerprintIndex = decideWhichFingerprintToDelete(runStartIndex, fingerprint);
+ if (matchingFingerprintIndex == -1) {
+ // we didn't find a matching fingerprint
+ return false;
+ }
+ return delete(canonicalSlot, runStartIndex, matchingFingerprintIndex);
+ }
+
+ long getSlotFromHash(final long largeHash) {
+ return (largeHash >> getFingerprintLength()) & getSlotMask();
+ }
+
+ long getFingerprintFromHash(final long largeHash) {
+ return largeHash & getFingerprintMask();
+ }
+
+ /*
+ This is the main insertion function accessed externally.
+ It calls the underlying filter _insert function which hashes the input
+ item internally.
+ Hence, the `large_hash` argument is already a hash key that has been generated
+ by the hashing library (eg xxhash).
+ */
+ protected boolean _insert(final long largeHash) {
+ final long slotIndex = getSlotFromHash(largeHash);
+ final long fingerprint = getFingerprintFromHash(largeHash);
+ final boolean success = insert(fingerprint, slotIndex);
+
+ if (numEntries_ == getMaxEntriesBeforeExpansion()) {
+ expand();
+ }
+ return success;
+ }
+
+ protected boolean _delete(final long largeHash) {
+ final long slotIndex = getSlotFromHash(largeHash);
+ final long fingerprint = getFingerprintFromHash(largeHash);
+ final boolean success = delete(fingerprint, slotIndex);
+ if (success) {
+ numEntries_--;
+ }
+ return success;
+ }
+
+ protected boolean _search(final long largeHash) {
+ final long slotIndex = getSlotFromHash(largeHash);
+ final long fingerprint = getFingerprintFromHash(largeHash);
+ return search(fingerprint, slotIndex);
+ }
+
+ public boolean getBitAtOffset(final int offset) {
+ return bitArray_.getBit(offset);
+ }
+
+ public void merge(final QuotientFilter other) {
+ if (lgQ_ + numFingerprintBits_ != other.lgQ_ + other.numFingerprintBits_) {
+ throw new SketchesArgumentException("incompatible sketches in merge");
+ }
+ long i = 0;
+ if (!other.isSlotEmpty(i)) { i = other.findClusterStart(i); }
+
+ final Queue fifo = new LinkedList();
+ long count = 0;
+ while (count < other.numEntries_) {
+ if (!other.isSlotEmpty(i)) {
+ if (other.isOccupied(i)) { fifo.add(i); }
+ final long quotient = fifo.element();
+ final long fingerprint = other.getFingerprint(i);
+ final long hash = quotient << other.getFingerprintLength() | fingerprint;
+ _insert(hash);
+ count++;
+ }
+ i = (i + 1) & other.getSlotMask();
+ if (!fifo.isEmpty() && ! other.isContinuation(i)) { fifo.remove(); }
+ }
+ }
+}
diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java
new file mode 100644
index 000000000..6a1b83a81
--- /dev/null
+++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.filters.quotientfilter;
+import static org.apache.datasketches.filters.quotientfilter.QuotientFilter.DEFAULT_LOAD_FACTOR;
+import org.apache.datasketches.common.SketchesArgumentException;
+
+/**
+ * This class provides methods to help estimate the correct parameters when
+ * creating a Quotient filter, and methods to create the filter using those values.
+ *
+ * The underlying math is described in the
+ *
+ * Wikipedia article on Quotient filters.
+ */
+public final class QuotientFilterBuilder {
+
+ /*
+ This function is used to suggest the number of bits per entry for a given number of entries.
+ The fingerprint length is related to the targetFalsePositiveProb roughly by 2^(-fingerprint_length).
+ Hence, the length of the fingerprint can be stored in at most 8 bits.
+ This, after rounding up, is the same as the more sophisticated expression which involves the capacity
+ from https://en.wikipedia.org/wiki/Quotient_filter#Probability_of_false_positives.
+ * @param targetFalsePositiveProb A desired false positive probability per item
+ * @return The suggested fingerprint length in bits
+ */
+ public static byte suggestFingerprintLength(double targetFalsePositiveProb) {
+ if (targetFalsePositiveProb <= 0. || targetFalsePositiveProb >= 1.) {
+
+ throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0");
+ }
+ return (byte) Math.ceil(-Math.log(targetFalsePositiveProb) / Math.log(2));
+ }
+
+ /**
+ * This method suggests the number of slots in the filter for a given input size, assuming 90% capacity.
+ * There is no load factor checking internally within the filter, so this method is used to map between the
+ * number of items we insert into a sketch and the number of slots we need to allocate.
+ * A design feature of Niv's implementation is that 2^j +2*j slots are allocated. This asymptotically approaches
+ * 2^j slots as j grows, and the canonical number of slots is 2^j. Therefore, we will only check against
+ * 0.9*2^j slots.
+ * The load factor is 0.9 to get some space-utility advantages over the bloom filter.
+ * @param maxDistinctItems The maximum number of distinct items that can be inserted into the filter.
+ * @return The log-base-2 of the number of slots in the filter.
+ */
+ public static byte suggestLgNumSlots(long maxDistinctItems, float loadFactor) {
+ if (maxDistinctItems <= 0) {
+ throw new SketchesArgumentException("maxDistinctItems must be strictly positive");
+ }
+ byte result = (byte) Math.ceil(Math.log(maxDistinctItems / loadFactor) / Math.log(2));
+ if (result < 31) {
+ return result;
+ } else {
+ // Largest address space for a Java array is 2^31 - 1
+ throw new SketchesArgumentException("Largest address space for a Java array is 2^31 - 1");
+ }
+ }
+
+ public static byte suggestLgNumSlots(long maxDistinctItems) {
+ return suggestLgNumSlots(maxDistinctItems, DEFAULT_LOAD_FACTOR);
+ }
+
+ /*
+ Returns the largest number of unique items that can be inserted into the filter.
+ We use a predefined load factor of 0.9 compared to the number of slots as 2^j.
+ @param lgNumSlots The log-base-2 of the number of slots in the filter
+ @return The maximum number of items that can be inserted into the filter
+ */
+ public static long suggestMaxNumItemsFromNumSlots(int lgNumSlots, float loadFactor) {
+ if (lgNumSlots <= 0) {
+ throw new SketchesArgumentException("lgNumSlots must be at least 1.");
+ } else if (lgNumSlots >= 31) {
+ throw new SketchesArgumentException("lgNumSlots cannot exceed 2^31 - 1.");
+ }
+ return (long) (loadFactor * (1L<= 1.0) {
+ throw new SketchesArgumentException("loadFactor must be larger than 0 and less than 1");
+ }
+ if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) {
+ throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0");
+ }
+ }
+
+ /**
+ * Helper class to return a pair of parameters for a Quotient filter:
+ * the log-base-2 of the number of slots (lgNumSlots) and the fingerprint length.
+ * These parameters are used to configure the Quotient filter.
+ */
+ public static class QFPair {
+ public final byte lgNumSlots;
+ public final byte fingerprintLength;
+
+ public QFPair(byte lgNumSlots, byte fingerprintLength) {
+ this.lgNumSlots = lgNumSlots;
+ this.fingerprintLength = fingerprintLength;
+ }
+ }
+
+}
\ No newline at end of file
diff --git a/src/main/java/org/apache/datasketches/theta/Sketches.java b/src/main/java/org/apache/datasketches/theta/Sketches.java
index 4b1461876..c204751f2 100644
--- a/src/main/java/org/apache/datasketches/theta/Sketches.java
+++ b/src/main/java/org/apache/datasketches/theta/Sketches.java
@@ -80,7 +80,7 @@ public static int getMaxAnotBResultBytes(final int maxNomEntries) {
/**
* Returns the maximum number of storage bytes required for a CompactSketch with the given
- * number of actual entries. Note that this assumes the worse case of the sketch in
+ * number of actual entries. Note that this assumes the worst case of the sketch in
* estimation mode, which requires storing theta and count.
* @param numberOfEntries the actual number of entries stored with the CompactSketch.
* @return the maximum number of storage bytes required for a CompactSketch with the given number
diff --git a/src/test/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayRTest.java b/src/test/java/org/apache/datasketches/filters/common/DirectBitArrayRTest.java
similarity index 85%
rename from src/test/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayRTest.java
rename to src/test/java/org/apache/datasketches/filters/common/DirectBitArrayRTest.java
index 521019e62..ea02ad21a 100644
--- a/src/test/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayRTest.java
+++ b/src/test/java/org/apache/datasketches/filters/common/DirectBitArrayRTest.java
@@ -17,7 +17,7 @@
* under the License.
*/
-package org.apache.datasketches.filters.bloomfilter;
+package org.apache.datasketches.filters.common;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertFalse;
@@ -99,6 +99,27 @@ public void basicOperationTest() {
assertTrue(dba.isReadOnly());
}
+ @Test
+ public void getBitsFromToTest() {
+ final HeapBitArray hba = new HeapBitArray(128);
+ hba.setBit(1); // will override, but this forces non-empty
+ hba.setLong(0, 0x5555555555555555L);
+ hba.setLong(1, 0xFFFFFFFFFC003FFFL);
+ final Memory mem = bitArrayToMemory(hba);
+ DirectBitArrayR dba = DirectBitArrayR.wrap(mem, hba.isEmpty());
+
+ // single, full long test
+ assertEquals(dba.getBits(0, 64), 0x5555555555555555L);
+
+ // subset of single long, mostly ones with a stretch of zeros
+ assertEquals(dba.getBits(64, 64), 0xFFFFFFFFFC003FFFL);
+ assertEquals(dba.getBits(78, 12), 0);
+ assertEquals(dba.getBits(77, 14), 8193);
+
+ // spanning longs
+ assertEquals(dba.getBits(60, 20), 0x3FFF5);
+ }
+
@Test
public void countBitsWhenDirty() {
// like basicOperationTest but with setBit which does
@@ -159,6 +180,9 @@ public void checkInvalidMethods() {
// all of these try to modify a read-only memory
assertThrows(SketchesReadOnlyException.class, () -> dba.setBit(14));
+ assertThrows(SketchesReadOnlyException.class, () -> dba.clearBit(7));
+ assertThrows(SketchesReadOnlyException.class, () -> dba.assignBit(924, false));
+ assertThrows(SketchesReadOnlyException.class, () -> dba.setBits(100, 30, 0xFF));
assertThrows(SketchesReadOnlyException.class, () -> dba.getAndSetBit(100));
assertThrows(SketchesReadOnlyException.class, () -> dba.reset());
assertThrows(SketchesReadOnlyException.class, () -> dba.invert());
diff --git a/src/test/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayTest.java b/src/test/java/org/apache/datasketches/filters/common/DirectBitArrayTest.java
similarity index 78%
rename from src/test/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayTest.java
rename to src/test/java/org/apache/datasketches/filters/common/DirectBitArrayTest.java
index a45bcbb82..4cc229c50 100644
--- a/src/test/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayTest.java
+++ b/src/test/java/org/apache/datasketches/filters/common/DirectBitArrayTest.java
@@ -17,7 +17,7 @@
* under the License.
*/
-package org.apache.datasketches.filters.bloomfilter;
+package org.apache.datasketches.filters.common;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertFalse;
@@ -68,6 +68,8 @@ public void tooSmallCapacityTest() {
}
// no text of max size because the BitArray allows up to Integer.MAX_VALUE
+ // bits, which is the maximum size of an array in Java -- can't use it all,
+ // (need 2 longs for preamble) but also can't allocate that large to test on most machines
@Test
public void initializeTooSmallTest() {
@@ -134,6 +136,70 @@ public void basicWritableWrapTest() {
dba.setBit(100);
assertTrue(dba.getAndSetBit(100));
assertEquals(dba.getNumBitsSet(), 8);
+
+ dba.reset();
+ assertTrue(dba.isEmpty());
+ assertEquals(dba.getNumBitsSet(), 0);
+
+ dba.setBit(0);
+ dba.setLong(0, -1);
+ assertTrue(dba.getBit(60));
+ dba.clearBit(60);
+ assertFalse(dba.getBit(60));
+
+ assertTrue(dba.getBit(35));
+ dba.assignBit(35, false);
+ assertFalse(dba.getBit(35));
+ dba.assignBit(35, true);
+ assertTrue(dba.getBit(35));
+ }
+
+ @Test
+ public void getBitsFromToTest() {
+ final int numBits = 128;
+ final WritableMemory wmem = WritableMemory.writableWrap(new byte[32]);
+ final DirectBitArray dba = DirectBitArray.initialize(numBits, wmem);
+
+ // single, full long test
+ dba.setBit(0); // useless but forces non-empty when using setLong()
+ dba.setLong(0, 0x5555555555555555L);
+ assertEquals(dba.getBits(0, 64), 0x5555555555555555L);
+ assertEquals(dba.getBits(64, 64), 0);
+
+ // subset of single long, mostly ones with a stretch of zeros
+ dba.setLong(1, 0xFFFFFFFFFC003FFFL);
+ assertEquals(dba.getBits(64, 64), 0xFFFFFFFFFC003FFFL);
+ assertEquals(dba.getBits(78, 12), 0);
+ assertEquals(dba.getBits(77, 14), 8193);
+
+ // spanning longs
+ assertEquals(dba.getBits(60, 20), 0x3FFF5);
+ }
+
+ @Test
+ public void setBitsFromToTest() {
+ final int numBits = 128;
+ WritableMemory wmem = WritableMemory.writableWrap(new byte[32]);
+ DirectBitArray ba = DirectBitArray.initialize(numBits, wmem);
+
+ // within a single long
+ ba.setBits(0, 64, 0x80000000DAB8C730L);
+ assertEquals(ba.getLong(0), 0x80000000DAB8C730L);
+ assertEquals(ba.getLong(1), 0);
+
+ ba.setBits(40, 8, 0xA6);
+ assertEquals(ba.getLong(0), 0x8000A600DAB8C730L);
+
+ // spanning longs
+ ba.setBits(60, 20, 0x3FFF5);
+ assertEquals(ba.getLong(0), 0x5000A600DAB8C730L);
+ assertEquals(ba.getLong(1), 0x3FFFL);
+
+ // found specific failure with this test
+ wmem = WritableMemory.writableWrap(new byte[1272]);
+ ba = DirectBitArray.initialize(10000, wmem);
+ ba.setBits(601 * 10 + 3, 7, 125);
+ assertEquals(ba.getBits(601 * 10 + 3, 7), 125);
}
@Test
diff --git a/src/test/java/org/apache/datasketches/filters/bloomfilter/HeapBitArrayTest.java b/src/test/java/org/apache/datasketches/filters/common/HeapBitArrayTest.java
similarity index 79%
rename from src/test/java/org/apache/datasketches/filters/bloomfilter/HeapBitArrayTest.java
rename to src/test/java/org/apache/datasketches/filters/common/HeapBitArrayTest.java
index 0e91788ea..a55f98a30 100644
--- a/src/test/java/org/apache/datasketches/filters/bloomfilter/HeapBitArrayTest.java
+++ b/src/test/java/org/apache/datasketches/filters/common/HeapBitArrayTest.java
@@ -17,7 +17,7 @@
* under the License.
*/
-package org.apache.datasketches.filters.bloomfilter;
+package org.apache.datasketches.filters.common;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertFalse;
@@ -75,9 +75,62 @@ public void basicOperationTest() {
assertTrue(ba.isEmpty());
assertEquals(ba.getNumBitsSet(), 0);
+ ba.setLong(0, -1);
+ assertTrue(ba.getBit(60));
+ ba.clearBit(60);
+ assertFalse(ba.getBit(60));
+
+ assertTrue(ba.getBit(35));
+ ba.assignBit(35, false);
+ assertFalse(ba.getBit(35));
+ ba.assignBit(35, true);
+ assertTrue(ba.getBit(35));
+
assertTrue(String.valueOf(ba).length() > 0);
}
+ @Test
+ public void getBitsFromToTest() {
+ final HeapBitArray ba = new HeapBitArray(128);
+
+ // single, full long test
+ ba.setLong(0, 0x5555555555555555L);
+ assertEquals(ba.getBits(0, 64), 0x5555555555555555L);
+ assertEquals(ba.getBits(64, 64), 0);
+
+ // subset of single long, mostly ones with a stretch of zeros
+ ba.setLong(1, 0xFFFFFFFFFC003FFFL);
+ assertEquals(ba.getBits(64, 64), 0xFFFFFFFFFC003FFFL);
+ assertEquals(ba.getBits(78, 12), 0);
+ assertEquals(ba.getBits(77, 14), 8193);
+
+ // spanning longs
+ assertEquals(ba.getBits(60, 20), 0x3FFF5);
+ }
+
+ @Test
+ public void setBitsFromToTest() {
+ HeapBitArray ba = new HeapBitArray(128);
+
+ // within a single long
+ ba.setBits(0, 64, 0x80000000DAB8C730L);
+ assertEquals(ba.getLong(0), 0x80000000DAB8C730L);
+ assertEquals(ba.getLong(1), 0);
+
+ ba.setBits(40, 8, 0xA6);
+ assertEquals(ba.getLong(0), 0x8000A600DAB8C730L);
+
+ // spanning longs
+ ba.setBits(60, 20, 0x3FFF5);
+ assertEquals(ba.getLong(0), 0x5000A600DAB8C730L);
+ assertEquals(ba.getLong(1), 0x3FFFL);
+
+ // found specific failure with this test
+ ba = new HeapBitArray(10000);
+ ba.setBits(601 * 10 + 3, 7, 125);
+ assertEquals(ba.getBits(601 * 10 + 3, 7), 125);
+ }
+
@Test
public void bitAddresOutOfBoundsTest() {
final HeapBitArray ba = new HeapBitArray(1024);
diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java
new file mode 100644
index 000000000..432e5a6df
--- /dev/null
+++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.filters.quotientfilter;
+import org.testng.annotations.Test;
+import static org.testng.Assert.assertTrue;
+
+import java.util.BitSet;
+
+public class DeletionTests {
+
+ /**
+ * This test checks the functionality of deleting items from the QuotientFilter.
+ * The test works by:
+ * 1. Inserting multiple keys into a single slot to create an overflow.
+ * 2. Removing these keys.
+ * 3. Checking that the remaining keys have returned to their canonical slots.
+ * The expected outcome is that after deletion, the remaining keys should be in their canonical slots.
+ */
+ @Test
+ static public void BasicDeletions() {
+ int fingerprint_len_bits = 5;
+ int num_entries_power = 3;
+ int num_entries = 1 << num_entries_power;
+ QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits);
+
+ long fp1 = 1 << 4;
+ long fp2 = 1 << 3;
+ long fp3 = 1 << 2;
+ long fp4 = 31;
+
+ qf.insert(fp4, 1);
+ qf.insert(fp1, 1);
+ qf.insert(fp1, 1);
+ qf.insert(fp2, 2);
+ qf.insert(fp1, 1);
+ qf.insert(fp1, 1);
+ qf.insert(fp3, 4);
+
+
+ qf.delete(31, 1);
+ qf.delete(fp1, 1);
+ qf.delete(fp1, 1);
+ qf.delete(fp1, 1);
+ qf.delete(fp1, 1);
+
+ BitSet result = new BitSet(num_entries * qf.getNumBitsPerEntry());
+ result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 2, true, false, false, fp2);
+ result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 4, true, false, false, fp3);
+ assertTrue(QuotientFilterTest.check_equality(qf, result, true));
+ }
+
+ /**
+ * This test checks the functionality of deleting items from the QuotientFilter.
+ * The test works by:
+ * 1. Inserting multiple keys into a single slot to create an overflow.
+ * 2. Removing these keys.
+ * 3. Checking that the remaining keys have returned to their canonical slots.
+ * The expected outcome is that after deletion, the remaining keys should be in their canonical slots.
+ */
+ @Test
+ static public void Deletions() {
+ int fingerprint_len_bits = 5;
+ int num_entries_power = 3;
+ int num_entries = (int)Math.pow(2, num_entries_power);
+ QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits);
+
+ qf.insert(1, 1);
+ qf.insert(2, 1);
+ qf.insert(3, 2);
+ qf.insert(4, 2);
+ qf.insert(5, 3);
+ qf.insert(6, 3);
+ qf.insert(7, 3);
+ qf.insert(8, 6);
+ qf.insert(9, 6); // these are ignored
+ qf.insert(10, 6);
+ qf.insert(11, 7);
+
+ qf.delete(3, 2);
+ qf.delete(5, 3);
+
+ BitSet result = new BitSet(num_entries * qf.getNumBitsPerEntry());
+ result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 0, false, false, false, 0);
+ result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 1, true, false, false, 1);
+ result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 2, true, true, true, 2);
+ result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 3, true, false, true, 4);
+ result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 4, false, false, true, 6);
+ result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 5, false, true, true, 7);
+ result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 6, true, false, false, 8);
+ result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 7, false, false, false, 0);
+
+ assertTrue(QuotientFilterTest.check_equality(qf, result, true));
+ }
+
+ @Test
+ /**
+ * This is a test for deleting items from the QuotientFilter even when an overflow is caused
+ * by multiple insertions.
+ * The test works by:
+ * 1. Inserting multiple keys into a single slot to create an overflow.
+ * 2. Removing these keys.
+ * 3. Checking that the remaining keys have returned to their canonical slots.
+ *
+ * The expected outcome is that after deletion, the remaining keys should be in their canonical slots.
+ */
+ static public void DeletionsWithWrap() {
+ int fingerprint_len_bits = 5;
+ int num_entries_power = 3;
+ int num_entries = (int)Math.pow(2, num_entries_power);
+ QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits);
+
+ qf.insert(1, 1);
+ qf.insert(2, 1);
+ qf.insert(3, 2);
+ qf.insert(4, 2);
+ qf.insert(5, 3);
+ qf.insert(6, 4);
+ qf.insert(7, 4);
+ qf.insert(8, 5);
+
+ //qf.pretty_print();
+ qf.delete(5, 3);
+ //qf.pretty_print();
+
+ BitSet result = new BitSet(num_entries * qf.getNumBitsPerEntry());
+ result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 0, false, false, false, 0);
+ result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 1, true, false, false, 1);
+ result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 2, true, true, true, 2);
+ result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 3, false, false, true, 3);
+ result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 4, true, true, true, 4);
+ result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 5, true, false, true, 6);
+ result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 6, false, true, true, 7);
+ result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 7, false, false, true, 8);
+ assertTrue(QuotientFilterTest.check_equality(qf, result, true));
+ }
+}
diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java
new file mode 100644
index 000000000..8f708455b
--- /dev/null
+++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.filters.quotientfilter;
+
+import org.apache.datasketches.common.SketchesArgumentException;
+import org.testng.annotations.Test;
+
+import static org.testng.Assert.*;
+public class QuotientFilterBuilderTest {
+
+ @Test
+ public void testSuggestFingerprintLengthFromFPP(){
+ // invalid false positive rate
+ assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestFingerprintLength(0.));
+ assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestFingerprintLength(1.));
+
+ // manually computed values based on formula using ceil(log2(1/targetFalsePositiveProb))
+ double[] fpps = {0.1, 0.01, 0.001, 0.0001, 1E-5, 1E-6, 1E-7, 1E-8};
+ byte[] results = {4, 7, 10, 14, 17, 20, 24, 27, 30};
+ for (int i = 0; i < fpps.length; i++) {
+ assertEquals(QuotientFilterBuilder.suggestFingerprintLength(fpps[i]), results[i]);
+ }
+ }
+
+ @Test
+ public static void testSuggestLgNumSlots(){
+ // invalid number of items
+ assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(0,0.9f));
+ assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(-1, 0.9f));
+ assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(5000000000L, 0.9f));
+ assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(0));
+ assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(-1));
+ assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(5000000000L));
+
+ long[] numItems = {1, 100, 1000, 1000000L};
+ int[] results = {1, 7, 11, 21} ;
+
+ for (int i = 0; i < numItems.length; i++) {
+ long num = numItems[i];
+ byte result = QuotientFilterBuilder.suggestLgNumSlots(num, 0.9f);
+ assertEquals(result, results[i]);
+ result = QuotientFilterBuilder.suggestLgNumSlots(num);
+ assertEquals(result, results[i]);
+ }
+ }
+
+ @Test
+ public static void testSuggestMaxNumItems(){
+ // invalid number of slots
+ assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots((byte)-127));
+ assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots((byte)0));
+ assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots((byte)32));
+
+ int[] lgNumSlots = {1, 2, 3, 6, 10, 15, 25, 30,};
+ long[] results_ninety_pc = {1, 3, 7, 57, 922, 29504, 30212096, 966787072} ;
+ long[] results_eighty_pc = {1, 3, 6, 51, 820, 26240, 26869760, 859832320} ;
+
+ // load capacities arbitrarily chosen using powers of two for exact arithmetic
+ float ninety_pc_appx = 922f / 1024f; // ≈ 0.9
+ float eighty_pc_appx = 820f / 1024f; // ≈ 0.8
+
+ for (int i = 0; i < lgNumSlots.length; i++) {
+ long result_ninety = QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots(lgNumSlots[i], ninety_pc_appx);
+ long result_eighty = QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots(lgNumSlots[i], eighty_pc_appx);
+ assertEquals(result_ninety, results_ninety_pc[i]);
+ assertEquals(result_eighty, results_eighty_pc[i]);
+ }
+ }
+
+ @Test
+ public static void testSuggestParamsFromMaxDistinctsFPP(){
+ // invalid number of slots
+ assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(5000000000L, 0.0001));
+ assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(100000000, 0.));
+ assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(100000000, 1.5));
+ assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(5000000000L, -1.));
+
+ byte lgNumSlots ;
+ byte fingerprintLength ;
+ long[] numItems = {1L, 900L, 500_000_000L} ;
+ double[] fpp = {1E-10, 1E-2, 1e-7} ;
+
+ // expected outcomes
+ byte[] expected_lgNumSlotsNinety = {1, 10, 30} ;
+ byte[] expected_lgNumSlotsEighty = {1, 11, 30} ;
+ byte[] expected_fingerprintLength = {34, 7, 24} ;
+
+ for (int i = 0; i < numItems.length; i++) {
+ QuotientFilterBuilder.QFPair pair = QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(numItems[i], 0.9f, fpp[i]);
+ lgNumSlots = pair.lgNumSlots;
+ fingerprintLength = pair.fingerprintLength;
+ assertEquals(expected_lgNumSlotsNinety[i], lgNumSlots);
+ assertEquals(expected_fingerprintLength[i], fingerprintLength);
+
+ // 80% load
+ pair = QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(numItems[i], fpp[i]);
+ lgNumSlots = pair.lgNumSlots;
+ fingerprintLength = pair.fingerprintLength;
+ assertEquals(expected_lgNumSlotsEighty[i], lgNumSlots);
+ assertEquals(expected_fingerprintLength[i], fingerprintLength);
+ }
+ }
+}
diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java
new file mode 100644
index 000000000..9ddf9a28c
--- /dev/null
+++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java
@@ -0,0 +1,365 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.filters.quotientfilter;
+import org.apache.datasketches.common.SketchesArgumentException;
+import org.testng.annotations.Test;
+import static org.testng.Assert.assertTrue;
+import static org.testng.Assert.assertEquals;
+
+import java.util.BitSet;
+import java.util.HashSet;
+import java.util.Random;
+
+
+public class QuotientFilterTest {
+ // this method had been in Bitmap, but was used only to test the QuotientFilter
+ public static boolean get_fingerprint_bit(long index, long fingerprint) {
+ long mask = 1 << index;
+ long and = fingerprint & mask;
+ return and != 0;
+ }
+
+ /*
+ * This test is based on the example from https://en.wikipedia.org/wiki/Quotient_filter
+ * in "Algorithm Description" section.
+ * It performs the same insertions and query as the example and verifies that it gets the same results.
+ * The insertion keys are: b, e, f, c, d, a which are hashed into slots as:
+ * (b,1), (e,4), (f, 7), (c,1), (d,2), (a,1)
+ */
+ @Test
+ public void WikiInsertionTest() {
+ int fingerprint_len_bits = 3; // 3 bits fingerprint => 6 bits per entry, resolved internally in the filter.
+ int num_entries_power = 3;
+ QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits, 1.0f);
+
+ final int A = 1;
+ final int B = 2;
+ final int C = 3;
+ final int D = 4;
+ final int E = 5;
+ final int F = 6;
+
+ qf.insert(B, 1);
+ qf.insert(E, 4);
+ qf.insert(F, 7);
+ qf.insert(C, 1);
+ qf.insert(D, 2);
+ qf.insert(A, 1);
+ assertEquals(qf.getNumEntries(), 6);
+
+ assertEquals(getState(qf, 0), 0);
+ assertEquals(qf.getFingerprint(0), 0);
+ assertEquals(getState(qf, 1), 0b100);
+ assertEquals(qf.getFingerprint(1), A);
+ assertEquals(getState(qf, 2), 0b111);
+ assertEquals(qf.getFingerprint(2), B);
+ assertEquals(getState(qf, 3), 0b011);
+ assertEquals(qf.getFingerprint(3), C);
+ assertEquals(getState(qf, 4), 0b101);
+ assertEquals(qf.getFingerprint(4), D);
+ assertEquals(getState(qf, 5), 0b001);
+ assertEquals(qf.getFingerprint(5), E);
+ assertEquals(getState(qf, 6), 0);
+ assertEquals(qf.getFingerprint(6), 0);
+ assertEquals(getState(qf, 7), 0b100);
+ assertEquals(qf.getFingerprint(7), F);
+ }
+
+ public int getState(QuotientFilter filter, int slot) {
+ return (filter.isOccupied(slot) ? 1 : 0) << 2
+ | (filter.isContinuation(slot) ? 1 : 0) << 1
+ | (filter.isShifted(slot) ? 1 : 0);
+ }
+
+ /*
+ * This test is based on the Figure 2. from https://vldb.org/pvldb/vol5/p1627_michaelabender_vldb2012.pdf.
+ * It performs the same insertions as in Figure 2 and checks for the same result.
+ */
+ @Test
+ public void PaperInsertionTest() {
+ int fingerprint_len_bits = 5;
+ int num_entries_power = 4;
+ int num_entries = (int)Math.pow(2, num_entries_power);
+ QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits);
+
+ final int A = 1;
+ final int B = 2;
+ final int C = 3;
+ final int D = 4;
+ final int E = 5;
+ final int F = 6;
+ final int G = 7;
+ final int H = 8;
+
+ // (key, slot): {(a, 1), (b, 1), (c, 3), (d, 3), (e, 3), (f, 4), (g, 6), (h, 6)}
+ qf.insert(A, 1);
+ qf.insert(B, 1);
+ qf.insert(C, 3);
+ qf.insert(D, 3);
+ qf.insert(E, 3);
+ qf.insert(F, 4);
+ qf.insert(G, 6);
+ qf.insert(H, 6);
+
+ BitSet result = new BitSet(num_entries * qf.getNumBitsPerEntry());
+ result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 0, false, false, false, 0);
+ result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 1, true, false, false, A);
+ result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 2, false, true, true, B);
+ result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 3, true, false, false, C);
+ result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 4, true, true, true, D);
+ result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 5, false, true, true, E);
+ result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 6, true, false, true, F);
+ result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 7, false, false, true, G);
+ result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 8, false, true, true, H);
+ assertTrue(check_equality(qf, result, false));
+ }
+
+ // test we don't get any false negatives for quotient filter
+ @Test
+ public void FalseNegativeTest() {
+ int fingerprint_len_bits = 7;
+ int num_entries_power = 10;
+ QuotientFilter filter = new QuotientFilter(num_entries_power, fingerprint_len_bits);
+ int num_entries = (int) ((1 << num_entries_power) * 0.8);
+ assertTrue(test_no_false_negatives(filter, num_entries));
+ }
+
+
+ /**
+ * This method tests the functionality of the QuotientFilter and Iterator classes. It creates a QuotientFilter and inserts
+ * six entries into it. An Iterator is then used to traverse the entries in the QuotientFilter. The method checks if the
+ * bucket index of each visited entry matches the expected bucket index. If there's a mismatch, an error message is printed
+ * and the program exits, indicating a test failure.
+ */
+ @Test
+ public void testQuotientFilterInsertionAndIteration() {
+
+ int fingerprint_len_bits = 5;
+ int num_entries_power = 4;
+ QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits);
+
+ qf.insert(0x1F, 2);
+ qf.insert(0x1F, 3);
+ qf.insert(0x1F, 3);
+ qf.insert(0x1F, 4);
+ qf.insert(0x1F, 15); // last slot in the filter
+ qf.insert(0x1F, 16); // outside the bounds
+// qf.pretty_print();
+
+ Iterator it = new Iterator(qf);
+ int[] arr = new int[] {2, 3, 4, 15};
+ int arr_index = 0;
+ while (it.next()) {assertEquals(it.bucket_index, arr[arr_index++]);}
+ }
+
+ @Test
+ public void testQuotientFilterIterator() {
+
+ int fingerprint_len_bits = 5;
+ int num_entries_power = 4;
+ QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits);
+
+ qf.insert(0, 1);
+ qf.insert(0, 4);
+ qf.insert(0, 7);
+ qf.insert(0, 1);
+ qf.insert(0, 2);
+ qf.insert(0, 1);
+ qf.insert(0, 15);
+
+ Iterator it = new Iterator(qf);
+ int[] arr = new int[] {1, 2, 4, 7, 15};
+ int arr_index = 0;
+ while (it.next()) {assertEquals(arr[arr_index++], it.bucket_index);}
+ }
+
+
+ // Helper functions
+
+ /**
+ * This method sets the values of a slot in a BitSet based on the provided parameters.
+ * The slot is defined by the number of bits per entry and the slot index.
+ * The values to be set include whether the slot is occupied, whether it is a continuation of a previous entry,
+ * whether it is shifted, and the fingerprint.
+ *
+ * @param result The BitSet where the slot values will be set.
+ * @param bits_per_entry The number of bits per entry in the BitSet.
+ * @param slot The index of the slot to be set.
+ * @param is_occupied Whether the slot is occupied.
+ * @param is_continuation Whether the slot is a continuation of a previous entry.
+ * @param is_shifted Whether the slot is shifted.
+ * @param fingerprint The fingerprint to be set in the slot.
+ * @return The BitSet after setting the slot values.
+ */
+ static public BitSet set_slot_in_test(BitSet result, int bits_per_entry, int slot, boolean is_occupied, boolean is_continuation, boolean is_shifted, long fingerprint) {
+ int index = bits_per_entry * slot;
+ result.set(index++, is_occupied);
+ result.set(index++, is_continuation);
+ result.set(index++, is_shifted);
+ for (int i = 0; i < bits_per_entry - 3; i++) {
+ result.set(index++, get_fingerprint_bit(i, fingerprint) );
+ }
+ return result;
+ }
+
+ static public BitSet set_slot_in_test(BitSet result, int bits_per_entry, int slot, boolean is_occupied, boolean is_continuation, boolean is_shifted, String fingerprint) {
+ long l_fingerprint = 0;
+ for (int i = 0; i < fingerprint.length(); i++) {
+ char c = fingerprint.charAt(i);
+ if (c == '1') {
+ l_fingerprint |= (1 << i);
+ }
+ }
+ return set_slot_in_test(result, bits_per_entry, slot, is_occupied, is_continuation, is_shifted, l_fingerprint);
+ }
+
+ static public boolean check_equality(QuotientFilter qf, BitSet bs, boolean check_also_fingerprints) {
+ for (int i = 0; i < bs.size(); i++) {
+ if (check_also_fingerprints || (i % qf.getNumBitsPerEntry() == 0 || i % qf.getNumBitsPerEntry() == 1 || i % qf.getNumBitsPerEntry() == 2)) {
+ if (qf.getBitAtOffset(i) != bs.get(i)) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ /*
+ Helper function to test that no false negatives are returned.
+ */
+ static public boolean test_no_false_negatives(QuotientFilter filter, int num_entries) {
+ HashSet added = new HashSet();
+ int seed = 5;
+ Random rand = new Random(seed);
+
+ for (int i = 0; i < num_entries; i++) {
+ int rand_num = rand.nextInt();
+ boolean success = filter.insert(rand_num);
+ if (success) {
+ added.add(rand_num);
+ }
+ else {
+ System.out.println("insertion failed");
+ }
+ }
+
+ for (Integer i: added) {
+ boolean found = filter.search((long)i);
+ if (!found) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ @Test
+ public void smallExpansion() {
+ final QuotientFilter qf = new QuotientFilter(5, 9);
+ final int n = 30;
+ for (int i = 0; i < n; i++) { qf.insert(i); }
+ System.out.println(qf.toString());
+ assertEquals(qf.getNumExpansions(), 1);
+ assertEquals(qf.getNumEntries(), n);
+
+ // query the same keys
+ int positives = 0;
+ for (int i = 0; i < n; i++) { if (qf.search(i)) { positives++; } }
+ assertEquals(positives, n);
+
+ // query novel keys
+ positives = 0;
+ for (int i = 0; i < n; i++) { if (qf.search(i + n)) { positives++; } }
+ assertTrue(positives < 2);
+ }
+
+ @Test
+ public void expansion() {
+ final QuotientFilter qf = new QuotientFilter(16, 13);
+ final int n = 60000;
+ for (int i = 0; i < n; i++) { qf.insert(i); }
+// qf.printFilterSummary();
+ assertEquals(qf.getNumExpansions(), 1);
+ assertTrue(qf.getNumEntries() > n * 0.99); // allow a few hash collisions
+
+ // query the same keys
+ int positives = 0;
+ for (int i = 0; i < n; i++) { if (qf.search(i)) { positives++; } }
+ assertEquals(positives, n);
+
+ // query novel keys
+ positives = 0;
+ for (int i = 0; i < n; i++) { if (qf.search(i + n)) { positives++; } }
+ assertTrue(positives < 6);
+ }
+
+ @Test
+ public void mergeEmpty() {
+ final QuotientFilter qf1 = new QuotientFilter(4, 3);
+ final QuotientFilter qf2 = new QuotientFilter(4, 3);
+ qf1.merge(qf2);
+
+ assertEquals(qf1.getLgQ(), 4);
+ assertEquals(qf1.getFingerprintLength(), 3);
+ assertEquals(qf1.getNumEntries(), 0);
+ }
+
+ @Test
+ public void merge() {
+ final QuotientFilter qf1 = new QuotientFilter(16, 13);
+ final QuotientFilter qf2 = new QuotientFilter(16, 13);
+ final int n = 50000;
+ for (int i = 0; i < n / 2; i++) {
+ qf1.insert(i);
+ qf2.insert(i + n / 2);
+ }
+ qf1.merge(qf2);
+
+ assertEquals(qf1.getNumExpansions(), 0);
+ assertTrue(qf1.getNumEntries() > n * 0.99); // allow a few hash collisions
+
+ // query the same keys
+ int positives = 0;
+ for (int i = 0; i < n; i++) { if (qf1.search(i)) { positives++; } }
+ assertEquals(positives, n);
+
+ // query novel keys
+ positives = 0;
+ for (int i = 0; i < n; i++) { if (qf1.search(i + n)) { positives++; } }
+ assertTrue(positives < 4);
+ }
+
+ @Test
+ public void mergeDifferentConfiguration() {
+ final QuotientFilter qf1 = new QuotientFilter(3, 4);
+ final QuotientFilter qf2 = new QuotientFilter(4, 3);
+ qf1.insert(4);
+ qf2.insert(4);
+ qf1.merge(qf2);
+ assertEquals(qf1.getNumEntries(), 1);
+ }
+
+ @Test(expectedExceptions = SketchesArgumentException.class)
+ public void mergeIncompatible() {
+ final QuotientFilter qf1 = new QuotientFilter(4, 4);
+ final QuotientFilter qf2 = new QuotientFilter(4, 3);
+ qf1.merge(qf2);
+ }
+
+}