diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java index bfb1ad25d..53dfd1c4b 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java @@ -99,7 +99,7 @@ long get_hash(long input) { return XxHash.hashLong(input, 0L) ; // CD edit for datasketches hash function using same seed. } - public long get_space_use() { return 0 ; } + public long getSpaceUse() { return 0 ; } // public int get_bits_per_entry() { return 0 ; } // // public abstract long get_num_entries(boolean include_all_internal_filters); diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java index 6b76a1789..e04e6cd12 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java @@ -49,22 +49,22 @@ void clear() { boolean next() { - if (index == qf.get_num_slots()) { + if (index == qf.getNumSlots()) { return false; } - long slot = qf.get_slot(index); + long slot = qf.getSlot(index); boolean occupied = (slot & 1) != 0; boolean continuation = (slot & 2) != 0; boolean shifted = (slot & 4) != 0; - while (!occupied && !continuation && !shifted && index < qf.get_num_slots()) { + while (!occupied && !continuation && !shifted && index < qf.getNumSlots()) { index++; - if (index == qf.get_num_slots()) { + if (index == qf.getNumSlots()) { return false; } - slot = qf.get_slot(index); + slot = qf.getSlot(index); occupied = (slot & 1) != 0; continuation = (slot & 2) != 0; shifted = (slot & 4) != 0; diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index d90f243c6..bd8c2abd9 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -28,756 +28,613 @@ public class QuotientFilter extends Filter { - int bitPerEntry; - int fingerprintLength; - int power_of_two_size; - int num_entries; - BitArray filter; - - double expansion_threshold; - long max_entries_before_expansion; - boolean expand_autonomously; - boolean is_full; - - // statistics, computed in the compute_statistics method. method should be called before these are used - long num_runs; - long num_clusters; - public double avg_run_length; - public double avg_cluster_length; - - int original_fingerprint_size; - int num_expansions; - - - public QuotientFilter(int power_of_two, int bits_per_entry) { - power_of_two_size = power_of_two; - bitPerEntry = bits_per_entry; - fingerprintLength = bits_per_entry - 3; - final long init_size = 1L << power_of_two; - filter = make_filter(init_size, bits_per_entry); - - expansion_threshold = 0.8; - max_entries_before_expansion = (int) (init_size * expansion_threshold); - expand_autonomously = false; - is_full = false; - - original_fingerprint_size = fingerprintLength; - num_expansions = 0; - //hash_type = XxHash.hashLong ; //HashType.xxh; - } - - public boolean rejuvenate(long key) { - return false; - } - - public long get_num_entries() { - return num_entries; - } - - public long get_max_entries_before_expansion() { - return max_entries_before_expansion; - } - - public boolean expand_autonomously() { - return expand_autonomously; - } - - public void set_expand_autonomously(boolean val) { - expand_autonomously = val; - } - - BitArray make_filter(long init_size, int bits_per_entry) { -// System.out.println(init_size ) ; -// System.out.println(num_extension_slots); -// System.out.println("Making BitVector with: " + (init_size + num_extension_slots) + "SLOTS"); - //return new QuickBitVectorWrapper(bits_per_entry, init_size); - return new HeapBitArray(init_size * bits_per_entry); - } - - public int get_fingerprint_length() { - return fingerprintLength; - } - - QuotientFilter(int power_of_two, int bits_per_entry, BitArray bitmap) { - power_of_two_size = power_of_two; - bitPerEntry = bits_per_entry; - fingerprintLength = bits_per_entry - 3; - filter = bitmap; - } - - boolean expand() { - is_full = true; - return false; - } - - // measures the number of bits per entry for the filter - public double measure_num_bits_per_entry() { - return measure_num_bits_per_entry(this, new ArrayList()); - } - - // measures the number of bits per entry for the filter - // it takes an array of filters as a parameter since some filter implementations here consist of multiple filter objects - protected static double measure_num_bits_per_entry(QuotientFilter current, ArrayList other_filters) { - //System.out.println("--------------------------"); - //current.print_filter_summary(); - //System.out.println(); - double num_entries = current.get_num_entries(); - for (QuotientFilter q : other_filters) { - //q.print_filter_summary(); - //System.out.println(); - num_entries += q.get_num_entries(); - } - long init_size = 1L << current.power_of_two_size; - long num_bits = current.bitPerEntry * init_size; - for (QuotientFilter q : other_filters) { - init_size = 1L << q.power_of_two_size; - num_bits += q.bitPerEntry * init_size; - } - //System.out.println("total entries: \t\t" + num_entries); - //System.out.println("total bits: \t\t" + num_bits); - final double bits_per_entry = num_bits / num_entries; - //System.out.println("total bits/entry: \t" + bits_per_entry); - //System.out.println(); - return bits_per_entry; - } - - // returns the fraction of occupied slots in the filter - public double get_utilization() { - long num_logical_slots = 1L << power_of_two_size; - double util = num_entries / (double) num_logical_slots; - return util; - } - - // returns the number of slots in the filter without the extension/buffer slots - public long get_num_slots() { - return 1L << power_of_two_size; - } - - long getMask() { - return get_num_slots() - 1; - } - - // sets the metadata flag bits for a given slot index - void modify_slot(boolean is_occupied, boolean is_continuation, boolean is_shifted, - long index) { - set_occupied(index, is_occupied); - set_continuation(index, is_continuation); - set_shifted(index, is_shifted); - } - - // sets the fingerprint for a given slot index - void set_fingerprint(long index, long fingerprint) { - filter.setBits(index * bitPerEntry + 3, fingerprintLength, fingerprint); - } - - // print a nice representation of the filter that can be understood. - // if vertical is on, each line will represent a slot - public String get_pretty_str(boolean vertical) { - StringBuffer sbr = new StringBuffer(); - - long numBits = get_num_slots() * bitPerEntry; - - for (long i = 0; i < numBits; i++) { - long remainder = i % bitPerEntry; - if (remainder == 0) { - long slot_num = i/bitPerEntry; - sbr.append(" "); - if (vertical) { - sbr.append("\n" + String.format("%-10d", slot_num) + "\t"); - } - } - if (remainder == 3) { - sbr.append(" "); - } - sbr.append(filter.getBit(i) ? "1" : "0"); - } - sbr.append("\n"); - return sbr.toString(); - } - - // print a nice representation of the filter that can be humanly read. - public void pretty_print() { - System.out.print(get_pretty_str(true)); - } - - // return a fingerprint in a given slot index - long get_fingerprint(long index) { - return filter.getBits(index * bitPerEntry + 3, fingerprintLength); - } - - // return an entire slot representation, including metadata flags and fingerprint - long get_slot(long index) { - return filter.getBits(index * bitPerEntry, bitPerEntry); - } - - // compare a fingerprint input to the fingerprint in some slot index - protected boolean compare(long index, long fingerprint) { - return get_fingerprint(index) == fingerprint; - } - - // modify the flags and fingerprint of a given slot - void modify_slot(boolean is_occupied, boolean is_continuation, boolean is_shifted, - long index, long fingerprint) { - modify_slot(is_occupied, is_continuation, is_shifted, index); - set_fingerprint(index, fingerprint); - } - - // summarize some statistical measures about the filter - public void print_filter_summary() { - long slots = get_num_slots(); - long num_bits = slots * bitPerEntry; - System.out.println("slots:\t" + slots); - System.out.println("entries:\t" + num_entries); - System.out.println("bits\t:" + num_bits); - System.out.println("bits/entry\t:" + num_bits / (double)num_entries); - System.out.println("FP length:\t" + fingerprintLength); - System.out.println("Is full?\t" + is_full); - double capacity = num_entries / (double)(slots) ; - System.out.println("Capacity\t" + capacity); - compute_statistics(); - //System.out.println("num runs: \t\t" + num_runs); - //System.out.println("avg run length: \t" + avg_run_length); - //System.out.println("num clusters: \t\t" + num_clusters); - //System.out.println("avg cluster length: \t" + avg_cluster_length); - } - - /* - Returns the number of bits used for the filter - */ - @Override - public long get_space_use() { - long num_bits = get_num_slots() * bitPerEntry; - return num_bits; - } - - public int get_bits_per_entry() { - return bitPerEntry; - } - - boolean is_occupied(long index) { - return filter.getBit(index * bitPerEntry); - } - - boolean is_continuation(long index) { - return filter.getBit(index * bitPerEntry + 1); - } - - boolean is_shifted(long index) { - return filter.getBit(index * bitPerEntry + 2); - } - - void set_occupied(long index, boolean val) { - filter.assignBit(index * bitPerEntry, val); - } - - void set_continuation(long index, boolean val) { - filter.assignBit(index * bitPerEntry + 1, val); - } - - void set_shifted(long index, boolean val) { - filter.assignBit(index * bitPerEntry + 2, val); - } - - boolean is_slot_empty(long index) { - return !is_occupied(index) && !is_continuation(index) && !is_shifted(index); - } - - // scan the cluster leftwards until finding the start of the cluster and returning its slot index - // used by deletes - long find_cluster_start(long index) { - long current_index = index; - while (is_shifted(current_index)) { - current_index = (current_index - 1) & getMask(); - } - return current_index; - } - - // given a canonical slot A, finds the actual index B of where the run belonging to slot A now resides - // since the run might have been shifted to the right due to collisions - long find_run_start(long index) { - int num_runs_to_skip = 0; - while (is_shifted(index)) { - index = (index - 1) & getMask(); - if (is_occupied(index)) { - num_runs_to_skip++; + int bitPerEntry_; + int fingerprintLength_; + int powerOfTwoSize_; + int numEntries_; + BitArray bitArray_; + + double expansionThreshold_; + long maxEntriesBeforeExpansion_; + boolean expandAutonomously_; + boolean isFull_; + + // statistics, computed in the compute_statistics method. method should be called before these are used + long numRuns_; + long numClusters_; + public double avgRunLength_; + public double avgClusterLength_; + + int originalFingerprintSize_; + int numExpansions_; + + public QuotientFilter(final int powerOfTwo, final int bitsPerEntry) { + powerOfTwoSize_ = powerOfTwo; + bitPerEntry_ = bitsPerEntry; + fingerprintLength_ = bitsPerEntry - 3; + final long init_size = 1L << powerOfTwo; + bitArray_ = makeFilter(init_size, bitsPerEntry); + + expansionThreshold_ = 0.8; + maxEntriesBeforeExpansion_ = (int) (init_size * expansionThreshold_); + expandAutonomously_ = false; + isFull_ = false; + + originalFingerprintSize_ = fingerprintLength_; + numExpansions_ = 0; + //hash_type = XxHash.hashLong ; //HashType.xxh; + } + + public boolean rejuvenate(final long key) { + return false; + } + + public long getNumEntries() { + return numEntries_; + } + + public long getMaxEntriesBeforeExpansion() { + return maxEntriesBeforeExpansion_; + } + + public boolean expandAutonomously() { + return expandAutonomously_; + } + + public void setExpandAutonomously(final boolean val) { + expandAutonomously_ = val; + } + + BitArray makeFilter(final long initSize, final int bitsPerEntry) { + return new HeapBitArray(initSize * bitsPerEntry); + } + + public int getFingerprintLength() { + return fingerprintLength_; + } + + QuotientFilter(final int powerOfTwo, final int bitsPerEntry, final BitArray bitArray) { + powerOfTwoSize_ = powerOfTwo; + bitPerEntry_ = bitsPerEntry; + fingerprintLength_ = bitsPerEntry - 3; + bitArray_ = bitArray; + } + + boolean expand() { + isFull_ = true; + return false; + } + + // measures the number of bits per entry for the filter + public double measureNumBitsPerEntry() { + return measureNumBitsPerEntry(this, new ArrayList()); + } + + // measures the number of bits per entry for the filter + // it takes an array of filters as a parameter since some filter implementations here consist of multiple filter objects + protected static double measureNumBitsPerEntry(final QuotientFilter current, final ArrayList otherFilters) { + //System.out.println("--------------------------"); + //current.print_filter_summary(); + //System.out.println(); + double numEntries = current.getNumEntries(); + for (QuotientFilter q : otherFilters) { + //q.print_filter_summary(); + //System.out.println(); + numEntries += q.getNumEntries(); + } + long init_size = 1L << current.powerOfTwoSize_; + long numBits = current.bitPerEntry_ * init_size; + for (QuotientFilter q : otherFilters) { + init_size = 1L << q.powerOfTwoSize_; + numBits += q.bitPerEntry_ * init_size; + } + //System.out.println("total entries: \t\t" + num_entries); + //System.out.println("total bits: \t\t" + num_bits); + final double bits_per_entry = numBits / numEntries; + //System.out.println("total bits/entry: \t" + bits_per_entry); + //System.out.println(); + return bits_per_entry; + } + + // returns the fraction of occupied slots in the filter + public double getUtilization() { + return numEntries_ / (double) getNumSlots(); + } + + // returns the number of slots in the filter without the extension/buffer slots + public long getNumSlots() { + return 1L << powerOfTwoSize_; + } + + long getMask() { + return getNumSlots() - 1; + } + + // sets the metadata flag bits for a given slot index + void modifySlot(final boolean isOccupied, final boolean isContinuation, final boolean isShifted, final long index) { + setOccupied(index, isOccupied); + setContinuation(index, isContinuation); + setShifted(index, isShifted); + } + + // sets the fingerprint for a given slot index + void setFingerprint(final long index, final long fingerprint) { + bitArray_.setBits(index * bitPerEntry_ + 3, fingerprintLength_, fingerprint); + } + + // print a nice representation of the filter that can be understood. + // if vertical is on, each line will represent a slot + public String get_pretty_str(final boolean vertical) { + final StringBuffer sbr = new StringBuffer(); + final long numBits = getNumSlots() * bitPerEntry_; + for (long i = 0; i < numBits; i++) { + final long remainder = i % bitPerEntry_; + if (remainder == 0) { + final long slot = i / bitPerEntry_; + sbr.append(" "); + if (vertical) { + sbr.append("\n" + String.format("%-10d", slot) + "\t"); } } - while (num_runs_to_skip > 0) { - index = (index + 1) & getMask(); - if (!is_continuation(index)) { - num_runs_to_skip--; - } + if (remainder == 3) { + sbr.append(" "); + } + sbr.append(bitArray_.getBit(i) ? "1" : "0"); + } + sbr.append("\n"); + return sbr.toString(); + } + + // print a representation of the filter that can be humanly read. + public void prettyPrint() { + System.out.print(get_pretty_str(true)); + } + + // return a fingerprint in a given slot index + long getFingerprint(final long index) { + return bitArray_.getBits(index * bitPerEntry_ + 3, fingerprintLength_); + } + + // return an entire slot representation, including metadata flags and fingerprint + long getSlot(final long index) { + return bitArray_.getBits(index * bitPerEntry_, bitPerEntry_); + } + + // compare a fingerprint input to the fingerprint in some slot index + protected boolean compare(final long index, final long fingerprint) { + return getFingerprint(index) == fingerprint; + } + + // modify the flags and fingerprint of a given slot + void modifySlot(final boolean isOccupied, final boolean isContinuation, final boolean isShifted, + final long index, final long fingerprint) { + modifySlot(isOccupied, isContinuation, isShifted, index); + setFingerprint(index, fingerprint); + } + + // summarize some statistical measures about the filter + public void printFilterSummary() { + final long slots = getNumSlots(); + final long num_bits = slots * bitPerEntry_; + System.out.println("slots:\t" + slots); + System.out.println("entries:\t" + numEntries_); + System.out.println("bits\t:" + num_bits); + System.out.println("bits/entry\t:" + num_bits / (double)numEntries_); + System.out.println("FP length:\t" + fingerprintLength_); + System.out.println("Is full?\t" + isFull_); + final double capacity = numEntries_ / (double)(slots) ; + System.out.println("Capacity\t" + capacity); + computeStatistics(); + //System.out.println("num runs: \t\t" + num_runs); + //System.out.println("avg run length: \t" + avg_run_length); + //System.out.println("num clusters: \t\t" + num_clusters); + //System.out.println("avg cluster length: \t" + avg_cluster_length); + } + + /* + * Returns the number of bits used for the filter + */ + @Override + public long getSpaceUse() { + return getNumSlots() * bitPerEntry_; + } + + public int getBitsPerEntry() { + return bitPerEntry_; + } + + boolean isOccupied(final long index) { + return bitArray_.getBit(index * bitPerEntry_); + } + + boolean isContinuation(final long index) { + return bitArray_.getBit(index * bitPerEntry_ + 1); + } + + boolean isShifted(final long index) { + return bitArray_.getBit(index * bitPerEntry_ + 2); + } + + void setOccupied(final long index, final boolean val) { + bitArray_.assignBit(index * bitPerEntry_, val); + } + + void setContinuation(final long index, final boolean val) { + bitArray_.assignBit(index * bitPerEntry_ + 1, val); + } + + void setShifted(final long index, final boolean val) { + bitArray_.assignBit(index * bitPerEntry_ + 2, val); + } + + boolean isSlotEmpty(final long index) { + return !isOccupied(index) && !isContinuation(index) && !isShifted(index); + } + + // scan the cluster leftwards until finding the start of the cluster and returning its slot index + // used by deletes + long findClusterStart(long index) { + while (isShifted(index)) { + index = (index - 1) & getMask(); + } + return index; + } + + // given a canonical slot A, finds the actual index B of where the run belonging to slot A now resides + // since the run might have been shifted to the right due to collisions + long findRunStart(long index) { + int numRunsToSkip = 0; + while (isShifted(index)) { + index = (index - 1) & getMask(); + if (isOccupied(index)) { + numRunsToSkip++; } - return index; - } - - // given the start of a run, scan the run and return the index of the first matching fingerprint - // if not found returns the insertion position as bitwise complement to make it negative - long find_first_fingerprint_in_run(long index, long fingerprint) { - assert(!is_continuation(index)); - do { - final long fingerprintAtIndex = get_fingerprint(index); - if (fingerprintAtIndex == fingerprint) { - return index; - } else if (fingerprintAtIndex > fingerprint) { - return ~index; - } - index = (index + 1) & getMask(); - } while (is_continuation(index)); - return ~index; } - - // delete the last matching fingerprint in the run - long decide_which_fingerprint_to_delete(long index, long fingerprint) { - assert(!is_continuation(index)); - long matching_fingerprint_index = -1; - do { - if (compare(index, fingerprint)) { - //System.out.println("found matching FP at index " + index); - matching_fingerprint_index = index; - } - index = (index + 1) & getMask(); - } while (is_continuation(index)); - return matching_fingerprint_index; + while (numRunsToSkip > 0) { + index = (index + 1) & getMask(); + if (!isContinuation(index)) { + numRunsToSkip--; + } } + return index; + } - // given the start of a run, find the last slot index that still belongs to this run - long find_run_end(long index) { - while (is_continuation((index + 1) & getMask())) { - index = (index + 1) & getMask(); - } + // given the start of a run, scan the run and return the index of the first matching fingerprint + // if not found returns the insertion position as bitwise complement to make it negative + long findFirstFingerprintInRun(long index, final long fingerprint) { + assert(!isContinuation(index)); + do { + final long fingerprintAtIndex = getFingerprint(index); + if (fingerprintAtIndex == fingerprint) { return index; - } - - // given a canonical index slot and a fingerprint, find the relevant run and check if there is a matching fingerprint within it - boolean search(long fingerprint, long index) { - boolean does_run_exist = is_occupied(index); - if (!does_run_exist) { - return false; - } - long run_start_index = find_run_start(index); - long found_index = find_first_fingerprint_in_run(run_start_index, fingerprint); - return found_index >= 0; - } - - // Given a canonical slot index, find the corresponding run and return all fingerprints in the run. - // This method is only used for testing purposes. - Set get_all_fingerprints(long bucket_index) { - boolean does_run_exist = is_occupied(bucket_index); - HashSet set = new HashSet(); - if (!does_run_exist) { - return set; - } - long run_index = find_run_start(bucket_index); - do { - set.add(get_fingerprint(run_index)); - run_index = (run_index + 1) & getMask(); - } while (is_continuation(run_index)); - return set; - } - - boolean insert(long fingerprint, long index) { - if (index >= get_num_slots() || num_entries == get_num_slots()) { - return false; - } - final long run_start = find_run_start(index); - if (!is_occupied(index)) { - return insert_fingerprint_and_push_all_else(fingerprint, run_start, index, true, true); + } else if (fingerprintAtIndex > fingerprint) { + return ~index; } - final long found_index = find_first_fingerprint_in_run(run_start, fingerprint); - if (found_index >= 0) { - return false; + index = (index + 1) & getMask(); + } while (isContinuation(index)); + return ~index; + } + + // delete the last matching fingerprint in the run + long decideWhichFingerprintToDelete(long index, final long fingerprint) { + assert(!isContinuation(index)); + long matchingFingerprintIndex = -1; + do { + if (compare(index, fingerprint)) { + //System.out.println("found matching FP at index " + index); + matchingFingerprintIndex = index; } - return insert_fingerprint_and_push_all_else(fingerprint, ~found_index, index, false, ~found_index == run_start); - } - - boolean insert_fingerprint_and_push_all_else(long fingerprint, long index, long canonical, boolean is_new_run, boolean is_run_start) { - // in the first shifted entry set is_continuation flag if inserting at the start of the existing run - // otherwise just shift the existing flag as it is - boolean force_continuation = !is_new_run && is_run_start; - - // prepare flags for the current slot - boolean is_continuation = !is_run_start; - boolean is_shifted = index != canonical; - - // remember the existing entry from the current slot to be shifted to the next slot - // is_occupied flag belongs to the slot, therefore it is never shifted - // is_shifted flag is always true for all shifted entries, no need to remember it - long existing_fingerprint = get_fingerprint(index); - boolean existing_is_continuation = is_continuation(index); - boolean existing_is_empty = is_slot_empty(index); - - while (!existing_is_empty) { - // set the current slot - set_fingerprint(index, fingerprint); - set_continuation(index, is_continuation); - set_shifted(index, is_shifted); - - // prepare values for the next slot - fingerprint = existing_fingerprint; - is_continuation = existing_is_continuation | force_continuation; - is_shifted = true; - - index = (index + 1) & getMask(); - - // save the existing entry to be shifted - existing_fingerprint = get_fingerprint(index); - existing_is_continuation = is_continuation(index); - existing_is_empty = is_slot_empty(index); - - force_continuation = false; // this is needed for the first shift only + index = (index + 1) & getMask(); + } while (isContinuation(index)); + return matchingFingerprintIndex; + } + + // given the start of a run, find the last slot index that still belongs to this run + long findRunEnd(long index) { + while (isContinuation((index + 1) & getMask())) { + index = (index + 1) & getMask(); + } + return index; + } + + // given a canonical index slot and a fingerprint, find the relevant run and check if there is a matching fingerprint within it + boolean search(long fingerprint, long index) { + final boolean doesRunExist = isOccupied(index); + if (!doesRunExist) { + return false; + } + final long runStartIndex = findRunStart(index); + final long foundIndex = findFirstFingerprintInRun(runStartIndex, fingerprint); + return foundIndex >= 0; + } + + // Given a canonical slot index, find the corresponding run and return all fingerprints in the run. + // This method is only used for testing purposes. + Set getAllFingerprints(final long bucketIndex) { + final boolean doesRunExist = isOccupied(bucketIndex); + final HashSet set = new HashSet(); + if (!doesRunExist) { + return set; + } + long runIndex = findRunStart(bucketIndex); + do { + set.add(getFingerprint(runIndex)); + runIndex = (runIndex + 1) & getMask(); + } while (isContinuation(runIndex)); + return set; + } + + boolean insert(final long fingerprint, final long index) { + if (index >= getNumSlots() || numEntries_ == getNumSlots()) { + return false; + } + final long run_start = findRunStart(index); + if (!isOccupied(index)) { + return insertFingerprintAndPushAllElse(fingerprint, run_start, index, true, true); + } + final long found_index = findFirstFingerprintInRun(run_start, fingerprint); + if (found_index >= 0) { + return false; + } + return insertFingerprintAndPushAllElse(fingerprint, ~found_index, index, false, ~found_index == run_start); + } + + boolean insertFingerprintAndPushAllElse(long fingerprint, long index, final long canonical, + final boolean isNewRun, final boolean isRunStart) { + // in the first shifted entry set isContinuation flag if inserting at the start of the existing run + // otherwise just shift the existing flag as it is + boolean forceContinuation = !isNewRun && isRunStart; + + // prepare flags for the current slot + boolean isContinuation = !isRunStart; + boolean isShifted = index != canonical; + + // remember the existing entry from the current slot to be shifted to the next slot + // isOccupied flag belongs to the slot, therefore it is never shifted + // isShifted flag is always true for all shifted entries, no need to remember it + long existingFingerprint = getFingerprint(index); + boolean existingIsContinuation = isContinuation(index); + + while (!isSlotEmpty(index)) { + // set the current slot + setFingerprint(index, fingerprint); + setContinuation(index, isContinuation); + setShifted(index, isShifted); + + // prepare values for the next slot + fingerprint = existingFingerprint; + isContinuation = existingIsContinuation | forceContinuation; + isShifted = true; + + index = (index + 1) & getMask(); + + // remember the existing entry to be shifted + existingFingerprint = getFingerprint(index); + existingIsContinuation = isContinuation(index); + + forceContinuation = false; // this is needed for the first shift only + } + // at this point the current slot is empty, so just populate with prepared values + // either the incoming fingerprint or the last shifted one + setFingerprint(index, fingerprint); + setContinuation(index, isContinuation); + setShifted(index, isShifted); + + if (isNewRun) { + setOccupied(canonical, true); + } + numEntries_++; + return true; + } + + boolean delete(final long fingerprint, final long canonicalSlot, long runStartIndex, long matchingFingerprintIndex) { + long runEnd = findRunEnd(matchingFingerprintIndex); + + // the run has only one entry, we need to disable its is_occupied flag + // we just remember we need to do this here, and we do it later to not interfere with counts + boolean turnOffOccupied = runStartIndex == runEnd; + + // First thing to do is move everything else in the run back by one slot + for (long i = matchingFingerprintIndex; i != runEnd; i = (i + 1) & getMask()) { + long f = getFingerprint((i + 1) & getMask()); + setFingerprint(i, f); + } + + // for each slot, we want to know by how much the entry there is shifted + // we can do this by counting the number of continuation flags set to true + // and the number of occupied flags set to false from the start of the cluster to the given cell + // and then subtracting: num_shifted_count - num_non_occupied = number of slots by which an entry is shifted + long clusterStart = findClusterStart(canonicalSlot); + long numShiftedCount = 0; + long numNonOccupied = 0; + for (long i = clusterStart; i != ((runEnd + 1) & getMask()); i = (i + 1) & getMask()) { + if (isContinuation(i)) { + numShiftedCount++; } - // at this point the current slot is empty, so just populate with prepared values - // either the incoming fingerprint or the last shifted one - set_fingerprint(index, fingerprint); - set_continuation(index, is_continuation); - set_shifted(index, is_shifted); - - if (is_new_run) { - set_occupied(canonical, true); + if (!isOccupied(i)) { + numNonOccupied++; } - num_entries++; - return true; } - boolean delete(long fingerprint, long canonical_slot, long run_start_index, long matching_fingerprint_index) { - long run_end = find_run_end(matching_fingerprint_index); - - // the run has only one entry, we need to disable its is_occupied flag - // we just remember we need to do this here, and we do it later to not interfere with counts - boolean turn_off_occupied = run_start_index == run_end; - - // First thing to do is move everything else in the run back by one slot - for (long i = matching_fingerprint_index; i != run_end; i = (i + 1) & getMask()) { - long f = get_fingerprint((i + 1) & getMask()); - set_fingerprint(i, f); + setFingerprint(runEnd, 0); + setShifted(runEnd, false); + setContinuation(runEnd, false); + + // we now have a nested loop. The outer do-while iterates over the remaining runs in the cluster. + // the inner for loop iterates over cells of particular runs, pushing entries one slot back. + do { + // we first check if the next run actually exists and if it is shifted. + // only if both conditions hold, we need to shift it back one slot. + //boolean does_next_run_exist = !is_slot_empty(run_end + 1); + //boolean is_next_run_shifted = is_shifted(run_end + 1); + //if (!does_next_run_exist || !is_next_run_shifted) { + if (isSlotEmpty((runEnd + 1) & getMask()) || !isShifted((runEnd + 1) & getMask())) { + if (turnOffOccupied) { + // if we eliminated a run and now need to turn the isOccupied flag off, we do it at the end to not interfere in our counts + setOccupied(canonicalSlot, false); } + return true; + } - // for each slot, we want to know by how much the entry there is shifted - // we can do this by counting the number of continuation flags set to true - // and the number of occupied flags set to false from the start of the cluster to the given cell - // and then subtracting: num_shifted_count - num_non_occupied = number of slots by which an entry is shifted - long cluster_start = find_cluster_start(canonical_slot); - long num_shifted_count = 0; - long num_non_occupied = 0; - for (long i = cluster_start; i != ((run_end + 1) & getMask()); i = (i + 1) & getMask()) { - if (is_continuation(i)) { - num_shifted_count++; - } - if (!is_occupied(i)) { - num_non_occupied++; - } - } + // we now find the start and end of the next run + final long nextRunStart = (runEnd + 1) & getMask(); + runEnd = findRunEnd(nextRunStart); - set_fingerprint(run_end, 0); - set_shifted(run_end, false); - set_continuation(run_end, false); - - // we now have a nested loop. The outer do-while iterates over the remaining runs in the cluster. - // the inner for loop iterates over cells of particular runs, pushing entries one slot back. - do { - // we first check if the next run actually exists and if it is shifted. - // only if both conditions hold, we need to shift it back one slot. - //boolean does_next_run_exist = !is_slot_empty(run_end + 1); - //boolean is_next_run_shifted = is_shifted(run_end + 1); - //if (!does_next_run_exist || !is_next_run_shifted) { - if (is_slot_empty((run_end + 1) & getMask()) || !is_shifted((run_end + 1) & getMask())) { - if (turn_off_occupied) { - // if we eliminated a run and now need to turn the is_occupied flag off, we do it at the end to not interfere in our counts - set_occupied(canonical_slot, false); - } - return true; - } - - // we now find the start and end of the next run - final long next_run_start = (run_end + 1) & getMask(); - run_end = find_run_end(next_run_start); - - // before we start processing the next run, we check whether the previous run we shifted is now back to its canonical slot - // The condition num_shifted_count - num_non_occupied == 1 ensures that the run was shifted by only 1 slot, meaning it is now back in its proper place - if (is_occupied((next_run_start - 1) & getMask()) && num_shifted_count - num_non_occupied == 1) { - set_shifted((next_run_start - 1) & getMask(), false); - } - else { - set_shifted((next_run_start - 1) & getMask(), true); - } - - for (long i = next_run_start; i != ((run_end + 1) & getMask()); i = (i + 1) & getMask()) { - long f = get_fingerprint(i); - set_fingerprint((i - 1) & getMask(), f); - if (is_continuation(i)) { - set_continuation((i - 1) & getMask(), true); - } - if (!is_occupied(i)) { - num_non_occupied++; - } - if (i != next_run_start) { - num_shifted_count++; - } - } - set_fingerprint(run_end, 0); - set_shifted(run_end, false); - set_continuation(run_end, false); - } while (true); - } + // before we start processing the next run, we check whether the previous run we shifted is now back to its canonical slot + // The condition num_shifted_count - num_non_occupied == 1 ensures that the run was shifted by only 1 slot, meaning it is now back in its proper place + if (isOccupied((nextRunStart - 1) & getMask()) && numShiftedCount - numNonOccupied == 1) { + setShifted((nextRunStart - 1) & getMask(), false); + } else { + setShifted((nextRunStart - 1) & getMask(), true); + } - boolean delete(long fingerprint, long canonical_slot) { - // if the run doesn't exist, the key can't have possibly been inserted - boolean does_run_exist = is_occupied(canonical_slot); - if (!does_run_exist) { - return false; + for (long i = nextRunStart; i != ((runEnd + 1) & getMask()); i = (i + 1) & getMask()) { + long f = getFingerprint(i); + setFingerprint((i - 1) & getMask(), f); + if (isContinuation(i)) { + setContinuation((i - 1) & getMask(), true); } - long run_start_index = find_run_start(canonical_slot); - - long matching_fingerprint_index = decide_which_fingerprint_to_delete(run_start_index, fingerprint); - - if (matching_fingerprint_index == -1) { - // we didn't find a matching fingerprint - return false; + if (!isOccupied(i)) { + numNonOccupied++; } - - return delete(fingerprint, canonical_slot, run_start_index, matching_fingerprint_index); - } - - /* - Performs the modular arithmetic of large_hash % bits_per_entry and uses this as the slot_index - */ - long get_slot_index(long large_hash) { - long slot_index_mask = (1L << power_of_two_size) - 1; - long slot_index = large_hash & slot_index_mask; - //System.out.format("\n**get_slot_index(): [total_hash:index_hash:int_index] --> [%016x:%016x:%016x]\n", large_hash, (int)large_hash, slot_index); - return slot_index; - } - - long gen_fingerprint(long large_hash) { - long fingerprint_mask = (1L << fingerprintLength) - 1L; - fingerprint_mask = fingerprint_mask << power_of_two_size; - long fingerprint = (large_hash & fingerprint_mask) >> power_of_two_size; - //System.out.format("\n**gen_fingerprint(): [total_hash:fingerprint_hash:int_fingerprint] --> [%016x:%016x:%016x]\n", large_hash, ((int)(large_hash>>32)), fingerprint); - return fingerprint; - } - -// void print_key(int input) { -// long large_hash = HashFunctions.normal_hash(input); -// long slot_index = get_slot_index(large_hash); -// long fingerprint = gen_fingerprint(large_hash); -// -// System.out.println("num : " + input); -// System.out.print("hash : "); -// print_long_in_binary(large_hash, fingerprintLength + power_of_two_size); -// //print_int_in_binary(slot_index_mask, 31); -// System.out.print("bucket: "); -// print_long_in_binary(slot_index, power_of_two_size); -// System.out.print("FP : "); -// //print_int_in_binary(fingerprint_mask, 31); -// print_long_in_binary(fingerprint, fingerprintLength); -// System.out.println(); -// -// } -// -// void set_expansion_threshold(double thresh) { -// expansion_threshold = thresh; -// max_entries_before_expansion = (long)(Math.pow(2, power_of_two_size) * expansion_threshold); -// } -// - /* - This is the main insertion function accessed externally. - It calls the underlying filter _insert function which hashes the input - item internally. - Hence, the `large_hash` argument is already a hash key that has been generated - by the hashing library (eg xxhash). - */ - protected boolean _insert(long large_hash) { - //System.out.println("Inserting long hash " + large_hash); - if (is_full) { - return false; + if (i != nextRunStart) { + numShiftedCount++; } - long slot_index = get_slot_index(large_hash); - long fingerprint = gen_fingerprint(large_hash); - - /*print_long_in_binary(large_hash, 64); - print_long_in_binary(slot_index, 32); - print_long_in_binary((int)fingerprint, 64); - System.out.println(slot_index + " " + fingerprint ); - System.out.println(); */ - - boolean success = insert(fingerprint, slot_index); - /*if (!success) { - System.out.println("insertion failure"); - System.out.println(input + "\t" + slot_index + "\t" + get_fingerprint_str(fingerprint, fingerprintLength)); - pretty_print(); - System.exit(1); - }*/ - -// if (expand_autonomously && num_entries >= max_entries_before_expansion) { -// boolean expanded = expand(); -// if (expanded) { -// num_expansions++; -// } -// } - return success; - } -// -// protected boolean _delete(long large_hash) { -// long slot_index = get_slot_index(large_hash); -// long fp_long = gen_fingerprint(large_hash); -// boolean success = delete(fp_long, slot_index); -// if (success) { -// num_entries--; -// } -// return success; -// } -// - protected boolean _search(long large_hash) { - long slot_index = get_slot_index(large_hash); - long fingerprint = gen_fingerprint(large_hash); - return search(fingerprint, slot_index); - } - - public boolean get_bit_at_offset(int offset) { - return filter.getBit(offset); + } + setFingerprint(runEnd, 0); + setShifted(runEnd, false); + setContinuation(runEnd, false); + } while (true); + } + + boolean delete(final long fingerprint, final long canonicalSlot) { + // if the run doesn't exist, the key can't have possibly been inserted + boolean doesRunExist = isOccupied(canonicalSlot); + if (!doesRunExist) { + return false; + } + long runStartIndex = findRunStart(canonicalSlot); + long matchingFingerprintIndex = decideWhichFingerprintToDelete(runStartIndex, fingerprint); + if (matchingFingerprintIndex == -1) { + // we didn't find a matching fingerprint + return false; + } + return delete(fingerprint, canonicalSlot, runStartIndex, matchingFingerprintIndex); + } + + /* + * Performs the modular arithmetic of large_hash % bits_per_entry and uses this as the slot_index + */ + long getSlotIndex(final long largeHash) { + return largeHash & getMask(); + } + + long genFingerprint(final long largeHash) { + long fingerprintMask = (1L << fingerprintLength_) - 1L; + fingerprintMask = fingerprintMask << powerOfTwoSize_; + return (largeHash & fingerprintMask) >> powerOfTwoSize_; + } + + void setExpansionThreshold(double thresh) { + expansionThreshold_ = thresh; + maxEntriesBeforeExpansion_ = (long)(Math.pow(2, powerOfTwoSize_) * expansionThreshold_); + } + + /* + This is the main insertion function accessed externally. + It calls the underlying filter _insert function which hashes the input + item internally. + Hence, the `large_hash` argument is already a hash key that has been generated + by the hashing library (eg xxhash). + */ + protected boolean _insert(long large_hash) { + if (isFull_) { + return false; + } + final long slotIndex = getSlotIndex(large_hash); + final long fingerprint = genFingerprint(large_hash); + boolean success = insert(fingerprint, slotIndex); + + if (expandAutonomously_ && numEntries_ >= maxEntriesBeforeExpansion_) { + final boolean expanded = expand(); + if (expanded) { + numExpansions_++; + } } - - public void compute_statistics() { - num_runs = 0; - num_clusters = 0; - double sum_run_lengths = 0; - double sum_cluster_lengths = 0; - - int current_run_length = 0; - int current_cluster_length = 0; - - long num_slots = get_num_slots(); - for (long i = 0; i < num_slots; i++) { - - boolean occupied = is_occupied(i); - boolean continuation = is_continuation(i); - boolean shifted = is_shifted(i); - - if ( !occupied && !continuation && !shifted ) { // empty slot - sum_cluster_lengths += current_cluster_length; - current_cluster_length = 0; - sum_run_lengths += current_run_length; - current_run_length = 0; - } - else if ( !occupied && !continuation && shifted ) { // start of new run - num_runs++; - sum_run_lengths += current_run_length; - current_run_length = 1; - current_cluster_length++; - } - else if ( !occupied && continuation && !shifted ) { - // not used - } - else if ( !occupied && continuation && shifted ) { // continuation of run - current_cluster_length++; - current_run_length++; - } - else if ( occupied && !continuation && !shifted ) { // start of new cluster & run - num_runs++; - num_clusters++; - sum_cluster_lengths += current_cluster_length; - sum_run_lengths += current_run_length; - current_cluster_length = 1; - current_run_length = 1; - } - else if (occupied && !continuation && shifted ) { // start of new run - num_runs++; - sum_run_lengths += current_run_length; - current_run_length = 1; - current_cluster_length++; - } - else if (occupied && continuation && !shifted ) { - // not used - } - else if (occupied && continuation && shifted ) { // continuation of run - current_cluster_length++; - current_run_length++; - } - } - avg_run_length = sum_run_lengths / num_runs; - avg_cluster_length = sum_cluster_lengths / num_clusters; + return success; + } + + protected boolean _delete(final long largeHash) { + final long slotIndex = getSlotIndex(largeHash); + long fingerprint = genFingerprint(largeHash); + boolean success = delete(fingerprint, slotIndex); + if (success) { + numEntries_--; + } + return success; + } + + protected boolean _search(final long largeHash) { + final long slotIndex = getSlotIndex(largeHash); + long fingerprint = genFingerprint(largeHash); + return search(fingerprint, slotIndex); + } + + public boolean getBitAtOffset(final int offset) { + return bitArray_.getBit(offset); + } + + public void computeStatistics() { + numRuns_ = 0; + numClusters_ = 0; + double sumRunLengths = 0; + double sumClusterLengths = 0; + + int currentRunLength = 0; + int currentCluster_length = 0; + + final long numSlots = getNumSlots(); + for (long i = 0; i < numSlots; i++) { + final boolean occupied = isOccupied(i); + final boolean continuation = isContinuation(i); + final boolean shifted = isShifted(i); + + if (!occupied && !continuation && !shifted) { // empty slot + sumClusterLengths += currentCluster_length; + currentCluster_length = 0; + sumRunLengths += currentRunLength; + currentRunLength = 0; + } else if ( !occupied && !continuation && shifted ) { // start of new run + numRuns_++; + sumRunLengths += currentRunLength; + currentRunLength = 1; + currentCluster_length++; + } else if ( !occupied && continuation && !shifted ) { + // not used + } else if ( !occupied && continuation && shifted ) { // continuation of run + currentCluster_length++; + currentRunLength++; + } else if ( occupied && !continuation && !shifted ) { // start of new cluster & run + numRuns_++; + numClusters_++; + sumClusterLengths += currentCluster_length; + sumRunLengths += currentRunLength; + currentCluster_length = 1; + currentRunLength = 1; + } else if (occupied && !continuation && shifted ) { // start of new run + numRuns_++; + sumRunLengths += currentRunLength; + currentRunLength = 1; + currentCluster_length++; + } else if (occupied && continuation && !shifted ) { + // not used + } else if (occupied && continuation && shifted ) { // continuation of run + currentCluster_length++; + currentRunLength++; + } } - -// -// void ar_sum1(ArrayList ar, int index) -// { -// int s = ar.size(); -// if (s <= index) -// { -// for (int i = s; i measure_cluster_length() -// { -// ArrayList ar = new ArrayList(); -// -// num_runs = 0; -// num_clusters = 0; -// -// int current_run_length = 0; -// int current_cluster_length = 0; -// -// int cnt = 0; -// -// for (int i = 0; i < get_logical_num_slots_plus_extensions(); i++) { -// -// boolean occupied = is_occupied(i); -// boolean continuation = is_continuation(i); -// boolean shifted = is_shifted(i); -// -// if (!occupied && !continuation && !shifted ) { // empty slot -// if(current_cluster_length != 0) ar_sum1(ar, current_cluster_length-1); -// current_cluster_length = 0; -// current_run_length = 0; -// } -// else if (!occupied && !continuation && shifted ) { // start of new run -// num_runs++; -// current_run_length = 1; -// current_cluster_length++; -// } -// else if (!occupied && continuation && shifted ) { // continuation of run -// current_cluster_length++; -// current_run_length++; -// } -// else if (occupied && !continuation && !shifted ) { // start of new cluster & run -// if(current_cluster_length != 0) ar_sum1(ar, current_cluster_length-1); -// num_runs++; -// num_clusters++; -// //if(current_cluster_length == 0) cnt++; -// current_cluster_length = 1; -// current_run_length = 1; -// } -// else if (occupied && !continuation && shifted ) { // start of new run -// num_runs++; -// current_run_length = 1; -// current_cluster_length++; -// } -// else if (occupied && continuation && shifted ) { // continuation of run -// current_cluster_length++; -// current_run_length++; -// } -// } -// if(current_cluster_length != 0) ar_sum1(ar, current_cluster_length-1); -// //System.out.println("CNT = " + cnt); -// return ar; -// } -// -// /* -// @charlied -// Returns the fraction of the filter that is occupied by inserted items. -// Extension slots are omitted from the calculation of the load factor because they are used to accomodate -// items in the filter at the top end of the filter. -// Asymptotically, these will make little-to-no difference to the load in these calculations as the slots -// contributed 2*j / (2^j) --> 0 entries. -// */ -// public double get_load() { -// return num_entries / (double) get_logical_num_slots(); -// } + avgRunLength_ = sumRunLengths / numRuns_; + avgClusterLength_ = sumClusterLengths / numClusters_; + } } diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index 4f68cb1f4..60db5be84 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -62,30 +62,30 @@ public void WikiInsertionTest() { qf.insert(C, 1); qf.insert(D, 2); qf.insert(A, 1); - assertEquals(qf.get_num_entries(), 6); + assertEquals(qf.getNumEntries(), 6); assertEquals(getState(qf, 0), 0); - assertEquals(qf.get_fingerprint(0), 0); + assertEquals(qf.getFingerprint(0), 0); assertEquals(getState(qf, 1), 0b100); - assertEquals(qf.get_fingerprint(1), A); + assertEquals(qf.getFingerprint(1), A); assertEquals(getState(qf, 2), 0b111); - assertEquals(qf.get_fingerprint(2), B); + assertEquals(qf.getFingerprint(2), B); assertEquals(getState(qf, 3), 0b011); - assertEquals(qf.get_fingerprint(3), C); + assertEquals(qf.getFingerprint(3), C); assertEquals(getState(qf, 4), 0b101); - assertEquals(qf.get_fingerprint(4), D); + assertEquals(qf.getFingerprint(4), D); assertEquals(getState(qf, 5), 0b001); - assertEquals(qf.get_fingerprint(5), E); + assertEquals(qf.getFingerprint(5), E); assertEquals(getState(qf, 6), 0); - assertEquals(qf.get_fingerprint(6), 0); + assertEquals(qf.getFingerprint(6), 0); assertEquals(getState(qf, 7), 0b100); - assertEquals(qf.get_fingerprint(7), F); + assertEquals(qf.getFingerprint(7), F); } public int getState(QuotientFilter filter, int slot) { - return (filter.is_occupied(slot) ? 1 : 0) << 2 - | (filter.is_continuation(slot) ? 1 : 0) << 1 - | (filter.is_shifted(slot) ? 1 : 0); + return (filter.isOccupied(slot) ? 1 : 0) << 2 + | (filter.isContinuation(slot) ? 1 : 0) << 1 + | (filter.isShifted(slot) ? 1 : 0); } /* @@ -157,13 +157,13 @@ public void OverflowTest() { final long fp1 = 1; final long fp2 = 1 << fingerprint_size - 1; qf.insert(fp1, num_entries - 1); - assertEquals(qf.get_fingerprint(num_entries - 1), fp1); + assertEquals(qf.getFingerprint(num_entries - 1), fp1); assertEquals(getState(qf, num_entries - 1), 0b100); qf.insert(fp2, num_entries - 1); - assertEquals(qf.get_fingerprint(0), fp2); + assertEquals(qf.getFingerprint(0), fp2); assertEquals(getState(qf, 0), 0b011); qf.delete(fp2, num_entries - 1); - assertEquals(qf.get_fingerprint(0), 0); + assertEquals(qf.getFingerprint(0), 0); assertEquals(getState(qf, 0), 0b000); final boolean found = qf.search(fp1, num_entries - 1); assertTrue(found); @@ -190,7 +190,7 @@ public void testQuotientFilterInsertionAndIteration() { qf.insert(0x1F, 4); qf.insert(0x1F, 15); // last slot in the filter qf.insert(0x1F, 16); // outside the bounds - qf.pretty_print() ; +// qf.pretty_print(); Iterator it = new Iterator(qf); int[] arr = new int[] {2, 3, 4, 15}; @@ -261,8 +261,8 @@ static public BitSet set_slot_in_test(BitSet result, int bits_per_entry, int slo static public boolean check_equality(QuotientFilter qf, BitSet bs, boolean check_also_fingerprints) { for (int i = 0; i < bs.size(); i++) { - if (check_also_fingerprints || (i % qf.bitPerEntry == 0 || i % qf.bitPerEntry == 1 || i % qf.bitPerEntry == 2)) { - if (qf.get_bit_at_offset(i) != bs.get(i)) { + if (check_also_fingerprints || (i % qf.getBitsPerEntry() == 0 || i % qf.getBitsPerEntry() == 1 || i % qf.getBitsPerEntry() == 2)) { + if (qf.getBitAtOffset(i) != bs.get(i)) { return false; } }