From 29b40b59bbd306782672fa8c876b9c0215fdc438 Mon Sep 17 00:00:00 2001 From: lrhodes Date: Thu, 28 Mar 2019 12:00:26 -0700 Subject: [PATCH 01/16] Updated Intersection and AnotB so far. --- .../java/com/yahoo/sketches/theta/AnotB.java | 41 +- .../sketches/theta/DirectCompactSketch.java | 1 + .../sketches/theta/ForwardCompatibility.java | 9 +- .../com/yahoo/sketches/theta/HeapAnotB.java | 235 ++++++------ .../sketches/theta/HeapCompactSketch.java | 5 +- .../yahoo/sketches/theta/Intersection.java | 25 ++ .../sketches/theta/IntersectionImpl.java | 144 +++++--- .../sketches/theta/IntersectionImplR.java | 38 +- .../sketches/theta/PairwiseSetOperations.java | 349 ++++++------------ .../yahoo/sketches/theta/PreambleUtil.java | 85 ++++- .../yahoo/sketches/theta/SetOperation.java | 16 +- .../java/com/yahoo/sketches/theta/Sketch.java | 30 ++ .../com/yahoo/sketches/theta/UnionImpl.java | 119 +++--- .../com/yahoo/sketches/theta/EmptyTest.java | 4 +- .../theta/ForwardCompatibilityTest.java | 8 +- .../yahoo/sketches/theta/HeapAnotBTest.java | 22 +- .../sketches/theta/HeapIntersectionTest.java | 9 + .../theta/PairwiseCornerCasesTest.java | 193 +++++++--- .../theta/PairwiseSetOperationsTest.java | 26 +- 19 files changed, 785 insertions(+), 574 deletions(-) diff --git a/src/main/java/com/yahoo/sketches/theta/AnotB.java b/src/main/java/com/yahoo/sketches/theta/AnotB.java index 73f5196c1..f4e2d9282 100644 --- a/src/main/java/com/yahoo/sketches/theta/AnotB.java +++ b/src/main/java/com/yahoo/sketches/theta/AnotB.java @@ -21,6 +21,8 @@ *

Calling the update function a second time essentially clears the internal state and updates * with the new pair of sketches. * + *

As an alternative, one can use the aNotB method that returns the result immediately. + * * @author Lee Rhodes */ public abstract class AnotB extends SetOperation { @@ -30,10 +32,16 @@ public Family getFamily() { return Family.A_NOT_B; } + /** + * Gets the result of this operation as an ordered CompactSketch on the Java heap + * @return the result of this operation as an ordered CompactSketch on the Java heap + */ + public abstract CompactSketch getResult(); + /** * Gets the result of this set operation as a CompactSketch of the chosen form * @param dstOrdered - * See Destination Ordered + * See Destination Ordered. * * @param dstMem * See Destination Memory. @@ -42,12 +50,6 @@ public Family getFamily() { */ public abstract CompactSketch getResult(boolean dstOrdered, WritableMemory dstMem); - /** - * Gets the result of this operation as an ordered CompactSketch on the Java heap - * @return the result of this operation as an ordered CompactSketch on the Java heap - */ - public abstract CompactSketch getResult(); - /** * Perform A-and-not-B set operation on the two given sketches. * A null sketch is interpreted as an empty sketch. @@ -57,4 +59,29 @@ public Family getFamily() { */ public abstract void update(Sketch a, Sketch b); + /** + * Perform A-and-not-B set operation on the two given sketches and return the result as an + * ordered CompactSketch on the heap. + * @param a The incoming sketch for the first argument + * @param b The incoming sketch for the second argument + * @return an ordered CompactSketch on the heap + */ + public CompactSketch aNotB(final Sketch a, final Sketch b) { + return aNotB(a, b, true, null); + } + + /** + * Perform A-and-not-B set operation on the two given sketches and return the result as a + * CompactSketch. + * @param a The incoming sketch for the first argument + * @param b The incoming sketch for the second argument + * @param dstOrdered + * See Destination Ordered. + * @param dstMem + * See Destination Memory. + * @return the result as a CompactSketch. + */ + public abstract CompactSketch aNotB(Sketch a, Sketch b, boolean dstOrdered, + WritableMemory dstMem); + } diff --git a/src/main/java/com/yahoo/sketches/theta/DirectCompactSketch.java b/src/main/java/com/yahoo/sketches/theta/DirectCompactSketch.java index f8a6394b3..b270f23f3 100644 --- a/src/main/java/com/yahoo/sketches/theta/DirectCompactSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/DirectCompactSketch.java @@ -22,6 +22,7 @@ abstract class DirectCompactSketch extends CompactSketch { DirectCompactSketch(final Memory mem) { mem_ = mem; + checkEmptyState(isEmpty(), getRetainedEntries(true), getThetaLong()); //TODO remove if not needed } //Sketch diff --git a/src/main/java/com/yahoo/sketches/theta/ForwardCompatibility.java b/src/main/java/com/yahoo/sketches/theta/ForwardCompatibility.java index f2c98359a..9e681b0c0 100644 --- a/src/main/java/com/yahoo/sketches/theta/ForwardCompatibility.java +++ b/src/main/java/com/yahoo/sketches/theta/ForwardCompatibility.java @@ -11,6 +11,7 @@ import static com.yahoo.sketches.theta.PreambleUtil.RETAINED_ENTRIES_INT; import static com.yahoo.sketches.theta.PreambleUtil.SEED_HASH_SHORT; import static com.yahoo.sketches.theta.PreambleUtil.THETA_LONG; +import static com.yahoo.sketches.theta.Sketch.checkEmptyState; import com.yahoo.memory.Memory; import com.yahoo.sketches.SketchesArgumentException; @@ -59,7 +60,7 @@ static final CompactSketch heapify1to3(final Memory srcMem, final long seed) { final long[] compactOrderedCache = new long[curCount]; srcMem.getLongArray(24, compactOrderedCache, 0, curCount); - + checkEmptyState(false, curCount, thetaLong); return HeapCompactOrderedSketch .compact(compactOrderedCache, false, seedHash, curCount, thetaLong); } @@ -93,11 +94,11 @@ static final CompactSketch heapify2to3(final Memory srcMem, final long seed) { validateInputSize(reqBytesIn, memCap); final long thetaLong = (mdLongs < 3) ? Long.MAX_VALUE : srcMem.getLong(THETA_LONG); - final boolean empty = (srcMem.getByte(FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; - + boolean empty = (srcMem.getByte(FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; + empty = (curCount == 0) && (thetaLong == Long.MAX_VALUE); //force true final long[] compactOrderedCache = new long[curCount]; srcMem.getLongArray(mdLongs << 3, compactOrderedCache, 0, curCount); - + checkEmptyState(empty, curCount, thetaLong); return HeapCompactOrderedSketch .compact(compactOrderedCache, empty, seedHash, curCount, thetaLong); } diff --git a/src/main/java/com/yahoo/sketches/theta/HeapAnotB.java b/src/main/java/com/yahoo/sketches/theta/HeapAnotB.java index ba320f1f3..e51789ff0 100644 --- a/src/main/java/com/yahoo/sketches/theta/HeapAnotB.java +++ b/src/main/java/com/yahoo/sketches/theta/HeapAnotB.java @@ -14,7 +14,6 @@ import com.yahoo.memory.Memory; import com.yahoo.memory.WritableMemory; -import com.yahoo.sketches.HashOperations; import com.yahoo.sketches.Util; /** @@ -28,7 +27,7 @@ final class HeapAnotB extends AnotB { private long thetaLong_; private boolean empty_; private long[] cache_; // no match set - private int curCount_ = 0; + private int curCount_; private int lgArrLongsHT_; //for Hash Table only. may not need to be member after refactoring private long[] bHashTable_; //may not need to be member after refactoring. @@ -39,15 +38,17 @@ final class HeapAnotB extends AnotB { * @param seed See seed */ HeapAnotB(final long seed) { - seedHash_ = Util.computeSeedHash(seed); - a_ = null; - b_ = null; - thetaLong_ = Long.MAX_VALUE; - empty_ = true; - cache_ = null; - curCount_ = 0; - lgArrLongsHT_ = 5; - bHashTable_ = null; + this(Util.computeSeedHash(seed)); + } + + /** + * Construct a new AnotB SetOperation on the java heap. Called by PairwiseSetOperation. + * + * @param seedHash 16 bit hash of the chosen update seed. + */ + HeapAnotB(final short seedHash) { + seedHash_ = seedHash; + reset(); } @Override @@ -63,6 +64,18 @@ public void update(final Sketch a, final Sketch b) { compute(); } + @Override + public CompactSketch aNotB(final Sketch a, final Sketch b, final boolean dstOrdered, + final WritableMemory dstMem) { + update(a, b); + return getResult(dstOrdered, dstMem); + } + + @Override + public CompactSketch getResult() { + return getResult(true, null); + } + @Override public CompactSketch getResult(final boolean dstOrdered, final WritableMemory dstMem) { final long[] compactCache = (curCount_ <= 0) @@ -72,17 +85,13 @@ public CompactSketch getResult(final boolean dstOrdered, final WritableMemory ds Arrays.sort(compactCache); } //Create the CompactSketch + final boolean empty = (curCount_ == 0) && (thetaLong_ == Long.MAX_VALUE); final CompactSketch comp = createCompactSketch( - compactCache, empty_, seedHash_, curCount_, thetaLong_, dstOrdered, dstMem); + compactCache, empty, seedHash_, curCount_, thetaLong_, dstOrdered, dstMem); reset(); return comp; } - @Override - public CompactSketch getResult() { - return getResult(true, null); - } - @Override int getRetainedEntries(final boolean valid) { return curCount_; @@ -109,149 +118,135 @@ void compute() { // NOTES: // In the table below, A and B refer to the two input sketches in the order A-not-B. - // The Theta rule: min( ThetaA, ThetaB) - // The Empty rule: Whatever A is: E(a) + // The Theta rule: min(ThetaA, ThetaB) + // The Empty rule: Whatever the empty state of A is: E(A) // The Return triple is defined as: (Theta, Count, EmptyFlag). // bHashTable temporarily stores the values of B. - // A sketch in stored form can be in one of 5 states + // A sketch in stored form can be in one of 5 states. // Null is not actually a state, but is included for completeness. // Null is interpreted as {Theta = 1.0, count = 0, empty = true}. - // The empty state may have Theta < 1.0, but count must be zero. + // The empty state may have Theta < 1.0 but it is ignored; count must be zero. // State: - // 0 N null + // 0 N Null // 1 E Empty // 2 C Compact, not ordered // 3 O Compact Ordered // 4 H Hash-Table // - //A B swA swB Case Action + //A B swA swB Case Actions //N N 0 0 0 Return (1.0, 0, T) - //N E 0 1 1 Return B: (ThB, 0, T) - //N C 0 2 2 Return (ThB, 0, T) - //N O 0 3 3 Return (ThB, 0, T) - //N H 0 4 4 Return (ThB, 0, T) - //E N 1 0 8 Return A: (ThA, 0, T) - //E E 1 1 9 Return (minT, 0, T) - //E C 1 2 10 Return (minT, 0, T) - //E O 1 3 11 Return (minT, 0, T) - //E H 1 4 12 Return (minT, 0, T) - //C N 2 0 16 Return A: (ThA, |A|, E(a)) - //C E 2 1 17 Return (minT, |A| < minT, E(a)) - //C C 2 2 18 B -> H; => C,H - //C O 2 3 19 B -> H; => C,H - //C H 2 4 20 scan all A, search B, on nomatch -> list (same as HH) - //O N 3 0 24 Return A: (ThA, |A|, E(a)) - //O E 3 1 25 Return (minT, |A| < minT, E(a)) - //O C 3 2 26 B -> H; => O,H - //O O 3 3 27 B -> H; => O,H - //O H 3 4 28 scan A early stop, search B, on nomatch -> list - //H N 4 0 32 Return A: (ThA, |A|, E(a)) - //H E 4 1 33 Return (minT, |A|< minT, E(a)) - //H C 4 2 34 B -> H; => H,H - //H O 4 3 35 B -> H; => H,H - //H H 4 4 36 scan all A, search B, on nomatch -> list + //N E 0 1 1 CheckB, Return (1.0, 0, T) + //N C 0 2 2 CheckB, Return (1.0, 0, T) + //N O 0 3 3 CheckB, Return (1.0, 0, T) + //N H 0 4 4 CheckB, Return (1.0, 0, T) + //E N 1 0 8 CheckA, Return (1.0, 0, T) + //E E 1 1 9 CheckAB, Return (1.0, 0, T) + //E C 1 2 10 CheckAB, Return (1.0, 0, T) + //E O 1 3 11 CheckAB, Return (1.0, 0, T) + //E H 1 4 12 CheckAB, Return (1.0, 0, T) + //C N 2 0 16 CheckA, Return (ThA, |A|, F), copyA + //C E 2 1 17 CheckAB, Return (ThA, |A|, F)), copyA + //C C 2 2 18 CheckAB, B -> H; => C,H; scanAllAsearchB() + //C O 2 3 19 CheckAB, B -> H; => C,H; scanAllAsearchB() + //C H 2 4 20 CheckAB, scanAllAsearchB() + //O N 3 0 24 CheckA, Return (ThA, |A|, F), copyA + //O E 3 1 25 CheckAB, Return (ThA, |A|, F), copyA + //O C 3 2 26 CheckAB, B -> H; => O,H; scanEarlyStopAsearchB() + //O O 3 3 27 CheckAB, B -> H; => O,H; scanEarlyStopAsearchB() + //O H 3 4 28 CheckAB, scanEarlyStopAsearchB() + //H N 4 0 32 CheckA, Return (ThA, |A|, F), copyA + //H E 4 1 33 CheckAB, Return (ThA, |A|, F), copyA + //H C 4 2 34 CheckAB, B -> H; => H,H; scanAllAsearchB() + //H O 4 3 35 CheckAB, B -> H; => H,H; scanAllAsearchB() + //H H 4 4 36 CheckAB, scanAllAsearchB() switch (sw) { - case 0 : { //A and B are null. + case 0 : //A Null, B Null; Return (1.0, 0, T) thetaLong_ = Long.MAX_VALUE; empty_ = true; - break; //{1.0, 0, T} - } - case 1: - case 2: - case 3: - case 4: { //A is null, B is valid + break; + + case 10: //A Empty, B Compact; CheckAB, Return (1.0, 0, T) + case 11: //A Empty, B Ordered; CheckAB, Return (1.0, 0, T) + case 12: //A Empty, B HashTbl; CheckAB, Return (1.0, 0, T) + Util.checkSeedHashes(seedHash_, a_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] + //$FALL-THROUGH$ + case 1: //A Null, B Empty; CheckB, Return (1.0, 0, T) + case 2: //A Null, B Compact; CheckB, Return (1.0, 0, T) + case 3: //A Null, B Ordered; CheckB, Return (1.0, 0, T) + case 4: //A Null, B HashTbl; CheckB, Return (1.0, 0, T) Util.checkSeedHashes(seedHash_, b_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] - thetaLong_ = b_.getThetaLong(); + thetaLong_ = Long.MAX_VALUE; empty_ = true; - break; //{ThB, 0, T} - } - case 8: { //A is empty, B is null + break; + + case 9: //A Empty, B Empty; CheckAB, Return (1.0, 0, T) + Util.checkSeedHashes(seedHash_, b_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] + //$FALL-THROUGH$ + case 8: //A Empty, B Null; CheckA, Return (1.0, 0, T) Util.checkSeedHashes(seedHash_, a_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] - thetaLong_ = a_.getThetaLong(); + thetaLong_ = Long.MAX_VALUE; empty_ = true; - break; //{ThA, 0, T} - } - case 9: - case 10: - case 11: - case 12: { //A empty, B valid - Util.checkSeedHashes(seedHash_, a_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] + break; + + case 17: //A Compact, B Empty; CheckAB, Return (ThA, |A|, F), copyA + case 25: //A Ordered, B Empty; CheckAB, Return (ThA, |A|, F), copyA + case 33: //A HashTbl, B Empty; CheckAB, Return (ThA, |A|, F), copyA Util.checkSeedHashes(seedHash_, b_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] - thetaLong_ = min(a_.getThetaLong(), b_.getThetaLong()); - empty_ = true; - break; //{minT, 0, T} - } - case 16: - case 24: - case 32: { //A valid, B null + //$FALL-THROUGH$ + case 16: //A Compact, B Null; CheckA, Return (ThA, |A|, F), copyA + case 24: //A Ordered, B Null; CheckA, Return (ThA, |A|, F), copyA + case 32: //A HashTbl, B Null; CheckA, Return (ThA, |A|, F), copyA Util.checkSeedHashes(seedHash_, a_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] thetaLong_ = a_.getThetaLong(); - empty_ = a_.isEmpty(); - //move A to cache + empty_ = false; curCount_ = a_.getRetainedEntries(true); cache_ = compactCache(a_.getCache(), curCount_, thetaLong_, false); - break; //{ThA, |A|, E(a)} - } - case 17: - case 25: - case 33: { //A valid, B empty - Util.checkSeedHashes(seedHash_, a_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] - Util.checkSeedHashes(seedHash_, b_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] - thetaLong_ = min(a_.getThetaLong(), b_.getThetaLong()); - empty_ = a_.isEmpty(); - //move A < theta to cache - final long[] cache = a_.getCache(); - curCount_ = HashOperations.count(cache, thetaLong_); - cache_ = compactCache(cache, curCount_, thetaLong_, false); - break; //{minT, |A| < minT , E(a)} - } - case 18: - case 19: - case 34: - case 35: { //A compact or HT, B compact or ordered + break; + + case 18: //A Compact, B Compact; CheckAB, B -> H; => C,H; scanAllAsearchB() + case 19: //A Compact, B Ordered; CheckAB, B -> H; => C,H; scanAllAsearchB() + case 34: //A HashTbl, B Compact; CheckAB, B -> H; => H,H; scanAllAsearchB() + case 35: //A HashTbl, B Ordered; CheckAB, B -> H; => H,H; scanAllAsearchB() Util.checkSeedHashes(seedHash_, a_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] Util.checkSeedHashes(seedHash_, b_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] thetaLong_ = min(a_.getThetaLong(), b_.getThetaLong()); - empty_ = a_.isEmpty(); - //must convert B to HT - convertBtoHT(); //builds HT from B - scanAllAsearchB(); //builds cache, curCount from A, HT - break; //{minT, n, E(a)} - } - case 26: - case 27: { //A ordered early stop, B compact or ordered + empty_ = false; + convertBtoHT(); + scanAllAsearchB(); + break; + + case 26: //A Ordered, B Compact; CheckAB, B -> H; => O,H; scanEarlyStopAsearchB() + case 27: //A Ordered, B Ordered; CheckAB, B -> H; => O,H; scanEarlyStopAsearchB() Util.checkSeedHashes(seedHash_, a_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] Util.checkSeedHashes(seedHash_, b_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] thetaLong_ = min(a_.getThetaLong(), b_.getThetaLong()); - empty_ = a_.isEmpty(); - convertBtoHT(); //builds HT from B + empty_ = false; + convertBtoHT(); scanEarlyStopAsearchB(); - break; //{minT, n, E(a)} - } - case 20: - case 36: { //A compact or HT, B is already HT + break; + + case 20: //A Compact, B HashTbl; CheckAB, scanAllAsearchB() + case 36: //A HashTbl, B HashTbl; CheckAB, scanAllAsearchB() Util.checkSeedHashes(seedHash_, a_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] Util.checkSeedHashes(seedHash_, b_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] thetaLong_ = min(a_.getThetaLong(), b_.getThetaLong()); - empty_ = a_.isEmpty(); - //b is already HT + empty_ = false; lgArrLongsHT_ = ((UpdateSketch)b_).getLgArrLongs(); - bHashTable_ = b_.getCache(); //safe as bHashTable is read-only - scanAllAsearchB(); //builds cache, curCount from A, HT - break; //{minT, n, E(a)} - } - case 28: { //A ordered early stop, B is already hashtable + bHashTable_ = b_.getCache(); + scanAllAsearchB(); + break; + + case 28: //A Ordered, B HashTbl; CheckAB, scanEarlyStopAsearchB() Util.checkSeedHashes(seedHash_, a_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] Util.checkSeedHashes(seedHash_, b_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] thetaLong_ = min(a_.getThetaLong(), b_.getThetaLong()); - empty_ = a_.isEmpty(); - //b is already HT + empty_ = false; lgArrLongsHT_ = ((UpdateSketch)b_).getLgArrLongs(); - bHashTable_ = b_.getCache(); //safe as bHashTable is read-only + bHashTable_ = b_.getCache(); scanEarlyStopAsearchB(); - break; //{minT, n, E(a)} - } + break; + //default: //This cannot happen and cannot be tested } } diff --git a/src/main/java/com/yahoo/sketches/theta/HeapCompactSketch.java b/src/main/java/com/yahoo/sketches/theta/HeapCompactSketch.java index 72f60bb61..29543bc85 100644 --- a/src/main/java/com/yahoo/sketches/theta/HeapCompactSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/HeapCompactSketch.java @@ -40,10 +40,11 @@ abstract class HeapCompactSketch extends CompactSketch { final int curCount, final long thetaLong) { empty_ = empty; seedHash_ = seedHash; - curCount_ = curCount; - thetaLong_ = thetaLong; + curCount_ = empty ? 0 : curCount; + thetaLong_ = empty ? Long.MAX_VALUE : thetaLong; cache_ = cache; preLongs_ = computeCompactPreLongs(thetaLong, empty, curCount); + checkEmptyState(isEmpty(), getRetainedEntries(true), getThetaLong()); //TODO remove if not needed } //Sketch diff --git a/src/main/java/com/yahoo/sketches/theta/Intersection.java b/src/main/java/com/yahoo/sketches/theta/Intersection.java index 5686bc81d..d4bee3d2c 100644 --- a/src/main/java/com/yahoo/sketches/theta/Intersection.java +++ b/src/main/java/com/yahoo/sketches/theta/Intersection.java @@ -70,4 +70,29 @@ public Family getFamily() { */ public abstract void update(Sketch sketchIn); + /** + * Perform intersect set operation on the two given sketch arguments and return the result as an + * ordered CompactSketch on the heap. + * @param a The first sketch argument + * @param b The second sketch argument + * @return an ordered CompactSketch on the heap + */ + public CompactSketch intersect(final Sketch a, final Sketch b) { + return intersect(a, b, true, null); + } + + /** + * Perform intersect set operation on the two given sketches and return the result as a + * CompactSketch. + * @param a The first sketch argument + * @param b The second sketch argument + * @param dstOrdered + * See Destination Ordered. + * @param dstMem + * See Destination Memory. + * @return the result as a CompactSketch. + */ + public abstract CompactSketch intersect(Sketch a, Sketch b, boolean dstOrdered, + WritableMemory dstMem); + } diff --git a/src/main/java/com/yahoo/sketches/theta/IntersectionImpl.java b/src/main/java/com/yahoo/sketches/theta/IntersectionImpl.java index 817ff00ee..efa3643f3 100644 --- a/src/main/java/com/yahoo/sketches/theta/IntersectionImpl.java +++ b/src/main/java/com/yahoo/sketches/theta/IntersectionImpl.java @@ -41,6 +41,10 @@ private IntersectionImpl(final WritableMemory wmem, final long seed, final boole super(wmem, seed, newMem); } + IntersectionImpl(final short seedHash) { + super(seedHash); + } + /** * Construct a new Intersection target on the java heap. * @@ -57,6 +61,7 @@ static IntersectionImpl initNewHeapInstance(final long seed) { return impl; } + /** * Construct a new Intersection target direct to the given destination Memory. * Called by SetOperation.Builder. @@ -163,83 +168,118 @@ static IntersectionImpl wrapInstance(final WritableMemory srcMem, final long see @Override public void update(final Sketch sketchIn) { - final boolean firstCall = curCount_ < 0; - - //Corner cases - if (sketchIn == null) { //null -> Th = 1.0, count = 0, empty = true - //No seedHash to check - //Because of the def of null above and the Empty Rule (which is OR) empty_ must be true. + if (sketchIn != null) { + Util.checkSeedHashes(seedHash_, sketchIn.getSeedHash()); + } + //Null / Empty cases. + //Note: null == empty := Th = 1.0, count = 0, empty = true + if ((sketchIn == null) || sketchIn.isEmpty() || empty_) { //empty rule + //Because of the def of null above and the Empty Rule (which is OR), empty_ must be true. + //Whatever the current internal state, we make it empty. empty_ = true; - thetaLong_ = firstCall ? Long.MAX_VALUE : thetaLong_; //if Nth call, stays the same + thetaLong_ = Long.MAX_VALUE; curCount_ = 0; + lgArrLongs_ = 0; + maxLgArrLongs_ = 0; + hashTable_ = null; if (mem_ != null) { - PreambleUtil.setEmpty(mem_); + PreambleUtil.setEmpty(mem_); //true insertThetaLong(mem_, thetaLong_); insertCurCount(mem_, 0); + insertLgArrLongs(mem_, lgArrLongs_); } return; } - //Checks - Util.checkSeedHashes(seedHash_, sketchIn.getSeedHash()); - thetaLong_ = min(thetaLong_, sketchIn.getThetaLong()); //Theta rule - empty_ = empty_ || sketchIn.isEmpty(); //Empty rule - + empty_ = false; if (mem_ != null) { insertThetaLong(mem_, thetaLong_); - if (empty_) { PreambleUtil.setEmpty(mem_); } - else { clearEmpty(mem_); } + PreambleUtil.clearEmpty(mem_); //false } final int sketchInEntries = sketchIn.getRetainedEntries(true); - // The truth table for the following state machine for corner cases: - // Case CurCount SketchInEntries | Actions - // 1 <0 0 | CurCount = 0; HT = null; exit + // The truth table for the following state machine + // Case curCount sketchInEntries | Actions + // 1 <0 0 | First update, curCount = 0; HT = null; exit // 2 0 0 | CurCount = 0; HT = null; exit // 3 >0 0 | CurCount = 0; HT = null; exit - // 4 <0 >0 | Clone SketchIn; exit - // 5 0 >0 | CurCount = 0; HT = null; exit - // 6 >0 >0 | Perform full intersect - - if ((curCount_ == 0) || (sketchInEntries == 0)) { //Cases 1,2,3,5 - //All future intersections result in zero data, but theta can still be reduced. - curCount_ = 0; - if (mem_ != null) { insertCurCount(mem_, 0); } - hashTable_ = null; //No need for a HT. Don't bother clearing mem if valid - } - else if (firstCall) { //Case 4: Clone the incoming sketch - curCount_ = sketchIn.getRetainedEntries(true); - final int requiredLgArrLongs = computeMinLgArrLongsFromCount(curCount_); - final int priorLgArrLongs = lgArrLongs_; //prior only used in error message - lgArrLongs_ = requiredLgArrLongs; - - if (mem_ != null) { //Off heap, check if current dstMem is large enough - insertCurCount(mem_, curCount_); - insertLgArrLongs(mem_, lgArrLongs_); - if (requiredLgArrLongs <= maxLgArrLongs_) { //OK - mem_.clear(CONST_PREAMBLE_LONGS << 3, 8 << lgArrLongs_); //clear only what required + // 5 <0 >0 | First update, clone SketchIn; exit + // 6 0 >0 | CurCount = 0; HT = null; exit + // 7 >0 >0 | Perform full intersect + final int sw = ((curCount_ < 0) ? 1 : (curCount_ == 0) ? 2 : 3) + | (((sketchInEntries > 0) ? 1 : 0) << 2) ; + switch (sw) { + case 1: + case 2: + case 3: + case 6: { //(curCount_ == 0) || (sketchInEntries == 0) + //All future intersections result in zero data, but theta can still be reduced. + curCount_ = 0; + if (mem_ != null) { insertCurCount(mem_, 0); } + hashTable_ = null; //No need for a HT. Don't bother clearing mem if valid + break; + } + case 5: { // curCount_ < 0; This is the 1st update, clone the incoming sketch + curCount_ = sketchIn.getRetainedEntries(true); + final int requiredLgArrLongs = computeMinLgArrLongsFromCount(curCount_); + final int priorLgArrLongs = lgArrLongs_; //prior only used in error message + lgArrLongs_ = requiredLgArrLongs; + + if (mem_ != null) { //Off heap, check if current dstMem is large enough + insertCurCount(mem_, curCount_); + insertLgArrLongs(mem_, lgArrLongs_); + if (requiredLgArrLongs <= maxLgArrLongs_) { //OK + mem_.clear(CONST_PREAMBLE_LONGS << 3, 8 << lgArrLongs_); //clear only what required + } + else { //not enough space in dstMem + throw new SketchesArgumentException( + "Insufficient dstMem hash table space: " + + (1 << requiredLgArrLongs) + " > " + (1 << priorLgArrLongs)); + } } - else { //not enough space in dstMem - throw new SketchesArgumentException( - "Insufficient dstMem hash table space: " - + (1 << requiredLgArrLongs) + " > " + (1 << priorLgArrLongs)); + else { //On the heap, allocate a HT + hashTable_ = new long[1 << lgArrLongs_]; } + moveDataToTgt(sketchIn.getCache(), curCount_); + break; } - else { //On the heap, allocate a HT - hashTable_ = new long[1 << lgArrLongs_]; + case 7: { // (curCount > 0) && (sketchInEntries > 0); Perform full intersect + //Sets resulting hashTable, curCount and adjusts lgArrLongs + performIntersect(sketchIn); + break; } - - moveDataToTgt(sketchIn.getCache(), curCount_); + //default: not possible } - else { //Case 6: Perform full intersect - //Sets resulting hashTable, curCount and adjusts lgArrLongs - performIntersect(sketchIn); + } + + @Override + public CompactSketch intersect(final Sketch a, final Sketch b, final boolean dstOrdered, + final WritableMemory dstMem) { + reset(); + update(a); + update(b); + return getResult(dstOrdered, dstMem); + } + + @Override + public void reset() { + curCount_ = -1; + thetaLong_ = Long.MAX_VALUE; + empty_ = false; + hashTable_ = null; + if (mem_ != null) { + insertLgArrLongs(mem_, lgArrLongs_); //make sure + insertCurCount(mem_, -1); + insertThetaLong(mem_, Long.MAX_VALUE); + clearEmpty(mem_); } } - void performIntersect(final Sketch sketchIn) { + //restricted + + private void performIntersect(final Sketch sketchIn) { // curCount and input data are nonzero, match against HT assert ((curCount_ > 0) && (!empty_)); final long[] cacheIn = sketchIn.getCache(); @@ -299,7 +339,7 @@ void performIntersect(final Sketch sketchIn) { } } - void moveDataToTgt(final long[] arr, final int count) { + private void moveDataToTgt(final long[] arr, final int count) { final int arrLongsIn = arr.length; int tmpCnt = 0; if (mem_ != null) { //Off Heap puts directly into mem diff --git a/src/main/java/com/yahoo/sketches/theta/IntersectionImplR.java b/src/main/java/com/yahoo/sketches/theta/IntersectionImplR.java index 342f72045..816308c27 100644 --- a/src/main/java/com/yahoo/sketches/theta/IntersectionImplR.java +++ b/src/main/java/com/yahoo/sketches/theta/IntersectionImplR.java @@ -20,10 +20,6 @@ import static com.yahoo.sketches.theta.PreambleUtil.SER_VER; import static com.yahoo.sketches.theta.PreambleUtil.SER_VER_BYTE; import static com.yahoo.sketches.theta.PreambleUtil.THETA_LONG; -import static com.yahoo.sketches.theta.PreambleUtil.clearEmpty; -import static com.yahoo.sketches.theta.PreambleUtil.insertCurCount; -import static com.yahoo.sketches.theta.PreambleUtil.insertLgArrLongs; -import static com.yahoo.sketches.theta.PreambleUtil.insertThetaLong; import com.yahoo.memory.Memory; import com.yahoo.memory.WritableMemory; @@ -38,7 +34,7 @@ * *

This implementation uses data either on-heap or off-heap in a given Memory * that is owned and managed by the caller. - * The off-heap Memory, which if managed properly will greatly reduce the need for + * The off-heap Memory, which if managed properly, will greatly reduce the need for * the JVM to perform garbage collection.

* * @author Lee Rhodes @@ -72,6 +68,16 @@ class IntersectionImplR extends Intersection { } } + IntersectionImplR(final short seedHash) { + seedHash_ = seedHash; + mem_ = null; + lgArrLongs_ = 0; + curCount_ = -1; + thetaLong_ = Long.MAX_VALUE; + empty_ = false; + hashTable_ = null; + } + /** * Wrap an Intersection target around the given source Memory containing intersection data. * @param srcMem The source Memory image. @@ -87,7 +93,7 @@ static IntersectionImplR wrapInstance(final Memory srcMem, final long seed) { static IntersectionImplR internalWrapInstance(final Memory srcMem, final IntersectionImplR impl) { //Get Preamble //Note: Intersection does not use lgNomLongs (or k), per se. - //seedHash loaded and checked in private constructor + //seedHash loaded and checked in constructor final int preLongsMem = srcMem.getByte(PREAMBLE_LONGS_BYTE) & 0X3F; final int serVer = srcMem.getByte(SER_VER_BYTE) & 0XFF; final int famID = srcMem.getByte(FAMILY_BYTE) & 0XFF; @@ -131,7 +137,8 @@ static IntersectionImplR internalWrapInstance(final Memory srcMem, final Interse public CompactSketch getResult(final boolean dstOrdered, final WritableMemory dstMem) { if (curCount_ < 0) { throw new SketchesStateException( - "Calling getResult() with no intervening intersections is not a legal result."); + "Calling getResult() with no intervening intersections would represent the infinite set, " + + "which is not a legal result."); } long[] compactCacheR; @@ -187,16 +194,7 @@ public boolean isSameResource(final Memory that) { @Override public void reset() { - curCount_ = -1; - thetaLong_ = Long.MAX_VALUE; - empty_ = false; - hashTable_ = null; - if (mem_ != null) { - insertLgArrLongs(mem_, lgArrLongs_); //make sure - insertCurCount(mem_, -1); - insertThetaLong(mem_, Long.MAX_VALUE); - clearEmpty(mem_); - } + throw new SketchesReadOnlyException(); } @Override @@ -240,6 +238,12 @@ public void update(final Sketch sketchIn) { throw new SketchesReadOnlyException(); } + @Override + public CompactSketch intersect(final Sketch a, final Sketch b, final boolean dstOrdered, + final WritableMemory dstMem) { + throw new SketchesReadOnlyException(); + } + //restricted @Override diff --git a/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java b/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java index 4c2d7d900..7bc5bc480 100644 --- a/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java +++ b/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java @@ -5,207 +5,56 @@ package com.yahoo.sketches.theta; +import static com.yahoo.sketches.theta.Sketch.checkEmptyState; + import java.util.Arrays; -import com.yahoo.sketches.HashOperations; import com.yahoo.sketches.SketchesArgumentException; -import com.yahoo.sketches.SketchesException; import com.yahoo.sketches.Util; /** * Set Operations where the arguments are presented in pairs as in C = Op(A,B). These are - * stateless operations and the result is returned immediately. These operations are designed for - * high performance and only accept ordered, CompactSketches, which may be either Heap-based or - * Direct. The returned results are always in the form of an ordered CompactSketch. + * stateless operations and the result is returned immediately. + * + *

These operations are designed for convenience and accept Sketches that may be either + * Heap-based or Direct. * * @author Lee Rhodes */ public class PairwiseSetOperations { /** - * This implements a stateless, pair-wise intersection operation on ordered, - * CompactSketches that are either Heap-based or Direct. + * This implements a stateless, pair-wise Intersect operation on sketches + * that are either Heap-based or Direct. * If both inputs are null a null is returned. - * If one is null an empty sketch is returned. - * @param skA The first ordered, CompactSketch argument. - * @param skB The second ordered, CompactSketch argument. - * @return the result as an ordered CompactSketch. + * + * @param skA The first Sketch argument. + * @param skB The second Sketch argument. + * @return the result as an ordered CompactSketch on the heap. */ - public static CompactSketch intersect(final CompactSketch skA, final CompactSketch skB) { - if ((skA == null) && (skB == null)) { return null; } //no way to construct the seedHash - - if (skA == null) { - return HeapCompactOrderedSketch - .compact(new long[0], true, skB.getSeedHash(), 0, skB.getThetaLong()); - } - if (skB == null) { - return HeapCompactOrderedSketch - .compact(new long[0], true, skA.getSeedHash(), 0, skA.getThetaLong()); - } - - //Both sketches are valid, check seedHashes and ordered - final short seedHash = Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); - if (!skB.isOrdered()) { - throw new SketchesArgumentException("skB must be ordered!"); - } - if (!skA.isOrdered()) { - throw new SketchesArgumentException("skA must be ordered!"); - } - - //Full Intersection - final boolean emptyA = skA.isEmpty(); - final boolean emptyB = skB.isEmpty(); - final boolean emptyRule = emptyA || emptyB; //Empty rule is OR - - final long thetaLong = Math.min(skA.getThetaLong(), skB.getThetaLong()); //Theta rule - - if (emptyRule) { //even if emptyRule = true, theta can be < 1.0 - return HeapCompactOrderedSketch - .compact(new long[0], emptyRule, seedHash, 0, thetaLong); - } - - //Both sketches are non-empty - final long[] cacheA = (skA.isDirect()) ? skA.getCache() : skA.getCache().clone(); - final long[] cacheB = (skB.isDirect()) ? skB.getCache() : skB.getCache().clone(); - final int aLen = cacheA.length; - final int bLen = cacheB.length; - - final long[] outCache = new long[Math.min(aLen, bLen)]; - - int indexA = 0; - int indexB = 0; - int outCount = 0; - - while ((indexA < aLen) && (indexB < bLen)) { - final long hashA = cacheA[indexA]; - final long hashB = cacheB[indexB]; - - if ((hashA >= thetaLong) || (hashB >= thetaLong)) { - break; - } - - if (hashA == hashB) { - outCache[outCount++] = hashA; - ++indexA; - ++indexB; - } else if (hashA < hashB) { - ++indexA; - } else { - ++indexB; - } - } - - return HeapCompactOrderedSketch - .compact(Arrays.copyOf(outCache, outCount), emptyRule, seedHash, outCount, thetaLong); + public static CompactSketch intersect(final Sketch skA, final Sketch skB) { + if ((skA == null) && (skB == null)) { return null; } + final short seedHash = (skA == null) ? skB.getSeedHash() : skA.getSeedHash(); + final Intersection inter = new IntersectionImpl(seedHash); + return inter.intersect(skA, skB, true, null); } /** - * This implements a stateless, pair-wise A AND NOT B operation on ordered, - * CompactSketches that are either Heap-based or Direct. - * If both inputs are null a null is returned. If skA is null an empty sketch is returned. - * If skB is null or empty skA is returned. + * This implements a stateless, pair-wise A AND NOT B operation on Sketches + * that are either Heap-based or Direct. + * If both inputs are null a null is returned. * - * @param skA The first ordered, CompactSketch argument. - * @param skB The second ordered, CompactSketch argument. - * @return the result as an ordered CompactSketch. - */ //see HeapAnotB.compute() for return rule table - public static CompactSketch aNotB(final CompactSketch skA, final CompactSketch skB) { - if ((skA == null) && (skB == null)) { return null; } //no way to construct the seedHash - - if (skA == null) { - if (!skB.isOrdered()) { - throw new SketchesException("skB must be ordered!"); - } - //return rule {ThB, 0, T} - return HeapCompactOrderedSketch - .compact(new long[0], true, skB.getSeedHash(), 0, skB.getThetaLong()); - } - if (skB == null) { - if (!skA.isOrdered()) { - throw new SketchesException("skA must be ordered!"); - } - return skA; //return rule {ThA, |A|, E(a)} - } - - //Both sketches are valid check seedHashes and ordered - final short seedHash = Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); - if (!skB.isOrdered()) { - throw new SketchesArgumentException("skB must be ordered!"); - } - if (!skA.isOrdered()) { - throw new SketchesArgumentException("skA must be ordered!"); - } - - final boolean emptyA = skA.isEmpty(); - final boolean emptyB = skB.isEmpty(); - final boolean bothEmpty = emptyA && emptyB; - - final long thetaLong = Math.min(skA.getThetaLong(), skB.getThetaLong()); //Theta rule - final boolean emptyRule = emptyA; //Empty rule is whatever A is - - if (emptyA || bothEmpty) { //return rule {minT, 0, T} - return HeapCompactOrderedSketch - .compact(new long[0], emptyRule, seedHash, 0, thetaLong); - } - - final long[] cacheA = (skA.isDirect()) ? skA.getCache() : skA.getCache().clone(); - - if (emptyB) { //return rule {minT, |A| < minT , E(a)} - final int curCount = HashOperations.count(cacheA, thetaLong); - final long[] cache = CompactSketch.compactCache(cacheA, curCount, thetaLong, true); - return HeapCompactOrderedSketch - .compact(cache, emptyRule, seedHash, curCount, thetaLong); - } - - //Both are non-empty - final long[] cacheB = (skB.isDirect()) ? skB.getCache() : skB.getCache().clone(); - - final int aLen = cacheA.length; - final int bLen = cacheB.length; - - final long[] outCache = new long[aLen]; - - int indexA = 0; - int indexB = 0; - int indexOut = 0; - long hashA = cacheA[indexA]; - long hashB = cacheB[indexB]; - - while ((indexA < aLen) || (indexB < bLen)) { - if (hashA == hashB) { - if (hashA < thetaLong) { - //reject - hashA = (++indexA < aLen) ? cacheA[indexA] : thetaLong; - hashB = (++indexB < bLen) ? cacheB[indexB] : thetaLong; - continue; - } - break; - } - else if (hashA < hashB) { - if (hashA < thetaLong) { - outCache[indexOut++] = hashA; //keep - hashA = (++indexA < aLen) ? cacheA[indexA] : thetaLong; - continue; - } - break; - } - else { //hashA > hashB - if (hashB < thetaLong) { - //reject - hashB = (++indexB < bLen) ? cacheB[indexB] : thetaLong; - continue; - } - break; - } - } - - final int outLen = indexOut; - - return HeapCompactOrderedSketch - .compact(Arrays.copyOf(outCache, outLen), emptyA, seedHash, outLen, thetaLong); + * @param skA The first Sketch argument. + * @param skB The second Sketch argument. + * @return the result as an ordered CompactSketch on the heap. + */ + public static CompactSketch aNotB(final Sketch skA, final Sketch skB) { + if ((skA == null) && (skB == null)) { return null; } + final short seedHash = (skA == null) ? skB.getSeedHash() : skA.getSeedHash(); + final HeapAnotB anotb = new HeapAnotB(seedHash); + return anotb.aNotB(skA, skB, true, null); } - /** * This implements a stateless, pair-wise union operation on ordered, * CompactSketches that are either Heap-based or Direct. @@ -223,69 +72,75 @@ public static CompactSketch union(final CompactSketch skA, final CompactSketch s /** * This implements a stateless, pair-wise union operation on ordered, - * CompactSketches that are either Heap-based or Direct. - * If both inputs are null a null is returned. - * If one is null the other is returned, which can be either Heap-based or Direct. + * CompactSketches that are either Heap-based or Direct. The returned sketch will be cutback to + * k if required, similar to the regular Union operation. If a cutback is required, the returned + * sketch will always be on the heap. + * If both inputs are null a null is returned. If either sketch is empty its Theta is ignored. + * If one is null the other is returned, which may be either Direct or heap-based if a cutback + * is required. * * @param skA The first ordered, CompactSketch argument. * @param skB The second ordered, CompactSketch argument * @param k The upper bound of the number of entries to be retained by the sketch * @return the result as an ordered CompactSketch. */ + @SuppressWarnings("null") public static CompactSketch union(final CompactSketch skA, final CompactSketch skB, final int k) { - if ((skA == null) && (skB == null)) { return null; } //no way to construct the seedHash - - if (skA == null) { - if (!skB.isOrdered()) { //must be ordered - throw new SketchesException("skB must be ordered!"); + //Handle all corner cases with null or empty arguments + //For backward compatibility, we must allow input empties with Theta < 1.0. + final int swA = (skA == null) ? 1 : skA.isEmpty() ? 2 : 3; + final int swB = (skB == null) ? 1 : skB.isEmpty() ? 2 : 3; + final int sw = (swA << 2) | swB; + switch (sw) { + case 5: { //skA == null; skB == null; return null. Cannot determine seedhash. + return null; } - if (skB.getRetainedEntries(true) > k) { //guarantees cutback to k - final long[] cacheB = (skB.isDirect()) ? skB.getCache() : skB.getCache().clone(); - final long thetaLong = cacheB[k]; - final long[] arrB = Arrays.copyOf(cacheB, k); - return HeapCompactOrderedSketch - .compact(arrB, skB.isEmpty(), skB.getSeedHash(), k, thetaLong); + case 6: { //skA == null; skB == empty; return empty + checkOrdered(skB); + return (skB.getThetaLong() == Long.MAX_VALUE) ? skB : + HeapCompactOrderedSketch.compact(new long[0], true, skB.getSeedHash(), 0, Long.MAX_VALUE); } - return skB; - } - - if (skB == null) { - if (!skA.isOrdered()) { //must be ordered - throw new SketchesException("skA must be ordered!"); + case 7: { //skA == null; skB == valid; return skB + checkOrdered(skB); + return maybeCutback(skB, k); } - if (skA.getRetainedEntries(true) > k) { //guarantees cutback to k - final long[] cacheA = (skA.isDirect()) ? skA.getCache() : skA.getCache().clone(); - final long thetaLong = cacheA[k]; - final long[] arrA = Arrays.copyOf(cacheA, k); - return HeapCompactOrderedSketch - .compact(arrA, skA.isEmpty(), skA.getSeedHash(), k, thetaLong); + case 9: { //skA == empty; skB == null; return empty + checkOrdered(skA); + return (skA.getThetaLong() == Long.MAX_VALUE) ? skA : + HeapCompactOrderedSketch.compact(new long[0], true, skA.getSeedHash(), 0, Long.MAX_VALUE); } - return skA; - } - - //Both sketches are valid check seedHashes and ordered - final short seedHash = Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); - if (!skB.isOrdered()) { - throw new SketchesArgumentException("skB must be ordered!"); - } - if (!skA.isOrdered()) { - throw new SketchesArgumentException("skA must be ordered!"); - } - - final boolean emptyA = skA.isEmpty(); - final boolean emptyB = skB.isEmpty(); - final boolean bothEmptyRule = emptyA && emptyB; //Empty rule is AND - - if (bothEmptyRule) { - return (skA.getThetaLong() < skB.getThetaLong()) ? skA : skB; + case 10: { //skA == empty; skB == empty; return empty + final short seedHash = Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); + if (skA.getThetaLong() == Long.MAX_VALUE) { checkOrdered(skA); return skA; } + if (skB.getThetaLong() == Long.MAX_VALUE) { checkOrdered(skB); return skB; } + return HeapCompactOrderedSketch.compact(new long[0], true, seedHash, 0, Long.MAX_VALUE); + } + case 11: { //skA == empty; skB == valid; return skB + Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); + checkOrdered(skB); + return maybeCutback(skB, k); + } + case 13: { //skA == valid; skB == null; return skA + checkOrdered(skA); + return maybeCutback(skA, k); + } + case 14: { //skA == valid; skB == empty; return skA + Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); + checkOrdered(skA); + return maybeCutback(skA, k); + } + case 15: { //skA == valid; skB == valid; perform full union + Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); + checkOrdered(skA); + checkOrdered(skB); + break; + } + //default: cannot happen } - long thetaLong = Math.min(skA.getThetaLong(), skB.getThetaLong()); //Theta rule - - // Attempting to shortcut this if one of the arguments is "empty" turns out to be complex. - // The theta of an empty sketch could be < 1.0 and will empact the other sketch. - + //Both sketches are valid with matching seedhashes and ordered //Full Union operation + long thetaLong = Math.min(skA.getThetaLong(), skB.getThetaLong()); //Theta rule final long[] cacheA = (skA.isDirect()) ? skA.getCache() : skA.getCache().clone(); final long[] cacheB = (skB.isDirect()) ? skB.getCache() : skB.getCache().clone(); final int aLen = cacheA.length; @@ -339,10 +194,40 @@ else if (hashA < hashB) { } } - final int outLen = indexOut; - + int curCount = indexOut; + final long[] outArr; + if (indexOut > k) { + outArr = Arrays.copyOf(outCache, k); //cutback to k + curCount = k; + } else { + outArr = Arrays.copyOf(outCache, curCount); //copy only valid items + } + checkEmptyState(false, curCount, thetaLong); return HeapCompactOrderedSketch - .compact(Arrays.copyOf(outCache, outLen), bothEmptyRule, seedHash, outLen, thetaLong); + .compact(outArr, false, skA.getSeedHash(), curCount, thetaLong); + } + + private static CompactSketch maybeCutback(final CompactSketch csk, final int k) { + final boolean empty = csk.isEmpty(); + int curCount = csk.getRetainedEntries(true); + long thetaLong = csk.getThetaLong(); + if (curCount > k) { //cutback to k + final long[] cache = (csk.isDirect()) ? csk.getCache() : csk.getCache().clone(); + thetaLong = cache[k]; + final long[] arr = Arrays.copyOf(cache, k); + curCount = k; + checkEmptyState(empty, curCount, thetaLong); + return HeapCompactOrderedSketch + .compact(arr, empty, csk.getSeedHash(), curCount, thetaLong); + } + checkEmptyState(empty, curCount, thetaLong); + return csk; + } + + private static void checkOrdered(final CompactSketch csk) { + if (!csk.isOrdered()) { + throw new SketchesArgumentException("Given sketch must be ordered."); + } } } diff --git a/src/main/java/com/yahoo/sketches/theta/PreambleUtil.java b/src/main/java/com/yahoo/sketches/theta/PreambleUtil.java index 15b9fbe22..c35597383 100644 --- a/src/main/java/com/yahoo/sketches/theta/PreambleUtil.java +++ b/src/main/java/com/yahoo/sketches/theta/PreambleUtil.java @@ -33,16 +33,79 @@ * *

An empty CompactSketch only requires 8 bytes.

* - *

A SingleItemSketch requires an 8 byte preamble plus a single hash item of 8 bytes.

+ *
+ * Long || Start Byte Adr:
+ * Adr:
+ *      ||    7   |    6   |    5   |    4   |    3   |    2   |    1   |     0              |
+ *  0   ||    Seed Hash    | Flags  |        |        | FamID  | SerVer |     PreLongs = 1   |
+ * 
+ * + *

A SingleItemSketch (extends CompactSketch) requires an 8 byte preamble plus a single + * hash item of 8 bytes.

+ * + *
+ * Long || Start Byte Adr:
+ * Adr:
+ *      ||    7   |    6   |    5   |    4   |    3   |    2   |    1   |     0              |
+ *  0   ||    Seed Hash    | Flags  |        |        | FamID  | SerVer |     PreLongs = 1   |
+ *
+ *      ||   15   |   14   |   13   |   12   |   11   |   10   |    9   |     8              |
+ *  1   ||---------------------------Single long hash----------------------------------------|
+ * 
* *

An exact (non-estimating) CompactSketch requires 16 bytes of preamble plus a compact array of * longs.

* + *
+ * Long || Start Byte Adr:
+ * Adr:
+ *      ||    7   |    6   |    5   |    4   |    3   |    2   |    1   |     0              |
+ *  0   ||    Seed Hash    | Flags  |        |        | FamID  | SerVer |     PreLongs = 2   |
+ *
+ *      ||   15   |   14   |   13   |   12   |   11   |   10   |    9   |     8              |
+ *  1   ||-----------------p-----------------|----------Retained Entries Count---------------|
+ *
+ *      ||   23   |   22   |   21    |  20   |   19   |   18   |   17   |    16              |
+ *  2   ||----------------------Start of Compact Long Array----------------------------------|
+ * 
+ * *

An estimating CompactSketch requires 24 bytes of preamble plus a compact array of longs.

* + *
+ * Long || Start Byte Adr:
+ * Adr:
+ *      ||    7   |    6   |    5   |    4   |    3   |    2   |    1   |     0              |
+ *  0   ||    Seed Hash    | Flags  |        |        | FamID  | SerVer |     PreLongs = 3   |
+ *
+ *      ||   15   |   14   |   13   |   12   |   11   |   10   |    9   |     8              |
+ *  1   ||-----------------p-----------------|----------Retained Entries Count---------------|
+ *
+ *      ||   23   |   22   |   21    |  20   |   19   |   18   |   17   |    16              |
+ *  2   ||------------------------------THETA_LONG-------------------------------------------|
+ *
+ *      ||   31   |   30   |   29   |   28   |   27   |   26   |   25   |    24              |
+ *  3   ||----------------------Start of Compact Long Array----------------------------------|
+ *  
+ * *

An UpdateSketch requires 24 bytes of preamble plus a non-compact array of longs representing a * hash table.

* + *
+ * Long || Start Byte Adr:
+ * Adr:
+ *      ||    7   |    6   |    5   |    4   |    3   |    2   |    1   |     0              |
+ *  0   ||    Seed Hash    | Flags  |  LgArr |  lgNom | FamID  | SerVer | RF, PreLongs = 3   |
+ *
+ *      ||   15   |   14   |   13   |   12   |   11   |   10   |    9   |     8              |
+ *  1   ||-----------------p-----------------|----------Retained Entries Count---------------|
+ *
+ *      ||   23   |   22   |   21    |  20   |   19   |   18   |   17   |    16              |
+ *  2   ||------------------------------THETA_LONG-------------------------------------------|
+ *
+ *      ||   31   |   30   |   29   |   28   |   27   |   26   |   25   |    24              |
+ *  3   ||----------------------Start of Hash Table of longs---------------------------------|
+ *  
+ * *

Union objects require 32 bytes of preamble plus a non-compact array of longs representing a * hash table.

* @@ -50,7 +113,7 @@ * Long || Start Byte Adr: * Adr: * || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | - * 0 || Seed Hash | Flags | LgArr | lgNom | FamID | SerVer | RF, Preamble_Longs | + * 0 || Seed Hash | Flags | LgArr | lgNom | FamID | SerVer | RF, PreLongs = 4 | * * || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 | * 1 ||-----------------p-----------------|----------Retained Entries Count---------------| @@ -59,7 +122,11 @@ * 2 ||------------------------------THETA_LONG-------------------------------------------| * * || 31 | 30 | 29 | 28 | 27 | 26 | 25 | 24 | - * 3 ||---------------------------Start of Long Array-------------------------------------| + * 3 ||---------------------------UNION THETA LONG----------------------------------------| + * + * || 39 | 38 | 37 | 36 | 35 | 34 | 33 | 32 | + * 4 ||----------------------Start of Hash Table of longs---------------------------------| + * * * * @author Lee Rhodes @@ -161,23 +228,23 @@ static String preambleToString(final Memory mem) { final int seedHash = extractSeedHash(mem); - //assumes preLongs == 1 - int curCount = singleItem ? 1 : 0; //preLongs 1 empty or singleItem + //assumes preLongs == 1; empty or singleItem + int curCount = singleItem ? 1 : 0; float p = (float) 1.0; //preLongs 1 or 2 long thetaLong = Long.MAX_VALUE; //preLongs 1 or 2 long thetaULong = thetaLong; //preLongs 1, 2 or 3 - if (preLongs == 2) { + if (preLongs == 2) { //exact (non-estimating) CompactSketch curCount = extractCurCount(mem); p = extractP(mem); } - else if (preLongs == 3) { + else if (preLongs == 3) { //Update Sketch curCount = extractCurCount(mem); p = extractP(mem); thetaLong = extractThetaLong(mem); thetaULong = thetaLong; } - else if (preLongs == 4) { + else if (preLongs == 4) { //Union curCount = extractCurCount(mem); p = extractP(mem); thetaLong = extractThetaLong(mem); @@ -358,7 +425,7 @@ static void insertUnionThetaLong(final WritableMemory wmem, final long unionThet wmem.putLong(UNION_THETA_LONG, unionThetaLong); } - //TODO convert to set/clear/any bits + //TODO convert these to set/clear/any bits static void setEmpty(final WritableMemory wmem) { int flags = wmem.getByte(FLAGS_BYTE) & 0XFF; flags |= EMPTY_FLAG_MASK; diff --git a/src/main/java/com/yahoo/sketches/theta/SetOperation.java b/src/main/java/com/yahoo/sketches/theta/SetOperation.java index 33551ed0b..82d7109d7 100644 --- a/src/main/java/com/yahoo/sketches/theta/SetOperation.java +++ b/src/main/java/com/yahoo/sketches/theta/SetOperation.java @@ -12,6 +12,7 @@ import static com.yahoo.sketches.Util.ceilingPowerOf2; import static com.yahoo.sketches.theta.PreambleUtil.FAMILY_BYTE; import static com.yahoo.sketches.theta.PreambleUtil.SER_VER_BYTE; +import static com.yahoo.sketches.theta.Sketch.checkEmptyState; import static java.lang.Math.max; import com.yahoo.memory.Memory; @@ -223,29 +224,34 @@ static short computeSeedHash(final long seed) { //used only by the set operations static final CompactSketch createCompactSketch(final long[] compactCache, final boolean empty, - final short seedHash, final int curCount, final long thetaLong, final boolean dstOrdered, + final short seedHash, int curCount, long thetaLong, final boolean dstOrdered, final WritableMemory dstMem) { + if (empty) { + curCount = 0; + thetaLong = Long.MAX_VALUE; + } + checkEmptyState(empty, curCount, thetaLong); CompactSketch sketchOut = null; final int sw = (dstOrdered ? 2 : 0) | ((dstMem != null) ? 1 : 0); switch (sw) { case 0: { //dst not ordered, dstMem == null sketchOut = HeapCompactUnorderedSketch.compact(compactCache, empty, seedHash, curCount, - thetaLong); + thetaLong); //converts to SingleItem if curCount == 1 break; } case 1: { //dst not ordered, dstMem == valid sketchOut = DirectCompactUnorderedSketch.compact(compactCache, empty, seedHash, curCount, - thetaLong, dstMem); + thetaLong, dstMem); //converts to SingleItem format if curCount == 1 break; } case 2: { //dst ordered, dstMem == null sketchOut = HeapCompactOrderedSketch.compact(compactCache, empty, seedHash, curCount, - thetaLong); + thetaLong); //converts to SingleItem format if curCount == 1 break; } case 3: { //dst ordered, dstMem == valid sketchOut = DirectCompactOrderedSketch.compact(compactCache, empty, seedHash, curCount, - thetaLong, dstMem); + thetaLong, dstMem); //converts to SingleItem format if curCount == 1 break; } //default: //This cannot happen and cannot be tested diff --git a/src/main/java/com/yahoo/sketches/theta/Sketch.java b/src/main/java/com/yahoo/sketches/theta/Sketch.java index e03b6bb33..23b96692b 100644 --- a/src/main/java/com/yahoo/sketches/theta/Sketch.java +++ b/src/main/java/com/yahoo/sketches/theta/Sketch.java @@ -24,6 +24,7 @@ import com.yahoo.sketches.BinomialBoundsN; import com.yahoo.sketches.Family; import com.yahoo.sketches.SketchesArgumentException; +import com.yahoo.sketches.SketchesStateException; import com.yahoo.sketches.Util; /** @@ -557,6 +558,35 @@ static final void checkSketchAndMemoryFlags(final Sketch sketch) { } } + /** + * Checks for an illegal state of the empty flag. The truth table is as follows: + *
+   *  Empty CurCount Theta State    Comments
+   *    T      0       1.0   OK     The Normal Empty State
+   *    T      0      <1.0   Error  This can be an initial on-heap state if p < 1.0,
+   *                                  but should stored as a Normal Empty State.
+   *    T     !0       1.0   Error  Empty and curCount !0 should never co-exist
+   *    T     !0      <1.0   Error  Empty and curCount !0 should never co-exist
+   *    F      0       1.0   Error  This conflicts with the normal empty state
+   *    F      0      <1.0   OK     This can result from set operations
+   *    F     !0       1.0   OK     This corresponds to a sketch in exact mode
+   *    F     !0      <1.0   OK     This corresponds to a sketch in estimation mode
+   * 
+ * + * @param empty the state of the empty flag + * @param curCount the current number of retained entries + * @param thetaLong the value of theta as a long + */ + static final void checkEmptyState(final boolean empty, final int curCount, final long thetaLong) { + final boolean thLT1 = thetaLong < Long.MAX_VALUE; + final boolean zeroCount = curCount == 0; + final boolean error = (empty && !zeroCount) || (zeroCount && (empty ^ !thLT1)); + if (error) { + throw new SketchesStateException("Improper Empty State: Empty: " + empty + + ", CurCount=0: " + zeroCount + " Theta<1.0: " + thLT1); + } + } + static final double estimate(final long thetaLong, final int curCount, final boolean empty) { if (estMode(thetaLong, empty)) { final double theta = thetaLong / MAX_THETA_LONG_AS_DOUBLE; diff --git a/src/main/java/com/yahoo/sketches/theta/UnionImpl.java b/src/main/java/com/yahoo/sketches/theta/UnionImpl.java index 939936745..47af3b9dd 100644 --- a/src/main/java/com/yahoo/sketches/theta/UnionImpl.java +++ b/src/main/java/com/yahoo/sketches/theta/UnionImpl.java @@ -48,7 +48,8 @@ final class UnionImpl extends Union { */ private final UpdateSketch gadget_; private final short seedHash_; //eliminates having to compute the seedHash on every update. - private long unionThetaLong_; //when on-heap, this is the only copy + private long unionThetaLong_ = Long.MAX_VALUE; //when on-heap, this is the only copy + private boolean unionEmpty_ = true; private UnionImpl(final UpdateSketch gadget, final long seed) { gadget_ = gadget; @@ -114,6 +115,7 @@ static UnionImpl heapifyInstance(final Memory srcMem, final long seed) { final UpdateSketch gadget = HeapQuickSelectSketch.heapifyInstance(srcMem, seed); final UnionImpl unionImpl = new UnionImpl(gadget, seed); unionImpl.unionThetaLong_ = srcMem.getLong(UNION_THETA_LONG); + unionImpl.unionEmpty_ = PreambleUtil.isEmpty(srcMem); return unionImpl; } @@ -130,6 +132,7 @@ static UnionImpl fastWrap(final Memory srcMem, final long seed) { final UpdateSketch gadget = DirectQuickSelectSketchR.fastReadOnlyWrap(srcMem, seed); final UnionImpl unionImpl = new UnionImpl(gadget, seed); unionImpl.unionThetaLong_ = srcMem.getLong(UNION_THETA_LONG); + unionImpl.unionEmpty_ = PreambleUtil.isEmpty(srcMem); return unionImpl; } @@ -146,6 +149,7 @@ static UnionImpl fastWrap(final WritableMemory srcMem, final long seed) { final UpdateSketch gadget = DirectQuickSelectSketch.fastWritableWrap(srcMem, seed); final UnionImpl unionImpl = new UnionImpl(gadget, seed); unionImpl.unionThetaLong_ = srcMem.getLong(UNION_THETA_LONG); + unionImpl.unionEmpty_ = PreambleUtil.isEmpty(srcMem); return unionImpl; } @@ -162,6 +166,7 @@ static UnionImpl wrapInstance(final Memory srcMem, final long seed) { final UpdateSketch gadget = DirectQuickSelectSketchR.readOnlyWrap(srcMem, seed); final UnionImpl unionImpl = new UnionImpl(gadget, seed); unionImpl.unionThetaLong_ = srcMem.getLong(UNION_THETA_LONG); + unionImpl.unionEmpty_ = PreambleUtil.isEmpty(srcMem); return unionImpl; } @@ -178,6 +183,7 @@ static UnionImpl wrapInstance(final WritableMemory srcMem, final long seed) { final UpdateSketch gadget = DirectQuickSelectSketch.writableWrap(srcMem, seed); final UnionImpl unionImpl = new UnionImpl(gadget, seed); unionImpl.unionThetaLong_ = srcMem.getLong(UNION_THETA_LONG); + unionImpl.unionEmpty_ = PreambleUtil.isEmpty(srcMem); return unionImpl; } @@ -205,7 +211,7 @@ public CompactSketch getResult(final boolean dstOrdered, final WritableMemory ds //Compact the cache final long[] compactCacheOut = compactCache(gadgetCacheCopy, curCountOut, minThetaLong, dstOrdered); - final boolean empty = gadget_.isEmpty(); + final boolean empty = gadget_.isEmpty() && unionEmpty_; return createCompactSketch( compactCacheOut, empty, seedHash_, curCountOut, minThetaLong, dstOrdered, dstMem); } @@ -219,13 +225,16 @@ public CompactSketch getResult() { public void reset() { gadget_.reset(); unionThetaLong_ = gadget_.getThetaLong(); + unionEmpty_ = true; } @Override public byte[] toByteArray() { final byte[] gadgetByteArr = gadget_.toByteArray(); - final WritableMemory mem = WritableMemory.wrap(gadgetByteArr); - mem.putLong(UNION_THETA_LONG, unionThetaLong_); // union theta + final WritableMemory wmem = WritableMemory.wrap(gadgetByteArr); + wmem.putLong(UNION_THETA_LONG, unionThetaLong_); // union theta + final boolean empty = gadget_.isEmpty() && unionEmpty_; + if (!empty) { PreambleUtil.clearEmpty(wmem); } return gadgetByteArr; } @@ -237,12 +246,13 @@ public boolean isSameResource(final Memory that) { @Override public void update(final Sketch sketchIn) { //Only valid for theta Sketches using SerVer = 3 - //UNION Empty Rule: AND the empty states. This does not require separate treatment. + //UNION Empty Rule: AND the empty states. - if (sketchIn == null) { - //null is interpreted as (Theta = 1.0, count = 0, empty = T). Nothing changes + if ((sketchIn == null) || sketchIn.isEmpty()) { + //null and empty is interpreted as (Theta = 1.0, count = 0, empty = T). Nothing changes return; } + //sketchIn is valid and not empty Util.checkSeedHashes(seedHash_, sketchIn.getSeedHash()); Sketch.checkSketchAndMemoryFlags(sketchIn); @@ -250,41 +260,47 @@ public void update(final Sketch sketchIn) { //Only valid for theta Sketches usin final long thetaLongIn = sketchIn.getThetaLong(); unionThetaLong_ = min(unionThetaLong_, thetaLongIn); //Theta rule with incoming final int curCountIn = sketchIn.getRetainedEntries(true); - - if (sketchIn.isOrdered()) { //Only true if Compact. Use early stop - //Ordered, thus compact - if (sketchIn.isDirect()) { - final Memory skMem = ((CompactSketch) sketchIn).getMemory(); - final int preambleLongs = skMem.getByte(PREAMBLE_LONGS_BYTE) & 0X3F; - for (int i = 0; i < curCountIn; i++ ) { - final int offsetBytes = (preambleLongs + i) << 3; - final long hashIn = skMem.getLong(offsetBytes); - if (hashIn >= unionThetaLong_) { break; } // "early stop" - gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed + if (curCountIn > 0) { + if (sketchIn.isOrdered()) { //Only true if Compact. Use early stop + //Ordered, thus compact + if (sketchIn.isDirect()) { + final Memory skMem = ((CompactSketch) sketchIn).getMemory(); + final int preambleLongs = skMem.getByte(PREAMBLE_LONGS_BYTE) & 0X3F; + for (int i = 0; i < curCountIn; i++ ) { + final int offsetBytes = (preambleLongs + i) << 3; + final long hashIn = skMem.getLong(offsetBytes); + if (hashIn >= unionThetaLong_) { break; } // "early stop" + gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed + } } - } - else { //sketchIn is on the Java Heap or has array - final long[] cacheIn = sketchIn.getCache(); //not a copy! - for (int i = 0; i < curCountIn; i++ ) { + else { //sketchIn is on the Java Heap or has array + final long[] cacheIn = sketchIn.getCache(); //not a copy! + for (int i = 0; i < curCountIn; i++ ) { + final long hashIn = cacheIn[i]; + if (hashIn >= unionThetaLong_) { break; } // "early stop" + gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed + } + } + } //End ordered, compact + else { //either not-ordered compact or Hash Table form. A HT may have dirty values. + final long[] cacheIn = sketchIn.getCache(); //if off-heap this will be a copy + final int arrLongs = cacheIn.length; + for (int i = 0, c = 0; (i < arrLongs) && (c < curCountIn); i++ ) { final long hashIn = cacheIn[i]; - if (hashIn >= unionThetaLong_) { break; } // "early stop" + if ((hashIn <= 0L) || (hashIn >= unionThetaLong_)) { continue; } //rejects dirty values gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed + c++; //insures against invalid state inside the incoming sketch } } - } //End ordered, compact - else { //either not-ordered compact or Hash Table form. A HT may have dirty values. - final long[] cacheIn = sketchIn.getCache(); //if off-heap this will be a copy - final int arrLongs = cacheIn.length; - for (int i = 0, c = 0; (i < arrLongs) && (c < curCountIn); i++ ) { - final long hashIn = cacheIn[i]; - if ((hashIn <= 0L) || (hashIn >= unionThetaLong_)) { continue; } //rejects dirty values - gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed - c++; //insures against invalid state inside the incoming sketch - } } - unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //Theta rule with gadget + unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //theta rule + final int gCurCount = gadget_.getRetainedEntries(); + unionEmpty_ = (gCurCount == 0) && (unionThetaLong_ == Long.MAX_VALUE); //empty rule if (gadget_.isDirect()) { - ((WritableMemory)gadget_.getMemory()).putLong(UNION_THETA_LONG, unionThetaLong_); + final WritableMemory wmem = (WritableMemory)gadget_.getMemory(); + wmem.putLong(UNION_THETA_LONG, unionThetaLong_); + if (unionEmpty_) { PreambleUtil.setEmpty(wmem); } + else { PreambleUtil.clearEmpty(wmem); } } } @@ -295,7 +311,7 @@ public void update(final Memory skMem) { final int cap = (int)skMem.getCapacity(); final int fam = skMem.getByte(FAMILY_BYTE); final int serVer = skMem.getByte(SER_VER_BYTE); - if (serVer == 1) { //older SetSketch, which is compact and ordered + if (serVer == 1) { //very old SetSketch, which is compact and ordered if (fam != 3) { //the original SetSketch throw new SketchesArgumentException( "Family must be old SET_SKETCH: " + Family.idToFamily(fam)); @@ -383,7 +399,7 @@ long getThetaLong() { @Override boolean isEmpty() { - return gadget_.isEmpty(); + return gadget_.isEmpty() && unionEmpty_; } //no seedHash, assumes given seed is correct. No p, no empty flag, no concept of direct @@ -399,9 +415,14 @@ private void processVer1(final Memory skMem) { if (hashIn >= unionThetaLong_) { break; } // "early stop" gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed } - unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); + unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //theta rule + final int gCurCount = gadget_.getRetainedEntries(); + unionEmpty_ = (gCurCount == 0) && (unionThetaLong_ == Long.MAX_VALUE); //empty rule if (gadget_.isDirect()) { - ((WritableMemory)gadget_.getMemory()).putLong(UNION_THETA_LONG, unionThetaLong_); + final WritableMemory wmem = (WritableMemory)gadget_.getMemory(); + wmem.putLong(UNION_THETA_LONG, unionThetaLong_); + if (unionEmpty_) { PreambleUtil.setEmpty(wmem); } + else { PreambleUtil.clearEmpty(wmem); } } } @@ -428,9 +449,14 @@ private void processVer2(final Memory skMem) { if (hashIn >= unionThetaLong_) { break; } // "early stop" gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed } - unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); + unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //theta rule + final int gCurCount = gadget_.getRetainedEntries(); + unionEmpty_ = (gCurCount == 0) && (unionThetaLong_ == Long.MAX_VALUE); //empty rule if (gadget_.isDirect()) { - ((WritableMemory)gadget_.getMemory()).putLong(UNION_THETA_LONG, unionThetaLong_); + final WritableMemory wmem = (WritableMemory)gadget_.getMemory(); + wmem.putLong(UNION_THETA_LONG, unionThetaLong_); + if (unionEmpty_) { PreambleUtil.setEmpty(wmem); } + else { PreambleUtil.clearEmpty(wmem); } } } @@ -460,7 +486,7 @@ else if (preLongs == 2) { //curCount has to be > 0 and exact mode. Cannot be fro assert curCount > 0; thetaLongIn = skMem.getLong(THETA_LONG); } - unionThetaLong_ = min(unionThetaLong_, thetaLongIn); //Theta rule + unionThetaLong_ = min(unionThetaLong_, thetaLongIn); //theta rule final boolean ordered = (skMem.getByte(FLAGS_BYTE) & ORDERED_FLAG_MASK) != 0; if (ordered) { //must be compact for (int i = 0; i < curCount; i++ ) { @@ -480,9 +506,14 @@ else if (preLongs == 2) { //curCount has to be > 0 and exact mode. Cannot be fro gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed } } - unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //sync thetaLongs + unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //theta rule + final int gCurCount = gadget_.getRetainedEntries(); + unionEmpty_ = (gCurCount == 0) && (unionThetaLong_ == Long.MAX_VALUE); //empty rule if (gadget_.isDirect()) { - ((WritableMemory)gadget_.getMemory()).putLong(UNION_THETA_LONG, unionThetaLong_); + final WritableMemory wmem = (WritableMemory)gadget_.getMemory(); + wmem.putLong(UNION_THETA_LONG, unionThetaLong_); + if (unionEmpty_) { PreambleUtil.setEmpty(wmem); } + else { PreambleUtil.clearEmpty(wmem); } } } diff --git a/src/test/java/com/yahoo/sketches/theta/EmptyTest.java b/src/test/java/com/yahoo/sketches/theta/EmptyTest.java index 43aec65bd..60b942bfb 100644 --- a/src/test/java/com/yahoo/sketches/theta/EmptyTest.java +++ b/src/test/java/com/yahoo/sketches/theta/EmptyTest.java @@ -8,9 +8,7 @@ /** - * Empty essentially means that the sketch has never seen data. But just because it has never - * seen data does not mean it would not impact a union operation. This would occur if P is - * set < 1.0. + * Empty essentially means that the sketch has never seen data. * * @author Lee Rhodes */ diff --git a/src/test/java/com/yahoo/sketches/theta/ForwardCompatibilityTest.java b/src/test/java/com/yahoo/sketches/theta/ForwardCompatibilityTest.java index 3ad2de4c6..1e675eafe 100644 --- a/src/test/java/com/yahoo/sketches/theta/ForwardCompatibilityTest.java +++ b/src/test/java/com/yahoo/sketches/theta/ForwardCompatibilityTest.java @@ -134,7 +134,7 @@ public void checkSerVer2_8Bytes() { } @Test - public void checkSerVer2_24Bytes_1Value() { + public void checkSerVer2_24Bytes_0Values() { byte[] byteArray = new byte[24]; WritableMemory mem = WritableMemory.wrap(byteArray); mem.putByte(0, (byte) 2); //mdLongs, RF (RR) = 0 @@ -151,7 +151,7 @@ public void checkSerVer2_24Bytes_1Value() { Memory srcMem = Memory.wrap(byteArray); Sketch sketch = Sketch.heapify(srcMem); - assertEquals(sketch.isEmpty(), false); + assertEquals(sketch.isEmpty(), true); //was forced true assertEquals(sketch.isEstimationMode(), false); assertEquals(sketch.isDirect(), false); assertEquals(sketch.isCompact(), true); @@ -161,7 +161,7 @@ public void checkSerVer2_24Bytes_1Value() { } @Test - public void checkSerVer2_32Bytes_1Value() { + public void checkSerVer2_32Bytes_0Values() { byte[] byteArray = new byte[32]; WritableMemory mem = WritableMemory.wrap(byteArray); mem.putByte(0, (byte) 3); //mdLongs, RF (RR) = 0 @@ -178,7 +178,7 @@ public void checkSerVer2_32Bytes_1Value() { Memory srcMem = Memory.wrap(byteArray); Sketch sketch = Sketch.heapify(srcMem); - assertEquals(sketch.isEmpty(), false); + assertEquals(sketch.isEmpty(), true); //forced true assertEquals(sketch.isEstimationMode(), false); assertEquals(sketch.isDirect(), false); assertEquals(sketch.isCompact(), true); diff --git a/src/test/java/com/yahoo/sketches/theta/HeapAnotBTest.java b/src/test/java/com/yahoo/sketches/theta/HeapAnotBTest.java index b364a43fa..f365bfc75 100644 --- a/src/test/java/com/yahoo/sketches/theta/HeapAnotBTest.java +++ b/src/test/java/com/yahoo/sketches/theta/HeapAnotBTest.java @@ -257,20 +257,17 @@ public void checkAnotBnotC() { boolean ordered = true; UpdateSketch aU = UpdateSketch.builder().setNominalEntries(k).build(); - for (int i=0; i k, true); + assertEquals(csk.getThetaLong() < Long.MAX_VALUE, true); + assertEquals(csk.isDirect(), false); + assertEquals(csk.isOrdered(), true); + + csk = generate(State.CNT0_THLT1, k); + assertEquals(csk.isEmpty(), false); + assertEquals(csk.isEstimationMode(), true); + assertEquals(csk.getRetainedEntries(), 0); + assertEquals(csk.getThetaLong() < Long.MAX_VALUE, true); + assertEquals(csk.isDirect(), false); + assertEquals(csk.isOrdered(), true); + + csk = generate(State.EST_HEAP_UNORDERED, k); + assertEquals(csk.isEmpty(), false); + assertEquals(csk.isEstimationMode(), true); + assertEquals(csk.getRetainedEntries() > k, true); + assertEquals(csk.getThetaLong() < Long.MAX_VALUE, true); + assertEquals(csk.isDirect(), false); + assertEquals(csk.isOrdered(), false); + } + + enum State {NULL, EMPTY, EXACT, EST_HEAP, CNT0_THLT1, EST_HEAP_UNORDERED} private static CompactSketch generate(State state, int k) { UpdateSketch sk = null; @@ -279,32 +378,32 @@ private static CompactSketch generate(State state, int k) { } case EXACT : { sk = Sketches.updateSketchBuilder().setNominalEntries(k).build(); - for (int i = 0; i < k; i++) sk.update(i); + for (int i = 0; i < k; i++) { + sk.update(i); + } csk = sk.compact(true, null); break; } case EST_HEAP : { sk = Sketches.updateSketchBuilder().setNominalEntries(k).build(); - for (int i = 0; i < 4*k; i++) sk.update(i); + for (int i = 0; i < (4 * k); i++) { + sk.update(i); + } csk = sk.compact(true, null); break; } - case EST_DIR : { - sk = Sketches.updateSketchBuilder().setNominalEntries(k).build(); - for (int i = 0; i < 4 * k; i++) sk.update(i); - int bytes = Sketch.getMaxCompactSketchBytes(sk.getRetainedEntries(true)); - byte[] byteArr = new byte[bytes]; - WritableMemory mem = WritableMemory.wrap(byteArr); - csk = sk.compact(true, mem); - break; - } - case EMPTY_THLT0 : { - csk = Sketches.updateSketchBuilder().setP((float)0.5).setNominalEntries(k).build().compact(true, null); + case CNT0_THLT1 : { + sk = Sketches.updateSketchBuilder().setP((float)0.5).setNominalEntries(k).build(); + sk.update(7); + assert(sk.getRetainedEntries() == 0); + csk = sk.compact(true, null); break; } case EST_HEAP_UNORDERED : { sk = Sketches.updateSketchBuilder().setNominalEntries(k).build(); - for (int i = 0; i < 4 * k; i++) sk.update(i); + for (int i = 0; i < (4 * k); i++) { + sk.update(i); + } int bytes = Sketch.getMaxCompactSketchBytes(sk.getRetainedEntries(true)); byte[] byteArr = new byte[bytes]; WritableMemory mem = WritableMemory.wrap(byteArr); diff --git a/src/test/java/com/yahoo/sketches/theta/PairwiseSetOperationsTest.java b/src/test/java/com/yahoo/sketches/theta/PairwiseSetOperationsTest.java index 0dea5b527..a3d239736 100644 --- a/src/test/java/com/yahoo/sketches/theta/PairwiseSetOperationsTest.java +++ b/src/test/java/com/yahoo/sketches/theta/PairwiseSetOperationsTest.java @@ -9,8 +9,6 @@ import org.testng.annotations.Test; -import com.yahoo.sketches.SketchesArgumentException; - public class PairwiseSetOperationsTest { // Intersection @@ -70,9 +68,9 @@ public void checkIntersectionEarlyStop() { for (int t = 0; t < trials; t++) { for (int i=0; i Date: Thu, 28 Mar 2019 12:53:50 -0700 Subject: [PATCH 02/16] add empty line --- src/main/java/com/yahoo/sketches/theta/UnionImpl.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/com/yahoo/sketches/theta/UnionImpl.java b/src/main/java/com/yahoo/sketches/theta/UnionImpl.java index 47af3b9dd..ce3314c1a 100644 --- a/src/main/java/com/yahoo/sketches/theta/UnionImpl.java +++ b/src/main/java/com/yahoo/sketches/theta/UnionImpl.java @@ -37,6 +37,7 @@ * @author Kevin Lang */ final class UnionImpl extends Union { + /** * Although the gadget object is initially an UpdateSketch, in the context of a Union it is used * as a specialized buffer that happens to leverage much of the machinery of an UpdateSketch. From 7796a01e0ad5bd0b635937c4b5dfaae683f5b0c9 Mon Sep 17 00:00:00 2001 From: lrhodes Date: Thu, 28 Mar 2019 12:59:34 -0700 Subject: [PATCH 03/16] tiny edits --- .../java/com/yahoo/sketches/theta/PairwiseSetOperations.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java b/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java index 7bc5bc480..f4c7c4a5f 100644 --- a/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java +++ b/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java @@ -139,7 +139,7 @@ public static CompactSketch union(final CompactSketch skA, final CompactSketch s } //Both sketches are valid with matching seedhashes and ordered - //Full Union operation + //Full Union operation: long thetaLong = Math.min(skA.getThetaLong(), skB.getThetaLong()); //Theta rule final long[] cacheA = (skA.isDirect()) ? skA.getCache() : skA.getCache().clone(); final long[] cacheB = (skB.isDirect()) ? skB.getCache() : skB.getCache().clone(); From d327234ab665606c9ce2aef8ab745af658d133fc Mon Sep 17 00:00:00 2001 From: lrhodes Date: Thu, 28 Mar 2019 18:33:51 -0700 Subject: [PATCH 04/16] Removed checkEmptyState --- .../theta/DirectCompactOrderedSketch.java | 4 +- .../sketches/theta/DirectCompactSketch.java | 1 - .../theta/DirectCompactUnorderedSketch.java | 1 + .../sketches/theta/ForwardCompatibility.java | 3 - .../theta/HeapCompactOrderedSketch.java | 2 +- .../sketches/theta/HeapCompactSketch.java | 1 - .../theta/HeapCompactUnorderedSketch.java | 2 +- .../sketches/theta/PairwiseSetOperations.java | 11 +--- .../yahoo/sketches/theta/SetOperation.java | 5 +- .../java/com/yahoo/sketches/theta/Sketch.java | 63 ++++++++++--------- 10 files changed, 43 insertions(+), 50 deletions(-) diff --git a/src/main/java/com/yahoo/sketches/theta/DirectCompactOrderedSketch.java b/src/main/java/com/yahoo/sketches/theta/DirectCompactOrderedSketch.java index a59412534..f710c8658 100644 --- a/src/main/java/com/yahoo/sketches/theta/DirectCompactOrderedSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/DirectCompactOrderedSketch.java @@ -52,11 +52,11 @@ static DirectCompactOrderedSketch wrapInstance(final Memory srcMem, final long s * @param dstMem the given destination Memory. This clears it before use. * @return a DirectCompactOrderedSketch. */ - static DirectCompactOrderedSketch compact(final UpdateSketch sketch, - final WritableMemory dstMem) { + static DirectCompactOrderedSketch compact(final UpdateSketch sketch, final WritableMemory dstMem) { final long thetaLong = sketch.getThetaLong(); final boolean empty = sketch.isEmpty(); final int curCount = sketch.getRetainedEntries(true); + //checkEmptyState(empty, curCount, thetaLong); final int preLongs = computeCompactPreLongs(thetaLong, empty, curCount); final short seedHash = sketch.getSeedHash(); final long[] cache = sketch.getCache(); diff --git a/src/main/java/com/yahoo/sketches/theta/DirectCompactSketch.java b/src/main/java/com/yahoo/sketches/theta/DirectCompactSketch.java index b270f23f3..f8a6394b3 100644 --- a/src/main/java/com/yahoo/sketches/theta/DirectCompactSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/DirectCompactSketch.java @@ -22,7 +22,6 @@ abstract class DirectCompactSketch extends CompactSketch { DirectCompactSketch(final Memory mem) { mem_ = mem; - checkEmptyState(isEmpty(), getRetainedEntries(true), getThetaLong()); //TODO remove if not needed } //Sketch diff --git a/src/main/java/com/yahoo/sketches/theta/DirectCompactUnorderedSketch.java b/src/main/java/com/yahoo/sketches/theta/DirectCompactUnorderedSketch.java index 2eb3a7801..09c55205c 100644 --- a/src/main/java/com/yahoo/sketches/theta/DirectCompactUnorderedSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/DirectCompactUnorderedSketch.java @@ -56,6 +56,7 @@ static DirectCompactUnorderedSketch compact(final UpdateSketch sketch, final long thetaLong = sketch.getThetaLong(); final boolean empty = sketch.isEmpty(); final int curCount = sketch.getRetainedEntries(true); + //checkEmptyState(empty, curCount, thetaLong); final int preLongs = computeCompactPreLongs(thetaLong, empty, curCount); final short seedHash = sketch.getSeedHash(); final long[] cache = sketch.getCache(); diff --git a/src/main/java/com/yahoo/sketches/theta/ForwardCompatibility.java b/src/main/java/com/yahoo/sketches/theta/ForwardCompatibility.java index 9e681b0c0..dc960ac6e 100644 --- a/src/main/java/com/yahoo/sketches/theta/ForwardCompatibility.java +++ b/src/main/java/com/yahoo/sketches/theta/ForwardCompatibility.java @@ -11,7 +11,6 @@ import static com.yahoo.sketches.theta.PreambleUtil.RETAINED_ENTRIES_INT; import static com.yahoo.sketches.theta.PreambleUtil.SEED_HASH_SHORT; import static com.yahoo.sketches.theta.PreambleUtil.THETA_LONG; -import static com.yahoo.sketches.theta.Sketch.checkEmptyState; import com.yahoo.memory.Memory; import com.yahoo.sketches.SketchesArgumentException; @@ -60,7 +59,6 @@ static final CompactSketch heapify1to3(final Memory srcMem, final long seed) { final long[] compactOrderedCache = new long[curCount]; srcMem.getLongArray(24, compactOrderedCache, 0, curCount); - checkEmptyState(false, curCount, thetaLong); return HeapCompactOrderedSketch .compact(compactOrderedCache, false, seedHash, curCount, thetaLong); } @@ -98,7 +96,6 @@ static final CompactSketch heapify2to3(final Memory srcMem, final long seed) { empty = (curCount == 0) && (thetaLong == Long.MAX_VALUE); //force true final long[] compactOrderedCache = new long[curCount]; srcMem.getLongArray(mdLongs << 3, compactOrderedCache, 0, curCount); - checkEmptyState(empty, curCount, thetaLong); return HeapCompactOrderedSketch .compact(compactOrderedCache, empty, seedHash, curCount, thetaLong); } diff --git a/src/main/java/com/yahoo/sketches/theta/HeapCompactOrderedSketch.java b/src/main/java/com/yahoo/sketches/theta/HeapCompactOrderedSketch.java index 031c13398..0685e8223 100644 --- a/src/main/java/com/yahoo/sketches/theta/HeapCompactOrderedSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/HeapCompactOrderedSketch.java @@ -80,7 +80,7 @@ static CompactSketch compact(final UpdateSketch sketch) { final long thetaLong = sketch.getThetaLong(); final boolean empty = sketch.isEmpty(); final int curCount = sketch.getRetainedEntries(true); - + //checkEmptyState(empty, curCount, thetaLong); final short seedHash = sketch.getSeedHash(); final long[] cache = sketch.getCache(); final boolean ordered = true; diff --git a/src/main/java/com/yahoo/sketches/theta/HeapCompactSketch.java b/src/main/java/com/yahoo/sketches/theta/HeapCompactSketch.java index 29543bc85..238cfb4da 100644 --- a/src/main/java/com/yahoo/sketches/theta/HeapCompactSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/HeapCompactSketch.java @@ -44,7 +44,6 @@ abstract class HeapCompactSketch extends CompactSketch { thetaLong_ = empty ? Long.MAX_VALUE : thetaLong; cache_ = cache; preLongs_ = computeCompactPreLongs(thetaLong, empty, curCount); - checkEmptyState(isEmpty(), getRetainedEntries(true), getThetaLong()); //TODO remove if not needed } //Sketch diff --git a/src/main/java/com/yahoo/sketches/theta/HeapCompactUnorderedSketch.java b/src/main/java/com/yahoo/sketches/theta/HeapCompactUnorderedSketch.java index fa1ee54dc..449e93d6d 100644 --- a/src/main/java/com/yahoo/sketches/theta/HeapCompactUnorderedSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/HeapCompactUnorderedSketch.java @@ -80,7 +80,7 @@ static CompactSketch compact(final UpdateSketch sketch) { final long thetaLong = sketch.getThetaLong(); final boolean empty = sketch.isEmpty(); final int curCount = sketch.getRetainedEntries(true); - + //checkEmptyState(empty, curCount, thetaLong); final short seedHash = sketch.getSeedHash(); final long[] cache = sketch.getCache(); final boolean ordered = false; diff --git a/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java b/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java index f4c7c4a5f..bab3cc9e5 100644 --- a/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java +++ b/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java @@ -5,7 +5,7 @@ package com.yahoo.sketches.theta; -import static com.yahoo.sketches.theta.Sketch.checkEmptyState; +import static com.yahoo.sketches.theta.SetOperation.createCompactSketch; import java.util.Arrays; @@ -202,9 +202,7 @@ else if (hashA < hashB) { } else { outArr = Arrays.copyOf(outCache, curCount); //copy only valid items } - checkEmptyState(false, curCount, thetaLong); - return HeapCompactOrderedSketch - .compact(outArr, false, skA.getSeedHash(), curCount, thetaLong); + return createCompactSketch(outArr, false, skA.getSeedHash(), curCount, thetaLong, true, null); } private static CompactSketch maybeCutback(final CompactSketch csk, final int k) { @@ -216,11 +214,8 @@ private static CompactSketch maybeCutback(final CompactSketch csk, final int k) thetaLong = cache[k]; final long[] arr = Arrays.copyOf(cache, k); curCount = k; - checkEmptyState(empty, curCount, thetaLong); - return HeapCompactOrderedSketch - .compact(arr, empty, csk.getSeedHash(), curCount, thetaLong); + return createCompactSketch(arr, empty, csk.getSeedHash(), curCount, thetaLong, true, null); } - checkEmptyState(empty, curCount, thetaLong); return csk; } diff --git a/src/main/java/com/yahoo/sketches/theta/SetOperation.java b/src/main/java/com/yahoo/sketches/theta/SetOperation.java index 82d7109d7..6d4a7b82c 100644 --- a/src/main/java/com/yahoo/sketches/theta/SetOperation.java +++ b/src/main/java/com/yahoo/sketches/theta/SetOperation.java @@ -12,7 +12,6 @@ import static com.yahoo.sketches.Util.ceilingPowerOf2; import static com.yahoo.sketches.theta.PreambleUtil.FAMILY_BYTE; import static com.yahoo.sketches.theta.PreambleUtil.SER_VER_BYTE; -import static com.yahoo.sketches.theta.Sketch.checkEmptyState; import static java.lang.Math.max; import com.yahoo.memory.Memory; @@ -230,7 +229,7 @@ static final CompactSketch createCompactSketch(final long[] compactCache, final curCount = 0; thetaLong = Long.MAX_VALUE; } - checkEmptyState(empty, curCount, thetaLong); + //checkEmptyState(empty, curCount, thetaLong); CompactSketch sketchOut = null; final int sw = (dstOrdered ? 2 : 0) | ((dstMem != null) ? 1 : 0); switch (sw) { @@ -259,8 +258,6 @@ static final CompactSketch createCompactSketch(final long[] compactCache, final return sketchOut; } - - /** * Computes minimum lgArrLongs from a current count. * @param count the given current count diff --git a/src/main/java/com/yahoo/sketches/theta/Sketch.java b/src/main/java/com/yahoo/sketches/theta/Sketch.java index 23b96692b..d440a7de0 100644 --- a/src/main/java/com/yahoo/sketches/theta/Sketch.java +++ b/src/main/java/com/yahoo/sketches/theta/Sketch.java @@ -24,7 +24,6 @@ import com.yahoo.sketches.BinomialBoundsN; import com.yahoo.sketches.Family; import com.yahoo.sketches.SketchesArgumentException; -import com.yahoo.sketches.SketchesStateException; import com.yahoo.sketches.Util; /** @@ -558,34 +557,40 @@ static final void checkSketchAndMemoryFlags(final Sketch sketch) { } } - /** - * Checks for an illegal state of the empty flag. The truth table is as follows: - *
-   *  Empty CurCount Theta State    Comments
-   *    T      0       1.0   OK     The Normal Empty State
-   *    T      0      <1.0   Error  This can be an initial on-heap state if p < 1.0,
-   *                                  but should stored as a Normal Empty State.
-   *    T     !0       1.0   Error  Empty and curCount !0 should never co-exist
-   *    T     !0      <1.0   Error  Empty and curCount !0 should never co-exist
-   *    F      0       1.0   Error  This conflicts with the normal empty state
-   *    F      0      <1.0   OK     This can result from set operations
-   *    F     !0       1.0   OK     This corresponds to a sketch in exact mode
-   *    F     !0      <1.0   OK     This corresponds to a sketch in estimation mode
-   * 
- * - * @param empty the state of the empty flag - * @param curCount the current number of retained entries - * @param thetaLong the value of theta as a long - */ - static final void checkEmptyState(final boolean empty, final int curCount, final long thetaLong) { - final boolean thLT1 = thetaLong < Long.MAX_VALUE; - final boolean zeroCount = curCount == 0; - final boolean error = (empty && !zeroCount) || (zeroCount && (empty ^ !thLT1)); - if (error) { - throw new SketchesStateException("Improper Empty State: Empty: " + empty - + ", CurCount=0: " + zeroCount + " Theta<1.0: " + thLT1); - } - } + // /** + // * Checks for an illegal state of the empty flag. The truth table is as follows: + // *
+  //   *  Empty CurCount Theta State    Comments
+  //   *    T      0       1.0   OK     The Normal Empty State
+  //   *    T      0      <1.0   Error  This can be an initial on-heap state if p < 1.0,
+  //   *                                  but should stored as a Normal Empty State.
+  //   *    T     !0       1.0   Error  Empty and curCount !0 should never co-exist
+  //   *    T     !0      <1.0   Error  Empty and curCount !0 should never co-exist
+  //   *    F      0       1.0   Error  This conflicts with the normal empty state
+  //   *    F      0      <1.0   OK     This can result from set operations
+  //   *    F     !0       1.0   OK     This corresponds to a sketch in exact mode
+  //   *    F     !0      <1.0   OK     This corresponds to a sketch in estimation mode
+  //   * 
+ // * + // * @param empty the state of the empty flag + // * @param curCount the current number of retained entries + // * @param thetaLong the value of theta as a long + // */ + // static final void checkEmptyState(final boolean empty, final int curCount, final long thetaLong) { + // final boolean thLT1 = thetaLong < Long.MAX_VALUE; + // final boolean zeroCount = curCount == 0; + // final boolean error = (empty && !zeroCount) || (zeroCount && (empty ^ !thLT1)); + // if (error) { + // throw new SketchesStateException("Improper Empty State: Empty: " + empty + // + ", CurCount=0: " + zeroCount + " Theta<1.0: " + thLT1); + // } + // } + // + // static final boolean fixEmpty(final boolean empty, final int curCount, final long thetaLong) { + // if (curCount > 0) { return false; } + // if ((curCount == 0) && (thetaLong == Long.MAX_VALUE)) { return true; } + // return empty; + // } static final double estimate(final long thetaLong, final int curCount, final boolean empty) { if (estMode(thetaLong, empty)) { From 76483669065c89e8524bc756468c7f738fee31ab Mon Sep 17 00:00:00 2001 From: lrhodes Date: Fri, 29 Mar 2019 11:35:17 -0700 Subject: [PATCH 05/16] clean up pairwise logic --- .../sketches/theta/PairwiseSetOperations.java | 37 ++++++++++--------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java b/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java index bab3cc9e5..c91cf192d 100644 --- a/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java +++ b/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java @@ -88,51 +88,46 @@ public static CompactSketch union(final CompactSketch skA, final CompactSketch s public static CompactSketch union(final CompactSketch skA, final CompactSketch skB, final int k) { //Handle all corner cases with null or empty arguments //For backward compatibility, we must allow input empties with Theta < 1.0. - final int swA = (skA == null) ? 1 : skA.isEmpty() ? 2 : 3; - final int swB = (skB == null) ? 1 : skB.isEmpty() ? 2 : 3; + final int swA, swB; + if (skA == null) { swA = 1; } else { checkOrdered(skA); swA = skA.isEmpty() ? 2 : 3; } + if (skB == null) { swB = 1; } else { checkOrdered(skB); swB = skB.isEmpty() ? 2 : 3; } final int sw = (swA << 2) | swB; switch (sw) { case 5: { //skA == null; skB == null; return null. Cannot determine seedhash. return null; } case 6: { //skA == null; skB == empty; return empty - checkOrdered(skB); - return (skB.getThetaLong() == Long.MAX_VALUE) ? skB : + return (skB.getThetaLong() == Long.MAX_VALUE) ? skB : //lgtm [java/dereferenced-value-may-be-null] HeapCompactOrderedSketch.compact(new long[0], true, skB.getSeedHash(), 0, Long.MAX_VALUE); } case 7: { //skA == null; skB == valid; return skB - checkOrdered(skB); return maybeCutback(skB, k); } case 9: { //skA == empty; skB == null; return empty - checkOrdered(skA); - return (skA.getThetaLong() == Long.MAX_VALUE) ? skA : + return (skA.getThetaLong() == Long.MAX_VALUE) ? skA : //lgtm [java/dereferenced-value-may-be-null] HeapCompactOrderedSketch.compact(new long[0], true, skA.getSeedHash(), 0, Long.MAX_VALUE); } case 10: { //skA == empty; skB == empty; return empty - final short seedHash = Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); - if (skA.getThetaLong() == Long.MAX_VALUE) { checkOrdered(skA); return skA; } - if (skB.getThetaLong() == Long.MAX_VALUE) { checkOrdered(skB); return skB; } + final short seedHash = seedHashesCheck(skA, skB); + if (skA.getThetaLong() == Long.MAX_VALUE) //lgtm [java/dereferenced-value-may-be-null] + { return skA; } + if (skB.getThetaLong() == Long.MAX_VALUE) //lgtm [java/dereferenced-value-may-be-null] + { return skB; } return HeapCompactOrderedSketch.compact(new long[0], true, seedHash, 0, Long.MAX_VALUE); } case 11: { //skA == empty; skB == valid; return skB - Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); - checkOrdered(skB); + seedHashesCheck(skA, skB); return maybeCutback(skB, k); } case 13: { //skA == valid; skB == null; return skA - checkOrdered(skA); return maybeCutback(skA, k); } case 14: { //skA == valid; skB == empty; return skA - Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); - checkOrdered(skA); + seedHashesCheck(skA, skB); return maybeCutback(skA, k); } case 15: { //skA == valid; skB == valid; perform full union - Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); - checkOrdered(skA); - checkOrdered(skB); + seedHashesCheck(skA, skB); break; } //default: cannot happen @@ -225,4 +220,10 @@ private static void checkOrdered(final CompactSketch csk) { } } + private static short seedHashesCheck(final Sketch skA, final Sketch skB) { + final short seedHashA = skA.getSeedHash(); //lgtm [java/dereferenced-value-may-be-null] + final short seedHashB = skB.getSeedHash(); //lgtm [java/dereferenced-value-may-be-null] + return Util.checkSeedHashes(seedHashA, seedHashB); + } + } From 2b8e741f891af66dd2d7851f7991caa2d2b22e4b Mon Sep 17 00:00:00 2001 From: lrhodes Date: Thu, 28 Mar 2019 12:00:26 -0700 Subject: [PATCH 06/16] Updated Intersection and AnotB so far. --- .../java/com/yahoo/sketches/theta/AnotB.java | 41 +- .../sketches/theta/DirectCompactSketch.java | 1 + .../sketches/theta/ForwardCompatibility.java | 9 +- .../com/yahoo/sketches/theta/HeapAnotB.java | 235 ++++++------ .../sketches/theta/HeapCompactSketch.java | 5 +- .../yahoo/sketches/theta/Intersection.java | 25 ++ .../sketches/theta/IntersectionImpl.java | 144 +++++--- .../sketches/theta/IntersectionImplR.java | 38 +- .../sketches/theta/PairwiseSetOperations.java | 349 ++++++------------ .../yahoo/sketches/theta/PreambleUtil.java | 85 ++++- .../yahoo/sketches/theta/SetOperation.java | 16 +- .../java/com/yahoo/sketches/theta/Sketch.java | 30 ++ .../com/yahoo/sketches/theta/UnionImpl.java | 119 +++--- .../com/yahoo/sketches/theta/EmptyTest.java | 4 +- .../theta/ForwardCompatibilityTest.java | 8 +- .../yahoo/sketches/theta/HeapAnotBTest.java | 22 +- .../sketches/theta/HeapIntersectionTest.java | 9 + .../theta/PairwiseCornerCasesTest.java | 193 +++++++--- .../theta/PairwiseSetOperationsTest.java | 26 +- 19 files changed, 785 insertions(+), 574 deletions(-) diff --git a/src/main/java/com/yahoo/sketches/theta/AnotB.java b/src/main/java/com/yahoo/sketches/theta/AnotB.java index 73f5196c1..f4e2d9282 100644 --- a/src/main/java/com/yahoo/sketches/theta/AnotB.java +++ b/src/main/java/com/yahoo/sketches/theta/AnotB.java @@ -21,6 +21,8 @@ *

Calling the update function a second time essentially clears the internal state and updates * with the new pair of sketches. * + *

As an alternative, one can use the aNotB method that returns the result immediately. + * * @author Lee Rhodes */ public abstract class AnotB extends SetOperation { @@ -30,10 +32,16 @@ public Family getFamily() { return Family.A_NOT_B; } + /** + * Gets the result of this operation as an ordered CompactSketch on the Java heap + * @return the result of this operation as an ordered CompactSketch on the Java heap + */ + public abstract CompactSketch getResult(); + /** * Gets the result of this set operation as a CompactSketch of the chosen form * @param dstOrdered - * See Destination Ordered + * See Destination Ordered. * * @param dstMem * See Destination Memory. @@ -42,12 +50,6 @@ public Family getFamily() { */ public abstract CompactSketch getResult(boolean dstOrdered, WritableMemory dstMem); - /** - * Gets the result of this operation as an ordered CompactSketch on the Java heap - * @return the result of this operation as an ordered CompactSketch on the Java heap - */ - public abstract CompactSketch getResult(); - /** * Perform A-and-not-B set operation on the two given sketches. * A null sketch is interpreted as an empty sketch. @@ -57,4 +59,29 @@ public Family getFamily() { */ public abstract void update(Sketch a, Sketch b); + /** + * Perform A-and-not-B set operation on the two given sketches and return the result as an + * ordered CompactSketch on the heap. + * @param a The incoming sketch for the first argument + * @param b The incoming sketch for the second argument + * @return an ordered CompactSketch on the heap + */ + public CompactSketch aNotB(final Sketch a, final Sketch b) { + return aNotB(a, b, true, null); + } + + /** + * Perform A-and-not-B set operation on the two given sketches and return the result as a + * CompactSketch. + * @param a The incoming sketch for the first argument + * @param b The incoming sketch for the second argument + * @param dstOrdered + * See Destination Ordered. + * @param dstMem + * See Destination Memory. + * @return the result as a CompactSketch. + */ + public abstract CompactSketch aNotB(Sketch a, Sketch b, boolean dstOrdered, + WritableMemory dstMem); + } diff --git a/src/main/java/com/yahoo/sketches/theta/DirectCompactSketch.java b/src/main/java/com/yahoo/sketches/theta/DirectCompactSketch.java index f8a6394b3..b270f23f3 100644 --- a/src/main/java/com/yahoo/sketches/theta/DirectCompactSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/DirectCompactSketch.java @@ -22,6 +22,7 @@ abstract class DirectCompactSketch extends CompactSketch { DirectCompactSketch(final Memory mem) { mem_ = mem; + checkEmptyState(isEmpty(), getRetainedEntries(true), getThetaLong()); //TODO remove if not needed } //Sketch diff --git a/src/main/java/com/yahoo/sketches/theta/ForwardCompatibility.java b/src/main/java/com/yahoo/sketches/theta/ForwardCompatibility.java index f2c98359a..9e681b0c0 100644 --- a/src/main/java/com/yahoo/sketches/theta/ForwardCompatibility.java +++ b/src/main/java/com/yahoo/sketches/theta/ForwardCompatibility.java @@ -11,6 +11,7 @@ import static com.yahoo.sketches.theta.PreambleUtil.RETAINED_ENTRIES_INT; import static com.yahoo.sketches.theta.PreambleUtil.SEED_HASH_SHORT; import static com.yahoo.sketches.theta.PreambleUtil.THETA_LONG; +import static com.yahoo.sketches.theta.Sketch.checkEmptyState; import com.yahoo.memory.Memory; import com.yahoo.sketches.SketchesArgumentException; @@ -59,7 +60,7 @@ static final CompactSketch heapify1to3(final Memory srcMem, final long seed) { final long[] compactOrderedCache = new long[curCount]; srcMem.getLongArray(24, compactOrderedCache, 0, curCount); - + checkEmptyState(false, curCount, thetaLong); return HeapCompactOrderedSketch .compact(compactOrderedCache, false, seedHash, curCount, thetaLong); } @@ -93,11 +94,11 @@ static final CompactSketch heapify2to3(final Memory srcMem, final long seed) { validateInputSize(reqBytesIn, memCap); final long thetaLong = (mdLongs < 3) ? Long.MAX_VALUE : srcMem.getLong(THETA_LONG); - final boolean empty = (srcMem.getByte(FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; - + boolean empty = (srcMem.getByte(FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; + empty = (curCount == 0) && (thetaLong == Long.MAX_VALUE); //force true final long[] compactOrderedCache = new long[curCount]; srcMem.getLongArray(mdLongs << 3, compactOrderedCache, 0, curCount); - + checkEmptyState(empty, curCount, thetaLong); return HeapCompactOrderedSketch .compact(compactOrderedCache, empty, seedHash, curCount, thetaLong); } diff --git a/src/main/java/com/yahoo/sketches/theta/HeapAnotB.java b/src/main/java/com/yahoo/sketches/theta/HeapAnotB.java index ba320f1f3..e51789ff0 100644 --- a/src/main/java/com/yahoo/sketches/theta/HeapAnotB.java +++ b/src/main/java/com/yahoo/sketches/theta/HeapAnotB.java @@ -14,7 +14,6 @@ import com.yahoo.memory.Memory; import com.yahoo.memory.WritableMemory; -import com.yahoo.sketches.HashOperations; import com.yahoo.sketches.Util; /** @@ -28,7 +27,7 @@ final class HeapAnotB extends AnotB { private long thetaLong_; private boolean empty_; private long[] cache_; // no match set - private int curCount_ = 0; + private int curCount_; private int lgArrLongsHT_; //for Hash Table only. may not need to be member after refactoring private long[] bHashTable_; //may not need to be member after refactoring. @@ -39,15 +38,17 @@ final class HeapAnotB extends AnotB { * @param seed See seed */ HeapAnotB(final long seed) { - seedHash_ = Util.computeSeedHash(seed); - a_ = null; - b_ = null; - thetaLong_ = Long.MAX_VALUE; - empty_ = true; - cache_ = null; - curCount_ = 0; - lgArrLongsHT_ = 5; - bHashTable_ = null; + this(Util.computeSeedHash(seed)); + } + + /** + * Construct a new AnotB SetOperation on the java heap. Called by PairwiseSetOperation. + * + * @param seedHash 16 bit hash of the chosen update seed. + */ + HeapAnotB(final short seedHash) { + seedHash_ = seedHash; + reset(); } @Override @@ -63,6 +64,18 @@ public void update(final Sketch a, final Sketch b) { compute(); } + @Override + public CompactSketch aNotB(final Sketch a, final Sketch b, final boolean dstOrdered, + final WritableMemory dstMem) { + update(a, b); + return getResult(dstOrdered, dstMem); + } + + @Override + public CompactSketch getResult() { + return getResult(true, null); + } + @Override public CompactSketch getResult(final boolean dstOrdered, final WritableMemory dstMem) { final long[] compactCache = (curCount_ <= 0) @@ -72,17 +85,13 @@ public CompactSketch getResult(final boolean dstOrdered, final WritableMemory ds Arrays.sort(compactCache); } //Create the CompactSketch + final boolean empty = (curCount_ == 0) && (thetaLong_ == Long.MAX_VALUE); final CompactSketch comp = createCompactSketch( - compactCache, empty_, seedHash_, curCount_, thetaLong_, dstOrdered, dstMem); + compactCache, empty, seedHash_, curCount_, thetaLong_, dstOrdered, dstMem); reset(); return comp; } - @Override - public CompactSketch getResult() { - return getResult(true, null); - } - @Override int getRetainedEntries(final boolean valid) { return curCount_; @@ -109,149 +118,135 @@ void compute() { // NOTES: // In the table below, A and B refer to the two input sketches in the order A-not-B. - // The Theta rule: min( ThetaA, ThetaB) - // The Empty rule: Whatever A is: E(a) + // The Theta rule: min(ThetaA, ThetaB) + // The Empty rule: Whatever the empty state of A is: E(A) // The Return triple is defined as: (Theta, Count, EmptyFlag). // bHashTable temporarily stores the values of B. - // A sketch in stored form can be in one of 5 states + // A sketch in stored form can be in one of 5 states. // Null is not actually a state, but is included for completeness. // Null is interpreted as {Theta = 1.0, count = 0, empty = true}. - // The empty state may have Theta < 1.0, but count must be zero. + // The empty state may have Theta < 1.0 but it is ignored; count must be zero. // State: - // 0 N null + // 0 N Null // 1 E Empty // 2 C Compact, not ordered // 3 O Compact Ordered // 4 H Hash-Table // - //A B swA swB Case Action + //A B swA swB Case Actions //N N 0 0 0 Return (1.0, 0, T) - //N E 0 1 1 Return B: (ThB, 0, T) - //N C 0 2 2 Return (ThB, 0, T) - //N O 0 3 3 Return (ThB, 0, T) - //N H 0 4 4 Return (ThB, 0, T) - //E N 1 0 8 Return A: (ThA, 0, T) - //E E 1 1 9 Return (minT, 0, T) - //E C 1 2 10 Return (minT, 0, T) - //E O 1 3 11 Return (minT, 0, T) - //E H 1 4 12 Return (minT, 0, T) - //C N 2 0 16 Return A: (ThA, |A|, E(a)) - //C E 2 1 17 Return (minT, |A| < minT, E(a)) - //C C 2 2 18 B -> H; => C,H - //C O 2 3 19 B -> H; => C,H - //C H 2 4 20 scan all A, search B, on nomatch -> list (same as HH) - //O N 3 0 24 Return A: (ThA, |A|, E(a)) - //O E 3 1 25 Return (minT, |A| < minT, E(a)) - //O C 3 2 26 B -> H; => O,H - //O O 3 3 27 B -> H; => O,H - //O H 3 4 28 scan A early stop, search B, on nomatch -> list - //H N 4 0 32 Return A: (ThA, |A|, E(a)) - //H E 4 1 33 Return (minT, |A|< minT, E(a)) - //H C 4 2 34 B -> H; => H,H - //H O 4 3 35 B -> H; => H,H - //H H 4 4 36 scan all A, search B, on nomatch -> list + //N E 0 1 1 CheckB, Return (1.0, 0, T) + //N C 0 2 2 CheckB, Return (1.0, 0, T) + //N O 0 3 3 CheckB, Return (1.0, 0, T) + //N H 0 4 4 CheckB, Return (1.0, 0, T) + //E N 1 0 8 CheckA, Return (1.0, 0, T) + //E E 1 1 9 CheckAB, Return (1.0, 0, T) + //E C 1 2 10 CheckAB, Return (1.0, 0, T) + //E O 1 3 11 CheckAB, Return (1.0, 0, T) + //E H 1 4 12 CheckAB, Return (1.0, 0, T) + //C N 2 0 16 CheckA, Return (ThA, |A|, F), copyA + //C E 2 1 17 CheckAB, Return (ThA, |A|, F)), copyA + //C C 2 2 18 CheckAB, B -> H; => C,H; scanAllAsearchB() + //C O 2 3 19 CheckAB, B -> H; => C,H; scanAllAsearchB() + //C H 2 4 20 CheckAB, scanAllAsearchB() + //O N 3 0 24 CheckA, Return (ThA, |A|, F), copyA + //O E 3 1 25 CheckAB, Return (ThA, |A|, F), copyA + //O C 3 2 26 CheckAB, B -> H; => O,H; scanEarlyStopAsearchB() + //O O 3 3 27 CheckAB, B -> H; => O,H; scanEarlyStopAsearchB() + //O H 3 4 28 CheckAB, scanEarlyStopAsearchB() + //H N 4 0 32 CheckA, Return (ThA, |A|, F), copyA + //H E 4 1 33 CheckAB, Return (ThA, |A|, F), copyA + //H C 4 2 34 CheckAB, B -> H; => H,H; scanAllAsearchB() + //H O 4 3 35 CheckAB, B -> H; => H,H; scanAllAsearchB() + //H H 4 4 36 CheckAB, scanAllAsearchB() switch (sw) { - case 0 : { //A and B are null. + case 0 : //A Null, B Null; Return (1.0, 0, T) thetaLong_ = Long.MAX_VALUE; empty_ = true; - break; //{1.0, 0, T} - } - case 1: - case 2: - case 3: - case 4: { //A is null, B is valid + break; + + case 10: //A Empty, B Compact; CheckAB, Return (1.0, 0, T) + case 11: //A Empty, B Ordered; CheckAB, Return (1.0, 0, T) + case 12: //A Empty, B HashTbl; CheckAB, Return (1.0, 0, T) + Util.checkSeedHashes(seedHash_, a_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] + //$FALL-THROUGH$ + case 1: //A Null, B Empty; CheckB, Return (1.0, 0, T) + case 2: //A Null, B Compact; CheckB, Return (1.0, 0, T) + case 3: //A Null, B Ordered; CheckB, Return (1.0, 0, T) + case 4: //A Null, B HashTbl; CheckB, Return (1.0, 0, T) Util.checkSeedHashes(seedHash_, b_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] - thetaLong_ = b_.getThetaLong(); + thetaLong_ = Long.MAX_VALUE; empty_ = true; - break; //{ThB, 0, T} - } - case 8: { //A is empty, B is null + break; + + case 9: //A Empty, B Empty; CheckAB, Return (1.0, 0, T) + Util.checkSeedHashes(seedHash_, b_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] + //$FALL-THROUGH$ + case 8: //A Empty, B Null; CheckA, Return (1.0, 0, T) Util.checkSeedHashes(seedHash_, a_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] - thetaLong_ = a_.getThetaLong(); + thetaLong_ = Long.MAX_VALUE; empty_ = true; - break; //{ThA, 0, T} - } - case 9: - case 10: - case 11: - case 12: { //A empty, B valid - Util.checkSeedHashes(seedHash_, a_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] + break; + + case 17: //A Compact, B Empty; CheckAB, Return (ThA, |A|, F), copyA + case 25: //A Ordered, B Empty; CheckAB, Return (ThA, |A|, F), copyA + case 33: //A HashTbl, B Empty; CheckAB, Return (ThA, |A|, F), copyA Util.checkSeedHashes(seedHash_, b_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] - thetaLong_ = min(a_.getThetaLong(), b_.getThetaLong()); - empty_ = true; - break; //{minT, 0, T} - } - case 16: - case 24: - case 32: { //A valid, B null + //$FALL-THROUGH$ + case 16: //A Compact, B Null; CheckA, Return (ThA, |A|, F), copyA + case 24: //A Ordered, B Null; CheckA, Return (ThA, |A|, F), copyA + case 32: //A HashTbl, B Null; CheckA, Return (ThA, |A|, F), copyA Util.checkSeedHashes(seedHash_, a_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] thetaLong_ = a_.getThetaLong(); - empty_ = a_.isEmpty(); - //move A to cache + empty_ = false; curCount_ = a_.getRetainedEntries(true); cache_ = compactCache(a_.getCache(), curCount_, thetaLong_, false); - break; //{ThA, |A|, E(a)} - } - case 17: - case 25: - case 33: { //A valid, B empty - Util.checkSeedHashes(seedHash_, a_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] - Util.checkSeedHashes(seedHash_, b_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] - thetaLong_ = min(a_.getThetaLong(), b_.getThetaLong()); - empty_ = a_.isEmpty(); - //move A < theta to cache - final long[] cache = a_.getCache(); - curCount_ = HashOperations.count(cache, thetaLong_); - cache_ = compactCache(cache, curCount_, thetaLong_, false); - break; //{minT, |A| < minT , E(a)} - } - case 18: - case 19: - case 34: - case 35: { //A compact or HT, B compact or ordered + break; + + case 18: //A Compact, B Compact; CheckAB, B -> H; => C,H; scanAllAsearchB() + case 19: //A Compact, B Ordered; CheckAB, B -> H; => C,H; scanAllAsearchB() + case 34: //A HashTbl, B Compact; CheckAB, B -> H; => H,H; scanAllAsearchB() + case 35: //A HashTbl, B Ordered; CheckAB, B -> H; => H,H; scanAllAsearchB() Util.checkSeedHashes(seedHash_, a_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] Util.checkSeedHashes(seedHash_, b_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] thetaLong_ = min(a_.getThetaLong(), b_.getThetaLong()); - empty_ = a_.isEmpty(); - //must convert B to HT - convertBtoHT(); //builds HT from B - scanAllAsearchB(); //builds cache, curCount from A, HT - break; //{minT, n, E(a)} - } - case 26: - case 27: { //A ordered early stop, B compact or ordered + empty_ = false; + convertBtoHT(); + scanAllAsearchB(); + break; + + case 26: //A Ordered, B Compact; CheckAB, B -> H; => O,H; scanEarlyStopAsearchB() + case 27: //A Ordered, B Ordered; CheckAB, B -> H; => O,H; scanEarlyStopAsearchB() Util.checkSeedHashes(seedHash_, a_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] Util.checkSeedHashes(seedHash_, b_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] thetaLong_ = min(a_.getThetaLong(), b_.getThetaLong()); - empty_ = a_.isEmpty(); - convertBtoHT(); //builds HT from B + empty_ = false; + convertBtoHT(); scanEarlyStopAsearchB(); - break; //{minT, n, E(a)} - } - case 20: - case 36: { //A compact or HT, B is already HT + break; + + case 20: //A Compact, B HashTbl; CheckAB, scanAllAsearchB() + case 36: //A HashTbl, B HashTbl; CheckAB, scanAllAsearchB() Util.checkSeedHashes(seedHash_, a_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] Util.checkSeedHashes(seedHash_, b_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] thetaLong_ = min(a_.getThetaLong(), b_.getThetaLong()); - empty_ = a_.isEmpty(); - //b is already HT + empty_ = false; lgArrLongsHT_ = ((UpdateSketch)b_).getLgArrLongs(); - bHashTable_ = b_.getCache(); //safe as bHashTable is read-only - scanAllAsearchB(); //builds cache, curCount from A, HT - break; //{minT, n, E(a)} - } - case 28: { //A ordered early stop, B is already hashtable + bHashTable_ = b_.getCache(); + scanAllAsearchB(); + break; + + case 28: //A Ordered, B HashTbl; CheckAB, scanEarlyStopAsearchB() Util.checkSeedHashes(seedHash_, a_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] Util.checkSeedHashes(seedHash_, b_.getSeedHash());//lgtm [java/dereferenced-value-may-be-null] thetaLong_ = min(a_.getThetaLong(), b_.getThetaLong()); - empty_ = a_.isEmpty(); - //b is already HT + empty_ = false; lgArrLongsHT_ = ((UpdateSketch)b_).getLgArrLongs(); - bHashTable_ = b_.getCache(); //safe as bHashTable is read-only + bHashTable_ = b_.getCache(); scanEarlyStopAsearchB(); - break; //{minT, n, E(a)} - } + break; + //default: //This cannot happen and cannot be tested } } diff --git a/src/main/java/com/yahoo/sketches/theta/HeapCompactSketch.java b/src/main/java/com/yahoo/sketches/theta/HeapCompactSketch.java index 72f60bb61..29543bc85 100644 --- a/src/main/java/com/yahoo/sketches/theta/HeapCompactSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/HeapCompactSketch.java @@ -40,10 +40,11 @@ abstract class HeapCompactSketch extends CompactSketch { final int curCount, final long thetaLong) { empty_ = empty; seedHash_ = seedHash; - curCount_ = curCount; - thetaLong_ = thetaLong; + curCount_ = empty ? 0 : curCount; + thetaLong_ = empty ? Long.MAX_VALUE : thetaLong; cache_ = cache; preLongs_ = computeCompactPreLongs(thetaLong, empty, curCount); + checkEmptyState(isEmpty(), getRetainedEntries(true), getThetaLong()); //TODO remove if not needed } //Sketch diff --git a/src/main/java/com/yahoo/sketches/theta/Intersection.java b/src/main/java/com/yahoo/sketches/theta/Intersection.java index 5686bc81d..d4bee3d2c 100644 --- a/src/main/java/com/yahoo/sketches/theta/Intersection.java +++ b/src/main/java/com/yahoo/sketches/theta/Intersection.java @@ -70,4 +70,29 @@ public Family getFamily() { */ public abstract void update(Sketch sketchIn); + /** + * Perform intersect set operation on the two given sketch arguments and return the result as an + * ordered CompactSketch on the heap. + * @param a The first sketch argument + * @param b The second sketch argument + * @return an ordered CompactSketch on the heap + */ + public CompactSketch intersect(final Sketch a, final Sketch b) { + return intersect(a, b, true, null); + } + + /** + * Perform intersect set operation on the two given sketches and return the result as a + * CompactSketch. + * @param a The first sketch argument + * @param b The second sketch argument + * @param dstOrdered + * See Destination Ordered. + * @param dstMem + * See Destination Memory. + * @return the result as a CompactSketch. + */ + public abstract CompactSketch intersect(Sketch a, Sketch b, boolean dstOrdered, + WritableMemory dstMem); + } diff --git a/src/main/java/com/yahoo/sketches/theta/IntersectionImpl.java b/src/main/java/com/yahoo/sketches/theta/IntersectionImpl.java index 817ff00ee..efa3643f3 100644 --- a/src/main/java/com/yahoo/sketches/theta/IntersectionImpl.java +++ b/src/main/java/com/yahoo/sketches/theta/IntersectionImpl.java @@ -41,6 +41,10 @@ private IntersectionImpl(final WritableMemory wmem, final long seed, final boole super(wmem, seed, newMem); } + IntersectionImpl(final short seedHash) { + super(seedHash); + } + /** * Construct a new Intersection target on the java heap. * @@ -57,6 +61,7 @@ static IntersectionImpl initNewHeapInstance(final long seed) { return impl; } + /** * Construct a new Intersection target direct to the given destination Memory. * Called by SetOperation.Builder. @@ -163,83 +168,118 @@ static IntersectionImpl wrapInstance(final WritableMemory srcMem, final long see @Override public void update(final Sketch sketchIn) { - final boolean firstCall = curCount_ < 0; - - //Corner cases - if (sketchIn == null) { //null -> Th = 1.0, count = 0, empty = true - //No seedHash to check - //Because of the def of null above and the Empty Rule (which is OR) empty_ must be true. + if (sketchIn != null) { + Util.checkSeedHashes(seedHash_, sketchIn.getSeedHash()); + } + //Null / Empty cases. + //Note: null == empty := Th = 1.0, count = 0, empty = true + if ((sketchIn == null) || sketchIn.isEmpty() || empty_) { //empty rule + //Because of the def of null above and the Empty Rule (which is OR), empty_ must be true. + //Whatever the current internal state, we make it empty. empty_ = true; - thetaLong_ = firstCall ? Long.MAX_VALUE : thetaLong_; //if Nth call, stays the same + thetaLong_ = Long.MAX_VALUE; curCount_ = 0; + lgArrLongs_ = 0; + maxLgArrLongs_ = 0; + hashTable_ = null; if (mem_ != null) { - PreambleUtil.setEmpty(mem_); + PreambleUtil.setEmpty(mem_); //true insertThetaLong(mem_, thetaLong_); insertCurCount(mem_, 0); + insertLgArrLongs(mem_, lgArrLongs_); } return; } - //Checks - Util.checkSeedHashes(seedHash_, sketchIn.getSeedHash()); - thetaLong_ = min(thetaLong_, sketchIn.getThetaLong()); //Theta rule - empty_ = empty_ || sketchIn.isEmpty(); //Empty rule - + empty_ = false; if (mem_ != null) { insertThetaLong(mem_, thetaLong_); - if (empty_) { PreambleUtil.setEmpty(mem_); } - else { clearEmpty(mem_); } + PreambleUtil.clearEmpty(mem_); //false } final int sketchInEntries = sketchIn.getRetainedEntries(true); - // The truth table for the following state machine for corner cases: - // Case CurCount SketchInEntries | Actions - // 1 <0 0 | CurCount = 0; HT = null; exit + // The truth table for the following state machine + // Case curCount sketchInEntries | Actions + // 1 <0 0 | First update, curCount = 0; HT = null; exit // 2 0 0 | CurCount = 0; HT = null; exit // 3 >0 0 | CurCount = 0; HT = null; exit - // 4 <0 >0 | Clone SketchIn; exit - // 5 0 >0 | CurCount = 0; HT = null; exit - // 6 >0 >0 | Perform full intersect - - if ((curCount_ == 0) || (sketchInEntries == 0)) { //Cases 1,2,3,5 - //All future intersections result in zero data, but theta can still be reduced. - curCount_ = 0; - if (mem_ != null) { insertCurCount(mem_, 0); } - hashTable_ = null; //No need for a HT. Don't bother clearing mem if valid - } - else if (firstCall) { //Case 4: Clone the incoming sketch - curCount_ = sketchIn.getRetainedEntries(true); - final int requiredLgArrLongs = computeMinLgArrLongsFromCount(curCount_); - final int priorLgArrLongs = lgArrLongs_; //prior only used in error message - lgArrLongs_ = requiredLgArrLongs; - - if (mem_ != null) { //Off heap, check if current dstMem is large enough - insertCurCount(mem_, curCount_); - insertLgArrLongs(mem_, lgArrLongs_); - if (requiredLgArrLongs <= maxLgArrLongs_) { //OK - mem_.clear(CONST_PREAMBLE_LONGS << 3, 8 << lgArrLongs_); //clear only what required + // 5 <0 >0 | First update, clone SketchIn; exit + // 6 0 >0 | CurCount = 0; HT = null; exit + // 7 >0 >0 | Perform full intersect + final int sw = ((curCount_ < 0) ? 1 : (curCount_ == 0) ? 2 : 3) + | (((sketchInEntries > 0) ? 1 : 0) << 2) ; + switch (sw) { + case 1: + case 2: + case 3: + case 6: { //(curCount_ == 0) || (sketchInEntries == 0) + //All future intersections result in zero data, but theta can still be reduced. + curCount_ = 0; + if (mem_ != null) { insertCurCount(mem_, 0); } + hashTable_ = null; //No need for a HT. Don't bother clearing mem if valid + break; + } + case 5: { // curCount_ < 0; This is the 1st update, clone the incoming sketch + curCount_ = sketchIn.getRetainedEntries(true); + final int requiredLgArrLongs = computeMinLgArrLongsFromCount(curCount_); + final int priorLgArrLongs = lgArrLongs_; //prior only used in error message + lgArrLongs_ = requiredLgArrLongs; + + if (mem_ != null) { //Off heap, check if current dstMem is large enough + insertCurCount(mem_, curCount_); + insertLgArrLongs(mem_, lgArrLongs_); + if (requiredLgArrLongs <= maxLgArrLongs_) { //OK + mem_.clear(CONST_PREAMBLE_LONGS << 3, 8 << lgArrLongs_); //clear only what required + } + else { //not enough space in dstMem + throw new SketchesArgumentException( + "Insufficient dstMem hash table space: " + + (1 << requiredLgArrLongs) + " > " + (1 << priorLgArrLongs)); + } } - else { //not enough space in dstMem - throw new SketchesArgumentException( - "Insufficient dstMem hash table space: " - + (1 << requiredLgArrLongs) + " > " + (1 << priorLgArrLongs)); + else { //On the heap, allocate a HT + hashTable_ = new long[1 << lgArrLongs_]; } + moveDataToTgt(sketchIn.getCache(), curCount_); + break; } - else { //On the heap, allocate a HT - hashTable_ = new long[1 << lgArrLongs_]; + case 7: { // (curCount > 0) && (sketchInEntries > 0); Perform full intersect + //Sets resulting hashTable, curCount and adjusts lgArrLongs + performIntersect(sketchIn); + break; } - - moveDataToTgt(sketchIn.getCache(), curCount_); + //default: not possible } - else { //Case 6: Perform full intersect - //Sets resulting hashTable, curCount and adjusts lgArrLongs - performIntersect(sketchIn); + } + + @Override + public CompactSketch intersect(final Sketch a, final Sketch b, final boolean dstOrdered, + final WritableMemory dstMem) { + reset(); + update(a); + update(b); + return getResult(dstOrdered, dstMem); + } + + @Override + public void reset() { + curCount_ = -1; + thetaLong_ = Long.MAX_VALUE; + empty_ = false; + hashTable_ = null; + if (mem_ != null) { + insertLgArrLongs(mem_, lgArrLongs_); //make sure + insertCurCount(mem_, -1); + insertThetaLong(mem_, Long.MAX_VALUE); + clearEmpty(mem_); } } - void performIntersect(final Sketch sketchIn) { + //restricted + + private void performIntersect(final Sketch sketchIn) { // curCount and input data are nonzero, match against HT assert ((curCount_ > 0) && (!empty_)); final long[] cacheIn = sketchIn.getCache(); @@ -299,7 +339,7 @@ void performIntersect(final Sketch sketchIn) { } } - void moveDataToTgt(final long[] arr, final int count) { + private void moveDataToTgt(final long[] arr, final int count) { final int arrLongsIn = arr.length; int tmpCnt = 0; if (mem_ != null) { //Off Heap puts directly into mem diff --git a/src/main/java/com/yahoo/sketches/theta/IntersectionImplR.java b/src/main/java/com/yahoo/sketches/theta/IntersectionImplR.java index 342f72045..816308c27 100644 --- a/src/main/java/com/yahoo/sketches/theta/IntersectionImplR.java +++ b/src/main/java/com/yahoo/sketches/theta/IntersectionImplR.java @@ -20,10 +20,6 @@ import static com.yahoo.sketches.theta.PreambleUtil.SER_VER; import static com.yahoo.sketches.theta.PreambleUtil.SER_VER_BYTE; import static com.yahoo.sketches.theta.PreambleUtil.THETA_LONG; -import static com.yahoo.sketches.theta.PreambleUtil.clearEmpty; -import static com.yahoo.sketches.theta.PreambleUtil.insertCurCount; -import static com.yahoo.sketches.theta.PreambleUtil.insertLgArrLongs; -import static com.yahoo.sketches.theta.PreambleUtil.insertThetaLong; import com.yahoo.memory.Memory; import com.yahoo.memory.WritableMemory; @@ -38,7 +34,7 @@ * *

This implementation uses data either on-heap or off-heap in a given Memory * that is owned and managed by the caller. - * The off-heap Memory, which if managed properly will greatly reduce the need for + * The off-heap Memory, which if managed properly, will greatly reduce the need for * the JVM to perform garbage collection.

* * @author Lee Rhodes @@ -72,6 +68,16 @@ class IntersectionImplR extends Intersection { } } + IntersectionImplR(final short seedHash) { + seedHash_ = seedHash; + mem_ = null; + lgArrLongs_ = 0; + curCount_ = -1; + thetaLong_ = Long.MAX_VALUE; + empty_ = false; + hashTable_ = null; + } + /** * Wrap an Intersection target around the given source Memory containing intersection data. * @param srcMem The source Memory image. @@ -87,7 +93,7 @@ static IntersectionImplR wrapInstance(final Memory srcMem, final long seed) { static IntersectionImplR internalWrapInstance(final Memory srcMem, final IntersectionImplR impl) { //Get Preamble //Note: Intersection does not use lgNomLongs (or k), per se. - //seedHash loaded and checked in private constructor + //seedHash loaded and checked in constructor final int preLongsMem = srcMem.getByte(PREAMBLE_LONGS_BYTE) & 0X3F; final int serVer = srcMem.getByte(SER_VER_BYTE) & 0XFF; final int famID = srcMem.getByte(FAMILY_BYTE) & 0XFF; @@ -131,7 +137,8 @@ static IntersectionImplR internalWrapInstance(final Memory srcMem, final Interse public CompactSketch getResult(final boolean dstOrdered, final WritableMemory dstMem) { if (curCount_ < 0) { throw new SketchesStateException( - "Calling getResult() with no intervening intersections is not a legal result."); + "Calling getResult() with no intervening intersections would represent the infinite set, " + + "which is not a legal result."); } long[] compactCacheR; @@ -187,16 +194,7 @@ public boolean isSameResource(final Memory that) { @Override public void reset() { - curCount_ = -1; - thetaLong_ = Long.MAX_VALUE; - empty_ = false; - hashTable_ = null; - if (mem_ != null) { - insertLgArrLongs(mem_, lgArrLongs_); //make sure - insertCurCount(mem_, -1); - insertThetaLong(mem_, Long.MAX_VALUE); - clearEmpty(mem_); - } + throw new SketchesReadOnlyException(); } @Override @@ -240,6 +238,12 @@ public void update(final Sketch sketchIn) { throw new SketchesReadOnlyException(); } + @Override + public CompactSketch intersect(final Sketch a, final Sketch b, final boolean dstOrdered, + final WritableMemory dstMem) { + throw new SketchesReadOnlyException(); + } + //restricted @Override diff --git a/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java b/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java index 4c2d7d900..7bc5bc480 100644 --- a/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java +++ b/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java @@ -5,207 +5,56 @@ package com.yahoo.sketches.theta; +import static com.yahoo.sketches.theta.Sketch.checkEmptyState; + import java.util.Arrays; -import com.yahoo.sketches.HashOperations; import com.yahoo.sketches.SketchesArgumentException; -import com.yahoo.sketches.SketchesException; import com.yahoo.sketches.Util; /** * Set Operations where the arguments are presented in pairs as in C = Op(A,B). These are - * stateless operations and the result is returned immediately. These operations are designed for - * high performance and only accept ordered, CompactSketches, which may be either Heap-based or - * Direct. The returned results are always in the form of an ordered CompactSketch. + * stateless operations and the result is returned immediately. + * + *

These operations are designed for convenience and accept Sketches that may be either + * Heap-based or Direct. * * @author Lee Rhodes */ public class PairwiseSetOperations { /** - * This implements a stateless, pair-wise intersection operation on ordered, - * CompactSketches that are either Heap-based or Direct. + * This implements a stateless, pair-wise Intersect operation on sketches + * that are either Heap-based or Direct. * If both inputs are null a null is returned. - * If one is null an empty sketch is returned. - * @param skA The first ordered, CompactSketch argument. - * @param skB The second ordered, CompactSketch argument. - * @return the result as an ordered CompactSketch. + * + * @param skA The first Sketch argument. + * @param skB The second Sketch argument. + * @return the result as an ordered CompactSketch on the heap. */ - public static CompactSketch intersect(final CompactSketch skA, final CompactSketch skB) { - if ((skA == null) && (skB == null)) { return null; } //no way to construct the seedHash - - if (skA == null) { - return HeapCompactOrderedSketch - .compact(new long[0], true, skB.getSeedHash(), 0, skB.getThetaLong()); - } - if (skB == null) { - return HeapCompactOrderedSketch - .compact(new long[0], true, skA.getSeedHash(), 0, skA.getThetaLong()); - } - - //Both sketches are valid, check seedHashes and ordered - final short seedHash = Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); - if (!skB.isOrdered()) { - throw new SketchesArgumentException("skB must be ordered!"); - } - if (!skA.isOrdered()) { - throw new SketchesArgumentException("skA must be ordered!"); - } - - //Full Intersection - final boolean emptyA = skA.isEmpty(); - final boolean emptyB = skB.isEmpty(); - final boolean emptyRule = emptyA || emptyB; //Empty rule is OR - - final long thetaLong = Math.min(skA.getThetaLong(), skB.getThetaLong()); //Theta rule - - if (emptyRule) { //even if emptyRule = true, theta can be < 1.0 - return HeapCompactOrderedSketch - .compact(new long[0], emptyRule, seedHash, 0, thetaLong); - } - - //Both sketches are non-empty - final long[] cacheA = (skA.isDirect()) ? skA.getCache() : skA.getCache().clone(); - final long[] cacheB = (skB.isDirect()) ? skB.getCache() : skB.getCache().clone(); - final int aLen = cacheA.length; - final int bLen = cacheB.length; - - final long[] outCache = new long[Math.min(aLen, bLen)]; - - int indexA = 0; - int indexB = 0; - int outCount = 0; - - while ((indexA < aLen) && (indexB < bLen)) { - final long hashA = cacheA[indexA]; - final long hashB = cacheB[indexB]; - - if ((hashA >= thetaLong) || (hashB >= thetaLong)) { - break; - } - - if (hashA == hashB) { - outCache[outCount++] = hashA; - ++indexA; - ++indexB; - } else if (hashA < hashB) { - ++indexA; - } else { - ++indexB; - } - } - - return HeapCompactOrderedSketch - .compact(Arrays.copyOf(outCache, outCount), emptyRule, seedHash, outCount, thetaLong); + public static CompactSketch intersect(final Sketch skA, final Sketch skB) { + if ((skA == null) && (skB == null)) { return null; } + final short seedHash = (skA == null) ? skB.getSeedHash() : skA.getSeedHash(); + final Intersection inter = new IntersectionImpl(seedHash); + return inter.intersect(skA, skB, true, null); } /** - * This implements a stateless, pair-wise A AND NOT B operation on ordered, - * CompactSketches that are either Heap-based or Direct. - * If both inputs are null a null is returned. If skA is null an empty sketch is returned. - * If skB is null or empty skA is returned. + * This implements a stateless, pair-wise A AND NOT B operation on Sketches + * that are either Heap-based or Direct. + * If both inputs are null a null is returned. * - * @param skA The first ordered, CompactSketch argument. - * @param skB The second ordered, CompactSketch argument. - * @return the result as an ordered CompactSketch. - */ //see HeapAnotB.compute() for return rule table - public static CompactSketch aNotB(final CompactSketch skA, final CompactSketch skB) { - if ((skA == null) && (skB == null)) { return null; } //no way to construct the seedHash - - if (skA == null) { - if (!skB.isOrdered()) { - throw new SketchesException("skB must be ordered!"); - } - //return rule {ThB, 0, T} - return HeapCompactOrderedSketch - .compact(new long[0], true, skB.getSeedHash(), 0, skB.getThetaLong()); - } - if (skB == null) { - if (!skA.isOrdered()) { - throw new SketchesException("skA must be ordered!"); - } - return skA; //return rule {ThA, |A|, E(a)} - } - - //Both sketches are valid check seedHashes and ordered - final short seedHash = Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); - if (!skB.isOrdered()) { - throw new SketchesArgumentException("skB must be ordered!"); - } - if (!skA.isOrdered()) { - throw new SketchesArgumentException("skA must be ordered!"); - } - - final boolean emptyA = skA.isEmpty(); - final boolean emptyB = skB.isEmpty(); - final boolean bothEmpty = emptyA && emptyB; - - final long thetaLong = Math.min(skA.getThetaLong(), skB.getThetaLong()); //Theta rule - final boolean emptyRule = emptyA; //Empty rule is whatever A is - - if (emptyA || bothEmpty) { //return rule {minT, 0, T} - return HeapCompactOrderedSketch - .compact(new long[0], emptyRule, seedHash, 0, thetaLong); - } - - final long[] cacheA = (skA.isDirect()) ? skA.getCache() : skA.getCache().clone(); - - if (emptyB) { //return rule {minT, |A| < minT , E(a)} - final int curCount = HashOperations.count(cacheA, thetaLong); - final long[] cache = CompactSketch.compactCache(cacheA, curCount, thetaLong, true); - return HeapCompactOrderedSketch - .compact(cache, emptyRule, seedHash, curCount, thetaLong); - } - - //Both are non-empty - final long[] cacheB = (skB.isDirect()) ? skB.getCache() : skB.getCache().clone(); - - final int aLen = cacheA.length; - final int bLen = cacheB.length; - - final long[] outCache = new long[aLen]; - - int indexA = 0; - int indexB = 0; - int indexOut = 0; - long hashA = cacheA[indexA]; - long hashB = cacheB[indexB]; - - while ((indexA < aLen) || (indexB < bLen)) { - if (hashA == hashB) { - if (hashA < thetaLong) { - //reject - hashA = (++indexA < aLen) ? cacheA[indexA] : thetaLong; - hashB = (++indexB < bLen) ? cacheB[indexB] : thetaLong; - continue; - } - break; - } - else if (hashA < hashB) { - if (hashA < thetaLong) { - outCache[indexOut++] = hashA; //keep - hashA = (++indexA < aLen) ? cacheA[indexA] : thetaLong; - continue; - } - break; - } - else { //hashA > hashB - if (hashB < thetaLong) { - //reject - hashB = (++indexB < bLen) ? cacheB[indexB] : thetaLong; - continue; - } - break; - } - } - - final int outLen = indexOut; - - return HeapCompactOrderedSketch - .compact(Arrays.copyOf(outCache, outLen), emptyA, seedHash, outLen, thetaLong); + * @param skA The first Sketch argument. + * @param skB The second Sketch argument. + * @return the result as an ordered CompactSketch on the heap. + */ + public static CompactSketch aNotB(final Sketch skA, final Sketch skB) { + if ((skA == null) && (skB == null)) { return null; } + final short seedHash = (skA == null) ? skB.getSeedHash() : skA.getSeedHash(); + final HeapAnotB anotb = new HeapAnotB(seedHash); + return anotb.aNotB(skA, skB, true, null); } - /** * This implements a stateless, pair-wise union operation on ordered, * CompactSketches that are either Heap-based or Direct. @@ -223,69 +72,75 @@ public static CompactSketch union(final CompactSketch skA, final CompactSketch s /** * This implements a stateless, pair-wise union operation on ordered, - * CompactSketches that are either Heap-based or Direct. - * If both inputs are null a null is returned. - * If one is null the other is returned, which can be either Heap-based or Direct. + * CompactSketches that are either Heap-based or Direct. The returned sketch will be cutback to + * k if required, similar to the regular Union operation. If a cutback is required, the returned + * sketch will always be on the heap. + * If both inputs are null a null is returned. If either sketch is empty its Theta is ignored. + * If one is null the other is returned, which may be either Direct or heap-based if a cutback + * is required. * * @param skA The first ordered, CompactSketch argument. * @param skB The second ordered, CompactSketch argument * @param k The upper bound of the number of entries to be retained by the sketch * @return the result as an ordered CompactSketch. */ + @SuppressWarnings("null") public static CompactSketch union(final CompactSketch skA, final CompactSketch skB, final int k) { - if ((skA == null) && (skB == null)) { return null; } //no way to construct the seedHash - - if (skA == null) { - if (!skB.isOrdered()) { //must be ordered - throw new SketchesException("skB must be ordered!"); + //Handle all corner cases with null or empty arguments + //For backward compatibility, we must allow input empties with Theta < 1.0. + final int swA = (skA == null) ? 1 : skA.isEmpty() ? 2 : 3; + final int swB = (skB == null) ? 1 : skB.isEmpty() ? 2 : 3; + final int sw = (swA << 2) | swB; + switch (sw) { + case 5: { //skA == null; skB == null; return null. Cannot determine seedhash. + return null; } - if (skB.getRetainedEntries(true) > k) { //guarantees cutback to k - final long[] cacheB = (skB.isDirect()) ? skB.getCache() : skB.getCache().clone(); - final long thetaLong = cacheB[k]; - final long[] arrB = Arrays.copyOf(cacheB, k); - return HeapCompactOrderedSketch - .compact(arrB, skB.isEmpty(), skB.getSeedHash(), k, thetaLong); + case 6: { //skA == null; skB == empty; return empty + checkOrdered(skB); + return (skB.getThetaLong() == Long.MAX_VALUE) ? skB : + HeapCompactOrderedSketch.compact(new long[0], true, skB.getSeedHash(), 0, Long.MAX_VALUE); } - return skB; - } - - if (skB == null) { - if (!skA.isOrdered()) { //must be ordered - throw new SketchesException("skA must be ordered!"); + case 7: { //skA == null; skB == valid; return skB + checkOrdered(skB); + return maybeCutback(skB, k); } - if (skA.getRetainedEntries(true) > k) { //guarantees cutback to k - final long[] cacheA = (skA.isDirect()) ? skA.getCache() : skA.getCache().clone(); - final long thetaLong = cacheA[k]; - final long[] arrA = Arrays.copyOf(cacheA, k); - return HeapCompactOrderedSketch - .compact(arrA, skA.isEmpty(), skA.getSeedHash(), k, thetaLong); + case 9: { //skA == empty; skB == null; return empty + checkOrdered(skA); + return (skA.getThetaLong() == Long.MAX_VALUE) ? skA : + HeapCompactOrderedSketch.compact(new long[0], true, skA.getSeedHash(), 0, Long.MAX_VALUE); } - return skA; - } - - //Both sketches are valid check seedHashes and ordered - final short seedHash = Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); - if (!skB.isOrdered()) { - throw new SketchesArgumentException("skB must be ordered!"); - } - if (!skA.isOrdered()) { - throw new SketchesArgumentException("skA must be ordered!"); - } - - final boolean emptyA = skA.isEmpty(); - final boolean emptyB = skB.isEmpty(); - final boolean bothEmptyRule = emptyA && emptyB; //Empty rule is AND - - if (bothEmptyRule) { - return (skA.getThetaLong() < skB.getThetaLong()) ? skA : skB; + case 10: { //skA == empty; skB == empty; return empty + final short seedHash = Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); + if (skA.getThetaLong() == Long.MAX_VALUE) { checkOrdered(skA); return skA; } + if (skB.getThetaLong() == Long.MAX_VALUE) { checkOrdered(skB); return skB; } + return HeapCompactOrderedSketch.compact(new long[0], true, seedHash, 0, Long.MAX_VALUE); + } + case 11: { //skA == empty; skB == valid; return skB + Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); + checkOrdered(skB); + return maybeCutback(skB, k); + } + case 13: { //skA == valid; skB == null; return skA + checkOrdered(skA); + return maybeCutback(skA, k); + } + case 14: { //skA == valid; skB == empty; return skA + Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); + checkOrdered(skA); + return maybeCutback(skA, k); + } + case 15: { //skA == valid; skB == valid; perform full union + Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); + checkOrdered(skA); + checkOrdered(skB); + break; + } + //default: cannot happen } - long thetaLong = Math.min(skA.getThetaLong(), skB.getThetaLong()); //Theta rule - - // Attempting to shortcut this if one of the arguments is "empty" turns out to be complex. - // The theta of an empty sketch could be < 1.0 and will empact the other sketch. - + //Both sketches are valid with matching seedhashes and ordered //Full Union operation + long thetaLong = Math.min(skA.getThetaLong(), skB.getThetaLong()); //Theta rule final long[] cacheA = (skA.isDirect()) ? skA.getCache() : skA.getCache().clone(); final long[] cacheB = (skB.isDirect()) ? skB.getCache() : skB.getCache().clone(); final int aLen = cacheA.length; @@ -339,10 +194,40 @@ else if (hashA < hashB) { } } - final int outLen = indexOut; - + int curCount = indexOut; + final long[] outArr; + if (indexOut > k) { + outArr = Arrays.copyOf(outCache, k); //cutback to k + curCount = k; + } else { + outArr = Arrays.copyOf(outCache, curCount); //copy only valid items + } + checkEmptyState(false, curCount, thetaLong); return HeapCompactOrderedSketch - .compact(Arrays.copyOf(outCache, outLen), bothEmptyRule, seedHash, outLen, thetaLong); + .compact(outArr, false, skA.getSeedHash(), curCount, thetaLong); + } + + private static CompactSketch maybeCutback(final CompactSketch csk, final int k) { + final boolean empty = csk.isEmpty(); + int curCount = csk.getRetainedEntries(true); + long thetaLong = csk.getThetaLong(); + if (curCount > k) { //cutback to k + final long[] cache = (csk.isDirect()) ? csk.getCache() : csk.getCache().clone(); + thetaLong = cache[k]; + final long[] arr = Arrays.copyOf(cache, k); + curCount = k; + checkEmptyState(empty, curCount, thetaLong); + return HeapCompactOrderedSketch + .compact(arr, empty, csk.getSeedHash(), curCount, thetaLong); + } + checkEmptyState(empty, curCount, thetaLong); + return csk; + } + + private static void checkOrdered(final CompactSketch csk) { + if (!csk.isOrdered()) { + throw new SketchesArgumentException("Given sketch must be ordered."); + } } } diff --git a/src/main/java/com/yahoo/sketches/theta/PreambleUtil.java b/src/main/java/com/yahoo/sketches/theta/PreambleUtil.java index 15b9fbe22..c35597383 100644 --- a/src/main/java/com/yahoo/sketches/theta/PreambleUtil.java +++ b/src/main/java/com/yahoo/sketches/theta/PreambleUtil.java @@ -33,16 +33,79 @@ * *

An empty CompactSketch only requires 8 bytes.

* - *

A SingleItemSketch requires an 8 byte preamble plus a single hash item of 8 bytes.

+ *
+ * Long || Start Byte Adr:
+ * Adr:
+ *      ||    7   |    6   |    5   |    4   |    3   |    2   |    1   |     0              |
+ *  0   ||    Seed Hash    | Flags  |        |        | FamID  | SerVer |     PreLongs = 1   |
+ * 
+ * + *

A SingleItemSketch (extends CompactSketch) requires an 8 byte preamble plus a single + * hash item of 8 bytes.

+ * + *
+ * Long || Start Byte Adr:
+ * Adr:
+ *      ||    7   |    6   |    5   |    4   |    3   |    2   |    1   |     0              |
+ *  0   ||    Seed Hash    | Flags  |        |        | FamID  | SerVer |     PreLongs = 1   |
+ *
+ *      ||   15   |   14   |   13   |   12   |   11   |   10   |    9   |     8              |
+ *  1   ||---------------------------Single long hash----------------------------------------|
+ * 
* *

An exact (non-estimating) CompactSketch requires 16 bytes of preamble plus a compact array of * longs.

* + *
+ * Long || Start Byte Adr:
+ * Adr:
+ *      ||    7   |    6   |    5   |    4   |    3   |    2   |    1   |     0              |
+ *  0   ||    Seed Hash    | Flags  |        |        | FamID  | SerVer |     PreLongs = 2   |
+ *
+ *      ||   15   |   14   |   13   |   12   |   11   |   10   |    9   |     8              |
+ *  1   ||-----------------p-----------------|----------Retained Entries Count---------------|
+ *
+ *      ||   23   |   22   |   21    |  20   |   19   |   18   |   17   |    16              |
+ *  2   ||----------------------Start of Compact Long Array----------------------------------|
+ * 
+ * *

An estimating CompactSketch requires 24 bytes of preamble plus a compact array of longs.

* + *
+ * Long || Start Byte Adr:
+ * Adr:
+ *      ||    7   |    6   |    5   |    4   |    3   |    2   |    1   |     0              |
+ *  0   ||    Seed Hash    | Flags  |        |        | FamID  | SerVer |     PreLongs = 3   |
+ *
+ *      ||   15   |   14   |   13   |   12   |   11   |   10   |    9   |     8              |
+ *  1   ||-----------------p-----------------|----------Retained Entries Count---------------|
+ *
+ *      ||   23   |   22   |   21    |  20   |   19   |   18   |   17   |    16              |
+ *  2   ||------------------------------THETA_LONG-------------------------------------------|
+ *
+ *      ||   31   |   30   |   29   |   28   |   27   |   26   |   25   |    24              |
+ *  3   ||----------------------Start of Compact Long Array----------------------------------|
+ *  
+ * *

An UpdateSketch requires 24 bytes of preamble plus a non-compact array of longs representing a * hash table.

* + *
+ * Long || Start Byte Adr:
+ * Adr:
+ *      ||    7   |    6   |    5   |    4   |    3   |    2   |    1   |     0              |
+ *  0   ||    Seed Hash    | Flags  |  LgArr |  lgNom | FamID  | SerVer | RF, PreLongs = 3   |
+ *
+ *      ||   15   |   14   |   13   |   12   |   11   |   10   |    9   |     8              |
+ *  1   ||-----------------p-----------------|----------Retained Entries Count---------------|
+ *
+ *      ||   23   |   22   |   21    |  20   |   19   |   18   |   17   |    16              |
+ *  2   ||------------------------------THETA_LONG-------------------------------------------|
+ *
+ *      ||   31   |   30   |   29   |   28   |   27   |   26   |   25   |    24              |
+ *  3   ||----------------------Start of Hash Table of longs---------------------------------|
+ *  
+ * *

Union objects require 32 bytes of preamble plus a non-compact array of longs representing a * hash table.

* @@ -50,7 +113,7 @@ * Long || Start Byte Adr: * Adr: * || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | - * 0 || Seed Hash | Flags | LgArr | lgNom | FamID | SerVer | RF, Preamble_Longs | + * 0 || Seed Hash | Flags | LgArr | lgNom | FamID | SerVer | RF, PreLongs = 4 | * * || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 | * 1 ||-----------------p-----------------|----------Retained Entries Count---------------| @@ -59,7 +122,11 @@ * 2 ||------------------------------THETA_LONG-------------------------------------------| * * || 31 | 30 | 29 | 28 | 27 | 26 | 25 | 24 | - * 3 ||---------------------------Start of Long Array-------------------------------------| + * 3 ||---------------------------UNION THETA LONG----------------------------------------| + * + * || 39 | 38 | 37 | 36 | 35 | 34 | 33 | 32 | + * 4 ||----------------------Start of Hash Table of longs---------------------------------| + * * * * @author Lee Rhodes @@ -161,23 +228,23 @@ static String preambleToString(final Memory mem) { final int seedHash = extractSeedHash(mem); - //assumes preLongs == 1 - int curCount = singleItem ? 1 : 0; //preLongs 1 empty or singleItem + //assumes preLongs == 1; empty or singleItem + int curCount = singleItem ? 1 : 0; float p = (float) 1.0; //preLongs 1 or 2 long thetaLong = Long.MAX_VALUE; //preLongs 1 or 2 long thetaULong = thetaLong; //preLongs 1, 2 or 3 - if (preLongs == 2) { + if (preLongs == 2) { //exact (non-estimating) CompactSketch curCount = extractCurCount(mem); p = extractP(mem); } - else if (preLongs == 3) { + else if (preLongs == 3) { //Update Sketch curCount = extractCurCount(mem); p = extractP(mem); thetaLong = extractThetaLong(mem); thetaULong = thetaLong; } - else if (preLongs == 4) { + else if (preLongs == 4) { //Union curCount = extractCurCount(mem); p = extractP(mem); thetaLong = extractThetaLong(mem); @@ -358,7 +425,7 @@ static void insertUnionThetaLong(final WritableMemory wmem, final long unionThet wmem.putLong(UNION_THETA_LONG, unionThetaLong); } - //TODO convert to set/clear/any bits + //TODO convert these to set/clear/any bits static void setEmpty(final WritableMemory wmem) { int flags = wmem.getByte(FLAGS_BYTE) & 0XFF; flags |= EMPTY_FLAG_MASK; diff --git a/src/main/java/com/yahoo/sketches/theta/SetOperation.java b/src/main/java/com/yahoo/sketches/theta/SetOperation.java index 33551ed0b..82d7109d7 100644 --- a/src/main/java/com/yahoo/sketches/theta/SetOperation.java +++ b/src/main/java/com/yahoo/sketches/theta/SetOperation.java @@ -12,6 +12,7 @@ import static com.yahoo.sketches.Util.ceilingPowerOf2; import static com.yahoo.sketches.theta.PreambleUtil.FAMILY_BYTE; import static com.yahoo.sketches.theta.PreambleUtil.SER_VER_BYTE; +import static com.yahoo.sketches.theta.Sketch.checkEmptyState; import static java.lang.Math.max; import com.yahoo.memory.Memory; @@ -223,29 +224,34 @@ static short computeSeedHash(final long seed) { //used only by the set operations static final CompactSketch createCompactSketch(final long[] compactCache, final boolean empty, - final short seedHash, final int curCount, final long thetaLong, final boolean dstOrdered, + final short seedHash, int curCount, long thetaLong, final boolean dstOrdered, final WritableMemory dstMem) { + if (empty) { + curCount = 0; + thetaLong = Long.MAX_VALUE; + } + checkEmptyState(empty, curCount, thetaLong); CompactSketch sketchOut = null; final int sw = (dstOrdered ? 2 : 0) | ((dstMem != null) ? 1 : 0); switch (sw) { case 0: { //dst not ordered, dstMem == null sketchOut = HeapCompactUnorderedSketch.compact(compactCache, empty, seedHash, curCount, - thetaLong); + thetaLong); //converts to SingleItem if curCount == 1 break; } case 1: { //dst not ordered, dstMem == valid sketchOut = DirectCompactUnorderedSketch.compact(compactCache, empty, seedHash, curCount, - thetaLong, dstMem); + thetaLong, dstMem); //converts to SingleItem format if curCount == 1 break; } case 2: { //dst ordered, dstMem == null sketchOut = HeapCompactOrderedSketch.compact(compactCache, empty, seedHash, curCount, - thetaLong); + thetaLong); //converts to SingleItem format if curCount == 1 break; } case 3: { //dst ordered, dstMem == valid sketchOut = DirectCompactOrderedSketch.compact(compactCache, empty, seedHash, curCount, - thetaLong, dstMem); + thetaLong, dstMem); //converts to SingleItem format if curCount == 1 break; } //default: //This cannot happen and cannot be tested diff --git a/src/main/java/com/yahoo/sketches/theta/Sketch.java b/src/main/java/com/yahoo/sketches/theta/Sketch.java index e03b6bb33..23b96692b 100644 --- a/src/main/java/com/yahoo/sketches/theta/Sketch.java +++ b/src/main/java/com/yahoo/sketches/theta/Sketch.java @@ -24,6 +24,7 @@ import com.yahoo.sketches.BinomialBoundsN; import com.yahoo.sketches.Family; import com.yahoo.sketches.SketchesArgumentException; +import com.yahoo.sketches.SketchesStateException; import com.yahoo.sketches.Util; /** @@ -557,6 +558,35 @@ static final void checkSketchAndMemoryFlags(final Sketch sketch) { } } + /** + * Checks for an illegal state of the empty flag. The truth table is as follows: + *
+   *  Empty CurCount Theta State    Comments
+   *    T      0       1.0   OK     The Normal Empty State
+   *    T      0      <1.0   Error  This can be an initial on-heap state if p < 1.0,
+   *                                  but should stored as a Normal Empty State.
+   *    T     !0       1.0   Error  Empty and curCount !0 should never co-exist
+   *    T     !0      <1.0   Error  Empty and curCount !0 should never co-exist
+   *    F      0       1.0   Error  This conflicts with the normal empty state
+   *    F      0      <1.0   OK     This can result from set operations
+   *    F     !0       1.0   OK     This corresponds to a sketch in exact mode
+   *    F     !0      <1.0   OK     This corresponds to a sketch in estimation mode
+   * 
+ * + * @param empty the state of the empty flag + * @param curCount the current number of retained entries + * @param thetaLong the value of theta as a long + */ + static final void checkEmptyState(final boolean empty, final int curCount, final long thetaLong) { + final boolean thLT1 = thetaLong < Long.MAX_VALUE; + final boolean zeroCount = curCount == 0; + final boolean error = (empty && !zeroCount) || (zeroCount && (empty ^ !thLT1)); + if (error) { + throw new SketchesStateException("Improper Empty State: Empty: " + empty + + ", CurCount=0: " + zeroCount + " Theta<1.0: " + thLT1); + } + } + static final double estimate(final long thetaLong, final int curCount, final boolean empty) { if (estMode(thetaLong, empty)) { final double theta = thetaLong / MAX_THETA_LONG_AS_DOUBLE; diff --git a/src/main/java/com/yahoo/sketches/theta/UnionImpl.java b/src/main/java/com/yahoo/sketches/theta/UnionImpl.java index 939936745..47af3b9dd 100644 --- a/src/main/java/com/yahoo/sketches/theta/UnionImpl.java +++ b/src/main/java/com/yahoo/sketches/theta/UnionImpl.java @@ -48,7 +48,8 @@ final class UnionImpl extends Union { */ private final UpdateSketch gadget_; private final short seedHash_; //eliminates having to compute the seedHash on every update. - private long unionThetaLong_; //when on-heap, this is the only copy + private long unionThetaLong_ = Long.MAX_VALUE; //when on-heap, this is the only copy + private boolean unionEmpty_ = true; private UnionImpl(final UpdateSketch gadget, final long seed) { gadget_ = gadget; @@ -114,6 +115,7 @@ static UnionImpl heapifyInstance(final Memory srcMem, final long seed) { final UpdateSketch gadget = HeapQuickSelectSketch.heapifyInstance(srcMem, seed); final UnionImpl unionImpl = new UnionImpl(gadget, seed); unionImpl.unionThetaLong_ = srcMem.getLong(UNION_THETA_LONG); + unionImpl.unionEmpty_ = PreambleUtil.isEmpty(srcMem); return unionImpl; } @@ -130,6 +132,7 @@ static UnionImpl fastWrap(final Memory srcMem, final long seed) { final UpdateSketch gadget = DirectQuickSelectSketchR.fastReadOnlyWrap(srcMem, seed); final UnionImpl unionImpl = new UnionImpl(gadget, seed); unionImpl.unionThetaLong_ = srcMem.getLong(UNION_THETA_LONG); + unionImpl.unionEmpty_ = PreambleUtil.isEmpty(srcMem); return unionImpl; } @@ -146,6 +149,7 @@ static UnionImpl fastWrap(final WritableMemory srcMem, final long seed) { final UpdateSketch gadget = DirectQuickSelectSketch.fastWritableWrap(srcMem, seed); final UnionImpl unionImpl = new UnionImpl(gadget, seed); unionImpl.unionThetaLong_ = srcMem.getLong(UNION_THETA_LONG); + unionImpl.unionEmpty_ = PreambleUtil.isEmpty(srcMem); return unionImpl; } @@ -162,6 +166,7 @@ static UnionImpl wrapInstance(final Memory srcMem, final long seed) { final UpdateSketch gadget = DirectQuickSelectSketchR.readOnlyWrap(srcMem, seed); final UnionImpl unionImpl = new UnionImpl(gadget, seed); unionImpl.unionThetaLong_ = srcMem.getLong(UNION_THETA_LONG); + unionImpl.unionEmpty_ = PreambleUtil.isEmpty(srcMem); return unionImpl; } @@ -178,6 +183,7 @@ static UnionImpl wrapInstance(final WritableMemory srcMem, final long seed) { final UpdateSketch gadget = DirectQuickSelectSketch.writableWrap(srcMem, seed); final UnionImpl unionImpl = new UnionImpl(gadget, seed); unionImpl.unionThetaLong_ = srcMem.getLong(UNION_THETA_LONG); + unionImpl.unionEmpty_ = PreambleUtil.isEmpty(srcMem); return unionImpl; } @@ -205,7 +211,7 @@ public CompactSketch getResult(final boolean dstOrdered, final WritableMemory ds //Compact the cache final long[] compactCacheOut = compactCache(gadgetCacheCopy, curCountOut, minThetaLong, dstOrdered); - final boolean empty = gadget_.isEmpty(); + final boolean empty = gadget_.isEmpty() && unionEmpty_; return createCompactSketch( compactCacheOut, empty, seedHash_, curCountOut, minThetaLong, dstOrdered, dstMem); } @@ -219,13 +225,16 @@ public CompactSketch getResult() { public void reset() { gadget_.reset(); unionThetaLong_ = gadget_.getThetaLong(); + unionEmpty_ = true; } @Override public byte[] toByteArray() { final byte[] gadgetByteArr = gadget_.toByteArray(); - final WritableMemory mem = WritableMemory.wrap(gadgetByteArr); - mem.putLong(UNION_THETA_LONG, unionThetaLong_); // union theta + final WritableMemory wmem = WritableMemory.wrap(gadgetByteArr); + wmem.putLong(UNION_THETA_LONG, unionThetaLong_); // union theta + final boolean empty = gadget_.isEmpty() && unionEmpty_; + if (!empty) { PreambleUtil.clearEmpty(wmem); } return gadgetByteArr; } @@ -237,12 +246,13 @@ public boolean isSameResource(final Memory that) { @Override public void update(final Sketch sketchIn) { //Only valid for theta Sketches using SerVer = 3 - //UNION Empty Rule: AND the empty states. This does not require separate treatment. + //UNION Empty Rule: AND the empty states. - if (sketchIn == null) { - //null is interpreted as (Theta = 1.0, count = 0, empty = T). Nothing changes + if ((sketchIn == null) || sketchIn.isEmpty()) { + //null and empty is interpreted as (Theta = 1.0, count = 0, empty = T). Nothing changes return; } + //sketchIn is valid and not empty Util.checkSeedHashes(seedHash_, sketchIn.getSeedHash()); Sketch.checkSketchAndMemoryFlags(sketchIn); @@ -250,41 +260,47 @@ public void update(final Sketch sketchIn) { //Only valid for theta Sketches usin final long thetaLongIn = sketchIn.getThetaLong(); unionThetaLong_ = min(unionThetaLong_, thetaLongIn); //Theta rule with incoming final int curCountIn = sketchIn.getRetainedEntries(true); - - if (sketchIn.isOrdered()) { //Only true if Compact. Use early stop - //Ordered, thus compact - if (sketchIn.isDirect()) { - final Memory skMem = ((CompactSketch) sketchIn).getMemory(); - final int preambleLongs = skMem.getByte(PREAMBLE_LONGS_BYTE) & 0X3F; - for (int i = 0; i < curCountIn; i++ ) { - final int offsetBytes = (preambleLongs + i) << 3; - final long hashIn = skMem.getLong(offsetBytes); - if (hashIn >= unionThetaLong_) { break; } // "early stop" - gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed + if (curCountIn > 0) { + if (sketchIn.isOrdered()) { //Only true if Compact. Use early stop + //Ordered, thus compact + if (sketchIn.isDirect()) { + final Memory skMem = ((CompactSketch) sketchIn).getMemory(); + final int preambleLongs = skMem.getByte(PREAMBLE_LONGS_BYTE) & 0X3F; + for (int i = 0; i < curCountIn; i++ ) { + final int offsetBytes = (preambleLongs + i) << 3; + final long hashIn = skMem.getLong(offsetBytes); + if (hashIn >= unionThetaLong_) { break; } // "early stop" + gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed + } } - } - else { //sketchIn is on the Java Heap or has array - final long[] cacheIn = sketchIn.getCache(); //not a copy! - for (int i = 0; i < curCountIn; i++ ) { + else { //sketchIn is on the Java Heap or has array + final long[] cacheIn = sketchIn.getCache(); //not a copy! + for (int i = 0; i < curCountIn; i++ ) { + final long hashIn = cacheIn[i]; + if (hashIn >= unionThetaLong_) { break; } // "early stop" + gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed + } + } + } //End ordered, compact + else { //either not-ordered compact or Hash Table form. A HT may have dirty values. + final long[] cacheIn = sketchIn.getCache(); //if off-heap this will be a copy + final int arrLongs = cacheIn.length; + for (int i = 0, c = 0; (i < arrLongs) && (c < curCountIn); i++ ) { final long hashIn = cacheIn[i]; - if (hashIn >= unionThetaLong_) { break; } // "early stop" + if ((hashIn <= 0L) || (hashIn >= unionThetaLong_)) { continue; } //rejects dirty values gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed + c++; //insures against invalid state inside the incoming sketch } } - } //End ordered, compact - else { //either not-ordered compact or Hash Table form. A HT may have dirty values. - final long[] cacheIn = sketchIn.getCache(); //if off-heap this will be a copy - final int arrLongs = cacheIn.length; - for (int i = 0, c = 0; (i < arrLongs) && (c < curCountIn); i++ ) { - final long hashIn = cacheIn[i]; - if ((hashIn <= 0L) || (hashIn >= unionThetaLong_)) { continue; } //rejects dirty values - gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed - c++; //insures against invalid state inside the incoming sketch - } } - unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //Theta rule with gadget + unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //theta rule + final int gCurCount = gadget_.getRetainedEntries(); + unionEmpty_ = (gCurCount == 0) && (unionThetaLong_ == Long.MAX_VALUE); //empty rule if (gadget_.isDirect()) { - ((WritableMemory)gadget_.getMemory()).putLong(UNION_THETA_LONG, unionThetaLong_); + final WritableMemory wmem = (WritableMemory)gadget_.getMemory(); + wmem.putLong(UNION_THETA_LONG, unionThetaLong_); + if (unionEmpty_) { PreambleUtil.setEmpty(wmem); } + else { PreambleUtil.clearEmpty(wmem); } } } @@ -295,7 +311,7 @@ public void update(final Memory skMem) { final int cap = (int)skMem.getCapacity(); final int fam = skMem.getByte(FAMILY_BYTE); final int serVer = skMem.getByte(SER_VER_BYTE); - if (serVer == 1) { //older SetSketch, which is compact and ordered + if (serVer == 1) { //very old SetSketch, which is compact and ordered if (fam != 3) { //the original SetSketch throw new SketchesArgumentException( "Family must be old SET_SKETCH: " + Family.idToFamily(fam)); @@ -383,7 +399,7 @@ long getThetaLong() { @Override boolean isEmpty() { - return gadget_.isEmpty(); + return gadget_.isEmpty() && unionEmpty_; } //no seedHash, assumes given seed is correct. No p, no empty flag, no concept of direct @@ -399,9 +415,14 @@ private void processVer1(final Memory skMem) { if (hashIn >= unionThetaLong_) { break; } // "early stop" gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed } - unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); + unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //theta rule + final int gCurCount = gadget_.getRetainedEntries(); + unionEmpty_ = (gCurCount == 0) && (unionThetaLong_ == Long.MAX_VALUE); //empty rule if (gadget_.isDirect()) { - ((WritableMemory)gadget_.getMemory()).putLong(UNION_THETA_LONG, unionThetaLong_); + final WritableMemory wmem = (WritableMemory)gadget_.getMemory(); + wmem.putLong(UNION_THETA_LONG, unionThetaLong_); + if (unionEmpty_) { PreambleUtil.setEmpty(wmem); } + else { PreambleUtil.clearEmpty(wmem); } } } @@ -428,9 +449,14 @@ private void processVer2(final Memory skMem) { if (hashIn >= unionThetaLong_) { break; } // "early stop" gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed } - unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); + unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //theta rule + final int gCurCount = gadget_.getRetainedEntries(); + unionEmpty_ = (gCurCount == 0) && (unionThetaLong_ == Long.MAX_VALUE); //empty rule if (gadget_.isDirect()) { - ((WritableMemory)gadget_.getMemory()).putLong(UNION_THETA_LONG, unionThetaLong_); + final WritableMemory wmem = (WritableMemory)gadget_.getMemory(); + wmem.putLong(UNION_THETA_LONG, unionThetaLong_); + if (unionEmpty_) { PreambleUtil.setEmpty(wmem); } + else { PreambleUtil.clearEmpty(wmem); } } } @@ -460,7 +486,7 @@ else if (preLongs == 2) { //curCount has to be > 0 and exact mode. Cannot be fro assert curCount > 0; thetaLongIn = skMem.getLong(THETA_LONG); } - unionThetaLong_ = min(unionThetaLong_, thetaLongIn); //Theta rule + unionThetaLong_ = min(unionThetaLong_, thetaLongIn); //theta rule final boolean ordered = (skMem.getByte(FLAGS_BYTE) & ORDERED_FLAG_MASK) != 0; if (ordered) { //must be compact for (int i = 0; i < curCount; i++ ) { @@ -480,9 +506,14 @@ else if (preLongs == 2) { //curCount has to be > 0 and exact mode. Cannot be fro gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed } } - unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //sync thetaLongs + unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //theta rule + final int gCurCount = gadget_.getRetainedEntries(); + unionEmpty_ = (gCurCount == 0) && (unionThetaLong_ == Long.MAX_VALUE); //empty rule if (gadget_.isDirect()) { - ((WritableMemory)gadget_.getMemory()).putLong(UNION_THETA_LONG, unionThetaLong_); + final WritableMemory wmem = (WritableMemory)gadget_.getMemory(); + wmem.putLong(UNION_THETA_LONG, unionThetaLong_); + if (unionEmpty_) { PreambleUtil.setEmpty(wmem); } + else { PreambleUtil.clearEmpty(wmem); } } } diff --git a/src/test/java/com/yahoo/sketches/theta/EmptyTest.java b/src/test/java/com/yahoo/sketches/theta/EmptyTest.java index 43aec65bd..60b942bfb 100644 --- a/src/test/java/com/yahoo/sketches/theta/EmptyTest.java +++ b/src/test/java/com/yahoo/sketches/theta/EmptyTest.java @@ -8,9 +8,7 @@ /** - * Empty essentially means that the sketch has never seen data. But just because it has never - * seen data does not mean it would not impact a union operation. This would occur if P is - * set < 1.0. + * Empty essentially means that the sketch has never seen data. * * @author Lee Rhodes */ diff --git a/src/test/java/com/yahoo/sketches/theta/ForwardCompatibilityTest.java b/src/test/java/com/yahoo/sketches/theta/ForwardCompatibilityTest.java index 3ad2de4c6..1e675eafe 100644 --- a/src/test/java/com/yahoo/sketches/theta/ForwardCompatibilityTest.java +++ b/src/test/java/com/yahoo/sketches/theta/ForwardCompatibilityTest.java @@ -134,7 +134,7 @@ public void checkSerVer2_8Bytes() { } @Test - public void checkSerVer2_24Bytes_1Value() { + public void checkSerVer2_24Bytes_0Values() { byte[] byteArray = new byte[24]; WritableMemory mem = WritableMemory.wrap(byteArray); mem.putByte(0, (byte) 2); //mdLongs, RF (RR) = 0 @@ -151,7 +151,7 @@ public void checkSerVer2_24Bytes_1Value() { Memory srcMem = Memory.wrap(byteArray); Sketch sketch = Sketch.heapify(srcMem); - assertEquals(sketch.isEmpty(), false); + assertEquals(sketch.isEmpty(), true); //was forced true assertEquals(sketch.isEstimationMode(), false); assertEquals(sketch.isDirect(), false); assertEquals(sketch.isCompact(), true); @@ -161,7 +161,7 @@ public void checkSerVer2_24Bytes_1Value() { } @Test - public void checkSerVer2_32Bytes_1Value() { + public void checkSerVer2_32Bytes_0Values() { byte[] byteArray = new byte[32]; WritableMemory mem = WritableMemory.wrap(byteArray); mem.putByte(0, (byte) 3); //mdLongs, RF (RR) = 0 @@ -178,7 +178,7 @@ public void checkSerVer2_32Bytes_1Value() { Memory srcMem = Memory.wrap(byteArray); Sketch sketch = Sketch.heapify(srcMem); - assertEquals(sketch.isEmpty(), false); + assertEquals(sketch.isEmpty(), true); //forced true assertEquals(sketch.isEstimationMode(), false); assertEquals(sketch.isDirect(), false); assertEquals(sketch.isCompact(), true); diff --git a/src/test/java/com/yahoo/sketches/theta/HeapAnotBTest.java b/src/test/java/com/yahoo/sketches/theta/HeapAnotBTest.java index b364a43fa..f365bfc75 100644 --- a/src/test/java/com/yahoo/sketches/theta/HeapAnotBTest.java +++ b/src/test/java/com/yahoo/sketches/theta/HeapAnotBTest.java @@ -257,20 +257,17 @@ public void checkAnotBnotC() { boolean ordered = true; UpdateSketch aU = UpdateSketch.builder().setNominalEntries(k).build(); - for (int i=0; i k, true); + assertEquals(csk.getThetaLong() < Long.MAX_VALUE, true); + assertEquals(csk.isDirect(), false); + assertEquals(csk.isOrdered(), true); + + csk = generate(State.CNT0_THLT1, k); + assertEquals(csk.isEmpty(), false); + assertEquals(csk.isEstimationMode(), true); + assertEquals(csk.getRetainedEntries(), 0); + assertEquals(csk.getThetaLong() < Long.MAX_VALUE, true); + assertEquals(csk.isDirect(), false); + assertEquals(csk.isOrdered(), true); + + csk = generate(State.EST_HEAP_UNORDERED, k); + assertEquals(csk.isEmpty(), false); + assertEquals(csk.isEstimationMode(), true); + assertEquals(csk.getRetainedEntries() > k, true); + assertEquals(csk.getThetaLong() < Long.MAX_VALUE, true); + assertEquals(csk.isDirect(), false); + assertEquals(csk.isOrdered(), false); + } + + enum State {NULL, EMPTY, EXACT, EST_HEAP, CNT0_THLT1, EST_HEAP_UNORDERED} private static CompactSketch generate(State state, int k) { UpdateSketch sk = null; @@ -279,32 +378,32 @@ private static CompactSketch generate(State state, int k) { } case EXACT : { sk = Sketches.updateSketchBuilder().setNominalEntries(k).build(); - for (int i = 0; i < k; i++) sk.update(i); + for (int i = 0; i < k; i++) { + sk.update(i); + } csk = sk.compact(true, null); break; } case EST_HEAP : { sk = Sketches.updateSketchBuilder().setNominalEntries(k).build(); - for (int i = 0; i < 4*k; i++) sk.update(i); + for (int i = 0; i < (4 * k); i++) { + sk.update(i); + } csk = sk.compact(true, null); break; } - case EST_DIR : { - sk = Sketches.updateSketchBuilder().setNominalEntries(k).build(); - for (int i = 0; i < 4 * k; i++) sk.update(i); - int bytes = Sketch.getMaxCompactSketchBytes(sk.getRetainedEntries(true)); - byte[] byteArr = new byte[bytes]; - WritableMemory mem = WritableMemory.wrap(byteArr); - csk = sk.compact(true, mem); - break; - } - case EMPTY_THLT0 : { - csk = Sketches.updateSketchBuilder().setP((float)0.5).setNominalEntries(k).build().compact(true, null); + case CNT0_THLT1 : { + sk = Sketches.updateSketchBuilder().setP((float)0.5).setNominalEntries(k).build(); + sk.update(7); + assert(sk.getRetainedEntries() == 0); + csk = sk.compact(true, null); break; } case EST_HEAP_UNORDERED : { sk = Sketches.updateSketchBuilder().setNominalEntries(k).build(); - for (int i = 0; i < 4 * k; i++) sk.update(i); + for (int i = 0; i < (4 * k); i++) { + sk.update(i); + } int bytes = Sketch.getMaxCompactSketchBytes(sk.getRetainedEntries(true)); byte[] byteArr = new byte[bytes]; WritableMemory mem = WritableMemory.wrap(byteArr); diff --git a/src/test/java/com/yahoo/sketches/theta/PairwiseSetOperationsTest.java b/src/test/java/com/yahoo/sketches/theta/PairwiseSetOperationsTest.java index 0dea5b527..a3d239736 100644 --- a/src/test/java/com/yahoo/sketches/theta/PairwiseSetOperationsTest.java +++ b/src/test/java/com/yahoo/sketches/theta/PairwiseSetOperationsTest.java @@ -9,8 +9,6 @@ import org.testng.annotations.Test; -import com.yahoo.sketches.SketchesArgumentException; - public class PairwiseSetOperationsTest { // Intersection @@ -70,9 +68,9 @@ public void checkIntersectionEarlyStop() { for (int t = 0; t < trials; t++) { for (int i=0; i Date: Thu, 28 Mar 2019 12:53:50 -0700 Subject: [PATCH 07/16] add empty line --- src/main/java/com/yahoo/sketches/theta/UnionImpl.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/com/yahoo/sketches/theta/UnionImpl.java b/src/main/java/com/yahoo/sketches/theta/UnionImpl.java index 47af3b9dd..ce3314c1a 100644 --- a/src/main/java/com/yahoo/sketches/theta/UnionImpl.java +++ b/src/main/java/com/yahoo/sketches/theta/UnionImpl.java @@ -37,6 +37,7 @@ * @author Kevin Lang */ final class UnionImpl extends Union { + /** * Although the gadget object is initially an UpdateSketch, in the context of a Union it is used * as a specialized buffer that happens to leverage much of the machinery of an UpdateSketch. From c6fe6079a489f5faa73b163cac27a9297f2beda6 Mon Sep 17 00:00:00 2001 From: lrhodes Date: Thu, 28 Mar 2019 12:59:34 -0700 Subject: [PATCH 08/16] tiny edits --- .../java/com/yahoo/sketches/theta/PairwiseSetOperations.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java b/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java index 7bc5bc480..f4c7c4a5f 100644 --- a/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java +++ b/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java @@ -139,7 +139,7 @@ public static CompactSketch union(final CompactSketch skA, final CompactSketch s } //Both sketches are valid with matching seedhashes and ordered - //Full Union operation + //Full Union operation: long thetaLong = Math.min(skA.getThetaLong(), skB.getThetaLong()); //Theta rule final long[] cacheA = (skA.isDirect()) ? skA.getCache() : skA.getCache().clone(); final long[] cacheB = (skB.isDirect()) ? skB.getCache() : skB.getCache().clone(); From 4402feaf0a759159e039b1ebc718f3c52d915a42 Mon Sep 17 00:00:00 2001 From: lrhodes Date: Thu, 28 Mar 2019 18:33:51 -0700 Subject: [PATCH 09/16] Removed checkEmptyState --- .../theta/DirectCompactOrderedSketch.java | 4 +- .../sketches/theta/DirectCompactSketch.java | 1 - .../theta/DirectCompactUnorderedSketch.java | 1 + .../sketches/theta/ForwardCompatibility.java | 3 - .../theta/HeapCompactOrderedSketch.java | 2 +- .../sketches/theta/HeapCompactSketch.java | 1 - .../theta/HeapCompactUnorderedSketch.java | 2 +- .../sketches/theta/PairwiseSetOperations.java | 11 +--- .../yahoo/sketches/theta/SetOperation.java | 5 +- .../java/com/yahoo/sketches/theta/Sketch.java | 63 ++++++++++--------- 10 files changed, 43 insertions(+), 50 deletions(-) diff --git a/src/main/java/com/yahoo/sketches/theta/DirectCompactOrderedSketch.java b/src/main/java/com/yahoo/sketches/theta/DirectCompactOrderedSketch.java index a59412534..f710c8658 100644 --- a/src/main/java/com/yahoo/sketches/theta/DirectCompactOrderedSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/DirectCompactOrderedSketch.java @@ -52,11 +52,11 @@ static DirectCompactOrderedSketch wrapInstance(final Memory srcMem, final long s * @param dstMem the given destination Memory. This clears it before use. * @return a DirectCompactOrderedSketch. */ - static DirectCompactOrderedSketch compact(final UpdateSketch sketch, - final WritableMemory dstMem) { + static DirectCompactOrderedSketch compact(final UpdateSketch sketch, final WritableMemory dstMem) { final long thetaLong = sketch.getThetaLong(); final boolean empty = sketch.isEmpty(); final int curCount = sketch.getRetainedEntries(true); + //checkEmptyState(empty, curCount, thetaLong); final int preLongs = computeCompactPreLongs(thetaLong, empty, curCount); final short seedHash = sketch.getSeedHash(); final long[] cache = sketch.getCache(); diff --git a/src/main/java/com/yahoo/sketches/theta/DirectCompactSketch.java b/src/main/java/com/yahoo/sketches/theta/DirectCompactSketch.java index b270f23f3..f8a6394b3 100644 --- a/src/main/java/com/yahoo/sketches/theta/DirectCompactSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/DirectCompactSketch.java @@ -22,7 +22,6 @@ abstract class DirectCompactSketch extends CompactSketch { DirectCompactSketch(final Memory mem) { mem_ = mem; - checkEmptyState(isEmpty(), getRetainedEntries(true), getThetaLong()); //TODO remove if not needed } //Sketch diff --git a/src/main/java/com/yahoo/sketches/theta/DirectCompactUnorderedSketch.java b/src/main/java/com/yahoo/sketches/theta/DirectCompactUnorderedSketch.java index 2eb3a7801..09c55205c 100644 --- a/src/main/java/com/yahoo/sketches/theta/DirectCompactUnorderedSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/DirectCompactUnorderedSketch.java @@ -56,6 +56,7 @@ static DirectCompactUnorderedSketch compact(final UpdateSketch sketch, final long thetaLong = sketch.getThetaLong(); final boolean empty = sketch.isEmpty(); final int curCount = sketch.getRetainedEntries(true); + //checkEmptyState(empty, curCount, thetaLong); final int preLongs = computeCompactPreLongs(thetaLong, empty, curCount); final short seedHash = sketch.getSeedHash(); final long[] cache = sketch.getCache(); diff --git a/src/main/java/com/yahoo/sketches/theta/ForwardCompatibility.java b/src/main/java/com/yahoo/sketches/theta/ForwardCompatibility.java index 9e681b0c0..dc960ac6e 100644 --- a/src/main/java/com/yahoo/sketches/theta/ForwardCompatibility.java +++ b/src/main/java/com/yahoo/sketches/theta/ForwardCompatibility.java @@ -11,7 +11,6 @@ import static com.yahoo.sketches.theta.PreambleUtil.RETAINED_ENTRIES_INT; import static com.yahoo.sketches.theta.PreambleUtil.SEED_HASH_SHORT; import static com.yahoo.sketches.theta.PreambleUtil.THETA_LONG; -import static com.yahoo.sketches.theta.Sketch.checkEmptyState; import com.yahoo.memory.Memory; import com.yahoo.sketches.SketchesArgumentException; @@ -60,7 +59,6 @@ static final CompactSketch heapify1to3(final Memory srcMem, final long seed) { final long[] compactOrderedCache = new long[curCount]; srcMem.getLongArray(24, compactOrderedCache, 0, curCount); - checkEmptyState(false, curCount, thetaLong); return HeapCompactOrderedSketch .compact(compactOrderedCache, false, seedHash, curCount, thetaLong); } @@ -98,7 +96,6 @@ static final CompactSketch heapify2to3(final Memory srcMem, final long seed) { empty = (curCount == 0) && (thetaLong == Long.MAX_VALUE); //force true final long[] compactOrderedCache = new long[curCount]; srcMem.getLongArray(mdLongs << 3, compactOrderedCache, 0, curCount); - checkEmptyState(empty, curCount, thetaLong); return HeapCompactOrderedSketch .compact(compactOrderedCache, empty, seedHash, curCount, thetaLong); } diff --git a/src/main/java/com/yahoo/sketches/theta/HeapCompactOrderedSketch.java b/src/main/java/com/yahoo/sketches/theta/HeapCompactOrderedSketch.java index 031c13398..0685e8223 100644 --- a/src/main/java/com/yahoo/sketches/theta/HeapCompactOrderedSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/HeapCompactOrderedSketch.java @@ -80,7 +80,7 @@ static CompactSketch compact(final UpdateSketch sketch) { final long thetaLong = sketch.getThetaLong(); final boolean empty = sketch.isEmpty(); final int curCount = sketch.getRetainedEntries(true); - + //checkEmptyState(empty, curCount, thetaLong); final short seedHash = sketch.getSeedHash(); final long[] cache = sketch.getCache(); final boolean ordered = true; diff --git a/src/main/java/com/yahoo/sketches/theta/HeapCompactSketch.java b/src/main/java/com/yahoo/sketches/theta/HeapCompactSketch.java index 29543bc85..238cfb4da 100644 --- a/src/main/java/com/yahoo/sketches/theta/HeapCompactSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/HeapCompactSketch.java @@ -44,7 +44,6 @@ abstract class HeapCompactSketch extends CompactSketch { thetaLong_ = empty ? Long.MAX_VALUE : thetaLong; cache_ = cache; preLongs_ = computeCompactPreLongs(thetaLong, empty, curCount); - checkEmptyState(isEmpty(), getRetainedEntries(true), getThetaLong()); //TODO remove if not needed } //Sketch diff --git a/src/main/java/com/yahoo/sketches/theta/HeapCompactUnorderedSketch.java b/src/main/java/com/yahoo/sketches/theta/HeapCompactUnorderedSketch.java index fa1ee54dc..449e93d6d 100644 --- a/src/main/java/com/yahoo/sketches/theta/HeapCompactUnorderedSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/HeapCompactUnorderedSketch.java @@ -80,7 +80,7 @@ static CompactSketch compact(final UpdateSketch sketch) { final long thetaLong = sketch.getThetaLong(); final boolean empty = sketch.isEmpty(); final int curCount = sketch.getRetainedEntries(true); - + //checkEmptyState(empty, curCount, thetaLong); final short seedHash = sketch.getSeedHash(); final long[] cache = sketch.getCache(); final boolean ordered = false; diff --git a/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java b/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java index f4c7c4a5f..bab3cc9e5 100644 --- a/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java +++ b/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java @@ -5,7 +5,7 @@ package com.yahoo.sketches.theta; -import static com.yahoo.sketches.theta.Sketch.checkEmptyState; +import static com.yahoo.sketches.theta.SetOperation.createCompactSketch; import java.util.Arrays; @@ -202,9 +202,7 @@ else if (hashA < hashB) { } else { outArr = Arrays.copyOf(outCache, curCount); //copy only valid items } - checkEmptyState(false, curCount, thetaLong); - return HeapCompactOrderedSketch - .compact(outArr, false, skA.getSeedHash(), curCount, thetaLong); + return createCompactSketch(outArr, false, skA.getSeedHash(), curCount, thetaLong, true, null); } private static CompactSketch maybeCutback(final CompactSketch csk, final int k) { @@ -216,11 +214,8 @@ private static CompactSketch maybeCutback(final CompactSketch csk, final int k) thetaLong = cache[k]; final long[] arr = Arrays.copyOf(cache, k); curCount = k; - checkEmptyState(empty, curCount, thetaLong); - return HeapCompactOrderedSketch - .compact(arr, empty, csk.getSeedHash(), curCount, thetaLong); + return createCompactSketch(arr, empty, csk.getSeedHash(), curCount, thetaLong, true, null); } - checkEmptyState(empty, curCount, thetaLong); return csk; } diff --git a/src/main/java/com/yahoo/sketches/theta/SetOperation.java b/src/main/java/com/yahoo/sketches/theta/SetOperation.java index 82d7109d7..6d4a7b82c 100644 --- a/src/main/java/com/yahoo/sketches/theta/SetOperation.java +++ b/src/main/java/com/yahoo/sketches/theta/SetOperation.java @@ -12,7 +12,6 @@ import static com.yahoo.sketches.Util.ceilingPowerOf2; import static com.yahoo.sketches.theta.PreambleUtil.FAMILY_BYTE; import static com.yahoo.sketches.theta.PreambleUtil.SER_VER_BYTE; -import static com.yahoo.sketches.theta.Sketch.checkEmptyState; import static java.lang.Math.max; import com.yahoo.memory.Memory; @@ -230,7 +229,7 @@ static final CompactSketch createCompactSketch(final long[] compactCache, final curCount = 0; thetaLong = Long.MAX_VALUE; } - checkEmptyState(empty, curCount, thetaLong); + //checkEmptyState(empty, curCount, thetaLong); CompactSketch sketchOut = null; final int sw = (dstOrdered ? 2 : 0) | ((dstMem != null) ? 1 : 0); switch (sw) { @@ -259,8 +258,6 @@ static final CompactSketch createCompactSketch(final long[] compactCache, final return sketchOut; } - - /** * Computes minimum lgArrLongs from a current count. * @param count the given current count diff --git a/src/main/java/com/yahoo/sketches/theta/Sketch.java b/src/main/java/com/yahoo/sketches/theta/Sketch.java index 23b96692b..d440a7de0 100644 --- a/src/main/java/com/yahoo/sketches/theta/Sketch.java +++ b/src/main/java/com/yahoo/sketches/theta/Sketch.java @@ -24,7 +24,6 @@ import com.yahoo.sketches.BinomialBoundsN; import com.yahoo.sketches.Family; import com.yahoo.sketches.SketchesArgumentException; -import com.yahoo.sketches.SketchesStateException; import com.yahoo.sketches.Util; /** @@ -558,34 +557,40 @@ static final void checkSketchAndMemoryFlags(final Sketch sketch) { } } - /** - * Checks for an illegal state of the empty flag. The truth table is as follows: - *
-   *  Empty CurCount Theta State    Comments
-   *    T      0       1.0   OK     The Normal Empty State
-   *    T      0      <1.0   Error  This can be an initial on-heap state if p < 1.0,
-   *                                  but should stored as a Normal Empty State.
-   *    T     !0       1.0   Error  Empty and curCount !0 should never co-exist
-   *    T     !0      <1.0   Error  Empty and curCount !0 should never co-exist
-   *    F      0       1.0   Error  This conflicts with the normal empty state
-   *    F      0      <1.0   OK     This can result from set operations
-   *    F     !0       1.0   OK     This corresponds to a sketch in exact mode
-   *    F     !0      <1.0   OK     This corresponds to a sketch in estimation mode
-   * 
- * - * @param empty the state of the empty flag - * @param curCount the current number of retained entries - * @param thetaLong the value of theta as a long - */ - static final void checkEmptyState(final boolean empty, final int curCount, final long thetaLong) { - final boolean thLT1 = thetaLong < Long.MAX_VALUE; - final boolean zeroCount = curCount == 0; - final boolean error = (empty && !zeroCount) || (zeroCount && (empty ^ !thLT1)); - if (error) { - throw new SketchesStateException("Improper Empty State: Empty: " + empty - + ", CurCount=0: " + zeroCount + " Theta<1.0: " + thLT1); - } - } + // /** + // * Checks for an illegal state of the empty flag. The truth table is as follows: + // *
+  //   *  Empty CurCount Theta State    Comments
+  //   *    T      0       1.0   OK     The Normal Empty State
+  //   *    T      0      <1.0   Error  This can be an initial on-heap state if p < 1.0,
+  //   *                                  but should stored as a Normal Empty State.
+  //   *    T     !0       1.0   Error  Empty and curCount !0 should never co-exist
+  //   *    T     !0      <1.0   Error  Empty and curCount !0 should never co-exist
+  //   *    F      0       1.0   Error  This conflicts with the normal empty state
+  //   *    F      0      <1.0   OK     This can result from set operations
+  //   *    F     !0       1.0   OK     This corresponds to a sketch in exact mode
+  //   *    F     !0      <1.0   OK     This corresponds to a sketch in estimation mode
+  //   * 
+ // * + // * @param empty the state of the empty flag + // * @param curCount the current number of retained entries + // * @param thetaLong the value of theta as a long + // */ + // static final void checkEmptyState(final boolean empty, final int curCount, final long thetaLong) { + // final boolean thLT1 = thetaLong < Long.MAX_VALUE; + // final boolean zeroCount = curCount == 0; + // final boolean error = (empty && !zeroCount) || (zeroCount && (empty ^ !thLT1)); + // if (error) { + // throw new SketchesStateException("Improper Empty State: Empty: " + empty + // + ", CurCount=0: " + zeroCount + " Theta<1.0: " + thLT1); + // } + // } + // + // static final boolean fixEmpty(final boolean empty, final int curCount, final long thetaLong) { + // if (curCount > 0) { return false; } + // if ((curCount == 0) && (thetaLong == Long.MAX_VALUE)) { return true; } + // return empty; + // } static final double estimate(final long thetaLong, final int curCount, final boolean empty) { if (estMode(thetaLong, empty)) { From 5d4422110d9799bd37666fcd65be0862337a0d80 Mon Sep 17 00:00:00 2001 From: lrhodes Date: Fri, 29 Mar 2019 11:35:17 -0700 Subject: [PATCH 10/16] clean up pairwise logic --- .../sketches/theta/PairwiseSetOperations.java | 37 ++++++++++--------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java b/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java index bab3cc9e5..c91cf192d 100644 --- a/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java +++ b/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java @@ -88,51 +88,46 @@ public static CompactSketch union(final CompactSketch skA, final CompactSketch s public static CompactSketch union(final CompactSketch skA, final CompactSketch skB, final int k) { //Handle all corner cases with null or empty arguments //For backward compatibility, we must allow input empties with Theta < 1.0. - final int swA = (skA == null) ? 1 : skA.isEmpty() ? 2 : 3; - final int swB = (skB == null) ? 1 : skB.isEmpty() ? 2 : 3; + final int swA, swB; + if (skA == null) { swA = 1; } else { checkOrdered(skA); swA = skA.isEmpty() ? 2 : 3; } + if (skB == null) { swB = 1; } else { checkOrdered(skB); swB = skB.isEmpty() ? 2 : 3; } final int sw = (swA << 2) | swB; switch (sw) { case 5: { //skA == null; skB == null; return null. Cannot determine seedhash. return null; } case 6: { //skA == null; skB == empty; return empty - checkOrdered(skB); - return (skB.getThetaLong() == Long.MAX_VALUE) ? skB : + return (skB.getThetaLong() == Long.MAX_VALUE) ? skB : //lgtm [java/dereferenced-value-may-be-null] HeapCompactOrderedSketch.compact(new long[0], true, skB.getSeedHash(), 0, Long.MAX_VALUE); } case 7: { //skA == null; skB == valid; return skB - checkOrdered(skB); return maybeCutback(skB, k); } case 9: { //skA == empty; skB == null; return empty - checkOrdered(skA); - return (skA.getThetaLong() == Long.MAX_VALUE) ? skA : + return (skA.getThetaLong() == Long.MAX_VALUE) ? skA : //lgtm [java/dereferenced-value-may-be-null] HeapCompactOrderedSketch.compact(new long[0], true, skA.getSeedHash(), 0, Long.MAX_VALUE); } case 10: { //skA == empty; skB == empty; return empty - final short seedHash = Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); - if (skA.getThetaLong() == Long.MAX_VALUE) { checkOrdered(skA); return skA; } - if (skB.getThetaLong() == Long.MAX_VALUE) { checkOrdered(skB); return skB; } + final short seedHash = seedHashesCheck(skA, skB); + if (skA.getThetaLong() == Long.MAX_VALUE) //lgtm [java/dereferenced-value-may-be-null] + { return skA; } + if (skB.getThetaLong() == Long.MAX_VALUE) //lgtm [java/dereferenced-value-may-be-null] + { return skB; } return HeapCompactOrderedSketch.compact(new long[0], true, seedHash, 0, Long.MAX_VALUE); } case 11: { //skA == empty; skB == valid; return skB - Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); - checkOrdered(skB); + seedHashesCheck(skA, skB); return maybeCutback(skB, k); } case 13: { //skA == valid; skB == null; return skA - checkOrdered(skA); return maybeCutback(skA, k); } case 14: { //skA == valid; skB == empty; return skA - Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); - checkOrdered(skA); + seedHashesCheck(skA, skB); return maybeCutback(skA, k); } case 15: { //skA == valid; skB == valid; perform full union - Util.checkSeedHashes(skA.getSeedHash(), skB.getSeedHash()); - checkOrdered(skA); - checkOrdered(skB); + seedHashesCheck(skA, skB); break; } //default: cannot happen @@ -225,4 +220,10 @@ private static void checkOrdered(final CompactSketch csk) { } } + private static short seedHashesCheck(final Sketch skA, final Sketch skB) { + final short seedHashA = skA.getSeedHash(); //lgtm [java/dereferenced-value-may-be-null] + final short seedHashB = skB.getSeedHash(); //lgtm [java/dereferenced-value-may-be-null] + return Util.checkSeedHashes(seedHashA, seedHashB); + } + } From 8498ad4c42420c653e6db573c081915dcf3027e1 Mon Sep 17 00:00:00 2001 From: lrhodes Date: Sat, 30 Mar 2019 11:39:51 -0700 Subject: [PATCH 11/16] Incomplete --- .../java/com/yahoo/sketches/theta/UnionImpl.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/main/java/com/yahoo/sketches/theta/UnionImpl.java b/src/main/java/com/yahoo/sketches/theta/UnionImpl.java index ce3314c1a..b2127dd3d 100644 --- a/src/main/java/com/yahoo/sketches/theta/UnionImpl.java +++ b/src/main/java/com/yahoo/sketches/theta/UnionImpl.java @@ -188,6 +188,11 @@ static UnionImpl wrapInstance(final WritableMemory srcMem, final long seed) { return unionImpl; } + @Override + public CompactSketch getResult() { + return getResult(true, null); + } + @Override public CompactSketch getResult(final boolean dstOrdered, final WritableMemory dstMem) { final int gadgetCurCount = gadget_.getRetainedEntries(true); @@ -204,7 +209,7 @@ public CompactSketch getResult(final boolean dstOrdered, final WritableMemory ds final long unionThetaLong = (gadget_.isDirect()) ? gadget_.getMemory().getLong(UNION_THETA_LONG) : unionThetaLong_; - final long minThetaLong = min(min(curGadgetThetaLong, adjGadgetThetaLong), unionThetaLong); + long minThetaLong = min(min(curGadgetThetaLong, adjGadgetThetaLong), unionThetaLong); final int curCountOut = (minThetaLong < curGadgetThetaLong) ? HashOperations.count(gadgetCacheCopy, minThetaLong) : gadgetCurCount; @@ -213,15 +218,11 @@ public CompactSketch getResult(final boolean dstOrdered, final WritableMemory ds final long[] compactCacheOut = compactCache(gadgetCacheCopy, curCountOut, minThetaLong, dstOrdered); final boolean empty = gadget_.isEmpty() && unionEmpty_; + if (empty) { minThetaLong = Long.MAX_VALUE; } return createCompactSketch( compactCacheOut, empty, seedHash_, curCountOut, minThetaLong, dstOrdered, dstMem); } - @Override - public CompactSketch getResult() { - return getResult(true, null); - } - @Override public void reset() { gadget_.reset(); From 68dadd52c3bbc9f8202d5c3cfe67ba9e4eef7b25 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Sun, 31 Mar 2019 16:21:49 -0700 Subject: [PATCH 12/16] fix several bugs wrt empty and hasMemory(). --- .../theta/ConcurrentHeapThetaBuffer.java | 5 + .../theta/ConcurrentSharedThetaSketch.java | 2 + .../theta/DirectCompactOrderedSketch.java | 7 +- .../theta/DirectCompactUnorderedSketch.java | 7 +- .../yahoo/sketches/theta/HeapAlphaSketch.java | 7 +- .../theta/HeapCompactOrderedSketch.java | 7 +- .../theta/HeapCompactUnorderedSketch.java | 7 +- .../sketches/theta/HeapQuickSelectSketch.java | 5 + .../sketches/theta/HeapUpdateSketch.java | 3 + .../sketches/theta/PairwiseSetOperations.java | 6 +- .../yahoo/sketches/theta/SetOperation.java | 14 +-- .../java/com/yahoo/sketches/theta/Sketch.java | 59 +++++------- .../com/yahoo/sketches/theta/UnionImpl.java | 92 ++++++++----------- ...ConcurrentDirectQuickSelectSketchTest.java | 3 + .../ConcurrentHeapQuickSelectSketchTest.java | 2 + .../theta/DirectQuickSelectSketchTest.java | 3 + .../com/yahoo/sketches/theta/EmptyTest.java | 7 +- .../theta/ForwardCompatibilityTest.java | 5 + .../theta/HeapQuickSelectSketchTest.java | 1 + .../theta/PairwiseCornerCasesTest.java | 22 +++-- .../sketches/theta/SingleItemSketchTest.java | 2 +- .../com/yahoo/sketches/theta/SketchTest.java | 2 + 22 files changed, 147 insertions(+), 121 deletions(-) diff --git a/src/main/java/com/yahoo/sketches/theta/ConcurrentHeapThetaBuffer.java b/src/main/java/com/yahoo/sketches/theta/ConcurrentHeapThetaBuffer.java index 46c0147a7..dbafc83ca 100644 --- a/src/main/java/com/yahoo/sketches/theta/ConcurrentHeapThetaBuffer.java +++ b/src/main/java/com/yahoo/sketches/theta/ConcurrentHeapThetaBuffer.java @@ -82,6 +82,11 @@ public double getUpperBound(final int numStdDev) { return shared.getUpperBound(numStdDev); } + @Override + public boolean hasMemory() { + return shared.hasMemory(); + } + @Override public boolean isDirect() { return shared.isDirect(); diff --git a/src/main/java/com/yahoo/sketches/theta/ConcurrentSharedThetaSketch.java b/src/main/java/com/yahoo/sketches/theta/ConcurrentSharedThetaSketch.java index 389e082ea..c0ec0cb98 100644 --- a/src/main/java/com/yahoo/sketches/theta/ConcurrentSharedThetaSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/ConcurrentSharedThetaSketch.java @@ -112,6 +112,8 @@ boolean propagate(final AtomicBoolean localPropagationInProgress, final Sketch s double getUpperBound(int numStdDev); + boolean hasMemory(); + boolean isDirect(); boolean isEmpty(); diff --git a/src/main/java/com/yahoo/sketches/theta/DirectCompactOrderedSketch.java b/src/main/java/com/yahoo/sketches/theta/DirectCompactOrderedSketch.java index f710c8658..211825375 100644 --- a/src/main/java/com/yahoo/sketches/theta/DirectCompactOrderedSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/DirectCompactOrderedSketch.java @@ -53,10 +53,11 @@ static DirectCompactOrderedSketch wrapInstance(final Memory srcMem, final long s * @return a DirectCompactOrderedSketch. */ static DirectCompactOrderedSketch compact(final UpdateSketch sketch, final WritableMemory dstMem) { - final long thetaLong = sketch.getThetaLong(); - final boolean empty = sketch.isEmpty(); final int curCount = sketch.getRetainedEntries(true); - //checkEmptyState(empty, curCount, thetaLong); + long thetaLong = sketch.getThetaLong(); + boolean empty = sketch.isEmpty(); + thetaLong = thetaOnCompact(empty, curCount, thetaLong); + empty = emptyOnCompact(curCount, thetaLong); final int preLongs = computeCompactPreLongs(thetaLong, empty, curCount); final short seedHash = sketch.getSeedHash(); final long[] cache = sketch.getCache(); diff --git a/src/main/java/com/yahoo/sketches/theta/DirectCompactUnorderedSketch.java b/src/main/java/com/yahoo/sketches/theta/DirectCompactUnorderedSketch.java index 09c55205c..44450a87e 100644 --- a/src/main/java/com/yahoo/sketches/theta/DirectCompactUnorderedSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/DirectCompactUnorderedSketch.java @@ -53,10 +53,11 @@ static DirectCompactUnorderedSketch wrapInstance(final Memory srcMem, final long */ static DirectCompactUnorderedSketch compact(final UpdateSketch sketch, final WritableMemory dstMem) { - final long thetaLong = sketch.getThetaLong(); - final boolean empty = sketch.isEmpty(); final int curCount = sketch.getRetainedEntries(true); - //checkEmptyState(empty, curCount, thetaLong); + long thetaLong = sketch.getThetaLong(); + boolean empty = sketch.isEmpty(); + thetaLong = thetaOnCompact(empty, curCount, thetaLong); + empty = emptyOnCompact(curCount, thetaLong); final int preLongs = computeCompactPreLongs(thetaLong, empty, curCount); final short seedHash = sketch.getSeedHash(); final long[] cache = sketch.getCache(); diff --git a/src/main/java/com/yahoo/sketches/theta/HeapAlphaSketch.java b/src/main/java/com/yahoo/sketches/theta/HeapAlphaSketch.java index 01d10b01e..db045eb84 100644 --- a/src/main/java/com/yahoo/sketches/theta/HeapAlphaSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/HeapAlphaSketch.java @@ -53,7 +53,7 @@ final class HeapAlphaSketch extends HeapUpdateSketch { private int hashTableThreshold_; //never serialized private int curCount_ = 0; private long thetaLong_; - private boolean empty_ = true; + boolean empty_ = true; private long[] cache_; private boolean dirty_ = false; @@ -286,6 +286,11 @@ int getLgArrLongs() { return lgArrLongs_; } + @Override + void setEmpty(final boolean empty) { + empty_ = empty; + } + @Override UpdateReturnState hashUpdate(final long hash) { HashOperations.checkHashCorruption(hash); diff --git a/src/main/java/com/yahoo/sketches/theta/HeapCompactOrderedSketch.java b/src/main/java/com/yahoo/sketches/theta/HeapCompactOrderedSketch.java index 0685e8223..e36cf20f2 100644 --- a/src/main/java/com/yahoo/sketches/theta/HeapCompactOrderedSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/HeapCompactOrderedSketch.java @@ -77,10 +77,11 @@ static CompactSketch heapifyInstance(final Memory srcMem, final long seed) { * @return a CompactSketch */ static CompactSketch compact(final UpdateSketch sketch) { - final long thetaLong = sketch.getThetaLong(); - final boolean empty = sketch.isEmpty(); final int curCount = sketch.getRetainedEntries(true); - //checkEmptyState(empty, curCount, thetaLong); + long thetaLong = sketch.getThetaLong(); + boolean empty = sketch.isEmpty(); + thetaLong = thetaOnCompact(empty, curCount, thetaLong); + empty = emptyOnCompact(curCount, thetaLong); final short seedHash = sketch.getSeedHash(); final long[] cache = sketch.getCache(); final boolean ordered = true; diff --git a/src/main/java/com/yahoo/sketches/theta/HeapCompactUnorderedSketch.java b/src/main/java/com/yahoo/sketches/theta/HeapCompactUnorderedSketch.java index 449e93d6d..80d59877e 100644 --- a/src/main/java/com/yahoo/sketches/theta/HeapCompactUnorderedSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/HeapCompactUnorderedSketch.java @@ -77,10 +77,11 @@ static CompactSketch heapifyInstance(final Memory srcMem, final long seed) { * @return a CompactSketch */ static CompactSketch compact(final UpdateSketch sketch) { - final long thetaLong = sketch.getThetaLong(); - final boolean empty = sketch.isEmpty(); final int curCount = sketch.getRetainedEntries(true); - //checkEmptyState(empty, curCount, thetaLong); + long thetaLong = sketch.getThetaLong(); + boolean empty = sketch.isEmpty(); + thetaLong = thetaOnCompact(empty, curCount, thetaLong); + empty = emptyOnCompact(curCount, thetaLong); final short seedHash = sketch.getSeedHash(); final long[] cache = sketch.getCache(); final boolean ordered = false; diff --git a/src/main/java/com/yahoo/sketches/theta/HeapQuickSelectSketch.java b/src/main/java/com/yahoo/sketches/theta/HeapQuickSelectSketch.java index 568aae982..146dcd7d2 100644 --- a/src/main/java/com/yahoo/sketches/theta/HeapQuickSelectSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/HeapQuickSelectSketch.java @@ -254,6 +254,11 @@ boolean isOutOfSpace(final int numEntries) { return numEntries > hashTableThreshold_; } + @Override + void setEmpty(final boolean empty) { + empty_ = empty; + } + //Must resize. Changes lgArrLongs_ and cache_. theta and count don't change. // Used by hashUpdate() private final void resizeCache() { diff --git a/src/main/java/com/yahoo/sketches/theta/HeapUpdateSketch.java b/src/main/java/com/yahoo/sketches/theta/HeapUpdateSketch.java index 0b6b9573a..de7f310d1 100644 --- a/src/main/java/com/yahoo/sketches/theta/HeapUpdateSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/HeapUpdateSketch.java @@ -90,6 +90,9 @@ short getSeedHash() { return Util.computeSeedHash(getSeed()); } + //for set operations + abstract void setEmpty(boolean empty); + byte[] toByteArray(final int preLongs, final byte familyID) { if (isDirty()) { rebuild(); } final int preBytes = (preLongs << 3) & 0X3F; diff --git a/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java b/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java index c91cf192d..5c77db7db 100644 --- a/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java +++ b/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java @@ -136,8 +136,8 @@ public static CompactSketch union(final CompactSketch skA, final CompactSketch s //Both sketches are valid with matching seedhashes and ordered //Full Union operation: long thetaLong = Math.min(skA.getThetaLong(), skB.getThetaLong()); //Theta rule - final long[] cacheA = (skA.isDirect()) ? skA.getCache() : skA.getCache().clone(); - final long[] cacheB = (skB.isDirect()) ? skB.getCache() : skB.getCache().clone(); + final long[] cacheA = (skA.hasMemory()) ? skA.getCache() : skA.getCache().clone(); + final long[] cacheB = (skB.hasMemory()) ? skB.getCache() : skB.getCache().clone(); final int aLen = cacheA.length; final int bLen = cacheB.length; @@ -205,7 +205,7 @@ private static CompactSketch maybeCutback(final CompactSketch csk, final int k) int curCount = csk.getRetainedEntries(true); long thetaLong = csk.getThetaLong(); if (curCount > k) { //cutback to k - final long[] cache = (csk.isDirect()) ? csk.getCache() : csk.getCache().clone(); + final long[] cache = (csk.hasMemory()) ? csk.getCache() : csk.getCache().clone(); thetaLong = cache[k]; final long[] arr = Arrays.copyOf(cache, k); curCount = k; diff --git a/src/main/java/com/yahoo/sketches/theta/SetOperation.java b/src/main/java/com/yahoo/sketches/theta/SetOperation.java index 6d4a7b82c..7c0647744 100644 --- a/src/main/java/com/yahoo/sketches/theta/SetOperation.java +++ b/src/main/java/com/yahoo/sketches/theta/SetOperation.java @@ -12,6 +12,8 @@ import static com.yahoo.sketches.Util.ceilingPowerOf2; import static com.yahoo.sketches.theta.PreambleUtil.FAMILY_BYTE; import static com.yahoo.sketches.theta.PreambleUtil.SER_VER_BYTE; +import static com.yahoo.sketches.theta.Sketch.emptyOnCompact; +import static com.yahoo.sketches.theta.Sketch.thetaOnCompact; import static java.lang.Math.max; import com.yahoo.memory.Memory; @@ -222,14 +224,12 @@ static short computeSeedHash(final long seed) { abstract boolean isEmpty(); //used only by the set operations - static final CompactSketch createCompactSketch(final long[] compactCache, final boolean empty, - final short seedHash, int curCount, long thetaLong, final boolean dstOrdered, + static final CompactSketch createCompactSketch(final long[] compactCache, boolean empty, + final short seedHash, final int curCount, long thetaLong, final boolean dstOrdered, final WritableMemory dstMem) { - if (empty) { - curCount = 0; - thetaLong = Long.MAX_VALUE; - } - //checkEmptyState(empty, curCount, thetaLong); + thetaLong = thetaOnCompact(empty, curCount, thetaLong); + empty = emptyOnCompact(curCount, thetaLong); + CompactSketch sketchOut = null; final int sw = (dstOrdered ? 2 : 0) | ((dstMem != null) ? 1 : 0); switch (sw) { diff --git a/src/main/java/com/yahoo/sketches/theta/Sketch.java b/src/main/java/com/yahoo/sketches/theta/Sketch.java index d440a7de0..0b3dc9b5c 100644 --- a/src/main/java/com/yahoo/sketches/theta/Sketch.java +++ b/src/main/java/com/yahoo/sketches/theta/Sketch.java @@ -557,40 +557,31 @@ static final void checkSketchAndMemoryFlags(final Sketch sketch) { } } - // /** - // * Checks for an illegal state of the empty flag. The truth table is as follows: - // *
-  //   *  Empty CurCount Theta State    Comments
-  //   *    T      0       1.0   OK     The Normal Empty State
-  //   *    T      0      <1.0   Error  This can be an initial on-heap state if p < 1.0,
-  //   *                                  but should stored as a Normal Empty State.
-  //   *    T     !0       1.0   Error  Empty and curCount !0 should never co-exist
-  //   *    T     !0      <1.0   Error  Empty and curCount !0 should never co-exist
-  //   *    F      0       1.0   Error  This conflicts with the normal empty state
-  //   *    F      0      <1.0   OK     This can result from set operations
-  //   *    F     !0       1.0   OK     This corresponds to a sketch in exact mode
-  //   *    F     !0      <1.0   OK     This corresponds to a sketch in estimation mode
-  //   * 
- // * - // * @param empty the state of the empty flag - // * @param curCount the current number of retained entries - // * @param thetaLong the value of theta as a long - // */ - // static final void checkEmptyState(final boolean empty, final int curCount, final long thetaLong) { - // final boolean thLT1 = thetaLong < Long.MAX_VALUE; - // final boolean zeroCount = curCount == 0; - // final boolean error = (empty && !zeroCount) || (zeroCount && (empty ^ !thLT1)); - // if (error) { - // throw new SketchesStateException("Improper Empty State: Empty: " + empty - // + ", CurCount=0: " + zeroCount + " Theta<1.0: " + thLT1); - // } - // } - // - // static final boolean fixEmpty(final boolean empty, final int curCount, final long thetaLong) { - // if (curCount > 0) { return false; } - // if ((curCount == 0) && (thetaLong == Long.MAX_VALUE)) { return true; } - // return empty; - // } + /* + * The truth table for empty, curCount and theta on compact is as follows: + *
+   * Num Theta CurCount Empty State  Comments
+   *  0    1.0     0      T     OK   The Normal Empty State
+   *  1    1.0     0      F   Error  This conflicts with the normal empty state
+   *  2    1.0    !0      T   Error  Empty and curCount !0 should never co-exist
+   *  3    1.0    !0      F     OK   This corresponds to a sketch in exact mode
+   *  4   <1.0     0      T   Error  This can be an initial UpdateSketch state if p < 1.0,
+   *                                   but should compacted as {Th = 1.0, 0, T}.
+   *  5   <1.0     0      F     OK   This can result from set operations
+   *  6   <1.0    !0      T   Error  Empty and curCount !0 should never co-exist
+   *  7   <1.0    !0      F     OK   This corresponds to a sketch in estimation mode
+   * 
+ *

thetaOnCompact() checks for only #4 and corrects theta. + *

emptyOnCompact() corrects for #1, 2, 6 if they occur + *

First apply thetaOnCompact() then emptyOnCompact(). + */ + static final long thetaOnCompact(final boolean empty, final int curCount, final long thetaLong) { + return (empty && (curCount == 0) && (thetaLong < Long.MAX_VALUE)) ? Long.MAX_VALUE : thetaLong; + } + + static final boolean emptyOnCompact(final int curCount, final long thetaLong) { + return ((curCount == 0) && (thetaLong == Long.MAX_VALUE)); + } static final double estimate(final long thetaLong, final int curCount, final boolean empty) { if (estMode(thetaLong, empty)) { diff --git a/src/main/java/com/yahoo/sketches/theta/UnionImpl.java b/src/main/java/com/yahoo/sketches/theta/UnionImpl.java index b2127dd3d..409133a07 100644 --- a/src/main/java/com/yahoo/sketches/theta/UnionImpl.java +++ b/src/main/java/com/yahoo/sketches/theta/UnionImpl.java @@ -49,8 +49,7 @@ final class UnionImpl extends Union { */ private final UpdateSketch gadget_; private final short seedHash_; //eliminates having to compute the seedHash on every update. - private long unionThetaLong_ = Long.MAX_VALUE; //when on-heap, this is the only copy - private boolean unionEmpty_ = true; + private long unionThetaLong_; //when on-heap, this is the only copy private UnionImpl(final UpdateSketch gadget, final long seed) { gadget_ = gadget; @@ -116,7 +115,6 @@ static UnionImpl heapifyInstance(final Memory srcMem, final long seed) { final UpdateSketch gadget = HeapQuickSelectSketch.heapifyInstance(srcMem, seed); final UnionImpl unionImpl = new UnionImpl(gadget, seed); unionImpl.unionThetaLong_ = srcMem.getLong(UNION_THETA_LONG); - unionImpl.unionEmpty_ = PreambleUtil.isEmpty(srcMem); return unionImpl; } @@ -133,7 +131,6 @@ static UnionImpl fastWrap(final Memory srcMem, final long seed) { final UpdateSketch gadget = DirectQuickSelectSketchR.fastReadOnlyWrap(srcMem, seed); final UnionImpl unionImpl = new UnionImpl(gadget, seed); unionImpl.unionThetaLong_ = srcMem.getLong(UNION_THETA_LONG); - unionImpl.unionEmpty_ = PreambleUtil.isEmpty(srcMem); return unionImpl; } @@ -150,7 +147,6 @@ static UnionImpl fastWrap(final WritableMemory srcMem, final long seed) { final UpdateSketch gadget = DirectQuickSelectSketch.fastWritableWrap(srcMem, seed); final UnionImpl unionImpl = new UnionImpl(gadget, seed); unionImpl.unionThetaLong_ = srcMem.getLong(UNION_THETA_LONG); - unionImpl.unionEmpty_ = PreambleUtil.isEmpty(srcMem); return unionImpl; } @@ -167,7 +163,6 @@ static UnionImpl wrapInstance(final Memory srcMem, final long seed) { final UpdateSketch gadget = DirectQuickSelectSketchR.readOnlyWrap(srcMem, seed); final UnionImpl unionImpl = new UnionImpl(gadget, seed); unionImpl.unionThetaLong_ = srcMem.getLong(UNION_THETA_LONG); - unionImpl.unionEmpty_ = PreambleUtil.isEmpty(srcMem); return unionImpl; } @@ -184,7 +179,6 @@ static UnionImpl wrapInstance(final WritableMemory srcMem, final long seed) { final UpdateSketch gadget = DirectQuickSelectSketch.writableWrap(srcMem, seed); final UnionImpl unionImpl = new UnionImpl(gadget, seed); unionImpl.unionThetaLong_ = srcMem.getLong(UNION_THETA_LONG); - unionImpl.unionEmpty_ = PreambleUtil.isEmpty(srcMem); return unionImpl; } @@ -198,7 +192,7 @@ public CompactSketch getResult(final boolean dstOrdered, final WritableMemory ds final int gadgetCurCount = gadget_.getRetainedEntries(true); final int k = 1 << gadget_.getLgNomLongs(); final long[] gadgetCacheCopy = - (gadget_.isDirect()) ? gadget_.getCache() : gadget_.getCache().clone(); + (gadget_.hasMemory()) ? gadget_.getCache() : gadget_.getCache().clone(); //Pull back to k final long curGadgetThetaLong = gadget_.getThetaLong(); @@ -206,10 +200,10 @@ public CompactSketch getResult(final boolean dstOrdered, final WritableMemory ds ? selectExcludingZeros(gadgetCacheCopy, gadgetCurCount, k + 1) : curGadgetThetaLong; //Finalize Theta and curCount - final long unionThetaLong = (gadget_.isDirect()) + final long unionThetaLong = (gadget_.hasMemory()) ? gadget_.getMemory().getLong(UNION_THETA_LONG) : unionThetaLong_; - long minThetaLong = min(min(curGadgetThetaLong, adjGadgetThetaLong), unionThetaLong); + final long minThetaLong = min(min(curGadgetThetaLong, adjGadgetThetaLong), unionThetaLong); final int curCountOut = (minThetaLong < curGadgetThetaLong) ? HashOperations.count(gadgetCacheCopy, minThetaLong) : gadgetCurCount; @@ -217,8 +211,7 @@ public CompactSketch getResult(final boolean dstOrdered, final WritableMemory ds //Compact the cache final long[] compactCacheOut = compactCache(gadgetCacheCopy, curCountOut, minThetaLong, dstOrdered); - final boolean empty = gadget_.isEmpty() && unionEmpty_; - if (empty) { minThetaLong = Long.MAX_VALUE; } + final boolean empty = gadget_.isEmpty(); return createCompactSketch( compactCacheOut, empty, seedHash_, curCountOut, minThetaLong, dstOrdered, dstMem); } @@ -227,16 +220,13 @@ public CompactSketch getResult(final boolean dstOrdered, final WritableMemory ds public void reset() { gadget_.reset(); unionThetaLong_ = gadget_.getThetaLong(); - unionEmpty_ = true; } @Override public byte[] toByteArray() { final byte[] gadgetByteArr = gadget_.toByteArray(); - final WritableMemory wmem = WritableMemory.wrap(gadgetByteArr); - wmem.putLong(UNION_THETA_LONG, unionThetaLong_); // union theta - final boolean empty = gadget_.isEmpty() && unionEmpty_; - if (!empty) { PreambleUtil.clearEmpty(wmem); } + final WritableMemory mem = WritableMemory.wrap(gadgetByteArr); + mem.putLong(UNION_THETA_LONG, unionThetaLong_); // union theta return gadgetByteArr; } @@ -258,14 +248,13 @@ public void update(final Sketch sketchIn) { //Only valid for theta Sketches usin Util.checkSeedHashes(seedHash_, sketchIn.getSeedHash()); Sketch.checkSketchAndMemoryFlags(sketchIn); - final long thetaLongIn = sketchIn.getThetaLong(); unionThetaLong_ = min(unionThetaLong_, thetaLongIn); //Theta rule with incoming final int curCountIn = sketchIn.getRetainedEntries(true); if (curCountIn > 0) { if (sketchIn.isOrdered()) { //Only true if Compact. Use early stop //Ordered, thus compact - if (sketchIn.isDirect()) { + if (sketchIn.hasMemory()) { final Memory skMem = ((CompactSketch) sketchIn).getMemory(); final int preambleLongs = skMem.getByte(PREAMBLE_LONGS_BYTE) & 0X3F; for (int i = 0; i < curCountIn; i++ ) { @@ -295,15 +284,14 @@ public void update(final Sketch sketchIn) { //Only valid for theta Sketches usin } } } - unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //theta rule - final int gCurCount = gadget_.getRetainedEntries(); - unionEmpty_ = (gCurCount == 0) && (unionThetaLong_ == Long.MAX_VALUE); //empty rule - if (gadget_.isDirect()) { - final WritableMemory wmem = (WritableMemory)gadget_.getMemory(); - wmem.putLong(UNION_THETA_LONG, unionThetaLong_); - if (unionEmpty_) { PreambleUtil.setEmpty(wmem); } + unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //Theta rule with gadget + final boolean empty = gadget_.isEmpty() && sketchIn.isEmpty(); //Empty rule + if (gadget_.hasMemory()) { + final WritableMemory wmem = (WritableMemory) gadget_.getMemory(); + //OK to modify empty but NOT thetaLong + if (empty) { PreambleUtil.setEmpty(wmem); } else { PreambleUtil.clearEmpty(wmem); } - } + } else { ((HeapUpdateSketch) gadget_).setEmpty(empty); } } @Override @@ -401,7 +389,7 @@ long getThetaLong() { @Override boolean isEmpty() { - return gadget_.isEmpty() && unionEmpty_; + return gadget_.isEmpty(); } //no seedHash, assumes given seed is correct. No p, no empty flag, no concept of direct @@ -417,15 +405,15 @@ private void processVer1(final Memory skMem) { if (hashIn >= unionThetaLong_) { break; } // "early stop" gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed } - unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //theta rule - final int gCurCount = gadget_.getRetainedEntries(); - unionEmpty_ = (gCurCount == 0) && (unionThetaLong_ == Long.MAX_VALUE); //empty rule - if (gadget_.isDirect()) { - final WritableMemory wmem = (WritableMemory)gadget_.getMemory(); - wmem.putLong(UNION_THETA_LONG, unionThetaLong_); - if (unionEmpty_) { PreambleUtil.setEmpty(wmem); } + unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //Theta rule + final boolean emptyIn = (curCount == 0) && (thetaLongIn == Long.MAX_VALUE); + final boolean empty = gadget_.isEmpty() && emptyIn; //Empty rule + if (gadget_.hasMemory()) { + final WritableMemory wmem = (WritableMemory) gadget_.getMemory(); + //OK to modify empty but NOT thetaLong + if (empty) { PreambleUtil.setEmpty(wmem); } else { PreambleUtil.clearEmpty(wmem); } - } + } else { ((HeapUpdateSketch) gadget_).setEmpty(empty); } } //has seedHash and p, could have 0 entries & theta, @@ -451,15 +439,15 @@ private void processVer2(final Memory skMem) { if (hashIn >= unionThetaLong_) { break; } // "early stop" gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed } - unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //theta rule - final int gCurCount = gadget_.getRetainedEntries(); - unionEmpty_ = (gCurCount == 0) && (unionThetaLong_ == Long.MAX_VALUE); //empty rule - if (gadget_.isDirect()) { - final WritableMemory wmem = (WritableMemory)gadget_.getMemory(); - wmem.putLong(UNION_THETA_LONG, unionThetaLong_); - if (unionEmpty_) { PreambleUtil.setEmpty(wmem); } + unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); + final boolean emptyIn = (curCount == 0) && (thetaLongIn == Long.MAX_VALUE); + final boolean empty = gadget_.isEmpty() && emptyIn; //Empty rule + if (gadget_.hasMemory()) { + final WritableMemory wmem = (WritableMemory) gadget_.getMemory(); + //OK to modify empty but NOT thetaLong + if (empty) { PreambleUtil.setEmpty(wmem); } else { PreambleUtil.clearEmpty(wmem); } - } + } else { ((HeapUpdateSketch) gadget_).setEmpty(empty); } } //has seedHash, p, could have 0 entries & theta, @@ -508,15 +496,15 @@ else if (preLongs == 2) { //curCount has to be > 0 and exact mode. Cannot be fro gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed } } - unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //theta rule - final int gCurCount = gadget_.getRetainedEntries(); - unionEmpty_ = (gCurCount == 0) && (unionThetaLong_ == Long.MAX_VALUE); //empty rule - if (gadget_.isDirect()) { - final WritableMemory wmem = (WritableMemory)gadget_.getMemory(); - wmem.putLong(UNION_THETA_LONG, unionThetaLong_); - if (unionEmpty_) { PreambleUtil.setEmpty(wmem); } + unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //sync thetaLongs + final boolean emptyIn = (curCount == 0) && (thetaLongIn == Long.MAX_VALUE); + final boolean empty = gadget_.isEmpty() && emptyIn; //Empty rule + if (gadget_.hasMemory()) { + final WritableMemory wmem = (WritableMemory) gadget_.getMemory(); + //OK to modify empty but NOT thetaLong + if (empty) { PreambleUtil.setEmpty(wmem); } else { PreambleUtil.clearEmpty(wmem); } - } + } else { ((HeapUpdateSketch) gadget_).setEmpty(empty); } } } diff --git a/src/test/java/com/yahoo/sketches/theta/ConcurrentDirectQuickSelectSketchTest.java b/src/test/java/com/yahoo/sketches/theta/ConcurrentDirectQuickSelectSketchTest.java index e3affbca6..0e0be1ca6 100644 --- a/src/test/java/com/yahoo/sketches/theta/ConcurrentDirectQuickSelectSketchTest.java +++ b/src/test/java/com/yahoo/sketches/theta/ConcurrentDirectQuickSelectSketchTest.java @@ -126,6 +126,7 @@ public void checkHeapifyMemoryEstimating() { assertEquals(local.getClass().getSimpleName(), "ConcurrentHeapThetaBuffer"); int curCount1 = shared.getRetainedEntries(true); assertTrue(local.isDirect()); + assertTrue(local.hasMemory()); assertEquals(local.getCurrentPreambleLongs(false), 3); UpdateSketch sharedHeap = Sketches.heapifyUpdateSketch(mem); @@ -142,6 +143,7 @@ public void checkHeapifyMemoryEstimating() { int cacheCount = HashOperations.count(cache, thetaLong); assertEquals(curCount1, cacheCount); assertFalse(sharedHeap.isDirect()); + assertFalse(sharedHeap.hasMemory()); } } @@ -334,6 +336,7 @@ public void checkDQStoCompactForms() { assertEquals(local.getClass().getSimpleName(), "ConcurrentHeapThetaBuffer"); assertTrue(local.isDirect()); + assertTrue(local.hasMemory()); for (int i=0; i k, true); assertEquals(csk.getThetaLong() < Long.MAX_VALUE, true); assertEquals(csk.isDirect(), false); + assertEquals(csk.hasMemory(), false); assertEquals(csk.isOrdered(), true); csk = generate(State.CNT0_THLT1, k); @@ -350,18 +354,20 @@ public void checkGenerate() { assertEquals(csk.getRetainedEntries(), 0); assertEquals(csk.getThetaLong() < Long.MAX_VALUE, true); assertEquals(csk.isDirect(), false); + assertEquals(csk.hasMemory(), false); assertEquals(csk.isOrdered(), true); - csk = generate(State.EST_HEAP_UNORDERED, k); + csk = generate(State.EST_MEMORY_UNORDERED, k); assertEquals(csk.isEmpty(), false); assertEquals(csk.isEstimationMode(), true); assertEquals(csk.getRetainedEntries() > k, true); assertEquals(csk.getThetaLong() < Long.MAX_VALUE, true); assertEquals(csk.isDirect(), false); + assertEquals(csk.hasMemory(), true); assertEquals(csk.isOrdered(), false); } - enum State {NULL, EMPTY, EXACT, EST_HEAP, CNT0_THLT1, EST_HEAP_UNORDERED} + enum State {NULL, EMPTY, EXACT, EST_HEAP, CNT0_THLT1, EST_MEMORY_UNORDERED} private static CompactSketch generate(State state, int k) { UpdateSketch sk = null; @@ -399,7 +405,7 @@ private static CompactSketch generate(State state, int k) { csk = sk.compact(true, null); break; } - case EST_HEAP_UNORDERED : { + case EST_MEMORY_UNORDERED : { sk = Sketches.updateSketchBuilder().setNominalEntries(k).build(); for (int i = 0; i < (4 * k); i++) { sk.update(i); diff --git a/src/test/java/com/yahoo/sketches/theta/SingleItemSketchTest.java b/src/test/java/com/yahoo/sketches/theta/SingleItemSketchTest.java index 59baa2977..fca3c478e 100644 --- a/src/test/java/com/yahoo/sketches/theta/SingleItemSketchTest.java +++ b/src/test/java/com/yahoo/sketches/theta/SingleItemSketchTest.java @@ -113,8 +113,8 @@ public void checkSketchInterface() { assertEquals(sis.getRetainedEntries(true), 1); assertEquals(sis.getUpperBound(1), 1.0); assertFalse(sis.isDirect()); - assertFalse(sis.isEmpty()); assertFalse(sis.hasMemory()); + assertFalse(sis.isEmpty()); assertTrue(sis.isOrdered()); } diff --git a/src/test/java/com/yahoo/sketches/theta/SketchTest.java b/src/test/java/com/yahoo/sketches/theta/SketchTest.java index 9516d5611..d791ae6b2 100644 --- a/src/test/java/com/yahoo/sketches/theta/SketchTest.java +++ b/src/test/java/com/yahoo/sketches/theta/SketchTest.java @@ -290,11 +290,13 @@ public void checkWrapToHeapifyConversion1() { Memory v1mem = ForwardCompatibilityTest.convertSerV3toSerV1(v3mem); Sketch csk2 = Sketch.wrap(v1mem); assertFalse(csk2.isDirect()); + assertFalse(csk2.hasMemory()); assertEquals(uest1, csk2.getEstimate(), 0.0); Memory v2mem = ForwardCompatibilityTest.convertSerV3toSerV2(v3mem); csk2 = Sketch.wrap(v2mem); assertFalse(csk2.isDirect()); + assertFalse(csk2.hasMemory()); assertEquals(uest1, csk2.getEstimate(), 0.0); } From efd59a262e01a6a70324333460b0b9b63acf686a Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Mon, 1 Apr 2019 17:27:34 -0700 Subject: [PATCH 13/16] Fix initialization bug for DirectQuickSelectSketch in Union gadget mode. --- .../theta/ConcurrentHeapThetaBuffer.java | 22 +++-- .../theta/DirectQuickSelectSketch.java | 4 + .../yahoo/sketches/theta/HeapAlphaSketch.java | 7 +- .../sketches/theta/HeapQuickSelectSketch.java | 5 - .../sketches/theta/HeapUpdateSketch.java | 3 - .../com/yahoo/sketches/theta/UnionImpl.java | 94 ++++++++----------- .../sketches/theta/UpdateReturnState.java | 24 ++++- .../sketches/theta/UpdateSketchBuilder.java | 7 +- .../theta/PairwiseCornerCasesTest.java | 33 +++++-- 9 files changed, 106 insertions(+), 93 deletions(-) diff --git a/src/main/java/com/yahoo/sketches/theta/ConcurrentHeapThetaBuffer.java b/src/main/java/com/yahoo/sketches/theta/ConcurrentHeapThetaBuffer.java index dbafc83ca..80de35eb4 100644 --- a/src/main/java/com/yahoo/sketches/theta/ConcurrentHeapThetaBuffer.java +++ b/src/main/java/com/yahoo/sketches/theta/ConcurrentHeapThetaBuffer.java @@ -5,7 +5,8 @@ package com.yahoo.sketches.theta; -import static com.yahoo.sketches.theta.UpdateReturnState.InsertedCountIncremented; +import static com.yahoo.sketches.theta.UpdateReturnState.ConcurrentBufferInserted; +import static com.yahoo.sketches.theta.UpdateReturnState.ConcurrentPropagated; import static com.yahoo.sketches.theta.UpdateReturnState.RejectedOverTheta; import java.util.concurrent.atomic.AtomicBoolean; @@ -27,11 +28,6 @@ */ final class ConcurrentHeapThetaBuffer extends HeapQuickSelectSketch { - private static int computeLogBufferSize(final int lgNomLongs, final long exactSize, - final int maxNumLocalBuffers) { - return Math.min(lgNomLongs, (int)Math.log(Math.sqrt(exactSize) / (2 * maxNumLocalBuffers))); - } - // Shared sketch consisting of the global sample set and theta value. private final ConcurrentSharedThetaSketch shared; @@ -60,6 +56,11 @@ private static int computeLogBufferSize(final int lgNomLongs, final long exactSi localPropagationInProgress = new AtomicBoolean(false); } + private static int computeLogBufferSize(final int lgNomLongs, final long exactSize, + final int maxNumLocalBuffers) { + return Math.min(lgNomLongs, (int)Math.log(Math.sqrt(exactSize) / (2 * maxNumLocalBuffers))); + } + //Sketch overrides @Override @@ -134,18 +135,21 @@ UpdateReturnState hashUpdate(final long hash) { } HashOperations.checkHashCorruption(hash); if ((getHashTableThreshold() == 0) || isExactMode ) { - final long thetaLong = getThetaLong(); //The over-theta and zero test - if (HashOperations.continueCondition(thetaLong, hash)) { + if (HashOperations.continueCondition(getThetaLong(), hash)) { return RejectedOverTheta; //signal that hash was rejected due to theta or zero. } if (propagateToSharedSketch(hash)) { - return InsertedCountIncremented; //not totally correct + return ConcurrentPropagated; } } final UpdateReturnState state = super.hashUpdate(hash); if (isOutOfSpace(getRetainedEntries() + 1)) { propagateToSharedSketch(); + return ConcurrentPropagated; + } + if (state == UpdateReturnState.InsertedCountIncremented) { + return ConcurrentBufferInserted; } return state; } diff --git a/src/main/java/com/yahoo/sketches/theta/DirectQuickSelectSketch.java b/src/main/java/com/yahoo/sketches/theta/DirectQuickSelectSketch.java index 2362bb38f..94b77d640 100644 --- a/src/main/java/com/yahoo/sketches/theta/DirectQuickSelectSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/DirectQuickSelectSketch.java @@ -30,6 +30,7 @@ import static com.yahoo.sketches.theta.PreambleUtil.insertSeedHash; import static com.yahoo.sketches.theta.PreambleUtil.insertSerVer; import static com.yahoo.sketches.theta.PreambleUtil.insertThetaLong; +import static com.yahoo.sketches.theta.PreambleUtil.insertUnionThetaLong; import static com.yahoo.sketches.theta.Rebuilder.actLgResizeFactor; import static com.yahoo.sketches.theta.Rebuilder.moveAndResize; import static com.yahoo.sketches.theta.Rebuilder.quickSelectAndRebuild; @@ -132,6 +133,9 @@ private DirectQuickSelectSketch( insertP(dstMem, p); //bytes 12-15 final long thetaLong = (long)(p * MAX_THETA_LONG_AS_DOUBLE); insertThetaLong(dstMem, thetaLong); //bytes 16-23 + if (unionGadget) { + insertUnionThetaLong(dstMem, thetaLong); + } //@formatter:on //clear hash table area diff --git a/src/main/java/com/yahoo/sketches/theta/HeapAlphaSketch.java b/src/main/java/com/yahoo/sketches/theta/HeapAlphaSketch.java index db045eb84..01d10b01e 100644 --- a/src/main/java/com/yahoo/sketches/theta/HeapAlphaSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/HeapAlphaSketch.java @@ -53,7 +53,7 @@ final class HeapAlphaSketch extends HeapUpdateSketch { private int hashTableThreshold_; //never serialized private int curCount_ = 0; private long thetaLong_; - boolean empty_ = true; + private boolean empty_ = true; private long[] cache_; private boolean dirty_ = false; @@ -286,11 +286,6 @@ int getLgArrLongs() { return lgArrLongs_; } - @Override - void setEmpty(final boolean empty) { - empty_ = empty; - } - @Override UpdateReturnState hashUpdate(final long hash) { HashOperations.checkHashCorruption(hash); diff --git a/src/main/java/com/yahoo/sketches/theta/HeapQuickSelectSketch.java b/src/main/java/com/yahoo/sketches/theta/HeapQuickSelectSketch.java index 146dcd7d2..568aae982 100644 --- a/src/main/java/com/yahoo/sketches/theta/HeapQuickSelectSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/HeapQuickSelectSketch.java @@ -254,11 +254,6 @@ boolean isOutOfSpace(final int numEntries) { return numEntries > hashTableThreshold_; } - @Override - void setEmpty(final boolean empty) { - empty_ = empty; - } - //Must resize. Changes lgArrLongs_ and cache_. theta and count don't change. // Used by hashUpdate() private final void resizeCache() { diff --git a/src/main/java/com/yahoo/sketches/theta/HeapUpdateSketch.java b/src/main/java/com/yahoo/sketches/theta/HeapUpdateSketch.java index de7f310d1..0b6b9573a 100644 --- a/src/main/java/com/yahoo/sketches/theta/HeapUpdateSketch.java +++ b/src/main/java/com/yahoo/sketches/theta/HeapUpdateSketch.java @@ -90,9 +90,6 @@ short getSeedHash() { return Util.computeSeedHash(getSeed()); } - //for set operations - abstract void setEmpty(boolean empty); - byte[] toByteArray(final int preLongs, final byte familyID) { if (isDirty()) { rebuild(); } final int preBytes = (preLongs << 3) & 0X3F; diff --git a/src/main/java/com/yahoo/sketches/theta/UnionImpl.java b/src/main/java/com/yahoo/sketches/theta/UnionImpl.java index 409133a07..2611312ee 100644 --- a/src/main/java/com/yahoo/sketches/theta/UnionImpl.java +++ b/src/main/java/com/yahoo/sketches/theta/UnionImpl.java @@ -1,5 +1,5 @@ /* - * Copyright 2015-16, Yahoo! Inc. + * Copyright 2015, Yahoo! Inc. * Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms. */ @@ -19,6 +19,10 @@ import static com.yahoo.sketches.theta.PreambleUtil.SER_VER_BYTE; import static com.yahoo.sketches.theta.PreambleUtil.THETA_LONG; import static com.yahoo.sketches.theta.PreambleUtil.UNION_THETA_LONG; +import static com.yahoo.sketches.theta.PreambleUtil.clearEmpty; +import static com.yahoo.sketches.theta.PreambleUtil.extractFamilyID; +import static com.yahoo.sketches.theta.PreambleUtil.extractUnionThetaLong; +import static com.yahoo.sketches.theta.PreambleUtil.insertUnionThetaLong; import static java.lang.Math.min; import com.yahoo.memory.Memory; @@ -50,6 +54,7 @@ final class UnionImpl extends Union { private final UpdateSketch gadget_; private final short seedHash_; //eliminates having to compute the seedHash on every update. private long unionThetaLong_; //when on-heap, this is the only copy + private boolean unionEmpty_; //when on-heap, this is the only copy private UnionImpl(final UpdateSketch gadget, final long seed) { gadget_ = gadget; @@ -72,6 +77,7 @@ static UnionImpl initNewHeapInstance(final int lgNomLongs, final long seed, fina lgNomLongs, seed, p, rf, true); //create with UNION family final UnionImpl unionImpl = new UnionImpl(gadget, seed); unionImpl.unionThetaLong_ = gadget.getThetaLong(); + unionImpl.unionEmpty_ = gadget.isEmpty(); return unionImpl; } @@ -98,23 +104,24 @@ static UnionImpl initNewDirectInstance( lgNomLongs, seed, p, rf, memReqSvr, dstMem, true); //create with UNION family final UnionImpl unionImpl = new UnionImpl(gadget, seed); unionImpl.unionThetaLong_ = gadget.getThetaLong(); - dstMem.putLong(UNION_THETA_LONG, gadget.getThetaLong()); + unionImpl.unionEmpty_ = gadget.isEmpty(); return unionImpl; } /** - * Heapify a Union from a Memory object containing data. + * Heapify a Union from a Memory Union object containing data. * Called by SetOperation. - * @param srcMem The source Memory object. + * @param srcMem The source Memory Union object. * See Memory * @param seed See seed * @return this class */ static UnionImpl heapifyInstance(final Memory srcMem, final long seed) { - Family.UNION.checkFamilyID(srcMem.getByte(FAMILY_BYTE)); + Family.UNION.checkFamilyID(extractFamilyID(srcMem)); final UpdateSketch gadget = HeapQuickSelectSketch.heapifyInstance(srcMem, seed); final UnionImpl unionImpl = new UnionImpl(gadget, seed); - unionImpl.unionThetaLong_ = srcMem.getLong(UNION_THETA_LONG); + unionImpl.unionThetaLong_ = extractUnionThetaLong(srcMem); + unionImpl.unionEmpty_ = PreambleUtil.isEmpty(srcMem); return unionImpl; } @@ -127,10 +134,11 @@ static UnionImpl heapifyInstance(final Memory srcMem, final long seed) { * @return this class */ static UnionImpl fastWrap(final Memory srcMem, final long seed) { - Family.UNION.checkFamilyID(srcMem.getByte(FAMILY_BYTE)); + Family.UNION.checkFamilyID(extractFamilyID(srcMem)); final UpdateSketch gadget = DirectQuickSelectSketchR.fastReadOnlyWrap(srcMem, seed); final UnionImpl unionImpl = new UnionImpl(gadget, seed); - unionImpl.unionThetaLong_ = srcMem.getLong(UNION_THETA_LONG); + unionImpl.unionThetaLong_ = extractUnionThetaLong(srcMem); + unionImpl.unionEmpty_ = PreambleUtil.isEmpty(srcMem); return unionImpl; } @@ -143,10 +151,11 @@ static UnionImpl fastWrap(final Memory srcMem, final long seed) { * @return this class */ static UnionImpl fastWrap(final WritableMemory srcMem, final long seed) { - Family.UNION.checkFamilyID(srcMem.getByte(FAMILY_BYTE)); + Family.UNION.checkFamilyID(extractFamilyID(srcMem)); final UpdateSketch gadget = DirectQuickSelectSketch.fastWritableWrap(srcMem, seed); final UnionImpl unionImpl = new UnionImpl(gadget, seed); - unionImpl.unionThetaLong_ = srcMem.getLong(UNION_THETA_LONG); + unionImpl.unionThetaLong_ = extractUnionThetaLong(srcMem); + unionImpl.unionEmpty_ = PreambleUtil.isEmpty(srcMem); return unionImpl; } @@ -159,10 +168,11 @@ static UnionImpl fastWrap(final WritableMemory srcMem, final long seed) { * @return this class */ static UnionImpl wrapInstance(final Memory srcMem, final long seed) { - Family.UNION.checkFamilyID(srcMem.getByte(FAMILY_BYTE)); + Family.UNION.checkFamilyID(extractFamilyID(srcMem)); final UpdateSketch gadget = DirectQuickSelectSketchR.readOnlyWrap(srcMem, seed); final UnionImpl unionImpl = new UnionImpl(gadget, seed); - unionImpl.unionThetaLong_ = srcMem.getLong(UNION_THETA_LONG); + unionImpl.unionThetaLong_ = extractUnionThetaLong(srcMem); + unionImpl.unionEmpty_ = PreambleUtil.isEmpty(srcMem); return unionImpl; } @@ -175,10 +185,11 @@ static UnionImpl wrapInstance(final Memory srcMem, final long seed) { * @return this class */ static UnionImpl wrapInstance(final WritableMemory srcMem, final long seed) { - Family.UNION.checkFamilyID(srcMem.getByte(FAMILY_BYTE)); + Family.UNION.checkFamilyID(extractFamilyID(srcMem)); final UpdateSketch gadget = DirectQuickSelectSketch.writableWrap(srcMem, seed); final UnionImpl unionImpl = new UnionImpl(gadget, seed); - unionImpl.unionThetaLong_ = srcMem.getLong(UNION_THETA_LONG); + unionImpl.unionThetaLong_ = extractUnionThetaLong(srcMem); + unionImpl.unionEmpty_ = PreambleUtil.isEmpty(srcMem); return unionImpl; } @@ -211,7 +222,7 @@ public CompactSketch getResult(final boolean dstOrdered, final WritableMemory ds //Compact the cache final long[] compactCacheOut = compactCache(gadgetCacheCopy, curCountOut, minThetaLong, dstOrdered); - final boolean empty = gadget_.isEmpty(); + final boolean empty = gadget_.isEmpty() && unionEmpty_; return createCompactSketch( compactCacheOut, empty, seedHash_, curCountOut, minThetaLong, dstOrdered, dstMem); } @@ -220,13 +231,18 @@ public CompactSketch getResult(final boolean dstOrdered, final WritableMemory ds public void reset() { gadget_.reset(); unionThetaLong_ = gadget_.getThetaLong(); + unionEmpty_ = gadget_.isEmpty(); } @Override public byte[] toByteArray() { final byte[] gadgetByteArr = gadget_.toByteArray(); final WritableMemory mem = WritableMemory.wrap(gadgetByteArr); - mem.putLong(UNION_THETA_LONG, unionThetaLong_); // union theta + insertUnionThetaLong(mem, unionThetaLong_); + if (gadget_.isEmpty() != unionEmpty_) { + clearEmpty(mem); + unionEmpty_ = false; + } return gadgetByteArr; } @@ -248,8 +264,8 @@ public void update(final Sketch sketchIn) { //Only valid for theta Sketches usin Util.checkSeedHashes(seedHash_, sketchIn.getSeedHash()); Sketch.checkSketchAndMemoryFlags(sketchIn); - final long thetaLongIn = sketchIn.getThetaLong(); - unionThetaLong_ = min(unionThetaLong_, thetaLongIn); //Theta rule with incoming + unionThetaLong_ = min(unionThetaLong_, sketchIn.getThetaLong()); //Theta rule + unionEmpty_ &= sketchIn.isEmpty(); final int curCountIn = sketchIn.getRetainedEntries(true); if (curCountIn > 0) { if (sketchIn.isOrdered()) { //Only true if Compact. Use early stop @@ -285,13 +301,6 @@ public void update(final Sketch sketchIn) { //Only valid for theta Sketches usin } } unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //Theta rule with gadget - final boolean empty = gadget_.isEmpty() && sketchIn.isEmpty(); //Empty rule - if (gadget_.hasMemory()) { - final WritableMemory wmem = (WritableMemory) gadget_.getMemory(); - //OK to modify empty but NOT thetaLong - if (empty) { PreambleUtil.setEmpty(wmem); } - else { PreambleUtil.clearEmpty(wmem); } - } else { ((HeapUpdateSketch) gadget_).setEmpty(empty); } } @Override @@ -389,7 +398,7 @@ long getThetaLong() { @Override boolean isEmpty() { - return gadget_.isEmpty(); + return gadget_.isEmpty() && unionEmpty_; } //no seedHash, assumes given seed is correct. No p, no empty flag, no concept of direct @@ -406,14 +415,7 @@ private void processVer1(final Memory skMem) { gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed } unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //Theta rule - final boolean emptyIn = (curCount == 0) && (thetaLongIn == Long.MAX_VALUE); - final boolean empty = gadget_.isEmpty() && emptyIn; //Empty rule - if (gadget_.hasMemory()) { - final WritableMemory wmem = (WritableMemory) gadget_.getMemory(); - //OK to modify empty but NOT thetaLong - if (empty) { PreambleUtil.setEmpty(wmem); } - else { PreambleUtil.clearEmpty(wmem); } - } else { ((HeapUpdateSketch) gadget_).setEmpty(empty); } + unionEmpty_ &= gadget_.isEmpty(); } //has seedHash and p, could have 0 entries & theta, @@ -423,10 +425,10 @@ private void processVer2(final Memory skMem) { final int preLongs = skMem.getByte(PREAMBLE_LONGS_BYTE) & 0X3F; final int curCount = skMem.getInt(RETAINED_ENTRIES_INT); final long thetaLongIn; - if (preLongs == 1) { + if (preLongs == 1) { //does not change anything {1.0, 0, T} return; } - if (preLongs == 2) { + if (preLongs == 2) { //exact mode assert curCount > 0; thetaLongIn = Long.MAX_VALUE; } else { //prelongs == 3, curCount may be 0 (e.g., from intersection) @@ -440,14 +442,7 @@ private void processVer2(final Memory skMem) { gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed } unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); - final boolean emptyIn = (curCount == 0) && (thetaLongIn == Long.MAX_VALUE); - final boolean empty = gadget_.isEmpty() && emptyIn; //Empty rule - if (gadget_.hasMemory()) { - final WritableMemory wmem = (WritableMemory) gadget_.getMemory(); - //OK to modify empty but NOT thetaLong - if (empty) { PreambleUtil.setEmpty(wmem); } - else { PreambleUtil.clearEmpty(wmem); } - } else { ((HeapUpdateSketch) gadget_).setEmpty(empty); } + unionEmpty_ &= gadget_.isEmpty(); } //has seedHash, p, could have 0 entries & theta, @@ -463,7 +458,7 @@ private void processVer3(final Memory skMem) { curCount = 1; thetaLongIn = Long.MAX_VALUE; } else { - return; //otherwise an empty sketch + return; //otherwise an empty sketch {1.0, 0, T} } } else if (preLongs == 2) { //curCount has to be > 0 and exact mode. Cannot be from intersection. @@ -497,14 +492,7 @@ else if (preLongs == 2) { //curCount has to be > 0 and exact mode. Cannot be fro } } unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //sync thetaLongs - final boolean emptyIn = (curCount == 0) && (thetaLongIn == Long.MAX_VALUE); - final boolean empty = gadget_.isEmpty() && emptyIn; //Empty rule - if (gadget_.hasMemory()) { - final WritableMemory wmem = (WritableMemory) gadget_.getMemory(); - //OK to modify empty but NOT thetaLong - if (empty) { PreambleUtil.setEmpty(wmem); } - else { PreambleUtil.clearEmpty(wmem); } - } else { ((HeapUpdateSketch) gadget_).setEmpty(empty); } + unionEmpty_ &= gadget_.isEmpty(); } } diff --git a/src/main/java/com/yahoo/sketches/theta/UpdateReturnState.java b/src/main/java/com/yahoo/sketches/theta/UpdateReturnState.java index 8c0748585..4881250b8 100644 --- a/src/main/java/com/yahoo/sketches/theta/UpdateReturnState.java +++ b/src/main/java/com/yahoo/sketches/theta/UpdateReturnState.java @@ -13,27 +13,41 @@ public enum UpdateReturnState { /** - * Indicates a normal sketch update response and the action that was taken by the sketch. + * Indicates that the value was accepted into the sketch and the retained count was incremented. */ InsertedCountIncremented, //all UpdateSketches /** - * Indicates a normal sketch update response and the action that was taken by the sketch. + * Indicates that the value was accepted into the sketch and the retained count was not incremented. */ InsertedCountNotIncremented, //used by enhancedHashInsert for Alpha /** - * Indicates a normal sketch update response and the action that was taken by the sketch. + * Indicates that the value was inserted into the local concurrent buffer, + * but has not yet been propagated to the concurrent shared sketch. + */ + ConcurrentBufferInserted, //used by ConcurrentHeapThetaBuffer + + /** + * Indicates that the value has been propagated to the concurrent shared sketch. + * This does not reflect the action taken by the shared sketch. + */ + ConcurrentPropagated, //used by ConcurrentHeapThetaBuffer + + /** + * Indicates that the value was rejected as a duplicate. */ RejectedDuplicate, //all UpdateSketches hashUpdate(), enhancedHashInsert /** - * Indicates a normal sketch update response and the action that was taken by the sketch. + * Indicates that the value was rejected because it was null or empty. */ RejectedNullOrEmpty, //UpdateSketch.update(arr[]) /** - * Indicates a normal sketch update response and the action that was taken by the sketch. + * Indicates that the value was rejected because the hash value was negative, zero or + * greater than theta. */ RejectedOverTheta; //all UpdateSketches.hashUpdate() + } diff --git a/src/main/java/com/yahoo/sketches/theta/UpdateSketchBuilder.java b/src/main/java/com/yahoo/sketches/theta/UpdateSketchBuilder.java index daabc8cbd..f5e3a5127 100644 --- a/src/main/java/com/yahoo/sketches/theta/UpdateSketchBuilder.java +++ b/src/main/java/com/yahoo/sketches/theta/UpdateSketchBuilder.java @@ -435,11 +435,12 @@ private ConcurrentSharedThetaSketch buildSharedInternal(final WritableMemory dst } } - private ConcurrentSharedThetaSketch buildSharedFromSketchInternal(final UpdateSketch sketch, - final WritableMemory dstMem) { + private ConcurrentSharedThetaSketch buildSharedFromSketchInternal( + final UpdateSketch sketch, final WritableMemory dstMem) { ConcurrentPropagationService.NUM_POOL_THREADS = bNumPoolThreads; if (sketch instanceof HeapQuickSelectSketch) { - return new ConcurrentHeapQuickSelectSketch((HeapQuickSelectSketch)sketch, bSeed, bMaxConcurrencyError); + return new ConcurrentHeapQuickSelectSketch( + (HeapQuickSelectSketch)sketch, bSeed, bMaxConcurrencyError); } if (sketch instanceof DirectQuickSelectSketch) { return new ConcurrentDirectQuickSelectSketch((DirectQuickSelectSketch)sketch, bSeed, diff --git a/src/test/java/com/yahoo/sketches/theta/PairwiseCornerCasesTest.java b/src/test/java/com/yahoo/sketches/theta/PairwiseCornerCasesTest.java index f3b2c429f..d812adf50 100644 --- a/src/test/java/com/yahoo/sketches/theta/PairwiseCornerCasesTest.java +++ b/src/test/java/com/yahoo/sketches/theta/PairwiseCornerCasesTest.java @@ -92,9 +92,9 @@ public void compareCornerCases() { } } - //@Test - public void checkNull_CNT0_THLT1() { - cornerCaseChecks(State.NULL, State.CNT0_THLT1, 64); + @Test + public void checkNull_THLT1_CNT0_FALSE() { + cornerCaseChecks(State.NULL, State.THLT1_CNT0_FALSE, 64); } private static void cornerCaseChecks(State stateA, State stateB, int k) { @@ -126,7 +126,7 @@ private static void cornerCaseChecks(State stateA, State stateB, int k) { } else { Assert.assertEquals(pwEst, stdEst, 0.0); } - + assert pwEmpty == stdEmpty; Assert.assertEquals(pwEmpty, stdEmpty); Assert.assertEquals(pwTheta, stdTheta, 0.0); Assert.assertEquals(pwEnt, stdEnt); @@ -348,7 +348,7 @@ public void checkGenerate() { assertEquals(csk.hasMemory(), false); assertEquals(csk.isOrdered(), true); - csk = generate(State.CNT0_THLT1, k); + csk = generate(State.THLT1_CNT0_FALSE, k); assertEquals(csk.isEmpty(), false); assertEquals(csk.isEstimationMode(), true); assertEquals(csk.getRetainedEntries(), 0); @@ -357,6 +357,15 @@ public void checkGenerate() { assertEquals(csk.hasMemory(), false); assertEquals(csk.isOrdered(), true); + csk = generate(State.THEQ1_CNT0_TRUE, k); + assertEquals(csk.isEmpty(), true); + assertEquals(csk.isEstimationMode(), false); + assertEquals(csk.getRetainedEntries(), 0); + assertEquals(csk.getThetaLong() < Long.MAX_VALUE, false); + assertEquals(csk.isDirect(), false); + assertEquals(csk.hasMemory(), false); + assertEquals(csk.isOrdered(), true); + csk = generate(State.EST_MEMORY_UNORDERED, k); assertEquals(csk.isEmpty(), false); assertEquals(csk.isEstimationMode(), true); @@ -367,7 +376,7 @@ public void checkGenerate() { assertEquals(csk.isOrdered(), false); } - enum State {NULL, EMPTY, EXACT, EST_HEAP, CNT0_THLT1, EST_MEMORY_UNORDERED} + enum State {NULL, EMPTY, EXACT, EST_HEAP, THLT1_CNT0_FALSE, THEQ1_CNT0_TRUE, EST_MEMORY_UNORDERED} private static CompactSketch generate(State state, int k) { UpdateSketch sk = null; @@ -398,11 +407,17 @@ private static CompactSketch generate(State state, int k) { csk = sk.compact(true, null); break; } - case CNT0_THLT1 : { + case THLT1_CNT0_FALSE : { sk = Sketches.updateSketchBuilder().setP((float)0.5).setNominalEntries(k).build(); - sk.update(7); + sk.update(7); //above theta assert(sk.getRetainedEntries() == 0); - csk = sk.compact(true, null); + csk = sk.compact(true, null); //compact as {Th < 1.0, 0, F} + break; + } + case THEQ1_CNT0_TRUE : { + sk = Sketches.updateSketchBuilder().setP((float)0.5).setNominalEntries(k).build(); + assert(sk.getRetainedEntries() == 0); + csk = sk.compact(true, null); //compact as {Th < 1.0, 0, T} break; } case EST_MEMORY_UNORDERED : { From 4a973a7518844f08f5a0a0a864cfb48f548f0241 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Tue, 2 Apr 2019 10:02:55 -0700 Subject: [PATCH 14/16] Added unit tests --- .../theta/DirectIntersectionTest.java | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/test/java/com/yahoo/sketches/theta/DirectIntersectionTest.java b/src/test/java/com/yahoo/sketches/theta/DirectIntersectionTest.java index 2ca62cccb..b72a98c8e 100644 --- a/src/test/java/com/yahoo/sketches/theta/DirectIntersectionTest.java +++ b/src/test/java/com/yahoo/sketches/theta/DirectIntersectionTest.java @@ -12,6 +12,7 @@ import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; +import static org.testng.Assert.fail; import org.testng.annotations.Test; @@ -19,6 +20,7 @@ import com.yahoo.memory.WritableMemory; import com.yahoo.sketches.Family; import com.yahoo.sketches.SketchesArgumentException; +import com.yahoo.sketches.SketchesReadOnlyException; import com.yahoo.sketches.SketchesStateException; /** @@ -750,14 +752,25 @@ public void checkOverlappedDirect() { } WritableMemory memIn1 = WritableMemory.wrap(new byte[memBytes]); WritableMemory memIn2 = WritableMemory.wrap(new byte[memBytes]); - WritableMemory memOut = WritableMemory.wrap(new byte[memBytes]); + WritableMemory memInter = WritableMemory.wrap(new byte[memBytes]); + WritableMemory memComp = WritableMemory.wrap(new byte[memBytes]); CompactSketch csk1 = sk1.compact(true, memIn1); CompactSketch csk2 = sk2.compact(true, memIn2); - Intersection inter = Sketches.setOperationBuilder().buildIntersection(memOut); + Intersection inter = Sketches.setOperationBuilder().buildIntersection(memInter); inter.update(csk1); inter.update(csk2); - CompactSketch cskOut = inter.getResult(true, memOut); + CompactSketch cskOut = inter.getResult(true, memComp); assertEquals(cskOut.getEstimate(), 2.0, 0.0); + + Intersection interRO = (Intersection) SetOperation.wrap((Memory)memInter); + try { + interRO.intersect(sk1, sk2); + fail(); + } catch (SketchesReadOnlyException e) { } + try { + interRO.reset(); + fail(); + } catch (SketchesReadOnlyException e) { } } @Test From f0477af1db1bd4b8ea7f389ee1c43b87d7500b8d Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Tue, 2 Apr 2019 10:51:46 -0700 Subject: [PATCH 15/16] Fix FindBugs warnings --- .../sketches/quantiles/DoublesSketch.java | 16 ++++++++++------ .../com/yahoo/sketches/theta/UnionImpl.java | 8 ++++---- .../sketches/quantiles/DebugUnionTest.java | 18 ++++++------------ 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/src/main/java/com/yahoo/sketches/quantiles/DoublesSketch.java b/src/main/java/com/yahoo/sketches/quantiles/DoublesSketch.java index 6d5efe832..e71b09509 100644 --- a/src/main/java/com/yahoo/sketches/quantiles/DoublesSketch.java +++ b/src/main/java/com/yahoo/sketches/quantiles/DoublesSketch.java @@ -128,23 +128,27 @@ public abstract class DoublesSketch { static final int MIN_K = 2; static final int MAX_K = 1 << 15; - /** - * Parameter that controls space usage of sketch and accuracy of estimates. - */ - final int k_; - /** * Setting the seed makes the results of the sketch deterministic if the input values are * received in exactly the same order. This is only useful when performing test comparisons, * otherwise is not recommended. */ - public static Random rand = new Random(); + static Random rand = new Random(); + + /** + * Parameter that controls space usage of sketch and accuracy of estimates. + */ + final int k_; DoublesSketch(final int k) { Util.checkK(k); k_ = k; } + synchronized static void setRandom(final long seed) { + DoublesSketch.rand = new Random(seed); + } + /** * Returns a new builder * @return a new builder diff --git a/src/main/java/com/yahoo/sketches/theta/UnionImpl.java b/src/main/java/com/yahoo/sketches/theta/UnionImpl.java index 2611312ee..78d0a5d73 100644 --- a/src/main/java/com/yahoo/sketches/theta/UnionImpl.java +++ b/src/main/java/com/yahoo/sketches/theta/UnionImpl.java @@ -265,7 +265,7 @@ public void update(final Sketch sketchIn) { //Only valid for theta Sketches usin Sketch.checkSketchAndMemoryFlags(sketchIn); unionThetaLong_ = min(unionThetaLong_, sketchIn.getThetaLong()); //Theta rule - unionEmpty_ &= sketchIn.isEmpty(); + unionEmpty_ = unionEmpty_ && sketchIn.isEmpty(); final int curCountIn = sketchIn.getRetainedEntries(true); if (curCountIn > 0) { if (sketchIn.isOrdered()) { //Only true if Compact. Use early stop @@ -415,7 +415,7 @@ private void processVer1(final Memory skMem) { gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed } unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //Theta rule - unionEmpty_ &= gadget_.isEmpty(); + unionEmpty_ = unionEmpty_ && gadget_.isEmpty(); } //has seedHash and p, could have 0 entries & theta, @@ -442,7 +442,7 @@ private void processVer2(final Memory skMem) { gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed } unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); - unionEmpty_ &= gadget_.isEmpty(); + unionEmpty_ = unionEmpty_ && gadget_.isEmpty(); } //has seedHash, p, could have 0 entries & theta, @@ -492,7 +492,7 @@ else if (preLongs == 2) { //curCount has to be > 0 and exact mode. Cannot be fro } } unionThetaLong_ = min(unionThetaLong_, gadget_.getThetaLong()); //sync thetaLongs - unionEmpty_ &= gadget_.isEmpty(); + unionEmpty_ = unionEmpty_ && gadget_.isEmpty(); } } diff --git a/src/test/java/com/yahoo/sketches/quantiles/DebugUnionTest.java b/src/test/java/com/yahoo/sketches/quantiles/DebugUnionTest.java index 9f27a3564..88d3d7d1d 100644 --- a/src/test/java/com/yahoo/sketches/quantiles/DebugUnionTest.java +++ b/src/test/java/com/yahoo/sketches/quantiles/DebugUnionTest.java @@ -10,7 +10,6 @@ import static org.testng.Assert.assertTrue; import java.util.HashSet; -import java.util.Random; import org.testng.annotations.Test; @@ -32,20 +31,20 @@ public void test() { UpdateDoublesSketch[] sketchArr = new UpdateDoublesSketch[numSketches]; //builds the input sketches, all on heap - DoublesSketch.rand = new Random(1); //make deterministic for test + DoublesSketch.setRandom(1); //make deterministic for test final HashSet set = new HashSet<>(); //holds input values for (int s = 0; s < numSketches; s++) { sketchArr[s] = buildHeapSketch(sketchK, n, valueLimit, set); } //loads the on heap union - DoublesSketch.rand = new Random(1); //make deterministic for test + DoublesSketch.setRandom(1); //make deterministic for test DoublesUnion hUnion = DoublesUnion.builder().setMaxK(unionK).build(); for (int s = 0; s < numSketches; s++) { hUnion.update(sketchArr[s]); } DoublesSketch hSketch = hUnion.getResult(); //loads the direct union - DoublesSketch.rand = new Random(1); //make deterministic for test + DoublesSketch.setRandom(1); //make deterministic for test DoublesUnion dUnion; DoublesSketch dSketch; try ( WritableDirectHandle wdh = WritableMemory.allocateDirect(10_000_000) ) { @@ -61,32 +60,27 @@ public void test() { assertEquals(hCount, dCount); //Retained items must be the same - double[] heapItems = new double[hCount]; - double[] directItems = new double[dCount]; int hErrors = 0; int dErrors = 0; DoublesSketchIterator hit = hSketch.iterator(); DoublesSketchIterator dit = dSketch.iterator(); - int i = 0; + while (hit.next() && dit.next()) { double v = hit.getValue(); - heapItems[i] = v; if (!set.contains(v)) { hErrors++; } double w = dit.getValue(); - directItems[i] = w; if (!set.contains(w)) { dErrors++; } - i++; assertEquals(v, w, 0); //Items must be returned in same order and be equal } assertTrue(hErrors == 0); assertTrue(dErrors == 0); - //println("HeapUnion : Values: " + hCount + ", errors: " + hErrors); + println("HeapUnion : Values: " + hCount + ", errors: " + hErrors); //println(hSketch.toString(true, true)); - //println("DirectUnion: Values: " + dCount + ", errors: " + dErrors); + println("DirectUnion: Values: " + dCount + ", errors: " + dErrors); //println(dSketch.toString(true, true)); } From 65da0ff567f5daa6de8d5e1d8be761e4b1d98027 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Tue, 2 Apr 2019 11:49:19 -0700 Subject: [PATCH 16/16] Suppress LGTM warnings --- .../sketches/theta/PairwiseSetOperations.java | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java b/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java index 5c77db7db..d78251224 100644 --- a/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java +++ b/src/main/java/com/yahoo/sketches/theta/PairwiseSetOperations.java @@ -97,22 +97,24 @@ public static CompactSketch union(final CompactSketch skA, final CompactSketch s return null; } case 6: { //skA == null; skB == empty; return empty - return (skB.getThetaLong() == Long.MAX_VALUE) ? skB : //lgtm [java/dereferenced-value-may-be-null] - HeapCompactOrderedSketch.compact(new long[0], true, skB.getSeedHash(), 0, Long.MAX_VALUE); + final long thetaLong = skB.getThetaLong(); //lgtm [java/dereferenced-value-may-be-null] + return (thetaLong == Long.MAX_VALUE) ? skB + : HeapCompactOrderedSketch.compact(new long[0], true, skB.getSeedHash(), 0, Long.MAX_VALUE); } case 7: { //skA == null; skB == valid; return skB return maybeCutback(skB, k); } case 9: { //skA == empty; skB == null; return empty - return (skA.getThetaLong() == Long.MAX_VALUE) ? skA : //lgtm [java/dereferenced-value-may-be-null] - HeapCompactOrderedSketch.compact(new long[0], true, skA.getSeedHash(), 0, Long.MAX_VALUE); + final long thetaLong = skA.getThetaLong(); //lgtm [java/dereferenced-value-may-be-null] + return (thetaLong == Long.MAX_VALUE) ? skA + : HeapCompactOrderedSketch.compact(new long[0], true, skA.getSeedHash(), 0, Long.MAX_VALUE); } case 10: { //skA == empty; skB == empty; return empty final short seedHash = seedHashesCheck(skA, skB); - if (skA.getThetaLong() == Long.MAX_VALUE) //lgtm [java/dereferenced-value-may-be-null] - { return skA; } - if (skB.getThetaLong() == Long.MAX_VALUE) //lgtm [java/dereferenced-value-may-be-null] - { return skB; } + long thetaLong = skA.getThetaLong(); //lgtm [java/dereferenced-value-may-be-null] + if (thetaLong == Long.MAX_VALUE) { return skA; } + thetaLong = skB.getThetaLong(); //lgtm [java/dereferenced-value-may-be-null] + if (thetaLong == Long.MAX_VALUE) { return skB; } return HeapCompactOrderedSketch.compact(new long[0], true, seedHash, 0, Long.MAX_VALUE); } case 11: { //skA == empty; skB == valid; return skB @@ -135,7 +137,9 @@ public static CompactSketch union(final CompactSketch skA, final CompactSketch s //Both sketches are valid with matching seedhashes and ordered //Full Union operation: - long thetaLong = Math.min(skA.getThetaLong(), skB.getThetaLong()); //Theta rule + final long thetaLongA = skA.getThetaLong(); //lgtm [java/dereferenced-value-may-be-null] + final long thetaLongB = skB.getThetaLong(); //lgtm [java/dereferenced-value-may-be-null] + long thetaLong = Math.min(thetaLongA, thetaLongB); //Theta rule final long[] cacheA = (skA.hasMemory()) ? skA.getCache() : skA.getCache().clone(); final long[] cacheB = (skB.hasMemory()) ? skB.getCache() : skB.getCache().clone(); final int aLen = cacheA.length;