diff --git a/pom.xml b/pom.xml
index 93ca18b71..f5ab892b9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -32,7 +32,7 @@ under the License.
Presenting an intersection with a null argument will throw an exception.
* - * * @param dstOrdered * See Destination Ordered * * @param dstMem * See Destination Memory. * - * @return the result of this operation as a CompactSketch of the chosen form + * @return the result of this operation as a CompactSketch stored in the given dstMem, + * which can be either on or off-heap.. */ public abstract CompactSketch getResult(boolean dstOrdered, WritableMemory dstMem); @@ -90,7 +94,7 @@ public CompactSketch getResult() { /** * Resets this Intersection for stateful operations only. * The seed remains intact, otherwise reverts to - * the Universal Set, theta of 1.0 and empty = false. + * the Universal Set: theta = 1.0, no retained data and empty = false. */ public abstract void reset(); diff --git a/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java b/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java index b51140968..4a8811ca1 100644 --- a/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java +++ b/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java @@ -219,7 +219,7 @@ static IntersectionImpl wrapInstance( @Override public CompactSketch intersect(final Sketch a, final Sketch b, final boolean dstOrdered, final WritableMemory dstMem) { - if ((wmem_ != null) && readOnly_) { throw new SketchesReadOnlyException(); } + if (wmem_ != null && readOnly_) { throw new SketchesReadOnlyException(); } hardReset(); intersect(a); intersect(b); @@ -233,7 +233,7 @@ public void intersect(final Sketch sketchIn) { if (sketchIn == null) { throw new SketchesArgumentException("Intersection argument must not be null."); } - if ((wmem_ != null) && readOnly_) { throw new SketchesReadOnlyException(); } + if (wmem_ != null && readOnly_) { throw new SketchesReadOnlyException(); } if (empty_ || sketchIn.isEmpty()) { //empty rule //Because of the def of null above and the Empty Rule (which is OR), empty_ must be true. //Whatever the current internal state, we make our local empty. @@ -262,14 +262,14 @@ public void intersect(final Sketch sketchIn) { final int sketchInEntries = sketchIn.getRetainedEntries(true); //states 1,2,3,6 - if ((curCount_ == 0) || (sketchInEntries == 0)) { + if (curCount_ == 0 || sketchInEntries == 0) { curCount_ = 0; if (wmem_ != null) { insertCurCount(wmem_, 0); } hashTable_ = null; //No need for a HT. Don't bother clearing mem if valid } //end of states 1,2,3,6 // state 5 - else if ((curCount_ < 0) && (sketchInEntries > 0)) { + else if (curCount_ < 0 && sketchInEntries > 0) { curCount_ = sketchIn.getRetainedEntries(true); final int requiredLgArrLongs = minLgHashTableSize(curCount_, REBUILD_THRESHOLD); final int priorLgArrLongs = lgArrLongs_; //prior only used in error message @@ -295,7 +295,7 @@ else if ((curCount_ < 0) && (sketchInEntries > 0)) { } //end of state 5 //state 7 - else if ((curCount_ > 0) && (sketchInEntries > 0)) { + else if (curCount_ > 0 && sketchInEntries > 0) { //Sets resulting hashTable, curCount and adjusts lgArrLongs performIntersect(sketchIn); } //end of state 7 @@ -339,6 +339,16 @@ public CompactSketch getResult(final boolean dstOrdered, final WritableMemory ds dstMem, compactCache); } + @Override + public boolean hasResult() { + return wmem_ != null ? wmem_.getInt(RETAINED_ENTRIES_INT) >= 0 : curCount_ >= 0; + } + + @Override + public boolean isSameResource(final Memory that) { + return wmem_ != null ? wmem_.isSameResource(that) : false; + } + @Override public void reset() { hardReset(); @@ -347,7 +357,7 @@ public void reset() { @Override public byte[] toByteArray() { final int preBytes = CONST_PREAMBLE_LONGS << 3; - final int dataBytes = (curCount_ > 0) ? 8 << lgArrLongs_ : 0; + final int dataBytes = curCount_ > 0 ? 8 << lgArrLongs_ : 0; final byte[] byteArrOut = new byte[preBytes + dataBytes]; if (wmem_ != null) { wmem_.getByteArray(0, byteArrOut, 0, preBytes + dataBytes); @@ -376,16 +386,6 @@ public byte[] toByteArray() { return byteArrOut; } - @Override - public boolean hasResult() { - return (wmem_ != null) ? wmem_.getInt(RETAINED_ENTRIES_INT) >= 0 : curCount_ >= 0; - } - - @Override - public boolean isSameResource(final Memory that) { - return (wmem_ != null) ? wmem_.isSameResource(that) : false; - } - //restricted /** @@ -405,7 +405,7 @@ boolean isEmpty() { @Override long[] getCache() { if (wmem_ == null) { - return (hashTable_ != null) ? hashTable_ : new long[0]; + return hashTable_ != null ? hashTable_ : new long[0]; } //Direct final int arrLongs = 1 << lgArrLongs_; @@ -426,7 +426,7 @@ long getThetaLong() { private void performIntersect(final Sketch sketchIn) { // curCount and input data are nonzero, match against HT - assert ((curCount_ > 0) && (!empty_)); + assert curCount_ > 0 && !empty_; final long[] cacheIn = sketchIn.getCache(); final int arrLongsIn = cacheIn.length; final long[] hashTable; @@ -458,7 +458,7 @@ private void performIntersect(final Sketch sketchIn) { //either unordered compact or hash table for (int i = 0; i < arrLongsIn; i++ ) { final long hashIn = cacheIn[i]; - if ((hashIn <= 0L) || (hashIn >= thetaLong_)) { continue; } + if (hashIn <= 0L || hashIn >= thetaLong_) { continue; } final int foundIdx = hashSearch(hashTable, lgArrLongs_, hashIn); if (foundIdx == -1) { continue; } matchSet[matchSetCount++] = hashIn; @@ -505,7 +505,7 @@ private void moveDataToTgt(final long[] arr, final int count) { tmpCnt++; } } - assert (tmpCnt == count) : "Intersection Count Check: got: " + tmpCnt + ", expected: " + count; + assert tmpCnt == count : "Intersection Count Check: got: " + tmpCnt + ", expected: " + count; } private void hardReset() { diff --git a/src/main/java/org/apache/datasketches/theta/JaccardSimilarity.java b/src/main/java/org/apache/datasketches/theta/JaccardSimilarity.java index 8abea8800..4a5d33708 100644 --- a/src/main/java/org/apache/datasketches/theta/JaccardSimilarity.java +++ b/src/main/java/org/apache/datasketches/theta/JaccardSimilarity.java @@ -54,7 +54,7 @@ public final class JaccardSimilarity { */ public static double[] jaccard(final Sketch sketchA, final Sketch sketchB) { //Corner case checks - if ((sketchA == null) || (sketchB == null)) { return ZEROS.clone(); } + if (sketchA == null || sketchB == null) { return ZEROS.clone(); } if (sketchA == sketchB) { return ONES.clone(); } if (sketchA.isEmpty() && sketchB.isEmpty()) { return ONES.clone(); } if (sketchA.isEmpty() || sketchB.isEmpty()) { return ZEROS.clone(); } @@ -68,8 +68,8 @@ public static double[] jaccard(final Sketch sketchA, final Sketch sketchB) { final int newK = max(min(ceilingPowerOf2(countA + countB), maxK), minK); final Union union = SetOperation.builder().setNominalEntries(newK).buildUnion(); - union.update(sketchA); - union.update(sketchB); + union.union(sketchA); + union.union(sketchB); final Sketch unionAB = union.getResult(false, null); final long thetaLongUAB = unionAB.getThetaLong(); final long thetaLongA = sketchA.getThetaLong(); @@ -77,8 +77,8 @@ public static double[] jaccard(final Sketch sketchA, final Sketch sketchB) { final int countUAB = unionAB.getRetainedEntries(true); //Check for identical data - if ((countUAB == countA) && (countUAB == countB) - && (thetaLongUAB == thetaLongA) && (thetaLongUAB == thetaLongB)) { + if (countUAB == countA && countUAB == countB + && thetaLongUAB == thetaLongA && thetaLongUAB == thetaLongB) { return ONES.clone(); } @@ -105,7 +105,7 @@ public static double[] jaccard(final Sketch sketchA, final Sketch sketchB) { */ public static boolean exactlyEqual(final Sketch sketchA, final Sketch sketchB) { //Corner case checks - if ((sketchA == null) || (sketchB == null)) { return false; } + if (sketchA == null || sketchB == null) { return false; } if (sketchA == sketchB) { return true; } if (sketchA.isEmpty() && sketchB.isEmpty()) { return true; } if (sketchA.isEmpty() || sketchB.isEmpty()) { return false; } @@ -116,8 +116,8 @@ public static boolean exactlyEqual(final Sketch sketchA, final Sketch sketchB) { //Create the Union final Union union = SetOperation.builder().setNominalEntries(ceilingPowerOf2(countA + countB)).buildUnion(); - union.update(sketchA); - union.update(sketchB); + union.union(sketchA); + union.union(sketchB); final Sketch unionAB = union.getResult(); final long thetaLongUAB = unionAB.getThetaLong(); final long thetaLongA = sketchA.getThetaLong(); @@ -125,8 +125,8 @@ public static boolean exactlyEqual(final Sketch sketchA, final Sketch sketchB) { final int countUAB = unionAB.getRetainedEntries(true); //Check for identical counts and thetas - if ((countUAB == countA) && (countUAB == countB) - && (thetaLongUAB == thetaLongA) && (thetaLongUAB == thetaLongB)) { + if (countUAB == countA && countUAB == countB + && thetaLongUAB == thetaLongA && thetaLongUAB == thetaLongB) { return true; } return false; diff --git a/src/main/java/org/apache/datasketches/theta/Union.java b/src/main/java/org/apache/datasketches/theta/Union.java index 1d74a2f3e..1e9ea7a40 100644 --- a/src/main/java/org/apache/datasketches/theta/Union.java +++ b/src/main/java/org/apache/datasketches/theta/Union.java @@ -35,6 +35,14 @@ public Family getFamily() { return Family.UNION; } + /** + * Gets the result of this operation as an ordered CompactSketch on the Java heap. + * This does not disturb the underlying data structure of the union. + * Therefore, it is OK to continue updating the union after this operation. + * @return the result of this operation as an ordered CompactSketch on the Java heap + */ + public abstract CompactSketch getResult(); + /** * Gets the result of this operation as a CompactSketch of the chosen form. * This does not disturb the underlying data structure of the union. @@ -50,14 +58,6 @@ public Family getFamily() { */ public abstract CompactSketch getResult(boolean dstOrdered, WritableMemory dstMem); - /** - * Gets the result of this operation as an ordered CompactSketch on the Java heap. - * This does not disturb the underlying data structure of the union. - * Therefore, it is OK to continue updating the union after this operation. - * @return the result of this operation as an ordered CompactSketch on the Java heap - */ - public abstract CompactSketch getResult(); - /** * Resets this Union. The seed remains intact, otherwise reverts back to its virgin state. */ @@ -108,6 +108,20 @@ public abstract CompactSketch union(Sketch sketchA, Sketch sketchB, boolean dstO * * @param sketchIn The incoming sketch. */ + public abstract void union(Sketch sketchIn); + + + /** + * Perform a Union operation with this union and the given on-heap sketch of the Theta Family. + * This method is not valid for the older SetSketch, which was prior to Open Source (August, 2015). + * + *This method can be repeatedly called. + * If the given sketch is null it is interpreted as an empty sketch.
+ * + * @param sketchIn The incoming sketch. + * @deprecated 2.0.0. Use {@link #union(Sketch)} instead. + */ + @Deprecated public abstract void update(Sketch sketchIn); /** @@ -120,6 +134,20 @@ public abstract CompactSketch union(Sketch sketchA, Sketch sketchB, boolean dstO * * @param mem Memory image of sketch to be merged */ + public abstract void union(Memory mem); + + /** + * Perform a Union operation with this union and the given Memory image of any sketch of the + * Theta Family. The input image may be from earlier versions of the Theta Compact Sketch, + * called the SetSketch (circa 2012), which was prior to Open Source and are compact and ordered. + * + *This method can be repeatedly called. + * If the given sketch is null it is interpreted as an empty sketch.
+ * + * @param mem Memory image of sketch to be merged + * @deprecated 2.0.0. Use {@link #union(Memory)} instead. + */ + @Deprecated public abstract void update(Memory mem); /** diff --git a/src/main/java/org/apache/datasketches/theta/UnionImpl.java b/src/main/java/org/apache/datasketches/theta/UnionImpl.java index 72515f671..700ee71d9 100644 --- a/src/main/java/org/apache/datasketches/theta/UnionImpl.java +++ b/src/main/java/org/apache/datasketches/theta/UnionImpl.java @@ -210,7 +210,7 @@ static UnionImpl wrapInstance(final WritableMemory srcMem, final long seed) { @Override public boolean isSameResource(final Memory that) { - return (gadget_ instanceof DirectQuickSelectSketchR) + return gadget_ instanceof DirectQuickSelectSketchR ? gadget_.getMemory().isSameResource(that) : false; } @@ -224,19 +224,19 @@ public CompactSketch getResult(final boolean dstOrdered, final WritableMemory ds final int gadgetCurCount = gadget_.getRetainedEntries(true); final int k = 1 << gadget_.getLgNomLongs(); final long[] gadgetCacheCopy = - (gadget_.hasMemory()) ? gadget_.getCache() : gadget_.getCache().clone(); + gadget_.hasMemory() ? gadget_.getCache() : gadget_.getCache().clone(); //Pull back to k final long curGadgetThetaLong = gadget_.getThetaLong(); - final long adjGadgetThetaLong = (gadgetCurCount > k) + final long adjGadgetThetaLong = gadgetCurCount > k ? selectExcludingZeros(gadgetCacheCopy, gadgetCurCount, k + 1) : curGadgetThetaLong; //Finalize Theta and curCount - final long unionThetaLong = (gadget_.hasMemory()) + final long unionThetaLong = gadget_.hasMemory() ? gadget_.getMemory().getLong(UNION_THETA_LONG) : unionThetaLong_; final long minThetaLong = min(min(curGadgetThetaLong, adjGadgetThetaLong), unionThetaLong); - final int curCountOut = (minThetaLong < curGadgetThetaLong) + final int curCountOut = minThetaLong < curGadgetThetaLong ? HashOperations.count(gadgetCacheCopy, minThetaLong) : gadgetCurCount; @@ -277,11 +277,17 @@ public CompactSketch union(final Sketch sketchA, final Sketch sketchB, final boo return getResult(dstOrdered, dstMem); } + @Deprecated @Override - public void update(final Sketch sketchIn) { //Only valid for theta Sketches using SerVer = 3 + public void update(final Sketch sketchIn) { + union(sketchIn); + } + + @Override + public void union(final Sketch sketchIn) { //Only valid for theta Sketches using SerVer = 3 //UNION Empty Rule: AND the empty states. - if ((sketchIn == null) || sketchIn.isEmpty()) { + if (sketchIn == null || sketchIn.isEmpty()) { //null and empty is interpreted as (Theta = 1.0, count = 0, empty = T). Nothing changes return; } @@ -303,7 +309,7 @@ public void update(final Sketch sketchIn) { //Only valid for theta Sketches usin final Memory skMem = ((CompactSketch) sketchIn).getMemory(); final int preambleLongs = skMem.getByte(PREAMBLE_LONGS_BYTE) & 0X3F; for (int i = 0; i < curCountIn; i++ ) { - final int offsetBytes = (preambleLongs + i) << 3; + final int offsetBytes = preambleLongs + i << 3; final long hashIn = skMem.getLong(offsetBytes); if (hashIn >= unionThetaLong_) { break; } // "early stop" gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed @@ -321,9 +327,9 @@ public void update(final Sketch sketchIn) { //Only valid for theta Sketches usin else { //either not-ordered compact or Hash Table form. A HT may have dirty values. final long[] cacheIn = sketchIn.getCache(); //if off-heap this will be a copy final int arrLongs = cacheIn.length; - for (int i = 0, c = 0; (i < arrLongs) && (c < curCountIn); i++ ) { + for (int i = 0, c = 0; i < arrLongs && c < curCountIn; i++ ) { final long hashIn = cacheIn[i]; - if ((hashIn <= 0L) || (hashIn >= unionThetaLong_)) { continue; } //rejects dirty values + if (hashIn <= 0L || hashIn >= unionThetaLong_) { continue; } //rejects dirty values gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed c++; //ensures against invalid state inside the incoming sketch } @@ -337,8 +343,14 @@ public void update(final Sketch sketchIn) { //Only valid for theta Sketches usin } } + @Deprecated @Override public void update(final Memory skMem) { + union(skMem); + } + + @Override + public void union(final Memory skMem) { if (skMem == null) { return; } final int cap = (int) skMem.getCapacity(); if (cap < 16) { return; } //empty or garbage @@ -346,7 +358,7 @@ public void update(final Memory skMem) { final int fam = extractFamilyID(skMem); if (serVer == 3) { //The OpenSource sketches (Aug 4, 2015) starts with serVer = 3 - if ((fam < 1) || (fam > 3)) { + if (fam < 1 || fam > 3) { throw new SketchesArgumentException( "Family must be Alpha, QuickSelect, or Compact: " + Family.idToFamily(fam)); } @@ -407,7 +419,7 @@ private void processVer3(final Memory skMem) { if (ordered) { //must be compact for (int i = 0; i < curCountIn; i++ ) { - final int offsetBytes = (preLongs + i) << 3; + final int offsetBytes = preLongs + i << 3; final long hashIn = skMem.getLong(offsetBytes); if (hashIn >= unionThetaLong_) { break; } // "early stop" gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed @@ -416,12 +428,12 @@ private void processVer3(final Memory skMem) { else { //not-ordered, could be compact or hash-table form final boolean compact = (flags & COMPACT_FLAG_MASK) != 0; - final int size = (compact) ? curCountIn : 1 << extractLgArrLongs(skMem); + final int size = compact ? curCountIn : 1 << extractLgArrLongs(skMem); for (int i = 0; i < size; i++ ) { - final int offsetBytes = (preLongs + i) << 3; + final int offsetBytes = preLongs + i << 3; final long hashIn = skMem.getLong(offsetBytes); - if ((hashIn <= 0L) || (hashIn >= unionThetaLong_)) { continue; } + if (hashIn <= 0L || hashIn >= unionThetaLong_) { continue; } gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed } } diff --git a/src/main/java/org/apache/datasketches/theta/package-info.java b/src/main/java/org/apache/datasketches/theta/package-info.java index 12c886f5b..9268109bf 100644 --- a/src/main/java/org/apache/datasketches/theta/package-info.java +++ b/src/main/java/org/apache/datasketches/theta/package-info.java @@ -18,46 +18,12 @@ */ /** - *The theta package contains all the sketch classes that are members of the - * Theta Sketch Framework. - * The basic sketching functionality in this package is also - * accessible from Hadoop Pig UDFs found in the sketches-pig repository, + *
The theta package contains all the sketch classes that are members of the + * Theta Sketch Framework. + * The basic sketching functionality in this package is also + * accessible from Hadoop Pig UDFs found in the sketches-pig repository, * and from Hadoop Hive UADFs and UDFs found in the sketches-hive repository. *
- *- public void SimpleCountingSketch() { - int k = 4096; - int u = 1000000; - - UpdateSketch sketch = UpdateSketch.builder().build(k); - for (int i = 0; i < u; i++) { - sketch.update(i); - } - - println(sketch.toString()); - } - -### HeapQuickSelectSketch SUMMARY: - Nominal Entries (k) : 4096 - Estimate : 1002714.745231455 - Upper Bound, 95% conf : 1027777.3354974985 - Lower Bound, 95% conf : 978261.4472857157 - p : 1.0 - Theta (double) : 0.00654223948655085 - Theta (long) : 60341508738660257 - Theta (long, hex : 00d66048519437a1 - EstMode? : true - Empty? : false - Resize Factor : 8 - Array Size Entries : 8192 - Retained Entries : 6560 - Update Seed : 9001 - Seed Hash : ffff93cc -### END SKETCH SUMMARY -* * @author Lee Rhodes */ diff --git a/src/main/java/org/apache/datasketches/tuple/AnotB.java b/src/main/java/org/apache/datasketches/tuple/AnotB.java index 10202bebc..31a83e26b 100644 --- a/src/main/java/org/apache/datasketches/tuple/AnotB.java +++ b/src/main/java/org/apache/datasketches/tuple/AnotB.java @@ -199,7 +199,7 @@ public void notB(final org.apache.datasketches.theta.Sketch skB) { * * @param reset If true, clears this operator to the empty state after this result is * returned. Set this to false if you wish to obtain an intermediate result. - * @return the result of this operation as a {@link CompactSketch}. + * @return the result of this operation as an unordered {@link CompactSketch}. */ public CompactSketch