From c6f371098426d73fe00990408b514d1ddd5a2d84 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Tue, 2 Feb 2021 16:57:52 -0800 Subject: [PATCH 1/3] Start Release of 2.0.0 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 93ca18b71..f5ab892b9 100644 --- a/pom.xml +++ b/pom.xml @@ -32,7 +32,7 @@ under the License. org.apache.datasketches datasketches-java - 1.4.0-SNAPSHOT + 2.0.0 jar ${project.artifactId} From ccfc4497ca67b554a5c2630d9b7a4608f0c83d8a Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Thu, 4 Feb 2021 12:32:31 -0800 Subject: [PATCH 2/3] Fixes in preparation for 2.0.0-RC2 --- .../datasketches/theta/package-info.java | 42 +-------- .../datasketches/tuple/Intersection.java | 12 +-- .../datasketches/tuple/UpdatableSummary.java | 5 +- .../tuple/adouble/DoubleSummary.java | 7 +- .../tuple/aninteger/IntegerSummary.java | 10 +- .../tuple/strings/ArrayOfStringsSummary.java | 8 +- .../datasketches/theta/ExamplesTest.java | 59 +++++++++++- .../datasketches/tuple/TupleExamplesTest.java | 93 +++++++++++++++++++ 8 files changed, 172 insertions(+), 64 deletions(-) create mode 100644 src/test/java/org/apache/datasketches/tuple/TupleExamplesTest.java diff --git a/src/main/java/org/apache/datasketches/theta/package-info.java b/src/main/java/org/apache/datasketches/theta/package-info.java index 12c886f5b..9268109bf 100644 --- a/src/main/java/org/apache/datasketches/theta/package-info.java +++ b/src/main/java/org/apache/datasketches/theta/package-info.java @@ -18,46 +18,12 @@ */ /** - *

The theta package contains all the sketch classes that are members of the - * Theta Sketch Framework. - * The basic sketching functionality in this package is also - * accessible from Hadoop Pig UDFs found in the sketches-pig repository, + *

The theta package contains all the sketch classes that are members of the + * Theta Sketch Framework. + * The basic sketching functionality in this package is also + * accessible from Hadoop Pig UDFs found in the sketches-pig repository, * and from Hadoop Hive UADFs and UDFs found in the sketches-hive repository. *

- *

Simple Java Example

- * Note: The complete example code can be found in the parallel package under src/test/java and - * with the class name "ExamplesTest.java". -
-  public void SimpleCountingSketch() {
-    int k = 4096;
-    int u = 1000000;
-    
-    UpdateSketch sketch = UpdateSketch.builder().build(k);
-    for (int i = 0; i < u; i++) {
-      sketch.update(i);
-    }
-    
-    println(sketch.toString());
-  }
-
-### HeapQuickSelectSketch SUMMARY: 
-   Nominal Entries (k)     : 4096
-   Estimate                : 1002714.745231455
-   Upper Bound, 95% conf   : 1027777.3354974985
-   Lower Bound, 95% conf   : 978261.4472857157
-   p                       : 1.0
-   Theta (double)          : 0.00654223948655085
-   Theta (long)            : 60341508738660257
-   Theta (long, hex        : 00d66048519437a1
-   EstMode?                : true
-   Empty?                  : false
-   Resize Factor           : 8
-   Array Size Entries      : 8192
-   Retained Entries        : 6560
-   Update Seed             : 9001
-   Seed Hash               : ffff93cc
-### END SKETCH SUMMARY
-
* * @author Lee Rhodes */ diff --git a/src/main/java/org/apache/datasketches/tuple/Intersection.java b/src/main/java/org/apache/datasketches/tuple/Intersection.java index c10cad7b5..1dad7d244 100644 --- a/src/main/java/org/apache/datasketches/tuple/Intersection.java +++ b/src/main/java/org/apache/datasketches/tuple/Intersection.java @@ -47,7 +47,7 @@ public class Intersection { private final SummarySetOperations summarySetOps_; private boolean empty_; private long thetaLong_; - private HashTables hashTables_; + private final HashTables hashTables_; private boolean firstCall_; /** @@ -76,7 +76,7 @@ public void update(final Sketch sketchIn) { final int countIn = sketchIn.getRetainedEntries(); thetaLong_ = min(thetaLong_, thetaLongIn); //Theta rule // Empty rule extended in case incoming sketch does not have empty bit properly set - empty_ |= (countIn == 0) && (thetaLongIn == Long.MAX_VALUE); + empty_ |= countIn == 0 && thetaLongIn == Long.MAX_VALUE; if (countIn == 0) { hashTables_.clear(); return; @@ -128,7 +128,7 @@ public void update(final Sketch sketchIn) { * Updates the internal set by intersecting it with the given Theta sketch. * @param sketchIn input Theta Sketch to intersect with the internal state. It may not be null. * @param summary the given proxy summary for the Theta Sketch, which doesn't have one. - * It may not be null. + * It will be copied for each matching index. It may not be null. */ public void update(final org.apache.datasketches.theta.Sketch sketchIn, final S summary) { if (sketchIn == null) { throw new SketchesArgumentException("Sketch may not be null"); } @@ -140,7 +140,7 @@ public void update(final org.apache.datasketches.theta.Sketch sketchIn, final S final int countIn = sketchIn.getRetainedEntries(true); thetaLong_ = min(thetaLong_, thetaLongIn); //Theta rule // Empty rule extended in case incoming sketch does not have empty bit properly set - empty_ |= (countIn == 0) && (thetaLongIn == Long.MAX_VALUE); + empty_ |= countIn == 0 && thetaLongIn == Long.MAX_VALUE; if (countIn == 0) { hashTables_.clear(); return; @@ -180,7 +180,7 @@ public void update(final org.apache.datasketches.theta.Sketch sketchIn, final S matchSummaries = (S[]) Array.newInstance(summaryType, maxMatchSize); } matchHashArr[matchCount] = hash; - matchSummaries[matchCount] = summarySetOps_.intersection(mySummary, (S)mySummary.copy()); + matchSummaries[matchCount] = summarySetOps_.intersection(mySummary, (S)summary.copy()); matchCount++; } hashTables_.fromArrays(matchHashArr, matchSummaries, matchCount); @@ -207,7 +207,7 @@ public CompactSketch getResult() { int cnt = 0; for (int i = 0; i < tableSize; i++) { final long hash = hashTables_.hashTable_[i]; - if ((hash == 0) || (hash > thetaLong_)) { continue; } + if (hash == 0 || hash > thetaLong_) { continue; } final S summary = hashTables_.summaryTable_[i]; if (summaries == null) { summaries = (S[]) Array.newInstance(summaryType, hashTables_.count_); diff --git a/src/main/java/org/apache/datasketches/tuple/UpdatableSummary.java b/src/main/java/org/apache/datasketches/tuple/UpdatableSummary.java index ac5ce9333..4ddb71d1b 100644 --- a/src/main/java/org/apache/datasketches/tuple/UpdatableSummary.java +++ b/src/main/java/org/apache/datasketches/tuple/UpdatableSummary.java @@ -27,9 +27,10 @@ public interface UpdatableSummary extends Summary { /** * This is to provide a method of updating summaries. - * This should not be called by the user. + * This is primarily used internally. * @param value update value + * @return this */ - public void update(U value); + UpdatableSummary update(U value); } diff --git a/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSummary.java b/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSummary.java index ae6b7c09b..76c16330e 100644 --- a/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSummary.java +++ b/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSummary.java @@ -37,7 +37,7 @@ public final class DoubleSummary implements UpdatableSummary { /** * The aggregation modes for this Summary */ - public static enum Mode { + public enum Mode { /** * The aggregation mode is the summation function. @@ -97,10 +97,10 @@ public DoubleSummary(final Mode mode) { } @Override - public void update(final Double value) { + public DoubleSummary update(final Double value) { switch (mode_) { case Sum: - value_ += value.doubleValue(); + value_ += value; break; case Min: if (value < value_) { value_ = value; } @@ -112,6 +112,7 @@ public void update(final Double value) { value_ = 1.0; break; } + return this; } @Override diff --git a/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSummary.java b/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSummary.java index a0e3e2982..3af888b10 100644 --- a/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSummary.java +++ b/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSummary.java @@ -37,7 +37,7 @@ public class IntegerSummary implements UpdatableSummary { /** * The aggregation modes for this Summary */ - public static enum Mode { + public enum Mode { /** * The aggregation mode is the summation function. @@ -96,13 +96,8 @@ public IntegerSummary(final Mode mode) { } } - /** - * Updates an instance of IntegerSummary with the given value. - * This should not be called by the user. - * @param value The given value. - */ @Override - public void update(final Integer value) { + public IntegerSummary update(final Integer value) { switch (mode_) { case Sum: value_ += value; @@ -117,6 +112,7 @@ public void update(final Integer value) { value_ = 1; break; } + return this; } @Override diff --git a/src/main/java/org/apache/datasketches/tuple/strings/ArrayOfStringsSummary.java b/src/main/java/org/apache/datasketches/tuple/strings/ArrayOfStringsSummary.java index 48cfbc6fb..75ab751b5 100644 --- a/src/main/java/org/apache/datasketches/tuple/strings/ArrayOfStringsSummary.java +++ b/src/main/java/org/apache/datasketches/tuple/strings/ArrayOfStringsSummary.java @@ -90,11 +90,11 @@ public byte[] toByteArray() { //From UpdatableSummary @Override - public void update(final String[] value) { + public ArrayOfStringsSummary update(final String[] value) { if (nodesArr == null) { nodesArr = value.clone(); } - //otherwise do not update. + return this; } //From Object @@ -106,7 +106,7 @@ public int hashCode() { @Override public boolean equals(final Object summary) { - if ((summary == null) || !(summary instanceof ArrayOfStringsSummary)) { + if (summary == null || !(summary instanceof ArrayOfStringsSummary)) { return false; } final String thatStr = stringConcat(((ArrayOfStringsSummary) summary).nodesArr); @@ -152,7 +152,7 @@ private static class ComputeBytes { nodeLengthsArr_[i] = nodeBytesArr_[i].length; sumNodeBytes += nodeLengthsArr_[i]; } - totBytes_ = sumNodeBytes + ((numNodes_ + 1) * Integer.BYTES) + 1; + totBytes_ = sumNodeBytes + (numNodes_ + 1) * Integer.BYTES + 1; } } diff --git a/src/test/java/org/apache/datasketches/theta/ExamplesTest.java b/src/test/java/org/apache/datasketches/theta/ExamplesTest.java index 64f618aab..d4b308025 100644 --- a/src/test/java/org/apache/datasketches/theta/ExamplesTest.java +++ b/src/test/java/org/apache/datasketches/theta/ExamplesTest.java @@ -29,10 +29,10 @@ public class ExamplesTest { @Test public void simpleCountingSketch() { - int k = 4096; - int u = 1000000; + final int k = 4096; + final int u = 1000000; - UpdateSketch sketch = UpdateSketch.builder().setNominalEntries(k).build(); + final UpdateSketch sketch = UpdateSketch.builder().setNominalEntries(k).build(); for (int i = 0; i < u; i++) { sketch.update(i); } @@ -59,6 +59,57 @@ Nominal Entries (k) : 4096 ### END SKETCH SUMMARY */ + @Test + public void theta2dot0Examples() { + //Load source sketches + final UpdateSketchBuilder bldr = UpdateSketch.builder(); + final UpdateSketch skA = bldr.build(); + final UpdateSketch skB = bldr.build(); + for (int i = 1; i <= 1000; i++) { + skA.update(i); + skB.update(i + 250); + } + + //Union Stateless: + Union union = SetOperation.builder().buildUnion(); + CompactSketch csk = union.union(skA, skB); + assert csk.getEstimate() == 1250; + + //Union Stateful: + union = SetOperation.builder().buildUnion(); + union.update(skA); //first call + union.update(skB); //2nd through nth calls + //... + csk = union.getResult(); + assert csk.getEstimate() == 1250; + + //Intersection Stateless: + Intersection inter = SetOperation.builder().buildIntersection(); + csk = inter.intersect(skA, skB); + assert csk.getEstimate() == 750; + + //Intersection Stateful: + inter = SetOperation.builder().buildIntersection(); + inter.intersect(skA); //first call + inter.intersect(skB); //2nd through nth calls + //... + csk = inter.getResult(); + assert csk.getEstimate() == 750; + + //AnotB Stateless: + AnotB diff = SetOperation.builder().buildANotB(); + csk = diff.aNotB(skA, skB); + assert csk.getEstimate() == 250; + + //AnotB Stateful: + diff = SetOperation.builder().buildANotB(); + diff.setA(skA); //first call + diff.notB(skB); //2nd through nth calls + //... + csk = diff.getResult(true); + assert csk.getEstimate() == 250; + } + @Test public void printlnTest() { println("PRINTING: "+this.getClass().getName()); @@ -67,7 +118,7 @@ public void printlnTest() { /** * @param s value to print */ - static void println(String s) { + static void println(final String s) { //System.out.println(s); //enable/disable here } diff --git a/src/test/java/org/apache/datasketches/tuple/TupleExamplesTest.java b/src/test/java/org/apache/datasketches/tuple/TupleExamplesTest.java new file mode 100644 index 000000000..35d67b662 --- /dev/null +++ b/src/test/java/org/apache/datasketches/tuple/TupleExamplesTest.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.tuple; + +import org.apache.datasketches.theta.UpdateSketch; +import org.apache.datasketches.theta.UpdateSketchBuilder; +import org.apache.datasketches.tuple.aninteger.IntegerSummary; +import org.apache.datasketches.tuple.aninteger.IntegerSummary.Mode; +import org.apache.datasketches.tuple.aninteger.IntegerSummaryFactory; +import org.apache.datasketches.tuple.aninteger.IntegerSummarySetOperations; +import org.testng.annotations.Test; + + +@SuppressWarnings("javadoc") +public class TupleExamplesTest { + private final IntegerSummary.Mode umode = Mode.Sum; + private final IntegerSummary.Mode imode = Mode.AlwaysOne; + private final IntegerSummarySetOperations isso = new IntegerSummarySetOperations(umode, imode); + private final IntegerSummaryFactory ufactory = new IntegerSummaryFactory(umode); + private final IntegerSummaryFactory ifactory = new IntegerSummaryFactory(imode); + private final UpdateSketchBuilder thetaBldr = UpdateSketch.builder(); + private final UpdatableSketchBuilder tupleBldr = + new UpdatableSketchBuilder<>(ufactory); + + + @Test + public void tuple2dot0Examples() { + //Load source sketches + final UpdatableSketch tupleSk = tupleBldr.build(); + final UpdateSketch thetaSk = thetaBldr.build(); + for (int i = 1; i <= 12; i++) { + tupleSk.update(i, 1); + thetaSk.update(i + 3); + } + + //Union + final Union union = new Union<>(isso); + union.update(tupleSk); + union.update(thetaSk, ufactory.newSummary().update(1)); + final CompactSketch ucsk = union.getResult(); + int entries = ucsk.getRetainedEntries(); + println("Union: " + entries); + final SketchIterator uiter = ucsk.iterator(); + int counter = 1; + while (uiter.next()) { + final int i = uiter.getSummary().getValue(); + println(counter++ + ", " + i); //9 entries = 2, 6 entries = 1 + } + + //Intersection + final Intersection inter = new Intersection<>(isso); + inter.update(tupleSk); + inter.update(thetaSk, ifactory.newSummary().update(1)); + final CompactSketch icsk = inter.getResult(); + entries = icsk.getRetainedEntries(); + println("Intersection: " + entries); + final SketchIterator iiter = icsk.iterator(); + counter = 1; + while (iiter.next()) { + final int i = iiter.getSummary().getValue(); + println(counter++ + ", " + i); //9 entries = 1 + } + } + + @Test + public void printlnTest() { + println("PRINTING: "+this.getClass().getName()); + } + + /** + * @param s value to print + */ + static void println(final String s) { + System.out.println(s); //enable/disable here + } +} From 9de80d7221af1c2f479997658f0e35b296e4a5cd Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Thu, 4 Feb 2021 19:39:03 -0800 Subject: [PATCH 3/3] Fixed error in Tuple Intersection.update(Sketch, S summary), line 184. Also added self return to UpdatableSummary update(U value). This simplifies user code. Also fixed naming of update methods where they should have been deprecated. --- .../org/apache/datasketches/theta/AnotB.java | 11 +- .../datasketches/theta/Intersection.java | 14 +- .../datasketches/theta/IntersectionImpl.java | 40 +- .../datasketches/theta/JaccardSimilarity.java | 20 +- .../org/apache/datasketches/theta/Union.java | 44 +- .../apache/datasketches/theta/UnionImpl.java | 42 +- .../org/apache/datasketches/tuple/AnotB.java | 8 +- .../datasketches/tuple/Intersection.java | 5 +- .../org/apache/datasketches/tuple/Union.java | 15 +- .../datasketches/theta/DirectUnionTest.java | 542 +++++++++--------- .../apache/datasketches/theta/EmptyTest.java | 68 +-- .../datasketches/theta/ExamplesTest.java | 4 +- .../datasketches/theta/HeapUnionTest.java | 412 ++++++------- .../datasketches/theta/PreambleUtilTest.java | 38 +- .../datasketches/theta/SetOperationTest.java | 232 ++++---- .../datasketches/theta/SketchesTest.java | 111 ++-- .../datasketches/theta/UnionImplTest.java | 182 +++--- .../datasketches/tuple/TupleExamplesTest.java | 11 +- 18 files changed, 928 insertions(+), 871 deletions(-) diff --git a/src/main/java/org/apache/datasketches/theta/AnotB.java b/src/main/java/org/apache/datasketches/theta/AnotB.java index c194cd66a..7ecc3bd73 100644 --- a/src/main/java/org/apache/datasketches/theta/AnotB.java +++ b/src/main/java/org/apache/datasketches/theta/AnotB.java @@ -108,20 +108,21 @@ public Family getFamily() { public abstract void notB(Sketch skB); /** - * Gets the result of the mutistep, stateful operation AnotB that have been executed with calls + * Gets the result of the multistep, stateful operation AnotB that have been executed with calls * to {@link #setA(Sketch)} and ({@link #notB(Sketch)} or * {@link #notB(org.apache.datasketches.theta.Sketch)}). * * @param reset If true, clears this operator to the empty state after this result is * returned. Set this to false if you wish to obtain an intermediate result. + * * @return the result of this operation as an ordered, on-heap {@link CompactSketch}. */ public abstract CompactSketch getResult(boolean reset); /** - * Gets the result of this stateful set operation as a CompactSketch of the form based on - * the input arguments. - * The stateful input operations are {@link #setA(Sketch)} and {@link #notB(Sketch)}. + * Gets the result of the multistep, stateful operation AnotB that have been executed with calls + * to {@link #setA(Sketch)} and ({@link #notB(Sketch)} or + * {@link #notB(org.apache.datasketches.theta.Sketch)}). * * @param dstOrdered If true, the result will be an ordered {@link CompactSketch}. * See Destination Ordered. @@ -132,7 +133,7 @@ public Family getFamily() { * @param reset If true, clears this operator to the empty state after this result is * returned. Set this to false if you wish to obtain an intermediate result. * - * @return the result of this operation as a {@link CompactSketch} of the chosen form. + * @return the result of this operation as a {@link CompactSketch} in the given dstMem. */ public abstract CompactSketch getResult(boolean dstOrdered, WritableMemory dstMem, boolean reset); diff --git a/src/main/java/org/apache/datasketches/theta/Intersection.java b/src/main/java/org/apache/datasketches/theta/Intersection.java index 9c7e16aba..b9dc47e8e 100644 --- a/src/main/java/org/apache/datasketches/theta/Intersection.java +++ b/src/main/java/org/apache/datasketches/theta/Intersection.java @@ -50,7 +50,10 @@ public Family getFamily() { /** * Gets the result of this operation as an ordered CompactSketch on the Java heap. - * The {@link #intersect(Sketch)} method must have been called at least once. + * This does not disturb the underlying data structure of this intersection. + * The {@link #intersect(Sketch)} method must have been called at least once, otherwise an + * exception will be thrown. This is because a virgin Intersection object represents the + * Universal Set, which has an infinite number of values. * @return the result of this operation as an ordered CompactSketch on the Java heap */ public CompactSketch getResult() { @@ -58,7 +61,8 @@ public CompactSketch getResult() { } /** - * Gets the result of this operation as a CompactSketch of the chosen form. + * Gets the result of this operation as a CompactSketch in the given dstMem. + * This does not disturb the underlying data structure of this intersection. * The {@link #intersect(Sketch)} method must have been called at least once, otherwise an * exception will be thrown. This is because a virgin Intersection object represents the * Universal Set, which has an infinite number of values. @@ -70,14 +74,14 @@ public CompactSketch getResult() { * *

Presenting an intersection with a null argument will throw an exception.

* - * * @param dstOrdered * See Destination Ordered * * @param dstMem * See Destination Memory. * - * @return the result of this operation as a CompactSketch of the chosen form + * @return the result of this operation as a CompactSketch stored in the given dstMem, + * which can be either on or off-heap.. */ public abstract CompactSketch getResult(boolean dstOrdered, WritableMemory dstMem); @@ -90,7 +94,7 @@ public CompactSketch getResult() { /** * Resets this Intersection for stateful operations only. * The seed remains intact, otherwise reverts to - * the Universal Set, theta of 1.0 and empty = false. + * the Universal Set: theta = 1.0, no retained data and empty = false. */ public abstract void reset(); diff --git a/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java b/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java index b51140968..4a8811ca1 100644 --- a/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java +++ b/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java @@ -219,7 +219,7 @@ static IntersectionImpl wrapInstance( @Override public CompactSketch intersect(final Sketch a, final Sketch b, final boolean dstOrdered, final WritableMemory dstMem) { - if ((wmem_ != null) && readOnly_) { throw new SketchesReadOnlyException(); } + if (wmem_ != null && readOnly_) { throw new SketchesReadOnlyException(); } hardReset(); intersect(a); intersect(b); @@ -233,7 +233,7 @@ public void intersect(final Sketch sketchIn) { if (sketchIn == null) { throw new SketchesArgumentException("Intersection argument must not be null."); } - if ((wmem_ != null) && readOnly_) { throw new SketchesReadOnlyException(); } + if (wmem_ != null && readOnly_) { throw new SketchesReadOnlyException(); } if (empty_ || sketchIn.isEmpty()) { //empty rule //Because of the def of null above and the Empty Rule (which is OR), empty_ must be true. //Whatever the current internal state, we make our local empty. @@ -262,14 +262,14 @@ public void intersect(final Sketch sketchIn) { final int sketchInEntries = sketchIn.getRetainedEntries(true); //states 1,2,3,6 - if ((curCount_ == 0) || (sketchInEntries == 0)) { + if (curCount_ == 0 || sketchInEntries == 0) { curCount_ = 0; if (wmem_ != null) { insertCurCount(wmem_, 0); } hashTable_ = null; //No need for a HT. Don't bother clearing mem if valid } //end of states 1,2,3,6 // state 5 - else if ((curCount_ < 0) && (sketchInEntries > 0)) { + else if (curCount_ < 0 && sketchInEntries > 0) { curCount_ = sketchIn.getRetainedEntries(true); final int requiredLgArrLongs = minLgHashTableSize(curCount_, REBUILD_THRESHOLD); final int priorLgArrLongs = lgArrLongs_; //prior only used in error message @@ -295,7 +295,7 @@ else if ((curCount_ < 0) && (sketchInEntries > 0)) { } //end of state 5 //state 7 - else if ((curCount_ > 0) && (sketchInEntries > 0)) { + else if (curCount_ > 0 && sketchInEntries > 0) { //Sets resulting hashTable, curCount and adjusts lgArrLongs performIntersect(sketchIn); } //end of state 7 @@ -339,6 +339,16 @@ public CompactSketch getResult(final boolean dstOrdered, final WritableMemory ds dstMem, compactCache); } + @Override + public boolean hasResult() { + return wmem_ != null ? wmem_.getInt(RETAINED_ENTRIES_INT) >= 0 : curCount_ >= 0; + } + + @Override + public boolean isSameResource(final Memory that) { + return wmem_ != null ? wmem_.isSameResource(that) : false; + } + @Override public void reset() { hardReset(); @@ -347,7 +357,7 @@ public void reset() { @Override public byte[] toByteArray() { final int preBytes = CONST_PREAMBLE_LONGS << 3; - final int dataBytes = (curCount_ > 0) ? 8 << lgArrLongs_ : 0; + final int dataBytes = curCount_ > 0 ? 8 << lgArrLongs_ : 0; final byte[] byteArrOut = new byte[preBytes + dataBytes]; if (wmem_ != null) { wmem_.getByteArray(0, byteArrOut, 0, preBytes + dataBytes); @@ -376,16 +386,6 @@ public byte[] toByteArray() { return byteArrOut; } - @Override - public boolean hasResult() { - return (wmem_ != null) ? wmem_.getInt(RETAINED_ENTRIES_INT) >= 0 : curCount_ >= 0; - } - - @Override - public boolean isSameResource(final Memory that) { - return (wmem_ != null) ? wmem_.isSameResource(that) : false; - } - //restricted /** @@ -405,7 +405,7 @@ boolean isEmpty() { @Override long[] getCache() { if (wmem_ == null) { - return (hashTable_ != null) ? hashTable_ : new long[0]; + return hashTable_ != null ? hashTable_ : new long[0]; } //Direct final int arrLongs = 1 << lgArrLongs_; @@ -426,7 +426,7 @@ long getThetaLong() { private void performIntersect(final Sketch sketchIn) { // curCount and input data are nonzero, match against HT - assert ((curCount_ > 0) && (!empty_)); + assert curCount_ > 0 && !empty_; final long[] cacheIn = sketchIn.getCache(); final int arrLongsIn = cacheIn.length; final long[] hashTable; @@ -458,7 +458,7 @@ private void performIntersect(final Sketch sketchIn) { //either unordered compact or hash table for (int i = 0; i < arrLongsIn; i++ ) { final long hashIn = cacheIn[i]; - if ((hashIn <= 0L) || (hashIn >= thetaLong_)) { continue; } + if (hashIn <= 0L || hashIn >= thetaLong_) { continue; } final int foundIdx = hashSearch(hashTable, lgArrLongs_, hashIn); if (foundIdx == -1) { continue; } matchSet[matchSetCount++] = hashIn; @@ -505,7 +505,7 @@ private void moveDataToTgt(final long[] arr, final int count) { tmpCnt++; } } - assert (tmpCnt == count) : "Intersection Count Check: got: " + tmpCnt + ", expected: " + count; + assert tmpCnt == count : "Intersection Count Check: got: " + tmpCnt + ", expected: " + count; } private void hardReset() { diff --git a/src/main/java/org/apache/datasketches/theta/JaccardSimilarity.java b/src/main/java/org/apache/datasketches/theta/JaccardSimilarity.java index 8abea8800..4a5d33708 100644 --- a/src/main/java/org/apache/datasketches/theta/JaccardSimilarity.java +++ b/src/main/java/org/apache/datasketches/theta/JaccardSimilarity.java @@ -54,7 +54,7 @@ public final class JaccardSimilarity { */ public static double[] jaccard(final Sketch sketchA, final Sketch sketchB) { //Corner case checks - if ((sketchA == null) || (sketchB == null)) { return ZEROS.clone(); } + if (sketchA == null || sketchB == null) { return ZEROS.clone(); } if (sketchA == sketchB) { return ONES.clone(); } if (sketchA.isEmpty() && sketchB.isEmpty()) { return ONES.clone(); } if (sketchA.isEmpty() || sketchB.isEmpty()) { return ZEROS.clone(); } @@ -68,8 +68,8 @@ public static double[] jaccard(final Sketch sketchA, final Sketch sketchB) { final int newK = max(min(ceilingPowerOf2(countA + countB), maxK), minK); final Union union = SetOperation.builder().setNominalEntries(newK).buildUnion(); - union.update(sketchA); - union.update(sketchB); + union.union(sketchA); + union.union(sketchB); final Sketch unionAB = union.getResult(false, null); final long thetaLongUAB = unionAB.getThetaLong(); final long thetaLongA = sketchA.getThetaLong(); @@ -77,8 +77,8 @@ public static double[] jaccard(final Sketch sketchA, final Sketch sketchB) { final int countUAB = unionAB.getRetainedEntries(true); //Check for identical data - if ((countUAB == countA) && (countUAB == countB) - && (thetaLongUAB == thetaLongA) && (thetaLongUAB == thetaLongB)) { + if (countUAB == countA && countUAB == countB + && thetaLongUAB == thetaLongA && thetaLongUAB == thetaLongB) { return ONES.clone(); } @@ -105,7 +105,7 @@ public static double[] jaccard(final Sketch sketchA, final Sketch sketchB) { */ public static boolean exactlyEqual(final Sketch sketchA, final Sketch sketchB) { //Corner case checks - if ((sketchA == null) || (sketchB == null)) { return false; } + if (sketchA == null || sketchB == null) { return false; } if (sketchA == sketchB) { return true; } if (sketchA.isEmpty() && sketchB.isEmpty()) { return true; } if (sketchA.isEmpty() || sketchB.isEmpty()) { return false; } @@ -116,8 +116,8 @@ public static boolean exactlyEqual(final Sketch sketchA, final Sketch sketchB) { //Create the Union final Union union = SetOperation.builder().setNominalEntries(ceilingPowerOf2(countA + countB)).buildUnion(); - union.update(sketchA); - union.update(sketchB); + union.union(sketchA); + union.union(sketchB); final Sketch unionAB = union.getResult(); final long thetaLongUAB = unionAB.getThetaLong(); final long thetaLongA = sketchA.getThetaLong(); @@ -125,8 +125,8 @@ public static boolean exactlyEqual(final Sketch sketchA, final Sketch sketchB) { final int countUAB = unionAB.getRetainedEntries(true); //Check for identical counts and thetas - if ((countUAB == countA) && (countUAB == countB) - && (thetaLongUAB == thetaLongA) && (thetaLongUAB == thetaLongB)) { + if (countUAB == countA && countUAB == countB + && thetaLongUAB == thetaLongA && thetaLongUAB == thetaLongB) { return true; } return false; diff --git a/src/main/java/org/apache/datasketches/theta/Union.java b/src/main/java/org/apache/datasketches/theta/Union.java index 1d74a2f3e..1e9ea7a40 100644 --- a/src/main/java/org/apache/datasketches/theta/Union.java +++ b/src/main/java/org/apache/datasketches/theta/Union.java @@ -35,6 +35,14 @@ public Family getFamily() { return Family.UNION; } + /** + * Gets the result of this operation as an ordered CompactSketch on the Java heap. + * This does not disturb the underlying data structure of the union. + * Therefore, it is OK to continue updating the union after this operation. + * @return the result of this operation as an ordered CompactSketch on the Java heap + */ + public abstract CompactSketch getResult(); + /** * Gets the result of this operation as a CompactSketch of the chosen form. * This does not disturb the underlying data structure of the union. @@ -50,14 +58,6 @@ public Family getFamily() { */ public abstract CompactSketch getResult(boolean dstOrdered, WritableMemory dstMem); - /** - * Gets the result of this operation as an ordered CompactSketch on the Java heap. - * This does not disturb the underlying data structure of the union. - * Therefore, it is OK to continue updating the union after this operation. - * @return the result of this operation as an ordered CompactSketch on the Java heap - */ - public abstract CompactSketch getResult(); - /** * Resets this Union. The seed remains intact, otherwise reverts back to its virgin state. */ @@ -108,6 +108,20 @@ public abstract CompactSketch union(Sketch sketchA, Sketch sketchB, boolean dstO * * @param sketchIn The incoming sketch. */ + public abstract void union(Sketch sketchIn); + + + /** + * Perform a Union operation with this union and the given on-heap sketch of the Theta Family. + * This method is not valid for the older SetSketch, which was prior to Open Source (August, 2015). + * + *

This method can be repeatedly called. + * If the given sketch is null it is interpreted as an empty sketch.

+ * + * @param sketchIn The incoming sketch. + * @deprecated 2.0.0. Use {@link #union(Sketch)} instead. + */ + @Deprecated public abstract void update(Sketch sketchIn); /** @@ -120,6 +134,20 @@ public abstract CompactSketch union(Sketch sketchA, Sketch sketchB, boolean dstO * * @param mem Memory image of sketch to be merged */ + public abstract void union(Memory mem); + + /** + * Perform a Union operation with this union and the given Memory image of any sketch of the + * Theta Family. The input image may be from earlier versions of the Theta Compact Sketch, + * called the SetSketch (circa 2012), which was prior to Open Source and are compact and ordered. + * + *

This method can be repeatedly called. + * If the given sketch is null it is interpreted as an empty sketch.

+ * + * @param mem Memory image of sketch to be merged + * @deprecated 2.0.0. Use {@link #union(Memory)} instead. + */ + @Deprecated public abstract void update(Memory mem); /** diff --git a/src/main/java/org/apache/datasketches/theta/UnionImpl.java b/src/main/java/org/apache/datasketches/theta/UnionImpl.java index 72515f671..700ee71d9 100644 --- a/src/main/java/org/apache/datasketches/theta/UnionImpl.java +++ b/src/main/java/org/apache/datasketches/theta/UnionImpl.java @@ -210,7 +210,7 @@ static UnionImpl wrapInstance(final WritableMemory srcMem, final long seed) { @Override public boolean isSameResource(final Memory that) { - return (gadget_ instanceof DirectQuickSelectSketchR) + return gadget_ instanceof DirectQuickSelectSketchR ? gadget_.getMemory().isSameResource(that) : false; } @@ -224,19 +224,19 @@ public CompactSketch getResult(final boolean dstOrdered, final WritableMemory ds final int gadgetCurCount = gadget_.getRetainedEntries(true); final int k = 1 << gadget_.getLgNomLongs(); final long[] gadgetCacheCopy = - (gadget_.hasMemory()) ? gadget_.getCache() : gadget_.getCache().clone(); + gadget_.hasMemory() ? gadget_.getCache() : gadget_.getCache().clone(); //Pull back to k final long curGadgetThetaLong = gadget_.getThetaLong(); - final long adjGadgetThetaLong = (gadgetCurCount > k) + final long adjGadgetThetaLong = gadgetCurCount > k ? selectExcludingZeros(gadgetCacheCopy, gadgetCurCount, k + 1) : curGadgetThetaLong; //Finalize Theta and curCount - final long unionThetaLong = (gadget_.hasMemory()) + final long unionThetaLong = gadget_.hasMemory() ? gadget_.getMemory().getLong(UNION_THETA_LONG) : unionThetaLong_; final long minThetaLong = min(min(curGadgetThetaLong, adjGadgetThetaLong), unionThetaLong); - final int curCountOut = (minThetaLong < curGadgetThetaLong) + final int curCountOut = minThetaLong < curGadgetThetaLong ? HashOperations.count(gadgetCacheCopy, minThetaLong) : gadgetCurCount; @@ -277,11 +277,17 @@ public CompactSketch union(final Sketch sketchA, final Sketch sketchB, final boo return getResult(dstOrdered, dstMem); } + @Deprecated @Override - public void update(final Sketch sketchIn) { //Only valid for theta Sketches using SerVer = 3 + public void update(final Sketch sketchIn) { + union(sketchIn); + } + + @Override + public void union(final Sketch sketchIn) { //Only valid for theta Sketches using SerVer = 3 //UNION Empty Rule: AND the empty states. - if ((sketchIn == null) || sketchIn.isEmpty()) { + if (sketchIn == null || sketchIn.isEmpty()) { //null and empty is interpreted as (Theta = 1.0, count = 0, empty = T). Nothing changes return; } @@ -303,7 +309,7 @@ public void update(final Sketch sketchIn) { //Only valid for theta Sketches usin final Memory skMem = ((CompactSketch) sketchIn).getMemory(); final int preambleLongs = skMem.getByte(PREAMBLE_LONGS_BYTE) & 0X3F; for (int i = 0; i < curCountIn; i++ ) { - final int offsetBytes = (preambleLongs + i) << 3; + final int offsetBytes = preambleLongs + i << 3; final long hashIn = skMem.getLong(offsetBytes); if (hashIn >= unionThetaLong_) { break; } // "early stop" gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed @@ -321,9 +327,9 @@ public void update(final Sketch sketchIn) { //Only valid for theta Sketches usin else { //either not-ordered compact or Hash Table form. A HT may have dirty values. final long[] cacheIn = sketchIn.getCache(); //if off-heap this will be a copy final int arrLongs = cacheIn.length; - for (int i = 0, c = 0; (i < arrLongs) && (c < curCountIn); i++ ) { + for (int i = 0, c = 0; i < arrLongs && c < curCountIn; i++ ) { final long hashIn = cacheIn[i]; - if ((hashIn <= 0L) || (hashIn >= unionThetaLong_)) { continue; } //rejects dirty values + if (hashIn <= 0L || hashIn >= unionThetaLong_) { continue; } //rejects dirty values gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed c++; //ensures against invalid state inside the incoming sketch } @@ -337,8 +343,14 @@ public void update(final Sketch sketchIn) { //Only valid for theta Sketches usin } } + @Deprecated @Override public void update(final Memory skMem) { + union(skMem); + } + + @Override + public void union(final Memory skMem) { if (skMem == null) { return; } final int cap = (int) skMem.getCapacity(); if (cap < 16) { return; } //empty or garbage @@ -346,7 +358,7 @@ public void update(final Memory skMem) { final int fam = extractFamilyID(skMem); if (serVer == 3) { //The OpenSource sketches (Aug 4, 2015) starts with serVer = 3 - if ((fam < 1) || (fam > 3)) { + if (fam < 1 || fam > 3) { throw new SketchesArgumentException( "Family must be Alpha, QuickSelect, or Compact: " + Family.idToFamily(fam)); } @@ -407,7 +419,7 @@ private void processVer3(final Memory skMem) { if (ordered) { //must be compact for (int i = 0; i < curCountIn; i++ ) { - final int offsetBytes = (preLongs + i) << 3; + final int offsetBytes = preLongs + i << 3; final long hashIn = skMem.getLong(offsetBytes); if (hashIn >= unionThetaLong_) { break; } // "early stop" gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed @@ -416,12 +428,12 @@ private void processVer3(final Memory skMem) { else { //not-ordered, could be compact or hash-table form final boolean compact = (flags & COMPACT_FLAG_MASK) != 0; - final int size = (compact) ? curCountIn : 1 << extractLgArrLongs(skMem); + final int size = compact ? curCountIn : 1 << extractLgArrLongs(skMem); for (int i = 0; i < size; i++ ) { - final int offsetBytes = (preLongs + i) << 3; + final int offsetBytes = preLongs + i << 3; final long hashIn = skMem.getLong(offsetBytes); - if ((hashIn <= 0L) || (hashIn >= unionThetaLong_)) { continue; } + if (hashIn <= 0L || hashIn >= unionThetaLong_) { continue; } gadget_.hashUpdate(hashIn); //backdoor update, hash function is bypassed } } diff --git a/src/main/java/org/apache/datasketches/tuple/AnotB.java b/src/main/java/org/apache/datasketches/tuple/AnotB.java index 10202bebc..31a83e26b 100644 --- a/src/main/java/org/apache/datasketches/tuple/AnotB.java +++ b/src/main/java/org/apache/datasketches/tuple/AnotB.java @@ -199,7 +199,7 @@ public void notB(final org.apache.datasketches.theta.Sketch skB) { * * @param reset If true, clears this operator to the empty state after this result is * returned. Set this to false if you wish to obtain an intermediate result. - * @return the result of this operation as a {@link CompactSketch}. + * @return the result of this operation as an unordered {@link CompactSketch}. */ public CompactSketch getResult(final boolean reset) { if (curCount_ == 0) { @@ -233,7 +233,7 @@ public CompactSketch getResult(final boolean reset) { * @param skA The incoming Tuple sketch for the first argument * @param skB The incoming Tuple sketch for the second argument * @param Type of Summary - * @return the result as a compact sketch + * @return the result as an unordered {@link CompactSketch} */ public static CompactSketch aNotB(final Sketch skA, final Sketch skB) { @@ -285,7 +285,7 @@ CompactSketch aNotB(final Sketch skA, final Sketch skB) { * @param skA The incoming Tuple sketch for the first argument * @param skB The incoming Theta sketch for the second argument * @param Type of Summary - * @return the result as a compact sketch + * @return the result as an unordered {@link CompactSketch} */ public static CompactSketch aNotB(final Sketch skA, final org.apache.datasketches.theta.Sketch skB) { @@ -470,7 +470,7 @@ public void update(final Sketch skA, final Sketch skB) { /** * Gets the result of this operation. This clears the state of this operator after the result is * returned. - * @return the result of this operation as a CompactSketch + * @return the result of this operation as an unordered {@link CompactSketch} * @deprecated v2.0.0. Instead use {@link #getResult(boolean)}. */ @Deprecated diff --git a/src/main/java/org/apache/datasketches/tuple/Intersection.java b/src/main/java/org/apache/datasketches/tuple/Intersection.java index 1dad7d244..d92f2034a 100644 --- a/src/main/java/org/apache/datasketches/tuple/Intersection.java +++ b/src/main/java/org/apache/datasketches/tuple/Intersection.java @@ -51,7 +51,8 @@ public class Intersection { private boolean firstCall_; /** - * Creates new instance + * Creates new Intersection instance with instructions on how to process two summaries that + * intersect. * @param summarySetOps instance of SummarySetOperations */ public Intersection(final SummarySetOperations summarySetOps) { @@ -188,7 +189,7 @@ public void update(final org.apache.datasketches.theta.Sketch sketchIn, final S } /** - * Gets the internal set as a CompactSketch + * Gets the internal set as an unordered CompactSketch * @return result of the intersections so far */ public CompactSketch getResult() { diff --git a/src/main/java/org/apache/datasketches/tuple/Union.java b/src/main/java/org/apache/datasketches/tuple/Union.java index 884faffac..5debba6a9 100644 --- a/src/main/java/org/apache/datasketches/tuple/Union.java +++ b/src/main/java/org/apache/datasketches/tuple/Union.java @@ -41,7 +41,8 @@ public class Union { private boolean empty_; /** - * Creates new instance with default nominal entries + * Creates new Intersection instance with instructions on how to process two summaries that + * overlap. This will have the default nominal entries (K). * @param summarySetOps instance of SummarySetOperations */ public Union(final SummarySetOperations summarySetOps) { @@ -49,8 +50,10 @@ public Union(final SummarySetOperations summarySetOps) { } /** + * Creates new Intersection instance with instructions on how to process two summaries that + * overlap. * Creates new instance - * @param nomEntries nominal number of entries. Forced to the nearest power of 2 greater than + * @param nomEntries nominal entries (K). Forced to the nearest power of 2 greater than * given value. * @param summarySetOps instance of SummarySetOperations */ @@ -67,7 +70,7 @@ public Union(final int nomEntries, final SummarySetOperations summarySetOps) * If null or empty, it is ignored. */ public void update(final Sketch sketchIn) { - if ((sketchIn == null) || sketchIn.isEmpty()) { return; } + if (sketchIn == null || sketchIn.isEmpty()) { return; } empty_ = false; if (sketchIn.thetaLong_ < thetaLong_) { thetaLong_ = sketchIn.thetaLong_; } final SketchIterator it = sketchIn.iterator(); @@ -91,7 +94,7 @@ public void update(final Sketch sketchIn) { public void update(final org.apache.datasketches.theta.Sketch sketchIn, final S summary) { if (summary == null) { throw new SketchesArgumentException("Summary cannot be null."); } - if ((sketchIn == null) || sketchIn.isEmpty()) { return; } + if (sketchIn == null || sketchIn.isEmpty()) { return; } empty_ = false; final long thetaIn = sketchIn.getThetaLong(); if (thetaIn < thetaLong_) { thetaLong_ = thetaIn; } @@ -105,7 +108,7 @@ public void update(final org.apache.datasketches.theta.Sketch sketchIn, final S } /** - * Gets the internal set as a CompactSketch + * Gets the internal set as an unordered CompactSketch * @return result of the unions so far */ @SuppressWarnings("unchecked") @@ -113,7 +116,7 @@ public CompactSketch getResult() { if (empty_) { return qsk_.compact(); } - if ((thetaLong_ >= qsk_.thetaLong_) && (qsk_.getRetainedEntries() <= qsk_.getNominalEntries())) { + if (thetaLong_ >= qsk_.thetaLong_ && qsk_.getRetainedEntries() <= qsk_.getNominalEntries()) { return qsk_.compact(); } long theta = min(thetaLong_, qsk_.thetaLong_); diff --git a/src/test/java/org/apache/datasketches/theta/DirectUnionTest.java b/src/test/java/org/apache/datasketches/theta/DirectUnionTest.java index eb372a03b..d92e67255 100644 --- a/src/test/java/org/apache/datasketches/theta/DirectUnionTest.java +++ b/src/test/java/org/apache/datasketches/theta/DirectUnionTest.java @@ -46,14 +46,14 @@ public class DirectUnionTest { @Test public void checkExactUnionNoOverlap() { - int lgK = 9; //512 - int k = 1 << lgK; - int u = k; + final int lgK = 9; //512 + final int k = 1 << lgK; + final int u = k; - UpdateSketch usk1 = UpdateSketch.builder().setNominalEntries(k).build(); - UpdateSketch usk2 = UpdateSketch.builder().setNominalEntries(k).build(); + final UpdateSketch usk1 = UpdateSketch.builder().setNominalEntries(k).build(); + final UpdateSketch usk2 = UpdateSketch.builder().setNominalEntries(k).build(); - for (int i=0; i<(u/2); i++) { + for (int i=0; i uiter = ucsk.iterator(); int counter = 1; + int twos = 0; + int ones = 0; while (uiter.next()) { final int i = uiter.getSummary().getValue(); println(counter++ + ", " + i); //9 entries = 2, 6 entries = 1 + if (i == 1) { ones++; } + if (i == 2) { twos++; } } + assertEquals(ones, 6); + assertEquals(twos, 9); //Intersection final Intersection inter = new Intersection<>(isso); @@ -76,6 +84,7 @@ public void tuple2dot0Examples() { while (iiter.next()) { final int i = iiter.getSummary().getValue(); println(counter++ + ", " + i); //9 entries = 1 + assertEquals(i, 1); } } @@ -88,6 +97,6 @@ public void printlnTest() { * @param s value to print */ static void println(final String s) { - System.out.println(s); //enable/disable here + //System.out.println(s); //enable/disable here } }