Skip to content

Commit

Permalink
Merge pull request #348 from apache/Prep_for_2.0.0-RC3
Browse files Browse the repository at this point in the history
Made parallel APIs for SetOperations in Theta, and Tuple (generics).
  • Loading branch information
leerho authored Feb 9, 2021
2 parents ac00afa + 0cad491 commit 0dea9cd
Show file tree
Hide file tree
Showing 22 changed files with 1,232 additions and 492 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

=================

# DataSketches Core Java Library Component
# Apache<sup>&reg;</sup> DataSketches&trade; Core Java Library Component
This is the core Java component of the DataSketches library. It contains all of the sketching algorithms and can be accessed directly from user applications.

This component is also a dependency of other components of the library that create adaptors for target systems, such as Hadoop Pig and Hadoop Hive.
Expand Down
6 changes: 3 additions & 3 deletions src/main/java/org/apache/datasketches/BinarySearch.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public final class BinarySearch {
/**
* Binary Search for the index of the exact float value in the given search range.
* If -1 is returned there are no values in the search range that equals the given value.
* @param arr The given array to search.
* @param arr The given ordered array to search.
* @param low the index of the lowest value of the search range
* @param high the index of the highest value of the search range
* @param v the value to search for
Expand All @@ -53,7 +53,7 @@ public static int find(final float[] arr, final int low, final int high, final f
/**
* Binary Search for the index of the exact double value in the given search range.
* If -1 is returned there are no values in the search range that equals the given value.
* @param arr The given array to search.
* @param arr The given ordered array to search.
* @param low the index of the lowest value of the search range
* @param high the index of the highest value of the search range
* @param v the value to search for
Expand All @@ -76,7 +76,7 @@ public static int find(final double[] arr, final int low, final int high, final
/**
* Binary Search for the index of the exact long value in the given search range.
* If -1 is returned there are no values in the search range that equals the given value.
* @param arr The given array to search.
* @param arr The given ordered array to search.
* @param low the index of the lowest value of the search range
* @param high the index of the highest value of the search range
* @param v the value to search for
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,6 @@ public enum Inequality {
GT
}

/**
* Constructs this class
*/
public GenericInequalitySearch() { }

/**
* Binary Search for the index of the generic value in the given search range that satisfies
* the given inequality.
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/apache/datasketches/theta/Intersection.java
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ public CompactSketch getResult() {
public abstract CompactSketch getResult(boolean dstOrdered, WritableMemory dstMem);

/**
* Returns true if there is an intersection result available
* @return true if there is an intersection result available
* Returns true if there is a valid intersection result available
* @return true if there is a valid intersection result available
*/
public abstract boolean hasResult();

Expand Down
19 changes: 12 additions & 7 deletions src/main/java/org/apache/datasketches/theta/Union.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
import org.apache.datasketches.memory.WritableMemory;

/**
* The API for Union operations
* Compute the union of two or more theta sketches.
* A new instance represents an empty set.
*
* @author Lee Rhodes
*/
Expand Down Expand Up @@ -59,7 +60,7 @@ public Family getFamily() {
public abstract CompactSketch getResult(boolean dstOrdered, WritableMemory dstMem);

/**
* Resets this Union. The seed remains intact, otherwise reverts back to its virgin state.
* Resets this Union. The seed remains intact, everything else reverts back to its virgin state.
*/
public abstract void reset();

Expand All @@ -71,7 +72,7 @@ public Family getFamily() {

/**
* This implements a stateless, pair-wise union operation. The returned sketch will be cutback to
* k if required, similar to the regular Union operation.
* the smaller of the two k values if required.
*
* <p>Nulls and empty sketches are ignored.</p>
*
Expand Down Expand Up @@ -104,7 +105,8 @@ public abstract CompactSketch union(Sketch sketchA, Sketch sketchB, boolean dstO
* This method is not valid for the older SetSketch, which was prior to Open Source (August, 2015).
*
* <p>This method can be repeatedly called.
* If the given sketch is null it is interpreted as an empty sketch.</p>
*
* <p>Nulls and empty sketches are ignored.</p>
*
* @param sketchIn The incoming sketch.
*/
Expand All @@ -116,7 +118,8 @@ public abstract CompactSketch union(Sketch sketchA, Sketch sketchB, boolean dstO
* This method is not valid for the older SetSketch, which was prior to Open Source (August, 2015).
*
* <p>This method can be repeatedly called.
* If the given sketch is null it is interpreted as an empty sketch.</p>
*
* <p>Nulls and empty sketches are ignored.</p>
*
* @param sketchIn The incoming sketch.
* @deprecated 2.0.0. Use {@link #union(Sketch)} instead.
Expand All @@ -130,7 +133,8 @@ public abstract CompactSketch union(Sketch sketchA, Sketch sketchB, boolean dstO
* called the SetSketch (circa 2012), which was prior to Open Source and are compact and ordered.
*
* <p>This method can be repeatedly called.
* If the given sketch is null it is interpreted as an empty sketch.</p>
*
* <p>Nulls and empty sketches are ignored.</p>
*
* @param mem Memory image of sketch to be merged
*/
Expand All @@ -142,7 +146,8 @@ public abstract CompactSketch union(Sketch sketchA, Sketch sketchB, boolean dstO
* called the SetSketch (circa 2012), which was prior to Open Source and are compact and ordered.
*
* <p>This method can be repeatedly called.
* If the given sketch is null it is interpreted as an empty sketch.</p>
*
* <p>Nulls and empty sketches are ignored.</p>
*
* @param mem Memory image of sketch to be merged
* @deprecated 2.0.0. Use {@link #union(Memory)} instead.
Expand Down
13 changes: 8 additions & 5 deletions src/main/java/org/apache/datasketches/theta/UnionImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -272,9 +272,12 @@ public byte[] toByteArray() {
@Override
public CompactSketch union(final Sketch sketchA, final Sketch sketchB, final boolean dstOrdered,
final WritableMemory dstMem) {
update(sketchA);
update(sketchB);
return getResult(dstOrdered, dstMem);
reset();
union(sketchA);
union(sketchB);
final CompactSketch csk = getResult(dstOrdered, dstMem);
reset();
return csk;
}

@Deprecated
Expand Down Expand Up @@ -369,13 +372,13 @@ public void union(final Memory skMem) {
if (serVer == 2) { //older Sketch, which is compact and ordered
Util.checkSeedHashes(seedHash_, (short)extractSeedHash(skMem));
final CompactSketch csk = ForwardCompatibility.heapify2to3(skMem, DEFAULT_UPDATE_SEED);
update(csk);
union(csk);
return;
}

if (serVer == 1) { //much older Sketch, which is compact and ordered
final CompactSketch csk = ForwardCompatibility.heapify1to3(skMem, DEFAULT_UPDATE_SEED);
update(csk);
union(csk);
return;
}

Expand Down
117 changes: 95 additions & 22 deletions src/main/java/org/apache/datasketches/tuple/Intersection.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@


/**
* Computes an intersection of two or more generic tuple sketches.
* Computes an intersection of two or more generic tuple sketches or generic tuple sketches
* combined with theta sketches.
* A new instance represents the Universal Set. Because the Universal Set
* cannot be realized a <i>getResult()</i> on a new instance will produce an error.
* Every update() computes an intersection with the internal state, which will never
Expand Down Expand Up @@ -64,28 +65,74 @@ public Intersection(final SummarySetOperations<S> summarySetOps) {
}

/**
* Updates the internal state by intersecting it with the given sketch.
* @param sketchIn input sketch to intersect with the internal state. It may not be null.
* Perform a stateless intersect set operation on the two given tuple sketches and returns the
* result as an unordered CompactSketch on the heap.
* @param tupleSketchA The first sketch argument. It must not be null.
* @param tupleSketchB The second sketch argument. It must not be null.
* @return an unordered CompactSketch on the heap
*/
public void update(final Sketch<S> sketchIn) {
if (sketchIn == null) { throw new SketchesArgumentException("Sketch may not be null"); }
public CompactSketch<S> intersect(final Sketch<S> tupleSketchA, final Sketch<S> tupleSketchB) {
reset();
intersect(tupleSketchA);
intersect(tupleSketchB);
final CompactSketch<S> csk = getResult();
reset();
return csk;
}

/**
* Perform a stateless intersect set operation on a tuple sketch and a theta sketch and returns the
* result as an unordered CompactSketch on the heap.
* @param tupleSketch The first sketch argument. It must not be null.
* @param thetaSketch The second sketch argument. It must not be null.
* @param summary the given proxy summary for the theta sketch, which doesn't have one.
* This must not be null.
* @return an unordered CompactSketch on the heap
*/
public CompactSketch<S> intersect(final Sketch<S> tupleSketch,
final org.apache.datasketches.theta.Sketch thetaSketch, final S summary) {
reset();
intersect(tupleSketch);
intersect(thetaSketch, summary);
final CompactSketch<S> csk = getResult();
reset();
return csk;
}

/**
* Performs a stateful intersection of the internal set with the given tupleSketch.
* @param tupleSketch input sketch to intersect with the internal state. It must not be null.
* @deprecated 2.0.0. Please use {@link #intersect(Sketch)}.
*/
@Deprecated
public void update(final Sketch<S> tupleSketch) {
intersect(tupleSketch);
}

/**
* Performs a stateful intersection of the internal set with the given tupleSketch.
* @param tupleSketch input sketch to intersect with the internal state. It must not be null.
*/
public void intersect(final Sketch<S> tupleSketch) {
if (tupleSketch == null) { throw new SketchesArgumentException("Sketch must not be null"); }
final boolean firstCall = firstCall_;
firstCall_ = false;

// input sketch could be first or next call
final long thetaLongIn = sketchIn.getThetaLong();
final int countIn = sketchIn.getRetainedEntries();
final long thetaLongIn = tupleSketch.getThetaLong();
final int countIn = tupleSketch.getRetainedEntries();
thetaLong_ = min(thetaLong_, thetaLongIn); //Theta rule
// Empty rule extended in case incoming sketch does not have empty bit properly set
empty_ |= countIn == 0 && thetaLongIn == Long.MAX_VALUE;
final boolean emptyIn = countIn == 0 && thetaLongIn == Long.MAX_VALUE;
empty_ |= emptyIn; //empty rule
if (countIn == 0) {
hashTables_.clear();
return;
}
// input sketch will have valid entries > 0

if (firstCall) {
final Sketch<S> firstSketch = sketchIn;
final Sketch<S> firstSketch = tupleSketch;
//Copy firstSketch data into local instance hashTables_
hashTables_.fromSketch(firstSketch);
}
Expand All @@ -95,7 +142,7 @@ public void update(final Sketch<S> sketchIn) {
if (hashTables_.count_ == 0) {
return;
}
final Sketch<S> nextSketch = sketchIn;
final Sketch<S> nextSketch = tupleSketch;
//Match nextSketch data with local instance data, filtering by theta
final int maxMatchSize = min(hashTables_.count_, nextSketch.getRetainedEntries());

Expand Down Expand Up @@ -126,29 +173,47 @@ public void update(final Sketch<S> sketchIn) {
}

/**
* Updates the internal set by intersecting it with the given Theta sketch.
* @param sketchIn input Theta Sketch to intersect with the internal state. It may not be null.
* @param summary the given proxy summary for the Theta Sketch, which doesn't have one.
* It will be copied for each matching index. It may not be null.
* Performs a stateful intersection of the internal set with the given thetaSketch by combining entries
* using the hashes from the theta sketch and summary values from the given summary and rules
* from the summarySetOps defined by the Intersection constructor.
* @param thetaSketch input theta sketch to intersect with the internal state. It must not be null.
* @param summary the given proxy summary for the theta sketch, which doesn't have one.
* It will be copied for each matching index. It must not be null.
* @deprecated 2.0.0. Please use intersect(org.apache.datasketches.theta.Sketch, S).
*/
@Deprecated //note the {at_link} does not work in the above
public void update(final org.apache.datasketches.theta.Sketch thetaSketch, final S summary) {
intersect(thetaSketch, summary);
}

/**
* Performs a stateful intersection of the internal set with the given thetaSketch by combining entries
* using the hashes from the theta sketch and summary values from the given summary and rules
* from the summarySetOps defined by the Intersection constructor.
* @param thetaSketch input theta sketch to intersect with the internal state. It must not be null.
* @param summary the given proxy summary for the theta sketch, which doesn't have one.
* It will be copied for each matching index. It must not be null.
*/
public void update(final org.apache.datasketches.theta.Sketch sketchIn, final S summary) {
if (sketchIn == null) { throw new SketchesArgumentException("Sketch may not be null"); }
public void intersect(final org.apache.datasketches.theta.Sketch thetaSketch, final S summary) {
if (thetaSketch == null) { throw new SketchesArgumentException("Sketch must not be null"); }
if (summary == null) { throw new SketchesArgumentException("Summary cannot be null."); }
final boolean firstCall = firstCall_;
firstCall_ = false;

// input sketch is not null, could be first or next call
final long thetaLongIn = sketchIn.getThetaLong();
final int countIn = sketchIn.getRetainedEntries(true);
final long thetaLongIn = thetaSketch.getThetaLong();
final int countIn = thetaSketch.getRetainedEntries(true);
thetaLong_ = min(thetaLong_, thetaLongIn); //Theta rule
// Empty rule extended in case incoming sketch does not have empty bit properly set
empty_ |= countIn == 0 && thetaLongIn == Long.MAX_VALUE;
final boolean emptyIn = countIn == 0 && thetaLongIn == Long.MAX_VALUE;
empty_ |= emptyIn; //empty rule
if (countIn == 0) {
hashTables_.clear();
return;
}
// input sketch will have valid entries > 0
if (firstCall) {
final org.apache.datasketches.theta.Sketch firstSketch = sketchIn;
final org.apache.datasketches.theta.Sketch firstSketch = thetaSketch;
//Copy firstSketch data into local instance hashTables_
hashTables_.fromSketch(firstSketch, summary);
}
Expand All @@ -158,15 +223,15 @@ public void update(final org.apache.datasketches.theta.Sketch sketchIn, final S
if (hashTables_.count_ == 0) {
return;
}
final org.apache.datasketches.theta.Sketch nextSketch = sketchIn;
final org.apache.datasketches.theta.Sketch nextSketch = thetaSketch;
//Match nextSketch data with local instance data, filtering by theta
final int maxMatchSize = min(hashTables_.count_, nextSketch.getRetainedEntries(true));

final long[] matchHashArr = new long[maxMatchSize];
S[] matchSummaries = null;
int matchCount = 0;

final org.apache.datasketches.theta.HashIterator it = sketchIn.iterator();
final org.apache.datasketches.theta.HashIterator it = thetaSketch.iterator();
final Class<S> summaryType = (Class<S>) hashTables_.summaryTable_.getClass().getComponentType();
while (it.next()) {
final long hash = it.get();
Expand Down Expand Up @@ -221,6 +286,14 @@ public CompactSketch<S> getResult() {
return new CompactSketch<>(hashArr, summaries, thetaLong_, empty_);
}

/**
* Returns true if there is a valid intersection result available
* @return true if there is a valid intersection result available
*/
public boolean hasResult() {
return !firstCall_;
}

/**
* Resets the internal set to the initial state, which represents the Universal Set
*/
Expand Down
Loading

0 comments on commit 0dea9cd

Please sign in to comment.