Skip to content

Commit

Permalink
This commit includes a number of enhancements for the 5.0 release:
Browse files Browse the repository at this point in the history
- This is a large number of changes.

- The problem detected by the Druid team is fixed, so now the
"getPartitionBoundaries" works for input streams that are larger than
Integer.MAX_VALUE.

- This fix applies to both the KllItemsSketch and the classic
ItemsSketch.  These are the only two sketches, for now, that will
support the "getPartitionBoundaries" functionality. This is enforced via
a new "PartitioningFeature" API interface.

- In addition, there is new "partitions" package that solves the problem
of limited accuracy of our quantiles sketches when being asked to
partition very large input streams.  This package can partition very
large streams of almost unlimited size with very small variation in the
resulting partition sizes. I have tested this with streams as large as
30E12 elements.

- I have reduced code duplication in a number of places. Specifically,
All the quantile sketch sorted view classes use only 3 iterator
implementations, which are for float, double and generic. Further
consolidation of classes can be done across the sorted view classes
themselves, but that will have to be done later.

- Javadocs have been improved in a number of places and I have fixed
spelling errors when I see them.
  • Loading branch information
leerho committed Nov 16, 2023
1 parent 3b4778e commit 23e36ac
Show file tree
Hide file tree
Showing 73 changed files with 2,222 additions and 1,362 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,5 +154,5 @@ In Eclipse, open the project *Properties / Java Build Path / Module Dependencies

#### SpotBugs

* Make sure you configure SpotBugs with the /tools/FindBugsExcludeFilter.xml file. Otherwise, you will get a lot of false positive or low risk issues that we have examined and exliminated with this exclusion file.
* Make sure you configure SpotBugs with the /tools/FindBugsExcludeFilter.xml file. Otherwise, you may get a lot of false positive or low risk issues that we have examined and eliminated with this exclusion file.

7 changes: 7 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,13 @@ under the License.
<version>${testng.version}</version>
<scope>test</scope>
</dependency>
<!--
<dependency>
<groupId>org.apache.datasketches</groupId>
<artifactId>datasketches-java-common</artifactId>
<version>1.0.0</version>
</dependency>
-->
</dependencies>

<build>
Expand Down
83 changes: 34 additions & 49 deletions src/main/java/org/apache/datasketches/common/Util.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import static java.lang.Math.log;
import static java.lang.Math.pow;
import static java.lang.Math.round;
import static java.util.Arrays.fill;

import java.util.Comparator;

Expand Down Expand Up @@ -217,7 +218,7 @@ public static String nanoSecToString(final long nS) {

/**
* Returns the given time in milliseconds formatted as Hours:Min:Sec.mSec
* @param mS the given nanoseconds
* @param mS the given milliseconds
* @return the given time in milliseconds formatted as Hours:Min:Sec.mSec
*/
public static String milliSecToString(final long mS) {
Expand All @@ -244,40 +245,20 @@ public static String zeroPad(final String s, final int fieldLength) {

/**
* Prepend or postpend the given string with the given character to fill the given field length.
* If the given string is equal or greater than the given field length, it will be returned
* without modification.
* If the given string is equal to or greater than the given field length, it will be returned without modification.
* @param s the given string
* @param fieldLength the desired field length
* @param padChar the desired pad character
* @param postpend if true append the pacCharacters to the end of the string.
* @return prepended or postpended given string with the given character to fill the given field
* length.
* @return prepended or postpended given string with the given character to fill the given field length.
*/
public static String characterPad(final String s, final int fieldLength, final char padChar,
final boolean postpend) {
final char[] chArr = s.toCharArray();
final int sLen = chArr.length;
public static String characterPad(final String s, final int fieldLength, final char padChar, final boolean postpend) {
final int sLen = s.length();
if (sLen < fieldLength) {
final char[] out = new char[fieldLength];
final int blanks = fieldLength - sLen;

if (postpend) {
for (int i = 0; i < sLen; i++) {
out[i] = chArr[i];
}
for (int i = sLen; i < fieldLength; i++) {
out[i] = padChar;
}
} else { //prepend
for (int i = 0; i < blanks; i++) {
out[i] = padChar;
}
for (int i = blanks; i < fieldLength; i++) {
out[i] = chArr[i - blanks];
}
}

return String.valueOf(out);
final char[] cArr = new char[fieldLength - sLen];
fill(cArr, padChar);
final String addstr = String.valueOf(cArr);
return (postpend) ? s.concat(addstr) : addstr.concat(s);
}
return s;
}
Expand Down Expand Up @@ -550,56 +531,60 @@ public static double powerSeriesNextDouble(final int ppb, final double curPoint,
}

/**
* Computes the ceiling power of given <i>base</i> and <i>n</i> as doubles.
* This is the smallest positive power
* of <i>base</i> that equal to or greater than the given <i>n</i> and equal to a mathematical integer.
* Returns the ceiling of a given <i>n</i> given a <i>radix</i>, where the ceiling is an integral power of the radix.
* This is the smallest positive power of <i>radix</i> that is equal to or greater than the given <i>n</i>
* and equal to a mathematical integer.
* The result of this function is consistent with {@link #ceilingIntPowerOf2(int)} for values
* less than one. I.e., if <i>n &lt; 1,</i> the result is 1.
*
* @param base The base in the expression &#8968;base<sup>n</sup>&#8969;.
* <p>The formula is: <i>radix<sup>ceiling(log<sub>radix</sub>(x))</sup></i></p>
*
* @param radix The base of the number system.
* @param n The input argument.
* @return the ceiling power of <i>base</i> as a double and equal to a mathematical integer.
* @return the ceiling power of <i>radix</i> as a double and equal to a mathematical integer.
*/
public static double ceilingPowerBaseOfDouble(final double base, final double n) {
public static double ceilingPowerBaseOfDouble(final double radix, final double n) {
final double x = n < 1.0 ? 1.0 : n;
return pow(base, ceil(logBaseOfX(base, x)));
return Math.round(pow(radix, ceil(logBaseOfX(radix, x))));
}

/**
* Computes the floor power of given <i>base</i> and <i>n</i> as doubles.
* This is the largest positive power
* of <i>base</i> that equal to or less than the given n and equal to a mathematical integer.
* Computes the floor of a given <i>n</i> given <i>radix</i>, where the floor is an integral power of the radix.
* This is the largest positive power of <i>radix</i> that is equal to or less than the given <i>n</i>
* and equal to a mathematical integer.
* The result of this function is consistent with {@link #floorPowerOf2(int)} for values
* less than one. I.e., if <i>n &lt; 1,</i> the result is 1.
*
* @param base The base in the expression &#8970;base<sup>n</sup>&#8971;.
* <p>The formula is: <i>radix<sup>floor(log<sub>radix</sub>(x))</sup></i></p>
*
* @param radix The base of the number system.
* @param n The input argument.
* @return the floor power of 2 and equal to a mathematical integer.
*/
public static double floorPowerBaseOfDouble(final double base, final double n) {
public static double floorPowerBaseOfDouble(final double radix, final double n) {
final double x = n < 1.0 ? 1.0 : n;
return pow(base, floor(logBaseOfX(base, x)));
return Math.round(pow(radix, floor(logBaseOfX(radix, x))));
}

// Logarithm related

/**
* The log base 2 of the value
* The log<sub>2</sub>(value)
* @param value the given value
* @return The log base 2 of the value
* @return log<sub>2</sub>(value)
*/
public static double log2(final double value) {
return log(value) / LOG2;
}

/**
* Returns the logarithm_logBase of x. Example: logB(2.0, x) = log(x) / log(2.0).
* @param logBase the base of the logarithm used
* Returns the log<sub>radix</sub>(x). Example: logB(2.0, x) = log(x) / log(2.0).
* @param radix the base of the number system
* @param x the given value
* @return the logarithm_logBase of x: Example: logB(2.0, x) = log(x) / log(2.0).
* @return the log<sub>radix</sub>(x): Example: logB(2.0, x) = log(x) / log(2.0).
*/
public static double logBaseOfX(final double logBase, final double x) {
return log(x) / log(logBase);
public static double logBaseOfX(final double radix, final double x) {
return log(x) / log(radix);
}

/**
Expand Down
16 changes: 0 additions & 16 deletions src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import static org.apache.datasketches.common.ByteArrayUtil.putDoubleLE;
import static org.apache.datasketches.kll.KllSketch.SketchStructure.UPDATABLE;
import static org.apache.datasketches.kll.KllSketch.SketchType.DOUBLES_SKETCH;
import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallySpacedDoubles;

import java.util.Objects;

Expand Down Expand Up @@ -175,21 +174,6 @@ public double[] getCDF(final double[] splitPoints, final QuantileSearchCriteria
return kllDoublesSV.getCDF(splitPoints, searchCrit);
}

@Override
public DoublesPartitionBoundaries getPartitionBoundaries(final int numEquallyWeighted,
final QuantileSearchCriteria searchCrit) {
if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); }
final double[] ranks = equallySpacedDoubles(numEquallyWeighted);
final double[] boundaries = getQuantiles(ranks, searchCrit);
boundaries[0] = getMinItem();
boundaries[boundaries.length - 1] = getMaxItem();
final DoublesPartitionBoundaries dpb = new DoublesPartitionBoundaries();
dpb.N = this.getN();
dpb.ranks = ranks;
dpb.boundaries = boundaries;
return dpb;
}

@Override
public double[] getPMF(final double[] splitPoints, final QuantileSearchCriteria searchCrit) {
if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,55 +24,17 @@
/**
* Iterator over KllDoublesSketch. The order is not defined.
*/
public final class KllDoublesSketchIterator implements QuantilesDoublesSketchIterator {
public final class KllDoublesSketchIterator extends KllSketchIterator implements QuantilesDoublesSketchIterator {
private final double[] quantiles;
private final int[] levelsArr;
private final int numLevels;
private int level;
private int index;
private long weight;
private boolean isInitialized;

KllDoublesSketchIterator(final double[] quantiles, final int[] levelsArr, final int numLevels) {
super(levelsArr, numLevels);
this.quantiles = quantiles;
this.levelsArr = levelsArr;
this.numLevels = numLevels;
this.isInitialized = false;
}

@Override
public double getQuantile() {
return quantiles[index];
}

@Override
public long getWeight() {
return weight;
}

@Override
public boolean next() {
if (!isInitialized) {
level = 0;
index = levelsArr[level];
weight = 1;
isInitialized = true;
} else {
index++;
}
if (index < levelsArr[level + 1]) {
return true;
}
// go to the next non-empty level
do {
level++;
if (level == numLevels) {
return false; // run out of levels
}
weight *= 2;
} while (levelsArr[level] == levelsArr[level + 1]);
index = levelsArr[level];
return true;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

import org.apache.datasketches.common.SketchesArgumentException;
import org.apache.datasketches.quantilescommon.DoublesSortedView;
import org.apache.datasketches.quantilescommon.DoublesSortedViewIterator;
import org.apache.datasketches.quantilescommon.InequalitySearch;
import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
import org.apache.datasketches.quantilescommon.QuantilesUtil;
Expand All @@ -40,73 +41,91 @@ public final class KllDoublesSketchSortedView implements DoublesSortedView {
private final double[] quantiles;
private final long[] cumWeights; //comes in as individual weights, converted to cumulative natural weights
private final long totalN;
private final double[] normRanks;
private final double maxItem;
private final double minItem;

/**
* Construct from elements for testing.
* @param quantiles sorted array of quantiles
* @param cumWeights sorted, monotonically increasing cumulative weights.
* @param totalN the total number of items presented to the sketch.
*/
KllDoublesSketchSortedView(final double[] quantiles, final long[] cumWeights, final long totalN) {
KllDoublesSketchSortedView(final double[] quantiles, final long[] cumWeights, final long totalN,
final double maxItem, final double minItem) {
this.quantiles = quantiles;
this.cumWeights = cumWeights;
this.totalN = totalN;
this.maxItem = maxItem;
this.minItem = minItem;
final int len = cumWeights.length;
final double[] normRanks = new double[len];
for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; }
this.normRanks = normRanks;
}

/**
* Constructs this Sorted View given the sketch
* @param sk the given KllDoublesSketch.
* @param sketch the given KllDoublesSketch.
*/
public KllDoublesSketchSortedView(final KllDoublesSketch sk) {
this.totalN = sk.getN();
final double[] srcQuantiles = sk.getDoubleItemsArray();
final int[] srcLevels = sk.levelsArr;
final int srcNumLevels = sk.getNumLevels();
public KllDoublesSketchSortedView(final KllDoublesSketch sketch) {
if (sketch.isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); }
this.totalN = sketch.getN();
this.maxItem = sketch.getMaxItem();
this.minItem = sketch.getMinItem();
final double[] srcQuantiles = sketch.getDoubleItemsArray();
final int[] srcLevels = sketch.levelsArr;
final int srcNumLevels = sketch.getNumLevels();

if (!sk.isLevelZeroSorted()) {
if (!sketch.isLevelZeroSorted()) {
Arrays.sort(srcQuantiles, srcLevels[0], srcLevels[1]);
if (!sk.hasMemory()) { sk.setLevelZeroSorted(true); }
if (!sketch.hasMemory()) { sketch.setLevelZeroSorted(true); }
}

final int numQuantiles = srcLevels[srcNumLevels] - srcLevels[0]; //remove garbage
quantiles = new double[numQuantiles];
cumWeights = new long[numQuantiles];
populateFromSketch(srcQuantiles, srcLevels, srcNumLevels, numQuantiles);
final double[] normRanks = new double[numQuantiles];
for (int i = 0; i < numQuantiles; i++) { normRanks[i] = (double)cumWeights[i] / totalN; }
this.normRanks = normRanks;
}

@Override
public long[] getCumulativeWeights() {
return cumWeights.clone();
}

@Override
public double getMaxItem() {
return maxItem;
}

@Override
public double getMinItem() {
return minItem;
}

@Override
public long getN() {
return totalN;
}

@Override
public double[] getNormalizedRanks() {
return normRanks;
}

@Override
public double getQuantile(final double rank, final QuantileSearchCriteria searchCrit) {
if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); }
QuantilesUtil.checkNormalizedRankBounds(rank);
final int len = cumWeights.length;
final double naturalRank = getNaturalRank(rank, totalN);
final double naturalRank = getNaturalRank(rank, totalN, searchCrit);
final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT;
final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit);
if (index == -1) {
return quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0;
}
return quantiles[index];
}

/**
* Special version of getQuantile to support the getPartitionBoundaries(int) function.
* @param weight ultimately comes from selected integral weights computed by the sketch.
* @param searchCrit If INCLUSIVE, the given rank includes all quantiles &le;
* the quantile directly corresponding to the given weight internal to the sketch.
* @return the approximate quantile given the weight.
*/
double getQuantile(final long weight, final QuantileSearchCriteria searchCrit) {
if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); }
final int len = cumWeights.length;
final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT;
final int index = InequalitySearch.find(cumWeights, 0, len - 1, weight, crit);
if (index == -1) {
return quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0;
return quantiles[len - 1]; //EXCLUSIVE (GT) case: normRank == 1.0;
}
return quantiles[index];
}
Expand Down Expand Up @@ -134,8 +153,8 @@ public boolean isEmpty() {
}

@Override
public KllDoublesSketchSortedViewIterator iterator() {
return new KllDoublesSketchSortedViewIterator(quantiles, cumWeights);
public DoublesSortedViewIterator iterator() {
return new DoublesSortedViewIterator(quantiles, cumWeights);
}

//restricted methods
Expand Down
Loading

0 comments on commit 23e36ac

Please sign in to comment.