diff --git a/README.md b/README.md index 8da5faac3..3190036d1 100644 --- a/README.md +++ b/README.md @@ -154,5 +154,5 @@ In Eclipse, open the project *Properties / Java Build Path / Module Dependencies #### SpotBugs -* Make sure you configure SpotBugs with the /tools/FindBugsExcludeFilter.xml file. Otherwise, you will get a lot of false positive or low risk issues that we have examined and exliminated with this exclusion file. +* Make sure you configure SpotBugs with the /tools/FindBugsExcludeFilter.xml file. Otherwise, you may get a lot of false positive or low risk issues that we have examined and eliminated with this exclusion file. diff --git a/pom.xml b/pom.xml index 75016d7f7..02765d07f 100644 --- a/pom.xml +++ b/pom.xml @@ -150,6 +150,13 @@ under the License. <version>${testng.version}</version> <scope>test</scope> </dependency> + <!-- + <dependency> + <groupId>org.apache.datasketches</groupId> + <artifactId>datasketches-java-common</artifactId> + <version>1.0.0</version> + </dependency> + --> </dependencies> <build> diff --git a/src/main/java/org/apache/datasketches/common/Util.java b/src/main/java/org/apache/datasketches/common/Util.java index 602b40b0b..c9a749e55 100644 --- a/src/main/java/org/apache/datasketches/common/Util.java +++ b/src/main/java/org/apache/datasketches/common/Util.java @@ -24,6 +24,7 @@ import static java.lang.Math.log; import static java.lang.Math.pow; import static java.lang.Math.round; +import static java.util.Arrays.fill; import java.util.Comparator; @@ -217,7 +218,7 @@ public static String nanoSecToString(final long nS) { /** * Returns the given time in milliseconds formatted as Hours:Min:Sec.mSec - * @param mS the given nanoseconds + * @param mS the given milliseconds * @return the given time in milliseconds formatted as Hours:Min:Sec.mSec */ public static String milliSecToString(final long mS) { @@ -244,40 +245,20 @@ public static String zeroPad(final String s, final int fieldLength) { /** * Prepend or postpend the given string with the given character to fill the given field length. - * If the given string is equal or greater than the given field length, it will be returned - * without modification. + * If the given string is equal to or greater than the given field length, it will be returned without modification. * @param s the given string * @param fieldLength the desired field length * @param padChar the desired pad character * @param postpend if true append the pacCharacters to the end of the string. - * @return prepended or postpended given string with the given character to fill the given field - * length. + * @return prepended or postpended given string with the given character to fill the given field length. */ - public static String characterPad(final String s, final int fieldLength, final char padChar, - final boolean postpend) { - final char[] chArr = s.toCharArray(); - final int sLen = chArr.length; + public static String characterPad(final String s, final int fieldLength, final char padChar, final boolean postpend) { + final int sLen = s.length(); if (sLen < fieldLength) { - final char[] out = new char[fieldLength]; - final int blanks = fieldLength - sLen; - - if (postpend) { - for (int i = 0; i < sLen; i++) { - out[i] = chArr[i]; - } - for (int i = sLen; i < fieldLength; i++) { - out[i] = padChar; - } - } else { //prepend - for (int i = 0; i < blanks; i++) { - out[i] = padChar; - } - for (int i = blanks; i < fieldLength; i++) { - out[i] = chArr[i - blanks]; - } - } - - return String.valueOf(out); + final char[] cArr = new char[fieldLength - sLen]; + fill(cArr, padChar); + final String addstr = String.valueOf(cArr); + return (postpend) ? s.concat(addstr) : addstr.concat(s); } return s; } @@ -550,56 +531,60 @@ public static double powerSeriesNextDouble(final int ppb, final double curPoint, } /** - * Computes the ceiling power of given <i>base</i> and <i>n</i> as doubles. - * This is the smallest positive power - * of <i>base</i> that equal to or greater than the given <i>n</i> and equal to a mathematical integer. + * Returns the ceiling of a given <i>n</i> given a <i>radix</i>, where the ceiling is an integral power of the radix. + * This is the smallest positive power of <i>radix</i> that is equal to or greater than the given <i>n</i> + * and equal to a mathematical integer. * The result of this function is consistent with {@link #ceilingIntPowerOf2(int)} for values * less than one. I.e., if <i>n < 1,</i> the result is 1. * - * @param base The base in the expression ⌈base<sup>n</sup>⌉. + * <p>The formula is: <i>radix<sup>ceiling(log<sub>radix</sub>(x))</sup></i></p> + * + * @param radix The base of the number system. * @param n The input argument. - * @return the ceiling power of <i>base</i> as a double and equal to a mathematical integer. + * @return the ceiling power of <i>radix</i> as a double and equal to a mathematical integer. */ - public static double ceilingPowerBaseOfDouble(final double base, final double n) { + public static double ceilingPowerBaseOfDouble(final double radix, final double n) { final double x = n < 1.0 ? 1.0 : n; - return pow(base, ceil(logBaseOfX(base, x))); + return Math.round(pow(radix, ceil(logBaseOfX(radix, x)))); } /** - * Computes the floor power of given <i>base</i> and <i>n</i> as doubles. - * This is the largest positive power - * of <i>base</i> that equal to or less than the given n and equal to a mathematical integer. + * Computes the floor of a given <i>n</i> given <i>radix</i>, where the floor is an integral power of the radix. + * This is the largest positive power of <i>radix</i> that is equal to or less than the given <i>n</i> + * and equal to a mathematical integer. * The result of this function is consistent with {@link #floorPowerOf2(int)} for values * less than one. I.e., if <i>n < 1,</i> the result is 1. * - * @param base The base in the expression ⌊base<sup>n</sup>⌋. + * <p>The formula is: <i>radix<sup>floor(log<sub>radix</sub>(x))</sup></i></p> + * + * @param radix The base of the number system. * @param n The input argument. * @return the floor power of 2 and equal to a mathematical integer. */ - public static double floorPowerBaseOfDouble(final double base, final double n) { + public static double floorPowerBaseOfDouble(final double radix, final double n) { final double x = n < 1.0 ? 1.0 : n; - return pow(base, floor(logBaseOfX(base, x))); + return Math.round(pow(radix, floor(logBaseOfX(radix, x)))); } // Logarithm related /** - * The log base 2 of the value + * The log<sub>2</sub>(value) * @param value the given value - * @return The log base 2 of the value + * @return log<sub>2</sub>(value) */ public static double log2(final double value) { return log(value) / LOG2; } /** - * Returns the logarithm_logBase of x. Example: logB(2.0, x) = log(x) / log(2.0). - * @param logBase the base of the logarithm used + * Returns the log<sub>radix</sub>(x). Example: logB(2.0, x) = log(x) / log(2.0). + * @param radix the base of the number system * @param x the given value - * @return the logarithm_logBase of x: Example: logB(2.0, x) = log(x) / log(2.0). + * @return the log<sub>radix</sub>(x): Example: logB(2.0, x) = log(x) / log(2.0). */ - public static double logBaseOfX(final double logBase, final double x) { - return log(x) / log(logBase); + public static double logBaseOfX(final double radix, final double x) { + return log(x) / log(radix); } /** diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java index 213544021..7c175512a 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java @@ -24,7 +24,6 @@ import static org.apache.datasketches.common.ByteArrayUtil.putDoubleLE; import static org.apache.datasketches.kll.KllSketch.SketchStructure.UPDATABLE; import static org.apache.datasketches.kll.KllSketch.SketchType.DOUBLES_SKETCH; -import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallySpacedDoubles; import java.util.Objects; @@ -175,21 +174,6 @@ public double[] getCDF(final double[] splitPoints, final QuantileSearchCriteria return kllDoublesSV.getCDF(splitPoints, searchCrit); } - @Override - public DoublesPartitionBoundaries getPartitionBoundaries(final int numEquallyWeighted, - final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } - final double[] ranks = equallySpacedDoubles(numEquallyWeighted); - final double[] boundaries = getQuantiles(ranks, searchCrit); - boundaries[0] = getMinItem(); - boundaries[boundaries.length - 1] = getMaxItem(); - final DoublesPartitionBoundaries dpb = new DoublesPartitionBoundaries(); - dpb.N = this.getN(); - dpb.ranks = ranks; - dpb.boundaries = boundaries; - return dpb; - } - @Override public double[] getPMF(final double[] splitPoints, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchIterator.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketchIterator.java index 473d5f1bb..bc18c5347 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchIterator.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketchIterator.java @@ -24,20 +24,12 @@ /** * Iterator over KllDoublesSketch. The order is not defined. */ -public final class KllDoublesSketchIterator implements QuantilesDoublesSketchIterator { +public final class KllDoublesSketchIterator extends KllSketchIterator implements QuantilesDoublesSketchIterator { private final double[] quantiles; - private final int[] levelsArr; - private final int numLevels; - private int level; - private int index; - private long weight; - private boolean isInitialized; KllDoublesSketchIterator(final double[] quantiles, final int[] levelsArr, final int numLevels) { + super(levelsArr, numLevels); this.quantiles = quantiles; - this.levelsArr = levelsArr; - this.numLevels = numLevels; - this.isInitialized = false; } @Override @@ -45,34 +37,4 @@ public double getQuantile() { return quantiles[index]; } - @Override - public long getWeight() { - return weight; - } - - @Override - public boolean next() { - if (!isInitialized) { - level = 0; - index = levelsArr[level]; - weight = 1; - isInitialized = true; - } else { - index++; - } - if (index < levelsArr[level + 1]) { - return true; - } - // go to the next non-empty level - do { - level++; - if (level == numLevels) { - return false; // run out of levels - } - weight *= 2; - } while (levelsArr[level] == levelsArr[level + 1]); - index = levelsArr[level]; - return true; - } - } diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java index 8f8ae5d63..e8bed53eb 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java @@ -27,6 +27,7 @@ import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.quantilescommon.DoublesSortedView; +import org.apache.datasketches.quantilescommon.DoublesSortedViewIterator; import org.apache.datasketches.quantilescommon.InequalitySearch; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.datasketches.quantilescommon.QuantilesUtil; @@ -40,6 +41,9 @@ public final class KllDoublesSketchSortedView implements DoublesSortedView { private final double[] quantiles; private final long[] cumWeights; //comes in as individual weights, converted to cumulative natural weights private final long totalN; + private final double[] normRanks; + private final double maxItem; + private final double minItem; /** * Construct from elements for testing. @@ -47,31 +51,44 @@ public final class KllDoublesSketchSortedView implements DoublesSortedView { * @param cumWeights sorted, monotonically increasing cumulative weights. * @param totalN the total number of items presented to the sketch. */ - KllDoublesSketchSortedView(final double[] quantiles, final long[] cumWeights, final long totalN) { + KllDoublesSketchSortedView(final double[] quantiles, final long[] cumWeights, final long totalN, + final double maxItem, final double minItem) { this.quantiles = quantiles; this.cumWeights = cumWeights; this.totalN = totalN; + this.maxItem = maxItem; + this.minItem = minItem; + final int len = cumWeights.length; + final double[] normRanks = new double[len]; + for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } + this.normRanks = normRanks; } /** * Constructs this Sorted View given the sketch - * @param sk the given KllDoublesSketch. + * @param sketch the given KllDoublesSketch. */ - public KllDoublesSketchSortedView(final KllDoublesSketch sk) { - this.totalN = sk.getN(); - final double[] srcQuantiles = sk.getDoubleItemsArray(); - final int[] srcLevels = sk.levelsArr; - final int srcNumLevels = sk.getNumLevels(); + public KllDoublesSketchSortedView(final KllDoublesSketch sketch) { + if (sketch.isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } + this.totalN = sketch.getN(); + this.maxItem = sketch.getMaxItem(); + this.minItem = sketch.getMinItem(); + final double[] srcQuantiles = sketch.getDoubleItemsArray(); + final int[] srcLevels = sketch.levelsArr; + final int srcNumLevels = sketch.getNumLevels(); - if (!sk.isLevelZeroSorted()) { + if (!sketch.isLevelZeroSorted()) { Arrays.sort(srcQuantiles, srcLevels[0], srcLevels[1]); - if (!sk.hasMemory()) { sk.setLevelZeroSorted(true); } + if (!sketch.hasMemory()) { sketch.setLevelZeroSorted(true); } } final int numQuantiles = srcLevels[srcNumLevels] - srcLevels[0]; //remove garbage quantiles = new double[numQuantiles]; cumWeights = new long[numQuantiles]; populateFromSketch(srcQuantiles, srcLevels, srcNumLevels, numQuantiles); + final double[] normRanks = new double[numQuantiles]; + for (int i = 0; i < numQuantiles; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } + this.normRanks = normRanks; } @Override @@ -79,34 +96,36 @@ public long[] getCumulativeWeights() { return cumWeights.clone(); } + @Override + public double getMaxItem() { + return maxItem; + } + + @Override + public double getMinItem() { + return minItem; + } + + @Override + public long getN() { + return totalN; + } + + @Override + public double[] getNormalizedRanks() { + return normRanks; + } + @Override public double getQuantile(final double rank, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int len = cumWeights.length; - final double naturalRank = getNaturalRank(rank, totalN); + final double naturalRank = getNaturalRank(rank, totalN, searchCrit); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); if (index == -1) { - return quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; - } - return quantiles[index]; - } - - /** - * Special version of getQuantile to support the getPartitionBoundaries(int) function. - * @param weight ultimately comes from selected integral weights computed by the sketch. - * @param searchCrit If INCLUSIVE, the given rank includes all quantiles ≤ - * the quantile directly corresponding to the given weight internal to the sketch. - * @return the approximate quantile given the weight. - */ - double getQuantile(final long weight, final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } - final int len = cumWeights.length; - final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; - final int index = InequalitySearch.find(cumWeights, 0, len - 1, weight, crit); - if (index == -1) { - return quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; + return quantiles[len - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; } return quantiles[index]; } @@ -134,8 +153,8 @@ public boolean isEmpty() { } @Override - public KllDoublesSketchSortedViewIterator iterator() { - return new KllDoublesSketchSortedViewIterator(quantiles, cumWeights); + public DoublesSortedViewIterator iterator() { + return new DoublesSortedViewIterator(quantiles, cumWeights); } //restricted methods diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedViewIterator.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedViewIterator.java deleted file mode 100644 index 29131bd2c..000000000 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedViewIterator.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.kll; - -import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; - -import org.apache.datasketches.quantilescommon.DoublesSortedViewIterator; -import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; - -/** - * Iterator over KllDoublesSketchSortedView - * @author Alexander Saydakov - * @author Lee Rhodes - */ -public final class KllDoublesSketchSortedViewIterator implements DoublesSortedViewIterator { - - private final double[] quantiles; - private final long[] cumWeights; - private final long totalN; - private int index; - - KllDoublesSketchSortedViewIterator(final double[] quantiles, final long[] cumWeights) { - this.quantiles = quantiles; - this.cumWeights = cumWeights; - this.totalN = (cumWeights.length > 0) ? cumWeights[cumWeights.length - 1] : 0; - index = -1; - } - - @Override - public long getCumulativeWeight(final QuantileSearchCriteria searchCrit) { - if (searchCrit == INCLUSIVE) { return cumWeights[index]; } - return (index == 0) ? 0 : cumWeights[index - 1]; - } - - @Override - public long getN() { - return totalN; - } - - @Override - public double getNormalizedRank(final QuantileSearchCriteria searchCrit) { - return (double) getCumulativeWeight(searchCrit) / totalN; - } - - @Override - public double getQuantile() { - return quantiles[index]; - } - - @Override - public long getWeight() { - if (index == 0) { return cumWeights[0]; } - return cumWeights[index] - cumWeights[index - 1]; - } - - @Override - public boolean next() { - index++; - return index < quantiles.length; - } - -} diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java index e2e4d808a..5484e8bf1 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java @@ -24,7 +24,6 @@ import static org.apache.datasketches.common.ByteArrayUtil.putFloatLE; import static org.apache.datasketches.kll.KllSketch.SketchStructure.UPDATABLE; import static org.apache.datasketches.kll.KllSketch.SketchType.FLOATS_SKETCH; -import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallySpacedDoubles; import java.util.Objects; @@ -175,21 +174,6 @@ public double[] getCDF(final float[] splitPoints, final QuantileSearchCriteria s return kllFloatsSV.getCDF(splitPoints, searchCrit); } - @Override - public FloatsPartitionBoundaries getPartitionBoundaries(final int numEquallyWeighted, - final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } - final double[] ranks = equallySpacedDoubles(numEquallyWeighted); - final float[] boundaries = getQuantiles(ranks, searchCrit); - boundaries[0] = getMinItem(); - boundaries[boundaries.length - 1] = getMaxItem(); - final FloatsPartitionBoundaries fpb = new FloatsPartitionBoundaries(); - fpb.N = this.getN(); - fpb.ranks = ranks; - fpb.boundaries = boundaries; - return fpb; - } - @Override public double[] getPMF(final float[] splitPoints, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchIterator.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketchIterator.java index 8c5808ead..accf039de 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchIterator.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketchIterator.java @@ -24,20 +24,12 @@ /** * Iterator over KllFloatsSketch. The order is not defined. */ -public final class KllFloatsSketchIterator implements QuantilesFloatsSketchIterator { +public final class KllFloatsSketchIterator extends KllSketchIterator implements QuantilesFloatsSketchIterator { private final float[] quantiles; - private final int[] levelsArr; - private final int numLevels; - private int level; - private int index; - private long weight; - private boolean isInitialized; KllFloatsSketchIterator(final float[] quantiles, final int[] levelsArr, final int numLevels) { + super(levelsArr, numLevels); this.quantiles = quantiles; - this.levelsArr = levelsArr; - this.numLevels = numLevels; - this.isInitialized = false; } @Override @@ -45,34 +37,4 @@ public float getQuantile() { return quantiles[index]; } - @Override - public long getWeight() { - return weight; - } - - @Override - public boolean next() { - if (!isInitialized) { - level = 0; - index = levelsArr[level]; - weight = 1; - isInitialized = true; - } else { - index++; - } - if (index < levelsArr[level + 1]) { - return true; - } - // go to the next non-empty level - do { - level++; - if (level == numLevels) { - return false; // run out of levels - } - weight *= 2; - } while (levelsArr[level] == levelsArr[level + 1]); - index = levelsArr[level]; - return true; - } - } diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java index 8f47a8da7..08678503c 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java @@ -27,6 +27,7 @@ import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.quantilescommon.FloatsSortedView; +import org.apache.datasketches.quantilescommon.FloatsSortedViewIterator; import org.apache.datasketches.quantilescommon.InequalitySearch; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.datasketches.quantilescommon.QuantilesUtil; @@ -40,6 +41,9 @@ public final class KllFloatsSketchSortedView implements FloatsSortedView { private final float[] quantiles; private final long[] cumWeights; //comes in as individual weights, converted to cumulative natural weights private final long totalN; + private final double[] normRanks; + private final float maxItem; + private final float minItem; /** * Construct from elements for testing. @@ -47,44 +51,80 @@ public final class KllFloatsSketchSortedView implements FloatsSortedView { * @param cumWeights sorted, monotonically increasing cumulative weights. * @param totalN the total number of items presented to the sketch. */ - KllFloatsSketchSortedView(final float[] quantiles, final long[] cumWeights, final long totalN) { + KllFloatsSketchSortedView(final float[] quantiles, final long[] cumWeights, final long totalN, + final float maxItem, final float minItem) { this.quantiles = quantiles; this.cumWeights = cumWeights; this.totalN = totalN; + this.maxItem = maxItem; + this.minItem = minItem; + final int len = cumWeights.length; + final double[] normRanks = new double[len]; + for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } + this.normRanks = normRanks; } /** * Constructs this Sorted View given the sketch - * @param sk the given KllFloatsSketch. + * @param sketch the given KllFloatsSketch. */ - public KllFloatsSketchSortedView(final KllFloatsSketch sk) { - this.totalN = sk.getN(); - final float[] srcQuantiles = sk.getFloatItemsArray(); - final int[] srcLevels = sk.levelsArr; - final int srcNumLevels = sk.getNumLevels(); + public KllFloatsSketchSortedView(final KllFloatsSketch sketch) { + if (sketch.isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } + this.totalN = sketch.getN(); + this.maxItem = sketch.getMaxItem(); + this.minItem = sketch.getMinItem(); + final float[] srcQuantiles = sketch.getFloatItemsArray(); + final int[] srcLevels = sketch.levelsArr; + final int srcNumLevels = sketch.getNumLevels(); - if (!sk.isLevelZeroSorted()) { + if (!sketch.isLevelZeroSorted()) { Arrays.sort(srcQuantiles, srcLevels[0], srcLevels[1]); - if (!sk.hasMemory()) { sk.setLevelZeroSorted(true); } + if (!sketch.hasMemory()) { sketch.setLevelZeroSorted(true); } } final int numQuantiles = srcLevels[srcNumLevels] - srcLevels[0]; //remove garbage quantiles = new float[numQuantiles]; cumWeights = new long[numQuantiles]; populateFromSketch(srcQuantiles, srcLevels, srcNumLevels, numQuantiles); + final int len = cumWeights.length; + final double[] normRanks = new double[len]; + for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } + this.normRanks = normRanks; } + //end of constructors + @Override public long[] getCumulativeWeights() { return cumWeights.clone(); } + @Override + public float getMaxItem() { + return maxItem; + } + + @Override + public float getMinItem() { + return minItem; + } + + @Override + public long getN() { + return totalN; + } + + @Override + public double[] getNormalizedRanks() { + return normRanks; + } + @Override public float getQuantile(final double rank, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int len = cumWeights.length; - final double naturalRank = getNaturalRank(rank, totalN); + final double naturalRank = getNaturalRank(rank, totalN, searchCrit); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); if (index == -1) { @@ -116,8 +156,8 @@ public boolean isEmpty() { } @Override - public KllFloatsSketchSortedViewIterator iterator() { - return new KllFloatsSketchSortedViewIterator(quantiles, cumWeights); + public FloatsSortedViewIterator iterator() { + return new FloatsSortedViewIterator(quantiles, cumWeights); } //restricted methods diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedViewIterator.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedViewIterator.java deleted file mode 100644 index 87c2e88bd..000000000 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedViewIterator.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.kll; - -import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; - -import org.apache.datasketches.quantilescommon.FloatsSortedViewIterator; -import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; - -/** - * Iterator over KllFloatsSketchSortedView - * @author Alexander Saydakov - * @author Lee Rhodes - */ -public final class KllFloatsSketchSortedViewIterator implements FloatsSortedViewIterator { - - private final float[] quantiles; - private final long[] cumWeights; - private final long totalN; - private int index; - - KllFloatsSketchSortedViewIterator(final float[] quantiles, final long[] cumWeights) { - this.quantiles = quantiles; - this.cumWeights = cumWeights; - this.totalN = (cumWeights.length > 0) ? cumWeights[cumWeights.length - 1] : 0; - index = -1; - } - - @Override - public long getCumulativeWeight(final QuantileSearchCriteria searchCrit) { - if (searchCrit == INCLUSIVE) { return cumWeights[index]; } - return (index == 0) ? 0 : cumWeights[index - 1]; - } - - @Override - public long getN() { - return totalN; - } - - @Override - public double getNormalizedRank(final QuantileSearchCriteria searchCrit) { - return (double) getCumulativeWeight(searchCrit) / totalN; - } - - @Override - public float getQuantile() { - return quantiles[index]; - } - - @Override - public long getWeight() { - if (index == 0) { return cumWeights[0]; } - return cumWeights[index] - cumWeights[index - 1]; - } - - @Override - public boolean next() { - index++; - return index < quantiles.length; - } - -} diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java index d5f73b00d..f0e923fbd 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java @@ -23,7 +23,6 @@ import static java.lang.Math.min; import static org.apache.datasketches.kll.KllSketch.SketchStructure.UPDATABLE; import static org.apache.datasketches.kll.KllSketch.SketchType.ITEMS_SKETCH; -import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallySpacedDoubles; import java.lang.reflect.Array; import java.util.Comparator; @@ -34,7 +33,10 @@ import org.apache.datasketches.memory.Memory; import org.apache.datasketches.memory.MemoryRequestServer; import org.apache.datasketches.memory.WritableMemory; +import org.apache.datasketches.quantilescommon.GenericPartitionBoundaries; +import org.apache.datasketches.quantilescommon.PartitioningFeature; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; +import org.apache.datasketches.quantilescommon.QuantilesAPI; import org.apache.datasketches.quantilescommon.QuantilesGenericAPI; import org.apache.datasketches.quantilescommon.QuantilesGenericSketchIterator; @@ -46,7 +48,7 @@ * @see org.apache.datasketches.kll.KllSketch */ @SuppressWarnings("unchecked") -public abstract class KllItemsSketch<T> extends KllSketch implements QuantilesGenericAPI<T> { +public abstract class KllItemsSketch<T> extends KllSketch implements QuantilesGenericAPI<T>, PartitioningFeature<T> { private KllItemsSketchSortedView<T> kllItemsSV = null; final Comparator<? super T> comparator; final ArrayOfItemsSerDe<T> serDe; @@ -150,18 +152,11 @@ public double[] getCDF(final T[] splitPoints, final QuantileSearchCriteria searc } @Override - public GenericPartitionBoundaries<T> getPartitionBoundaries(final int numEquallyWeighted, + public GenericPartitionBoundaries<T> getPartitionBoundaries(final int numEquallySized, final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } - final double[] ranks = equallySpacedDoubles(numEquallyWeighted); - final Object[] boundaries = getQuantiles(ranks, searchCrit); - boundaries[0] = getMinItem(); - boundaries[boundaries.length - 1] = getMaxItem(); - final GenericPartitionBoundaries<T> gpb = new GenericPartitionBoundaries<>(); - gpb.N = this.getN(); - gpb.ranks = ranks; - gpb.boundaries = (T[])boundaries; - return gpb; + if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } + refreshSortedView(); + return kllItemsSV.getPartitionBoundaries(numEquallySized, searchCrit); } @Override diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketchIterator.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketchIterator.java index 4adb9d79b..3a0a8da0f 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsSketchIterator.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketchIterator.java @@ -24,20 +24,12 @@ /** * Iterator over KllItemsSketch. The order is not defined. */ -public final class KllItemsSketchIterator<T> implements QuantilesGenericSketchIterator<T> { +public final class KllItemsSketchIterator<T> extends KllSketchIterator implements QuantilesGenericSketchIterator<T> { private final Object[] quantiles; - private final int[] levelsArr; - private final int numLevels; - private int level; - private int index; - private long weight; - private boolean isInitialized_; KllItemsSketchIterator(final Object[] quantiles, final int[] levelsArr, final int numLevels) { + super(levelsArr, numLevels); this.quantiles = quantiles; - this.levelsArr = levelsArr; - this.numLevels = numLevels; - this.isInitialized_ = false; } @SuppressWarnings("unchecked") @@ -46,34 +38,4 @@ public T getQuantile() { return (T)quantiles[index]; } - @Override - public long getWeight() { - return weight; - } - - @Override - public boolean next() { - if (!isInitialized_) { - level = 0; - index = levelsArr[level]; - weight = 1; - isInitialized_ = true; - } else { - index++; - } - if (index < levelsArr[level + 1]) { - return true; - } - // go to the next non-empty level - do { - level++; - if (level == numLevels) { - return false; // run out of levels - } - weight *= 2; - } while (levelsArr[level] == levelsArr[level + 1]); - index = levelsArr[level]; - return true; - } - } diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java index 7c066dff1..4b901f54a 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java @@ -22,6 +22,7 @@ import static org.apache.datasketches.quantilescommon.GenericInequalitySearch.find; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import static org.apache.datasketches.quantilescommon.QuantilesAPI.EMPTY_MSG; +import static org.apache.datasketches.quantilescommon.QuantilesUtil.evenlySpacedDoubles; import static org.apache.datasketches.quantilescommon.QuantilesUtil.getNaturalRank; import java.lang.reflect.Array; @@ -31,10 +32,13 @@ import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; import org.apache.datasketches.quantilescommon.GenericInequalitySearch.Inequality; +import org.apache.datasketches.quantilescommon.GenericPartitionBoundaries; import org.apache.datasketches.quantilescommon.GenericSortedView; import org.apache.datasketches.quantilescommon.GenericSortedViewIterator; import org.apache.datasketches.quantilescommon.InequalitySearch; +import org.apache.datasketches.quantilescommon.PartitioningFeature; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; +import org.apache.datasketches.quantilescommon.QuantilesAPI; import org.apache.datasketches.quantilescommon.QuantilesUtil; /** @@ -43,13 +47,15 @@ * @author Alexander Saydakov * @author Lee Rhodes */ -@SuppressWarnings("unchecked") -public class KllItemsSketchSortedView<T> implements GenericSortedView<T> { - private final Object[] quantiles; +public class KllItemsSketchSortedView<T> implements GenericSortedView<T>, PartitioningFeature<T> { + private final T[] quantiles; private final long[] cumWeights; //comes in as individual weights, converted to cumulative natural weights private final long totalN; + private final Comparator<? super T> comparator; + private final T maxItem; private final T minItem; - private final Comparator<? super T> comp; + private final Class<T> clazz; + private final double[] normRanks; /** * Construct from elements for testing only. @@ -59,49 +65,59 @@ public class KllItemsSketchSortedView<T> implements GenericSortedView<T> { * @param minItem used to extract the type of T * @param comparator the Comparator for type T */ + @SuppressWarnings("unchecked") KllItemsSketchSortedView( final T[] quantiles, final long[] cumWeights, final long totalN, - final T minItem, - final Comparator<? super T> comparator) { + final Comparator<? super T> comparator, + final T maxItem, + final T minItem) { this.quantiles = quantiles; this.cumWeights = cumWeights; this.totalN = totalN; + this.comparator = comparator; + this.maxItem = maxItem; this.minItem = minItem; - this.comp = comparator; + this.clazz = (Class<T>)quantiles[0].getClass(); + this.normRanks = convertCumWtsToNormRanks(cumWeights, totalN); } /** * Constructs this Sorted View given the sketch - * @param sk the given KllItemsSketch. + * @param sketch the given KllItemsSketch. */ - KllItemsSketchSortedView(final KllItemsSketch<T> sk) { - this.totalN = sk.getN(); - this.minItem = sk.getMinItem(); - final Object[] srcQuantiles = sk.getTotalItemsArray(); - final int[] srcLevels = sk.levelsArr; - final int srcNumLevels = sk.getNumLevels(); - this.comp = sk.comparator; + @SuppressWarnings("unchecked") + KllItemsSketchSortedView(final KllItemsSketch<T> sketch) { + if (sketch.isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } + this.totalN = sketch.getN(); + final T[] srcQuantiles = sketch.getTotalItemsArray(); + final int[] srcLevels = sketch.levelsArr; + final int srcNumLevels = sketch.getNumLevels(); + this.comparator = sketch.comparator; + this.maxItem = sketch.getMaxItem(); + this.minItem = sketch.getMinItem(); + this.clazz = (Class<T>)sketch.serDe.getClassOfT(); if (totalN == 0) { throw new SketchesArgumentException(EMPTY_MSG); } - if (!sk.isLevelZeroSorted()) { - Arrays.sort((T[])srcQuantiles, srcLevels[0], srcLevels[1], comp); - if (!sk.hasMemory()) { sk.setLevelZeroSorted(true); } + if (!sketch.isLevelZeroSorted()) { + Arrays.sort(srcQuantiles, srcLevels[0], srcLevels[1], comparator); + if (!sketch.hasMemory()) { sketch.setLevelZeroSorted(true); } } final int numQuantiles = srcLevels[srcNumLevels] - srcLevels[0]; //remove garbage - quantiles = new Object[numQuantiles]; + quantiles = (T[]) Array.newInstance(sketch.serDe.getClassOfT(), numQuantiles); cumWeights = new long[numQuantiles]; populateFromSketch(srcQuantiles, srcLevels, srcNumLevels, numQuantiles); + this.normRanks = convertCumWtsToNormRanks(cumWeights, totalN); } //end of constructors - @Override //implemented here because it needs the comparator + @Override public double[] getCDF(final T[] splitPoints, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } - GenericSortedView.validateItems(splitPoints, comp); + GenericSortedView.validateItems(splitPoints, comparator); final int len = splitPoints.length + 1; final double[] buckets = new double[len]; for (int i = 0; i < len - 1; i++) { @@ -116,10 +132,66 @@ public long[] getCumulativeWeights() { return cumWeights.clone(); } - @Override //implemented here because it needs the comparator + @Override + public T getMaxItem() { + return maxItem; + } + + @Override + public T getMinItem() { + return minItem; + } + + @Override + public long getN() { + return totalN; + } + + @Override + public double[] getNormalizedRanks() { + return normRanks.clone(); + } + + @Override + @SuppressWarnings("unchecked") + public GenericPartitionBoundaries<T> getPartitionBoundaries(final int numEquallySized, + final QuantileSearchCriteria searchCrit) { + if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } + final long totalN = this.totalN; + final int svLen = cumWeights.length; + //adjust ends of sortedView arrays + cumWeights[0] = 1L; + cumWeights[svLen - 1] = totalN; + normRanks[0] = 1.0 / totalN; + normRanks[svLen - 1] = 1.0; + quantiles[0] = this.getMinItem(); + quantiles[svLen - 1] = this.getMaxItem(); + + final double[] evSpNormRanks = evenlySpacedDoubles(0, 1.0, numEquallySized + 1); + final int len = evSpNormRanks.length; + final T[] evSpQuantiles = (T[]) Array.newInstance(clazz, len); + + final long[] evSpNatRanks = new long[len]; + for (int i = 0; i < len; i++) { + final int index = getQuantileIndex(evSpNormRanks[i], searchCrit); + evSpQuantiles[i] = getQuantileFromIndex(index); + evSpNatRanks[i] = getCumWeightFromIndex(index); + } + final GenericPartitionBoundaries<T> gpb = new GenericPartitionBoundaries<>( + this.totalN, + evSpQuantiles.clone(), + evSpNatRanks.clone(), + evSpNormRanks.clone(), + getMaxItem(), + getMinItem(), + searchCrit); + return gpb; + } + + @Override public double[] getPMF(final T[] splitPoints, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } - GenericSortedView.validateItems(splitPoints, comp); + GenericSortedView.validateItems(splitPoints, comparator); final double[] buckets = getCDF(splitPoints, searchCrit); final int len = buckets.length; for (int i = len; i-- > 1; ) { @@ -132,35 +204,36 @@ public double[] getPMF(final T[] splitPoints, final QuantileSearchCriteria searc public T getQuantile(final double rank, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); + final int index = getQuantileIndex(rank, searchCrit); + return getQuantileFromIndex(index); + } + + private T getQuantileFromIndex(final int index) { return quantiles[index]; } + + private long getCumWeightFromIndex(final int index) { return cumWeights[index]; } + + private int getQuantileIndex(final double rank, final QuantileSearchCriteria searchCrit) { final int len = cumWeights.length; - final double naturalRank = getNaturalRank(rank, totalN); + final double naturalRank = getNaturalRank(rank, totalN, searchCrit); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); - if (index == -1) { - return (T) quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; - } - return (T) quantiles[index]; + if (index == -1) { return len - 1; } + return index; } - /** - * Special version of getQuantile to support the getPartitionBoundaries(int) function. - * @param weight ultimately comes from selected integral weights computed by the sketch. - * @param searchCrit If INCLUSIVE, the given rank includes all quantiles ≤ - * the quantile directly corresponding to the given weight internal to the sketch. - * @return the approximate quantile given the weight. - */ - T getQuantile(final long weight, final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } - final int len = cumWeights.length; - final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; - final int index = InequalitySearch.find(cumWeights, 0, len - 1, weight, crit); - if (index == -1) { - return (T) quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; + @SuppressWarnings("unchecked") + public T[] getQuantiles(final double[] ranks, final QuantileSearchCriteria searchCrit) { + if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } + final int len = ranks.length; + final T[] quants = (T[]) Array.newInstance(clazz, len); + for (int i = 0; i < len; i++) { + quants[i] = getQuantile(ranks[i], searchCrit); } - return (T) quantiles[index]; + return quants; } @Override + @SuppressWarnings("unchecked") public T[] getQuantiles() { final T[] quants = (T[]) Array.newInstance(minItem.getClass(), quantiles.length); System.arraycopy(quantiles, 0, quants, 0, quantiles.length); @@ -172,7 +245,7 @@ public double getRank(final T quantile, final QuantileSearchCriteria searchCrit) if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } final int len = quantiles.length; final Inequality crit = (searchCrit == INCLUSIVE) ? Inequality.LE : Inequality.LT; - final int index = find((T[])quantiles, 0, len - 1, quantile, crit, comp); + final int index = find(quantiles, 0, len - 1, quantile, crit, comparator); if (index == -1) { return 0; //EXCLUSIVE (LT) case: quantile <= minQuantile; INCLUSIVE (LE) case: quantile < minQuantile } @@ -185,12 +258,19 @@ public boolean isEmpty() { } @Override - public KllItemsSketchSortedViewIterator<T> iterator() { - return new KllItemsSketchSortedViewIterator<>((T[])quantiles, cumWeights); + public GenericSortedViewIterator<T> iterator() { + return new GenericSortedViewIterator<>(quantiles, cumWeights); } //restricted methods + private static double[] convertCumWtsToNormRanks(final long[] cumWeights, final long totalN) { + final int len = cumWeights.length; + final double[] normRanks = new double[len]; + for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } + return normRanks; + } + private void populateFromSketch(final Object[] srcQuantiles, final int[] srcLevels, final int srcNumLevels, final int numItems) { final int[] myLevels = new int[srcNumLevels + 1]; @@ -212,7 +292,7 @@ private void populateFromSketch(final Object[] srcQuantiles, final int[] srcLeve weight *= 2; } final int numLevels = dstLevel; - blockyTandemMergeSort(quantiles, cumWeights, myLevels, numLevels, comp); //create unit weights + blockyTandemMergeSort(quantiles, cumWeights, myLevels, numLevels, comparator); //create unit weights KllHelper.convertToCumulative(cumWeights); } @@ -255,6 +335,7 @@ private static <T> void blockyTandemMergeSortRecursion( startingLevel2, numLevels2, comp); } + @SuppressWarnings("unchecked") private static <T> void tandemMerge( final Object[] quantilesSrc, final long[] weightsSrc, final Object[] quantilesDst, final long[] weightsDst, @@ -290,15 +371,4 @@ private static <T> void tandemMerge( } } - /** - * Iterator over KllItemsSketchSortedView. - * @param <T> type of quantile (item) - */ - public static final class KllItemsSketchSortedViewIterator<T> extends GenericSortedViewIterator<T> { - - KllItemsSketchSortedViewIterator(final T[] quantiles, final long[] cumWeights) { - super(quantiles, cumWeights); - } - } - } diff --git a/src/main/java/org/apache/datasketches/kll/KllSketchIterator.java b/src/main/java/org/apache/datasketches/kll/KllSketchIterator.java new file mode 100644 index 000000000..feaf33f53 --- /dev/null +++ b/src/main/java/org/apache/datasketches/kll/KllSketchIterator.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.kll; + +import org.apache.datasketches.quantilescommon.QuantilesSketchIterator; + +/** + * The base implementation for the KLL sketch iterator hierarchy used for viewing the + * non-ordered quantiles retained by a sketch. + * + * <p>Prototype example of the recommended iteration loop:</p> + * <pre>{@code + * SketchIterator itr = sketch.iterator(); + * while (itr.next()) { + * ...get*(); + * } + * }</pre> + * + * @author Lee Rhodes + */ +public class KllSketchIterator implements QuantilesSketchIterator { + protected final int[] levelsArr; + protected final int numLevels; + protected int level; + protected int index; + protected long weight; + protected boolean isInitialized_; + + KllSketchIterator(final int[] levelsArr, final int numLevels) { + this.levelsArr = levelsArr; + this.numLevels = numLevels; + this.isInitialized_ = false; + } + + @Override + public long getWeight() { + return weight; + } + + @Override + public boolean next() { + if (!isInitialized_) { + level = 0; + index = levelsArr[level]; + weight = 1; + isInitialized_ = true; + } else { + index++; + } + if (index < levelsArr[level + 1]) { + return true; + } + // go to the next non-empty level + do { + level++; + if (level == numLevels) { + return false; // run out of levels + } + weight *= 2; + } while (levelsArr[level] == levelsArr[level + 1]); + index = levelsArr[level]; + return true; + } + +} diff --git a/src/main/java/org/apache/datasketches/partitions/BoundsRule.java b/src/main/java/org/apache/datasketches/partitions/BoundsRule.java new file mode 100644 index 000000000..68dc87bc1 --- /dev/null +++ b/src/main/java/org/apache/datasketches/partitions/BoundsRule.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.partitions; + +public enum BoundsRule { + + /** + * Include both the upper and lower bounds + */ + INCLUDE_BOTH, + + /** + * Include only the upper bound but not the lower bound + */ + INCLUDE_UPPER, + /** + * Include only the lower bound but not the upper bound + */ + INCLUDE_LOWER +} diff --git a/src/main/java/org/apache/datasketches/partitions/Partitioner.java b/src/main/java/org/apache/datasketches/partitions/Partitioner.java new file mode 100644 index 000000000..65577385a --- /dev/null +++ b/src/main/java/org/apache/datasketches/partitions/Partitioner.java @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.partitions; + +import static java.lang.Math.ceil; +import static java.lang.Math.log; +import static java.lang.Math.max; +import static java.lang.Math.min; +import static java.lang.Math.pow; +import static java.lang.Math.round; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; +import static org.apache.datasketches.quantilescommon.QuantilesAPI.EMPTY_MSG; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.datasketches.common.SketchesArgumentException; +import org.apache.datasketches.quantilescommon.GenericPartitionBoundaries; +import org.apache.datasketches.quantilescommon.PartitioningFeature; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; +import org.apache.datasketches.quantilescommon.QuantilesGenericAPI; +import org.apache.datasketches.quantilescommon.Stack; + +/** + * A partitioning process that can partition very large data sets into thousands to millions + * of partitions of approximately the same size. + * @param T the data type + * @param S the quantiles sketch that implements both QuantilesGenericAPI and PartitioningFeature. + */ +//@SuppressWarnings("unused") +public class Partitioner<T, S extends QuantilesGenericAPI<T> & PartitioningFeature<T>> { + private static final QuantileSearchCriteria defaultCriteria = INCLUSIVE; + private final long tgtPartitionSize; + private final int maxPartsPerSk; + private final SketchFillRequest<T, S> fillReq; + private final QuantileSearchCriteria criteria; + private final Stack<StackElement<T>> stack = new Stack<>(); + + //computed once at the beginning + private int numLevels; + private int partitionsPerSk; + //output + private final List<PartitionBoundsRow<T>> finalPartitionList = new ArrayList<>(); + + /** + * This constructor assumes a QuantileSearchCriteria of INCLUSIVE. + * @param tgtPartitionSize the target size of the resulting partitions in number of items. + * @param maxPartsPerPass The maximum number of partitions to request from the sketch. The smaller this number is + * the smaller the variance will be of the resulting partitions, but this will increase the number of passes of the + * source data set. + * @param fillReq The is an implementation of the SketchFillRequest call-back supplied by the user and implements + * the SketchFillRequest interface. + */ + public Partitioner( + final long tgtPartitionSize, + final int maxPartsPerPass, + final SketchFillRequest<T,S> fillReq) { + this(tgtPartitionSize, maxPartsPerPass, fillReq, defaultCriteria); + } + + /** + * This constructor includes the QuantileSearchCriteria criteria as a parameter. + * @param tgtPartitionSize the target size of the resulting partitions in number of items. + * @param maxPartsPerSk The maximum number of partitions to request from the sketch. The smaller this number is + * the smaller the variance will be of the resulting partitions, but this will increase the number of passes of the + * source data set. + * @param fillReq The is an implementation of the SketchFillRequest call-back supplied by the user. + * @param criteria This is the desired QuantileSearchCriteria to be used. + */ + public Partitioner( + final long tgtPartitionSize, + final int maxPartsPerSk, + final SketchFillRequest<T,S> fillReq, + final QuantileSearchCriteria criteria) { + this.tgtPartitionSize = tgtPartitionSize; + this.maxPartsPerSk = maxPartsPerSk; + this.fillReq = fillReq; + this.criteria = criteria; + } + + /** + * This initiates the partitioning process + * @param sk A sketch of the entire data set. + * @return the final partitioning list + */ + public List<PartitionBoundsRow<T>> partition(final S sk) { + if (sk.isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } + final long inputN = sk.getN(); + final double guessNumParts = max(1.0, ceil((double)inputN / tgtPartitionSize)); + this.numLevels = (int)max(1, ceil(log(guessNumParts) / log(maxPartsPerSk))); + final int partsPerSk = (int)round(pow(guessNumParts, 1.0 / numLevels)); + this.partitionsPerSk = min(partsPerSk, maxPartsPerSk); + final GenericPartitionBoundaries<T> gpb = sk.getPartitionBoundaries(partitionsPerSk, criteria); + final StackElement<T> se = new StackElement<>(gpb, stack.size() + 1, 0, "1"); + stack.push(se); + partitionSearch(stack); + return finalPartitionList; + } + + private void partitionSearch(final Stack<StackElement<T>> stack) { + if (stack.isEmpty()) { + return; + } + final StackElement<T> se = stack.peek(); + final GenericPartitionBoundaries<T> gpb = se.gpb; + final int numParts = gpb.getNumPartitions(); + + if (stack.size() == numLevels) { //at max level + while (++se.part <= numParts) { //add rows to final partition list + final PartitionBoundsRow<T> row = new PartitionBoundsRow<>(se); + finalPartitionList.add(row); + } + stack.pop(); + partitionSearch(stack); + } + else { //not at max level + if (++se.part <= numParts) { + final PartitionBoundsRow<T> row = new PartitionBoundsRow<>(se); + final S sk = fillReq.getRange(row.lowerBound, row.upperBound, row.rule); + final GenericPartitionBoundaries<T> gpb2 = sk.getPartitionBoundaries(this.partitionsPerSk, criteria); + final int level = stack.size() + 1; + final String partId = se.partId + "." + se.part + "," + level; + final StackElement<T> se2 = new StackElement<>(gpb2, level, 0, partId); + stack.push(se2); + partitionSearch(stack); + } + //done with all parts at this level + if (stack.isEmpty()) { + return; + } + stack.pop(); + partitionSearch(stack); + } + } + + /** + * Holds data for a Stack element + */ + public static class StackElement<T> { + public final GenericPartitionBoundaries<T> gpb; + public int part; + public String partId; + + public StackElement(final GenericPartitionBoundaries<T> gpb, final int level, final int part, final String partId) { + this.gpb = gpb; + this.part = part; + this.partId = partId; + } + } + + /** + * Defines a row for List of PartitionBounds. + */ + public static class PartitionBoundsRow<T> { + public int part; + public String partId; + public long approxNumDeltaItems; + public BoundsRule rule; + public T lowerBound; + public T upperBound; + + public PartitionBoundsRow(final StackElement<T> se) { + final GenericPartitionBoundaries<T> gpb = se.gpb; + this.part = se.part; + this.partId = se.partId + "." + part; + final QuantileSearchCriteria searchCrit = gpb.getSearchCriteria(); + final T[] boundaries = gpb.getBoundaries(); + final int numParts = gpb.getNumPartitions(); + if (searchCrit == INCLUSIVE) { + if (part == 1) { + lowerBound = gpb.getMinItem(); + upperBound = boundaries[part]; + rule = BoundsRule.INCLUDE_BOTH; + } else { + lowerBound = boundaries[part - 1]; + upperBound = boundaries[part]; + rule = BoundsRule.INCLUDE_UPPER; + } + } else { //EXCLUSIVE + if (part == numParts) { + lowerBound = boundaries[part - 1]; + upperBound = gpb.getMaxItem(); + rule = BoundsRule.INCLUDE_BOTH; + } else { + lowerBound = boundaries[part - 1]; + upperBound = boundaries[part]; + rule = BoundsRule.INCLUDE_LOWER; + } + } + approxNumDeltaItems = gpb.getNumDeltaItems()[part]; + } + } + +} diff --git a/src/main/java/org/apache/datasketches/partitions/SketchFillRequest.java b/src/main/java/org/apache/datasketches/partitions/SketchFillRequest.java new file mode 100644 index 000000000..d005561d0 --- /dev/null +++ b/src/main/java/org/apache/datasketches/partitions/SketchFillRequest.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.partitions; + +import org.apache.datasketches.quantilescommon.PartitioningFeature; +import org.apache.datasketches.quantilescommon.QuantilesGenericAPI; + +/** + * This is a callback request to the data source to fill a quantiles sketch, + * which is returned to the caller. + * + * @author Lee Rhodes + */ +public interface SketchFillRequest<T, S extends QuantilesGenericAPI<T> & PartitioningFeature<T>> { + + /** + * This is a callback request to the data source to fill a quantiles sketch + * with a range of data between upper and lower bounds. Which of these bounds are to be included is determined by + * the <i>BoundsRule</i>. + * + * <p>This range of data may or may not be subsequently further partitioned.</p> + * @param lowerQuantile the lowest quantile of a range + * @param upperQuantile the highest quantile of a range + * @param boundsRule determines which quantile bounds to include + * @return a quantiles sketch filled from the given upper and lower bounds. + */ + public S getRange(final T lowerQuantile, final T upperQuantile, final BoundsRule boundsRule); + +} diff --git a/src/main/java/org/apache/datasketches/partitions/package-info.java b/src/main/java/org/apache/datasketches/partitions/package-info.java new file mode 100644 index 000000000..cee11ec1d --- /dev/null +++ b/src/main/java/org/apache/datasketches/partitions/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/** + * + */ +package org.apache.datasketches.partitions; diff --git a/src/main/java/org/apache/datasketches/quantiles/DoublesSketch.java b/src/main/java/org/apache/datasketches/quantiles/DoublesSketch.java index b3a78d5af..bbcdf44f7 100644 --- a/src/main/java/org/apache/datasketches/quantiles/DoublesSketch.java +++ b/src/main/java/org/apache/datasketches/quantiles/DoublesSketch.java @@ -28,7 +28,6 @@ import static org.apache.datasketches.quantiles.ClassicUtil.checkK; import static org.apache.datasketches.quantiles.ClassicUtil.computeNumLevelsNeeded; import static org.apache.datasketches.quantiles.ClassicUtil.computeRetainedItems; -import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallySpacedDoubles; import java.util.Random; @@ -170,21 +169,6 @@ public double[] getCDF(final double[] splitPoints, final QuantileSearchCriteria @Override public abstract double getMinItem(); - @Override - public DoublesPartitionBoundaries getPartitionBoundaries(final int numEquallyWeighted, - final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } - final double[] ranks = equallySpacedDoubles(numEquallyWeighted); - final double[] boundaries = getQuantiles(ranks, searchCrit); - boundaries[0] = getMinItem(); - boundaries[boundaries.length - 1] = getMaxItem(); - final DoublesPartitionBoundaries dpb = new DoublesPartitionBoundaries(); - dpb.N = this.getN(); - dpb.ranks = ranks; - dpb.boundaries = boundaries; - return dpb; - } - @Override public double[] getPMF(final double[] splitPoints, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } diff --git a/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedView.java b/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedView.java index ef250fe5f..b746bae15 100644 --- a/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedView.java @@ -27,8 +27,10 @@ import java.util.Arrays; +import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.SketchesStateException; import org.apache.datasketches.quantilescommon.DoublesSortedView; +import org.apache.datasketches.quantilescommon.DoublesSortedViewIterator; import org.apache.datasketches.quantilescommon.InequalitySearch; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.datasketches.quantilescommon.QuantilesUtil; @@ -42,6 +44,9 @@ public final class DoublesSketchSortedView implements DoublesSortedView { private final double[] quantiles; private final long[] cumWeights; //comes in as individual weights, converted to cumulative natural weights private final long totalN; + private final double[] normRanks; + private final double maxItem; + private final double minItem; /** * Construct from elements for testing. @@ -49,10 +54,17 @@ public final class DoublesSketchSortedView implements DoublesSortedView { * @param cumWeights sorted, monotonically increasing cumulative weights. * @param totalN the total number of items presented to the sketch. */ - DoublesSketchSortedView(final double[] quantiles, final long[] cumWeights, final long totalN) { + DoublesSketchSortedView(final double[] quantiles, final long[] cumWeights, final long totalN, + final double maxItem, final double minItem) { this.quantiles = quantiles; this.cumWeights = cumWeights; this.totalN = totalN; + this.maxItem = maxItem; + this.minItem = minItem; + final int len = cumWeights.length; + final double[] normRanks = new double[len]; + for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } + this.normRanks = normRanks; } /** @@ -60,7 +72,10 @@ public final class DoublesSketchSortedView implements DoublesSortedView { * @param sketch the given Classic Quantiles DoublesSketch */ public DoublesSketchSortedView(final DoublesSketch sketch) { + if (sketch.isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } this.totalN = sketch.getN(); + this.maxItem = sketch.getMaxItem(); + this.minItem = sketch.getMinItem(); final int k = sketch.getK(); final int numQuantiles = sketch.getNumRetained(); quantiles = new double[numQuantiles]; @@ -77,6 +92,34 @@ public DoublesSketchSortedView(final DoublesSketch sketch) { if (convertToCumulative(cumWeights) != totalN) { throw new SketchesStateException("Sorted View is misconfigured. TotalN does not match cumWeights."); } + final double[] normRanks = new double[numQuantiles]; + for (int i = 0; i < numQuantiles; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } + this.normRanks = normRanks; + } + + @Override + public long[] getCumulativeWeights() { + return cumWeights.clone(); + } + + @Override + public double getMaxItem() { + return maxItem; + } + + @Override + public double getMinItem() { + return minItem; + } + + @Override + public long getN() { + return totalN; + } + + @Override + public double[] getNormalizedRanks() { + return normRanks.clone(); } @Override @@ -84,29 +127,11 @@ public double getQuantile(final double rank, final QuantileSearchCriteria search if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int len = cumWeights.length; - final double naturalRank = getNaturalRank(rank, totalN); + final double naturalRank = getNaturalRank(rank, totalN, searchCrit); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); if (index == -1) { - return quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; - } - return quantiles[index]; - } - - /** - * Special version of getQuantile to support the getPartitionBoundaries(int) function. - * @param weight ultimately comes from selected integral weights computed by the sketch. - * @param searchCrit If INCLUSIVE, the given rank includes all quantiles ≤ - * the quantile directly corresponding to the given weight internal to the sketch. - * @return the approximate quantile given the weight. - */ - double getQuantile(final long weight, final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } - final int len = cumWeights.length; - final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; - final int index = InequalitySearch.find(cumWeights, 0, len - 1, weight, crit); - if (index == -1) { - return quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; + return quantiles[len - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; } return quantiles[index]; } @@ -123,11 +148,6 @@ public double getRank(final double quantile, final QuantileSearchCriteria search return (double)cumWeights[index] / totalN; } - @Override - public long[] getCumulativeWeights() { - return cumWeights.clone(); - } - @Override public double[] getQuantiles() { return quantiles.clone(); @@ -139,8 +159,8 @@ public boolean isEmpty() { } @Override - public DoublesSketchSortedViewIterator iterator() { - return new DoublesSketchSortedViewIterator(quantiles, cumWeights); + public DoublesSortedViewIterator iterator() { + return new DoublesSortedViewIterator(quantiles, cumWeights); } //restricted methods diff --git a/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedViewIterator.java b/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedViewIterator.java deleted file mode 100644 index f834fb2aa..000000000 --- a/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedViewIterator.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.quantiles; - -import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; - -import org.apache.datasketches.quantilescommon.DoublesSortedViewIterator; -import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; - -/** - * Iterator over DoublesSketchSortedView. - */ -public final class DoublesSketchSortedViewIterator implements DoublesSortedViewIterator { - - private final double[] quantiles; - private final long[] cumWeights; - private final long totalN; - private int index; - - DoublesSketchSortedViewIterator(final double[] quantiles, final long[] cumWeights) { - this.quantiles = quantiles; - this.cumWeights = cumWeights; - this.totalN = (cumWeights.length > 0) ? cumWeights[cumWeights.length - 1] : 0; - index = -1; - } - - @Override - public long getCumulativeWeight(final QuantileSearchCriteria searchCrit) { - if (searchCrit == INCLUSIVE) { return cumWeights[index]; } - return (index == 0) ? 0 : cumWeights[index - 1]; - } - - @Override - public long getN() { - return totalN; - } - - @Override - public double getNormalizedRank(final QuantileSearchCriteria searchCrit) { - return (double) getCumulativeWeight(searchCrit) / totalN; - } - - @Override - public double getQuantile() { - return quantiles[index]; - } - - @Override - public long getWeight() { - if (index == 0) { return cumWeights[0]; } - return cumWeights[index] - cumWeights[index - 1]; - } - - @Override - public boolean next() { - index++; - return index < quantiles.length; - } - -} diff --git a/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java b/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java index 64f66fde2..6b247347a 100644 --- a/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java +++ b/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java @@ -36,10 +36,7 @@ import static org.apache.datasketches.quantiles.PreambleUtil.extractN; import static org.apache.datasketches.quantiles.PreambleUtil.extractPreLongs; import static org.apache.datasketches.quantiles.PreambleUtil.extractSerVer; -import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; -import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallySpacedLongs; -import java.lang.reflect.Array; import java.util.Arrays; import java.util.Comparator; import java.util.Objects; @@ -49,7 +46,8 @@ import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.memory.Memory; import org.apache.datasketches.memory.WritableMemory; -import org.apache.datasketches.quantilescommon.GenericSortedView; +import org.apache.datasketches.quantilescommon.GenericPartitionBoundaries; +import org.apache.datasketches.quantilescommon.PartitioningFeature; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.datasketches.quantilescommon.QuantilesAPI; import org.apache.datasketches.quantilescommon.QuantilesGenericAPI; @@ -74,25 +72,13 @@ * * @param <T> The sketch data type */ -public final class ItemsSketch<T> implements QuantilesGenericAPI<T> { - +public final class ItemsSketch<T> implements QuantilesGenericAPI<T>, PartitioningFeature<T> { final Class<T> clazz; - private final Comparator<? super T> comparator_; - final int k_; - long n_; - - /** - * The largest item ever seen in the stream. - */ - T maxItem_; - - /** - * The smallest item ever seen in the stream. - */ - T minItem_; + T maxItem_; //The largest item ever seen in the stream. + T minItem_; //The smallest item ever seen in the stream. /** * In the initial on-heap version, equals combinedBuffer_.length. @@ -132,7 +118,7 @@ public final class ItemsSketch<T> implements QuantilesGenericAPI<T> { /** * Setting the seed makes the results of the sketch deterministic if the input items are * received in exactly the same order. This is only useful when performing test comparisons, - * otherwise is not recommended. + * otherwise, it is not recommended. */ public static final Random rand = new Random(); @@ -220,7 +206,6 @@ public static <T> ItemsSketch<T> getInstance( final boolean empty = checkPreLongsFlagsCap(preambleLongs, flags, memCapBytes); checkFamilyID(familyID); - final ItemsSketch<T> sk = getInstance(clazz, k, comparator); //checks k if (empty) { return sk; } @@ -265,10 +250,7 @@ static <T> ItemsSketch<T> copy(final ItemsSketch<T> sketch) { return qsCopy; } - @Override - public double[] getCDF(final T[] splitPoints) { - return getCDF(splitPoints, INCLUSIVE); - } + //END of Constructors @Override public double[] getCDF(final T[] splitPoints, final QuantileSearchCriteria searchCrit) { @@ -295,25 +277,11 @@ public T getMinItem() { } @Override - public GenericPartitionBoundaries<T> getPartitionBoundaries(final int numEquallyWeighted, + public GenericPartitionBoundaries<T> getPartitionBoundaries(final int numEquallySized, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } refreshSortedView(); - final long[] weights = equallySpacedLongs(1, getN(), numEquallyWeighted); - final T[] boundaries = getQuantiles(weights, searchCrit); - final GenericPartitionBoundaries<T> gpb = new GenericPartitionBoundaries<>(); - gpb.N = this.getN(); - gpb.boundaries = boundaries; - gpb.weights = weights; - final double[] ranks = new double[weights.length]; - for (int i = 0; i < weights.length; i++) { ranks[i] = (double)weights[i] / getN(); } - gpb.ranks = ranks; - return gpb; - } - - @Override - public double[] getPMF(final T[] splitPoints) { - return getPMF(splitPoints, INCLUSIVE); + return classicQisSV.getPartitionBoundaries(numEquallySized, searchCrit); } @Override @@ -323,11 +291,6 @@ public double[] getPMF(final T[] splitPoints, final QuantileSearchCriteria searc return classicQisSV.getPMF(splitPoints, searchCrit); } - @Override - public T getQuantile(final double rank) { - return getQuantile(rank, INCLUSIVE); - } - @Override public T getQuantile(final double rank, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } @@ -348,36 +311,10 @@ public T getQuantileUpperBound(final double rank) { } @Override - public T[] getQuantiles(final double[] ranks) { - return getQuantiles(ranks, INCLUSIVE); - } - - @Override - @SuppressWarnings("unchecked") public T[] getQuantiles(final double[] ranks, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } refreshSortedView(); - final int len = ranks.length; - final T[] quantiles = (T[]) Array.newInstance(minItem_.getClass(), len); - for (int i = 0; i < len; i++) { - quantiles[i] = classicQisSV.getQuantile(ranks[i], searchCrit); - } - return quantiles; - } - - @SuppressWarnings("unchecked") - private T[] getQuantiles(final long[] weights, final QuantileSearchCriteria crit) { - final int len = weights.length; - final T[] quantiles = (T[]) Array.newInstance(minItem_.getClass(), len); - for (int i = 0; i < len; i++) { - quantiles[i] = classicQisSV.getQuantile(weights[i], crit); - } - return quantiles; - } - - @Override - public double getRank(final T quantile) { - return getRank(quantile, INCLUSIVE); + return classicQisSV.getQuantiles(ranks, searchCrit); } @Override @@ -397,11 +334,6 @@ public double getRankUpperBound(final double rank) { return min(1.0, rank + getNormalizedRankError(k_, false)); } - @Override - public double[] getRanks(final T[] quantiles) { - return getRanks(quantiles, INCLUSIVE); - } - @Override public double[] getRanks(final T[] quantiles, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } @@ -522,11 +454,6 @@ public byte[] toByteArray(final boolean ordered, final ArrayOfItemsSerDe<T> serD return ItemsByteArrayImpl.toByteArray(this, ordered, serDe); } - @Override - public String toString() { - return toString(true, false); - } - /** * Returns summary information about this sketch. Used for debugging. * @param sketchSummary if true includes sketch summary @@ -592,7 +519,7 @@ public void putMemory(final WritableMemory dstMem, final ArrayOfItemsSerDe<T> se } @Override - public GenericSortedView<T> getSortedView() { + public ItemsSketchSortedView<T> getSortedView() { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } return refreshSortedView(); } diff --git a/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java b/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java index 68ec30e36..869b68021 100644 --- a/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java @@ -22,18 +22,23 @@ import static org.apache.datasketches.quantilescommon.GenericInequalitySearch.find; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import static org.apache.datasketches.quantilescommon.QuantilesAPI.EMPTY_MSG; +import static org.apache.datasketches.quantilescommon.QuantilesUtil.evenlySpacedDoubles; import static org.apache.datasketches.quantilescommon.QuantilesUtil.getNaturalRank; import java.lang.reflect.Array; import java.util.Arrays; import java.util.Comparator; +import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.SketchesStateException; import org.apache.datasketches.quantilescommon.GenericInequalitySearch.Inequality; +import org.apache.datasketches.quantilescommon.GenericPartitionBoundaries; import org.apache.datasketches.quantilescommon.GenericSortedView; import org.apache.datasketches.quantilescommon.GenericSortedViewIterator; import org.apache.datasketches.quantilescommon.InequalitySearch; +import org.apache.datasketches.quantilescommon.PartitioningFeature; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; +import org.apache.datasketches.quantilescommon.QuantilesAPI; import org.apache.datasketches.quantilescommon.QuantilesUtil; /** @@ -42,11 +47,15 @@ * @author Kevin Lang * @author Alexander Saydakov */ -public class ItemsSketchSortedView<T> implements GenericSortedView<T> { +public class ItemsSketchSortedView<T> implements GenericSortedView<T>, PartitioningFeature<T> { private final T[] quantiles; private final long[] cumWeights; //comes in as individual weights, converted to cumulative natural weights private final long totalN; private final Comparator<? super T> comparator; + private final T maxItem; + private final T minItem; + private final Class<T> clazz; + private final double[] normRanks; /** * Construct from elements for testing. @@ -55,15 +64,22 @@ public class ItemsSketchSortedView<T> implements GenericSortedView<T> { * @param totalN the total number of items presented to the sketch. * @param comparator comparator for type T */ + @SuppressWarnings("unchecked") ItemsSketchSortedView( final T[] quantiles, - final long[] cumWeights, + final long[] cumWeights, //or Natural Ranks final long totalN, - final Comparator<T> comparator) { + final Comparator<T> comparator, + final T maxItem, + final T minItem) { this.quantiles = quantiles; this.cumWeights = cumWeights; this.totalN = totalN; this.comparator = comparator; + this.maxItem = maxItem; + this.minItem = minItem; + this.clazz = (Class<T>)quantiles[0].getClass(); + this.normRanks = convertCumWtsToNormRanks(cumWeights, totalN); } /** @@ -72,12 +88,16 @@ public class ItemsSketchSortedView<T> implements GenericSortedView<T> { */ @SuppressWarnings("unchecked") ItemsSketchSortedView(final ItemsSketch<T> sketch) { + if (sketch.isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } this.totalN = sketch.getN(); final int k = sketch.getK(); final int numQuantiles = sketch.getNumRetained(); - quantiles = (T[]) Array.newInstance(sketch.clazz, numQuantiles); + this.quantiles = (T[]) Array.newInstance(sketch.clazz, numQuantiles); + this.minItem = sketch.minItem_; + this.maxItem = sketch.maxItem_; cumWeights = new long[numQuantiles]; comparator = sketch.getComparator(); + clazz = sketch.clazz; final Object[] combinedBuffer = sketch.getCombinedBuffer(); final int baseBufferCount = sketch.getBaseBufferCount(); @@ -94,9 +114,12 @@ public class ItemsSketchSortedView<T> implements GenericSortedView<T> { if (convertToCumulative(cumWeights) != totalN) { throw new SketchesStateException("Sorted View is misconfigured. TotalN does not match cumWeights."); } + this.normRanks = convertCumWtsToNormRanks(cumWeights, totalN); } - @Override //implemented here because it needs the comparator + //end of constructors + + @Override public double[] getCDF(final T[] splitPoints, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } GenericSortedView.validateItems(splitPoints, comparator); @@ -114,7 +137,62 @@ public long[] getCumulativeWeights() { return cumWeights.clone(); } - @Override //implemented here because it needs the comparator + @Override + public T getMaxItem() { + return maxItem; + } + + @Override + public T getMinItem() { + return minItem; + } + + @Override + public long getN() { + return totalN; + } + + @Override + public double[] getNormalizedRanks() { + return normRanks.clone(); + } + + @Override + @SuppressWarnings("unchecked") + public GenericPartitionBoundaries<T> getPartitionBoundaries(final int numEquallySized, + final QuantileSearchCriteria searchCrit) { + if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } + final long totalN = this.totalN; + final int svLen = cumWeights.length; + //adjust ends of sortedView arrays + cumWeights[0] = 1L; + cumWeights[svLen - 1] = totalN; + normRanks[0] = 1.0 / totalN; + normRanks[svLen - 1] = 1.0; + quantiles[0] = this.getMinItem(); + quantiles[svLen - 1] = this.getMaxItem(); + + final double[] evSpNormRanks = evenlySpacedDoubles(0, 1.0, numEquallySized + 1); + final int len = evSpNormRanks.length; + final T[] evSpQuantiles = (T[]) Array.newInstance(clazz, len); + final long[] evSpNatRanks = new long[len]; + for (int i = 0; i < len; i++) { + final int index = getQuantileIndex(evSpNormRanks[i], searchCrit); + evSpQuantiles[i] = getQuantileFromIndex(index); + evSpNatRanks[i] = getCumWeightFromIndex(index); + } + final GenericPartitionBoundaries<T> gpb = new GenericPartitionBoundaries<>( + this.totalN, + evSpQuantiles.clone(), + evSpNatRanks.clone(), + evSpNormRanks.clone(), + getMaxItem(), + getMinItem(), + searchCrit); + return gpb; + } + + @Override public double[] getPMF(final T[] splitPoints, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } GenericSortedView.validateItems(splitPoints, comparator); @@ -130,32 +208,32 @@ public double[] getPMF(final T[] splitPoints, final QuantileSearchCriteria searc public T getQuantile(final double rank, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); + final int index = getQuantileIndex(rank, searchCrit); + return getQuantileFromIndex(index); + } + + private T getQuantileFromIndex(final int index) { return quantiles[index]; } + + private long getCumWeightFromIndex(final int index) { return cumWeights[index]; } + + private int getQuantileIndex(final double rank, final QuantileSearchCriteria searchCrit) { final int len = cumWeights.length; - final double naturalRank = getNaturalRank(rank, totalN); + final double naturalRank = getNaturalRank(rank, totalN, searchCrit); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); - if (index == -1) { - return quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; - } - return quantiles[index]; + if (index == -1) { return len - 1; } + return index; } - /** - * Special version of getQuantile to support the getPartitionBoundaries(int) function. - * @param weight ultimately comes from selected integral weights computed by the sketch. - * @param searchCrit If INCLUSIVE, the given rank includes all quantiles ≤ - * the quantile directly corresponding to the given weight internal to the sketch. - * @return the approximate quantile given the weight. - */ - T getQuantile(final long weight, final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } - final int len = cumWeights.length; - final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; - final int index = InequalitySearch.find(cumWeights, 0, len - 1, weight, crit); - if (index == -1) { - return quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; + @SuppressWarnings("unchecked") + public T[] getQuantiles(final double[] ranks, final QuantileSearchCriteria searchCrit) { + if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } + final int len = ranks.length; + final T[] quants = (T[]) Array.newInstance(clazz, len); + for (int i = 0; i < len; i++) { + quants[i] = getQuantile(ranks[i], searchCrit); } - return quantiles[index]; + return quants; } @Override @@ -181,8 +259,8 @@ public boolean isEmpty() { } @Override - public ItemsSketchSortedViewIterator<T> iterator() { - return new ItemsSketchSortedViewIterator<>(quantiles, cumWeights); + public GenericSortedViewIterator<T> iterator() { + return new GenericSortedViewIterator<>(quantiles, cumWeights); } //restricted methods @@ -236,6 +314,13 @@ private final static <T> void populateFromItemsSketch( Arrays.sort(quantilesArr, startOfBaseBufferBlock, numQuantiles, comparator); } + private static double[] convertCumWtsToNormRanks(final long[] cumWeights, final long totalN) { + final int len = cumWeights.length; + final double[] normRanks = new double[len]; + for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } + return normRanks; + } + /** * Convert the individual weights into cumulative weights. * An array of {1,1,1,1} becomes {1,2,3,4} @@ -251,15 +336,4 @@ private static long convertToCumulative(final long[] array) { return subtotal; } - /** - * Iterator over ItemsSketchSortedView. - * @param <T> type of quantile (item) - */ - public static final class ItemsSketchSortedViewIterator<T> extends GenericSortedViewIterator<T> { - - ItemsSketchSortedViewIterator(final T[] quantiles, final long[] cumWeights) { - super(quantiles, cumWeights); - } - } - } diff --git a/src/main/java/org/apache/datasketches/quantilescommon/DoublesSortedView.java b/src/main/java/org/apache/datasketches/quantilescommon/DoublesSortedView.java index 8c299321e..bdc3cc75c 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/DoublesSortedView.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/DoublesSortedView.java @@ -20,7 +20,7 @@ package org.apache.datasketches.quantilescommon; /** - * The Sorted View for quantiles of primitive type double. + * The Sorted View for quantile sketches of primitive type double. * @see SortedView * @author Alexander Saydakov * @author Lee Rhodes @@ -71,6 +71,24 @@ default double[] getCDF(double[] splitPoints, QuantileSearchCriteria searchCrit) return buckets; } + /** + * Returns the maximum item of the stream. This may be distinct from the largest item retained by the + * sketch algorithm. + * + * @return the maximum item of the stream + * @throws IllegalArgumentException if sketch is empty. + */ + double getMaxItem(); + + /** + * Returns the minimum item of the stream. This may be distinct from the smallest item retained by the + * sketch algorithm. + * + * @return the minimum item of the stream + * @throws IllegalArgumentException if sketch is empty. + */ + double getMinItem(); + /** * Returns an approximation to the Probability Mass Function (PMF) of the input stream * as an array of probability masses as doubles on the interval [0.0, 1.0], diff --git a/src/main/java/org/apache/datasketches/quantilescommon/DoublesSortedViewIterator.java b/src/main/java/org/apache/datasketches/quantilescommon/DoublesSortedViewIterator.java index df9c41f23..da112dc2e 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/DoublesSortedViewIterator.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/DoublesSortedViewIterator.java @@ -20,12 +20,15 @@ package org.apache.datasketches.quantilescommon; /** - * The quantiles SortedView iterator for type double. - * @see SortedViewIterator - * @author Alexander Saydakov - * @author Lee Rhodes + * Iterator over quantile sketches of primitive type <i>double</i>. */ -public interface DoublesSortedViewIterator extends SortedViewIterator { +public final class DoublesSortedViewIterator extends SortedViewIterator { + private final double[] quantiles; + + public DoublesSortedViewIterator(final double[] quantiles, final long[] cumWeights) { + super(cumWeights); + this.quantiles = quantiles; //SpotBugs EI_EXPOSE_REP2 suppressed by FindBugsExcludeFilter + } /** * Gets the quantile at the current index. @@ -35,7 +38,8 @@ public interface DoublesSortedViewIterator extends SortedViewIterator { * * @return the quantile at the current index. */ - double getQuantile(); + public double getQuantile() { + return quantiles[index]; + } } - diff --git a/src/main/java/org/apache/datasketches/quantilescommon/FloatsSortedView.java b/src/main/java/org/apache/datasketches/quantilescommon/FloatsSortedView.java index 7127b5928..0a0c54b5a 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/FloatsSortedView.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/FloatsSortedView.java @@ -71,6 +71,24 @@ default double[] getCDF(float[] splitPoints, QuantileSearchCriteria searchCrit) return buckets; } + /** + * Returns the maximum item of the stream. This may be distinct from the largest item retained by the + * sketch algorithm. + * + * @return the maximum item of the stream + * @throws IllegalArgumentException if sketch is empty. + */ + float getMaxItem(); + + /** + * Returns the minimum item of the stream. This may be distinct from the smallest item retained by the + * sketch algorithm. + * + * @return the minimum item of the stream + * @throws IllegalArgumentException if sketch is empty. + */ + float getMinItem(); + /** * Returns an approximation to the Probability Mass Function (PMF) of the input stream * as an array of probability masses as doubles on the interval [0.0, 1.0], diff --git a/src/main/java/org/apache/datasketches/quantilescommon/FloatsSortedViewIterator.java b/src/main/java/org/apache/datasketches/quantilescommon/FloatsSortedViewIterator.java index ff6203f45..a40bacef1 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/FloatsSortedViewIterator.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/FloatsSortedViewIterator.java @@ -20,12 +20,15 @@ package org.apache.datasketches.quantilescommon; /** - * The quantiles SortedView Iterator for type float. - * @see SortedViewIterator - * @author Alexander Saydakov - * @author Lee Rhodes + * Iterator over quantile sketches of primitive type <i>float</i>. */ -public interface FloatsSortedViewIterator extends SortedViewIterator { +public final class FloatsSortedViewIterator extends SortedViewIterator { + private final float[] quantiles; + + public FloatsSortedViewIterator(final float[] quantiles, final long[] cumWeights) { + super(cumWeights); + this.quantiles = quantiles; //SpotBugs EI_EXPOSE_REP2 suppressed by FindBugsExcludeFilter + } /** * Gets the quantile at the current index. @@ -35,7 +38,8 @@ public interface FloatsSortedViewIterator extends SortedViewIterator { * * @return the quantile at the current index. */ - float getQuantile(); + public float getQuantile() { + return quantiles[index]; + } } - diff --git a/src/main/java/org/apache/datasketches/quantilescommon/GenericPartitionBoundaries.java b/src/main/java/org/apache/datasketches/quantilescommon/GenericPartitionBoundaries.java new file mode 100644 index 000000000..733f7846d --- /dev/null +++ b/src/main/java/org/apache/datasketches/quantilescommon/GenericPartitionBoundaries.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.quantilescommon; + +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; + +import org.apache.datasketches.common.SketchesStateException; + +/** + * Implements PartitionBoundaries + */ +public class GenericPartitionBoundaries<T> implements PartitionBoundaries { + private long totalN; //totalN of source sketch + private T[] boundaries; //quantiles at the boundaries + private long[] natRanks; //natural ranks at the boundaries + private double[] normRanks; //normalized ranks at the boundaries + private T maxItem; //of the source sketch + private T minItem; //of the source sketch + private QuantileSearchCriteria searchCrit; //of the source sketch query to getPartitionBoundaries. + //computed + private long[] numDeltaItems; //num of items in each part + private int numPartitions; //num of partitions + + public GenericPartitionBoundaries( + final long totalN, + final T[] boundaries, + final long[] natRanks, + final double[] normRanks, + final T maxItem, + final T minItem, + final QuantileSearchCriteria searchCrit) { + this.totalN = totalN; + this.boundaries = boundaries; + this.natRanks = natRanks; + this.normRanks = normRanks; + this.maxItem = maxItem; + this.minItem = minItem; + this.searchCrit = searchCrit; + //check and compute + final int len = boundaries.length; + if (len < 2) { throw new SketchesStateException("Source sketch is empty"); } + numDeltaItems = new long[len]; + numDeltaItems[0] = 0; // index 0 is always 0 + for (int i = 1; i < len; i++) { + final int addOne = ( (i == 1 && (this.searchCrit == INCLUSIVE)) + || ((i == (len - 1)) && this.searchCrit == EXCLUSIVE) ) ? 1 : 0; + numDeltaItems[i] = natRanks[i] - natRanks[i - 1] + addOne; + } + this.numPartitions = len - 1; + } + + @Override + public long getN() { return totalN; } + + /** + * Gets an ordered array of boundaries that sequentially define the upper and lower boundaries of partitions. + * These partitions are to be constructed by an external process. Each boundary is essentially a reference and + * should uniquely identify an item or a set of identical items from the original stream of data fed to the + * originating sketch. + * + * <p>Assume boundaries array has size N + 1. Let the indicies be sequentially numbered from 0 to N. + * The number of partitions is always one less than the size of the boundaries array. + * Let the the partitions be sequentially numbered from 1 to N. + * + * <p>If these results were computed using QuantileSearchCriteria.INCLUSIVE then these sequential boundaries + * are to be interpreted as follows: + * <ul> + * <li>Partition 1: include all items >= index 0 and <= index 1.</li> + * <li>Partition 2: include all items > index 1 and <= index 2.</li> + * <li>Partition N: include all items > index N-1 and <= index N.</li> + * </ul> + * + * <p>If these results were computed using QuantileSearchCriteria.EXCLUSIVE then these sequential boundaries + * are to be interpreted as follows: + * <ul> + * <li>Partition 1: include all items >= index 0 and < index 1.</li> + * <li>Partition 2: include all items >= index 1 and < index 2.</li> + * <li>Partition N: include all items >= index N-1 and <= index N.</li> + * </ul> + * + * @return an array of boundaries that sequentially define the upper and lower boundaries of partitions. + */ + public T[] getBoundaries() { return boundaries; } + + @Override + public long[] getNaturalRanks() { return natRanks; } + + @Override + public double[] getNormalizedRanks() { return normRanks; } + + @Override + public long[] getNumDeltaItems() { return numDeltaItems; } + + @Override + public int getNumPartitions() { return numPartitions; } + + /** + * Returns the maximum item of the stream. This may be distinct from the largest item retained by the + * sketch algorithm. + * + * @return the maximum item of the stream + * @throws IllegalArgumentException if sketch is empty. + */ + public T getMaxItem() { return maxItem; } + + /** + * Returns the minimum item of the stream. This may be distinct from the smallest item retained by the + * sketch algorithm. + * + * @return the minimum item of the stream + * @throws IllegalArgumentException if sketch is empty. + */ + public T getMinItem() { return minItem; } + + @Override + public QuantileSearchCriteria getSearchCriteria() { return searchCrit; } + +} diff --git a/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedView.java b/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedView.java index 452467bb7..e3d89a6e2 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedView.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedView.java @@ -69,6 +69,24 @@ public interface GenericSortedView<T> extends SortedView { */ double[] getCDF(T[] splitPoints, QuantileSearchCriteria searchCrit); + /** + * Returns the maximum item of the stream. This may be distinct from the largest item retained by the + * sketch algorithm. + * + * @return the maximum item of the stream + * @throws IllegalArgumentException if sketch is empty. + */ + T getMaxItem(); + + /** + * Returns the minimum item of the stream. This may be distinct from the smallest item retained by the + * sketch algorithm. + * + * @return the minimum item of the stream + * @throws IllegalArgumentException if sketch is empty. + */ + T getMinItem(); + /** * Returns an approximation to the Probability Mass Function (PMF) of the input stream * as an array of probability masses as doubles on the interval [0.0, 1.0], @@ -126,8 +144,8 @@ public interface GenericSortedView<T> extends SortedView { T getQuantile(double rank, QuantileSearchCriteria searchCrit); /** - * Returns the array of quantiles. - * @return the array of quantiles. + * Returns the full array of quantiles. + * @return the full array of quantiles. */ T[] getQuantiles(); @@ -169,4 +187,3 @@ static <T> void validateItems(final T[] items, final Comparator<? super T> compa } } - diff --git a/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedViewIterator.java b/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedViewIterator.java index 69b454a92..5a5c00e26 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedViewIterator.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedViewIterator.java @@ -19,58 +19,28 @@ package org.apache.datasketches.quantilescommon; -import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; - /** - * The quantiles SortedView Iterator for generic types. - * @see SortedViewIterator + * Iterator over quantile sketches of generic type. * @param <T> The generic quantile type - * @author Alexander Saydakov - * @author Lee Rhodes */ -public class GenericSortedViewIterator<T> implements SortedViewIterator { +public class GenericSortedViewIterator<T> extends SortedViewIterator { private final T[] quantiles; - private final long[] cumWeights; - private final long totalN; - private int index; public GenericSortedViewIterator(final T[] quantiles, final long[] cumWeights) { - this.quantiles = quantiles; //SpotBugs EI_EXPOSE_REP2 suppressed by FindBugsExcludeFilter - this.cumWeights = cumWeights; //SpotBugs EI_EXPOSE_REP2 suppressed by FindBugsExcludeFilter - this.totalN = (cumWeights.length > 0) ? cumWeights[cumWeights.length - 1] : 0; - index = -1; - } - - @Override - public long getCumulativeWeight(final QuantileSearchCriteria searchCrit) { - if (searchCrit == INCLUSIVE) { return cumWeights[index]; } - return (index == 0) ? 0 : cumWeights[index - 1]; + super(cumWeights); + this.quantiles = quantiles; //SpotBugs EI_EXPOSE_REP2 suppressed by FindBugsExcludeFilter } + /** + * Gets the quantile at the current index. + * + * <p>Don't call this before calling next() for the first time + * or after getting false from next().</p> + * + * @return the quantile at the current index. + */ public T getQuantile() { return quantiles[index]; } - @Override - public long getN() { - return totalN; - } - - @Override - public double getNormalizedRank(final QuantileSearchCriteria searchCrit) { - return (double) getCumulativeWeight(searchCrit) / totalN; - } - - @Override - public long getWeight() { - if (index == 0) { return cumWeights[0]; } - return cumWeights[index] - cumWeights[index - 1]; - } - - @Override - public boolean next() { - index++; - return index < quantiles.length; - } - } diff --git a/src/main/java/org/apache/datasketches/quantilescommon/PartitionBoundaries.java b/src/main/java/org/apache/datasketches/quantilescommon/PartitionBoundaries.java new file mode 100644 index 000000000..e3c59d2c7 --- /dev/null +++ b/src/main/java/org/apache/datasketches/quantilescommon/PartitionBoundaries.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.quantilescommon; + +/** + * This defines a set of results computed from the getParitionBoundaries() function and + * encapsulates the basic methods needed to construct actual partitions based on generic items. + */ +public interface PartitionBoundaries { + + /** + * Gets the length of the input stream offered to the underlying sketch. + * @return the length of the input stream offered to the underlying sketch. + */ + long getN(); + + /** + * Gets an ordered array of natural ranks of the associated array of partition boundaries utilizing + * a specified search criterion. Natural ranks are integral values on the interval [1, N] + * @return an array of natural ranks. + */ + long[] getNaturalRanks(); + + /** + * Gets an ordered array of normalized ranks of the associated array of partition boundaries utilizing + * a specified search criterion. Normalized ranks are double values on the interval [0.0, 1.0]. + * @return an array of normalized ranks. + */ + double[] getNormalizedRanks(); + + /** + * Gets the number of items to be included for each partition as an array. + * The count at index 0 is 0. The number of items included in the first partition, defined by the boundaries at + * index 0 and index 1, is at index 1 in this array, etc. + * @return the number of items to be included for each partition as an array. + */ + long[] getNumDeltaItems(); + + /** + * Gets the number of partitions + * @return the number of partitions + */ + int getNumPartitions(); + + /** + * Gets the search criteria specified for the source sketch + * @return The search criteria specified for the source sketch + */ + QuantileSearchCriteria getSearchCriteria(); +} diff --git a/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java b/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java new file mode 100644 index 000000000..3ff51a3b4 --- /dev/null +++ b/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.quantilescommon; + +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; + +/** + * This enables the special functions for performing efficient partitioning of massive data. + */ +public interface PartitioningFeature<T> { + + /** + * This method returns an instance of + * {@link GenericPartitionBoundaries GenericPartitionBoundaries} which provides + * sufficient information for the user to create the given number of equally sized partitions, where "equally sized" + * refers to an approximately equal number of items per partition. + * + * <p>This method is equivalent to + * {@link #getPartitionBoundaries(int, QuantileSearchCriteria) getPartitionBoundaries(numEquallySized, INCLUSIVE)}. + * </p> + * + * @param numEquallySized an integer that specifies the number of equally sized partitions between + * {@link #getMinItem() getMinItem()} and {@link #getMaxItem() getMaxItem()}. + * This must be a positive integer greater than zero. + * <ul> + * <li>A 1 will return: minItem, maxItem.</li> + * <li>A 2 will return: minItem, median quantile, maxItem.</li> + * <li>Etc.</li> + * </ul> + * + * @return an instance of {@link GenericPartitionBoundaries GenericPartitionBoundaries}. + * @throws IllegalArgumentException if sketch is empty. + * @throws IllegalArgumentException if <i>numEquallySized</i> is less than 1. + */ + default GenericPartitionBoundaries<T> getPartitionBoundaries(int numEquallySized) { + return getPartitionBoundaries(numEquallySized, INCLUSIVE); + } + + /** + * This method returns an instance of + * {@link GenericPartitionBoundaries GenericPartitionBoundaries} which provides + * sufficient information for the user to create the given number of equally sized partitions, where "equally sized" + * refers to an approximately equal number of items per partition. + * + * @param numEquallySized an integer that specifies the number of equally sized partitions between + * {@link #getMinItem() getMinItem()} and {@link #getMaxItem() getMaxItem()}. + * This must be a positive integer greater than zero. + * <ul> + * <li>A 1 will return: minItem, maxItem.</li> + * <li>A 2 will return: minItem, median quantile, maxItem.</li> + * <li>Etc.</li> + * </ul> + * + * @param searchCrit + * If INCLUSIVE, all the returned quantiles are the upper boundaries of the equally sized partitions + * with the exception of the lowest returned quantile, which is the lowest boundary of the lowest ranked partition. + * If EXCLUSIVE, all the returned quantiles are the lower boundaries of the equally sized partitions + * with the exception of the highest returned quantile, which is the upper boundary of the highest ranked partition. + * + * @return an instance of {@link GenericPartitionBoundaries GenericPartitionBoundaries}. + * @throws IllegalArgumentException if sketch is empty. + * @throws IllegalArgumentException if <i>numEquallySized</i> is less than 1. + */ + GenericPartitionBoundaries<T> getPartitionBoundaries(int numEquallySized, QuantileSearchCriteria searchCrit); + +} diff --git a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesAPI.java b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesAPI.java index 74e5d8061..38502ecaa 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesAPI.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesAPI.java @@ -219,8 +219,8 @@ public interface QuantilesAPI { int getK(); /** - * Gets the length of the input stream. - * @return the length of the input stream. + * Gets the length of the input stream offered to the sketch.. + * @return the length of the input stream offered to the sketch. */ long getN(); diff --git a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesDoublesAPI.java b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesDoublesAPI.java index a70b08372..31a5bedf9 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesDoublesAPI.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesDoublesAPI.java @@ -92,56 +92,6 @@ default double[] getCDF(double[] splitPoints) { */ double getMinItem(); - /** - * This method returns an instance of {@link DoublesPartitionBoundaries DoublesPartitionBoundaries} which provides - * sufficient information for the user to create the given number of equally weighted partitions. - * - * <p>This method is equivalent to - * {@link #getPartitionBoundaries(int, QuantileSearchCriteria) getPartitionBoundaries(numEquallyWeighted, INCLUSIVE)}. - * </p> - * - * @param numEquallyWeighted an integer that specifies the number of equally weighted partitions between - * {@link #getMinItem() getMinItem()} and {@link #getMaxItem() getMaxItem()}. - * This must be a positive integer greater than zero. - * <ul> - * <li>A 1 will return: minItem, maxItem.</li> - * <li>A 2 will return: minItem, median quantile, maxItem.</li> - * <li>Etc.</li> - * </ul> - * - * @return an instance of {@link DoublesPartitionBoundaries DoublesPartitionBoundaries}. - * @throws IllegalArgumentException if sketch is empty. - * @throws IllegalArgumentException if <i>numEquallyWeighted</i> is less than 1. - */ - default DoublesPartitionBoundaries getPartitionBoundaries(int numEquallyWeighted) { - return getPartitionBoundaries(numEquallyWeighted, INCLUSIVE); - } - - /** - * This method returns an instance of {@link DoublesPartitionBoundaries DoublesPartitionBoundaries} which provides - * sufficient information for the user to create the given number of equally weighted partitions. - * - * @param numEquallyWeighted an integer that specifies the number of equally weighted partitions between - * {@link #getMinItem() getMinItem()} and {@link #getMaxItem() getMaxItem()}. - * This must be a positive integer greater than zero. - * <ul> - * <li>A 1 will return: minItem, maxItem.</li> - * <li>A 2 will return: minItem, median quantile, maxItem.</li> - * <li>Etc.</li> - * </ul> - * - * @param searchCrit - * If INCLUSIVE, all the returned quantiles are the upper boundaries of the equally weighted partitions - * with the exception of the lowest returned quantile, which is the lowest boundary of the lowest ranked partition. - * If EXCLUSIVE, all the returned quantiles are the lower boundaries of the equally weighted partitions - * with the exception of the highest returned quantile, which is the upper boundary of the highest ranked partition. - * - * @return an instance of {@link DoublesPartitionBoundaries DoublesPartitionBoundaries}. - * @throws IllegalArgumentException if sketch is empty. - * @throws IllegalArgumentException if <i>numEquallyWeighted</i> is less than 1. - */ - DoublesPartitionBoundaries getPartitionBoundaries(int numEquallyWeighted, QuantileSearchCriteria searchCrit); - /** * This is equivalent to {@link #getPMF(double[], QuantileSearchCriteria) getPMF(splitPoints, INCLUSIVE)} * @param splitPoints an array of <i>m</i> unique, monotonically increasing items. diff --git a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesFloatsAPI.java b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesFloatsAPI.java index c6ea484cc..2fcbdd99f 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesFloatsAPI.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesFloatsAPI.java @@ -91,56 +91,6 @@ default double[] getCDF(float[] splitPoints) { */ float getMinItem(); - /** - * This method returns an instance of {@link FloatsPartitionBoundaries FloatsPartitionBoundaries} which provides - * sufficient information for the user to create the given number of equally weighted partitions. - * - * <p>This method is equivalent to - * {@link #getPartitionBoundaries(int, QuantileSearchCriteria) getPartitionBoundaries(numEquallyWeighted, INCLUSIVE)}. - * </p> - * - * @param numEquallyWeighted an integer that specifies the number of equally weighted partitions between - * {@link #getMinItem() getMinItem()} and {@link #getMaxItem() getMaxItem()}. - * This must be a positive integer greater than zero. - * <ul> - * <li>A 1 will return: minItem, maxItem.</li> - * <li>A 2 will return: minItem, median quantile, maxItem.</li> - * <li>Etc.</li> - * </ul> - * - * @return an instance of {@link FloatsPartitionBoundaries FloatsPartitionBoundaries}. - * @throws IllegalArgumentException if sketch is empty. - * @throws IllegalArgumentException if <i>numEquallyWeighted</i> is less than 1. - */ - default FloatsPartitionBoundaries getPartitionBoundaries(int numEquallyWeighted) { - return getPartitionBoundaries(numEquallyWeighted, INCLUSIVE); - } - - /** - * This method returns an instance of {@link FloatsPartitionBoundaries FloatsPartitionBoundaries} which provides - * sufficient information for the user to create the given number of equally weighted partitions. - * - * @param numEquallyWeighted an integer that specifies the number of equally weighted partitions between - * {@link #getMinItem() getMinItem()} and {@link #getMaxItem() getMaxItem()}. - * This must be a positive integer greater than zero. - * <ul> - * <li>A 1 will return: minItem, maxItem.</li> - * <li>A 2 will return: minItem, median quantile, maxItem.</li> - * <li>Etc.</li> - * </ul> - * - * @param searchCrit - * If INCLUSIVE, all the returned quantiles are the upper boundaries of the equally weighted partitions - * with the exception of the lowest returned quantile, which is the lowest boundary of the lowest ranked partition. - * If EXCLUSIVE, all the returned quantiles are the lower boundaries of the equally weighted partitions - * with the exception of the highest returned quantile, which is the upper boundary of the highest ranked partition. - * - * @return an instance of {@link FloatsPartitionBoundaries FloatsPartitionBoundaries}. - * @throws IllegalArgumentException if sketch is empty. - * @throws IllegalArgumentException if <i>numEquallyWeighted</i> is less than 1. - */ - FloatsPartitionBoundaries getPartitionBoundaries(int numEquallyWeighted, QuantileSearchCriteria searchCrit); - /** * This is equivalent to {@link #getPMF(float[], QuantileSearchCriteria) getPMF(splitPoints, INCLUSIVE)} * @param splitPoints an array of <i>m</i> unique, monotonically increasing items. diff --git a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java index f8dd8e62d..fbd7f691f 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java @@ -92,58 +92,6 @@ default double[] getCDF(T[] splitPoints) { */ T getMinItem(); - /** - * This method returns an instance of - * {@link GenericPartitionBoundaries GenericPartitionBoundaries} which provides - * sufficient information for the user to create the given number of equally weighted partitions. - * - * <p>This method is equivalent to - * {@link #getPartitionBoundaries(int, QuantileSearchCriteria) getPartitionBoundaries(numEquallyWeighted, INCLUSIVE)}. - * </p> - * - * @param numEquallyWeighted an integer that specifies the number of equally weighted partitions between - * {@link #getMinItem() getMinItem()} and {@link #getMaxItem() getMaxItem()}. - * This must be a positive integer greater than zero. - * <ul> - * <li>A 1 will return: minItem, maxItem.</li> - * <li>A 2 will return: minItem, median quantile, maxItem.</li> - * <li>Etc.</li> - * </ul> - * - * @return an instance of {@link GenericPartitionBoundaries GenericPartitionBoundaries}. - * @throws IllegalArgumentException if sketch is empty. - * @throws IllegalArgumentException if <i>numEquallyWeighted</i> is less than 1. - */ - default GenericPartitionBoundaries<T> getPartitionBoundaries(int numEquallyWeighted) { - return getPartitionBoundaries(numEquallyWeighted, INCLUSIVE); - } - - /** - * This method returns an instance of - * {@link GenericPartitionBoundaries GenericPartitionBoundaries} which provides - * sufficient information for the user to create the given number of equally weighted partitions. - * - * @param numEquallyWeighted an integer that specifies the number of equally weighted partitions between - * {@link #getMinItem() getMinItem()} and {@link #getMaxItem() getMaxItem()}. - * This must be a positive integer greater than zero. - * <ul> - * <li>A 1 will return: minItem, maxItem.</li> - * <li>A 2 will return: minItem, median quantile, maxItem.</li> - * <li>Etc.</li> - * </ul> - * - * @param searchCrit - * If INCLUSIVE, all the returned quantiles are the upper boundaries of the equally weighted partitions - * with the exception of the lowest returned quantile, which is the lowest boundary of the lowest ranked partition. - * If EXCLUSIVE, all the returned quantiles are the lower boundaries of the equally weighted partitions - * with the exception of the highest returned quantile, which is the upper boundary of the highest ranked partition. - * - * @return an instance of {@link GenericPartitionBoundaries GenericPartitionBoundaries}. - * @throws IllegalArgumentException if sketch is empty. - * @throws IllegalArgumentException if <i>numEquallyWeighted</i> is less than 1. - */ - GenericPartitionBoundaries<T> getPartitionBoundaries(int numEquallyWeighted, QuantileSearchCriteria searchCrit); - /** * This is equivalent to {@link #getPMF(Object[], QuantileSearchCriteria) getPMF(splitPoints, INCLUSIVE)} * @param splitPoints an array of <i>m</i> unique, monotonically increasing items. @@ -337,47 +285,5 @@ default double[] getRanks(T[] quantiles) { */ void update(T item); - /** - * This encapsulates the essential information needed to construct actual partitions and is returned from the - * <i>getPartitionBoundaries(int, QuantileSearchCritera)</i> method. - * @param <T> generic value T for the item type - */ - static class GenericPartitionBoundaries<T> { - - /** - * The total number of items presented to the sketch. - * - * <p>To compute the weight or density of a specific - * partition <i>i</i> where <i>i</i> varies from 1 to <i>m</i> partitions: - * <pre>{@code - * long N = getN(); - * double[] ranks = getRanks(); - * long weight = Math.round((ranks[i] - ranks[i - 1]) * N); - * }</pre> - */ - public long N; - - /** - * The normalized ranks that correspond to the returned boundaries. - * The returned array is of size <i>(m + 1)</i>, where <i>m</i> is the requested number of partitions. - * Index 0 of the returned array is always 0.0, and index <i>m</i> is always 1.0. - */ - public double[] ranks; - - /** - * The cumulative weights that correspond to the returned boundaries. - * The returned array is of size <i>(m + 1)</i>, where <i>m</i> is the requested number of partitions. - * Index 0 of the returned array is always 1, and index <i>m</i> is always <i>n</i>. - */ - public long[] weights; - - /** - * The partition boundaries as quantiles. - * The returned array is of size <i>(m + 1)</i>, where <i>m</i> is the requested number of partitions. - * Index 0 of the returned array is always {@link #getMinItem() getMinItem()}, and index <i>m</i> is always - * {@link #getMaxItem() getMaxItem()}. - */ - public T[] boundaries; - } } diff --git a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesUtil.java b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesUtil.java index 848ee3105..a35aa27cd 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesUtil.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesUtil.java @@ -21,6 +21,7 @@ import static java.lang.Math.log; import static java.lang.Math.pow; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import java.util.Objects; @@ -208,18 +209,17 @@ public static double[] evenlyLogSpaced(final double value1, final double value2, return arr; } - public static double maxPrecision; - - public static double getNaturalRank(final double normalizedRank, final long totalN) { - final double naturalRank = normalizedRank * totalN; - if (totalN <= 1_000_000L) { - final double precision = Util.ceilingPowerBaseOfDouble(10.0, totalN) ; - maxPrecision = precision; - final double trimmedNatRank = Math.round(naturalRank * precision) / precision; - return trimmedNatRank; - } else { - return naturalRank; + public static final double tailRoundingFactor = 1e7; + + public static double getNaturalRank( + final double normalizedRank, + final long totalN, + final QuantileSearchCriteria searchCrit) { + double naturalRank = (normalizedRank * totalN); + if (totalN <= tailRoundingFactor) { + naturalRank = Math.round(naturalRank * tailRoundingFactor) / tailRoundingFactor; } + return (searchCrit == INCLUSIVE) ? (long)Math.ceil(naturalRank) : (long)Math.floor(naturalRank); } } diff --git a/src/main/java/org/apache/datasketches/quantilescommon/SortedView.java b/src/main/java/org/apache/datasketches/quantilescommon/SortedView.java index 434b548a9..92acfb2d4 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/SortedView.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/SortedView.java @@ -20,19 +20,15 @@ package org.apache.datasketches.quantilescommon; /** - * This is the base interface for the Sorted View interface hierarchy. + * This is the base interface for the Sorted View interface hierarchy and defines the methods that are type independent. * - * <p>The Sorted View provides a view of the data retained by a quantiles-type sketch - * that would be cumbersome to get any other way. - * One can iterate over the contents of the sketch using the sketch's iterator, but the result is not sorted.</p> + * <p>The SortedView interface hierarchy provides a sorted view of the data retained by a quantiles-type sketch that + * would be cumbersome to get any other way. + * One could use the sketch's iterator to iterate over the contents of the sketch, + * but the result would not be sorted.</p> * - * <p>Once this sorted view has been created, it provides not only a sorted view of the data retained by the sketch - * but also the basic queries, such as getRank(), getQuantile(), and getCDF() and getPMF(). - * In addition, the iterator obtained from this sorted view provides useful detailed information about each entry.</p> - * - * <p>The data from a Sorted view is an unbiased sample of the input stream that can be used for other kinds of - * analysis not directly provided by the sketch. For example, comparing two sketches using the Kolmogorov-Smirnov - * test.</p> + * <p>The data from a Sorted view is an unbiased random sample of the input stream that can be used for other kinds of + * analysis not directly provided by the sketch.</p> * * @author Alexander Saydakov * @author Lee Rhodes @@ -40,11 +36,25 @@ public interface SortedView { /** - * Returns the array of cumulative weights - * @return the array of cumulative weights + * Returns the array of cumulative weights from the sketch. + * Also known as the natural ranks, which are the Natural Numbers on the interval [1, N]. + * @return the array of cumulative weights (or natural ranks). */ long[] getCumulativeWeights(); + /** + * Returns the array of normalized ranks. The normalized ranks are the natural ranks divided by N. + * The normalized ranks are fractional numbers on the interval (0,1.0]. + * @return the array of normalized ranks. + */ + double[] getNormalizedRanks(); + + /** + * Returns the total number of items presented to the sourcing sketch. + * @return the total number of items presented to the sourcing sketch. + */ + long getN(); + /** * Returns true if this sorted view is empty. * @return true if this sorted view is empty. diff --git a/src/main/java/org/apache/datasketches/quantilescommon/SortedViewIterator.java b/src/main/java/org/apache/datasketches/quantilescommon/SortedViewIterator.java index b36a2594e..06c298d4e 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/SortedViewIterator.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/SortedViewIterator.java @@ -19,6 +19,8 @@ package org.apache.datasketches.quantilescommon; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; + /** * This is the base interface for the SortedViewIterator hierarchy used with a SortedView obtained * from a quantile-type sketch. This provides an ordered iterator over the retained quantiles of @@ -35,30 +37,47 @@ * @author Alexander Saydakov * @author Lee Rhodes */ -public interface SortedViewIterator { +public class SortedViewIterator { + protected final long[] cumWeights; + protected long totalN; + protected int index; + + SortedViewIterator(final long[] cumWeights) { + this.cumWeights = cumWeights; //SpotBugs EI_EXPOSE_REP2 suppressed by FindBugsExcludeFilter + this.totalN = (cumWeights.length > 0) ? cumWeights[cumWeights.length - 1] : 0; + index = -1; + } /** - * Gets the cumulative weight at the current index (or previous index) based on the chosen search criterion. - * This is also referred to as the "Natural Rank". + * Gets the natural rank at the current index (or previous index) based on the chosen search criterion. + * This is also referred to as the "cumulative weight". The natural rank is a number in the range <i>[1, N]</i>, + * where <i>N</i> ({@link #getN()}) is the total number of items fed to the sketch. * * <p>Don't call this before calling next() for the first time * or after getting false from next().</p> * - * @param searchCrit if INCLUSIVE, includes the weight at the current index in the cumulative sum. - * Otherwise, it will return the cumulative weight of the previous index. - * @return cumulative weight at the current index on the chosen search criterion. + * @param searchCrit if INCLUSIVE, includes the weight of the item at the current index in the computation of + * the natural rank. + * Otherwise, it will return the natural rank of the previous index. + * @return the natural rank at the current index (or previous index) based on the chosen search criterion. */ - long getCumulativeWeight(QuantileSearchCriteria searchCrit); + public long getNaturalRank(final QuantileSearchCriteria searchCrit) { + if (searchCrit == INCLUSIVE) { return cumWeights[index]; } + return (index == 0) ? 0 : cumWeights[index - 1]; + } /** * Gets the total count of all items presented to the sketch. * @return the total count of all items presented to the sketch. */ - long getN(); + public long getN() { + return totalN; + } /** * Gets the normalized rank at the current index (or previous index) - * based on the chosen search criterion. + * based on the chosen search criterion. Where <i>normalized rank = natural rank / N</i> ({@link #getN()}) + * and is a fraction in the range (0,1.0]. * * <p>Don't call this before calling next() for the first time * or after getting false from next().</p> @@ -68,24 +87,32 @@ public interface SortedViewIterator { * @return the normalized rank at the current index (or previous index) * based on the chosen search criterion. */ - double getNormalizedRank(QuantileSearchCriteria searchCrit); + public double getNormalizedRank(final QuantileSearchCriteria searchCrit) { + return (double) getNaturalRank(searchCrit) / totalN; + } /** - * Gets the natural weight at the current index. + * Gets the weight contribution of the item at the current index. * * <p>Don't call this before calling next() for the first time * or after getting false from next().</p> * - * @return the natural weight at the current index. + * @return the weight contribution of the item at the current index. */ - long getWeight(); + public long getWeight() { + if (index == 0) { return cumWeights[0]; } + return cumWeights[index] - cumWeights[index - 1]; + } /** * Advances the index and checks if it is valid. * The state of this iterator is undefined before the first call of this method. * @return true if the next index is valid. */ - boolean next(); + public boolean next() { + index++; + return index < cumWeights.length; + } } diff --git a/src/main/java/org/apache/datasketches/quantilescommon/Stack.java b/src/main/java/org/apache/datasketches/quantilescommon/Stack.java new file mode 100644 index 000000000..68d6378b5 --- /dev/null +++ b/src/main/java/org/apache/datasketches/quantilescommon/Stack.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.quantilescommon; + +import java.util.ArrayList; + +import org.apache.datasketches.common.SketchesStateException; + +/** + * A classic LIFO stack based on ArrayList (as opposed to Vector). + * All of the methods of ArrayList are available. + */ +public class Stack<E> extends ArrayList<E> { + private static final long serialVersionUID = 1L; + + /** + * Creates an empty stack. + */ + public Stack() { } + + /** + * Pushes an item onto the stack + * @param item the given item + * @return the given element + */ + public E push(final E item) { + add(item); + return item; + } + + /** + * Removes the item at the top of the stack. + * @return the item at the top of the stack. + */ + public E pop() { + final E item = peek(); + remove(size() - 1); + return item; + } + + /** + * Allows examination of the top item without removing it. + * @return the top item without removing it + */ + public E peek() { + final int len = size(); + if (len == 0) { throw new SketchesStateException("Stack is empty"); } + return get(len - 1); + } + +} diff --git a/src/main/java/org/apache/datasketches/req/BaseReqSketch.java b/src/main/java/org/apache/datasketches/req/BaseReqSketch.java index 7c11ee2ab..e587cd633 100644 --- a/src/main/java/org/apache/datasketches/req/BaseReqSketch.java +++ b/src/main/java/org/apache/datasketches/req/BaseReqSketch.java @@ -19,11 +19,8 @@ package org.apache.datasketches.req; -import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallySpacedDoubles; - import org.apache.datasketches.quantilescommon.FloatsSortedView; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; -import org.apache.datasketches.quantilescommon.QuantilesAPI; import org.apache.datasketches.quantilescommon.QuantilesFloatsAPI; import org.apache.datasketches.quantilescommon.QuantilesFloatsSketchIterator; @@ -62,21 +59,6 @@ abstract class BaseReqSketch implements QuantilesFloatsAPI { @Override public abstract float getMinItem(); - @Override - public FloatsPartitionBoundaries getPartitionBoundaries(final int numEquallyWeighted, - final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } - final double[] ranks = equallySpacedDoubles(numEquallyWeighted); - final float[] boundaries = getQuantiles(ranks, searchCrit); - boundaries[0] = getMinItem(); - boundaries[boundaries.length - 1] = getMaxItem(); - final FloatsPartitionBoundaries fpb = new FloatsPartitionBoundaries(); - fpb.N = this.getN(); - fpb.ranks = ranks; - fpb.boundaries = boundaries; - return fpb; - } - /** * Returns an a priori estimate of relative standard error (RSE, expressed as a number in [0,1]). * Derived from Lemma 12 in https://arxiv.org/abs/2004.01668v2, but the constant factors were diff --git a/src/main/java/org/apache/datasketches/req/ReqSketchSortedView.java b/src/main/java/org/apache/datasketches/req/ReqSketchSortedView.java index f06461650..dbf14be6d 100644 --- a/src/main/java/org/apache/datasketches/req/ReqSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/req/ReqSketchSortedView.java @@ -20,11 +20,14 @@ package org.apache.datasketches.req; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; +import static org.apache.datasketches.quantilescommon.QuantilesAPI.EMPTY_MSG; import static org.apache.datasketches.quantilescommon.QuantilesUtil.getNaturalRank; import java.util.List; +import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.quantilescommon.FloatsSortedView; +import org.apache.datasketches.quantilescommon.FloatsSortedViewIterator; import org.apache.datasketches.quantilescommon.InequalitySearch; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.datasketches.quantilescommon.QuantilesAPI; @@ -39,6 +42,9 @@ public final class ReqSketchSortedView implements FloatsSortedView { private float[] quantiles; private long[] cumWeights; //comes in as individual weights, converted to cumulative natural weights private final long totalN; + private final double[] normRanks; + private final float maxItem; + private final float minItem; /** * Construct from elements for testing. @@ -46,60 +52,76 @@ public final class ReqSketchSortedView implements FloatsSortedView { * @param cumWeights sorted, monotonically increasing cumulative weights. * @param totalN the total number of items presented to the sketch. */ - ReqSketchSortedView(final float[] quantiles, final long[] cumWeights, final long totalN) { + ReqSketchSortedView(final float[] quantiles, final long[] cumWeights, final long totalN, + final float maxItem, final float minItem) { this.quantiles = quantiles; this.cumWeights = cumWeights; this.totalN = totalN; + this.maxItem = maxItem; + this.minItem = minItem; + final int len = cumWeights.length; + final double[] normRanks = new double[len]; + for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } + this.normRanks = normRanks; } /** * Constructs this Sorted View given the sketch - * @param sk the given ReqSketch + * @param sketch the given ReqSketch */ - public ReqSketchSortedView(final ReqSketch sk) { - totalN = sk.getN(); - buildSortedViewArrays(sk); + public ReqSketchSortedView(final ReqSketch sketch) { + if (sketch.isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } + this.totalN = sketch.getN(); + this.maxItem = sketch.getMaxItem(); + this.minItem = sketch.getMinItem(); + buildSortedViewArrays(sketch); + final int len = cumWeights.length; + final double[] normRanks = new double[len]; + for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } + this.normRanks = normRanks; } + //end of constructors + @Override public long[] getCumulativeWeights() { return cumWeights.clone(); } + @Override + public float getMaxItem() { + return maxItem; + } + + @Override + public float getMinItem() { + return minItem; + } + + @Override + public long getN() { + return totalN; + } + + @Override + public double[] getNormalizedRanks() { + return normRanks; + } + @Override public float getQuantile(final double rank, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int len = cumWeights.length; - final double naturalRank = getNaturalRank(rank, totalN); + final double naturalRank = getNaturalRank(rank, totalN, searchCrit); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); if (index == -1) { - return quantiles[quantiles.length - 1]; ///EXCLUSIVE (GT) case: normRank == 1.0; + return quantiles[len - 1]; ///EXCLUSIVE (GT) case: normRank == 1.0; } return quantiles[index]; } - /** - * Special version of getQuantile to support the getPartitionBoundaries(int) function. - * @param weight ultimately comes from selected integral weights computed by the sketch. - * @param searchCrit If INCLUSIVE, the given rank includes all quantiles ≤ - * the quantile directly corresponding to the given weight internal to the sketch. - * @return the approximate quantile given the weight. - */ - float getQuantile(final long weight, final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } - final int len = cumWeights.length; - final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; - final int index = InequalitySearch.find(cumWeights, 0, len - 1, weight, crit); - if (index == -1) { - return quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; - } - return quantiles[index]; - } - - - @Override public float[] getQuantiles() { return quantiles.clone(); @@ -123,8 +145,8 @@ public boolean isEmpty() { } @Override - public ReqSketchSortedViewIterator iterator() { - return new ReqSketchSortedViewIterator(quantiles, cumWeights); + public FloatsSortedViewIterator iterator() { + return new FloatsSortedViewIterator(quantiles, cumWeights); } //restricted methods diff --git a/src/main/java/org/apache/datasketches/req/ReqSketchSortedViewIterator.java b/src/main/java/org/apache/datasketches/req/ReqSketchSortedViewIterator.java deleted file mode 100644 index 6dbc63222..000000000 --- a/src/main/java/org/apache/datasketches/req/ReqSketchSortedViewIterator.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.req; - -import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; - -import org.apache.datasketches.quantilescommon.FloatsSortedViewIterator; -import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; - -/** - * Iterator over ReqSketchSortedView. - * @author Alexander Saydakov - * @author Lee Rhodes - */ -public final class ReqSketchSortedViewIterator implements FloatsSortedViewIterator { - - private final float[] quantiles; - private final long[] cumWeights; - private final long totalN; - private int index; - - ReqSketchSortedViewIterator(final float[] quantiles, final long[] cumWeights) { - this.quantiles = quantiles; - this.cumWeights = cumWeights; - this.totalN = (cumWeights.length > 0) ? cumWeights[cumWeights.length - 1] : 0; - index = -1; - } - - @Override - public long getCumulativeWeight(final QuantileSearchCriteria searchCrit) { - if (searchCrit == INCLUSIVE) { return cumWeights[index]; } - return (index == 0) ? 0 : cumWeights[index - 1]; - } - - @Override - public long getN() { - return totalN; - } - - @Override - public double getNormalizedRank(final QuantileSearchCriteria searchCrit) { - return (double) getCumulativeWeight(searchCrit) / totalN; - } - - @Override - public float getQuantile() { - return quantiles[index]; - } - - @Override - public long getWeight() { - if (index == 0) { return cumWeights[0]; } - return cumWeights[index] - cumWeights[index - 1]; - } - - @Override - public boolean next() { - index++; - return index < quantiles.length; - } - -} diff --git a/src/test/java/org/apache/datasketches/common/UtilTest.java b/src/test/java/org/apache/datasketches/common/UtilTest.java index a68671685..50112a315 100644 --- a/src/test/java/org/apache/datasketches/common/UtilTest.java +++ b/src/test/java/org/apache/datasketches/common/UtilTest.java @@ -263,9 +263,14 @@ public void checkZeroPad() { @Test public void checkCharacterPad() { - final String s = "Pad 30, postpend z:"; - final String out = characterPad(s, 30, 'z', true); + String s = "Pad 30, postpend z:"; + String out = characterPad(s, 30, 'z', true); println(out); + assertEquals(out, "Pad 30, postpend z:zzzzzzzzzzz"); + s = "Pad 30, prepend z:"; + out = characterPad(s, 30, 'z', false); + println(out); + assertEquals(out,"zzzzzzzzzzzzPad 30, prepend z:"); } @Test diff --git a/src/test/java/org/apache/datasketches/kll/KllDirectCompactItemsSketchIteratorTest.java b/src/test/java/org/apache/datasketches/kll/KllDirectCompactItemsSketchIteratorTest.java index bc7651b14..ccfb52533 100644 --- a/src/test/java/org/apache/datasketches/kll/KllDirectCompactItemsSketchIteratorTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllDirectCompactItemsSketchIteratorTest.java @@ -96,8 +96,8 @@ public void twoItemSketchForSortedViewIterator() { assertEquals(itr.getQuantile(), "1"); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 0); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 1); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 0); + assertEquals(itr.getNaturalRank(INCLUSIVE), 1); assertEquals(itr.getNormalizedRank(EXCLUSIVE), 0); assertEquals(itr.getNormalizedRank(INCLUSIVE), 0.5); @@ -105,8 +105,8 @@ public void twoItemSketchForSortedViewIterator() { assertEquals(itr.getQuantile(), "2"); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 1); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 2); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 1); + assertEquals(itr.getNaturalRank(INCLUSIVE), 2); assertEquals(itr.getNormalizedRank(EXCLUSIVE), 0.5); assertEquals(itr.getNormalizedRank(INCLUSIVE), 1.0); } diff --git a/src/test/java/org/apache/datasketches/kll/KllDirectDoublesSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllDirectDoublesSketchTest.java index e4e349205..a8ca4145e 100644 --- a/src/test/java/org/apache/datasketches/kll/KllDirectDoublesSketchTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllDirectDoublesSketchTest.java @@ -21,7 +21,6 @@ import static org.apache.datasketches.kll.KllSketch.SketchType.DOUBLES_SKETCH; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE; -import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNotNull; @@ -423,21 +422,6 @@ public void nanSplitPoint() { sketch.getCDF(new double[] {Double.NaN}); } - @Test - public void getQuantiles() { - final KllDoublesSketch sketch = getUpdatableDirectDoublesSketch(200, 0); - sketch.update(1); - sketch.update(2); - sketch.update(3); - sketch.update(4); - double[] quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, EXCLUSIVE); - double[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, INCLUSIVE); - quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - } - @Test public void checkSimpleMergeDirect() { //used for troubleshooting int k = 20; diff --git a/src/test/java/org/apache/datasketches/kll/KllDirectFloatsSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllDirectFloatsSketchTest.java index 6f9ea0ba5..3013e6295 100644 --- a/src/test/java/org/apache/datasketches/kll/KllDirectFloatsSketchTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllDirectFloatsSketchTest.java @@ -21,7 +21,6 @@ import static org.apache.datasketches.kll.KllSketch.SketchType.FLOATS_SKETCH; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE; -import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNotNull; @@ -423,21 +422,6 @@ public void nanSplitPoint() { sketch.getCDF(new float[] {Float.NaN}); } - @Test - public void getQuantiles() { - final KllFloatsSketch sketch = getUpdatableDirectFloatSketch(200, 0); - sketch.update(1); - sketch.update(2); - sketch.update(3); - sketch.update(4); - float[] quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, EXCLUSIVE); - float[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, INCLUSIVE); - quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - } - @Test public void checkSimpleMergeDirect() { //used for troubleshooting int k = 20; diff --git a/src/test/java/org/apache/datasketches/kll/KllDoublesSketchIteratorTest.java b/src/test/java/org/apache/datasketches/kll/KllDoublesSketchIteratorTest.java index d428cd259..7a12d8466 100644 --- a/src/test/java/org/apache/datasketches/kll/KllDoublesSketchIteratorTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllDoublesSketchIteratorTest.java @@ -77,8 +77,8 @@ public void twoItemSketchForSortedViewIterator() { assertEquals(itr.getQuantile(), 1.0); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 0); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 1); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 0); + assertEquals(itr.getNaturalRank(INCLUSIVE), 1); assertEquals(itr.getNormalizedRank(EXCLUSIVE), 0); assertEquals(itr.getNormalizedRank(INCLUSIVE), 0.5); @@ -86,8 +86,8 @@ public void twoItemSketchForSortedViewIterator() { assertEquals(itr.getQuantile(), 2.0); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 1); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 2); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 1); + assertEquals(itr.getNaturalRank(INCLUSIVE), 2); assertEquals(itr.getNormalizedRank(EXCLUSIVE), 0.5); assertEquals(itr.getNormalizedRank(INCLUSIVE), 1.0); } diff --git a/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java index ba63e8bef..8aeabb8bf 100644 --- a/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java @@ -391,21 +391,6 @@ public void nanSplitPoint() { sketch.getCDF(new double[] {Double.NaN}); } - @Test - public void getQuantiles() { - final KllDoublesSketch sketch = KllDoublesSketch.newHeapInstance(); - sketch.update(1); - sketch.update(2); - sketch.update(3); - sketch.update(4); - double[] quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, EXCLUSIVE); - double[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, INCLUSIVE); - quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - } - @Test public void checkReset() { KllDoublesSketch sk = KllDoublesSketch.newHeapInstance(20); @@ -456,18 +441,18 @@ public void sortedView() { assertEquals(itr.next(), true); assertEquals(itr.getQuantile(), 1); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 0); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 1); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 0); + assertEquals(itr.getNaturalRank(INCLUSIVE), 1); assertEquals(itr.next(), true); assertEquals(itr.getQuantile(), 2); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 1); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 2); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 1); + assertEquals(itr.getNaturalRank(INCLUSIVE), 2); assertEquals(itr.next(), true); assertEquals(itr.getQuantile(), 3); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 2); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 3); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 2); + assertEquals(itr.getNaturalRank(INCLUSIVE), 3); assertEquals(itr.next(), false); } diff --git a/src/test/java/org/apache/datasketches/kll/KllFloatsSketchIteratorTest.java b/src/test/java/org/apache/datasketches/kll/KllFloatsSketchIteratorTest.java index e511de562..88003b836 100644 --- a/src/test/java/org/apache/datasketches/kll/KllFloatsSketchIteratorTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllFloatsSketchIteratorTest.java @@ -77,8 +77,8 @@ public void twoItemSketchForSortedViewIterator() { assertEquals(itr.getQuantile(), 1.0f); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 0); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 1); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 0); + assertEquals(itr.getNaturalRank(INCLUSIVE), 1); assertEquals(itr.getNormalizedRank(EXCLUSIVE), 0); assertEquals(itr.getNormalizedRank(INCLUSIVE), 0.5); @@ -86,8 +86,8 @@ public void twoItemSketchForSortedViewIterator() { assertEquals(itr.getQuantile(), 2.0f); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 1); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 2); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 1); + assertEquals(itr.getNaturalRank(INCLUSIVE), 2); assertEquals(itr.getNormalizedRank(EXCLUSIVE), 0.5); assertEquals(itr.getNormalizedRank(INCLUSIVE), 1.0); } diff --git a/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java index 161ee4318..846965cb8 100644 --- a/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java @@ -391,21 +391,6 @@ public void nanSplitPoint() { sketch.getCDF(new float[] {Float.NaN}); } - @Test - public void getQuantiles() { - final KllFloatsSketch sketch = KllFloatsSketch.newHeapInstance(); - sketch.update(1); - sketch.update(2); - sketch.update(3); - sketch.update(4); - float[] quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, EXCLUSIVE); - float[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, INCLUSIVE); - quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - } - @Test public void checkReset() { KllFloatsSketch sk = KllFloatsSketch.newHeapInstance(20); @@ -456,18 +441,18 @@ public void sortedView() { assertEquals(itr.next(), true); assertEquals(itr.getQuantile(), 1); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 0); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 1); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 0); + assertEquals(itr.getNaturalRank(INCLUSIVE), 1); assertEquals(itr.next(), true); assertEquals(itr.getQuantile(), 2); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 1); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 2); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 1); + assertEquals(itr.getNaturalRank(INCLUSIVE), 2); assertEquals(itr.next(), true); assertEquals(itr.getQuantile(), 3); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 2); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 3); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 2); + assertEquals(itr.getNaturalRank(INCLUSIVE), 3); assertEquals(itr.next(), false); } diff --git a/src/test/java/org/apache/datasketches/kll/KllItemsSketchSortedViewString.java b/src/test/java/org/apache/datasketches/kll/KllItemsSketchSortedViewString.java index 5eb513aa8..b0024420c 100644 --- a/src/test/java/org/apache/datasketches/kll/KllItemsSketchSortedViewString.java +++ b/src/test/java/org/apache/datasketches/kll/KllItemsSketchSortedViewString.java @@ -30,8 +30,9 @@ public KllItemsSketchSortedViewString( final String[] quantiles, final long[] cumWeights, final long totalN, - final String minItem, - final Comparator<String> comparator) { - super(quantiles, cumWeights, totalN, minItem, comparator); + final Comparator<String> comparator, + final String maxItem, + final String minItem) { + super(quantiles, cumWeights, totalN, comparator, maxItem, minItem); } } diff --git a/src/test/java/org/apache/datasketches/kll/KllItemsSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllItemsSketchTest.java index a980841b6..deb3cb9c8 100644 --- a/src/test/java/org/apache/datasketches/kll/KllItemsSketchTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllItemsSketchTest.java @@ -461,10 +461,10 @@ public void getQuantiles() { sketch.update("C"); sketch.update("D"); String[] quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, EXCLUSIVE); - String[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).boundaries; + String[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).getBoundaries(); assertEquals(quantiles1, quantiles2); quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, INCLUSIVE); - quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).boundaries; + quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).getBoundaries(); assertEquals(quantiles1, quantiles2); } @@ -528,18 +528,18 @@ public void sortedView() { assertEquals(itr.next(), true); assertEquals(itr.getQuantile(), "A"); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 0); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 1); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 0); + assertEquals(itr.getNaturalRank(INCLUSIVE), 1); assertEquals(itr.next(), true); assertEquals(itr.getQuantile(), "AB"); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 1); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 2); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 1); + assertEquals(itr.getNaturalRank(INCLUSIVE), 2); assertEquals(itr.next(), true); assertEquals(itr.getQuantile(), "ABC"); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 2); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 3); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 2); + assertEquals(itr.getNaturalRank(INCLUSIVE), 3); assertEquals(itr.next(), false); } diff --git a/src/test/java/org/apache/datasketches/kll/KllItemsSketchiteratorTest.java b/src/test/java/org/apache/datasketches/kll/KllItemsSketchiteratorTest.java index 0607ff5d7..f97eb2320 100644 --- a/src/test/java/org/apache/datasketches/kll/KllItemsSketchiteratorTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllItemsSketchiteratorTest.java @@ -82,8 +82,8 @@ public void twoItemSketchForSortedViewIterator() { assertEquals(itr.getQuantile(), "1"); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 0); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 1); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 0); + assertEquals(itr.getNaturalRank(INCLUSIVE), 1); assertEquals(itr.getNormalizedRank(EXCLUSIVE), 0); assertEquals(itr.getNormalizedRank(INCLUSIVE), 0.5); @@ -91,8 +91,8 @@ public void twoItemSketchForSortedViewIterator() { assertEquals(itr.getQuantile(), "2"); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 1); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 2); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 1); + assertEquals(itr.getNaturalRank(INCLUSIVE), 2); assertEquals(itr.getNormalizedRank(EXCLUSIVE), 0.5); assertEquals(itr.getNormalizedRank(INCLUSIVE), 1.0); } diff --git a/src/test/java/org/apache/datasketches/kll/KllMiscDirectDoublesTest.java b/src/test/java/org/apache/datasketches/kll/KllMiscDirectDoublesTest.java index 45feb7637..28095dda0 100644 --- a/src/test/java/org/apache/datasketches/kll/KllMiscDirectDoublesTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllMiscDirectDoublesTest.java @@ -58,19 +58,6 @@ public void checkBounds() { assertTrue(rest - restLB < (2 * eps)); } - @Test - public void checkMisc() { - final int k = 8; - final KllDoublesSketch sk = getDirectDoublesSketch(k, 0); - try { sk.getPartitionBoundaries(10); fail(); } catch (SketchesArgumentException e) {} - for (int i = 0; i < 20; i++) { sk.update(i); } - final double[] items = sk.getDoubleItemsArray(); - assertEquals(items.length, 16); - final int[] levels = sk.getLevelsArray(sk.sketchStructure); - assertEquals(levels.length, 3); - assertEquals(sk.getNumLevels(), 2); - } - //@Test //enable static println(..) for visual checking public void visualCheckToString() { final int k = 20; diff --git a/src/test/java/org/apache/datasketches/kll/KllMiscDirectFloatsTest.java b/src/test/java/org/apache/datasketches/kll/KllMiscDirectFloatsTest.java index 6f042ce06..5f88baed4 100644 --- a/src/test/java/org/apache/datasketches/kll/KllMiscDirectFloatsTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllMiscDirectFloatsTest.java @@ -58,19 +58,6 @@ public void checkBounds() { assertTrue(rest - restLB < (2 * eps)); } - @Test - public void checkMisc() { - final int k = 8; - final KllFloatsSketch sk = getDirectFloatsSketch(k, 0); - try { sk.getPartitionBoundaries(10); fail(); } catch (SketchesArgumentException e) {} - for (int i = 0; i < 20; i++) { sk.update(i); } - final float[] items = sk.getFloatItemsArray(); - assertEquals(items.length, 16); - final int[] levels = sk.getLevelsArray(sk.sketchStructure); - assertEquals(levels.length, 3); - assertEquals(sk.getNumLevels(), 2); - } - //@Test //enable static println(..) for visual checking public void visualCheckToString() { final int k = 20; diff --git a/src/test/java/org/apache/datasketches/kll/KllMiscItemsTest.java b/src/test/java/org/apache/datasketches/kll/KllMiscItemsTest.java index 35d73fce3..0524db725 100644 --- a/src/test/java/org/apache/datasketches/kll/KllMiscItemsTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllMiscItemsTest.java @@ -30,7 +30,7 @@ import org.apache.datasketches.common.ArrayOfStringsSerDe; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; -import org.apache.datasketches.kll.KllItemsSketchSortedView.KllItemsSketchSortedViewIterator; +import org.apache.datasketches.quantilescommon.GenericSortedViewIterator; import org.apache.datasketches.memory.Memory; import org.apache.datasketches.memory.WritableMemory; import org.testng.annotations.Test; @@ -201,7 +201,7 @@ public void viewCompactionAndSortedView() { for (int i = 1; i <= n; i++) { sk.update(Util.intToFixedLengthString(i, digits)); } println(sk.toString(true, true)); KllItemsSketchSortedView<String> sv = sk.getSortedView(); - KllItemsSketchSortedViewIterator<String> itr = sv.iterator(); + GenericSortedViewIterator<String> itr = sv.iterator(); println("### SORTED VIEW"); printf("%12s%12s\n", "Value", "CumWeight"); while (itr.next()) { diff --git a/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java b/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java new file mode 100644 index 000000000..f26031465 --- /dev/null +++ b/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.partitions; + +import static org.apache.datasketches.common.Util.milliSecToString; +import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_BOTH; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; + +import java.util.List; + +import org.apache.datasketches.partitions.Partitioner; +import org.apache.datasketches.partitions.Partitioner.PartitionBoundsRow; +import org.apache.datasketches.quantiles.ItemsSketch; +import org.testng.annotations.Test; + +@SuppressWarnings("unused") +public class ClassicPartitionsTest { + private final int k = 1 << 15; + private final long totalN = 100_000_000L; + private final long tgtPartitionSize = (long)3e6; + private final int maxPartsPerSk = 100; + + //@Test + public void checkClassicPartitioner() { + println("Classic ItemsSketch Partitions Test"); + printf("Sketch K :%,20d\n", k); + printf("Total N :%,20d\n", totalN); + printf("Tgt Partition Size :%,20d\n", tgtPartitionSize); + printf("Max Parts Per Sketch :%20d\n", maxPartsPerSk); + + final long startTime_mS = System.currentTimeMillis(); + final ItemsSketchFillRequestLongAsString fillReq = new ItemsSketchFillRequestLongAsString(k, totalN); + final ItemsSketch<String> sk = fillReq.getRange(1L, totalN, INCLUDE_BOTH); + final long endFillInitialSketchTime_mS = System.currentTimeMillis(); + final Partitioner<String, ItemsSketch<String>> partitioner = new Partitioner<>( + tgtPartitionSize, + maxPartsPerSk, + fillReq, + INCLUSIVE); + final List<PartitionBoundsRow<String>> list = partitioner.partition(sk); + outputList(list); + + final long endTime_mS = System.currentTimeMillis(); + final long fillInitialSketchTime_mS = endFillInitialSketchTime_mS - startTime_mS; + final long partitioningTime_mS = endTime_mS - endFillInitialSketchTime_mS; + final long totalTime_mS = endTime_mS - startTime_mS; + println(""); + println("FillInitialSketchTime: " + milliSecToString(fillInitialSketchTime_mS)); + println("PartioningTime : " + milliSecToString(partitioningTime_mS)); + println("Total Time : " + milliSecToString(totalTime_mS)); + } + + private static final String[] hdr = + { "Level.Part", "Partition", "LowerBound", "UpperBound", "ApproxNumItems", "Include Rule" }; + private static final String hdrFmt = "%15s %10s %15s %15s %15s %15s\n"; + private static final String dFmt = "%15s %10d %15s %15s %15d %15s\n"; + + void outputList(final List<PartitionBoundsRow<String>> list) { + printf(hdrFmt, (Object[]) hdr); + final int numParts = list.size(); + final double meanPartSize = (double)totalN / numParts; + double size = 0; + double sumSizes = 0; + double sumAbsRelErr = 0; + double sumSqErr = 0; + for (int i = 0; i < numParts; i++) { + final PartitionBoundsRow<String> row = list.get(i); + printf(dFmt, row.partId , (i + 1), row.lowerBound, row.upperBound, row.approxNumDeltaItems, row.rule.name()); + size = row.approxNumDeltaItems; + sumSizes += size; + sumAbsRelErr += Math.abs(size / meanPartSize - 1.0); + final double absErr = size - meanPartSize; + sumSqErr += absErr * absErr; + } + final double meanAbsRelErr = sumAbsRelErr / numParts; + final double meanSqErr = sumSqErr / numParts; //intermediate value + final double normMeanSqErr = meanSqErr / (meanPartSize * meanPartSize); //intermediate value + final double rmsRelErr = Math.sqrt(normMeanSqErr); //a.k.a. Normalized RMS Error or NRMSE + + printf("Total ApproxNumItems :%,20d\n",(long)sumSizes); + printf("Mean Partition Size :%,20.1f\n",meanPartSize); + printf("Mean Abs Rel Error :%20.3f%%\n",meanAbsRelErr * 100); + printf("Norm RMS Error :%20.3f%%\n",rmsRelErr * 100); + } + + private final static boolean enablePrinting = true; + + /** + * @param o the Object to print + */ + private static final void print(final Object o) { + if (enablePrinting) { System.out.print(o.toString()); } + } + + /** + * @param o the Object to println + */ + private static final void println(final Object o) { + if (enablePrinting) { System.out.println(o.toString()); } + } + + /** + * @param format the format + * @param args the args + */ + private static final void printf(final String format, final Object ...args) { + if (enablePrinting) { System.out.printf(format, args); } + } + +} diff --git a/src/test/java/org/apache/datasketches/partitions/ItemsSketchFillRequestLongAsString.java b/src/test/java/org/apache/datasketches/partitions/ItemsSketchFillRequestLongAsString.java new file mode 100644 index 000000000..2b966051f --- /dev/null +++ b/src/test/java/org/apache/datasketches/partitions/ItemsSketchFillRequestLongAsString.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.partitions; + +import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_BOTH; +import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_UPPER; +import static org.apache.datasketches.quantilescommon.LongsAsOrderableStrings.digits; +import static org.apache.datasketches.quantilescommon.LongsAsOrderableStrings.getString; + +import java.util.Comparator; +import java.util.Random; + +import org.apache.datasketches.quantiles.ItemsSketch; + +/** + * This is an simulated data set with a given N used for testing. + * @author Lee Rhodes + */ +public class ItemsSketchFillRequestLongAsString implements SketchFillRequest<String, ItemsSketch<String>> { + private int k; + private int numDigits; + private Random rand = new Random(); + + public ItemsSketchFillRequestLongAsString() { + k = 1 << 10; + numDigits = 3; + } + + public ItemsSketchFillRequestLongAsString(final int k, final long totalN) { + this.k = k; + this.numDigits = digits(totalN); + } + + @Override + public ItemsSketch<String> getRange(final String lowerQuantile, final String upperQuantile, + final BoundsRule bounds) { + final ItemsSketch<String> sk = ItemsSketch.getInstance(String.class, k, Comparator.naturalOrder()); + final long lower = Long.parseLong(lowerQuantile.trim()); + final long upper = Long.parseLong(upperQuantile.trim()); + if (bounds == INCLUDE_BOTH) { + for (long i = lower; i <= upper; i++) { sk.update(getString(i, numDigits)); } + } else if (bounds == INCLUDE_UPPER) { + for (long i = lower + 1; i <= upper; i++) { sk.update(getString(i, numDigits)); } + } else { //INCLUDE_LOWER + for (long i = lower; i < upper; i++) { sk.update(getString(i, numDigits)); } + } + return sk; + } + + public ItemsSketch<String> getRange(final long lowerQuantile, final long upperQuantile, final BoundsRule bounds) { + final ItemsSketch<String> sk = ItemsSketch.getInstance(String.class, k, Comparator.naturalOrder()); + final long lower = lowerQuantile; + final long upper = upperQuantile; + if (bounds == INCLUDE_BOTH) { + for (long i = lower; i <= upper; i++) { sk.update(getString(i, numDigits)); } + } else if (bounds == INCLUDE_UPPER) { + for (long i = lower + 1; i <= upper; i++) { sk.update(getString(i, numDigits)); } + } else { //INCLUDE_LOWER + for (long i = lower; i < upper; i++) { sk.update(getString(i, numDigits)); } + } + return sk; + } + + public ItemsSketch<String> getRangeRandom(final long lowerQuantile, final long upperQuantile, + final BoundsRule bounds) { + final ItemsSketch<String> sk = ItemsSketch.getInstance(String.class, k, Comparator.naturalOrder()); + final long lower = lowerQuantile; + final long upper = upperQuantile; + this.rand = new Random(); + if (bounds == INCLUDE_BOTH) { + for (long i = lower; i <= upper; i++) { + sk.update(getString(randBetween(lowerQuantile, upperQuantile, bounds), numDigits)); + } + } else if (bounds == INCLUDE_UPPER) { + for (long i = lower + 1; i <= upper; i++) { + sk.update(getString(randBetween(lowerQuantile, upperQuantile, bounds), numDigits)); + } + } else { //INCLUDE_LOWER + for (long i = lower; i < upper; i++) { + sk.update(getString(randBetween(lowerQuantile, upperQuantile, bounds), numDigits)); + } + } + return sk; + } + + private final long randBetween(final long lb, final long ub, final BoundsRule bounds) { + final double r = rand.nextDouble(); + final long range; + final long offset; + if (bounds == INCLUDE_BOTH) { + range = ub - lb; + offset = lb; + } + else if (bounds == INCLUDE_UPPER) { + range = ub - lb - 1; + offset = lb + 1; + } else { //INCLUDE_LOWER + range = ub - lb - 1; + offset = lb; + } + return Math.round(r * range + offset); + } + +} diff --git a/src/test/java/org/apache/datasketches/partitions/KllItemsSketchFillRequestLongAsString.java b/src/test/java/org/apache/datasketches/partitions/KllItemsSketchFillRequestLongAsString.java new file mode 100644 index 000000000..53d80190f --- /dev/null +++ b/src/test/java/org/apache/datasketches/partitions/KllItemsSketchFillRequestLongAsString.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.partitions; + +import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_BOTH; +import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_UPPER; +import static org.apache.datasketches.quantilescommon.LongsAsOrderableStrings.digits; +import static org.apache.datasketches.quantilescommon.LongsAsOrderableStrings.getString; + +import java.util.Comparator; +import java.util.Random; + +import org.apache.datasketches.common.ArrayOfStringsSerDe; +import org.apache.datasketches.kll.KllItemsSketch; + +/** + * This is an simulated data set with a given N used for testing. + * @author Lee Rhodes + */ +public class KllItemsSketchFillRequestLongAsString implements SketchFillRequest<String, KllItemsSketch<String>> { + private int k; + private int numDigits; + private Random rand = new Random(); + + public KllItemsSketchFillRequestLongAsString() { + k = 1 << 10; + numDigits = 3; + } + + public KllItemsSketchFillRequestLongAsString(final int k, final long totalN) { + this.k = k; + this.numDigits = digits(totalN); + } + + @Override + public KllItemsSketch<String> getRange(final String lowerQuantile, final String upperQuantile, + final BoundsRule bounds) { + KllItemsSketch<String> sk = KllItemsSketch.newHeapInstance(k, Comparator.naturalOrder(), new ArrayOfStringsSerDe()); + long lower = Long.parseLong(lowerQuantile.trim()); + long upper = Long.parseLong(upperQuantile.trim()); + if (bounds == INCLUDE_BOTH) { + for (long i = lower; i <= upper; i++) { sk.update(getString(i, numDigits)); } + } else if (bounds == INCLUDE_UPPER) { + for (long i = lower + 1; i <= upper; i++) { sk.update(getString(i, numDigits)); } + } else { //INCLUDE_LOWER + for (long i = lower; i < upper; i++) { sk.update(getString(i, numDigits)); } + } + return sk; + } + + public KllItemsSketch<String> getRange(final long lowerQuantile, final long upperQuantile, final BoundsRule bounds) { + KllItemsSketch<String> sk = KllItemsSketch.newHeapInstance(k, Comparator.naturalOrder(), new ArrayOfStringsSerDe()); + long lower = lowerQuantile; + long upper = upperQuantile; + if (bounds == INCLUDE_BOTH) { + for (long i = lower; i <= upper; i++) { sk.update(getString(i, numDigits)); } + } else if (bounds == INCLUDE_UPPER) { + for (long i = lower + 1; i <= upper; i++) { sk.update(getString(i, numDigits)); } + } else { //INCLUDE_LOWER + for (long i = lower; i < upper; i++) { sk.update(getString(i, numDigits)); } + } + return sk; + } + + public KllItemsSketch<String> getRangeRandom(final long lowerQuantile, final long upperQuantile, final BoundsRule bounds) { + KllItemsSketch<String> sk = KllItemsSketch.newHeapInstance(k, Comparator.naturalOrder(), new ArrayOfStringsSerDe()); + long lower = lowerQuantile; + long upper = upperQuantile; + this.rand = new Random(); + if (bounds == INCLUDE_BOTH) { + for (long i = lower; i <= upper; i++) { + sk.update(getString(randBetween(lowerQuantile, upperQuantile, bounds), numDigits)); + } + } else if (bounds == INCLUDE_UPPER) { + for (long i = lower + 1; i <= upper; i++) { + sk.update(getString(randBetween(lowerQuantile, upperQuantile, bounds), numDigits)); + } + } else { //INCLUDE_LOWER + for (long i = lower; i < upper; i++) { + sk.update(getString(randBetween(lowerQuantile, upperQuantile, bounds), numDigits)); + } + } + return sk; + } + + private final long randBetween(final long lb, final long ub, final BoundsRule bounds) { + double r = rand.nextDouble(); + long range; + long offset; + if (bounds == INCLUDE_BOTH) { + range = ub - lb; + offset = lb; + } + else if (bounds == INCLUDE_UPPER) { + range = ub - lb - 1; + offset = lb + 1; + } else { //INCLUDE_LOWER + range = ub - lb - 1; + offset = lb; + } + return Math.round(r * range + offset); + } + +} diff --git a/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java b/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java new file mode 100644 index 000000000..3b44d9988 --- /dev/null +++ b/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.partitions; + +import static org.apache.datasketches.common.Util.milliSecToString; +import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_BOTH; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; + +import java.util.List; + +import org.apache.datasketches.partitions.Partitioner; +import org.apache.datasketches.partitions.Partitioner.PartitionBoundsRow; +import org.apache.datasketches.kll.KllItemsSketch; +import org.testng.annotations.Test; + +@SuppressWarnings("unused") +public class KllPartitionsTest { + private final int k = 1 << 15; + private final long totalN = 100_000_000L; + private final long tgtPartitionSize = (long)3e6; + private final int maxPartsPerSk = 100; + + //@Test + public void checkKllPartitioner() { + println("KllItemsSketch Partitions Test"); + printf("Sketch K :%,20d\n", k); + printf("Total N :%,20d\n", totalN); + printf("Tgt Partition Size :%,20d\n", tgtPartitionSize); + printf("Max Parts Per Sketch :%20d\n", maxPartsPerSk); + + final long startTime_mS = System.currentTimeMillis(); + final KllItemsSketchFillRequestLongAsString fillReq = new KllItemsSketchFillRequestLongAsString(k, totalN); + final KllItemsSketch<String> sk = fillReq.getRange(1L, totalN, INCLUDE_BOTH); + final long endFillInitialSketchTime_mS = System.currentTimeMillis(); + final Partitioner<String, KllItemsSketch<String>> partitioner = new Partitioner<>( + tgtPartitionSize, + maxPartsPerSk, + fillReq, + INCLUSIVE); + final List<PartitionBoundsRow<String>> list = partitioner.partition(sk); + outputList(list); + + final long endTime_mS = System.currentTimeMillis(); + final long fillInitialSketchTime_mS = endFillInitialSketchTime_mS - startTime_mS; + final long partitioningTime_mS = endTime_mS - endFillInitialSketchTime_mS; + final long totalTime_mS = endTime_mS - startTime_mS; + println(""); + println("FillInitialSketchTime: " + milliSecToString(fillInitialSketchTime_mS)); + println("PartioningTime : " + milliSecToString(partitioningTime_mS)); + println("Total Time : " + milliSecToString(totalTime_mS)); + } + + private static final String[] hdr = + { "Level.Part", "Partition", "LowerBound", "UpperBound", "ApproxNumItems", "Include Rule" }; + private static final String hdrFmt = "%15s %10s %15s %15s %15s %15s\n"; + private static final String dFmt = "%15s %10d %15s %15s %15d %15s\n"; + + void outputList(final List<PartitionBoundsRow<String>> list) { + printf(hdrFmt, (Object[]) hdr); + final int numParts = list.size(); + final double meanPartSize = (double)totalN / numParts; + double size = 0; + double sumSizes = 0; + double sumAbsRelErr = 0; + double sumSqErr = 0; + for (int i = 0; i < numParts; i++) { + final PartitionBoundsRow<String> row = list.get(i); + printf(dFmt, row.partId , (i + 1), row.lowerBound, row.upperBound, row.approxNumDeltaItems, row.rule.name()); + size = row.approxNumDeltaItems; + sumSizes += size; + sumAbsRelErr += Math.abs(size / meanPartSize - 1.0); + final double absErr = size - meanPartSize; + sumSqErr += absErr * absErr; + } + final double meanAbsRelErr = sumAbsRelErr / numParts; + final double meanSqErr = sumSqErr / numParts; //intermediate value + final double normMeanSqErr = meanSqErr / (meanPartSize * meanPartSize); //intermediate value + final double rmsRelErr = Math.sqrt(normMeanSqErr); //a.k.a. Normalized RMS Error or NRMSE + + printf("Total ApproxNumItems :%,20d\n",(long)sumSizes); + printf("Mean Partition Size :%,20.1f\n",meanPartSize); + printf("Mean Abs Rel Error :%20.3f%%\n",meanAbsRelErr * 100); + printf("Norm RMS Error :%20.3f%%\n",rmsRelErr * 100); + } + + private final static boolean enablePrinting = true; + + /** + * @param o the Object to print + */ + private static final void print(final Object o) { + if (enablePrinting) { System.out.print(o.toString()); } + } + + /** + * @param o the Object to println + */ + private static final void println(final Object o) { + if (enablePrinting) { System.out.println(o.toString()); } + } + + /** + * @param format the format + * @param args the args + */ + private static final void printf(final String format, final Object ...args) { + if (enablePrinting) { System.out.printf(format, args); } + } + +} diff --git a/src/test/java/org/apache/datasketches/quantiles/CustomQuantilesTest.java b/src/test/java/org/apache/datasketches/quantiles/CustomQuantilesTest.java index 216b91f72..d3193883b 100644 --- a/src/test/java/org/apache/datasketches/quantiles/CustomQuantilesTest.java +++ b/src/test/java/org/apache/datasketches/quantiles/CustomQuantilesTest.java @@ -91,7 +91,7 @@ public void checkQuantilesV400() { double qTrue = getTrueDoubleQuantile(cumWtsArr, quantilesArr, normRankIn, EXCLUSIVE); assertEquals(qEst, qTrue); double rawNatRank = normRankIn * N; - double trimNatRank = getNaturalRank(normRankIn, N); + double trimNatRank = getNaturalRank(normRankIn, N, EXCLUSIVE); printf("%22.18f %22.18f %22.18f %13.1f", normRankIn, rawNatRank, trimNatRank, qEst); if (qEst != qTrue) { println(" " + qEst + " != " +qTrue); } else { println(""); } } @@ -120,7 +120,7 @@ public void checkQuantilesV400() { double qTrue = getTrueDoubleQuantile(cumWtsArr, quantilesArr, normRankIn, INCLUSIVE); assertEquals(qEst, qTrue); double rawNatRank = normRankIn * N; - double trimNatRank = getNaturalRank(normRankIn, N); + double trimNatRank = getNaturalRank(normRankIn, N, INCLUSIVE); printf("%22.18f %22.18f %22.18f %13.1f", normRankIn, rawNatRank, trimNatRank, qEst); if (qEst != qTrue) { println(" " + qEst + " != " +qTrue); } else { println(""); } } diff --git a/src/test/java/org/apache/datasketches/quantiles/DoublesSketchTest.java b/src/test/java/org/apache/datasketches/quantiles/DoublesSketchTest.java index fdd7918d1..d4f549ebe 100644 --- a/src/test/java/org/apache/datasketches/quantiles/DoublesSketchTest.java +++ b/src/test/java/org/apache/datasketches/quantiles/DoublesSketchTest.java @@ -134,7 +134,6 @@ public void checkEmptyExceptions() { try { uds.getMaxItem(); fail(); } catch (IllegalArgumentException e) {} try { uds.getMinItem(); fail(); } catch (IllegalArgumentException e) {} try { uds.getRank(1.0); fail(); } catch (IllegalArgumentException e) {} - try { uds.getPartitionBoundaries(5); fail(); } catch (IllegalArgumentException e) {} try { uds.getPMF(new double[] { 0, 0.5, 1.0 }); fail(); } catch (IllegalArgumentException e) {} try { uds.getCDF(new double[] { 0, 0.5, 1.0 }); fail(); } catch (IllegalArgumentException e) {} } @@ -199,15 +198,15 @@ public void sortedView() { Assert.assertEquals(it.next(), true); Assert.assertEquals(it.getQuantile(), 1); Assert.assertEquals(it.getWeight(), 1); - Assert.assertEquals(it.getCumulativeWeight(INCLUSIVE), 1); + Assert.assertEquals(it.getNaturalRank(INCLUSIVE), 1); Assert.assertEquals(it.next(), true); Assert.assertEquals(it.getQuantile(), 2); Assert.assertEquals(it.getWeight(), 1); - Assert.assertEquals(it.getCumulativeWeight(INCLUSIVE), 2); + Assert.assertEquals(it.getNaturalRank(INCLUSIVE), 2); Assert.assertEquals(it.next(), true); Assert.assertEquals(it.getQuantile(), 3); Assert.assertEquals(it.getWeight(), 1); - Assert.assertEquals(it.getCumulativeWeight(INCLUSIVE), 3); + Assert.assertEquals(it.getNaturalRank(INCLUSIVE), 3); Assert.assertEquals(it.next(), false); } } diff --git a/src/test/java/org/apache/datasketches/quantiles/HeapUpdateDoublesSketchTest.java b/src/test/java/org/apache/datasketches/quantiles/HeapUpdateDoublesSketchTest.java index b5fd7b2d3..eba9f6b55 100644 --- a/src/test/java/org/apache/datasketches/quantiles/HeapUpdateDoublesSketchTest.java +++ b/src/test/java/org/apache/datasketches/quantiles/HeapUpdateDoublesSketchTest.java @@ -782,31 +782,6 @@ public void testIt() { assertTrue(qsk2.isEmpty()); } - @Test - public void checkEvenlySpacedQuantiles() { - DoublesSketch qsk = buildAndLoadQS(32, 1001); - double[] values = qsk.getPartitionBoundaries(10).boundaries; - for (int i = 0; i<values.length; i++) { - println(""+values[i]); - } - assertEquals(values.length, 11); - } - - @Test - public void getQuantiles() { - final DoublesSketch sketch = buildAndLoadQS(32,0); - sketch.update(1); - sketch.update(2); - sketch.update(3); - sketch.update(4); - double[] quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, EXCLUSIVE); - double[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, INCLUSIVE); - quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - } - @Test public void checkEquallySpacedRanks() { int n = 10; @@ -954,11 +929,10 @@ public void tenItems() { assertEquals(sketch.getN(), 10); assertEquals(sketch.getNumRetained(), 10); for (int i = 1; i <= 10; i++) { - assertEquals(sketch.getRank(i, EXCLUSIVE), (i - 1) / 10.0); assertEquals(sketch.getRank(i, EXCLUSIVE), (i - 1) / 10.0); assertEquals(sketch.getRank(i, INCLUSIVE), i / 10.0); } - // inclusive = false (default) + // inclusive = false assertEquals(sketch.getQuantile(0, EXCLUSIVE), 1); assertEquals(sketch.getQuantile(0.1, EXCLUSIVE), 2); assertEquals(sketch.getQuantile(0.2, EXCLUSIVE), 3); diff --git a/src/test/java/org/apache/datasketches/quantiles/ItemsSketchSortedViewString.java b/src/test/java/org/apache/datasketches/quantiles/ItemsSketchSortedViewString.java index 6a4934f97..0dbf5bde2 100644 --- a/src/test/java/org/apache/datasketches/quantiles/ItemsSketchSortedViewString.java +++ b/src/test/java/org/apache/datasketches/quantiles/ItemsSketchSortedViewString.java @@ -30,7 +30,9 @@ public ItemsSketchSortedViewString( final String[] quantiles, final long[] cumWeights, final long totalN, - final Comparator<String> comparator) { - super(quantiles, cumWeights, totalN, comparator); + final Comparator<String> comparator, + final String maxItem, + final String minItem) { + super(quantiles, cumWeights, totalN, comparator, maxItem, minItem); } } diff --git a/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java b/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java index f123b01bd..0d8527bbf 100644 --- a/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java +++ b/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java @@ -599,15 +599,15 @@ public void sortedView() { assertEquals(it.next(), true); assertEquals(it.getQuantile(), 1); assertEquals(it.getWeight(), 1); - assertEquals(it.getCumulativeWeight(INCLUSIVE), 1); + assertEquals(it.getNaturalRank(INCLUSIVE), 1); assertEquals(it.next(), true); assertEquals(it.getQuantile(), 2); assertEquals(it.getWeight(), 1); - assertEquals(it.getCumulativeWeight(INCLUSIVE), 2); + assertEquals(it.getNaturalRank(INCLUSIVE), 2); assertEquals(it.next(), true); assertEquals(it.getQuantile(), 3); assertEquals(it.getWeight(), 1); - assertEquals(it.getCumulativeWeight(INCLUSIVE), 3); + assertEquals(it.getNaturalRank(INCLUSIVE), 3); assertEquals(it.next(), false); } } @@ -617,7 +617,7 @@ public void sortedView2() { Double[] qArr = {8.0, 10.0, 10.0, 20.0}; long[] cwArr = {1, 3, 4, 5}; Comparator<Double> comp = Comparator.naturalOrder(); - ItemsSketchSortedView<Double> sv = new ItemsSketchSortedView<>(qArr, cwArr, 5L, comp); + ItemsSketchSortedView<Double> sv = new ItemsSketchSortedView<>(qArr, cwArr, 5L, comp, 20.0, 8.0); double[] ranks = {0, .1, .2, .3, .6, .7, .8, .9, 1.0}; Double[] qOut = new Double[9]; for (int i = 0; i < ranks.length; i++) { @@ -640,10 +640,10 @@ public void getQuantiles() { sketch.update(3); sketch.update(4); Integer[] quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, EXCLUSIVE); - Integer[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).boundaries; + Integer[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).getBoundaries(); assertEquals(quantiles1, quantiles2); quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, INCLUSIVE); - quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).boundaries; + quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).getBoundaries(); assertEquals(quantiles1, quantiles2); } diff --git a/src/test/java/org/apache/datasketches/quantiles/SkewedDataTest.java b/src/test/java/org/apache/datasketches/quantiles/SkewedDataTest.java new file mode 100644 index 000000000..d27911cab --- /dev/null +++ b/src/test/java/org/apache/datasketches/quantiles/SkewedDataTest.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.quantiles; + +import java.util.Comparator; + +import static org.apache.datasketches.quantilescommon.LongsAsOrderableStrings.*; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; + +import org.apache.datasketches.quantilescommon.GenericSortedViewIterator; +import org.apache.datasketches.quantilescommon.GenericPartitionBoundaries; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; +import org.testng.annotations.Test; + +/** + * blah + */ +@SuppressWarnings("unused") +public class SkewedDataTest { + static String[] hdr = {"N", "MaxItem", "MinItem", "NumParts", "SearchCriteria"}; + static String hdrfmt = "%6s %10s %10s %10s %15s\n"; + static String hdrdfmt = "%6d %10s %10s %10d %15s\n"; + + static String[] rowhdr = {"Row", "NormRanks", "NatRanks", "Boundaries", "DeltaItems"}; + static String rowhdrfmt = "%5s %12s %12s %12s %12s\n"; + static String rowdfmt = "%5d %12.8f %12d %12s %12d\n"; + + static String[] rowhdr2 = {"Row", "NormRanks", "NatRanks", "Boundaries"}; + static String rowhdrfmt2= "%5s %12s %12s %12s\n"; + static String rowdfmt2 = "%5d %12.8f %12d %12s\n"; + + //@Test //visual only + public void checkWithSkew() { + int n = 2050; + int k = 1 << 15; + int n2 = 200; + int totalN = n + n2; + int numDigits = digits(totalN); + long v2 = 1000L; + int numParts = 22; + QuantileSearchCriteria searchCrit = QuantileSearchCriteria.INCLUSIVE; + ItemsSketch<String> sk = ItemsSketch.getInstance(String.class,k, Comparator.naturalOrder()); + + for (long i = 1; i <= n; i++) { sk.update(getString(i, numDigits)); } + for (long i = 1; i <= n2; i++) { sk.update(getString(v2, numDigits)); } + ItemsSketchSortedView<String> sv = sk.getSortedView(); + GenericSortedViewIterator<String> itr = sv.iterator(); + println("SORTED VIEW:"); + printf(rowhdrfmt2, (Object[])rowhdr2); + int j = 0; + while (itr.next()) { + printf(rowdfmt2, j++, itr.getNormalizedRank(searchCrit), itr.getNaturalRank(searchCrit), itr.getQuantile()); + } + + GenericPartitionBoundaries<String> gpb = sv.getPartitionBoundaries(numParts, searchCrit); + int arrLen = gpb.getBoundaries().length; + double[] normRanks = gpb.getNormalizedRanks(); + long[] natRanks = gpb.getNaturalRanks(); + String[] boundaries = gpb.getBoundaries(); + long[] numDeltaItems = gpb.getNumDeltaItems(); + println(""); + println("GET PARTITION BOUNDARIES:"); + printf(hdrfmt, (Object[]) hdr); + printf(hdrdfmt, totalN, gpb.getMaxItem(), gpb.getMinItem(), numParts, searchCrit.toString()); + println(""); + printf(rowhdrfmt, (Object[]) rowhdr); + for (int i = 0; i < arrLen; i++) { + printf(rowdfmt, i, normRanks[i], natRanks[i], boundaries[i], numDeltaItems[i]); + } + } + + private final static boolean enablePrinting = true; + + /** + * @param o the Object to print + */ + private static final void print(final Object o) { + if (enablePrinting) { System.out.print(o.toString()); } + } + + /** + * @param o the Object to println + */ + private static final void println(final Object o) { + if (enablePrinting) { System.out.println(o.toString()); } + } + + /** + * @param format the format + * @param args the args + */ + private static final void printf(final String format, final Object ...args) { + if (enablePrinting) { System.out.printf(format, args); } + } + + +} diff --git a/src/test/java/org/apache/datasketches/quantilescommon/CrossCheckQuantilesTest.java b/src/test/java/org/apache/datasketches/quantilescommon/CrossCheckQuantilesTest.java index 5f4c4c753..df151c8ce 100644 --- a/src/test/java/org/apache/datasketches/quantilescommon/CrossCheckQuantilesTest.java +++ b/src/test/java/org/apache/datasketches/quantilescommon/CrossCheckQuantilesTest.java @@ -77,7 +77,6 @@ */ public class CrossCheckQuantilesTest { private ArrayOfStringsSerDe serDe = new ArrayOfStringsSerDe(); - private final String minItem = "10"; private final Comparator<String> comparator = Comparator.naturalOrder(); private final static int k = 32; //all sketches are in exact mode @@ -121,6 +120,14 @@ public class CrossCheckQuantilesTest { {2,1,2,1,2,1,2,1} }; + final float[] svMaxFValues = { 10, 10, 40, 50, 40 }; + final float[] svMinFValues = { 10, 10, 10, 10, 10 }; + final double[] svMaxDValues = { 10, 10, 40, 50, 40 }; + final double[] svMinDValues = { 10, 10, 10, 10, 10 }; + final String[] svMaxIValues = { "10", "10", "40", "50", "40" }; + final String[] svMinIValues = { "10", "10", "10", "10", "10" }; + + int numSets; long[][] svCumWeights; @@ -329,32 +336,44 @@ private void buildSketches(int set) { /*******BUILD & LOAD SVs***********/ private void buildSVs(int set) throws Exception { - reqFloatsSV = getRawReqSV(svFValues[set], svCumWeights[set], totalN[set]); - kllFloatsSV = getRawKllFloatsSV(svFValues[set], svCumWeights[set], totalN[set]); - kllDoublesSV = getRawKllDoublesSV(svDValues[set], svCumWeights[set], totalN[set]); - classicDoublesSV = getRawClassicDoublesSV(svDValues[set], svCumWeights[set], totalN[set]); - kllItemsSV = new KllItemsSketchSortedViewString(svIValues[set], svCumWeights[set], totalN[set], minItem, comparator); - itemsSV = new ItemsSketchSortedViewString(svIValues[set], svCumWeights[set], totalN[set], comparator); + reqFloatsSV = getRawReqSV(svFValues[set], svCumWeights[set], totalN[set], + svMaxFValues[set], svMinFValues[set]); + kllFloatsSV = getRawKllFloatsSV(svFValues[set], svCumWeights[set], totalN[set], + svMaxFValues[set], svMinFValues[set]); + kllDoublesSV = getRawKllDoublesSV(svDValues[set], svCumWeights[set], totalN[set], + svMaxDValues[set], svMinDValues[set]); + classicDoublesSV = getRawClassicDoublesSV(svDValues[set], svCumWeights[set], totalN[set], + svMaxDValues[set], svMinDValues[set]); + String svImax = svIValues[set][svIValues[set].length - 1]; + String svImin = svIValues[set][0]; + kllItemsSV = new KllItemsSketchSortedViewString(svIValues[set], svCumWeights[set], totalN[set], + comparator, svImax, svImin); + itemsSV = new ItemsSketchSortedViewString(svIValues[set], svCumWeights[set], totalN[set], + comparator, svImax, svImin); } private final static ReqSketchSortedView getRawReqSV( - final float[] values, final long[] cumWeights, final long totalN) throws Exception { - return (ReqSketchSortedView) REQ_SV_CTOR.newInstance(values, cumWeights, totalN); + final float[] values, final long[] cumWeights, final long totalN, final float maxItem, final float minItem) + throws Exception { + return (ReqSketchSortedView) REQ_SV_CTOR.newInstance(values, cumWeights, totalN, maxItem, minItem); } private final static KllFloatsSketchSortedView getRawKllFloatsSV( - final float[] values, final long[] cumWeights, final long totalN) throws Exception { - return (KllFloatsSketchSortedView) KLL_FLOATS_SV_CTOR.newInstance(values, cumWeights, totalN); + final float[] values, final long[] cumWeights, final long totalN, final float maxItem, final float minItem) + throws Exception { + return (KllFloatsSketchSortedView) KLL_FLOATS_SV_CTOR.newInstance(values, cumWeights, totalN, maxItem, minItem); } private final static KllDoublesSketchSortedView getRawKllDoublesSV( - final double[] values, final long[] cumWeights, final long totalN) throws Exception { - return (KllDoublesSketchSortedView) KLL_DOUBLES_SV_CTOR.newInstance(values, cumWeights, totalN); + final double[] values, final long[] cumWeights, final long totalN, final double maxItem, final double minItem) + throws Exception { + return (KllDoublesSketchSortedView) KLL_DOUBLES_SV_CTOR.newInstance(values, cumWeights, totalN, maxItem, minItem); } private final static DoublesSketchSortedView getRawClassicDoublesSV( - final double[] values, final long[] cumWeights, final long totalN) throws Exception { - return (DoublesSketchSortedView) CLASSIC_DOUBLES_SV_CTOR.newInstance(values, cumWeights, totalN); + final double[] values, final long[] cumWeights, final long totalN, final double maxItem, final double minItem) + throws Exception { + return (DoublesSketchSortedView) CLASSIC_DOUBLES_SV_CTOR.newInstance(values, cumWeights, totalN, maxItem, minItem); } /********BUILD DATA SETS**********/ diff --git a/src/test/java/org/apache/datasketches/quantilescommon/LongsAsOrderableStrings.java b/src/test/java/org/apache/datasketches/quantilescommon/LongsAsOrderableStrings.java new file mode 100644 index 000000000..d8eb60d56 --- /dev/null +++ b/src/test/java/org/apache/datasketches/quantilescommon/LongsAsOrderableStrings.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.quantilescommon; + +import static java.lang.Math.ceil; +import static java.lang.Math.log; +import static org.apache.datasketches.common.Util.characterPad; + +/** + * Creates a string from a positive long value that is orderable in the + * same order as its long value. + */ +public final class LongsAsOrderableStrings { + + /** + * Converts the given long into a string with leading spaces based on the given numDigits. + * This allows the stings to be ordered as if they were longs. + * @param value the value to convert + * @param numDigits the maximum required number of total spaces for digits. + * @return the given long into a string with leading spaces + */ + public static String getString(final long value, final int numDigits) { + return characterPad(Long.toString(value), numDigits, ' ', false); + } + + /** + * Converts the given String back to a long by trimming any leading or trailing spaces. + * @param value the given string to convert + * @return the given String back to a long + */ + public static long getLong(final String value) { + return Long.parseLong(value.trim()); + } + + /** + * Computes the number of digits required to display the given positive long value. + * This does not include commas or other digit separators. + * This works with longs less than 1E15. + * @param maxValue the maximum anticipated long value. + * @return the number of required display digits + */ + public static int digits(final long maxValue) { + if (maxValue <= 0) { return 1; } + return (int) ceil(log(maxValue + 1) / log(10.0)); + } + +} diff --git a/src/test/java/org/apache/datasketches/quantilescommon/ReflectUtilityTest.java b/src/test/java/org/apache/datasketches/quantilescommon/ReflectUtilityTest.java index b756c5da1..191629fbe 100644 --- a/src/test/java/org/apache/datasketches/quantilescommon/ReflectUtilityTest.java +++ b/src/test/java/org/apache/datasketches/quantilescommon/ReflectUtilityTest.java @@ -50,10 +50,14 @@ private ReflectUtilityTest() {} KLL_DOUBLES_SV = getClass("org.apache.datasketches.kll.KllDoublesSketchSortedView"); CLASSIC_DOUBLES_SV = getClass("org.apache.datasketches.quantiles.DoublesSketchSortedView"); - REQ_SV_CTOR = getConstructor(REQ_SV, float[].class, long[].class, long.class); - KLL_FLOATS_SV_CTOR = getConstructor(KLL_FLOATS_SV, float[].class, long[].class, long.class); - KLL_DOUBLES_SV_CTOR = getConstructor(KLL_DOUBLES_SV, double[].class, long[].class, long.class); - CLASSIC_DOUBLES_SV_CTOR = getConstructor(CLASSIC_DOUBLES_SV, double[].class, long[].class, long.class); + REQ_SV_CTOR = + getConstructor(REQ_SV, float[].class, long[].class, long.class, float.class, float.class); + KLL_FLOATS_SV_CTOR = + getConstructor(KLL_FLOATS_SV, float[].class, long[].class, long.class, float.class, float.class); + KLL_DOUBLES_SV_CTOR = + getConstructor(KLL_DOUBLES_SV, double[].class, long[].class, long.class, double.class, double.class); + CLASSIC_DOUBLES_SV_CTOR = + getConstructor(CLASSIC_DOUBLES_SV, double[].class, long[].class, long.class, double.class, double.class); } @Test //Example @@ -62,7 +66,7 @@ public static void checkCtr() throws Exception { long[] larr = { 1, 2, 3 }; long n = 3; ReqSketchSortedView reqSV = - (ReqSketchSortedView) REQ_SV_CTOR.newInstance(farr, larr, n); + (ReqSketchSortedView) REQ_SV_CTOR.newInstance(farr, larr, n, 10f, 30f); float q = reqSV.getQuantile(1.0, INCLUSIVE); assertEquals(q, 30f); } diff --git a/src/test/java/org/apache/datasketches/req/ReqSketchSortedViewTest.java b/src/test/java/org/apache/datasketches/req/ReqSketchSortedViewTest.java index eb75790e5..003a53c3b 100644 --- a/src/test/java/org/apache/datasketches/req/ReqSketchSortedViewTest.java +++ b/src/test/java/org/apache/datasketches/req/ReqSketchSortedViewTest.java @@ -21,12 +21,12 @@ import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; + import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; import org.apache.datasketches.quantilescommon.FloatsSortedView; import org.apache.datasketches.quantilescommon.FloatsSortedViewIterator; -import org.testng.Assert; import org.testng.annotations.Test; /** @@ -39,13 +39,6 @@ public class ReqSketchSortedViewTest { private final int dup = 2; private final int n = numV * dup; - @Test - public void emptySketch() { - ReqSketch sketch = ReqSketch.builder().build(); - FloatsSortedViewIterator itr = sketch.getSortedView().iterator(); - Assert.assertFalse(itr.next()); - } - @Test public void twoValueSketch() { ReqSketch sketch = ReqSketch.builder().build(); @@ -57,8 +50,8 @@ public void twoValueSketch() { assertEquals(itr.getQuantile(), 1f); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 0); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 1); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 0); + assertEquals(itr.getNaturalRank(INCLUSIVE), 1); assertEquals(itr.getNormalizedRank(EXCLUSIVE), 0); assertEquals(itr.getNormalizedRank(INCLUSIVE), 0.5); @@ -66,8 +59,8 @@ public void twoValueSketch() { assertEquals(itr.getQuantile(), 2f); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 1); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 2); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 1); + assertEquals(itr.getNaturalRank(INCLUSIVE), 2); assertEquals(itr.getNormalizedRank(EXCLUSIVE), 0.5); assertEquals(itr.getNormalizedRank(INCLUSIVE), 1.0); } @@ -111,9 +104,9 @@ private static void printIterator(final FloatsSortedViewIterator itr) { while (itr.next()) { float v = itr.getQuantile(); long wt = itr.getWeight(); - long cumWtNotInc = itr.getCumulativeWeight(EXCLUSIVE); + long cumWtNotInc = itr.getNaturalRank(EXCLUSIVE); double nRankNotInc = itr.getNormalizedRank(EXCLUSIVE); - long cumWtInc = itr.getCumulativeWeight(INCLUSIVE); + long cumWtInc = itr.getNaturalRank(INCLUSIVE); double nRankInc = itr.getNormalizedRank(INCLUSIVE); printf(fmt, v, wt, cumWtNotInc, nRankNotInc, cumWtInc, nRankInc); } diff --git a/src/test/java/org/apache/datasketches/req/ReqSketchTest.java b/src/test/java/org/apache/datasketches/req/ReqSketchTest.java index 4db9112a8..78b321e1d 100644 --- a/src/test/java/org/apache/datasketches/req/ReqSketchTest.java +++ b/src/test/java/org/apache/datasketches/req/ReqSketchTest.java @@ -29,6 +29,7 @@ import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.memory.Memory; import org.apache.datasketches.quantilescommon.FloatsSortedView; +import org.apache.datasketches.quantilescommon.FloatsSortedViewIterator; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.datasketches.quantilescommon.QuantilesFloatsSketchIterator; import org.apache.datasketches.quantilescommon.QuantilesUtil; @@ -152,13 +153,13 @@ private static void checkGetRanks(final ReqSketch sk, final int max, final int i private static void checkSortedView(final ReqSketch sk, final int iDebug) { final ReqSketchSortedView sv = new ReqSketchSortedView(sk); - final ReqSketchSortedViewIterator itr = sv.iterator(); + final FloatsSortedViewIterator itr = sv.iterator(); final int retainedCount = sk.getNumRetained(); final long totalN = sk.getN(); int count = 0; long cumWt = 0; while (itr.next()) { - cumWt = itr.getCumulativeWeight(INCLUSIVE); + cumWt = itr.getNaturalRank(INCLUSIVE); count++; } assertEquals(cumWt, totalN); @@ -234,21 +235,6 @@ private static void checkMerge(final ReqSketch sk, final int iDebug) { //specific tests - @Test - public void getQuantiles() { - final ReqSketch sketch = ReqSketch.builder().setK(12).build(); - sketch.update(1); - sketch.update(2); - sketch.update(3); - sketch.update(4); - float[] quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, EXCLUSIVE); - float[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, INCLUSIVE); - quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - } - @Test public void merge() { final ReqSketch s1 = ReqSketch.builder().setK(12).build();