From e6741622c796046d6bdb9368ac903f1beffe3358 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Thu, 31 Oct 2024 17:59:53 -0700 Subject: [PATCH 1/3] implemented getPMF() and getCDF() --- .../datasketches/tdigest/TDigestDouble.java | 47 +++++++++++++++++++ .../tdigest/TDigestDoubleTest.java | 13 +++-- 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/apache/datasketches/tdigest/TDigestDouble.java b/src/main/java/org/apache/datasketches/tdigest/TDigestDouble.java index 1e3408511..478b80124 100644 --- a/src/main/java/org/apache/datasketches/tdigest/TDigestDouble.java +++ b/src/main/java/org/apache/datasketches/tdigest/TDigestDouble.java @@ -32,6 +32,7 @@ import org.apache.datasketches.memory.WritableBuffer; import org.apache.datasketches.memory.WritableMemory; import org.apache.datasketches.quantilescommon.QuantilesAPI; +import org.apache.datasketches.quantilescommon.QuantilesUtil; /** * t-Digest for estimating quantiles and ranks. @@ -125,6 +126,7 @@ public void merge(final TDigestDouble other) { /** * Process buffered values and merge centroids if needed */ + // this method will become private in the next major version public void compress() { if (numBuffered_ == 0) { return; } final int num = numBuffered_ + numCentroids_; @@ -277,6 +279,51 @@ public double getQuantile(final double rank) { return weightedAverage(centroidWeights_[numCentroids_ - 1], w1, maxValue_, w2); } + /** + * Returns an approximation to the Probability Mass Function (PMF) of the input stream + * given a set of split points. + * + * @param splitPoints an array of m unique, monotonically increasing values + * that divide the input domain into m+1 consecutive disjoint intervals (bins). + * + * @return an array of m+1 doubles each of which is an approximation + * to the fraction of the input stream values (the mass) that fall into one of those intervals. + * @throws SketchesStateException if sketch is empty. + */ + public double[] getPMF(final double[] splitPoints) { + final double[] buckets = getCDF(splitPoints); + for (int i = buckets.length; i-- > 1; ) { + buckets[i] -= buckets[i - 1]; + } + return buckets; + } + + /** + * Returns an approximation to the Cumulative Distribution Function (CDF), which is the + * cumulative analog of the PMF, of the input stream given a set of split points. + * + * @param splitPoints an array of m unique, monotonically increasing values + * that divide the input domain into m+1 consecutive disjoint intervals. + * + * @return an array of m+1 doubles, which are a consecutive approximation to the CDF + * of the input stream given the splitPoints. The value at array position j of the returned + * CDF array is the sum of the returned values in positions 0 through j of the returned PMF + * array. This can be viewed as array of ranks of the given split points plus one more value + * that is always 1. + * @throws SketchesStateException if sketch is empty. + */ + public double[] getCDF(final double[] splitPoints) { + if (isEmpty()) { throw new SketchesStateException(QuantilesAPI.EMPTY_MSG); } + QuantilesUtil.checkDoublesSplitPointsOrder(splitPoints); + final int len = splitPoints.length + 1; + final double[] ranks = new double[len]; + for (int i = 0; i < len - 1; i++) { + ranks[i] = getRank(splitPoints[i]); + } + ranks[len - 1] = 1.0; + return ranks; + } + /** * Computes size needed to serialize the current state. * @return size in bytes needed to serialize this tdigest diff --git a/src/test/java/org/apache/datasketches/tdigest/TDigestDoubleTest.java b/src/test/java/org/apache/datasketches/tdigest/TDigestDoubleTest.java index db043cff6..55baa83e7 100644 --- a/src/test/java/org/apache/datasketches/tdigest/TDigestDoubleTest.java +++ b/src/test/java/org/apache/datasketches/tdigest/TDigestDoubleTest.java @@ -41,6 +41,8 @@ public void empty() { assertThrows(SketchesStateException.class, () -> td.getMaxValue()); assertThrows(SketchesStateException.class, () -> td.getRank(0)); assertThrows(SketchesStateException.class, () -> td.getQuantile(0.5)); + assertThrows(SketchesStateException.class, () -> td.getPMF(new double[]{0})); + assertThrows(SketchesStateException.class, () -> td.getCDF(new double[]{0})); } @Test @@ -65,9 +67,6 @@ public void manyValues() { final TDigestDouble td = new TDigestDouble(); final int n = 10000; for (int i = 0; i < n; i++) td.update(i); -// System.out.println(td.toString(true)); -// td.compress(); -// System.out.println(td.toString(true)); assertFalse(td.isEmpty()); assertEquals(td.getTotalWeight(), n); assertEquals(td.getMinValue(), 0); @@ -82,6 +81,14 @@ public void manyValues() { assertEquals(td.getQuantile(0.9), n * 0.9, n * 0.9 * 0.01); assertEquals(td.getQuantile(0.95), n * 0.95, n * 0.95 * 0.01); assertEquals(td.getQuantile(1), n - 1); + final double[] pmf = td.getPMF(new double[] {n / 2}); + assertEquals(pmf.length, 2); + assertEquals(pmf[0], 0.5, 0.0001); + assertEquals(pmf[1], 0.5, 0.0001); + final double[] cdf = td.getCDF(new double[] {n / 2}); + assertEquals(cdf.length, 2); + assertEquals(cdf[0], 0.5, 0.0001); + assertEquals(cdf[1], 1.0); } @Test From 9d1d45b53ebbf6159491a29677ec4d7ee210e520 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Thu, 31 Oct 2024 18:01:01 -0700 Subject: [PATCH 2/3] corrected the exception thrown --- .../quantilescommon/QuantilesDoublesAPI.java | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesDoublesAPI.java b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesDoublesAPI.java index e8e5310f5..e584a5c68 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesDoublesAPI.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesDoublesAPI.java @@ -33,7 +33,7 @@ public interface QuantilesDoublesAPI extends QuantilesAPI { * This is equivalent to {@link #getCDF(double[], QuantileSearchCriteria) getCDF(splitPoints, INCLUSIVE)} * @param splitPoints an array of m unique, monotonically increasing items. * @return a discrete CDF array of m+1 double ranks (or cumulative probabilities) on the interval [0.0, 1.0]. - * @throws IllegalArgumentException if sketch is empty. + * @throws SketchesArgumentException if sketch is empty. */ default double[] getCDF(double[] splitPoints) { return getCDF(splitPoints, INCLUSIVE); @@ -70,7 +70,7 @@ default double[] getCDF(double[] splitPoints) { * * @param searchCrit the desired search criteria. * @return a discrete CDF array of m+1 double ranks (or cumulative probabilities) on the interval [0.0, 1.0]. - * @throws IllegalArgumentException if sketch is empty. + * @throws SketchesArgumentException if sketch is empty. */ double[] getCDF(double[] splitPoints, QuantileSearchCriteria searchCrit); @@ -79,7 +79,7 @@ default double[] getCDF(double[] splitPoints) { * item returned by getQuantile(1.0). * * @return the maximum item of the stream - * @throws IllegalArgumentException if sketch is empty. + * @throws SketchesArgumentException if sketch is empty. */ double getMaxItem(); @@ -88,7 +88,7 @@ default double[] getCDF(double[] splitPoints) { * item returned by getQuantile(0.0). * * @return the minimum item of the stream - * @throws IllegalArgumentException if sketch is empty. + * @throws SketchesArgumentException if sketch is empty. */ double getMinItem(); @@ -96,7 +96,7 @@ default double[] getCDF(double[] splitPoints) { * This is equivalent to {@link #getPMF(double[], QuantileSearchCriteria) getPMF(splitPoints, INCLUSIVE)} * @param splitPoints an array of m unique, monotonically increasing items. * @return a PMF array of m+1 probability masses as doubles on the interval [0.0, 1.0]. - * @throws IllegalArgumentException if sketch is empty. + * @throws SketchesArgumentException if sketch is empty. */ default double[] getPMF(double[] splitPoints) { return getPMF(splitPoints, INCLUSIVE); @@ -140,7 +140,7 @@ default double[] getPMF(double[] splitPoints) { * * @param searchCrit the desired search criteria. * @return a PMF array of m+1 probability masses as doubles on the interval [0.0, 1.0]. - * @throws IllegalArgumentException if sketch is empty. + * @throws SketchesArgumentException if sketch is empty. */ double[] getPMF(double[] splitPoints, QuantileSearchCriteria searchCrit); @@ -148,7 +148,7 @@ default double[] getPMF(double[] splitPoints) { * This is equivalent to {@link #getQuantile(double, QuantileSearchCriteria) getQuantile(rank, INCLUSIVE)} * @param rank the given normalized rank, a double in the range [0.0, 1.0]. * @return the approximate quantile given the normalized rank. - * @throws IllegalArgumentException if sketch is empty. + * @throws SketchesArgumentException if sketch is empty. */ default double getQuantile(double rank) { return getQuantile(rank, INCLUSIVE); @@ -163,7 +163,7 @@ default double getQuantile(double rank) { * If EXCLUSIVE, he given rank includes all quantiles < * the quantile directly corresponding to the given rank. * @return the approximate quantile given the normalized rank. - * @throws IllegalArgumentException if sketch is empty. + * @throws SketchesArgumentException if sketch is empty. * @see org.apache.datasketches.quantilescommon.QuantileSearchCriteria */ double getQuantile(double rank, QuantileSearchCriteria searchCrit); @@ -180,7 +180,7 @@ default double getQuantile(double rank) { * @param rank the given normalized rank * @return the lower bound of the quantile confidence interval in which the quantile of the * given rank exists. - * @throws IllegalArgumentException if sketch is empty. + * @throws SketchesArgumentException if sketch is empty. */ double getQuantileLowerBound(double rank); @@ -196,7 +196,7 @@ default double getQuantile(double rank) { * @param rank the given normalized rank * @return the upper bound of the quantile confidence interval in which the true quantile of the * given rank exists. - * @throws IllegalArgumentException if sketch is empty. + * @throws SketchesArgumentException if sketch is empty. */ double getQuantileUpperBound(double rank); @@ -205,7 +205,7 @@ default double getQuantile(double rank) { * @param ranks the given array of normalized ranks, each of which must be * in the interval [0.0,1.0]. * @return an array of quantiles corresponding to the given array of normalized ranks. - * @throws IllegalArgumentException if sketch is empty. + * @throws SketchesArgumentException if sketch is empty. */ default double[] getQuantiles(double[] ranks) { return getQuantiles(ranks, INCLUSIVE); @@ -219,7 +219,7 @@ default double[] getQuantiles(double[] ranks) { * @param searchCrit if INCLUSIVE, the given ranks include all quantiles ≤ * the quantile directly corresponding to each rank. * @return an array of quantiles corresponding to the given array of normalized ranks. - * @throws IllegalArgumentException if sketch is empty. + * @throws SketchesArgumentException if sketch is empty. * @see org.apache.datasketches.quantilescommon.QuantileSearchCriteria */ double[] getQuantiles(double[] ranks, QuantileSearchCriteria searchCrit); @@ -228,7 +228,7 @@ default double[] getQuantiles(double[] ranks) { * This is equivalent to {@link #getRank(double, QuantileSearchCriteria) getRank(quantile, INCLUSIVE)} * @param quantile the given quantile * @return the normalized rank corresponding to the given quantile - * @throws IllegalArgumentException if sketch is empty. + * @throws SketchesArgumentException if sketch is empty. */ default double getRank(double quantile) { return getRank(quantile, INCLUSIVE); @@ -240,7 +240,7 @@ default double getRank(double quantile) { * @param quantile the given quantile * @param searchCrit if INCLUSIVE the given quantile is included into the rank. * @return the normalized rank corresponding to the given quantile - * @throws IllegalArgumentException if sketch is empty. + * @throws SketchesArgumentException if sketch is empty. * @see org.apache.datasketches.quantilescommon.QuantileSearchCriteria */ double getRank(double quantile, QuantileSearchCriteria searchCrit); @@ -249,7 +249,7 @@ default double getRank(double quantile) { * This is equivalent to {@link #getRanks(double[], QuantileSearchCriteria) getRanks(quantiles, INCLUSIVE)} * @param quantiles the given array of quantiles * @return an array of normalized ranks corresponding to the given array of quantiles. - * @throws IllegalArgumentException if sketch is empty. + * @throws SketchesArgumentException if sketch is empty. */ default double[] getRanks(double[] quantiles) { return getRanks(quantiles, INCLUSIVE); @@ -262,7 +262,7 @@ default double[] getRanks(double[] quantiles) { * @param quantiles the given array of quantiles * @param searchCrit if INCLUSIVE, the given quantiles include the rank directly corresponding to each quantile. * @return an array of normalized ranks corresponding to the given array of quantiles. - * @throws IllegalArgumentException if sketch is empty. + * @throws SketchesArgumentException if sketch is empty. * @see org.apache.datasketches.quantilescommon.QuantileSearchCriteria */ double[] getRanks(double[] quantiles, QuantileSearchCriteria searchCrit); From c07bca2bdf5053dea9edee78c5c5060388d18786 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Thu, 31 Oct 2024 21:56:27 -0700 Subject: [PATCH 3/3] added import --- .../apache/datasketches/quantilescommon/QuantilesDoublesAPI.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesDoublesAPI.java b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesDoublesAPI.java index e584a5c68..3b26e93f0 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesDoublesAPI.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesDoublesAPI.java @@ -20,6 +20,7 @@ package org.apache.datasketches.quantilescommon; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; +import org.apache.datasketches.common.SketchesArgumentException; /** * The Quantiles API for item type double.