From 405cb7620d820319d12e18601997523861a64de7 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Wed, 25 Oct 2023 17:07:09 -0700 Subject: [PATCH 01/13] Initial fixes for getPartitionBoundaries. --- .../quantiles/ItemsSketchSortedView.java | 3 +- .../quantilescommon/InequalitySearch.java | 255 +++++++++++++++++- .../quantilescommon/QuantilesFloatsAPI.java | 2 +- 3 files changed, 245 insertions(+), 15 deletions(-) diff --git a/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java b/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java index d2ccf9fdc..0dcd35fb9 100644 --- a/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java @@ -130,8 +130,7 @@ public T getQuantile(final double rank, final QuantileSearchCriteria searchCrit) if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int len = cumWeights.length; - final long naturalRank = (searchCrit == INCLUSIVE) - ? (long)Math.ceil(rank * totalN) : (long)Math.floor(rank * totalN); + final long naturalRank = Math.round(rank * totalN); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); if (index == -1) { diff --git a/src/main/java/org/apache/datasketches/quantilescommon/InequalitySearch.java b/src/main/java/org/apache/datasketches/quantilescommon/InequalitySearch.java index 5a61e525f..51b013573 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/InequalitySearch.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/InequalitySearch.java @@ -73,6 +73,11 @@ int compare(final long[] arr, final int a, final int b, final long v) { return v <= arr[a] ? -1 : arr[b] < v ? 1 : 0; } + @Override + int compare(final long[] arr, final int a, final int b, final double v) { + return v <= arr[a] ? -1 : arr[b] < v ? 1 : 0; + } + @Override int getIndex(final double[] arr, final int a, final int b, final double v) { return a; @@ -88,6 +93,11 @@ int getIndex(final long[] arr, final int a, final int b, final long v) { return a; } + @Override + int getIndex(final long[] arr, final int a, final int b, final double v) { + return a; + } + @Override int resolve(final double[] arr, final int lo, final int hi, final double v) { return (lo == hi) @@ -109,6 +119,13 @@ int resolve(final long[] arr, final int lo, final int hi, final long v) { : v > arr[hi] ? hi : (v > arr[lo] ? lo : -1); } + @Override + int resolve(final long[] arr, final int lo, final int hi, final double v) { + return (lo == hi) + ? (v > arr[lo] ? lo : -1) + : v > arr[hi] ? hi : (v > arr[lo] ? lo : -1); + } + @Override public String desc(final double[] arr, final int low, final int high, final double v, final int idx) { if (idx == -1) { @@ -150,6 +167,20 @@ public String desc(final long[] arr, final int low, final int high, final long v + ": arr[" + idx + "]=" + arr[idx] + " < " + v + " <= arr[" + (idx + 1) + "]=" + arr[idx + 1] + "; return arr[" + idx + "]=" + arr[idx]; } + + @Override + public String desc(final long[] arr, final int low, final int high, final double v, final int idx) { + if (idx == -1) { + return "LT: " + v + " <= arr[" + low + "]=" + arr[low] + "; return -1"; + } + if (idx == high) { + return "LT: " + v + " > arr[" + high + "]=" + arr[high] + + "; return arr[" + high + "]=" + arr[high]; + } //idx < high + return "LT: " + v + + ": arr[" + idx + "]=" + arr[idx] + " < " + v + " <= arr[" + (idx + 1) + "]=" + arr[idx + 1] + + "; return arr[" + idx + "]=" + arr[idx]; + } }, /** @@ -179,6 +210,11 @@ int compare(final long[] arr, final int a, final int b, final long v) { return v < arr[a] ? -1 : arr[b] <= v ? 1 : 0; } + @Override + int compare(final long[] arr, final int a, final int b, final double v) { + return v < arr[a] ? -1 : arr[b] <= v ? 1 : 0; + } + @Override int getIndex(final double[] arr, final int a, final int b, final double v) { return a; @@ -194,6 +230,11 @@ int getIndex(final long[] arr, final int a, final int b, final long v) { return a; } + @Override + int getIndex(final long[] arr, final int a, final int b, final double v) { + return a; + } + @Override int resolve(final double[] arr, final int lo, final int hi, final double v) { return (lo == hi) @@ -215,6 +256,13 @@ int resolve(final long[] arr, final int lo, final int hi, final long v) { : v >= arr[hi] ? hi : (v >= arr[lo] ? lo : -1); } + @Override + int resolve(final long[] arr, final int lo, final int hi, final double v) { + return (lo == hi) + ? (v >= arr[lo] ? lo : -1) + : v >= arr[hi] ? hi : (v >= arr[lo] ? lo : -1); + } + @Override public String desc(final double[] arr, final int low, final int high, final double v, final int idx) { if (idx == -1) { @@ -256,6 +304,20 @@ public String desc(final long[] arr, final int low, final int high, final long v + ": arr[" + idx + "]=" + arr[idx] + " <= " + v + " < arr[" + (idx + 1) + "]=" + arr[idx + 1] + "; return arr[" + idx + "]=" + arr[idx]; } + + @Override + public String desc(final long[] arr, final int low, final int high, final double v, final int idx) { + if (idx == -1) { + return "LE: " + v + " < arr[" + low + "]=" + arr[low] + "; return -1"; + } + if (idx == high) { + return "LE: " + v + " >= arr[" + high + "]=" + arr[high] + + "; return arr[" + high + "]=" + arr[high]; + } + return "LE: " + v + + ": arr[" + idx + "]=" + arr[idx] + " <= " + v + " < arr[" + (idx + 1) + "]=" + arr[idx + 1] + + "; return arr[" + idx + "]=" + arr[idx]; + } }, /** @@ -281,6 +343,11 @@ int compare(final long[] arr, final int a, final int b, final long v) { return v < arr[a] ? -1 : arr[b] < v ? 1 : 0; } + @Override + int compare(final long[] arr, final int a, final int b, final double v) { + return v < arr[a] ? -1 : arr[b] < v ? 1 : 0; + } + @Override int getIndex(final double[] arr, final int a, final int b, final double v) { return v == arr[a] ? a : v == arr[b] ? b : -1; @@ -296,6 +363,11 @@ int getIndex(final long[] arr, final int a, final int b, final long v) { return v == arr[a] ? a : v == arr[b] ? b : -1; } + @Override + int getIndex(final long[] arr, final int a, final int b, final double v) { + return v == arr[a] ? a : v == arr[b] ? b : -1; + } + @Override int resolve(final double[] arr, final int lo, final int hi, final double v) { return (lo == hi) @@ -317,6 +389,13 @@ int resolve(final long[] arr, final int lo, final int hi, final long v) { : v == arr[lo] ? lo : (v == arr[hi] ? hi : -1); } + @Override + int resolve(final long[] arr, final int lo, final int hi, final double v) { + return (lo == hi) + ? (v == arr[lo] ? lo : -1) + : v == arr[lo] ? lo : (v == arr[hi] ? hi : -1); + } + @Override public String desc(final double[] arr, final int low, final int high, final double v, final int idx) { if (idx == -1) { @@ -358,6 +437,20 @@ public String desc(final long[] arr, final int low, final int high, final long v } return "EQ: " + v + " == arr[" + idx + "]; return arr[" + idx + "]=" + arr[idx]; } + + @Override + public String desc(final long[] arr, final int low, final int high, final double v, final int idx) { + if (idx == -1) { + if (v > arr[high]) { + return "EQ: " + v + " > arr[" + high + "]; return -1"; + } + if (v < arr[low]) { + return "EQ: " + v + " < arr[" + low + "]; return -1"; + } + return "EQ: " + v + " Cannot be found within arr[" + low + "], arr[" + high + "]; return -1"; + } + return "EQ: " + v + " == arr[" + idx + "]; return arr[" + idx + "]=" + arr[idx]; + } }, /** @@ -387,6 +480,11 @@ int compare(final long[] arr, final int a, final int b, final long v) { return v <= arr[a] ? -1 : arr[b] < v ? 1 : 0; } + @Override + int compare(final long[] arr, final int a, final int b, final double v) { + return v <= arr[a] ? -1 : arr[b] < v ? 1 : 0; + } + @Override int getIndex(final double[] arr, final int a, final int b, final double v) { return b; @@ -402,6 +500,11 @@ int getIndex(final long[] arr, final int a, final int b, final long v) { return b; } + @Override + int getIndex(final long[] arr, final int a, final int b, final double v) { + return b; + } + @Override int resolve(final double[] arr, final int lo, final int hi, final double v) { return (lo == hi) @@ -423,6 +526,13 @@ int resolve(final long[] arr, final int lo, final int hi, final long v) { : v <= arr[lo] ? lo : (v <= arr[hi] ? hi : -1); } + @Override + int resolve(final long[] arr, final int lo, final int hi, final double v) { + return (lo == hi) + ? (v <= arr[lo] ? lo : -1) + : v <= arr[lo] ? lo : (v <= arr[hi] ? hi : -1); + } + @Override public String desc(final double[] arr, final int low, final int high, final double v, final int idx) { if (idx == -1) { @@ -464,6 +574,20 @@ public String desc(final long[] arr, final int low, final int high, final long v + ": arr[" + (idx - 1) + "]=" + arr[idx - 1] + " < " + v + " <= arr[" + idx + "]=" + arr[idx] + "; return arr[" + idx + "]=" + arr[idx]; } + + @Override + public String desc(final long[] arr, final int low, final int high, final double v, final int idx) { + if (idx == -1) { + return "GE: " + v + " > arr[" + high + "]=" + arr[high] + "; return -1"; + } + if (idx == low) { + return "GE: " + v + " <= arr[" + low + "]=" + arr[low] + + "; return arr[" + low + "]=" + arr[low]; + } //idx > low + return "GE: " + v + + ": arr[" + (idx - 1) + "]=" + arr[idx - 1] + " < " + v + " <= arr[" + idx + "]=" + arr[idx] + + "; return arr[" + idx + "]=" + arr[idx]; + } }, /** @@ -493,6 +617,11 @@ int compare(final long[] arr, final int a, final int b, final long v) { return v < arr[a] ? -1 : arr[b] <= v ? 1 : 0; } + @Override + int compare(final long[] arr, final int a, final int b, final double v) { + return v < arr[a] ? -1 : arr[b] <= v ? 1 : 0; + } + @Override int getIndex(final double[] arr, final int a, final int b, final double v) { return b; @@ -508,6 +637,11 @@ int getIndex(final long[] arr, final int a, final int b, final long v) { return b; } + @Override + int getIndex(final long[] arr, final int a, final int b, final double v) { + return b; + } + @Override int resolve(final double[] arr, final int lo, final int hi, final double v) { return (lo == hi) @@ -529,6 +663,13 @@ int resolve(final long[] arr, final int lo, final int hi, final long v) { : v < arr[lo] ? lo : (v < arr[hi] ? hi : -1); } + @Override + int resolve(final long[] arr, final int lo, final int hi, final double v) { + return (lo == hi) + ? (v < arr[lo] ? lo : -1) + : v < arr[lo] ? lo : (v < arr[hi] ? hi : -1); + } + @Override public String desc(final double[] arr, final int low, final int high, final double v, final int idx) { if (idx == -1) { @@ -570,14 +711,28 @@ public String desc(final long[] arr, final int low, final int high, final long v + ": arr[" + (idx - 1) + "]=" + arr[idx - 1] + " <= " + v + " < arr[" + idx + "]=" + arr[idx] + "; return arr[" + idx + "]=" + arr[idx]; } + + @Override + public String desc(final long[] arr, final int low, final int high, final double v, final int idx) { + if (idx == -1) { + return "GT: " + v + " >= arr[" + high + "]=" + arr[high] + "; return -1"; + } + if (idx == low) { + return "GT: " + v + " < arr[" + low + "]=" + arr[low] + + "; return arr[" + low + "]=" + arr[low]; + } //idx > low + return "GT: " + v + + ": arr[" + (idx - 1) + "]=" + arr[idx - 1] + " <= " + v + " < arr[" + idx + "]=" + arr[idx] + + "; return arr[" + idx + "]=" + arr[idx]; + } }; /** * The call to compare index a and index b with the value v. - * @param arr The underlying sorted array of double values + * @param arr The underlying sorted array of values * @param a the lower index of the current pair * @param b the higher index of the current pair - * @param v the double value to search for + * @param v the value to search for * @return +1, which means we must search higher in the array, or -1, which means we must * search lower in the array, or 0, which means we have found the correct bounding pair. */ @@ -585,10 +740,10 @@ public String desc(final long[] arr, final int low, final int high, final long v /** * The call to compare index a and index b with the value v. - * @param arr The underlying sorted array of float values + * @param arr The underlying sorted array of values * @param a the lower index of the current pair * @param b the higher index of the current pair - * @param v the float value to search for + * @param v the value to search for * @return +1, which means we must search higher in the array, or -1, which means we must * search lower in the array, or 0, which means we have found the correct bounding pair. */ @@ -596,15 +751,26 @@ public String desc(final long[] arr, final int low, final int high, final long v /** * The call to compare index a and index b with the value v. - * @param arr The underlying sorted array of long values + * @param arr The underlying sorted array of values * @param a the lower index of the current pair * @param b the higher index of the current pair - * @param v the long value to search for + * @param v the value to search for * @return +1, which means we must search higher in the array, or -1, which means we must * search lower in the array, or 0, which means we have found the correct bounding pair. */ abstract int compare(long[] arr, int a, int b, long v); + /** + * The call to compare index a and index b with the value v. + * @param arr The underlying sorted array of values + * @param a the lower index of the current pair + * @param b the higher index of the current pair + * @param v the value to search for + * @return +1, which means we must search higher in the array, or -1, which means we must + * search lower in the array, or 0, which means we have found the correct bounding pair. + */ + abstract int compare(long[] arr, int a, int b, double v); + /** * If the compare operation returns 0, which means "found", this returns the index of the * found value that satisfies the selected criteria. @@ -638,6 +804,17 @@ public String desc(final long[] arr, final int low, final int high, final long v */ abstract int getIndex(long[] arr, int a, int b, long v); + /** + * If the compare operation returns 0, which means "found", this returns the index of the + * found value that satisfies the selected criteria. + * @param arr the array being searched + * @param a the lower index of the current pair + * @param b the higher index of the current pair + * @param v the value being searched for. + * @return the index of the found value that satisfies the selected criteria. + */ + abstract int getIndex(long[] arr, int a, int b, double v); + /** * Called to resolve the search when the hi and lo pointers are equal or adjacent. * @param arr the array being searched @@ -668,13 +845,23 @@ public String desc(final long[] arr, final int low, final int high, final long v */ abstract int resolve(long[] arr, int lo, int hi, long v); + /** + * Called to resolve the search when the hi and lo pointers are equal or adjacent. + * @param arr the array being searched + * @param lo the current lo value + * @param hi the current hi value + * @param v the value being searched for + * @return the index of the resolution or -1, if it cannot be resolved. + */ + abstract int resolve(long[] arr, int lo, int hi, double v); + /** * Optional call that describes the details of the results of the search. * Used primarily for debugging. - * @param arr The underlying sorted array of double values + * @param arr The underlying sorted array of values * @param low the low index of the range * @param high the high index of the range - * @param v the double value to search for + * @param v the value to search for * @param idx the resolved index from the search * @return the descriptive string. */ @@ -683,10 +870,10 @@ public String desc(final long[] arr, final int low, final int high, final long v /** * Optional call that describes the details of the results of the search. * Used primarily for debugging. - * @param arr The underlying sorted array of double values + * @param arr The underlying sorted array of values * @param low the low index of the range * @param high the high index of the range - * @param v the double value to search for + * @param v the value to search for * @param idx the resolved index from the search * @return the descriptive string. */ @@ -695,15 +882,27 @@ public String desc(final long[] arr, final int low, final int high, final long v /** * Optional call that describes the details of the results of the search. * Used primarily for debugging. - * @param arr The underlying sorted array of double values + * @param arr The underlying sorted array of values * @param low the low index of the range * @param high the high index of the range - * @param v the double value to search for + * @param v the value to search for * @param idx the resolved index from the search * @return the descriptive string. */ public abstract String desc(long[] arr, int low, int high, long v, int idx); + /** + * Optional call that describes the details of the results of the search. + * Used primarily for debugging. + * @param arr The underlying sorted array of values + * @param low the low index of the range + * @param high the high index of the range + * @param v the value to search for + * @param idx the resolved index from the search + * @return the descriptive string. + */ + public abstract String desc(long[] arr, int low, int high, double v, int idx); + /** * Binary Search for the index of the double value in the given search range that satisfies * the given InequalitySearch criterion. @@ -804,4 +1003,36 @@ public static int find(final long[] arr, final int low, final int high, return -1; //should never return here } + /** + * Binary Search for the index of the double value in the given search range that satisfies + * the given InequalitySearch criterion. + * If -1 is returned there are no values in the search range that satisfy the criterion. + * + * @param arr the given array that must be sorted. + * @param low the lowest index of the lowest value in the search range, inclusive. + * @param high the highest index of the highest value in the search range, inclusive. + * @param v the value to search for. + * @param crit one of LT, LE, EQ, GT, GE + * @return the index of the value in the given search range that satisfies the criterion + */ + public static int find(final long[] arr, final int low, final int high, + final double v, final InequalitySearch crit) { + Objects.requireNonNull(arr, "Input arr must not be null"); + Objects.requireNonNull(crit, "Input crit must not be null"); + if (arr.length == 0) { throw new SketchesArgumentException("Input array must not be empty."); } + int lo = low; + int hi = high; + while (lo <= hi) { + if (hi - lo <= 1) { + return crit.resolve(arr, lo, hi, v); + } + final int mid = lo + (hi - lo) / 2; + final int ret = crit.compare(arr, mid, mid + 1, v); + if (ret == -1 ) { hi = mid; } + else if (ret == 1) { lo = mid + 1; } + else { return crit.getIndex(arr, mid, mid + 1, v); } + } + return -1; //should never return here + } + } //End of enum diff --git a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesFloatsAPI.java b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesFloatsAPI.java index ddece47e6..aebdc46b8 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesFloatsAPI.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesFloatsAPI.java @@ -221,7 +221,7 @@ default float getQuantile(double rank) { * Gets the lower bound of the quantile confidence interval in which the quantile of the * given rank exists. * - *

Although it is possible to estimate the probablity that the true quantile + *

Although it is possible to estimate the probability that the true quantile * exists within the quantile confidence interval specified by the upper and lower quantile bounds, * it is not possible to guarantee the width of the quantile confidence interval * as an additive or multiplicative percent of the true quantile.

From a57677487a44e2d65077d0c9717030355c29a495 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Wed, 25 Oct 2023 17:31:28 -0700 Subject: [PATCH 02/13] This applies the same fix that eliminates duplicate entries when using getPartitionBoundaries(...) in all the quantile sketches for small values of N. --- .../apache/datasketches/kll/KllDoublesSketchSortedView.java | 3 +-- .../org/apache/datasketches/kll/KllFloatsSketchSortedView.java | 3 +-- .../org/apache/datasketches/kll/KllItemsSketchSortedView.java | 3 +-- .../apache/datasketches/quantiles/DoublesSketchSortedView.java | 3 +-- .../datasketches/quantilescommon/QuantilesFloatsAPI.java | 2 +- .../java/org/apache/datasketches/req/ReqSketchSortedView.java | 3 +-- 6 files changed, 6 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java index 03259b952..a5d50297d 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java @@ -83,8 +83,7 @@ public double getQuantile(final double rank, final QuantileSearchCriteria search if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int len = cumWeights.length; - final long naturalRank = (searchCrit == INCLUSIVE) - ? (long)Math.ceil(rank * totalN) : (long)Math.floor(rank * totalN); + final long naturalRank = Math.round(rank * totalN); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); if (index == -1) { diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java index 6a378531d..41dfb0992 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java @@ -83,8 +83,7 @@ public float getQuantile(final double rank, final QuantileSearchCriteria searchC if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int len = cumWeights.length; - final long naturalRank = (searchCrit == INCLUSIVE) - ? (long)Math.ceil(rank * totalN) : (long)Math.floor(rank * totalN); + final long naturalRank = Math.round(rank * totalN); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); if (index == -1) { diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java index c3fb8bab6..40d8117d9 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java @@ -133,8 +133,7 @@ public T getQuantile(final double rank, final QuantileSearchCriteria searchCrit) if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int len = cumWeights.length; - final long naturalRank = (searchCrit == INCLUSIVE) - ? (long)Math.ceil(rank * totalN) : (long)Math.floor(rank * totalN); + final long naturalRank = Math.round(rank * totalN); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); if (index == -1) { diff --git a/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedView.java b/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedView.java index 02ccdd039..ec359df11 100644 --- a/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedView.java @@ -83,8 +83,7 @@ public double getQuantile(final double rank, final QuantileSearchCriteria search if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int len = cumWeights.length; - final long naturalRank = (searchCrit == INCLUSIVE) - ? (long)Math.ceil(rank * totalN) : (long)Math.floor(rank * totalN); + final long naturalRank = Math.round(rank * totalN); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); if (index == -1) { diff --git a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesFloatsAPI.java b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesFloatsAPI.java index aebdc46b8..c6ea484cc 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesFloatsAPI.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesFloatsAPI.java @@ -237,7 +237,7 @@ default float getQuantile(double rank) { * Gets the upper bound of the quantile confidence interval in which the true quantile of the * given rank exists. * - *

Although it is possible to estimate the probablity that the true quantile + *

Although it is possible to estimate the probability that the true quantile * exists within the quantile confidence interval specified by the upper and lower quantile bounds, * it is not possible to guarantee the width of the quantile interval * as an additive or multiplicative percent of the true quantile.

diff --git a/src/main/java/org/apache/datasketches/req/ReqSketchSortedView.java b/src/main/java/org/apache/datasketches/req/ReqSketchSortedView.java index 1b8586abf..cb8bc4a51 100644 --- a/src/main/java/org/apache/datasketches/req/ReqSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/req/ReqSketchSortedView.java @@ -70,8 +70,7 @@ public float getQuantile(final double rank, final QuantileSearchCriteria searchC if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int len = cumWeights.length; - final long naturalRank = (searchCrit == INCLUSIVE) - ? (long)Math.ceil(rank * totalN) : (long)Math.floor(rank * totalN); + final long naturalRank = Math.round(rank * totalN); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); if (index == -1) { From e263fe6f72820b016475b089fa7b74ff9339b0bf Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Tue, 31 Oct 2023 12:50:05 -0700 Subject: [PATCH 03/13] update Fixes for getPartitionBoundaries --- .../org/apache/datasketches/common/Util.java | 4 +- .../datasketches/kll/KllDoublesSketch.java | 4 +- .../kll/KllDoublesSketchSortedView.java | 21 +++++- .../datasketches/kll/KllFloatsSketch.java | 4 +- .../kll/KllFloatsSketchSortedView.java | 3 +- .../datasketches/kll/KllItemsSketch.java | 4 +- .../kll/KllItemsSketchSortedView.java | 28 ++++++-- .../datasketches/quantiles/DoublesSketch.java | 4 +- .../quantiles/DoublesSketchSortedView.java | 27 +++++-- .../datasketches/quantiles/ItemsSketch.java | 24 +++++-- .../quantiles/ItemsSketchSortedView.java | 35 +++++++--- .../quantilescommon/DoublesSortedView.java | 4 +- .../quantilescommon/FloatsSortedView.java | 4 +- .../quantilescommon/QuantilesAPI.java | 11 ++- .../quantilescommon/QuantilesGenericAPI.java | 7 ++ .../quantilescommon/QuantilesUtil.java | 64 ++++++++++++++--- .../datasketches/req/BaseReqSketch.java | 4 +- .../datasketches/req/ReqSketchSortedView.java | 23 +++++- .../datasketches/kll/KllItemsSketchTest.java | 22 +++--- .../quantiles/CustomQuantilesTest.java | 70 +++++++++++-------- .../HeapUpdateDoublesSketchTest.java | 4 +- .../quantilescommon/QuantilesUtilTest.java | 27 ++++++- 22 files changed, 295 insertions(+), 103 deletions(-) diff --git a/src/main/java/org/apache/datasketches/common/Util.java b/src/main/java/org/apache/datasketches/common/Util.java index 18e051261..602b40b0b 100644 --- a/src/main/java/org/apache/datasketches/common/Util.java +++ b/src/main/java/org/apache/datasketches/common/Util.java @@ -376,8 +376,8 @@ public static int ceilingIntPowerOf2(final int n) { } /** - * Computes the long ceiling power of 2 within the range [1, 2^30]. This is the smallest positive power - * of 2 that is equal to or greater than the given n and a mathematical integer. + * Computes the long ceiling power of 2 within the range [1, 2^62]. This is the smallest positive power + * of 2 that is equal to or greater than the given n and a mathematical long. * *

For: *

    diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java index 183a15ba7..213544021 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java @@ -24,7 +24,7 @@ import static org.apache.datasketches.common.ByteArrayUtil.putDoubleLE; import static org.apache.datasketches.kll.KllSketch.SketchStructure.UPDATABLE; import static org.apache.datasketches.kll.KllSketch.SketchType.DOUBLES_SKETCH; -import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallyWeightedRanks; +import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallySpacedDoubles; import java.util.Objects; @@ -179,7 +179,7 @@ public double[] getCDF(final double[] splitPoints, final QuantileSearchCriteria public DoublesPartitionBoundaries getPartitionBoundaries(final int numEquallyWeighted, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } - final double[] ranks = equallyWeightedRanks(numEquallyWeighted); + final double[] ranks = equallySpacedDoubles(numEquallyWeighted); final double[] boundaries = getQuantiles(ranks, searchCrit); boundaries[0] = getMinItem(); boundaries[boundaries.length - 1] = getMaxItem(); diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java index a5d50297d..8f8ae5d63 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java @@ -21,6 +21,7 @@ import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import static org.apache.datasketches.quantilescommon.QuantilesAPI.EMPTY_MSG; +import static org.apache.datasketches.quantilescommon.QuantilesUtil.getNaturalRank; import java.util.Arrays; @@ -83,7 +84,7 @@ public double getQuantile(final double rank, final QuantileSearchCriteria search if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int len = cumWeights.length; - final long naturalRank = Math.round(rank * totalN); + final double naturalRank = getNaturalRank(rank, totalN); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); if (index == -1) { @@ -92,6 +93,24 @@ public double getQuantile(final double rank, final QuantileSearchCriteria search return quantiles[index]; } + /** + * Special version of getQuantile to support the getPartitionBoundaries(int) function. + * @param weight ultimately comes from selected integral weights computed by the sketch. + * @param searchCrit If INCLUSIVE, the given rank includes all quantiles ≤ + * the quantile directly corresponding to the given weight internal to the sketch. + * @return the approximate quantile given the weight. + */ + double getQuantile(final long weight, final QuantileSearchCriteria searchCrit) { + if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } + final int len = cumWeights.length; + final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; + final int index = InequalitySearch.find(cumWeights, 0, len - 1, weight, crit); + if (index == -1) { + return quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; + } + return quantiles[index]; + } + @Override public double[] getQuantiles() { return quantiles.clone(); diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java index 6c60facae..e2e4d808a 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java @@ -24,7 +24,7 @@ import static org.apache.datasketches.common.ByteArrayUtil.putFloatLE; import static org.apache.datasketches.kll.KllSketch.SketchStructure.UPDATABLE; import static org.apache.datasketches.kll.KllSketch.SketchType.FLOATS_SKETCH; -import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallyWeightedRanks; +import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallySpacedDoubles; import java.util.Objects; @@ -179,7 +179,7 @@ public double[] getCDF(final float[] splitPoints, final QuantileSearchCriteria s public FloatsPartitionBoundaries getPartitionBoundaries(final int numEquallyWeighted, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } - final double[] ranks = equallyWeightedRanks(numEquallyWeighted); + final double[] ranks = equallySpacedDoubles(numEquallyWeighted); final float[] boundaries = getQuantiles(ranks, searchCrit); boundaries[0] = getMinItem(); boundaries[boundaries.length - 1] = getMaxItem(); diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java index 41dfb0992..8f47a8da7 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java @@ -21,6 +21,7 @@ import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import static org.apache.datasketches.quantilescommon.QuantilesAPI.EMPTY_MSG; +import static org.apache.datasketches.quantilescommon.QuantilesUtil.getNaturalRank; import java.util.Arrays; @@ -83,7 +84,7 @@ public float getQuantile(final double rank, final QuantileSearchCriteria searchC if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int len = cumWeights.length; - final long naturalRank = Math.round(rank * totalN); + final double naturalRank = getNaturalRank(rank, totalN); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); if (index == -1) { diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java index 68c9a6dfd..d5f73b00d 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java @@ -23,7 +23,7 @@ import static java.lang.Math.min; import static org.apache.datasketches.kll.KllSketch.SketchStructure.UPDATABLE; import static org.apache.datasketches.kll.KllSketch.SketchType.ITEMS_SKETCH; -import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallyWeightedRanks; +import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallySpacedDoubles; import java.lang.reflect.Array; import java.util.Comparator; @@ -153,7 +153,7 @@ public double[] getCDF(final T[] splitPoints, final QuantileSearchCriteria searc public GenericPartitionBoundaries getPartitionBoundaries(final int numEquallyWeighted, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } - final double[] ranks = equallyWeightedRanks(numEquallyWeighted); + final double[] ranks = equallySpacedDoubles(numEquallyWeighted); final Object[] boundaries = getQuantiles(ranks, searchCrit); boundaries[0] = getMinItem(); boundaries[boundaries.length - 1] = getMaxItem(); diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java index 40d8117d9..7c066dff1 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java @@ -19,8 +19,10 @@ package org.apache.datasketches.kll; +import static org.apache.datasketches.quantilescommon.GenericInequalitySearch.find; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import static org.apache.datasketches.quantilescommon.QuantilesAPI.EMPTY_MSG; +import static org.apache.datasketches.quantilescommon.QuantilesUtil.getNaturalRank; import java.lang.reflect.Array; import java.util.Arrays; @@ -28,13 +30,11 @@ import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; -import org.apache.datasketches.quantilescommon.GenericInequalitySearch; import org.apache.datasketches.quantilescommon.GenericInequalitySearch.Inequality; import org.apache.datasketches.quantilescommon.GenericSortedView; import org.apache.datasketches.quantilescommon.GenericSortedViewIterator; import org.apache.datasketches.quantilescommon.InequalitySearch; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; -import org.apache.datasketches.quantilescommon.QuantilesAPI; import org.apache.datasketches.quantilescommon.QuantilesUtil; /** @@ -84,7 +84,7 @@ public class KllItemsSketchSortedView implements GenericSortedView { final int srcNumLevels = sk.getNumLevels(); this.comp = sk.comparator; - if (totalN == 0) { throw new SketchesArgumentException(QuantilesAPI.EMPTY_MSG); } + if (totalN == 0) { throw new SketchesArgumentException(EMPTY_MSG); } if (!sk.isLevelZeroSorted()) { Arrays.sort((T[])srcQuantiles, srcLevels[0], srcLevels[1], comp); if (!sk.hasMemory()) { sk.setLevelZeroSorted(true); } @@ -133,7 +133,7 @@ public T getQuantile(final double rank, final QuantileSearchCriteria searchCrit) if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int len = cumWeights.length; - final long naturalRank = Math.round(rank * totalN); + final double naturalRank = getNaturalRank(rank, totalN); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); if (index == -1) { @@ -142,6 +142,24 @@ public T getQuantile(final double rank, final QuantileSearchCriteria searchCrit) return (T) quantiles[index]; } + /** + * Special version of getQuantile to support the getPartitionBoundaries(int) function. + * @param weight ultimately comes from selected integral weights computed by the sketch. + * @param searchCrit If INCLUSIVE, the given rank includes all quantiles ≤ + * the quantile directly corresponding to the given weight internal to the sketch. + * @return the approximate quantile given the weight. + */ + T getQuantile(final long weight, final QuantileSearchCriteria searchCrit) { + if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } + final int len = cumWeights.length; + final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; + final int index = InequalitySearch.find(cumWeights, 0, len - 1, weight, crit); + if (index == -1) { + return (T) quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; + } + return (T) quantiles[index]; + } + @Override public T[] getQuantiles() { final T[] quants = (T[]) Array.newInstance(minItem.getClass(), quantiles.length); @@ -154,7 +172,7 @@ public double getRank(final T quantile, final QuantileSearchCriteria searchCrit) if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } final int len = quantiles.length; final Inequality crit = (searchCrit == INCLUSIVE) ? Inequality.LE : Inequality.LT; - final int index = GenericInequalitySearch.find((T[])quantiles, 0, len - 1, quantile, crit, comp); + final int index = find((T[])quantiles, 0, len - 1, quantile, crit, comp); if (index == -1) { return 0; //EXCLUSIVE (LT) case: quantile <= minQuantile; INCLUSIVE (LE) case: quantile < minQuantile } diff --git a/src/main/java/org/apache/datasketches/quantiles/DoublesSketch.java b/src/main/java/org/apache/datasketches/quantiles/DoublesSketch.java index afa660205..b3a78d5af 100644 --- a/src/main/java/org/apache/datasketches/quantiles/DoublesSketch.java +++ b/src/main/java/org/apache/datasketches/quantiles/DoublesSketch.java @@ -28,7 +28,7 @@ import static org.apache.datasketches.quantiles.ClassicUtil.checkK; import static org.apache.datasketches.quantiles.ClassicUtil.computeNumLevelsNeeded; import static org.apache.datasketches.quantiles.ClassicUtil.computeRetainedItems; -import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallyWeightedRanks; +import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallySpacedDoubles; import java.util.Random; @@ -174,7 +174,7 @@ public double[] getCDF(final double[] splitPoints, final QuantileSearchCriteria public DoublesPartitionBoundaries getPartitionBoundaries(final int numEquallyWeighted, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } - final double[] ranks = equallyWeightedRanks(numEquallyWeighted); + final double[] ranks = equallySpacedDoubles(numEquallyWeighted); final double[] boundaries = getQuantiles(ranks, searchCrit); boundaries[0] = getMinItem(); boundaries[boundaries.length - 1] = getMaxItem(); diff --git a/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedView.java b/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedView.java index ec359df11..ef250fe5f 100644 --- a/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedView.java @@ -22,6 +22,8 @@ import static java.lang.System.arraycopy; import static org.apache.datasketches.quantiles.DoublesSketchAccessor.BB_LVL_IDX; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; +import static org.apache.datasketches.quantilescommon.QuantilesAPI.EMPTY_MSG; +import static org.apache.datasketches.quantilescommon.QuantilesUtil.getNaturalRank; import java.util.Arrays; @@ -29,7 +31,6 @@ import org.apache.datasketches.quantilescommon.DoublesSortedView; import org.apache.datasketches.quantilescommon.InequalitySearch; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; -import org.apache.datasketches.quantilescommon.QuantilesAPI; import org.apache.datasketches.quantilescommon.QuantilesUtil; /** @@ -80,10 +81,10 @@ public DoublesSketchSortedView(final DoublesSketch sketch) { @Override public double getQuantile(final double rank, final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } + if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int len = cumWeights.length; - final long naturalRank = Math.round(rank * totalN); + final double naturalRank = getNaturalRank(rank, totalN); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); if (index == -1) { @@ -92,9 +93,27 @@ public double getQuantile(final double rank, final QuantileSearchCriteria search return quantiles[index]; } + /** + * Special version of getQuantile to support the getPartitionBoundaries(int) function. + * @param weight ultimately comes from selected integral weights computed by the sketch. + * @param searchCrit If INCLUSIVE, the given rank includes all quantiles ≤ + * the quantile directly corresponding to the given weight internal to the sketch. + * @return the approximate quantile given the weight. + */ + double getQuantile(final long weight, final QuantileSearchCriteria searchCrit) { + if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } + final int len = cumWeights.length; + final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; + final int index = InequalitySearch.find(cumWeights, 0, len - 1, weight, crit); + if (index == -1) { + return quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; + } + return quantiles[index]; + } + @Override public double getRank(final double quantile, final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } + if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } final int len = quantiles.length; final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.LE : InequalitySearch.LT; final int index = InequalitySearch.find(quantiles, 0, len - 1, quantile, crit); diff --git a/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java b/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java index cdb21ae94..64f66fde2 100644 --- a/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java +++ b/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java @@ -37,7 +37,7 @@ import static org.apache.datasketches.quantiles.PreambleUtil.extractPreLongs; import static org.apache.datasketches.quantiles.PreambleUtil.extractSerVer; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; -import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallyWeightedRanks; +import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallySpacedLongs; import java.lang.reflect.Array; import java.util.Arrays; @@ -298,14 +298,16 @@ public T getMinItem() { public GenericPartitionBoundaries getPartitionBoundaries(final int numEquallyWeighted, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } - final double[] ranks = equallyWeightedRanks(numEquallyWeighted); - final T[] boundaries = getQuantiles(ranks, searchCrit); - boundaries[0] = getMinItem(); - boundaries[boundaries.length - 1] = getMaxItem(); + refreshSortedView(); + final long[] weights = equallySpacedLongs(1, getN(), numEquallyWeighted); + final T[] boundaries = getQuantiles(weights, searchCrit); final GenericPartitionBoundaries gpb = new GenericPartitionBoundaries<>(); gpb.N = this.getN(); - gpb.ranks = ranks; gpb.boundaries = boundaries; + gpb.weights = weights; + final double[] ranks = new double[weights.length]; + for (int i = 0; i < weights.length; i++) { ranks[i] = (double)weights[i] / getN(); } + gpb.ranks = ranks; return gpb; } @@ -363,6 +365,16 @@ public T[] getQuantiles(final double[] ranks, final QuantileSearchCriteria searc return quantiles; } + @SuppressWarnings("unchecked") + private T[] getQuantiles(final long[] weights, final QuantileSearchCriteria crit) { + final int len = weights.length; + final T[] quantiles = (T[]) Array.newInstance(minItem_.getClass(), len); + for (int i = 0; i < len; i++) { + quantiles[i] = classicQisSV.getQuantile(weights[i], crit); + } + return quantiles; + } + @Override public double getRank(final T quantile) { return getRank(quantile, INCLUSIVE); diff --git a/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java b/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java index 0dcd35fb9..68ec30e36 100644 --- a/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java @@ -19,20 +19,21 @@ package org.apache.datasketches.quantiles; +import static org.apache.datasketches.quantilescommon.GenericInequalitySearch.find; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; +import static org.apache.datasketches.quantilescommon.QuantilesAPI.EMPTY_MSG; +import static org.apache.datasketches.quantilescommon.QuantilesUtil.getNaturalRank; import java.lang.reflect.Array; import java.util.Arrays; import java.util.Comparator; import org.apache.datasketches.common.SketchesStateException; -import org.apache.datasketches.quantilescommon.GenericInequalitySearch; import org.apache.datasketches.quantilescommon.GenericInequalitySearch.Inequality; import org.apache.datasketches.quantilescommon.GenericSortedView; import org.apache.datasketches.quantilescommon.GenericSortedViewIterator; import org.apache.datasketches.quantilescommon.InequalitySearch; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; -import org.apache.datasketches.quantilescommon.QuantilesAPI; import org.apache.datasketches.quantilescommon.QuantilesUtil; /** @@ -97,7 +98,7 @@ public class ItemsSketchSortedView implements GenericSortedView { @Override //implemented here because it needs the comparator public double[] getCDF(final T[] splitPoints, final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } + if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } GenericSortedView.validateItems(splitPoints, comparator); final int len = splitPoints.length + 1; final double[] buckets = new double[len]; @@ -115,7 +116,7 @@ public long[] getCumulativeWeights() { @Override //implemented here because it needs the comparator public double[] getPMF(final T[] splitPoints, final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } + if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } GenericSortedView.validateItems(splitPoints, comparator); final double[] buckets = getCDF(splitPoints, searchCrit); final int len = buckets.length; @@ -127,10 +128,10 @@ public double[] getPMF(final T[] splitPoints, final QuantileSearchCriteria searc @Override public T getQuantile(final double rank, final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } + if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int len = cumWeights.length; - final long naturalRank = Math.round(rank * totalN); + final double naturalRank = getNaturalRank(rank, totalN); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); if (index == -1) { @@ -139,6 +140,24 @@ public T getQuantile(final double rank, final QuantileSearchCriteria searchCrit) return quantiles[index]; } + /** + * Special version of getQuantile to support the getPartitionBoundaries(int) function. + * @param weight ultimately comes from selected integral weights computed by the sketch. + * @param searchCrit If INCLUSIVE, the given rank includes all quantiles ≤ + * the quantile directly corresponding to the given weight internal to the sketch. + * @return the approximate quantile given the weight. + */ + T getQuantile(final long weight, final QuantileSearchCriteria searchCrit) { + if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } + final int len = cumWeights.length; + final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; + final int index = InequalitySearch.find(cumWeights, 0, len - 1, weight, crit); + if (index == -1) { + return quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; + } + return quantiles[index]; + } + @Override public T[] getQuantiles() { return quantiles.clone(); @@ -146,10 +165,10 @@ public T[] getQuantiles() { @Override public double getRank(final T quantile, final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } + if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } final int len = quantiles.length; final Inequality crit = (searchCrit == INCLUSIVE) ? Inequality.LE : Inequality.LT; - final int index = GenericInequalitySearch.find(quantiles, 0, len - 1, quantile, crit, comparator); + final int index = find(quantiles, 0, len - 1, quantile, crit, comparator); if (index == -1) { return 0; //EXCLUSIVE (LT) case: quantile <= minQuantile; INCLUSIVE (LE) case: quantile < minQuantile } diff --git a/src/main/java/org/apache/datasketches/quantilescommon/DoublesSortedView.java b/src/main/java/org/apache/datasketches/quantilescommon/DoublesSortedView.java index 6719c2f7f..8c299321e 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/DoublesSortedView.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/DoublesSortedView.java @@ -135,8 +135,8 @@ default double[] getPMF(double[] splitPoints, QuantileSearchCriteria searchCrit double getQuantile(double rank, QuantileSearchCriteria searchCrit); /** - * Returns the array of quantiles. - * @return the array of quantiles. + * Returns an array of all retained quantiles by the sketch. + * @return an array of all retained quantiles by the sketch. */ double[] getQuantiles(); diff --git a/src/main/java/org/apache/datasketches/quantilescommon/FloatsSortedView.java b/src/main/java/org/apache/datasketches/quantilescommon/FloatsSortedView.java index 0720da866..7127b5928 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/FloatsSortedView.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/FloatsSortedView.java @@ -135,8 +135,8 @@ default double[] getPMF(float[] splitPoints, QuantileSearchCriteria searchCrit) float getQuantile(double rank, QuantileSearchCriteria searchCrit); /** - * Returns the array of quantiles - * @return the array of quantiles + * Returns an array of all retained quantiles by the sketch. + * @return an array of all retained quantiles by the sketch. */ float[] getQuantiles(); diff --git a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesAPI.java b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesAPI.java index cbb721734..74e5d8061 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesAPI.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesAPI.java @@ -204,12 +204,11 @@ */ public interface QuantilesAPI { - String EMPTY_MSG = "The sketch must not be empty for this operation. "; - String UNSUPPORTED_MSG = "Unsupported operation for this Sketch Type. "; - String NOT_SINGLE_ITEM_MSG = "Sketch does not have just one item. "; - String MEM_REQ_SVR_NULL_MSG = "MemoryRequestServer must not be null. "; - String TGT_IS_READ_ONLY_MSG = "Target sketch is Read Only, cannot write. "; - + static String EMPTY_MSG = "The sketch must not be empty for this operation. "; + static String UNSUPPORTED_MSG = "Unsupported operation for this Sketch Type. "; + static String NOT_SINGLE_ITEM_MSG = "Sketch does not have just one item. "; + static String MEM_REQ_SVR_NULL_MSG = "MemoryRequestServer must not be null. "; + static String TGT_IS_READ_ONLY_MSG = "Target sketch is Read Only, cannot write. "; /** * Gets the user configured parameter k, which controls the accuracy of the sketch diff --git a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java index eb27ce76d..f8dd8e62d 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java @@ -364,6 +364,13 @@ static class GenericPartitionBoundaries { */ public double[] ranks; + /** + * The cumulative weights that correspond to the returned boundaries. + * The returned array is of size (m + 1), where m is the requested number of partitions. + * Index 0 of the returned array is always 1, and index m is always n. + */ + public long[] weights; + /** * The partition boundaries as quantiles. * The returned array is of size (m + 1), where m is the requested number of partitions. diff --git a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesUtil.java b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesUtil.java index 34faefb4f..848ee3105 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesUtil.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesUtil.java @@ -86,18 +86,18 @@ public static final void checkFloatsSplitPointsOrder(final float[] values) { } /** - * Returns a double array of ranks that defines equally weighted regions between 0.0, inclusive and 1.0, inclusive. - * The 0.0 and 1.0 end points are part of the returned array and are the getMinItem() and getMaxItem() values of the - * sketch. - * For example, if num == 2, three values will be returned: 0.0, .5, and 1, where the two equally weighted regions are - * 0.0 to 0.5, and 0.5 to 1.0. - * @param num the total number of equally weighted regions between 0.0 and 1.0 defined by the ranks in the returned - * array. num must be 1 or greater. - * @return a double array of num + 1 ranks that define the boundaries of num equally weighted - * regions between 0.0, inclusive and 1.0, inclusive. + * Returns an array of (num + 1) values that define equally sized intervals between 0.0, inclusive, and 1.0, + * inclusive. The end points 0.0 and 1.0 are part of the returned array. + * + *

    For example, if num == 2, three values will be returned: 0.0, .5, and 1, where the two equally sized regions + * are {0.0,0.5}, and {0.5, 1.0}.

    + * @param num the total number of equally sized intervals between 0.0, inclusive and 1.0, inclusive. + * Must be 1 or greater. + * @return a double array of values that define (num + 1) equally sized intervals between 0.0, inclusive and 1.0, + * inclusive. * @throws IllegalArgumentException if num is less than 1. */ - public static double[] equallyWeightedRanks(final int num) { + public static double[] equallySpacedDoubles(final int num) { if (num < 1) { throw new IllegalArgumentException("num must be >= 1"); } final double[] out = new double[num + 1]; out[0] = 0.0; @@ -107,6 +107,36 @@ public static double[] equallyWeightedRanks(final int num) { return out; } + /** + * Returns an array of (num + 1) longs that define, approximately, equally spaced intervals between the given + * max, inclusive, and min, inclusive. The end points max and min are part of the + * returned array. Because the range of the values may not exactly divide into num intervals, + * the size of these intervals may vary by plus or minus one. + * @param min the lowest positive valued (or zero) number of the range + * @param max the highest positive valued number of the range. max must be greater than min + * @param num Number of requested intervals. Must be greater or equal to one, and less than or equal to + * max - min. + * + * @return an array of (num + 1) longs that are approximately equally spaced between the given min and max. + */ + public static long[] equallySpacedLongs(final long min, final long max, final int num) { + if (num < 1 || min < 0 || max < 1 || (min >= max) || num > (max - min)) { + throw new SketchesArgumentException( + "Improper inputs: n < 1, min < 0, max < 1, min >= max, or n > (max - min)"); + } + final long span = (max - min); + final double[] splits = equallySpacedDoubles(num); + final int len = num + 1; + final long[] out = new long[len]; + long prev = -1L; + for (int i = 0; i < len; i++) { + long cur = Math.round(splits[i] * span); + if (cur == prev) { cur++; } else { prev = cur; } + out[i] = min + cur; + } + return out; + } + /** * Returns a float array of evenly spaced values between value1, inclusive, and value2 inclusive. * If value2 > value1, the resulting sequence will be increasing. @@ -178,5 +208,19 @@ public static double[] evenlyLogSpaced(final double value1, final double value2, return arr; } + public static double maxPrecision; + + public static double getNaturalRank(final double normalizedRank, final long totalN) { + final double naturalRank = normalizedRank * totalN; + if (totalN <= 1_000_000L) { + final double precision = Util.ceilingPowerBaseOfDouble(10.0, totalN) ; + maxPrecision = precision; + final double trimmedNatRank = Math.round(naturalRank * precision) / precision; + return trimmedNatRank; + } else { + return naturalRank; + } + } + } diff --git a/src/main/java/org/apache/datasketches/req/BaseReqSketch.java b/src/main/java/org/apache/datasketches/req/BaseReqSketch.java index 39e808dff..7c11ee2ab 100644 --- a/src/main/java/org/apache/datasketches/req/BaseReqSketch.java +++ b/src/main/java/org/apache/datasketches/req/BaseReqSketch.java @@ -19,7 +19,7 @@ package org.apache.datasketches.req; -import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallyWeightedRanks; +import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallySpacedDoubles; import org.apache.datasketches.quantilescommon.FloatsSortedView; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; @@ -66,7 +66,7 @@ abstract class BaseReqSketch implements QuantilesFloatsAPI { public FloatsPartitionBoundaries getPartitionBoundaries(final int numEquallyWeighted, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } - final double[] ranks = equallyWeightedRanks(numEquallyWeighted); + final double[] ranks = equallySpacedDoubles(numEquallyWeighted); final float[] boundaries = getQuantiles(ranks, searchCrit); boundaries[0] = getMinItem(); boundaries[boundaries.length - 1] = getMaxItem(); diff --git a/src/main/java/org/apache/datasketches/req/ReqSketchSortedView.java b/src/main/java/org/apache/datasketches/req/ReqSketchSortedView.java index cb8bc4a51..f06461650 100644 --- a/src/main/java/org/apache/datasketches/req/ReqSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/req/ReqSketchSortedView.java @@ -20,6 +20,7 @@ package org.apache.datasketches.req; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; +import static org.apache.datasketches.quantilescommon.QuantilesUtil.getNaturalRank; import java.util.List; @@ -70,7 +71,7 @@ public float getQuantile(final double rank, final QuantileSearchCriteria searchC if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int len = cumWeights.length; - final long naturalRank = Math.round(rank * totalN); + final double naturalRank = getNaturalRank(rank, totalN); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); if (index == -1) { @@ -79,6 +80,26 @@ public float getQuantile(final double rank, final QuantileSearchCriteria searchC return quantiles[index]; } + /** + * Special version of getQuantile to support the getPartitionBoundaries(int) function. + * @param weight ultimately comes from selected integral weights computed by the sketch. + * @param searchCrit If INCLUSIVE, the given rank includes all quantiles ≤ + * the quantile directly corresponding to the given weight internal to the sketch. + * @return the approximate quantile given the weight. + */ + float getQuantile(final long weight, final QuantileSearchCriteria searchCrit) { + if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } + final int len = cumWeights.length; + final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; + final int index = InequalitySearch.find(cumWeights, 0, len - 1, weight, crit); + if (index == -1) { + return quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; + } + return quantiles[index]; + } + + + @Override public float[] getQuantiles() { return quantiles.clone(); diff --git a/src/test/java/org/apache/datasketches/kll/KllItemsSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllItemsSketchTest.java index fddd5fbee..a980841b6 100644 --- a/src/test/java/org/apache/datasketches/kll/KllItemsSketchTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllItemsSketchTest.java @@ -107,30 +107,32 @@ public void oneValue() { public void tenValues() { final String[] tenStr = {"A","B","C","D","E","F","G","H","I","J"}; final KllItemsSketch sketch = KllItemsSketch.newHeapInstance(20, Comparator.naturalOrder(), serDe); - for (int i = 1; i <= 10; i++) { sketch.update(tenStr[i - 1]); } + final int strLen = tenStr.length; + final double dblStrLen = strLen; + for (int i = 1; i <= strLen; i++) { sketch.update(tenStr[i - 1]); } assertFalse(sketch.isEmpty()); - assertEquals(sketch.getN(), 10); - assertEquals(sketch.getNumRetained(), 10); - for (int i = 1; i <= 10; i++) { - assertEquals(sketch.getRank(tenStr[i - 1], EXCLUSIVE), (i - 1) / 10.0); - assertEquals(sketch.getRank(tenStr[i - 1], INCLUSIVE), i / 10.0); + assertEquals(sketch.getN(), strLen); + assertEquals(sketch.getNumRetained(), strLen); + for (int i = 1; i <= strLen; i++) { + assertEquals(sketch.getRank(tenStr[i - 1], EXCLUSIVE), (i - 1) / dblStrLen); + assertEquals(sketch.getRank(tenStr[i - 1], INCLUSIVE), i / dblStrLen); } final String[] qArr = tenStr; double[] rOut = sketch.getRanks(qArr); //inclusive for (int i = 0; i < qArr.length; i++) { - assertEquals(rOut[i], (i + 1) / 10.0); + assertEquals(rOut[i], (i + 1) / dblStrLen); } rOut = sketch.getRanks(qArr, EXCLUSIVE); //exclusive for (int i = 0; i < qArr.length; i++) { assertEquals(rOut[i], i / 10.0); } - for (int i = 0; i <= 10; i++) { - double rank = i/10.0; + for (int i = 0; i <= strLen; i++) { + double rank = i/dblStrLen; String q = rank == 1.0 ? tenStr[i-1] : tenStr[i]; assertEquals(sketch.getQuantile(rank, EXCLUSIVE), q); q = rank == 0 ? tenStr[i] : tenStr[i - 1]; - assertEquals(sketch.getQuantile(rank, INCLUSIVE), q); + assertEquals(sketch.getQuantile(rank, INCLUSIVE), q); //ERROR } { diff --git a/src/test/java/org/apache/datasketches/quantiles/CustomQuantilesTest.java b/src/test/java/org/apache/datasketches/quantiles/CustomQuantilesTest.java index 44b1b0c2d..216b91f72 100644 --- a/src/test/java/org/apache/datasketches/quantiles/CustomQuantilesTest.java +++ b/src/test/java/org/apache/datasketches/quantiles/CustomQuantilesTest.java @@ -23,27 +23,29 @@ import static org.apache.datasketches.quantilescommon.LinearRanksAndQuantiles.getTrueDoubleRank; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; +import static org.apache.datasketches.quantilescommon.QuantilesUtil.getNaturalRank; import static org.testng.Assert.assertEquals; import org.testng.annotations.Test; public class CustomQuantilesTest { - @Test /** * Currently, this test only exercises the classic DoublesSketch, but all the quantiles * sketches use the same code for getQuantile() and getRank() anyway. * This same pattern is also part of the CrossCheckQuantilesTest. * This structure of this test allows more detailed analysis for troubleshooting. */ + @Test public void checkQuantilesV400() { - println("Classic DoubleSketch, Version 4.0.0, k=4"); + println("org.apache.datasketches.quantiles.CustomQuantilesTest:"); + println("Classic DoubleSketch, Version 4.0.0, k=4, N=12"); println(""); //The following for loop creates the following pattern for the sorted view: // Quantiles: {10,10,20,20,30,30,40,40} // Weights : { 2, 1, 2, 1, 2, 1, 2, 1} //This is easy to create from the classic quantiles sketch directly, but for the other - //quantiles sketches it would be easier to create by loading the sorted view directly via + //quantiles sketches it is easier to create by loading the sorted view directly via //a package-private constructor. int k = 4; UpdateDoublesSketch sk = DoublesSketch.builder().setK(k).build(); @@ -58,7 +60,7 @@ public void checkQuantilesV400() { long[] cumWtsArr = sv.getCumulativeWeights(); int lenQ = quantilesArr.length; println("Sorted View:"); - printf("%12s%12s%12s\n", "Quantiles", "ICumWts", "IRanks"); + printf("%13s %13s %13s\n", "QuantilesArr", "CumWtsArr", "NormRanks"); double normRank; for (int i = 0; i < lenQ; i++) { normRank = (double)cumWtsArr[i] / N; @@ -68,51 +70,59 @@ public void checkQuantilesV400() { println("GetRanks, EXCLUSIVE:"); println(" R of the largest Q at the highest index that is < q. If q <= smallest Q => 0"); - printf("%12s%12s\n", "Quantiles", "Ranks"); - for (int q = 0; q <= (k * 10) + 5; q += 5) { - double nr = sk.getRank(q, EXCLUSIVE); - double nrTrue = getTrueDoubleRank(cumWtsArr, quantilesArr, q, EXCLUSIVE); - assertEquals(nr, nrTrue); - printf("%12.1f%12.3f\n", (double)q, nr); + printf("%12s %12s\n", "Quantiles", "NormRanks"); + for (int q = 0; q <= (k * 10) + 5; q += 5) { //create a range of quantiles for input + double normRankEst = sk.getRank(q, EXCLUSIVE); + double normRankTrue = getTrueDoubleRank(cumWtsArr, quantilesArr, q, EXCLUSIVE); + assertEquals(normRankEst, normRankTrue); + printf("%12.1f %12.3f", (double)q, normRankEst); + if (normRankEst != normRankTrue) { println(" " + normRankEst + " != " + normRankTrue); } else { println(""); } } println(""); println("GetQuantiles, EXCLUSIVE (round down)"); println(" Q of the smallest rank > r. If r = 1.0 => null or NaN"); - printf("%12s%12s%12s\n", "Ranks", "Quantiles", "CompRank"); - double inc = 1.0 / (2 * N); - for (long j = 0; j <= (2 * N); j++) { - double nr = (j * inc); - double q = sk.getQuantile(nr, EXCLUSIVE); - double qTrue = getTrueDoubleQuantile(cumWtsArr, quantilesArr, nr, EXCLUSIVE); - assertEquals(q, qTrue); - double nrN = Math.floor(nr * N); - printf("%12.4f%12.1f%12.1f\n", nr, q, nrN); + printf("%22s %22s %22s %13s\n", "NormRanksIn", "RawNaturalRank", "TrimmedNatRank", "QuantilesEst"); + long limit = 4 * N; + double inc = 1.0 / limit; + for (long j = 0; j <= limit; j++) { + double normRankIn = (j * inc); + double qEst = sk.getQuantile(normRankIn, EXCLUSIVE); + double qTrue = getTrueDoubleQuantile(cumWtsArr, quantilesArr, normRankIn, EXCLUSIVE); + assertEquals(qEst, qTrue); + double rawNatRank = normRankIn * N; + double trimNatRank = getNaturalRank(normRankIn, N); + printf("%22.18f %22.18f %22.18f %13.1f", normRankIn, rawNatRank, trimNatRank, qEst); + if (qEst != qTrue) { println(" " + qEst + " != " +qTrue); } else { println(""); } } println(""); println("GetRanks, INCLUSIVE:"); println(" R of the largest Q at the highest index that is <= q. If q < smallest Q => 0"); - printf("%12s%12s\n", "Quantiles", "Ranks"); + printf("%12s %12s\n", "Quantiles", "NormRanks"); for (int q = 0; q <= (k * 10) + 5; q += 5) { double nr = sk.getRank(q, INCLUSIVE); double nrTrue = getTrueDoubleRank(cumWtsArr, quantilesArr, q, INCLUSIVE); assertEquals(nr, nrTrue); - printf("%12.1f%12.3f\n", (double)q, nr); + printf("%12.1f %12.3f", (double)q, nr); + if (nr != nrTrue) { println(" " + nr + " != " +nrTrue); } else { println(""); } } println(""); println("GetQuantiles, INCLUSIVE (round up)"); println(" Q of the smallest rank >= r."); - printf("%12s%12s%12s\n", "Ranks", "Quantiles", "CompRank"); - inc = 1.0 / (2 * N); - for (long j = 0; j <= (2 * N); j++) { - double nr = (j * inc); - double q = sk.getQuantile(nr, INCLUSIVE); - double qTrue = getTrueDoubleQuantile(cumWtsArr, quantilesArr, nr, INCLUSIVE); - assertEquals(q, qTrue); - double nrN = Math.ceil(nr * N); - printf("%12.4f%12.1f%12.1f\n", nr, q, nrN); + printf("%22s %22s %22s %13s\n", "NormRanksIn", "RawNaturalRank", "TrimmedNatRank", "QuantilesEst"); + + inc = 1.0 / limit; + for (long j = 0; j <= limit; j++) { + double normRankIn = (j * inc); + double qEst = sk.getQuantile(normRankIn, INCLUSIVE); + double qTrue = getTrueDoubleQuantile(cumWtsArr, quantilesArr, normRankIn, INCLUSIVE); + assertEquals(qEst, qTrue); + double rawNatRank = normRankIn * N; + double trimNatRank = getNaturalRank(normRankIn, N); + printf("%22.18f %22.18f %22.18f %13.1f", normRankIn, rawNatRank, trimNatRank, qEst); + if (qEst != qTrue) { println(" " + qEst + " != " +qTrue); } else { println(""); } } println(""); } diff --git a/src/test/java/org/apache/datasketches/quantiles/HeapUpdateDoublesSketchTest.java b/src/test/java/org/apache/datasketches/quantiles/HeapUpdateDoublesSketchTest.java index 2529bd0d9..b5fd7b2d3 100644 --- a/src/test/java/org/apache/datasketches/quantiles/HeapUpdateDoublesSketchTest.java +++ b/src/test/java/org/apache/datasketches/quantiles/HeapUpdateDoublesSketchTest.java @@ -29,7 +29,7 @@ import static org.apache.datasketches.quantiles.PreambleUtil.EMPTY_FLAG_MASK; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; -import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallyWeightedRanks; +import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallySpacedDoubles; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNull; @@ -810,7 +810,7 @@ public void getQuantiles() { @Test public void checkEquallySpacedRanks() { int n = 10; - double[] es = equallyWeightedRanks(n); + double[] es = equallySpacedDoubles(n); int len = es.length; for (int j=0; j Date: Thu, 16 Nov 2023 14:11:35 -0800 Subject: [PATCH 04/13] This commit includes a number of enhancements for the 5.0 release: - This is a large number of changes. - The problem detected by the Druid team is fixed, so now the "getPartitionBoundaries" works for input streams that are larger than Integer.MAX_VALUE. - This fix applies to both the KllItemsSketch and the classic ItemsSketch. These are the only two sketches, for now, that will support the "getPartitionBoundaries" functionality. This is enforced via a new "PartitioningFeature" API interface. - In addition, there is new "partitions" package that solves the problem of limited accuracy of our quantiles sketches when being asked to partition very large input streams. This package can partition very large streams of almost unlimited size with very small variation in the resulting partition sizes. I have tested this with streams as large as 30E12 elements. - I have reduced code duplication in a number of places. Specifically, All the quantile sketch sorted view classes use only 3 iterator implementations, which are for float, double and generic. Further consolidation of classes can be done across the sorted view classes themselves, but that will have to be done later. - Javadocs have been improved in a number of places and I have fixed spelling errors when I see them. --- README.md | 2 +- pom.xml | 7 + .../org/apache/datasketches/common/Util.java | 83 +++---- .../datasketches/kll/KllDoublesSketch.java | 16 -- .../kll/KllDoublesSketchIterator.java | 42 +--- .../kll/KllDoublesSketchSortedView.java | 81 ++++--- .../KllDoublesSketchSortedViewIterator.java | 79 ------- .../datasketches/kll/KllFloatsSketch.java | 16 -- .../kll/KllFloatsSketchIterator.java | 42 +--- .../kll/KllFloatsSketchSortedView.java | 64 +++++- .../KllFloatsSketchSortedViewIterator.java | 79 ------- .../datasketches/kll/KllItemsSketch.java | 21 +- .../kll/KllItemsSketchIterator.java | 42 +--- .../kll/KllItemsSketchSortedView.java | 186 ++++++++++----- .../datasketches/kll/KllSketchIterator.java | 82 +++++++ .../datasketches/partitions/BoundsRule.java | 37 +++ .../datasketches/partitions/Partitioner.java | 211 ++++++++++++++++++ .../partitions/SketchFillRequest.java | 46 ++++ .../datasketches/partitions/package-info.java | 23 ++ .../datasketches/quantiles/DoublesSketch.java | 16 -- .../quantiles/DoublesSketchSortedView.java | 76 ++++--- .../DoublesSketchSortedViewIterator.java | 77 ------- .../datasketches/quantiles/ItemsSketch.java | 95 +------- .../quantiles/ItemsSketchSortedView.java | 152 +++++++++---- .../quantilescommon/DoublesSortedView.java | 20 +- .../DoublesSortedViewIterator.java | 18 +- .../quantilescommon/FloatsSortedView.java | 18 ++ .../FloatsSortedViewIterator.java | 18 +- .../GenericPartitionBoundaries.java | 136 +++++++++++ .../quantilescommon/GenericSortedView.java | 23 +- .../GenericSortedViewIterator.java | 54 +---- .../quantilescommon/PartitionBoundaries.java | 67 ++++++ .../quantilescommon/PartitioningFeature.java | 83 +++++++ .../quantilescommon/QuantilesAPI.java | 4 +- .../quantilescommon/QuantilesDoublesAPI.java | 50 ----- .../quantilescommon/QuantilesFloatsAPI.java | 50 ----- .../quantilescommon/QuantilesGenericAPI.java | 94 -------- .../quantilescommon/QuantilesUtil.java | 22 +- .../quantilescommon/SortedView.java | 36 +-- .../quantilescommon/SortedViewIterator.java | 55 +++-- .../datasketches/quantilescommon/Stack.java | 68 ++++++ .../datasketches/req/BaseReqSketch.java | 18 -- .../datasketches/req/ReqSketchSortedView.java | 80 ++++--- .../req/ReqSketchSortedViewIterator.java | 80 ------- .../apache/datasketches/common/UtilTest.java | 9 +- ...lDirectCompactItemsSketchIteratorTest.java | 8 +- .../kll/KllDirectDoublesSketchTest.java | 16 -- .../kll/KllDirectFloatsSketchTest.java | 16 -- .../kll/KllDoublesSketchIteratorTest.java | 8 +- .../kll/KllDoublesSketchTest.java | 27 +-- .../kll/KllFloatsSketchIteratorTest.java | 8 +- .../datasketches/kll/KllFloatsSketchTest.java | 27 +-- .../kll/KllItemsSketchSortedViewString.java | 7 +- .../datasketches/kll/KllItemsSketchTest.java | 16 +- .../kll/KllItemsSketchiteratorTest.java | 8 +- .../kll/KllMiscDirectDoublesTest.java | 13 -- .../kll/KllMiscDirectFloatsTest.java | 13 -- .../datasketches/kll/KllMiscItemsTest.java | 4 +- .../partitions/ClassicPartitionsTest.java | 127 +++++++++++ .../ItemsSketchFillRequestLongAsString.java | 121 ++++++++++ ...KllItemsSketchFillRequestLongAsString.java | 121 ++++++++++ .../partitions/KllPartitionsTest.java | 127 +++++++++++ .../quantiles/CustomQuantilesTest.java | 4 +- .../quantiles/DoublesSketchTest.java | 7 +- .../HeapUpdateDoublesSketchTest.java | 28 +-- .../ItemsSketchSortedViewString.java | 6 +- .../quantiles/ItemsSketchTest.java | 12 +- .../quantiles/SkewedDataTest.java | 114 ++++++++++ .../CrossCheckQuantilesTest.java | 49 ++-- .../LongsAsOrderableStrings.java | 64 ++++++ .../quantilescommon/ReflectUtilityTest.java | 14 +- .../req/ReqSketchSortedViewTest.java | 21 +- .../datasketches/req/ReqSketchTest.java | 20 +- 73 files changed, 2222 insertions(+), 1362 deletions(-) delete mode 100644 src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedViewIterator.java delete mode 100644 src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedViewIterator.java create mode 100644 src/main/java/org/apache/datasketches/kll/KllSketchIterator.java create mode 100644 src/main/java/org/apache/datasketches/partitions/BoundsRule.java create mode 100644 src/main/java/org/apache/datasketches/partitions/Partitioner.java create mode 100644 src/main/java/org/apache/datasketches/partitions/SketchFillRequest.java create mode 100644 src/main/java/org/apache/datasketches/partitions/package-info.java delete mode 100644 src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedViewIterator.java create mode 100644 src/main/java/org/apache/datasketches/quantilescommon/GenericPartitionBoundaries.java create mode 100644 src/main/java/org/apache/datasketches/quantilescommon/PartitionBoundaries.java create mode 100644 src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java create mode 100644 src/main/java/org/apache/datasketches/quantilescommon/Stack.java delete mode 100644 src/main/java/org/apache/datasketches/req/ReqSketchSortedViewIterator.java create mode 100644 src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java create mode 100644 src/test/java/org/apache/datasketches/partitions/ItemsSketchFillRequestLongAsString.java create mode 100644 src/test/java/org/apache/datasketches/partitions/KllItemsSketchFillRequestLongAsString.java create mode 100644 src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java create mode 100644 src/test/java/org/apache/datasketches/quantiles/SkewedDataTest.java create mode 100644 src/test/java/org/apache/datasketches/quantilescommon/LongsAsOrderableStrings.java diff --git a/README.md b/README.md index 8da5faac3..3190036d1 100644 --- a/README.md +++ b/README.md @@ -154,5 +154,5 @@ In Eclipse, open the project *Properties / Java Build Path / Module Dependencies #### SpotBugs -* Make sure you configure SpotBugs with the /tools/FindBugsExcludeFilter.xml file. Otherwise, you will get a lot of false positive or low risk issues that we have examined and exliminated with this exclusion file. +* Make sure you configure SpotBugs with the /tools/FindBugsExcludeFilter.xml file. Otherwise, you may get a lot of false positive or low risk issues that we have examined and eliminated with this exclusion file. diff --git a/pom.xml b/pom.xml index 75016d7f7..02765d07f 100644 --- a/pom.xml +++ b/pom.xml @@ -150,6 +150,13 @@ under the License. ${testng.version} test + diff --git a/src/main/java/org/apache/datasketches/common/Util.java b/src/main/java/org/apache/datasketches/common/Util.java index 602b40b0b..c9a749e55 100644 --- a/src/main/java/org/apache/datasketches/common/Util.java +++ b/src/main/java/org/apache/datasketches/common/Util.java @@ -24,6 +24,7 @@ import static java.lang.Math.log; import static java.lang.Math.pow; import static java.lang.Math.round; +import static java.util.Arrays.fill; import java.util.Comparator; @@ -217,7 +218,7 @@ public static String nanoSecToString(final long nS) { /** * Returns the given time in milliseconds formatted as Hours:Min:Sec.mSec - * @param mS the given nanoseconds + * @param mS the given milliseconds * @return the given time in milliseconds formatted as Hours:Min:Sec.mSec */ public static String milliSecToString(final long mS) { @@ -244,40 +245,20 @@ public static String zeroPad(final String s, final int fieldLength) { /** * Prepend or postpend the given string with the given character to fill the given field length. - * If the given string is equal or greater than the given field length, it will be returned - * without modification. + * If the given string is equal to or greater than the given field length, it will be returned without modification. * @param s the given string * @param fieldLength the desired field length * @param padChar the desired pad character * @param postpend if true append the pacCharacters to the end of the string. - * @return prepended or postpended given string with the given character to fill the given field - * length. + * @return prepended or postpended given string with the given character to fill the given field length. */ - public static String characterPad(final String s, final int fieldLength, final char padChar, - final boolean postpend) { - final char[] chArr = s.toCharArray(); - final int sLen = chArr.length; + public static String characterPad(final String s, final int fieldLength, final char padChar, final boolean postpend) { + final int sLen = s.length(); if (sLen < fieldLength) { - final char[] out = new char[fieldLength]; - final int blanks = fieldLength - sLen; - - if (postpend) { - for (int i = 0; i < sLen; i++) { - out[i] = chArr[i]; - } - for (int i = sLen; i < fieldLength; i++) { - out[i] = padChar; - } - } else { //prepend - for (int i = 0; i < blanks; i++) { - out[i] = padChar; - } - for (int i = blanks; i < fieldLength; i++) { - out[i] = chArr[i - blanks]; - } - } - - return String.valueOf(out); + final char[] cArr = new char[fieldLength - sLen]; + fill(cArr, padChar); + final String addstr = String.valueOf(cArr); + return (postpend) ? s.concat(addstr) : addstr.concat(s); } return s; } @@ -550,56 +531,60 @@ public static double powerSeriesNextDouble(final int ppb, final double curPoint, } /** - * Computes the ceiling power of given base and n as doubles. - * This is the smallest positive power - * of base that equal to or greater than the given n and equal to a mathematical integer. + * Returns the ceiling of a given n given a radix, where the ceiling is an integral power of the radix. + * This is the smallest positive power of radix that is equal to or greater than the given n + * and equal to a mathematical integer. * The result of this function is consistent with {@link #ceilingIntPowerOf2(int)} for values * less than one. I.e., if n < 1, the result is 1. * - * @param base The base in the expression ⌈basen⌉. + *

    The formula is: radixceiling(logradix(x))

    + * + * @param radix The base of the number system. * @param n The input argument. - * @return the ceiling power of base as a double and equal to a mathematical integer. + * @return the ceiling power of radix as a double and equal to a mathematical integer. */ - public static double ceilingPowerBaseOfDouble(final double base, final double n) { + public static double ceilingPowerBaseOfDouble(final double radix, final double n) { final double x = n < 1.0 ? 1.0 : n; - return pow(base, ceil(logBaseOfX(base, x))); + return Math.round(pow(radix, ceil(logBaseOfX(radix, x)))); } /** - * Computes the floor power of given base and n as doubles. - * This is the largest positive power - * of base that equal to or less than the given n and equal to a mathematical integer. + * Computes the floor of a given n given radix, where the floor is an integral power of the radix. + * This is the largest positive power of radix that is equal to or less than the given n + * and equal to a mathematical integer. * The result of this function is consistent with {@link #floorPowerOf2(int)} for values * less than one. I.e., if n < 1, the result is 1. * - * @param base The base in the expression ⌊basen⌋. + *

    The formula is: radixfloor(logradix(x))

    + * + * @param radix The base of the number system. * @param n The input argument. * @return the floor power of 2 and equal to a mathematical integer. */ - public static double floorPowerBaseOfDouble(final double base, final double n) { + public static double floorPowerBaseOfDouble(final double radix, final double n) { final double x = n < 1.0 ? 1.0 : n; - return pow(base, floor(logBaseOfX(base, x))); + return Math.round(pow(radix, floor(logBaseOfX(radix, x)))); } // Logarithm related /** - * The log base 2 of the value + * The log2(value) * @param value the given value - * @return The log base 2 of the value + * @return log2(value) */ public static double log2(final double value) { return log(value) / LOG2; } /** - * Returns the logarithm_logBase of x. Example: logB(2.0, x) = log(x) / log(2.0). - * @param logBase the base of the logarithm used + * Returns the logradix(x). Example: logB(2.0, x) = log(x) / log(2.0). + * @param radix the base of the number system * @param x the given value - * @return the logarithm_logBase of x: Example: logB(2.0, x) = log(x) / log(2.0). + * @return the logradix(x): Example: logB(2.0, x) = log(x) / log(2.0). */ - public static double logBaseOfX(final double logBase, final double x) { - return log(x) / log(logBase); + public static double logBaseOfX(final double radix, final double x) { + return log(x) / log(radix); } /** diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java index 213544021..7c175512a 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java @@ -24,7 +24,6 @@ import static org.apache.datasketches.common.ByteArrayUtil.putDoubleLE; import static org.apache.datasketches.kll.KllSketch.SketchStructure.UPDATABLE; import static org.apache.datasketches.kll.KllSketch.SketchType.DOUBLES_SKETCH; -import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallySpacedDoubles; import java.util.Objects; @@ -175,21 +174,6 @@ public double[] getCDF(final double[] splitPoints, final QuantileSearchCriteria return kllDoublesSV.getCDF(splitPoints, searchCrit); } - @Override - public DoublesPartitionBoundaries getPartitionBoundaries(final int numEquallyWeighted, - final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } - final double[] ranks = equallySpacedDoubles(numEquallyWeighted); - final double[] boundaries = getQuantiles(ranks, searchCrit); - boundaries[0] = getMinItem(); - boundaries[boundaries.length - 1] = getMaxItem(); - final DoublesPartitionBoundaries dpb = new DoublesPartitionBoundaries(); - dpb.N = this.getN(); - dpb.ranks = ranks; - dpb.boundaries = boundaries; - return dpb; - } - @Override public double[] getPMF(final double[] splitPoints, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchIterator.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketchIterator.java index 473d5f1bb..bc18c5347 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchIterator.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketchIterator.java @@ -24,20 +24,12 @@ /** * Iterator over KllDoublesSketch. The order is not defined. */ -public final class KllDoublesSketchIterator implements QuantilesDoublesSketchIterator { +public final class KllDoublesSketchIterator extends KllSketchIterator implements QuantilesDoublesSketchIterator { private final double[] quantiles; - private final int[] levelsArr; - private final int numLevels; - private int level; - private int index; - private long weight; - private boolean isInitialized; KllDoublesSketchIterator(final double[] quantiles, final int[] levelsArr, final int numLevels) { + super(levelsArr, numLevels); this.quantiles = quantiles; - this.levelsArr = levelsArr; - this.numLevels = numLevels; - this.isInitialized = false; } @Override @@ -45,34 +37,4 @@ public double getQuantile() { return quantiles[index]; } - @Override - public long getWeight() { - return weight; - } - - @Override - public boolean next() { - if (!isInitialized) { - level = 0; - index = levelsArr[level]; - weight = 1; - isInitialized = true; - } else { - index++; - } - if (index < levelsArr[level + 1]) { - return true; - } - // go to the next non-empty level - do { - level++; - if (level == numLevels) { - return false; // run out of levels - } - weight *= 2; - } while (levelsArr[level] == levelsArr[level + 1]); - index = levelsArr[level]; - return true; - } - } diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java index 8f8ae5d63..e8bed53eb 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java @@ -27,6 +27,7 @@ import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.quantilescommon.DoublesSortedView; +import org.apache.datasketches.quantilescommon.DoublesSortedViewIterator; import org.apache.datasketches.quantilescommon.InequalitySearch; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.datasketches.quantilescommon.QuantilesUtil; @@ -40,6 +41,9 @@ public final class KllDoublesSketchSortedView implements DoublesSortedView { private final double[] quantiles; private final long[] cumWeights; //comes in as individual weights, converted to cumulative natural weights private final long totalN; + private final double[] normRanks; + private final double maxItem; + private final double minItem; /** * Construct from elements for testing. @@ -47,31 +51,44 @@ public final class KllDoublesSketchSortedView implements DoublesSortedView { * @param cumWeights sorted, monotonically increasing cumulative weights. * @param totalN the total number of items presented to the sketch. */ - KllDoublesSketchSortedView(final double[] quantiles, final long[] cumWeights, final long totalN) { + KllDoublesSketchSortedView(final double[] quantiles, final long[] cumWeights, final long totalN, + final double maxItem, final double minItem) { this.quantiles = quantiles; this.cumWeights = cumWeights; this.totalN = totalN; + this.maxItem = maxItem; + this.minItem = minItem; + final int len = cumWeights.length; + final double[] normRanks = new double[len]; + for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } + this.normRanks = normRanks; } /** * Constructs this Sorted View given the sketch - * @param sk the given KllDoublesSketch. + * @param sketch the given KllDoublesSketch. */ - public KllDoublesSketchSortedView(final KllDoublesSketch sk) { - this.totalN = sk.getN(); - final double[] srcQuantiles = sk.getDoubleItemsArray(); - final int[] srcLevels = sk.levelsArr; - final int srcNumLevels = sk.getNumLevels(); + public KllDoublesSketchSortedView(final KllDoublesSketch sketch) { + if (sketch.isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } + this.totalN = sketch.getN(); + this.maxItem = sketch.getMaxItem(); + this.minItem = sketch.getMinItem(); + final double[] srcQuantiles = sketch.getDoubleItemsArray(); + final int[] srcLevels = sketch.levelsArr; + final int srcNumLevels = sketch.getNumLevels(); - if (!sk.isLevelZeroSorted()) { + if (!sketch.isLevelZeroSorted()) { Arrays.sort(srcQuantiles, srcLevels[0], srcLevels[1]); - if (!sk.hasMemory()) { sk.setLevelZeroSorted(true); } + if (!sketch.hasMemory()) { sketch.setLevelZeroSorted(true); } } final int numQuantiles = srcLevels[srcNumLevels] - srcLevels[0]; //remove garbage quantiles = new double[numQuantiles]; cumWeights = new long[numQuantiles]; populateFromSketch(srcQuantiles, srcLevels, srcNumLevels, numQuantiles); + final double[] normRanks = new double[numQuantiles]; + for (int i = 0; i < numQuantiles; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } + this.normRanks = normRanks; } @Override @@ -79,34 +96,36 @@ public long[] getCumulativeWeights() { return cumWeights.clone(); } + @Override + public double getMaxItem() { + return maxItem; + } + + @Override + public double getMinItem() { + return minItem; + } + + @Override + public long getN() { + return totalN; + } + + @Override + public double[] getNormalizedRanks() { + return normRanks; + } + @Override public double getQuantile(final double rank, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int len = cumWeights.length; - final double naturalRank = getNaturalRank(rank, totalN); + final double naturalRank = getNaturalRank(rank, totalN, searchCrit); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); if (index == -1) { - return quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; - } - return quantiles[index]; - } - - /** - * Special version of getQuantile to support the getPartitionBoundaries(int) function. - * @param weight ultimately comes from selected integral weights computed by the sketch. - * @param searchCrit If INCLUSIVE, the given rank includes all quantiles ≤ - * the quantile directly corresponding to the given weight internal to the sketch. - * @return the approximate quantile given the weight. - */ - double getQuantile(final long weight, final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } - final int len = cumWeights.length; - final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; - final int index = InequalitySearch.find(cumWeights, 0, len - 1, weight, crit); - if (index == -1) { - return quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; + return quantiles[len - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; } return quantiles[index]; } @@ -134,8 +153,8 @@ public boolean isEmpty() { } @Override - public KllDoublesSketchSortedViewIterator iterator() { - return new KllDoublesSketchSortedViewIterator(quantiles, cumWeights); + public DoublesSortedViewIterator iterator() { + return new DoublesSortedViewIterator(quantiles, cumWeights); } //restricted methods diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedViewIterator.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedViewIterator.java deleted file mode 100644 index 29131bd2c..000000000 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedViewIterator.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.kll; - -import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; - -import org.apache.datasketches.quantilescommon.DoublesSortedViewIterator; -import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; - -/** - * Iterator over KllDoublesSketchSortedView - * @author Alexander Saydakov - * @author Lee Rhodes - */ -public final class KllDoublesSketchSortedViewIterator implements DoublesSortedViewIterator { - - private final double[] quantiles; - private final long[] cumWeights; - private final long totalN; - private int index; - - KllDoublesSketchSortedViewIterator(final double[] quantiles, final long[] cumWeights) { - this.quantiles = quantiles; - this.cumWeights = cumWeights; - this.totalN = (cumWeights.length > 0) ? cumWeights[cumWeights.length - 1] : 0; - index = -1; - } - - @Override - public long getCumulativeWeight(final QuantileSearchCriteria searchCrit) { - if (searchCrit == INCLUSIVE) { return cumWeights[index]; } - return (index == 0) ? 0 : cumWeights[index - 1]; - } - - @Override - public long getN() { - return totalN; - } - - @Override - public double getNormalizedRank(final QuantileSearchCriteria searchCrit) { - return (double) getCumulativeWeight(searchCrit) / totalN; - } - - @Override - public double getQuantile() { - return quantiles[index]; - } - - @Override - public long getWeight() { - if (index == 0) { return cumWeights[0]; } - return cumWeights[index] - cumWeights[index - 1]; - } - - @Override - public boolean next() { - index++; - return index < quantiles.length; - } - -} diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java index e2e4d808a..5484e8bf1 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java @@ -24,7 +24,6 @@ import static org.apache.datasketches.common.ByteArrayUtil.putFloatLE; import static org.apache.datasketches.kll.KllSketch.SketchStructure.UPDATABLE; import static org.apache.datasketches.kll.KllSketch.SketchType.FLOATS_SKETCH; -import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallySpacedDoubles; import java.util.Objects; @@ -175,21 +174,6 @@ public double[] getCDF(final float[] splitPoints, final QuantileSearchCriteria s return kllFloatsSV.getCDF(splitPoints, searchCrit); } - @Override - public FloatsPartitionBoundaries getPartitionBoundaries(final int numEquallyWeighted, - final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } - final double[] ranks = equallySpacedDoubles(numEquallyWeighted); - final float[] boundaries = getQuantiles(ranks, searchCrit); - boundaries[0] = getMinItem(); - boundaries[boundaries.length - 1] = getMaxItem(); - final FloatsPartitionBoundaries fpb = new FloatsPartitionBoundaries(); - fpb.N = this.getN(); - fpb.ranks = ranks; - fpb.boundaries = boundaries; - return fpb; - } - @Override public double[] getPMF(final float[] splitPoints, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchIterator.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketchIterator.java index 8c5808ead..accf039de 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchIterator.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketchIterator.java @@ -24,20 +24,12 @@ /** * Iterator over KllFloatsSketch. The order is not defined. */ -public final class KllFloatsSketchIterator implements QuantilesFloatsSketchIterator { +public final class KllFloatsSketchIterator extends KllSketchIterator implements QuantilesFloatsSketchIterator { private final float[] quantiles; - private final int[] levelsArr; - private final int numLevels; - private int level; - private int index; - private long weight; - private boolean isInitialized; KllFloatsSketchIterator(final float[] quantiles, final int[] levelsArr, final int numLevels) { + super(levelsArr, numLevels); this.quantiles = quantiles; - this.levelsArr = levelsArr; - this.numLevels = numLevels; - this.isInitialized = false; } @Override @@ -45,34 +37,4 @@ public float getQuantile() { return quantiles[index]; } - @Override - public long getWeight() { - return weight; - } - - @Override - public boolean next() { - if (!isInitialized) { - level = 0; - index = levelsArr[level]; - weight = 1; - isInitialized = true; - } else { - index++; - } - if (index < levelsArr[level + 1]) { - return true; - } - // go to the next non-empty level - do { - level++; - if (level == numLevels) { - return false; // run out of levels - } - weight *= 2; - } while (levelsArr[level] == levelsArr[level + 1]); - index = levelsArr[level]; - return true; - } - } diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java index 8f47a8da7..08678503c 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java @@ -27,6 +27,7 @@ import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.quantilescommon.FloatsSortedView; +import org.apache.datasketches.quantilescommon.FloatsSortedViewIterator; import org.apache.datasketches.quantilescommon.InequalitySearch; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.datasketches.quantilescommon.QuantilesUtil; @@ -40,6 +41,9 @@ public final class KllFloatsSketchSortedView implements FloatsSortedView { private final float[] quantiles; private final long[] cumWeights; //comes in as individual weights, converted to cumulative natural weights private final long totalN; + private final double[] normRanks; + private final float maxItem; + private final float minItem; /** * Construct from elements for testing. @@ -47,44 +51,80 @@ public final class KllFloatsSketchSortedView implements FloatsSortedView { * @param cumWeights sorted, monotonically increasing cumulative weights. * @param totalN the total number of items presented to the sketch. */ - KllFloatsSketchSortedView(final float[] quantiles, final long[] cumWeights, final long totalN) { + KllFloatsSketchSortedView(final float[] quantiles, final long[] cumWeights, final long totalN, + final float maxItem, final float minItem) { this.quantiles = quantiles; this.cumWeights = cumWeights; this.totalN = totalN; + this.maxItem = maxItem; + this.minItem = minItem; + final int len = cumWeights.length; + final double[] normRanks = new double[len]; + for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } + this.normRanks = normRanks; } /** * Constructs this Sorted View given the sketch - * @param sk the given KllFloatsSketch. + * @param sketch the given KllFloatsSketch. */ - public KllFloatsSketchSortedView(final KllFloatsSketch sk) { - this.totalN = sk.getN(); - final float[] srcQuantiles = sk.getFloatItemsArray(); - final int[] srcLevels = sk.levelsArr; - final int srcNumLevels = sk.getNumLevels(); + public KllFloatsSketchSortedView(final KllFloatsSketch sketch) { + if (sketch.isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } + this.totalN = sketch.getN(); + this.maxItem = sketch.getMaxItem(); + this.minItem = sketch.getMinItem(); + final float[] srcQuantiles = sketch.getFloatItemsArray(); + final int[] srcLevels = sketch.levelsArr; + final int srcNumLevels = sketch.getNumLevels(); - if (!sk.isLevelZeroSorted()) { + if (!sketch.isLevelZeroSorted()) { Arrays.sort(srcQuantiles, srcLevels[0], srcLevels[1]); - if (!sk.hasMemory()) { sk.setLevelZeroSorted(true); } + if (!sketch.hasMemory()) { sketch.setLevelZeroSorted(true); } } final int numQuantiles = srcLevels[srcNumLevels] - srcLevels[0]; //remove garbage quantiles = new float[numQuantiles]; cumWeights = new long[numQuantiles]; populateFromSketch(srcQuantiles, srcLevels, srcNumLevels, numQuantiles); + final int len = cumWeights.length; + final double[] normRanks = new double[len]; + for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } + this.normRanks = normRanks; } + //end of constructors + @Override public long[] getCumulativeWeights() { return cumWeights.clone(); } + @Override + public float getMaxItem() { + return maxItem; + } + + @Override + public float getMinItem() { + return minItem; + } + + @Override + public long getN() { + return totalN; + } + + @Override + public double[] getNormalizedRanks() { + return normRanks; + } + @Override public float getQuantile(final double rank, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int len = cumWeights.length; - final double naturalRank = getNaturalRank(rank, totalN); + final double naturalRank = getNaturalRank(rank, totalN, searchCrit); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); if (index == -1) { @@ -116,8 +156,8 @@ public boolean isEmpty() { } @Override - public KllFloatsSketchSortedViewIterator iterator() { - return new KllFloatsSketchSortedViewIterator(quantiles, cumWeights); + public FloatsSortedViewIterator iterator() { + return new FloatsSortedViewIterator(quantiles, cumWeights); } //restricted methods diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedViewIterator.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedViewIterator.java deleted file mode 100644 index 87c2e88bd..000000000 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedViewIterator.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.kll; - -import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; - -import org.apache.datasketches.quantilescommon.FloatsSortedViewIterator; -import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; - -/** - * Iterator over KllFloatsSketchSortedView - * @author Alexander Saydakov - * @author Lee Rhodes - */ -public final class KllFloatsSketchSortedViewIterator implements FloatsSortedViewIterator { - - private final float[] quantiles; - private final long[] cumWeights; - private final long totalN; - private int index; - - KllFloatsSketchSortedViewIterator(final float[] quantiles, final long[] cumWeights) { - this.quantiles = quantiles; - this.cumWeights = cumWeights; - this.totalN = (cumWeights.length > 0) ? cumWeights[cumWeights.length - 1] : 0; - index = -1; - } - - @Override - public long getCumulativeWeight(final QuantileSearchCriteria searchCrit) { - if (searchCrit == INCLUSIVE) { return cumWeights[index]; } - return (index == 0) ? 0 : cumWeights[index - 1]; - } - - @Override - public long getN() { - return totalN; - } - - @Override - public double getNormalizedRank(final QuantileSearchCriteria searchCrit) { - return (double) getCumulativeWeight(searchCrit) / totalN; - } - - @Override - public float getQuantile() { - return quantiles[index]; - } - - @Override - public long getWeight() { - if (index == 0) { return cumWeights[0]; } - return cumWeights[index] - cumWeights[index - 1]; - } - - @Override - public boolean next() { - index++; - return index < quantiles.length; - } - -} diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java index d5f73b00d..f0e923fbd 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java @@ -23,7 +23,6 @@ import static java.lang.Math.min; import static org.apache.datasketches.kll.KllSketch.SketchStructure.UPDATABLE; import static org.apache.datasketches.kll.KllSketch.SketchType.ITEMS_SKETCH; -import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallySpacedDoubles; import java.lang.reflect.Array; import java.util.Comparator; @@ -34,7 +33,10 @@ import org.apache.datasketches.memory.Memory; import org.apache.datasketches.memory.MemoryRequestServer; import org.apache.datasketches.memory.WritableMemory; +import org.apache.datasketches.quantilescommon.GenericPartitionBoundaries; +import org.apache.datasketches.quantilescommon.PartitioningFeature; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; +import org.apache.datasketches.quantilescommon.QuantilesAPI; import org.apache.datasketches.quantilescommon.QuantilesGenericAPI; import org.apache.datasketches.quantilescommon.QuantilesGenericSketchIterator; @@ -46,7 +48,7 @@ * @see org.apache.datasketches.kll.KllSketch */ @SuppressWarnings("unchecked") -public abstract class KllItemsSketch extends KllSketch implements QuantilesGenericAPI { +public abstract class KllItemsSketch extends KllSketch implements QuantilesGenericAPI, PartitioningFeature { private KllItemsSketchSortedView kllItemsSV = null; final Comparator comparator; final ArrayOfItemsSerDe serDe; @@ -150,18 +152,11 @@ public double[] getCDF(final T[] splitPoints, final QuantileSearchCriteria searc } @Override - public GenericPartitionBoundaries getPartitionBoundaries(final int numEquallyWeighted, + public GenericPartitionBoundaries getPartitionBoundaries(final int numEquallySized, final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } - final double[] ranks = equallySpacedDoubles(numEquallyWeighted); - final Object[] boundaries = getQuantiles(ranks, searchCrit); - boundaries[0] = getMinItem(); - boundaries[boundaries.length - 1] = getMaxItem(); - final GenericPartitionBoundaries gpb = new GenericPartitionBoundaries<>(); - gpb.N = this.getN(); - gpb.ranks = ranks; - gpb.boundaries = (T[])boundaries; - return gpb; + if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } + refreshSortedView(); + return kllItemsSV.getPartitionBoundaries(numEquallySized, searchCrit); } @Override diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketchIterator.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketchIterator.java index 4adb9d79b..3a0a8da0f 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsSketchIterator.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketchIterator.java @@ -24,20 +24,12 @@ /** * Iterator over KllItemsSketch. The order is not defined. */ -public final class KllItemsSketchIterator implements QuantilesGenericSketchIterator { +public final class KllItemsSketchIterator extends KllSketchIterator implements QuantilesGenericSketchIterator { private final Object[] quantiles; - private final int[] levelsArr; - private final int numLevels; - private int level; - private int index; - private long weight; - private boolean isInitialized_; KllItemsSketchIterator(final Object[] quantiles, final int[] levelsArr, final int numLevels) { + super(levelsArr, numLevels); this.quantiles = quantiles; - this.levelsArr = levelsArr; - this.numLevels = numLevels; - this.isInitialized_ = false; } @SuppressWarnings("unchecked") @@ -46,34 +38,4 @@ public T getQuantile() { return (T)quantiles[index]; } - @Override - public long getWeight() { - return weight; - } - - @Override - public boolean next() { - if (!isInitialized_) { - level = 0; - index = levelsArr[level]; - weight = 1; - isInitialized_ = true; - } else { - index++; - } - if (index < levelsArr[level + 1]) { - return true; - } - // go to the next non-empty level - do { - level++; - if (level == numLevels) { - return false; // run out of levels - } - weight *= 2; - } while (levelsArr[level] == levelsArr[level + 1]); - index = levelsArr[level]; - return true; - } - } diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java index 7c066dff1..4b901f54a 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java @@ -22,6 +22,7 @@ import static org.apache.datasketches.quantilescommon.GenericInequalitySearch.find; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import static org.apache.datasketches.quantilescommon.QuantilesAPI.EMPTY_MSG; +import static org.apache.datasketches.quantilescommon.QuantilesUtil.evenlySpacedDoubles; import static org.apache.datasketches.quantilescommon.QuantilesUtil.getNaturalRank; import java.lang.reflect.Array; @@ -31,10 +32,13 @@ import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; import org.apache.datasketches.quantilescommon.GenericInequalitySearch.Inequality; +import org.apache.datasketches.quantilescommon.GenericPartitionBoundaries; import org.apache.datasketches.quantilescommon.GenericSortedView; import org.apache.datasketches.quantilescommon.GenericSortedViewIterator; import org.apache.datasketches.quantilescommon.InequalitySearch; +import org.apache.datasketches.quantilescommon.PartitioningFeature; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; +import org.apache.datasketches.quantilescommon.QuantilesAPI; import org.apache.datasketches.quantilescommon.QuantilesUtil; /** @@ -43,13 +47,15 @@ * @author Alexander Saydakov * @author Lee Rhodes */ -@SuppressWarnings("unchecked") -public class KllItemsSketchSortedView implements GenericSortedView { - private final Object[] quantiles; +public class KllItemsSketchSortedView implements GenericSortedView, PartitioningFeature { + private final T[] quantiles; private final long[] cumWeights; //comes in as individual weights, converted to cumulative natural weights private final long totalN; + private final Comparator comparator; + private final T maxItem; private final T minItem; - private final Comparator comp; + private final Class clazz; + private final double[] normRanks; /** * Construct from elements for testing only. @@ -59,49 +65,59 @@ public class KllItemsSketchSortedView implements GenericSortedView { * @param minItem used to extract the type of T * @param comparator the Comparator for type T */ + @SuppressWarnings("unchecked") KllItemsSketchSortedView( final T[] quantiles, final long[] cumWeights, final long totalN, - final T minItem, - final Comparator comparator) { + final Comparator comparator, + final T maxItem, + final T minItem) { this.quantiles = quantiles; this.cumWeights = cumWeights; this.totalN = totalN; + this.comparator = comparator; + this.maxItem = maxItem; this.minItem = minItem; - this.comp = comparator; + this.clazz = (Class)quantiles[0].getClass(); + this.normRanks = convertCumWtsToNormRanks(cumWeights, totalN); } /** * Constructs this Sorted View given the sketch - * @param sk the given KllItemsSketch. + * @param sketch the given KllItemsSketch. */ - KllItemsSketchSortedView(final KllItemsSketch sk) { - this.totalN = sk.getN(); - this.minItem = sk.getMinItem(); - final Object[] srcQuantiles = sk.getTotalItemsArray(); - final int[] srcLevels = sk.levelsArr; - final int srcNumLevels = sk.getNumLevels(); - this.comp = sk.comparator; + @SuppressWarnings("unchecked") + KllItemsSketchSortedView(final KllItemsSketch sketch) { + if (sketch.isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } + this.totalN = sketch.getN(); + final T[] srcQuantiles = sketch.getTotalItemsArray(); + final int[] srcLevels = sketch.levelsArr; + final int srcNumLevels = sketch.getNumLevels(); + this.comparator = sketch.comparator; + this.maxItem = sketch.getMaxItem(); + this.minItem = sketch.getMinItem(); + this.clazz = (Class)sketch.serDe.getClassOfT(); if (totalN == 0) { throw new SketchesArgumentException(EMPTY_MSG); } - if (!sk.isLevelZeroSorted()) { - Arrays.sort((T[])srcQuantiles, srcLevels[0], srcLevels[1], comp); - if (!sk.hasMemory()) { sk.setLevelZeroSorted(true); } + if (!sketch.isLevelZeroSorted()) { + Arrays.sort(srcQuantiles, srcLevels[0], srcLevels[1], comparator); + if (!sketch.hasMemory()) { sketch.setLevelZeroSorted(true); } } final int numQuantiles = srcLevels[srcNumLevels] - srcLevels[0]; //remove garbage - quantiles = new Object[numQuantiles]; + quantiles = (T[]) Array.newInstance(sketch.serDe.getClassOfT(), numQuantiles); cumWeights = new long[numQuantiles]; populateFromSketch(srcQuantiles, srcLevels, srcNumLevels, numQuantiles); + this.normRanks = convertCumWtsToNormRanks(cumWeights, totalN); } //end of constructors - @Override //implemented here because it needs the comparator + @Override public double[] getCDF(final T[] splitPoints, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } - GenericSortedView.validateItems(splitPoints, comp); + GenericSortedView.validateItems(splitPoints, comparator); final int len = splitPoints.length + 1; final double[] buckets = new double[len]; for (int i = 0; i < len - 1; i++) { @@ -116,10 +132,66 @@ public long[] getCumulativeWeights() { return cumWeights.clone(); } - @Override //implemented here because it needs the comparator + @Override + public T getMaxItem() { + return maxItem; + } + + @Override + public T getMinItem() { + return minItem; + } + + @Override + public long getN() { + return totalN; + } + + @Override + public double[] getNormalizedRanks() { + return normRanks.clone(); + } + + @Override + @SuppressWarnings("unchecked") + public GenericPartitionBoundaries getPartitionBoundaries(final int numEquallySized, + final QuantileSearchCriteria searchCrit) { + if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } + final long totalN = this.totalN; + final int svLen = cumWeights.length; + //adjust ends of sortedView arrays + cumWeights[0] = 1L; + cumWeights[svLen - 1] = totalN; + normRanks[0] = 1.0 / totalN; + normRanks[svLen - 1] = 1.0; + quantiles[0] = this.getMinItem(); + quantiles[svLen - 1] = this.getMaxItem(); + + final double[] evSpNormRanks = evenlySpacedDoubles(0, 1.0, numEquallySized + 1); + final int len = evSpNormRanks.length; + final T[] evSpQuantiles = (T[]) Array.newInstance(clazz, len); + + final long[] evSpNatRanks = new long[len]; + for (int i = 0; i < len; i++) { + final int index = getQuantileIndex(evSpNormRanks[i], searchCrit); + evSpQuantiles[i] = getQuantileFromIndex(index); + evSpNatRanks[i] = getCumWeightFromIndex(index); + } + final GenericPartitionBoundaries gpb = new GenericPartitionBoundaries<>( + this.totalN, + evSpQuantiles.clone(), + evSpNatRanks.clone(), + evSpNormRanks.clone(), + getMaxItem(), + getMinItem(), + searchCrit); + return gpb; + } + + @Override public double[] getPMF(final T[] splitPoints, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } - GenericSortedView.validateItems(splitPoints, comp); + GenericSortedView.validateItems(splitPoints, comparator); final double[] buckets = getCDF(splitPoints, searchCrit); final int len = buckets.length; for (int i = len; i-- > 1; ) { @@ -132,35 +204,36 @@ public double[] getPMF(final T[] splitPoints, final QuantileSearchCriteria searc public T getQuantile(final double rank, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); + final int index = getQuantileIndex(rank, searchCrit); + return getQuantileFromIndex(index); + } + + private T getQuantileFromIndex(final int index) { return quantiles[index]; } + + private long getCumWeightFromIndex(final int index) { return cumWeights[index]; } + + private int getQuantileIndex(final double rank, final QuantileSearchCriteria searchCrit) { final int len = cumWeights.length; - final double naturalRank = getNaturalRank(rank, totalN); + final double naturalRank = getNaturalRank(rank, totalN, searchCrit); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); - if (index == -1) { - return (T) quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; - } - return (T) quantiles[index]; + if (index == -1) { return len - 1; } + return index; } - /** - * Special version of getQuantile to support the getPartitionBoundaries(int) function. - * @param weight ultimately comes from selected integral weights computed by the sketch. - * @param searchCrit If INCLUSIVE, the given rank includes all quantiles ≤ - * the quantile directly corresponding to the given weight internal to the sketch. - * @return the approximate quantile given the weight. - */ - T getQuantile(final long weight, final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } - final int len = cumWeights.length; - final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; - final int index = InequalitySearch.find(cumWeights, 0, len - 1, weight, crit); - if (index == -1) { - return (T) quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; + @SuppressWarnings("unchecked") + public T[] getQuantiles(final double[] ranks, final QuantileSearchCriteria searchCrit) { + if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } + final int len = ranks.length; + final T[] quants = (T[]) Array.newInstance(clazz, len); + for (int i = 0; i < len; i++) { + quants[i] = getQuantile(ranks[i], searchCrit); } - return (T) quantiles[index]; + return quants; } @Override + @SuppressWarnings("unchecked") public T[] getQuantiles() { final T[] quants = (T[]) Array.newInstance(minItem.getClass(), quantiles.length); System.arraycopy(quantiles, 0, quants, 0, quantiles.length); @@ -172,7 +245,7 @@ public double getRank(final T quantile, final QuantileSearchCriteria searchCrit) if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } final int len = quantiles.length; final Inequality crit = (searchCrit == INCLUSIVE) ? Inequality.LE : Inequality.LT; - final int index = find((T[])quantiles, 0, len - 1, quantile, crit, comp); + final int index = find(quantiles, 0, len - 1, quantile, crit, comparator); if (index == -1) { return 0; //EXCLUSIVE (LT) case: quantile <= minQuantile; INCLUSIVE (LE) case: quantile < minQuantile } @@ -185,12 +258,19 @@ public boolean isEmpty() { } @Override - public KllItemsSketchSortedViewIterator iterator() { - return new KllItemsSketchSortedViewIterator<>((T[])quantiles, cumWeights); + public GenericSortedViewIterator iterator() { + return new GenericSortedViewIterator<>(quantiles, cumWeights); } //restricted methods + private static double[] convertCumWtsToNormRanks(final long[] cumWeights, final long totalN) { + final int len = cumWeights.length; + final double[] normRanks = new double[len]; + for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } + return normRanks; + } + private void populateFromSketch(final Object[] srcQuantiles, final int[] srcLevels, final int srcNumLevels, final int numItems) { final int[] myLevels = new int[srcNumLevels + 1]; @@ -212,7 +292,7 @@ private void populateFromSketch(final Object[] srcQuantiles, final int[] srcLeve weight *= 2; } final int numLevels = dstLevel; - blockyTandemMergeSort(quantiles, cumWeights, myLevels, numLevels, comp); //create unit weights + blockyTandemMergeSort(quantiles, cumWeights, myLevels, numLevels, comparator); //create unit weights KllHelper.convertToCumulative(cumWeights); } @@ -255,6 +335,7 @@ private static void blockyTandemMergeSortRecursion( startingLevel2, numLevels2, comp); } + @SuppressWarnings("unchecked") private static void tandemMerge( final Object[] quantilesSrc, final long[] weightsSrc, final Object[] quantilesDst, final long[] weightsDst, @@ -290,15 +371,4 @@ private static void tandemMerge( } } - /** - * Iterator over KllItemsSketchSortedView. - * @param type of quantile (item) - */ - public static final class KllItemsSketchSortedViewIterator extends GenericSortedViewIterator { - - KllItemsSketchSortedViewIterator(final T[] quantiles, final long[] cumWeights) { - super(quantiles, cumWeights); - } - } - } diff --git a/src/main/java/org/apache/datasketches/kll/KllSketchIterator.java b/src/main/java/org/apache/datasketches/kll/KllSketchIterator.java new file mode 100644 index 000000000..feaf33f53 --- /dev/null +++ b/src/main/java/org/apache/datasketches/kll/KllSketchIterator.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.kll; + +import org.apache.datasketches.quantilescommon.QuantilesSketchIterator; + +/** + * The base implementation for the KLL sketch iterator hierarchy used for viewing the + * non-ordered quantiles retained by a sketch. + * + *

    Prototype example of the recommended iteration loop:

    + *
    {@code
    + *   SketchIterator itr = sketch.iterator();
    + *   while (itr.next()) {
    + *     ...get*();
    + *   }
    + * }
    + * + * @author Lee Rhodes + */ +public class KllSketchIterator implements QuantilesSketchIterator { + protected final int[] levelsArr; + protected final int numLevels; + protected int level; + protected int index; + protected long weight; + protected boolean isInitialized_; + + KllSketchIterator(final int[] levelsArr, final int numLevels) { + this.levelsArr = levelsArr; + this.numLevels = numLevels; + this.isInitialized_ = false; + } + + @Override + public long getWeight() { + return weight; + } + + @Override + public boolean next() { + if (!isInitialized_) { + level = 0; + index = levelsArr[level]; + weight = 1; + isInitialized_ = true; + } else { + index++; + } + if (index < levelsArr[level + 1]) { + return true; + } + // go to the next non-empty level + do { + level++; + if (level == numLevels) { + return false; // run out of levels + } + weight *= 2; + } while (levelsArr[level] == levelsArr[level + 1]); + index = levelsArr[level]; + return true; + } + +} diff --git a/src/main/java/org/apache/datasketches/partitions/BoundsRule.java b/src/main/java/org/apache/datasketches/partitions/BoundsRule.java new file mode 100644 index 000000000..68dc87bc1 --- /dev/null +++ b/src/main/java/org/apache/datasketches/partitions/BoundsRule.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.partitions; + +public enum BoundsRule { + + /** + * Include both the upper and lower bounds + */ + INCLUDE_BOTH, + + /** + * Include only the upper bound but not the lower bound + */ + INCLUDE_UPPER, + /** + * Include only the lower bound but not the upper bound + */ + INCLUDE_LOWER +} diff --git a/src/main/java/org/apache/datasketches/partitions/Partitioner.java b/src/main/java/org/apache/datasketches/partitions/Partitioner.java new file mode 100644 index 000000000..65577385a --- /dev/null +++ b/src/main/java/org/apache/datasketches/partitions/Partitioner.java @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.partitions; + +import static java.lang.Math.ceil; +import static java.lang.Math.log; +import static java.lang.Math.max; +import static java.lang.Math.min; +import static java.lang.Math.pow; +import static java.lang.Math.round; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; +import static org.apache.datasketches.quantilescommon.QuantilesAPI.EMPTY_MSG; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.datasketches.common.SketchesArgumentException; +import org.apache.datasketches.quantilescommon.GenericPartitionBoundaries; +import org.apache.datasketches.quantilescommon.PartitioningFeature; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; +import org.apache.datasketches.quantilescommon.QuantilesGenericAPI; +import org.apache.datasketches.quantilescommon.Stack; + +/** + * A partitioning process that can partition very large data sets into thousands to millions + * of partitions of approximately the same size. + * @param T the data type + * @param S the quantiles sketch that implements both QuantilesGenericAPI and PartitioningFeature. + */ +//@SuppressWarnings("unused") +public class Partitioner & PartitioningFeature> { + private static final QuantileSearchCriteria defaultCriteria = INCLUSIVE; + private final long tgtPartitionSize; + private final int maxPartsPerSk; + private final SketchFillRequest fillReq; + private final QuantileSearchCriteria criteria; + private final Stack> stack = new Stack<>(); + + //computed once at the beginning + private int numLevels; + private int partitionsPerSk; + //output + private final List> finalPartitionList = new ArrayList<>(); + + /** + * This constructor assumes a QuantileSearchCriteria of INCLUSIVE. + * @param tgtPartitionSize the target size of the resulting partitions in number of items. + * @param maxPartsPerPass The maximum number of partitions to request from the sketch. The smaller this number is + * the smaller the variance will be of the resulting partitions, but this will increase the number of passes of the + * source data set. + * @param fillReq The is an implementation of the SketchFillRequest call-back supplied by the user and implements + * the SketchFillRequest interface. + */ + public Partitioner( + final long tgtPartitionSize, + final int maxPartsPerPass, + final SketchFillRequest fillReq) { + this(tgtPartitionSize, maxPartsPerPass, fillReq, defaultCriteria); + } + + /** + * This constructor includes the QuantileSearchCriteria criteria as a parameter. + * @param tgtPartitionSize the target size of the resulting partitions in number of items. + * @param maxPartsPerSk The maximum number of partitions to request from the sketch. The smaller this number is + * the smaller the variance will be of the resulting partitions, but this will increase the number of passes of the + * source data set. + * @param fillReq The is an implementation of the SketchFillRequest call-back supplied by the user. + * @param criteria This is the desired QuantileSearchCriteria to be used. + */ + public Partitioner( + final long tgtPartitionSize, + final int maxPartsPerSk, + final SketchFillRequest fillReq, + final QuantileSearchCriteria criteria) { + this.tgtPartitionSize = tgtPartitionSize; + this.maxPartsPerSk = maxPartsPerSk; + this.fillReq = fillReq; + this.criteria = criteria; + } + + /** + * This initiates the partitioning process + * @param sk A sketch of the entire data set. + * @return the final partitioning list + */ + public List> partition(final S sk) { + if (sk.isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } + final long inputN = sk.getN(); + final double guessNumParts = max(1.0, ceil((double)inputN / tgtPartitionSize)); + this.numLevels = (int)max(1, ceil(log(guessNumParts) / log(maxPartsPerSk))); + final int partsPerSk = (int)round(pow(guessNumParts, 1.0 / numLevels)); + this.partitionsPerSk = min(partsPerSk, maxPartsPerSk); + final GenericPartitionBoundaries gpb = sk.getPartitionBoundaries(partitionsPerSk, criteria); + final StackElement se = new StackElement<>(gpb, stack.size() + 1, 0, "1"); + stack.push(se); + partitionSearch(stack); + return finalPartitionList; + } + + private void partitionSearch(final Stack> stack) { + if (stack.isEmpty()) { + return; + } + final StackElement se = stack.peek(); + final GenericPartitionBoundaries gpb = se.gpb; + final int numParts = gpb.getNumPartitions(); + + if (stack.size() == numLevels) { //at max level + while (++se.part <= numParts) { //add rows to final partition list + final PartitionBoundsRow row = new PartitionBoundsRow<>(se); + finalPartitionList.add(row); + } + stack.pop(); + partitionSearch(stack); + } + else { //not at max level + if (++se.part <= numParts) { + final PartitionBoundsRow row = new PartitionBoundsRow<>(se); + final S sk = fillReq.getRange(row.lowerBound, row.upperBound, row.rule); + final GenericPartitionBoundaries gpb2 = sk.getPartitionBoundaries(this.partitionsPerSk, criteria); + final int level = stack.size() + 1; + final String partId = se.partId + "." + se.part + "," + level; + final StackElement se2 = new StackElement<>(gpb2, level, 0, partId); + stack.push(se2); + partitionSearch(stack); + } + //done with all parts at this level + if (stack.isEmpty()) { + return; + } + stack.pop(); + partitionSearch(stack); + } + } + + /** + * Holds data for a Stack element + */ + public static class StackElement { + public final GenericPartitionBoundaries gpb; + public int part; + public String partId; + + public StackElement(final GenericPartitionBoundaries gpb, final int level, final int part, final String partId) { + this.gpb = gpb; + this.part = part; + this.partId = partId; + } + } + + /** + * Defines a row for List of PartitionBounds. + */ + public static class PartitionBoundsRow { + public int part; + public String partId; + public long approxNumDeltaItems; + public BoundsRule rule; + public T lowerBound; + public T upperBound; + + public PartitionBoundsRow(final StackElement se) { + final GenericPartitionBoundaries gpb = se.gpb; + this.part = se.part; + this.partId = se.partId + "." + part; + final QuantileSearchCriteria searchCrit = gpb.getSearchCriteria(); + final T[] boundaries = gpb.getBoundaries(); + final int numParts = gpb.getNumPartitions(); + if (searchCrit == INCLUSIVE) { + if (part == 1) { + lowerBound = gpb.getMinItem(); + upperBound = boundaries[part]; + rule = BoundsRule.INCLUDE_BOTH; + } else { + lowerBound = boundaries[part - 1]; + upperBound = boundaries[part]; + rule = BoundsRule.INCLUDE_UPPER; + } + } else { //EXCLUSIVE + if (part == numParts) { + lowerBound = boundaries[part - 1]; + upperBound = gpb.getMaxItem(); + rule = BoundsRule.INCLUDE_BOTH; + } else { + lowerBound = boundaries[part - 1]; + upperBound = boundaries[part]; + rule = BoundsRule.INCLUDE_LOWER; + } + } + approxNumDeltaItems = gpb.getNumDeltaItems()[part]; + } + } + +} diff --git a/src/main/java/org/apache/datasketches/partitions/SketchFillRequest.java b/src/main/java/org/apache/datasketches/partitions/SketchFillRequest.java new file mode 100644 index 000000000..d005561d0 --- /dev/null +++ b/src/main/java/org/apache/datasketches/partitions/SketchFillRequest.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.partitions; + +import org.apache.datasketches.quantilescommon.PartitioningFeature; +import org.apache.datasketches.quantilescommon.QuantilesGenericAPI; + +/** + * This is a callback request to the data source to fill a quantiles sketch, + * which is returned to the caller. + * + * @author Lee Rhodes + */ +public interface SketchFillRequest & PartitioningFeature> { + + /** + * This is a callback request to the data source to fill a quantiles sketch + * with a range of data between upper and lower bounds. Which of these bounds are to be included is determined by + * the BoundsRule. + * + *

    This range of data may or may not be subsequently further partitioned.

    + * @param lowerQuantile the lowest quantile of a range + * @param upperQuantile the highest quantile of a range + * @param boundsRule determines which quantile bounds to include + * @return a quantiles sketch filled from the given upper and lower bounds. + */ + public S getRange(final T lowerQuantile, final T upperQuantile, final BoundsRule boundsRule); + +} diff --git a/src/main/java/org/apache/datasketches/partitions/package-info.java b/src/main/java/org/apache/datasketches/partitions/package-info.java new file mode 100644 index 000000000..cee11ec1d --- /dev/null +++ b/src/main/java/org/apache/datasketches/partitions/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/** + * + */ +package org.apache.datasketches.partitions; diff --git a/src/main/java/org/apache/datasketches/quantiles/DoublesSketch.java b/src/main/java/org/apache/datasketches/quantiles/DoublesSketch.java index b3a78d5af..bbcdf44f7 100644 --- a/src/main/java/org/apache/datasketches/quantiles/DoublesSketch.java +++ b/src/main/java/org/apache/datasketches/quantiles/DoublesSketch.java @@ -28,7 +28,6 @@ import static org.apache.datasketches.quantiles.ClassicUtil.checkK; import static org.apache.datasketches.quantiles.ClassicUtil.computeNumLevelsNeeded; import static org.apache.datasketches.quantiles.ClassicUtil.computeRetainedItems; -import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallySpacedDoubles; import java.util.Random; @@ -170,21 +169,6 @@ public double[] getCDF(final double[] splitPoints, final QuantileSearchCriteria @Override public abstract double getMinItem(); - @Override - public DoublesPartitionBoundaries getPartitionBoundaries(final int numEquallyWeighted, - final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } - final double[] ranks = equallySpacedDoubles(numEquallyWeighted); - final double[] boundaries = getQuantiles(ranks, searchCrit); - boundaries[0] = getMinItem(); - boundaries[boundaries.length - 1] = getMaxItem(); - final DoublesPartitionBoundaries dpb = new DoublesPartitionBoundaries(); - dpb.N = this.getN(); - dpb.ranks = ranks; - dpb.boundaries = boundaries; - return dpb; - } - @Override public double[] getPMF(final double[] splitPoints, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } diff --git a/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedView.java b/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedView.java index ef250fe5f..b746bae15 100644 --- a/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedView.java @@ -27,8 +27,10 @@ import java.util.Arrays; +import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.SketchesStateException; import org.apache.datasketches.quantilescommon.DoublesSortedView; +import org.apache.datasketches.quantilescommon.DoublesSortedViewIterator; import org.apache.datasketches.quantilescommon.InequalitySearch; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.datasketches.quantilescommon.QuantilesUtil; @@ -42,6 +44,9 @@ public final class DoublesSketchSortedView implements DoublesSortedView { private final double[] quantiles; private final long[] cumWeights; //comes in as individual weights, converted to cumulative natural weights private final long totalN; + private final double[] normRanks; + private final double maxItem; + private final double minItem; /** * Construct from elements for testing. @@ -49,10 +54,17 @@ public final class DoublesSketchSortedView implements DoublesSortedView { * @param cumWeights sorted, monotonically increasing cumulative weights. * @param totalN the total number of items presented to the sketch. */ - DoublesSketchSortedView(final double[] quantiles, final long[] cumWeights, final long totalN) { + DoublesSketchSortedView(final double[] quantiles, final long[] cumWeights, final long totalN, + final double maxItem, final double minItem) { this.quantiles = quantiles; this.cumWeights = cumWeights; this.totalN = totalN; + this.maxItem = maxItem; + this.minItem = minItem; + final int len = cumWeights.length; + final double[] normRanks = new double[len]; + for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } + this.normRanks = normRanks; } /** @@ -60,7 +72,10 @@ public final class DoublesSketchSortedView implements DoublesSortedView { * @param sketch the given Classic Quantiles DoublesSketch */ public DoublesSketchSortedView(final DoublesSketch sketch) { + if (sketch.isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } this.totalN = sketch.getN(); + this.maxItem = sketch.getMaxItem(); + this.minItem = sketch.getMinItem(); final int k = sketch.getK(); final int numQuantiles = sketch.getNumRetained(); quantiles = new double[numQuantiles]; @@ -77,6 +92,34 @@ public DoublesSketchSortedView(final DoublesSketch sketch) { if (convertToCumulative(cumWeights) != totalN) { throw new SketchesStateException("Sorted View is misconfigured. TotalN does not match cumWeights."); } + final double[] normRanks = new double[numQuantiles]; + for (int i = 0; i < numQuantiles; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } + this.normRanks = normRanks; + } + + @Override + public long[] getCumulativeWeights() { + return cumWeights.clone(); + } + + @Override + public double getMaxItem() { + return maxItem; + } + + @Override + public double getMinItem() { + return minItem; + } + + @Override + public long getN() { + return totalN; + } + + @Override + public double[] getNormalizedRanks() { + return normRanks.clone(); } @Override @@ -84,29 +127,11 @@ public double getQuantile(final double rank, final QuantileSearchCriteria search if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int len = cumWeights.length; - final double naturalRank = getNaturalRank(rank, totalN); + final double naturalRank = getNaturalRank(rank, totalN, searchCrit); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); if (index == -1) { - return quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; - } - return quantiles[index]; - } - - /** - * Special version of getQuantile to support the getPartitionBoundaries(int) function. - * @param weight ultimately comes from selected integral weights computed by the sketch. - * @param searchCrit If INCLUSIVE, the given rank includes all quantiles ≤ - * the quantile directly corresponding to the given weight internal to the sketch. - * @return the approximate quantile given the weight. - */ - double getQuantile(final long weight, final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } - final int len = cumWeights.length; - final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; - final int index = InequalitySearch.find(cumWeights, 0, len - 1, weight, crit); - if (index == -1) { - return quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; + return quantiles[len - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; } return quantiles[index]; } @@ -123,11 +148,6 @@ public double getRank(final double quantile, final QuantileSearchCriteria search return (double)cumWeights[index] / totalN; } - @Override - public long[] getCumulativeWeights() { - return cumWeights.clone(); - } - @Override public double[] getQuantiles() { return quantiles.clone(); @@ -139,8 +159,8 @@ public boolean isEmpty() { } @Override - public DoublesSketchSortedViewIterator iterator() { - return new DoublesSketchSortedViewIterator(quantiles, cumWeights); + public DoublesSortedViewIterator iterator() { + return new DoublesSortedViewIterator(quantiles, cumWeights); } //restricted methods diff --git a/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedViewIterator.java b/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedViewIterator.java deleted file mode 100644 index f834fb2aa..000000000 --- a/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedViewIterator.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.quantiles; - -import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; - -import org.apache.datasketches.quantilescommon.DoublesSortedViewIterator; -import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; - -/** - * Iterator over DoublesSketchSortedView. - */ -public final class DoublesSketchSortedViewIterator implements DoublesSortedViewIterator { - - private final double[] quantiles; - private final long[] cumWeights; - private final long totalN; - private int index; - - DoublesSketchSortedViewIterator(final double[] quantiles, final long[] cumWeights) { - this.quantiles = quantiles; - this.cumWeights = cumWeights; - this.totalN = (cumWeights.length > 0) ? cumWeights[cumWeights.length - 1] : 0; - index = -1; - } - - @Override - public long getCumulativeWeight(final QuantileSearchCriteria searchCrit) { - if (searchCrit == INCLUSIVE) { return cumWeights[index]; } - return (index == 0) ? 0 : cumWeights[index - 1]; - } - - @Override - public long getN() { - return totalN; - } - - @Override - public double getNormalizedRank(final QuantileSearchCriteria searchCrit) { - return (double) getCumulativeWeight(searchCrit) / totalN; - } - - @Override - public double getQuantile() { - return quantiles[index]; - } - - @Override - public long getWeight() { - if (index == 0) { return cumWeights[0]; } - return cumWeights[index] - cumWeights[index - 1]; - } - - @Override - public boolean next() { - index++; - return index < quantiles.length; - } - -} diff --git a/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java b/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java index 64f66fde2..6b247347a 100644 --- a/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java +++ b/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java @@ -36,10 +36,7 @@ import static org.apache.datasketches.quantiles.PreambleUtil.extractN; import static org.apache.datasketches.quantiles.PreambleUtil.extractPreLongs; import static org.apache.datasketches.quantiles.PreambleUtil.extractSerVer; -import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; -import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallySpacedLongs; -import java.lang.reflect.Array; import java.util.Arrays; import java.util.Comparator; import java.util.Objects; @@ -49,7 +46,8 @@ import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.memory.Memory; import org.apache.datasketches.memory.WritableMemory; -import org.apache.datasketches.quantilescommon.GenericSortedView; +import org.apache.datasketches.quantilescommon.GenericPartitionBoundaries; +import org.apache.datasketches.quantilescommon.PartitioningFeature; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.datasketches.quantilescommon.QuantilesAPI; import org.apache.datasketches.quantilescommon.QuantilesGenericAPI; @@ -74,25 +72,13 @@ * * @param The sketch data type */ -public final class ItemsSketch implements QuantilesGenericAPI { - +public final class ItemsSketch implements QuantilesGenericAPI, PartitioningFeature { final Class clazz; - private final Comparator comparator_; - final int k_; - long n_; - - /** - * The largest item ever seen in the stream. - */ - T maxItem_; - - /** - * The smallest item ever seen in the stream. - */ - T minItem_; + T maxItem_; //The largest item ever seen in the stream. + T minItem_; //The smallest item ever seen in the stream. /** * In the initial on-heap version, equals combinedBuffer_.length. @@ -132,7 +118,7 @@ public final class ItemsSketch implements QuantilesGenericAPI { /** * Setting the seed makes the results of the sketch deterministic if the input items are * received in exactly the same order. This is only useful when performing test comparisons, - * otherwise is not recommended. + * otherwise, it is not recommended. */ public static final Random rand = new Random(); @@ -220,7 +206,6 @@ public static ItemsSketch getInstance( final boolean empty = checkPreLongsFlagsCap(preambleLongs, flags, memCapBytes); checkFamilyID(familyID); - final ItemsSketch sk = getInstance(clazz, k, comparator); //checks k if (empty) { return sk; } @@ -265,10 +250,7 @@ static ItemsSketch copy(final ItemsSketch sketch) { return qsCopy; } - @Override - public double[] getCDF(final T[] splitPoints) { - return getCDF(splitPoints, INCLUSIVE); - } + //END of Constructors @Override public double[] getCDF(final T[] splitPoints, final QuantileSearchCriteria searchCrit) { @@ -295,25 +277,11 @@ public T getMinItem() { } @Override - public GenericPartitionBoundaries getPartitionBoundaries(final int numEquallyWeighted, + public GenericPartitionBoundaries getPartitionBoundaries(final int numEquallySized, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } refreshSortedView(); - final long[] weights = equallySpacedLongs(1, getN(), numEquallyWeighted); - final T[] boundaries = getQuantiles(weights, searchCrit); - final GenericPartitionBoundaries gpb = new GenericPartitionBoundaries<>(); - gpb.N = this.getN(); - gpb.boundaries = boundaries; - gpb.weights = weights; - final double[] ranks = new double[weights.length]; - for (int i = 0; i < weights.length; i++) { ranks[i] = (double)weights[i] / getN(); } - gpb.ranks = ranks; - return gpb; - } - - @Override - public double[] getPMF(final T[] splitPoints) { - return getPMF(splitPoints, INCLUSIVE); + return classicQisSV.getPartitionBoundaries(numEquallySized, searchCrit); } @Override @@ -323,11 +291,6 @@ public double[] getPMF(final T[] splitPoints, final QuantileSearchCriteria searc return classicQisSV.getPMF(splitPoints, searchCrit); } - @Override - public T getQuantile(final double rank) { - return getQuantile(rank, INCLUSIVE); - } - @Override public T getQuantile(final double rank, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } @@ -348,36 +311,10 @@ public T getQuantileUpperBound(final double rank) { } @Override - public T[] getQuantiles(final double[] ranks) { - return getQuantiles(ranks, INCLUSIVE); - } - - @Override - @SuppressWarnings("unchecked") public T[] getQuantiles(final double[] ranks, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } refreshSortedView(); - final int len = ranks.length; - final T[] quantiles = (T[]) Array.newInstance(minItem_.getClass(), len); - for (int i = 0; i < len; i++) { - quantiles[i] = classicQisSV.getQuantile(ranks[i], searchCrit); - } - return quantiles; - } - - @SuppressWarnings("unchecked") - private T[] getQuantiles(final long[] weights, final QuantileSearchCriteria crit) { - final int len = weights.length; - final T[] quantiles = (T[]) Array.newInstance(minItem_.getClass(), len); - for (int i = 0; i < len; i++) { - quantiles[i] = classicQisSV.getQuantile(weights[i], crit); - } - return quantiles; - } - - @Override - public double getRank(final T quantile) { - return getRank(quantile, INCLUSIVE); + return classicQisSV.getQuantiles(ranks, searchCrit); } @Override @@ -397,11 +334,6 @@ public double getRankUpperBound(final double rank) { return min(1.0, rank + getNormalizedRankError(k_, false)); } - @Override - public double[] getRanks(final T[] quantiles) { - return getRanks(quantiles, INCLUSIVE); - } - @Override public double[] getRanks(final T[] quantiles, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } @@ -522,11 +454,6 @@ public byte[] toByteArray(final boolean ordered, final ArrayOfItemsSerDe serD return ItemsByteArrayImpl.toByteArray(this, ordered, serDe); } - @Override - public String toString() { - return toString(true, false); - } - /** * Returns summary information about this sketch. Used for debugging. * @param sketchSummary if true includes sketch summary @@ -592,7 +519,7 @@ public void putMemory(final WritableMemory dstMem, final ArrayOfItemsSerDe se } @Override - public GenericSortedView getSortedView() { + public ItemsSketchSortedView getSortedView() { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } return refreshSortedView(); } diff --git a/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java b/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java index 68ec30e36..869b68021 100644 --- a/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java @@ -22,18 +22,23 @@ import static org.apache.datasketches.quantilescommon.GenericInequalitySearch.find; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import static org.apache.datasketches.quantilescommon.QuantilesAPI.EMPTY_MSG; +import static org.apache.datasketches.quantilescommon.QuantilesUtil.evenlySpacedDoubles; import static org.apache.datasketches.quantilescommon.QuantilesUtil.getNaturalRank; import java.lang.reflect.Array; import java.util.Arrays; import java.util.Comparator; +import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.SketchesStateException; import org.apache.datasketches.quantilescommon.GenericInequalitySearch.Inequality; +import org.apache.datasketches.quantilescommon.GenericPartitionBoundaries; import org.apache.datasketches.quantilescommon.GenericSortedView; import org.apache.datasketches.quantilescommon.GenericSortedViewIterator; import org.apache.datasketches.quantilescommon.InequalitySearch; +import org.apache.datasketches.quantilescommon.PartitioningFeature; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; +import org.apache.datasketches.quantilescommon.QuantilesAPI; import org.apache.datasketches.quantilescommon.QuantilesUtil; /** @@ -42,11 +47,15 @@ * @author Kevin Lang * @author Alexander Saydakov */ -public class ItemsSketchSortedView implements GenericSortedView { +public class ItemsSketchSortedView implements GenericSortedView, PartitioningFeature { private final T[] quantiles; private final long[] cumWeights; //comes in as individual weights, converted to cumulative natural weights private final long totalN; private final Comparator comparator; + private final T maxItem; + private final T minItem; + private final Class clazz; + private final double[] normRanks; /** * Construct from elements for testing. @@ -55,15 +64,22 @@ public class ItemsSketchSortedView implements GenericSortedView { * @param totalN the total number of items presented to the sketch. * @param comparator comparator for type T */ + @SuppressWarnings("unchecked") ItemsSketchSortedView( final T[] quantiles, - final long[] cumWeights, + final long[] cumWeights, //or Natural Ranks final long totalN, - final Comparator comparator) { + final Comparator comparator, + final T maxItem, + final T minItem) { this.quantiles = quantiles; this.cumWeights = cumWeights; this.totalN = totalN; this.comparator = comparator; + this.maxItem = maxItem; + this.minItem = minItem; + this.clazz = (Class)quantiles[0].getClass(); + this.normRanks = convertCumWtsToNormRanks(cumWeights, totalN); } /** @@ -72,12 +88,16 @@ public class ItemsSketchSortedView implements GenericSortedView { */ @SuppressWarnings("unchecked") ItemsSketchSortedView(final ItemsSketch sketch) { + if (sketch.isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } this.totalN = sketch.getN(); final int k = sketch.getK(); final int numQuantiles = sketch.getNumRetained(); - quantiles = (T[]) Array.newInstance(sketch.clazz, numQuantiles); + this.quantiles = (T[]) Array.newInstance(sketch.clazz, numQuantiles); + this.minItem = sketch.minItem_; + this.maxItem = sketch.maxItem_; cumWeights = new long[numQuantiles]; comparator = sketch.getComparator(); + clazz = sketch.clazz; final Object[] combinedBuffer = sketch.getCombinedBuffer(); final int baseBufferCount = sketch.getBaseBufferCount(); @@ -94,9 +114,12 @@ public class ItemsSketchSortedView implements GenericSortedView { if (convertToCumulative(cumWeights) != totalN) { throw new SketchesStateException("Sorted View is misconfigured. TotalN does not match cumWeights."); } + this.normRanks = convertCumWtsToNormRanks(cumWeights, totalN); } - @Override //implemented here because it needs the comparator + //end of constructors + + @Override public double[] getCDF(final T[] splitPoints, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } GenericSortedView.validateItems(splitPoints, comparator); @@ -114,7 +137,62 @@ public long[] getCumulativeWeights() { return cumWeights.clone(); } - @Override //implemented here because it needs the comparator + @Override + public T getMaxItem() { + return maxItem; + } + + @Override + public T getMinItem() { + return minItem; + } + + @Override + public long getN() { + return totalN; + } + + @Override + public double[] getNormalizedRanks() { + return normRanks.clone(); + } + + @Override + @SuppressWarnings("unchecked") + public GenericPartitionBoundaries getPartitionBoundaries(final int numEquallySized, + final QuantileSearchCriteria searchCrit) { + if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } + final long totalN = this.totalN; + final int svLen = cumWeights.length; + //adjust ends of sortedView arrays + cumWeights[0] = 1L; + cumWeights[svLen - 1] = totalN; + normRanks[0] = 1.0 / totalN; + normRanks[svLen - 1] = 1.0; + quantiles[0] = this.getMinItem(); + quantiles[svLen - 1] = this.getMaxItem(); + + final double[] evSpNormRanks = evenlySpacedDoubles(0, 1.0, numEquallySized + 1); + final int len = evSpNormRanks.length; + final T[] evSpQuantiles = (T[]) Array.newInstance(clazz, len); + final long[] evSpNatRanks = new long[len]; + for (int i = 0; i < len; i++) { + final int index = getQuantileIndex(evSpNormRanks[i], searchCrit); + evSpQuantiles[i] = getQuantileFromIndex(index); + evSpNatRanks[i] = getCumWeightFromIndex(index); + } + final GenericPartitionBoundaries gpb = new GenericPartitionBoundaries<>( + this.totalN, + evSpQuantiles.clone(), + evSpNatRanks.clone(), + evSpNormRanks.clone(), + getMaxItem(), + getMinItem(), + searchCrit); + return gpb; + } + + @Override public double[] getPMF(final T[] splitPoints, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } GenericSortedView.validateItems(splitPoints, comparator); @@ -130,32 +208,32 @@ public double[] getPMF(final T[] splitPoints, final QuantileSearchCriteria searc public T getQuantile(final double rank, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); + final int index = getQuantileIndex(rank, searchCrit); + return getQuantileFromIndex(index); + } + + private T getQuantileFromIndex(final int index) { return quantiles[index]; } + + private long getCumWeightFromIndex(final int index) { return cumWeights[index]; } + + private int getQuantileIndex(final double rank, final QuantileSearchCriteria searchCrit) { final int len = cumWeights.length; - final double naturalRank = getNaturalRank(rank, totalN); + final double naturalRank = getNaturalRank(rank, totalN, searchCrit); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); - if (index == -1) { - return quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; - } - return quantiles[index]; + if (index == -1) { return len - 1; } + return index; } - /** - * Special version of getQuantile to support the getPartitionBoundaries(int) function. - * @param weight ultimately comes from selected integral weights computed by the sketch. - * @param searchCrit If INCLUSIVE, the given rank includes all quantiles ≤ - * the quantile directly corresponding to the given weight internal to the sketch. - * @return the approximate quantile given the weight. - */ - T getQuantile(final long weight, final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } - final int len = cumWeights.length; - final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; - final int index = InequalitySearch.find(cumWeights, 0, len - 1, weight, crit); - if (index == -1) { - return quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; + @SuppressWarnings("unchecked") + public T[] getQuantiles(final double[] ranks, final QuantileSearchCriteria searchCrit) { + if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } + final int len = ranks.length; + final T[] quants = (T[]) Array.newInstance(clazz, len); + for (int i = 0; i < len; i++) { + quants[i] = getQuantile(ranks[i], searchCrit); } - return quantiles[index]; + return quants; } @Override @@ -181,8 +259,8 @@ public boolean isEmpty() { } @Override - public ItemsSketchSortedViewIterator iterator() { - return new ItemsSketchSortedViewIterator<>(quantiles, cumWeights); + public GenericSortedViewIterator iterator() { + return new GenericSortedViewIterator<>(quantiles, cumWeights); } //restricted methods @@ -236,6 +314,13 @@ private final static void populateFromItemsSketch( Arrays.sort(quantilesArr, startOfBaseBufferBlock, numQuantiles, comparator); } + private static double[] convertCumWtsToNormRanks(final long[] cumWeights, final long totalN) { + final int len = cumWeights.length; + final double[] normRanks = new double[len]; + for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } + return normRanks; + } + /** * Convert the individual weights into cumulative weights. * An array of {1,1,1,1} becomes {1,2,3,4} @@ -251,15 +336,4 @@ private static long convertToCumulative(final long[] array) { return subtotal; } - /** - * Iterator over ItemsSketchSortedView. - * @param type of quantile (item) - */ - public static final class ItemsSketchSortedViewIterator extends GenericSortedViewIterator { - - ItemsSketchSortedViewIterator(final T[] quantiles, final long[] cumWeights) { - super(quantiles, cumWeights); - } - } - } diff --git a/src/main/java/org/apache/datasketches/quantilescommon/DoublesSortedView.java b/src/main/java/org/apache/datasketches/quantilescommon/DoublesSortedView.java index 8c299321e..bdc3cc75c 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/DoublesSortedView.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/DoublesSortedView.java @@ -20,7 +20,7 @@ package org.apache.datasketches.quantilescommon; /** - * The Sorted View for quantiles of primitive type double. + * The Sorted View for quantile sketches of primitive type double. * @see SortedView * @author Alexander Saydakov * @author Lee Rhodes @@ -71,6 +71,24 @@ default double[] getCDF(double[] splitPoints, QuantileSearchCriteria searchCrit) return buckets; } + /** + * Returns the maximum item of the stream. This may be distinct from the largest item retained by the + * sketch algorithm. + * + * @return the maximum item of the stream + * @throws IllegalArgumentException if sketch is empty. + */ + double getMaxItem(); + + /** + * Returns the minimum item of the stream. This may be distinct from the smallest item retained by the + * sketch algorithm. + * + * @return the minimum item of the stream + * @throws IllegalArgumentException if sketch is empty. + */ + double getMinItem(); + /** * Returns an approximation to the Probability Mass Function (PMF) of the input stream * as an array of probability masses as doubles on the interval [0.0, 1.0], diff --git a/src/main/java/org/apache/datasketches/quantilescommon/DoublesSortedViewIterator.java b/src/main/java/org/apache/datasketches/quantilescommon/DoublesSortedViewIterator.java index df9c41f23..da112dc2e 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/DoublesSortedViewIterator.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/DoublesSortedViewIterator.java @@ -20,12 +20,15 @@ package org.apache.datasketches.quantilescommon; /** - * The quantiles SortedView iterator for type double. - * @see SortedViewIterator - * @author Alexander Saydakov - * @author Lee Rhodes + * Iterator over quantile sketches of primitive type double. */ -public interface DoublesSortedViewIterator extends SortedViewIterator { +public final class DoublesSortedViewIterator extends SortedViewIterator { + private final double[] quantiles; + + public DoublesSortedViewIterator(final double[] quantiles, final long[] cumWeights) { + super(cumWeights); + this.quantiles = quantiles; //SpotBugs EI_EXPOSE_REP2 suppressed by FindBugsExcludeFilter + } /** * Gets the quantile at the current index. @@ -35,7 +38,8 @@ public interface DoublesSortedViewIterator extends SortedViewIterator { * * @return the quantile at the current index. */ - double getQuantile(); + public double getQuantile() { + return quantiles[index]; + } } - diff --git a/src/main/java/org/apache/datasketches/quantilescommon/FloatsSortedView.java b/src/main/java/org/apache/datasketches/quantilescommon/FloatsSortedView.java index 7127b5928..0a0c54b5a 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/FloatsSortedView.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/FloatsSortedView.java @@ -71,6 +71,24 @@ default double[] getCDF(float[] splitPoints, QuantileSearchCriteria searchCrit) return buckets; } + /** + * Returns the maximum item of the stream. This may be distinct from the largest item retained by the + * sketch algorithm. + * + * @return the maximum item of the stream + * @throws IllegalArgumentException if sketch is empty. + */ + float getMaxItem(); + + /** + * Returns the minimum item of the stream. This may be distinct from the smallest item retained by the + * sketch algorithm. + * + * @return the minimum item of the stream + * @throws IllegalArgumentException if sketch is empty. + */ + float getMinItem(); + /** * Returns an approximation to the Probability Mass Function (PMF) of the input stream * as an array of probability masses as doubles on the interval [0.0, 1.0], diff --git a/src/main/java/org/apache/datasketches/quantilescommon/FloatsSortedViewIterator.java b/src/main/java/org/apache/datasketches/quantilescommon/FloatsSortedViewIterator.java index ff6203f45..a40bacef1 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/FloatsSortedViewIterator.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/FloatsSortedViewIterator.java @@ -20,12 +20,15 @@ package org.apache.datasketches.quantilescommon; /** - * The quantiles SortedView Iterator for type float. - * @see SortedViewIterator - * @author Alexander Saydakov - * @author Lee Rhodes + * Iterator over quantile sketches of primitive type float. */ -public interface FloatsSortedViewIterator extends SortedViewIterator { +public final class FloatsSortedViewIterator extends SortedViewIterator { + private final float[] quantiles; + + public FloatsSortedViewIterator(final float[] quantiles, final long[] cumWeights) { + super(cumWeights); + this.quantiles = quantiles; //SpotBugs EI_EXPOSE_REP2 suppressed by FindBugsExcludeFilter + } /** * Gets the quantile at the current index. @@ -35,7 +38,8 @@ public interface FloatsSortedViewIterator extends SortedViewIterator { * * @return the quantile at the current index. */ - float getQuantile(); + public float getQuantile() { + return quantiles[index]; + } } - diff --git a/src/main/java/org/apache/datasketches/quantilescommon/GenericPartitionBoundaries.java b/src/main/java/org/apache/datasketches/quantilescommon/GenericPartitionBoundaries.java new file mode 100644 index 000000000..733f7846d --- /dev/null +++ b/src/main/java/org/apache/datasketches/quantilescommon/GenericPartitionBoundaries.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.quantilescommon; + +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; + +import org.apache.datasketches.common.SketchesStateException; + +/** + * Implements PartitionBoundaries + */ +public class GenericPartitionBoundaries implements PartitionBoundaries { + private long totalN; //totalN of source sketch + private T[] boundaries; //quantiles at the boundaries + private long[] natRanks; //natural ranks at the boundaries + private double[] normRanks; //normalized ranks at the boundaries + private T maxItem; //of the source sketch + private T minItem; //of the source sketch + private QuantileSearchCriteria searchCrit; //of the source sketch query to getPartitionBoundaries. + //computed + private long[] numDeltaItems; //num of items in each part + private int numPartitions; //num of partitions + + public GenericPartitionBoundaries( + final long totalN, + final T[] boundaries, + final long[] natRanks, + final double[] normRanks, + final T maxItem, + final T minItem, + final QuantileSearchCriteria searchCrit) { + this.totalN = totalN; + this.boundaries = boundaries; + this.natRanks = natRanks; + this.normRanks = normRanks; + this.maxItem = maxItem; + this.minItem = minItem; + this.searchCrit = searchCrit; + //check and compute + final int len = boundaries.length; + if (len < 2) { throw new SketchesStateException("Source sketch is empty"); } + numDeltaItems = new long[len]; + numDeltaItems[0] = 0; // index 0 is always 0 + for (int i = 1; i < len; i++) { + final int addOne = ( (i == 1 && (this.searchCrit == INCLUSIVE)) + || ((i == (len - 1)) && this.searchCrit == EXCLUSIVE) ) ? 1 : 0; + numDeltaItems[i] = natRanks[i] - natRanks[i - 1] + addOne; + } + this.numPartitions = len - 1; + } + + @Override + public long getN() { return totalN; } + + /** + * Gets an ordered array of boundaries that sequentially define the upper and lower boundaries of partitions. + * These partitions are to be constructed by an external process. Each boundary is essentially a reference and + * should uniquely identify an item or a set of identical items from the original stream of data fed to the + * originating sketch. + * + *

    Assume boundaries array has size N + 1. Let the indicies be sequentially numbered from 0 to N. + * The number of partitions is always one less than the size of the boundaries array. + * Let the the partitions be sequentially numbered from 1 to N. + * + *

    If these results were computed using QuantileSearchCriteria.INCLUSIVE then these sequential boundaries + * are to be interpreted as follows: + *

      + *
    • Partition 1: include all items >= index 0 and <= index 1.
    • + *
    • Partition 2: include all items > index 1 and <= index 2.
    • + *
    • Partition N: include all items > index N-1 and <= index N.
    • + *
    + * + *

    If these results were computed using QuantileSearchCriteria.EXCLUSIVE then these sequential boundaries + * are to be interpreted as follows: + *

      + *
    • Partition 1: include all items >= index 0 and < index 1.
    • + *
    • Partition 2: include all items >= index 1 and < index 2.
    • + *
    • Partition N: include all items >= index N-1 and <= index N.
    • + *
    + * + * @return an array of boundaries that sequentially define the upper and lower boundaries of partitions. + */ + public T[] getBoundaries() { return boundaries; } + + @Override + public long[] getNaturalRanks() { return natRanks; } + + @Override + public double[] getNormalizedRanks() { return normRanks; } + + @Override + public long[] getNumDeltaItems() { return numDeltaItems; } + + @Override + public int getNumPartitions() { return numPartitions; } + + /** + * Returns the maximum item of the stream. This may be distinct from the largest item retained by the + * sketch algorithm. + * + * @return the maximum item of the stream + * @throws IllegalArgumentException if sketch is empty. + */ + public T getMaxItem() { return maxItem; } + + /** + * Returns the minimum item of the stream. This may be distinct from the smallest item retained by the + * sketch algorithm. + * + * @return the minimum item of the stream + * @throws IllegalArgumentException if sketch is empty. + */ + public T getMinItem() { return minItem; } + + @Override + public QuantileSearchCriteria getSearchCriteria() { return searchCrit; } + +} diff --git a/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedView.java b/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedView.java index 452467bb7..e3d89a6e2 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedView.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedView.java @@ -69,6 +69,24 @@ public interface GenericSortedView extends SortedView { */ double[] getCDF(T[] splitPoints, QuantileSearchCriteria searchCrit); + /** + * Returns the maximum item of the stream. This may be distinct from the largest item retained by the + * sketch algorithm. + * + * @return the maximum item of the stream + * @throws IllegalArgumentException if sketch is empty. + */ + T getMaxItem(); + + /** + * Returns the minimum item of the stream. This may be distinct from the smallest item retained by the + * sketch algorithm. + * + * @return the minimum item of the stream + * @throws IllegalArgumentException if sketch is empty. + */ + T getMinItem(); + /** * Returns an approximation to the Probability Mass Function (PMF) of the input stream * as an array of probability masses as doubles on the interval [0.0, 1.0], @@ -126,8 +144,8 @@ public interface GenericSortedView extends SortedView { T getQuantile(double rank, QuantileSearchCriteria searchCrit); /** - * Returns the array of quantiles. - * @return the array of quantiles. + * Returns the full array of quantiles. + * @return the full array of quantiles. */ T[] getQuantiles(); @@ -169,4 +187,3 @@ static void validateItems(final T[] items, final Comparator compa } } - diff --git a/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedViewIterator.java b/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedViewIterator.java index 69b454a92..5a5c00e26 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedViewIterator.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedViewIterator.java @@ -19,58 +19,28 @@ package org.apache.datasketches.quantilescommon; -import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; - /** - * The quantiles SortedView Iterator for generic types. - * @see SortedViewIterator + * Iterator over quantile sketches of generic type. * @param The generic quantile type - * @author Alexander Saydakov - * @author Lee Rhodes */ -public class GenericSortedViewIterator implements SortedViewIterator { +public class GenericSortedViewIterator extends SortedViewIterator { private final T[] quantiles; - private final long[] cumWeights; - private final long totalN; - private int index; public GenericSortedViewIterator(final T[] quantiles, final long[] cumWeights) { - this.quantiles = quantiles; //SpotBugs EI_EXPOSE_REP2 suppressed by FindBugsExcludeFilter - this.cumWeights = cumWeights; //SpotBugs EI_EXPOSE_REP2 suppressed by FindBugsExcludeFilter - this.totalN = (cumWeights.length > 0) ? cumWeights[cumWeights.length - 1] : 0; - index = -1; - } - - @Override - public long getCumulativeWeight(final QuantileSearchCriteria searchCrit) { - if (searchCrit == INCLUSIVE) { return cumWeights[index]; } - return (index == 0) ? 0 : cumWeights[index - 1]; + super(cumWeights); + this.quantiles = quantiles; //SpotBugs EI_EXPOSE_REP2 suppressed by FindBugsExcludeFilter } + /** + * Gets the quantile at the current index. + * + *

    Don't call this before calling next() for the first time + * or after getting false from next().

    + * + * @return the quantile at the current index. + */ public T getQuantile() { return quantiles[index]; } - @Override - public long getN() { - return totalN; - } - - @Override - public double getNormalizedRank(final QuantileSearchCriteria searchCrit) { - return (double) getCumulativeWeight(searchCrit) / totalN; - } - - @Override - public long getWeight() { - if (index == 0) { return cumWeights[0]; } - return cumWeights[index] - cumWeights[index - 1]; - } - - @Override - public boolean next() { - index++; - return index < quantiles.length; - } - } diff --git a/src/main/java/org/apache/datasketches/quantilescommon/PartitionBoundaries.java b/src/main/java/org/apache/datasketches/quantilescommon/PartitionBoundaries.java new file mode 100644 index 000000000..e3c59d2c7 --- /dev/null +++ b/src/main/java/org/apache/datasketches/quantilescommon/PartitionBoundaries.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.quantilescommon; + +/** + * This defines a set of results computed from the getParitionBoundaries() function and + * encapsulates the basic methods needed to construct actual partitions based on generic items. + */ +public interface PartitionBoundaries { + + /** + * Gets the length of the input stream offered to the underlying sketch. + * @return the length of the input stream offered to the underlying sketch. + */ + long getN(); + + /** + * Gets an ordered array of natural ranks of the associated array of partition boundaries utilizing + * a specified search criterion. Natural ranks are integral values on the interval [1, N] + * @return an array of natural ranks. + */ + long[] getNaturalRanks(); + + /** + * Gets an ordered array of normalized ranks of the associated array of partition boundaries utilizing + * a specified search criterion. Normalized ranks are double values on the interval [0.0, 1.0]. + * @return an array of normalized ranks. + */ + double[] getNormalizedRanks(); + + /** + * Gets the number of items to be included for each partition as an array. + * The count at index 0 is 0. The number of items included in the first partition, defined by the boundaries at + * index 0 and index 1, is at index 1 in this array, etc. + * @return the number of items to be included for each partition as an array. + */ + long[] getNumDeltaItems(); + + /** + * Gets the number of partitions + * @return the number of partitions + */ + int getNumPartitions(); + + /** + * Gets the search criteria specified for the source sketch + * @return The search criteria specified for the source sketch + */ + QuantileSearchCriteria getSearchCriteria(); +} diff --git a/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java b/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java new file mode 100644 index 000000000..3ff51a3b4 --- /dev/null +++ b/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.quantilescommon; + +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; + +/** + * This enables the special functions for performing efficient partitioning of massive data. + */ +public interface PartitioningFeature { + + /** + * This method returns an instance of + * {@link GenericPartitionBoundaries GenericPartitionBoundaries} which provides + * sufficient information for the user to create the given number of equally sized partitions, where "equally sized" + * refers to an approximately equal number of items per partition. + * + *

    This method is equivalent to + * {@link #getPartitionBoundaries(int, QuantileSearchCriteria) getPartitionBoundaries(numEquallySized, INCLUSIVE)}. + *

    + * + * @param numEquallySized an integer that specifies the number of equally sized partitions between + * {@link #getMinItem() getMinItem()} and {@link #getMaxItem() getMaxItem()}. + * This must be a positive integer greater than zero. + *
      + *
    • A 1 will return: minItem, maxItem.
    • + *
    • A 2 will return: minItem, median quantile, maxItem.
    • + *
    • Etc.
    • + *
    + * + * @return an instance of {@link GenericPartitionBoundaries GenericPartitionBoundaries}. + * @throws IllegalArgumentException if sketch is empty. + * @throws IllegalArgumentException if numEquallySized is less than 1. + */ + default GenericPartitionBoundaries getPartitionBoundaries(int numEquallySized) { + return getPartitionBoundaries(numEquallySized, INCLUSIVE); + } + + /** + * This method returns an instance of + * {@link GenericPartitionBoundaries GenericPartitionBoundaries} which provides + * sufficient information for the user to create the given number of equally sized partitions, where "equally sized" + * refers to an approximately equal number of items per partition. + * + * @param numEquallySized an integer that specifies the number of equally sized partitions between + * {@link #getMinItem() getMinItem()} and {@link #getMaxItem() getMaxItem()}. + * This must be a positive integer greater than zero. + *
      + *
    • A 1 will return: minItem, maxItem.
    • + *
    • A 2 will return: minItem, median quantile, maxItem.
    • + *
    • Etc.
    • + *
    + * + * @param searchCrit + * If INCLUSIVE, all the returned quantiles are the upper boundaries of the equally sized partitions + * with the exception of the lowest returned quantile, which is the lowest boundary of the lowest ranked partition. + * If EXCLUSIVE, all the returned quantiles are the lower boundaries of the equally sized partitions + * with the exception of the highest returned quantile, which is the upper boundary of the highest ranked partition. + * + * @return an instance of {@link GenericPartitionBoundaries GenericPartitionBoundaries}. + * @throws IllegalArgumentException if sketch is empty. + * @throws IllegalArgumentException if numEquallySized is less than 1. + */ + GenericPartitionBoundaries getPartitionBoundaries(int numEquallySized, QuantileSearchCriteria searchCrit); + +} diff --git a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesAPI.java b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesAPI.java index 74e5d8061..38502ecaa 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesAPI.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesAPI.java @@ -219,8 +219,8 @@ public interface QuantilesAPI { int getK(); /** - * Gets the length of the input stream. - * @return the length of the input stream. + * Gets the length of the input stream offered to the sketch.. + * @return the length of the input stream offered to the sketch. */ long getN(); diff --git a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesDoublesAPI.java b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesDoublesAPI.java index a70b08372..31a5bedf9 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesDoublesAPI.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesDoublesAPI.java @@ -92,56 +92,6 @@ default double[] getCDF(double[] splitPoints) { */ double getMinItem(); - /** - * This method returns an instance of {@link DoublesPartitionBoundaries DoublesPartitionBoundaries} which provides - * sufficient information for the user to create the given number of equally weighted partitions. - * - *

    This method is equivalent to - * {@link #getPartitionBoundaries(int, QuantileSearchCriteria) getPartitionBoundaries(numEquallyWeighted, INCLUSIVE)}. - *

    - * - * @param numEquallyWeighted an integer that specifies the number of equally weighted partitions between - * {@link #getMinItem() getMinItem()} and {@link #getMaxItem() getMaxItem()}. - * This must be a positive integer greater than zero. - *
      - *
    • A 1 will return: minItem, maxItem.
    • - *
    • A 2 will return: minItem, median quantile, maxItem.
    • - *
    • Etc.
    • - *
    - * - * @return an instance of {@link DoublesPartitionBoundaries DoublesPartitionBoundaries}. - * @throws IllegalArgumentException if sketch is empty. - * @throws IllegalArgumentException if numEquallyWeighted is less than 1. - */ - default DoublesPartitionBoundaries getPartitionBoundaries(int numEquallyWeighted) { - return getPartitionBoundaries(numEquallyWeighted, INCLUSIVE); - } - - /** - * This method returns an instance of {@link DoublesPartitionBoundaries DoublesPartitionBoundaries} which provides - * sufficient information for the user to create the given number of equally weighted partitions. - * - * @param numEquallyWeighted an integer that specifies the number of equally weighted partitions between - * {@link #getMinItem() getMinItem()} and {@link #getMaxItem() getMaxItem()}. - * This must be a positive integer greater than zero. - *
      - *
    • A 1 will return: minItem, maxItem.
    • - *
    • A 2 will return: minItem, median quantile, maxItem.
    • - *
    • Etc.
    • - *
    - * - * @param searchCrit - * If INCLUSIVE, all the returned quantiles are the upper boundaries of the equally weighted partitions - * with the exception of the lowest returned quantile, which is the lowest boundary of the lowest ranked partition. - * If EXCLUSIVE, all the returned quantiles are the lower boundaries of the equally weighted partitions - * with the exception of the highest returned quantile, which is the upper boundary of the highest ranked partition. - * - * @return an instance of {@link DoublesPartitionBoundaries DoublesPartitionBoundaries}. - * @throws IllegalArgumentException if sketch is empty. - * @throws IllegalArgumentException if numEquallyWeighted is less than 1. - */ - DoublesPartitionBoundaries getPartitionBoundaries(int numEquallyWeighted, QuantileSearchCriteria searchCrit); - /** * This is equivalent to {@link #getPMF(double[], QuantileSearchCriteria) getPMF(splitPoints, INCLUSIVE)} * @param splitPoints an array of m unique, monotonically increasing items. diff --git a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesFloatsAPI.java b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesFloatsAPI.java index c6ea484cc..2fcbdd99f 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesFloatsAPI.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesFloatsAPI.java @@ -91,56 +91,6 @@ default double[] getCDF(float[] splitPoints) { */ float getMinItem(); - /** - * This method returns an instance of {@link FloatsPartitionBoundaries FloatsPartitionBoundaries} which provides - * sufficient information for the user to create the given number of equally weighted partitions. - * - *

    This method is equivalent to - * {@link #getPartitionBoundaries(int, QuantileSearchCriteria) getPartitionBoundaries(numEquallyWeighted, INCLUSIVE)}. - *

    - * - * @param numEquallyWeighted an integer that specifies the number of equally weighted partitions between - * {@link #getMinItem() getMinItem()} and {@link #getMaxItem() getMaxItem()}. - * This must be a positive integer greater than zero. - *
      - *
    • A 1 will return: minItem, maxItem.
    • - *
    • A 2 will return: minItem, median quantile, maxItem.
    • - *
    • Etc.
    • - *
    - * - * @return an instance of {@link FloatsPartitionBoundaries FloatsPartitionBoundaries}. - * @throws IllegalArgumentException if sketch is empty. - * @throws IllegalArgumentException if numEquallyWeighted is less than 1. - */ - default FloatsPartitionBoundaries getPartitionBoundaries(int numEquallyWeighted) { - return getPartitionBoundaries(numEquallyWeighted, INCLUSIVE); - } - - /** - * This method returns an instance of {@link FloatsPartitionBoundaries FloatsPartitionBoundaries} which provides - * sufficient information for the user to create the given number of equally weighted partitions. - * - * @param numEquallyWeighted an integer that specifies the number of equally weighted partitions between - * {@link #getMinItem() getMinItem()} and {@link #getMaxItem() getMaxItem()}. - * This must be a positive integer greater than zero. - *
      - *
    • A 1 will return: minItem, maxItem.
    • - *
    • A 2 will return: minItem, median quantile, maxItem.
    • - *
    • Etc.
    • - *
    - * - * @param searchCrit - * If INCLUSIVE, all the returned quantiles are the upper boundaries of the equally weighted partitions - * with the exception of the lowest returned quantile, which is the lowest boundary of the lowest ranked partition. - * If EXCLUSIVE, all the returned quantiles are the lower boundaries of the equally weighted partitions - * with the exception of the highest returned quantile, which is the upper boundary of the highest ranked partition. - * - * @return an instance of {@link FloatsPartitionBoundaries FloatsPartitionBoundaries}. - * @throws IllegalArgumentException if sketch is empty. - * @throws IllegalArgumentException if numEquallyWeighted is less than 1. - */ - FloatsPartitionBoundaries getPartitionBoundaries(int numEquallyWeighted, QuantileSearchCriteria searchCrit); - /** * This is equivalent to {@link #getPMF(float[], QuantileSearchCriteria) getPMF(splitPoints, INCLUSIVE)} * @param splitPoints an array of m unique, monotonically increasing items. diff --git a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java index f8dd8e62d..fbd7f691f 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java @@ -92,58 +92,6 @@ default double[] getCDF(T[] splitPoints) { */ T getMinItem(); - /** - * This method returns an instance of - * {@link GenericPartitionBoundaries GenericPartitionBoundaries} which provides - * sufficient information for the user to create the given number of equally weighted partitions. - * - *

    This method is equivalent to - * {@link #getPartitionBoundaries(int, QuantileSearchCriteria) getPartitionBoundaries(numEquallyWeighted, INCLUSIVE)}. - *

    - * - * @param numEquallyWeighted an integer that specifies the number of equally weighted partitions between - * {@link #getMinItem() getMinItem()} and {@link #getMaxItem() getMaxItem()}. - * This must be a positive integer greater than zero. - *
      - *
    • A 1 will return: minItem, maxItem.
    • - *
    • A 2 will return: minItem, median quantile, maxItem.
    • - *
    • Etc.
    • - *
    - * - * @return an instance of {@link GenericPartitionBoundaries GenericPartitionBoundaries}. - * @throws IllegalArgumentException if sketch is empty. - * @throws IllegalArgumentException if numEquallyWeighted is less than 1. - */ - default GenericPartitionBoundaries getPartitionBoundaries(int numEquallyWeighted) { - return getPartitionBoundaries(numEquallyWeighted, INCLUSIVE); - } - - /** - * This method returns an instance of - * {@link GenericPartitionBoundaries GenericPartitionBoundaries} which provides - * sufficient information for the user to create the given number of equally weighted partitions. - * - * @param numEquallyWeighted an integer that specifies the number of equally weighted partitions between - * {@link #getMinItem() getMinItem()} and {@link #getMaxItem() getMaxItem()}. - * This must be a positive integer greater than zero. - *
      - *
    • A 1 will return: minItem, maxItem.
    • - *
    • A 2 will return: minItem, median quantile, maxItem.
    • - *
    • Etc.
    • - *
    - * - * @param searchCrit - * If INCLUSIVE, all the returned quantiles are the upper boundaries of the equally weighted partitions - * with the exception of the lowest returned quantile, which is the lowest boundary of the lowest ranked partition. - * If EXCLUSIVE, all the returned quantiles are the lower boundaries of the equally weighted partitions - * with the exception of the highest returned quantile, which is the upper boundary of the highest ranked partition. - * - * @return an instance of {@link GenericPartitionBoundaries GenericPartitionBoundaries}. - * @throws IllegalArgumentException if sketch is empty. - * @throws IllegalArgumentException if numEquallyWeighted is less than 1. - */ - GenericPartitionBoundaries getPartitionBoundaries(int numEquallyWeighted, QuantileSearchCriteria searchCrit); - /** * This is equivalent to {@link #getPMF(Object[], QuantileSearchCriteria) getPMF(splitPoints, INCLUSIVE)} * @param splitPoints an array of m unique, monotonically increasing items. @@ -337,47 +285,5 @@ default double[] getRanks(T[] quantiles) { */ void update(T item); - /** - * This encapsulates the essential information needed to construct actual partitions and is returned from the - * getPartitionBoundaries(int, QuantileSearchCritera) method. - * @param generic value T for the item type - */ - static class GenericPartitionBoundaries { - - /** - * The total number of items presented to the sketch. - * - *

    To compute the weight or density of a specific - * partition i where i varies from 1 to m partitions: - *

    {@code
    -     * long N = getN();
    -     * double[] ranks = getRanks();
    -     * long weight = Math.round((ranks[i] - ranks[i - 1]) * N);
    -     * }
    - */ - public long N; - - /** - * The normalized ranks that correspond to the returned boundaries. - * The returned array is of size (m + 1), where m is the requested number of partitions. - * Index 0 of the returned array is always 0.0, and index m is always 1.0. - */ - public double[] ranks; - - /** - * The cumulative weights that correspond to the returned boundaries. - * The returned array is of size (m + 1), where m is the requested number of partitions. - * Index 0 of the returned array is always 1, and index m is always n. - */ - public long[] weights; - - /** - * The partition boundaries as quantiles. - * The returned array is of size (m + 1), where m is the requested number of partitions. - * Index 0 of the returned array is always {@link #getMinItem() getMinItem()}, and index m is always - * {@link #getMaxItem() getMaxItem()}. - */ - public T[] boundaries; - } } diff --git a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesUtil.java b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesUtil.java index 848ee3105..a35aa27cd 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesUtil.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesUtil.java @@ -21,6 +21,7 @@ import static java.lang.Math.log; import static java.lang.Math.pow; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import java.util.Objects; @@ -208,18 +209,17 @@ public static double[] evenlyLogSpaced(final double value1, final double value2, return arr; } - public static double maxPrecision; - - public static double getNaturalRank(final double normalizedRank, final long totalN) { - final double naturalRank = normalizedRank * totalN; - if (totalN <= 1_000_000L) { - final double precision = Util.ceilingPowerBaseOfDouble(10.0, totalN) ; - maxPrecision = precision; - final double trimmedNatRank = Math.round(naturalRank * precision) / precision; - return trimmedNatRank; - } else { - return naturalRank; + public static final double tailRoundingFactor = 1e7; + + public static double getNaturalRank( + final double normalizedRank, + final long totalN, + final QuantileSearchCriteria searchCrit) { + double naturalRank = (normalizedRank * totalN); + if (totalN <= tailRoundingFactor) { + naturalRank = Math.round(naturalRank * tailRoundingFactor) / tailRoundingFactor; } + return (searchCrit == INCLUSIVE) ? (long)Math.ceil(naturalRank) : (long)Math.floor(naturalRank); } } diff --git a/src/main/java/org/apache/datasketches/quantilescommon/SortedView.java b/src/main/java/org/apache/datasketches/quantilescommon/SortedView.java index 434b548a9..92acfb2d4 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/SortedView.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/SortedView.java @@ -20,19 +20,15 @@ package org.apache.datasketches.quantilescommon; /** - * This is the base interface for the Sorted View interface hierarchy. + * This is the base interface for the Sorted View interface hierarchy and defines the methods that are type independent. * - *

    The Sorted View provides a view of the data retained by a quantiles-type sketch - * that would be cumbersome to get any other way. - * One can iterate over the contents of the sketch using the sketch's iterator, but the result is not sorted.

    + *

    The SortedView interface hierarchy provides a sorted view of the data retained by a quantiles-type sketch that + * would be cumbersome to get any other way. + * One could use the sketch's iterator to iterate over the contents of the sketch, + * but the result would not be sorted.

    * - *

    Once this sorted view has been created, it provides not only a sorted view of the data retained by the sketch - * but also the basic queries, such as getRank(), getQuantile(), and getCDF() and getPMF(). - * In addition, the iterator obtained from this sorted view provides useful detailed information about each entry.

    - * - *

    The data from a Sorted view is an unbiased sample of the input stream that can be used for other kinds of - * analysis not directly provided by the sketch. For example, comparing two sketches using the Kolmogorov-Smirnov - * test.

    + *

    The data from a Sorted view is an unbiased random sample of the input stream that can be used for other kinds of + * analysis not directly provided by the sketch.

    * * @author Alexander Saydakov * @author Lee Rhodes @@ -40,11 +36,25 @@ public interface SortedView { /** - * Returns the array of cumulative weights - * @return the array of cumulative weights + * Returns the array of cumulative weights from the sketch. + * Also known as the natural ranks, which are the Natural Numbers on the interval [1, N]. + * @return the array of cumulative weights (or natural ranks). */ long[] getCumulativeWeights(); + /** + * Returns the array of normalized ranks. The normalized ranks are the natural ranks divided by N. + * The normalized ranks are fractional numbers on the interval (0,1.0]. + * @return the array of normalized ranks. + */ + double[] getNormalizedRanks(); + + /** + * Returns the total number of items presented to the sourcing sketch. + * @return the total number of items presented to the sourcing sketch. + */ + long getN(); + /** * Returns true if this sorted view is empty. * @return true if this sorted view is empty. diff --git a/src/main/java/org/apache/datasketches/quantilescommon/SortedViewIterator.java b/src/main/java/org/apache/datasketches/quantilescommon/SortedViewIterator.java index b36a2594e..06c298d4e 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/SortedViewIterator.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/SortedViewIterator.java @@ -19,6 +19,8 @@ package org.apache.datasketches.quantilescommon; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; + /** * This is the base interface for the SortedViewIterator hierarchy used with a SortedView obtained * from a quantile-type sketch. This provides an ordered iterator over the retained quantiles of @@ -35,30 +37,47 @@ * @author Alexander Saydakov * @author Lee Rhodes */ -public interface SortedViewIterator { +public class SortedViewIterator { + protected final long[] cumWeights; + protected long totalN; + protected int index; + + SortedViewIterator(final long[] cumWeights) { + this.cumWeights = cumWeights; //SpotBugs EI_EXPOSE_REP2 suppressed by FindBugsExcludeFilter + this.totalN = (cumWeights.length > 0) ? cumWeights[cumWeights.length - 1] : 0; + index = -1; + } /** - * Gets the cumulative weight at the current index (or previous index) based on the chosen search criterion. - * This is also referred to as the "Natural Rank". + * Gets the natural rank at the current index (or previous index) based on the chosen search criterion. + * This is also referred to as the "cumulative weight". The natural rank is a number in the range [1, N], + * where N ({@link #getN()}) is the total number of items fed to the sketch. * *

    Don't call this before calling next() for the first time * or after getting false from next().

    * - * @param searchCrit if INCLUSIVE, includes the weight at the current index in the cumulative sum. - * Otherwise, it will return the cumulative weight of the previous index. - * @return cumulative weight at the current index on the chosen search criterion. + * @param searchCrit if INCLUSIVE, includes the weight of the item at the current index in the computation of + * the natural rank. + * Otherwise, it will return the natural rank of the previous index. + * @return the natural rank at the current index (or previous index) based on the chosen search criterion. */ - long getCumulativeWeight(QuantileSearchCriteria searchCrit); + public long getNaturalRank(final QuantileSearchCriteria searchCrit) { + if (searchCrit == INCLUSIVE) { return cumWeights[index]; } + return (index == 0) ? 0 : cumWeights[index - 1]; + } /** * Gets the total count of all items presented to the sketch. * @return the total count of all items presented to the sketch. */ - long getN(); + public long getN() { + return totalN; + } /** * Gets the normalized rank at the current index (or previous index) - * based on the chosen search criterion. + * based on the chosen search criterion. Where normalized rank = natural rank / N ({@link #getN()}) + * and is a fraction in the range (0,1.0]. * *

    Don't call this before calling next() for the first time * or after getting false from next().

    @@ -68,24 +87,32 @@ public interface SortedViewIterator { * @return the normalized rank at the current index (or previous index) * based on the chosen search criterion. */ - double getNormalizedRank(QuantileSearchCriteria searchCrit); + public double getNormalizedRank(final QuantileSearchCriteria searchCrit) { + return (double) getNaturalRank(searchCrit) / totalN; + } /** - * Gets the natural weight at the current index. + * Gets the weight contribution of the item at the current index. * *

    Don't call this before calling next() for the first time * or after getting false from next().

    * - * @return the natural weight at the current index. + * @return the weight contribution of the item at the current index. */ - long getWeight(); + public long getWeight() { + if (index == 0) { return cumWeights[0]; } + return cumWeights[index] - cumWeights[index - 1]; + } /** * Advances the index and checks if it is valid. * The state of this iterator is undefined before the first call of this method. * @return true if the next index is valid. */ - boolean next(); + public boolean next() { + index++; + return index < cumWeights.length; + } } diff --git a/src/main/java/org/apache/datasketches/quantilescommon/Stack.java b/src/main/java/org/apache/datasketches/quantilescommon/Stack.java new file mode 100644 index 000000000..68d6378b5 --- /dev/null +++ b/src/main/java/org/apache/datasketches/quantilescommon/Stack.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.quantilescommon; + +import java.util.ArrayList; + +import org.apache.datasketches.common.SketchesStateException; + +/** + * A classic LIFO stack based on ArrayList (as opposed to Vector). + * All of the methods of ArrayList are available. + */ +public class Stack extends ArrayList { + private static final long serialVersionUID = 1L; + + /** + * Creates an empty stack. + */ + public Stack() { } + + /** + * Pushes an item onto the stack + * @param item the given item + * @return the given element + */ + public E push(final E item) { + add(item); + return item; + } + + /** + * Removes the item at the top of the stack. + * @return the item at the top of the stack. + */ + public E pop() { + final E item = peek(); + remove(size() - 1); + return item; + } + + /** + * Allows examination of the top item without removing it. + * @return the top item without removing it + */ + public E peek() { + final int len = size(); + if (len == 0) { throw new SketchesStateException("Stack is empty"); } + return get(len - 1); + } + +} diff --git a/src/main/java/org/apache/datasketches/req/BaseReqSketch.java b/src/main/java/org/apache/datasketches/req/BaseReqSketch.java index 7c11ee2ab..e587cd633 100644 --- a/src/main/java/org/apache/datasketches/req/BaseReqSketch.java +++ b/src/main/java/org/apache/datasketches/req/BaseReqSketch.java @@ -19,11 +19,8 @@ package org.apache.datasketches.req; -import static org.apache.datasketches.quantilescommon.QuantilesUtil.equallySpacedDoubles; - import org.apache.datasketches.quantilescommon.FloatsSortedView; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; -import org.apache.datasketches.quantilescommon.QuantilesAPI; import org.apache.datasketches.quantilescommon.QuantilesFloatsAPI; import org.apache.datasketches.quantilescommon.QuantilesFloatsSketchIterator; @@ -62,21 +59,6 @@ abstract class BaseReqSketch implements QuantilesFloatsAPI { @Override public abstract float getMinItem(); - @Override - public FloatsPartitionBoundaries getPartitionBoundaries(final int numEquallyWeighted, - final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } - final double[] ranks = equallySpacedDoubles(numEquallyWeighted); - final float[] boundaries = getQuantiles(ranks, searchCrit); - boundaries[0] = getMinItem(); - boundaries[boundaries.length - 1] = getMaxItem(); - final FloatsPartitionBoundaries fpb = new FloatsPartitionBoundaries(); - fpb.N = this.getN(); - fpb.ranks = ranks; - fpb.boundaries = boundaries; - return fpb; - } - /** * Returns an a priori estimate of relative standard error (RSE, expressed as a number in [0,1]). * Derived from Lemma 12 in https://arxiv.org/abs/2004.01668v2, but the constant factors were diff --git a/src/main/java/org/apache/datasketches/req/ReqSketchSortedView.java b/src/main/java/org/apache/datasketches/req/ReqSketchSortedView.java index f06461650..dbf14be6d 100644 --- a/src/main/java/org/apache/datasketches/req/ReqSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/req/ReqSketchSortedView.java @@ -20,11 +20,14 @@ package org.apache.datasketches.req; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; +import static org.apache.datasketches.quantilescommon.QuantilesAPI.EMPTY_MSG; import static org.apache.datasketches.quantilescommon.QuantilesUtil.getNaturalRank; import java.util.List; +import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.quantilescommon.FloatsSortedView; +import org.apache.datasketches.quantilescommon.FloatsSortedViewIterator; import org.apache.datasketches.quantilescommon.InequalitySearch; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.datasketches.quantilescommon.QuantilesAPI; @@ -39,6 +42,9 @@ public final class ReqSketchSortedView implements FloatsSortedView { private float[] quantiles; private long[] cumWeights; //comes in as individual weights, converted to cumulative natural weights private final long totalN; + private final double[] normRanks; + private final float maxItem; + private final float minItem; /** * Construct from elements for testing. @@ -46,60 +52,76 @@ public final class ReqSketchSortedView implements FloatsSortedView { * @param cumWeights sorted, monotonically increasing cumulative weights. * @param totalN the total number of items presented to the sketch. */ - ReqSketchSortedView(final float[] quantiles, final long[] cumWeights, final long totalN) { + ReqSketchSortedView(final float[] quantiles, final long[] cumWeights, final long totalN, + final float maxItem, final float minItem) { this.quantiles = quantiles; this.cumWeights = cumWeights; this.totalN = totalN; + this.maxItem = maxItem; + this.minItem = minItem; + final int len = cumWeights.length; + final double[] normRanks = new double[len]; + for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } + this.normRanks = normRanks; } /** * Constructs this Sorted View given the sketch - * @param sk the given ReqSketch + * @param sketch the given ReqSketch */ - public ReqSketchSortedView(final ReqSketch sk) { - totalN = sk.getN(); - buildSortedViewArrays(sk); + public ReqSketchSortedView(final ReqSketch sketch) { + if (sketch.isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } + this.totalN = sketch.getN(); + this.maxItem = sketch.getMaxItem(); + this.minItem = sketch.getMinItem(); + buildSortedViewArrays(sketch); + final int len = cumWeights.length; + final double[] normRanks = new double[len]; + for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } + this.normRanks = normRanks; } + //end of constructors + @Override public long[] getCumulativeWeights() { return cumWeights.clone(); } + @Override + public float getMaxItem() { + return maxItem; + } + + @Override + public float getMinItem() { + return minItem; + } + + @Override + public long getN() { + return totalN; + } + + @Override + public double[] getNormalizedRanks() { + return normRanks; + } + @Override public float getQuantile(final double rank, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int len = cumWeights.length; - final double naturalRank = getNaturalRank(rank, totalN); + final double naturalRank = getNaturalRank(rank, totalN, searchCrit); final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit); if (index == -1) { - return quantiles[quantiles.length - 1]; ///EXCLUSIVE (GT) case: normRank == 1.0; + return quantiles[len - 1]; ///EXCLUSIVE (GT) case: normRank == 1.0; } return quantiles[index]; } - /** - * Special version of getQuantile to support the getPartitionBoundaries(int) function. - * @param weight ultimately comes from selected integral weights computed by the sketch. - * @param searchCrit If INCLUSIVE, the given rank includes all quantiles ≤ - * the quantile directly corresponding to the given weight internal to the sketch. - * @return the approximate quantile given the weight. - */ - float getQuantile(final long weight, final QuantileSearchCriteria searchCrit) { - if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } - final int len = cumWeights.length; - final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT; - final int index = InequalitySearch.find(cumWeights, 0, len - 1, weight, crit); - if (index == -1) { - return quantiles[quantiles.length - 1]; //EXCLUSIVE (GT) case: normRank == 1.0; - } - return quantiles[index]; - } - - - @Override public float[] getQuantiles() { return quantiles.clone(); @@ -123,8 +145,8 @@ public boolean isEmpty() { } @Override - public ReqSketchSortedViewIterator iterator() { - return new ReqSketchSortedViewIterator(quantiles, cumWeights); + public FloatsSortedViewIterator iterator() { + return new FloatsSortedViewIterator(quantiles, cumWeights); } //restricted methods diff --git a/src/main/java/org/apache/datasketches/req/ReqSketchSortedViewIterator.java b/src/main/java/org/apache/datasketches/req/ReqSketchSortedViewIterator.java deleted file mode 100644 index 6dbc63222..000000000 --- a/src/main/java/org/apache/datasketches/req/ReqSketchSortedViewIterator.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.req; - -import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; - -import org.apache.datasketches.quantilescommon.FloatsSortedViewIterator; -import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; - -/** - * Iterator over ReqSketchSortedView. - * @author Alexander Saydakov - * @author Lee Rhodes - */ -public final class ReqSketchSortedViewIterator implements FloatsSortedViewIterator { - - private final float[] quantiles; - private final long[] cumWeights; - private final long totalN; - private int index; - - ReqSketchSortedViewIterator(final float[] quantiles, final long[] cumWeights) { - this.quantiles = quantiles; - this.cumWeights = cumWeights; - this.totalN = (cumWeights.length > 0) ? cumWeights[cumWeights.length - 1] : 0; - index = -1; - } - - @Override - public long getCumulativeWeight(final QuantileSearchCriteria searchCrit) { - if (searchCrit == INCLUSIVE) { return cumWeights[index]; } - return (index == 0) ? 0 : cumWeights[index - 1]; - } - - @Override - public long getN() { - return totalN; - } - - @Override - public double getNormalizedRank(final QuantileSearchCriteria searchCrit) { - return (double) getCumulativeWeight(searchCrit) / totalN; - } - - @Override - public float getQuantile() { - return quantiles[index]; - } - - @Override - public long getWeight() { - if (index == 0) { return cumWeights[0]; } - return cumWeights[index] - cumWeights[index - 1]; - } - - @Override - public boolean next() { - index++; - return index < quantiles.length; - } - -} diff --git a/src/test/java/org/apache/datasketches/common/UtilTest.java b/src/test/java/org/apache/datasketches/common/UtilTest.java index a68671685..50112a315 100644 --- a/src/test/java/org/apache/datasketches/common/UtilTest.java +++ b/src/test/java/org/apache/datasketches/common/UtilTest.java @@ -263,9 +263,14 @@ public void checkZeroPad() { @Test public void checkCharacterPad() { - final String s = "Pad 30, postpend z:"; - final String out = characterPad(s, 30, 'z', true); + String s = "Pad 30, postpend z:"; + String out = characterPad(s, 30, 'z', true); println(out); + assertEquals(out, "Pad 30, postpend z:zzzzzzzzzzz"); + s = "Pad 30, prepend z:"; + out = characterPad(s, 30, 'z', false); + println(out); + assertEquals(out,"zzzzzzzzzzzzPad 30, prepend z:"); } @Test diff --git a/src/test/java/org/apache/datasketches/kll/KllDirectCompactItemsSketchIteratorTest.java b/src/test/java/org/apache/datasketches/kll/KllDirectCompactItemsSketchIteratorTest.java index bc7651b14..ccfb52533 100644 --- a/src/test/java/org/apache/datasketches/kll/KllDirectCompactItemsSketchIteratorTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllDirectCompactItemsSketchIteratorTest.java @@ -96,8 +96,8 @@ public void twoItemSketchForSortedViewIterator() { assertEquals(itr.getQuantile(), "1"); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 0); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 1); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 0); + assertEquals(itr.getNaturalRank(INCLUSIVE), 1); assertEquals(itr.getNormalizedRank(EXCLUSIVE), 0); assertEquals(itr.getNormalizedRank(INCLUSIVE), 0.5); @@ -105,8 +105,8 @@ public void twoItemSketchForSortedViewIterator() { assertEquals(itr.getQuantile(), "2"); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 1); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 2); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 1); + assertEquals(itr.getNaturalRank(INCLUSIVE), 2); assertEquals(itr.getNormalizedRank(EXCLUSIVE), 0.5); assertEquals(itr.getNormalizedRank(INCLUSIVE), 1.0); } diff --git a/src/test/java/org/apache/datasketches/kll/KllDirectDoublesSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllDirectDoublesSketchTest.java index e4e349205..a8ca4145e 100644 --- a/src/test/java/org/apache/datasketches/kll/KllDirectDoublesSketchTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllDirectDoublesSketchTest.java @@ -21,7 +21,6 @@ import static org.apache.datasketches.kll.KllSketch.SketchType.DOUBLES_SKETCH; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE; -import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNotNull; @@ -423,21 +422,6 @@ public void nanSplitPoint() { sketch.getCDF(new double[] {Double.NaN}); } - @Test - public void getQuantiles() { - final KllDoublesSketch sketch = getUpdatableDirectDoublesSketch(200, 0); - sketch.update(1); - sketch.update(2); - sketch.update(3); - sketch.update(4); - double[] quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, EXCLUSIVE); - double[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, INCLUSIVE); - quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - } - @Test public void checkSimpleMergeDirect() { //used for troubleshooting int k = 20; diff --git a/src/test/java/org/apache/datasketches/kll/KllDirectFloatsSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllDirectFloatsSketchTest.java index 6f9ea0ba5..3013e6295 100644 --- a/src/test/java/org/apache/datasketches/kll/KllDirectFloatsSketchTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllDirectFloatsSketchTest.java @@ -21,7 +21,6 @@ import static org.apache.datasketches.kll.KllSketch.SketchType.FLOATS_SKETCH; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE; -import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNotNull; @@ -423,21 +422,6 @@ public void nanSplitPoint() { sketch.getCDF(new float[] {Float.NaN}); } - @Test - public void getQuantiles() { - final KllFloatsSketch sketch = getUpdatableDirectFloatSketch(200, 0); - sketch.update(1); - sketch.update(2); - sketch.update(3); - sketch.update(4); - float[] quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, EXCLUSIVE); - float[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, INCLUSIVE); - quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - } - @Test public void checkSimpleMergeDirect() { //used for troubleshooting int k = 20; diff --git a/src/test/java/org/apache/datasketches/kll/KllDoublesSketchIteratorTest.java b/src/test/java/org/apache/datasketches/kll/KllDoublesSketchIteratorTest.java index d428cd259..7a12d8466 100644 --- a/src/test/java/org/apache/datasketches/kll/KllDoublesSketchIteratorTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllDoublesSketchIteratorTest.java @@ -77,8 +77,8 @@ public void twoItemSketchForSortedViewIterator() { assertEquals(itr.getQuantile(), 1.0); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 0); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 1); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 0); + assertEquals(itr.getNaturalRank(INCLUSIVE), 1); assertEquals(itr.getNormalizedRank(EXCLUSIVE), 0); assertEquals(itr.getNormalizedRank(INCLUSIVE), 0.5); @@ -86,8 +86,8 @@ public void twoItemSketchForSortedViewIterator() { assertEquals(itr.getQuantile(), 2.0); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 1); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 2); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 1); + assertEquals(itr.getNaturalRank(INCLUSIVE), 2); assertEquals(itr.getNormalizedRank(EXCLUSIVE), 0.5); assertEquals(itr.getNormalizedRank(INCLUSIVE), 1.0); } diff --git a/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java index ba63e8bef..8aeabb8bf 100644 --- a/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java @@ -391,21 +391,6 @@ public void nanSplitPoint() { sketch.getCDF(new double[] {Double.NaN}); } - @Test - public void getQuantiles() { - final KllDoublesSketch sketch = KllDoublesSketch.newHeapInstance(); - sketch.update(1); - sketch.update(2); - sketch.update(3); - sketch.update(4); - double[] quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, EXCLUSIVE); - double[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, INCLUSIVE); - quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - } - @Test public void checkReset() { KllDoublesSketch sk = KllDoublesSketch.newHeapInstance(20); @@ -456,18 +441,18 @@ public void sortedView() { assertEquals(itr.next(), true); assertEquals(itr.getQuantile(), 1); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 0); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 1); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 0); + assertEquals(itr.getNaturalRank(INCLUSIVE), 1); assertEquals(itr.next(), true); assertEquals(itr.getQuantile(), 2); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 1); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 2); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 1); + assertEquals(itr.getNaturalRank(INCLUSIVE), 2); assertEquals(itr.next(), true); assertEquals(itr.getQuantile(), 3); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 2); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 3); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 2); + assertEquals(itr.getNaturalRank(INCLUSIVE), 3); assertEquals(itr.next(), false); } diff --git a/src/test/java/org/apache/datasketches/kll/KllFloatsSketchIteratorTest.java b/src/test/java/org/apache/datasketches/kll/KllFloatsSketchIteratorTest.java index e511de562..88003b836 100644 --- a/src/test/java/org/apache/datasketches/kll/KllFloatsSketchIteratorTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllFloatsSketchIteratorTest.java @@ -77,8 +77,8 @@ public void twoItemSketchForSortedViewIterator() { assertEquals(itr.getQuantile(), 1.0f); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 0); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 1); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 0); + assertEquals(itr.getNaturalRank(INCLUSIVE), 1); assertEquals(itr.getNormalizedRank(EXCLUSIVE), 0); assertEquals(itr.getNormalizedRank(INCLUSIVE), 0.5); @@ -86,8 +86,8 @@ public void twoItemSketchForSortedViewIterator() { assertEquals(itr.getQuantile(), 2.0f); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 1); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 2); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 1); + assertEquals(itr.getNaturalRank(INCLUSIVE), 2); assertEquals(itr.getNormalizedRank(EXCLUSIVE), 0.5); assertEquals(itr.getNormalizedRank(INCLUSIVE), 1.0); } diff --git a/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java index 161ee4318..846965cb8 100644 --- a/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java @@ -391,21 +391,6 @@ public void nanSplitPoint() { sketch.getCDF(new float[] {Float.NaN}); } - @Test - public void getQuantiles() { - final KllFloatsSketch sketch = KllFloatsSketch.newHeapInstance(); - sketch.update(1); - sketch.update(2); - sketch.update(3); - sketch.update(4); - float[] quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, EXCLUSIVE); - float[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, INCLUSIVE); - quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - } - @Test public void checkReset() { KllFloatsSketch sk = KllFloatsSketch.newHeapInstance(20); @@ -456,18 +441,18 @@ public void sortedView() { assertEquals(itr.next(), true); assertEquals(itr.getQuantile(), 1); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 0); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 1); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 0); + assertEquals(itr.getNaturalRank(INCLUSIVE), 1); assertEquals(itr.next(), true); assertEquals(itr.getQuantile(), 2); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 1); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 2); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 1); + assertEquals(itr.getNaturalRank(INCLUSIVE), 2); assertEquals(itr.next(), true); assertEquals(itr.getQuantile(), 3); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 2); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 3); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 2); + assertEquals(itr.getNaturalRank(INCLUSIVE), 3); assertEquals(itr.next(), false); } diff --git a/src/test/java/org/apache/datasketches/kll/KllItemsSketchSortedViewString.java b/src/test/java/org/apache/datasketches/kll/KllItemsSketchSortedViewString.java index 5eb513aa8..b0024420c 100644 --- a/src/test/java/org/apache/datasketches/kll/KllItemsSketchSortedViewString.java +++ b/src/test/java/org/apache/datasketches/kll/KllItemsSketchSortedViewString.java @@ -30,8 +30,9 @@ public KllItemsSketchSortedViewString( final String[] quantiles, final long[] cumWeights, final long totalN, - final String minItem, - final Comparator comparator) { - super(quantiles, cumWeights, totalN, minItem, comparator); + final Comparator comparator, + final String maxItem, + final String minItem) { + super(quantiles, cumWeights, totalN, comparator, maxItem, minItem); } } diff --git a/src/test/java/org/apache/datasketches/kll/KllItemsSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllItemsSketchTest.java index a980841b6..deb3cb9c8 100644 --- a/src/test/java/org/apache/datasketches/kll/KllItemsSketchTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllItemsSketchTest.java @@ -461,10 +461,10 @@ public void getQuantiles() { sketch.update("C"); sketch.update("D"); String[] quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, EXCLUSIVE); - String[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).boundaries; + String[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).getBoundaries(); assertEquals(quantiles1, quantiles2); quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, INCLUSIVE); - quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).boundaries; + quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).getBoundaries(); assertEquals(quantiles1, quantiles2); } @@ -528,18 +528,18 @@ public void sortedView() { assertEquals(itr.next(), true); assertEquals(itr.getQuantile(), "A"); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 0); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 1); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 0); + assertEquals(itr.getNaturalRank(INCLUSIVE), 1); assertEquals(itr.next(), true); assertEquals(itr.getQuantile(), "AB"); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 1); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 2); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 1); + assertEquals(itr.getNaturalRank(INCLUSIVE), 2); assertEquals(itr.next(), true); assertEquals(itr.getQuantile(), "ABC"); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 2); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 3); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 2); + assertEquals(itr.getNaturalRank(INCLUSIVE), 3); assertEquals(itr.next(), false); } diff --git a/src/test/java/org/apache/datasketches/kll/KllItemsSketchiteratorTest.java b/src/test/java/org/apache/datasketches/kll/KllItemsSketchiteratorTest.java index 0607ff5d7..f97eb2320 100644 --- a/src/test/java/org/apache/datasketches/kll/KllItemsSketchiteratorTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllItemsSketchiteratorTest.java @@ -82,8 +82,8 @@ public void twoItemSketchForSortedViewIterator() { assertEquals(itr.getQuantile(), "1"); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 0); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 1); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 0); + assertEquals(itr.getNaturalRank(INCLUSIVE), 1); assertEquals(itr.getNormalizedRank(EXCLUSIVE), 0); assertEquals(itr.getNormalizedRank(INCLUSIVE), 0.5); @@ -91,8 +91,8 @@ public void twoItemSketchForSortedViewIterator() { assertEquals(itr.getQuantile(), "2"); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 1); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 2); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 1); + assertEquals(itr.getNaturalRank(INCLUSIVE), 2); assertEquals(itr.getNormalizedRank(EXCLUSIVE), 0.5); assertEquals(itr.getNormalizedRank(INCLUSIVE), 1.0); } diff --git a/src/test/java/org/apache/datasketches/kll/KllMiscDirectDoublesTest.java b/src/test/java/org/apache/datasketches/kll/KllMiscDirectDoublesTest.java index 45feb7637..28095dda0 100644 --- a/src/test/java/org/apache/datasketches/kll/KllMiscDirectDoublesTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllMiscDirectDoublesTest.java @@ -58,19 +58,6 @@ public void checkBounds() { assertTrue(rest - restLB < (2 * eps)); } - @Test - public void checkMisc() { - final int k = 8; - final KllDoublesSketch sk = getDirectDoublesSketch(k, 0); - try { sk.getPartitionBoundaries(10); fail(); } catch (SketchesArgumentException e) {} - for (int i = 0; i < 20; i++) { sk.update(i); } - final double[] items = sk.getDoubleItemsArray(); - assertEquals(items.length, 16); - final int[] levels = sk.getLevelsArray(sk.sketchStructure); - assertEquals(levels.length, 3); - assertEquals(sk.getNumLevels(), 2); - } - //@Test //enable static println(..) for visual checking public void visualCheckToString() { final int k = 20; diff --git a/src/test/java/org/apache/datasketches/kll/KllMiscDirectFloatsTest.java b/src/test/java/org/apache/datasketches/kll/KllMiscDirectFloatsTest.java index 6f042ce06..5f88baed4 100644 --- a/src/test/java/org/apache/datasketches/kll/KllMiscDirectFloatsTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllMiscDirectFloatsTest.java @@ -58,19 +58,6 @@ public void checkBounds() { assertTrue(rest - restLB < (2 * eps)); } - @Test - public void checkMisc() { - final int k = 8; - final KllFloatsSketch sk = getDirectFloatsSketch(k, 0); - try { sk.getPartitionBoundaries(10); fail(); } catch (SketchesArgumentException e) {} - for (int i = 0; i < 20; i++) { sk.update(i); } - final float[] items = sk.getFloatItemsArray(); - assertEquals(items.length, 16); - final int[] levels = sk.getLevelsArray(sk.sketchStructure); - assertEquals(levels.length, 3); - assertEquals(sk.getNumLevels(), 2); - } - //@Test //enable static println(..) for visual checking public void visualCheckToString() { final int k = 20; diff --git a/src/test/java/org/apache/datasketches/kll/KllMiscItemsTest.java b/src/test/java/org/apache/datasketches/kll/KllMiscItemsTest.java index 35d73fce3..0524db725 100644 --- a/src/test/java/org/apache/datasketches/kll/KllMiscItemsTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllMiscItemsTest.java @@ -30,7 +30,7 @@ import org.apache.datasketches.common.ArrayOfStringsSerDe; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; -import org.apache.datasketches.kll.KllItemsSketchSortedView.KllItemsSketchSortedViewIterator; +import org.apache.datasketches.quantilescommon.GenericSortedViewIterator; import org.apache.datasketches.memory.Memory; import org.apache.datasketches.memory.WritableMemory; import org.testng.annotations.Test; @@ -201,7 +201,7 @@ public void viewCompactionAndSortedView() { for (int i = 1; i <= n; i++) { sk.update(Util.intToFixedLengthString(i, digits)); } println(sk.toString(true, true)); KllItemsSketchSortedView sv = sk.getSortedView(); - KllItemsSketchSortedViewIterator itr = sv.iterator(); + GenericSortedViewIterator itr = sv.iterator(); println("### SORTED VIEW"); printf("%12s%12s\n", "Value", "CumWeight"); while (itr.next()) { diff --git a/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java b/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java new file mode 100644 index 000000000..f26031465 --- /dev/null +++ b/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.partitions; + +import static org.apache.datasketches.common.Util.milliSecToString; +import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_BOTH; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; + +import java.util.List; + +import org.apache.datasketches.partitions.Partitioner; +import org.apache.datasketches.partitions.Partitioner.PartitionBoundsRow; +import org.apache.datasketches.quantiles.ItemsSketch; +import org.testng.annotations.Test; + +@SuppressWarnings("unused") +public class ClassicPartitionsTest { + private final int k = 1 << 15; + private final long totalN = 100_000_000L; + private final long tgtPartitionSize = (long)3e6; + private final int maxPartsPerSk = 100; + + //@Test + public void checkClassicPartitioner() { + println("Classic ItemsSketch Partitions Test"); + printf("Sketch K :%,20d\n", k); + printf("Total N :%,20d\n", totalN); + printf("Tgt Partition Size :%,20d\n", tgtPartitionSize); + printf("Max Parts Per Sketch :%20d\n", maxPartsPerSk); + + final long startTime_mS = System.currentTimeMillis(); + final ItemsSketchFillRequestLongAsString fillReq = new ItemsSketchFillRequestLongAsString(k, totalN); + final ItemsSketch sk = fillReq.getRange(1L, totalN, INCLUDE_BOTH); + final long endFillInitialSketchTime_mS = System.currentTimeMillis(); + final Partitioner> partitioner = new Partitioner<>( + tgtPartitionSize, + maxPartsPerSk, + fillReq, + INCLUSIVE); + final List> list = partitioner.partition(sk); + outputList(list); + + final long endTime_mS = System.currentTimeMillis(); + final long fillInitialSketchTime_mS = endFillInitialSketchTime_mS - startTime_mS; + final long partitioningTime_mS = endTime_mS - endFillInitialSketchTime_mS; + final long totalTime_mS = endTime_mS - startTime_mS; + println(""); + println("FillInitialSketchTime: " + milliSecToString(fillInitialSketchTime_mS)); + println("PartioningTime : " + milliSecToString(partitioningTime_mS)); + println("Total Time : " + milliSecToString(totalTime_mS)); + } + + private static final String[] hdr = + { "Level.Part", "Partition", "LowerBound", "UpperBound", "ApproxNumItems", "Include Rule" }; + private static final String hdrFmt = "%15s %10s %15s %15s %15s %15s\n"; + private static final String dFmt = "%15s %10d %15s %15s %15d %15s\n"; + + void outputList(final List> list) { + printf(hdrFmt, (Object[]) hdr); + final int numParts = list.size(); + final double meanPartSize = (double)totalN / numParts; + double size = 0; + double sumSizes = 0; + double sumAbsRelErr = 0; + double sumSqErr = 0; + for (int i = 0; i < numParts; i++) { + final PartitionBoundsRow row = list.get(i); + printf(dFmt, row.partId , (i + 1), row.lowerBound, row.upperBound, row.approxNumDeltaItems, row.rule.name()); + size = row.approxNumDeltaItems; + sumSizes += size; + sumAbsRelErr += Math.abs(size / meanPartSize - 1.0); + final double absErr = size - meanPartSize; + sumSqErr += absErr * absErr; + } + final double meanAbsRelErr = sumAbsRelErr / numParts; + final double meanSqErr = sumSqErr / numParts; //intermediate value + final double normMeanSqErr = meanSqErr / (meanPartSize * meanPartSize); //intermediate value + final double rmsRelErr = Math.sqrt(normMeanSqErr); //a.k.a. Normalized RMS Error or NRMSE + + printf("Total ApproxNumItems :%,20d\n",(long)sumSizes); + printf("Mean Partition Size :%,20.1f\n",meanPartSize); + printf("Mean Abs Rel Error :%20.3f%%\n",meanAbsRelErr * 100); + printf("Norm RMS Error :%20.3f%%\n",rmsRelErr * 100); + } + + private final static boolean enablePrinting = true; + + /** + * @param o the Object to print + */ + private static final void print(final Object o) { + if (enablePrinting) { System.out.print(o.toString()); } + } + + /** + * @param o the Object to println + */ + private static final void println(final Object o) { + if (enablePrinting) { System.out.println(o.toString()); } + } + + /** + * @param format the format + * @param args the args + */ + private static final void printf(final String format, final Object ...args) { + if (enablePrinting) { System.out.printf(format, args); } + } + +} diff --git a/src/test/java/org/apache/datasketches/partitions/ItemsSketchFillRequestLongAsString.java b/src/test/java/org/apache/datasketches/partitions/ItemsSketchFillRequestLongAsString.java new file mode 100644 index 000000000..2b966051f --- /dev/null +++ b/src/test/java/org/apache/datasketches/partitions/ItemsSketchFillRequestLongAsString.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.partitions; + +import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_BOTH; +import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_UPPER; +import static org.apache.datasketches.quantilescommon.LongsAsOrderableStrings.digits; +import static org.apache.datasketches.quantilescommon.LongsAsOrderableStrings.getString; + +import java.util.Comparator; +import java.util.Random; + +import org.apache.datasketches.quantiles.ItemsSketch; + +/** + * This is an simulated data set with a given N used for testing. + * @author Lee Rhodes + */ +public class ItemsSketchFillRequestLongAsString implements SketchFillRequest> { + private int k; + private int numDigits; + private Random rand = new Random(); + + public ItemsSketchFillRequestLongAsString() { + k = 1 << 10; + numDigits = 3; + } + + public ItemsSketchFillRequestLongAsString(final int k, final long totalN) { + this.k = k; + this.numDigits = digits(totalN); + } + + @Override + public ItemsSketch getRange(final String lowerQuantile, final String upperQuantile, + final BoundsRule bounds) { + final ItemsSketch sk = ItemsSketch.getInstance(String.class, k, Comparator.naturalOrder()); + final long lower = Long.parseLong(lowerQuantile.trim()); + final long upper = Long.parseLong(upperQuantile.trim()); + if (bounds == INCLUDE_BOTH) { + for (long i = lower; i <= upper; i++) { sk.update(getString(i, numDigits)); } + } else if (bounds == INCLUDE_UPPER) { + for (long i = lower + 1; i <= upper; i++) { sk.update(getString(i, numDigits)); } + } else { //INCLUDE_LOWER + for (long i = lower; i < upper; i++) { sk.update(getString(i, numDigits)); } + } + return sk; + } + + public ItemsSketch getRange(final long lowerQuantile, final long upperQuantile, final BoundsRule bounds) { + final ItemsSketch sk = ItemsSketch.getInstance(String.class, k, Comparator.naturalOrder()); + final long lower = lowerQuantile; + final long upper = upperQuantile; + if (bounds == INCLUDE_BOTH) { + for (long i = lower; i <= upper; i++) { sk.update(getString(i, numDigits)); } + } else if (bounds == INCLUDE_UPPER) { + for (long i = lower + 1; i <= upper; i++) { sk.update(getString(i, numDigits)); } + } else { //INCLUDE_LOWER + for (long i = lower; i < upper; i++) { sk.update(getString(i, numDigits)); } + } + return sk; + } + + public ItemsSketch getRangeRandom(final long lowerQuantile, final long upperQuantile, + final BoundsRule bounds) { + final ItemsSketch sk = ItemsSketch.getInstance(String.class, k, Comparator.naturalOrder()); + final long lower = lowerQuantile; + final long upper = upperQuantile; + this.rand = new Random(); + if (bounds == INCLUDE_BOTH) { + for (long i = lower; i <= upper; i++) { + sk.update(getString(randBetween(lowerQuantile, upperQuantile, bounds), numDigits)); + } + } else if (bounds == INCLUDE_UPPER) { + for (long i = lower + 1; i <= upper; i++) { + sk.update(getString(randBetween(lowerQuantile, upperQuantile, bounds), numDigits)); + } + } else { //INCLUDE_LOWER + for (long i = lower; i < upper; i++) { + sk.update(getString(randBetween(lowerQuantile, upperQuantile, bounds), numDigits)); + } + } + return sk; + } + + private final long randBetween(final long lb, final long ub, final BoundsRule bounds) { + final double r = rand.nextDouble(); + final long range; + final long offset; + if (bounds == INCLUDE_BOTH) { + range = ub - lb; + offset = lb; + } + else if (bounds == INCLUDE_UPPER) { + range = ub - lb - 1; + offset = lb + 1; + } else { //INCLUDE_LOWER + range = ub - lb - 1; + offset = lb; + } + return Math.round(r * range + offset); + } + +} diff --git a/src/test/java/org/apache/datasketches/partitions/KllItemsSketchFillRequestLongAsString.java b/src/test/java/org/apache/datasketches/partitions/KllItemsSketchFillRequestLongAsString.java new file mode 100644 index 000000000..53d80190f --- /dev/null +++ b/src/test/java/org/apache/datasketches/partitions/KllItemsSketchFillRequestLongAsString.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.partitions; + +import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_BOTH; +import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_UPPER; +import static org.apache.datasketches.quantilescommon.LongsAsOrderableStrings.digits; +import static org.apache.datasketches.quantilescommon.LongsAsOrderableStrings.getString; + +import java.util.Comparator; +import java.util.Random; + +import org.apache.datasketches.common.ArrayOfStringsSerDe; +import org.apache.datasketches.kll.KllItemsSketch; + +/** + * This is an simulated data set with a given N used for testing. + * @author Lee Rhodes + */ +public class KllItemsSketchFillRequestLongAsString implements SketchFillRequest> { + private int k; + private int numDigits; + private Random rand = new Random(); + + public KllItemsSketchFillRequestLongAsString() { + k = 1 << 10; + numDigits = 3; + } + + public KllItemsSketchFillRequestLongAsString(final int k, final long totalN) { + this.k = k; + this.numDigits = digits(totalN); + } + + @Override + public KllItemsSketch getRange(final String lowerQuantile, final String upperQuantile, + final BoundsRule bounds) { + KllItemsSketch sk = KllItemsSketch.newHeapInstance(k, Comparator.naturalOrder(), new ArrayOfStringsSerDe()); + long lower = Long.parseLong(lowerQuantile.trim()); + long upper = Long.parseLong(upperQuantile.trim()); + if (bounds == INCLUDE_BOTH) { + for (long i = lower; i <= upper; i++) { sk.update(getString(i, numDigits)); } + } else if (bounds == INCLUDE_UPPER) { + for (long i = lower + 1; i <= upper; i++) { sk.update(getString(i, numDigits)); } + } else { //INCLUDE_LOWER + for (long i = lower; i < upper; i++) { sk.update(getString(i, numDigits)); } + } + return sk; + } + + public KllItemsSketch getRange(final long lowerQuantile, final long upperQuantile, final BoundsRule bounds) { + KllItemsSketch sk = KllItemsSketch.newHeapInstance(k, Comparator.naturalOrder(), new ArrayOfStringsSerDe()); + long lower = lowerQuantile; + long upper = upperQuantile; + if (bounds == INCLUDE_BOTH) { + for (long i = lower; i <= upper; i++) { sk.update(getString(i, numDigits)); } + } else if (bounds == INCLUDE_UPPER) { + for (long i = lower + 1; i <= upper; i++) { sk.update(getString(i, numDigits)); } + } else { //INCLUDE_LOWER + for (long i = lower; i < upper; i++) { sk.update(getString(i, numDigits)); } + } + return sk; + } + + public KllItemsSketch getRangeRandom(final long lowerQuantile, final long upperQuantile, final BoundsRule bounds) { + KllItemsSketch sk = KllItemsSketch.newHeapInstance(k, Comparator.naturalOrder(), new ArrayOfStringsSerDe()); + long lower = lowerQuantile; + long upper = upperQuantile; + this.rand = new Random(); + if (bounds == INCLUDE_BOTH) { + for (long i = lower; i <= upper; i++) { + sk.update(getString(randBetween(lowerQuantile, upperQuantile, bounds), numDigits)); + } + } else if (bounds == INCLUDE_UPPER) { + for (long i = lower + 1; i <= upper; i++) { + sk.update(getString(randBetween(lowerQuantile, upperQuantile, bounds), numDigits)); + } + } else { //INCLUDE_LOWER + for (long i = lower; i < upper; i++) { + sk.update(getString(randBetween(lowerQuantile, upperQuantile, bounds), numDigits)); + } + } + return sk; + } + + private final long randBetween(final long lb, final long ub, final BoundsRule bounds) { + double r = rand.nextDouble(); + long range; + long offset; + if (bounds == INCLUDE_BOTH) { + range = ub - lb; + offset = lb; + } + else if (bounds == INCLUDE_UPPER) { + range = ub - lb - 1; + offset = lb + 1; + } else { //INCLUDE_LOWER + range = ub - lb - 1; + offset = lb; + } + return Math.round(r * range + offset); + } + +} diff --git a/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java b/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java new file mode 100644 index 000000000..3b44d9988 --- /dev/null +++ b/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.partitions; + +import static org.apache.datasketches.common.Util.milliSecToString; +import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_BOTH; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; + +import java.util.List; + +import org.apache.datasketches.partitions.Partitioner; +import org.apache.datasketches.partitions.Partitioner.PartitionBoundsRow; +import org.apache.datasketches.kll.KllItemsSketch; +import org.testng.annotations.Test; + +@SuppressWarnings("unused") +public class KllPartitionsTest { + private final int k = 1 << 15; + private final long totalN = 100_000_000L; + private final long tgtPartitionSize = (long)3e6; + private final int maxPartsPerSk = 100; + + //@Test + public void checkKllPartitioner() { + println("KllItemsSketch Partitions Test"); + printf("Sketch K :%,20d\n", k); + printf("Total N :%,20d\n", totalN); + printf("Tgt Partition Size :%,20d\n", tgtPartitionSize); + printf("Max Parts Per Sketch :%20d\n", maxPartsPerSk); + + final long startTime_mS = System.currentTimeMillis(); + final KllItemsSketchFillRequestLongAsString fillReq = new KllItemsSketchFillRequestLongAsString(k, totalN); + final KllItemsSketch sk = fillReq.getRange(1L, totalN, INCLUDE_BOTH); + final long endFillInitialSketchTime_mS = System.currentTimeMillis(); + final Partitioner> partitioner = new Partitioner<>( + tgtPartitionSize, + maxPartsPerSk, + fillReq, + INCLUSIVE); + final List> list = partitioner.partition(sk); + outputList(list); + + final long endTime_mS = System.currentTimeMillis(); + final long fillInitialSketchTime_mS = endFillInitialSketchTime_mS - startTime_mS; + final long partitioningTime_mS = endTime_mS - endFillInitialSketchTime_mS; + final long totalTime_mS = endTime_mS - startTime_mS; + println(""); + println("FillInitialSketchTime: " + milliSecToString(fillInitialSketchTime_mS)); + println("PartioningTime : " + milliSecToString(partitioningTime_mS)); + println("Total Time : " + milliSecToString(totalTime_mS)); + } + + private static final String[] hdr = + { "Level.Part", "Partition", "LowerBound", "UpperBound", "ApproxNumItems", "Include Rule" }; + private static final String hdrFmt = "%15s %10s %15s %15s %15s %15s\n"; + private static final String dFmt = "%15s %10d %15s %15s %15d %15s\n"; + + void outputList(final List> list) { + printf(hdrFmt, (Object[]) hdr); + final int numParts = list.size(); + final double meanPartSize = (double)totalN / numParts; + double size = 0; + double sumSizes = 0; + double sumAbsRelErr = 0; + double sumSqErr = 0; + for (int i = 0; i < numParts; i++) { + final PartitionBoundsRow row = list.get(i); + printf(dFmt, row.partId , (i + 1), row.lowerBound, row.upperBound, row.approxNumDeltaItems, row.rule.name()); + size = row.approxNumDeltaItems; + sumSizes += size; + sumAbsRelErr += Math.abs(size / meanPartSize - 1.0); + final double absErr = size - meanPartSize; + sumSqErr += absErr * absErr; + } + final double meanAbsRelErr = sumAbsRelErr / numParts; + final double meanSqErr = sumSqErr / numParts; //intermediate value + final double normMeanSqErr = meanSqErr / (meanPartSize * meanPartSize); //intermediate value + final double rmsRelErr = Math.sqrt(normMeanSqErr); //a.k.a. Normalized RMS Error or NRMSE + + printf("Total ApproxNumItems :%,20d\n",(long)sumSizes); + printf("Mean Partition Size :%,20.1f\n",meanPartSize); + printf("Mean Abs Rel Error :%20.3f%%\n",meanAbsRelErr * 100); + printf("Norm RMS Error :%20.3f%%\n",rmsRelErr * 100); + } + + private final static boolean enablePrinting = true; + + /** + * @param o the Object to print + */ + private static final void print(final Object o) { + if (enablePrinting) { System.out.print(o.toString()); } + } + + /** + * @param o the Object to println + */ + private static final void println(final Object o) { + if (enablePrinting) { System.out.println(o.toString()); } + } + + /** + * @param format the format + * @param args the args + */ + private static final void printf(final String format, final Object ...args) { + if (enablePrinting) { System.out.printf(format, args); } + } + +} diff --git a/src/test/java/org/apache/datasketches/quantiles/CustomQuantilesTest.java b/src/test/java/org/apache/datasketches/quantiles/CustomQuantilesTest.java index 216b91f72..d3193883b 100644 --- a/src/test/java/org/apache/datasketches/quantiles/CustomQuantilesTest.java +++ b/src/test/java/org/apache/datasketches/quantiles/CustomQuantilesTest.java @@ -91,7 +91,7 @@ public void checkQuantilesV400() { double qTrue = getTrueDoubleQuantile(cumWtsArr, quantilesArr, normRankIn, EXCLUSIVE); assertEquals(qEst, qTrue); double rawNatRank = normRankIn * N; - double trimNatRank = getNaturalRank(normRankIn, N); + double trimNatRank = getNaturalRank(normRankIn, N, EXCLUSIVE); printf("%22.18f %22.18f %22.18f %13.1f", normRankIn, rawNatRank, trimNatRank, qEst); if (qEst != qTrue) { println(" " + qEst + " != " +qTrue); } else { println(""); } } @@ -120,7 +120,7 @@ public void checkQuantilesV400() { double qTrue = getTrueDoubleQuantile(cumWtsArr, quantilesArr, normRankIn, INCLUSIVE); assertEquals(qEst, qTrue); double rawNatRank = normRankIn * N; - double trimNatRank = getNaturalRank(normRankIn, N); + double trimNatRank = getNaturalRank(normRankIn, N, INCLUSIVE); printf("%22.18f %22.18f %22.18f %13.1f", normRankIn, rawNatRank, trimNatRank, qEst); if (qEst != qTrue) { println(" " + qEst + " != " +qTrue); } else { println(""); } } diff --git a/src/test/java/org/apache/datasketches/quantiles/DoublesSketchTest.java b/src/test/java/org/apache/datasketches/quantiles/DoublesSketchTest.java index fdd7918d1..d4f549ebe 100644 --- a/src/test/java/org/apache/datasketches/quantiles/DoublesSketchTest.java +++ b/src/test/java/org/apache/datasketches/quantiles/DoublesSketchTest.java @@ -134,7 +134,6 @@ public void checkEmptyExceptions() { try { uds.getMaxItem(); fail(); } catch (IllegalArgumentException e) {} try { uds.getMinItem(); fail(); } catch (IllegalArgumentException e) {} try { uds.getRank(1.0); fail(); } catch (IllegalArgumentException e) {} - try { uds.getPartitionBoundaries(5); fail(); } catch (IllegalArgumentException e) {} try { uds.getPMF(new double[] { 0, 0.5, 1.0 }); fail(); } catch (IllegalArgumentException e) {} try { uds.getCDF(new double[] { 0, 0.5, 1.0 }); fail(); } catch (IllegalArgumentException e) {} } @@ -199,15 +198,15 @@ public void sortedView() { Assert.assertEquals(it.next(), true); Assert.assertEquals(it.getQuantile(), 1); Assert.assertEquals(it.getWeight(), 1); - Assert.assertEquals(it.getCumulativeWeight(INCLUSIVE), 1); + Assert.assertEquals(it.getNaturalRank(INCLUSIVE), 1); Assert.assertEquals(it.next(), true); Assert.assertEquals(it.getQuantile(), 2); Assert.assertEquals(it.getWeight(), 1); - Assert.assertEquals(it.getCumulativeWeight(INCLUSIVE), 2); + Assert.assertEquals(it.getNaturalRank(INCLUSIVE), 2); Assert.assertEquals(it.next(), true); Assert.assertEquals(it.getQuantile(), 3); Assert.assertEquals(it.getWeight(), 1); - Assert.assertEquals(it.getCumulativeWeight(INCLUSIVE), 3); + Assert.assertEquals(it.getNaturalRank(INCLUSIVE), 3); Assert.assertEquals(it.next(), false); } } diff --git a/src/test/java/org/apache/datasketches/quantiles/HeapUpdateDoublesSketchTest.java b/src/test/java/org/apache/datasketches/quantiles/HeapUpdateDoublesSketchTest.java index b5fd7b2d3..eba9f6b55 100644 --- a/src/test/java/org/apache/datasketches/quantiles/HeapUpdateDoublesSketchTest.java +++ b/src/test/java/org/apache/datasketches/quantiles/HeapUpdateDoublesSketchTest.java @@ -782,31 +782,6 @@ public void testIt() { assertTrue(qsk2.isEmpty()); } - @Test - public void checkEvenlySpacedQuantiles() { - DoublesSketch qsk = buildAndLoadQS(32, 1001); - double[] values = qsk.getPartitionBoundaries(10).boundaries; - for (int i = 0; i comparator) { - super(quantiles, cumWeights, totalN, comparator); + final Comparator comparator, + final String maxItem, + final String minItem) { + super(quantiles, cumWeights, totalN, comparator, maxItem, minItem); } } diff --git a/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java b/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java index f123b01bd..0d8527bbf 100644 --- a/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java +++ b/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java @@ -599,15 +599,15 @@ public void sortedView() { assertEquals(it.next(), true); assertEquals(it.getQuantile(), 1); assertEquals(it.getWeight(), 1); - assertEquals(it.getCumulativeWeight(INCLUSIVE), 1); + assertEquals(it.getNaturalRank(INCLUSIVE), 1); assertEquals(it.next(), true); assertEquals(it.getQuantile(), 2); assertEquals(it.getWeight(), 1); - assertEquals(it.getCumulativeWeight(INCLUSIVE), 2); + assertEquals(it.getNaturalRank(INCLUSIVE), 2); assertEquals(it.next(), true); assertEquals(it.getQuantile(), 3); assertEquals(it.getWeight(), 1); - assertEquals(it.getCumulativeWeight(INCLUSIVE), 3); + assertEquals(it.getNaturalRank(INCLUSIVE), 3); assertEquals(it.next(), false); } } @@ -617,7 +617,7 @@ public void sortedView2() { Double[] qArr = {8.0, 10.0, 10.0, 20.0}; long[] cwArr = {1, 3, 4, 5}; Comparator comp = Comparator.naturalOrder(); - ItemsSketchSortedView sv = new ItemsSketchSortedView<>(qArr, cwArr, 5L, comp); + ItemsSketchSortedView sv = new ItemsSketchSortedView<>(qArr, cwArr, 5L, comp, 20.0, 8.0); double[] ranks = {0, .1, .2, .3, .6, .7, .8, .9, 1.0}; Double[] qOut = new Double[9]; for (int i = 0; i < ranks.length; i++) { @@ -640,10 +640,10 @@ public void getQuantiles() { sketch.update(3); sketch.update(4); Integer[] quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, EXCLUSIVE); - Integer[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).boundaries; + Integer[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).getBoundaries(); assertEquals(quantiles1, quantiles2); quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, INCLUSIVE); - quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).boundaries; + quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).getBoundaries(); assertEquals(quantiles1, quantiles2); } diff --git a/src/test/java/org/apache/datasketches/quantiles/SkewedDataTest.java b/src/test/java/org/apache/datasketches/quantiles/SkewedDataTest.java new file mode 100644 index 000000000..d27911cab --- /dev/null +++ b/src/test/java/org/apache/datasketches/quantiles/SkewedDataTest.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.quantiles; + +import java.util.Comparator; + +import static org.apache.datasketches.quantilescommon.LongsAsOrderableStrings.*; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; + +import org.apache.datasketches.quantilescommon.GenericSortedViewIterator; +import org.apache.datasketches.quantilescommon.GenericPartitionBoundaries; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; +import org.testng.annotations.Test; + +/** + * blah + */ +@SuppressWarnings("unused") +public class SkewedDataTest { + static String[] hdr = {"N", "MaxItem", "MinItem", "NumParts", "SearchCriteria"}; + static String hdrfmt = "%6s %10s %10s %10s %15s\n"; + static String hdrdfmt = "%6d %10s %10s %10d %15s\n"; + + static String[] rowhdr = {"Row", "NormRanks", "NatRanks", "Boundaries", "DeltaItems"}; + static String rowhdrfmt = "%5s %12s %12s %12s %12s\n"; + static String rowdfmt = "%5d %12.8f %12d %12s %12d\n"; + + static String[] rowhdr2 = {"Row", "NormRanks", "NatRanks", "Boundaries"}; + static String rowhdrfmt2= "%5s %12s %12s %12s\n"; + static String rowdfmt2 = "%5d %12.8f %12d %12s\n"; + + //@Test //visual only + public void checkWithSkew() { + int n = 2050; + int k = 1 << 15; + int n2 = 200; + int totalN = n + n2; + int numDigits = digits(totalN); + long v2 = 1000L; + int numParts = 22; + QuantileSearchCriteria searchCrit = QuantileSearchCriteria.INCLUSIVE; + ItemsSketch sk = ItemsSketch.getInstance(String.class,k, Comparator.naturalOrder()); + + for (long i = 1; i <= n; i++) { sk.update(getString(i, numDigits)); } + for (long i = 1; i <= n2; i++) { sk.update(getString(v2, numDigits)); } + ItemsSketchSortedView sv = sk.getSortedView(); + GenericSortedViewIterator itr = sv.iterator(); + println("SORTED VIEW:"); + printf(rowhdrfmt2, (Object[])rowhdr2); + int j = 0; + while (itr.next()) { + printf(rowdfmt2, j++, itr.getNormalizedRank(searchCrit), itr.getNaturalRank(searchCrit), itr.getQuantile()); + } + + GenericPartitionBoundaries gpb = sv.getPartitionBoundaries(numParts, searchCrit); + int arrLen = gpb.getBoundaries().length; + double[] normRanks = gpb.getNormalizedRanks(); + long[] natRanks = gpb.getNaturalRanks(); + String[] boundaries = gpb.getBoundaries(); + long[] numDeltaItems = gpb.getNumDeltaItems(); + println(""); + println("GET PARTITION BOUNDARIES:"); + printf(hdrfmt, (Object[]) hdr); + printf(hdrdfmt, totalN, gpb.getMaxItem(), gpb.getMinItem(), numParts, searchCrit.toString()); + println(""); + printf(rowhdrfmt, (Object[]) rowhdr); + for (int i = 0; i < arrLen; i++) { + printf(rowdfmt, i, normRanks[i], natRanks[i], boundaries[i], numDeltaItems[i]); + } + } + + private final static boolean enablePrinting = true; + + /** + * @param o the Object to print + */ + private static final void print(final Object o) { + if (enablePrinting) { System.out.print(o.toString()); } + } + + /** + * @param o the Object to println + */ + private static final void println(final Object o) { + if (enablePrinting) { System.out.println(o.toString()); } + } + + /** + * @param format the format + * @param args the args + */ + private static final void printf(final String format, final Object ...args) { + if (enablePrinting) { System.out.printf(format, args); } + } + + +} diff --git a/src/test/java/org/apache/datasketches/quantilescommon/CrossCheckQuantilesTest.java b/src/test/java/org/apache/datasketches/quantilescommon/CrossCheckQuantilesTest.java index 5f4c4c753..df151c8ce 100644 --- a/src/test/java/org/apache/datasketches/quantilescommon/CrossCheckQuantilesTest.java +++ b/src/test/java/org/apache/datasketches/quantilescommon/CrossCheckQuantilesTest.java @@ -77,7 +77,6 @@ */ public class CrossCheckQuantilesTest { private ArrayOfStringsSerDe serDe = new ArrayOfStringsSerDe(); - private final String minItem = "10"; private final Comparator comparator = Comparator.naturalOrder(); private final static int k = 32; //all sketches are in exact mode @@ -121,6 +120,14 @@ public class CrossCheckQuantilesTest { {2,1,2,1,2,1,2,1} }; + final float[] svMaxFValues = { 10, 10, 40, 50, 40 }; + final float[] svMinFValues = { 10, 10, 10, 10, 10 }; + final double[] svMaxDValues = { 10, 10, 40, 50, 40 }; + final double[] svMinDValues = { 10, 10, 10, 10, 10 }; + final String[] svMaxIValues = { "10", "10", "40", "50", "40" }; + final String[] svMinIValues = { "10", "10", "10", "10", "10" }; + + int numSets; long[][] svCumWeights; @@ -329,32 +336,44 @@ private void buildSketches(int set) { /*******BUILD & LOAD SVs***********/ private void buildSVs(int set) throws Exception { - reqFloatsSV = getRawReqSV(svFValues[set], svCumWeights[set], totalN[set]); - kllFloatsSV = getRawKllFloatsSV(svFValues[set], svCumWeights[set], totalN[set]); - kllDoublesSV = getRawKllDoublesSV(svDValues[set], svCumWeights[set], totalN[set]); - classicDoublesSV = getRawClassicDoublesSV(svDValues[set], svCumWeights[set], totalN[set]); - kllItemsSV = new KllItemsSketchSortedViewString(svIValues[set], svCumWeights[set], totalN[set], minItem, comparator); - itemsSV = new ItemsSketchSortedViewString(svIValues[set], svCumWeights[set], totalN[set], comparator); + reqFloatsSV = getRawReqSV(svFValues[set], svCumWeights[set], totalN[set], + svMaxFValues[set], svMinFValues[set]); + kllFloatsSV = getRawKllFloatsSV(svFValues[set], svCumWeights[set], totalN[set], + svMaxFValues[set], svMinFValues[set]); + kllDoublesSV = getRawKllDoublesSV(svDValues[set], svCumWeights[set], totalN[set], + svMaxDValues[set], svMinDValues[set]); + classicDoublesSV = getRawClassicDoublesSV(svDValues[set], svCumWeights[set], totalN[set], + svMaxDValues[set], svMinDValues[set]); + String svImax = svIValues[set][svIValues[set].length - 1]; + String svImin = svIValues[set][0]; + kllItemsSV = new KllItemsSketchSortedViewString(svIValues[set], svCumWeights[set], totalN[set], + comparator, svImax, svImin); + itemsSV = new ItemsSketchSortedViewString(svIValues[set], svCumWeights[set], totalN[set], + comparator, svImax, svImin); } private final static ReqSketchSortedView getRawReqSV( - final float[] values, final long[] cumWeights, final long totalN) throws Exception { - return (ReqSketchSortedView) REQ_SV_CTOR.newInstance(values, cumWeights, totalN); + final float[] values, final long[] cumWeights, final long totalN, final float maxItem, final float minItem) + throws Exception { + return (ReqSketchSortedView) REQ_SV_CTOR.newInstance(values, cumWeights, totalN, maxItem, minItem); } private final static KllFloatsSketchSortedView getRawKllFloatsSV( - final float[] values, final long[] cumWeights, final long totalN) throws Exception { - return (KllFloatsSketchSortedView) KLL_FLOATS_SV_CTOR.newInstance(values, cumWeights, totalN); + final float[] values, final long[] cumWeights, final long totalN, final float maxItem, final float minItem) + throws Exception { + return (KllFloatsSketchSortedView) KLL_FLOATS_SV_CTOR.newInstance(values, cumWeights, totalN, maxItem, minItem); } private final static KllDoublesSketchSortedView getRawKllDoublesSV( - final double[] values, final long[] cumWeights, final long totalN) throws Exception { - return (KllDoublesSketchSortedView) KLL_DOUBLES_SV_CTOR.newInstance(values, cumWeights, totalN); + final double[] values, final long[] cumWeights, final long totalN, final double maxItem, final double minItem) + throws Exception { + return (KllDoublesSketchSortedView) KLL_DOUBLES_SV_CTOR.newInstance(values, cumWeights, totalN, maxItem, minItem); } private final static DoublesSketchSortedView getRawClassicDoublesSV( - final double[] values, final long[] cumWeights, final long totalN) throws Exception { - return (DoublesSketchSortedView) CLASSIC_DOUBLES_SV_CTOR.newInstance(values, cumWeights, totalN); + final double[] values, final long[] cumWeights, final long totalN, final double maxItem, final double minItem) + throws Exception { + return (DoublesSketchSortedView) CLASSIC_DOUBLES_SV_CTOR.newInstance(values, cumWeights, totalN, maxItem, minItem); } /********BUILD DATA SETS**********/ diff --git a/src/test/java/org/apache/datasketches/quantilescommon/LongsAsOrderableStrings.java b/src/test/java/org/apache/datasketches/quantilescommon/LongsAsOrderableStrings.java new file mode 100644 index 000000000..d8eb60d56 --- /dev/null +++ b/src/test/java/org/apache/datasketches/quantilescommon/LongsAsOrderableStrings.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.quantilescommon; + +import static java.lang.Math.ceil; +import static java.lang.Math.log; +import static org.apache.datasketches.common.Util.characterPad; + +/** + * Creates a string from a positive long value that is orderable in the + * same order as its long value. + */ +public final class LongsAsOrderableStrings { + + /** + * Converts the given long into a string with leading spaces based on the given numDigits. + * This allows the stings to be ordered as if they were longs. + * @param value the value to convert + * @param numDigits the maximum required number of total spaces for digits. + * @return the given long into a string with leading spaces + */ + public static String getString(final long value, final int numDigits) { + return characterPad(Long.toString(value), numDigits, ' ', false); + } + + /** + * Converts the given String back to a long by trimming any leading or trailing spaces. + * @param value the given string to convert + * @return the given String back to a long + */ + public static long getLong(final String value) { + return Long.parseLong(value.trim()); + } + + /** + * Computes the number of digits required to display the given positive long value. + * This does not include commas or other digit separators. + * This works with longs less than 1E15. + * @param maxValue the maximum anticipated long value. + * @return the number of required display digits + */ + public static int digits(final long maxValue) { + if (maxValue <= 0) { return 1; } + return (int) ceil(log(maxValue + 1) / log(10.0)); + } + +} diff --git a/src/test/java/org/apache/datasketches/quantilescommon/ReflectUtilityTest.java b/src/test/java/org/apache/datasketches/quantilescommon/ReflectUtilityTest.java index b756c5da1..191629fbe 100644 --- a/src/test/java/org/apache/datasketches/quantilescommon/ReflectUtilityTest.java +++ b/src/test/java/org/apache/datasketches/quantilescommon/ReflectUtilityTest.java @@ -50,10 +50,14 @@ private ReflectUtilityTest() {} KLL_DOUBLES_SV = getClass("org.apache.datasketches.kll.KllDoublesSketchSortedView"); CLASSIC_DOUBLES_SV = getClass("org.apache.datasketches.quantiles.DoublesSketchSortedView"); - REQ_SV_CTOR = getConstructor(REQ_SV, float[].class, long[].class, long.class); - KLL_FLOATS_SV_CTOR = getConstructor(KLL_FLOATS_SV, float[].class, long[].class, long.class); - KLL_DOUBLES_SV_CTOR = getConstructor(KLL_DOUBLES_SV, double[].class, long[].class, long.class); - CLASSIC_DOUBLES_SV_CTOR = getConstructor(CLASSIC_DOUBLES_SV, double[].class, long[].class, long.class); + REQ_SV_CTOR = + getConstructor(REQ_SV, float[].class, long[].class, long.class, float.class, float.class); + KLL_FLOATS_SV_CTOR = + getConstructor(KLL_FLOATS_SV, float[].class, long[].class, long.class, float.class, float.class); + KLL_DOUBLES_SV_CTOR = + getConstructor(KLL_DOUBLES_SV, double[].class, long[].class, long.class, double.class, double.class); + CLASSIC_DOUBLES_SV_CTOR = + getConstructor(CLASSIC_DOUBLES_SV, double[].class, long[].class, long.class, double.class, double.class); } @Test //Example @@ -62,7 +66,7 @@ public static void checkCtr() throws Exception { long[] larr = { 1, 2, 3 }; long n = 3; ReqSketchSortedView reqSV = - (ReqSketchSortedView) REQ_SV_CTOR.newInstance(farr, larr, n); + (ReqSketchSortedView) REQ_SV_CTOR.newInstance(farr, larr, n, 10f, 30f); float q = reqSV.getQuantile(1.0, INCLUSIVE); assertEquals(q, 30f); } diff --git a/src/test/java/org/apache/datasketches/req/ReqSketchSortedViewTest.java b/src/test/java/org/apache/datasketches/req/ReqSketchSortedViewTest.java index eb75790e5..003a53c3b 100644 --- a/src/test/java/org/apache/datasketches/req/ReqSketchSortedViewTest.java +++ b/src/test/java/org/apache/datasketches/req/ReqSketchSortedViewTest.java @@ -21,12 +21,12 @@ import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; + import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; import org.apache.datasketches.quantilescommon.FloatsSortedView; import org.apache.datasketches.quantilescommon.FloatsSortedViewIterator; -import org.testng.Assert; import org.testng.annotations.Test; /** @@ -39,13 +39,6 @@ public class ReqSketchSortedViewTest { private final int dup = 2; private final int n = numV * dup; - @Test - public void emptySketch() { - ReqSketch sketch = ReqSketch.builder().build(); - FloatsSortedViewIterator itr = sketch.getSortedView().iterator(); - Assert.assertFalse(itr.next()); - } - @Test public void twoValueSketch() { ReqSketch sketch = ReqSketch.builder().build(); @@ -57,8 +50,8 @@ public void twoValueSketch() { assertEquals(itr.getQuantile(), 1f); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 0); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 1); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 0); + assertEquals(itr.getNaturalRank(INCLUSIVE), 1); assertEquals(itr.getNormalizedRank(EXCLUSIVE), 0); assertEquals(itr.getNormalizedRank(INCLUSIVE), 0.5); @@ -66,8 +59,8 @@ public void twoValueSketch() { assertEquals(itr.getQuantile(), 2f); assertEquals(itr.getWeight(), 1); - assertEquals(itr.getCumulativeWeight(EXCLUSIVE), 1); - assertEquals(itr.getCumulativeWeight(INCLUSIVE), 2); + assertEquals(itr.getNaturalRank(EXCLUSIVE), 1); + assertEquals(itr.getNaturalRank(INCLUSIVE), 2); assertEquals(itr.getNormalizedRank(EXCLUSIVE), 0.5); assertEquals(itr.getNormalizedRank(INCLUSIVE), 1.0); } @@ -111,9 +104,9 @@ private static void printIterator(final FloatsSortedViewIterator itr) { while (itr.next()) { float v = itr.getQuantile(); long wt = itr.getWeight(); - long cumWtNotInc = itr.getCumulativeWeight(EXCLUSIVE); + long cumWtNotInc = itr.getNaturalRank(EXCLUSIVE); double nRankNotInc = itr.getNormalizedRank(EXCLUSIVE); - long cumWtInc = itr.getCumulativeWeight(INCLUSIVE); + long cumWtInc = itr.getNaturalRank(INCLUSIVE); double nRankInc = itr.getNormalizedRank(INCLUSIVE); printf(fmt, v, wt, cumWtNotInc, nRankNotInc, cumWtInc, nRankInc); } diff --git a/src/test/java/org/apache/datasketches/req/ReqSketchTest.java b/src/test/java/org/apache/datasketches/req/ReqSketchTest.java index 4db9112a8..78b321e1d 100644 --- a/src/test/java/org/apache/datasketches/req/ReqSketchTest.java +++ b/src/test/java/org/apache/datasketches/req/ReqSketchTest.java @@ -29,6 +29,7 @@ import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.memory.Memory; import org.apache.datasketches.quantilescommon.FloatsSortedView; +import org.apache.datasketches.quantilescommon.FloatsSortedViewIterator; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.datasketches.quantilescommon.QuantilesFloatsSketchIterator; import org.apache.datasketches.quantilescommon.QuantilesUtil; @@ -152,13 +153,13 @@ private static void checkGetRanks(final ReqSketch sk, final int max, final int i private static void checkSortedView(final ReqSketch sk, final int iDebug) { final ReqSketchSortedView sv = new ReqSketchSortedView(sk); - final ReqSketchSortedViewIterator itr = sv.iterator(); + final FloatsSortedViewIterator itr = sv.iterator(); final int retainedCount = sk.getNumRetained(); final long totalN = sk.getN(); int count = 0; long cumWt = 0; while (itr.next()) { - cumWt = itr.getCumulativeWeight(INCLUSIVE); + cumWt = itr.getNaturalRank(INCLUSIVE); count++; } assertEquals(cumWt, totalN); @@ -234,21 +235,6 @@ private static void checkMerge(final ReqSketch sk, final int iDebug) { //specific tests - @Test - public void getQuantiles() { - final ReqSketch sketch = ReqSketch.builder().setK(12).build(); - sketch.update(1); - sketch.update(2); - sketch.update(3); - sketch.update(4); - float[] quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, EXCLUSIVE); - float[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, INCLUSIVE); - quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).boundaries; - assertEquals(quantiles1, quantiles2); - } - @Test public void merge() { final ReqSketch s1 = ReqSketch.builder().setK(12).build(); From 4747f67f0363d42694bb86b37b9a44de0ca1e383 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Thu, 16 Nov 2023 14:23:36 -0800 Subject: [PATCH 05/13] Fix javadoc issues. --- .../quantilescommon/GenericPartitionBoundaries.java | 12 ++++++------ .../quantilescommon/PartitioningFeature.java | 6 ++++-- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/apache/datasketches/quantilescommon/GenericPartitionBoundaries.java b/src/main/java/org/apache/datasketches/quantilescommon/GenericPartitionBoundaries.java index 733f7846d..4db851460 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/GenericPartitionBoundaries.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/GenericPartitionBoundaries.java @@ -83,17 +83,17 @@ public GenericPartitionBoundaries( *

    If these results were computed using QuantileSearchCriteria.INCLUSIVE then these sequential boundaries * are to be interpreted as follows: *

      - *
    • Partition 1: include all items >= index 0 and <= index 1.
    • - *
    • Partition 2: include all items > index 1 and <= index 2.
    • - *
    • Partition N: include all items > index N-1 and <= index N.
    • + *
    • Partition 1: include all items ≥ index 0 and ≤ index 1.
    • + *
    • Partition 2: include all items > index 1 and ≤ index 2.
    • + *
    • Partition N: include all items > index N-1 and ≤ index N.
    • *
    * *

    If these results were computed using QuantileSearchCriteria.EXCLUSIVE then these sequential boundaries * are to be interpreted as follows: *

      - *
    • Partition 1: include all items >= index 0 and < index 1.
    • - *
    • Partition 2: include all items >= index 1 and < index 2.
    • - *
    • Partition N: include all items >= index N-1 and <= index N.
    • + *
    • Partition 1: include all items ≥ index 0 and < index 1.
    • + *
    • Partition 2: include all items ≥ index 1 and < index 2.
    • + *
    • Partition N: include all items ≥ index N-1 and ≤ index N.
    • *
    * * @return an array of boundaries that sequentially define the upper and lower boundaries of partitions. diff --git a/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java b/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java index 3ff51a3b4..2c36bb10a 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java @@ -37,7 +37,8 @@ public interface PartitioningFeature { *

    * * @param numEquallySized an integer that specifies the number of equally sized partitions between - * {@link #getMinItem() getMinItem()} and {@link #getMaxItem() getMaxItem()}. + * {@link GenericPartitionBoundaries#getMinItem() getMinItem()} and + * {@link GenericPartitionBoundaries#getMaxItem() getMaxItem()}. * This must be a positive integer greater than zero. *
      *
    • A 1 will return: minItem, maxItem.
    • @@ -60,7 +61,8 @@ default GenericPartitionBoundaries getPartitionBoundaries(int numEquallySized * refers to an approximately equal number of items per partition. * * @param numEquallySized an integer that specifies the number of equally sized partitions between - * {@link #getMinItem() getMinItem()} and {@link #getMaxItem() getMaxItem()}. + * {@link GenericPartitionBoundaries#getMinItem() getMinItem()} and + * {@link GenericPartitionBoundaries#getMaxItem() getMaxItem()}. * This must be a positive integer greater than zero. *
        *
      • A 1 will return: minItem, maxItem.
      • From 9237a28d66d826c099c6d8d0e142662f732525b1 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Thu, 16 Nov 2023 15:16:05 -0800 Subject: [PATCH 06/13] Fixed the CodeQL issues. --- .../datasketches/partitions/Partitioner.java | 20 +++++++++---------- .../partitions/ClassicPartitionsTest.java | 2 +- .../ItemsSketchFillRequestLongAsString.java | 8 ++++++-- ...KllItemsSketchFillRequestLongAsString.java | 8 ++++++-- .../partitions/KllPartitionsTest.java | 2 +- .../LongsAsOrderableStrings.java | 8 +++++++- 6 files changed, 31 insertions(+), 17 deletions(-) diff --git a/src/main/java/org/apache/datasketches/partitions/Partitioner.java b/src/main/java/org/apache/datasketches/partitions/Partitioner.java index 65577385a..3816d4210 100644 --- a/src/main/java/org/apache/datasketches/partitions/Partitioner.java +++ b/src/main/java/org/apache/datasketches/partitions/Partitioner.java @@ -41,8 +41,8 @@ /** * A partitioning process that can partition very large data sets into thousands to millions * of partitions of approximately the same size. - * @param T the data type - * @param S the quantiles sketch that implements both QuantilesGenericAPI and PartitioningFeature. + * @param the data type + * @param the quantiles sketch that implements both QuantilesGenericAPI and PartitioningFeature. */ //@SuppressWarnings("unused") public class Partitioner & PartitioningFeature> { @@ -108,7 +108,7 @@ public List> partition(final S sk) { final int partsPerSk = (int)round(pow(guessNumParts, 1.0 / numLevels)); this.partitionsPerSk = min(partsPerSk, maxPartsPerSk); final GenericPartitionBoundaries gpb = sk.getPartitionBoundaries(partitionsPerSk, criteria); - final StackElement se = new StackElement<>(gpb, stack.size() + 1, 0, "1"); + final StackElement se = new StackElement<>(gpb, 0, "1"); stack.push(se); partitionSearch(stack); return finalPartitionList; @@ -136,8 +136,8 @@ private void partitionSearch(final Stack> stack) { final S sk = fillReq.getRange(row.lowerBound, row.upperBound, row.rule); final GenericPartitionBoundaries gpb2 = sk.getPartitionBoundaries(this.partitionsPerSk, criteria); final int level = stack.size() + 1; - final String partId = se.partId + "." + se.part + "," + level; - final StackElement se2 = new StackElement<>(gpb2, level, 0, partId); + final String partId = se.levelPartId + "." + se.part + "," + level; + final StackElement se2 = new StackElement<>(gpb2, 0, partId); stack.push(se2); partitionSearch(stack); } @@ -156,12 +156,12 @@ private void partitionSearch(final Stack> stack) { public static class StackElement { public final GenericPartitionBoundaries gpb; public int part; - public String partId; + public String levelPartId; - public StackElement(final GenericPartitionBoundaries gpb, final int level, final int part, final String partId) { + public StackElement(final GenericPartitionBoundaries gpb, final int part, final String levelPartId) { this.gpb = gpb; this.part = part; - this.partId = partId; + this.levelPartId = levelPartId; } } @@ -170,7 +170,7 @@ public StackElement(final GenericPartitionBoundaries gpb, final int level, fi */ public static class PartitionBoundsRow { public int part; - public String partId; + public String levelPartId; public long approxNumDeltaItems; public BoundsRule rule; public T lowerBound; @@ -179,7 +179,7 @@ public static class PartitionBoundsRow { public PartitionBoundsRow(final StackElement se) { final GenericPartitionBoundaries gpb = se.gpb; this.part = se.part; - this.partId = se.partId + "." + part; + this.levelPartId = se.levelPartId + "." + part; final QuantileSearchCriteria searchCrit = gpb.getSearchCriteria(); final T[] boundaries = gpb.getBoundaries(); final int numParts = gpb.getNumPartitions(); diff --git a/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java b/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java index f26031465..83c3a8158 100644 --- a/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java +++ b/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java @@ -82,7 +82,7 @@ void outputList(final List> list) { double sumSqErr = 0; for (int i = 0; i < numParts; i++) { final PartitionBoundsRow row = list.get(i); - printf(dFmt, row.partId , (i + 1), row.lowerBound, row.upperBound, row.approxNumDeltaItems, row.rule.name()); + printf(dFmt, row.levelPartId , (i + 1), row.lowerBound, row.upperBound, row.approxNumDeltaItems, row.rule.name()); size = row.approxNumDeltaItems; sumSizes += size; sumAbsRelErr += Math.abs(size / meanPartSize - 1.0); diff --git a/src/test/java/org/apache/datasketches/partitions/ItemsSketchFillRequestLongAsString.java b/src/test/java/org/apache/datasketches/partitions/ItemsSketchFillRequestLongAsString.java index 2b966051f..c1a33d7a5 100644 --- a/src/test/java/org/apache/datasketches/partitions/ItemsSketchFillRequestLongAsString.java +++ b/src/test/java/org/apache/datasketches/partitions/ItemsSketchFillRequestLongAsString.java @@ -27,6 +27,7 @@ import java.util.Comparator; import java.util.Random; +import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.quantiles.ItemsSketch; /** @@ -52,8 +53,11 @@ public ItemsSketchFillRequestLongAsString(final int k, final long totalN) { public ItemsSketch getRange(final String lowerQuantile, final String upperQuantile, final BoundsRule bounds) { final ItemsSketch sk = ItemsSketch.getInstance(String.class, k, Comparator.naturalOrder()); - final long lower = Long.parseLong(lowerQuantile.trim()); - final long upper = Long.parseLong(upperQuantile.trim()); + long upper, lower; + try { + lower = Long.parseLong(lowerQuantile.trim()); + upper = Long.parseLong(upperQuantile.trim()); + } catch (NumberFormatException e) { throw new SketchesArgumentException(e.toString()); } if (bounds == INCLUDE_BOTH) { for (long i = lower; i <= upper; i++) { sk.update(getString(i, numDigits)); } } else if (bounds == INCLUDE_UPPER) { diff --git a/src/test/java/org/apache/datasketches/partitions/KllItemsSketchFillRequestLongAsString.java b/src/test/java/org/apache/datasketches/partitions/KllItemsSketchFillRequestLongAsString.java index 53d80190f..41d3f6569 100644 --- a/src/test/java/org/apache/datasketches/partitions/KllItemsSketchFillRequestLongAsString.java +++ b/src/test/java/org/apache/datasketches/partitions/KllItemsSketchFillRequestLongAsString.java @@ -28,6 +28,7 @@ import java.util.Random; import org.apache.datasketches.common.ArrayOfStringsSerDe; +import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.kll.KllItemsSketch; /** @@ -53,8 +54,11 @@ public KllItemsSketchFillRequestLongAsString(final int k, final long totalN) { public KllItemsSketch getRange(final String lowerQuantile, final String upperQuantile, final BoundsRule bounds) { KllItemsSketch sk = KllItemsSketch.newHeapInstance(k, Comparator.naturalOrder(), new ArrayOfStringsSerDe()); - long lower = Long.parseLong(lowerQuantile.trim()); - long upper = Long.parseLong(upperQuantile.trim()); + long upper, lower; + try { + lower = Long.parseLong(lowerQuantile.trim()); + upper = Long.parseLong(upperQuantile.trim()); + } catch (NumberFormatException e) { throw new SketchesArgumentException(e.toString()); } if (bounds == INCLUDE_BOTH) { for (long i = lower; i <= upper; i++) { sk.update(getString(i, numDigits)); } } else if (bounds == INCLUDE_UPPER) { diff --git a/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java b/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java index 3b44d9988..f814c98db 100644 --- a/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java +++ b/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java @@ -82,7 +82,7 @@ void outputList(final List> list) { double sumSqErr = 0; for (int i = 0; i < numParts; i++) { final PartitionBoundsRow row = list.get(i); - printf(dFmt, row.partId , (i + 1), row.lowerBound, row.upperBound, row.approxNumDeltaItems, row.rule.name()); + printf(dFmt, row.levelPartId , (i + 1), row.lowerBound, row.upperBound, row.approxNumDeltaItems, row.rule.name()); size = row.approxNumDeltaItems; sumSizes += size; sumAbsRelErr += Math.abs(size / meanPartSize - 1.0); diff --git a/src/test/java/org/apache/datasketches/quantilescommon/LongsAsOrderableStrings.java b/src/test/java/org/apache/datasketches/quantilescommon/LongsAsOrderableStrings.java index d8eb60d56..10d5c9073 100644 --- a/src/test/java/org/apache/datasketches/quantilescommon/LongsAsOrderableStrings.java +++ b/src/test/java/org/apache/datasketches/quantilescommon/LongsAsOrderableStrings.java @@ -23,6 +23,8 @@ import static java.lang.Math.log; import static org.apache.datasketches.common.Util.characterPad; +import org.apache.datasketches.common.SketchesArgumentException; + /** * Creates a string from a positive long value that is orderable in the * same order as its long value. @@ -46,7 +48,11 @@ public static String getString(final long value, final int numDigits) { * @return the given String back to a long */ public static long getLong(final String value) { - return Long.parseLong(value.trim()); + long out; + try { out = Long.parseLong(value.trim()); } catch (NumberFormatException e) { + throw new SketchesArgumentException(e.toString()); + } + return out; } /** From c57b3613fe0d7fd67e531fbc8e3d5882009924fc Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Thu, 16 Nov 2023 17:05:42 -0800 Subject: [PATCH 07/13] added Maximum absolute error % --- .../partitions/ClassicPartitionsTest.java | 30 +++++++++++-------- .../partitions/KllPartitionsTest.java | 29 +++++++++++------- 2 files changed, 36 insertions(+), 23 deletions(-) diff --git a/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java b/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java index 83c3a8158..38bc22e85 100644 --- a/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java +++ b/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java @@ -19,13 +19,15 @@ package org.apache.datasketches.partitions; +import static java.lang.Math.abs; +import static java.lang.Math.max; +import static java.lang.Math.sqrt; import static org.apache.datasketches.common.Util.milliSecToString; import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_BOTH; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import java.util.List; -import org.apache.datasketches.partitions.Partitioner; import org.apache.datasketches.partitions.Partitioner.PartitionBoundsRow; import org.apache.datasketches.quantiles.ItemsSketch; import org.testng.annotations.Test; @@ -62,9 +64,9 @@ public void checkClassicPartitioner() { final long partitioningTime_mS = endTime_mS - endFillInitialSketchTime_mS; final long totalTime_mS = endTime_mS - startTime_mS; println(""); - println("FillInitialSketchTime: " + milliSecToString(fillInitialSketchTime_mS)); - println("PartioningTime : " + milliSecToString(partitioningTime_mS)); - println("Total Time : " + milliSecToString(totalTime_mS)); + println("FillInitialSketchTime : " + milliSecToString(fillInitialSketchTime_mS)); + println("PartioningTime : " + milliSecToString(partitioningTime_mS)); + println("Total Time : " + milliSecToString(totalTime_mS)); } private static final String[] hdr = @@ -80,24 +82,28 @@ void outputList(final List> list) { double sumSizes = 0; double sumAbsRelErr = 0; double sumSqErr = 0; + double maxAbsErr = 0; for (int i = 0; i < numParts; i++) { final PartitionBoundsRow row = list.get(i); printf(dFmt, row.levelPartId , (i + 1), row.lowerBound, row.upperBound, row.approxNumDeltaItems, row.rule.name()); size = row.approxNumDeltaItems; sumSizes += size; - sumAbsRelErr += Math.abs(size / meanPartSize - 1.0); - final double absErr = size - meanPartSize; + sumAbsRelErr += abs(size / meanPartSize - 1.0); + final double absErr = abs(size - meanPartSize); sumSqErr += absErr * absErr; + maxAbsErr= max(absErr, maxAbsErr); } final double meanAbsRelErr = sumAbsRelErr / numParts; final double meanSqErr = sumSqErr / numParts; //intermediate value final double normMeanSqErr = meanSqErr / (meanPartSize * meanPartSize); //intermediate value - final double rmsRelErr = Math.sqrt(normMeanSqErr); //a.k.a. Normalized RMS Error or NRMSE - - printf("Total ApproxNumItems :%,20d\n",(long)sumSizes); - printf("Mean Partition Size :%,20.1f\n",meanPartSize); - printf("Mean Abs Rel Error :%20.3f%%\n",meanAbsRelErr * 100); - printf("Norm RMS Error :%20.3f%%\n",rmsRelErr * 100); + final double rmsRelErr = sqrt(normMeanSqErr); //a.k.a. Normalized RMS Error or NRMSE + final double maxAbsErrFraction = maxAbsErr / meanPartSize; + + printf("Total ApproxNumItems :%,20d\n", (long)sumSizes); + printf("Mean Partition Size :%,20.1f\n", meanPartSize); + printf("Mean Abs Rel Error :%20.3f%%\n", meanAbsRelErr * 100); + printf("Norm RMS Error :%20.3f%%\n", rmsRelErr * 100); + printf("Max Abs Error Fraction:%20.3f%%\n", maxAbsErrFraction * 100); } private final static boolean enablePrinting = true; diff --git a/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java b/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java index f814c98db..cba50a175 100644 --- a/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java +++ b/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java @@ -19,6 +19,9 @@ package org.apache.datasketches.partitions; +import static java.lang.Math.abs; +import static java.lang.Math.max; +import static java.lang.Math.sqrt; import static org.apache.datasketches.common.Util.milliSecToString; import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_BOTH; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; @@ -62,9 +65,9 @@ public void checkKllPartitioner() { final long partitioningTime_mS = endTime_mS - endFillInitialSketchTime_mS; final long totalTime_mS = endTime_mS - startTime_mS; println(""); - println("FillInitialSketchTime: " + milliSecToString(fillInitialSketchTime_mS)); - println("PartioningTime : " + milliSecToString(partitioningTime_mS)); - println("Total Time : " + milliSecToString(totalTime_mS)); + println("FillInitialSketchTime : " + milliSecToString(fillInitialSketchTime_mS)); + println("PartioningTime : " + milliSecToString(partitioningTime_mS)); + println("Total Time : " + milliSecToString(totalTime_mS)); } private static final String[] hdr = @@ -80,24 +83,28 @@ void outputList(final List> list) { double sumSizes = 0; double sumAbsRelErr = 0; double sumSqErr = 0; + double maxAbsErr = 0; for (int i = 0; i < numParts; i++) { final PartitionBoundsRow row = list.get(i); printf(dFmt, row.levelPartId , (i + 1), row.lowerBound, row.upperBound, row.approxNumDeltaItems, row.rule.name()); size = row.approxNumDeltaItems; sumSizes += size; - sumAbsRelErr += Math.abs(size / meanPartSize - 1.0); - final double absErr = size - meanPartSize; + sumAbsRelErr += abs(size / meanPartSize - 1.0); + final double absErr = abs(size - meanPartSize); sumSqErr += absErr * absErr; + maxAbsErr= max(absErr, maxAbsErr); } final double meanAbsRelErr = sumAbsRelErr / numParts; final double meanSqErr = sumSqErr / numParts; //intermediate value final double normMeanSqErr = meanSqErr / (meanPartSize * meanPartSize); //intermediate value - final double rmsRelErr = Math.sqrt(normMeanSqErr); //a.k.a. Normalized RMS Error or NRMSE - - printf("Total ApproxNumItems :%,20d\n",(long)sumSizes); - printf("Mean Partition Size :%,20.1f\n",meanPartSize); - printf("Mean Abs Rel Error :%20.3f%%\n",meanAbsRelErr * 100); - printf("Norm RMS Error :%20.3f%%\n",rmsRelErr * 100); + final double rmsRelErr = sqrt(normMeanSqErr); //a.k.a. Normalized RMS Error or NRMSE + final double maxAbsErrFraction = maxAbsErr / meanPartSize; + + printf("Total ApproxNumItems :%,20d\n", (long)sumSizes); + printf("Mean Partition Size :%,20.1f\n", meanPartSize); + printf("Mean Abs Rel Error :%20.3f%%\n", meanAbsRelErr * 100); + printf("Norm RMS Error :%20.3f%%\n", rmsRelErr * 100); + printf("Max Abs Error Fraction:%20.3f%%\n", maxAbsErrFraction * 100); } private final static boolean enablePrinting = true; From 56e3bd9c35d16e3a8e6e3d604e20ce28d82bc1a0 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Fri, 17 Nov 2023 15:12:13 -0800 Subject: [PATCH 08/13] Improved the user access to the two partition test classes. 1. providing a main(...) 2. Initiate via TestNG 3. Direct progammatic access. I also split out the reporting method as it was duplicate code. --- .../datasketches/partitions/Partitioner.java | 2 +- .../partitions/ClassicPartitionsTest.java | 142 ++++++++--------- .../ItemsSketchFillRequestLongAsString.java | 42 ----- ...KllItemsSketchFillRequestLongAsString.java | 41 ----- .../partitions/KllPartitionsTest.java | 145 ++++++++---------- .../partitions/PartitionResults.java | 111 ++++++++++++++ 6 files changed, 235 insertions(+), 248 deletions(-) create mode 100644 src/test/java/org/apache/datasketches/partitions/PartitionResults.java diff --git a/src/main/java/org/apache/datasketches/partitions/Partitioner.java b/src/main/java/org/apache/datasketches/partitions/Partitioner.java index 3816d4210..fce0becfc 100644 --- a/src/main/java/org/apache/datasketches/partitions/Partitioner.java +++ b/src/main/java/org/apache/datasketches/partitions/Partitioner.java @@ -39,7 +39,7 @@ import org.apache.datasketches.quantilescommon.Stack; /** - * A partitioning process that can partition very large data sets into thousands to millions + * A partitioning process that can partition very large data sets into thousands * of partitions of approximately the same size. * @param the data type * @param the quantiles sketch that implements both QuantilesGenericAPI and PartitioningFeature. diff --git a/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java b/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java index 38bc22e85..64bf9823c 100644 --- a/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java +++ b/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java @@ -19,33 +19,70 @@ package org.apache.datasketches.partitions; -import static java.lang.Math.abs; -import static java.lang.Math.max; -import static java.lang.Math.sqrt; -import static org.apache.datasketches.common.Util.milliSecToString; import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_BOTH; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import java.util.List; +import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.partitions.Partitioner.PartitionBoundsRow; import org.apache.datasketches.quantiles.ItemsSketch; import org.testng.annotations.Test; +/** + * This classic quantiles sketch partitioner example application uses Strings formatted as numbers. + * The length of each string is the number of characters required to display the decimal digits of N, + * the number of elements of the entire set of data to be partitioned. + * As a result, there is a lot of overhead in string processing. + * Nevertheless, real applications of the approach outlined here, would have a lot of IO overhead that this simple + * test example does not have. + */ @SuppressWarnings("unused") public class ClassicPartitionsTest { - private final int k = 1 << 15; - private final long totalN = 100_000_000L; - private final long tgtPartitionSize = (long)3e6; - private final int maxPartsPerSk = 100; - //@Test + /** + * Launch the partitioner as an application with the following arguments as strings: + *
          + *
        • arg[0]: int k, the size of the sketch
        • + *
        • arg[1]: long totalN, the total size, in elements, of the data set to parse.
        • + *
        • arg[2]: long tgtPartitionSize, the target number of elements per resulting partition.
        • + *
        • arg[3]: int maxPartsPerSk, the maximum number of partitions to be handled by any one sketch
        • + *
        + * @param args input arguments as defined above. + */ + public void main(String[] args) { + final int k, maxPartsPerSk; + final long totalN, tgtPartitionSize; + try { + k = Integer.parseInt(args[0].trim()); + totalN = Long.parseLong(args[1].trim()); + tgtPartitionSize = Long.parseLong(args[2].trim()); + maxPartsPerSk = Integer.parseInt(args[3].trim()); + } catch (NumberFormatException e) { throw new SketchesArgumentException(e.toString()); } + classicPartitioner(k, totalN, tgtPartitionSize, maxPartsPerSk); + } + + //@Test //launch from TestNG public void checkClassicPartitioner() { - println("Classic ItemsSketch Partitions Test"); - printf("Sketch K :%,20d\n", k); - printf("Total N :%,20d\n", totalN); - printf("Tgt Partition Size :%,20d\n", tgtPartitionSize); - printf("Max Parts Per Sketch :%20d\n", maxPartsPerSk); + final int k = 1 << 15; + final long totalN = 30_000_000L; //artificially set low so it will execute fast + final long tgtPartitionSize = 3_000_000L; + final int maxPartsPerSk = 100; + classicPartitioner(k, totalN, tgtPartitionSize, maxPartsPerSk); + } + + /** + * Programmatic call to classic Partitioner + * @param k the size of the sketch. + * @param totalN the total size, in elements, of the data set to parse. + * @param tgtPartitionSize the target number of elements per resulting partition. + * @param maxPartsPerSk the maximum number of partitions to be handled by any one sketch. + */ + public void classicPartitioner( + final int k, + final long totalN, + final long tgtPartitionSize, + final int maxPartsPerSk) { final long startTime_mS = System.currentTimeMillis(); final ItemsSketchFillRequestLongAsString fillReq = new ItemsSketchFillRequestLongAsString(k, totalN); @@ -57,77 +94,20 @@ public void checkClassicPartitioner() { fillReq, INCLUSIVE); final List> list = partitioner.partition(sk); - outputList(list); - final long endTime_mS = System.currentTimeMillis(); final long fillInitialSketchTime_mS = endFillInitialSketchTime_mS - startTime_mS; final long partitioningTime_mS = endTime_mS - endFillInitialSketchTime_mS; final long totalTime_mS = endTime_mS - startTime_mS; - println(""); - println("FillInitialSketchTime : " + milliSecToString(fillInitialSketchTime_mS)); - println("PartioningTime : " + milliSecToString(partitioningTime_mS)); - println("Total Time : " + milliSecToString(totalTime_mS)); - } - - private static final String[] hdr = - { "Level.Part", "Partition", "LowerBound", "UpperBound", "ApproxNumItems", "Include Rule" }; - private static final String hdrFmt = "%15s %10s %15s %15s %15s %15s\n"; - private static final String dFmt = "%15s %10d %15s %15s %15d %15s\n"; - - void outputList(final List> list) { - printf(hdrFmt, (Object[]) hdr); - final int numParts = list.size(); - final double meanPartSize = (double)totalN / numParts; - double size = 0; - double sumSizes = 0; - double sumAbsRelErr = 0; - double sumSqErr = 0; - double maxAbsErr = 0; - for (int i = 0; i < numParts; i++) { - final PartitionBoundsRow row = list.get(i); - printf(dFmt, row.levelPartId , (i + 1), row.lowerBound, row.upperBound, row.approxNumDeltaItems, row.rule.name()); - size = row.approxNumDeltaItems; - sumSizes += size; - sumAbsRelErr += abs(size / meanPartSize - 1.0); - final double absErr = abs(size - meanPartSize); - sumSqErr += absErr * absErr; - maxAbsErr= max(absErr, maxAbsErr); - } - final double meanAbsRelErr = sumAbsRelErr / numParts; - final double meanSqErr = sumSqErr / numParts; //intermediate value - final double normMeanSqErr = meanSqErr / (meanPartSize * meanPartSize); //intermediate value - final double rmsRelErr = sqrt(normMeanSqErr); //a.k.a. Normalized RMS Error or NRMSE - final double maxAbsErrFraction = maxAbsErr / meanPartSize; - - printf("Total ApproxNumItems :%,20d\n", (long)sumSizes); - printf("Mean Partition Size :%,20.1f\n", meanPartSize); - printf("Mean Abs Rel Error :%20.3f%%\n", meanAbsRelErr * 100); - printf("Norm RMS Error :%20.3f%%\n", rmsRelErr * 100); - printf("Max Abs Error Fraction:%20.3f%%\n", maxAbsErrFraction * 100); - } - - private final static boolean enablePrinting = true; - - /** - * @param o the Object to print - */ - private static final void print(final Object o) { - if (enablePrinting) { System.out.print(o.toString()); } - } - - /** - * @param o the Object to println - */ - private static final void println(final Object o) { - if (enablePrinting) { System.out.println(o.toString()); } - } - - /** - * @param format the format - * @param args the args - */ - private static final void printf(final String format, final Object ...args) { - if (enablePrinting) { System.out.printf(format, args); } + PartitionResults.output( + "Classic", + list, + k, + totalN, + tgtPartitionSize, + maxPartsPerSk, + fillInitialSketchTime_mS, + partitioningTime_mS, + totalTime_mS); } } diff --git a/src/test/java/org/apache/datasketches/partitions/ItemsSketchFillRequestLongAsString.java b/src/test/java/org/apache/datasketches/partitions/ItemsSketchFillRequestLongAsString.java index c1a33d7a5..9f3cfb9fb 100644 --- a/src/test/java/org/apache/datasketches/partitions/ItemsSketchFillRequestLongAsString.java +++ b/src/test/java/org/apache/datasketches/partitions/ItemsSketchFillRequestLongAsString.java @@ -25,7 +25,6 @@ import static org.apache.datasketches.quantilescommon.LongsAsOrderableStrings.getString; import java.util.Comparator; -import java.util.Random; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.quantiles.ItemsSketch; @@ -37,7 +36,6 @@ public class ItemsSketchFillRequestLongAsString implements SketchFillRequest> { private int k; private int numDigits; - private Random rand = new Random(); public ItemsSketchFillRequestLongAsString() { k = 1 << 10; @@ -82,44 +80,4 @@ public ItemsSketch getRange(final long lowerQuantile, final long upperQu return sk; } - public ItemsSketch getRangeRandom(final long lowerQuantile, final long upperQuantile, - final BoundsRule bounds) { - final ItemsSketch sk = ItemsSketch.getInstance(String.class, k, Comparator.naturalOrder()); - final long lower = lowerQuantile; - final long upper = upperQuantile; - this.rand = new Random(); - if (bounds == INCLUDE_BOTH) { - for (long i = lower; i <= upper; i++) { - sk.update(getString(randBetween(lowerQuantile, upperQuantile, bounds), numDigits)); - } - } else if (bounds == INCLUDE_UPPER) { - for (long i = lower + 1; i <= upper; i++) { - sk.update(getString(randBetween(lowerQuantile, upperQuantile, bounds), numDigits)); - } - } else { //INCLUDE_LOWER - for (long i = lower; i < upper; i++) { - sk.update(getString(randBetween(lowerQuantile, upperQuantile, bounds), numDigits)); - } - } - return sk; - } - - private final long randBetween(final long lb, final long ub, final BoundsRule bounds) { - final double r = rand.nextDouble(); - final long range; - final long offset; - if (bounds == INCLUDE_BOTH) { - range = ub - lb; - offset = lb; - } - else if (bounds == INCLUDE_UPPER) { - range = ub - lb - 1; - offset = lb + 1; - } else { //INCLUDE_LOWER - range = ub - lb - 1; - offset = lb; - } - return Math.round(r * range + offset); - } - } diff --git a/src/test/java/org/apache/datasketches/partitions/KllItemsSketchFillRequestLongAsString.java b/src/test/java/org/apache/datasketches/partitions/KllItemsSketchFillRequestLongAsString.java index 41d3f6569..ab6f487e6 100644 --- a/src/test/java/org/apache/datasketches/partitions/KllItemsSketchFillRequestLongAsString.java +++ b/src/test/java/org/apache/datasketches/partitions/KllItemsSketchFillRequestLongAsString.java @@ -25,7 +25,6 @@ import static org.apache.datasketches.quantilescommon.LongsAsOrderableStrings.getString; import java.util.Comparator; -import java.util.Random; import org.apache.datasketches.common.ArrayOfStringsSerDe; import org.apache.datasketches.common.SketchesArgumentException; @@ -38,7 +37,6 @@ public class KllItemsSketchFillRequestLongAsString implements SketchFillRequest> { private int k; private int numDigits; - private Random rand = new Random(); public KllItemsSketchFillRequestLongAsString() { k = 1 << 10; @@ -83,43 +81,4 @@ public KllItemsSketch getRange(final long lowerQuantile, final long uppe return sk; } - public KllItemsSketch getRangeRandom(final long lowerQuantile, final long upperQuantile, final BoundsRule bounds) { - KllItemsSketch sk = KllItemsSketch.newHeapInstance(k, Comparator.naturalOrder(), new ArrayOfStringsSerDe()); - long lower = lowerQuantile; - long upper = upperQuantile; - this.rand = new Random(); - if (bounds == INCLUDE_BOTH) { - for (long i = lower; i <= upper; i++) { - sk.update(getString(randBetween(lowerQuantile, upperQuantile, bounds), numDigits)); - } - } else if (bounds == INCLUDE_UPPER) { - for (long i = lower + 1; i <= upper; i++) { - sk.update(getString(randBetween(lowerQuantile, upperQuantile, bounds), numDigits)); - } - } else { //INCLUDE_LOWER - for (long i = lower; i < upper; i++) { - sk.update(getString(randBetween(lowerQuantile, upperQuantile, bounds), numDigits)); - } - } - return sk; - } - - private final long randBetween(final long lb, final long ub, final BoundsRule bounds) { - double r = rand.nextDouble(); - long range; - long offset; - if (bounds == INCLUDE_BOTH) { - range = ub - lb; - offset = lb; - } - else if (bounds == INCLUDE_UPPER) { - range = ub - lb - 1; - offset = lb + 1; - } else { //INCLUDE_LOWER - range = ub - lb - 1; - offset = lb; - } - return Math.round(r * range + offset); - } - } diff --git a/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java b/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java index cba50a175..50eff5777 100644 --- a/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java +++ b/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java @@ -19,34 +19,70 @@ package org.apache.datasketches.partitions; -import static java.lang.Math.abs; -import static java.lang.Math.max; -import static java.lang.Math.sqrt; -import static org.apache.datasketches.common.Util.milliSecToString; import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_BOTH; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import java.util.List; -import org.apache.datasketches.partitions.Partitioner; -import org.apache.datasketches.partitions.Partitioner.PartitionBoundsRow; +import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.kll.KllItemsSketch; +import org.apache.datasketches.partitions.Partitioner.PartitionBoundsRow; import org.testng.annotations.Test; +/** + * This KLL quantiles sketch partitioner example application uses Strings formatted as numbers. + * The length of each string is the number of characters required to display the decimal digits of N, + * the number of elements of the entire set of data to be partitioned. + * As a result, there is a lot of overhead in string processing. + * Nevertheless, real applications of the approach outlined here, would have a lot of IO overhead that this simple + * test example does not have. + */ @SuppressWarnings("unused") public class KllPartitionsTest { - private final int k = 1 << 15; - private final long totalN = 100_000_000L; - private final long tgtPartitionSize = (long)3e6; - private final int maxPartsPerSk = 100; - //@Test + /** + * Launch the partitioner as an application with the following arguments as strings: + *
          + *
        • arg[0]: int k, the size of the sketch
        • + *
        • arg[1]: long totalN, the total size, in elements, of the data set to parse.
        • + *
        • arg[2]: long tgtPartitionSize, the target number of elements per resulting partition.
        • + *
        • arg[3]: int maxPartsPerSk, the maximum number of partitions to be handled by any one sketch
        • + *
        + * @param args input arguments as defined above. + */ + public void main(String[] args) { + final int k, maxPartsPerSk; + final long totalN, tgtPartitionSize; + try { + k = Integer.parseInt(args[0].trim()); + totalN = Long.parseLong(args[1].trim()); + tgtPartitionSize = Long.parseLong(args[2].trim()); + maxPartsPerSk = Integer.parseInt(args[3].trim()); + } catch (NumberFormatException e) { throw new SketchesArgumentException(e.toString()); } + kllPartitioner(k, totalN, tgtPartitionSize, maxPartsPerSk); + } + + //@Test //launch from TestNG public void checkKllPartitioner() { - println("KllItemsSketch Partitions Test"); - printf("Sketch K :%,20d\n", k); - printf("Total N :%,20d\n", totalN); - printf("Tgt Partition Size :%,20d\n", tgtPartitionSize); - printf("Max Parts Per Sketch :%20d\n", maxPartsPerSk); + final int k = 1 << 15; + final long totalN = 30_000_000L; //artificially set low so it will execute fast + final long tgtPartitionSize = 3_000_000L; + final int maxPartsPerSk = 100; + kllPartitioner(k, totalN, tgtPartitionSize, maxPartsPerSk); + } + + /** + * Programmatic call to KLL Partitioner + * @param k the size of the sketch. + * @param totalN the total size, in elements, of the data set to parse. + * @param tgtPartitionSize the target number of elements per resulting partition. + * @param maxPartsPerSk the maximum number of partitions to be handled by any one sketch. + */ + public void kllPartitioner( + final int k, + final long totalN, + final long tgtPartitionSize, + final int maxPartsPerSk) { final long startTime_mS = System.currentTimeMillis(); final KllItemsSketchFillRequestLongAsString fillReq = new KllItemsSketchFillRequestLongAsString(k, totalN); @@ -58,77 +94,20 @@ public void checkKllPartitioner() { fillReq, INCLUSIVE); final List> list = partitioner.partition(sk); - outputList(list); - final long endTime_mS = System.currentTimeMillis(); final long fillInitialSketchTime_mS = endFillInitialSketchTime_mS - startTime_mS; final long partitioningTime_mS = endTime_mS - endFillInitialSketchTime_mS; final long totalTime_mS = endTime_mS - startTime_mS; - println(""); - println("FillInitialSketchTime : " + milliSecToString(fillInitialSketchTime_mS)); - println("PartioningTime : " + milliSecToString(partitioningTime_mS)); - println("Total Time : " + milliSecToString(totalTime_mS)); - } - - private static final String[] hdr = - { "Level.Part", "Partition", "LowerBound", "UpperBound", "ApproxNumItems", "Include Rule" }; - private static final String hdrFmt = "%15s %10s %15s %15s %15s %15s\n"; - private static final String dFmt = "%15s %10d %15s %15s %15d %15s\n"; - - void outputList(final List> list) { - printf(hdrFmt, (Object[]) hdr); - final int numParts = list.size(); - final double meanPartSize = (double)totalN / numParts; - double size = 0; - double sumSizes = 0; - double sumAbsRelErr = 0; - double sumSqErr = 0; - double maxAbsErr = 0; - for (int i = 0; i < numParts; i++) { - final PartitionBoundsRow row = list.get(i); - printf(dFmt, row.levelPartId , (i + 1), row.lowerBound, row.upperBound, row.approxNumDeltaItems, row.rule.name()); - size = row.approxNumDeltaItems; - sumSizes += size; - sumAbsRelErr += abs(size / meanPartSize - 1.0); - final double absErr = abs(size - meanPartSize); - sumSqErr += absErr * absErr; - maxAbsErr= max(absErr, maxAbsErr); - } - final double meanAbsRelErr = sumAbsRelErr / numParts; - final double meanSqErr = sumSqErr / numParts; //intermediate value - final double normMeanSqErr = meanSqErr / (meanPartSize * meanPartSize); //intermediate value - final double rmsRelErr = sqrt(normMeanSqErr); //a.k.a. Normalized RMS Error or NRMSE - final double maxAbsErrFraction = maxAbsErr / meanPartSize; - - printf("Total ApproxNumItems :%,20d\n", (long)sumSizes); - printf("Mean Partition Size :%,20.1f\n", meanPartSize); - printf("Mean Abs Rel Error :%20.3f%%\n", meanAbsRelErr * 100); - printf("Norm RMS Error :%20.3f%%\n", rmsRelErr * 100); - printf("Max Abs Error Fraction:%20.3f%%\n", maxAbsErrFraction * 100); - } - - private final static boolean enablePrinting = true; - - /** - * @param o the Object to print - */ - private static final void print(final Object o) { - if (enablePrinting) { System.out.print(o.toString()); } - } - - /** - * @param o the Object to println - */ - private static final void println(final Object o) { - if (enablePrinting) { System.out.println(o.toString()); } - } - - /** - * @param format the format - * @param args the args - */ - private static final void printf(final String format, final Object ...args) { - if (enablePrinting) { System.out.printf(format, args); } + PartitionResults.output( + "KLL", + list, + k, + totalN, + tgtPartitionSize, + maxPartsPerSk, + fillInitialSketchTime_mS, + partitioningTime_mS, + totalTime_mS); } } diff --git a/src/test/java/org/apache/datasketches/partitions/PartitionResults.java b/src/test/java/org/apache/datasketches/partitions/PartitionResults.java new file mode 100644 index 000000000..501820d78 --- /dev/null +++ b/src/test/java/org/apache/datasketches/partitions/PartitionResults.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.partitions; + +import static java.lang.Math.abs; +import static java.lang.Math.max; +import static java.lang.Math.sqrt; +import static org.apache.datasketches.common.Util.milliSecToString; + +import java.util.List; + +import org.apache.datasketches.partitions.Partitioner.PartitionBoundsRow; + +/** + * Output partitioning results to console. + */ +public class PartitionResults { + private final static String LS = System.getProperty("line.separator"); + private static final String[] hdr = + { "Level.Part", "Partition", "LowerBound", "UpperBound", "ApproxNumItems", "Include Rule" }; + private static final String hdrFmt = "%15s %10s %15s %15s %15s %15s\n"; + private static final String dFmt = "%15s %10d %15s %15s %15d %15s\n"; + + public static void output( + final String sketchType, + final List> list, + final int k, + final long totalN, + final long tgtPartitionSize, + final int maxPartsPerSk, + final long fillInitialSketchTime_mS, + final long partitioningTime_mS, + final long totalTime_mS) { + printf(hdrFmt, (Object[]) hdr); + final int numParts = list.size(); + final double meanPartSize = (double)totalN / numParts; + double size = 0; + double sumSizes = 0; + double sumAbsRelErr = 0; + double sumSqErr = 0; + double maxAbsErr = 0; + for (int i = 0; i < numParts; i++) { + final PartitionBoundsRow row = list.get(i); + printf(dFmt, row.levelPartId , (i + 1), row.lowerBound, row.upperBound, row.approxNumDeltaItems, row.rule.name()); + size = row.approxNumDeltaItems; + sumSizes += size; + sumAbsRelErr += abs(size / meanPartSize - 1.0); + final double absErr = abs(size - meanPartSize); + sumSqErr += absErr * absErr; + maxAbsErr= max(absErr, maxAbsErr); + } + final double meanAbsRelErr = sumAbsRelErr / numParts; + final double meanSqErr = sumSqErr / numParts; //intermediate value + final double normMeanSqErr = meanSqErr / (meanPartSize * meanPartSize); //intermediate value + final double rmsRelErr = sqrt(normMeanSqErr); //a.k.a. Normalized RMS Error or NRMSE + final double maxAbsErrFraction = maxAbsErr / meanPartSize; + + println(LS + sketchType +" ItemsSketch Partitions Test"); + println(LS + "INPUT:"); + printf("Sketch K :%,20d\n", k); + printf("Total N :%,20d\n", totalN); + printf("Tgt Partition Size :%,20d\n", tgtPartitionSize); + printf("Max Parts Per Sketch :%20d\n", maxPartsPerSk); + + println(LS + "STATISTICS:"); + printf( "Total ApproxNumItems :%,20d\n", (long)sumSizes); + printf( "Mean Partition Size :%,20.1f\n", meanPartSize); + printf( "Mean Abs Rel Error :%20.3f%%\n", meanAbsRelErr * 100); + printf( "Norm RMS Error :%20.3f%%\n", rmsRelErr * 100); + printf( "Max Abs Error Percent :%20.3f%%\n", maxAbsErrFraction * 100); + + println(LS + "TIMINGS:"); + println("FillInitialSketchTime : " + milliSecToString(fillInitialSketchTime_mS)); + println("PartioningTime : " + milliSecToString(partitioningTime_mS)); + println("Total Time : " + milliSecToString(totalTime_mS) + LS); + } + + private final static boolean enablePrinting = true; + + /** + * @param o the Object to println + */ + private static final void println(final Object o) { + if (enablePrinting) { System.out.println(o.toString()); } + } + + /** + * @param format the format + * @param args the args + */ + private static final void printf(final String format, final Object ...args) { + if (enablePrinting) { System.out.printf(format, args); } + } +} From 6ecbf503fd9581d586da08426944667d4a2e065b Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Wed, 29 Nov 2023 15:31:03 -0800 Subject: [PATCH 09/13] This commit is the result of the first round of reviews. --- .../org/apache/datasketches/common/Util.java | 36 +++++----- .../kll/KllItemsSketchSortedView.java | 6 +- .../datasketches/partitions/Partitioner.java | 7 +- .../quantiles/ItemsSketchSortedView.java | 6 +- .../datasketches/quantilescommon/Stack.java | 68 ------------------- 5 files changed, 27 insertions(+), 96 deletions(-) delete mode 100644 src/main/java/org/apache/datasketches/quantilescommon/Stack.java diff --git a/src/main/java/org/apache/datasketches/common/Util.java b/src/main/java/org/apache/datasketches/common/Util.java index c9a749e55..f713171e6 100644 --- a/src/main/java/org/apache/datasketches/common/Util.java +++ b/src/main/java/org/apache/datasketches/common/Util.java @@ -531,39 +531,39 @@ public static double powerSeriesNextDouble(final int ppb, final double curPoint, } /** - * Returns the ceiling of a given n given a radix, where the ceiling is an integral power of the radix. - * This is the smallest positive power of radix that is equal to or greater than the given n + * Returns the ceiling of a given n given a base, where the ceiling is an integral power of the base. + * This is the smallest positive power of base that is equal to or greater than the given n * and equal to a mathematical integer. * The result of this function is consistent with {@link #ceilingIntPowerOf2(int)} for values * less than one. I.e., if n < 1, the result is 1. * - *

        The formula is: radixceiling(logradix(x))

        + *

        The formula is: baseceiling(logbase(x))

        * - * @param radix The base of the number system. + * @param base The number in the expression ⌈basen⌉. * @param n The input argument. - * @return the ceiling power of radix as a double and equal to a mathematical integer. + * @return the ceiling power of base as a double and equal to a mathematical integer. */ - public static double ceilingPowerBaseOfDouble(final double radix, final double n) { + public static double ceilingPowerBaseOfDouble(final double base, final double n) { final double x = n < 1.0 ? 1.0 : n; - return Math.round(pow(radix, ceil(logBaseOfX(radix, x)))); + return Math.round(pow(base, ceil(logBaseOfX(base, x)))); } /** - * Computes the floor of a given n given radix, where the floor is an integral power of the radix. - * This is the largest positive power of radix that is equal to or less than the given n + * Computes the floor of a given n given base, where the floor is an integral power of the base. + * This is the largest positive power of base that is equal to or less than the given n * and equal to a mathematical integer. * The result of this function is consistent with {@link #floorPowerOf2(int)} for values * less than one. I.e., if n < 1, the result is 1. * - *

        The formula is: radixfloor(logradix(x))

        + *

        The formula is: basefloor(logbase(x))

        * - * @param radix The base of the number system. + * @param base The number in the expression ⌊basen⌋. * @param n The input argument. * @return the floor power of 2 and equal to a mathematical integer. */ - public static double floorPowerBaseOfDouble(final double radix, final double n) { + public static double floorPowerBaseOfDouble(final double base, final double n) { final double x = n < 1.0 ? 1.0 : n; - return Math.round(pow(radix, floor(logBaseOfX(radix, x)))); + return Math.round(pow(base, floor(logBaseOfX(base, x)))); } // Logarithm related @@ -578,13 +578,13 @@ public static double log2(final double value) { } /** - * Returns the logradix(x). Example: logB(2.0, x) = log(x) / log(2.0). - * @param radix the base of the number system + * Returns the logbase(x). Example, if base = 2.0: logB(2.0, x) = log(x) / log(2.0). + * @param base The number in the expression log(x) / log(base). * @param x the given value - * @return the logradix(x): Example: logB(2.0, x) = log(x) / log(2.0). + * @return the logbase(x) */ - public static double logBaseOfX(final double radix, final double x) { - return log(x) / log(radix); + public static double logBaseOfX(final double base, final double x) { + return log(x) / log(base); } /** diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java index 4b901f54a..3f0baf077 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java @@ -179,9 +179,9 @@ public GenericPartitionBoundaries getPartitionBoundaries(final int numEqually } final GenericPartitionBoundaries gpb = new GenericPartitionBoundaries<>( this.totalN, - evSpQuantiles.clone(), - evSpNatRanks.clone(), - evSpNormRanks.clone(), + evSpQuantiles, + evSpNatRanks, + evSpNormRanks, getMaxItem(), getMinItem(), searchCrit); diff --git a/src/main/java/org/apache/datasketches/partitions/Partitioner.java b/src/main/java/org/apache/datasketches/partitions/Partitioner.java index fce0becfc..b56356f49 100644 --- a/src/main/java/org/apache/datasketches/partitions/Partitioner.java +++ b/src/main/java/org/apache/datasketches/partitions/Partitioner.java @@ -28,6 +28,7 @@ import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import static org.apache.datasketches.quantilescommon.QuantilesAPI.EMPTY_MSG; +import java.util.ArrayDeque; import java.util.ArrayList; import java.util.List; @@ -36,7 +37,6 @@ import org.apache.datasketches.quantilescommon.PartitioningFeature; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.datasketches.quantilescommon.QuantilesGenericAPI; -import org.apache.datasketches.quantilescommon.Stack; /** * A partitioning process that can partition very large data sets into thousands @@ -44,14 +44,13 @@ * @param the data type * @param the quantiles sketch that implements both QuantilesGenericAPI and PartitioningFeature. */ -//@SuppressWarnings("unused") public class Partitioner & PartitioningFeature> { private static final QuantileSearchCriteria defaultCriteria = INCLUSIVE; private final long tgtPartitionSize; private final int maxPartsPerSk; private final SketchFillRequest fillReq; private final QuantileSearchCriteria criteria; - private final Stack> stack = new Stack<>(); + private final ArrayDeque> stack = new ArrayDeque<>(); //computed once at the beginning private int numLevels; @@ -114,7 +113,7 @@ public List> partition(final S sk) { return finalPartitionList; } - private void partitionSearch(final Stack> stack) { + private void partitionSearch(final ArrayDeque> stack) { if (stack.isEmpty()) { return; } diff --git a/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java b/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java index 869b68021..23cd9757a 100644 --- a/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java @@ -183,9 +183,9 @@ public GenericPartitionBoundaries getPartitionBoundaries(final int numEqually } final GenericPartitionBoundaries gpb = new GenericPartitionBoundaries<>( this.totalN, - evSpQuantiles.clone(), - evSpNatRanks.clone(), - evSpNormRanks.clone(), + evSpQuantiles, + evSpNatRanks, + evSpNormRanks, getMaxItem(), getMinItem(), searchCrit); diff --git a/src/main/java/org/apache/datasketches/quantilescommon/Stack.java b/src/main/java/org/apache/datasketches/quantilescommon/Stack.java deleted file mode 100644 index 68d6378b5..000000000 --- a/src/main/java/org/apache/datasketches/quantilescommon/Stack.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.quantilescommon; - -import java.util.ArrayList; - -import org.apache.datasketches.common.SketchesStateException; - -/** - * A classic LIFO stack based on ArrayList (as opposed to Vector). - * All of the methods of ArrayList are available. - */ -public class Stack extends ArrayList { - private static final long serialVersionUID = 1L; - - /** - * Creates an empty stack. - */ - public Stack() { } - - /** - * Pushes an item onto the stack - * @param item the given item - * @return the given element - */ - public E push(final E item) { - add(item); - return item; - } - - /** - * Removes the item at the top of the stack. - * @return the item at the top of the stack. - */ - public E pop() { - final E item = peek(); - remove(size() - 1); - return item; - } - - /** - * Allows examination of the top item without removing it. - * @return the top item without removing it - */ - public E peek() { - final int len = size(); - if (len == 0) { throw new SketchesStateException("Stack is empty"); } - return get(len - 1); - } - -} From 0aecf1f5c970d03750dd2fbd29ebd6a943435888 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Wed, 29 Nov 2023 16:24:44 -0800 Subject: [PATCH 10/13] Changes due to 2nd round of review feedback. --- .../datasketches/kll/KllItemsSketchSortedView.java | 10 +++------- .../datasketches/quantiles/ItemsSketchSortedView.java | 10 +++------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java index 3f0baf077..d3336c2da 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java @@ -174,8 +174,8 @@ public GenericPartitionBoundaries getPartitionBoundaries(final int numEqually final long[] evSpNatRanks = new long[len]; for (int i = 0; i < len; i++) { final int index = getQuantileIndex(evSpNormRanks[i], searchCrit); - evSpQuantiles[i] = getQuantileFromIndex(index); - evSpNatRanks[i] = getCumWeightFromIndex(index); + evSpQuantiles[i] = quantiles[index]; + evSpNatRanks[i] = cumWeights[index]; } final GenericPartitionBoundaries gpb = new GenericPartitionBoundaries<>( this.totalN, @@ -205,13 +205,9 @@ public T getQuantile(final double rank, final QuantileSearchCriteria searchCrit) if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int index = getQuantileIndex(rank, searchCrit); - return getQuantileFromIndex(index); + return quantiles[index]; } - private T getQuantileFromIndex(final int index) { return quantiles[index]; } - - private long getCumWeightFromIndex(final int index) { return cumWeights[index]; } - private int getQuantileIndex(final double rank, final QuantileSearchCriteria searchCrit) { final int len = cumWeights.length; final double naturalRank = getNaturalRank(rank, totalN, searchCrit); diff --git a/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java b/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java index 23cd9757a..cb4bd1139 100644 --- a/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java @@ -178,8 +178,8 @@ public GenericPartitionBoundaries getPartitionBoundaries(final int numEqually final long[] evSpNatRanks = new long[len]; for (int i = 0; i < len; i++) { final int index = getQuantileIndex(evSpNormRanks[i], searchCrit); - evSpQuantiles[i] = getQuantileFromIndex(index); - evSpNatRanks[i] = getCumWeightFromIndex(index); + evSpQuantiles[i] = quantiles[index]; + evSpNatRanks[i] = cumWeights[index]; } final GenericPartitionBoundaries gpb = new GenericPartitionBoundaries<>( this.totalN, @@ -209,13 +209,9 @@ public T getQuantile(final double rank, final QuantileSearchCriteria searchCrit) if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } QuantilesUtil.checkNormalizedRankBounds(rank); final int index = getQuantileIndex(rank, searchCrit); - return getQuantileFromIndex(index); + return quantiles[index]; } - private T getQuantileFromIndex(final int index) { return quantiles[index]; } - - private long getCumWeightFromIndex(final int index) { return cumWeights[index]; } - private int getQuantileIndex(final double rank, final QuantileSearchCriteria searchCrit) { final int len = cumWeights.length; final double naturalRank = getNaturalRank(rank, totalN, searchCrit); From 7a727b4fe0e47441ca1e5ed52c8a9b0ad41ffb85 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Wed, 29 Nov 2023 17:39:25 -0800 Subject: [PATCH 11/13] Removed the normalized ranks array from all the Sorted Views. --- .../kll/KllDoublesSketchSortedView.java | 13 ------------- .../kll/KllFloatsSketchSortedView.java | 14 -------------- .../kll/KllItemsSketchSortedView.java | 17 ----------------- .../quantiles/DoublesSketchSortedView.java | 13 ------------- .../quantiles/ItemsSketchSortedView.java | 17 ----------------- .../quantilescommon/SortedView.java | 7 ------- .../datasketches/req/ReqSketchSortedView.java | 14 -------------- .../partitions/ClassicPartitionsTest.java | 16 +++++++++++++++- 8 files changed, 15 insertions(+), 96 deletions(-) diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java index e8bed53eb..cac663695 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java @@ -41,7 +41,6 @@ public final class KllDoublesSketchSortedView implements DoublesSortedView { private final double[] quantiles; private final long[] cumWeights; //comes in as individual weights, converted to cumulative natural weights private final long totalN; - private final double[] normRanks; private final double maxItem; private final double minItem; @@ -58,10 +57,6 @@ public final class KllDoublesSketchSortedView implements DoublesSortedView { this.totalN = totalN; this.maxItem = maxItem; this.minItem = minItem; - final int len = cumWeights.length; - final double[] normRanks = new double[len]; - for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } - this.normRanks = normRanks; } /** @@ -86,9 +81,6 @@ public KllDoublesSketchSortedView(final KllDoublesSketch sketch) { quantiles = new double[numQuantiles]; cumWeights = new long[numQuantiles]; populateFromSketch(srcQuantiles, srcLevels, srcNumLevels, numQuantiles); - final double[] normRanks = new double[numQuantiles]; - for (int i = 0; i < numQuantiles; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } - this.normRanks = normRanks; } @Override @@ -111,11 +103,6 @@ public long getN() { return totalN; } - @Override - public double[] getNormalizedRanks() { - return normRanks; - } - @Override public double getQuantile(final double rank, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java index 08678503c..ebad5f397 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java @@ -41,7 +41,6 @@ public final class KllFloatsSketchSortedView implements FloatsSortedView { private final float[] quantiles; private final long[] cumWeights; //comes in as individual weights, converted to cumulative natural weights private final long totalN; - private final double[] normRanks; private final float maxItem; private final float minItem; @@ -58,10 +57,6 @@ public final class KllFloatsSketchSortedView implements FloatsSortedView { this.totalN = totalN; this.maxItem = maxItem; this.minItem = minItem; - final int len = cumWeights.length; - final double[] normRanks = new double[len]; - for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } - this.normRanks = normRanks; } /** @@ -86,10 +81,6 @@ public KllFloatsSketchSortedView(final KllFloatsSketch sketch) { quantiles = new float[numQuantiles]; cumWeights = new long[numQuantiles]; populateFromSketch(srcQuantiles, srcLevels, srcNumLevels, numQuantiles); - final int len = cumWeights.length; - final double[] normRanks = new double[len]; - for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } - this.normRanks = normRanks; } //end of constructors @@ -114,11 +105,6 @@ public long getN() { return totalN; } - @Override - public double[] getNormalizedRanks() { - return normRanks; - } - @Override public float getQuantile(final double rank, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); } diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java index d3336c2da..fffb5d704 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java @@ -55,7 +55,6 @@ public class KllItemsSketchSortedView implements GenericSortedView, Partit private final T maxItem; private final T minItem; private final Class clazz; - private final double[] normRanks; /** * Construct from elements for testing only. @@ -80,7 +79,6 @@ public class KllItemsSketchSortedView implements GenericSortedView, Partit this.maxItem = maxItem; this.minItem = minItem; this.clazz = (Class)quantiles[0].getClass(); - this.normRanks = convertCumWtsToNormRanks(cumWeights, totalN); } /** @@ -109,7 +107,6 @@ public class KllItemsSketchSortedView implements GenericSortedView, Partit quantiles = (T[]) Array.newInstance(sketch.serDe.getClassOfT(), numQuantiles); cumWeights = new long[numQuantiles]; populateFromSketch(srcQuantiles, srcLevels, srcNumLevels, numQuantiles); - this.normRanks = convertCumWtsToNormRanks(cumWeights, totalN); } //end of constructors @@ -147,11 +144,6 @@ public long getN() { return totalN; } - @Override - public double[] getNormalizedRanks() { - return normRanks.clone(); - } - @Override @SuppressWarnings("unchecked") public GenericPartitionBoundaries getPartitionBoundaries(final int numEquallySized, @@ -162,8 +154,6 @@ public GenericPartitionBoundaries getPartitionBoundaries(final int numEqually //adjust ends of sortedView arrays cumWeights[0] = 1L; cumWeights[svLen - 1] = totalN; - normRanks[0] = 1.0 / totalN; - normRanks[svLen - 1] = 1.0; quantiles[0] = this.getMinItem(); quantiles[svLen - 1] = this.getMaxItem(); @@ -260,13 +250,6 @@ public GenericSortedViewIterator iterator() { //restricted methods - private static double[] convertCumWtsToNormRanks(final long[] cumWeights, final long totalN) { - final int len = cumWeights.length; - final double[] normRanks = new double[len]; - for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } - return normRanks; - } - private void populateFromSketch(final Object[] srcQuantiles, final int[] srcLevels, final int srcNumLevels, final int numItems) { final int[] myLevels = new int[srcNumLevels + 1]; diff --git a/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedView.java b/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedView.java index b746bae15..a5f2d476b 100644 --- a/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/quantiles/DoublesSketchSortedView.java @@ -44,7 +44,6 @@ public final class DoublesSketchSortedView implements DoublesSortedView { private final double[] quantiles; private final long[] cumWeights; //comes in as individual weights, converted to cumulative natural weights private final long totalN; - private final double[] normRanks; private final double maxItem; private final double minItem; @@ -61,10 +60,6 @@ public final class DoublesSketchSortedView implements DoublesSortedView { this.totalN = totalN; this.maxItem = maxItem; this.minItem = minItem; - final int len = cumWeights.length; - final double[] normRanks = new double[len]; - for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } - this.normRanks = normRanks; } /** @@ -92,9 +87,6 @@ public DoublesSketchSortedView(final DoublesSketch sketch) { if (convertToCumulative(cumWeights) != totalN) { throw new SketchesStateException("Sorted View is misconfigured. TotalN does not match cumWeights."); } - final double[] normRanks = new double[numQuantiles]; - for (int i = 0; i < numQuantiles; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } - this.normRanks = normRanks; } @Override @@ -117,11 +109,6 @@ public long getN() { return totalN; } - @Override - public double[] getNormalizedRanks() { - return normRanks.clone(); - } - @Override public double getQuantile(final double rank, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } diff --git a/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java b/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java index cb4bd1139..9638a9a9e 100644 --- a/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java @@ -55,7 +55,6 @@ public class ItemsSketchSortedView implements GenericSortedView, Partition private final T maxItem; private final T minItem; private final Class clazz; - private final double[] normRanks; /** * Construct from elements for testing. @@ -79,7 +78,6 @@ public class ItemsSketchSortedView implements GenericSortedView, Partition this.maxItem = maxItem; this.minItem = minItem; this.clazz = (Class)quantiles[0].getClass(); - this.normRanks = convertCumWtsToNormRanks(cumWeights, totalN); } /** @@ -114,7 +112,6 @@ public class ItemsSketchSortedView implements GenericSortedView, Partition if (convertToCumulative(cumWeights) != totalN) { throw new SketchesStateException("Sorted View is misconfigured. TotalN does not match cumWeights."); } - this.normRanks = convertCumWtsToNormRanks(cumWeights, totalN); } //end of constructors @@ -152,11 +149,6 @@ public long getN() { return totalN; } - @Override - public double[] getNormalizedRanks() { - return normRanks.clone(); - } - @Override @SuppressWarnings("unchecked") public GenericPartitionBoundaries getPartitionBoundaries(final int numEquallySized, @@ -167,8 +159,6 @@ public GenericPartitionBoundaries getPartitionBoundaries(final int numEqually //adjust ends of sortedView arrays cumWeights[0] = 1L; cumWeights[svLen - 1] = totalN; - normRanks[0] = 1.0 / totalN; - normRanks[svLen - 1] = 1.0; quantiles[0] = this.getMinItem(); quantiles[svLen - 1] = this.getMaxItem(); @@ -310,13 +300,6 @@ private final static void populateFromItemsSketch( Arrays.sort(quantilesArr, startOfBaseBufferBlock, numQuantiles, comparator); } - private static double[] convertCumWtsToNormRanks(final long[] cumWeights, final long totalN) { - final int len = cumWeights.length; - final double[] normRanks = new double[len]; - for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } - return normRanks; - } - /** * Convert the individual weights into cumulative weights. * An array of {1,1,1,1} becomes {1,2,3,4} diff --git a/src/main/java/org/apache/datasketches/quantilescommon/SortedView.java b/src/main/java/org/apache/datasketches/quantilescommon/SortedView.java index 92acfb2d4..5fb50291f 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/SortedView.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/SortedView.java @@ -42,13 +42,6 @@ public interface SortedView { */ long[] getCumulativeWeights(); - /** - * Returns the array of normalized ranks. The normalized ranks are the natural ranks divided by N. - * The normalized ranks are fractional numbers on the interval (0,1.0]. - * @return the array of normalized ranks. - */ - double[] getNormalizedRanks(); - /** * Returns the total number of items presented to the sourcing sketch. * @return the total number of items presented to the sourcing sketch. diff --git a/src/main/java/org/apache/datasketches/req/ReqSketchSortedView.java b/src/main/java/org/apache/datasketches/req/ReqSketchSortedView.java index dbf14be6d..40842221b 100644 --- a/src/main/java/org/apache/datasketches/req/ReqSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/req/ReqSketchSortedView.java @@ -42,7 +42,6 @@ public final class ReqSketchSortedView implements FloatsSortedView { private float[] quantiles; private long[] cumWeights; //comes in as individual weights, converted to cumulative natural weights private final long totalN; - private final double[] normRanks; private final float maxItem; private final float minItem; @@ -59,10 +58,6 @@ public final class ReqSketchSortedView implements FloatsSortedView { this.totalN = totalN; this.maxItem = maxItem; this.minItem = minItem; - final int len = cumWeights.length; - final double[] normRanks = new double[len]; - for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } - this.normRanks = normRanks; } /** @@ -75,10 +70,6 @@ public ReqSketchSortedView(final ReqSketch sketch) { this.maxItem = sketch.getMaxItem(); this.minItem = sketch.getMinItem(); buildSortedViewArrays(sketch); - final int len = cumWeights.length; - final double[] normRanks = new double[len]; - for (int i = 0; i < len; i++) { normRanks[i] = (double)cumWeights[i] / totalN; } - this.normRanks = normRanks; } //end of constructors @@ -103,11 +94,6 @@ public long getN() { return totalN; } - @Override - public double[] getNormalizedRanks() { - return normRanks; - } - @Override public float getQuantile(final double rank, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } diff --git a/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java b/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java index 64bf9823c..b4098bf44 100644 --- a/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java +++ b/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java @@ -22,11 +22,14 @@ import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_BOTH; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; +import java.util.Comparator; import java.util.List; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.partitions.Partitioner.PartitionBoundsRow; import org.apache.datasketches.quantiles.ItemsSketch; +import org.apache.datasketches.quantilescommon.PartitioningFeature; +import org.apache.datasketches.quantilescommon.QuantilesGenericAPI; import org.testng.annotations.Test; /** @@ -65,12 +68,23 @@ public void main(String[] args) { //@Test //launch from TestNG public void checkClassicPartitioner() { final int k = 1 << 15; - final long totalN = 30_000_000L; //artificially set low so it will execute fast + final long totalN = 1000_000_000L; //artificially set low so it will execute fast final long tgtPartitionSize = 3_000_000L; final int maxPartsPerSk = 100; classicPartitioner(k, totalN, tgtPartitionSize, maxPartsPerSk); + final ItemsSketch sk = ItemsSketch.getInstance(String.class, k, Comparator.naturalOrder()); + this.runPartitioner(k, totalN, tgtPartitionSize, maxPartsPerSk, sk); } + public & PartitioningFeature> + void runPartitioner(final int k, final long totalN, final long tgtPartitionSize, final int maxPartsPerSk, + ItemsSketch sketch) { + final ItemsSketchFillRequestLongAsString fillReq = new ItemsSketchFillRequestLongAsString(k, totalN); + final long startTime_mS = System.currentTimeMillis(); + } + + //SketchFillRequest> + /** * Programmatic call to classic Partitioner * @param k the size of the sketch. From 327d6217a92e1637e15a30eb30394d17dfb6cb78 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Wed, 29 Nov 2023 18:07:03 -0800 Subject: [PATCH 12/13] Fixed CodeQL issues --- .../partitions/ClassicPartitionsTest.java | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java b/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java index b4098bf44..4c7e26a85 100644 --- a/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java +++ b/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java @@ -22,14 +22,11 @@ import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_BOTH; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; -import java.util.Comparator; import java.util.List; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.partitions.Partitioner.PartitionBoundsRow; import org.apache.datasketches.quantiles.ItemsSketch; -import org.apache.datasketches.quantilescommon.PartitioningFeature; -import org.apache.datasketches.quantilescommon.QuantilesGenericAPI; import org.testng.annotations.Test; /** @@ -72,19 +69,8 @@ public void checkClassicPartitioner() { final long tgtPartitionSize = 3_000_000L; final int maxPartsPerSk = 100; classicPartitioner(k, totalN, tgtPartitionSize, maxPartsPerSk); - final ItemsSketch sk = ItemsSketch.getInstance(String.class, k, Comparator.naturalOrder()); - this.runPartitioner(k, totalN, tgtPartitionSize, maxPartsPerSk, sk); } - public & PartitioningFeature> - void runPartitioner(final int k, final long totalN, final long tgtPartitionSize, final int maxPartsPerSk, - ItemsSketch sketch) { - final ItemsSketchFillRequestLongAsString fillReq = new ItemsSketchFillRequestLongAsString(k, totalN); - final long startTime_mS = System.currentTimeMillis(); - } - - //SketchFillRequest> - /** * Programmatic call to classic Partitioner * @param k the size of the sketch. From 51aef3cca4f44cbfa92309a8025abf73a995481e Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Thu, 30 Nov 2023 11:46:34 -0800 Subject: [PATCH 13/13] Updated class javadoc to include the following comment: The code included here does work fine for moderate sized partitioning tasks. As an example, using the test code in the test branch with the partitioning task of splitting a data set of 1 billion items into 324 partitions of size 3M items completed in under 3 minutes, which was performed on a single CPU. For much larger partitioning tasks, it is recommended that this code be leveraged into a parallelized systems environment. I made some minor tweaks to the test code examples. --- .../datasketches/partitions/Partitioner.java | 6 ++++ .../partitions/ClassicPartitionsTest.java | 29 ++++++++++++------- .../partitions/KllPartitionsTest.java | 29 ++++++++++++------- .../partitions/PartitionResults.java | 3 ++ 4 files changed, 47 insertions(+), 20 deletions(-) diff --git a/src/main/java/org/apache/datasketches/partitions/Partitioner.java b/src/main/java/org/apache/datasketches/partitions/Partitioner.java index b56356f49..be256e479 100644 --- a/src/main/java/org/apache/datasketches/partitions/Partitioner.java +++ b/src/main/java/org/apache/datasketches/partitions/Partitioner.java @@ -41,6 +41,12 @@ /** * A partitioning process that can partition very large data sets into thousands * of partitions of approximately the same size. + * + *

        The code included here does work fine for moderate sized partitioning tasks. + * As an example, using the test code in the test branch with the partitioning task of splitting + * a data set of 1 billion items into 324 partitions of size 3M items completed in under 3 minutes, which was + * performed on a single CPU. For much larger partitioning tasks, it is recommended that this code be leveraged into a + * parallelized systems environment.

        * @param the data type * @param the quantiles sketch that implements both QuantilesGenericAPI and PartitioningFeature. */ diff --git a/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java b/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java index 4c7e26a85..52e6c50d7 100644 --- a/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java +++ b/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java @@ -20,6 +20,7 @@ package org.apache.datasketches.partitions; import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_BOTH; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import java.util.List; @@ -27,6 +28,7 @@ import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.partitions.Partitioner.PartitionBoundsRow; import org.apache.datasketches.quantiles.ItemsSketch; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.testng.annotations.Test; /** @@ -44,42 +46,48 @@ public class ClassicPartitionsTest { * Launch the partitioner as an application with the following arguments as strings: *
          *
        • arg[0]: int k, the size of the sketch
        • - *
        • arg[1]: long totalN, the total size, in elements, of the data set to parse.
        • - *
        • arg[2]: long tgtPartitionSize, the target number of elements per resulting partition.
        • - *
        • arg[3]: int maxPartsPerSk, the maximum number of partitions to be handled by any one sketch
        • + *
        • arg[1]: String INCLUSIVE or EXCLUSIVE, the search criteria.
        • + *
        • arg[2]: long totalN, the total size, in elements, of the data set to parse.
        • + *
        • arg[3]: long tgtPartitionSize, the target number of elements per resulting partition.
        • + *
        • arg[4]: int maxPartsPerSk, the maximum number of partitions to be handled by any one sketch
        • *
        * @param args input arguments as defined above. */ public void main(String[] args) { final int k, maxPartsPerSk; final long totalN, tgtPartitionSize; + final QuantileSearchCriteria searchCrit; try { k = Integer.parseInt(args[0].trim()); - totalN = Long.parseLong(args[1].trim()); - tgtPartitionSize = Long.parseLong(args[2].trim()); - maxPartsPerSk = Integer.parseInt(args[3].trim()); + searchCrit = args[1].trim().equalsIgnoreCase("INCLUSIVE") ? INCLUSIVE : EXCLUSIVE; + totalN = Long.parseLong(args[2].trim()); + tgtPartitionSize = Long.parseLong(args[3].trim()); + maxPartsPerSk = Integer.parseInt(args[4].trim()); } catch (NumberFormatException e) { throw new SketchesArgumentException(e.toString()); } - classicPartitioner(k, totalN, tgtPartitionSize, maxPartsPerSk); + classicPartitioner(k, searchCrit, totalN, tgtPartitionSize, maxPartsPerSk); } //@Test //launch from TestNG public void checkClassicPartitioner() { final int k = 1 << 15; - final long totalN = 1000_000_000L; //artificially set low so it will execute fast + final QuantileSearchCriteria searchCrit = INCLUSIVE; + final long totalN = 30_000_000L; //artificially set low so it will execute fast as a simple test final long tgtPartitionSize = 3_000_000L; final int maxPartsPerSk = 100; - classicPartitioner(k, totalN, tgtPartitionSize, maxPartsPerSk); + classicPartitioner(k, searchCrit, totalN, tgtPartitionSize, maxPartsPerSk); } /** * Programmatic call to classic Partitioner * @param k the size of the sketch. + * @param searchCrit the QuantileSearchCriteria: either INCLUSIVE or EXCLUSIVE. * @param totalN the total size, in elements, of the data set to parse. * @param tgtPartitionSize the target number of elements per resulting partition. * @param maxPartsPerSk the maximum number of partitions to be handled by any one sketch. */ public void classicPartitioner( final int k, + final QuantileSearchCriteria searchCrit, final long totalN, final long tgtPartitionSize, final int maxPartsPerSk) { @@ -92,7 +100,7 @@ public void classicPartitioner( tgtPartitionSize, maxPartsPerSk, fillReq, - INCLUSIVE); + searchCrit); final List> list = partitioner.partition(sk); final long endTime_mS = System.currentTimeMillis(); final long fillInitialSketchTime_mS = endFillInitialSketchTime_mS - startTime_mS; @@ -102,6 +110,7 @@ public void classicPartitioner( "Classic", list, k, + searchCrit, totalN, tgtPartitionSize, maxPartsPerSk, diff --git a/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java b/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java index 50eff5777..3937d1648 100644 --- a/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java +++ b/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java @@ -20,6 +20,7 @@ package org.apache.datasketches.partitions; import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_BOTH; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import java.util.List; @@ -27,6 +28,7 @@ import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.kll.KllItemsSketch; import org.apache.datasketches.partitions.Partitioner.PartitionBoundsRow; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.testng.annotations.Test; /** @@ -44,42 +46,48 @@ public class KllPartitionsTest { * Launch the partitioner as an application with the following arguments as strings: *
          *
        • arg[0]: int k, the size of the sketch
        • - *
        • arg[1]: long totalN, the total size, in elements, of the data set to parse.
        • - *
        • arg[2]: long tgtPartitionSize, the target number of elements per resulting partition.
        • - *
        • arg[3]: int maxPartsPerSk, the maximum number of partitions to be handled by any one sketch
        • + *
        • arg[1]: String INCLUSIVE or EXCLUSIVE, the search criteria.
        • + *
        • arg[2]: long totalN, the total size, in elements, of the data set to parse.
        • + *
        • arg[3]: long tgtPartitionSize, the target number of elements per resulting partition.
        • + *
        • arg[4]: int maxPartsPerSk, the maximum number of partitions to be handled by any one sketch
        • *
        * @param args input arguments as defined above. */ public void main(String[] args) { final int k, maxPartsPerSk; final long totalN, tgtPartitionSize; + final QuantileSearchCriteria searchCrit; try { k = Integer.parseInt(args[0].trim()); - totalN = Long.parseLong(args[1].trim()); - tgtPartitionSize = Long.parseLong(args[2].trim()); - maxPartsPerSk = Integer.parseInt(args[3].trim()); + searchCrit = args[1].trim().equalsIgnoreCase("INCLUSIVE") ? INCLUSIVE : EXCLUSIVE; + totalN = Long.parseLong(args[2].trim()); + tgtPartitionSize = Long.parseLong(args[3].trim()); + maxPartsPerSk = Integer.parseInt(args[4].trim()); } catch (NumberFormatException e) { throw new SketchesArgumentException(e.toString()); } - kllPartitioner(k, totalN, tgtPartitionSize, maxPartsPerSk); + kllPartitioner(k, searchCrit, totalN, tgtPartitionSize, maxPartsPerSk); } //@Test //launch from TestNG public void checkKllPartitioner() { final int k = 1 << 15; - final long totalN = 30_000_000L; //artificially set low so it will execute fast + final QuantileSearchCriteria searchCrit = INCLUSIVE; + final long totalN = 30_000_000L; //artificially set low so it will execute fast as a simple test final long tgtPartitionSize = 3_000_000L; final int maxPartsPerSk = 100; - kllPartitioner(k, totalN, tgtPartitionSize, maxPartsPerSk); + kllPartitioner(k, searchCrit, totalN, tgtPartitionSize, maxPartsPerSk); } /** * Programmatic call to KLL Partitioner * @param k the size of the sketch. + * @param searchCrit the QuantileSearchCriteria: either INCLUSIVE or EXCLUSIVE. * @param totalN the total size, in elements, of the data set to parse. * @param tgtPartitionSize the target number of elements per resulting partition. * @param maxPartsPerSk the maximum number of partitions to be handled by any one sketch. */ public void kllPartitioner( final int k, + final QuantileSearchCriteria searchCrit, final long totalN, final long tgtPartitionSize, final int maxPartsPerSk) { @@ -92,7 +100,7 @@ public void kllPartitioner( tgtPartitionSize, maxPartsPerSk, fillReq, - INCLUSIVE); + searchCrit); final List> list = partitioner.partition(sk); final long endTime_mS = System.currentTimeMillis(); final long fillInitialSketchTime_mS = endFillInitialSketchTime_mS - startTime_mS; @@ -102,6 +110,7 @@ public void kllPartitioner( "KLL", list, k, + searchCrit, totalN, tgtPartitionSize, maxPartsPerSk, diff --git a/src/test/java/org/apache/datasketches/partitions/PartitionResults.java b/src/test/java/org/apache/datasketches/partitions/PartitionResults.java index 501820d78..b061ce921 100644 --- a/src/test/java/org/apache/datasketches/partitions/PartitionResults.java +++ b/src/test/java/org/apache/datasketches/partitions/PartitionResults.java @@ -27,6 +27,7 @@ import java.util.List; import org.apache.datasketches.partitions.Partitioner.PartitionBoundsRow; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; /** * Output partitioning results to console. @@ -42,6 +43,7 @@ public static void output( final String sketchType, final List> list, final int k, + final QuantileSearchCriteria searchCrit, final long totalN, final long tgtPartitionSize, final int maxPartsPerSk, @@ -75,6 +77,7 @@ public static void output( println(LS + sketchType +" ItemsSketch Partitions Test"); println(LS + "INPUT:"); printf("Sketch K :%,20d\n", k); + printf("Search Criteria :%20s\n", searchCrit.name()); printf("Total N :%,20d\n", totalN); printf("Tgt Partition Size :%,20d\n", tgtPartitionSize); printf("Max Parts Per Sketch :%20d\n", maxPartsPerSk);