Skip to content

Commit

Permalink
fix approx_percentile_cont() bug (apache#11934)
Browse files Browse the repository at this point in the history
  • Loading branch information
2010YOUY01 authored Aug 11, 2024
1 parent ee6910b commit 64a9280
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 2 deletions.
14 changes: 13 additions & 1 deletion datafusion/functions-aggregate-common/src/tdigest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ impl TDigest {
}

fn clamp(v: f64, lo: f64, hi: f64) -> f64 {
if lo.is_nan() && hi.is_nan() {
if lo.is_nan() || hi.is_nan() {
return v;
}
v.clamp(lo, hi)
Expand Down Expand Up @@ -539,6 +539,18 @@ impl TDigest {
let value = self.centroids[pos].mean()
+ ((rank - t) / self.centroids[pos].weight() - 0.5) * delta;

// In `merge_digests()`: `min` is initialized to Inf, `max` is initialized to -Inf
// and gets updated according to different `TDigest`s
// However, `min`/`max` won't get updated if there is only one `NaN` within `TDigest`
// The following two checks is for such edge case
if !min.is_finite() && min.is_sign_positive() {
min = f64::NEG_INFINITY;
}

if !max.is_finite() && max.is_sign_negative() {
max = f64::INFINITY;
}

Self::clamp(value, min, max)
}

Expand Down
40 changes: 39 additions & 1 deletion datafusion/sqllogictest/test_files/aggregate.slt
Original file line number Diff line number Diff line change
Expand Up @@ -1249,6 +1249,44 @@ SELECT APPROX_PERCENTILE_CONT(v, 0.5) FROM (VALUES (CAST(NULL as INT))) as t (v)
----
NULL

#
# percentile_cont edge cases
#

statement ok
CREATE TABLE tmp_percentile_cont(v1 INT, v2 DOUBLE);

statement ok
INSERT INTO tmp_percentile_cont VALUES (1, 'NaN'::Double), (2, 'NaN'::Double), (3, 'NaN'::Double);

# ISSUE: https://github.com/apache/datafusion/issues/11871
# Note `approx_median()` is using the same implementation as `approx_percentile_cont()`
query R
select APPROX_MEDIAN(v2) from tmp_percentile_cont WHERE v1 = 1;
----
NaN

# ISSUE: https://github.com/apache/datafusion/issues/11870
query R
select APPROX_PERCENTILE_CONT(v2, 0.8) from tmp_percentile_cont;
----
NaN

# ISSUE: https://github.com/apache/datafusion/issues/11869
# Note: `approx_percentile_cont_with_weight()` uses the same implementation as `approx_percentile_cont()`
query R
SELECT APPROX_PERCENTILE_CONT_WITH_WEIGHT(
v2,
'+Inf'::Double,
0.9
)
FROM tmp_percentile_cont;
----
NaN

statement ok
DROP TABLE tmp_percentile_cont;

# csv_query_cube_avg
query TIR
SELECT c1, c2, AVG(c3) FROM aggregate_test_100 GROUP BY CUBE (c1, c2) ORDER BY c1, c2
Expand Down Expand Up @@ -5553,4 +5591,4 @@ drop table employee_csv;
query I??III?T
select count(null), min(null), max(null), bit_and(NULL), bit_or(NULL), bit_xor(NULL), nth_value(NULL, 1), string_agg(NULL, ',');
----
0 NULL NULL NULL NULL NULL NULL NULL
0 NULL NULL NULL NULL NULL NULL NULL

0 comments on commit 64a9280

Please sign in to comment.