diff --git a/datafusion/functions-aggregate-common/src/tdigest.rs b/datafusion/functions-aggregate-common/src/tdigest.rs index 070ebc46483b..620a68e83ecd 100644 --- a/datafusion/functions-aggregate-common/src/tdigest.rs +++ b/datafusion/functions-aggregate-common/src/tdigest.rs @@ -233,7 +233,7 @@ impl TDigest { } fn clamp(v: f64, lo: f64, hi: f64) -> f64 { - if lo.is_nan() && hi.is_nan() { + if lo.is_nan() || hi.is_nan() { return v; } v.clamp(lo, hi) @@ -539,6 +539,18 @@ impl TDigest { let value = self.centroids[pos].mean() + ((rank - t) / self.centroids[pos].weight() - 0.5) * delta; + // In `merge_digests()`: `min` is initialized to Inf, `max` is initialized to -Inf + // and gets updated according to different `TDigest`s + // However, `min`/`max` won't get updated if there is only one `NaN` within `TDigest` + // The following two checks is for such edge case + if !min.is_finite() && min.is_sign_positive() { + min = f64::NEG_INFINITY; + } + + if !max.is_finite() && max.is_sign_negative() { + max = f64::INFINITY; + } + Self::clamp(value, min, max) } diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index c68a6c345caa..322ddcdb047b 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -1249,6 +1249,44 @@ SELECT APPROX_PERCENTILE_CONT(v, 0.5) FROM (VALUES (CAST(NULL as INT))) as t (v) ---- NULL +# +# percentile_cont edge cases +# + +statement ok +CREATE TABLE tmp_percentile_cont(v1 INT, v2 DOUBLE); + +statement ok +INSERT INTO tmp_percentile_cont VALUES (1, 'NaN'::Double), (2, 'NaN'::Double), (3, 'NaN'::Double); + +# ISSUE: https://github.com/apache/datafusion/issues/11871 +# Note `approx_median()` is using the same implementation as `approx_percentile_cont()` +query R +select APPROX_MEDIAN(v2) from tmp_percentile_cont WHERE v1 = 1; +---- +NaN + +# ISSUE: https://github.com/apache/datafusion/issues/11870 +query R +select APPROX_PERCENTILE_CONT(v2, 0.8) from tmp_percentile_cont; +---- +NaN + +# ISSUE: https://github.com/apache/datafusion/issues/11869 +# Note: `approx_percentile_cont_with_weight()` uses the same implementation as `approx_percentile_cont()` +query R +SELECT APPROX_PERCENTILE_CONT_WITH_WEIGHT( + v2, + '+Inf'::Double, + 0.9 +) +FROM tmp_percentile_cont; +---- +NaN + +statement ok +DROP TABLE tmp_percentile_cont; + # csv_query_cube_avg query TIR SELECT c1, c2, AVG(c3) FROM aggregate_test_100 GROUP BY CUBE (c1, c2) ORDER BY c1, c2 @@ -5553,4 +5591,4 @@ drop table employee_csv; query I??III?T select count(null), min(null), max(null), bit_and(NULL), bit_or(NULL), bit_xor(NULL), nth_value(NULL, 1), string_agg(NULL, ','); ---- -0 NULL NULL NULL NULL NULL NULL NULL \ No newline at end of file +0 NULL NULL NULL NULL NULL NULL NULL