From 76b35adec49d85cf23d4a32a44588c856234f140 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 13 Dec 2024 15:50:11 -0800 Subject: [PATCH] Ignore NaN correctly in .quantile (#17593) From an offline conversation, fixes the follow discrepancy between cudf and pandas ```python In [1]: import cudf In [2]: import numpy as np In [3]: ser = cudf.Series([np.nan, np.nan, 0.9], nan_as_null=False) In [4]: ser Out[4]: 0 NaN 1 NaN 2 0.9 dtype: float64 In [5]: ser.quantile(0.9) Out[5]: np.float64(nan) In [6]: import pandas as pd In [7]: ser = pd.Series([np.nan, np.nan, 0.9]) In [8]: ser.quantile(0.9) Out[8]: np.float64(0.9) ``` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17593 --- python/cudf/cudf/core/column/numerical_base.py | 7 ++++--- python/cudf/cudf/tests/test_quantiles.py | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index aaf2239a71e..689d5132d45 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -143,13 +143,14 @@ def quantile( ), ) else: + no_nans = self.nans_to_nulls() # get sorted indices and exclude nulls indices = sorting.order_by( - [self], [True], "first", stable=True - ).slice(self.null_count, len(self)) + [no_nans], [True], "first", stable=True + ).slice(no_nans.null_count, len(no_nans)) with acquire_spill_lock(): plc_column = plc.quantiles.quantile( - self.to_pylibcudf(mode="read"), + no_nans.to_pylibcudf(mode="read"), q, plc.types.Interpolation[interpolation.upper()], indices.to_pylibcudf(mode="read"), diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py index 9a2816f5444..84de2ac38e7 100644 --- a/python/cudf/cudf/tests/test_quantiles.py +++ b/python/cudf/cudf/tests/test_quantiles.py @@ -91,3 +91,19 @@ def test_quantile_type_int_float(interpolation): assert expected == actual assert type(expected) is type(actual) + + +@pytest.mark.parametrize( + "data", + [ + [float("nan"), float("nan"), 0.9], + [float("nan"), float("nan"), float("nan")], + ], +) +def test_ignore_nans(data): + psr = pd.Series(data) + gsr = cudf.Series(data, nan_as_null=False) + + expected = gsr.quantile(0.9) + result = psr.quantile(0.9) + assert_eq(result, expected)