Ignore NaN correctly in .quantile (#17593)

From an offline conversation, fixes the follow discrepancy between cudf and pandas ```python In [1]: import cudf In [2]: import numpy as np In [3]: ser = cudf.Series([np.nan, np.nan, 0.9], nan_as_null=False) In [4]: ser Out[4]: 0 NaN 1 NaN 2 0.9 dtype: float64 In [5]: ser.quantile(0.9) Out[5]: np.float64(nan) In [6]: import pandas as pd In [7]: ser = pd.Series([np.nan, np.nan, 0.9]) In [8]: ser.quantile(0.9) Out[8]: np.float64(0.9) ``` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: #17593
rapidsai · Dec 13, 2024 · 76b35ad · 76b35ad
1 parent 34e2045
commit 76b35ad
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 3 deletions.
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
@@ -143,13 +143,14 @@ def quantile(
                 ),
             )
         else:
+            no_nans = self.nans_to_nulls()
             # get sorted indices and exclude nulls
             indices = sorting.order_by(
-                [self], [True], "first", stable=True
-            ).slice(self.null_count, len(self))
+                [no_nans], [True], "first", stable=True
+            ).slice(no_nans.null_count, len(no_nans))
             with acquire_spill_lock():
                 plc_column = plc.quantiles.quantile(
-                    self.to_pylibcudf(mode="read"),
+                    no_nans.to_pylibcudf(mode="read"),
                     q,
                     plc.types.Interpolation[interpolation.upper()],
                     indices.to_pylibcudf(mode="read"),

diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py
@@ -91,3 +91,19 @@ def test_quantile_type_int_float(interpolation):
 
     assert expected == actual
     assert type(expected) is type(actual)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [float("nan"), float("nan"), 0.9],
+        [float("nan"), float("nan"), float("nan")],
+    ],
+)
+def test_ignore_nans(data):
+    psr = pd.Series(data)
+    gsr = cudf.Series(data, nan_as_null=False)
+
+    expected = gsr.quantile(0.9)
+    result = psr.quantile(0.9)
+    assert_eq(result, expected)