From 76b35adec49d85cf23d4a32a44588c856234f140 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 13 Dec 2024 15:50:11 -0800
Subject: [PATCH] Ignore NaN correctly in .quantile (#17593)

From an offline conversation, fixes the follow discrepancy between cudf and pandas

```python
In [1]: import cudf

In [2]: import numpy as np

In [3]: ser = cudf.Series([np.nan, np.nan, 0.9], nan_as_null=False)

In [4]: ser
Out[4]:
0    NaN
1    NaN
2    0.9
dtype: float64

In [5]: ser.quantile(0.9)
Out[5]: np.float64(nan)

In [6]: import pandas as pd

In [7]: ser = pd.Series([np.nan, np.nan, 0.9])

In [8]: ser.quantile(0.9)
Out[8]: np.float64(0.9)
```

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17593
---
 python/cudf/cudf/core/column/numerical_base.py |  7 ++++---
 python/cudf/cudf/tests/test_quantiles.py       | 16 ++++++++++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index aaf2239a71e..689d5132d45 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -143,13 +143,14 @@ def quantile(
                 ),
             )
         else:
+            no_nans = self.nans_to_nulls()
             # get sorted indices and exclude nulls
             indices = sorting.order_by(
-                [self], [True], "first", stable=True
-            ).slice(self.null_count, len(self))
+                [no_nans], [True], "first", stable=True
+            ).slice(no_nans.null_count, len(no_nans))
             with acquire_spill_lock():
                 plc_column = plc.quantiles.quantile(
-                    self.to_pylibcudf(mode="read"),
+                    no_nans.to_pylibcudf(mode="read"),
                     q,
                     plc.types.Interpolation[interpolation.upper()],
                     indices.to_pylibcudf(mode="read"),
diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py
index 9a2816f5444..84de2ac38e7 100644
--- a/python/cudf/cudf/tests/test_quantiles.py
+++ b/python/cudf/cudf/tests/test_quantiles.py
@@ -91,3 +91,19 @@ def test_quantile_type_int_float(interpolation):
 
     assert expected == actual
     assert type(expected) is type(actual)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [float("nan"), float("nan"), 0.9],
+        [float("nan"), float("nan"), float("nan")],
+    ],
+)
+def test_ignore_nans(data):
+    psr = pd.Series(data)
+    gsr = cudf.Series(data, nan_as_null=False)
+
+    expected = gsr.quantile(0.9)
+    result = psr.quantile(0.9)
+    assert_eq(result, expected)