From 579a167542ce664bb9d28ae6b5419e524ec5288b Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Thu, 30 May 2024 18:37:56 +0200
Subject: [PATCH] Simple NumPy 2 fixes that are clearly no behavior change
 (#15876)

I have a branch that works, but some changes may need a bit of thought to get right, so splitting out the simpler half.

(N.B. the only bigger chunk that is remaining is to make sure that `uint_series > -1` keeps working at least as well as before)

In either case, these are changes that:
* Avoid `copy=False` in `np.array()`
* Are necessary due to NumPy rejecting e.g. `uint8(-1)` now (only changed this where it is test-only)
* Are necessary due to NumPy preserving the scalar dtype things fail later (the hashing code and using `float(float32)` to avoid overflow.
  * Sorting change is the same, using `int8(-1)` gives effectively the old promotion (to float) rather than erroring to not implicit go to float based on the value.

The main noise, is that I parametrized that one test since it seemed easy enough.

Authors:
  - Sebastian Berg (https://github.com/seberg)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15876
---
 python/cudf/cudf/core/buffer/buffer.py        |  4 +-
 .../cudf/cudf/core/buffer/spillable_buffer.py |  4 +-
 python/cudf/cudf/tests/test_hash_vocab.py     |  8 ++-
 python/cudf/cudf/tests/test_numerical.py      |  2 +-
 python/cudf/cudf/tests/test_replace.py        | 51 +++++--------------
 python/cudf/cudf/tests/test_sorting.py        |  3 +-
 python/cudf/cudf/utils/hash_vocab_utils.py    | 25 ++++-----
 7 files changed, 37 insertions(+), 60 deletions(-)

diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index 5c2d77033b8..bf6f9f1a3c1 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -191,7 +191,7 @@ def from_host_memory(cls, data: Any) -> Self:
         """Create an owner from a buffer or array like object
 
         Data must implement `__array_interface__`, the buffer protocol, and/or
-        be convertible to a buffer object using `numpy.array()`
+        be convertible to a buffer object using `numpy.asanyarray()`
 
         The host memory is copied to a new device allocation.
 
@@ -209,7 +209,7 @@ def from_host_memory(cls, data: Any) -> Self:
         """
 
         # Convert to numpy array, this will not copy data in most cases.
-        ary = numpy.array(data, copy=False, subok=True)
+        ary = numpy.asanyarray(data)
         # Extract pointer and size
         ptr, size = get_ptr_and_size(ary.__array_interface__)
         # Copy to device memory
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index a1af3ba8c9d..49258fea9ab 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -146,7 +146,7 @@ def from_host_memory(cls, data: Any) -> Self:
         """Create a spillabe buffer from host memory.
 
         Data must implement `__array_interface__`, the buffer protocol, and/or
-        be convertible to a buffer object using `numpy.array()`
+        be convertible to a buffer object using `numpy.asanyarray()`
 
         The new buffer is marked as spilled to host memory already.
 
@@ -165,7 +165,7 @@ def from_host_memory(cls, data: Any) -> Self:
 
         # Convert to a memoryview using numpy array, this will not copy data
         # in most cases.
-        data = memoryview(numpy.array(data, copy=False, subok=True))
+        data = memoryview(numpy.asanyarray(data))
         if not data.c_contiguous:
             raise ValueError("Buffer data must be C-contiguous")
         data = data.cast("B")  # Make sure itemsize==1
diff --git a/python/cudf/cudf/tests/test_hash_vocab.py b/python/cudf/cudf/tests/test_hash_vocab.py
index e081119ff89..c98b92f7083 100644
--- a/python/cudf/cudf/tests/test_hash_vocab.py
+++ b/python/cudf/cudf/tests/test_hash_vocab.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 import filecmp
 import os
 import warnings
@@ -21,9 +21,7 @@ def test_correct_bert_base_vocab_hash(datadir, tmpdir):
 
     groundtruth_path = os.path.join(datadir, "vocab-hash.txt")
     output_path = tmpdir.join("cudf-vocab-hash.txt")
-    with warnings.catch_warnings():
-        # See https://github.com/rapidsai/cudf/issues/12403
-        warnings.simplefilter(action="ignore", category=RuntimeWarning)
-        hash_vocab(vocab_path, output_path)
+    warnings.simplefilter(action="ignore", category=RuntimeWarning)
+    hash_vocab(vocab_path, output_path)
 
     assert filecmp.cmp(output_path, groundtruth_path, shallow=False)
diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py
index 2e3be92dbeb..03081208739 100644
--- a/python/cudf/cudf/tests/test_numerical.py
+++ b/python/cudf/cudf/tests/test_numerical.py
@@ -44,7 +44,7 @@ def test_can_cast_safely_same_kind():
     assert data.can_cast_safely(to_dtype)
 
     data = cudf.Series(
-        [np.finfo("float32").max * 2, 1.0], dtype="float64"
+        [float(np.finfo("float32").max) * 2, 1.0], dtype="float64"
     )._column
     to_dtype = np.dtype("float32")
     assert not data.can_cast_safely(to_dtype)
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index 8992c4d617b..d77ec596271 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+import operator
 import re
 from decimal import Decimal
 
@@ -825,43 +826,23 @@ def test_series_fillna_invalid_dtype(data_dtype):
 
 @pytest.mark.parametrize("data_dtype", NUMERIC_TYPES)
 @pytest.mark.parametrize("fill_value", [100, 100.0, 128.5])
-def test_series_where(data_dtype, fill_value):
+@pytest.mark.parametrize("op", [operator.gt, operator.eq, operator.lt])
+def test_series_where(data_dtype, fill_value, op):
     psr = pd.Series(list(range(10)), dtype=data_dtype)
     sr = cudf.from_pandas(psr)
 
-    if sr.dtype.type(fill_value) != fill_value:
-        with pytest.raises(TypeError):
-            sr.where(sr > 0, fill_value)
-    else:
-        # Cast back to original dtype as pandas automatically upcasts
-        expect = psr.where(psr > 0, fill_value)
-        got = sr.where(sr > 0, fill_value)
-        # pandas returns 'float16' dtype, which is not supported in cudf
-        assert_eq(
-            expect,
-            got,
-            check_dtype=expect.dtype.kind not in ("f"),
-        )
+    try:
+        scalar_fits = sr.dtype.type(fill_value) == fill_value
+    except OverflowError:
+        scalar_fits = False
 
-    if sr.dtype.type(fill_value) != fill_value:
+    if not scalar_fits:
         with pytest.raises(TypeError):
-            sr.where(sr < 0, fill_value)
+            sr.where(op(sr, 0), fill_value)
     else:
-        expect = psr.where(psr < 0, fill_value)
-        got = sr.where(sr < 0, fill_value)
-        # pandas returns 'float16' dtype, which is not supported in cudf
-        assert_eq(
-            expect,
-            got,
-            check_dtype=expect.dtype.kind not in ("f"),
-        )
-
-    if sr.dtype.type(fill_value) != fill_value:
-        with pytest.raises(TypeError):
-            sr.where(sr == 0, fill_value)
-    else:
-        expect = psr.where(psr == 0, fill_value)
-        got = sr.where(sr == 0, fill_value)
+        # Cast back to original dtype as pandas automatically upcasts
+        expect = psr.where(op(psr, 0), fill_value)
+        got = sr.where(op(sr, 0), fill_value)
         # pandas returns 'float16' dtype, which is not supported in cudf
         assert_eq(
             expect,
@@ -985,12 +966,8 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
     psr = pd.Series([0, 1, 2, 3, 4, 5], dtype=series_dtype)
     sr = cudf.from_pandas(psr)
 
-    if sr.dtype.kind in "ui":
-        can_replace = np.array([replacement])[0].is_integer() and np.can_cast(
-            int(replacement), sr.dtype
-        )
-    else:
-        can_replace = np.can_cast(replacement, sr.dtype)
+    numpy_replacement = np.array(replacement).astype(sr.dtype)[()]
+    can_replace = numpy_replacement == replacement
 
     # Both Scalar
     if not can_replace:
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index 618c4f30bd9..449f21721f4 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -107,7 +107,8 @@ def test_series_argsort(nelem, dtype, asc):
     if asc:
         expected = np.argsort(sr.to_numpy(), kind="mergesort")
     else:
-        expected = np.argsort(sr.to_numpy() * -1, kind="mergesort")
+        # -1 multiply works around missing desc sort (may promote to float64)
+        expected = np.argsort(sr.to_numpy() * np.int8(-1), kind="mergesort")
     np.testing.assert_array_equal(expected, res.to_numpy())
 
 
diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py
index ef078ed8c5d..babe4be2715 100644
--- a/python/cudf/cudf/utils/hash_vocab_utils.py
+++ b/python/cudf/cudf/utils/hash_vocab_utils.py
@@ -7,8 +7,8 @@
 
 # Coefficients ranges for inner hash - This are important to set to be
 # large so that we have randomness in the bottom bits when modding
-A_SECOND_LEVEL_POW = np.uint8(48)
-B_SECOND_LEVEL_POW = np.uint8(7)
+A_SECOND_LEVEL_POW = np.uint64(48)
+B_SECOND_LEVEL_POW = np.uint64(7)
 
 A_LBOUND_SECOND_LEVEL_HASH = 2**16
 A_HBOUND_SECOND_LEVEL_HASH = 2**A_SECOND_LEVEL_POW
@@ -23,11 +23,11 @@
 
 
 # Shifts for bit packing
-A_SECOND_LEVEL_SHIFT_AMT = np.uint8(64 - A_SECOND_LEVEL_POW)
-B_SECOND_LEVEL_SHIFT_AMT = np.uint8(
+A_SECOND_LEVEL_SHIFT_AMT = np.uint64(64 - A_SECOND_LEVEL_POW)
+B_SECOND_LEVEL_SHIFT_AMT = np.uint64(
     64 - A_SECOND_LEVEL_POW - B_SECOND_LEVEL_POW
 )
-BITS_FOR_INNER_TABLE_SIZE = np.uint8(8)
+BITS_FOR_INNER_TABLE_SIZE = np.uint64(8)
 
 NOT_FOUND = -1
 
@@ -94,7 +94,8 @@ def _find_hash_for_internal(hash_bin):
 
     while True:
         a = np.random.randint(
-            A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH
+            A_LBOUND_SECOND_LEVEL_HASH,
+            A_HBOUND_SECOND_LEVEL_HASH,
         )
         b = np.random.randint(
             B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH
@@ -130,13 +131,13 @@ def _perfect_hash(integers, max_constant):
         bin_length = len(internal_table)
         max_bin_length = max(bin_length, max_bin_length)
         internal_table_coeffs[i] = (
-            coeff_a << A_SECOND_LEVEL_SHIFT_AMT
-            | coeff_b << B_SECOND_LEVEL_SHIFT_AMT
-            | bin_length
-        )
-        offset_into_flattened_table[i + 1] = (
-            offset_into_flattened_table[i] + bin_length
+            np.uint64(coeff_a) << A_SECOND_LEVEL_SHIFT_AMT
+            | np.uint64(coeff_b) << B_SECOND_LEVEL_SHIFT_AMT
+            | np.uint64(bin_length)
         )
+        offset_into_flattened_table[i + 1] = offset_into_flattened_table[
+            i
+        ] + np.uint64(bin_length)
         flattened_bins.extend(internal_table)
 
     print(