From 579a167542ce664bb9d28ae6b5419e524ec5288b Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Thu, 30 May 2024 18:37:56 +0200 Subject: [PATCH] Simple NumPy 2 fixes that are clearly no behavior change (#15876) I have a branch that works, but some changes may need a bit of thought to get right, so splitting out the simpler half. (N.B. the only bigger chunk that is remaining is to make sure that `uint_series > -1` keeps working at least as well as before) In either case, these are changes that: * Avoid `copy=False` in `np.array()` * Are necessary due to NumPy rejecting e.g. `uint8(-1)` now (only changed this where it is test-only) * Are necessary due to NumPy preserving the scalar dtype things fail later (the hashing code and using `float(float32)` to avoid overflow. * Sorting change is the same, using `int8(-1)` gives effectively the old promotion (to float) rather than erroring to not implicit go to float based on the value. The main noise, is that I parametrized that one test since it seemed easy enough. Authors: - Sebastian Berg (https://github.com/seberg) Approvers: - Lawrence Mitchell (https://github.com/wence-) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15876 --- python/cudf/cudf/core/buffer/buffer.py | 4 +- .../cudf/cudf/core/buffer/spillable_buffer.py | 4 +- python/cudf/cudf/tests/test_hash_vocab.py | 8 ++- python/cudf/cudf/tests/test_numerical.py | 2 +- python/cudf/cudf/tests/test_replace.py | 51 +++++-------------- python/cudf/cudf/tests/test_sorting.py | 3 +- python/cudf/cudf/utils/hash_vocab_utils.py | 25 ++++----- 7 files changed, 37 insertions(+), 60 deletions(-) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 5c2d77033b8..bf6f9f1a3c1 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -191,7 +191,7 @@ def from_host_memory(cls, data: Any) -> Self: """Create an owner from a buffer or array like object Data must implement `__array_interface__`, the buffer protocol, and/or - be convertible to a buffer object using `numpy.array()` + be convertible to a buffer object using `numpy.asanyarray()` The host memory is copied to a new device allocation. @@ -209,7 +209,7 @@ def from_host_memory(cls, data: Any) -> Self: """ # Convert to numpy array, this will not copy data in most cases. - ary = numpy.array(data, copy=False, subok=True) + ary = numpy.asanyarray(data) # Extract pointer and size ptr, size = get_ptr_and_size(ary.__array_interface__) # Copy to device memory diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index a1af3ba8c9d..49258fea9ab 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -146,7 +146,7 @@ def from_host_memory(cls, data: Any) -> Self: """Create a spillabe buffer from host memory. Data must implement `__array_interface__`, the buffer protocol, and/or - be convertible to a buffer object using `numpy.array()` + be convertible to a buffer object using `numpy.asanyarray()` The new buffer is marked as spilled to host memory already. @@ -165,7 +165,7 @@ def from_host_memory(cls, data: Any) -> Self: # Convert to a memoryview using numpy array, this will not copy data # in most cases. - data = memoryview(numpy.array(data, copy=False, subok=True)) + data = memoryview(numpy.asanyarray(data)) if not data.c_contiguous: raise ValueError("Buffer data must be C-contiguous") data = data.cast("B") # Make sure itemsize==1 diff --git a/python/cudf/cudf/tests/test_hash_vocab.py b/python/cudf/cudf/tests/test_hash_vocab.py index e081119ff89..c98b92f7083 100644 --- a/python/cudf/cudf/tests/test_hash_vocab.py +++ b/python/cudf/cudf/tests/test_hash_vocab.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import filecmp import os import warnings @@ -21,9 +21,7 @@ def test_correct_bert_base_vocab_hash(datadir, tmpdir): groundtruth_path = os.path.join(datadir, "vocab-hash.txt") output_path = tmpdir.join("cudf-vocab-hash.txt") - with warnings.catch_warnings(): - # See https://github.com/rapidsai/cudf/issues/12403 - warnings.simplefilter(action="ignore", category=RuntimeWarning) - hash_vocab(vocab_path, output_path) + warnings.simplefilter(action="ignore", category=RuntimeWarning) + hash_vocab(vocab_path, output_path) assert filecmp.cmp(output_path, groundtruth_path, shallow=False) diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index 2e3be92dbeb..03081208739 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -44,7 +44,7 @@ def test_can_cast_safely_same_kind(): assert data.can_cast_safely(to_dtype) data = cudf.Series( - [np.finfo("float32").max * 2, 1.0], dtype="float64" + [float(np.finfo("float32").max) * 2, 1.0], dtype="float64" )._column to_dtype = np.dtype("float32") assert not data.can_cast_safely(to_dtype) diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index 8992c4d617b..d77ec596271 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -1,5 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +import operator import re from decimal import Decimal @@ -825,43 +826,23 @@ def test_series_fillna_invalid_dtype(data_dtype): @pytest.mark.parametrize("data_dtype", NUMERIC_TYPES) @pytest.mark.parametrize("fill_value", [100, 100.0, 128.5]) -def test_series_where(data_dtype, fill_value): +@pytest.mark.parametrize("op", [operator.gt, operator.eq, operator.lt]) +def test_series_where(data_dtype, fill_value, op): psr = pd.Series(list(range(10)), dtype=data_dtype) sr = cudf.from_pandas(psr) - if sr.dtype.type(fill_value) != fill_value: - with pytest.raises(TypeError): - sr.where(sr > 0, fill_value) - else: - # Cast back to original dtype as pandas automatically upcasts - expect = psr.where(psr > 0, fill_value) - got = sr.where(sr > 0, fill_value) - # pandas returns 'float16' dtype, which is not supported in cudf - assert_eq( - expect, - got, - check_dtype=expect.dtype.kind not in ("f"), - ) + try: + scalar_fits = sr.dtype.type(fill_value) == fill_value + except OverflowError: + scalar_fits = False - if sr.dtype.type(fill_value) != fill_value: + if not scalar_fits: with pytest.raises(TypeError): - sr.where(sr < 0, fill_value) + sr.where(op(sr, 0), fill_value) else: - expect = psr.where(psr < 0, fill_value) - got = sr.where(sr < 0, fill_value) - # pandas returns 'float16' dtype, which is not supported in cudf - assert_eq( - expect, - got, - check_dtype=expect.dtype.kind not in ("f"), - ) - - if sr.dtype.type(fill_value) != fill_value: - with pytest.raises(TypeError): - sr.where(sr == 0, fill_value) - else: - expect = psr.where(psr == 0, fill_value) - got = sr.where(sr == 0, fill_value) + # Cast back to original dtype as pandas automatically upcasts + expect = psr.where(op(psr, 0), fill_value) + got = sr.where(op(sr, 0), fill_value) # pandas returns 'float16' dtype, which is not supported in cudf assert_eq( expect, @@ -985,12 +966,8 @@ def test_numeric_series_replace_dtype(series_dtype, replacement): psr = pd.Series([0, 1, 2, 3, 4, 5], dtype=series_dtype) sr = cudf.from_pandas(psr) - if sr.dtype.kind in "ui": - can_replace = np.array([replacement])[0].is_integer() and np.can_cast( - int(replacement), sr.dtype - ) - else: - can_replace = np.can_cast(replacement, sr.dtype) + numpy_replacement = np.array(replacement).astype(sr.dtype)[()] + can_replace = numpy_replacement == replacement # Both Scalar if not can_replace: diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 618c4f30bd9..449f21721f4 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -107,7 +107,8 @@ def test_series_argsort(nelem, dtype, asc): if asc: expected = np.argsort(sr.to_numpy(), kind="mergesort") else: - expected = np.argsort(sr.to_numpy() * -1, kind="mergesort") + # -1 multiply works around missing desc sort (may promote to float64) + expected = np.argsort(sr.to_numpy() * np.int8(-1), kind="mergesort") np.testing.assert_array_equal(expected, res.to_numpy()) diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py index ef078ed8c5d..babe4be2715 100644 --- a/python/cudf/cudf/utils/hash_vocab_utils.py +++ b/python/cudf/cudf/utils/hash_vocab_utils.py @@ -7,8 +7,8 @@ # Coefficients ranges for inner hash - This are important to set to be # large so that we have randomness in the bottom bits when modding -A_SECOND_LEVEL_POW = np.uint8(48) -B_SECOND_LEVEL_POW = np.uint8(7) +A_SECOND_LEVEL_POW = np.uint64(48) +B_SECOND_LEVEL_POW = np.uint64(7) A_LBOUND_SECOND_LEVEL_HASH = 2**16 A_HBOUND_SECOND_LEVEL_HASH = 2**A_SECOND_LEVEL_POW @@ -23,11 +23,11 @@ # Shifts for bit packing -A_SECOND_LEVEL_SHIFT_AMT = np.uint8(64 - A_SECOND_LEVEL_POW) -B_SECOND_LEVEL_SHIFT_AMT = np.uint8( +A_SECOND_LEVEL_SHIFT_AMT = np.uint64(64 - A_SECOND_LEVEL_POW) +B_SECOND_LEVEL_SHIFT_AMT = np.uint64( 64 - A_SECOND_LEVEL_POW - B_SECOND_LEVEL_POW ) -BITS_FOR_INNER_TABLE_SIZE = np.uint8(8) +BITS_FOR_INNER_TABLE_SIZE = np.uint64(8) NOT_FOUND = -1 @@ -94,7 +94,8 @@ def _find_hash_for_internal(hash_bin): while True: a = np.random.randint( - A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH + A_LBOUND_SECOND_LEVEL_HASH, + A_HBOUND_SECOND_LEVEL_HASH, ) b = np.random.randint( B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH @@ -130,13 +131,13 @@ def _perfect_hash(integers, max_constant): bin_length = len(internal_table) max_bin_length = max(bin_length, max_bin_length) internal_table_coeffs[i] = ( - coeff_a << A_SECOND_LEVEL_SHIFT_AMT - | coeff_b << B_SECOND_LEVEL_SHIFT_AMT - | bin_length - ) - offset_into_flattened_table[i + 1] = ( - offset_into_flattened_table[i] + bin_length + np.uint64(coeff_a) << A_SECOND_LEVEL_SHIFT_AMT + | np.uint64(coeff_b) << B_SECOND_LEVEL_SHIFT_AMT + | np.uint64(bin_length) ) + offset_into_flattened_table[i + 1] = offset_into_flattened_table[ + i + ] + np.uint64(bin_length) flattened_bins.extend(internal_table) print(