Skip to content

Commit

Permalink
Simple NumPy 2 fixes that are clearly no behavior change (#15876)
Browse files Browse the repository at this point in the history
I have a branch that works, but some changes may need a bit of thought to get right, so splitting out the simpler half.

(N.B. the only bigger chunk that is remaining is to make sure that `uint_series > -1` keeps working at least as well as before)

In either case, these are changes that:
* Avoid `copy=False` in `np.array()`
* Are necessary due to NumPy rejecting e.g. `uint8(-1)` now (only changed this where it is test-only)
* Are necessary due to NumPy preserving the scalar dtype things fail later (the hashing code and using `float(float32)` to avoid overflow.
  * Sorting change is the same, using `int8(-1)` gives effectively the old promotion (to float) rather than erroring to not implicit go to float based on the value.

The main noise, is that I parametrized that one test since it seemed easy enough.

Authors:
  - Sebastian Berg (https://github.com/seberg)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #15876
  • Loading branch information
seberg authored May 30, 2024
1 parent c268fc1 commit 579a167
Show file tree
Hide file tree
Showing 7 changed files with 37 additions and 60 deletions.
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/buffer/buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def from_host_memory(cls, data: Any) -> Self:
"""Create an owner from a buffer or array like object
Data must implement `__array_interface__`, the buffer protocol, and/or
be convertible to a buffer object using `numpy.array()`
be convertible to a buffer object using `numpy.asanyarray()`
The host memory is copied to a new device allocation.
Expand All @@ -209,7 +209,7 @@ def from_host_memory(cls, data: Any) -> Self:
"""

# Convert to numpy array, this will not copy data in most cases.
ary = numpy.array(data, copy=False, subok=True)
ary = numpy.asanyarray(data)
# Extract pointer and size
ptr, size = get_ptr_and_size(ary.__array_interface__)
# Copy to device memory
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/buffer/spillable_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def from_host_memory(cls, data: Any) -> Self:
"""Create a spillabe buffer from host memory.
Data must implement `__array_interface__`, the buffer protocol, and/or
be convertible to a buffer object using `numpy.array()`
be convertible to a buffer object using `numpy.asanyarray()`
The new buffer is marked as spilled to host memory already.
Expand All @@ -165,7 +165,7 @@ def from_host_memory(cls, data: Any) -> Self:

# Convert to a memoryview using numpy array, this will not copy data
# in most cases.
data = memoryview(numpy.array(data, copy=False, subok=True))
data = memoryview(numpy.asanyarray(data))
if not data.c_contiguous:
raise ValueError("Buffer data must be C-contiguous")
data = data.cast("B") # Make sure itemsize==1
Expand Down
8 changes: 3 additions & 5 deletions python/cudf/cudf/tests/test_hash_vocab.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
import filecmp
import os
import warnings
Expand All @@ -21,9 +21,7 @@ def test_correct_bert_base_vocab_hash(datadir, tmpdir):

groundtruth_path = os.path.join(datadir, "vocab-hash.txt")
output_path = tmpdir.join("cudf-vocab-hash.txt")
with warnings.catch_warnings():
# See https://github.com/rapidsai/cudf/issues/12403
warnings.simplefilter(action="ignore", category=RuntimeWarning)
hash_vocab(vocab_path, output_path)
warnings.simplefilter(action="ignore", category=RuntimeWarning)
hash_vocab(vocab_path, output_path)

assert filecmp.cmp(output_path, groundtruth_path, shallow=False)
2 changes: 1 addition & 1 deletion python/cudf/cudf/tests/test_numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test_can_cast_safely_same_kind():
assert data.can_cast_safely(to_dtype)

data = cudf.Series(
[np.finfo("float32").max * 2, 1.0], dtype="float64"
[float(np.finfo("float32").max) * 2, 1.0], dtype="float64"
)._column
to_dtype = np.dtype("float32")
assert not data.can_cast_safely(to_dtype)
Expand Down
51 changes: 14 additions & 37 deletions python/cudf/cudf/tests/test_replace.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

import operator
import re
from decimal import Decimal

Expand Down Expand Up @@ -825,43 +826,23 @@ def test_series_fillna_invalid_dtype(data_dtype):

@pytest.mark.parametrize("data_dtype", NUMERIC_TYPES)
@pytest.mark.parametrize("fill_value", [100, 100.0, 128.5])
def test_series_where(data_dtype, fill_value):
@pytest.mark.parametrize("op", [operator.gt, operator.eq, operator.lt])
def test_series_where(data_dtype, fill_value, op):
psr = pd.Series(list(range(10)), dtype=data_dtype)
sr = cudf.from_pandas(psr)

if sr.dtype.type(fill_value) != fill_value:
with pytest.raises(TypeError):
sr.where(sr > 0, fill_value)
else:
# Cast back to original dtype as pandas automatically upcasts
expect = psr.where(psr > 0, fill_value)
got = sr.where(sr > 0, fill_value)
# pandas returns 'float16' dtype, which is not supported in cudf
assert_eq(
expect,
got,
check_dtype=expect.dtype.kind not in ("f"),
)
try:
scalar_fits = sr.dtype.type(fill_value) == fill_value
except OverflowError:
scalar_fits = False

if sr.dtype.type(fill_value) != fill_value:
if not scalar_fits:
with pytest.raises(TypeError):
sr.where(sr < 0, fill_value)
sr.where(op(sr, 0), fill_value)
else:
expect = psr.where(psr < 0, fill_value)
got = sr.where(sr < 0, fill_value)
# pandas returns 'float16' dtype, which is not supported in cudf
assert_eq(
expect,
got,
check_dtype=expect.dtype.kind not in ("f"),
)

if sr.dtype.type(fill_value) != fill_value:
with pytest.raises(TypeError):
sr.where(sr == 0, fill_value)
else:
expect = psr.where(psr == 0, fill_value)
got = sr.where(sr == 0, fill_value)
# Cast back to original dtype as pandas automatically upcasts
expect = psr.where(op(psr, 0), fill_value)
got = sr.where(op(sr, 0), fill_value)
# pandas returns 'float16' dtype, which is not supported in cudf
assert_eq(
expect,
Expand Down Expand Up @@ -985,12 +966,8 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
psr = pd.Series([0, 1, 2, 3, 4, 5], dtype=series_dtype)
sr = cudf.from_pandas(psr)

if sr.dtype.kind in "ui":
can_replace = np.array([replacement])[0].is_integer() and np.can_cast(
int(replacement), sr.dtype
)
else:
can_replace = np.can_cast(replacement, sr.dtype)
numpy_replacement = np.array(replacement).astype(sr.dtype)[()]
can_replace = numpy_replacement == replacement

# Both Scalar
if not can_replace:
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/tests/test_sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,8 @@ def test_series_argsort(nelem, dtype, asc):
if asc:
expected = np.argsort(sr.to_numpy(), kind="mergesort")
else:
expected = np.argsort(sr.to_numpy() * -1, kind="mergesort")
# -1 multiply works around missing desc sort (may promote to float64)
expected = np.argsort(sr.to_numpy() * np.int8(-1), kind="mergesort")
np.testing.assert_array_equal(expected, res.to_numpy())


Expand Down
25 changes: 13 additions & 12 deletions python/cudf/cudf/utils/hash_vocab_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

# Coefficients ranges for inner hash - This are important to set to be
# large so that we have randomness in the bottom bits when modding
A_SECOND_LEVEL_POW = np.uint8(48)
B_SECOND_LEVEL_POW = np.uint8(7)
A_SECOND_LEVEL_POW = np.uint64(48)
B_SECOND_LEVEL_POW = np.uint64(7)

A_LBOUND_SECOND_LEVEL_HASH = 2**16
A_HBOUND_SECOND_LEVEL_HASH = 2**A_SECOND_LEVEL_POW
Expand All @@ -23,11 +23,11 @@


# Shifts for bit packing
A_SECOND_LEVEL_SHIFT_AMT = np.uint8(64 - A_SECOND_LEVEL_POW)
B_SECOND_LEVEL_SHIFT_AMT = np.uint8(
A_SECOND_LEVEL_SHIFT_AMT = np.uint64(64 - A_SECOND_LEVEL_POW)
B_SECOND_LEVEL_SHIFT_AMT = np.uint64(
64 - A_SECOND_LEVEL_POW - B_SECOND_LEVEL_POW
)
BITS_FOR_INNER_TABLE_SIZE = np.uint8(8)
BITS_FOR_INNER_TABLE_SIZE = np.uint64(8)

NOT_FOUND = -1

Expand Down Expand Up @@ -94,7 +94,8 @@ def _find_hash_for_internal(hash_bin):

while True:
a = np.random.randint(
A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH
A_LBOUND_SECOND_LEVEL_HASH,
A_HBOUND_SECOND_LEVEL_HASH,
)
b = np.random.randint(
B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH
Expand Down Expand Up @@ -130,13 +131,13 @@ def _perfect_hash(integers, max_constant):
bin_length = len(internal_table)
max_bin_length = max(bin_length, max_bin_length)
internal_table_coeffs[i] = (
coeff_a << A_SECOND_LEVEL_SHIFT_AMT
| coeff_b << B_SECOND_LEVEL_SHIFT_AMT
| bin_length
)
offset_into_flattened_table[i + 1] = (
offset_into_flattened_table[i] + bin_length
np.uint64(coeff_a) << A_SECOND_LEVEL_SHIFT_AMT
| np.uint64(coeff_b) << B_SECOND_LEVEL_SHIFT_AMT
| np.uint64(bin_length)
)
offset_into_flattened_table[i + 1] = offset_into_flattened_table[
i
] + np.uint64(bin_length)
flattened_bins.extend(internal_table)

print(
Expand Down

0 comments on commit 579a167

Please sign in to comment.