Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Fix pyarrow and numpy logical bug concerning bool and string #60529

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,8 @@ Conversion
- Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`)
- Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`)
- Bug in :meth:`Series.reindex` not maintaining ``float32`` type when a ``reindex`` introduces a missing value (:issue:`45857`)
- Bug in :meth:`Ops.logical_op` not correctly casting numpy-backed string arrays to boolean when used in logical operations with other boolean arrays (:issue:`60234`)
- Bug in :meth:`ArrowExtensionArray._evaluate_op_method` not correctly casting pyarrow-backed string arrays to boolean when used in logical operations with other boolean arrays (:issue:`60234`)

Strings
^^^^^^^
Expand Down
33 changes: 27 additions & 6 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,12 @@
}

ARROW_LOGICAL_FUNCS = {
"and_": pc.and_kleene,
"rand_": lambda x, y: pc.and_kleene(y, x),
"or_": pc.or_kleene,
"ror_": lambda x, y: pc.or_kleene(y, x),
"xor": pc.xor,
"rxor": lambda x, y: pc.xor(y, x),
"and_": lambda x, y: pc.and_kleene(*cast_for_logical(x, y)),
"rand_": lambda x, y: pc.and_kleene(*cast_for_logical(y, x)),
"or_": lambda x, y: pc.or_kleene(*cast_for_logical(x, y)),
"ror_": lambda x, y: pc.or_kleene(*cast_for_logical(y, x)),
"xor": lambda x, y: pc.xor(*cast_for_logical(x, y)),
"rxor": lambda x, y: pc.xor(*cast_for_logical(y, x)),
}

ARROW_BIT_WISE_FUNCS = {
Expand All @@ -107,6 +107,20 @@
"rxor": lambda x, y: pc.bit_wise_xor(y, x),
}

def convert_string_to_boolean_array(arr):
if pa.types.is_string(arr.type) or pa.types.is_large_string(arr.type):
string_to_bool = [bool(value.as_py()) for value in arr]
arr = pc.cast(string_to_bool, pa.bool_())
return arr

def cast_for_logical(x, y):
is_x_bool = pa.types.is_boolean(x.type)
is_y_bool = pa.types.is_boolean(y.type)

if (is_x_bool != is_y_bool):
return convert_string_to_boolean_array(x), convert_string_to_boolean_array(y)
return x, y

def cast_for_truediv(
arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar
) -> tuple[pa.ChunkedArray, pa.Array | pa.Scalar]:
Expand Down Expand Up @@ -822,6 +836,13 @@ def _evaluate_op_method(self, other, op, arrow_funcs) -> Self:
result = pc_func(self._pa_array, other)
except pa.ArrowNotImplementedError as err:
raise TypeError(self._op_method_error_message(other_original, op)) from err

if (op.__name__ in ARROW_LOGICAL_FUNCS
and (isinstance(self, pa.lib.BooleanArray) !=
isinstance(other, pa.lib.BooleanArray))
):
return pc.cast(result, pa.bool_())

return type(self)(result)

def _logical_method(self, other, op) -> Self:
Expand Down
7 changes: 7 additions & 0 deletions pandas/core/ops/array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,13 @@ def fill_bool(x, left=None):
rvalues = right

if should_extension_dispatch(lvalues, rvalues):
# Must cast if logical op between a boolean array and numpy-backed string array
if ((lvalues.dtype == np.bool_ and rvalues.dtype == "string[python]")
or (lvalues.dtype == "string[python]" and rvalues.dtype == np.bool_)
):
lvalues = lvalues.astype(bool)
rvalues = rvalues.astype(bool)

# Call the method on lvalues
res_values = op(lvalues, rvalues)

Expand Down
51 changes: 51 additions & 0 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -740,3 +740,54 @@ def test_tolist(dtype):
result = arr.tolist()
expected = vals
tm.assert_equal(result, expected)

@pytest.mark.parametrize("dtype", ["string[pyarrow]"])
def test_or_pyarrow_string(dtype):
with pd.option_context("future.infer_string", True):
ser1 = pd.Series([False, False])
ser2 = pd.Series(["", "b"], dtype=dtype)
result = ser1 | ser2
expected = pd.Series([False, True], dtype=bool)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("dtype", ["string[pyarrow]"])
def test_and_pyarrow_string(dtype):
with pd.option_context("future.infer_string", True):
ser1 = pd.Series([False, False])
ser2 = pd.Series(["", "b"], dtype=dtype)
result = ser1 & ser2
expected = pd.Series([False, False], dtype=bool)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("dtype", ["string[pyarrow]"])
def test_xor_pyarrow_string(dtype):
with pd.option_context("future.infer_string", True):
ser1 = pd.Series([False, False])
ser2 = pd.Series(["", "b"], dtype=dtype)
result = ser1 ^ ser2
expected = pd.Series([False, True], dtype=bool)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("dtype", ["string[python]"])
def test_or_numpy_string(dtype):
ser1 = pd.Series([False, False])
ser2 = pd.Series(["", "b"], dtype=dtype)
result = ser1 | ser2
expected = pd.Series([False, True], dtype=bool)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("dtype", ["string[python]"])
def test_and_numpy_string(dtype):
ser1 = pd.Series([False, False])
ser2 = pd.Series(["", "b"], dtype=dtype)
result = ser1 & ser2
expected = pd.Series([False, False], dtype=bool)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("dtype", ["string[python]"])
def test_xor_numpy_string(dtype):
ser1 = pd.Series([False, False])
ser2 = pd.Series(["", "b"], dtype=dtype)
result = ser1 ^ ser2
expected = pd.Series([False, True], dtype=bool)
tm.assert_series_equal(result, expected)
Loading