From 4d807497bf5180300385e0b83e1e9b76391858e8 Mon Sep 17 00:00:00 2001 From: Emma Tong <53305901+etong11@users.noreply.github.com> Date: Sat, 7 Dec 2024 16:37:44 -0500 Subject: [PATCH 1/6] Added test for pyarrow logical op bug --- pandas/tests/arrays/string_/test_string.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index a32ac7db4656a..5a1001302b2d4 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -740,3 +740,12 @@ def test_tolist(dtype): result = arr.tolist() expected = vals tm.assert_equal(result, expected) + +@pytest.mark.parametrize("dtype", ["string[python]", "string[pyarrow]"]) +def test_logical_operators_pyarrow_string(dtype): + with pd.option_context("future.infer_string", True): + ser1 = pd.Series([False, False]) + ser2 = pd.Series(["", "b"], dtype=dtype) + result = ser1 | ser2 + expected = pd.Series([False, True], dtype=bool) + tm.assert_series_equal(result, expected) \ No newline at end of file From b63506e997eded2aeeff055aec37520cdb2082cc Mon Sep 17 00:00:00 2001 From: Emma Tong <53305901+etong11@users.noreply.github.com> Date: Sat, 7 Dec 2024 21:33:15 -0500 Subject: [PATCH 2/6] Added tests for xor and and operators --- pandas/tests/arrays/string_/test_string.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 5a1001302b2d4..70d4a60606c30 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -741,11 +741,29 @@ def test_tolist(dtype): expected = vals tm.assert_equal(result, expected) -@pytest.mark.parametrize("dtype", ["string[python]", "string[pyarrow]"]) -def test_logical_operators_pyarrow_string(dtype): +@pytest.mark.parametrize("dtype", ["string[pyarrow]"]) +def test_or_pyarrow_string(dtype): with pd.option_context("future.infer_string", True): ser1 = pd.Series([False, False]) ser2 = pd.Series(["", "b"], dtype=dtype) result = ser1 | ser2 expected = pd.Series([False, True], dtype=bool) + tm.assert_series_equal(result, expected) + +@pytest.mark.parametrize("dtype", ["string[pyarrow]"]) +def test_and_pyarrow_string(dtype): + with pd.option_context("future.infer_string", True): + ser1 = pd.Series([False, False]) + ser2 = pd.Series(["", "b"], dtype=dtype) + result = ser1 & ser2 + expected = pd.Series([False, False], dtype=bool) + tm.assert_series_equal(result, expected) + +@pytest.mark.parametrize("dtype", ["string[pyarrow]"]) +def test_xor_pyarrow_string(dtype): + with pd.option_context("future.infer_string", True): + ser1 = pd.Series([False, False]) + ser2 = pd.Series(["", "b"], dtype=dtype) + result = ser1 ^ ser2 + expected = pd.Series([False, True], dtype=bool) tm.assert_series_equal(result, expected) \ No newline at end of file From d97c8378a4830fde48aad1e99e60eea7a2cea5ff Mon Sep 17 00:00:00 2001 From: Emma Tong <53305901+etong11@users.noreply.github.com> Date: Sat, 7 Dec 2024 21:55:37 -0500 Subject: [PATCH 3/6] Converted pyarrow stringarrays to booleanarray if using logical operators --- pandas/core/arrays/arrow/array.py | 33 +++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e0c93db0afb07..1a89ec28c2de1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -90,12 +90,12 @@ } ARROW_LOGICAL_FUNCS = { - "and_": pc.and_kleene, - "rand_": lambda x, y: pc.and_kleene(y, x), - "or_": pc.or_kleene, - "ror_": lambda x, y: pc.or_kleene(y, x), - "xor": pc.xor, - "rxor": lambda x, y: pc.xor(y, x), + "and_": lambda x, y: pc.and_kleene(*cast_for_logical(x, y)), + "rand_": lambda x, y: pc.and_kleene(*cast_for_logical(y, x)), + "or_": lambda x, y: pc.or_kleene(*cast_for_logical(x, y)), + "ror_": lambda x, y: pc.or_kleene(*cast_for_logical(y, x)), + "xor": lambda x, y: pc.xor(*cast_for_logical(x, y)), + "rxor": lambda x, y: pc.xor(*cast_for_logical(y, x)), } ARROW_BIT_WISE_FUNCS = { @@ -107,6 +107,20 @@ "rxor": lambda x, y: pc.bit_wise_xor(y, x), } + def convert_string_to_boolean_array(arr): + if pa.types.is_string(arr.type) or pa.types.is_large_string(arr.type): + string_to_bool = [bool(value.as_py()) for value in arr] + arr = pc.cast(string_to_bool, pa.bool_()) + return arr + + def cast_for_logical(x, y): + is_x_bool = pa.types.is_boolean(x.type) + is_y_bool = pa.types.is_boolean(y.type) + + if (is_x_bool ^ is_y_bool): + return convert_string_to_boolean_array(x), convert_string_to_boolean_array(y) + return x, y + def cast_for_truediv( arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar ) -> tuple[pa.ChunkedArray, pa.Array | pa.Scalar]: @@ -822,6 +836,13 @@ def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: result = pc_func(self._pa_array, other) except pa.ArrowNotImplementedError as err: raise TypeError(self._op_method_error_message(other_original, op)) from err + + if (op.__name__ in ARROW_LOGICAL_FUNCS + and (isinstance(self, pa.lib.BooleanArray) ^ + isinstance(other, pa.lib.BooleanArray)) + ): + return pc.cast(result, pa.bool_()) + return type(self)(result) def _logical_method(self, other, op) -> Self: From bb906bbae8ae781c68f815c62b26841744549adf Mon Sep 17 00:00:00 2001 From: Emma Tong <53305901+etong11@users.noreply.github.com> Date: Sat, 7 Dec 2024 23:19:51 -0500 Subject: [PATCH 4/6] Converting ^ to != for logical xor --- pandas/core/arrays/arrow/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 1a89ec28c2de1..62a8e449b2b05 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -117,7 +117,7 @@ def cast_for_logical(x, y): is_x_bool = pa.types.is_boolean(x.type) is_y_bool = pa.types.is_boolean(y.type) - if (is_x_bool ^ is_y_bool): + if (is_x_bool != is_y_bool): return convert_string_to_boolean_array(x), convert_string_to_boolean_array(y) return x, y @@ -838,7 +838,7 @@ def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: raise TypeError(self._op_method_error_message(other_original, op)) from err if (op.__name__ in ARROW_LOGICAL_FUNCS - and (isinstance(self, pa.lib.BooleanArray) ^ + and (isinstance(self, pa.lib.BooleanArray) != isinstance(other, pa.lib.BooleanArray)) ): return pc.cast(result, pa.bool_()) From 7188634652e87c8be8e7b983565c6ee0d9970d34 Mon Sep 17 00:00:00 2001 From: Emma Tong <53305901+etong11@users.noreply.github.com> Date: Sun, 8 Dec 2024 00:39:50 -0500 Subject: [PATCH 5/6] Fixed logical op bug for numpy string arrays --- pandas/core/ops/array_ops.py | 7 ++++++ pandas/tests/arrays/string_/test_string.py | 26 +++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 983a3df57e369..fd900726ecc47 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -435,6 +435,13 @@ def fill_bool(x, left=None): rvalues = right if should_extension_dispatch(lvalues, rvalues): + # Must cast if logical op between a boolean array and numpy-backed string array + if ((lvalues.dtype == np.bool_ and rvalues.dtype == "string[python]") + or (lvalues.dtype == "string[python]" and rvalues.dtype == np.bool_) + ): + lvalues = lvalues.astype(bool) + rvalues = rvalues.astype(bool) + # Call the method on lvalues res_values = op(lvalues, rvalues) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 70d4a60606c30..63b221d1fea24 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -766,4 +766,28 @@ def test_xor_pyarrow_string(dtype): ser2 = pd.Series(["", "b"], dtype=dtype) result = ser1 ^ ser2 expected = pd.Series([False, True], dtype=bool) - tm.assert_series_equal(result, expected) \ No newline at end of file + tm.assert_series_equal(result, expected) + +@pytest.mark.parametrize("dtype", ["string[python]"]) +def test_or_numpy_string(dtype): + ser1 = pd.Series([False, False]) + ser2 = pd.Series(["", "b"], dtype=dtype) + result = ser1 | ser2 + expected = pd.Series([False, True], dtype=bool) + tm.assert_series_equal(result, expected) + +@pytest.mark.parametrize("dtype", ["string[python]"]) +def test_and_numpy_string(dtype): + ser1 = pd.Series([False, False]) + ser2 = pd.Series(["", "b"], dtype=dtype) + result = ser1 & ser2 + expected = pd.Series([False, False], dtype=bool) + tm.assert_series_equal(result, expected) + +@pytest.mark.parametrize("dtype", ["string[python]"]) +def test_xor_numpy_string(dtype): + ser1 = pd.Series([False, False]) + ser2 = pd.Series(["", "b"], dtype=dtype) + result = ser1 ^ ser2 + expected = pd.Series([False, True], dtype=bool) + tm.assert_series_equal(result, expected) \ No newline at end of file From 2fb6d81bc3a36623e677e43fad44a5e8cb903988 Mon Sep 17 00:00:00 2001 From: Lucas Lin Date: Sun, 8 Dec 2024 22:45:06 -0500 Subject: [PATCH 6/6] Added entry into whatsnew v3.0.0 file --- doc/source/whatsnew/v3.0.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1d55fc3ed7b84..addaf81e1a3e5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -651,6 +651,8 @@ Conversion - Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) - Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`) - Bug in :meth:`Series.reindex` not maintaining ``float32`` type when a ``reindex`` introduces a missing value (:issue:`45857`) +- Bug in :meth:`Ops.logical_op` not correctly casting numpy-backed string arrays to boolean when used in logical operations with other boolean arrays (:issue:`60234`) +- Bug in :meth:`ArrowExtensionArray._evaluate_op_method` not correctly casting pyarrow-backed string arrays to boolean when used in logical operations with other boolean arrays (:issue:`60234`) Strings ^^^^^^^