From 7958d6c73402ebf995c337fba7cfeadb18ec79e9 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 17 Nov 2024 00:09:02 -0800 Subject: [PATCH] Backport PR #56013 on branch 2.3.x (BUG: get_indexer rountripping through string dtype) (#60339) Backport PR #56013: BUG: get_indexer rountripping through string dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/indexes/base.py | 11 ++++++++++- pandas/tests/indexes/object/test_indexing.py | 9 +++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 3e699e1a27b55..473d67acf6e74 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -119,7 +119,7 @@ Interval Indexing ^^^^^^^^ -- +- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`) - Missing diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5da327a82c02b..4896fb0ad1cd2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6695,7 +6695,16 @@ def _maybe_cast_listlike_indexer(self, target) -> Index: """ Analogue to maybe_cast_indexer for get_indexer instead of get_loc. """ - return ensure_index(target) + target_index = ensure_index(target) + if ( + not hasattr(target, "dtype") + and self.dtype == object + and target_index.dtype == "string" + ): + # If we started with a list-like, avoid inference to string dtype if self + # is object dtype (coercing to string dtype will alter the missing values) + target_index = Index(target, dtype=self.dtype) + return target_index @final def _validate_indexer( diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index d3df349027c00..42ef7e7a96f5e 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -62,6 +62,15 @@ def test_get_indexer_with_NA_values( expected = np.array([0, 1, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_infer_string_missing_values(self): + # ensure the passed list is not cast to string but to object so that + # the None value is matched in the index + # https://github.com/pandas-dev/pandas/issues/55834 + idx = Index(["a", "b", None], dtype="object") + result = idx.get_indexer([None, "x"]) + expected = np.array([2, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + class TestGetIndexerNonUnique: def test_get_indexer_non_unique_nas(self, nulls_fixture):