BUG: reindex not matching categoricals and new string dtypes (#56106)

phofl · web-flow · commit 726e8e8a61cb · 2023-12-08T17:08:03.000-08:00
* Fix string option tests in indexing

* Update v2.1.4.rst

* Fixup

* Update whatsnew
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -577,6 +577,7 @@ Strings
 ^^^^^^^
 - Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`)
 - Bug in :meth:`DataFrame.apply` failing when ``engine="numba"`` and columns or index have ``StringDtype`` (:issue:`56189`)
+- Bug in :meth:`DataFrame.reindex` not matching :class:`Index` with ``string[pyarrow_numpy]`` dtype (:issue:`56106`)
 - Bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`)
 - Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`)
 - Bug in :meth:`Series.str.find` when ``start < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56411`)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -159,7 +159,10 @@
     ExtensionArray,
     TimedeltaArray,
 )
-from pandas.core.arrays.string_ import StringArray
+from pandas.core.arrays.string_ import (
+    StringArray,
+    StringDtype,
+)
 from pandas.core.base import (
     IndexOpsMixin,
     PandasObject,
@@ -5574,6 +5577,14 @@ def equals(self, other: Any) -> bool:
             # quickly return if the lengths are different
             return False
 
+        if (
+            isinstance(self.dtype, StringDtype)
+            and self.dtype.storage == "pyarrow_numpy"
+            and other.dtype != self.dtype
+        ):
+            # special case for object behavior
+            return other.equals(self.astype(object))
+
         if is_object_dtype(self.dtype) and not is_object_dtype(other.dtype):
             # if other is not object, use other's logic for coercion
             return other.equals(self)
diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py
@@ -3,6 +3,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 import pandas as pd
 from pandas import (
     Categorical,
@@ -14,6 +16,7 @@
     Series,
     Timedelta,
     Timestamp,
+    option_context,
 )
 import pandas._testing as tm
 
@@ -428,38 +431,42 @@ def test_ix_categorical_index(self):
         expect = DataFrame(df.loc[:, ["X", "Y"]], index=cdf.index, columns=exp_columns)
         tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect)
 
-    def test_ix_categorical_index_non_unique(self):
+    @pytest.mark.parametrize(
+        "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
+    )
+    def test_ix_categorical_index_non_unique(self, infer_string):
         # non-unique
-        df = DataFrame(
-            np.random.default_rng(2).standard_normal((3, 3)),
-            index=list("ABA"),
-            columns=list("XYX"),
-        )
-        cdf = df.copy()
-        cdf.index = CategoricalIndex(df.index)
-        cdf.columns = CategoricalIndex(df.columns)
-
-        exp_index = CategoricalIndex(list("AA"), categories=["A", "B"])
-        expect = DataFrame(df.loc["A", :], columns=cdf.columns, index=exp_index)
-        tm.assert_frame_equal(cdf.loc["A", :], expect)
-
-        exp_columns = CategoricalIndex(list("XX"), categories=["X", "Y"])
-        expect = DataFrame(df.loc[:, "X"], index=cdf.index, columns=exp_columns)
-        tm.assert_frame_equal(cdf.loc[:, "X"], expect)
-
-        expect = DataFrame(
-            df.loc[["A", "B"], :],
-            columns=cdf.columns,
-            index=CategoricalIndex(list("AAB")),
-        )
-        tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect)
-
-        expect = DataFrame(
-            df.loc[:, ["X", "Y"]],
-            index=cdf.index,
-            columns=CategoricalIndex(list("XXY")),
-        )
-        tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect)
+        with option_context("future.infer_string", infer_string):
+            df = DataFrame(
+                np.random.default_rng(2).standard_normal((3, 3)),
+                index=list("ABA"),
+                columns=list("XYX"),
+            )
+            cdf = df.copy()
+            cdf.index = CategoricalIndex(df.index)
+            cdf.columns = CategoricalIndex(df.columns)
+
+            exp_index = CategoricalIndex(list("AA"), categories=["A", "B"])
+            expect = DataFrame(df.loc["A", :], columns=cdf.columns, index=exp_index)
+            tm.assert_frame_equal(cdf.loc["A", :], expect)
+
+            exp_columns = CategoricalIndex(list("XX"), categories=["X", "Y"])
+            expect = DataFrame(df.loc[:, "X"], index=cdf.index, columns=exp_columns)
+            tm.assert_frame_equal(cdf.loc[:, "X"], expect)
+
+            expect = DataFrame(
+                df.loc[["A", "B"], :],
+                columns=cdf.columns,
+                index=CategoricalIndex(list("AAB")),
+            )
+            tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect)
+
+            expect = DataFrame(
+                df.loc[:, ["X", "Y"]],
+                index=cdf.index,
+                columns=CategoricalIndex(list("XXY")),
+            )
+            tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect)
 
     def test_loc_slice(self, df):
         # GH9748