Skip to content

Commit 726e8e8

Browse files
authored
BUG: reindex not matching categoricals and new string dtypes (#56106)
* Fix string option tests in indexing * Update v2.1.4.rst * Fixup * Update whatsnew
1 parent aa7b17e commit 726e8e8

File tree

3 files changed

+51
-32
lines changed

3 files changed

+51
-32
lines changed

doc/source/whatsnew/v2.2.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,7 @@ Strings
577577
^^^^^^^
578578
- Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`)
579579
- Bug in :meth:`DataFrame.apply` failing when ``engine="numba"`` and columns or index have ``StringDtype`` (:issue:`56189`)
580+
- Bug in :meth:`DataFrame.reindex` not matching :class:`Index` with ``string[pyarrow_numpy]`` dtype (:issue:`56106`)
580581
- Bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`)
581582
- Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`)
582583
- Bug in :meth:`Series.str.find` when ``start < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56411`)

pandas/core/indexes/base.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,10 @@
159159
ExtensionArray,
160160
TimedeltaArray,
161161
)
162-
from pandas.core.arrays.string_ import StringArray
162+
from pandas.core.arrays.string_ import (
163+
StringArray,
164+
StringDtype,
165+
)
163166
from pandas.core.base import (
164167
IndexOpsMixin,
165168
PandasObject,
@@ -5574,6 +5577,14 @@ def equals(self, other: Any) -> bool:
55745577
# quickly return if the lengths are different
55755578
return False
55765579

5580+
if (
5581+
isinstance(self.dtype, StringDtype)
5582+
and self.dtype.storage == "pyarrow_numpy"
5583+
and other.dtype != self.dtype
5584+
):
5585+
# special case for object behavior
5586+
return other.equals(self.astype(object))
5587+
55775588
if is_object_dtype(self.dtype) and not is_object_dtype(other.dtype):
55785589
# if other is not object, use other's logic for coercion
55795590
return other.equals(self)

pandas/tests/indexing/test_categorical.py

Lines changed: 38 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import numpy as np
44
import pytest
55

6+
import pandas.util._test_decorators as td
7+
68
import pandas as pd
79
from pandas import (
810
Categorical,
@@ -14,6 +16,7 @@
1416
Series,
1517
Timedelta,
1618
Timestamp,
19+
option_context,
1720
)
1821
import pandas._testing as tm
1922

@@ -428,38 +431,42 @@ def test_ix_categorical_index(self):
428431
expect = DataFrame(df.loc[:, ["X", "Y"]], index=cdf.index, columns=exp_columns)
429432
tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect)
430433

431-
def test_ix_categorical_index_non_unique(self):
434+
@pytest.mark.parametrize(
435+
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
436+
)
437+
def test_ix_categorical_index_non_unique(self, infer_string):
432438
# non-unique
433-
df = DataFrame(
434-
np.random.default_rng(2).standard_normal((3, 3)),
435-
index=list("ABA"),
436-
columns=list("XYX"),
437-
)
438-
cdf = df.copy()
439-
cdf.index = CategoricalIndex(df.index)
440-
cdf.columns = CategoricalIndex(df.columns)
441-
442-
exp_index = CategoricalIndex(list("AA"), categories=["A", "B"])
443-
expect = DataFrame(df.loc["A", :], columns=cdf.columns, index=exp_index)
444-
tm.assert_frame_equal(cdf.loc["A", :], expect)
445-
446-
exp_columns = CategoricalIndex(list("XX"), categories=["X", "Y"])
447-
expect = DataFrame(df.loc[:, "X"], index=cdf.index, columns=exp_columns)
448-
tm.assert_frame_equal(cdf.loc[:, "X"], expect)
449-
450-
expect = DataFrame(
451-
df.loc[["A", "B"], :],
452-
columns=cdf.columns,
453-
index=CategoricalIndex(list("AAB")),
454-
)
455-
tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect)
456-
457-
expect = DataFrame(
458-
df.loc[:, ["X", "Y"]],
459-
index=cdf.index,
460-
columns=CategoricalIndex(list("XXY")),
461-
)
462-
tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect)
439+
with option_context("future.infer_string", infer_string):
440+
df = DataFrame(
441+
np.random.default_rng(2).standard_normal((3, 3)),
442+
index=list("ABA"),
443+
columns=list("XYX"),
444+
)
445+
cdf = df.copy()
446+
cdf.index = CategoricalIndex(df.index)
447+
cdf.columns = CategoricalIndex(df.columns)
448+
449+
exp_index = CategoricalIndex(list("AA"), categories=["A", "B"])
450+
expect = DataFrame(df.loc["A", :], columns=cdf.columns, index=exp_index)
451+
tm.assert_frame_equal(cdf.loc["A", :], expect)
452+
453+
exp_columns = CategoricalIndex(list("XX"), categories=["X", "Y"])
454+
expect = DataFrame(df.loc[:, "X"], index=cdf.index, columns=exp_columns)
455+
tm.assert_frame_equal(cdf.loc[:, "X"], expect)
456+
457+
expect = DataFrame(
458+
df.loc[["A", "B"], :],
459+
columns=cdf.columns,
460+
index=CategoricalIndex(list("AAB")),
461+
)
462+
tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect)
463+
464+
expect = DataFrame(
465+
df.loc[:, ["X", "Y"]],
466+
index=cdf.index,
467+
columns=CategoricalIndex(list("XXY")),
468+
)
469+
tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect)
463470

464471
def test_loc_slice(self, df):
465472
# GH9748

0 commit comments

Comments
 (0)