From 497dcbb34e9a1e9092b0cd4f31364907e354deea Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Fri, 8 Jan 2021 23:43:26 +0000 Subject: [PATCH 1/3] BUG: reindexing empty CategoricalIndex would fail if target had duplicates (#38906) --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/core/indexes/base.py | 10 +++++++--- pandas/tests/frame/indexing/test_categorical.py | 2 +- pandas/tests/indexing/test_categorical.py | 16 ++++++++++++++++ 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 1d76c9d296255..f5ac66ea2dcb8 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -192,6 +192,7 @@ Categorical - Bug in ``CategoricalIndex.reindex`` failed when ``Index`` passed with elements all in category (:issue:`28690`) - Bug where constructing a :class:`Categorical` from an object-dtype array of ``date`` objects did not round-trip correctly with ``astype`` (:issue:`38552`) - Bug in constructing a :class:`DataFrame` from an ``ndarray`` and a :class:`CategoricalDtype` (:issue:`38857`) +- Bug where :class:`DataFrame` axes with an empty :class:`CategoricalIndex` could not be reindexed if the target contained duplicates (:issue:`38906`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 92a4d0a125195..d52a524c7309b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3745,9 +3745,13 @@ def _reindex_non_unique(self, target): # need to retake to have the same size as the indexer indexer[~check] = -1 - # reset the new indexer to account for the new size - new_indexer = np.arange(len(self.take(indexer))) - new_indexer[~check] = -1 + if len(self): + # reset the new indexer to account for the new size + new_indexer = np.arange(len(self.take(indexer))) + new_indexer[~check] = -1 + else: + # GH#38906 + new_indexer = np.arange(0) if isinstance(self, ABCMultiIndex): new_index = type(self).from_tuples(new_labels, names=self.names) diff --git a/pandas/tests/frame/indexing/test_categorical.py b/pandas/tests/frame/indexing/test_categorical.py index 6137cadc93125..52dae82c983bc 100644 --- a/pandas/tests/frame/indexing/test_categorical.py +++ b/pandas/tests/frame/indexing/test_categorical.py @@ -385,4 +385,4 @@ def test_loc_indexing_preserves_index_category_dtype(self): tm.assert_index_equal(result, expected) result = df.loc[["a"]].index.levels[0] - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) \ No newline at end of file diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 1b9b6452b2e33..13d952d7ff37a 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -533,3 +533,19 @@ def test_loc_with_non_string_categories(self, idx_values, ordered): result.loc[sl, "A"] = ["qux", "qux2"] expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected) + + def test_reindex_empty(self): + df = DataFrame(columns=CategoricalIndex([]), index=["K"], dtype="f8") + + # No duplicates + cat_idx = CategoricalIndex(["A", "B"]) + result = df.reindex(columns=cat_idx) + expected = DataFrame(index=["K"], columns=cat_idx, dtype="f8") + tm.assert_frame_equal(result, expected) + + # Duplicates + # GH#38906 + cat_idx = CategoricalIndex(["A", "A"]) + result = df.reindex(columns=cat_idx) + expected = DataFrame(index=["K"], columns=cat_idx, dtype="f8") + tm.assert_frame_equal(result, expected) \ No newline at end of file From 72671513dd651b210f5fd9e9eb9af2492c9c162a Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Sat, 9 Jan 2021 12:21:22 +0000 Subject: [PATCH 2/3] Update for review --- doc/source/whatsnew/v1.3.0.rst | 2 +- .../tests/frame/indexing/test_categorical.py | 2 +- pandas/tests/indexing/test_categorical.py | 20 +++++++++---------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index f5ac66ea2dcb8..d90ccd57be16a 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -192,7 +192,7 @@ Categorical - Bug in ``CategoricalIndex.reindex`` failed when ``Index`` passed with elements all in category (:issue:`28690`) - Bug where constructing a :class:`Categorical` from an object-dtype array of ``date`` objects did not round-trip correctly with ``astype`` (:issue:`38552`) - Bug in constructing a :class:`DataFrame` from an ``ndarray`` and a :class:`CategoricalDtype` (:issue:`38857`) -- Bug where :class:`DataFrame` axes with an empty :class:`CategoricalIndex` could not be reindexed if the target contained duplicates (:issue:`38906`) +- Bug in :meth:`DataFrame.reindex` was throwing ``IndexError`` when new index contained duplicates and old index was :class:`CategoricalIndex` (:issue:`38906`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/tests/frame/indexing/test_categorical.py b/pandas/tests/frame/indexing/test_categorical.py index 52dae82c983bc..6137cadc93125 100644 --- a/pandas/tests/frame/indexing/test_categorical.py +++ b/pandas/tests/frame/indexing/test_categorical.py @@ -385,4 +385,4 @@ def test_loc_indexing_preserves_index_category_dtype(self): tm.assert_index_equal(result, expected) result = df.loc[["a"]].index.levels[0] - tm.assert_index_equal(result, expected) \ No newline at end of file + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 13d952d7ff37a..90aa65d647171 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -534,18 +534,18 @@ def test_loc_with_non_string_categories(self, idx_values, ordered): expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected) - def test_reindex_empty(self): + @pytest.mark.parametrize( + "cat_idx", + [ + # No duplicates + CategoricalIndex(["A", "B"]), + # Duplicates: GH#38906 + CategoricalIndex(["A", "A"]), + ], + ) + def test_reindex_empty(self, cat_idx): df = DataFrame(columns=CategoricalIndex([]), index=["K"], dtype="f8") - # No duplicates - cat_idx = CategoricalIndex(["A", "B"]) result = df.reindex(columns=cat_idx) expected = DataFrame(index=["K"], columns=cat_idx, dtype="f8") tm.assert_frame_equal(result, expected) - - # Duplicates - # GH#38906 - cat_idx = CategoricalIndex(["A", "A"]) - result = df.reindex(columns=cat_idx) - expected = DataFrame(index=["K"], columns=cat_idx, dtype="f8") - tm.assert_frame_equal(result, expected) \ No newline at end of file From b6aece72bc9551d9a3c643bda922842716d9ecc4 Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Sun, 10 Jan 2021 00:36:28 +0000 Subject: [PATCH 3/3] Reviewer comments --- pandas/core/indexes/base.py | 17 ++++++------ pandas/tests/frame/indexing/test_indexing.py | 28 ++++++++++++++++++++ pandas/tests/indexing/test_categorical.py | 16 ----------- 3 files changed, 37 insertions(+), 24 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d52a524c7309b..a2e9737f305ba 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3730,8 +3730,13 @@ def _reindex_non_unique(self, target): new_labels[cur_indexer] = cur_labels new_labels[missing_indexer] = missing_labels + # GH#38906 + if not len(self): + + new_indexer = np.arange(0) + # a unique indexer - if target.is_unique: + elif target.is_unique: # see GH5553, make sure we use the right indexer new_indexer = np.arange(len(indexer)) @@ -3745,13 +3750,9 @@ def _reindex_non_unique(self, target): # need to retake to have the same size as the indexer indexer[~check] = -1 - if len(self): - # reset the new indexer to account for the new size - new_indexer = np.arange(len(self.take(indexer))) - new_indexer[~check] = -1 - else: - # GH#38906 - new_indexer = np.arange(0) + # reset the new indexer to account for the new size + new_indexer = np.arange(len(self.take(indexer))) + new_indexer[~check] = -1 if isinstance(self, ABCMultiIndex): new_index = type(self).from_tuples(new_labels, names=self.names) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 4cbdf61ff8dae..2b80e06861a00 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1717,6 +1717,34 @@ def test_setitem(self, uint64_frame): ) +@pytest.mark.parametrize( + "src_idx", + [ + Index([]), + pd.CategoricalIndex([]), + ], +) +@pytest.mark.parametrize( + "cat_idx", + [ + # No duplicates + Index([]), + pd.CategoricalIndex([]), + Index(["A", "B"]), + pd.CategoricalIndex(["A", "B"]), + # Duplicates: GH#38906 + Index(["A", "A"]), + pd.CategoricalIndex(["A", "A"]), + ], +) +def test_reindex_empty(src_idx, cat_idx): + df = DataFrame(columns=src_idx, index=["K"], dtype="f8") + + result = df.reindex(columns=cat_idx) + expected = DataFrame(index=["K"], columns=cat_idx, dtype="f8") + tm.assert_frame_equal(result, expected) + + def test_object_casting_indexing_wraps_datetimelike(): # GH#31649, check the indexing methods all the way down the stack df = DataFrame( diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 90aa65d647171..1b9b6452b2e33 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -533,19 +533,3 @@ def test_loc_with_non_string_categories(self, idx_values, ordered): result.loc[sl, "A"] = ["qux", "qux2"] expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "cat_idx", - [ - # No duplicates - CategoricalIndex(["A", "B"]), - # Duplicates: GH#38906 - CategoricalIndex(["A", "A"]), - ], - ) - def test_reindex_empty(self, cat_idx): - df = DataFrame(columns=CategoricalIndex([]), index=["K"], dtype="f8") - - result = df.reindex(columns=cat_idx) - expected = DataFrame(index=["K"], columns=cat_idx, dtype="f8") - tm.assert_frame_equal(result, expected)