From 3637554bba73de2654b8f2c8b8c48ab9c6c88491 Mon Sep 17 00:00:00 2001 From: MBrouns Date: Sat, 20 Jun 2020 10:52:29 +0200 Subject: [PATCH 1/4] Add test to verify `align` behaviour on `CategoricalIndex` verify that aligning two dataframes with a `CategoricalIndex` does not change the type of the index. --- pandas/tests/indexing/test_categorical.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 98edb56260b01..fe6b55d9c94e9 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -806,3 +806,13 @@ def test_loc_with_non_string_categories(self, idx_values, ordered): result.loc[sl, "A"] = ["qux", "qux2"] expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected) + + def test_align_keeps_categorical_index(self): + # GH-28397 + df_1, df_2 = self.df.copy(), self.df.copy() + + aligned_1, aligned_2 = df_1.align(df_2) + assert isinstance(aligned_1.index, CategoricalIndex) + assert isinstance(aligned_2.index, CategoricalIndex) + tm.assert_index_equal(aligned_1.index, df_1.index) + tm.assert_index_equal(aligned_2.index, df_2.index) From d9894adbc0cd7797ed48dff883ceb64a8a9072c6 Mon Sep 17 00:00:00 2001 From: MBrouns Date: Sat, 20 Jun 2020 17:32:21 +0200 Subject: [PATCH 2/4] use different dataframes in testing index alignment to ensure there's no shortcut path that skips aligning --- pandas/tests/indexing/test_categorical.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index fe6b55d9c94e9..64437b71e7244 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -807,12 +807,11 @@ def test_loc_with_non_string_categories(self, idx_values, ordered): expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected) - def test_align_keeps_categorical_index(self): + def test_align(self): # GH-28397 - df_1, df_2 = self.df.copy(), self.df.copy() + df_1, df_2 = self.df.copy(), self.df.copy().iloc[::-1] aligned_1, aligned_2 = df_1.align(df_2) assert isinstance(aligned_1.index, CategoricalIndex) assert isinstance(aligned_2.index, CategoricalIndex) - tm.assert_index_equal(aligned_1.index, df_1.index) - tm.assert_index_equal(aligned_2.index, df_2.index) + tm.assert_index_equal(aligned_1.index, aligned_2.index) From ef41dcddd606dc3ec63909a85831b4f84638180e Mon Sep 17 00:00:00 2001 From: MBrouns Date: Sat, 20 Jun 2020 17:46:29 +0200 Subject: [PATCH 3/4] move test to test_align.py --- pandas/tests/frame/methods/test_align.py | 20 ++++++++++++++++++++ pandas/tests/indexing/test_categorical.py | 9 --------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py index 5dae719283d17..ab6aee4ba9a9d 100644 --- a/pandas/tests/frame/methods/test_align.py +++ b/pandas/tests/frame/methods/test_align.py @@ -129,6 +129,26 @@ def test_align_mixed_int(self, mixed_int_frame): ) tm.assert_index_equal(bf.index, Index([])) + def test_align_categorical(self): + # GH-28397 + df_1 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": Series(list("aabbca")).astype(pd.CategoricalDtype(list("cab"))), + } + ).set_index("B") + df_2 = DataFrame( + { + "A": np.arange(5, dtype="int64"), + "B": Series(list("babca")).astype(pd.CategoricalDtype(list("cab"))), + } + ).set_index("B") + + aligned_1, aligned_2 = df_1.align(df_2) + assert isinstance(aligned_1.index, pd.CategoricalIndex) + assert isinstance(aligned_2.index, pd.CategoricalIndex) + tm.assert_index_equal(aligned_1.index, aligned_2.index) + def test_align_multiindex(self): # GH#10665 # same test cases as test_align_multiindex in test_series.py diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 64437b71e7244..98edb56260b01 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -806,12 +806,3 @@ def test_loc_with_non_string_categories(self, idx_values, ordered): result.loc[sl, "A"] = ["qux", "qux2"] expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected) - - def test_align(self): - # GH-28397 - df_1, df_2 = self.df.copy(), self.df.copy().iloc[::-1] - - aligned_1, aligned_2 = df_1.align(df_2) - assert isinstance(aligned_1.index, CategoricalIndex) - assert isinstance(aligned_2.index, CategoricalIndex) - tm.assert_index_equal(aligned_1.index, aligned_2.index) From 7f4bf9f009281738d4a184595a1d94e0e8f3ed57 Mon Sep 17 00:00:00 2001 From: MBrouns Date: Fri, 26 Jun 2020 17:39:07 +0200 Subject: [PATCH 4/4] parametrize categorical alignment test on ordered / non-ordered --- pandas/tests/frame/methods/test_align.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py index ab6aee4ba9a9d..d19b59debfdea 100644 --- a/pandas/tests/frame/methods/test_align.py +++ b/pandas/tests/frame/methods/test_align.py @@ -129,24 +129,37 @@ def test_align_mixed_int(self, mixed_int_frame): ) tm.assert_index_equal(bf.index, Index([])) - def test_align_categorical(self): + @pytest.mark.parametrize( + "l_ordered,r_ordered,expected", + [ + [True, True, pd.CategoricalIndex], + [True, False, pd.Index], + [False, True, pd.Index], + [False, False, pd.CategoricalIndex], + ], + ) + def test_align_categorical(self, l_ordered, r_ordered, expected): # GH-28397 df_1 = DataFrame( { "A": np.arange(6, dtype="int64"), - "B": Series(list("aabbca")).astype(pd.CategoricalDtype(list("cab"))), + "B": Series(list("aabbca")).astype( + pd.CategoricalDtype(list("cab"), ordered=l_ordered) + ), } ).set_index("B") df_2 = DataFrame( { "A": np.arange(5, dtype="int64"), - "B": Series(list("babca")).astype(pd.CategoricalDtype(list("cab"))), + "B": Series(list("babca")).astype( + pd.CategoricalDtype(list("cab"), ordered=r_ordered) + ), } ).set_index("B") aligned_1, aligned_2 = df_1.align(df_2) - assert isinstance(aligned_1.index, pd.CategoricalIndex) - assert isinstance(aligned_2.index, pd.CategoricalIndex) + assert isinstance(aligned_1.index, expected) + assert isinstance(aligned_2.index, expected) tm.assert_index_equal(aligned_1.index, aligned_2.index) def test_align_multiindex(self):