From 1b9051911c9e56caa9cd53d37abc9023db652d81 Mon Sep 17 00:00:00 2001 From: Michael Date: Sun, 28 Feb 2021 12:50:45 -0800 Subject: [PATCH 01/16] BUG: Set dtypes of new columns when stacking (#36991) --- pandas/core/reshape/reshape.py | 4 ++++ pandas/tests/frame/test_stack_unstack.py | 17 +++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 271bb2ca8dd75..c196b0912dbe7 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -645,6 +645,10 @@ def _convert_level_number(level_num, columns): unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) + new_columns = new_columns.set_levels([ + new_columns.levels[i].astype(this.columns.levels[i].dtype) + for i in range(0, len(new_columns.levels)) + ]) else: new_columns = this.columns.levels[0]._rename(name=this.columns.names[0]) unique_groups = new_columns diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 9945b739f8a87..5d56e2eee951b 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1052,6 +1052,23 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("ordered", [False, True]) + @pytest.mark.parametrize("labels", [list("yxz"), list("yzx")]) + @pytest.mark.parametrize("labels2", [list("uv"), list("vu")]) + def test_multi_stack_preserve_categorical_dtype(self, ordered, labels, labels2): + cidx = pd.CategoricalIndex(labels, categories=list("xyz"), ordered=ordered) + cidx2 = pd.CategoricalIndex(labels2, categories=list("uv"), ordered=ordered) + sorted_cidx = pd.CategoricalIndex(list("xyz"), categories=list("xyz"), ordered=ordered) + sorted_cidx2 = pd.CategoricalIndex(list("uv"), categories=list("uv"), ordered=ordered) + + midx = MultiIndex.from_product([cidx, cidx2, [1, 2, 3]], names=['a', 'b', 'c']) + df = pd.DataFrame(np.random.randn(5, midx.size), columns=midx) + result = df.stack(['a', 'b']) + + expected = MultiIndex.from_product([df.index, sorted_cidx, sorted_cidx2], names=[None, 'a', 'b']) + + tm.assert_equal(result.index, expected) + def test_stack_preserve_categorical_dtype_values(self): # GH-23077 cat = pd.Categorical(["a", "a", "b", "c"]) From f7093303224eaf2d06abd70ea683d5f59257d97d Mon Sep 17 00:00:00 2001 From: Michael Date: Sun, 28 Feb 2021 12:56:59 -0800 Subject: [PATCH 02/16] Fix PEP8 issues --- pandas/tests/frame/test_stack_unstack.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 5d56e2eee951b..f42742a77d5ec 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1057,15 +1057,18 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): @pytest.mark.parametrize("labels2", [list("uv"), list("vu")]) def test_multi_stack_preserve_categorical_dtype(self, ordered, labels, labels2): cidx = pd.CategoricalIndex(labels, categories=list("xyz"), ordered=ordered) - cidx2 = pd.CategoricalIndex(labels2, categories=list("uv"), ordered=ordered) - sorted_cidx = pd.CategoricalIndex(list("xyz"), categories=list("xyz"), ordered=ordered) - sorted_cidx2 = pd.CategoricalIndex(list("uv"), categories=list("uv"), ordered=ordered) + cidx2 = pd.CategoricalIndex(labels2, categories=list("uv"), ordered=ordered) + sorted_cidx = pd.CategoricalIndex( + list("xyz"), categories=list("xyz"), ordered=ordered) + sorted_cidx2 = pd.CategoricalIndex( + list("uv"), categories=list("uv"), ordered=ordered) midx = MultiIndex.from_product([cidx, cidx2, [1, 2, 3]], names=['a', 'b', 'c']) df = pd.DataFrame(np.random.randn(5, midx.size), columns=midx) result = df.stack(['a', 'b']) - expected = MultiIndex.from_product([df.index, sorted_cidx, sorted_cidx2], names=[None, 'a', 'b']) + expected = MultiIndex.from_product( + [df.index, sorted_cidx, sorted_cidx2], names=[None, 'a', 'b']) tm.assert_equal(result.index, expected) From 23ff7ba262bfa3f4fbca3fda2e19765fd6e4be84 Mon Sep 17 00:00:00 2001 From: Michael Date: Sun, 28 Feb 2021 13:05:13 -0800 Subject: [PATCH 03/16] Fix pre-commit issues and add GH comment --- pandas/tests/frame/test_stack_unstack.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index f42742a77d5ec..3c27b9098df76 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1056,6 +1056,7 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): @pytest.mark.parametrize("labels", [list("yxz"), list("yzx")]) @pytest.mark.parametrize("labels2", [list("uv"), list("vu")]) def test_multi_stack_preserve_categorical_dtype(self, ordered, labels, labels2): + # GH-36991 cidx = pd.CategoricalIndex(labels, categories=list("xyz"), ordered=ordered) cidx2 = pd.CategoricalIndex(labels2, categories=list("uv"), ordered=ordered) sorted_cidx = pd.CategoricalIndex( @@ -1063,12 +1064,12 @@ def test_multi_stack_preserve_categorical_dtype(self, ordered, labels, labels2): sorted_cidx2 = pd.CategoricalIndex( list("uv"), categories=list("uv"), ordered=ordered) - midx = MultiIndex.from_product([cidx, cidx2, [1, 2, 3]], names=['a', 'b', 'c']) + midx = MultiIndex.from_product([cidx, cidx2, [1, 2, 3]], names=list("abc")) df = pd.DataFrame(np.random.randn(5, midx.size), columns=midx) - result = df.stack(['a', 'b']) + result = df.stack(["a", "b"]) expected = MultiIndex.from_product( - [df.index, sorted_cidx, sorted_cidx2], names=[None, 'a', 'b']) + [df.index, sorted_cidx, sorted_cidx2], names=[None, "a", "b"]) tm.assert_equal(result.index, expected) From 9cb205519ab40ca23702eb3e271b1a451542ca20 Mon Sep 17 00:00:00 2001 From: Michael Date: Sun, 28 Feb 2021 13:42:50 -0800 Subject: [PATCH 04/16] Use MultiIndex.from_arrays --- pandas/core/reshape/reshape.py | 10 ++++------ pandas/tests/frame/test_stack_unstack.py | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c196b0912dbe7..a442f508e35fc 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -643,12 +643,10 @@ def _convert_level_number(level_num, columns): levs.append(np.take(lev, level_codes)) tuples = list(zip(*levs)) unique_groups = [key for key, _ in itertools.groupby(tuples)] - new_names = this.columns.names[:-1] - new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) - new_columns = new_columns.set_levels([ - new_columns.levels[i].astype(this.columns.levels[i].dtype) - for i in range(0, len(new_columns.levels)) - ]) + new_columns = MultiIndex.from_arrays([ + Index(new_level, dtype=level.dtype) if None not in new_level else new_level + for new_level, level in zip(zip(*unique_groups), this.columns.levels) + ], names=this.columns.names[:-1]) else: new_columns = this.columns.levels[0]._rename(name=this.columns.names[0]) unique_groups = new_columns diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 3c27b9098df76..9e3d593e52ed7 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1065,7 +1065,7 @@ def test_multi_stack_preserve_categorical_dtype(self, ordered, labels, labels2): list("uv"), categories=list("uv"), ordered=ordered) midx = MultiIndex.from_product([cidx, cidx2, [1, 2, 3]], names=list("abc")) - df = pd.DataFrame(np.random.randn(5, midx.size), columns=midx) + df = DataFrame(np.random.randn(5, midx.size), columns=midx) result = df.stack(["a", "b"]) expected = MultiIndex.from_product( From 75cb56cb534a0be1eb56c0457eed2a8e301ee01b Mon Sep 17 00:00:00 2001 From: Michael Date: Sun, 28 Feb 2021 13:55:32 -0800 Subject: [PATCH 05/16] Reformat --- pandas/core/reshape/reshape.py | 13 +++++++++---- pandas/tests/frame/test_stack_unstack.py | 9 ++++++--- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index a442f508e35fc..187dc3153b909 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -643,10 +643,15 @@ def _convert_level_number(level_num, columns): levs.append(np.take(lev, level_codes)) tuples = list(zip(*levs)) unique_groups = [key for key, _ in itertools.groupby(tuples)] - new_columns = MultiIndex.from_arrays([ - Index(new_level, dtype=level.dtype) if None not in new_level else new_level - for new_level, level in zip(zip(*unique_groups), this.columns.levels) - ], names=this.columns.names[:-1]) + new_columns = MultiIndex.from_arrays( + [ + Index(new_level, dtype=level.dtype) + if None not in new_level + else new_level + for new_level, level in zip(zip(*unique_groups), this.columns.levels) + ], + names=this.columns.names[:-1], + ) else: new_columns = this.columns.levels[0]._rename(name=this.columns.names[0]) unique_groups = new_columns diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 9e3d593e52ed7..d0fcc2930749f 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1060,16 +1060,19 @@ def test_multi_stack_preserve_categorical_dtype(self, ordered, labels, labels2): cidx = pd.CategoricalIndex(labels, categories=list("xyz"), ordered=ordered) cidx2 = pd.CategoricalIndex(labels2, categories=list("uv"), ordered=ordered) sorted_cidx = pd.CategoricalIndex( - list("xyz"), categories=list("xyz"), ordered=ordered) + list("xyz"), categories=list("xyz"), ordered=ordered + ) sorted_cidx2 = pd.CategoricalIndex( - list("uv"), categories=list("uv"), ordered=ordered) + list("uv"), categories=list("uv"), ordered=ordered + ) midx = MultiIndex.from_product([cidx, cidx2, [1, 2, 3]], names=list("abc")) df = DataFrame(np.random.randn(5, midx.size), columns=midx) result = df.stack(["a", "b"]) expected = MultiIndex.from_product( - [df.index, sorted_cidx, sorted_cidx2], names=[None, "a", "b"]) + [df.index, sorted_cidx, sorted_cidx2], names=[None, "a", "b"] + ) tm.assert_equal(result.index, expected) From 61164b2deadbff5d37a7126835955ad9d7d9b243 Mon Sep 17 00:00:00 2001 From: Michael Date: Mon, 1 Mar 2021 17:15:15 -0800 Subject: [PATCH 06/16] Remove unnecessary list() --- pandas/core/reshape/reshape.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 187dc3153b909..9c7434579b4a3 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -641,8 +641,7 @@ def _convert_level_number(level_num, columns): if -1 in level_codes: lev = np.append(lev, None) levs.append(np.take(lev, level_codes)) - tuples = list(zip(*levs)) - unique_groups = [key for key, _ in itertools.groupby(tuples)] + unique_groups = [key for key, _ in itertools.groupby(zip(*levs))] new_columns = MultiIndex.from_arrays( [ Index(new_level, dtype=level.dtype) From 611841a69df6fd463b2df4da646b83cf568ba40f Mon Sep 17 00:00:00 2001 From: Michael Date: Mon, 1 Mar 2021 17:27:10 -0800 Subject: [PATCH 07/16] Fix mypy error --- pandas/core/reshape/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 9c7434579b4a3..a439aa0395d60 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -653,7 +653,7 @@ def _convert_level_number(level_num, columns): ) else: new_columns = this.columns.levels[0]._rename(name=this.columns.names[0]) - unique_groups = new_columns + unique_groups = list(new_columns) # time to ravel the values new_data = {} From d8e7517785cdaec625aa665e38fb7bb369bd1325 Mon Sep 17 00:00:00 2001 From: Michael Date: Mon, 1 Mar 2021 17:27:50 -0800 Subject: [PATCH 08/16] Add whatsnew entry --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 6878227f6ae9c..5283a4fa2025d 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -532,6 +532,7 @@ Reshaping - Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`) - Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``datetime64`` and ``timedelta64`` dtypes (:issue:`39574`) - Bug in :meth:`DataFrame.pivot_table` returning a ``MultiIndex`` for a single value when operating on and empty ``DataFrame`` (:issue:`13483`) +- Bug in :meth:`DataFrame.stack` not preserving ``CategoricalDtype`` in a ``MultiIndex`` (:issue:`36991`) Sparse ^^^^^^ From 8016c7f7b73e65c995df2fa2e0beb12453b1301e Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 2 Mar 2021 22:28:11 -0800 Subject: [PATCH 09/16] Rewrite test such that it compares the series --- pandas/tests/frame/test_stack_unstack.py | 32 ++++++++++-------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index d0fcc2930749f..e17b251db869a 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1053,28 +1053,22 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("ordered", [False, True]) - @pytest.mark.parametrize("labels", [list("yxz"), list("yzx")]) - @pytest.mark.parametrize("labels2", [list("uv"), list("vu")]) - def test_multi_stack_preserve_categorical_dtype(self, ordered, labels, labels2): + @pytest.mark.parametrize("labels,data", [ + (list("xyz"), [10, 11, 12, 13, 14, 15]), + (list("zyx"), [14, 15, 12, 13, 10, 11]), + ]) + def test_stack_multi_preserve_categorical_dtype(self, ordered, labels, data): # GH-36991 - cidx = pd.CategoricalIndex(labels, categories=list("xyz"), ordered=ordered) - cidx2 = pd.CategoricalIndex(labels2, categories=list("uv"), ordered=ordered) - sorted_cidx = pd.CategoricalIndex( - list("xyz"), categories=list("xyz"), ordered=ordered - ) - sorted_cidx2 = pd.CategoricalIndex( - list("uv"), categories=list("uv"), ordered=ordered - ) + cidx = pd.CategoricalIndex(labels, categories=sorted(labels), ordered=ordered) + cidx2 = pd.CategoricalIndex(["u", "v"], ordered=ordered) + midx = MultiIndex.from_product([cidx, cidx2]) + df = DataFrame([sorted(data)], columns=midx) + result = df.stack([0, 1]) - midx = MultiIndex.from_product([cidx, cidx2, [1, 2, 3]], names=list("abc")) - df = DataFrame(np.random.randn(5, midx.size), columns=midx) - result = df.stack(["a", "b"]) + s_cidx = pd.CategoricalIndex(sorted(labels), ordered=ordered) + expected = Series(data, index=MultiIndex.from_product([[0], s_cidx, cidx2])) - expected = MultiIndex.from_product( - [df.index, sorted_cidx, sorted_cidx2], names=[None, "a", "b"] - ) - - tm.assert_equal(result.index, expected) + tm.assert_series_equal(result, expected) def test_stack_preserve_categorical_dtype_values(self): # GH-23077 From ce161c774878b775d16e9c9f4d1922e42d27a670 Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 2 Mar 2021 22:38:33 -0800 Subject: [PATCH 10/16] Refactor such that unique_groups is no longer needed --- pandas/core/reshape/reshape.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index a439aa0395d60..c529b74358d6a 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -641,19 +641,18 @@ def _convert_level_number(level_num, columns): if -1 in level_codes: lev = np.append(lev, None) levs.append(np.take(lev, level_codes)) - unique_groups = [key for key, _ in itertools.groupby(zip(*levs))] + new_levels = zip(*(key for key, _ in itertools.groupby(zip(*levs)))) new_columns = MultiIndex.from_arrays( [ Index(new_level, dtype=level.dtype) if None not in new_level else new_level - for new_level, level in zip(zip(*unique_groups), this.columns.levels) + for new_level, level in zip(new_levels, this.columns.levels) ], names=this.columns.names[:-1], ) else: new_columns = this.columns.levels[0]._rename(name=this.columns.names[0]) - unique_groups = list(new_columns) # time to ravel the values new_data = {} @@ -664,7 +663,7 @@ def _convert_level_number(level_num, columns): level_vals_used = np.take(level_vals_nan, level_codes) levsize = len(level_codes) drop_cols = [] - for key in unique_groups: + for key in new_columns: try: loc = this.columns.get_loc(key) except KeyError: From bc69788b5e773d5a415b2c94a3eed1ce9fc310b8 Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 2 Mar 2021 23:07:21 -0800 Subject: [PATCH 11/16] Extract method for stacking the column index --- pandas/core/reshape/reshape.py | 38 +++++++++++++++++----------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c529b74358d6a..f7d1005127123 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -600,6 +600,24 @@ def stack_multiple(frame, level, dropna=True): return result +def _stack_multi_column_index(columns): + if len(columns.levels) <= 2: + return columns.levels[0]._rename(name=columns.names[0]) + levs = [] + for lev, level_codes in zip(columns.levels[:-1], columns.codes[:-1]): + if -1 in level_codes: + lev = np.append(lev, None) + levs.append(np.take(lev, level_codes)) + dedupe_levs = zip(*(key for key, _ in itertools.groupby(zip(*levs)))) + return MultiIndex.from_arrays( + [ + Index(new_lev, dtype=lev.dtype) if None not in new_lev else new_lev + for new_lev, lev in zip(dedupe_levs, columns.levels) + ], + names=columns.names[:-1], + ) + + def _stack_multi_columns(frame, level_num=-1, dropna=True): def _convert_level_number(level_num, columns): """ @@ -634,25 +652,7 @@ def _convert_level_number(level_num, columns): level_to_sort = _convert_level_number(0, this.columns) this = this.sort_index(level=level_to_sort, axis=1) - # tuple list excluding level for grouping columns - if len(frame.columns.levels) > 2: - levs = [] - for lev, level_codes in zip(this.columns.levels[:-1], this.columns.codes[:-1]): - if -1 in level_codes: - lev = np.append(lev, None) - levs.append(np.take(lev, level_codes)) - new_levels = zip(*(key for key, _ in itertools.groupby(zip(*levs)))) - new_columns = MultiIndex.from_arrays( - [ - Index(new_level, dtype=level.dtype) - if None not in new_level - else new_level - for new_level, level in zip(new_levels, this.columns.levels) - ], - names=this.columns.names[:-1], - ) - else: - new_columns = this.columns.levels[0]._rename(name=this.columns.names[0]) + new_columns = _stack_multi_column_index(this.columns) # time to ravel the values new_data = {} From 379bde50a9c3dcc24794ee8d76205fe353c1da35 Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 2 Mar 2021 23:17:42 -0800 Subject: [PATCH 12/16] Reformat --- pandas/tests/frame/test_stack_unstack.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index e17b251db869a..ab493076241fc 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1053,10 +1053,13 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("ordered", [False, True]) - @pytest.mark.parametrize("labels,data", [ - (list("xyz"), [10, 11, 12, 13, 14, 15]), - (list("zyx"), [14, 15, 12, 13, 10, 11]), - ]) + @pytest.mark.parametrize( + "labels,data", + [ + (list("xyz"), [10, 11, 12, 13, 14, 15]), + (list("zyx"), [14, 15, 12, 13, 10, 11]), + ], + ) def test_stack_multi_preserve_categorical_dtype(self, ordered, labels, data): # GH-36991 cidx = pd.CategoricalIndex(labels, categories=sorted(labels), ordered=ordered) From 17e841d40565f7be67153f8351e0b51f7fe3bb4a Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 3 Mar 2021 12:38:26 -0800 Subject: [PATCH 13/16] Separate complex expression and add a comment --- pandas/core/reshape/reshape.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index f7d1005127123..11f0a0205559d 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -608,11 +608,14 @@ def _stack_multi_column_index(columns): if -1 in level_codes: lev = np.append(lev, None) levs.append(np.take(lev, level_codes)) - dedupe_levs = zip(*(key for key, _ in itertools.groupby(zip(*levs)))) + # Remove duplicate tuples in the MultiIndex. + tuples = zip(*levs) + unique_tuples = (key for key, _ in itertools.groupby(tuples)) + new_levs = zip(*unique_tuples) return MultiIndex.from_arrays( [ Index(new_lev, dtype=lev.dtype) if None not in new_lev else new_lev - for new_lev, lev in zip(dedupe_levs, columns.levels) + for new_lev, lev in zip(new_levs, columns.levels) ], names=columns.names[:-1], ) From 9a7b29eb08839f44e35cda68ce8922f9576ef83f Mon Sep 17 00:00:00 2001 From: Michael Date: Sat, 6 Mar 2021 12:23:04 -0800 Subject: [PATCH 14/16] Add function annotation and explanatory comments --- pandas/core/reshape/reshape.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 11f0a0205559d..b714ed10a6b01 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -601,6 +601,7 @@ def stack_multiple(frame, level, dropna=True): def _stack_multi_column_index(columns): + """Creates a MultiIndex from the first N-1 levels of this MultiIndex.""" if len(columns.levels) <= 2: return columns.levels[0]._rename(name=columns.names[0]) levs = [] @@ -612,8 +613,11 @@ def _stack_multi_column_index(columns): tuples = zip(*levs) unique_tuples = (key for key, _ in itertools.groupby(tuples)) new_levs = zip(*unique_tuples) + # The dtype of each level must be explicitly set to avoid inferring the wrong type. + # See GH-36991. return MultiIndex.from_arrays( [ + # Not all indices can accept None values. Index(new_lev, dtype=lev.dtype) if None not in new_lev else new_lev for new_lev, lev in zip(new_levs, columns.levels) ], From c09697b8fa1ac81b387c4d5902b2eb8fc02c462f Mon Sep 17 00:00:00 2001 From: Michael Date: Mon, 8 Mar 2021 20:27:05 -0800 Subject: [PATCH 15/16] Rewrite loop as list comprehension --- pandas/core/reshape/reshape.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index b714ed10a6b01..622f7244228e4 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -604,15 +604,17 @@ def _stack_multi_column_index(columns): """Creates a MultiIndex from the first N-1 levels of this MultiIndex.""" if len(columns.levels) <= 2: return columns.levels[0]._rename(name=columns.names[0]) - levs = [] - for lev, level_codes in zip(columns.levels[:-1], columns.codes[:-1]): - if -1 in level_codes: - lev = np.append(lev, None) - levs.append(np.take(lev, level_codes)) + + levs = [ + [lev[c] if c >= 0 else None for c in codes] + for lev, codes in zip(columns.levels[:-1], columns.codes[:-1]) + ] + # Remove duplicate tuples in the MultiIndex. tuples = zip(*levs) unique_tuples = (key for key, _ in itertools.groupby(tuples)) new_levs = zip(*unique_tuples) + # The dtype of each level must be explicitly set to avoid inferring the wrong type. # See GH-36991. return MultiIndex.from_arrays( From c6ab29140d9471e9d3157b68badf71a2b5d9d9b1 Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 9 Mar 2021 17:27:39 -0800 Subject: [PATCH 16/16] Add typing to _stack_multi_column_index --- pandas/core/reshape/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 622f7244228e4..ff6ba3f8f4164 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -600,7 +600,7 @@ def stack_multiple(frame, level, dropna=True): return result -def _stack_multi_column_index(columns): +def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex: """Creates a MultiIndex from the first N-1 levels of this MultiIndex.""" if len(columns.levels) <= 2: return columns.levels[0]._rename(name=columns.names[0])