From e1e99e751fc4f0a3f6d33925a764c2e22f04b9cd Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 30 Jan 2021 22:17:05 +0100 Subject: [PATCH 1/3] BUG: stack not recognizing NaNs in MultiIndex levels --- pandas/core/reshape/reshape.py | 17 +++++++---------- pandas/tests/frame/test_stack_unstack.py | 22 ++++++++++++++++++++++ 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index d389f19598d14..9cdf6a8b0eabf 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -629,16 +629,13 @@ def _convert_level_number(level_num, columns): # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: - tuples = list( - zip( - *[ - lev.take(level_codes) - for lev, level_codes in zip( - this.columns.levels[:-1], this.columns.codes[:-1] - ) - ] - ) - ) + levs = [] + for lev, level_codes in zip(this.columns.levels[:-1], this.columns.codes[:-1]): + if -1 in level_codes: + lev = lev.tolist() + lev.append(None) + levs.append(np.take(lev, level_codes)) + tuples = list(zip(*levs)) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index e8ae9f6584ad6..cd5b12a4b811a 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1931,3 +1931,25 @@ def test_unstack_with_level_has_nan(self): ) tm.assert_index_equal(result, expected) + + def test_stack_nan_in_multiindex_columns(self): + # GH#39481 + df = DataFrame( + np.zeros([1, 5]), + columns=MultiIndex.from_tuples( + [ + (0, None, None), + (0, 2, 0), + (0, 2, 1), + (0, 3, 0), + (0, 3, 1), + ], + ), + ) + result = df.stack(2) + expected = DataFrame( + [[0.0, 0.0, 0.0], [np.nan, 0.0, 0.0], [0.0, 0.0, 0.0]], + index=Index([(0, np.nan), (0, 0), (0, 1)]), + columns=Index([(0, np.nan), (0, 2), (0, 3)]), + ) + tm.assert_frame_equal(result, expected) From e1df91799b1977be75f63cb13db29083fc27ed8d Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 31 Jan 2021 21:05:02 +0100 Subject: [PATCH 2/3] BUG: DataFrame.stack not handling NaN in MultiIndex columns correct --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/core/reshape/reshape.py | 6 ++++-- pandas/tests/frame/test_stack_unstack.py | 6 +++--- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 9b1819a7d4d9f..41403b40c295b 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -410,7 +410,7 @@ Reshaping - Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`) - :meth:`Series.value_counts` and :meth:`Series.mode` return consistent keys in original order (:issue:`12679`, :issue:`11227` and :issue:`39007`) - Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`) -- +- Bug in :meth:`DataFrame.stack` not handling ``NaN`` in :class:`MultiIndex` columns correct (:issue:`39481`) Sparse ^^^^^^ diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 9cdf6a8b0eabf..c52a4b088a711 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -647,7 +647,9 @@ def _convert_level_number(level_num, columns): new_data = {} level_vals = this.columns.levels[-1] level_codes = sorted(set(this.columns.codes[-1])) - level_vals_used = level_vals[level_codes] + level_vals_nan = level_vals.insert(len(level_vals), None) + + level_vals_used = np.take(level_vals_nan, level_codes) levsize = len(level_codes) drop_cols = [] for key in unique_groups: @@ -668,7 +670,7 @@ def _convert_level_number(level_num, columns): if slice_len != levsize: chunk = this.loc[:, this.columns[loc]] - chunk.columns = level_vals.take(chunk.columns.codes[-1]) + chunk.columns = level_vals_nan.take(chunk.columns.codes[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: if frame._is_homogeneous_type and is_extension_array_dtype( diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index cd5b12a4b811a..81e10d276e79c 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1948,8 +1948,8 @@ def test_stack_nan_in_multiindex_columns(self): ) result = df.stack(2) expected = DataFrame( - [[0.0, 0.0, 0.0], [np.nan, 0.0, 0.0], [0.0, 0.0, 0.0]], - index=Index([(0, np.nan), (0, 0), (0, 1)]), - columns=Index([(0, np.nan), (0, 2), (0, 3)]), + [[0.0, np.nan, np.nan], [np.nan, 0.0, 0.0], [np.nan, 0.0, 0.0]], + index=Index([(0, None), (0, 0), (0, 1)]), + columns=Index([(0, None), (0, 2), (0, 3)]), ) tm.assert_frame_equal(result, expected) From 37702ae0e73185a107d10f64c23908ba77aba423 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 31 Jan 2021 21:13:37 +0100 Subject: [PATCH 3/3] Simplify --- pandas/core/reshape/reshape.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c52a4b088a711..abdc6ac9dfcbe 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -632,8 +632,7 @@ def _convert_level_number(level_num, columns): levs = [] for lev, level_codes in zip(this.columns.levels[:-1], this.columns.codes[:-1]): if -1 in level_codes: - lev = lev.tolist() - lev.append(None) + lev = np.append(lev, None) levs.append(np.take(lev, level_codes)) tuples = list(zip(*levs)) unique_groups = [key for key, _ in itertools.groupby(tuples)]