diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 6abe70e56b9b9..b946e4e93d7f4 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -413,6 +413,7 @@ Reshaping - :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non-numerical merge columns (:issue:`29130`) - Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`) - :meth:`Series.value_counts` and :meth:`Series.mode` return consistent keys in original order (:issue:`12679`, :issue:`11227` and :issue:`39007`) +- Bug in :meth:`DataFrame.stack` not handling ``NaN`` in :class:`MultiIndex` columns correct (:issue:`39481`) - Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`) - Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index d389f19598d14..abdc6ac9dfcbe 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -629,16 +629,12 @@ def _convert_level_number(level_num, columns): # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: - tuples = list( - zip( - *[ - lev.take(level_codes) - for lev, level_codes in zip( - this.columns.levels[:-1], this.columns.codes[:-1] - ) - ] - ) - ) + levs = [] + for lev, level_codes in zip(this.columns.levels[:-1], this.columns.codes[:-1]): + if -1 in level_codes: + lev = np.append(lev, None) + levs.append(np.take(lev, level_codes)) + tuples = list(zip(*levs)) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) @@ -650,7 +646,9 @@ def _convert_level_number(level_num, columns): new_data = {} level_vals = this.columns.levels[-1] level_codes = sorted(set(this.columns.codes[-1])) - level_vals_used = level_vals[level_codes] + level_vals_nan = level_vals.insert(len(level_vals), None) + + level_vals_used = np.take(level_vals_nan, level_codes) levsize = len(level_codes) drop_cols = [] for key in unique_groups: @@ -671,7 +669,7 @@ def _convert_level_number(level_num, columns): if slice_len != levsize: chunk = this.loc[:, this.columns[loc]] - chunk.columns = level_vals.take(chunk.columns.codes[-1]) + chunk.columns = level_vals_nan.take(chunk.columns.codes[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: if frame._is_homogeneous_type and is_extension_array_dtype( diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index e8ae9f6584ad6..81e10d276e79c 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1931,3 +1931,25 @@ def test_unstack_with_level_has_nan(self): ) tm.assert_index_equal(result, expected) + + def test_stack_nan_in_multiindex_columns(self): + # GH#39481 + df = DataFrame( + np.zeros([1, 5]), + columns=MultiIndex.from_tuples( + [ + (0, None, None), + (0, 2, 0), + (0, 2, 1), + (0, 3, 0), + (0, 3, 1), + ], + ), + ) + result = df.stack(2) + expected = DataFrame( + [[0.0, np.nan, np.nan], [np.nan, 0.0, 0.0], [np.nan, 0.0, 0.0]], + index=Index([(0, None), (0, 0), (0, 1)]), + columns=Index([(0, None), (0, 2), (0, 3)]), + ) + tm.assert_frame_equal(result, expected)