From 01556c211a5e00053b46c90de85b0f2cf9f061db Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 20 Dec 2023 17:52:52 -0500 Subject: [PATCH 1/2] BUG: stack changes NA values in the index --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/reshape/reshape.py | 4 ++-- pandas/tests/frame/test_stack_unstack.py | 22 ++++++++++++++++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 8c475791df64d..5382e56544907 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -686,6 +686,7 @@ Reshaping - Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) - Bug in :meth:`DataFrame.pivot_table` where the row margin is incorrect when the columns have numeric names (:issue:`26568`) - Bug in :meth:`DataFrame.pivot` with numeric columns and extension dtype for data (:issue:`56528`) +- Bug in :meth:`DataFrame.stack` and :meth:`Series.stack` with ``future_stack=True`` would not preserve NA values in the index (:issue:`56573`) Sparse ^^^^^^ diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 8c822ec58e011..ac1d57f76f0f2 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -953,8 +953,8 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: index_levels = frame.index.levels index_codes = list(np.tile(frame.index.codes, (1, ratio))) else: - index_levels = [frame.index.unique()] - codes = factorize(frame.index)[0] + codes, index_levels = factorize(frame.index, use_na_sentinel=False) + index_levels = [index_levels] index_codes = list(np.tile(codes, (1, ratio))) if isinstance(stack_cols, MultiIndex): column_levels = ordered_stack_cols.levels diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 554a9d4ce2d5d..35250e227f807 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -2638,3 +2638,25 @@ def test_stack_tuple_columns(future_stack): ), ) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype, na_value", + [ + ("float64", np.nan), + ("Float64", np.nan), + ("Float64", pd.NA), + ("Int64", pd.NA), + ], +) +def test_stack_preserves_na(dtype, na_value): + # GH#56573 + index = Index([na_value], dtype=dtype) + df = DataFrame({"a": [1]}, index=index) + result = df.stack(future_stack=True) + + expected = DataFrame( + {"a": Series([na_value], dtype=dtype), "b": ["a"], None: 1} + ).set_index(["a", "b"])[None] + expected.index.names = [None, None] + tm.assert_series_equal(result, expected) From 89fb04fce03ed82fa0de8be93b013b0328d25f36 Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 20 Dec 2023 21:35:55 -0500 Subject: [PATCH 2/2] Add test for MultiIndex --- pandas/core/reshape/reshape.py | 4 ++-- pandas/tests/frame/test_stack_unstack.py | 28 +++++++++++++++++++----- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index ac1d57f76f0f2..7a49682d7c57c 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -953,8 +953,8 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: index_levels = frame.index.levels index_codes = list(np.tile(frame.index.codes, (1, ratio))) else: - codes, index_levels = factorize(frame.index, use_na_sentinel=False) - index_levels = [index_levels] + codes, uniques = factorize(frame.index, use_na_sentinel=False) + index_levels = [uniques] index_codes = list(np.tile(codes, (1, ratio))) if isinstance(stack_cols, MultiIndex): column_levels = ordered_stack_cols.levels diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 35250e227f807..6e1e743eb60de 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -2649,14 +2649,30 @@ def test_stack_tuple_columns(future_stack): ("Int64", pd.NA), ], ) -def test_stack_preserves_na(dtype, na_value): +@pytest.mark.parametrize("test_multiindex", [True, False]) +def test_stack_preserves_na(dtype, na_value, test_multiindex): # GH#56573 - index = Index([na_value], dtype=dtype) + if test_multiindex: + index = MultiIndex.from_arrays(2 * [Index([na_value], dtype=dtype)]) + else: + index = Index([na_value], dtype=dtype) df = DataFrame({"a": [1]}, index=index) result = df.stack(future_stack=True) - expected = DataFrame( - {"a": Series([na_value], dtype=dtype), "b": ["a"], None: 1} - ).set_index(["a", "b"])[None] - expected.index.names = [None, None] + if test_multiindex: + expected_index = MultiIndex.from_arrays( + [ + Index([na_value], dtype=dtype), + Index([na_value], dtype=dtype), + Index(["a"]), + ] + ) + else: + expected_index = MultiIndex.from_arrays( + [ + Index([na_value], dtype=dtype), + Index(["a"]), + ] + ) + expected = Series(1, index=expected_index) tm.assert_series_equal(result, expected)