diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index 25ba237e8caf3..a6cfcd4614984 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -438,7 +438,7 @@ In Python, the :meth:`~pandas.melt` method is the R equivalent: ) pd.melt(cheese, id_vars=["first", "last"]) - cheese.set_index(["first", "last"]).stack() # alternative way + cheese.set_index(["first", "last"]).stack(future_stack=True) # alternative way For more details and examples see :ref:`the reshaping documentation `. diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index cb3c4ab3de658..51168f74c2657 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -579,7 +579,7 @@ columns: .. ipython:: python - stacked = df2.stack() + stacked = df2.stack(future_stack=True) stacked With a "stacked" DataFrame or Series (having a :class:`MultiIndex` as the diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 041061f32db3f..66ee571d6b5a5 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -311,7 +311,7 @@ The :ref:`multindexing ` docs. df.columns = pd.MultiIndex.from_tuples([tuple(c.split("_")) for c in df.columns]) df # Now stack & Reset - df = df.stack(0).reset_index(1) + df = df.stack(0, future_stack=True).reset_index(1) df # And fix the labels (Notice the label 'level_1' got added automatically) df.columns = ["Sample", "All_X", "All_Y"] @@ -688,7 +688,7 @@ The :ref:`Pivot ` docs. aggfunc="sum", margins=True, ) - table.stack("City") + table.stack("City", future_stack=True) `Frequency table like plyr in R `__ diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 482e3fe91ca09..75c816f66d5e4 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1713,4 +1713,4 @@ column index name will be used as the name of the inserted column: result - result.stack() + result.stack(future_stack=True) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 4df6996c4f66b..1e73b7672782e 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -127,7 +127,7 @@ stacked level becomes the new lowest level in a :class:`MultiIndex` on the colum .. ipython:: python - stacked = df2.stack() + stacked = df2.stack(future_stack=True) stacked With a "stacked" :class:`DataFrame` or :class:`Series` (having a :class:`MultiIndex` as the @@ -163,7 +163,7 @@ will result in a **sorted** copy of the original :class:`DataFrame` or :class:`S index = pd.MultiIndex.from_product([[2, 1], ["a", "b"]]) df = pd.DataFrame(np.random.randn(4), index=index, columns=["A"]) df - all(df.unstack().stack() == df.sort_index()) + all(df.unstack().stack(future_stack=True) == df.sort_index()) The above code will raise a ``TypeError`` if the call to :meth:`~DataFrame.sort_index` is removed. @@ -191,16 +191,16 @@ processed individually. df = pd.DataFrame(np.random.randn(4, 4), columns=columns) df - df.stack(level=["animal", "hair_length"]) + df.stack(level=["animal", "hair_length"], future_stack=True) The list of levels can contain either level names or level numbers (but not a mixture of the two). .. ipython:: python - # df.stack(level=['animal', 'hair_length']) + # df.stack(level=['animal', 'hair_length'], future_stack=True) # from above is equivalent to: - df.stack(level=[1, 2]) + df.stack(level=[1, 2], future_stack=True) Missing data ~~~~~~~~~~~~ @@ -233,8 +233,8 @@ which level in the columns to stack: .. ipython:: python - df2.stack("exp") - df2.stack("animal") + df2.stack("exp", future_stack=True) + df2.stack("animal", future_stack=True) Unstacking can result in missing values if subgroups do not have the same set of labels. By default, missing values will be replaced with the default @@ -345,12 +345,12 @@ some very expressive and fast data manipulations. .. ipython:: python df - df.stack().mean(1).unstack() + df.stack(future_stack=True).mean(1).unstack() # same result, another way df.T.groupby(level=1).mean() - df.stack().groupby(level=1).mean() + df.stack(future_stack=True).groupby(level=1).mean() df.mean().unstack(0) @@ -460,7 +460,7 @@ as having a multi-level index: .. ipython:: python - table.stack() + table.stack(future_stack=True) .. _reshaping.crosstabulations: diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index c50e031c815a6..6b7554c694f80 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -78,7 +78,7 @@ Copy-on-Write improvements - DataFrame.fillna / Series.fillna - DataFrame.replace / Series.replace -.. _whatsnew_210.enhancements.enhancement2: +.. _whatsnew_210.enhancements.map_na_action: ``map(func, na_action="ignore")`` now works for all array types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -128,6 +128,45 @@ Also, note that :meth:`Categorical.map` implicitly has had its ``na_action`` set This has been deprecated and will :meth:`Categorical.map` in the future change the default to ``na_action=None``, like for all the other array types. +.. _whatsnew_210.enhancements.new_stack: + +New implementation of :meth:`DataFrame.stack` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +pandas has reimplemented :meth:`DataFrame.stack`. To use the new implementation, pass the argument ``future_stack=True``. This will become the only option in pandas 3.0. + +The previous implementation had two main behavioral downsides. + + 1. The previous implementation would unnecessarily introduce NA values into the result. The user could have NA values automatically removed by passing ``dropna=True`` (the default), but doing this could also remove NA values from the result that existed in the input. See the examples below. + 2. The previous implementation with ``sort=True`` (the default) would sometimes sort part of the resulting index, and sometimes not. If the input's columns are *not* a :class:`MultiIndex`, then the resulting index would never be sorted. If the columns are a :class:`MultiIndex`, then in most cases the level(s) in the resulting index that come from stacking the column level(s) would be sorted. In rare cases such level(s) would be sorted in a non-standard order, depending on how the columns were created. + +The new implementation (``future_stack=True``) will no longer unnecessarily introduce NA values when stacking multiple levels and will never sort. As such, the arguments ``dropna`` and ``sort`` are not utilized and must remain unspecified when using ``future_stack=True``. These arguments will be removed in the next major release. + +.. ipython:: python + + columns = pd.MultiIndex.from_tuples([("B", "d"), ("A", "c")]) + df = pd.DataFrame([[0, 2], [1, 3]], index=["z", "y"], columns=columns) + df + +In the previous version (``future_stack=False``), the default of ``dropna=True`` would remove unnecessarily introduced NA values but still coerce the dtype to ``float64`` in the process. In the new version, no NAs are introduced and so there is no coercion of the dtype. + +.. ipython:: python + :okwarning: + + df.stack([0, 1], future_stack=False, dropna=True) + df.stack([0, 1], future_stack=True) + +If the input contains NA values, the previous version would drop those as well with ``dropna=True`` or introduce new NA values with ``dropna=False``. The new version persists all values from the input. + +.. ipython:: python + :okwarning: + + df = pd.DataFrame([[0, 2], [np.nan, np.nan]], columns=columns) + df + df.stack([0, 1], future_stack=False, dropna=True) + df.stack([0, 1], future_stack=False, dropna=False) + df.stack([0, 1], future_stack=True) + .. _whatsnew_210.enhancements.other: Other enhancements diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 331b06b42e7dc..3b2fe1699e996 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9166,7 +9166,13 @@ def pivot_table( sort=sort, ) - def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): + def stack( + self, + level: IndexLabel = -1, + dropna: bool | lib.NoDefault = lib.no_default, + sort: bool | lib.NoDefault = lib.no_default, + future_stack: bool = False, + ): """ Stack the prescribed level(s) from columns to index. @@ -9194,6 +9200,11 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): section. sort : bool, default True Whether to sort the levels of the resulting MultiIndex. + future_stack : bool, default False + Whether to use the new implementation that will replace the current + implementation in pandas 3.0. When True, dropna and sort have no impact + on the result and must remain unspecified. See :ref:`pandas 2.1.0 Release + notes ` for more details. Returns ------- @@ -9233,7 +9244,7 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): weight height cat 0 1 dog 2 3 - >>> df_single_level_cols.stack() + >>> df_single_level_cols.stack(future_stack=True) cat weight 0 height 1 dog weight 2 @@ -9255,7 +9266,7 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): kg pounds cat 1 2 dog 2 4 - >>> df_multi_level_cols1.stack() + >>> df_multi_level_cols1.stack(future_stack=True) weight cat kg 1 pounds 2 @@ -9280,7 +9291,7 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): kg m cat 1.0 2.0 dog 3.0 4.0 - >>> df_multi_level_cols2.stack() + >>> df_multi_level_cols2.stack(future_stack=True) weight height cat kg 1.0 NaN m NaN 2.0 @@ -9291,17 +9302,17 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): The first parameter controls which level or levels are stacked: - >>> df_multi_level_cols2.stack(0) + >>> df_multi_level_cols2.stack(0, future_stack=True) kg m - cat height NaN 2.0 - weight 1.0 NaN - dog height NaN 4.0 - weight 3.0 NaN - >>> df_multi_level_cols2.stack([0, 1]) - cat height m 2.0 - weight kg 1.0 - dog height m 4.0 - weight kg 3.0 + cat weight 1.0 NaN + height NaN 2.0 + dog weight 3.0 NaN + height NaN 4.0 + >>> df_multi_level_cols2.stack([0, 1], future_stack=True) + cat weight kg 1.0 + height m 2.0 + dog weight kg 3.0 + height m 4.0 dtype: float64 **Dropping missing values** @@ -9331,15 +9342,52 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): dog kg 2.0 NaN m NaN 3.0 """ - from pandas.core.reshape.reshape import ( - stack, - stack_multiple, - ) + if not future_stack: + from pandas.core.reshape.reshape import ( + stack, + stack_multiple, + ) + + if dropna is lib.no_default: + dropna = True + if sort is lib.no_default: + sort = True - if isinstance(level, (tuple, list)): - result = stack_multiple(self, level, dropna=dropna, sort=sort) + if isinstance(level, (tuple, list)): + result = stack_multiple(self, level, dropna=dropna, sort=sort) + else: + result = stack(self, level, dropna=dropna, sort=sort) else: - result = stack(self, level, dropna=dropna, sort=sort) + from pandas.core.reshape.reshape import stack_v3 + + if dropna is not lib.no_default: + raise ValueError( + "dropna must be unspecified with future_stack=True as the new " + "implementation does not introduce rows of NA values. This " + "argument will be removed in a future version of pandas." + ) + + if sort is not lib.no_default: + raise ValueError( + "Cannot specify sort with future_stack=True, this argument will be " + "removed in a future version of pandas. Sort the result using " + ".sort_index instead." + ) + + if ( + isinstance(level, (tuple, list)) + and not all(lev in self.columns.names for lev in level) + and not all(isinstance(lev, int) for lev in level) + ): + raise ValueError( + "level should contain all level names or all level " + "numbers, not a mixture of the two." + ) + + if not isinstance(level, (tuple, list)): + level = [level] + level = [self.columns._get_level_number(lev) for lev in level] + result = stack_v3(self, level) return result.__finalize__(self, method="stack") diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2ffdaa934e838..5c678adfe4970 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -416,7 +416,7 @@ def _wrap_applied_output( res_df = self._reindex_output(res_df) # if self.observed is False, # keep all-NaN rows created while re-indexing - res_ser = res_df.stack(dropna=self.observed) + res_ser = res_df.stack(future_stack=True) res_ser.name = self.obj.name return res_ser elif isinstance(values[0], (Series, DataFrame)): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1961bd83d2fed..33eb411374e67 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2440,6 +2440,10 @@ def reorder_levels(self, order) -> MultiIndex: names=['y', 'x']) """ order = [self._get_level_number(i) for i in order] + result = self._reorder_ilevels(order) + return result + + def _reorder_ilevels(self, order) -> MultiIndex: if len(order) != self.nlevels: raise AssertionError( f"Length of order must be same as number of levels ({self.nlevels}), " diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 53d587cdde182..9b8d1c870091d 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1497,7 +1497,7 @@ def size(self): # If the result is a non-empty DataFrame we stack to get a Series # GH 46826 if isinstance(result, ABCDataFrame) and not result.empty: - result = result.stack() + result = result.stack(future_stack=True) if not len(self.ax): from pandas import Series diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 5c2e94735ddc5..71e3ea5b2588e 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -418,7 +418,7 @@ def _all_key(key): if len(cols) > 0: row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc) - row_margin = row_margin.stack() + row_margin = row_margin.stack(future_stack=True) # slight hack new_order = [len(cols)] + list(range(len(cols))) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 6845530c5fa2a..fc8d827cd31bb 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -28,14 +28,19 @@ from pandas.core.dtypes.missing import notna import pandas.core.algorithms as algos -from pandas.core.algorithms import unique +from pandas.core.algorithms import ( + factorize, + unique, +) from pandas.core.arrays.categorical import factorize_from_iterable from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.frame import DataFrame from pandas.core.indexes.api import ( Index, MultiIndex, + RangeIndex, ) +from pandas.core.reshape.concat import concat from pandas.core.series import Series from pandas.core.sorting import ( compress_group_index, @@ -498,7 +503,7 @@ def unstack(obj: Series | DataFrame, level, fill_value=None, sort: bool = True): if isinstance(obj.index, MultiIndex): return _unstack_frame(obj, level, fill_value=fill_value, sort=sort) else: - return obj.T.stack(dropna=False, sort=sort) + return obj.T.stack(future_stack=True) elif not isinstance(obj.index, MultiIndex): # GH 36113 # Give nicer error messages when unstack a Series whose @@ -581,7 +586,7 @@ def stack(frame: DataFrame, level=-1, dropna: bool = True, sort: bool = True): stacked : Series or DataFrame """ - def factorize(index): + def stack_factorize(index): if index.is_unique: return index, np.arange(len(index)) codes, categories = factorize_from_iterable(index) @@ -600,7 +605,7 @@ def factorize(index): new_levels = list(frame.index.levels) new_codes = [lab.repeat(K) for lab in frame.index.codes] - clev, clab = factorize(frame.columns) + clev, clab = stack_factorize(frame.columns) new_levels.append(clev) new_codes.append(np.tile(clab, N).ravel()) @@ -610,7 +615,7 @@ def factorize(index): levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False ) else: - levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns))) + levels, (ilab, clab) = zip(*map(stack_factorize, (frame.index, frame.columns))) codes = ilab.repeat(K), np.tile(clab, N).ravel() new_index = MultiIndex( levels=levels, @@ -875,3 +880,110 @@ def _reorder_for_extension_array_stack( # c0r1, c1r1, c2r1, ...] idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel() return arr.take(idx) + + +def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: + if frame.columns.nunique() != len(frame.columns): + raise ValueError("Columns with duplicate values are not supported in stack") + + # If we need to drop `level` from columns, it needs to be in descending order + drop_levnums = sorted(level, reverse=True) + stack_cols = frame.columns._drop_level_numbers( + [k for k in range(frame.columns.nlevels) if k not in level][::-1] + ) + if len(level) > 1: + # Arrange columns in the order we want to take them, e.g. level=[2, 0, 1] + sorter = np.argsort(level) + ordered_stack_cols = stack_cols._reorder_ilevels(sorter) + else: + ordered_stack_cols = stack_cols + + stack_cols_unique = stack_cols.unique() + ordered_stack_cols_unique = ordered_stack_cols.unique() + + # Grab data for each unique index to be stacked + buf = [] + for idx in stack_cols_unique: + if len(frame.columns) == 1: + data = frame.copy() + else: + # Take the data from frame corresponding to this idx value + if not isinstance(idx, tuple): + idx = (idx,) + gen = iter(idx) + column_indexer = tuple( + next(gen) if k in level else slice(None) + for k in range(frame.columns.nlevels) + ) + data = frame.loc[:, column_indexer] + + if len(level) < frame.columns.nlevels: + data.columns = data.columns._drop_level_numbers(drop_levnums) + elif stack_cols.nlevels == 1: + if data.ndim == 1: + data.name = 0 + else: + data.columns = RangeIndex(len(data.columns)) + buf.append(data) + + result: Series | DataFrame + if len(buf) > 0 and not frame.empty: + result = concat(buf) + ratio = len(result) // len(frame) + else: + # input is empty + if len(level) < frame.columns.nlevels: + # concat column order may be different from dropping the levels + new_columns = frame.columns._drop_level_numbers(drop_levnums).unique() + else: + new_columns = [0] + result = DataFrame(columns=new_columns, dtype=frame._values.dtype) + ratio = 0 + + if len(level) < frame.columns.nlevels: + # concat column order may be different from dropping the levels + desired_columns = frame.columns._drop_level_numbers(drop_levnums).unique() + if not result.columns.equals(desired_columns): + result = result[desired_columns] + + # Construct the correct MultiIndex by combining the frame's index and + # stacked columns. + index_levels: list | FrozenList + if isinstance(frame.index, MultiIndex): + index_levels = frame.index.levels + index_codes = list(np.tile(frame.index.codes, (1, ratio))) + else: + index_levels = [frame.index.unique()] + codes = factorize(frame.index)[0] + index_codes = list(np.tile(codes, (1, ratio))) + if isinstance(stack_cols, MultiIndex): + column_levels = ordered_stack_cols.levels + column_codes = ordered_stack_cols.drop_duplicates().codes + else: + column_levels = [ordered_stack_cols.unique()] + column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]] + column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] + result.index = MultiIndex( + levels=index_levels + column_levels, + codes=index_codes + column_codes, + names=frame.index.names + list(ordered_stack_cols.names), + verify_integrity=False, + ) + + # sort result, but faster than calling sort_index since we know the order we need + len_df = len(frame) + n_uniques = len(ordered_stack_cols_unique) + indexer = np.arange(n_uniques) + idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques) + result = result.take(idxs) + + # Reshape/rename if needed and dropna + if result.ndim == 2 and frame.columns.nlevels == len(level): + if len(result.columns) == 0: + result = Series(index=result.index) + else: + result = result.iloc[:, 0] + if result.ndim == 1: + result.name = None + + return result diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 9b150cf5054ee..3f89ef5395006 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -253,11 +253,12 @@ def test_merge_on_extension_array_duplicates(self, data): ), ], ) - def test_stack(self, data, columns): + @pytest.mark.parametrize("future_stack", [True, False]) + def test_stack(self, data, columns, future_stack): df = pd.DataFrame({"A": data[:5], "B": data[:5]}) df.columns = columns - result = df.stack() - expected = df.astype(object).stack() + result = df.stack(future_stack=future_stack) + expected = df.astype(object).stack(future_stack=future_stack) # we need a second astype(object), in case the constructor inferred # object -> specialized, as is done for period. expected = expected.astype(object) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 3a2ec1f2e6ce1..8a571d9295e1f 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -139,7 +139,7 @@ class TestReshaping(BaseJSON, base.BaseReshapingTests): @pytest.mark.xfail(reason="Different definitions of NA") def test_stack(self): """ - The test does .astype(object).stack(). If we happen to have + The test does .astype(object).stack(future_stack=True). If we happen to have any missing values in `data`, then we'll end up with different rows since we consider `{}` NA, but `.astype(object)` doesn't. """ diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 748d5cc65de1a..851a630dbc1f2 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -157,8 +157,9 @@ def test_concat_mixed_dtypes(self, data): ), ], ) - def test_stack(self, data, columns): - super().test_stack(data, columns) + @pytest.mark.parametrize("future_stack", [True, False]) + def test_stack(self, data, columns, future_stack): + super().test_stack(data, columns, future_stack) def test_concat_columns(self, data, na_value): self._check_unsupported(data) diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index fa28ebc16e942..d99dd36f3a2e3 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -112,7 +112,7 @@ def test_reset_index_with_intervals(self): tm.assert_frame_equal(result2, original) def test_reset_index(self, float_frame): - stacked = float_frame.stack()[::2] + stacked = float_frame.stack(future_stack=True)[::2] stacked = DataFrame({"foo": stacked, "bar": stacked}) names = ["first", "second"] @@ -761,7 +761,7 @@ def test_reset_index_rename(float_frame): def test_reset_index_rename_multiindex(float_frame): # GH 6878 - stacked_df = float_frame.stack()[::2] + stacked_df = float_frame.stack(future_stack=True)[::2] stacked_df = DataFrame({"foo": stacked_df, "bar": stacked_df}) names = ["first", "second"] @@ -775,7 +775,7 @@ def test_reset_index_rename_multiindex(float_frame): def test_errorreset_index_rename(float_frame): # GH 6878 - stacked_df = float_frame.stack()[::2] + stacked_df = float_frame.stack(future_stack=True)[::2] stacked_df = DataFrame({"first": stacked_df, "second": stacked_df}) with pytest.raises( diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 3d11802694aef..cb8e8c5025e3b 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -6,6 +6,7 @@ import numpy as np import pytest +from pandas._libs import lib from pandas.errors import PerformanceWarning import pandas as pd @@ -22,12 +23,17 @@ from pandas.core.reshape import reshape as reshape_lib +@pytest.fixture(params=[True, False]) +def future_stack(request): + return request.param + + class TestDataFrameReshape: - def test_stack_unstack(self, float_frame): + def test_stack_unstack(self, float_frame, future_stack): df = float_frame.copy() df[:] = np.arange(np.prod(df.shape)).reshape(df.shape) - stacked = df.stack() + stacked = df.stack(future_stack=future_stack) stacked_df = DataFrame({"foo": stacked, "bar": stacked}) unstacked = stacked.unstack() @@ -41,26 +47,26 @@ def test_stack_unstack(self, float_frame): tm.assert_frame_equal(unstacked_cols.T, df) tm.assert_frame_equal(unstacked_cols_df["bar"].T, df) - def test_stack_mixed_level(self): + def test_stack_mixed_level(self, future_stack): # GH 18310 levels = [range(3), [3, "a", "b"], [1, 2]] # flat columns: df = DataFrame(1, index=levels[0], columns=levels[1]) - result = df.stack() + result = df.stack(future_stack=future_stack) expected = Series(1, index=MultiIndex.from_product(levels[:2])) tm.assert_series_equal(result, expected) # MultiIndex columns: df = DataFrame(1, index=levels[0], columns=MultiIndex.from_product(levels[1:])) - result = df.stack(1) + result = df.stack(1, future_stack=future_stack) expected = DataFrame( 1, index=MultiIndex.from_product([levels[0], levels[2]]), columns=levels[1] ) tm.assert_frame_equal(result, expected) # as above, but used labels in level are actually of homogeneous type - result = df[["a", "b"]].stack(1) + result = df[["a", "b"]].stack(1, future_stack=future_stack) expected = expected[["a", "b"]] tm.assert_frame_equal(result, expected) @@ -76,7 +82,7 @@ def test_unstack_not_consolidated(self, using_array_manager): expected = df.unstack() tm.assert_series_equal(res, expected) - def test_unstack_fill(self): + def test_unstack_fill(self, future_stack): # GH #9746: fill_value keyword argument for Series # and DataFrame unstack @@ -109,7 +115,7 @@ def test_unstack_fill(self): result = Series([0, 0, 2], index=unstacked.index, name=key) tm.assert_series_equal(result, expected) - stacked = unstacked.stack(["x", "y"]) + stacked = unstacked.stack(["x", "y"], future_stack=future_stack) stacked.index = stacked.index.reorder_levels(df.index.names) # Workaround for GH #17886 (unnecessarily casts to float): stacked = stacked.astype(np.int64) @@ -382,15 +388,23 @@ def unstack_and_compare(df, column_name): s = df1["A"] unstack_and_compare(s, "index") - def test_stack_ints(self): + def test_stack_ints(self, future_stack): columns = MultiIndex.from_tuples(list(itertools.product(range(3), repeat=3))) df = DataFrame( np.random.default_rng(2).standard_normal((30, 27)), columns=columns ) - tm.assert_frame_equal(df.stack(level=[1, 2]), df.stack(level=1).stack(level=1)) tm.assert_frame_equal( - df.stack(level=[-2, -1]), df.stack(level=1).stack(level=1) + df.stack(level=[1, 2], future_stack=future_stack), + df.stack(level=1, future_stack=future_stack).stack( + level=1, future_stack=future_stack + ), + ) + tm.assert_frame_equal( + df.stack(level=[-2, -1], future_stack=future_stack), + df.stack(level=1, future_stack=future_stack).stack( + level=1, future_stack=future_stack + ), ) df_named = df.copy() @@ -398,10 +412,13 @@ def test_stack_ints(self): assert return_value is None tm.assert_frame_equal( - df_named.stack(level=[1, 2]), df_named.stack(level=1).stack(level=1) + df_named.stack(level=[1, 2], future_stack=future_stack), + df_named.stack(level=1, future_stack=future_stack).stack( + level=1, future_stack=future_stack + ), ) - def test_stack_mixed_levels(self): + def test_stack_mixed_levels(self, future_stack): columns = MultiIndex.from_tuples( [ ("A", "cat", "long"), @@ -415,8 +432,12 @@ def test_stack_mixed_levels(self): np.random.default_rng(2).standard_normal((4, 4)), columns=columns ) - animal_hair_stacked = df.stack(level=["animal", "hair_length"]) - exp_hair_stacked = df.stack(level=["exp", "hair_length"]) + animal_hair_stacked = df.stack( + level=["animal", "hair_length"], future_stack=future_stack + ) + exp_hair_stacked = df.stack( + level=["exp", "hair_length"], future_stack=future_stack + ) # GH #8584: Need to check that stacking works when a number # is passed that is both a level name and in the range of @@ -424,10 +445,14 @@ def test_stack_mixed_levels(self): df2 = df.copy() df2.columns.names = ["exp", "animal", 1] tm.assert_frame_equal( - df2.stack(level=["animal", 1]), animal_hair_stacked, check_names=False + df2.stack(level=["animal", 1], future_stack=future_stack), + animal_hair_stacked, + check_names=False, ) tm.assert_frame_equal( - df2.stack(level=["exp", 1]), exp_hair_stacked, check_names=False + df2.stack(level=["exp", 1], future_stack=future_stack), + exp_hair_stacked, + check_names=False, ) # When mixed types are passed and the ints are not level @@ -437,17 +462,19 @@ def test_stack_mixed_levels(self): "a mixture of the two" ) with pytest.raises(ValueError, match=msg): - df2.stack(level=["animal", 0]) + df2.stack(level=["animal", 0], future_stack=future_stack) # GH #8584: Having 0 in the level names could raise a # strange error about lexsort depth df3 = df.copy() df3.columns.names = ["exp", "animal", 0] tm.assert_frame_equal( - df3.stack(level=["animal", 0]), animal_hair_stacked, check_names=False + df3.stack(level=["animal", 0], future_stack=future_stack), + animal_hair_stacked, + check_names=False, ) - def test_stack_int_level_names(self): + def test_stack_int_level_names(self, future_stack): columns = MultiIndex.from_tuples( [ ("A", "cat", "long"), @@ -461,33 +488,51 @@ def test_stack_int_level_names(self): np.random.default_rng(2).standard_normal((4, 4)), columns=columns ) - exp_animal_stacked = df.stack(level=["exp", "animal"]) - animal_hair_stacked = df.stack(level=["animal", "hair_length"]) - exp_hair_stacked = df.stack(level=["exp", "hair_length"]) + exp_animal_stacked = df.stack( + level=["exp", "animal"], future_stack=future_stack + ) + animal_hair_stacked = df.stack( + level=["animal", "hair_length"], future_stack=future_stack + ) + exp_hair_stacked = df.stack( + level=["exp", "hair_length"], future_stack=future_stack + ) df2 = df.copy() df2.columns.names = [0, 1, 2] tm.assert_frame_equal( - df2.stack(level=[1, 2]), animal_hair_stacked, check_names=False + df2.stack(level=[1, 2], future_stack=future_stack), + animal_hair_stacked, + check_names=False, ) tm.assert_frame_equal( - df2.stack(level=[0, 1]), exp_animal_stacked, check_names=False + df2.stack(level=[0, 1], future_stack=future_stack), + exp_animal_stacked, + check_names=False, ) tm.assert_frame_equal( - df2.stack(level=[0, 2]), exp_hair_stacked, check_names=False + df2.stack(level=[0, 2], future_stack=future_stack), + exp_hair_stacked, + check_names=False, ) # Out-of-order int column names df3 = df.copy() df3.columns.names = [2, 0, 1] tm.assert_frame_equal( - df3.stack(level=[0, 1]), animal_hair_stacked, check_names=False + df3.stack(level=[0, 1], future_stack=future_stack), + animal_hair_stacked, + check_names=False, ) tm.assert_frame_equal( - df3.stack(level=[2, 0]), exp_animal_stacked, check_names=False + df3.stack(level=[2, 0], future_stack=future_stack), + exp_animal_stacked, + check_names=False, ) tm.assert_frame_equal( - df3.stack(level=[2, 1]), exp_hair_stacked, check_names=False + df3.stack(level=[2, 1], future_stack=future_stack), + exp_hair_stacked, + check_names=False, ) def test_unstack_bool(self): @@ -504,7 +549,7 @@ def test_unstack_bool(self): ) tm.assert_frame_equal(rs, xp) - def test_unstack_level_binding(self): + def test_unstack_level_binding(self, future_stack): # GH9856 mi = MultiIndex( levels=[["foo", "bar"], ["one", "two"], ["a", "b"]], @@ -512,7 +557,7 @@ def test_unstack_level_binding(self): names=["first", "second", "third"], ) s = Series(0, index=mi) - result = s.unstack([1, 2]).stack(0) + result = s.unstack([1, 2]).stack(0, future_stack=future_stack) expected_mi = MultiIndex( levels=[["foo", "bar"], ["one", "two"]], @@ -631,7 +676,7 @@ def test_unstack_dtypes_mixed_date(self, c, d): assert left.shape == (3, 2) tm.assert_frame_equal(left, right) - def test_unstack_non_unique_index_names(self): + def test_unstack_non_unique_index_names(self, future_stack): idx = MultiIndex.from_tuples([("a", "b"), ("c", "d")], names=["c1", "c1"]) df = DataFrame([1, 2], index=idx) msg = "The name c1 occurs multiple times, use a level number" @@ -639,7 +684,7 @@ def test_unstack_non_unique_index_names(self): df.unstack("c1") with pytest.raises(ValueError, match=msg): - df.T.stack("c1") + df.T.stack("c1", future_stack=future_stack) def test_unstack_unused_levels(self): # GH 17845: unused codes in index make unstack() cast int to float @@ -995,11 +1040,11 @@ def test_unstack_nan_index5(self): key = r["1st"], (col, r["2nd"], r["3rd"]) assert r[col] == left.loc[key] - def test_stack_datetime_column_multiIndex(self): + def test_stack_datetime_column_multiIndex(self, future_stack): # GH 8039 t = datetime(2014, 1, 1) df = DataFrame([1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, "A", "B")])) - result = df.stack() + result = df.stack(future_stack=future_stack) eidx = MultiIndex.from_product([(0, 1, 2, 3), ("B",)]) ecols = MultiIndex.from_tuples([(t, "A")]) @@ -1033,8 +1078,9 @@ def test_stack_datetime_column_multiIndex(self): ], ) @pytest.mark.parametrize("level", (-1, 0, 1, [0, 1], [1, 0])) - def test_stack_partial_multiIndex(self, multiindex_columns, level): + def test_stack_partial_multiIndex(self, multiindex_columns, level, future_stack): # GH 8844 + dropna = False if not future_stack else lib.no_default full_multiindex = MultiIndex.from_tuples( [("B", "x"), ("B", "z"), ("A", "y"), ("C", "x"), ("C", "u")], names=["Upper", "Lower"], @@ -1044,13 +1090,13 @@ def test_stack_partial_multiIndex(self, multiindex_columns, level): np.arange(3 * len(multiindex)).reshape(3, len(multiindex)), columns=multiindex, ) - result = df.stack(level=level, dropna=False) + result = df.stack(level=level, dropna=dropna, future_stack=future_stack) - if isinstance(level, int): + if isinstance(level, int) and not future_stack: # Stacking a single level should not make any all-NaN rows, # so df.stack(level=level, dropna=False) should be the same # as df.stack(level=level, dropna=True). - expected = df.stack(level=level, dropna=True) + expected = df.stack(level=level, dropna=True, future_stack=future_stack) if isinstance(expected, Series): tm.assert_series_equal(result, expected) else: @@ -1059,20 +1105,21 @@ def test_stack_partial_multiIndex(self, multiindex_columns, level): df.columns = MultiIndex.from_tuples( df.columns.to_numpy(), names=df.columns.names ) - expected = df.stack(level=level, dropna=False) + expected = df.stack(level=level, dropna=dropna, future_stack=future_stack) if isinstance(expected, Series): tm.assert_series_equal(result, expected) else: tm.assert_frame_equal(result, expected) - def test_stack_full_multiIndex(self): + def test_stack_full_multiIndex(self, future_stack): # GH 8844 full_multiindex = MultiIndex.from_tuples( [("B", "x"), ("B", "z"), ("A", "y"), ("C", "x"), ("C", "u")], names=["Upper", "Lower"], ) df = DataFrame(np.arange(6).reshape(2, 3), columns=full_multiindex[[0, 1, 3]]) - result = df.stack(dropna=False) + dropna = False if not future_stack else lib.no_default + result = df.stack(dropna=dropna, future_stack=future_stack) expected = DataFrame( [[0, 2], [1, np.nan], [3, 5], [4, np.nan]], index=MultiIndex( @@ -1086,12 +1133,11 @@ def test_stack_full_multiIndex(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("ordered", [False, True]) - @pytest.mark.parametrize("labels", [list("yxz"), list("yxy")]) - def test_stack_preserve_categorical_dtype(self, ordered, labels): + def test_stack_preserve_categorical_dtype(self, ordered, future_stack): # GH13854 - cidx = pd.CategoricalIndex(labels, categories=list("xyz"), ordered=ordered) + cidx = pd.CategoricalIndex(list("yxz"), categories=list("xyz"), ordered=ordered) df = DataFrame([[10, 11, 12]], columns=cidx) - result = df.stack() + result = df.stack(future_stack=future_stack) # `MultiIndex.from_product` preserves categorical dtype - # it's tested elsewhere. @@ -1108,24 +1154,30 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): (list("zyx"), [14, 15, 12, 13, 10, 11]), ], ) - def test_stack_multi_preserve_categorical_dtype(self, ordered, labels, data): + def test_stack_multi_preserve_categorical_dtype( + self, ordered, labels, data, future_stack + ): # GH-36991 cidx = pd.CategoricalIndex(labels, categories=sorted(labels), ordered=ordered) cidx2 = pd.CategoricalIndex(["u", "v"], ordered=ordered) midx = MultiIndex.from_product([cidx, cidx2]) df = DataFrame([sorted(data)], columns=midx) - result = df.stack([0, 1]) + result = df.stack([0, 1], future_stack=future_stack) - s_cidx = pd.CategoricalIndex(sorted(labels), ordered=ordered) - expected = Series(data, index=MultiIndex.from_product([[0], s_cidx, cidx2])) + labels = labels if future_stack else sorted(labels) + s_cidx = pd.CategoricalIndex(labels, ordered=ordered) + expected_data = sorted(data) if future_stack else data + expected = Series( + expected_data, index=MultiIndex.from_product([[0], s_cidx, cidx2]) + ) tm.assert_series_equal(result, expected) - def test_stack_preserve_categorical_dtype_values(self): + def test_stack_preserve_categorical_dtype_values(self, future_stack): # GH-23077 cat = pd.Categorical(["a", "a", "b", "c"]) df = DataFrame({"A": cat, "B": cat}) - result = df.stack() + result = df.stack(future_stack=future_stack) index = MultiIndex.from_product([[0, 1, 2, 3], ["A", "B"]]) expected = Series( pd.Categorical(["a", "a", "a", "a", "b", "b", "c", "c"]), index=index @@ -1140,10 +1192,10 @@ def test_stack_preserve_categorical_dtype_values(self): ([0, 1, 2, 3], MultiIndex.from_product([[1, 2], ["a", "b"]])), ], ) - def test_stack_multi_columns_non_unique_index(self, index, columns): + def test_stack_multi_columns_non_unique_index(self, index, columns, future_stack): # GH-28301 df = DataFrame(index=index, columns=columns).fillna(1) - stacked = df.stack() + stacked = df.stack(future_stack=future_stack) new_index = MultiIndex.from_tuples(stacked.index.to_numpy()) expected = DataFrame( stacked.to_numpy(), index=new_index, columns=stacked.columns @@ -1161,7 +1213,7 @@ def test_stack_multi_columns_non_unique_index(self, index, columns): ], ) def test_stack_multi_columns_mixed_extension_types( - self, vals1, vals2, dtype1, dtype2, expected_dtype + self, vals1, vals2, dtype1, dtype2, expected_dtype, future_stack ): # GH45740 df = DataFrame( @@ -1170,8 +1222,10 @@ def test_stack_multi_columns_mixed_extension_types( ("A", 2): Series(vals2, dtype=dtype2), } ) - result = df.stack() - expected = df.astype(object).stack().astype(expected_dtype) + result = df.stack(future_stack=future_stack) + expected = ( + df.astype(object).stack(future_stack=future_stack).astype(expected_dtype) + ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("level", [0, 1]) @@ -1301,11 +1355,11 @@ def test_unstack_timezone_aware_values(): tm.assert_frame_equal(result, expected) -def test_stack_timezone_aware_values(): +def test_stack_timezone_aware_values(future_stack): # GH 19420 ts = date_range(freq="D", start="20180101", end="20180103", tz="America/New_York") df = DataFrame({"A": ts}, index=["a", "b", "c"]) - result = df.stack() + result = df.stack(future_stack=future_stack) expected = Series( ts, index=MultiIndex(levels=[["a", "b", "c"], ["A"]], codes=[[0, 1, 2], [0, 0, 0]]), @@ -1313,24 +1367,38 @@ def test_stack_timezone_aware_values(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("dropna", [True, False]) -def test_stack_empty_frame(dropna): +@pytest.mark.parametrize("dropna", [True, False, lib.no_default]) +def test_stack_empty_frame(dropna, future_stack): # GH 36113 levels = [np.array([], dtype=np.int64), np.array([], dtype=np.int64)] expected = Series(dtype=np.float64, index=MultiIndex(levels=levels, codes=[[], []])) - result = DataFrame(dtype=np.float64).stack(dropna=dropna) - tm.assert_series_equal(result, expected) + if future_stack and dropna is not lib.no_default: + with pytest.raises(ValueError, match="dropna must be unspecified"): + DataFrame(dtype=np.float64).stack(dropna=dropna, future_stack=future_stack) + else: + result = DataFrame(dtype=np.float64).stack( + dropna=dropna, future_stack=future_stack + ) + tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("dropna", [True, False]) +@pytest.mark.parametrize("dropna", [True, False, lib.no_default]) @pytest.mark.parametrize("fill_value", [None, 0]) -def test_stack_unstack_empty_frame(dropna, fill_value): +def test_stack_unstack_empty_frame(dropna, fill_value, future_stack): # GH 36113 - result = ( - DataFrame(dtype=np.int64).stack(dropna=dropna).unstack(fill_value=fill_value) - ) - expected = DataFrame(dtype=np.int64) - tm.assert_frame_equal(result, expected) + if future_stack and dropna is not lib.no_default: + with pytest.raises(ValueError, match="dropna must be unspecified"): + DataFrame(dtype=np.int64).stack( + dropna=dropna, future_stack=future_stack + ).unstack(fill_value=fill_value) + else: + result = ( + DataFrame(dtype=np.int64) + .stack(dropna=dropna, future_stack=future_stack) + .unstack(fill_value=fill_value) + ) + expected = DataFrame(dtype=np.int64) + tm.assert_frame_equal(result, expected) def test_unstack_single_index_series(): @@ -1371,11 +1439,11 @@ def test_unstacking_multi_index_df(): tm.assert_frame_equal(result, expected) -def test_stack_positional_level_duplicate_column_names(): +def test_stack_positional_level_duplicate_column_names(future_stack): # https://github.com/pandas-dev/pandas/issues/36353 columns = MultiIndex.from_product([("x", "y"), ("y", "z")], names=["a", "a"]) df = DataFrame([[1, 1, 1, 1]], columns=columns) - result = df.stack(0) + result = df.stack(0, future_stack=future_stack) new_columns = Index(["y", "z"], name="a") new_index = MultiIndex.from_tuples([(0, "x"), (0, "y")], names=[None, "a"]) @@ -1406,7 +1474,7 @@ def test_unstack_non_slice_like_blocks(using_array_manager): tm.assert_frame_equal(res, expected) -def test_stack_sort_false(): +def test_stack_sort_false(future_stack): # GH 15105 data = [[1, 2, 3.0, 4.0], [2, 3, 4.0, 5.0], [3, 4, np.nan, np.nan]] df = DataFrame( @@ -1415,11 +1483,23 @@ def test_stack_sort_false(): levels=[["B", "A"], ["x", "y"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] ), ) - result = df.stack(level=0, sort=False) - expected = DataFrame( - {"x": [1.0, 3.0, 2.0, 4.0, 3.0], "y": [2.0, 4.0, 3.0, 5.0, 4.0]}, - index=MultiIndex.from_arrays([[0, 0, 1, 1, 2], ["B", "A", "B", "A", "B"]]), - ) + kwargs = {} if future_stack else {"sort": False} + result = df.stack(level=0, future_stack=future_stack, **kwargs) + if future_stack: + expected = DataFrame( + { + "x": [1.0, 3.0, 2.0, 4.0, 3.0, np.nan], + "y": [2.0, 4.0, 3.0, 5.0, 4.0, np.nan], + }, + index=MultiIndex.from_arrays( + [[0, 0, 1, 1, 2, 2], ["B", "A", "B", "A", "B", "A"]] + ), + ) + else: + expected = DataFrame( + {"x": [1.0, 3.0, 2.0, 4.0, 3.0], "y": [2.0, 4.0, 3.0, 5.0, 4.0]}, + index=MultiIndex.from_arrays([[0, 0, 1, 1, 2], ["B", "A", "B", "A", "B"]]), + ) tm.assert_frame_equal(result, expected) # Codes sorted in this call @@ -1427,15 +1507,17 @@ def test_stack_sort_false(): data, columns=MultiIndex.from_arrays([["B", "B", "A", "A"], ["x", "y", "x", "y"]]), ) - result = df.stack(level=0, sort=False) + kwargs = {} if future_stack else {"sort": False} + result = df.stack(level=0, future_stack=future_stack, **kwargs) tm.assert_frame_equal(result, expected) -def test_stack_sort_false_multi_level(): +def test_stack_sort_false_multi_level(future_stack): # GH 15105 idx = MultiIndex.from_tuples([("weight", "kg"), ("height", "m")]) df = DataFrame([[1.0, 2.0], [3.0, 4.0]], index=["cat", "dog"], columns=idx) - result = df.stack([0, 1], sort=False) + kwargs = {} if future_stack else {"sort": False} + result = df.stack([0, 1], future_stack=future_stack, **kwargs) expected_index = MultiIndex.from_tuples( [ ("cat", "weight", "kg"), @@ -1516,75 +1598,85 @@ def test_unstack_multiple_no_empty_columns(self): expected = unstacked.dropna(axis=1, how="all") tm.assert_frame_equal(unstacked, expected) - def test_stack(self, multiindex_year_month_day_dataframe_random_data): + def test_stack(self, multiindex_year_month_day_dataframe_random_data, future_stack): ymd = multiindex_year_month_day_dataframe_random_data # regular roundtrip unstacked = ymd.unstack() - restacked = unstacked.stack() + restacked = unstacked.stack(future_stack=future_stack) + if future_stack: + # NA values in unstacked persist to restacked in version 3 + restacked = restacked.dropna(how="all") tm.assert_frame_equal(restacked, ymd) unlexsorted = ymd.sort_index(level=2) unstacked = unlexsorted.unstack(2) - restacked = unstacked.stack() + restacked = unstacked.stack(future_stack=future_stack) + if future_stack: + # NA values in unstacked persist to restacked in version 3 + restacked = restacked.dropna(how="all") tm.assert_frame_equal(restacked.sort_index(level=0), ymd) unlexsorted = unlexsorted[::-1] unstacked = unlexsorted.unstack(1) - restacked = unstacked.stack().swaplevel(1, 2) + restacked = unstacked.stack(future_stack=future_stack).swaplevel(1, 2) + if future_stack: + # NA values in unstacked persist to restacked in version 3 + restacked = restacked.dropna(how="all") tm.assert_frame_equal(restacked.sort_index(level=0), ymd) unlexsorted = unlexsorted.swaplevel(0, 1) unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1) - restacked = unstacked.stack(0).swaplevel(1, 2) + restacked = unstacked.stack(0, future_stack=future_stack).swaplevel(1, 2) + if future_stack: + # NA values in unstacked persist to restacked in version 3 + restacked = restacked.dropna(how="all") tm.assert_frame_equal(restacked.sort_index(level=0), ymd) # columns unsorted unstacked = ymd.unstack() - restacked = unstacked.stack() + restacked = unstacked.stack(future_stack=future_stack) + if future_stack: + # NA values in unstacked persist to restacked in version 3 + restacked = restacked.dropna(how="all") tm.assert_frame_equal(restacked, ymd) # more than 2 levels in the columns unstacked = ymd.unstack(1).unstack(1) - result = unstacked.stack(1) + result = unstacked.stack(1, future_stack=future_stack) expected = ymd.unstack() tm.assert_frame_equal(result, expected) - result = unstacked.stack(2) + result = unstacked.stack(2, future_stack=future_stack) expected = ymd.unstack(1) tm.assert_frame_equal(result, expected) - result = unstacked.stack(0) - expected = ymd.stack().unstack(1).unstack(1) + result = unstacked.stack(0, future_stack=future_stack) + expected = ymd.stack(future_stack=future_stack).unstack(1).unstack(1) tm.assert_frame_equal(result, expected) # not all levels present in each echelon unstacked = ymd.unstack(2).loc[:, ::3] - stacked = unstacked.stack().stack() - ymd_stacked = ymd.stack() + stacked = unstacked.stack(future_stack=future_stack).stack( + future_stack=future_stack + ) + ymd_stacked = ymd.stack(future_stack=future_stack) + if future_stack: + # NA values in unstacked persist to restacked in version 3 + stacked = stacked.dropna(how="all") + ymd_stacked = ymd_stacked.dropna(how="all") tm.assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) # stack with negative number - result = ymd.unstack(0).stack(-2) - expected = ymd.unstack(0).stack(0) + result = ymd.unstack(0).stack(-2, future_stack=future_stack) + expected = ymd.unstack(0).stack(0, future_stack=future_stack) tm.assert_equal(result, expected) @pytest.mark.parametrize( "idx, columns, exp_idx", [ - [ - list("abab"), - ["1st", "2nd", "3rd"], - MultiIndex( - levels=[["a", "b"], ["1st", "2nd", "3rd"]], - codes=[ - np.tile(np.arange(2).repeat(3), 2), - np.tile(np.arange(3), 4), - ], - ), - ], [ list("abab"), ["1st", "2nd", "1st"], @@ -1607,21 +1699,26 @@ def test_stack(self, multiindex_year_month_day_dataframe_random_data): ], ], ) - def test_stack_duplicate_index(self, idx, columns, exp_idx): + def test_stack_duplicate_index(self, idx, columns, exp_idx, future_stack): # GH10417 df = DataFrame( np.arange(12).reshape(4, 3), index=idx, columns=columns, ) - result = df.stack() - expected = Series(np.arange(12), index=exp_idx) - tm.assert_series_equal(result, expected) - assert result.index.is_unique is False - li, ri = result.index, expected.index - tm.assert_index_equal(li, ri) + if future_stack: + msg = "Columns with duplicate values are not supported in stack" + with pytest.raises(ValueError, match=msg): + df.stack(future_stack=future_stack) + else: + result = df.stack(future_stack=future_stack) + expected = Series(np.arange(12), index=exp_idx) + tm.assert_series_equal(result, expected) + assert result.index.is_unique is False + li, ri = result.index, expected.index + tm.assert_index_equal(li, ri) - def test_unstack_odd_failure(self): + def test_unstack_odd_failure(self, future_stack): data = """day,time,smoker,sum,len Fri,Dinner,No,8.25,3. Fri,Dinner,Yes,27.03,9 @@ -1640,23 +1737,26 @@ def test_unstack_odd_failure(self): # it works, #2100 result = df.unstack(2) - recons = result.stack() + recons = result.stack(future_stack=future_stack) + if future_stack: + # NA values in unstacked persist to restacked in version 3 + recons = recons.dropna(how="all") tm.assert_frame_equal(recons, df) - def test_stack_mixed_dtype(self, multiindex_dataframe_random_data): + def test_stack_mixed_dtype(self, multiindex_dataframe_random_data, future_stack): frame = multiindex_dataframe_random_data df = frame.T df["foo", "four"] = "foo" df = df.sort_index(level=1, axis=1) - stacked = df.stack() - result = df["foo"].stack().sort_index() + stacked = df.stack(future_stack=future_stack) + result = df["foo"].stack(future_stack=future_stack).sort_index() tm.assert_series_equal(stacked["foo"], result, check_names=False) assert result.name is None assert stacked["bar"].dtype == np.float_ - def test_unstack_bug(self): + def test_unstack_bug(self, future_stack): df = DataFrame( { "state": ["naive", "naive", "naive", "active", "active", "active"], @@ -1670,22 +1770,24 @@ def test_unstack_bug(self): result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) unstacked = result.unstack() - restacked = unstacked.stack() + restacked = unstacked.stack(future_stack=future_stack) tm.assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) - def test_stack_unstack_preserve_names(self, multiindex_dataframe_random_data): + def test_stack_unstack_preserve_names( + self, multiindex_dataframe_random_data, future_stack + ): frame = multiindex_dataframe_random_data unstacked = frame.unstack() assert unstacked.index.name == "first" assert unstacked.columns.names == ["exp", "second"] - restacked = unstacked.stack() + restacked = unstacked.stack(future_stack=future_stack) assert restacked.index.names == frame.index.names @pytest.mark.parametrize("method", ["stack", "unstack"]) def test_stack_unstack_wrong_level_name( - self, method, multiindex_dataframe_random_data + self, method, multiindex_dataframe_random_data, future_stack ): # GH 18303 - wrong level name should raise frame = multiindex_dataframe_random_data @@ -1693,14 +1795,15 @@ def test_stack_unstack_wrong_level_name( # A DataFrame with flat axes: df = frame.loc["foo"] + kwargs = {"future_stack": future_stack} if method == "stack" else {} with pytest.raises(KeyError, match="does not match index name"): - getattr(df, method)("mistake") + getattr(df, method)("mistake", **kwargs) if method == "unstack": # Same on a Series: s = df.iloc[:, 0] with pytest.raises(KeyError, match="does not match index name"): - getattr(s, method)("mistake") + getattr(s, method)("mistake", **kwargs) def test_unstack_level_name(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data @@ -1709,20 +1812,20 @@ def test_unstack_level_name(self, multiindex_dataframe_random_data): expected = frame.unstack(level=1) tm.assert_frame_equal(result, expected) - def test_stack_level_name(self, multiindex_dataframe_random_data): + def test_stack_level_name(self, multiindex_dataframe_random_data, future_stack): frame = multiindex_dataframe_random_data unstacked = frame.unstack("second") - result = unstacked.stack("exp") - expected = frame.unstack().stack(0) + result = unstacked.stack("exp", future_stack=future_stack) + expected = frame.unstack().stack(0, future_stack=future_stack) tm.assert_frame_equal(result, expected) - result = frame.stack("exp") - expected = frame.stack() + result = frame.stack("exp", future_stack=future_stack) + expected = frame.stack(future_stack=future_stack) tm.assert_series_equal(result, expected) def test_stack_unstack_multiple( - self, multiindex_year_month_day_dataframe_random_data + self, multiindex_year_month_day_dataframe_random_data, future_stack ): ymd = multiindex_year_month_day_dataframe_random_data @@ -1736,7 +1839,10 @@ def test_stack_unstack_multiple( s_unstacked = s.unstack(["year", "month"]) tm.assert_frame_equal(s_unstacked, expected["A"]) - restacked = unstacked.stack(["year", "month"]) + restacked = unstacked.stack(["year", "month"], future_stack=future_stack) + if future_stack: + # NA values in unstacked persist to restacked in version 3 + restacked = restacked.dropna(how="all") restacked = restacked.swaplevel(0, 1).swaplevel(1, 2) restacked = restacked.sort_index(level=0) @@ -1753,7 +1859,7 @@ def test_stack_unstack_multiple( tm.assert_frame_equal(unstacked, expected.loc[:, unstacked.columns]) def test_stack_names_and_numbers( - self, multiindex_year_month_day_dataframe_random_data + self, multiindex_year_month_day_dataframe_random_data, future_stack ): ymd = multiindex_year_month_day_dataframe_random_data @@ -1761,10 +1867,10 @@ def test_stack_names_and_numbers( # Can't use mixture of names and numbers to stack with pytest.raises(ValueError, match="level should contain"): - unstacked.stack([0, "month"]) + unstacked.stack([0, "month"], future_stack=future_stack) def test_stack_multiple_out_of_bounds( - self, multiindex_year_month_day_dataframe_random_data + self, multiindex_year_month_day_dataframe_random_data, future_stack ): # nlevels == 3 ymd = multiindex_year_month_day_dataframe_random_data @@ -1772,9 +1878,9 @@ def test_stack_multiple_out_of_bounds( unstacked = ymd.unstack(["year", "month"]) with pytest.raises(IndexError, match="Too many levels"): - unstacked.stack([2, 3]) + unstacked.stack([2, 3], future_stack=future_stack) with pytest.raises(IndexError, match="not a valid level number"): - unstacked.stack([-4, -3]) + unstacked.stack([-4, -3], future_stack=future_stack) def test_unstack_period_series(self): # GH4342 @@ -1892,7 +1998,7 @@ def test_unstack_period_frame(self): tm.assert_frame_equal(result3, expected) - def test_stack_multiple_bug(self): + def test_stack_multiple_bug(self, future_stack): # bug when some uniques are not present in the data GH#3170 id_col = ([1] * 3) + ([2] * 3) name = (["a"] * 3) + (["b"] * 3) @@ -1907,23 +2013,33 @@ def test_stack_multiple_bug(self): with pytest.raises(TypeError, match=msg): unst.resample("W-THU").mean() down = unst.resample("W-THU").mean(numeric_only=True) - rs = down.stack("ID") - xp = unst.loc[:, ["VAR1"]].resample("W-THU").mean().stack("ID") + rs = down.stack("ID", future_stack=future_stack) + xp = ( + unst.loc[:, ["VAR1"]] + .resample("W-THU") + .mean() + .stack("ID", future_stack=future_stack) + ) xp.columns.name = "Params" tm.assert_frame_equal(rs, xp) - def test_stack_dropna(self): + def test_stack_dropna(self, future_stack): # GH#3997 df = DataFrame({"A": ["a1", "a2"], "B": ["b1", "b2"], "C": [1, 1]}) df = df.set_index(["A", "B"]) - stacked = df.unstack().stack(dropna=False) + dropna = False if not future_stack else lib.no_default + stacked = df.unstack().stack(dropna=dropna, future_stack=future_stack) assert len(stacked) > len(stacked.dropna()) - stacked = df.unstack().stack(dropna=True) - tm.assert_frame_equal(stacked, stacked.dropna()) + if future_stack: + with pytest.raises(ValueError, match="dropna must be unspecified"): + df.unstack().stack(dropna=True, future_stack=future_stack) + else: + stacked = df.unstack().stack(dropna=True, future_stack=future_stack) + tm.assert_frame_equal(stacked, stacked.dropna()) - def test_unstack_multiple_hierarchical(self): + def test_unstack_multiple_hierarchical(self, future_stack): df = DataFrame( index=[ [0, 0, 0, 0, 1, 1, 1, 1], @@ -1960,7 +2076,7 @@ def test_unstack_sparse_keyspace(self): # it works! is sufficient idf.unstack("E") - def test_unstack_unobserved_keys(self): + def test_unstack_unobserved_keys(self, future_stack): # related to GH#2278 refactoring levels = [[0, 1], [0, 1, 2, 3]] codes = [[0, 0, 1, 1], [0, 2, 0, 2]] @@ -1972,7 +2088,7 @@ def test_unstack_unobserved_keys(self): result = df.unstack() assert len(result.columns) == 4 - recons = result.stack() + recons = result.stack(future_stack=future_stack) tm.assert_frame_equal(recons, df) @pytest.mark.slow @@ -2006,12 +2122,15 @@ def __init__(self, *args, **kwargs) -> None: ) @pytest.mark.parametrize("stack_lev", range(2)) @pytest.mark.parametrize("sort", [True, False]) - def test_stack_order_with_unsorted_levels(self, levels, stack_lev, sort): + def test_stack_order_with_unsorted_levels( + self, levels, stack_lev, sort, future_stack + ): # GH#16323 # deep check for 1-row case columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) df = DataFrame(columns=columns, data=[range(4)]) - df_stacked = df.stack(stack_lev, sort=sort) + kwargs = {} if future_stack else {"sort": sort} + df_stacked = df.stack(stack_lev, future_stack=future_stack, **kwargs) for row in df.index: for col in df.columns: expected = df.loc[row, col] @@ -2020,7 +2139,7 @@ def test_stack_order_with_unsorted_levels(self, levels, stack_lev, sort): result = df_stacked.loc[result_row, result_col] assert result == expected - def test_stack_order_with_unsorted_levels_multi_row(self): + def test_stack_order_with_unsorted_levels_multi_row(self, future_stack): # GH#16323 # check multi-row case @@ -2032,18 +2151,20 @@ def test_stack_order_with_unsorted_levels_multi_row(self): columns=mi, index=range(5), data=np.arange(5 * len(mi)).reshape(5, -1) ) assert all( - df.loc[row, col] == df.stack(0).loc[(row, col[0]), col[1]] + df.loc[row, col] + == df.stack(0, future_stack=future_stack).loc[(row, col[0]), col[1]] for row in df.index for col in df.columns ) - def test_stack_order_with_unsorted_levels_multi_row_2(self): + def test_stack_order_with_unsorted_levels_multi_row_2(self, future_stack): # GH#53636 levels = ((0, 1), (1, 0)) stack_lev = 1 columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) df = DataFrame(columns=columns, data=[range(4)], index=[1, 0, 2, 3]) - result = df.stack(stack_lev, sort=True) + kwargs = {} if future_stack else {"sort": True} + result = df.stack(stack_lev, future_stack=future_stack, **kwargs) expected_index = MultiIndex( levels=[[0, 1, 2, 3], [0, 1]], codes=[[1, 1, 0, 0, 2, 2, 3, 3], [1, 0, 1, 0, 1, 0, 1, 0]], @@ -2057,7 +2178,7 @@ def test_stack_order_with_unsorted_levels_multi_row_2(self): ) tm.assert_frame_equal(result, expected) - def test_stack_unstack_unordered_multiindex(self): + def test_stack_unstack_unordered_multiindex(self, future_stack): # GH# 18265 values = np.arange(5) data = np.vstack( @@ -2072,7 +2193,9 @@ def test_stack_unstack_unordered_multiindex(self): multi_level_df = pd.concat(second_level_dict, axis=1) multi_level_df.columns.names = ["second", "first"] df = multi_level_df.reindex(sorted(multi_level_df.columns), axis=1) - result = df.stack(["first", "second"]).unstack(["first", "second"]) + result = df.stack(["first", "second"], future_stack=future_stack).unstack( + ["first", "second"] + ) expected = DataFrame( [["a0", "b0"], ["a1", "b1"], ["a2", "b2"], ["a3", "b3"], ["a4", "b4"]], index=[0, 1, 2, 3, 4], @@ -2095,7 +2218,7 @@ def test_unstack_preserve_types( assert unstacked["E", 1].dtype == np.object_ assert unstacked["F", 1].dtype == np.float64 - def test_unstack_group_index_overflow(self): + def test_unstack_group_index_overflow(self, future_stack): codes = np.tile(np.arange(500), 2) level = np.arange(500) @@ -2109,7 +2232,7 @@ def test_unstack_group_index_overflow(self): assert result.shape == (500, 2) # test roundtrip - stacked = result.stack() + stacked = result.stack(future_stack=future_stack) tm.assert_series_equal(s, stacked.reindex(s.index)) # put it at beginning @@ -2188,7 +2311,7 @@ def test_unstack_with_level_has_nan(self): tm.assert_index_equal(result, expected) - def test_stack_nan_in_multiindex_columns(self): + def test_stack_nan_in_multiindex_columns(self, future_stack): # GH#39481 df = DataFrame( np.zeros([1, 5]), @@ -2202,15 +2325,21 @@ def test_stack_nan_in_multiindex_columns(self): ], ), ) - result = df.stack(2) + result = df.stack(2, future_stack=future_stack) + if future_stack: + index = MultiIndex(levels=[[0], [0.0, 1.0]], codes=[[0, 0, 0], [-1, 0, 1]]) + columns = MultiIndex(levels=[[0], [2, 3]], codes=[[0, 0, 0], [-1, 0, 1]]) + else: + index = Index([(0, None), (0, 0), (0, 1)]) + columns = Index([(0, None), (0, 2), (0, 3)]) expected = DataFrame( [[0.0, np.nan, np.nan], [np.nan, 0.0, 0.0], [np.nan, 0.0, 0.0]], - index=Index([(0, None), (0, 0), (0, 1)]), - columns=Index([(0, None), (0, 2), (0, 3)]), + index=index, + columns=columns, ) tm.assert_frame_equal(result, expected) - def test_multi_level_stack_categorical(self): + def test_multi_level_stack_categorical(self, future_stack): # GH 15239 midx = MultiIndex.from_arrays( [ @@ -2220,30 +2349,52 @@ def test_multi_level_stack_categorical(self): ] ) df = DataFrame(np.arange(8).reshape(2, 4), columns=midx) - result = df.stack([1, 2]) - expected = DataFrame( - [ - [0, np.nan], - [np.nan, 2], - [1, np.nan], - [np.nan, 3], - [4, np.nan], - [np.nan, 6], - [5, np.nan], - [np.nan, 7], - ], - columns=["A", "B"], - index=MultiIndex.from_arrays( + result = df.stack([1, 2], future_stack=future_stack) + if future_stack: + expected = DataFrame( [ - [0] * 4 + [1] * 4, - pd.Categorical(list("aabbaabb")), - pd.Categorical(list("cdcdcdcd")), - ] - ), - ) + [0, np.nan], + [1, np.nan], + [np.nan, 2], + [np.nan, 3], + [4, np.nan], + [5, np.nan], + [np.nan, 6], + [np.nan, 7], + ], + columns=["A", "B"], + index=MultiIndex.from_arrays( + [ + [0] * 4 + [1] * 4, + pd.Categorical(list("abababab")), + pd.Categorical(list("ccddccdd")), + ] + ), + ) + else: + expected = DataFrame( + [ + [0, np.nan], + [np.nan, 2], + [1, np.nan], + [np.nan, 3], + [4, np.nan], + [np.nan, 6], + [5, np.nan], + [np.nan, 7], + ], + columns=["A", "B"], + index=MultiIndex.from_arrays( + [ + [0] * 4 + [1] * 4, + pd.Categorical(list("aabbaabb")), + pd.Categorical(list("cdcdcdcd")), + ] + ), + ) tm.assert_frame_equal(result, expected) - def test_stack_nan_level(self): + def test_stack_nan_level(self, future_stack): # GH 9406 df_nan = DataFrame( np.arange(4).reshape(2, 2), @@ -2253,13 +2404,21 @@ def test_stack_nan_level(self): index=Index([0, 1], name="Num"), dtype=np.float64, ) - result = df_nan.stack() + result = df_nan.stack(future_stack=future_stack) + if future_stack: + index = MultiIndex( + levels=[[0, 1], [np.nan, "b"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=["Num", "Lower"], + ) + else: + index = MultiIndex.from_tuples( + [(0, np.nan), (0, "b"), (1, np.nan), (1, "b")], names=["Num", "Lower"] + ) expected = DataFrame( [[0.0, np.nan], [np.nan, 1], [2.0, np.nan], [np.nan, 3.0]], columns=Index(["A", "B"], name="Upper"), - index=MultiIndex.from_tuples( - [(0, np.nan), (0, "b"), (1, np.nan), (1, "b")], names=["Num", "Lower"] - ), + index=index, ) tm.assert_frame_equal(result, expected) @@ -2278,7 +2437,7 @@ def test_unstack_categorical_columns(self): expected.columns = MultiIndex.from_tuples([("cat", 0), ("cat", 1)]) tm.assert_frame_equal(result, expected) - def test_stack_unsorted(self): + def test_stack_unsorted(self, future_stack): # GH 16925 PAE = ["ITA", "FRA"] VAR = ["A1", "A2"] @@ -2292,11 +2451,15 @@ def test_stack_unsorted(self): DF.columns = DF.columns.droplevel(0) DF.loc[:, ("A0", "NET")] = 9999 - result = DF.stack(["VAR", "TYP"]).sort_index() - expected = DF.sort_index(axis=1).stack(["VAR", "TYP"]).sort_index() + result = DF.stack(["VAR", "TYP"], future_stack=future_stack).sort_index() + expected = ( + DF.sort_index(axis=1) + .stack(["VAR", "TYP"], future_stack=future_stack) + .sort_index() + ) tm.assert_series_equal(result, expected) - def test_stack_nullable_dtype(self): + def test_stack_nullable_dtype(self, future_stack): # GH#43561 columns = MultiIndex.from_product( [["54511", "54515"], ["r", "t_mean"]], names=["station", "element"] @@ -2306,14 +2469,18 @@ def test_stack_nullable_dtype(self): arr = np.array([[50, 226, 10, 215], [10, 215, 9, 220], [305, 232, 111, 220]]) df = DataFrame(arr, columns=columns, index=index, dtype=pd.Int64Dtype()) - result = df.stack("station") + result = df.stack("station", future_stack=future_stack) - expected = df.astype(np.int64).stack("station").astype(pd.Int64Dtype()) + expected = ( + df.astype(np.int64) + .stack("station", future_stack=future_stack) + .astype(pd.Int64Dtype()) + ) tm.assert_frame_equal(result, expected) # non-homogeneous case df[df.columns[0]] = df[df.columns[0]].astype(pd.Float64Dtype()) - result = df.stack("station") + result = df.stack("station", future_stack=future_stack) expected = DataFrame( { diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 9bc790cbed8e8..3ef012183ef26 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -216,7 +216,7 @@ def test_subclass_stack(self): columns=["X", "Y", "Z"], ) - res = df.stack() + res = df.stack(future_stack=True) exp = tm.SubclassedSeries( [1, 2, 3, 4, 5, 6, 7, 8, 9], index=[list("aaabbbccc"), list("XYZXYZXYZ")] ) @@ -253,10 +253,10 @@ def test_subclass_stack_multi(self): columns=Index(["W", "X"], name="www"), ) - res = df.stack() + res = df.stack(future_stack=True) tm.assert_frame_equal(res, exp) - res = df.stack("yyy") + res = df.stack("yyy", future_stack=True) tm.assert_frame_equal(res, exp) exp = tm.SubclassedDataFrame( @@ -277,7 +277,7 @@ def test_subclass_stack_multi(self): columns=Index(["y", "z"], name="yyy"), ) - res = df.stack("www") + res = df.stack("www", future_stack=True) tm.assert_frame_equal(res, exp) def test_subclass_stack_multi_mixed(self): @@ -315,10 +315,10 @@ def test_subclass_stack_multi_mixed(self): columns=Index(["W", "X"], name="www"), ) - res = df.stack() + res = df.stack(future_stack=True) tm.assert_frame_equal(res, exp) - res = df.stack("yyy") + res = df.stack("yyy", future_stack=True) tm.assert_frame_equal(res, exp) exp = tm.SubclassedDataFrame( @@ -339,7 +339,7 @@ def test_subclass_stack_multi_mixed(self): columns=Index(["y", "z"], name="yyy"), ) - res = df.stack("www") + res = df.stack("www", future_stack=True) tm.assert_frame_equal(res, exp) def test_subclass_unstack(self): diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index f5fe4d7d9831a..f917f567e1ce3 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -68,7 +68,7 @@ def test_cythonized_aggers(op_name): expd = {} for (cat1, cat2), group in grouped: expd.setdefault(cat1, {})[cat2] = op(group["C"]) - exp = DataFrame(expd).T.stack(dropna=False) + exp = DataFrame(expd).T.stack(future_stack=True) exp.index.names = ["A", "B"] exp.name = "C" diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 11ce290896073..d0ae9eeed394f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -237,9 +237,13 @@ def f(x): # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) - tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) + tm.assert_index_equal( + (desc_result.stack(future_stack=True).index.get_level_values(0)), exp + ) exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) - tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp) + tm.assert_index_equal( + (desc_result.stack(future_stack=True).index.get_level_values(1)), exp + ) def test_level_get_group(observed): @@ -673,9 +677,13 @@ def test_datetime(): # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) - tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) + tm.assert_index_equal( + (desc_result.stack(future_stack=True).index.get_level_values(0)), exp + ) exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) - tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp) + tm.assert_index_equal( + (desc_result.stack(future_stack=True).index.get_level_values(1)), exp + ) def test_categorical_index(): @@ -713,8 +721,10 @@ def test_describe_categorical_columns(): df = DataFrame(np.random.default_rng(2).standard_normal((20, 4)), columns=cats) result = df.groupby([1, 2, 3, 4] * 5).describe() - tm.assert_index_equal(result.stack().columns, cats) - tm.assert_categorical_equal(result.stack().columns.values, cats.values) + tm.assert_index_equal(result.stack(future_stack=True).columns, cats) + tm.assert_categorical_equal( + result.stack(future_stack=True).columns.values, cats.values + ) def test_unstack_categorical(): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index ffedafa91ce50..78e9f6111a230 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1118,7 +1118,7 @@ def test_series_describe_single(): ts = tm.makeTimeSeries() grouped = ts.groupby(lambda x: x.month) result = grouped.apply(lambda x: x.describe()) - expected = grouped.describe().stack() + expected = grouped.describe().stack(future_stack=True) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 33d7570a07d73..7978e596e6ee5 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -363,7 +363,7 @@ def test_partial_slicing_with_multiindex_series(self): ser = DataFrame( np.random.default_rng(2).random((1000, 1000)), index=date_range("2000-1-1", periods=1000), - ).stack() + ).stack(future_stack=True) s2 = ser[:-1].copy() expected = s2["2000-1-4"] diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 2b75efd130aa2..78b2c493ec116 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -37,12 +37,12 @@ def test_slice_locs_partial(self, idx): def test_slice_locs(self): df = tm.makeTimeDataFrame() - stacked = df.stack() + stacked = df.stack(future_stack=True) idx = stacked.index slob = slice(*idx.slice_locs(df.index[5], df.index[15])) sliced = stacked[slob] - expected = df[5:16].stack() + expected = df[5:16].stack(future_stack=True) tm.assert_almost_equal(sliced.values, expected.values) slob = slice( @@ -52,19 +52,19 @@ def test_slice_locs(self): ) ) sliced = stacked[slob] - expected = df[6:15].stack() + expected = df[6:15].stack(future_stack=True) tm.assert_almost_equal(sliced.values, expected.values) def test_slice_locs_with_type_mismatch(self): df = tm.makeTimeDataFrame() - stacked = df.stack() + stacked = df.stack(future_stack=True) idx = stacked.index with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs((1, 3)) with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs(df.index[5] + timedelta(seconds=30), (5, 2)) df = tm.makeCustomDataframe(5, 5) - stacked = df.stack() + stacked = df.stack(future_stack=True) idx = stacked.index with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs(timedelta(seconds=30)) diff --git a/pandas/tests/indexes/multi/test_integrity.py b/pandas/tests/indexes/multi/test_integrity.py index 72b6754542fa6..45dd484eff4c6 100644 --- a/pandas/tests/indexes/multi/test_integrity.py +++ b/pandas/tests/indexes/multi/test_integrity.py @@ -235,7 +235,10 @@ def test_rangeindex_fallback_coercion_bug(): # GH 12893 df1 = pd.DataFrame(np.arange(100).reshape((10, 10))) df2 = pd.DataFrame(np.arange(100).reshape((10, 10))) - df = pd.concat({"df1": df1.stack(), "df2": df2.stack()}, axis=1) + df = pd.concat( + {"df1": df1.stack(future_stack=True), "df2": df2.stack(future_stack=True)}, + axis=1, + ) df.index.names = ["fizz", "buzz"] str(df) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 4b5fe5ff13c14..f49c8b6d53723 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1875,7 +1875,7 @@ def test_frame_int_overflow(self): ], ) def test_json_multiindex(self, dataframe, expected): - series = dataframe.stack() + series = dataframe.stack(future_stack=True) result = series.to_json(orient="index") assert result == expected @@ -1914,7 +1914,7 @@ def test_to_json_multiindex_escape(self): True, index=pd.date_range("2017-01-20", "2017-01-23"), columns=["foo", "bar"], - ).stack() + ).stack(future_stack=True) result = df.to_json() expected = ( "{\"(Timestamp('2017-01-20 00:00:00'), 'foo')\":true," diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 1fc2877e70c65..a447601f3d8c4 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -134,7 +134,7 @@ def test_append_series(setup_path): mi["C"] = "foo" mi.loc[3:5, "C"] = "bar" mi.set_index(["C", "B"], inplace=True) - s = mi.stack() + s = mi.stack(future_stack=True) s.index = s.index.droplevel(2) store.append("mi", s) tm.assert_series_equal(store["mi"], s, check_index_type=True) diff --git a/pandas/tests/series/methods/test_reset_index.py b/pandas/tests/series/methods/test_reset_index.py index 39578212d4af0..db36221d8f510 100644 --- a/pandas/tests/series/methods/test_reset_index.py +++ b/pandas/tests/series/methods/test_reset_index.py @@ -34,7 +34,7 @@ def test_reset_index_dti_round_trip(self): def test_reset_index(self): df = tm.makeDataFrame()[:5] - ser = df.stack() + ser = df.stack(future_stack=True) ser.index.names = ["hash", "category"] ser.name = "value" diff --git a/pandas/tests/series/methods/test_unstack.py b/pandas/tests/series/methods/test_unstack.py index e37f955a91cd3..b294e2fcce9d8 100644 --- a/pandas/tests/series/methods/test_unstack.py +++ b/pandas/tests/series/methods/test_unstack.py @@ -133,7 +133,9 @@ def test_unstack_mixed_type_name_in_multiindex( def test_unstack_multi_index_categorical_values(): - mi = tm.makeTimeDataFrame().stack().index.rename(["major", "minor"]) + mi = ( + tm.makeTimeDataFrame().stack(future_stack=True).index.rename(["major", "minor"]) + ) ser = Series(["foo"] * len(mi), index=mi, name="category", dtype="category") result = ser.unstack()