From 793200b8992918a3461e9c475d5805d402ea8aa7 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 12 Jun 2023 20:26:04 -0400 Subject: [PATCH 01/10] BUG: DataFrame.stack with sort=True and unsorted MultiIndex levels --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/reshape/reshape.py | 11 ++++++- pandas/tests/frame/test_stack_unstack.py | 38 +++++++++++++++++++----- pyproject.toml | 1 + 4 files changed, 42 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index dd67d1f158c47..d656867965764 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -470,6 +470,7 @@ Reshaping - Bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax`, where the axis dtype would be lost for empty frames (:issue:`53265`) - Bug in :meth:`DataFrame.merge` not merging correctly when having ``MultiIndex`` with single level (:issue:`52331`) - Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`) +- Bug in :meth:`DataFrame.stack` would incorrectly order results when ``sort=True`` and the input had :class:`MultiIndex` levels that were not sorted (:issue:`53636`) - Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`) - Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`) - diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 3866d30e9c757..f6ce9955bc2bc 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -756,7 +756,16 @@ def _convert_level_number(level_num: int, columns: Index): level_vals = mi_cols.levels[-1] level_codes = unique(mi_cols.codes[-1]) if sort: + _, index, inverse = np.unique( + level_vals, return_index=True, return_inverse=True + ) + sorted_level_vals = np.take(level_vals, index) level_codes = np.sort(level_codes) + # Take level_codes according to where level_vals get sorted to, while + # also allowing for NA (-1) values + level_codes = np.where(level_codes == -1, -1, np.take(inverse, level_codes)) + else: + sorted_level_vals = level_vals level_vals_nan = level_vals.insert(len(level_vals), None) level_vals_used = np.take(level_vals_nan, level_codes) @@ -818,7 +827,7 @@ def _convert_level_number(level_num: int, columns: Index): new_codes = [old_codes.repeat(levsize)] new_names = [this.index.name] # something better? - new_levels.append(level_vals) + new_levels.append(sorted_level_vals) new_codes.append(np.tile(level_codes, N)) new_names.append(frame.columns.names[level_num]) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 2818df721db34..9900db7910c46 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1998,18 +1998,20 @@ def __init__(self, *args, **kwargs) -> None: ), ) @pytest.mark.parametrize("stack_lev", range(2)) - def test_stack_order_with_unsorted_levels(self, levels, stack_lev): + @pytest.mark.parametrize("sort", [True, False]) + def test_stack_order_with_unsorted_levels(self, levels, stack_lev, sort): # GH#16323 # deep check for 1-row case columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) df = DataFrame(columns=columns, data=[range(4)]) - df_stacked = df.stack(stack_lev) - assert all( - df.loc[row, col] - == df_stacked.loc[(row, col[stack_lev]), col[1 - stack_lev]] - for row in df.index - for col in df.columns - ) + df_stacked = df.stack(stack_lev, sort=sort) + for row in df.index: + for col in df.columns: + expected = df.loc[row, col] + result_row = row, col[stack_lev] + result_col = col[1 - stack_lev] + result = df_stacked.loc[result_row, result_col] + assert result == expected def test_stack_order_with_unsorted_levels_multi_row(self): # GH#16323 @@ -2028,6 +2030,26 @@ def test_stack_order_with_unsorted_levels_multi_row(self): for col in df.columns ) + def test_stack_order_with_unsorted_levels_multi_row_2(self): + # GH#53636 + levels = ((0, 1), (1, 0)) + stack_lev = 1 + columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) + df = DataFrame(columns=columns, data=[range(4)], index=[1, 0, 2, 3]) + result = df.stack(stack_lev, sort=True) + expected_index = MultiIndex( + levels=[[0, 1, 2, 3], [0, 1]], + codes=[[1, 1, 0, 0, 2, 2, 3, 3], [1, 0, 1, 0, 1, 0, 1, 0]], + ) + expected = DataFrame( + { + 0: [0, 1, 0, 1, 0, 1, 0, 1], + 1: [2, 3, 2, 3, 2, 3, 2, 3], + }, + index=expected_index, + ) + tm.assert_frame_equal(result, expected) + def test_stack_unstack_unordered_multiindex(self): # GH# 18265 values = np.arange(5) diff --git a/pyproject.toml b/pyproject.toml index 6f91aa2360406..3021a6c94b8af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -486,6 +486,7 @@ filterwarnings = [ "ignore:distutils Version classes are deprecated:DeprecationWarning:numexpr", "ignore:distutils Version classes are deprecated:DeprecationWarning:fastparquet", "ignore:distutils Version classes are deprecated:DeprecationWarning:fsspec", + "ignore:indexing past lexsort depth", ] junit_family = "xunit2" markers = [ From d524f1ddf4fd48575a24fb17be6966522d853bf9 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 12 Jun 2023 22:24:57 -0400 Subject: [PATCH 02/10] revert ignoring warnings --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3021a6c94b8af..6f91aa2360406 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -486,7 +486,6 @@ filterwarnings = [ "ignore:distutils Version classes are deprecated:DeprecationWarning:numexpr", "ignore:distutils Version classes are deprecated:DeprecationWarning:fastparquet", "ignore:distutils Version classes are deprecated:DeprecationWarning:fsspec", - "ignore:indexing past lexsort depth", ] junit_family = "xunit2" markers = [ From 24426fa015e486d8a0590fb3b942d41f28829123 Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 21 Jun 2023 22:44:20 -0400 Subject: [PATCH 03/10] BUG: DataFrame.stack sorting columns --- pandas/core/reshape/reshape.py | 5 +++++ pandas/tests/frame/test_stack_unstack.py | 5 ++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 3866d30e9c757..6b452f7cdaecf 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -828,6 +828,11 @@ def _convert_level_number(level_num: int, columns: Index): result = frame._constructor(new_data, index=new_index, columns=new_columns) + if frame.columns.nlevels > 1: + desired_columns = frame.columns._drop_level_numbers([level_num]).unique() + if not result.columns.equals(desired_columns): + result = result[desired_columns] + # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... if dropna: diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 6cc6534da0b87..c9caef51a432f 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -516,10 +516,10 @@ def test_unstack_level_binding(self): expected = DataFrame( np.array( - [[np.nan, 0], [0, np.nan], [np.nan, 0], [0, np.nan]], dtype=np.float64 + [[0, np.nan], [np.nan, 0], [0, np.nan], [np.nan, 0]], dtype=np.float64 ), index=expected_mi, - columns=Index(["a", "b"], name="third"), + columns=Index(["b", "a"], name="third"), ) tm.assert_frame_equal(result, expected) @@ -1536,7 +1536,6 @@ def test_stack(self, multiindex_year_month_day_dataframe_random_data): # columns unsorted unstacked = ymd.unstack() - unstacked = unstacked.sort_index(axis=1, ascending=False) restacked = unstacked.stack() tm.assert_frame_equal(restacked, ymd) From c333189512387a65b28e121ecaca35050a9b59c3 Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 21 Jun 2023 22:51:13 -0400 Subject: [PATCH 04/10] whatsnew --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 511e5793608bc..c2f27bcf8df1d 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -494,9 +494,9 @@ Reshaping - Bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax`, where the axis dtype would be lost for empty frames (:issue:`53265`) - Bug in :meth:`DataFrame.merge` not merging correctly when having ``MultiIndex`` with single level (:issue:`52331`) - Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`) +- Bug in :meth:`DataFrame.stack` sorting columns lexicographically (:issue:`53786`) - Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`) - Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`) -- Sparse ^^^^^^ From 739af16489a0063cd9f5f8138c4033066d94d01d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 22 Jun 2023 17:02:05 -0400 Subject: [PATCH 05/10] Docstring fixup --- pandas/core/frame.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 635073efe9357..53b99126da77f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9108,11 +9108,11 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): cat 1.0 2.0 dog 3.0 4.0 >>> df_multi_level_cols2.stack() - height weight - cat kg NaN 1.0 - m 2.0 NaN - dog kg NaN 3.0 - m 4.0 NaN + weight height + cat kg 1.0 NaN + m NaN 2.0 + dog kg 3.0 NaN + m NaN 4.0 **Prescribing the level(s) to be stacked** @@ -9147,16 +9147,16 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): cat NaN 1.0 dog 2.0 3.0 >>> df_multi_level_cols3.stack(dropna=False) - height weight + weight height cat kg NaN NaN - m 1.0 NaN - dog kg NaN 2.0 - m 3.0 NaN + m NaN 1.0 + dog kg 2.0 NaN + m NaN 3.0 >>> df_multi_level_cols3.stack(dropna=True) - height weight - cat m 1.0 NaN - dog kg NaN 2.0 - m 3.0 NaN + weight height + cat m NaN 1.0 + dog kg 2.0 NaN + m NaN 3.0 """ from pandas.core.reshape.reshape import ( stack, From 9be5486eba20129cd0e7d205dc420f8db048b506 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 22 Jun 2023 17:03:16 -0400 Subject: [PATCH 06/10] Merge cleanup --- pandas/core/reshape/reshape.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index fc3a3768d7b56..6b452f7cdaecf 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -756,16 +756,7 @@ def _convert_level_number(level_num: int, columns: Index): level_vals = mi_cols.levels[-1] level_codes = unique(mi_cols.codes[-1]) if sort: - _, index, inverse = np.unique( - level_vals, return_index=True, return_inverse=True - ) - sorted_level_vals = np.take(level_vals, index) level_codes = np.sort(level_codes) - # Take level_codes according to where level_vals get sorted to, while - # also allowing for NA (-1) values - level_codes = np.where(level_codes == -1, -1, np.take(inverse, level_codes)) - else: - sorted_level_vals = level_vals level_vals_nan = level_vals.insert(len(level_vals), None) level_vals_used = np.take(level_vals_nan, level_codes) @@ -827,7 +818,7 @@ def _convert_level_number(level_num: int, columns: Index): new_codes = [old_codes.repeat(levsize)] new_names = [this.index.name] # something better? - new_levels.append(sorted_level_vals) + new_levels.append(level_vals) new_codes.append(np.tile(level_codes, N)) new_names.append(frame.columns.names[level_num]) From 2ace984114124e2653fa0b2361c2f01ef58e84d5 Mon Sep 17 00:00:00 2001 From: richard Date: Fri, 23 Jun 2023 15:32:13 -0400 Subject: [PATCH 07/10] WIP --- pandas/core/reshape/reshape.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 6b452f7cdaecf..ffd84bd42efac 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -740,13 +740,13 @@ def _convert_level_number(level_num: int, columns: Index): roll_columns = roll_columns.swaplevel(lev1, lev2) this.columns = mi_cols = roll_columns - if not mi_cols._is_lexsorted() and sort: - # Workaround the edge case where 0 is one of the column names, - # which interferes with trying to sort based on the first - # level - level_to_sort = _convert_level_number(0, mi_cols) - this = this.sort_index(level=level_to_sort, axis=1) - mi_cols = this.columns + # if not mi_cols._is_lexsorted() and sort: + # # Workaround the edge case where 0 is one of the column names, + # # which interferes with trying to sort based on the first + # # level + # level_to_sort = _convert_level_number(0, mi_cols) + # this = this.sort_index(level=level_to_sort, axis=1) + # mi_cols = this.columns mi_cols = cast(MultiIndex, mi_cols) new_columns = _stack_multi_column_index(mi_cols) @@ -755,8 +755,8 @@ def _convert_level_number(level_num: int, columns: Index): new_data = {} level_vals = mi_cols.levels[-1] level_codes = unique(mi_cols.codes[-1]) - if sort: - level_codes = np.sort(level_codes) + # if sort: + # level_codes = np.sort(level_codes) level_vals_nan = level_vals.insert(len(level_vals), None) level_vals_used = np.take(level_vals_nan, level_codes) @@ -764,7 +764,9 @@ def _convert_level_number(level_num: int, columns: Index): drop_cols = [] for key in new_columns: try: - loc = this.columns.get_loc(key) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", PerformanceWarning) + loc = this.columns.get_loc(key) except KeyError: drop_cols.append(key) continue @@ -784,6 +786,8 @@ def _convert_level_number(level_num: int, columns: Index): value_slice = chunk.reindex(columns=level_vals_used).values else: subset = this.iloc[:, loc] + subset = this.loc[:, this.columns[loc]] + subset.columns = level_vals_nan.take(subset.columns.codes[-1]) dtype = find_common_type(subset.dtypes.tolist()) if isinstance(dtype, ExtensionDtype): # TODO(EA2D): won't need special case, can go through .values @@ -795,7 +799,7 @@ def _convert_level_number(level_num: int, columns: Index): idx = np.arange(N * K).reshape(K, N).T.ravel() value_slice = value_slice.take(idx) else: - value_slice = subset.values + value_slice = subset.reindex(columns=level_vals_used).values if value_slice.ndim > 1: # i.e. not extension From 3ffb3789e446b70e9e69bf31f56247c98fd159a4 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 23 Jun 2023 19:38:53 +0200 Subject: [PATCH 08/10] BUG: DataFrame.stack sometimes sorting the resulting index --- doc/source/whatsnew/v2.1.0.rst | 3 ++- pandas/core/frame.py | 16 ++++++++-------- pandas/core/reshape/reshape.py | 23 +++++++++-------------- pandas/tests/frame/test_stack_unstack.py | 6 +++--- 4 files changed, 22 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 7b9efd7f593dd..ef95ca9baacd3 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -498,7 +498,8 @@ Reshaping - Bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax`, where the axis dtype would be lost for empty frames (:issue:`53265`) - Bug in :meth:`DataFrame.merge` not merging correctly when having ``MultiIndex`` with single level (:issue:`52331`) - Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`) -- Bug in :meth:`DataFrame.stack` sorting columns lexicographically (:issue:`53786`) +- Bug in :meth:`DataFrame.stack` sorting columns lexicographically in rare cases (:issue:`53786`) +- Bug in :meth:`DataFrame.stack` sorting index lexicographically in rare cases (:issue:`53824`) - Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`) - Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4088736dd4150..702cb3e39cb83 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9120,15 +9120,15 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): >>> df_multi_level_cols2.stack(0) kg m - cat height NaN 2.0 - weight 1.0 NaN - dog height NaN 4.0 - weight 3.0 NaN + cat weight 1.0 NaN + height NaN 2.0 + dog weight 3.0 NaN + height NaN 4.0 >>> df_multi_level_cols2.stack([0, 1]) - cat height m 2.0 - weight kg 1.0 - dog height m 4.0 - weight kg 3.0 + cat weight kg 1.0 + height m 2.0 + dog weight kg 3.0 + height m 4.0 dtype: float64 **Dropping missing values** diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 6b452f7cdaecf..22f5c3949a4d7 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1,6 +1,5 @@ from __future__ import annotations -import itertools from typing import ( TYPE_CHECKING, cast, @@ -694,7 +693,8 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex: # Remove duplicate tuples in the MultiIndex. tuples = zip(*levs) - unique_tuples = (key for key, _ in itertools.groupby(tuples)) + seen = set() + unique_tuples = (key for key in tuples if not (key in seen or seen.add(key))) new_levs = zip(*unique_tuples) # The dtype of each level must be explicitly set to avoid inferring the wrong type. @@ -740,14 +740,6 @@ def _convert_level_number(level_num: int, columns: Index): roll_columns = roll_columns.swaplevel(lev1, lev2) this.columns = mi_cols = roll_columns - if not mi_cols._is_lexsorted() and sort: - # Workaround the edge case where 0 is one of the column names, - # which interferes with trying to sort based on the first - # level - level_to_sort = _convert_level_number(0, mi_cols) - this = this.sort_index(level=level_to_sort, axis=1) - mi_cols = this.columns - mi_cols = cast(MultiIndex, mi_cols) new_columns = _stack_multi_column_index(mi_cols) @@ -755,8 +747,6 @@ def _convert_level_number(level_num: int, columns: Index): new_data = {} level_vals = mi_cols.levels[-1] level_codes = unique(mi_cols.codes[-1]) - if sort: - level_codes = np.sort(level_codes) level_vals_nan = level_vals.insert(len(level_vals), None) level_vals_used = np.take(level_vals_nan, level_codes) @@ -764,7 +754,9 @@ def _convert_level_number(level_num: int, columns: Index): drop_cols = [] for key in new_columns: try: - loc = this.columns.get_loc(key) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", PerformanceWarning) + loc = this.columns.get_loc(key) except KeyError: drop_cols.append(key) continue @@ -774,9 +766,12 @@ def _convert_level_number(level_num: int, columns: Index): # but if unsorted can get a boolean # indexer if not isinstance(loc, slice): - slice_len = len(loc) + slice_len = loc.sum() else: slice_len = loc.stop - loc.start + if loc.step is not None: + # Integer division using ceiling instead of floor + slice_len = -(slice_len // -loc.step) if slice_len != levsize: chunk = this.loc[:, this.columns[loc]] diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index a48728a778877..ee31313f0868e 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1099,7 +1099,7 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): "labels,data", [ (list("xyz"), [10, 11, 12, 13, 14, 15]), - (list("zyx"), [14, 15, 12, 13, 10, 11]), + (list("zyx"), [10, 11, 12, 13, 14, 15]), ], ) def test_stack_multi_preserve_categorical_dtype(self, ordered, labels, data): @@ -1107,10 +1107,10 @@ def test_stack_multi_preserve_categorical_dtype(self, ordered, labels, data): cidx = pd.CategoricalIndex(labels, categories=sorted(labels), ordered=ordered) cidx2 = pd.CategoricalIndex(["u", "v"], ordered=ordered) midx = MultiIndex.from_product([cidx, cidx2]) - df = DataFrame([sorted(data)], columns=midx) + df = DataFrame([data], columns=midx) result = df.stack([0, 1]) - s_cidx = pd.CategoricalIndex(sorted(labels), ordered=ordered) + s_cidx = pd.CategoricalIndex(labels, ordered=ordered) expected = Series(data, index=MultiIndex.from_product([[0], s_cidx, cidx2])) tm.assert_series_equal(result, expected) From 1f7dc616d218aadb02f0736839c5648095c9588a Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 23 Jun 2023 18:09:14 -0400 Subject: [PATCH 09/10] mypy fixups --- pandas/core/reshape/reshape.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 22f5c3949a4d7..a9a45995da6be 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -694,7 +694,13 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex: # Remove duplicate tuples in the MultiIndex. tuples = zip(*levs) seen = set() - unique_tuples = (key for key in tuples if not (key in seen or seen.add(key))) + # mypy doesn't like our trickery to get `set.add` to work in a comprehension + # error: "add" of "set" does not return a value + unique_tuples = ( + key + for key in tuples + if not (key in seen or seen.add(key)) # type: ignore[func-returns-value] + ) new_levs = zip(*unique_tuples) # The dtype of each level must be explicitly set to avoid inferring the wrong type. @@ -740,7 +746,6 @@ def _convert_level_number(level_num: int, columns: Index): roll_columns = roll_columns.swaplevel(lev1, lev2) this.columns = mi_cols = roll_columns - mi_cols = cast(MultiIndex, mi_cols) new_columns = _stack_multi_column_index(mi_cols) # time to ravel the values From 26a68de5bc83afdb73fa8f82579f3f70beab929e Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 26 Jun 2023 22:49:41 -0400 Subject: [PATCH 10/10] Remove sort argument from DataFrame.stack --- doc/source/whatsnew/v2.1.0.rst | 1 - pandas/core/frame.py | 8 +++----- pandas/core/reshape/reshape.py | 16 +++++++--------- pandas/tests/frame/test_stack_unstack.py | 21 ++++++++++----------- 4 files changed, 20 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 46bb395dd973b..13a4a4531e6d0 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -104,7 +104,6 @@ Other enhancements - Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`) - Performance improvement in :func:`read_csv` (:issue:`52632`) with ``engine="c"`` - :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`) -- :meth:`DataFrame.stack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`) - :meth:`DataFrame.unstack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`) - :meth:`DataFrameGroupby.agg` and :meth:`DataFrameGroupby.transform` now support grouping by multiple keys when the index is not a :class:`MultiIndex` for ``engine="numba"`` (:issue:`53486`) - :meth:`Series.explode` now supports pyarrow-backed list types (:issue:`53602`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c41065959fc05..d12c07b3caca4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9010,7 +9010,7 @@ def pivot_table( sort=sort, ) - def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): + def stack(self, level: IndexLabel = -1, dropna: bool = True): """ Stack the prescribed level(s) from columns to index. @@ -9036,8 +9036,6 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): axis can create combinations of index and column values that are missing from the original dataframe. See Examples section. - sort : bool, default True - Whether to sort the levels of the resulting MultiIndex. Returns ------- @@ -9181,9 +9179,9 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): ) if isinstance(level, (tuple, list)): - result = stack_multiple(self, level, dropna=dropna, sort=sort) + result = stack_multiple(self, level, dropna=dropna) else: - result = stack(self, level, dropna=dropna, sort=sort) + result = stack(self, level, dropna=dropna) return result.__finalize__(self, method="stack") diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 3f08ec15de909..b0c74745511c4 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -498,7 +498,7 @@ def unstack(obj: Series | DataFrame, level, fill_value=None, sort: bool = True): if isinstance(obj.index, MultiIndex): return _unstack_frame(obj, level, fill_value=fill_value, sort=sort) else: - return obj.T.stack(dropna=False, sort=sort) + return obj.T.stack(dropna=False) elif not isinstance(obj.index, MultiIndex): # GH 36113 # Give nicer error messages when unstack a Series whose @@ -571,7 +571,7 @@ def _unstack_extension_series( return result -def stack(frame: DataFrame, level=-1, dropna: bool = True, sort: bool = True): +def stack(frame: DataFrame, level=-1, dropna: bool = True): """ Convert DataFrame to Series with multi-level Index. Columns become the second level of the resulting hierarchical index @@ -593,9 +593,7 @@ def factorize(index): level_num = frame.columns._get_level_number(level) if isinstance(frame.columns, MultiIndex): - return _stack_multi_columns( - frame, level_num=level_num, dropna=dropna, sort=sort - ) + return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) elif isinstance(frame.index, MultiIndex): new_levels = list(frame.index.levels) new_codes = [lab.repeat(K) for lab in frame.index.codes] @@ -648,13 +646,13 @@ def factorize(index): return frame._constructor_sliced(new_values, index=new_index) -def stack_multiple(frame: DataFrame, level, dropna: bool = True, sort: bool = True): +def stack_multiple(frame: DataFrame, level, dropna: bool = True): # If all passed levels match up to column names, no # ambiguity about what to do if all(lev in frame.columns.names for lev in level): result = frame for lev in level: - result = stack(result, lev, dropna=dropna, sort=sort) + result = stack(result, lev, dropna=dropna) # Otherwise, level numbers may change as each successive level is stacked elif all(isinstance(lev, int) for lev in level): @@ -667,7 +665,7 @@ def stack_multiple(frame: DataFrame, level, dropna: bool = True, sort: bool = Tr while level: lev = level.pop(0) - result = stack(result, lev, dropna=dropna, sort=sort) + result = stack(result, lev, dropna=dropna) # Decrement all level numbers greater than current, as these # have now shifted down by one level = [v if v <= lev else v - 1 for v in level] @@ -716,7 +714,7 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex: def _stack_multi_columns( - frame: DataFrame, level_num: int = -1, dropna: bool = True, sort: bool = True + frame: DataFrame, level_num: int = -1, dropna: bool = True ) -> DataFrame: def _convert_level_number(level_num: int, columns: Index): """ diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index ee31313f0868e..ffdcb06ee2847 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1400,8 +1400,8 @@ def test_unstack_non_slice_like_blocks(using_array_manager): tm.assert_frame_equal(res, expected) -def test_stack_sort_false(): - # GH 15105 +def test_stack_nosort(): + # GH 15105, GH 53825 data = [[1, 2, 3.0, 4.0], [2, 3, 4.0, 5.0], [3, 4, np.nan, np.nan]] df = DataFrame( data, @@ -1409,7 +1409,7 @@ def test_stack_sort_false(): levels=[["B", "A"], ["x", "y"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] ), ) - result = df.stack(level=0, sort=False) + result = df.stack(level=0) expected = DataFrame( {"x": [1.0, 3.0, 2.0, 4.0, 3.0], "y": [2.0, 4.0, 3.0, 5.0, 4.0]}, index=MultiIndex.from_arrays([[0, 0, 1, 1, 2], ["B", "A", "B", "A", "B"]]), @@ -1421,15 +1421,15 @@ def test_stack_sort_false(): data, columns=MultiIndex.from_arrays([["B", "B", "A", "A"], ["x", "y", "x", "y"]]), ) - result = df.stack(level=0, sort=False) + result = df.stack(level=0) tm.assert_frame_equal(result, expected) -def test_stack_sort_false_multi_level(): - # GH 15105 +def test_stack_nosort_multi_level(): + # GH 15105, GH 53825 idx = MultiIndex.from_tuples([("weight", "kg"), ("height", "m")]) df = DataFrame([[1.0, 2.0], [3.0, 4.0]], index=["cat", "dog"], columns=idx) - result = df.stack([0, 1], sort=False) + result = df.stack([0, 1]) expected_index = MultiIndex.from_tuples( [ ("cat", "weight", "kg"), @@ -1999,13 +1999,12 @@ def __init__(self, *args, **kwargs) -> None: ), ) @pytest.mark.parametrize("stack_lev", range(2)) - @pytest.mark.parametrize("sort", [True, False]) - def test_stack_order_with_unsorted_levels(self, levels, stack_lev, sort): + def test_stack_order_with_unsorted_levels(self, levels, stack_lev): # GH#16323 # deep check for 1-row case columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) df = DataFrame(columns=columns, data=[range(4)]) - df_stacked = df.stack(stack_lev, sort=sort) + df_stacked = df.stack(stack_lev) for row in df.index: for col in df.columns: expected = df.loc[row, col] @@ -2037,7 +2036,7 @@ def test_stack_order_with_unsorted_levels_multi_row_2(self): stack_lev = 1 columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) df = DataFrame(columns=columns, data=[range(4)], index=[1, 0, 2, 3]) - result = df.stack(stack_lev, sort=True) + result = df.stack(stack_lev) expected_index = MultiIndex( levels=[[0, 1, 2, 3], [0, 1]], codes=[[1, 1, 0, 0, 2, 2, 3, 3], [1, 0, 1, 0, 1, 0, 1, 0]],