Skip to content

REF: Break up stack_v3 #58792

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 20, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 78 additions & 51 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -925,27 +925,99 @@ def _reorder_for_extension_array_stack(
def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame:
if frame.columns.nunique() != len(frame.columns):
raise ValueError("Columns with duplicate values are not supported in stack")

# If we need to drop `level` from columns, it needs to be in descending order
set_levels = set(level)
drop_levnums = sorted(level, reverse=True)
stack_cols = frame.columns._drop_level_numbers(
[k for k in range(frame.columns.nlevels - 1, -1, -1) if k not in set_levels]
)

result = stack_reshape(frame, level, set_levels, stack_cols)

# Construct the correct MultiIndex by combining the frame's index and
# stacked columns.
ratio = 0 if frame.empty else len(result) // len(frame)

index_levels: list | FrozenList
if isinstance(frame.index, MultiIndex):
index_levels = frame.index.levels
index_codes = list(np.tile(frame.index.codes, (1, ratio)))
else:
codes, uniques = factorize(frame.index, use_na_sentinel=False)
index_levels = [uniques]
index_codes = list(np.tile(codes, (1, ratio)))

if len(level) > 1:
# Arrange columns in the order we want to take them, e.g. level=[2, 0, 1]
sorter = np.argsort(level)
assert isinstance(stack_cols, MultiIndex)
ordered_stack_cols = stack_cols._reorder_ilevels(sorter)
else:
ordered_stack_cols = stack_cols

stack_cols_unique = stack_cols.unique()
ordered_stack_cols_unique = ordered_stack_cols.unique()
if isinstance(ordered_stack_cols, MultiIndex):
column_levels = ordered_stack_cols.levels
column_codes = ordered_stack_cols.drop_duplicates().codes
else:
column_levels = [ordered_stack_cols_unique]
column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]]

# error: Incompatible types in assignment (expression has type "list[ndarray[Any,
# dtype[Any]]]", variable has type "FrozenList")
column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] # type: ignore[assignment]
result.index = MultiIndex(
levels=index_levels + column_levels,
codes=index_codes + column_codes,
names=frame.index.names + list(ordered_stack_cols.names),
verify_integrity=False,
)

# sort result, but faster than calling sort_index since we know the order we need
len_df = len(frame)
n_uniques = len(ordered_stack_cols_unique)
indexer = np.arange(n_uniques)
idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques)
result = result.take(idxs)

# Reshape/rename if needed and dropna
if result.ndim == 2 and frame.columns.nlevels == len(level):
if len(result.columns) == 0:
result = Series(index=result.index)
else:
result = result.iloc[:, 0]
if result.ndim == 1:
result.name = None

return result


def stack_reshape(
frame: DataFrame, level: list[int], set_levels: set[int], stack_cols: Index
) -> Series | DataFrame:
"""Reshape the data of a frame for stack.

This function takes care of most of the work that stack needs to do. Caller
will sort the result once the appropriate index is set.

Parameters
----------
frame: DataFrame
DataFrame that is to be stacked.
level: list of ints.
Levels of the columns to stack.
set_levels: set of ints.
Same as level, but as a set.
stack_cols: Index.
Columns of the result when the DataFrame is stacked.

Returns
-------
The data of behind the stacked DataFrame.
"""
# If we need to drop `level` from columns, it needs to be in descending order
drop_levnums = sorted(level, reverse=True)

# Grab data for each unique index to be stacked
buf = []
for idx in stack_cols_unique:
for idx in stack_cols.unique():
if len(frame.columns) == 1:
data = frame.copy()
else:
Expand All @@ -972,10 +1044,8 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame:
data.columns = RangeIndex(len(data.columns))
buf.append(data)

result: Series | DataFrame
if len(buf) > 0 and not frame.empty:
result = concat(buf, ignore_index=True)
ratio = len(result) // len(frame)
else:
# input is empty
if len(level) < frame.columns.nlevels:
Expand All @@ -984,54 +1054,11 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame:
else:
new_columns = [0]
result = DataFrame(columns=new_columns, dtype=frame._values.dtype)
ratio = 0

if len(level) < frame.columns.nlevels:
# concat column order may be different from dropping the levels
desired_columns = frame.columns._drop_level_numbers(drop_levnums).unique()
if not result.columns.equals(desired_columns):
result = result[desired_columns]

# Construct the correct MultiIndex by combining the frame's index and
# stacked columns.
index_levels: list | FrozenList
if isinstance(frame.index, MultiIndex):
index_levels = frame.index.levels
index_codes = list(np.tile(frame.index.codes, (1, ratio)))
else:
codes, uniques = factorize(frame.index, use_na_sentinel=False)
index_levels = [uniques]
index_codes = list(np.tile(codes, (1, ratio)))
if isinstance(ordered_stack_cols, MultiIndex):
column_levels = ordered_stack_cols.levels
column_codes = ordered_stack_cols.drop_duplicates().codes
else:
column_levels = [ordered_stack_cols.unique()]
column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]]
# error: Incompatible types in assignment (expression has type "list[ndarray[Any,
# dtype[Any]]]", variable has type "FrozenList")
column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] # type: ignore[assignment]
result.index = MultiIndex(
levels=index_levels + column_levels,
codes=index_codes + column_codes,
names=frame.index.names + list(ordered_stack_cols.names),
verify_integrity=False,
)

# sort result, but faster than calling sort_index since we know the order we need
len_df = len(frame)
n_uniques = len(ordered_stack_cols_unique)
indexer = np.arange(n_uniques)
idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques)
result = result.take(idxs)

# Reshape/rename if needed and dropna
if result.ndim == 2 and frame.columns.nlevels == len(level):
if len(result.columns) == 0:
result = Series(index=result.index)
else:
result = result.iloc[:, 0]
if result.ndim == 1:
result.name = None

return result