From 8249ceba848e13ef53c7d91cbd58dca00bd9ee0b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 14 Nov 2023 16:32:53 -0800 Subject: [PATCH 1/4] Refactor lreshape --- pandas/core/reshape/melt.py | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index e333d263a6b7a..ae55ac19b3ff0 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -139,7 +139,7 @@ def melt( return result -def lreshape(data: DataFrame, groups, dropna: bool = True) -> DataFrame: +def lreshape(data: DataFrame, groups: dict, dropna: bool = True) -> DataFrame: """ Reshape wide-format data to long. Generalized inverse of DataFrame.pivot. @@ -192,30 +192,20 @@ def lreshape(data: DataFrame, groups, dropna: bool = True) -> DataFrame: 2 Red Sox 2008 545 3 Yankees 2008 526 """ - if isinstance(groups, dict): - keys = list(groups.keys()) - values = list(groups.values()) - else: - keys, values = zip(*groups) - - all_cols = list(set.union(*(set(x) for x in values))) - id_cols = list(data.columns.difference(all_cols)) - - K = len(values[0]) - - for seq in values: - if len(seq) != K: - raise ValueError("All column lists must be same length") - mdata = {} pivot_cols = [] - - for target, names in zip(keys, values): + all_cols = set() + K = len(next(iter(groups.values()))) + for target, names in groups.items(): + if len(names) != K: + raise ValueError("All column lists must be same length") to_concat = [data[col]._values for col in names] mdata[target] = concat_compat(to_concat) pivot_cols.append(target) + all_cols = all_cols.union(names) + id_cols = list(data.columns.difference(all_cols)) for col in id_cols: mdata[col] = np.tile(data[col]._values, K) From 49ab895cc5516a36ef1238289c5a8b7460104314 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 15 Nov 2023 10:08:34 -0800 Subject: [PATCH 2/4] Refactor wide_to_long validation --- pandas/core/reshape/melt.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index ae55ac19b3ff0..3ee9740e99cbc 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -457,10 +457,10 @@ def wide_to_long( two 2.9 """ - def get_var_names(df, stub: str, sep: str, suffix: str) -> list[str]: + def get_var_names(df, stub: str, sep: str, suffix: str): regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$" pattern = re.compile(regex) - return [col for col in df.columns if pattern.match(col)] + return df.columns[df.columns.str.match(pattern)] def melt_stub(df, stub: str, i, j, value_vars, sep: str): newdf = melt( @@ -487,7 +487,7 @@ def melt_stub(df, stub: str, i, j, value_vars, sep: str): else: stubnames = list(stubnames) - if any(col in stubnames for col in df.columns): + if df.columns.isin(stubnames).any(): raise ValueError("stubname can't be identical to a column name") if not is_list_like(i): From e8b7134e58a786e421e5e89fdbc70965e5e66045 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 15 Nov 2023 11:06:01 -0800 Subject: [PATCH 3/4] Refactor wide_to_long --- pandas/core/reshape/melt.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 3ee9740e99cbc..7d587a7a8f5fe 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -12,7 +12,6 @@ from pandas.core.dtypes.missing import notna import pandas.core.algorithms as algos -from pandas.core.arrays import Categorical from pandas.core.indexes.api import MultiIndex from pandas.core.reshape.concat import concat from pandas.core.reshape.util import tile_compat @@ -470,7 +469,6 @@ def melt_stub(df, stub: str, i, j, value_vars, sep: str): value_name=stub.rstrip(sep), var_name=j, ) - newdf[j] = Categorical(newdf[j]) newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True) # GH17627 Cast numerics suffixes to int/float @@ -498,18 +496,18 @@ def melt_stub(df, stub: str, i, j, value_vars, sep: str): if df[i].duplicated().any(): raise ValueError("the id variables need to uniquely identify each row") - value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames] + _melted = [] + value_vars_flattened = [] + for stub in stubnames: + value_var = get_var_names(df, stub, sep, suffix) + value_vars_flattened.extend(value_var) + _melted.append(melt_stub(df, stub, i, j, value_var, sep)) - value_vars_flattened = [e for sublist in value_vars for e in sublist] - id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) - - _melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)] - melted = _melted[0].join(_melted[1:], how="outer") + melted = concat(_melted, axis=1) + id_vars = df.columns.difference(value_vars_flattened) + new = df[id_vars] if len(i) == 1: - new = df[id_vars].set_index(i).join(melted) - return new - - new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j]) - - return new + return new.set_index(i).join(melted) + else: + return new.merge(melted.reset_index(), on=i).set_index(i + [j]) From fff31744da3dc33c63d03bf638a9dd468a6d5750 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 15 Nov 2023 11:50:25 -0800 Subject: [PATCH 4/4] Annotation --- pandas/core/reshape/melt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 7d587a7a8f5fe..bb1cd0d738dac 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -193,7 +193,7 @@ def lreshape(data: DataFrame, groups: dict, dropna: bool = True) -> DataFrame: """ mdata = {} pivot_cols = [] - all_cols = set() + all_cols: set[Hashable] = set() K = len(next(iter(groups.values()))) for target, names in groups.items(): if len(names) != K: