Skip to content

Commit cfd5fa4

Browse files
committed
groupby WIP
1 parent 7e006d4 commit cfd5fa4

File tree

10 files changed

+131
-111
lines changed

10 files changed

+131
-111
lines changed

pandas/core/apply.py

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -434,22 +434,13 @@ def new_list_like(self, method: str) -> DataFrame | Series:
434434
obj = self.obj
435435
arg = cast(List[AggFuncTypeBase], self.f)
436436

437-
if not isinstance(obj, SelectionMixin):
438-
# i.e. obj is Series or DataFrame
439-
selected_obj = obj
440-
elif obj._selected_obj.ndim == 1:
441-
# For SeriesGroupBy this matches _obj_with_exclusions
442-
selected_obj = obj._selected_obj
443-
else:
444-
selected_obj = obj._obj_with_exclusions
445-
446437
results = []
447438
keys = []
448439
result_dim = None
449440

450441
for a in arg:
451442
try:
452-
new_res = getattr(selected_obj, method)(a)
443+
new_res = getattr(obj, method)(a)
453444
if result_dim is None:
454445
result_dim = getattr(new_res, "ndim", 0)
455446
elif getattr(new_res, "ndim", 0) != result_dim:
@@ -470,6 +461,7 @@ def new_list_like(self, method: str) -> DataFrame | Series:
470461
raise ValueError("no results")
471462

472463
try:
464+
473465
concatenated = concat(results, keys=keys, axis=1, sort=False)
474466
except TypeError:
475467
# we are concatting non-NDFrame objects,
@@ -597,7 +589,8 @@ def new_dict_like(self, method: str) -> DataFrame | Series:
597589
else:
598590
# key used for column selection and output
599591
results = [
600-
getattr(obj._gotitem([key], ndim=1), method)(how)
592+
# ndim = 2 for groupby; act like we always have multiple columns
593+
getattr(obj._gotitem(key, ndim=2), method)(how)
601594
for key, how in arg.items()
602595
]
603596
if self.renamer is not None:
@@ -612,8 +605,9 @@ def new_dict_like(self, method: str) -> DataFrame | Series:
612605

613606
# combine results
614607
if all(is_ndframe):
615-
axis = 1 if isinstance(obj, ABCSeries) else 0
616-
result = concat(results, axis=axis)
608+
result = concat(results, axis=1)
609+
if result.ndim == 1:
610+
result = result.to_frame()
617611
elif any(is_ndframe):
618612
# There is a mix of NDFrames and scalars
619613
raise ValueError(
@@ -632,7 +626,7 @@ def new_dict_like(self, method: str) -> DataFrame | Series:
632626
else:
633627
name = None
634628

635-
result = Series(results, name=name)
629+
result = Series(results, index=arg.keys(), name=name)
636630

637631
return result
638632

pandas/core/frame.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8624,11 +8624,9 @@ def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs):
86248624
# False, columns and order will be None.
86258625
assert columns is not None
86268626
assert order is not None
8627-
if get_option("mode.new_udf_methods"):
8628-
result = result[list(columns)]
8629-
else:
8630-
result_in_dict = relabel_result(result, func, columns, order)
8631-
result = DataFrame(result_in_dict, index=columns)
8627+
8628+
result_in_dict = relabel_result(result, func, columns, order)
8629+
result = DataFrame(result_in_dict, index=columns)
86328630

86338631
return result
86348632

pandas/core/groupby/generic.py

Lines changed: 62 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626

2727
import numpy as np
2828

29+
from pandas._config import get_option
30+
2931
from pandas._libs import reduction as libreduction
3032
from pandas._typing import (
3133
ArrayLike,
@@ -897,49 +899,72 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
897899
result.columns = columns
898900

899901
if result is None:
900-
901-
# grouper specific aggregations
902-
if self.grouper.nkeys > 1:
903-
# test_groupby_as_index_series_scalar gets here with 'not self.as_index'
904-
return self._python_agg_general(func, *args, **kwargs)
905-
elif args or kwargs:
906-
# test_pass_args_kwargs gets here (with and without as_index)
907-
# can't return early
908-
result = self._aggregate_frame(func, *args, **kwargs)
909-
910-
elif self.axis == 1:
911-
# _aggregate_multiple_funcs does not allow self.axis == 1
912-
# Note: axis == 1 precludes 'not self.as_index', see __init__
913-
result = self._aggregate_frame(func)
914-
return result
915-
902+
if get_option("new_udf_methods"):
903+
if args or kwargs:
904+
# test_pass_args_kwargs gets here (with and without as_index)
905+
# can't return early
906+
result = self._aggregate_frame(func, *args, **kwargs)
907+
908+
elif self.axis == 1:
909+
# _aggregate_multiple_funcs does not allow self.axis == 1
910+
# Note: axis == 1 precludes 'not self.as_index', see __init__
911+
result = self._aggregate_frame(func)
912+
return result
913+
else:
914+
# test_groupby_as_index_series_scalar gets here
915+
# with 'not self.as_index'
916+
return self._python_agg_general(func, *args, **kwargs)
916917
else:
917-
918-
# try to treat as if we are passing a list
919-
gba = GroupByApply(self, [func], args=(), kwargs={})
920-
try:
921-
result = gba.agg()
922-
923-
except ValueError as err:
924-
if "no results" not in str(err):
925-
# raised directly by _aggregate_multiple_funcs
926-
raise
918+
# grouper specific aggregations
919+
if self.grouper.nkeys > 1:
920+
# test_groupby_as_index_series_scalar gets here with
921+
# 'not self.as_index'
922+
return self._python_agg_general(func, *args, **kwargs)
923+
elif args or kwargs:
924+
# test_pass_args_kwargs gets here (with and without as_index)
925+
# can't return early
926+
result = self._aggregate_frame(func, *args, **kwargs)
927+
928+
elif self.axis == 1:
929+
# _aggregate_multiple_funcs does not allow self.axis == 1
930+
# Note: axis == 1 precludes 'not self.as_index', see __init__
927931
result = self._aggregate_frame(func)
932+
return result
928933

929934
else:
930-
sobj = self._selected_obj
931935

932-
if isinstance(sobj, Series):
933-
# GH#35246 test_groupby_as_index_select_column_sum_empty_df
934-
result.columns = self._obj_with_exclusions.columns.copy()
936+
# try to treat as if we are passing a list
937+
if get_option("new_udf_methods"):
938+
gba = GroupByApply(self, func, args=(), kwargs={})
939+
else:
940+
gba = GroupByApply(self, [func], args=(), kwargs={})
941+
try:
942+
result = gba.agg()
943+
if get_option("new_udf_methods") and result is None:
944+
result = self._aggregate_frame(func)
945+
946+
except ValueError as err:
947+
if "no results" not in str(err):
948+
# raised directly by _aggregate_multiple_funcs
949+
raise
950+
result = self._aggregate_frame(func)
951+
935952
else:
936-
# Retain our column names
937-
result.columns._set_names(
938-
sobj.columns.names, level=list(range(sobj.columns.nlevels))
939-
)
940-
# select everything except for the last level, which is the one
941-
# containing the name of the function(s), see GH#32040
942-
# result.columns = result.columns.droplevel(-1)
953+
sobj = self._selected_obj
954+
955+
if isinstance(sobj, Series):
956+
# GH#35246 test_groupby_as_index_select_column_sum_empty_df
957+
result.columns = self._obj_with_exclusions.columns.copy()
958+
else:
959+
# Retain our column names
960+
result.columns._set_names(
961+
sobj.columns.names,
962+
level=list(range(sobj.columns.nlevels)),
963+
)
964+
# select everything except for the last level, which is the
965+
# one containing the name of the function(s), see GH#32040
966+
if not get_option("new_udf_methods"):
967+
result.columns = result.columns.droplevel(-1)
943968

944969
if not self.as_index:
945970
self._insert_inaxis_grouper_inplace(result)

pandas/tests/apply/test_frame_apply.py

Lines changed: 14 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1069,20 +1069,11 @@ def test_demo():
10691069
tm.assert_frame_equal(result, expected)
10701070

10711071
result = df.agg({"A": ["min", "max"], "B": ["sum", "max"]})
1072-
if get_option("mode.new_udf_methods"):
1073-
expected = DataFrame(
1074-
{"min": [0.0, np.nan], "max": [4, 5], "sum": [np.nan, 25.0]},
1075-
columns=["max", "min", "sum"],
1076-
index=["A", "B"],
1077-
)
1078-
else:
1079-
expected = DataFrame(
1080-
{"A": [4.0, 0.0, np.nan], "B": [5.0, np.nan, 25.0]},
1081-
columns=["A", "B"],
1082-
index=["max", "min", "sum"],
1083-
)
1084-
print(result)
1085-
print(expected)
1072+
expected = DataFrame(
1073+
{"A": [4.0, 0.0, np.nan], "B": [5.0, np.nan, 25.0]},
1074+
columns=["A", "B"],
1075+
index=["max", "min", "sum"],
1076+
)
10861077
tm.assert_frame_equal(result.reindex_like(expected), expected)
10871078

10881079

@@ -1196,8 +1187,6 @@ def test_agg_reduce(axis, float_frame):
11961187
}
11971188
)
11981189
expected = expected.T if axis in {1, "columns"} else expected
1199-
if get_option("mode.new_udf_methods"):
1200-
expected = expected.T
12011190
tm.assert_frame_equal(result, expected)
12021191

12031192
# dict input with lists with multiple
@@ -1223,8 +1212,6 @@ def test_agg_reduce(axis, float_frame):
12231212
axis=1,
12241213
)
12251214
expected = expected.T if axis in {1, "columns"} else expected
1226-
if get_option("mode.new_udf_methods"):
1227-
expected = expected.T
12281215
tm.assert_frame_equal(result, expected)
12291216

12301217

@@ -1305,11 +1292,16 @@ def test_non_callable_aggregates(how):
13051292
"C": {"count": 2, "size": 3},
13061293
}
13071294
)
1308-
if get_option("mode.new_udf_methods"):
1309-
expected = expected.T
1295+
# if get_option("mode.new_udf_methods"):
1296+
# expected = expected.T
13101297

1311-
tm.assert_frame_equal(result1, result2, check_like=True)
1312-
tm.assert_frame_equal(result2, expected, check_like=True)
1298+
if get_option("new_udf_methods"):
1299+
tm.assert_frame_equal(result2, expected)
1300+
expected1 = expected.T
1301+
tm.assert_frame_equal(result1, expected1)
1302+
else:
1303+
tm.assert_frame_equal(result1, result2, check_like=True)
1304+
tm.assert_frame_equal(result2, expected, check_like=True)
13131305

13141306
# Just functional string arg is same as calling df.arg()
13151307
result = getattr(df, how)("count")

pandas/tests/apply/test_frame_apply_relabeling.py

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,12 @@ def test_agg_relabel():
1111
# simplest case with one column, one func
1212
result = df.agg(foo=("B", "sum"))
1313
expected = pd.DataFrame({"B": [10]}, index=pd.Index(["foo"]))
14-
if pd.get_option("new_udf_methods"):
15-
expected = expected.T
1614

1715
tm.assert_frame_equal(result, expected)
1816

1917
# test on same column with different methods
2018
result = df.agg(foo=("B", "sum"), bar=("B", "min"))
2119
expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"]))
22-
if pd.get_option("new_udf_methods"):
23-
expected = expected.T
2420

2521
tm.assert_frame_equal(result, expected)
2622

@@ -44,23 +40,16 @@ def test_agg_relabel_multi_columns_multi_methods():
4440
},
4541
index=pd.Index(["foo", "bar", "cat", "dat", "f", "g"]),
4642
)
47-
if pd.get_option("new_udf_methods"):
48-
expected = expected.T
4943
tm.assert_frame_equal(result, expected)
5044

5145

5246
def test_agg_relabel_partial_functions():
5347
# GH 26513, test on partial, functools or more complex cases
5448
df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]})
5549
result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min))
56-
if pd.get_option("new_udf_methods"):
57-
expected = pd.DataFrame(
58-
[[1.5, 1.5, 1]], index=["A"], columns=pd.Index(["foo", "bar", "cat"])
59-
)
60-
else:
61-
expected = pd.DataFrame(
62-
{"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"])
63-
)
50+
expected = pd.DataFrame(
51+
{"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"])
52+
)
6453
tm.assert_frame_equal(result, expected)
6554

6655
result = df.agg(
@@ -79,8 +68,6 @@ def test_agg_relabel_partial_functions():
7968
},
8069
index=pd.Index(["foo", "bar", "cat", "dat", "f", "kk"]),
8170
)
82-
if pd.get_option("new_udf_methods"):
83-
expected = expected.T
8471
tm.assert_frame_equal(result, expected)
8572

8673

@@ -97,8 +84,6 @@ def test_agg_namedtuple():
9784
expected = pd.DataFrame(
9885
{"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"])
9986
)
100-
if pd.get_option("new_udf_methods"):
101-
expected = expected.T
10287
tm.assert_frame_equal(result, expected)
10388

10489
result = df.agg(
@@ -110,6 +95,4 @@ def test_agg_namedtuple():
11095
{"A": [0.0, np.nan, 1.0], "B": [np.nan, 2.0, np.nan]},
11196
index=pd.Index(["foo", "bar", "cat"]),
11297
)
113-
if pd.get_option("new_udf_methods"):
114-
expected = expected.T
11598
tm.assert_frame_equal(result, expected)

pandas/tests/apply/test_series_apply.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -854,8 +854,6 @@ def test_apply_dictlike_reducer(string_series, ops, how):
854854
{name: op(string_series) for name, op in ops.items()}, name="series"
855855
)
856856
result = getattr(string_series, how)(ops)
857-
print(result)
858-
print(expected)
859857
tm.assert_equal(result, expected)
860858
else:
861859
expected = Series({name: op(string_series) for name, op in ops.items()})

pandas/tests/groupby/aggregate/test_aggregate.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
MultiIndex,
2121
Series,
2222
concat,
23+
get_option,
2324
to_datetime,
2425
)
2526
import pandas._testing as tm
@@ -845,11 +846,19 @@ def test_groupby_aggregate_empty_key(kwargs):
845846
# GH: 32580
846847
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]})
847848
result = df.groupby("a").agg(kwargs)
848-
expected = DataFrame(
849-
[1, 4],
850-
index=Index([1, 2], dtype="int64", name="a"),
851-
columns=MultiIndex.from_tuples([["c", "min"]]),
852-
)
849+
print(result)
850+
if get_option("new_udf_methods"):
851+
expected = DataFrame(
852+
[1, 4],
853+
index=Index([1, 2], dtype="int64", name="a"),
854+
columns=MultiIndex.from_tuples([["min", "c"]]),
855+
)
856+
else:
857+
expected = DataFrame(
858+
[1, 4],
859+
index=Index([1, 2], dtype="int64", name="a"),
860+
columns=MultiIndex.from_tuples([["c", "min"]]),
861+
)
853862
tm.assert_frame_equal(result, expected)
854863

855864

@@ -916,6 +925,9 @@ def test_multiindex_custom_func(func):
916925
}
917926
expected = DataFrame(expected_dict)
918927
expected.columns = df.columns
928+
print(df)
929+
print(result)
930+
print(expected)
919931
tm.assert_frame_equal(result, expected)
920932

921933

pandas/tests/groupby/test_categorical.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1477,6 +1477,9 @@ def test_groupby_agg_categorical_columns(func, expected_values):
14771477
expected = DataFrame(
14781478
{"value": expected_values}, index=Index([0, 1, 2], name="groups")
14791479
)
1480+
print(df)
1481+
print(result)
1482+
print(expected)
14801483
tm.assert_frame_equal(result, expected)
14811484

14821485

0 commit comments

Comments
 (0)