groupby WIP

rhshadrach · rhshadrach · commit cfd5fa40d293 · 2021-09-19T10:40:56.000-04:00
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -434,22 +434,13 @@ def new_list_like(self, method: str) -> DataFrame | Series:
         obj = self.obj
         arg = cast(List[AggFuncTypeBase], self.f)
 
-        if not isinstance(obj, SelectionMixin):
-            # i.e. obj is Series or DataFrame
-            selected_obj = obj
-        elif obj._selected_obj.ndim == 1:
-            # For SeriesGroupBy this matches _obj_with_exclusions
-            selected_obj = obj._selected_obj
-        else:
-            selected_obj = obj._obj_with_exclusions
-
         results = []
         keys = []
         result_dim = None
 
         for a in arg:
             try:
-                new_res = getattr(selected_obj, method)(a)
+                new_res = getattr(obj, method)(a)
                 if result_dim is None:
                     result_dim = getattr(new_res, "ndim", 0)
                 elif getattr(new_res, "ndim", 0) != result_dim:
@@ -470,6 +461,7 @@ def new_list_like(self, method: str) -> DataFrame | Series:
             raise ValueError("no results")
 
         try:
+
             concatenated = concat(results, keys=keys, axis=1, sort=False)
         except TypeError:
             # we are concatting non-NDFrame objects,
@@ -597,7 +589,8 @@ def new_dict_like(self, method: str) -> DataFrame | Series:
         else:
             # key used for column selection and output
             results = [
-                getattr(obj._gotitem([key], ndim=1), method)(how)
+                # ndim = 2 for groupby; act like we always have multiple columns
+                getattr(obj._gotitem(key, ndim=2), method)(how)
                 for key, how in arg.items()
             ]
             if self.renamer is not None:
@@ -612,8 +605,9 @@ def new_dict_like(self, method: str) -> DataFrame | Series:
 
         # combine results
         if all(is_ndframe):
-            axis = 1 if isinstance(obj, ABCSeries) else 0
-            result = concat(results, axis=axis)
+            result = concat(results, axis=1)
+            if result.ndim == 1:
+                result = result.to_frame()
         elif any(is_ndframe):
             # There is a mix of NDFrames and scalars
             raise ValueError(
@@ -632,7 +626,7 @@ def new_dict_like(self, method: str) -> DataFrame | Series:
             else:
                 name = None
 
-            result = Series(results, name=name)
+            result = Series(results, index=arg.keys(), name=name)
 
         return result
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -8624,11 +8624,9 @@ def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs):
             # False, columns and order will be None.
             assert columns is not None
             assert order is not None
-            if get_option("mode.new_udf_methods"):
-                result = result[list(columns)]
-            else:
-                result_in_dict = relabel_result(result, func, columns, order)
-                result = DataFrame(result_in_dict, index=columns)
+
+            result_in_dict = relabel_result(result, func, columns, order)
+            result = DataFrame(result_in_dict, index=columns)
 
         return result
 
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -26,6 +26,8 @@
 
 import numpy as np
 
+from pandas._config import get_option
+
 from pandas._libs import reduction as libreduction
 from pandas._typing import (
     ArrayLike,
@@ -897,49 +899,72 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
             result.columns = columns
 
         if result is None:
-
-            # grouper specific aggregations
-            if self.grouper.nkeys > 1:
-                # test_groupby_as_index_series_scalar gets here with 'not self.as_index'
-                return self._python_agg_general(func, *args, **kwargs)
-            elif args or kwargs:
-                # test_pass_args_kwargs gets here (with and without as_index)
-                # can't return early
-                result = self._aggregate_frame(func, *args, **kwargs)
-
-            elif self.axis == 1:
-                # _aggregate_multiple_funcs does not allow self.axis == 1
-                # Note: axis == 1 precludes 'not self.as_index', see __init__
-                result = self._aggregate_frame(func)
-                return result
-
+            if get_option("new_udf_methods"):
+                if args or kwargs:
+                    # test_pass_args_kwargs gets here (with and without as_index)
+                    # can't return early
+                    result = self._aggregate_frame(func, *args, **kwargs)
+
+                elif self.axis == 1:
+                    # _aggregate_multiple_funcs does not allow self.axis == 1
+                    # Note: axis == 1 precludes 'not self.as_index', see __init__
+                    result = self._aggregate_frame(func)
+                    return result
+                else:
+                    # test_groupby_as_index_series_scalar gets here
+                    # with 'not self.as_index'
+                    return self._python_agg_general(func, *args, **kwargs)
             else:
-
-                # try to treat as if we are passing a list
-                gba = GroupByApply(self, [func], args=(), kwargs={})
-                try:
-                    result = gba.agg()
-
-                except ValueError as err:
-                    if "no results" not in str(err):
-                        # raised directly by _aggregate_multiple_funcs
-                        raise
+                # grouper specific aggregations
+                if self.grouper.nkeys > 1:
+                    # test_groupby_as_index_series_scalar gets here with
+                    # 'not self.as_index'
+                    return self._python_agg_general(func, *args, **kwargs)
+                elif args or kwargs:
+                    # test_pass_args_kwargs gets here (with and without as_index)
+                    # can't return early
+                    result = self._aggregate_frame(func, *args, **kwargs)
+
+                elif self.axis == 1:
+                    # _aggregate_multiple_funcs does not allow self.axis == 1
+                    # Note: axis == 1 precludes 'not self.as_index', see __init__
                     result = self._aggregate_frame(func)
+                    return result
 
                 else:
-                    sobj = self._selected_obj
 
-                    if isinstance(sobj, Series):
-                        # GH#35246 test_groupby_as_index_select_column_sum_empty_df
-                        result.columns = self._obj_with_exclusions.columns.copy()
+                    # try to treat as if we are passing a list
+                    if get_option("new_udf_methods"):
+                        gba = GroupByApply(self, func, args=(), kwargs={})
+                    else:
+                        gba = GroupByApply(self, [func], args=(), kwargs={})
+                    try:
+                        result = gba.agg()
+                        if get_option("new_udf_methods") and result is None:
+                            result = self._aggregate_frame(func)
+
+                    except ValueError as err:
+                        if "no results" not in str(err):
+                            # raised directly by _aggregate_multiple_funcs
+                            raise
+                        result = self._aggregate_frame(func)
+
                     else:
-                        # Retain our column names
-                        result.columns._set_names(
-                            sobj.columns.names, level=list(range(sobj.columns.nlevels))
-                        )
-                        # select everything except for the last level, which is the one
-                        # containing the name of the function(s), see GH#32040
-                        # result.columns = result.columns.droplevel(-1)
+                        sobj = self._selected_obj
+
+                        if isinstance(sobj, Series):
+                            # GH#35246 test_groupby_as_index_select_column_sum_empty_df
+                            result.columns = self._obj_with_exclusions.columns.copy()
+                        else:
+                            # Retain our column names
+                            result.columns._set_names(
+                                sobj.columns.names,
+                                level=list(range(sobj.columns.nlevels)),
+                            )
+                            # select everything except for the last level, which is the
+                            # one containing the name of the function(s), see GH#32040
+                            if not get_option("new_udf_methods"):
+                                result.columns = result.columns.droplevel(-1)
 
         if not self.as_index:
             self._insert_inaxis_grouper_inplace(result)
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
@@ -1069,20 +1069,11 @@ def test_demo():
     tm.assert_frame_equal(result, expected)
 
     result = df.agg({"A": ["min", "max"], "B": ["sum", "max"]})
-    if get_option("mode.new_udf_methods"):
-        expected = DataFrame(
-            {"min": [0.0, np.nan], "max": [4, 5], "sum": [np.nan, 25.0]},
-            columns=["max", "min", "sum"],
-            index=["A", "B"],
-        )
-    else:
-        expected = DataFrame(
-            {"A": [4.0, 0.0, np.nan], "B": [5.0, np.nan, 25.0]},
-            columns=["A", "B"],
-            index=["max", "min", "sum"],
-        )
-    print(result)
-    print(expected)
+    expected = DataFrame(
+        {"A": [4.0, 0.0, np.nan], "B": [5.0, np.nan, 25.0]},
+        columns=["A", "B"],
+        index=["max", "min", "sum"],
+    )
     tm.assert_frame_equal(result.reindex_like(expected), expected)
 
 
@@ -1196,8 +1187,6 @@ def test_agg_reduce(axis, float_frame):
         }
     )
     expected = expected.T if axis in {1, "columns"} else expected
-    if get_option("mode.new_udf_methods"):
-        expected = expected.T
     tm.assert_frame_equal(result, expected)
 
     # dict input with lists with multiple
@@ -1223,8 +1212,6 @@ def test_agg_reduce(axis, float_frame):
         axis=1,
     )
     expected = expected.T if axis in {1, "columns"} else expected
-    if get_option("mode.new_udf_methods"):
-        expected = expected.T
     tm.assert_frame_equal(result, expected)
 
 
@@ -1305,11 +1292,16 @@ def test_non_callable_aggregates(how):
             "C": {"count": 2, "size": 3},
         }
     )
-    if get_option("mode.new_udf_methods"):
-        expected = expected.T
+    # if get_option("mode.new_udf_methods"):
+    #     expected = expected.T
 
-    tm.assert_frame_equal(result1, result2, check_like=True)
-    tm.assert_frame_equal(result2, expected, check_like=True)
+    if get_option("new_udf_methods"):
+        tm.assert_frame_equal(result2, expected)
+        expected1 = expected.T
+        tm.assert_frame_equal(result1, expected1)
+    else:
+        tm.assert_frame_equal(result1, result2, check_like=True)
+        tm.assert_frame_equal(result2, expected, check_like=True)
 
     # Just functional string arg is same as calling df.arg()
     result = getattr(df, how)("count")
diff --git a/pandas/tests/apply/test_frame_apply_relabeling.py b/pandas/tests/apply/test_frame_apply_relabeling.py
@@ -11,16 +11,12 @@ def test_agg_relabel():
     # simplest case with one column, one func
     result = df.agg(foo=("B", "sum"))
     expected = pd.DataFrame({"B": [10]}, index=pd.Index(["foo"]))
-    if pd.get_option("new_udf_methods"):
-        expected = expected.T
 
     tm.assert_frame_equal(result, expected)
 
     # test on same column with different methods
     result = df.agg(foo=("B", "sum"), bar=("B", "min"))
     expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"]))
-    if pd.get_option("new_udf_methods"):
-        expected = expected.T
 
     tm.assert_frame_equal(result, expected)
 
@@ -44,23 +40,16 @@ def test_agg_relabel_multi_columns_multi_methods():
         },
         index=pd.Index(["foo", "bar", "cat", "dat", "f", "g"]),
     )
-    if pd.get_option("new_udf_methods"):
-        expected = expected.T
     tm.assert_frame_equal(result, expected)
 
 
 def test_agg_relabel_partial_functions():
     # GH 26513, test on partial, functools or more complex cases
     df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]})
     result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min))
-    if pd.get_option("new_udf_methods"):
-        expected = pd.DataFrame(
-            [[1.5, 1.5, 1]], index=["A"], columns=pd.Index(["foo", "bar", "cat"])
-        )
-    else:
-        expected = pd.DataFrame(
-            {"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"])
-        )
+    expected = pd.DataFrame(
+        {"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"])
+    )
     tm.assert_frame_equal(result, expected)
 
     result = df.agg(
@@ -79,8 +68,6 @@ def test_agg_relabel_partial_functions():
         },
         index=pd.Index(["foo", "bar", "cat", "dat", "f", "kk"]),
     )
-    if pd.get_option("new_udf_methods"):
-        expected = expected.T
     tm.assert_frame_equal(result, expected)
 
 
@@ -97,8 +84,6 @@ def test_agg_namedtuple():
     expected = pd.DataFrame(
         {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"])
     )
-    if pd.get_option("new_udf_methods"):
-        expected = expected.T
     tm.assert_frame_equal(result, expected)
 
     result = df.agg(
@@ -110,6 +95,4 @@ def test_agg_namedtuple():
         {"A": [0.0, np.nan, 1.0], "B": [np.nan, 2.0, np.nan]},
         index=pd.Index(["foo", "bar", "cat"]),
     )
-    if pd.get_option("new_udf_methods"):
-        expected = expected.T
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py
@@ -854,8 +854,6 @@ def test_apply_dictlike_reducer(string_series, ops, how):
                 {name: op(string_series) for name, op in ops.items()}, name="series"
             )
         result = getattr(string_series, how)(ops)
-        print(result)
-        print(expected)
         tm.assert_equal(result, expected)
     else:
         expected = Series({name: op(string_series) for name, op in ops.items()})
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -20,6 +20,7 @@
     MultiIndex,
     Series,
     concat,
+    get_option,
     to_datetime,
 )
 import pandas._testing as tm
@@ -845,11 +846,19 @@ def test_groupby_aggregate_empty_key(kwargs):
     # GH: 32580
     df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]})
     result = df.groupby("a").agg(kwargs)
-    expected = DataFrame(
-        [1, 4],
-        index=Index([1, 2], dtype="int64", name="a"),
-        columns=MultiIndex.from_tuples([["c", "min"]]),
-    )
+    print(result)
+    if get_option("new_udf_methods"):
+        expected = DataFrame(
+            [1, 4],
+            index=Index([1, 2], dtype="int64", name="a"),
+            columns=MultiIndex.from_tuples([["min", "c"]]),
+        )
+    else:
+        expected = DataFrame(
+            [1, 4],
+            index=Index([1, 2], dtype="int64", name="a"),
+            columns=MultiIndex.from_tuples([["c", "min"]]),
+        )
     tm.assert_frame_equal(result, expected)
 
 
@@ -916,6 +925,9 @@ def test_multiindex_custom_func(func):
     }
     expected = DataFrame(expected_dict)
     expected.columns = df.columns
+    print(df)
+    print(result)
+    print(expected)
     tm.assert_frame_equal(result, expected)
 
 
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -1477,6 +1477,9 @@ def test_groupby_agg_categorical_columns(func, expected_values):
     expected = DataFrame(
         {"value": expected_values}, index=Index([0, 1, 2], name="groups")
     )
+    print(df)
+    print(result)
+    print(expected)
     tm.assert_frame_equal(result, expected)
 
 
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py

Original file line number	Diff line number	Diff line change
`@@ -854,8 +854,6 @@ def test_apply_dictlike_reducer(string_series, ops, how):`
`854`	`854`	`{name: op(string_series) for name, op in ops.items()}, name="series"`
`855`	`855`	`)`
`856`	`856`	`result = getattr(string_series, how)(ops)`
`857`		`- print(result)`
`858`		`- print(expected)`
`859`	`857`	`tm.assert_equal(result, expected)`
`860`	`858`	`else:`
`861`	`859`	`expected = Series({name: op(string_series) for name, op in ops.items()})`
Original file line number	Diff line number	Diff line change
`@@ -1477,6 +1477,9 @@ def test_groupby_agg_categorical_columns(func, expected_values):`
`1477`	`1477`	`expected = DataFrame(`
`1478`	`1478`	`{"value": expected_values}, index=Index([0, 1, 2], name="groups")`
`1479`	`1479`	`)`
	`1480`	`+ print(df)`
	`1481`	`+ print(result)`
	`1482`	`+ print(expected)`
`1480`	`1483`	`tm.assert_frame_equal(result, expected)`
`1481`	`1484`
`1482`	`1485`