From 65abb6bf72e61b1ece857a1687f134d1f56d56bb Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 14 May 2013 07:47:21 -0400 Subject: [PATCH] BUG: Add reduce_if_possible keyword to ``groupby`` to allow reduction from DataFrame -> Series if groups are unique. Regression from 0.10.1, partial revert on (GH2893_) with (GH3596_) CLN: renamed reduce_if_possible -> squeeze DOC: added v0.11.1 example --- RELEASE.rst | 4 ++++ doc/source/v0.11.1.txt | 22 ++++++++++++++++++++++ pandas/core/generic.py | 8 ++++++-- pandas/core/groupby.py | 26 +++++++++++++++++--------- pandas/tests/test_groupby.py | 11 ++++++++--- 5 files changed, 57 insertions(+), 14 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 006da5f8e76af..1f5bd2591470b 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -65,6 +65,9 @@ pandas 0.11.1 ``timedelta64[ns]`` to ``object/int`` (GH3425_) - Do not allow datetimelike/timedeltalike creation except with valid types (e.g. cannot pass ``datetime64[ms]``) (GH3423_) + - Add ``squeeze`` keyword to ``groupby`` to allow reduction from + DataFrame -> Series if groups are unique. Regression from 0.10.1, + partial revert on (GH2893_) with (GH3596_) **Bug Fixes** @@ -161,6 +164,7 @@ pandas 0.11.1 .. _GH3594: https://github.com/pydata/pandas/issues/3594 .. _GH3590: https://github.com/pydata/pandas/issues/3590 .. _GH3610: https://github.com/pydata/pandas/issues/3610 +.. _GH3596: https://github.com/pydata/pandas/issues/3596 .. _GH3435: https://github.com/pydata/pandas/issues/3435 diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index 3719d9eb09dee..c89118298a675 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -21,6 +21,26 @@ API changes p / p p / 0 + - Add ``squeeze`` keyword to ``groupby`` to allow reduction from + DataFrame -> Series if groups are unique. This is a Regression from 0.10.1. + We are reverting back to the prior behavior. This means groupby will return the + same shaped objects whether the groups are unique or not. revert on (GH2893_) + with (GH3596_). + + .. ipython:: python + + df2 = DataFrame([{"val1": 1, "val2" : 20}, {"val1":1, "val2": 19}, + {"val1":1, "val2": 27}, {"val1":1, "val2": 12}]) + def func(dataf): + return dataf["val2"] - dataf["val2"].mean() + + # squeezing the result frame to a series (because we have unique groups) + df2.groupby("val1", squeeze=True).apply(func) + + # no squeezing (the default, and behavior in 0.10.1) + df2.groupby("val1").apply(func) + + Enhancements ~~~~~~~~~~~~ - ``pd.read_html()`` can now parse HTML string, files or urls and return dataframes @@ -44,5 +64,7 @@ on GitHub for a complete list. .. _GH3477: https://github.com/pydata/pandas/issues/3477 .. _GH3492: https://github.com/pydata/pandas/issues/3492 .. _GH3499: https://github.com/pydata/pandas/issues/3499 +.. _GH2893: https://github.com/pydata/pandas/issues/2893 +.. _GH3596: https://github.com/pydata/pandas/issues/3596 .. _GH3590: https://github.com/pydata/pandas/issues/3590 .. _GH3435: https://github.com/pydata/pandas/issues/3435 diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ed90aab715cfd..4a80e2f65fd71 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -107,7 +107,7 @@ def get(self, key, default=None): return default def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, - group_keys=True): + group_keys=True, squeeze=False): """ Group series using mapper (dict or key function, apply given function to group, return result as series) or by a series of columns @@ -131,6 +131,9 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, Sort group keys. Get better performance by turning this off group_keys : boolean, default True When calling apply, add group keys to index to identify pieces + squeeze : boolean, default False + reduce the dimensionaility of the return type if possible, otherwise + return a consistent type Examples -------- @@ -150,7 +153,8 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, from pandas.core.groupby import groupby axis = self._get_axis_number(axis) return groupby(self, by, axis=axis, level=level, as_index=as_index, - sort=sort, group_keys=group_keys) + sort=sort, group_keys=group_keys, + squeeze=squeeze) def asfreq(self, freq, method=None, how=None, normalize=False): """ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 093c61ba5af5c..122355581956d 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -169,7 +169,7 @@ class GroupBy(object): def __init__(self, obj, keys=None, axis=0, level=None, grouper=None, exclusions=None, selection=None, as_index=True, - sort=True, group_keys=True): + sort=True, group_keys=True, squeeze=False): self._selection = selection if isinstance(obj, NDFrame): @@ -189,6 +189,7 @@ def __init__(self, obj, keys=None, axis=0, level=None, self.keys = keys self.sort = sort self.group_keys = group_keys + self.squeeze = squeeze if grouper is None: grouper, exclusions = _get_grouper(obj, keys, axis=axis, @@ -1841,15 +1842,22 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): all_indexed_same = _all_indexes_same([x.index for x in values]) singular_series = len(values) == 1 and applied_index.nlevels == 1 - # assign the name to this series - if singular_series: - values[0].name = keys[0] + # GH3596 + # provide a reduction (Frame -> Series) if groups are unique + if self.squeeze: - # GH2893 - # we have series in the values array, we want to produce a series: - # if any of the sub-series are not indexed the same - # OR we don't have a multi-index and we have only a single values - if singular_series or not all_indexed_same: + # assign the name to this series + if singular_series: + values[0].name = keys[0] + + # GH2893 + # we have series in the values array, we want to produce a series: + # if any of the sub-series are not indexed the same + # OR we don't have a multi-index and we have only a single values + return self._concat_objects(keys, values, + not_indexed_same=not_indexed_same) + + if not all_indexed_same: return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index c1c4217cb6f62..c56fca49cce48 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -263,14 +263,14 @@ def test_groupby_nonobject_dtype(self): def test_groupby_return_type(self): - # GH2893 + # GH2893, return a reduced type df1 = DataFrame([{"val1": 1, "val2" : 20}, {"val1":1, "val2": 19}, {"val1":2, "val2": 27}, {"val1":2, "val2": 12}]) def func(dataf): return dataf["val2"] - dataf["val2"].mean() - result = df1.groupby("val1").apply(func) + result = df1.groupby("val1", squeeze=True).apply(func) self.assert_(isinstance(result,Series)) df2 = DataFrame([{"val1": 1, "val2" : 20}, {"val1":1, "val2": 19}, @@ -278,9 +278,14 @@ def func(dataf): def func(dataf): return dataf["val2"] - dataf["val2"].mean() - result = df2.groupby("val1").apply(func) + result = df2.groupby("val1", squeeze=True).apply(func) self.assert_(isinstance(result,Series)) + # GH3596, return a consistent type (regression in 0.11 from 0.10.1) + df = DataFrame([[1,1],[1,1]],columns=['X','Y']) + result = df.groupby('X',squeeze=False).count() + self.assert_(isinstance(result,DataFrame)) + def test_agg_regression1(self): grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.agg(np.mean)