diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 7b769eeccbe68..e666bad2317df 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -705,3 +705,16 @@ can be used as group keys. If so, the order of the levels will be preserved: factor = qcut(data, [0, .25, .5, .75, 1.]) data.groupby(factor).mean() + +Enumerate group items +~~~~~~~~~~~~~~~~~~~~~ + +To see the order in which each row appears within its group, use the +``cumcount`` method: + +.. ipython:: python + + df = pd.DataFrame(list('aaabba'), columns=['A']) + df + + df.groupby('A').cumcount() \ No newline at end of file diff --git a/doc/source/release.rst b/doc/source/release.rst index 59ff48887269e..36bc02da3e68d 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -64,6 +64,7 @@ New features - ``to_csv()`` now outputs datetime objects according to a specified format string via the ``date_format`` keyword (:issue:`4313`) - Added ``LastWeekOfMonth`` DateOffset (:issue:`4637`) + - Added ``cumcount`` groupby method (:issue:`4646`) - Added ``FY5253``, and ``FY5253Quarter`` DateOffsets (:issue:`4511`) - Added ``mode()`` method to ``Series`` and ``DataFrame`` to get the statistical mode(s) of a column/series. (:issue:`5367`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 668c665613c0d..f37b94cd7f689 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -468,6 +468,7 @@ def ohlc(self): Compute sum of values, excluding missing values For multiple groupings, the result index will be a MultiIndex + """ return self._cython_agg_general('ohlc') @@ -480,9 +481,49 @@ def picker(arr): return np.nan return self.agg(picker) + def cumcount(self): + ''' + Number each item in each group from 0 to the length of that group. + + Essentially this is equivalent to + + >>> self.apply(lambda x: Series(np.arange(len(x)), x.index)). + + Example + ------- + + >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']], columns=['A']) + >>> df + A + 0 a + 1 a + 2 a + 3 b + 4 b + 5 a + >>> df.groupby('A').cumcount() + 0 0 + 1 1 + 2 2 + 3 0 + 4 1 + 5 3 + dtype: int64 + + ''' + index = self.obj.index + cumcounts = np.zeros(len(index), dtype='int64') + for v in self.indices.values(): + cumcounts[v] = np.arange(len(v), dtype='int64') + return Series(cumcounts, index) + + def _try_cast(self, result, obj): - """ try to cast the result to our obj original type, - we may have roundtripped thru object in the mean-time """ + """ + try to cast the result to our obj original type, + we may have roundtripped thru object in the mean-time + + """ if obj.ndim > 1: dtype = obj.values.dtype else: diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index ca74f46122d88..9df5541615cee 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2560,6 +2560,57 @@ def test_groupby_with_empty(self): grouped = series.groupby(grouper) assert next(iter(grouped), None) is None + def test_cumcount(self): + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A']) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 1, 2, 0, 3]) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_empty(self): + ge = DataFrame().groupby() + se = Series().groupby() + + e = Series(dtype='int') # edge case, as this is usually considered float + + assert_series_equal(e, ge.cumcount()) + assert_series_equal(e, se.cumcount()) + + def test_cumcount_dupe_index(self): + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], index=[0] * 5) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=[0] * 5) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_mi(self): + mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], index=mi) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=mi) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_groupby_not_col(self): + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], index=[0] * 5) + g = df.groupby([0, 0, 0, 1, 0]) + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=[0] * 5) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_filter_series(self): import pandas as pd s = pd.Series([1, 3, 20, 5, 22, 24, 7]) @@ -3180,7 +3231,7 @@ def test_tab_completion(self): 'min','name','ngroups','nth','ohlc','plot', 'prod', 'size','std','sum','transform','var', 'count', 'head', 'describe', 'cummax', 'dtype', 'quantile', 'rank', 'cumprod', 'tail', - 'resample', 'cummin', 'fillna', 'cumsum']) + 'resample', 'cummin', 'fillna', 'cumsum', 'cumcount']) self.assertEqual(results, expected) def assert_fp_equal(a, b):