Skip to content

Commit e8e7735

Browse files
committed
PERF faster head, tail and size groupby methods
1 parent d250d64 commit e8e7735

File tree

2 files changed

+145
-14
lines changed

2 files changed

+145
-14
lines changed

pandas/core/groupby.py

Lines changed: 110 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@
5252

5353
_apply_whitelist = frozenset(['last', 'first',
5454
'mean', 'sum', 'min', 'max',
55-
'head', 'tail',
5655
'cumsum', 'cumprod', 'cummin', 'cummax',
5756
'resample',
5857
'describe',
@@ -482,8 +481,9 @@ def picker(arr):
482481
return np.nan
483482
return self.agg(picker)
484483

485-
def cumcount(self):
486-
"""Number each item in each group from 0 to the length of that group.
484+
def cumcount(self, **kwargs):
485+
'''
486+
Number each item in each group from 0 to the length of that group.
487487
488488
Essentially this is equivalent to
489489
@@ -511,13 +511,101 @@ def cumcount(self):
511511
5 3
512512
dtype: int64
513513
514-
"""
514+
'''
515+
ascending = kwargs.pop('ascending', True)
516+
515517
index = self.obj.index
516-
cumcounts = np.zeros(len(index), dtype='int64')
517-
for v in self.indices.values():
518-
cumcounts[v] = np.arange(len(v), dtype='int64')
518+
rng = np.arange(self.grouper._max_groupsize, dtype='int64')
519+
cumcounts = self._cumcount_array(rng, ascending=ascending)
519520
return Series(cumcounts, index)
520521

522+
def head(self, n=5):
523+
'''
524+
Returns first n rows of each group.
525+
526+
Essentially equivalent to .apply(lambda x: x.head(n))
527+
528+
Example
529+
-------
530+
531+
>>> df = DataFrame([[1, 2], [1, 4], [5, 6]],
532+
columns=['A', 'B'])
533+
>>> df.groupby('A', as_index=False).head(1)
534+
A B
535+
0 1 2
536+
2 5 6
537+
>>> df.groupby('A').head(1)
538+
A B
539+
A
540+
1 0 1 2
541+
5 2 5 6
542+
543+
'''
544+
rng = np.arange(self.grouper._max_groupsize, dtype='int64')
545+
in_head = self._cumcount_array(rng) < n
546+
head = self.obj[in_head]
547+
if self.as_index:
548+
head.index = self._index_with_as_index(in_head)
549+
return head
550+
551+
def tail(self, n=5):
552+
'''
553+
Returns first n rows of each group
554+
555+
Essentially equivalent to .apply(lambda x: x.tail(n))
556+
557+
Example
558+
-------
559+
560+
>>> df = DataFrame([[1, 2], [1, 4], [5, 6]],
561+
columns=['A', 'B'])
562+
>>> df.groupby('A', as_index=False).tail(1)
563+
A B
564+
0 1 2
565+
2 5 6
566+
>>> df.groupby('A').head(1)
567+
A B
568+
A
569+
1 0 1 2
570+
5 2 5 6
571+
'''
572+
rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64')
573+
in_tail = self._cumcount_array(rng, ascending=False) > -n
574+
tail = self.obj[in_tail]
575+
if self.as_index:
576+
tail.index = self._index_with_as_index(in_tail)
577+
return tail
578+
579+
def _cumcount_array(self, arr, **kwargs):
580+
ascending = kwargs.pop('ascending', True)
581+
582+
len_index = len(self.obj.index)
583+
cumcounts = np.zeros(len_index, dtype='int64')
584+
if ascending:
585+
for v in self.indices.values():
586+
cumcounts[v] = arr[:len(v)]
587+
else:
588+
for v in self.indices.values():
589+
cumcounts[v] = arr[len(v)-1::-1]
590+
return cumcounts
591+
592+
def _index_with_as_index(self, b):
593+
'''
594+
Take boolean mask of index to be returned from apply, if as_index=True
595+
596+
'''
597+
# TODO perf, it feels like this should already be somewhere...
598+
from itertools import chain
599+
original = self.obj.index
600+
gp = self.grouper
601+
levels = chain((gp.levels[i][gp.labels[i][b]]
602+
for i in range(len(gp.groupings))),
603+
(original.get_level_values(i)[b]
604+
for i in range(original.nlevels)))
605+
new = MultiIndex.from_arrays(list(levels))
606+
new.names = gp.names + original.names
607+
return new
608+
521609
def _try_cast(self, result, obj):
522610
"""
523611
try to cast the result to our obj original type,
@@ -758,14 +846,28 @@ def names(self):
758846
def size(self):
759847
"""
760848
Compute group sizes
849+
761850
"""
762851
# TODO: better impl
763852
labels, _, ngroups = self.group_info
764-
bin_counts = Series(labels).value_counts()
853+
bin_counts = algos.value_counts(labels, sort=False)
765854
bin_counts = bin_counts.reindex(np.arange(ngroups))
766855
bin_counts.index = self.result_index
767856
return bin_counts
768857

858+
@cache_readonly
859+
def _max_groupsize(self):
860+
'''
861+
Compute size of largest group
862+
863+
'''
864+
# For many items in each group this is much faster than
865+
# self.size().max(), in worst case marginally slower
866+
if self.indices:
867+
return max(len(v) for v in self.indices.values())
868+
else:
869+
return 0
870+
769871
@cache_readonly
770872
def groups(self):
771873
if len(self.groupings) == 1:

pandas/tests/test_groupby.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1203,24 +1203,53 @@ def test_groupby_as_index_apply(self):
12031203
g_not_as = df.groupby('user_id', as_index=False)
12041204

12051205
res_as = g_as.head(2).index
1206-
exp_as = MultiIndex.from_tuples([(1, 0), (1, 2), (2, 1), (3, 4)])
1206+
exp_as = MultiIndex.from_tuples([(1, 0), (2, 1), (1, 2), (3, 4)])
12071207
assert_index_equal(res_as, exp_as)
12081208

12091209
res_not_as = g_not_as.head(2).index
1210-
exp_not_as = Index([0, 2, 1, 4])
1210+
exp_not_as = Index([0, 1, 2, 4])
12111211
assert_index_equal(res_not_as, exp_not_as)
12121212

1213-
res_as = g_as.apply(lambda x: x.head(2)).index
1214-
assert_index_equal(res_not_as, exp_not_as)
1213+
res_as_apply = g_as.apply(lambda x: x.head(2)).index
1214+
res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index
12151215

1216-
res_not_as = g_not_as.apply(lambda x: x.head(2)).index
1217-
assert_index_equal(res_not_as, exp_not_as)
1216+
# apply doesn't maintain the original ordering
1217+
exp_not_as_apply = Index([0, 2, 1, 4])
1218+
exp_as_apply = MultiIndex.from_tuples([(1, 0), (1, 2), (2, 1), (3, 4)])
1219+
1220+
assert_index_equal(res_as_apply, exp_as_apply)
1221+
assert_index_equal(res_not_as_apply, exp_not_as_apply)
12181222

12191223
ind = Index(list('abcde'))
12201224
df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
12211225
res = df.groupby(0, as_index=False).apply(lambda x: x).index
12221226
assert_index_equal(res, ind)
12231227

1228+
def test_groupby_head_tail(self):
1229+
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
1230+
g_as = df.groupby('A', as_index=True)
1231+
g_not_as = df.groupby('A', as_index=False)
1232+
1233+
# as_index= False much easier
1234+
exp_head_not_as = df.loc[[0, 2]]
1235+
res_head_not_as = g_not_as.head(1)
1236+
assert_frame_equal(exp_head_not_as, res_head_not_as)
1237+
exp_tail_not_as = df.loc[[1, 2]]
1238+
res_tail_not_as = g_not_as.tail(1)
1239+
assert_frame_equal(exp_tail_not_as, res_tail_not_as)
1240+
1241+
# as_index=True, yuck
1242+
res_head_as = g_as.head(1)
1243+
res_tail_as = g_as.tail(1)
1244+
1245+
# prepend the A column as an index, in a roundabout way
1246+
df.index = df.set_index('A', append=True, drop=False).index.swaplevel(0, 1)
1247+
exp_head_as = df.loc[[0, 2]]
1248+
exp_tail_as = df.loc[[1, 2]]
1249+
1250+
assert_frame_equal(exp_head_as, res_head_as)
1251+
assert_frame_equal(exp_tail_as, res_tail_as)
1252+
12241253
def test_groupby_multiple_key(self):
12251254
df = tm.makeTimeDataFrame()
12261255
grouped = df.groupby([lambda x: x.year,

0 commit comments

Comments
 (0)