|
52 | 52 |
|
53 | 53 | _apply_whitelist = frozenset(['last', 'first',
|
54 | 54 | 'mean', 'sum', 'min', 'max',
|
55 |
| - 'head', 'tail', |
56 | 55 | 'cumsum', 'cumprod', 'cummin', 'cummax',
|
57 | 56 | 'resample',
|
58 | 57 | 'describe',
|
@@ -482,8 +481,9 @@ def picker(arr):
|
482 | 481 | return np.nan
|
483 | 482 | return self.agg(picker)
|
484 | 483 |
|
485 |
| - def cumcount(self): |
486 |
| - """Number each item in each group from 0 to the length of that group. |
| 484 | + def cumcount(self, **kwargs): |
| 485 | + ''' |
| 486 | + Number each item in each group from 0 to the length of that group. |
487 | 487 |
|
488 | 488 | Essentially this is equivalent to
|
489 | 489 |
|
@@ -511,13 +511,101 @@ def cumcount(self):
|
511 | 511 | 5 3
|
512 | 512 | dtype: int64
|
513 | 513 |
|
514 |
| - """ |
| 514 | + ''' |
| 515 | + ascending = kwargs.pop('ascending', True) |
| 516 | + |
515 | 517 | index = self.obj.index
|
516 |
| - cumcounts = np.zeros(len(index), dtype='int64') |
517 |
| - for v in self.indices.values(): |
518 |
| - cumcounts[v] = np.arange(len(v), dtype='int64') |
| 518 | + rng = np.arange(self.grouper._max_groupsize, dtype='int64') |
| 519 | + cumcounts = self._cumcount_array(rng, ascending=ascending) |
519 | 520 | return Series(cumcounts, index)
|
520 | 521 |
|
| 522 | + def head(self, n=5): |
| 523 | + ''' |
| 524 | + Returns first n rows of each group. |
| 525 | +
|
| 526 | + Essentially equivalent to .apply(lambda x: x.head(n)) |
| 527 | +
|
| 528 | + Example |
| 529 | + ------- |
| 530 | +
|
| 531 | + >>> df = DataFrame([[1, 2], [1, 4], [5, 6]], |
| 532 | + columns=['A', 'B']) |
| 533 | + >>> df.groupby('A', as_index=False).head(1) |
| 534 | + A B |
| 535 | + 0 1 2 |
| 536 | + 2 5 6 |
| 537 | + >>> df.groupby('A').head(1) |
| 538 | + A B |
| 539 | + A |
| 540 | + 1 0 1 2 |
| 541 | + 5 2 5 6 |
| 542 | +
|
| 543 | + ''' |
| 544 | + rng = np.arange(self.grouper._max_groupsize, dtype='int64') |
| 545 | + in_head = self._cumcount_array(rng) < n |
| 546 | + head = self.obj[in_head] |
| 547 | + if self.as_index: |
| 548 | + head.index = self._index_with_as_index(in_head) |
| 549 | + return head |
| 550 | + |
| 551 | + def tail(self, n=5): |
| 552 | + ''' |
| 553 | + Returns first n rows of each group |
| 554 | +
|
| 555 | + Essentially equivalent to .apply(lambda x: x.tail(n)) |
| 556 | +
|
| 557 | + Example |
| 558 | + ------- |
| 559 | +
|
| 560 | + >>> df = DataFrame([[1, 2], [1, 4], [5, 6]], |
| 561 | + columns=['A', 'B']) |
| 562 | + >>> df.groupby('A', as_index=False).tail(1) |
| 563 | + A B |
| 564 | + 0 1 2 |
| 565 | + 2 5 6 |
| 566 | + >>> df.groupby('A').head(1) |
| 567 | + A B |
| 568 | + A |
| 569 | + 1 0 1 2 |
| 570 | + 5 2 5 6 |
| 571 | + ''' |
| 572 | + rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64') |
| 573 | + in_tail = self._cumcount_array(rng, ascending=False) > -n |
| 574 | + tail = self.obj[in_tail] |
| 575 | + if self.as_index: |
| 576 | + tail.index = self._index_with_as_index(in_tail) |
| 577 | + return tail |
| 578 | + |
| 579 | + def _cumcount_array(self, arr, **kwargs): |
| 580 | + ascending = kwargs.pop('ascending', True) |
| 581 | + |
| 582 | + len_index = len(self.obj.index) |
| 583 | + cumcounts = np.zeros(len_index, dtype='int64') |
| 584 | + if ascending: |
| 585 | + for v in self.indices.values(): |
| 586 | + cumcounts[v] = arr[:len(v)] |
| 587 | + else: |
| 588 | + for v in self.indices.values(): |
| 589 | + cumcounts[v] = arr[len(v)-1::-1] |
| 590 | + return cumcounts |
| 591 | + |
| 592 | + def _index_with_as_index(self, b): |
| 593 | + ''' |
| 594 | + Take boolean mask of index to be returned from apply, if as_index=True |
| 595 | +
|
| 596 | + ''' |
| 597 | + # TODO perf, it feels like this should already be somewhere... |
| 598 | + from itertools import chain |
| 599 | + original = self.obj.index |
| 600 | + gp = self.grouper |
| 601 | + levels = chain((gp.levels[i][gp.labels[i][b]] |
| 602 | + for i in range(len(gp.groupings))), |
| 603 | + (original.get_level_values(i)[b] |
| 604 | + for i in range(original.nlevels))) |
| 605 | + new = MultiIndex.from_arrays(list(levels)) |
| 606 | + new.names = gp.names + original.names |
| 607 | + return new |
| 608 | + |
521 | 609 | def _try_cast(self, result, obj):
|
522 | 610 | """
|
523 | 611 | try to cast the result to our obj original type,
|
@@ -758,14 +846,28 @@ def names(self):
|
758 | 846 | def size(self):
|
759 | 847 | """
|
760 | 848 | Compute group sizes
|
| 849 | +
|
761 | 850 | """
|
762 | 851 | # TODO: better impl
|
763 | 852 | labels, _, ngroups = self.group_info
|
764 |
| - bin_counts = Series(labels).value_counts() |
| 853 | + bin_counts = algos.value_counts(labels, sort=False) |
765 | 854 | bin_counts = bin_counts.reindex(np.arange(ngroups))
|
766 | 855 | bin_counts.index = self.result_index
|
767 | 856 | return bin_counts
|
768 | 857 |
|
| 858 | + @cache_readonly |
| 859 | + def _max_groupsize(self): |
| 860 | + ''' |
| 861 | + Compute size of largest group |
| 862 | +
|
| 863 | + ''' |
| 864 | + # For many items in each group this is much faster than |
| 865 | + # self.size().max(), in worst case marginally slower |
| 866 | + if self.indices: |
| 867 | + return max(len(v) for v in self.indices.values()) |
| 868 | + else: |
| 869 | + return 0 |
| 870 | + |
769 | 871 | @cache_readonly
|
770 | 872 | def groups(self):
|
771 | 873 | if len(self.groupings) == 1:
|
|
0 commit comments