|
7 | 7 | """
|
8 | 8 | from collections import abc, namedtuple
|
9 | 9 | import copy
|
10 |
| -from functools import partial |
11 | 10 | from textwrap import dedent
|
12 | 11 | import typing
|
13 | 12 | from typing import (
|
|
41 | 40 | maybe_downcast_to_dtype,
|
42 | 41 | )
|
43 | 42 | from pandas.core.dtypes.common import (
|
44 |
| - ensure_int64, |
45 | 43 | ensure_platform_int,
|
46 | 44 | is_bool,
|
47 |
| - is_integer_dtype, |
48 |
| - is_interval_dtype, |
49 | 45 | is_numeric_dtype,
|
50 | 46 | is_object_dtype,
|
51 | 47 | is_scalar,
|
@@ -671,129 +667,14 @@ def describe(self, **kwargs):
|
671 | 667 | def value_counts(
|
672 | 668 | self, normalize=False, sort=True, ascending=False, bins=None, dropna=True
|
673 | 669 | ):
|
674 |
| - |
675 |
| - from pandas.core.reshape.tile import cut |
676 |
| - from pandas.core.reshape.merge import _get_join_indexers |
677 |
| - |
678 |
| - if bins is not None:# and not np.iterable(bins): |
679 |
| - # scalar bins cannot be done at top level |
680 |
| - # in a backward compatible way |
681 |
| - return self.apply( |
682 |
| - Series.value_counts, |
683 |
| - normalize=normalize, |
684 |
| - sort=sort, |
685 |
| - ascending=ascending, |
686 |
| - bins=bins, |
687 |
| - dropna=dropna |
688 |
| - ) |
689 |
| - |
690 |
| - ids, _, _ = self.grouper.group_info |
691 |
| - val = self.obj._values |
692 |
| - |
693 |
| - # groupby removes null keys from groupings |
694 |
| - mask = ids != -1 |
695 |
| - ids, val = ids[mask], val[mask] |
696 |
| - |
697 |
| - if bins is None: |
698 |
| - lab, lev = algorithms.factorize(val, sort=True) |
699 |
| - llab = lambda lab, inc: lab[inc] |
700 |
| - else: |
701 |
| - |
702 |
| - # lab is a Categorical with categories an IntervalIndex |
703 |
| - lab = cut(Series(val), bins, include_lowest=True) |
704 |
| - lev = lab.cat.categories |
705 |
| - lab = lev.take(lab.cat.codes) |
706 |
| - llab = lambda lab, inc: lab[inc]._multiindex.codes[-1] |
707 |
| - |
708 |
| - if is_interval_dtype(lab): |
709 |
| - # TODO: should we do this inside II? |
710 |
| - sorter = np.lexsort((lab.left, lab.right, ids)) |
711 |
| - else: |
712 |
| - sorter = np.lexsort((lab, ids)) |
713 |
| - |
714 |
| - ids, lab = ids[sorter], lab[sorter] |
715 |
| - |
716 |
| - # group boundaries are where group ids change |
717 |
| - idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] |
718 |
| - |
719 |
| - # new values are where sorted labels change |
720 |
| - lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) |
721 |
| - inc = np.r_[True, lchanges] |
722 |
| - inc[idx] = True # group boundaries are also new values |
723 |
| - out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts |
724 |
| - |
725 |
| - # num. of times each group should be repeated |
726 |
| - rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) |
727 |
| - |
728 |
| - # multi-index components |
729 |
| - codes = self.grouper.reconstructed_codes |
730 |
| - codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] |
731 |
| - levels = [ping.group_index for ping in self.grouper.groupings] + [lev] |
732 |
| - names = self.grouper.names + [self._selection_name] |
733 |
| - |
734 |
| - if dropna: |
735 |
| - mask = codes[-1] != -1 |
736 |
| - if mask.all(): |
737 |
| - dropna = False |
738 |
| - else: |
739 |
| - out, codes = out[mask], [level_codes[mask] for level_codes in codes] |
740 |
| - |
741 |
| - if normalize: |
742 |
| - out = out.astype("float") |
743 |
| - d = np.diff(np.r_[idx, len(ids)]) |
744 |
| - if dropna: |
745 |
| - m = ids[lab == -1] |
746 |
| - np.add.at(d, m, -1) |
747 |
| - acc = rep(d)[mask] |
748 |
| - else: |
749 |
| - acc = rep(d) |
750 |
| - out /= acc |
751 |
| - |
752 |
| - if sort and bins is None: |
753 |
| - cat = ids[inc][mask] if dropna else ids[inc] |
754 |
| - sorter = np.lexsort((out if ascending else -out, cat)) |
755 |
| - out, codes[-1] = out[sorter], codes[-1][sorter] |
756 |
| - |
757 |
| - if bins is None: |
758 |
| - mi = MultiIndex( |
759 |
| - levels=levels, codes=codes, names=names, verify_integrity=False |
760 |
| - ) |
761 |
| - |
762 |
| - if is_integer_dtype(out): |
763 |
| - out = ensure_int64(out) |
764 |
| - return Series(out, index=mi, name=self._selection_name) |
765 |
| - |
766 |
| - # for compat. with libgroupby.value_counts need to ensure every |
767 |
| - # bin is present at every index level, null filled with zeros |
768 |
| - diff = np.zeros(len(out), dtype="bool") |
769 |
| - for level_codes in codes[:-1]: |
770 |
| - diff |= np.r_[True, level_codes[1:] != level_codes[:-1]] |
771 |
| - |
772 |
| - ncat, nbin = diff.sum(), len(levels[-1]) |
773 |
| - |
774 |
| - left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)] |
775 |
| - |
776 |
| - right = [diff.cumsum() - 1, codes[-1]] |
777 |
| - |
778 |
| - _, idx = _get_join_indexers(left, right, sort=False, how="left") |
779 |
| - out = np.where(idx != -1, out[idx], 0) |
780 |
| - |
781 |
| - if sort: |
782 |
| - sorter = np.lexsort((out if ascending else -out, left[0])) |
783 |
| - out, left[-1] = out[sorter], left[-1][sorter] |
784 |
| - |
785 |
| - # build the multi-index w/ full levels |
786 |
| - def build_codes(lev_codes: np.ndarray) -> np.ndarray: |
787 |
| - return np.repeat(lev_codes[diff], nbin) |
788 |
| - |
789 |
| - codes = [build_codes(lev_codes) for lev_codes in codes[:-1]] |
790 |
| - codes.append(left[-1]) |
791 |
| - |
792 |
| - mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) |
793 |
| - |
794 |
| - if is_integer_dtype(out): |
795 |
| - out = ensure_int64(out) |
796 |
| - return Series(out, index=mi, name=self._selection_name) |
| 670 | + return self.apply( |
| 671 | + Series.value_counts, |
| 672 | + normalize=normalize, |
| 673 | + sort=sort, |
| 674 | + ascending=ascending, |
| 675 | + bins=bins, |
| 676 | + dropna=dropna, |
| 677 | + ) |
797 | 678 |
|
798 | 679 | def count(self) -> Series:
|
799 | 680 | """
|
|
0 commit comments