Skip to content

Commit 5f8eb1d

Browse files
committed
updated value_count docstrings
1 parent 9c1c269 commit 5f8eb1d

File tree

3 files changed

+38
-139
lines changed

3 files changed

+38
-139
lines changed

pandas/core/algorithms.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -663,12 +663,16 @@ def value_counts(
663663
ascending : bool, default False
664664
Sort in ascending order
665665
normalize: bool, default False
666-
If True then compute a relative histogram
667-
bins : integer, optional
668-
Rather than count values, group them into half-open bins,
669-
convenience for pd.cut, only works with numeric data
666+
If True, then compute a relative histogram that outputs the
667+
proportion of each value.
668+
bins : integer or iterable of numeric, optional
669+
Rather than count values, group them into half-open bins.
670+
Only works with numeric data.
671+
If int, interpreted as number of bins and will use pd.cut.
672+
If interable of numeric, will use provided numbers as bin endpoints.
670673
dropna : bool, default True
671-
Don't include counts of NaN
674+
Don't include counts of NaN.
675+
If False and NaNs are present, NaN will be a key in the output.
672676
673677
Returns
674678
-------

pandas/core/base.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1176,17 +1176,20 @@ def value_counts(
11761176
Parameters
11771177
----------
11781178
normalize : bool, default False
1179-
If True then the object returned will contain the relative
1180-
frequencies of the unique values.
1179+
If True, outputs the relative frequencies of the unique values.
11811180
sort : bool, default True
11821181
Sort by frequencies.
11831182
ascending : bool, default False
11841183
Sort in ascending order.
1185-
bins : int, optional
1186-
Rather than count values, group them into half-open bins,
1187-
a convenience for ``pd.cut``, only works with numeric data.
1184+
bins : integer or iterable of numeric, optional
1185+
Rather than count individual values, group them into half-open bins.
1186+
Only works with numeric data.
1187+
If int, interpreted as number of bins and will use ``pd.cut``.
1188+
If interable of numeric, will use provided numbers as bin endpoints.
1189+
11881190
dropna : bool, default True
11891191
Don't include counts of NaN.
1192+
If False and NaNs are present, NaN will be a key in the output.
11901193
11911194
Returns
11921195
-------
@@ -1223,15 +1226,26 @@ def value_counts(
12231226
12241227
Bins can be useful for going from a continuous variable to a
12251228
categorical variable; instead of counting unique
1226-
apparitions of values, divide the index in the specified
1227-
number of half-open bins.
1229+
instances of values, count the number of values that fall
1230+
into half-open intervals.
1231+
1232+
Bins can be an int.
12281233
12291234
>>> s.value_counts(bins=3)
12301235
(2.0, 3.0] 2
12311236
(0.996, 2.0] 2
12321237
(3.0, 4.0] 1
12331238
dtype: int64
12341239
1240+
Bins can also be an iterable of numbers. These numbers are treated
1241+
as endpoints for the intervals.
1242+
1243+
>>> s.value_counts(bins=[0,2,4,9])
1244+
(2.0, 4.0] 3
1245+
(-0.001, 2.0] 2
1246+
(4.0, 9.0] 0
1247+
dtype: int64
1248+
12351249
**dropna**
12361250
12371251
With `dropna` set to `False` we can also see NaN index values.

pandas/core/groupby/generic.py

Lines changed: 8 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
"""
88
from collections import abc, namedtuple
99
import copy
10-
from functools import partial
1110
from textwrap import dedent
1211
import typing
1312
from typing import (
@@ -41,11 +40,8 @@
4140
maybe_downcast_to_dtype,
4241
)
4342
from pandas.core.dtypes.common import (
44-
ensure_int64,
4543
ensure_platform_int,
4644
is_bool,
47-
is_integer_dtype,
48-
is_interval_dtype,
4945
is_numeric_dtype,
5046
is_object_dtype,
5147
is_scalar,
@@ -671,129 +667,14 @@ def describe(self, **kwargs):
671667
def value_counts(
672668
self, normalize=False, sort=True, ascending=False, bins=None, dropna=True
673669
):
674-
675-
from pandas.core.reshape.tile import cut
676-
from pandas.core.reshape.merge import _get_join_indexers
677-
678-
if bins is not None:# and not np.iterable(bins):
679-
# scalar bins cannot be done at top level
680-
# in a backward compatible way
681-
return self.apply(
682-
Series.value_counts,
683-
normalize=normalize,
684-
sort=sort,
685-
ascending=ascending,
686-
bins=bins,
687-
dropna=dropna
688-
)
689-
690-
ids, _, _ = self.grouper.group_info
691-
val = self.obj._values
692-
693-
# groupby removes null keys from groupings
694-
mask = ids != -1
695-
ids, val = ids[mask], val[mask]
696-
697-
if bins is None:
698-
lab, lev = algorithms.factorize(val, sort=True)
699-
llab = lambda lab, inc: lab[inc]
700-
else:
701-
702-
# lab is a Categorical with categories an IntervalIndex
703-
lab = cut(Series(val), bins, include_lowest=True)
704-
lev = lab.cat.categories
705-
lab = lev.take(lab.cat.codes)
706-
llab = lambda lab, inc: lab[inc]._multiindex.codes[-1]
707-
708-
if is_interval_dtype(lab):
709-
# TODO: should we do this inside II?
710-
sorter = np.lexsort((lab.left, lab.right, ids))
711-
else:
712-
sorter = np.lexsort((lab, ids))
713-
714-
ids, lab = ids[sorter], lab[sorter]
715-
716-
# group boundaries are where group ids change
717-
idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
718-
719-
# new values are where sorted labels change
720-
lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
721-
inc = np.r_[True, lchanges]
722-
inc[idx] = True # group boundaries are also new values
723-
out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
724-
725-
# num. of times each group should be repeated
726-
rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
727-
728-
# multi-index components
729-
codes = self.grouper.reconstructed_codes
730-
codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
731-
levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
732-
names = self.grouper.names + [self._selection_name]
733-
734-
if dropna:
735-
mask = codes[-1] != -1
736-
if mask.all():
737-
dropna = False
738-
else:
739-
out, codes = out[mask], [level_codes[mask] for level_codes in codes]
740-
741-
if normalize:
742-
out = out.astype("float")
743-
d = np.diff(np.r_[idx, len(ids)])
744-
if dropna:
745-
m = ids[lab == -1]
746-
np.add.at(d, m, -1)
747-
acc = rep(d)[mask]
748-
else:
749-
acc = rep(d)
750-
out /= acc
751-
752-
if sort and bins is None:
753-
cat = ids[inc][mask] if dropna else ids[inc]
754-
sorter = np.lexsort((out if ascending else -out, cat))
755-
out, codes[-1] = out[sorter], codes[-1][sorter]
756-
757-
if bins is None:
758-
mi = MultiIndex(
759-
levels=levels, codes=codes, names=names, verify_integrity=False
760-
)
761-
762-
if is_integer_dtype(out):
763-
out = ensure_int64(out)
764-
return Series(out, index=mi, name=self._selection_name)
765-
766-
# for compat. with libgroupby.value_counts need to ensure every
767-
# bin is present at every index level, null filled with zeros
768-
diff = np.zeros(len(out), dtype="bool")
769-
for level_codes in codes[:-1]:
770-
diff |= np.r_[True, level_codes[1:] != level_codes[:-1]]
771-
772-
ncat, nbin = diff.sum(), len(levels[-1])
773-
774-
left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)]
775-
776-
right = [diff.cumsum() - 1, codes[-1]]
777-
778-
_, idx = _get_join_indexers(left, right, sort=False, how="left")
779-
out = np.where(idx != -1, out[idx], 0)
780-
781-
if sort:
782-
sorter = np.lexsort((out if ascending else -out, left[0]))
783-
out, left[-1] = out[sorter], left[-1][sorter]
784-
785-
# build the multi-index w/ full levels
786-
def build_codes(lev_codes: np.ndarray) -> np.ndarray:
787-
return np.repeat(lev_codes[diff], nbin)
788-
789-
codes = [build_codes(lev_codes) for lev_codes in codes[:-1]]
790-
codes.append(left[-1])
791-
792-
mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False)
793-
794-
if is_integer_dtype(out):
795-
out = ensure_int64(out)
796-
return Series(out, index=mi, name=self._selection_name)
670+
return self.apply(
671+
Series.value_counts,
672+
normalize=normalize,
673+
sort=sort,
674+
ascending=ascending,
675+
bins=bins,
676+
dropna=dropna,
677+
)
797678

798679
def count(self) -> Series:
799680
"""

0 commit comments

Comments
 (0)