Skip to content

Commit a3d490f

Browse files
committed
ENH: Add value_counts() to DataFrame
+ abstract bin generation from cut to use elsewhere. Panel goes to ndarray on apply so that's a future TODO. Conflicts: pandas/core/frame.py pandas/core/series.py
1 parent 072e40b commit a3d490f

File tree

2 files changed

+106
-32
lines changed

2 files changed

+106
-32
lines changed

pandas/core/frame.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4297,6 +4297,55 @@ def mode(self, axis=0, numeric_only=False):
42974297
f = lambda s: s.mode()
42984298
return data.apply(f, axis=axis)
42994299

4300+
def value_counts(self, axis=0, normalize=False, sort=True,
4301+
ascending=False, bins=None, numeric_only=False):
4302+
"""
4303+
Returns DataFrame containing counts of unique values. The resulting
4304+
DataFrame will be in descending order so that the first element is the
4305+
most frequently-occurring element among *all* columns. Excludes NA
4306+
values. Maintains order along axis (i.e., column/row)
4307+
4308+
Parameters
4309+
----------
4310+
axis : {0, 1, 'index', 'columns'} (default 0)
4311+
0/'index' : get value_counts by column
4312+
1/'columns' : get value_counts by row
4313+
normalize: boolean, default False
4314+
If True then the Series returned will contain the relative
4315+
frequencies of the unique values.
4316+
sort : boolean, default True
4317+
Sort by sum of counts across columns (if False, DataFrame will be
4318+
sorted by union of all the unique values found)
4319+
ascending : boolean, default False
4320+
Sort in ascending order
4321+
bins : integer or sequence of scalars, optional
4322+
Rather than count values, group them into half-open bins, a
4323+
convenience for pd.cut, only works with numeric data. If integer,
4324+
then creates bins based upon overall max and overall min. If
4325+
passed, assumes numeric_only.
4326+
numeric_only : bool, default False
4327+
only apply to numeric columns.
4328+
4329+
Returns
4330+
-------
4331+
counts : DataFrame
4332+
"""
4333+
data = self if not numeric_only else self._get_numeric_data()
4334+
from pandas.tools.tile import _generate_bins
4335+
if bins is not None and not com._is_sequence(bins):
4336+
max_val = self.max().max()
4337+
min_val = self.min().min()
4338+
bins = _generate_bins(bins=bins, min_val=min_val, max_val=max_val)
4339+
4340+
f = lambda s: s.value_counts(normalize=normalize, bins=bins)
4341+
res = data.apply(f, axis=axis)
4342+
4343+
if sort:
4344+
order = res.sum(1).order(ascending=ascending).index
4345+
res = res.reindex(order)
4346+
4347+
return res
4348+
43004349
def quantile(self, q=0.5, axis=0, numeric_only=True):
43014350
"""
43024351
Return values at the given quantile over requested axis, a la

pandas/tools/tile.py

Lines changed: 57 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,60 @@
1212

1313
import numpy as np
1414

15+
def _generate_bins(x=None, bins=None, min_val=None, max_val=None, right=True):
16+
"""
17+
Generate bins for cut, must either pass x (an array-like) or a min and max
18+
value. If min or max are passed, ignores x.
19+
20+
Adds .1% space around bins if integer.
21+
"""
22+
if bins is None:
23+
raise ValueError("bins cannot be None.")
24+
# ignore x if min and max are passed
25+
if min_val is not None or max_val is not None:
26+
assert min_val is not None and max_val is not None, (
27+
"Must pass *both* min_val and max_val")
28+
else:
29+
assert x is not None, "Must pass either min/max vals or array-like"
30+
31+
# NOTE: this binning code is changed a bit from histogram for var(x) == 0
32+
if not np.iterable(bins):
33+
if np.isscalar(bins) and bins < 1:
34+
raise ValueError("`bins` should be a positive integer.")
35+
if min_val is not None:
36+
mn, mx = min_val, max_val
37+
else:
38+
try: # for array-like
39+
sz = x.size
40+
except AttributeError:
41+
x = np.asarray(x)
42+
sz = x.size
43+
if sz == 0:
44+
raise ValueError('Cannot cut empty array')
45+
# handle empty arrays. Can't determine range, so use 0-1.
46+
# rng = (0, 1)
47+
else:
48+
rng = (nanops.nanmin(x), nanops.nanmax(x))
49+
mn, mx = [mi + 0.0 for mi in rng]
50+
51+
if mn == mx: # adjust end points before binning
52+
mn -= .001 * mn
53+
mx += .001 * mx
54+
bins = np.linspace(mn, mx, bins + 1, endpoint=True)
55+
else: # adjust end points after binning
56+
bins = np.linspace(mn, mx, bins + 1, endpoint=True)
57+
adj = (mx - mn) * 0.001 # 0.1% of the range
58+
if right:
59+
bins[0] -= adj
60+
else:
61+
bins[-1] += adj
62+
63+
else:
64+
bins = np.asarray(bins)
65+
if (np.diff(bins) < 0).any():
66+
raise ValueError('bins must increase monotonically.')
67+
return bins
68+
1569

1670
def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
1771
include_lowest=False):
@@ -75,39 +129,10 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
75129
>>> pd.cut(np.ones(5), 4, labels=False)
76130
array([1, 1, 1, 1, 1], dtype=int64)
77131
"""
78-
# NOTE: this binning code is changed a bit from histogram for var(x) == 0
79-
if not np.iterable(bins):
80-
if np.isscalar(bins) and bins < 1:
81-
raise ValueError("`bins` should be a positive integer.")
82-
try: # for array-like
83-
sz = x.size
84-
except AttributeError:
85-
x = np.asarray(x)
86-
sz = x.size
87-
if sz == 0:
88-
raise ValueError('Cannot cut empty array')
89-
# handle empty arrays. Can't determine range, so use 0-1.
90-
# rng = (0, 1)
91-
else:
92-
rng = (nanops.nanmin(x), nanops.nanmax(x))
93-
mn, mx = [mi + 0.0 for mi in rng]
132+
if x is None:
133+
raise TypeError("Must pass array-like as first argument, not None")
94134

95-
if mn == mx: # adjust end points before binning
96-
mn -= .001 * mn
97-
mx += .001 * mx
98-
bins = np.linspace(mn, mx, bins + 1, endpoint=True)
99-
else: # adjust end points after binning
100-
bins = np.linspace(mn, mx, bins + 1, endpoint=True)
101-
adj = (mx - mn) * 0.001 # 0.1% of the range
102-
if right:
103-
bins[0] -= adj
104-
else:
105-
bins[-1] += adj
106-
107-
else:
108-
bins = np.asarray(bins)
109-
if (np.diff(bins) < 0).any():
110-
raise ValueError('bins must increase monotonically.')
135+
bins = _generate_bins(x, bins, right=right)
111136

112137
return _bins_to_cuts(x, bins, right=right, labels=labels,retbins=retbins, precision=precision,
113138
include_lowest=include_lowest)

0 commit comments

Comments
 (0)