Skip to content

Commit 79b7145

Browse files
blaltermangfyoung
authored andcommitted
ENH: Accept string for bins in pd.cut
Passing a string to the `pd.cut` bins kwarg dispatches bin calculation to `np.histogram_bin_edges`. Closes gh-14627.
1 parent b4b945a commit 79b7145

File tree

3 files changed

+64
-7
lines changed

3 files changed

+64
-7
lines changed

doc/source/whatsnew/v0.24.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,7 @@ Other Enhancements
233233
- :class:`DatetimeIndex` gained :attr:`DatetimeIndex.timetz` attribute. Returns local time with timezone information. (:issue:`21358`)
234234
- :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`)
235235
- :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support a ``nonexistent`` argument for handling datetimes that are rounded to nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`22647`)
236+
- :func: `~cut` `bins` kwarg now accepts a string, which is dispatched to `numpy.histogram_bin_edges`. (:issue:`14627`)
236237
- :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`).
237238
- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`).
238239
- :meth:`pandas.core.dtypes.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``,

pandas/core/reshape/tile.py

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
is_scalar, is_timedelta64_dtype)
1414
from pandas.core.dtypes.missing import isna
1515

16+
from pandas.compat import string_types
17+
1618
from pandas import (
1719
Categorical, Index, Interval, IntervalIndex, Series, Timedelta, Timestamp,
1820
to_datetime, to_timedelta)
@@ -35,12 +37,14 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
3537
----------
3638
x : array-like
3739
The input array to be binned. Must be 1-dimensional.
38-
bins : int, sequence of scalars, or pandas.IntervalIndex
40+
bins : int, str, sequence of scalars, or pandas.IntervalIndex
3941
The criteria to bin by.
4042
4143
* int : Defines the number of equal-width bins in the range of `x`. The
4244
range of `x` is extended by .1% on each side to include the minimum
4345
and maximum values of `x`.
46+
* str : Bin calculaton dispatched to `np.histogram_bin_edges`. See that
47+
documentation for details. (versionadded:: 0.24.0)
4448
* sequence of scalars : Defines the bin edges allowing for non-uniform
4549
width. No extension of the range of `x` is done.
4650
* IntervalIndex : Defines the exact bins to be used.
@@ -83,11 +87,11 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
8387
8488
* False : returns an ndarray of integers.
8589
86-
bins : numpy.ndarray or IntervalIndex.
90+
bins : numpy.ndarray or IntervalIndex
8791
The computed or specified bins. Only returned when `retbins=True`.
88-
For scalar or sequence `bins`, this is an ndarray with the computed
89-
bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
90-
an IntervalIndex `bins`, this is equal to `bins`.
92+
For scalar, str, or sequence `bins`, this is an ndarray with the
93+
computed bins. If set `duplicates=drop`, `bins` will drop non-unique
94+
bin. For an IntervalIndex `bins`, this is equal to `bins`.
9195
9296
See Also
9397
--------
@@ -98,6 +102,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
98102
Series : One-dimensional array with axis labels (including time series).
99103
pandas.IntervalIndex : Immutable Index implementing an ordered,
100104
sliceable set.
105+
numpy.histogram_bin_edges : Bin calculation dispatched to this method when
106+
`bins` is a string.
101107
102108
Notes
103109
-----
@@ -181,14 +187,38 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
181187
>>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)
182188
[NaN, (0, 1], NaN, (2, 3], (4, 5]]
183189
Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]]
190+
191+
Passng a string for `bins` dispatches the bin calculation to numpy's
192+
`histogram_bin_edges`. (Starting in version 0.24.)
193+
>>> pd.cut(array([0.1, 0.1, 0.2, 0.5, 0.5, 0.9, 1.0]),
194+
... bins="auto")
195+
... # doctest: +ELLIPSIS`
196+
[(0.0991, 0.325], (0.0991, 0.325], (0.0991, 0.325], (0.325, 0.55],
197+
(0.325, 0.55], (0.775, 1.0], (0.775, 1.0]]
198+
Categories (4, interval[float64]): [(0.0991, 0.325] < (0.325, 0.55] <
199+
(0.55, 0.775] < (0.775, 1.0]]
184200
"""
185201
# NOTE: this binning code is changed a bit from histogram for var(x) == 0
186202

187203
# for handling the cut for datetime and timedelta objects
188204
x_is_series, series_index, name, x = _preprocess_for_cut(x)
189205
x, dtype = _coerce_to_type(x)
190206

191-
if not np.iterable(bins):
207+
if isinstance(bins, string_types):
208+
# GH 14627
209+
bins = np.histogram_bin_edges(x, bins)
210+
mn, mx = bins[0], bins[-1]
211+
adj = (mx - mn)
212+
if adj:
213+
adj *= 0.001 # 0.1% of the range
214+
else:
215+
adj = 0.001
216+
if right:
217+
bins[0] -= adj
218+
else:
219+
bins[-1] += adj
220+
221+
elif not np.iterable(bins):
192222
if is_scalar(bins) and bins < 1:
193223
raise ValueError("`bins` should be a positive integer.")
194224

pandas/tests/reshape/test_tile.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
from pandas.core.algorithms import quantile
1717
import pandas.core.reshape.tile as tmod
1818

19-
2019
class TestCut(object):
2120

2221
def test_simple(self):
@@ -37,6 +36,33 @@ def test_bins(self):
3736
tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667,
3837
6.53333333, 9.7]))
3938

39+
def test_str_bins(self):
40+
# GH 14627
41+
data = np.array([0.1, 0.1, 0.2, 0.5, 0.5, 0.9, 1.0])
42+
result, bins_cut = cut(data, bins="auto",
43+
retbins=True)
44+
45+
bins_np = np.histogram_bin_edges(data, "auto")
46+
adj = (bins_np[-1] - bins_np[0]) * 0.001
47+
bins_np[0] -= adj
48+
tm.assert_almost_equal(bins_cut, bins_np)
49+
tm.assert_almost_equal(np.round(bins_cut, 4),
50+
np.array([0.0991, 0.325, 0.55, 0.775, 1.0]))
51+
52+
intervals = IntervalIndex.from_breaks(np.round(bins_np, 4),
53+
closed="right")
54+
expected = Categorical(intervals, ordered=True)
55+
tm.assert_index_equal(result.categories,
56+
expected.categories)
57+
58+
59+
# Test that a `bin` string not present in `np.histogram_bin_edges`
60+
# throws a ValueError.
61+
with pytest.raises(ValueError,
62+
match="'*' is not a valid estimator for `bins`",
63+
message="Verify acceptable bins in `np.histogram_bin_edges`."):
64+
cut(data, bins="bad bins")
65+
4066
def test_right(self):
4167
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
4268
result, bins = cut(data, 4, right=True, retbins=True)

0 commit comments

Comments
 (0)