Skip to content

Commit 4be1ffa

Browse files
committed
ENH: numpy histogram bin edges in cut (GH 14627)
Passig a string to `pd.cut` bins kwarg dispatches bin calculation to `np.histogram_bin_edges`.
1 parent 6a5c34c commit 4be1ffa

File tree

2 files changed

+47
-3
lines changed

2 files changed

+47
-3
lines changed

pandas/core/reshape/tile.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,14 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
3535
----------
3636
x : array-like
3737
The input array to be binned. Must be 1-dimensional.
38-
bins : int, sequence of scalars, or pandas.IntervalIndex
38+
bins : int, str, sequence of scalars, or pandas.IntervalIndex
3939
The criteria to bin by.
4040
4141
* int : Defines the number of equal-width bins in the range of `x`. The
4242
range of `x` is extended by .1% on each side to include the minimum
4343
and maximum values of `x`.
44+
* str : Bin calculaton dispatched to `np.histogram_bin_edges`. See that
45+
documentation for details.
4446
* sequence of scalars : Defines the bin edges allowing for non-uniform
4547
width. No extension of the range of `x` is done.
4648
* IntervalIndex : Defines the exact bins to be used.
@@ -83,7 +85,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
8385
8486
* False : returns an ndarray of integers.
8587
86-
bins : numpy.ndarray or IntervalIndex.
88+
bins : numpy.ndarray or IntervalIndex
8789
The computed or specified bins. Only returned when `retbins=True`.
8890
For scalar or sequence `bins`, this is an ndarray with the computed
8991
bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
@@ -181,14 +183,37 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
181183
>>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)
182184
[NaN, (0, 1], NaN, (2, 3], (4, 5]]
183185
Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]]
186+
187+
Passng a string for `bins` dispatches the bin calculation to numpy's
188+
`histogram_bin_edges`.
189+
>>> pd.cut(array([0.1, 0.1, 0.2, 0.5, 0.5, 0.9, 1.0]),
190+
... bins="auto")
191+
... # doctest: +ELLIPSIS`
192+
[(0.0991, 0.325], (0.0991, 0.325], (0.0991, 0.325], (0.325, 0.55],
193+
(0.325, 0.55], (0.775, 1.0], (0.775, 1.0]]
194+
Categories (4, interval[float64]): [(0.0991, 0.325] < (0.325, 0.55] <
195+
(0.55, 0.775] < (0.775, 1.0]]
184196
"""
185197
# NOTE: this binning code is changed a bit from histogram for var(x) == 0
186198

187199
# for handling the cut for datetime and timedelta objects
188200
x_is_series, series_index, name, x = _preprocess_for_cut(x)
189201
x, dtype = _coerce_to_type(x)
190202

191-
if not np.iterable(bins):
203+
if isinstance(bins, str):
204+
bins = np.histogram_bin_edges(x, bins)
205+
mn, mx = bins[0], bins[-1]
206+
adj = (mx - mn)
207+
if adj:
208+
adj *= 0.001 # 0.1% of the range
209+
else:
210+
adj = 0.001
211+
if right:
212+
bins[0] -= adj
213+
else:
214+
bins[-1] += adj
215+
216+
elif not np.iterable(bins):
192217
if is_scalar(bins) and bins < 1:
193218
raise ValueError("`bins` should be a positive integer.")
194219

pandas/tests/reshape/test_tile.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from pandas.core.algorithms import quantile
1717
import pandas.core.reshape.tile as tmod
1818

19+
import pdb
1920

2021
class TestCut(object):
2122

@@ -37,6 +38,24 @@ def test_bins(self):
3738
tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667,
3839
6.53333333, 9.7]))
3940

41+
def test_str_bins(self):
42+
data = np.array([0.1, 0.1, 0.2, 0.5, 0.5, 0.9, 1.0])
43+
result, bins_cut = cut(data, bins="auto",
44+
retbins=True)
45+
46+
bins_np = np.histogram_bin_edges(data, "auto")
47+
adj = (bins_np[-1] - bins_np[0]) * 0.001
48+
bins_np[0] -= adj
49+
tm.assert_almost_equal(bins_cut, bins_np)
50+
tm.assert_almost_equal(np.round(bins_cut, 4),
51+
np.array([0.0991, 0.325, 0.55, 0.775, 1.0]))
52+
53+
intervals = IntervalIndex.from_breaks(np.round(bins_np, 4),
54+
closed="right")
55+
expected = Categorical(intervals, ordered=True)
56+
tm.assert_index_equal(result.categories,
57+
expected.categories)
58+
4059
def test_right(self):
4160
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
4261
result, bins = cut(data, 4, right=True, retbins=True)

0 commit comments

Comments
 (0)