Skip to content

ENH: numpy histogram bin edges in cut (GH 14627) #23567

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

31 changes: 28 additions & 3 deletions pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,14 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
----------
x : array-like
The input array to be binned. Must be 1-dimensional.
bins : int, sequence of scalars, or pandas.IntervalIndex
bins : int, str, sequence of scalars, or pandas.IntervalIndex
The criteria to bin by.

* int : Defines the number of equal-width bins in the range of `x`. The
range of `x` is extended by .1% on each side to include the minimum
and maximum values of `x`.
* str : Bin calculaton dispatched to `np.histogram_bin_edges`. See that
documentation for details.
* sequence of scalars : Defines the bin edges allowing for non-uniform
width. No extension of the range of `x` is done.
* IntervalIndex : Defines the exact bins to be used.
Expand Down Expand Up @@ -83,7 +85,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,

* False : returns an ndarray of integers.

bins : numpy.ndarray or IntervalIndex.
bins : numpy.ndarray or IntervalIndex
The computed or specified bins. Only returned when `retbins=True`.
For scalar or sequence `bins`, this is an ndarray with the computed
bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
Expand Down Expand Up @@ -181,14 +183,37 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
>>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)
[NaN, (0, 1], NaN, (2, 3], (4, 5]]
Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]]

Passng a string for `bins` dispatches the bin calculation to numpy's
`histogram_bin_edges`.
>>> pd.cut(array([0.1, 0.1, 0.2, 0.5, 0.5, 0.9, 1.0]),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

leave a blank line before this line

... bins="auto")
... # doctest: +ELLIPSIS`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should go in the previous line, after the code

[(0.0991, 0.325], (0.0991, 0.325], (0.0991, 0.325], (0.325, 0.55],
(0.325, 0.55], (0.775, 1.0], (0.775, 1.0]]
Categories (4, interval[float64]): [(0.0991, 0.325] < (0.325, 0.55] <
(0.55, 0.775] < (0.775, 1.0]]
"""
# NOTE: this binning code is changed a bit from histogram for var(x) == 0

# for handling the cut for datetime and timedelta objects
x_is_series, series_index, name, x = _preprocess_for_cut(x)
x, dtype = _coerce_to_type(x)

if not np.iterable(bins):
if isinstance(bins, str):
bins = np.histogram_bin_edges(x, bins)
mn, mx = bins[0], bins[-1]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this equivalent to doing pd.cut(np.histogram_bin_edges(array, bins))? Why do we do the additional processing / adjustment starting here?

adj = (mx - mn)
if adj:
adj *= 0.001 # 0.1% of the range
else:
adj = 0.001
if right:
bins[0] -= adj
else:
bins[-1] += adj

elif not np.iterable(bins):
if is_scalar(bins) and bins < 1:
raise ValueError("`bins` should be a positive integer.")

Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/reshape/test_tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from pandas.core.algorithms import quantile
import pandas.core.reshape.tile as tmod

import pdb

class TestCut(object):

Expand All @@ -37,6 +38,24 @@ def test_bins(self):
tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667,
6.53333333, 9.7]))

def test_str_bins(self):
data = np.array([0.1, 0.1, 0.2, 0.5, 0.5, 0.9, 1.0])
result, bins_cut = cut(data, bins="auto",
retbins=True)

bins_np = np.histogram_bin_edges(data, "auto")
adj = (bins_np[-1] - bins_np[0]) * 0.001
bins_np[0] -= adj
tm.assert_almost_equal(bins_cut, bins_np)
tm.assert_almost_equal(np.round(bins_cut, 4),
np.array([0.0991, 0.325, 0.55, 0.775, 1.0]))

intervals = IntervalIndex.from_breaks(np.round(bins_np, 4),
closed="right")
expected = Categorical(intervals, ordered=True)
tm.assert_index_equal(result.categories,
expected.categories)

def test_right(self):
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
result, bins = cut(data, 4, right=True, retbins=True)
Expand Down