diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 44c467795d1ed..86ee1cd226b13 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -233,6 +233,7 @@ Other Enhancements - :class:`DatetimeIndex` gained :attr:`DatetimeIndex.timetz` attribute. Returns local time with timezone information. (:issue:`21358`) - :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`) - :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support a ``nonexistent`` argument for handling datetimes that are rounded to nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`22647`) +- :func:`~cut` `bins` kwarg now accepts a string, which is dispatched to `numpy.histogram_bin_edges`. (:issue:`14627`) - :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`). - :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`). - :meth:`pandas.core.dtypes.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``, diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 4a863372eea13..bcea4360af4ea 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -13,6 +13,8 @@ is_scalar, is_timedelta64_dtype) from pandas.core.dtypes.missing import isna +from pandas.compat import string_types + from pandas import ( Categorical, Index, Interval, IntervalIndex, Series, Timedelta, Timestamp, to_datetime, to_timedelta) @@ -35,7 +37,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, ---------- x : array-like The input array to be binned. Must be 1-dimensional. - bins : int, sequence of scalars, or pandas.IntervalIndex + bins : int, str, sequence of scalars, or pandas.IntervalIndex The criteria to bin by. * int : Defines the number of equal-width bins in the range of `x`. The @@ -44,6 +46,10 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, * sequence of scalars : Defines the bin edges allowing for non-uniform width. No extension of the range of `x` is done. * IntervalIndex : Defines the exact bins to be used. + * str : Bin calculaton dispatched to :func:`np.histogram_bin_edges`. See that + documentation for details. + + .. versionadded:: 0.24.0 right : bool, default True Indicates whether `bins` includes the rightmost edge or not. If @@ -83,11 +89,11 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, * False : returns an ndarray of integers. - bins : numpy.ndarray or IntervalIndex. + bins : numpy.ndarray or IntervalIndex The computed or specified bins. Only returned when `retbins=True`. - For scalar or sequence `bins`, this is an ndarray with the computed - bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For - an IntervalIndex `bins`, this is equal to `bins`. + For scalar, str, or sequence `bins`, this is an ndarray with the + computed bins. If set `duplicates=drop`, `bins` will drop non-unique + bin. For an IntervalIndex `bins`, this is equal to `bins`. See Also -------- @@ -98,6 +104,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Series : One-dimensional array with axis labels (including time series). pandas.IntervalIndex : Immutable Index implementing an ordered, sliceable set. + numpy.histogram_bin_edges : Bin calculation dispatched to this method when + `bins` is a string. Notes ----- @@ -181,6 +189,16 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins) [NaN, (0, 1], NaN, (2, 3], (4, 5]] Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]] + + Passng a string for `bins` dispatches the bin calculation to numpy's + `histogram_bin_edges`. (Starting in version 0.24.) + + >>> pd.cut(array([0.1, 0.1, 0.2, 0.5, 0.5, 0.9, 1.0]), + ... bins="auto") # doctest: +ELLIPSIS` + [(0.0991, 0.325], (0.0991, 0.325], (0.0991, 0.325], (0.325, 0.55], + (0.325, 0.55], (0.775, 1.0], (0.775, 1.0]] + Categories (4, interval[float64]): [(0.0991, 0.325] < (0.325, 0.55] < + (0.55, 0.775] < (0.775, 1.0]] """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 @@ -188,7 +206,28 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, x_is_series, series_index, name, x = _preprocess_for_cut(x) x, dtype = _coerce_to_type(x) - if not np.iterable(bins): + if isinstance(bins, string_types): + # GH 14627 + # NOTE: when the minimum numpy requirement is + # increased to 1.15, the try-except statement + # can be removed. + try: + bins = np.histogram_bin_edges(x, bins) + except AttributeError: + cnt, bins = np.histogram(x, bins) + + mn, mx = bins[0], bins[-1] + adj = (mx - mn) + if adj: + adj *= 0.001 # 0.1% of the range + else: + adj = 0.001 + if right: + bins[0] -= adj + else: + bins[-1] += adj + + elif not np.iterable(bins): if is_scalar(bins) and bins < 1: raise ValueError("`bins` should be a positive integer.") diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py index f04e9a55a6c8d..1c520b6b20c2d 100644 --- a/pandas/tests/reshape/test_tile.py +++ b/pandas/tests/reshape/test_tile.py @@ -16,7 +16,6 @@ from pandas.core.algorithms import quantile import pandas.core.reshape.tile as tmod - class TestCut(object): def test_simple(self): @@ -37,6 +36,32 @@ def test_bins(self): tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, 6.53333333, 9.7])) + def test_str_bins(self): + # GH 14627 + data = np.array([0.1, 0.1, 0.2, 0.5, 0.5, 0.9, 1.0]) + result, bins_cut = cut(data, bins="auto", + retbins=True) + + bins_np = np.histogram_bin_edges(data, "auto") + adj = (bins_np[-1] - bins_np[0]) * 0.001 + bins_np[0] -= adj + tm.assert_almost_equal(bins_cut, bins_np) + tm.assert_almost_equal(np.round(bins_cut, 4), + np.array([0.0991, 0.325, 0.55, 0.775, 1.0])) + + intervals = IntervalIndex.from_breaks(np.round(bins_np, 4), + closed="right") + expected = Categorical(intervals, ordered=True) + tm.assert_index_equal(result.categories, + expected.categories) + + + # Test that a `bin` string not present in `np.histogram_bin_edges` + # throws a ValueError. + with pytest.raises(ValueError, + message="Verify acceptable bins in `np.histogram_bin_edges`."): + cut(data, bins="bad bins") + def test_right(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=True, retbins=True)