ENH: Accept string for bins in pd.cut

blalterman · gfyoung · commit 79b71455e685 · 2018-11-14T21:20:27.000-08:00
Passing a string to the `pd.cut` bins kwarg dispatches bin calculation to `np.histogram_bin_edges`. Closes gh-14627.
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -233,6 +233,7 @@ Other Enhancements
 - :class:`DatetimeIndex` gained :attr:`DatetimeIndex.timetz` attribute. Returns local time with timezone information. (:issue:`21358`)
 - :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`)
 - :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support a ``nonexistent`` argument for handling datetimes that are rounded to nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`22647`)
+- :func: `~cut` `bins` kwarg now accepts a string, which is dispatched to `numpy.histogram_bin_edges`. (:issue:`14627`)
 - :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`).
 - :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`).
 - :meth:`pandas.core.dtypes.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``,
diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
@@ -13,6 +13,8 @@
     is_scalar, is_timedelta64_dtype)
 from pandas.core.dtypes.missing import isna
 
+from pandas.compat import string_types
+
 from pandas import (
     Categorical, Index, Interval, IntervalIndex, Series, Timedelta, Timestamp,
     to_datetime, to_timedelta)
@@ -35,12 +37,14 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
     ----------
     x : array-like
         The input array to be binned. Must be 1-dimensional.
-    bins : int, sequence of scalars, or pandas.IntervalIndex
+    bins : int, str, sequence of scalars, or pandas.IntervalIndex
         The criteria to bin by.
 
         * int : Defines the number of equal-width bins in the range of `x`. The
           range of `x` is extended by .1% on each side to include the minimum
           and maximum values of `x`.
+        * str : Bin calculaton dispatched to `np.histogram_bin_edges`. See that
+          documentation for details. (versionadded:: 0.24.0)
         * sequence of scalars : Defines the bin edges allowing for non-uniform
           width. No extension of the range of `x` is done.
         * IntervalIndex : Defines the exact bins to be used.
@@ -83,11 +87,11 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
 
         * False : returns an ndarray of integers.
 
-    bins : numpy.ndarray or IntervalIndex.
+    bins : numpy.ndarray or IntervalIndex
         The computed or specified bins. Only returned when `retbins=True`.
-        For scalar or sequence `bins`, this is an ndarray with the computed
-        bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
-        an IntervalIndex `bins`, this is equal to `bins`.
+        For scalar, str, or sequence `bins`, this is an ndarray with the 
+        computed bins. If set `duplicates=drop`, `bins` will drop non-unique 
+        bin. For an IntervalIndex `bins`, this is equal to `bins`.
 
     See Also
     --------
@@ -98,6 +102,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
     Series : One-dimensional array with axis labels (including time series).
     pandas.IntervalIndex : Immutable Index implementing an ordered,
         sliceable set.
+    numpy.histogram_bin_edges : Bin calculation dispatched to this method when
+        `bins` is a string.
 
     Notes
     -----
@@ -181,14 +187,38 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
     >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)
     [NaN, (0, 1], NaN, (2, 3], (4, 5]]
     Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]]
+
+    Passng a string for `bins` dispatches the bin calculation to numpy's
+    `histogram_bin_edges`. (Starting in version 0.24.)
+    >>> pd.cut(array([0.1, 0.1, 0.2, 0.5, 0.5, 0.9, 1.0]),
+    ...        bins="auto")
+    ... # doctest: +ELLIPSIS`
+    [(0.0991, 0.325], (0.0991, 0.325], (0.0991, 0.325], (0.325, 0.55],
+    (0.325, 0.55], (0.775, 1.0], (0.775, 1.0]]
+    Categories (4, interval[float64]): [(0.0991, 0.325] < (0.325, 0.55] <
+                                        (0.55, 0.775] < (0.775, 1.0]]
     """
     # NOTE: this binning code is changed a bit from histogram for var(x) == 0
 
     # for handling the cut for datetime and timedelta objects
     x_is_series, series_index, name, x = _preprocess_for_cut(x)
     x, dtype = _coerce_to_type(x)
 
-    if not np.iterable(bins):
+    if isinstance(bins, string_types):
+        # GH 14627
+        bins = np.histogram_bin_edges(x, bins)
+        mn, mx = bins[0], bins[-1]
+        adj = (mx - mn)
+        if adj:
+            adj *= 0.001  # 0.1% of the range
+        else:
+            adj = 0.001
+        if right:
+            bins[0] -= adj
+        else:
+            bins[-1] += adj
+
+    elif not np.iterable(bins):
         if is_scalar(bins) and bins < 1:
             raise ValueError("`bins` should be a positive integer.")
 
diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py
@@ -16,7 +16,6 @@
 from pandas.core.algorithms import quantile
 import pandas.core.reshape.tile as tmod
 
-
 class TestCut(object):
 
     def test_simple(self):
@@ -37,6 +36,33 @@ def test_bins(self):
         tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667,
                                                6.53333333, 9.7]))
 
+    def test_str_bins(self):
+        # GH 14627
+        data = np.array([0.1, 0.1, 0.2, 0.5, 0.5, 0.9, 1.0])
+        result, bins_cut = cut(data, bins="auto",
+                               retbins=True)
+
+        bins_np = np.histogram_bin_edges(data, "auto")
+        adj = (bins_np[-1] - bins_np[0]) * 0.001
+        bins_np[0] -= adj
+        tm.assert_almost_equal(bins_cut, bins_np)
+        tm.assert_almost_equal(np.round(bins_cut, 4),
+                               np.array([0.0991, 0.325, 0.55, 0.775, 1.0]))
+
+        intervals = IntervalIndex.from_breaks(np.round(bins_np, 4),
+                                              closed="right")
+        expected = Categorical(intervals, ordered=True)
+        tm.assert_index_equal(result.categories,
+                              expected.categories)
+        
+        
+        # Test that a `bin` string not present in `np.histogram_bin_edges`
+        # throws a ValueError.
+        with pytest.raises(ValueError,
+            match="'*' is not a valid estimator for `bins`", 
+            message="Verify acceptable bins in `np.histogram_bin_edges`."):
+                cut(data, bins="bad bins")
+
     def test_right(self):
         data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
         result, bins = cut(data, 4, right=True, retbins=True)