ENH: numpy histogram bin edges in cut (GH 14627)

blalterman · blalterman · commit 4be1ffae17cd · 2018-11-08T09:36:50.000-05:00
Passig a string to `pd.cut` bins kwarg dispatches bin calculation
    to `np.histogram_bin_edges`.
diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
@@ -35,12 +35,14 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
     ----------
     x : array-like
         The input array to be binned. Must be 1-dimensional.
-    bins : int, sequence of scalars, or pandas.IntervalIndex
+    bins : int, str, sequence of scalars, or pandas.IntervalIndex
         The criteria to bin by.
 
         * int : Defines the number of equal-width bins in the range of `x`. The
           range of `x` is extended by .1% on each side to include the minimum
           and maximum values of `x`.
+        * str : Bin calculaton dispatched to `np.histogram_bin_edges`. See that
+          documentation for details.
         * sequence of scalars : Defines the bin edges allowing for non-uniform
           width. No extension of the range of `x` is done.
         * IntervalIndex : Defines the exact bins to be used.
@@ -83,7 +85,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
 
         * False : returns an ndarray of integers.
 
-    bins : numpy.ndarray or IntervalIndex.
+    bins : numpy.ndarray or IntervalIndex
         The computed or specified bins. Only returned when `retbins=True`.
         For scalar or sequence `bins`, this is an ndarray with the computed
         bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
@@ -181,14 +183,37 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
     >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)
     [NaN, (0, 1], NaN, (2, 3], (4, 5]]
     Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]]
+
+    Passng a string for `bins` dispatches the bin calculation to numpy's
+    `histogram_bin_edges`.
+    >>> pd.cut(array([0.1, 0.1, 0.2, 0.5, 0.5, 0.9, 1.0]),
+    ...        bins="auto")
+    ... # doctest: +ELLIPSIS`
+    [(0.0991, 0.325], (0.0991, 0.325], (0.0991, 0.325], (0.325, 0.55], 
+    (0.325, 0.55], (0.775, 1.0], (0.775, 1.0]]
+    Categories (4, interval[float64]): [(0.0991, 0.325] < (0.325, 0.55] < 
+                                        (0.55, 0.775] < (0.775, 1.0]]
     """
     # NOTE: this binning code is changed a bit from histogram for var(x) == 0
 
     # for handling the cut for datetime and timedelta objects
     x_is_series, series_index, name, x = _preprocess_for_cut(x)
     x, dtype = _coerce_to_type(x)
 
-    if not np.iterable(bins):
+    if isinstance(bins, str):
+        bins = np.histogram_bin_edges(x, bins)
+        mn, mx = bins[0], bins[-1]
+        adj = (mx - mn)
+        if adj:
+            adj *= 0.001  # 0.1% of the range
+        else:
+            adj  = 0.001
+        if right:
+            bins[0] -= adj
+        else:
+            bins[-1] += adj
+
+    elif not np.iterable(bins):
         if is_scalar(bins) and bins < 1:
             raise ValueError("`bins` should be a positive integer.")
 
diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py
@@ -16,6 +16,7 @@
 from pandas.core.algorithms import quantile
 import pandas.core.reshape.tile as tmod
 
+import pdb
 
 class TestCut(object):
 
@@ -37,6 +38,24 @@ def test_bins(self):
         tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667,
                                                6.53333333, 9.7]))
 
+    def test_str_bins(self):
+        data = np.array([0.1, 0.1, 0.2, 0.5, 0.5, 0.9, 1.0])
+        result, bins_cut = cut(data, bins="auto", 
+                                     retbins=True)
+        
+        bins_np = np.histogram_bin_edges(data, "auto")
+        adj = (bins_np[-1] - bins_np[0]) * 0.001
+        bins_np[0] -= adj
+        tm.assert_almost_equal(bins_cut, bins_np)
+        tm.assert_almost_equal(np.round(bins_cut, 4), 
+                               np.array([0.0991, 0.325, 0.55, 0.775, 1.0]))
+        
+        intervals = IntervalIndex.from_breaks(np.round(bins_np, 4), 
+                                              closed="right")
+        expected  = Categorical(intervals, ordered=True)
+        tm.assert_index_equal(result.categories, 
+                              expected.categories)
+
     def test_right(self):
         data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
         result, bins = cut(data, 4, right=True, retbins=True)