pandas-dev · chaarvii · Jul 12, 2024 · Jul 12, 2024 · Jul 12, 2024
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -40,6 +40,7 @@ Other enhancements
 - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
 - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`)
 - :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`)
+- :func:`cut` now supports a string for ``bins`` kwarg by dispatching to ``numpy.histogram_bin_edges``. (:issue:`59165`)
 - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`)
 - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
 - :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`)

diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
@@ -75,7 +75,7 @@ def cut(
     ----------
     x : array-like
         The input array to be binned. Must be 1-dimensional.
-    bins : int, sequence of scalars, or IntervalIndex
+    bins : int, str, sequence of scalars or IntervalIndex
         The criteria to bin by.
 
         * int : Defines the number of equal-width bins in the range of `x`. The
@@ -85,6 +85,14 @@ def cut(
           width. No extension of the range of `x` is done.
         * IntervalIndex : Defines the exact bins to be used. Note that
           IntervalIndex for `bins` must be non-overlapping.
+        * str : If bins is a string from a list of accepted strings, bin
+          calculation is dispatched to np.histogram_bin_edges. Which then
+          uses the method chosen to calculate the optimal bin width and
+          consequently the number of bins from the data that falls within the
+          requested range.
+          Supported strings = ["auto", "auto", "fd", "doane", "scott",
+                               "stone", "rice", "sturges", "sqrt"]
+          Please check np.histogram_bin_edges documentation for more details.
 
     right : bool, default True
         Indicates whether `bins` includes the rightmost edge or not. If
@@ -130,7 +138,7 @@ def cut(
 
     bins : numpy.ndarray or IntervalIndex.
         The computed or specified bins. Only returned when `retbins=True`.
-        For scalar or sequence `bins`, this is an ndarray with the computed
+        For scalar, str or sequence `bins`, this is an ndarray with the computed
         bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
         an IntervalIndex `bins`, this is equal to `bins`.
 
@@ -142,6 +150,8 @@ def cut(
         fixed set of values.
     Series : One-dimensional array with axis labels (including time series).
     IntervalIndex : Immutable Index implementing an ordered, sliceable set.
+    np.histogram_bin_edges : Bin calculation dispatched to this method when
+        `bins` is a string.
 
     Notes
     -----
@@ -239,6 +249,12 @@ def cut(
     >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)
     [NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]]
     Categories (3, interval[int64, right]): [(0, 1] < (2, 3] < (4, 5]]
+
+    Passing an str for 'bins' dispatches the bin calculation to np.histogram_bin_edges
+
+    >>> pd.cut(np.array([1, 7, 5, 4]), "auto")
+    [NaN, (5.0, 7.0], (3.0, 5.0], (3.0, 5.0]]
+    Categories (3, interval[float64, right]): [(1.0, 3.0] < (3.0, 5.0] < (5.0, 7.0]]
     """
     # NOTE: this binning code is changed a bit from histogram for var(x) == 0
 
@@ -253,6 +269,11 @@ def cut(
         if bins.is_overlapping:
             raise ValueError("Overlapping IntervalIndex is not accepted.")
 
+    elif isinstance(bins, str):
+        # GH 59165
+        # Raises ValueError if string is not supported
+        bins = np.histogram_bin_edges(x, bins)
+
     else:
         bins = Index(bins)
         if not bins.is_monotonic_increasing:

diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py
@@ -114,6 +114,37 @@ def test_bins_not_monotonic():
         cut(data, [0.1, 1.5, 1, 10])
 
 
+@pytest.mark.parametrize(
+    "data",
+    [
+        [0.2, 1.4, 2.5, 6.2, 9.7, 2.1],
+        np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]),
+        np.array([10, 15, 13, 12, 23, 25, 28, 59, 60]),
+        range(5),
+    ],
+)
+@pytest.mark.parametrize(
+    "bin_str", ["auto", "fd", "doane", "scott", "stone", "rice", "sturges", "sqrt"]
+)
+def test_bins_from_string(data, bin_str):
+    # we make sure calling cut(df, str) is equivalent
+    # to calling cut(df, bins=np.histogram_bin_edges(df,str))
+    expected = cut(data, bins=np.histogram_bin_edges(data, bins=bin_str))
+    result = cut(data, bin_str)
+    tm.assert_categorical_equal(result, expected, check_dtype=False)
+
+
+def test_bins_from_invalid_string():
+    # we make sure calling cut(df, str) with invalid string
+    # throws an error
+    bin_str = "INVALID_STR"
+    msg = f"{bin_str!r} is not a valid estimator for `bins`"
+    data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
+
+    with pytest.raises(ValueError, match=msg):
+        cut(data, bin_str)
+
+
 @pytest.mark.parametrize(
     "x, bins, expected",
     [