diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 639655ab28199..92568ad728c4d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -40,6 +40,7 @@ Other enhancements - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) - :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`) +- :func:`cut` now supports a string for ``bins`` kwarg by dispatching to ``numpy.histogram_bin_edges``. (:issue:`59165`) - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) - :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 0052bcfe09147..85d86c083260e 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -75,7 +75,7 @@ def cut( ---------- x : array-like The input array to be binned. Must be 1-dimensional. - bins : int, sequence of scalars, or IntervalIndex + bins : int, str, sequence of scalars or IntervalIndex The criteria to bin by. * int : Defines the number of equal-width bins in the range of `x`. The @@ -85,6 +85,14 @@ def cut( width. No extension of the range of `x` is done. * IntervalIndex : Defines the exact bins to be used. Note that IntervalIndex for `bins` must be non-overlapping. + * str : If bins is a string from a list of accepted strings, bin + calculation is dispatched to np.histogram_bin_edges. Which then + uses the method chosen to calculate the optimal bin width and + consequently the number of bins from the data that falls within the + requested range. + Supported strings = ["auto", "auto", "fd", "doane", "scott", + "stone", "rice", "sturges", "sqrt"] + Please check np.histogram_bin_edges documentation for more details. right : bool, default True Indicates whether `bins` includes the rightmost edge or not. If @@ -130,7 +138,7 @@ def cut( bins : numpy.ndarray or IntervalIndex. The computed or specified bins. Only returned when `retbins=True`. - For scalar or sequence `bins`, this is an ndarray with the computed + For scalar, str or sequence `bins`, this is an ndarray with the computed bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For an IntervalIndex `bins`, this is equal to `bins`. @@ -142,6 +150,8 @@ def cut( fixed set of values. Series : One-dimensional array with axis labels (including time series). IntervalIndex : Immutable Index implementing an ordered, sliceable set. + np.histogram_bin_edges : Bin calculation dispatched to this method when + `bins` is a string. Notes ----- @@ -239,6 +249,12 @@ def cut( >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins) [NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]] Categories (3, interval[int64, right]): [(0, 1] < (2, 3] < (4, 5]] + + Passing an str for 'bins' dispatches the bin calculation to np.histogram_bin_edges + + >>> pd.cut(np.array([1, 7, 5, 4]), "auto") + [NaN, (5.0, 7.0], (3.0, 5.0], (3.0, 5.0]] + Categories (3, interval[float64, right]): [(1.0, 3.0] < (3.0, 5.0] < (5.0, 7.0]] """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 @@ -253,6 +269,11 @@ def cut( if bins.is_overlapping: raise ValueError("Overlapping IntervalIndex is not accepted.") + elif isinstance(bins, str): + # GH 59165 + # Raises ValueError if string is not supported + bins = np.histogram_bin_edges(x, bins) + else: bins = Index(bins) if not bins.is_monotonic_increasing: diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index d8bb4fba1e1fe..cfbc4a49e9cf6 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -114,6 +114,37 @@ def test_bins_not_monotonic(): cut(data, [0.1, 1.5, 1, 10]) +@pytest.mark.parametrize( + "data", + [ + [0.2, 1.4, 2.5, 6.2, 9.7, 2.1], + np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]), + np.array([10, 15, 13, 12, 23, 25, 28, 59, 60]), + range(5), + ], +) +@pytest.mark.parametrize( + "bin_str", ["auto", "fd", "doane", "scott", "stone", "rice", "sturges", "sqrt"] +) +def test_bins_from_string(data, bin_str): + # we make sure calling cut(df, str) is equivalent + # to calling cut(df, bins=np.histogram_bin_edges(df,str)) + expected = cut(data, bins=np.histogram_bin_edges(data, bins=bin_str)) + result = cut(data, bin_str) + tm.assert_categorical_equal(result, expected, check_dtype=False) + + +def test_bins_from_invalid_string(): + # we make sure calling cut(df, str) with invalid string + # throws an error + bin_str = "INVALID_STR" + msg = f"{bin_str!r} is not a valid estimator for `bins`" + data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1] + + with pytest.raises(ValueError, match=msg): + cut(data, bin_str) + + @pytest.mark.parametrize( "x, bins, expected", [