From d54b26c2d7605b5d3cb2808cf23b390ad6632f32 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 31 Aug 2023 19:51:52 -0700 Subject: [PATCH 1/3] REF: cast x and bins to Index early in cut, qcut --- pandas/core/reshape/tile.py | 93 +++++++++++++++++++++---------------- 1 file changed, 53 insertions(+), 40 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 43eea7c669ce7..dfaceec261b7f 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -243,43 +243,18 @@ def cut( # NOTE: this binning code is changed a bit from histogram for var(x) == 0 original = x - x = _preprocess_for_cut(x) - x, dtype = _coerce_to_type(x) + x_idx = _preprocess_for_cut(x) + x_idx, dtype = _coerce_to_type(x_idx) if not np.iterable(bins): - if is_scalar(bins) and bins < 1: - raise ValueError("`bins` should be a positive integer.") - - sz = x.size - - if sz == 0: - raise ValueError("Cannot cut empty array") - - rng = (nanops.nanmin(x), nanops.nanmax(x)) - mn, mx = (mi + 0.0 for mi in rng) - - if np.isinf(mn) or np.isinf(mx): - # GH 24314 - raise ValueError( - "cannot specify integer `bins` when input data contains infinity" - ) - if mn == mx: # adjust end points before binning - mn -= 0.001 * abs(mn) if mn != 0 else 0.001 - mx += 0.001 * abs(mx) if mx != 0 else 0.001 - bins = np.linspace(mn, mx, bins + 1, endpoint=True) - else: # adjust end points after binning - bins = np.linspace(mn, mx, bins + 1, endpoint=True) - adj = (mx - mn) * 0.001 # 0.1% of the range - if right: - bins[0] -= adj - else: - bins[-1] += adj + bins = _nbins_to_bins(x_idx, bins, right) elif isinstance(bins, IntervalIndex): if bins.is_overlapping: raise ValueError("Overlapping IntervalIndex is not accepted.") else: + bins = Index(bins) if isinstance(getattr(bins, "dtype", None), DatetimeTZDtype): bins = np.asarray(bins, dtype=DT64NS_DTYPE) else: @@ -289,9 +264,10 @@ def cut( # GH 26045: cast to float64 to avoid an overflow if (np.diff(bins.astype("float64")) < 0).any(): raise ValueError("bins must increase monotonically.") + bins = Index(bins) fac, bins = _bins_to_cuts( - x, + x_idx, bins, right=right, labels=labels, @@ -367,18 +343,18 @@ def qcut( array([0, 0, 1, 2, 3]) """ original = x - x = _preprocess_for_cut(x) - x, dtype = _coerce_to_type(x) + x_idx = _preprocess_for_cut(x) + x_idx, dtype = _coerce_to_type(x_idx) quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q - x_np = np.asarray(x) + x_np = np.asarray(x_idx) x_np = x_np[~np.isnan(x_np)] bins = np.quantile(x_np, quantiles) fac, bins = _bins_to_cuts( - x, - bins, + x_idx, + Index(bins), labels=labels, precision=precision, include_lowest=True, @@ -389,9 +365,44 @@ def qcut( return _postprocess_for_cut(fac, bins, retbins, dtype, original) +def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index: + """ + If a user passed an integer N for bins, convert this to a sequence of N + equal(ish)-sized bins. + """ + if is_scalar(nbins) and nbins < 1: + raise ValueError("`bins` should be a positive integer.") + + if x_idx.size == 0: + raise ValueError("Cannot cut empty array") + + rng = (nanops.nanmin(x_idx), nanops.nanmax(x_idx)) + mn, mx = rng + + if np.isinf(mn) or np.isinf(mx): + # GH#24314 + raise ValueError( + "cannot specify integer `bins` when input data contains infinity" + ) + + if mn == mx: # adjust end points before binning + mn -= 0.001 * abs(mn) if mn != 0 else 0.001 + mx += 0.001 * abs(mx) if mx != 0 else 0.001 + bins = np.linspace(mn, mx, nbins + 1, endpoint=True) + else: # adjust end points after binning + bins = np.linspace(mn, mx, nbins + 1, endpoint=True) + adj = (mx - mn) * 0.001 # 0.1% of the range + if right: + bins[0] -= adj + else: + bins[-1] += adj + + return Index(bins) + + def _bins_to_cuts( - x, - bins: np.ndarray, + x: Index, + bins: Index, right: bool = True, labels=None, precision: int = 3, @@ -474,7 +485,7 @@ def _bins_to_cuts( return result, bins -def _coerce_to_type(x): +def _coerce_to_type(x: Index): """ if the passed data is of datetime/timedelta, bool or nullable int type, this method converts it to numeric so that cut or qcut method can @@ -597,7 +608,7 @@ def _format_labels( return IntervalIndex.from_breaks(breaks, closed=closed) -def _preprocess_for_cut(x): +def _preprocess_for_cut(x) -> Index: """ handles preprocessing for cut where we convert passed input to array, strip the index information and store it @@ -611,7 +622,7 @@ def _preprocess_for_cut(x): if x.ndim != 1: raise ValueError("Input array must be 1 dimensional") - return x + return Index(x) def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, original): @@ -627,6 +638,8 @@ def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, origi return fac bins = _convert_bin_to_datelike_type(bins, dtype) + if isinstance(bins, Index) and is_numeric_dtype(bins.dtype): + bins = bins._values return fac, bins From 55e6c986301d3dd2df3fb50496fed9a146fc5bdb Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 31 Aug 2023 21:02:06 -0700 Subject: [PATCH 2/3] mypy fixup --- pandas/core/reshape/tile.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index dfaceec261b7f..1aade1c149bea 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -419,6 +419,8 @@ def _bins_to_cuts( "invalid value for 'duplicates' parameter, valid options are: raise, drop" ) + result: Categorical | np.ndarray + if isinstance(bins, IntervalIndex): # we have a fast-path here ids = bins.get_indexer(x) @@ -485,7 +487,7 @@ def _bins_to_cuts( return result, bins -def _coerce_to_type(x: Index): +def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]: """ if the passed data is of datetime/timedelta, bool or nullable int type, this method converts it to numeric so that cut or qcut method can @@ -509,11 +511,13 @@ def _coerce_to_type(x: Index): # https://github.com/pandas-dev/pandas/pull/31290 # https://github.com/pandas-dev/pandas/issues/31389 elif isinstance(x.dtype, ExtensionDtype) and is_numeric_dtype(x.dtype): - x = x.to_numpy(dtype=np.float64, na_value=np.nan) + x_arr = x.to_numpy(dtype=np.float64, na_value=np.nan) + x = Index(x_arr) if dtype is not None: # GH 19768: force NaT to NaN during integer conversion - x = np.where(x.notna(), x.view(np.int64), np.nan) + x_arr = np.where(x.notna(), x.view(np.int64), np.nan) + x = Index(x_arr) return x, dtype @@ -575,7 +579,7 @@ def _convert_bin_to_datelike_type(bins, dtype: DtypeObj | None): def _format_labels( - bins, + bins: Index, precision: int, right: bool = True, include_lowest: bool = False, @@ -659,7 +663,7 @@ def _round_frac(x, precision: int): return np.around(x, digits) -def _infer_precision(base_precision: int, bins) -> int: +def _infer_precision(base_precision: int, bins: Index) -> int: """ Infer an appropriate precision for _round_frac """ From 60d5f5e1dc35db4e2af13efa8b93f2c88fb517a4 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 1 Sep 2023 08:03:46 -0700 Subject: [PATCH 3/3] troubleshoot builds --- pandas/core/reshape/tile.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 1aade1c149bea..126f589f5df71 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -43,7 +43,6 @@ to_datetime, to_timedelta, ) -from pandas.core import nanops import pandas.core.algorithms as algos if TYPE_CHECKING: @@ -376,7 +375,7 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index: if x_idx.size == 0: raise ValueError("Cannot cut empty array") - rng = (nanops.nanmin(x_idx), nanops.nanmax(x_idx)) + rng = (x_idx.min(), x_idx.max()) mn, mx = rng if np.isinf(mn) or np.isinf(mx):