From f9d79dbfd74ea63e76619ff5e56012aa6fb14bdc Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Thu, 8 Jul 2021 19:58:56 +0530 Subject: [PATCH 1/9] BUG: Issue with pd.cut on Series with duplicate index --- doc/source/whatsnew/v1.3.1.rst | 11 ++++++++ doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/core/reshape/tile.py | 2 +- pandas/tests/reshape/test_cut.py | 45 ++++++++++++++++++++++++++++++++ 4 files changed, 58 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst index a57995eb0db9a..ff9fdfea320c5 100644 --- a/doc/source/whatsnew/v1.3.1.rst +++ b/doc/source/whatsnew/v1.3.1.rst @@ -40,6 +40,17 @@ Bug fixes - Fixed bug in :meth:`DataFrame.to_xml` raising ``KeyError`` when called with ``index=False`` and an offset index (:issue:`42458`) - Fixed bug in :meth:`.Styler.set_sticky` not handling index names correctly for single index columns case (:issue:`42537`) - Fixed bug in :meth:`DataFrame.copy` failing to consolidate blocks in the result (:issue:`42579`) +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_131.other: + +Other +~~~~~ +- +- +>>>>>>> BUG: Issue with pd.cut on Series with duplicate index .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 2211722ad006d..ea337b2cac660 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -265,7 +265,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ - :func:`concat` creating :class:`MultiIndex` with duplicate level entries when concatenating a :class:`DataFrame` with duplicates in :class:`Index` and multiple keys (:issue:`42651`) -- +-Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`) Sparse ^^^^^^ diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 7db30dc1ba9b9..bf0664e233499 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -421,7 +421,7 @@ def _bins_to_cuts( ids = ensure_platform_int(bins.searchsorted(x, side=side)) if include_lowest: - ids[x == bins[0]] = 1 + ids[np.asarray(x) == bins[0]] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 944205c66c3e6..127be504e82d5 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -691,3 +691,48 @@ def test_cut_no_warnings(): labels = [f"{i} - {i + 9}" for i in range(0, 100, 10)] with tm.assert_produces_warning(False): df["group"] = cut(df.value, range(0, 105, 10), right=False, labels=labels) + + +def test_cut_with_duplicated_index_lowest_included(): + # GH 42185 + expected = Series( + [Interval(-0.001, 2, closed="right")] * 3 + + [Interval(2, 4, closed="right"), Interval(-0.001, 2, closed="right")], + index=[0, 1, 2, 3, 0], + dtype="category", + ).cat.as_ordered() + + s = Series([0, 1, 2, 3, 0], index=[0, 1, 2, 3, 0]) + result = cut(s, bins=[0, 2, 4], include_lowest=True) + tm.assert_series_equal(result, expected) + + +def test_cut_with_nonexact_categorical_indices(): + # GH 42424 + + ser = Series(range(0, 100)) + ser1 = cut(ser, 10).value_counts().head(5) + ser2 = cut(ser, 10).value_counts().tail(5) + result = DataFrame({"1": ser1, "2": ser2}) + + index = pd.CategoricalIndex( + [ + Interval(-0.099, 9.9, closed="right"), + Interval(9.9, 19.8, closed="right"), + Interval(19.8, 29.7, closed="right"), + Interval(29.7, 39.6, closed="right"), + Interval(39.6, 49.5, closed="right"), + Interval(49.5, 59.4, closed="right"), + Interval(59.4, 69.3, closed="right"), + Interval(69.3, 79.2, closed="right"), + Interval(79.2, 89.1, closed="right"), + Interval(89.1, 99, closed="right"), + ], + ordered=True, + ) + + expected = DataFrame( + {"1": [10] * 5 + [np.nan] * 5, "2": [np.nan] * 5 + [10] * 5}, index=index + ) + + tm.assert_frame_equal(expected, result) From d5e6e7afbbcb2cee64f667dbbd9b4d95d5aadfeb Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 14 Jul 2021 00:38:10 +0530 Subject: [PATCH 2/9] suggested edits --- pandas/core/reshape/tile.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index bf0664e233499..aee83f7612fef 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -13,6 +13,7 @@ Timestamp, ) from pandas._libs.lib import infer_dtype +from pandas._typing import AnyArrayLike from pandas.core.dtypes.common import ( DT64NS_DTYPE, @@ -44,7 +45,7 @@ def cut( - x, + x: AnyArrayLike, bins, right: bool = True, labels=None, @@ -383,8 +384,8 @@ def qcut( def _bins_to_cuts( - x, - bins, + x: AnyArrayLike, + bins: np.ndarray, right: bool = True, labels=None, precision: int = 3, @@ -393,6 +394,9 @@ def _bins_to_cuts( duplicates: str = "raise", ordered: bool = True, ): + """ + Allots bins to each item in `x` + """ if not ordered and labels is None: raise ValueError("'labels' must be provided if 'ordered = False'") From 72c9c6e0cffa2180a09894d05d3d522732afc566 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sat, 17 Jul 2021 00:06:33 +0530 Subject: [PATCH 3/9] made suggested changes --- doc/source/whatsnew/v1.4.0.rst | 3 +-- pandas/core/reshape/tile.py | 8 ++------ 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index ea337b2cac660..f4286c6458c2f 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -43,8 +43,7 @@ Other enhancements Notable bug fixes ~~~~~~~~~~~~~~~~~ - -These are bug fixes that might have notable behavior changes. +- Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`) .. _whatsnew_140.notable_bug_fixes.notable_bug_fix1: diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index aee83f7612fef..c5d06bcef72a4 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -13,7 +13,6 @@ Timestamp, ) from pandas._libs.lib import infer_dtype -from pandas._typing import AnyArrayLike from pandas.core.dtypes.common import ( DT64NS_DTYPE, @@ -45,7 +44,7 @@ def cut( - x: AnyArrayLike, + x, bins, right: bool = True, labels=None, @@ -384,7 +383,7 @@ def qcut( def _bins_to_cuts( - x: AnyArrayLike, + x, bins: np.ndarray, right: bool = True, labels=None, @@ -394,9 +393,6 @@ def _bins_to_cuts( duplicates: str = "raise", ordered: bool = True, ): - """ - Allots bins to each item in `x` - """ if not ordered and labels is None: raise ValueError("'labels' must be provided if 'ordered = False'") From e9937c0280036509bccb2263ca718a70ad87d389 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sun, 25 Jul 2021 14:28:28 +0530 Subject: [PATCH 4/9] resolved mypy errors --- pandas/core/reshape/tile.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index c5d06bcef72a4..1fd6e65cb7892 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -4,6 +4,8 @@ from typing import ( Any, Callable, + Literal, + Union, ) import numpy as np @@ -417,7 +419,7 @@ def _bins_to_cuts( else: bins = unique_bins - side = "left" if right else "right" + side: Union[Literal["left"], Literal["right"]] = "left" if right else "right" ids = ensure_platform_int(bins.searchsorted(x, side=side)) if include_lowest: From 19e4350f39580a001faacf3597510f42780783be Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sun, 25 Jul 2021 20:04:19 +0530 Subject: [PATCH 5/9] corrected whatsnew --- doc/source/whatsnew/v1.4.0.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index f4286c6458c2f..6c863822b3077 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -43,7 +43,8 @@ Other enhancements Notable bug fixes ~~~~~~~~~~~~~~~~~ -- Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`) +- +- These are bug fixes that might have notable behavior changes. .. _whatsnew_140.notable_bug_fixes.notable_bug_fix1: @@ -263,8 +264,13 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ +<<<<<<< HEAD - :func:`concat` creating :class:`MultiIndex` with duplicate level entries when concatenating a :class:`DataFrame` with duplicates in :class:`Index` and multiple keys (:issue:`42651`) -Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`) +======= +- Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`) +- +>>>>>>> corrected whatsnew Sparse ^^^^^^ From cec81cf98cbb5c60a99a5488d841c8ea059bc6b4 Mon Sep 17 00:00:00 2001 From: Shoham Debnath Date: Sun, 25 Jul 2021 20:07:26 +0530 Subject: [PATCH 6/9] Update v1.4.0.rst --- doc/source/whatsnew/v1.4.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 6c863822b3077..13ce5a65bae67 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -43,8 +43,8 @@ Other enhancements Notable bug fixes ~~~~~~~~~~~~~~~~~ -- -- These are bug fixes that might have notable behavior changes. + +These are bug fixes that might have notable behavior changes. .. _whatsnew_140.notable_bug_fixes.notable_bug_fix1: From 7640bc75ce6e0eb94242612a7261650442485c66 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Mon, 26 Jul 2021 22:58:27 +0530 Subject: [PATCH 7/9] Updated whatsnew --- doc/source/whatsnew/v1.3.1.rst | 11 ----------- doc/source/whatsnew/v1.4.0.rst | 4 ---- 2 files changed, 15 deletions(-) diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst index ff9fdfea320c5..a57995eb0db9a 100644 --- a/doc/source/whatsnew/v1.3.1.rst +++ b/doc/source/whatsnew/v1.3.1.rst @@ -40,17 +40,6 @@ Bug fixes - Fixed bug in :meth:`DataFrame.to_xml` raising ``KeyError`` when called with ``index=False`` and an offset index (:issue:`42458`) - Fixed bug in :meth:`.Styler.set_sticky` not handling index names correctly for single index columns case (:issue:`42537`) - Fixed bug in :meth:`DataFrame.copy` failing to consolidate blocks in the result (:issue:`42579`) -- - -.. --------------------------------------------------------------------------- - -.. _whatsnew_131.other: - -Other -~~~~~ -- -- ->>>>>>> BUG: Issue with pd.cut on Series with duplicate index .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 13ce5a65bae67..973da8c8ee4b3 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -264,13 +264,9 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -<<<<<<< HEAD - :func:`concat` creating :class:`MultiIndex` with duplicate level entries when concatenating a :class:`DataFrame` with duplicates in :class:`Index` and multiple keys (:issue:`42651`) -Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`) -======= -- Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`) - ->>>>>>> corrected whatsnew Sparse ^^^^^^ From 82fccf5712af1ccb9ce915f6f76953264bcaa785 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Mon, 26 Jul 2021 23:04:01 +0530 Subject: [PATCH 8/9] removed typing --- pandas/core/reshape/tile.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 1fd6e65cb7892..c5d06bcef72a4 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -4,8 +4,6 @@ from typing import ( Any, Callable, - Literal, - Union, ) import numpy as np @@ -419,7 +417,7 @@ def _bins_to_cuts( else: bins = unique_bins - side: Union[Literal["left"], Literal["right"]] = "left" if right else "right" + side = "left" if right else "right" ids = ensure_platform_int(bins.searchsorted(x, side=side)) if include_lowest: From 69a50e1f276e946af1dea8a1ff5f301fa993df24 Mon Sep 17 00:00:00 2001 From: Shoham Debnath Date: Tue, 27 Jul 2021 18:03:20 +0530 Subject: [PATCH 9/9] updated whatsnew format --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 973da8c8ee4b3..bd22fdea6b0e1 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -265,7 +265,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ - :func:`concat` creating :class:`MultiIndex` with duplicate level entries when concatenating a :class:`DataFrame` with duplicates in :class:`Index` and multiple keys (:issue:`42651`) --Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`) +- Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`) - Sparse