Skip to content

Commit a0a53fb

Browse files
committed
BUG: Issue with pd.cut on Series with duplicate index
1 parent 4653b6a commit a0a53fb

File tree

3 files changed

+47
-2
lines changed

3 files changed

+47
-2
lines changed

doc/source/whatsnew/v1.3.1.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ Fixed regressions
2525

2626
Bug fixes
2727
~~~~~~~~~
28-
-
28+
-Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`)
2929
-
3030

3131
.. ---------------------------------------------------------------------------

pandas/core/reshape/tile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,7 @@ def _bins_to_cuts(
421421
ids = ensure_platform_int(bins.searchsorted(x, side=side))
422422

423423
if include_lowest:
424-
ids[x == bins[0]] = 1
424+
ids[np.asarray(x) == bins[0]] = 1
425425

426426
na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
427427
has_nas = na_mask.any()

pandas/tests/reshape/test_cut.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -691,3 +691,48 @@ def test_cut_no_warnings():
691691
labels = [f"{i} - {i + 9}" for i in range(0, 100, 10)]
692692
with tm.assert_produces_warning(False):
693693
df["group"] = cut(df.value, range(0, 105, 10), right=False, labels=labels)
694+
695+
696+
def test_cut_with_duplicated_index_lowest_included():
697+
# GH 42185
698+
expected = Series(
699+
[Interval(-0.001, 2, closed="right")] * 3
700+
+ [Interval(2, 4, closed="right"), Interval(-0.001, 2, closed="right")],
701+
index=[0, 1, 2, 3, 0],
702+
dtype="category",
703+
).cat.as_ordered()
704+
705+
s = Series([0, 1, 2, 3, 0], index=[0, 1, 2, 3, 0])
706+
result = cut(s, bins=[0, 2, 4], include_lowest=True)
707+
tm.assert_series_equal(result, expected)
708+
709+
710+
def test_cut_with_nonexact_categorical_indices():
711+
# GH 42424
712+
713+
ser = Series(range(0, 100))
714+
ser1 = cut(ser, 10).value_counts().head(5)
715+
ser2 = cut(ser, 10).value_counts().tail(5)
716+
result = DataFrame({"1": ser1, "2": ser2})
717+
718+
index = pd.CategoricalIndex(
719+
[
720+
Interval(-0.099, 9.9, closed="right"),
721+
Interval(9.9, 19.8, closed="right"),
722+
Interval(19.8, 29.7, closed="right"),
723+
Interval(29.7, 39.6, closed="right"),
724+
Interval(39.6, 49.5, closed="right"),
725+
Interval(49.5, 59.4, closed="right"),
726+
Interval(59.4, 69.3, closed="right"),
727+
Interval(69.3, 79.2, closed="right"),
728+
Interval(79.2, 89.1, closed="right"),
729+
Interval(89.1, 99, closed="right"),
730+
],
731+
ordered=True,
732+
)
733+
734+
expected = DataFrame(
735+
{"1": [10] * 5 + [np.nan] * 5, "2": [np.nan] * 5 + [10] * 5}, index=index
736+
)
737+
738+
tm.assert_frame_equal(expected, result)

0 commit comments

Comments
 (0)