From 7c1620ee67e0fe4e550ed5e4a6e58a8ef0b2a0b9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 31 Aug 2023 23:27:13 +0200 Subject: [PATCH] REGR: drop_duplicates raising for arrow strings --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/algorithms.py | 2 +- pandas/tests/series/methods/test_drop_duplicates.py | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index f5aa0968ae362..ebb15f267fccb 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -14,6 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`) +- Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`) .. --------------------------------------------------------------------------- .. _whatsnew_211.bug_fixes: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0a9c1aad46f89..9411b67387cfd 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -998,7 +998,7 @@ def duplicated( duplicated : ndarray[bool] """ if hasattr(values, "dtype"): - if isinstance(values.dtype, ArrowDtype): + if isinstance(values.dtype, ArrowDtype) and values.dtype.kind in "ifub": values = values._to_masked() # type: ignore[union-attr] if isinstance(values.dtype, BaseMaskedDtype): diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index 324ab1204e16e..10b2e98586365 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas as pd from pandas import ( Categorical, Series, @@ -256,3 +257,11 @@ def test_duplicated_arrow_dtype(self): result = ser.drop_duplicates() expected = Series([True, False, None], dtype="bool[pyarrow]") tm.assert_series_equal(result, expected) + + def test_drop_duplicates_arrow_strings(self): + # GH#54904 + pa = pytest.importorskip("pyarrow") + ser = Series(["a", "a"], dtype=pd.ArrowDtype(pa.string())) + result = ser.drop_duplicates() + expecetd = Series(["a"], dtype=pd.ArrowDtype(pa.string())) + tm.assert_series_equal(result, expecetd)