From 5301c1c92b01299deddcf7f6c04115b62b1d7699 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Sun, 28 Jan 2024 15:27:56 +0800 Subject: [PATCH 1/3] Add test --- pandas/tests/arrays/string_/test_string.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index f67268616a021..cb324d29258c0 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -584,6 +584,19 @@ def test_value_counts_with_normalize(dtype): tm.assert_series_equal(result, expected) +def test_value_counts_sort_false(dtype): + if getattr(dtype, "storage", "") == "pyarrow": + exp_dtype = "int64[pyarrow]" + elif getattr(dtype, "storage", "") == "pyarrow_numpy": + exp_dtype = "int64" + else: + exp_dtype = "Int64" + ser = pd.Series(["a", "b", "c", "b"], dtype=dtype) + result = ser.value_counts(sort=False) + expected = pd.Series([1, 2, 1], index=ser[:3], dtype=exp_dtype, name="count") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "values, expected", [ From 84efa3c638aaa88c0b08b00889e172a21fe138f8 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Sun, 28 Jan 2024 15:55:20 +0800 Subject: [PATCH 2/3] Add sort to StringArray.value_counts --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/algorithms.py | 9 +++++++-- pandas/core/arrays/string_.py | 4 ++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 73201fa93a8aa..ceaf3a953f046 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -152,7 +152,7 @@ Conversion Strings ^^^^^^^ -- +- Bug in :meth:`Series.value_counts` would not respect ``sort=False`` for series having ``string`` dtype (:issue:`55224`) - Interval diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 128477dac562e..0e50f393d982a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -908,8 +908,13 @@ def value_counts_internal( else: if is_extension_array_dtype(values): - # handle Categorical and sparse, - result = Series(values, copy=False)._values.value_counts(dropna=dropna) + if values.dtype == "string[python]": + result = Series(values, copy=False)._values.value_counts( + sort=sort, dropna=dropna + ) + else: + # handle Categorical and sparse, + result = Series(values, copy=False)._values.value_counts(dropna=dropna) result.name = name result.index.name = index_name counts = result._values diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index b73b49eca3e18..d49ab6de118a1 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -539,10 +539,10 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: ) return self._wrap_reduction_result(axis, result) - def value_counts(self, dropna: bool = True) -> Series: + def value_counts(self, sort: bool = True, dropna: bool = True) -> Series: from pandas.core.algorithms import value_counts_internal as value_counts - result = value_counts(self._ndarray, dropna=dropna).astype("Int64") + result = value_counts(self._ndarray, sort=sort, dropna=dropna).astype("Int64") result.index = result.index.astype(self.dtype) return result From 561ce2fd2d1be6637c6d4369bef041123348e317 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Sun, 28 Jan 2024 21:06:00 +0800 Subject: [PATCH 3/3] Do not change signature of NDArrayBackedExtensionArray --- pandas/core/algorithms.py | 9 ++------- pandas/core/arrays/string_.py | 4 ++-- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0e50f393d982a..128477dac562e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -908,13 +908,8 @@ def value_counts_internal( else: if is_extension_array_dtype(values): - if values.dtype == "string[python]": - result = Series(values, copy=False)._values.value_counts( - sort=sort, dropna=dropna - ) - else: - # handle Categorical and sparse, - result = Series(values, copy=False)._values.value_counts(dropna=dropna) + # handle Categorical and sparse, + result = Series(values, copy=False)._values.value_counts(dropna=dropna) result.name = name result.index.name = index_name counts = result._values diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index d49ab6de118a1..6c4413052c12d 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -539,10 +539,10 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: ) return self._wrap_reduction_result(axis, result) - def value_counts(self, sort: bool = True, dropna: bool = True) -> Series: + def value_counts(self, dropna: bool = True) -> Series: from pandas.core.algorithms import value_counts_internal as value_counts - result = value_counts(self._ndarray, sort=sort, dropna=dropna).astype("Int64") + result = value_counts(self._ndarray, sort=False, dropna=dropna).astype("Int64") result.index = result.index.astype(self.dtype) return result