From 6815bbdaf19ce554741310ec8f065f8ce9228ec9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 21 Oct 2023 21:42:07 +0200 Subject: [PATCH 1/6] BUG: mode not sorting values for arrow backed strings --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/arrow/array.py | 1 + pandas/tests/groupby/test_groupby.py | 15 +++++++++++++-- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 97a718dd496e9..66dfa1565eac5 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -30,6 +30,7 @@ Bug fixes - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) - Fixed bug in :meth:`Series.all` and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`) - Fixed bug in :meth:`Series.floordiv` for :class:`ArrowDtype` (:issue:`55561`) +- Fixed bug in :meth:`Series.mode` not sorting values for arrow backed string dtype (:issue:`55621`) - Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`) - Fixed bug in :meth:`Series.str.extractall` for :class:`ArrowDtype` dtype being converted to object (:issue:`53846`) - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a2d8694b5b19b..1612976a11063 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1921,6 +1921,7 @@ def _mode(self, dropna: bool = True) -> Self: if pa.types.is_temporal(pa_type): most_common = most_common.cast(pa_type) + most_common = most_common.take(pc.array_sort_indices(most_common)) return type(self)(most_common) def _maybe_convert_setitem_value(self, value): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7297d049587e6..bccd0c7ed8e8c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -5,6 +5,7 @@ import numpy as np import pytest +from pandas.compat import pa_version_under7p0 from pandas.errors import ( PerformanceWarning, SpecificationError, @@ -2763,13 +2764,23 @@ def test_rolling_wrong_param_min_period(): test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() -def test_by_column_values_with_same_starting_value(): +@pytest.mark.parametrize( + "dtype", + [ + object, + pytest.param( + "string[pyarrow_numpy]", + marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"), + ), + ], +) +def test_by_column_values_with_same_starting_value(dtype): # GH29635 df = DataFrame( { "Name": ["Thomas", "Thomas", "Thomas John"], "Credit": [1200, 1300, 900], - "Mood": ["sad", "happy", "happy"], + "Mood": Series(["sad", "happy", "happy"], dtype=dtype), } ) aggregate_details = {"Mood": Series.mode, "Credit": "sum"} From 699b0bde99d6e2541a345b3499e7ae035aea0da0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 21 Oct 2023 23:06:07 +0200 Subject: [PATCH 2/6] Fix tests --- pandas/tests/extension/test_arrow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index b674ffd03eba4..d44fb7b3858e2 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1416,7 +1416,7 @@ def test_quantile(data, interpolation, quantile, request): @pytest.mark.parametrize( "take_idx, exp_idx", - [[[0, 0, 2, 2, 4, 4], [0, 4]], [[0, 0, 0, 2, 4, 4], [0]]], + [[[0, 0, 2, 2, 4, 4], [4, 0]], [[0, 0, 0, 2, 4, 4], [0]]], ids=["multi_mode", "single_mode"], ) def test_mode_dropna_true(data_for_grouping, take_idx, exp_idx): @@ -1434,7 +1434,7 @@ def test_mode_dropna_false_mode_na(data): expected = pd.Series([None], dtype=data.dtype) tm.assert_series_equal(result, expected) - expected = pd.Series([None, data[0]], dtype=data.dtype) + expected = pd.Series([data[0], None], dtype=data.dtype) result = expected.mode(dropna=False) tm.assert_series_equal(result, expected) From b0475581651c68aa0f045822c8ad9fac0f01caa7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 22 Oct 2023 23:43:43 +0200 Subject: [PATCH 3/6] Change to pa_installed variable --- pandas/compat/pyarrow.py | 2 ++ pandas/tests/groupby/test_groupby.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 12f58be109d98..81ed21f7d347f 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -7,6 +7,7 @@ try: import pyarrow as pa + pa_installed = True _palv = Version(Version(pa.__version__).base_version) pa_version_under7p0 = _palv < Version("7.0.0") pa_version_under8p0 = _palv < Version("8.0.0") @@ -17,6 +18,7 @@ pa_version_under13p0 = _palv < Version("13.0.0") pa_version_under14p0 = _palv < Version("14.0.0") except ImportError: + pa_installed = False pa_version_under7p0 = True pa_version_under8p0 = True pa_version_under9p0 = True diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index bccd0c7ed8e8c..2ce4cbc855169 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -5,7 +5,7 @@ import numpy as np import pytest -from pandas.compat import pa_version_under7p0 +from pandas.compat.pyarrow import pa_installed from pandas.errors import ( PerformanceWarning, SpecificationError, @@ -2770,7 +2770,7 @@ def test_rolling_wrong_param_min_period(): object, pytest.param( "string[pyarrow_numpy]", - marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"), + marks=pytest.mark.skipif(pa_installed, reason="arrow not installed"), ), ], ) From 78ada026007f475c8a378b25158f2fc207b73aba Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 24 Oct 2023 22:13:05 +0200 Subject: [PATCH 4/6] Update pyarrow.py --- pandas/compat/pyarrow.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index fc7af6b21328e..975391a92f725 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -16,10 +16,6 @@ pa_version_under14p0 = _palv < Version("14.0.0") except ImportError: pa_installed = False - pa_version_under7p0 = True - pa_version_under8p0 = True - pa_version_under9p0 = True - pa_version_under10p0 = True pa_version_under10p1 = True pa_version_under11p0 = True pa_version_under12p0 = True From 024e9b053871512d4490763cefa17ce2091b6e5f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 24 Oct 2023 22:27:45 +0200 Subject: [PATCH 5/6] Fix --- pandas/tests/groupby/test_groupby.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 89f67a604fddb..5992731643d31 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -5,11 +5,11 @@ import numpy as np import pytest -from pandas.compat.pyarrow import pa_installed from pandas.errors import ( PerformanceWarning, SpecificationError, ) +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -2542,10 +2542,7 @@ def test_groupby_column_index_name_lost(func): "infer_string", [ False, - pytest.param( - True, - marks=pytest.mark.skipif(not pa_installed, reason="arrow not installed"), - ), + pytest.param(True, marks=td.skip_if_no("pyarrow")), ], ) def test_groupby_duplicate_columns(infer_string): @@ -2781,10 +2778,7 @@ def test_rolling_wrong_param_min_period(): "dtype", [ object, - pytest.param( - "string[pyarrow_numpy]", - marks=pytest.mark.skipif(not pa_installed, reason="arrow not installed"), - ), + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), ], ) def test_by_column_values_with_same_starting_value(dtype): From fe1bc37026f351bea7a70d2bb6cc0a5d3018315c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 24 Oct 2023 22:27:53 +0200 Subject: [PATCH 6/6] Fix --- pandas/compat/pyarrow.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 975391a92f725..d125904ba83f8 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -7,7 +7,6 @@ try: import pyarrow as pa - pa_installed = True _palv = Version(Version(pa.__version__).base_version) pa_version_under10p1 = _palv < Version("10.0.1") pa_version_under11p0 = _palv < Version("11.0.0") @@ -15,7 +14,6 @@ pa_version_under13p0 = _palv < Version("13.0.0") pa_version_under14p0 = _palv < Version("14.0.0") except ImportError: - pa_installed = False pa_version_under10p1 = True pa_version_under11p0 = True pa_version_under12p0 = True