diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index de85ed67e7e8c..641eb7b01f0b6 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -653,6 +653,15 @@ def factorize( use_na_sentinel: bool = True, ) -> tuple[np.ndarray, ExtensionArray]: null_encoding = "mask" if use_na_sentinel else "encode" + + pa_type = self._data.type + if pa.types.is_duration(pa_type): + # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 + arr = cast(ArrowExtensionArray, self.astype("int64[pyarrow]")) + indices, uniques = arr.factorize(use_na_sentinel=use_na_sentinel) + uniques = uniques.astype(self.dtype) + return indices, uniques + encoded = self._data.dictionary_encode(null_encoding=null_encoding) if encoded.length() == 0: indices = np.array([], dtype=np.intp) @@ -849,6 +858,12 @@ def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: ------- ArrowExtensionArray """ + if pa.types.is_duration(self._data.type): + # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 + arr = cast(ArrowExtensionArrayT, self.astype("int64[pyarrow]")) + result = arr.unique() + return cast(ArrowExtensionArrayT, result.astype(self.dtype)) + return type(self)(pc.unique(self._data)) def value_counts(self, dropna: bool = True) -> Series: @@ -868,6 +883,13 @@ def value_counts(self, dropna: bool = True) -> Series: -------- Series.value_counts """ + if pa.types.is_duration(self._data.type): + # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 + arr = cast(ArrowExtensionArray, self.astype("int64[pyarrow]")) + result = arr.value_counts() + result.index = result.index.astype(self.dtype) + return result + from pandas import ( Index, Series, diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c02fa0aecdacc..d0cab0efa97b3 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -502,13 +502,6 @@ def test_groupby_extension_no_sort(self, data_for_grouping, request): reason=f"{pa_dtype} only has 2 unique possible values", ) ) - elif pa.types.is_duration(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason=f"pyarrow doesn't support factorizing {pa_dtype}", - ) - ) super().test_groupby_extension_no_sort(data_for_grouping) def test_groupby_extension_transform(self, data_for_grouping, request): @@ -519,13 +512,6 @@ def test_groupby_extension_transform(self, data_for_grouping, request): reason=f"{pa_dtype} only has 2 unique possible values", ) ) - elif pa.types.is_duration(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason=f"pyarrow doesn't support factorizing {pa_dtype}", - ) - ) with tm.maybe_produces_warning( PerformanceWarning, pa_version_under7p0, check_stacklevel=False ): @@ -534,14 +520,6 @@ def test_groupby_extension_transform(self, data_for_grouping, request): def test_groupby_extension_apply( self, data_for_grouping, groupby_apply_op, request ): - pa_dtype = data_for_grouping.dtype.pyarrow_dtype - if pa.types.is_duration(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason=f"pyarrow doesn't support factorizing {pa_dtype}", - ) - ) with tm.maybe_produces_warning( PerformanceWarning, pa_version_under7p0, check_stacklevel=False ): @@ -557,13 +535,6 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping, request): reason=f"{pa_dtype} only has 2 unique possible values", ) ) - elif pa.types.is_duration(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason=f"pyarrow doesn't support factorizing {pa_dtype}", - ) - ) with tm.maybe_produces_warning( PerformanceWarning, pa_version_under7p0, check_stacklevel=False ): @@ -784,25 +755,9 @@ def test_diff(self, data, periods, request): @pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning") @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna, request): - pa_dtype = all_data.dtype.pyarrow_dtype - if pa.types.is_duration(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason=f"value_count has no kernel for {pa_dtype}", - ) - ) super().test_value_counts(all_data, dropna) def test_value_counts_with_normalize(self, data, request): - pa_dtype = data.dtype.pyarrow_dtype - if pa.types.is_duration(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason=f"value_count has no pyarrow kernel for {pa_dtype}", - ) - ) with tm.maybe_produces_warning( PerformanceWarning, pa_version_under7p0, check_stacklevel=False ): @@ -882,17 +837,6 @@ def test_nargsort(self, data_missing_for_sorting, na_position, expected): @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values(self, data_for_sorting, ascending, sort_by_key, request): - pa_dtype = data_for_sorting.dtype.pyarrow_dtype - if pa.types.is_duration(pa_dtype) and not ascending: - request.node.add_marker( - pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason=( - f"unique has no pyarrow kernel " - f"for {pa_dtype} when ascending={ascending}" - ), - ) - ) with tm.maybe_produces_warning( PerformanceWarning, pa_version_under7p0, check_stacklevel=False ): @@ -911,45 +855,14 @@ def test_sort_values_missing( @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values_frame(self, data_for_sorting, ascending, request): - pa_dtype = data_for_sorting.dtype.pyarrow_dtype - if pa.types.is_duration(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason=( - f"dictionary_encode has no pyarrow kernel " - f"for {pa_dtype} when ascending={ascending}" - ), - ) - ) with tm.maybe_produces_warning( PerformanceWarning, pa_version_under7p0, check_stacklevel=False ): super().test_sort_values_frame(data_for_sorting, ascending) - @pytest.mark.parametrize("box", [pd.Series, lambda x: x]) - @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique]) - def test_unique(self, data, box, method, request): - pa_dtype = data.dtype.pyarrow_dtype - if pa.types.is_duration(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason=f"unique has no pyarrow kernel for {pa_dtype}.", - ) - ) - super().test_unique(data, box, method) - def test_factorize(self, data_for_grouping, request): pa_dtype = data_for_grouping.dtype.pyarrow_dtype - if pa.types.is_duration(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason=f"dictionary_encode has no pyarrow kernel for {pa_dtype}", - ) - ) - elif pa.types.is_boolean(pa_dtype): + if pa.types.is_boolean(pa_dtype): request.node.add_marker( pytest.mark.xfail( reason=f"{pa_dtype} only has 2 unique possible values", @@ -957,28 +870,6 @@ def test_factorize(self, data_for_grouping, request): ) super().test_factorize(data_for_grouping) - def test_factorize_equivalence(self, data_for_grouping, request): - pa_dtype = data_for_grouping.dtype.pyarrow_dtype - if pa.types.is_duration(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason=f"dictionary_encode has no pyarrow kernel for {pa_dtype}", - ) - ) - super().test_factorize_equivalence(data_for_grouping) - - def test_factorize_empty(self, data, request): - pa_dtype = data.dtype.pyarrow_dtype - if pa.types.is_duration(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason=f"dictionary_encode has no pyarrow kernel for {pa_dtype}", - ) - ) - super().test_factorize_empty(data) - @pytest.mark.xfail( reason="result dtype pyarrow[bool] better than expected dtype object" )