From aa19bad51dfca467e12d6544a704269c69fce1aa Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 31 May 2023 21:25:49 +0200 Subject: [PATCH 1/3] Use _values_for_factorize by default for hashing ExtensionArrays --- pandas/core/arrays/base.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 27eb7994d3ccb..4ebc312985aed 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -993,7 +993,6 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]: Returns ------- values : ndarray - An array suitable for factorization. This should maintain order and be a supported dtype (Float64, Int64, UInt64, String, Object). By default, the extension array is cast to object dtype. @@ -1002,6 +1001,12 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]: as NA in the factorization routines, so it will be coded as `-1` and not included in `uniques`. By default, ``np.nan`` is used. + + Notes + ----- + The values returned by this method are also used in + :func:`pandas.util.hash_pandas_object`. If needed, this can be + overridden in the ``self._hash_pandas_object()`` method. """ return self.astype(object), np.nan @@ -1449,7 +1454,7 @@ def _hash_pandas_object( """ Hook for hash_pandas_object. - Default is likely non-performant. + Default is to use the values returned by _values_for_factorize. Parameters ---------- @@ -1463,7 +1468,7 @@ def _hash_pandas_object( """ from pandas.core.util.hashing import hash_array - values = self.to_numpy(copy=False) + values, _ = self._values_for_factorize() return hash_array( values, encoding=encoding, hash_key=hash_key, categorize=categorize ) From 9ecd30330099bd9036b97f270f9b08258d86f35c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 15 Jun 2023 17:45:49 +0200 Subject: [PATCH 2/3] hashing json works --- pandas/tests/extension/json/test_json.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 7fda870e6f721..0920e70142446 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -240,10 +240,6 @@ class TestReduce(base.BaseNoReduceTests): class TestMethods(BaseJSON, base.BaseMethodsTests): - @pytest.mark.xfail(reason="ValueError: setting an array element with a sequence") - def test_hash_pandas_object(self, data): - super().test_hash_pandas_object(data) - @unhashable def test_value_counts(self, all_data, dropna): super().test_value_counts(all_data, dropna) @@ -286,10 +282,6 @@ def test_combine_add(self, data_repeated): def test_combine_first(self, data): super().test_combine_first(data) - @unhashable - def test_hash_pandas_object_works(self, data, kind): - super().test_hash_pandas_object_works(data, kind) - @pytest.mark.xfail(reason="broadcasting error") def test_where_series(self, data, na_value): # Fails with From cf618270c3cc21379f32459c684ad0fa3c1dc7f7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 19 Jun 2023 14:13:12 +0200 Subject: [PATCH 3/3] add whatsnew --- doc/source/whatsnew/v2.0.3.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.3.rst b/doc/source/whatsnew/v2.0.3.rst index a854906c6abdd..d8d152f2bd779 100644 --- a/doc/source/whatsnew/v2.0.3.rst +++ b/doc/source/whatsnew/v2.0.3.rst @@ -14,6 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed performance regression in merging on datetime-like columns (:issue:`53231`) +- For external ExtensionArray implementations, restored the default use of ``_values_for_factorize`` for hashing arrays (:issue:`53475`) - .. ---------------------------------------------------------------------------