From 4c6779b9e96c5101bdb155334839f1bd9ab84082 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 9 Mar 2022 09:44:01 -0800 Subject: [PATCH 1/2] BUG: PandasArray[uint].factorize --- pandas/core/arrays/_mixins.py | 3 +++ pandas/core/arrays/categorical.py | 3 --- pandas/core/arrays/datetimelike.py | 3 --- pandas/core/arrays/numpy_.py | 6 +++++- pandas/tests/arrays/numpy_/test_numpy.py | 13 +++++++++++++ 5 files changed, 21 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index d86c60d78195b..1827337b1b19a 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -190,6 +190,9 @@ def _from_factorized(cls, values, original): def _values_for_argsort(self) -> np.ndarray: return self._ndarray + def _values_for_factorize(self): + return self._ndarray, self._internal_fill_value + # Signature of "argmin" incompatible with supertype "ExtensionArray" def argmin(self, axis: int = 0, skipna: bool = True): # type: ignore[override] # override base class by adding axis keyword diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 9d649c533619e..eca7a205983ef 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2298,9 +2298,6 @@ def unique(self): unique_codes = unique1d(self.codes) return self._from_backing_data(unique_codes) - def _values_for_factorize(self): - return self._ndarray, -1 - def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: # make sure we have correct itemsize for resulting codes res_values = coerce_indexer_dtype(res_values, self.dtype.categories) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 93d32217ef322..ed19c77f3cf4b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -549,9 +549,6 @@ def copy(self: DatetimeLikeArrayT, order="C") -> DatetimeLikeArrayT: new_obj._freq = self.freq return new_obj - def _values_for_factorize(self): - return self._ndarray, self._internal_fill_value - # ------------------------------------------------------------------ # Validation Methods # TODO: try to de-duplicate these, ensure identical behavior diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 3d6e721ec273b..474d2f4ebb933 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -190,7 +190,11 @@ def _validate_scalar(self, fill_value): return fill_value def _values_for_factorize(self) -> tuple[np.ndarray, int]: - return self._ndarray, -1 + if self.dtype.kind in ["i", "u", "b"]: + fv = None + else: + fv = np.nan + return self._ndarray, fv # ------------------------------------------------------------------------ # Reductions diff --git a/pandas/tests/arrays/numpy_/test_numpy.py b/pandas/tests/arrays/numpy_/test_numpy.py index beef5522ed1ab..c748d487a2f9c 100644 --- a/pandas/tests/arrays/numpy_/test_numpy.py +++ b/pandas/tests/arrays/numpy_/test_numpy.py @@ -309,3 +309,16 @@ def test_quantile_empty(dtype): result = arr._quantile(idx, interpolation="linear") expected = PandasArray(np.array([np.nan, np.nan])) tm.assert_extension_array_equal(result, expected) + + +def test_factorize_unsigned(): + # don't raise when calling factorize on unsigned int PandasArray + arr = np.array([1, 2, 3], dtype=np.uint64) + obj = PandasArray(arr) + + res_codes, res_unique = obj.factorize() + exp_codes, exp_unique = pd.factorize(arr) + + tm.assert_numpy_array_equal(res_codes, exp_codes) + + tm.assert_extension_array_equal(res_unique, PandasArray(exp_unique)) From 69d29296b9e31a092f4a381838d28c1a134b9315 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 11 Mar 2022 15:07:08 -0800 Subject: [PATCH 2/2] mypy fixup --- pandas/core/arrays/numpy_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 474d2f4ebb933..be7dc5e0ebdc6 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -189,7 +189,7 @@ def _validate_scalar(self, fill_value): fill_value = self.dtype.na_value return fill_value - def _values_for_factorize(self) -> tuple[np.ndarray, int]: + def _values_for_factorize(self) -> tuple[np.ndarray, float | None]: if self.dtype.kind in ["i", "u", "b"]: fv = None else: