diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 0e567972e7823..c40ed43504d89 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -705,6 +705,7 @@ Conversion - Bug in :class:`DataFrame` failing to raise ``TypeError`` when constructing from a ``frozenset`` (:issue:`40163`) - Bug in :class:`Index` construction silently ignoring a passed ``dtype`` when the data cannot be cast to that dtype (:issue:`21311`) - Bug in :meth:`StringArray.astype` falling back to numpy and raising when converting to ``dtype='categorical'`` (:issue:`40450`) +- Bug in :func:`factorize` where, when given an array with a numeric numpy dtype lower than int64, uint64 and float64, the unique values did not keep their original dtype (:issue:`41132`) - Bug in :class:`DataFrame` construction with a dictionary containing an arraylike with ``ExtensionDtype`` and ``copy=True`` failing to make a copy (:issue:`38939`) - diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6f906cf8879ff..6a17174c6c0a8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -143,11 +143,11 @@ def _ensure_data(values: ArrayLike) -> tuple[np.ndarray, DtypeObj]: # until our algos support uint8 directly (see TODO) return np.asarray(values).astype("uint64"), np.dtype("bool") elif is_signed_integer_dtype(values): - return ensure_int64(values), np.dtype("int64") + return ensure_int64(values), values.dtype elif is_unsigned_integer_dtype(values): - return ensure_uint64(values), np.dtype("uint64") + return ensure_uint64(values), values.dtype elif is_float_dtype(values): - return ensure_float64(values), np.dtype("float64") + return ensure_float64(values), values.dtype elif is_complex_dtype(values): # ignore the fact that we are casting to float diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 75fc7a782772a..c7af104f62770 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -95,29 +95,32 @@ def test_basic(self): exp = np.array(["a", "b", "c"], dtype=object) tm.assert_numpy_array_equal(uniques, exp) - codes, uniques = algos.factorize(list(reversed(range(5)))) + arr = np.arange(5, dtype=np.intp)[::-1] + + codes, uniques = algos.factorize(arr) exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - exp = np.array([4, 3, 2, 1, 0], dtype=np.int64) + exp = np.array([4, 3, 2, 1, 0], dtype=arr.dtype) tm.assert_numpy_array_equal(uniques, exp) - codes, uniques = algos.factorize(list(reversed(range(5))), sort=True) - + codes, uniques = algos.factorize(arr, sort=True) exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - exp = np.array([0, 1, 2, 3, 4], dtype=np.int64) + exp = np.array([0, 1, 2, 3, 4], dtype=arr.dtype) tm.assert_numpy_array_equal(uniques, exp) - codes, uniques = algos.factorize(list(reversed(np.arange(5.0)))) + arr = np.arange(5.0)[::-1] + + codes, uniques = algos.factorize(arr) exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=np.float64) + exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=arr.dtype) tm.assert_numpy_array_equal(uniques, exp) - codes, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True) + codes, uniques = algos.factorize(arr, sort=True) exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64) + exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=arr.dtype) tm.assert_numpy_array_equal(uniques, exp) def test_mixed(self): @@ -246,6 +249,17 @@ def test_complex_sorting(self): with pytest.raises(TypeError, match=msg): algos.factorize(x17[::-1], sort=True) + def test_numeric_dtype_factorize(self, any_real_dtype): + # GH41132 + dtype = any_real_dtype + data = np.array([1, 2, 2, 1], dtype=dtype) + expected_codes = np.array([0, 1, 1, 0], dtype=np.intp) + expected_uniques = np.array([1, 2], dtype=dtype) + + codes, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) + def test_float64_factorize(self, writable): data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64) data.setflags(write=writable)