diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 7aaeee043c72b..4f86f63718f2a 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -90,7 +90,7 @@ cdef class ObjectFactorizer(Factorizer): self.uniques = ObjectVector() def factorize( - self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None + self, ndarray[object] values, na_sentinel=-1, na_value=None ) -> np.ndarray: """ @@ -115,14 +115,6 @@ cdef class ObjectFactorizer(Factorizer): self.uniques = uniques labels = self.table.get_labels(values, self.uniques, self.count, na_sentinel, na_value) - mask = (labels == na_sentinel) - # sort on - if sort: - sorter = self.uniques.to_array().argsort() - reverse_indexer = np.empty(len(sorter), dtype=np.intp) - reverse_indexer.put(sorter, np.arange(len(sorter))) - labels = reverse_indexer.take(labels, mode='clip') - labels[mask] = na_sentinel self.count = len(self.uniques) return labels @@ -136,7 +128,7 @@ cdef class Int64Factorizer(Factorizer): self.table = Int64HashTable(size_hint) self.uniques = Int64Vector() - def factorize(self, const int64_t[:] values, sort=False, + def factorize(self, const int64_t[:] values, na_sentinel=-1, na_value=None) -> np.ndarray: """ Returns @@ -161,14 +153,5 @@ cdef class Int64Factorizer(Factorizer): labels = self.table.get_labels(values, self.uniques, self.count, na_sentinel, na_value=na_value) - - # sort on - if sort: - sorter = self.uniques.to_array().argsort() - reverse_indexer = np.empty(len(sorter), dtype=np.intp) - reverse_indexer.put(sorter, np.arange(len(sorter))) - - labels = reverse_indexer.take(labels) - self.count = len(self.uniques) return labels diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index a6b765117f616..c6aefd5bb73b9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -230,21 +230,11 @@ def test_factorize_nan(self): key = np.array([1, 2, 1, np.nan], dtype="O") rizer = ht.ObjectFactorizer(len(key)) for na_sentinel in (-1, 20): - ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel) - expected = np.array([0, 1, 0, na_sentinel], dtype="int32") + ids = rizer.factorize(key, na_sentinel=na_sentinel) + expected = np.array([0, 1, 0, na_sentinel], dtype=np.intp) assert len(set(key)) == len(set(expected)) tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel) - - # nan still maps to na_sentinel when sort=False - key = np.array([0, np.nan, 1], dtype="O") - na_sentinel = -1 - - # TODO(wesm): unused? - ids = rizer.factorize(key, sort=False, na_sentinel=na_sentinel) # noqa - - expected = np.array([2, -1, 0], dtype="int32") - assert len(set(key)) == len(set(expected)) - tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel) + tm.assert_numpy_array_equal(ids, expected) @pytest.mark.parametrize( "data, expected_codes, expected_uniques",