Skip to content

Commit c1627dd

Browse files
committed
WIP
1 parent 0e0e41b commit c1627dd

File tree

5 files changed

+16
-12
lines changed

5 files changed

+16
-12
lines changed

pandas/_libs/hashtable_class_helper.pxi.in

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -623,14 +623,13 @@ cdef class {{name}}HashTable(HashTable):
623623
labels[i] = count
624624
na_code = count
625625
count += 1
626+
seen_na = True
626627
if return_inverse:
627628
idx = na_code
628629
labels[i] = idx
629630
continue
630631

631-
632632
k = kh_get_{{dtype}}(self.table, val)
633-
634633
if k == self.table.n_buckets:
635634
# k hasn't been seen yet
636635
k = kh_put_{{dtype}}(self.table, val, &ret)
@@ -720,7 +719,7 @@ cdef class {{name}}HashTable(HashTable):
720719
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
721720
object na_value=None):
722721
# -> np.ndarray[np.intp]
723-
_, labels = self._unique(values, uniques, count_prior=count_prior,
722+
_, _, labels = self._unique(values, uniques, count_prior=count_prior,
724723
na_sentinel=na_sentinel, na_value=na_value,
725724
ignore_na=True, return_inverse=True)
726725
return labels
@@ -1097,7 +1096,7 @@ cdef class StringHashTable(HashTable):
10971096
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
10981097
object na_value=None):
10991098
# -> np.ndarray[np.intp]
1100-
_, labels = self._unique(values, uniques, count_prior=count_prior,
1099+
_, _, labels = self._unique(values, uniques, count_prior=count_prior,
11011100
na_sentinel=na_sentinel, na_value=na_value,
11021101
ignore_na=True, return_inverse=True)
11031102
return labels
@@ -1363,7 +1362,7 @@ cdef class PyObjectHashTable(HashTable):
13631362
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
13641363
object na_value=None):
13651364
# -> np.ndarray[np.intp]
1366-
_, labels = self._unique(values, uniques, count_prior=count_prior,
1365+
_, _, labels = self._unique(values, uniques, count_prior=count_prior,
13671366
na_sentinel=na_sentinel, na_value=na_value,
13681367
ignore_na=True, return_inverse=True)
13691368
return labels

pandas/core/algorithms.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -497,13 +497,13 @@ def f(c, v):
497497

498498

499499
def factorize_array(
500-
values: np.ndarray,
500+
arr: ArrayLike,
501501
na_sentinel: int = -1,
502502
size_hint: int | None = None,
503503
na_value=None,
504504
mask: npt.NDArray[np.bool_] | None = None,
505505
ignore_na: bool = True,
506-
) -> tuple[npt.NDArray[np.intp], np.ndarray]:
506+
) -> tuple[npt.NDArray[np.intp], ArrayLike]:
507507
"""
508508
Factorize a numpy array to codes and uniques.
509509
@@ -530,6 +530,11 @@ def factorize_array(
530530
codes : ndarray[np.intp]
531531
uniques : ndarray
532532
"""
533+
if is_extension_array_dtype(arr):
534+
values, na_value = arr._values_for_factorize()
535+
else:
536+
values = arr
537+
533538
original = values
534539
if values.dtype.kind in ["m", "M"]:
535540
# _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we

pandas/core/arrays/arrow/array.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,9 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
105105
return type(self)(self._data)
106106

107107
@doc(ExtensionArray.factorize)
108-
def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
108+
def factorize(
109+
self, na_sentinel: int = -1, ignore_na=True
110+
) -> tuple[np.ndarray, ExtensionArray]:
109111
encoded = self._data.dictionary_encode()
110112
indices = pa.chunked_array(
111113
[c.indices for c in encoded.chunks], type=encoded.type.index_type

pandas/core/arrays/sparse/array.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -855,9 +855,7 @@ def factorize(
855855
# ExtensionArray.factorize -> Tuple[EA, EA]
856856
# Given that we have to return a dense array of codes, why bother
857857
# implementing an efficient factorize?
858-
codes, uniques = algos.factorize(
859-
np.asarray(self), na_sentinel=na_sentinel, ignore_na=ignore_na
860-
)
858+
codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
861859
uniques_sp = SparseArray(uniques, dtype=self.dtype)
862860
return codes, uniques_sp
863861

pandas/core/reshape/merge.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2243,7 +2243,7 @@ def _factorize_keys(
22432243
# Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
22442244
# "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
22452245
# ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"
2246-
llab, _, _ = rizer.factorize(lk) # type: ignore[arg-type]
2246+
llab = rizer.factorize(lk) # type: ignore[arg-type]
22472247
# Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
22482248
# "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
22492249
# ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"

0 commit comments

Comments
 (0)