Skip to content

Commit 6e28b43

Browse files
committed
WIP
1 parent 8e66efb commit 6e28b43

File tree

9 files changed

+104
-49
lines changed

9 files changed

+104
-49
lines changed

pandas/_libs/hashtable_class_helper.pxi.in

Lines changed: 65 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -560,13 +560,14 @@ cdef class {{name}}HashTable(HashTable):
560560
The labels from values to uniques
561561
"""
562562
cdef:
563-
Py_ssize_t i, idx, count = count_prior, n = len(values)
563+
Py_ssize_t i, idx, count = count_prior, n = len(values), na_index = -1, na_code = -1
564564
intp_t[::1] labels
565565
int ret = 0
566566
{{c_type}} val, na_value2
567567
khiter_t k
568568
{{name}}VectorData *ud
569-
bint use_na_value, use_mask
569+
bint use_na_value, use_mask, seen_na = False
570+
uint8_t is_na
570571
uint8_t[:] mask_values
571572

572573
if return_inverse:
@@ -592,20 +593,42 @@ cdef class {{name}}HashTable(HashTable):
592593
for i in range(n):
593594
val = {{to_c_type}}(values[i])
594595

595-
if ignore_na and use_mask:
596-
if mask_values[i]:
596+
if use_mask:
597+
is_na = mask_values[i]
598+
else:
599+
is_na = (
600+
is_nan_{{c_type}}(val) or
601+
(use_na_value and are_equivalent_{{c_type}}(val, na_value2))
602+
)
603+
604+
if is_na:
605+
if ignore_na:
606+
# if missing values do not count as unique values (i.e. if
607+
# ignore_na is True), skip the hashtable entry for them,
608+
# and replace the corresponding label with na_sentinel
597609
labels[i] = na_sentinel
598610
continue
599-
elif ignore_na and (
600-
is_nan_{{c_type}}(val) or
601-
(use_na_value and are_equivalent_{{c_type}}(val, na_value2))
602-
):
603-
# if missing values do not count as unique values (i.e. if
604-
# ignore_na is True), skip the hashtable entry for them,
605-
# and replace the corresponding label with na_sentinel
606-
labels[i] = na_sentinel
611+
612+
if not seen_na:
613+
if needs_resize(ud):
614+
with gil:
615+
if uniques.external_view_exists:
616+
raise ValueError("external reference to "
617+
"uniques held, but "
618+
"Vector.resize() needed")
619+
uniques.resize()
620+
append_data_{{dtype}}(ud, val)
621+
na_index = i
622+
if return_inverse:
623+
labels[i] = count
624+
na_code = count
625+
count += 1
626+
if return_inverse:
627+
idx = na_code
628+
labels[i] = idx
607629
continue
608630

631+
609632
k = kh_get_{{dtype}}(self.table, val)
610633

611634
if k == self.table.n_buckets:
@@ -620,6 +643,7 @@ cdef class {{name}}HashTable(HashTable):
620643
"Vector.resize() needed")
621644
uniques.resize()
622645
append_data_{{dtype}}(ud, val)
646+
623647
if return_inverse:
624648
self.table.vals[k] = count
625649
labels[i] = count
@@ -631,8 +655,8 @@ cdef class {{name}}HashTable(HashTable):
631655
labels[i] = idx
632656

633657
if return_inverse:
634-
return uniques.to_array(), labels.base # .base -> underlying ndarray
635-
return uniques.to_array()
658+
return uniques.to_array(), na_index, labels.base # .base -> underlying ndarray
659+
return uniques.to_array(), na_index
636660

637661
def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
638662
"""
@@ -1010,8 +1034,8 @@ cdef class StringHashTable(HashTable):
10101034
uniques.append(values[uindexer[i]])
10111035

10121036
if return_inverse:
1013-
return uniques.to_array(), labels.base # .base -> underlying ndarray
1014-
return uniques.to_array()
1037+
return uniques.to_array(), -1, labels.base # .base -> underlying ndarray
1038+
return uniques.to_array(), -1
10151039

10161040
def unique(self, ndarray[object] values, bint return_inverse=False):
10171041
"""
@@ -1222,12 +1246,13 @@ cdef class PyObjectHashTable(HashTable):
12221246
The labels from values to uniques
12231247
"""
12241248
cdef:
1225-
Py_ssize_t i, idx, count = count_prior, n = len(values)
1249+
Py_ssize_t i, idx, count = count_prior, n = len(values), na_index = -1
12261250
intp_t[::1] labels
12271251
int ret = 0
12281252
object val
12291253
khiter_t k
1230-
bint use_na_value
1254+
bint use_na_value, seen_na, is_na
1255+
intp_t na_code
12311256

12321257
if return_inverse:
12331258
labels = np.empty(n, dtype=np.intp)
@@ -1237,14 +1262,26 @@ cdef class PyObjectHashTable(HashTable):
12371262
val = values[i]
12381263
hash(val)
12391264

1240-
if ignore_na and (
1241-
checknull(val)
1242-
or (use_na_value and val == na_value)
1243-
):
1244-
# if missing values do not count as unique values (i.e. if
1245-
# ignore_na is True), skip the hashtable entry for them, and
1246-
# replace the corresponding label with na_sentinel
1247-
labels[i] = na_sentinel
1265+
is_na = checknull(val) or (use_na_value and val == na_value)
1266+
1267+
if is_na:
1268+
if ignore_na:
1269+
# if missing values do not count as unique values (i.e. if
1270+
# ignore_na is True), skip the hashtable entry for them, and
1271+
# replace the corresponding label with na_sentinel
1272+
labels[i] = na_sentinel
1273+
continue
1274+
1275+
if not seen_na:
1276+
seen_na = True
1277+
uniques.append(val)
1278+
na_index = i
1279+
if return_inverse:
1280+
labels[i] = count
1281+
na_code = count
1282+
count += 1
1283+
elif return_inverse:
1284+
labels[i] = na_code
12481285
continue
12491286

12501287
k = kh_get_pymap(self.table, <PyObject*>val)
@@ -1263,8 +1300,8 @@ cdef class PyObjectHashTable(HashTable):
12631300
labels[i] = idx
12641301

12651302
if return_inverse:
1266-
return uniques.to_array(), labels.base # .base -> underlying ndarray
1267-
return uniques.to_array()
1303+
return uniques.to_array(), na_index, labels.base # .base -> underlying ndarray
1304+
return uniques.to_array(), na_index
12681305

12691306
def unique(self, ndarray[object] values, bint return_inverse=False):
12701307
"""

pandas/core/algorithms.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,7 @@ def unique(values):
412412
htable, values = _get_hashtable_algo(values)
413413

414414
table = htable(len(values))
415-
uniques = table.unique(values)
415+
uniques, _ = table.unique(values)
416416
uniques = _reconstruct_data(uniques, original.dtype, original)
417417
return uniques
418418

@@ -541,7 +541,7 @@ def factorize_array(
541541
hash_klass, values = _get_hashtable_algo(values)
542542

543543
table = hash_klass(size_hint or len(values))
544-
uniques, codes = table.factorize(
544+
uniques, na_index, codes = table.factorize(
545545
values,
546546
na_sentinel=na_sentinel,
547547
na_value=na_value,
@@ -553,7 +553,7 @@ def factorize_array(
553553
uniques = _reconstruct_data(uniques, original.dtype, original)
554554

555555
codes = ensure_platform_int(codes)
556-
return codes, uniques
556+
return codes, uniques, na_index
557557

558558

559559
@doc(
@@ -733,11 +733,13 @@ def factorize(
733733

734734
if not isinstance(values.dtype, np.dtype):
735735
# i.e. ExtensionDtype
736-
assert dropna or sort
737-
codes, uniques = values.factorize(na_sentinel=na_sentinel)
736+
# assert dropna or sort
737+
codes, uniques = values.factorize(
738+
na_sentinel=na_sentinel, ignore_na=dropna or sort
739+
)
738740
else:
739741
values = np.asarray(values) # convert DTA/TDA/MultiIndex
740-
codes, uniques = factorize_array(
742+
codes, uniques, _ = factorize_array(
741743
values,
742744
na_sentinel=na_sentinel,
743745
size_hint=size_hint,
@@ -749,8 +751,7 @@ def factorize(
749751
uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
750752
)
751753

752-
# TODO: Fix
753-
if not dropna and (sort or not isinstance(values.dtype, np.dtype)):
754+
if not dropna and sort:
754755
code_is_na = codes == na_sentinel
755756
if code_is_na.any():
756757
# na_value is set based on the dtype of uniques, and compat set to False is

pandas/core/arrays/_mixins.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -605,7 +605,10 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
605605
return type(self)(self._data)
606606

607607
@doc(ExtensionArray.factorize)
608-
def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
608+
# TODO: ignore_na?
609+
def factorize(
610+
self, na_sentinel: int = -1, ignore_na: bool = True
611+
) -> tuple[np.ndarray, ExtensionArray]:
609612
encoded = self._data.dictionary_encode()
610613
indices = pa.chunked_array(
611614
[c.indices for c in encoded.chunks], type=encoded.type.index_type

pandas/core/arrays/base.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1002,7 +1002,9 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
10021002
"""
10031003
return self.astype(object), np.nan
10041004

1005-
def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
1005+
def factorize(
1006+
self, na_sentinel: int = -1, ignore_na: bool = True
1007+
) -> tuple[np.ndarray, ExtensionArray]:
10061008
"""
10071009
Encode the extension array as an enumerated type.
10081010
@@ -1043,8 +1045,8 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
10431045
# Complete control over factorization.
10441046
arr, na_value = self._values_for_factorize()
10451047

1046-
codes, uniques = factorize_array(
1047-
arr, na_sentinel=na_sentinel, na_value=na_value, ignore_na=True
1048+
codes, uniques, _ = factorize_array(
1049+
arr, na_sentinel=na_sentinel, na_value=na_value, ignore_na=ignore_na
10481050
)
10491051

10501052
uniques_ea = self._from_factorized(uniques, self)

pandas/core/arrays/datetimelike.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1878,7 +1878,8 @@ def _with_freq(self, freq):
18781878

18791879
# --------------------------------------------------------------
18801880

1881-
def factorize(self, na_sentinel=-1, sort: bool = False):
1881+
# TODO: Fix?
1882+
def factorize(self, na_sentinel=-1, sort: bool = False, ignore_na: bool = True):
18821883
if self.freq is not None:
18831884
# We must be unique, so can short-circuit (and retain freq)
18841885
codes = np.arange(len(self), dtype=np.intp)
@@ -1888,7 +1889,7 @@ def factorize(self, na_sentinel=-1, sort: bool = False):
18881889
uniques = uniques[::-1]
18891890
return codes, uniques
18901891
# FIXME: shouldn't get here; we are ignoring sort
1891-
return super().factorize(na_sentinel=na_sentinel)
1892+
return super().factorize(na_sentinel=na_sentinel, ignore_na=ignore_na)[0]
18921893

18931894

18941895
# -------------------------------------------------------------------

pandas/core/arrays/masked.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -868,16 +868,23 @@ def searchsorted(
868868
return self._data.searchsorted(value, side=side, sorter=sorter)
869869

870870
@doc(ExtensionArray.factorize)
871-
def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
871+
def factorize(
872+
self, na_sentinel: int = -1, ignore_na: bool = True
873+
) -> tuple[np.ndarray, ExtensionArray]:
872874
arr = self._data
873875
mask = self._mask
874876

875-
codes, uniques = factorize_array(arr, na_sentinel=na_sentinel, mask=mask)
877+
codes, uniques, na_index = factorize_array(
878+
arr, na_sentinel=na_sentinel, mask=mask, ignore_na=ignore_na
879+
)
876880

877881
# check that factorize_array correctly preserves dtype.
878882
assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype)
879883

880-
uniques_ea = type(self)(uniques, np.zeros(len(uniques), dtype=bool))
884+
mask = np.zeros(len(uniques), dtype=bool)
885+
if na_index >= 0:
886+
mask[na_index] = True
887+
uniques_ea = type(self)(uniques, mask)
881888
return codes, uniques_ea
882889

883890
@doc(ExtensionArray._values_for_argsort)

pandas/core/arrays/sparse/array.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -847,13 +847,17 @@ def _values_for_factorize(self):
847847
# Still override this for hash_pandas_object
848848
return np.asarray(self), self.fill_value
849849

850-
def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, SparseArray]:
850+
def factorize(
851+
self, na_sentinel: int = -1, ignore_na: bool = True
852+
) -> tuple[np.ndarray, SparseArray]:
851853
# Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
852854
# The sparsity on this is backwards from what Sparse would want. Want
853855
# ExtensionArray.factorize -> Tuple[EA, EA]
854856
# Given that we have to return a dense array of codes, why bother
855857
# implementing an efficient factorize?
856-
codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
858+
codes, uniques = algos.factorize(
859+
np.asarray(self), na_sentinel=na_sentinel, ignore_na=ignore_na
860+
)
857861
uniques_sp = SparseArray(uniques, dtype=self.dtype)
858862
return codes, uniques_sp
859863

pandas/core/groupby/grouper.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -687,7 +687,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
687687
na_sentinel = None
688688
else:
689689
na_sentinel = -1
690-
codes, uniques = algorithms.factorize(
690+
codes, uniques, _ = algorithms.factorize(
691691
self.grouping_vector, sort=self._sort, na_sentinel=na_sentinel
692692
)
693693
return codes, uniques

pandas/core/reshape/merge.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2243,7 +2243,7 @@ def _factorize_keys(
22432243
# Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
22442244
# "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
22452245
# ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"
2246-
llab = rizer.factorize(lk) # type: ignore[arg-type]
2246+
llab, _ = rizer.factorize(lk) # type: ignore[arg-type]
22472247
# Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
22482248
# "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
22492249
# ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"

0 commit comments

Comments
 (0)