Skip to content

Commit aba2551

Browse files
committed
WIP
1 parent c1627dd commit aba2551

File tree

6 files changed

+64
-87
lines changed

6 files changed

+64
-87
lines changed

pandas/_libs/hashtable_class_helper.pxi.in

Lines changed: 32 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -560,14 +560,13 @@ cdef class {{name}}HashTable(HashTable):
560560
The labels from values to uniques
561561
"""
562562
cdef:
563-
Py_ssize_t i, idx, count = count_prior, n = len(values), na_index = -1, na_code = -1
563+
Py_ssize_t i, idx, count = count_prior, n = len(values)
564564
intp_t[::1] labels
565565
int ret = 0
566566
{{c_type}} val, na_value2
567567
khiter_t k
568568
{{name}}VectorData *ud
569-
bint use_na_value, use_mask, seen_na = False
570-
uint8_t is_na
569+
bint use_na_value, use_mask
571570
uint8_t[:] mask_values
572571

573572
if return_inverse:
@@ -593,43 +592,22 @@ cdef class {{name}}HashTable(HashTable):
593592
for i in range(n):
594593
val = {{to_c_type}}(values[i])
595594

596-
if use_mask:
597-
is_na = mask_values[i]
598-
else:
599-
is_na = (
600-
is_nan_{{c_type}}(val) or
601-
(use_na_value and are_equivalent_{{c_type}}(val, na_value2))
602-
)
603-
604-
if is_na:
605-
if ignore_na:
606-
# if missing values do not count as unique values (i.e. if
607-
# ignore_na is True), skip the hashtable entry for them,
608-
# and replace the corresponding label with na_sentinel
595+
if ignore_na and use_mask:
596+
if mask_values[i]:
609597
labels[i] = na_sentinel
610598
continue
611-
612-
if not seen_na:
613-
if needs_resize(ud):
614-
with gil:
615-
if uniques.external_view_exists:
616-
raise ValueError("external reference to "
617-
"uniques held, but "
618-
"Vector.resize() needed")
619-
uniques.resize()
620-
append_data_{{dtype}}(ud, val)
621-
na_index = i
622-
if return_inverse:
623-
labels[i] = count
624-
na_code = count
625-
count += 1
626-
seen_na = True
627-
if return_inverse:
628-
idx = na_code
629-
labels[i] = idx
599+
elif ignore_na and (
600+
is_nan_{{c_type}}(val) or
601+
(use_na_value and are_equivalent_{{c_type}}(val, na_value2))
602+
):
603+
# if missing values do not count as unique values (i.e. if
604+
# ignore_na is True), skip the hashtable entry for them,
605+
# and replace the corresponding label with na_sentinel
606+
labels[i] = na_sentinel
630607
continue
631608

632609
k = kh_get_{{dtype}}(self.table, val)
610+
633611
if k == self.table.n_buckets:
634612
# k hasn't been seen yet
635613
k = kh_put_{{dtype}}(self.table, val, &ret)
@@ -653,8 +631,8 @@ cdef class {{name}}HashTable(HashTable):
653631
labels[i] = idx
654632

655633
if return_inverse:
656-
return uniques.to_array(), na_index, labels.base # .base -> underlying ndarray
657-
return uniques.to_array(), na_index
634+
return uniques.to_array(), labels.base # .base -> underlying ndarray
635+
return uniques.to_array()
658636

659637
def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
660638
"""
@@ -719,7 +697,7 @@ cdef class {{name}}HashTable(HashTable):
719697
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
720698
object na_value=None):
721699
# -> np.ndarray[np.intp]
722-
_, _, labels = self._unique(values, uniques, count_prior=count_prior,
700+
_, labels = self._unique(values, uniques, count_prior=count_prior,
723701
na_sentinel=na_sentinel, na_value=na_value,
724702
ignore_na=True, return_inverse=True)
725703
return labels
@@ -1032,8 +1010,8 @@ cdef class StringHashTable(HashTable):
10321010
uniques.append(values[uindexer[i]])
10331011

10341012
if return_inverse:
1035-
return uniques.to_array(), -1, labels.base # .base -> underlying ndarray
1036-
return uniques.to_array(), -1
1013+
return uniques.to_array(), labels.base # .base -> underlying ndarray
1014+
return uniques.to_array()
10371015

10381016
def unique(self, ndarray[object] values, bint return_inverse=False):
10391017
"""
@@ -1096,7 +1074,7 @@ cdef class StringHashTable(HashTable):
10961074
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
10971075
object na_value=None):
10981076
# -> np.ndarray[np.intp]
1099-
_, _, labels = self._unique(values, uniques, count_prior=count_prior,
1077+
_, labels = self._unique(values, uniques, count_prior=count_prior,
11001078
na_sentinel=na_sentinel, na_value=na_value,
11011079
ignore_na=True, return_inverse=True)
11021080
return labels
@@ -1244,13 +1222,12 @@ cdef class PyObjectHashTable(HashTable):
12441222
The labels from values to uniques
12451223
"""
12461224
cdef:
1247-
Py_ssize_t i, idx, count = count_prior, n = len(values), na_index = -1
1225+
Py_ssize_t i, idx, count = count_prior, n = len(values)
12481226
intp_t[::1] labels
12491227
int ret = 0
12501228
object val
12511229
khiter_t k
1252-
bint use_na_value, seen_na = False, is_na
1253-
intp_t na_code
1230+
bint use_na_value
12541231

12551232
if return_inverse:
12561233
labels = np.empty(n, dtype=np.intp)
@@ -1260,26 +1237,14 @@ cdef class PyObjectHashTable(HashTable):
12601237
val = values[i]
12611238
hash(val)
12621239

1263-
is_na = checknull(val) or (use_na_value and val == na_value)
1264-
1265-
if is_na:
1266-
if ignore_na:
1267-
# if missing values do not count as unique values (i.e. if
1268-
# ignore_na is True), skip the hashtable entry for them, and
1269-
# replace the corresponding label with na_sentinel
1270-
labels[i] = na_sentinel
1271-
continue
1272-
1273-
if not seen_na:
1274-
seen_na = True
1275-
uniques.append(val)
1276-
na_index = i
1277-
if return_inverse:
1278-
labels[i] = count
1279-
na_code = count
1280-
count += 1
1281-
elif return_inverse:
1282-
labels[i] = na_code
1240+
if ignore_na and (
1241+
checknull(val)
1242+
or (use_na_value and val == na_value)
1243+
):
1244+
# if missing values do not count as unique values (i.e. if
1245+
# ignore_na is True), skip the hashtable entry for them, and
1246+
# replace the corresponding label with na_sentinel
1247+
labels[i] = na_sentinel
12831248
continue
12841249

12851250
k = kh_get_pymap(self.table, <PyObject*>val)
@@ -1298,8 +1263,8 @@ cdef class PyObjectHashTable(HashTable):
12981263
labels[i] = idx
12991264

13001265
if return_inverse:
1301-
return uniques.to_array(), na_index, labels.base # .base -> underlying ndarray
1302-
return uniques.to_array(), na_index
1266+
return uniques.to_array(), labels.base # .base -> underlying ndarray
1267+
return uniques.to_array()
13031268

13041269
def unique(self, ndarray[object] values, bint return_inverse=False):
13051270
"""
@@ -1362,7 +1327,7 @@ cdef class PyObjectHashTable(HashTable):
13621327
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
13631328
object na_value=None):
13641329
# -> np.ndarray[np.intp]
1365-
_, _, labels = self._unique(values, uniques, count_prior=count_prior,
1330+
_, labels = self._unique(values, uniques, count_prior=count_prior,
13661331
na_sentinel=na_sentinel, na_value=na_value,
13671332
ignore_na=True, return_inverse=True)
13681333
return labels

pandas/core/algorithms.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,7 @@ def unique(values):
412412
htable, values = _get_hashtable_algo(values)
413413

414414
table = htable(len(values))
415-
uniques, _ = table.unique(values)
415+
uniques = table.unique(values)
416416
uniques = _reconstruct_data(uniques, original.dtype, original)
417417
return uniques
418418

@@ -497,13 +497,13 @@ def f(c, v):
497497

498498

499499
def factorize_array(
500-
arr: ArrayLike,
500+
values: np.ndarray,
501501
na_sentinel: int = -1,
502502
size_hint: int | None = None,
503503
na_value=None,
504504
mask: npt.NDArray[np.bool_] | None = None,
505505
ignore_na: bool = True,
506-
) -> tuple[npt.NDArray[np.intp], ArrayLike]:
506+
) -> tuple[npt.NDArray[np.intp], np.ndarray]:
507507
"""
508508
Factorize a numpy array to codes and uniques.
509509
@@ -530,11 +530,6 @@ def factorize_array(
530530
codes : ndarray[np.intp]
531531
uniques : ndarray
532532
"""
533-
if is_extension_array_dtype(arr):
534-
values, na_value = arr._values_for_factorize()
535-
else:
536-
values = arr
537-
538533
original = values
539534
if values.dtype.kind in ["m", "M"]:
540535
# _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we
@@ -546,7 +541,7 @@ def factorize_array(
546541
hash_klass, values = _get_hashtable_algo(values)
547542

548543
table = hash_klass(size_hint or len(values))
549-
uniques, na_index, codes = table.factorize(
544+
uniques, codes = table.factorize(
550545
values,
551546
na_sentinel=na_sentinel,
552547
na_value=na_value,
@@ -558,7 +553,7 @@ def factorize_array(
558553
uniques = _reconstruct_data(uniques, original.dtype, original)
559554

560555
codes = ensure_platform_int(codes)
561-
return codes, uniques, na_index
556+
return codes, uniques
562557

563558

564559
@doc(
@@ -744,7 +739,7 @@ def factorize(
744739
)
745740
else:
746741
values = np.asarray(values) # convert DTA/TDA/MultiIndex
747-
codes, uniques, _ = factorize_array(
742+
codes, uniques = factorize_array(
748743
values,
749744
na_sentinel=na_sentinel,
750745
size_hint=size_hint,

pandas/core/arrays/arrow/array.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,8 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
108108
def factorize(
109109
self, na_sentinel: int = -1, ignore_na=True
110110
) -> tuple[np.ndarray, ExtensionArray]:
111-
encoded = self._data.dictionary_encode()
111+
null_encoding = "mask" if ignore_na else "encode"
112+
encoded = self._data.dictionary_encode(null_encoding=null_encoding)
112113
indices = pa.chunked_array(
113114
[c.indices for c in encoded.chunks], type=encoded.type.index_type
114115
).to_pandas()

pandas/core/arrays/base.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1045,12 +1045,11 @@ def factorize(
10451045
# Complete control over factorization.
10461046
arr, na_value = self._values_for_factorize()
10471047

1048-
codes, uniques, _ = factorize_array(
1048+
codes, uniques = factorize_array(
10491049
arr, na_sentinel=na_sentinel, na_value=na_value, ignore_na=ignore_na
10501050
)
10511051

10521052
uniques_ea = self._from_factorized(uniques, self)
1053-
# TODO: Use na_index here?
10541053
return codes, uniques_ea
10551054

10561055
_extension_array_shared_docs[

pandas/core/arrays/masked.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -874,17 +874,28 @@ def factorize(
874874
arr = self._data
875875
mask = self._mask
876876

877-
codes, uniques, na_index = factorize_array(
878-
arr, na_sentinel=na_sentinel, mask=mask, ignore_na=ignore_na
877+
codes, uniques = factorize_array(
878+
arr, na_sentinel=na_sentinel, mask=mask, ignore_na=True
879879
)
880880

881881
# check that factorize_array correctly preserves dtype.
882882
assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype)
883883

884-
mask = np.zeros(len(uniques), dtype=bool)
885-
if na_index >= 0:
886-
mask[na_index] = True
887-
uniques_ea = type(self)(uniques, mask)
884+
size = len(uniques) if ignore_na else len(uniques) + 1
885+
uniques_mask = np.zeros(size, dtype=bool)
886+
if not ignore_na:
887+
na_index = mask.argmax()
888+
if mask[na_index]:
889+
# Insert na with the proper code
890+
# TODO: This only works with na_sentinel being -1
891+
na_code = codes[:na_index].argmax() + 1
892+
codes[codes >= na_code] += 1
893+
codes[codes == na_sentinel] = na_code
894+
# dummy value for uniques
895+
uniques = np.insert(uniques, na_code, 0)
896+
uniques_mask[na_code] = True
897+
uniques_ea = type(self)(uniques, uniques_mask)
898+
888899
return codes, uniques_ea
889900

890901
@doc(ExtensionArray._values_for_argsort)

pandas/core/arrays/string_.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,12 @@ def _values_for_factorize(self):
382382
arr[mask] = -1
383383
return arr, -1
384384

385+
@classmethod
386+
def _from_factorized(cls, values, original):
387+
assert values.dtype == original._ndarray.dtype
388+
values[values == -1] = None
389+
return original._from_backing_data(values)
390+
385391
def __setitem__(self, key, value):
386392
value = extract_array(value, extract_numpy=True)
387393
if isinstance(value, type(self)):

0 commit comments

Comments
 (0)