@@ -560,14 +560,13 @@ cdef class {{name}}HashTable(HashTable):
560
560
The labels from values to uniques
561
561
"""
562
562
cdef:
563
- Py_ssize_t i, idx, count = count_prior, n = len(values), na_index = -1, na_code = -1
563
+ Py_ssize_t i, idx, count = count_prior, n = len(values)
564
564
intp_t[::1] labels
565
565
int ret = 0
566
566
{{c_type}} val, na_value2
567
567
khiter_t k
568
568
{{name}}VectorData *ud
569
- bint use_na_value, use_mask, seen_na = False
570
- uint8_t is_na
569
+ bint use_na_value, use_mask
571
570
uint8_t[:] mask_values
572
571
573
572
if return_inverse:
@@ -593,43 +592,22 @@ cdef class {{name}}HashTable(HashTable):
593
592
for i in range(n):
594
593
val = {{to_c_type}}(values[i])
595
594
596
- if use_mask:
597
- is_na = mask_values[i]
598
- else:
599
- is_na = (
600
- is_nan_{{c_type}}(val) or
601
- (use_na_value and are_equivalent_{{c_type}}(val, na_value2))
602
- )
603
-
604
- if is_na:
605
- if ignore_na:
606
- # if missing values do not count as unique values (i.e. if
607
- # ignore_na is True), skip the hashtable entry for them,
608
- # and replace the corresponding label with na_sentinel
595
+ if ignore_na and use_mask:
596
+ if mask_values[i]:
609
597
labels[i] = na_sentinel
610
598
continue
611
-
612
- if not seen_na:
613
- if needs_resize(ud):
614
- with gil:
615
- if uniques.external_view_exists:
616
- raise ValueError("external reference to "
617
- "uniques held, but "
618
- "Vector.resize() needed")
619
- uniques.resize()
620
- append_data_{{dtype}}(ud, val)
621
- na_index = i
622
- if return_inverse:
623
- labels[i] = count
624
- na_code = count
625
- count += 1
626
- seen_na = True
627
- if return_inverse:
628
- idx = na_code
629
- labels[i] = idx
599
+ elif ignore_na and (
600
+ is_nan_{{c_type}}(val) or
601
+ (use_na_value and are_equivalent_{{c_type}}(val, na_value2))
602
+ ):
603
+ # if missing values do not count as unique values (i.e. if
604
+ # ignore_na is True), skip the hashtable entry for them,
605
+ # and replace the corresponding label with na_sentinel
606
+ labels[i] = na_sentinel
630
607
continue
631
608
632
609
k = kh_get_{{dtype}}(self.table, val)
610
+
633
611
if k == self.table.n_buckets:
634
612
# k hasn't been seen yet
635
613
k = kh_put_{{dtype}}(self.table, val, &ret)
@@ -653,8 +631,8 @@ cdef class {{name}}HashTable(HashTable):
653
631
labels[i] = idx
654
632
655
633
if return_inverse:
656
- return uniques.to_array(), na_index, labels.base # .base -> underlying ndarray
657
- return uniques.to_array(), na_index
634
+ return uniques.to_array(), labels.base # .base -> underlying ndarray
635
+ return uniques.to_array()
658
636
659
637
def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
660
638
"""
@@ -719,7 +697,7 @@ cdef class {{name}}HashTable(HashTable):
719
697
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
720
698
object na_value=None):
721
699
# -> np.ndarray[np.intp]
722
- _, _, labels = self._unique(values, uniques, count_prior=count_prior,
700
+ _, labels = self._unique(values, uniques, count_prior=count_prior,
723
701
na_sentinel=na_sentinel, na_value=na_value,
724
702
ignore_na=True, return_inverse=True)
725
703
return labels
@@ -1032,8 +1010,8 @@ cdef class StringHashTable(HashTable):
1032
1010
uniques.append(values[uindexer[i]])
1033
1011
1034
1012
if return_inverse:
1035
- return uniques.to_array(), -1, labels.base # .base -> underlying ndarray
1036
- return uniques.to_array(), -1
1013
+ return uniques.to_array(), labels.base # .base -> underlying ndarray
1014
+ return uniques.to_array()
1037
1015
1038
1016
def unique(self, ndarray[object] values, bint return_inverse=False):
1039
1017
"""
@@ -1096,7 +1074,7 @@ cdef class StringHashTable(HashTable):
1096
1074
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
1097
1075
object na_value=None):
1098
1076
# -> np.ndarray[np.intp]
1099
- _, _, labels = self._unique(values, uniques, count_prior=count_prior,
1077
+ _, labels = self._unique(values, uniques, count_prior=count_prior,
1100
1078
na_sentinel=na_sentinel, na_value=na_value,
1101
1079
ignore_na=True, return_inverse=True)
1102
1080
return labels
@@ -1244,13 +1222,12 @@ cdef class PyObjectHashTable(HashTable):
1244
1222
The labels from values to uniques
1245
1223
"""
1246
1224
cdef:
1247
- Py_ssize_t i, idx, count = count_prior, n = len(values), na_index = -1
1225
+ Py_ssize_t i, idx, count = count_prior, n = len(values)
1248
1226
intp_t[::1] labels
1249
1227
int ret = 0
1250
1228
object val
1251
1229
khiter_t k
1252
- bint use_na_value, seen_na = False, is_na
1253
- intp_t na_code
1230
+ bint use_na_value
1254
1231
1255
1232
if return_inverse:
1256
1233
labels = np.empty(n, dtype=np.intp)
@@ -1260,26 +1237,14 @@ cdef class PyObjectHashTable(HashTable):
1260
1237
val = values[i]
1261
1238
hash(val)
1262
1239
1263
- is_na = checknull(val) or (use_na_value and val == na_value)
1264
-
1265
- if is_na:
1266
- if ignore_na:
1267
- # if missing values do not count as unique values (i.e. if
1268
- # ignore_na is True), skip the hashtable entry for them, and
1269
- # replace the corresponding label with na_sentinel
1270
- labels[i] = na_sentinel
1271
- continue
1272
-
1273
- if not seen_na:
1274
- seen_na = True
1275
- uniques.append(val)
1276
- na_index = i
1277
- if return_inverse:
1278
- labels[i] = count
1279
- na_code = count
1280
- count += 1
1281
- elif return_inverse:
1282
- labels[i] = na_code
1240
+ if ignore_na and (
1241
+ checknull(val)
1242
+ or (use_na_value and val == na_value)
1243
+ ):
1244
+ # if missing values do not count as unique values (i.e. if
1245
+ # ignore_na is True), skip the hashtable entry for them, and
1246
+ # replace the corresponding label with na_sentinel
1247
+ labels[i] = na_sentinel
1283
1248
continue
1284
1249
1285
1250
k = kh_get_pymap(self.table, <PyObject*>val)
@@ -1298,8 +1263,8 @@ cdef class PyObjectHashTable(HashTable):
1298
1263
labels[i] = idx
1299
1264
1300
1265
if return_inverse:
1301
- return uniques.to_array(), na_index, labels.base # .base -> underlying ndarray
1302
- return uniques.to_array(), na_index
1266
+ return uniques.to_array(), labels.base # .base -> underlying ndarray
1267
+ return uniques.to_array()
1303
1268
1304
1269
def unique(self, ndarray[object] values, bint return_inverse=False):
1305
1270
"""
@@ -1362,7 +1327,7 @@ cdef class PyObjectHashTable(HashTable):
1362
1327
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
1363
1328
object na_value=None):
1364
1329
# -> np.ndarray[np.intp]
1365
- _, _, labels = self._unique(values, uniques, count_prior=count_prior,
1330
+ _, labels = self._unique(values, uniques, count_prior=count_prior,
1366
1331
na_sentinel=na_sentinel, na_value=na_value,
1367
1332
ignore_na=True, return_inverse=True)
1368
1333
return labels
0 commit comments