@@ -22,11 +22,13 @@ namespace nb = nanobind;
22
22
// / and support arbitrary types
23
23
// /
24
24
template <typename T> struct PandasHashFunction {
25
- auto operator ()(const T &value) const { return std::hash<T>()(value); }
25
+ constexpr auto operator ()(const T &value) const {
26
+ return std::hash<T>()(value);
27
+ }
26
28
};
27
29
28
30
template <>
29
- auto PandasHashFunction<float >::operator ()(const float &value) const {
31
+ constexpr auto PandasHashFunction<float >::operator ()(const float &value) const {
30
32
if (std::isnan (value)) {
31
33
return static_cast <decltype (std::hash<float >()(value))>(0 );
32
34
}
@@ -35,7 +37,8 @@ auto PandasHashFunction<float>::operator()(const float &value) const {
35
37
}
36
38
37
39
template <>
38
- auto PandasHashFunction<double >::operator ()(const double &value) const {
40
+ constexpr auto
41
+ PandasHashFunction<double >::operator ()(const double &value) const {
39
42
if (std::isnan (value)) {
40
43
return static_cast <decltype (std::hash<double >()(value))>(0 );
41
44
}
@@ -67,6 +70,15 @@ auto PandasHashEquality<double>::operator()(const double &lhs,
67
70
return lhs == rhs;
68
71
}
69
72
73
+ template <typename T> auto PandasIsNA (bool mask_value, T &scalar_value) {
74
+ // TODO: should NaN / pd.NA always be treated the same?
75
+ if constexpr (std::is_floating_point_v<T>) {
76
+ return mask_value || std::isnan (scalar_value);
77
+ } else {
78
+ return mask_value;
79
+ }
80
+ }
81
+
70
82
template <typename T> class PandasVector {
71
83
public:
72
84
explicit PandasVector<T>() : external_view_exists_(false ) {}
@@ -119,8 +131,14 @@ template <typename T, bool IsMasked> class PandasHashTable {
119
131
uint64_t >::type;
120
132
explicit PandasHashTable<T, IsMasked>() = default ;
121
133
explicit PandasHashTable<T, IsMasked>(HashValueT new_size) {
134
+ #if __APPLE__
135
+ // macOS cannot resolve size_t to uint32_t or uint64_t that khash needs
136
+ hash_map_.resize (static_cast <uint64_t >(new_size));
137
+ hash_set_.resize (static_cast < uint64_t < (new_size));
138
+ #else
122
139
hash_map_.resize (new_size);
123
140
hash_set_.resize (new_size);
141
+ #endif
124
142
}
125
143
126
144
auto __len__ () const noexcept { return hash_map_.size (); }
@@ -309,15 +327,18 @@ template <typename T, bool IsMasked> class PandasHashTable {
309
327
mask_vector = UniqueWithResultMask (values, uniques, mask);
310
328
311
329
return nb::make_tuple (uniques.ToNdArray (), mask_vector.ToNdArray ());
330
+ } else {
331
+ UniquesOnly (values, uniques);
332
+ const auto out_array = uniques.ToNdArray ();
333
+ return nb::cast (out_array);
312
334
}
313
- UniquesOnly (values, uniques, mask);
314
- const auto out_array = uniques.ToNdArray ();
315
- return nb::cast (out_array);
335
+
336
+ throw std::runtime_error (" Should not hit this" );
316
337
}
317
338
318
339
auto Factorize (const nb::ndarray<const T, nb::ndim<1 >> &values,
319
340
Py_ssize_t na_sentinel = -1 , nb::object na_value = nb::none(),
320
- nb::object mask = nb::none(), bool ignore_na = true )
341
+ nb::object mask = nb::none(), bool ignore_na = false )
321
342
-> nb::object {
322
343
PandasVector<T> uniques;
323
344
@@ -444,14 +465,14 @@ template <typename T, bool IsMasked> class PandasHashTable {
444
465
const auto mask_v = mask.view ();
445
466
446
467
for (auto i = decltype (n){0 }; i < n; i++) {
468
+ const auto val = values_v (i);
447
469
if constexpr (IgnoreNA) {
448
- if (mask_v (i)) {
470
+ if (PandasIsNA ( mask_v (i), val )) {
449
471
labels[i] = na_sentinel;
450
472
continue ;
451
473
}
452
474
}
453
475
454
- const auto val = values_v (i);
455
476
auto k = hash_map_.get (val);
456
477
if (k == hash_map_.end ()) {
457
478
int dummy;
@@ -472,7 +493,7 @@ template <typename T, bool IsMasked> class PandasHashTable {
472
493
const auto val = values_v (i);
473
494
474
495
if constexpr (IgnoreNA) {
475
- if constexpr (std::is_same_v<T, float > || std::is_same_v<T, double >) {
496
+ if constexpr (std::is_floating_point_v<T >) {
476
497
if (std::isnan (val)) {
477
498
labels[i] = na_sentinel;
478
499
continue ;
@@ -534,17 +555,12 @@ template <typename T, bool IsMasked> class PandasHashTable {
534
555
for (auto i = decltype (n){0 }; i < n; i++) {
535
556
const auto val = values_v (i);
536
557
537
- bool should_append_na;
538
- // NaN / pd.NA are treated the same? hmmm
539
- if constexpr (std::is_floating_point_v<T>) {
540
- should_append_na = !seen_na && (mask_v (i) || std::isnan (val));
541
- } else {
542
- should_append_na = !seen_na && mask_v (i);
543
- }
544
- if (should_append_na) {
545
- seen_na = true ;
546
- uniques.Append (val);
547
- result.Append (1 );
558
+ if (PandasIsNA (mask_v (i), val)) {
559
+ if (!seen_na) {
560
+ uniques.Append (val);
561
+ result.Append (1 );
562
+ seen_na = true ;
563
+ }
548
564
continue ;
549
565
}
550
566
@@ -572,44 +588,19 @@ template <typename T, bool IsMasked> class PandasHashTable {
572
588
}
573
589
574
590
auto UniquesOnly (const nb::ndarray<const T, nb::ndim<1 >> &values,
575
- PandasVector<T> &uniques,
576
- [[maybe_unused]] nb::object mask_obj = nb::none()) -> void {
577
- if constexpr (IsMasked) {
578
- if (mask_obj.is_none ()) {
579
- throw std::invalid_argument (" mask must not be None!" );
580
- }
581
- }
591
+ PandasVector<T> &uniques) -> void {
582
592
583
593
const auto values_v = values.view ();
584
594
const auto n = values.shape (0 );
585
595
586
- if constexpr (IsMasked) {
587
- using MaskT = nb::ndarray<const uint8_t , nb::ndim<1 >>;
588
- MaskT mask;
589
- if (!nb::try_cast<MaskT>(mask_obj, mask, false )) {
590
- throw std::invalid_argument (" Could not convert mask to uint8_t array!" );
591
- }
592
- nb::call_guard<nb::gil_scoped_release>();
593
-
594
- for (auto i = decltype (n){0 }; i < n; i++) {
595
- const auto val = values_v (i);
596
- auto k = hash_map_.get (val);
597
- if (k == hash_map_.end ()) {
598
- int dummy;
599
- k = hash_map_.put (val, &dummy);
600
- uniques.Append (val);
601
- }
602
- }
603
- } else {
604
- nb::call_guard<nb::gil_scoped_release>();
605
- for (auto i = decltype (n){0 }; i < n; i++) {
606
- const auto val = values_v (i);
607
- auto k = hash_map_.get (val);
608
- if (k == hash_map_.end ()) {
609
- int dummy;
610
- k = hash_map_.put (val, &dummy);
611
- uniques.Append (val);
612
- }
596
+ nb::call_guard<nb::gil_scoped_release>();
597
+ for (auto i = decltype (n){0 }; i < n; i++) {
598
+ const auto val = values_v (i);
599
+ auto k = hash_map_.get (val);
600
+ if (k == hash_map_.end ()) {
601
+ int dummy;
602
+ k = hash_map_.put (val, &dummy);
603
+ uniques.Append (val);
613
604
}
614
605
}
615
606
0 commit comments