@@ -22,11 +22,13 @@ namespace nb = nanobind;
2222// / and support arbitrary types
2323// /
2424template <typename T> struct PandasHashFunction {
25- auto operator ()(const T &value) const { return std::hash<T>()(value); }
25+ constexpr auto operator ()(const T &value) const {
26+ return std::hash<T>()(value);
27+ }
2628};
2729
2830template <>
29- auto PandasHashFunction<float >::operator ()(const float &value) const {
31+ constexpr auto PandasHashFunction<float >::operator ()(const float &value) const {
3032 if (std::isnan (value)) {
3133 return static_cast <decltype (std::hash<float >()(value))>(0 );
3234 }
@@ -35,7 +37,8 @@ auto PandasHashFunction<float>::operator()(const float &value) const {
3537}
3638
3739template <>
38- auto PandasHashFunction<double >::operator ()(const double &value) const {
40+ constexpr auto
41+ PandasHashFunction<double >::operator ()(const double &value) const {
3942 if (std::isnan (value)) {
4043 return static_cast <decltype (std::hash<double >()(value))>(0 );
4144 }
@@ -67,6 +70,15 @@ auto PandasHashEquality<double>::operator()(const double &lhs,
6770 return lhs == rhs;
6871}
6972
73+ template <typename T> auto PandasIsNA (bool mask_value, T &scalar_value) {
74+ // TODO: should NaN / pd.NA always be treated the same?
75+ if constexpr (std::is_floating_point_v<T>) {
76+ return mask_value || std::isnan (scalar_value);
77+ } else {
78+ return mask_value;
79+ }
80+ }
81+
7082template <typename T> class PandasVector {
7183public:
7284 explicit PandasVector<T>() : external_view_exists_(false ) {}
@@ -119,8 +131,14 @@ template <typename T, bool IsMasked> class PandasHashTable {
119131 uint64_t >::type;
120132 explicit PandasHashTable<T, IsMasked>() = default ;
121133 explicit PandasHashTable<T, IsMasked>(HashValueT new_size) {
134+ #if __APPLE__
135+ // macOS cannot resolve size_t to uint32_t or uint64_t that khash needs
136+ hash_map_.resize (static_cast <uint64_t >(new_size));
137+ hash_set_.resize (static_cast < uint64_t < (new_size));
138+ #else
122139 hash_map_.resize (new_size);
123140 hash_set_.resize (new_size);
141+ #endif
124142 }
125143
126144 auto __len__ () const noexcept { return hash_map_.size (); }
@@ -309,15 +327,18 @@ template <typename T, bool IsMasked> class PandasHashTable {
309327 mask_vector = UniqueWithResultMask (values, uniques, mask);
310328
311329 return nb::make_tuple (uniques.ToNdArray (), mask_vector.ToNdArray ());
330+ } else {
331+ UniquesOnly (values, uniques);
332+ const auto out_array = uniques.ToNdArray ();
333+ return nb::cast (out_array);
312334 }
313- UniquesOnly (values, uniques, mask);
314- const auto out_array = uniques.ToNdArray ();
315- return nb::cast (out_array);
335+
336+ throw std::runtime_error (" Should not hit this" );
316337 }
317338
318339 auto Factorize (const nb::ndarray<const T, nb::ndim<1 >> &values,
319340 Py_ssize_t na_sentinel = -1 , nb::object na_value = nb::none(),
320- nb::object mask = nb::none(), bool ignore_na = true )
341+ nb::object mask = nb::none(), bool ignore_na = false )
321342 -> nb::object {
322343 PandasVector<T> uniques;
323344
@@ -444,14 +465,14 @@ template <typename T, bool IsMasked> class PandasHashTable {
444465 const auto mask_v = mask.view ();
445466
446467 for (auto i = decltype (n){0 }; i < n; i++) {
468+ const auto val = values_v (i);
447469 if constexpr (IgnoreNA) {
448- if (mask_v (i)) {
470+ if (PandasIsNA ( mask_v (i), val )) {
449471 labels[i] = na_sentinel;
450472 continue ;
451473 }
452474 }
453475
454- const auto val = values_v (i);
455476 auto k = hash_map_.get (val);
456477 if (k == hash_map_.end ()) {
457478 int dummy;
@@ -472,7 +493,7 @@ template <typename T, bool IsMasked> class PandasHashTable {
472493 const auto val = values_v (i);
473494
474495 if constexpr (IgnoreNA) {
475- if constexpr (std::is_same_v<T, float > || std::is_same_v<T, double >) {
496+ if constexpr (std::is_floating_point_v<T >) {
476497 if (std::isnan (val)) {
477498 labels[i] = na_sentinel;
478499 continue ;
@@ -534,17 +555,12 @@ template <typename T, bool IsMasked> class PandasHashTable {
534555 for (auto i = decltype (n){0 }; i < n; i++) {
535556 const auto val = values_v (i);
536557
537- bool should_append_na;
538- // NaN / pd.NA are treated the same? hmmm
539- if constexpr (std::is_floating_point_v<T>) {
540- should_append_na = !seen_na && (mask_v (i) || std::isnan (val));
541- } else {
542- should_append_na = !seen_na && mask_v (i);
543- }
544- if (should_append_na) {
545- seen_na = true ;
546- uniques.Append (val);
547- result.Append (1 );
558+ if (PandasIsNA (mask_v (i), val)) {
559+ if (!seen_na) {
560+ uniques.Append (val);
561+ result.Append (1 );
562+ seen_na = true ;
563+ }
548564 continue ;
549565 }
550566
@@ -572,44 +588,19 @@ template <typename T, bool IsMasked> class PandasHashTable {
572588 }
573589
574590 auto UniquesOnly (const nb::ndarray<const T, nb::ndim<1 >> &values,
575- PandasVector<T> &uniques,
576- [[maybe_unused]] nb::object mask_obj = nb::none()) -> void {
577- if constexpr (IsMasked) {
578- if (mask_obj.is_none ()) {
579- throw std::invalid_argument (" mask must not be None!" );
580- }
581- }
591+ PandasVector<T> &uniques) -> void {
582592
583593 const auto values_v = values.view ();
584594 const auto n = values.shape (0 );
585595
586- if constexpr (IsMasked) {
587- using MaskT = nb::ndarray<const uint8_t , nb::ndim<1 >>;
588- MaskT mask;
589- if (!nb::try_cast<MaskT>(mask_obj, mask, false )) {
590- throw std::invalid_argument (" Could not convert mask to uint8_t array!" );
591- }
592- nb::call_guard<nb::gil_scoped_release>();
593-
594- for (auto i = decltype (n){0 }; i < n; i++) {
595- const auto val = values_v (i);
596- auto k = hash_map_.get (val);
597- if (k == hash_map_.end ()) {
598- int dummy;
599- k = hash_map_.put (val, &dummy);
600- uniques.Append (val);
601- }
602- }
603- } else {
604- nb::call_guard<nb::gil_scoped_release>();
605- for (auto i = decltype (n){0 }; i < n; i++) {
606- const auto val = values_v (i);
607- auto k = hash_map_.get (val);
608- if (k == hash_map_.end ()) {
609- int dummy;
610- k = hash_map_.put (val, &dummy);
611- uniques.Append (val);
612- }
596+ nb::call_guard<nb::gil_scoped_release>();
597+ for (auto i = decltype (n){0 }; i < n; i++) {
598+ const auto val = values_v (i);
599+ auto k = hash_map_.get (val);
600+ if (k == hash_map_.end ()) {
601+ int dummy;
602+ k = hash_map_.put (val, &dummy);
603+ uniques.Append (val);
613604 }
614605 }
615606
0 commit comments