@@ -76,16 +76,16 @@ template <typename T> auto PandasIsNA(bool mask_value, T &scalar_value) {
7676 }
7777}
7878
79- template <typename T> auto MaybeResizeKlibContainer (T &container) {
80- const auto current_size = container.size ();
81- if (container.n_buckets () == current_size) {
82- container.resize (current_size * 4 );
83- }
84- }
85-
8679template <typename T> class PandasVector {
8780public:
88- explicit PandasVector<T>() : external_view_exists_(false ) {}
81+ static constexpr size_t INIT_VEC_CAP = 128 ;
82+
83+ explicit PandasVector<T>() : external_view_exists_(false ) {
84+ vec_.reserve (INIT_VEC_CAP);
85+ }
86+ explicit PandasVector<T>(std::vector<T>&& vec) : vec_(vec), external_view_exists_(false ) {
87+ vec_.reserve (INIT_VEC_CAP);
88+ }
8989 ~PandasVector<T>() = default ;
9090 PandasVector<T>(PandasVector<T> const &) = delete ;
9191 void operator =(PandasVector<T> const &) = delete ;
@@ -137,8 +137,9 @@ template <typename T, bool IsMasked> class PandasHashTable {
137137 explicit PandasHashTable<T, IsMasked>(HashValueT new_size) {
138138#if __APPLE__
139139 // macOS cannot resolve size_t to uint32_t or uint64_t that khash needs
140- hash_map_.resize (static_cast <uint64_t >(new_size));
141- hash_set_.resize (static_cast <uint64_t >(new_size));
140+ const auto ns = static_cast <uint64_t >(new_size);
141+ hash_map_.resize (ns);
142+ hash_set_.resize (ns);
142143#else
143144 hash_map_.resize (new_size);
144145 hash_set_.resize (new_size);
@@ -226,7 +227,6 @@ template <typename T, bool IsMasked> class PandasHashTable {
226227 const auto n = values_v.shape (0 );
227228 for (auto i = decltype (n){0 }; i < n; i++) {
228229 hash_map_[keys_v (i)] = values_v (i);
229- MaybeResizeKlibContainer (hash_map_);
230230 }
231231 }
232232
@@ -251,15 +251,13 @@ template <typename T, bool IsMasked> class PandasHashTable {
251251 na_position = i;
252252 } else {
253253 hash_map_[values_v (i)] = i;
254- MaybeResizeKlibContainer (hash_map_);
255254 }
256255 }
257256 na_position_ = na_position;
258257 } else {
259258 for (auto i = decltype (n){0 }; i < n; i++) {
260259 const auto key = values_v (i);
261260 hash_map_[key] = i;
262- MaybeResizeKlibContainer (hash_map_);
263261 }
264262 }
265263 }
@@ -428,7 +426,6 @@ template <typename T, bool IsMasked> class PandasHashTable {
428426 int dummy;
429427 k = hash_map_.put (val, &dummy);
430428 hash_map_.value (k) = count;
431- MaybeResizeKlibContainer (hash_map_);
432429 uniques.Append (val);
433430 labels[i] = count;
434431 count++;
@@ -487,7 +484,6 @@ template <typename T, bool IsMasked> class PandasHashTable {
487484 k = hash_map_.put (val, &dummy);
488485 uniques.Append (val);
489486 hash_map_.value (k) = count_prior;
490- MaybeResizeKlibContainer (hash_map_);
491487 labels[i] = count_prior;
492488 count_prior++;
493489 } else {
@@ -521,7 +517,6 @@ template <typename T, bool IsMasked> class PandasHashTable {
521517 k = hash_map_.put (val, &dummy);
522518 uniques.Append (val);
523519 hash_map_.value (k) = count_prior;
524- MaybeResizeKlibContainer (hash_map_);
525520 labels[i] = count_prior;
526521 count_prior++;
527522 } else {
@@ -550,8 +545,10 @@ template <typename T, bool IsMasked> class PandasHashTable {
550545
551546 const auto values_v = values.view ();
552547 const auto n = values.shape (0 );
553- PandasVector<uint8_t > result;
548+ bool seen_na = false ;
549+ auto na_pos = decltype (n){0 };
554550
551+ std::vector<uint8_t > missing_vec;
555552 if constexpr (IsMasked) {
556553 using MaskT = nb::ndarray<const uint8_t , nb::ndim<1 >>;
557554 MaskT mask;
@@ -560,43 +557,45 @@ template <typename T, bool IsMasked> class PandasHashTable {
560557 }
561558 nb::call_guard<nb::gil_scoped_release>();
562559 const auto mask_v = mask.view ();
563-
564- bool seen_na = false ;
565560 for (auto i = decltype (n){0 }; i < n; i++) {
566561 const auto val = values_v (i);
567562
568563 if (PandasIsNA (mask_v (i), val)) {
569564 if (!seen_na) {
570565 uniques.Append (val);
571- result. Append ( 1 ) ;
566+ na_pos = i ;
572567 seen_na = true ;
573568 }
574569 continue ;
575570 }
576571
577572 int absent;
578573 hash_set_.put (val, &absent);
579- MaybeResizeKlibContainer (hash_set_);
580574 if (absent) {
581575 uniques.Append (val);
582- result.Append (0 );
583576 }
584577 }
585578 } else {
579+ // TODO: why do we even have this branch?
586580 nb::call_guard<nb::gil_scoped_release>();
587581 for (auto i = decltype (n){0 }; i < n; i++) {
588582 const auto val = values_v (i);
589583 int absent;
590584 hash_set_.put (val, &absent);
591- MaybeResizeKlibContainer (hash_set_);
592585 if (absent) {
593586 uniques.Append (val);
594- result.Append (0 );
595587 }
596588 }
597589 }
598590
599- return result;
591+
592+ std::vector<uint8_t > tmp;
593+ tmp.resize (hash_set_.n_buckets (), 0 );
594+ if (seen_na) {
595+ tmp[na_pos] = 1 ;
596+ }
597+
598+ return PandasVector (std::move (tmp));
600599 }
601600
602601 auto UniquesOnly (const nb::ndarray<const T, nb::ndim<1 >> &values,
@@ -612,7 +611,6 @@ template <typename T, bool IsMasked> class PandasHashTable {
612611 if (k == hash_map_.end ()) {
613612 int dummy;
614613 k = hash_map_.put (val, &dummy);
615- MaybeResizeKlibContainer (hash_map_);
616614 uniques.Append (val);
617615 }
618616 }
0 commit comments