@@ -735,6 +735,26 @@ def offset_labels(labels: np.ndarray, ngroups: int) -> tuple[np.ndarray, int]:
735735 return offset , size
736736
737737
738+ def fast_isin (ar1 , ar2 , invert ):
739+ rev_idx , ar1 = pd .factorize (ar1 , sort = False )
740+
741+ ar = np .concatenate ((ar1 , ar2 ))
742+ # We need this to be a stable sort, so always use 'mergesort'
743+ # here. The values from the first array should always come before
744+ # the values from the second array.
745+ order = ar .argsort (kind = "mergesort" )
746+ sar = ar [order ]
747+ if invert :
748+ bool_ar = sar [1 :] != sar [:- 1 ]
749+ else :
750+ bool_ar = sar [1 :] == sar [:- 1 ]
751+ flag = np .concatenate ((bool_ar , [invert ]))
752+ ret = np .empty (ar .shape , dtype = bool )
753+ ret [order ] = flag
754+
755+ return ret [rev_idx ]
756+
757+
738758@overload
739759def factorize_ (
740760 by : T_Bys ,
@@ -830,8 +850,18 @@ def factorize_(
830850 if expect is not None and reindex :
831851 sorter = np .argsort (expect )
832852 groups = expect [(sorter ,)] if sort else expect
853+
833854 idx = np .searchsorted (expect , flat , sorter = sorter )
834- mask = ~ np .isin (flat , expect ) | isnull (flat ) | (idx == len (expect ))
855+ mask = fast_isin (flat , expect , invert = True )
856+ if not np .issubdtype (flat .dtype , np .integer ):
857+ mask |= isnull (flat )
858+ mask |= idx == len (expect )
859+
860+ # idx = np.full(flat.shape, -1)
861+ # result = np.searchsorted(expect.values, flat[~mask], sorter=sorter)
862+ # idx[~mask] = result
863+ # idx = np.searchsorted(expect.values, flat, sorter=sorter)
864+ # idx[mask] = -1
835865 if not sort :
836866 # idx is the index in to the sorted array.
837867 # if we didn't want sorting, unsort it back
0 commit comments