factorize early as much as possible

dcherian · dcherian · commit d1a397b09a87 · 2023-05-01T15:38:05.000-06:00
diff --git a/flox/core.py b/flox/core.py
@@ -1598,7 +1598,11 @@ def _lazy_factorize_wrapper(*by: T_By, **kwargs) -> np.ndarray:
 
 
 def _factorize_multiple(
-    by: T_Bys, expected_groups: T_ExpectIndexTuple, any_by_dask: bool, reindex: bool
+    by: T_Bys,
+    expected_groups: T_ExpectIndexTuple,
+    any_by_dask: bool,
+    reindex: bool,
+    sort: bool = True,
 ) -> tuple[tuple[np.ndarray], tuple[np.ndarray, ...], tuple[int, ...]]:
     if any_by_dask:
         import dask.array
@@ -1617,6 +1621,7 @@ def _factorize_multiple(
             expected_groups=expected_groups,
             fastpath=True,
             reindex=reindex,
+            sort=sort,
         )
 
         fg, gs = [], []
@@ -1643,6 +1648,7 @@ def _factorize_multiple(
             expected_groups=expected_groups,
             fastpath=True,
             reindex=reindex,
+            sort=sort,
         )
 
     return (group_idx,), found_groups, grp_shape
@@ -1833,21 +1839,28 @@ def groupby_reduce(
     # (pd.IntervalIndex or not)
     expected_groups = _convert_expected_groups_to_index(expected_groups, isbins, sort)
 
-    is_binning = any([isinstance(e, pd.IntervalIndex) for e in expected_groups])
-
-    # TODO: could restrict this to dask-only
-    factorize_early = (nby > 1) or (
-        is_binning and method == "cohorts" and is_duck_dask_array(array)
+    # Don't factorize "early only when
+    # grouping by dask arrays, and not having expected_groups
+    factorize_early = not (
+        # can't do it if we are grouping by dask array but don't have expected_groups
+        any(is_dask and ex_ is None for is_dask, ex_ in zip(by_is_dask, expected_groups))
     )
     if factorize_early:
         bys, final_groups, grp_shape = _factorize_multiple(
-            bys, expected_groups, any_by_dask=any_by_dask, reindex=reindex
+            bys,
+            expected_groups,
+            any_by_dask=any_by_dask,
+            # This is the only way it makes sense I think.
+            # reindex controls what's actually allocated in chunk_reduce
+            # At this point, we care about an accurate conversion to codes.
+            reindex=True,
+            sort=sort,
         )
         expected_groups = (pd.RangeIndex(math.prod(grp_shape)),)
 
     assert len(bys) == 1
-    by_ = bys[0]
-    expected_groups = expected_groups[0]
+    (by_,) = bys
+    (expected_groups,) = expected_groups
 
     if axis is None:
         axis_ = tuple(array.ndim + np.arange(-by_.ndim, 0))
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,7 +1,7 @@
 import pytest
 
 
-@pytest.fixture(scope="module", params=["flox", "numpy", "numba"])
+@pytest.fixture(scope="module", params=["flox"])
 def engine(request):
     if request.param == "numba":
         try:
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -653,7 +653,7 @@ def test_groupby_bins(chunk_labels, kwargs, chunks, engine, method) -> None:
     array = [1, 1, 1, 1, 1, 1]
     labels = [0.2, 1.5, 1.9, 2, 3, 20]
 
-    if method in ["split-reduce", "cohorts"] and chunk_labels:
+    if method == "cohorts" and chunk_labels:
         pytest.xfail()
 
     if chunks:
@@ -836,10 +836,7 @@ def test_cohorts_nd_by(func, method, axis, engine):
     assert_equal(actual, expected)
 
     actual, groups = groupby_reduce(array, by, sort=False, **kwargs)
-    if method == "map-reduce":
-        assert_equal(groups, np.array([1, 30, 2, 31, 3, 4, 40], dtype=np.int64))
-    else:
-        assert_equal(groups, np.array([1, 30, 2, 31, 3, 40, 4], dtype=np.int64))
+    assert_equal(groups, np.array([1, 30, 2, 31, 3, 4, 40], dtype=np.int64))
     reindexed = reindex_(actual, groups, pd.Index(sorted_groups))
     assert_equal(reindexed, expected)