[Data] Support List Types for Unique Aggregator and encode_lists flag (#58916)

kyuds · web-flow · commit 1180868dd447 · 2025-12-08T19:16:12.000-08:00
## Description Basically the same idea as #58659 So `Unique` aggregator uses `pyarrow.compute.unique` function internally. This doesn't work with non-hashable types like lists. Similar to what I did for `ApproximateTopK`, we now use pickle to serialize and deserialize elements. Other improvements: - `ignore_nulls` flag didn't work at all. This flag now properly works - Had to force `ignore_nulls=False` for datasets `unique` api for backwards compatibility (we set `ignore_nulls` to `True` by default, so behavior for datasets `unique` api will change now that `ignore_nulls` actually works) ## Related issues This PR replaces #58538 ## Additional information [Design doc on my notion](https://www.notion.so/kyuds/Unique-Aggregator-Improvements-2b67a80e48eb80de9820edf9d4996e0a?source=copy_link) --------- Signed-off-by: Daniel Shin <kyuseung1016@gmail.com> Signed-off-by: kyuds <kyuseung1016@gmail.com>
diff --git a/python/ray/data/aggregate.py b/python/ray/data/aggregate.py
@@ -17,6 +17,7 @@
 )
 
 import numpy as np
+import pyarrow as pa
 import pyarrow.compute as pc
 
 from ray.data._internal.util import is_null
@@ -935,35 +936,60 @@ class Unique(AggregateFnV2[Set[Any], List[Any]]):
         ignore_nulls: Whether to ignore null values when collecting unique items.
                       Default is True (nulls are excluded).
         alias_name: Optional name for the resulting column.
+        encode_lists: If `True`, encode list elements.  If `False`, encode
+            whole lists (i.e., the entire list is considered as a single object).
+            `False` by default. Note that this is a top-level flatten (not a recursive
+            flatten) operation.
     """
 
     def __init__(
         self,
         on: Optional[str] = None,
         ignore_nulls: bool = True,
         alias_name: Optional[str] = None,
+        encode_lists: bool = False,
     ):
         super().__init__(
             alias_name if alias_name else f"unique({str(on)})",
             on=on,
             ignore_nulls=ignore_nulls,
             zero_factory=set,
         )
+        self._encode_lists = encode_lists
 
     def combine(self, current_accumulator: Set[Any], new: Set[Any]) -> Set[Any]:
         return self._to_set(current_accumulator) | self._to_set(new)
 
     def aggregate_block(self, block: Block) -> List[Any]:
-        import pyarrow.compute as pac
-
         col = BlockAccessor.for_block(block).to_arrow().column(self._target_col_name)
-        return pac.unique(col).to_pylist()
+        if pa.types.is_list(col.type):
+            if self._encode_lists:
+                col = pc.list_flatten(col)
+            else:
+                # pyarrow doesn't natively support calculating unique over
+                # list-like objects (ie: lists, tuples). Using pandas seem to be
+                # much more efficient than doing something like json dump/load or
+                # pickle dump/load.
+                series = BlockAccessor.for_block(block).to_pandas()[
+                    self._target_col_name
+                ]
+                series = series.map(lambda x: None if x is None else tuple(x))
+                if self._ignore_nulls:
+                    series = series.dropna()
+                return list(series.unique())
+        if self._ignore_nulls:
+            col = pc.drop_null(col)
+        return pc.unique(col).to_pylist()
 
     @staticmethod
     def _to_set(x):
         if isinstance(x, set):
             return x
         elif isinstance(x, list):
+            if len(x) > 0 and isinstance(x[0], list):
+                # necessary because pyarrow converts all tuples to
+                # list internally.
+                x = map(lambda v: None if v is None else tuple(v), x)
             return set(x)
         else:
             return {x}
diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py
@@ -2963,7 +2963,7 @@ def unique(self, column: str) -> List[Any]:
 
             >>> import ray
             >>> ds = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv")
-            >>> ds.unique("target")
+            >>> sorted(ds.unique("target"))
             [0, 1, 2]
 
             One common use case is to convert the class labels
@@ -2986,7 +2986,7 @@ def unique(self, column: str) -> List[Any]:
         Returns:
             A list with unique elements in the given column.
         """  # noqa: E501
-        ret = self._aggregate_on(Unique, column)
+        ret = self._aggregate_on(Unique, column, ignore_nulls=False)
         return self._aggregate_result(ret)
 
     @AllToAllAPI
diff --git a/python/ray/data/tests/test_custom_agg.py b/python/ray/data/tests/test_custom_agg.py
@@ -1,3 +1,5 @@
+from collections import Counter
+
 import numpy as np
 import pytest
 
@@ -6,6 +8,7 @@
     ApproximateQuantile,
     ApproximateTopK,
     MissingValuePercentage,
+    Unique,
     ZeroPercentage,
 )
 from ray.data.tests.conftest import *  # noqa
@@ -496,6 +499,68 @@ def test_approximate_topk_encode_lists(self, ray_start_regular_shared_2_cpus):
         assert result["approx_topk(id)"][2] == {"id": 3, "count": 1}
 
 
+class TestUnique:
+    """Test cases for Unique aggregation."""
+
+    def test_unique_basic(self, ray_start_regular_shared_2_cpus):
+        """Test basic Unique aggregation."""
+        data = [{"id": "a"}, {"id": "b"}, {"id": "b"}, {"id": None}]
+        ds = ray.data.from_items(data)
+        result = ds.aggregate(Unique(on="id", ignore_nulls=False))
+
+        answer = ["a", "b", None]
+
+        assert Counter(result["unique(id)"]) == Counter(answer)
+
+    def test_unique_ignores_nulls(self, ray_start_regular_shared_2_cpus):
+        """Test Unique properly ignores nulls."""
+        data = [{"id": "a"}, {"id": None}, {"id": "b"}, {"id": "b"}, {"id": None}]
+        ds = ray.data.from_items(data)
+        result = ds.aggregate(Unique(on="id"))
+
+        assert sorted(result["unique(id)"]) == ["a", "b"]
+
+    def test_unique_custom_alias(self, ray_start_regular_shared_2_cpus):
+        """Test Unique with custom alias."""
+        data = [{"id": "a"}, {"id": "b"}, {"id": "b"}]
+        ds = ray.data.from_items(data)
+        result = ds.aggregate(Unique(on="id", alias_name="custom"))
+
+        assert sorted(result["custom"]) == ["a", "b"]
+
+    def test_unique_list_datatype(self, ray_start_regular_shared_2_cpus):
+        """Test Unique works with non-hashable types like list."""
+        data = [
+            {"id": ["a", "b", "c"]},
+            {"id": ["a", "b", "c"]},
+            {"id": ["a", "b", "c"]},
+        ]
+        ds = ray.data.from_items(data)
+        result = ds.aggregate(Unique(on="id"))
+
+        assert result["unique(id)"][0] == ["a", "b", "c"]
+
+    def test_unique_encode_lists(self, ray_start_regular_shared_2_cpus):
+        """Test Unique works when encode_lists is True."""
+        data = [{"id": ["a", "b", "c"]}, {"id": ["a", "a", "a", "b", None]}]
+        ds = ray.data.from_items(data)
+        result = ds.aggregate(Unique(on="id", encode_lists=True, ignore_nulls=False))
+
+        answer = ["a", "b", "c", None]
+
+        assert Counter(result["unique(id)"]) == Counter(answer)
+
+    def test_unique_encode_lists_ignores_nulls(self, ray_start_regular_shared_2_cpus):
+        """Test Unique will drop null values when encode_lists is True."""
+        data = [{"id": ["a", "b", "c"]}, {"id": ["a", "a", "a", "b", None]}]
+        ds = ray.data.from_items(data)
+        result = ds.aggregate(Unique(on="id", encode_lists=True))
+
+        answer = ["a", "b", "c"]
+
+        assert Counter(result["unique(id)"]) == Counter(answer)
+
+
 if __name__ == "__main__":
     import sys
 
diff --git a/python/ray/data/tests/test_groupby_e2e.py b/python/ray/data/tests/test_groupby_e2e.py
@@ -644,7 +644,7 @@ def test_groupby_multi_agg_with_nans(
             Mean("B", alias_name="mean_b", ignore_nulls=ignore_nulls),
             Std("B", alias_name="std_b", ignore_nulls=ignore_nulls),
             Quantile("B", alias_name="quantile_b", ignore_nulls=ignore_nulls),
-            Unique("B", alias_name="unique_b"),
+            Unique("B", alias_name="unique_b", ignore_nulls=False),
         )
     )
 
@@ -751,7 +751,7 @@ def test_groupby_aggregations_are_associative(
         Mean("B", alias_name="mean_b", ignore_nulls=ignore_nulls),
         Std("B", alias_name="std_b", ignore_nulls=ignore_nulls),
         Quantile("B", alias_name="quantile_b", ignore_nulls=ignore_nulls),
-        Unique("B", alias_name="unique_b"),
+        Unique("B", alias_name="unique_b", ignore_nulls=False),
     ]
 
     # Step 0: Prepare expected output (using Pandas)

Original file line number	Diff line number	Diff line change
`@@ -644,7 +644,7 @@ def test_groupby_multi_agg_with_nans(`
`644`	`644`	`Mean("B", alias_name="mean_b", ignore_nulls=ignore_nulls),`
`645`	`645`	`Std("B", alias_name="std_b", ignore_nulls=ignore_nulls),`
`646`	`646`	`Quantile("B", alias_name="quantile_b", ignore_nulls=ignore_nulls),`
`647`		`- Unique("B", alias_name="unique_b"),`
	`647`	`+ Unique("B", alias_name="unique_b", ignore_nulls=False),`
`648`	`648`	`)`
`649`	`649`	`)`
`650`	`650`
`@@ -751,7 +751,7 @@ def test_groupby_aggregations_are_associative(`
`751`	`751`	`Mean("B", alias_name="mean_b", ignore_nulls=ignore_nulls),`
`752`	`752`	`Std("B", alias_name="std_b", ignore_nulls=ignore_nulls),`
`753`	`753`	`Quantile("B", alias_name="quantile_b", ignore_nulls=ignore_nulls),`
`754`		`- Unique("B", alias_name="unique_b"),`
	`754`	`+ Unique("B", alias_name="unique_b", ignore_nulls=False),`
`755`	`755`	`]`
`756`	`756`
`757`	`757`	`# Step 0: Prepare expected output (using Pandas)`