Skip to content

Commit f978d01

Browse files
committed
Move build_balanced_tree to Or constructor
1 parent e57b252 commit f978d01

File tree

5 files changed

+61
-52
lines changed

5 files changed

+61
-52
lines changed

pyiceberg/expressions/__init__.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,10 @@
2121
from functools import cached_property, reduce
2222
from typing import (
2323
Any,
24+
Callable,
2425
Generic,
2526
Iterable,
27+
Sequence,
2628
Set,
2729
Tuple,
2830
Type,
@@ -79,6 +81,45 @@ def __or__(self, other: BooleanExpression) -> BooleanExpression:
7981
return Or(self, other)
8082

8183

84+
def _build_balanced_tree(
85+
operator_: Callable[[BooleanExpression, BooleanExpression], BooleanExpression], items: Sequence[BooleanExpression]
86+
) -> BooleanExpression:
87+
"""
88+
Recursively constructs a balanced binary tree of BooleanExpressions using the provided binary operator.
89+
90+
This function is a safer and more scalable alternative to:
91+
reduce(operator_, items)
92+
93+
Using `reduce` creates a deeply nested, unbalanced tree (e.g., operator_(a, operator_(b, operator_(c, ...)))),
94+
which grows linearly with the number of items. This can lead to RecursionError exceptions in Python
95+
when the number of expressions is large (e.g., >1000).
96+
97+
In contrast, this function builds a balanced binary tree with logarithmic depth (O(log n)),
98+
helping avoid recursion issues and ensuring that expression trees remain stable, predictable,
99+
and safe to traverse — especially in tools like PyIceberg that operate on large logical trees.
100+
101+
Parameters:
102+
operator_ (Callable): A binary operator function (e.g., pyiceberg.expressions.Or, And) that takes two
103+
BooleanExpressions and returns a combined BooleanExpression.
104+
items (Sequence[BooleanExpression]): A sequence of BooleanExpression objects to combine.
105+
106+
Returns:
107+
BooleanExpression: The balanced combination of all input BooleanExpressions.
108+
109+
Raises:
110+
ValueError: If the input sequence is empty.
111+
"""
112+
if not items:
113+
raise ValueError("No expressions to combine")
114+
if len(items) == 1:
115+
return items[0]
116+
mid = len(items) // 2
117+
118+
left = _build_balanced_tree(operator_, items[:mid])
119+
right = _build_balanced_tree(operator_, items[mid:])
120+
return operator_(left, right)
121+
122+
82123
class Term(Generic[L], ABC):
83124
"""A simple expression that evaluates to a value."""
84125

@@ -257,7 +298,7 @@ class Or(BooleanExpression):
257298

258299
def __new__(cls, left: BooleanExpression, right: BooleanExpression, *rest: BooleanExpression) -> BooleanExpression: # type: ignore
259300
if rest:
260-
return reduce(Or, (left, right, *rest))
301+
return _build_balanced_tree(Or, (left, right, *rest))
261302
if left is AlwaysTrue() or right is AlwaysTrue():
262303
return AlwaysTrue()
263304
elif left is AlwaysFalse():

pyiceberg/table/upsert_util.py

Lines changed: 7 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
# under the License.
1717
import functools
1818
import operator
19-
from typing import Callable, List, TypeVar
2019

2120
import pyarrow as pa
2221
from pyarrow import Table as pyarrow_table
@@ -27,45 +26,9 @@
2726
BooleanExpression,
2827
EqualTo,
2928
In,
29+
Or,
3030
)
3131

32-
T = TypeVar("T")
33-
34-
35-
def build_balanced_tree(operator_: Callable[[T, T], T], items: List[T]) -> T:
36-
"""
37-
Recursively constructs a balanced binary tree of expressions using the provided binary operator.
38-
39-
This function is a safer and more scalable alternative to:
40-
reduce(operator_, items)
41-
42-
Using reduce creates a deeply nested, unbalanced tree (e.g., operator_(a, operator_(b, operator_(c, ...)))),
43-
which grows linearly with the number of items. This can lead to RecursionError exceptions in Python
44-
when the number of expressions is large (e.g., >1000).
45-
46-
In contrast, this function builds a balanced binary tree with logarithmic depth (O(log n)),
47-
helping avoid recursion issues and ensuring that expression trees remain stable, predictable,
48-
and safe to traverse — especially in tools like PyIceberg that operate on large logical trees.
49-
50-
Parameters:
51-
operator_ (Callable[[T, T], T]): A binary operator function (e.g., pyiceberg.expressions.Or, And).
52-
items (List[T]): A list of expression objects to combine.
53-
54-
Returns:
55-
T: An expression object representing the balanced combination of all input expressions.
56-
57-
Raises:
58-
ValueError: If the input list is empty.
59-
"""
60-
if not items:
61-
raise ValueError("No expressions to combine")
62-
if len(items) == 1:
63-
return items[0]
64-
mid = len(items) // 2
65-
left = build_balanced_tree(operator_, items[:mid])
66-
right = build_balanced_tree(operator_, items[mid:])
67-
return operator_(left, right)
68-
6932

7033
def create_match_filter(df: pyarrow_table, join_cols: list[str]) -> BooleanExpression:
7134
unique_keys = df.select(join_cols).group_by(join_cols).aggregate([])
@@ -77,7 +40,12 @@ def create_match_filter(df: pyarrow_table, join_cols: list[str]) -> BooleanExpre
7740
functools.reduce(operator.and_, [EqualTo(col, row[col]) for col in join_cols]) for row in unique_keys.to_pylist()
7841
]
7942

80-
return AlwaysFalse() if len(filters) == 0 else build_balanced_tree(operator.or_, filters)
43+
if len(filters) == 0:
44+
return AlwaysFalse()
45+
elif len(filters) == 1:
46+
return filters[0]
47+
else:
48+
return Or(*filters)
8149

8250

8351
def has_duplicate_rows(df: pyarrow_table, join_cols: list[str]) -> bool:

tests/expressions/test_expressions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -578,7 +578,7 @@ def test_negate(lhs: BooleanExpression, rhs: BooleanExpression) -> None:
578578
),
579579
(
580580
Or(ExpressionA(), ExpressionB(), ExpressionA()),
581-
Or(Or(ExpressionA(), ExpressionB()), ExpressionA()),
581+
Or(ExpressionA(), Or(ExpressionB(), ExpressionA())),
582582
),
583583
(Not(Not(ExpressionA())), ExpressionA()),
584584
],

tests/expressions/test_visitors.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -230,9 +230,9 @@ def test_boolean_expression_visitor() -> None:
230230
"NOT",
231231
"OR",
232232
"EQUALTO",
233-
"OR",
234233
"NOTEQUALTO",
235234
"OR",
235+
"OR",
236236
"EQUALTO",
237237
"NOT",
238238
"AND",
@@ -408,28 +408,28 @@ def test_and_expression_binding(
408408
),
409409
),
410410
Or(
411+
BoundIn(
412+
BoundReference(
413+
field=NestedField(field_id=1, name="foo", field_type=StringType(), required=False),
414+
accessor=Accessor(position=0, inner=None),
415+
),
416+
{literal("bar"), literal("baz")},
417+
),
411418
Or(
412419
BoundIn(
413420
BoundReference(
414421
field=NestedField(field_id=1, name="foo", field_type=StringType(), required=False),
415422
accessor=Accessor(position=0, inner=None),
416423
),
417-
{literal("bar"), literal("baz")},
424+
{literal("bar")},
418425
),
419426
BoundIn(
420427
BoundReference(
421428
field=NestedField(field_id=1, name="foo", field_type=StringType(), required=False),
422429
accessor=Accessor(position=0, inner=None),
423430
),
424-
{literal("bar")},
425-
),
426-
),
427-
BoundIn(
428-
BoundReference(
429-
field=NestedField(field_id=1, name="foo", field_type=StringType(), required=False),
430-
accessor=Accessor(position=0, inner=None),
431+
{literal("baz")},
431432
),
432-
{literal("baz")},
433433
),
434434
),
435435
),

tests/io/test_pyarrow_visitor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -836,5 +836,5 @@ def test_expression_to_complementary_pyarrow(
836836
# Notice an isNan predicate on a str column is automatically converted to always false and removed from Or and thus will not appear in the pc.expr.
837837
assert (
838838
repr(result)
839-
== """<pyarrow.compute.Expression (((invert((((((string_field == "hello") and (float_field > 100)) or (is_nan(float_field) and (double_field == 0))) or (float_field > 100)) and invert(is_null(double_field, {nan_is_null=false})))) or is_null(float_field, {nan_is_null=false})) or is_null(string_field, {nan_is_null=false})) or is_nan(double_field))>"""
839+
== """<pyarrow.compute.Expression (((invert(((((string_field == "hello") and (float_field > 100)) or ((is_nan(float_field) and (double_field == 0)) or (float_field > 100))) and invert(is_null(double_field, {nan_is_null=false})))) or is_null(float_field, {nan_is_null=false})) or is_null(string_field, {nan_is_null=false})) or is_nan(double_field))>"""
840840
)

0 commit comments

Comments
 (0)