|
21 | 21 | from functools import cached_property, reduce |
22 | 22 | from typing import ( |
23 | 23 | Any, |
| 24 | + Callable, |
24 | 25 | Generic, |
25 | 26 | Iterable, |
| 27 | + Sequence, |
26 | 28 | Set, |
27 | 29 | Tuple, |
28 | 30 | Type, |
@@ -79,6 +81,45 @@ def __or__(self, other: BooleanExpression) -> BooleanExpression: |
79 | 81 | return Or(self, other) |
80 | 82 |
|
81 | 83 |
|
| 84 | +def _build_balanced_tree( |
| 85 | + operator_: Callable[[BooleanExpression, BooleanExpression], BooleanExpression], items: Sequence[BooleanExpression] |
| 86 | +) -> BooleanExpression: |
| 87 | + """ |
| 88 | + Recursively constructs a balanced binary tree of BooleanExpressions using the provided binary operator. |
| 89 | +
|
| 90 | + This function is a safer and more scalable alternative to: |
| 91 | + reduce(operator_, items) |
| 92 | +
|
| 93 | + Using `reduce` creates a deeply nested, unbalanced tree (e.g., operator_(a, operator_(b, operator_(c, ...)))), |
| 94 | + which grows linearly with the number of items. This can lead to RecursionError exceptions in Python |
| 95 | + when the number of expressions is large (e.g., >1000). |
| 96 | +
|
| 97 | + In contrast, this function builds a balanced binary tree with logarithmic depth (O(log n)), |
| 98 | + helping avoid recursion issues and ensuring that expression trees remain stable, predictable, |
| 99 | + and safe to traverse — especially in tools like PyIceberg that operate on large logical trees. |
| 100 | +
|
| 101 | + Parameters: |
| 102 | + operator_ (Callable): A binary operator function (e.g., pyiceberg.expressions.Or, And) that takes two |
| 103 | + BooleanExpressions and returns a combined BooleanExpression. |
| 104 | + items (Sequence[BooleanExpression]): A sequence of BooleanExpression objects to combine. |
| 105 | +
|
| 106 | + Returns: |
| 107 | + BooleanExpression: The balanced combination of all input BooleanExpressions. |
| 108 | +
|
| 109 | + Raises: |
| 110 | + ValueError: If the input sequence is empty. |
| 111 | + """ |
| 112 | + if not items: |
| 113 | + raise ValueError("No expressions to combine") |
| 114 | + if len(items) == 1: |
| 115 | + return items[0] |
| 116 | + mid = len(items) // 2 |
| 117 | + |
| 118 | + left = _build_balanced_tree(operator_, items[:mid]) |
| 119 | + right = _build_balanced_tree(operator_, items[mid:]) |
| 120 | + return operator_(left, right) |
| 121 | + |
| 122 | + |
82 | 123 | class Term(Generic[L], ABC): |
83 | 124 | """A simple expression that evaluates to a value.""" |
84 | 125 |
|
@@ -257,7 +298,7 @@ class Or(BooleanExpression): |
257 | 298 |
|
258 | 299 | def __new__(cls, left: BooleanExpression, right: BooleanExpression, *rest: BooleanExpression) -> BooleanExpression: # type: ignore |
259 | 300 | if rest: |
260 | | - return reduce(Or, (left, right, *rest)) |
| 301 | + return _build_balanced_tree(Or, (left, right, *rest)) |
261 | 302 | if left is AlwaysTrue() or right is AlwaysTrue(): |
262 | 303 | return AlwaysTrue() |
263 | 304 | elif left is AlwaysFalse(): |
|
0 commit comments