Skip to content

Commit 35b00b5

Browse files
authored
feat: extend join condition dedup to anti/semi joins (#19097)
* feat: extend join condition dedup to anti/semi joins * rm log * make lint * cache ReadSettings * fix anti join * make more conservative
1 parent 5594436 commit 35b00b5

File tree

5 files changed

+340
-12
lines changed

5 files changed

+340
-12
lines changed

src/query/service/tests/it/sql/planner/optimizer/optimizers/operator/filter/deduplicate_join_condition_test.rs

Lines changed: 280 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1471,3 +1471,283 @@ Join [t2.id = t3.id, t1.id = t3.id]
14711471

14721472
Ok(())
14731473
}
1474+
1475+
// Deduplicate redundant equi-conditions on a semi join
1476+
// Inner child builds t1.id = t2.id; upper LEFT SEMI has both t2.id = t3.id
1477+
// and t1.id = t3.id. One of them should be removed.
1478+
#[test]
1479+
fn test_left_semi_deduplication() -> Result<()> {
1480+
let mut builder = ExprBuilder::new();
1481+
1482+
let t1_id = builder.column(
1483+
"t1.id",
1484+
0,
1485+
"id",
1486+
DataType::Number(NumberDataType::Int64),
1487+
"t1",
1488+
0,
1489+
);
1490+
let t2_id = builder.column(
1491+
"t2.id",
1492+
1,
1493+
"id",
1494+
DataType::Number(NumberDataType::Int64),
1495+
"t2",
1496+
1,
1497+
);
1498+
let t3_id = builder.column(
1499+
"t3.id",
1500+
2,
1501+
"id",
1502+
DataType::Number(NumberDataType::Int64),
1503+
"t3",
1504+
2,
1505+
);
1506+
1507+
let t1 = builder.table_scan(0, "t1");
1508+
let t2 = builder.table_scan(1, "t2");
1509+
let t3 = builder.table_scan(2, "t3");
1510+
1511+
let cond_t1_t2 = builder.join_condition(t1_id.clone(), t2_id.clone(), false);
1512+
let cond_t2_t3 = builder.join_condition(t2_id.clone(), t3_id.clone(), false);
1513+
let cond_t1_t3 = builder.join_condition(t1_id.clone(), t3_id.clone(), false); // redundant
1514+
1515+
let join_t1_t2 = builder.join(t1, t2, vec![cond_t1_t2], JoinType::Inner);
1516+
let join_tree = builder.join(
1517+
join_t1_t2,
1518+
t3,
1519+
vec![cond_t2_t3.clone(), cond_t1_t3],
1520+
JoinType::LeftSemi,
1521+
);
1522+
1523+
let before_patterns = [r#"
1524+
Join [t2.id = t3.id, t1.id = t3.id]
1525+
Join [t1.id = t2.id]
1526+
Table t0
1527+
Table t1
1528+
Table t2
1529+
"#];
1530+
1531+
let after_patterns = [r#"
1532+
Join [t2.id = t3.id]
1533+
Join [t1.id = t2.id]
1534+
Table t0
1535+
Table t1
1536+
Table t2
1537+
"#];
1538+
1539+
let optimized = run_optimizer(join_tree.clone())?;
1540+
compare_trees(&join_tree, &optimized, &before_patterns, &after_patterns)?;
1541+
1542+
Ok(())
1543+
}
1544+
1545+
// Deduplicate redundant equi-conditions on an anti join
1546+
// Inner child builds t1.id = t2.id; upper RIGHT ANTI has both t3.id = t1.id
1547+
// and t3.id = t2.id. One of them should be removed.
1548+
#[test]
1549+
fn test_right_anti_deduplication() -> Result<()> {
1550+
let mut builder = ExprBuilder::new();
1551+
1552+
let t1_id = builder.column(
1553+
"t1.id",
1554+
0,
1555+
"id",
1556+
DataType::Number(NumberDataType::Int64),
1557+
"t1",
1558+
0,
1559+
);
1560+
let t2_id = builder.column(
1561+
"t2.id",
1562+
1,
1563+
"id",
1564+
DataType::Number(NumberDataType::Int64),
1565+
"t2",
1566+
1,
1567+
);
1568+
let t3_id = builder.column(
1569+
"t3.id",
1570+
2,
1571+
"id",
1572+
DataType::Number(NumberDataType::Int64),
1573+
"t3",
1574+
2,
1575+
);
1576+
1577+
let t1 = builder.table_scan(0, "t1");
1578+
let t2 = builder.table_scan(1, "t2");
1579+
let t3 = builder.table_scan(2, "t3");
1580+
1581+
let cond_t1_t2 = builder.join_condition(t1_id.clone(), t2_id.clone(), false);
1582+
let cond_t3_t1 = builder.join_condition(t3_id.clone(), t1_id.clone(), false);
1583+
let cond_t3_t2 = builder.join_condition(t3_id.clone(), t2_id.clone(), false); // redundant
1584+
1585+
let join_t1_t2 = builder.join(t1, t2, vec![cond_t1_t2], JoinType::Inner);
1586+
let join_tree = builder.join(
1587+
t3,
1588+
join_t1_t2,
1589+
vec![cond_t3_t1.clone(), cond_t3_t2],
1590+
JoinType::RightAnti,
1591+
);
1592+
1593+
let before_patterns = [r#"
1594+
Join [t3.id = t1.id, t3.id = t2.id]
1595+
Table t2
1596+
Join [t1.id = t2.id]
1597+
Table t0
1598+
Table t1
1599+
"#];
1600+
1601+
let after_patterns = [r#"
1602+
Join [t3.id = t1.id]
1603+
Table t2
1604+
Join [t1.id = t2.id]
1605+
Table t0
1606+
Table t1
1607+
"#];
1608+
1609+
let optimized = run_optimizer(join_tree.clone())?;
1610+
compare_trees(&join_tree, &optimized, &before_patterns, &after_patterns)?;
1611+
1612+
Ok(())
1613+
}
1614+
1615+
// Ensure anti join equivalence does not leak upward to remove parent join predicates.
1616+
// Child LEFT ANTI has t1.id = t2.id, parent INNER joins on both t1.id = t3.id and t2.id = t3.id.
1617+
// These two predicates must not be deduplicated using child's equality.
1618+
#[test]
1619+
fn test_anti_equivalence_not_leaking() -> Result<()> {
1620+
let mut builder = ExprBuilder::new();
1621+
1622+
let t1_id = builder.column(
1623+
"t1.id",
1624+
0,
1625+
"id",
1626+
DataType::Number(NumberDataType::Int64),
1627+
"t1",
1628+
0,
1629+
);
1630+
let t2_id = builder.column(
1631+
"t2.id",
1632+
1,
1633+
"id",
1634+
DataType::Number(NumberDataType::Int64),
1635+
"t2",
1636+
1,
1637+
);
1638+
let t3_id = builder.column(
1639+
"t3.id",
1640+
2,
1641+
"id",
1642+
DataType::Number(NumberDataType::Int64),
1643+
"t3",
1644+
2,
1645+
);
1646+
1647+
let t1 = builder.table_scan(0, "t1");
1648+
let t2 = builder.table_scan(1, "t2");
1649+
let t3 = builder.table_scan(2, "t3");
1650+
1651+
let cond_t1_t2 = builder.join_condition(t1_id.clone(), t2_id.clone(), false);
1652+
let cond_t1_t3 = builder.join_condition(t1_id.clone(), t3_id.clone(), false);
1653+
let cond_t2_t3 = builder.join_condition(t2_id.clone(), t3_id.clone(), false);
1654+
1655+
let anti = builder.join(t1, t2, vec![cond_t1_t2], JoinType::LeftAnti);
1656+
let join_tree = builder.join(
1657+
anti,
1658+
t3,
1659+
vec![cond_t1_t3.clone(), cond_t2_t3.clone()],
1660+
JoinType::Inner,
1661+
);
1662+
1663+
let before_patterns = [r#"
1664+
Join [t1.id = t3.id, t2.id = t3.id]
1665+
Join [t1.id = t2.id]
1666+
Table t0
1667+
Table t1
1668+
Table t2
1669+
"#];
1670+
1671+
let after_patterns = [r#"
1672+
Join [t1.id = t3.id, t2.id = t3.id]
1673+
Join [t1.id = t2.id]
1674+
Table t0
1675+
Table t1
1676+
Table t2
1677+
"#];
1678+
1679+
let optimized = run_optimizer(join_tree.clone())?;
1680+
compare_trees(&join_tree, &optimized, &before_patterns, &after_patterns)?;
1681+
1682+
Ok(())
1683+
}
1684+
1685+
// Ensure semi join equivalence does not leak upward to remove parent join predicates.
1686+
// Child LEFT SEMI has t1.id = t2.id, parent INNER joins on both t1.id = t3.id and t2.id = t3.id.
1687+
// These two predicates must not be deduplicated using child's equality.
1688+
#[test]
1689+
fn test_semi_equivalence_not_leaking() -> Result<()> {
1690+
let mut builder = ExprBuilder::new();
1691+
1692+
let t1_id = builder.column(
1693+
"t1.id",
1694+
0,
1695+
"id",
1696+
DataType::Number(NumberDataType::Int64),
1697+
"t1",
1698+
0,
1699+
);
1700+
let t2_id = builder.column(
1701+
"t2.id",
1702+
1,
1703+
"id",
1704+
DataType::Number(NumberDataType::Int64),
1705+
"t2",
1706+
1,
1707+
);
1708+
let t3_id = builder.column(
1709+
"t3.id",
1710+
2,
1711+
"id",
1712+
DataType::Number(NumberDataType::Int64),
1713+
"t3",
1714+
2,
1715+
);
1716+
1717+
let t1 = builder.table_scan(0, "t1");
1718+
let t2 = builder.table_scan(1, "t2");
1719+
let t3 = builder.table_scan(2, "t3");
1720+
1721+
let cond_t1_t2 = builder.join_condition(t1_id.clone(), t2_id.clone(), false);
1722+
let cond_t1_t3 = builder.join_condition(t1_id.clone(), t3_id.clone(), false);
1723+
let cond_t2_t3 = builder.join_condition(t2_id.clone(), t3_id.clone(), false);
1724+
1725+
let semi = builder.join(t1, t2, vec![cond_t1_t2], JoinType::LeftSemi);
1726+
let join_tree = builder.join(
1727+
semi,
1728+
t3,
1729+
vec![cond_t1_t3.clone(), cond_t2_t3.clone()],
1730+
JoinType::Inner,
1731+
);
1732+
1733+
let before_patterns = [r#"
1734+
Join [t1.id = t3.id, t2.id = t3.id]
1735+
Join [t1.id = t2.id]
1736+
Table t0
1737+
Table t1
1738+
Table t2
1739+
"#];
1740+
1741+
let after_patterns = [r#"
1742+
Join [t1.id = t3.id, t2.id = t3.id]
1743+
Join [t1.id = t2.id]
1744+
Table t0
1745+
Table t1
1746+
Table t2
1747+
"#];
1748+
1749+
let optimized = run_optimizer(join_tree.clone())?;
1750+
compare_trees(&join_tree, &optimized, &before_patterns, &after_patterns)?;
1751+
1752+
Ok(())
1753+
}

src/query/sql/src/planner/optimizer/optimizers/operator/filter/deduplicate_join_condition.rs

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -106,25 +106,34 @@ impl DeduplicateJoinConditionOptimizer {
106106
#[recursive::recursive]
107107
pub fn deduplicate(&mut self, s_expr: &SExpr) -> Result<SExpr> {
108108
match s_expr.plan.as_ref() {
109-
// Only optimize inner joins
110-
RelOperator::Join(join) if matches!(join.join_type, JoinType::Inner) => {
111-
self.optimize_inner_join(s_expr, join)
109+
// Only optimize filtering joins that don't preserve nulls
110+
RelOperator::Join(join) if join.join_type.is_filtering_join() => {
111+
self.optimize_filtering_join(s_expr, join)
112112
}
113113
// Recursively process other nodes
114114
_ => self.deduplicate_children(s_expr),
115115
}
116116
}
117117

118-
/// Optimize inner join by removing redundant conditions
119-
fn optimize_inner_join(&mut self, s_expr: &SExpr, join: &Join) -> Result<SExpr> {
120-
debug_assert!(matches!(join.join_type, JoinType::Inner));
118+
/// Optimize filtering joins (inner/semi/anti) by removing redundant equi-conditions
119+
fn optimize_filtering_join(&mut self, s_expr: &SExpr, join: &Join) -> Result<SExpr> {
120+
debug_assert!(join.join_type.is_filtering_join());
121121

122122
// Recursively optimize left and right subtrees
123123
let left = self.deduplicate(s_expr.child(0)?)?;
124124
let right = self.deduplicate(s_expr.child(1)?)?;
125125

126126
let mut join = join.clone();
127127
let mut non_redundant_conditions = Vec::new();
128+
// Anti / Semi joins should not contribute new equivalence to ancestor nodes.
129+
let snapshot = if matches!(
130+
join.join_type,
131+
JoinType::LeftAnti | JoinType::RightAnti | JoinType::LeftSemi | JoinType::RightSemi
132+
) {
133+
Some(self.snapshot())
134+
} else {
135+
None
136+
};
128137

129138
// Check each equi-join condition
130139
for condition in &join.equi_conditions {
@@ -149,6 +158,11 @@ impl DeduplicateJoinConditionOptimizer {
149158
join.equi_conditions = non_redundant_conditions;
150159
}
151160

161+
// Restore union-find state for anti joins to avoid leaking equivalence upward.
162+
if let Some(snapshot) = snapshot {
163+
self.restore(snapshot);
164+
}
165+
152166
// Create new expression
153167
let new_plan = Arc::new(RelOperator::Join(join));
154168
let new_children = vec![Arc::new(left), Arc::new(right)];
@@ -233,6 +247,28 @@ impl DeduplicateJoinConditionOptimizer {
233247
fn union(&mut self, idx1: usize, idx2: usize) {
234248
self.column_group.insert(idx2, idx1);
235249
}
250+
251+
/// Snapshot the current union-find state so we can rollback after
252+
/// optimizing an anti join.
253+
fn snapshot(&self) -> UfSnapshot {
254+
UfSnapshot {
255+
expr_to_index: self.expr_to_index.clone(),
256+
column_group: self.column_group.clone(),
257+
next_index: self.next_index,
258+
}
259+
}
260+
261+
fn restore(&mut self, snapshot: UfSnapshot) {
262+
self.expr_to_index = snapshot.expr_to_index;
263+
self.column_group = snapshot.column_group;
264+
self.next_index = snapshot.next_index;
265+
}
266+
}
267+
268+
struct UfSnapshot {
269+
expr_to_index: HashMap<ScalarExpr, usize>,
270+
column_group: HashMap<usize, usize>,
271+
next_index: usize,
236272
}
237273

238274
#[async_trait::async_trait]

src/query/sql/src/planner/plans/join.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,20 @@ impl JoinType {
125125
JoinType::Asof | JoinType::LeftAsof | JoinType::RightAsof
126126
)
127127
}
128+
129+
/// Joins that behave like filters (no null preserving side) so
130+
/// equi-join conditions can be deduplicated safely.
131+
pub fn is_filtering_join(&self) -> bool {
132+
matches!(
133+
self,
134+
JoinType::Inner
135+
| JoinType::InnerAny
136+
| JoinType::LeftSemi
137+
| JoinType::RightSemi
138+
| JoinType::LeftAnti
139+
| JoinType::RightAnti
140+
)
141+
}
128142
}
129143

130144
impl Display for JoinType {

0 commit comments

Comments
 (0)