From 82ef74424d927be60035e161dd47a0030781237f Mon Sep 17 00:00:00 2001 From: John Gemignani Date: Mon, 22 Dec 2025 19:22:29 -0800 Subject: [PATCH] Optimize join conditions to use direct column references NOTE: This PR was created with AI tools and a human. When matching patterns like (u)-[e]->(v), join conditions previously rebuilt entire vertex/edge agtype values just to extract IDs: age_id(_agtype_build_vertex(r.id, ...))::graphid Added optimize_qual_expr_mutator() to replace these patterns with direct column access: age_id(_agtype_build_vertex(id, ...)) -> graphid_to_agtype(id) age_start_id(_agtype_build_edge(...)) -> graphid_to_agtype(start) age_end_id(_agtype_build_edge(...)) -> graphid_to_agtype(end) age_properties(...) -> direct properties column Join conditions now use efficient comparisons like (e.start_id = u.id) enabling PostgreSQL to leverage index scans on edge tables. Added regression tests. All regression tests passed. modified: regress/expected/unified_vertex_table.out modified: regress/sql/unified_vertex_table.sql modified: src/backend/parser/cypher_clause.c --- regress/expected/unified_vertex_table.out | 117 ++++++++++++++- regress/sql/unified_vertex_table.sql | 58 ++++++++ src/backend/parser/cypher_clause.c | 164 ++++++++++++++++++++++ 3 files changed, 338 insertions(+), 1 deletion(-) diff --git a/regress/expected/unified_vertex_table.out b/regress/expected/unified_vertex_table.out index 118f4a837..81655bcaa 100644 --- a/regress/expected/unified_vertex_table.out +++ b/regress/expected/unified_vertex_table.out @@ -1309,11 +1309,124 @@ $$) AS (eid agtype, props agtype, sid agtype, eid2 agtype); 11540474045136897 | {"weight": 10} | 11258999068426241 | 11821949021847553 (1 row) +-- +-- Test 29: Verify join condition optimization with EXPLAIN +-- +-- When vertices/edges from previous clauses are joined, the optimization +-- should replace patterns like: +-- age_id(_agtype_build_vertex(r.id, ...))::graphid +-- with direct column access: +-- r.id +-- +-- This avoids expensive vertex reconstruction in join conditions. +-- +-- Create test data: Users following each other +SELECT * FROM cypher('unified_test', $$ + CREATE (:JoinOptUser {name: 'Alice'}), + (:JoinOptUser {name: 'Bob'}), + (:JoinOptUser {name: 'Carol'}) +$$) AS (v agtype); + v +--- +(0 rows) + +SELECT * FROM cypher('unified_test', $$ + MATCH (a:JoinOptUser {name: 'Alice'}), (b:JoinOptUser {name: 'Bob'}) + CREATE (a)-[:JOPT_FOLLOWS]->(b) +$$) AS (e agtype); + e +--- +(0 rows) + +SELECT * FROM cypher('unified_test', $$ + MATCH (b:JoinOptUser {name: 'Bob'}), (c:JoinOptUser {name: 'Carol'}) + CREATE (b)-[:JOPT_FOLLOWS]->(c) +$$) AS (e agtype); + e +--- +(0 rows) + +-- EXPLAIN showing join conditions use direct column access +-- Look for: graphid_to_agtype(id) instead of age_id(_agtype_build_vertex(...)) +-- And: direct id comparisons instead of age_id(...)::graphid +EXPLAIN (COSTS OFF) +SELECT * FROM cypher('unified_test', $$ + MATCH (u:JoinOptUser)-[e:JOPT_FOLLOWS]->(v:JoinOptUser) + RETURN u.name, v.name +$$) AS (u_name agtype, v_name agtype); + QUERY PLAN +------------------------------------------------------------ + Nested Loop + Join Filter: (e.start_id = u.id) + -> Nested Loop + -> Seq Scan on _ag_label_vertex u + Filter: (labels = '23814'::oid) + -> Seq Scan on _ag_label_vertex v + Filter: (labels = '23814'::oid) + -> Bitmap Heap Scan on "JOPT_FOLLOWS" e + Recheck Cond: (end_id = v.id) + -> Bitmap Index Scan on "JOPT_FOLLOWS_end_id_idx" + Index Cond: (end_id = v.id) +(11 rows) + +-- Verify the query still returns correct results +SELECT * FROM cypher('unified_test', $$ + MATCH (u:JoinOptUser)-[e:JOPT_FOLLOWS]->(v:JoinOptUser) + RETURN u.name, v.name + ORDER BY u.name +$$) AS (u_name agtype, v_name agtype); + u_name | v_name +---------+--------- + "Alice" | "Bob" + "Bob" | "Carol" +(2 rows) + +-- Multi-hop pattern showing optimization across multiple joins +EXPLAIN (COSTS OFF) +SELECT * FROM cypher('unified_test', $$ + MATCH (a:JoinOptUser)-[e1:JOPT_FOLLOWS]->(b:JoinOptUser)-[e2:JOPT_FOLLOWS]->(c:JoinOptUser) + RETURN a.name, b.name, c.name +$$) AS (a_name agtype, b_name agtype, c_name agtype); + QUERY PLAN +------------------------------------------------------------------------ + Nested Loop + Join Filter: (e1.start_id = a.id) + -> Nested Loop + Join Filter: _ag_enforce_edge_uniqueness2(e1.id, e2.id) + -> Nested Loop + Join Filter: (e2.start_id = b.id) + -> Nested Loop + -> Seq Scan on _ag_label_vertex b + Filter: (labels = '23814'::oid) + -> Seq Scan on _ag_label_vertex c + Filter: (labels = '23814'::oid) + -> Bitmap Heap Scan on "JOPT_FOLLOWS" e2 + Recheck Cond: (end_id = c.id) + -> Bitmap Index Scan on "JOPT_FOLLOWS_end_id_idx" + Index Cond: (end_id = c.id) + -> Bitmap Heap Scan on "JOPT_FOLLOWS" e1 + Recheck Cond: (end_id = b.id) + -> Bitmap Index Scan on "JOPT_FOLLOWS_end_id_idx" + Index Cond: (end_id = b.id) + -> Seq Scan on _ag_label_vertex a + Filter: (labels = '23814'::oid) +(21 rows) + +-- Verify multi-hop query results +SELECT * FROM cypher('unified_test', $$ + MATCH (a:JoinOptUser)-[e1:JOPT_FOLLOWS]->(b:JoinOptUser)-[e2:JOPT_FOLLOWS]->(c:JoinOptUser) + RETURN a.name, b.name, c.name +$$) AS (a_name agtype, b_name agtype, c_name agtype); + a_name | b_name | c_name +---------+--------+--------- + "Alice" | "Bob" | "Carol" +(1 row) + -- -- Cleanup -- SELECT drop_graph('unified_test', true); -NOTICE: drop cascades to 42 other objects +NOTICE: drop cascades to 44 other objects DETAIL: drop cascades to table unified_test._ag_label_vertex drop cascades to table unified_test._ag_label_edge drop cascades to table unified_test."Person" @@ -1356,6 +1469,8 @@ drop cascades to table unified_test."OptimizeTest" drop cascades to table unified_test."OptStart" drop cascades to table unified_test."OPT_EDGE" drop cascades to table unified_test."OptEnd" +drop cascades to table unified_test."JoinOptUser" +drop cascades to table unified_test."JOPT_FOLLOWS" NOTICE: graph "unified_test" has been dropped drop_graph ------------ diff --git a/regress/sql/unified_vertex_table.sql b/regress/sql/unified_vertex_table.sql index aebf6cd5a..605fd176d 100644 --- a/regress/sql/unified_vertex_table.sql +++ b/regress/sql/unified_vertex_table.sql @@ -809,6 +809,64 @@ SELECT * FROM cypher('unified_test', $$ RETURN id(e), properties(e), start_id(e), end_id(e) $$) AS (eid agtype, props agtype, sid agtype, eid2 agtype); +-- +-- Test 29: Verify join condition optimization with EXPLAIN +-- +-- When vertices/edges from previous clauses are joined, the optimization +-- should replace patterns like: +-- age_id(_agtype_build_vertex(r.id, ...))::graphid +-- with direct column access: +-- r.id +-- +-- This avoids expensive vertex reconstruction in join conditions. +-- + +-- Create test data: Users following each other +SELECT * FROM cypher('unified_test', $$ + CREATE (:JoinOptUser {name: 'Alice'}), + (:JoinOptUser {name: 'Bob'}), + (:JoinOptUser {name: 'Carol'}) +$$) AS (v agtype); + +SELECT * FROM cypher('unified_test', $$ + MATCH (a:JoinOptUser {name: 'Alice'}), (b:JoinOptUser {name: 'Bob'}) + CREATE (a)-[:JOPT_FOLLOWS]->(b) +$$) AS (e agtype); + +SELECT * FROM cypher('unified_test', $$ + MATCH (b:JoinOptUser {name: 'Bob'}), (c:JoinOptUser {name: 'Carol'}) + CREATE (b)-[:JOPT_FOLLOWS]->(c) +$$) AS (e agtype); + +-- EXPLAIN showing join conditions use direct column access +-- Look for: graphid_to_agtype(id) instead of age_id(_agtype_build_vertex(...)) +-- And: direct id comparisons instead of age_id(...)::graphid +EXPLAIN (COSTS OFF) +SELECT * FROM cypher('unified_test', $$ + MATCH (u:JoinOptUser)-[e:JOPT_FOLLOWS]->(v:JoinOptUser) + RETURN u.name, v.name +$$) AS (u_name agtype, v_name agtype); + +-- Verify the query still returns correct results +SELECT * FROM cypher('unified_test', $$ + MATCH (u:JoinOptUser)-[e:JOPT_FOLLOWS]->(v:JoinOptUser) + RETURN u.name, v.name + ORDER BY u.name +$$) AS (u_name agtype, v_name agtype); + +-- Multi-hop pattern showing optimization across multiple joins +EXPLAIN (COSTS OFF) +SELECT * FROM cypher('unified_test', $$ + MATCH (a:JoinOptUser)-[e1:JOPT_FOLLOWS]->(b:JoinOptUser)-[e2:JOPT_FOLLOWS]->(c:JoinOptUser) + RETURN a.name, b.name, c.name +$$) AS (a_name agtype, b_name agtype, c_name agtype); + +-- Verify multi-hop query results +SELECT * FROM cypher('unified_test', $$ + MATCH (a:JoinOptUser)-[e1:JOPT_FOLLOWS]->(b:JoinOptUser)-[e2:JOPT_FOLLOWS]->(c:JoinOptUser) + RETURN a.name, b.name, c.name +$$) AS (a_name agtype, b_name agtype, c_name agtype); + -- -- Cleanup -- diff --git a/src/backend/parser/cypher_clause.c b/src/backend/parser/cypher_clause.c index eb7dccf01..e56ac8acb 100644 --- a/src/backend/parser/cypher_clause.c +++ b/src/backend/parser/cypher_clause.c @@ -38,6 +38,7 @@ #include "parser/parsetree.h" #include "parser/parse_relation.h" #include "rewrite/rewriteHandler.h" +#include "utils/lsyscache.h" #include "catalog/ag_graph.h" #include "catalog/ag_label.h" @@ -136,6 +137,7 @@ static Node *make_edge_expr(cypher_parsestate *cpstate, ParseNamespaceItem *pnsi); static Node *make_qual(cypher_parsestate *cpstate, transform_entity *entity, char *name); +static Node *optimize_qual_expr_mutator(Node *node, void *context); static TargetEntry * transform_match_create_path_variable(cypher_parsestate *cpstate, cypher_path *path, List *entities); @@ -3375,11 +3377,173 @@ static void transform_match_pattern(cypher_parsestate *cpstate, Query *query, expr = (Expr *)coerce_to_boolean(pstate, (Node *)expr, "WHERE"); } + /* + * Apply optimization to the transformed expression tree. This looks for + * patterns like age_id(_agtype_build_vertex(...)) and replaces them with + * direct column references. + */ + if (expr != NULL) + { + expr = (Expr *)optimize_qual_expr_mutator((Node *)expr, NULL); + } + query->rtable = cpstate->pstate.p_rtable; query->rteperminfos = cpstate->pstate.p_rteperminfos; query->jointree = makeFromExpr(cpstate->pstate.p_joinlist, (Node *)expr); } +/* + * optimize_qual_expr_mutator - Walk expression tree and optimize vertex/edge + * accessor patterns. + * + * This mutator looks for patterns like: + * age_id(_agtype_build_vertex(id, label, props)) + * and transforms them to: + * graphid_to_agtype(id) + * + * This avoids the expensive reconstruction of vertex/edge agtype values + * just to immediately extract a single field from them. This is particularly + * important for join conditions where the vertex/edge comes from a previous + * clause. + */ +static Node *optimize_qual_expr_mutator(Node *node, void *context) +{ + if (node == NULL) + { + return NULL; + } + + /* + * Look for FuncExpr nodes that wrap accessor functions around + * _agtype_build_vertex or _agtype_build_edge calls. + */ + if (IsA(node, FuncExpr)) + { + FuncExpr *outer_func = (FuncExpr *)node; + char *outer_func_name; + Node *arg; + FuncExpr *inner_func; + char *inner_func_name; + List *inner_args; + int arg_index = -1; + + /* Must have exactly one argument */ + if (list_length(outer_func->args) != 1) + { + goto recurse; + } + + outer_func_name = get_func_name(outer_func->funcid); + if (outer_func_name == NULL) + { + goto recurse; + } + + /* Check if this is an accessor function we can optimize */ + if (strcmp(outer_func_name, "age_id") != 0 && + strcmp(outer_func_name, "age_start_id") != 0 && + strcmp(outer_func_name, "age_end_id") != 0 && + strcmp(outer_func_name, "age_properties") != 0) + { + goto recurse; + } + + arg = (Node *)linitial(outer_func->args); + + /* The argument must be a FuncExpr (the build function) */ + if (!IsA(arg, FuncExpr)) + { + goto recurse; + } + + inner_func = (FuncExpr *)arg; + inner_func_name = get_func_name(inner_func->funcid); + if (inner_func_name == NULL) + { + goto recurse; + } + + inner_args = inner_func->args; + + /* + * Check for _agtype_build_vertex(id, label_name, properties) + * Arguments: 0=id (graphid), 1=label_name (cstring), 2=properties (agtype) + */ + if (strcmp(inner_func_name, "_agtype_build_vertex") == 0 && + list_length(inner_args) == 3) + { + if (strcmp(outer_func_name, "age_id") == 0) + { + arg_index = 0; /* id */ + } + else if (strcmp(outer_func_name, "age_properties") == 0) + { + arg_index = 2; /* properties */ + } + } + /* + * Check for _agtype_build_edge(id, startid, endid, label_name, properties) + * Arguments: 0=id (graphid), 1=start_id (graphid), 2=end_id (graphid), + * 3=label_name (cstring), 4=properties (agtype) + */ + else if (strcmp(inner_func_name, "_agtype_build_edge") == 0 && + list_length(inner_args) == 5) + { + if (strcmp(outer_func_name, "age_id") == 0) + { + arg_index = 0; /* id */ + } + else if (strcmp(outer_func_name, "age_start_id") == 0) + { + arg_index = 1; /* start_id */ + } + else if (strcmp(outer_func_name, "age_end_id") == 0) + { + arg_index = 2; /* end_id */ + } + else if (strcmp(outer_func_name, "age_properties") == 0) + { + arg_index = 4; /* properties */ + } + } + + /* If we found a pattern to optimize */ + if (arg_index >= 0) + { + Node *extracted_arg = (Node *)list_nth(inner_args, arg_index); + + /* For properties, return directly (already agtype) */ + if (strcmp(outer_func_name, "age_properties") == 0) + { + return extracted_arg; + } + else + { + /* + * For graphid fields (id, start_id, end_id), we need to wrap + * in graphid_to_agtype to match the original return type. + */ + Oid cast_func_oid; + FuncExpr *cast_expr; + + cast_func_oid = get_ag_func_oid("graphid_to_agtype", 1, + GRAPHIDOID); + + cast_expr = makeFuncExpr(cast_func_oid, AGTYPEOID, + list_make1(extracted_arg), + InvalidOid, InvalidOid, + COERCE_EXPLICIT_CALL); + cast_expr->location = outer_func->location; + + return (Node *)cast_expr; + } + } + } + +recurse: + return expression_tree_mutator(node, optimize_qual_expr_mutator, context); +} + /* * Creates a FuncCall node that will prevent an edge from being joined * to twice.