From 96ee4b6e50080c7835aba5031d7d1a58c10b9fcf Mon Sep 17 00:00:00 2001 From: dev-lew <> Date: Fri, 30 Jan 2026 15:16:43 -0500 Subject: [PATCH 1/8] Create initial comment removal function --- pgdog/src/frontend/router/parser/comment.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/pgdog/src/frontend/router/parser/comment.rs b/pgdog/src/frontend/router/parser/comment.rs index a87883ad..bcbfa63e 100644 --- a/pgdog/src/frontend/router/parser/comment.rs +++ b/pgdog/src/frontend/router/parser/comment.rs @@ -1,6 +1,7 @@ use once_cell::sync::Lazy; +use pg_query::protobuf::ScanResult; use pg_query::scan_raw; -use pg_query::{protobuf::Token, scan}; +use pg_query::{protobuf::ScanToken, protobuf::Token, scan}; use pgdog_config::QueryParserEngine; use regex::Regex; @@ -86,6 +87,20 @@ pub fn comment( Ok((None, role)) } +pub fn remove_comments(query: &str, engine: QueryParserEngine) -> Result { + let mut result = match engine { + QueryParserEngine::PgQueryProtobuf => scan(query), + QueryParserEngine::PgQueryRaw => scan_raw(query), + } + .map_err(Error::PgQuery)?; + + result + .tokens + .retain(|st| st.token != Token::CComment as i32 && st.token != Token::SqlComment as i32); + + Ok(result) +} + #[cfg(test)] mod tests { use pgdog_config::SystemCatalogsBehavior; From f58258c9d193c298dcde4784299d654f8fbade00 Mon Sep 17 00:00:00 2001 From: dev-lew <> Date: Sat, 31 Jan 2026 14:59:44 -0500 Subject: [PATCH 2/8] Change the implementation for remove_comments The previous approach is incorrect because we need a str representing the query instead of just a ScanResult. This approach reconstructs the query with string slicing by using the indices where comments are present as returned by the scanner. --- pgdog/src/frontend/router/parser/comment.rs | 44 +++++++++++++++++---- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/pgdog/src/frontend/router/parser/comment.rs b/pgdog/src/frontend/router/parser/comment.rs index bcbfa63e..0bd2d759 100644 --- a/pgdog/src/frontend/router/parser/comment.rs +++ b/pgdog/src/frontend/router/parser/comment.rs @@ -1,7 +1,6 @@ use once_cell::sync::Lazy; -use pg_query::protobuf::ScanResult; use pg_query::scan_raw; -use pg_query::{protobuf::ScanToken, protobuf::Token, scan}; +use pg_query::{protobuf::Token, scan}; use pgdog_config::QueryParserEngine; use regex::Regex; @@ -87,18 +86,49 @@ pub fn comment( Ok((None, role)) } -pub fn remove_comments(query: &str, engine: QueryParserEngine) -> Result { - let mut result = match engine { +pub fn has_comments(query: &str, engine: QueryParserEngine) -> Result { + let result = match engine { QueryParserEngine::PgQueryProtobuf => scan(query), QueryParserEngine::PgQueryRaw => scan_raw(query), } .map_err(Error::PgQuery)?; - result + Ok(result .tokens - .retain(|st| st.token != Token::CComment as i32 && st.token != Token::SqlComment as i32); + .iter() + .any(|st| st.token == Token::CComment as i32 || st.token == Token::SqlComment as i32)) +} + +pub fn remove_comments(query: &str, engine: QueryParserEngine) -> Result { + let result = match engine { + QueryParserEngine::PgQueryProtobuf => scan(query), + QueryParserEngine::PgQueryRaw => scan_raw(query), + } + .map_err(Error::PgQuery)?; + + let comment_ranges: Vec<(usize, usize)> = result + .tokens + .iter() + .filter(|st| st.token == Token::CComment as i32 || st.token == Token::SqlComment as i32) + .map(|t| (t.start as usize, t.end as usize)) + .collect(); + + let mut filtered_query = String::with_capacity(query.len()); + let mut cursor = 0usize; + + for (start, end) in comment_ranges { + if cursor < start { + filtered_query.push_str(&query[cursor..start]); + } + + cursor = end; + } + + if cursor < query.len() { + filtered_query.push_str(&query[cursor..]); + } - Ok(result) + Ok(filtered_query.trim().to_string()) } #[cfg(test)] From 9b5a0bcca6f46213a68ca9fadba851136260f58b Mon Sep 17 00:00:00 2001 From: dev-lew <> Date: Sat, 31 Jan 2026 15:03:57 -0500 Subject: [PATCH 3/8] Rename variable for clarity --- pgdog/src/frontend/router/parser/comment.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgdog/src/frontend/router/parser/comment.rs b/pgdog/src/frontend/router/parser/comment.rs index 0bd2d759..104c06b6 100644 --- a/pgdog/src/frontend/router/parser/comment.rs +++ b/pgdog/src/frontend/router/parser/comment.rs @@ -110,7 +110,7 @@ pub fn remove_comments(query: &str, engine: QueryParserEngine) -> Result Date: Tue, 3 Feb 2026 12:07:20 -0500 Subject: [PATCH 4/8] Upload untested except version This version will accept a list of regexs to keep --- pgdog/src/frontend/router/parser/comment.rs | 26 +++++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/pgdog/src/frontend/router/parser/comment.rs b/pgdog/src/frontend/router/parser/comment.rs index 104c06b6..b45d5e3d 100644 --- a/pgdog/src/frontend/router/parser/comment.rs +++ b/pgdog/src/frontend/router/parser/comment.rs @@ -11,11 +11,12 @@ use crate::frontend::router::sharding::ContextBuilder; use super::super::parser::Shard; use super::Error; -static SHARD: Lazy = Lazy::new(|| Regex::new(r#"pgdog_shard: *([0-9]+)"#).unwrap()); -static SHARDING_KEY: Lazy = Lazy::new(|| { +pub static SHARD: Lazy = Lazy::new(|| Regex::new(r#"pgdog_shard: *([0-9]+)"#).unwrap()); +pub static SHARDING_KEY: Lazy = Lazy::new(|| { Regex::new(r#"pgdog_sharding_key: *(?:"([^"]*)"|'([^']*)'|([0-9a-zA-Z-]+))"#).unwrap() }); -static ROLE: Lazy = Lazy::new(|| Regex::new(r#"pgdog_role: *(primary|replica)"#).unwrap()); +pub static ROLE: Lazy = + Lazy::new(|| Regex::new(r#"pgdog_role: *(primary|replica)"#).unwrap()); fn get_matched_value<'a>(caps: &'a regex::Captures<'a>) -> Option<&'a str> { caps.get(1) @@ -99,17 +100,32 @@ pub fn has_comments(query: &str, engine: QueryParserEngine) -> Result Result { +pub fn remove_comments( + query: &str, + engine: QueryParserEngine, + except: Option<&[&Regex]>, +) -> Result { let result = match engine { QueryParserEngine::PgQueryProtobuf => scan(query), QueryParserEngine::PgQueryRaw => scan_raw(query), } .map_err(Error::PgQuery)?; - let comment_ranges: Vec<(usize, usize)> = result + let tokens_to_remove = result .tokens .iter() .filter(|st| st.token == Token::CComment as i32 || st.token == Token::SqlComment as i32) + .filter(|st| { + if let Some(except) = except { + let text = &query[st.start as usize..st.end as usize]; + + !except.iter().any(|re| re.is_match(text)) + } else { + true + } + }); + + let comment_ranges: Vec<(usize, usize)> = tokens_to_remove .map(|st| (st.start as usize, st.end as usize)) .collect(); From 0319387cdff37bf37fe77bf365b18edd30c86fa5 Mon Sep 17 00:00:00 2001 From: dev-lew <> Date: Tue, 3 Feb 2026 22:26:12 -0500 Subject: [PATCH 5/8] Simplify removal code Add ability to keep pgdog metadata comments anywhere in the string --- pgdog/src/frontend/router/parser/comment.rs | 53 +++++++++++---------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/pgdog/src/frontend/router/parser/comment.rs b/pgdog/src/frontend/router/parser/comment.rs index b45d5e3d..09164c98 100644 --- a/pgdog/src/frontend/router/parser/comment.rs +++ b/pgdog/src/frontend/router/parser/comment.rs @@ -111,40 +111,41 @@ pub fn remove_comments( } .map_err(Error::PgQuery)?; - let tokens_to_remove = result - .tokens - .iter() - .filter(|st| st.token == Token::CComment as i32 || st.token == Token::SqlComment as i32) - .filter(|st| { - if let Some(except) = except { - let text = &query[st.start as usize..st.end as usize]; - - !except.iter().any(|re| re.is_match(text)) - } else { - true - } - }); + let mut out = String::with_capacity(query.len()); - let comment_ranges: Vec<(usize, usize)> = tokens_to_remove - .map(|st| (st.start as usize, st.end as usize)) - .collect(); + for st in &result.tokens { + let start = st.start as usize; + let end = st.end as usize; - let mut filtered_query = String::with_capacity(query.len()); - let mut cursor = 0usize; + match st.token { + t if t == Token::CComment as i32 => { + let comment = &query[start..end]; - for (start, end) in comment_ranges { - if cursor < start { - filtered_query.push_str(&query[cursor..start]); - } + if let Some(except) = except { + let rewritten = keep_only_matching(comment, except); - cursor = end; + out.push_str(&rewritten); + } + } + _ => { + out.push_str(&query[start..end]); + } + } } - if cursor < query.len() { - filtered_query.push_str(&query[cursor..]); + Ok(out) +} + +fn keep_only_matching(comment: &str, regs: &[&Regex]) -> String { + let mut out = String::new(); + + for reg in regs { + for m in reg.find_iter(comment) { + out.push_str(m.as_str()); + } } - Ok(filtered_query.trim().to_string()) + out } #[cfg(test)] From 212e1434482d4077bdab354b0569502e2deeee54 Mon Sep 17 00:00:00 2001 From: dev-lew <> Date: Tue, 3 Feb 2026 22:38:46 -0500 Subject: [PATCH 6/8] Preserve non token characters Not preserving these would mean queries with comments would never match their commentless counterparts --- pgdog/src/frontend/router/parser/comment.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pgdog/src/frontend/router/parser/comment.rs b/pgdog/src/frontend/router/parser/comment.rs index 09164c98..a1a8acb8 100644 --- a/pgdog/src/frontend/router/parser/comment.rs +++ b/pgdog/src/frontend/router/parser/comment.rs @@ -111,12 +111,15 @@ pub fn remove_comments( } .map_err(Error::PgQuery)?; + let mut cursor = 0; let mut out = String::with_capacity(query.len()); for st in &result.tokens { let start = st.start as usize; let end = st.end as usize; + out.push_str(&query[cursor..start]); + match st.token { t if t == Token::CComment as i32 => { let comment = &query[start..end]; @@ -131,6 +134,12 @@ pub fn remove_comments( out.push_str(&query[start..end]); } } + + cursor = end; + } + + if cursor < query.len() { + out.push_str(&query[cursor..]); } Ok(out) From f80d1e7390787a4acde3137c271259240ea5a50e Mon Sep 17 00:00:00 2001 From: dev-lew <> Date: Wed, 4 Feb 2026 16:59:50 -0500 Subject: [PATCH 7/8] Add retry logic in cache_impl.rs --- .../router/parser/cache/cache_impl.rs | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pgdog/src/frontend/router/parser/cache/cache_impl.rs b/pgdog/src/frontend/router/parser/cache/cache_impl.rs index 4c50e61f..c6cfe112 100644 --- a/pgdog/src/frontend/router/parser/cache/cache_impl.rs +++ b/pgdog/src/frontend/router/parser/cache/cache_impl.rs @@ -11,6 +11,7 @@ use tracing::debug; use super::super::{Error, Route}; use super::{Ast, AstContext}; +use crate::frontend::router::parser::comment; use crate::frontend::{BufferedQuery, PreparedStatements}; static CACHE: Lazy = Lazy::new(Cache::new); @@ -118,6 +119,27 @@ impl Cache { } } + if comment::has_comments(query.query(), ctx.sharding_schema.query_parser_engine)? { + // Check cache again after removing comments. + let filtered_query = comment::remove_comments( + query.query(), + ctx.sharding_schema.query_parser_engine, + Some(&[&*comment::SHARD, &*comment::SHARDING_KEY, &*comment::ROLE]), + )?; + + let mut guard = self.inner.lock(); + let ast = guard.queries.get_mut(&filtered_query).map(|entry| { + entry.stats.lock().hits += 1; + entry.clone() + }); + + if let Some(ast) = ast { + guard.stats.hits += 1; + + return Ok(ast); + } + } + // Parse query without holding lock. let entry = Ast::with_context(query, ctx, prepared_statements)?; let parse_time = entry.stats.lock().parse_time; From fe745e115cb73f25a9f4646c245c7182b831d6a4 Mon Sep 17 00:00:00 2001 From: dev-lew <55928726+dev-lew@users.noreply.github.com> Date: Sat, 7 Feb 2026 12:53:49 -0500 Subject: [PATCH 8/8] Refactor according to comments Logic for comment parsing happens before Ast creation resulting in fewer calls to scan. --- pgdog/src/frontend/router/parser/cache/ast.rs | 11 ++- .../router/parser/cache/cache_impl.rs | 38 +++++++--- pgdog/src/frontend/router/parser/comment.rs | 76 +++++++++---------- 3 files changed, 69 insertions(+), 56 deletions(-) diff --git a/pgdog/src/frontend/router/parser/cache/ast.rs b/pgdog/src/frontend/router/parser/cache/ast.rs index 2be0e0fb..c80c560e 100644 --- a/pgdog/src/frontend/router/parser/cache/ast.rs +++ b/pgdog/src/frontend/router/parser/cache/ast.rs @@ -7,9 +7,7 @@ use std::{collections::HashSet, ops::Deref}; use parking_lot::Mutex; use std::sync::Arc; -use super::super::{ - comment::comment, Error, Route, Shard, StatementRewrite, StatementRewriteContext, Table, -}; +use super::super::{Error, Route, Shard, StatementRewrite, StatementRewriteContext, Table}; use super::{Fingerprint, Stats}; use crate::backend::schema::Schema; use crate::frontend::router::parser::rewrite::statement::RewritePlan; @@ -72,6 +70,8 @@ impl Ast { schema: &ShardingSchema, db_schema: &Schema, prepared_statements: &mut PreparedStatements, + comment_shard: Option, + comment_role: Option, user: &str, search_path: Option<&ParameterValue>, ) -> Result { @@ -81,7 +81,6 @@ impl Ast { QueryParserEngine::PgQueryRaw => parse_raw(query), } .map_err(Error::PgQuery)?; - let (comment_shard, comment_role) = comment(query, schema)?; let fingerprint = Fingerprint::new(query, schema.query_parser_engine).map_err(Error::PgQuery)?; @@ -125,12 +124,16 @@ impl Ast { query: &BufferedQuery, ctx: &super::AstContext<'_>, prepared_statements: &mut PreparedStatements, + shard: Option, + role: Option, ) -> Result { Self::new( query, &ctx.sharding_schema, &ctx.db_schema, prepared_statements, + shard, + role, ctx.user, ctx.search_path, ) diff --git a/pgdog/src/frontend/router/parser/cache/cache_impl.rs b/pgdog/src/frontend/router/parser/cache/cache_impl.rs index c6cfe112..cd8aac8b 100644 --- a/pgdog/src/frontend/router/parser/cache/cache_impl.rs +++ b/pgdog/src/frontend/router/parser/cache/cache_impl.rs @@ -2,8 +2,10 @@ use lru::LruCache; use once_cell::sync::Lazy; use pg_query::normalize; use pgdog_config::QueryParserEngine; +use std::borrow::Cow; use std::collections::HashMap; use std::time::Duration; +use tracing_subscriber::filter::Filtered; use parking_lot::Mutex; use std::sync::Arc; @@ -98,6 +100,10 @@ impl Cache { /// Parse a statement by either getting it from cache /// or using pg_query parser. /// + /// In the event of cache miss, we retry after removing all comments except + /// for pgdog metadata. We retain it for correctness, since a query with + /// that metadata must not map to an identical query without it. + /// /// N.B. There is a race here that allows multiple threads to /// parse the same query. That's better imo than locking the data structure /// while we parse the query. @@ -119,33 +125,38 @@ impl Cache { } } - if comment::has_comments(query.query(), ctx.sharding_schema.query_parser_engine)? { - // Check cache again after removing comments. - let filtered_query = comment::remove_comments( - query.query(), - ctx.sharding_schema.query_parser_engine, - Some(&[&*comment::SHARD, &*comment::SHARDING_KEY, &*comment::ROLE]), - )?; + let (maybe_shard, maybe_role, maybe_filtered_query) = + comment::parse_comment(&query, &ctx.sharding_schema)?; + + let query_to_cache: Cow<'_, str>; + if let Some(filtered_query) = maybe_filtered_query { + query_to_cache = Cow::Owned(filtered_query); + + // Check cache again after removing comments from query let mut guard = self.inner.lock(); - let ast = guard.queries.get_mut(&filtered_query).map(|entry| { + + let ast = guard.queries.get_mut(&*query_to_cache).map(|entry| { entry.stats.lock().hits += 1; entry.clone() }); if let Some(ast) = ast { guard.stats.hits += 1; - return Ok(ast); } + } else { + query_to_cache = Cow::Borrowed(query.query()); } // Parse query without holding lock. - let entry = Ast::with_context(query, ctx, prepared_statements)?; + let entry = Ast::with_context(query, ctx, prepared_statements, maybe_shard, maybe_role)?; let parse_time = entry.stats.lock().parse_time; let mut guard = self.inner.lock(); - guard.queries.put(query.query().to_string(), entry.clone()); + guard + .queries + .put(query_to_cache.into_owned(), entry.clone()); guard.stats.misses += 1; guard.stats.parse_time += parse_time; @@ -160,7 +171,10 @@ impl Cache { ctx: &AstContext<'_>, prepared_statements: &mut PreparedStatements, ) -> Result { - let mut entry = Ast::with_context(query, ctx, prepared_statements)?; + let (maybe_shard, maybe_role, _) = comment::parse_comment(&query, &ctx.sharding_schema)?; + + let mut entry = + Ast::with_context(query, ctx, prepared_statements, maybe_shard, maybe_role)?; entry.cached = false; let parse_time = entry.stats.lock().parse_time; diff --git a/pgdog/src/frontend/router/parser/comment.rs b/pgdog/src/frontend/router/parser/comment.rs index a1a8acb8..b65968d8 100644 --- a/pgdog/src/frontend/router/parser/comment.rs +++ b/pgdog/src/frontend/router/parser/comment.rs @@ -1,4 +1,5 @@ use once_cell::sync::Lazy; +use pg_query::protobuf::ScanToken; use pg_query::scan_raw; use pg_query::{protobuf::Token, scan}; use pgdog_config::QueryParserEngine; @@ -25,23 +26,26 @@ fn get_matched_value<'a>(caps: &'a regex::Captures<'a>) -> Option<&'a str> { .map(|m| m.as_str()) } -/// Extract shard number from a comment. +/// Extract shard number from a comment. Additionally returns the entire +/// comment string if it exists. /// -/// Comment style uses the C-style comments (not SQL comments!) +/// Comment style for the shard metadata uses the C-style comments (not SQL comments!) /// as to allow the comment to appear anywhere in the query. /// /// See [`SHARD`] and [`SHARDING_KEY`] for the style of comment we expect. /// -pub fn comment( +pub fn parse_comment( query: &str, schema: &ShardingSchema, -) -> Result<(Option, Option), Error> { +) -> Result<(Option, Option, Option), Error> { let tokens = match schema.query_parser_engine { QueryParserEngine::PgQueryProtobuf => scan(query), QueryParserEngine::PgQueryRaw => scan_raw(query), } .map_err(Error::PgQuery)?; + let mut shard = None; let mut role = None; + let mut filtered_query = None; for token in tokens.tokens.iter() { if token.token == Token::CComment as i32 { @@ -58,63 +62,55 @@ pub fn comment( if let Some(cap) = SHARDING_KEY.captures(comment) { if let Some(sharding_key) = get_matched_value(&cap) { if let Some(schema) = schema.schemas.get(Some(sharding_key.into())) { - return Ok((Some(schema.shard().into()), role)); + shard = Some(schema.shard().into()); + } else { + let ctx = ContextBuilder::infer_from_from_and_config(sharding_key, schema)? + .shards(schema.shards) + .build()?; + shard = Some(ctx.apply()?); } - let ctx = ContextBuilder::infer_from_from_and_config(sharding_key, schema)? - .shards(schema.shards) - .build()?; - return Ok((Some(ctx.apply()?), role)); } } if let Some(cap) = SHARD.captures(comment) { - if let Some(shard) = cap.get(1) { - return Ok(( - Some( - shard - .as_str() - .parse::() - .ok() - .map(Shard::Direct) - .unwrap_or(Shard::All), - ), - role, - )); + if let Some(s) = cap.get(1) { + shard = Some( + s.as_str() + .parse::() + .ok() + .map(Shard::Direct) + .unwrap_or(Shard::All), + ); } } } } - Ok((None, role)) -} - -pub fn has_comments(query: &str, engine: QueryParserEngine) -> Result { - let result = match engine { - QueryParserEngine::PgQueryProtobuf => scan(query), - QueryParserEngine::PgQueryRaw => scan_raw(query), + if has_comments(&tokens.tokens) { + filtered_query = Some(remove_comments( + query, + &tokens.tokens, + Some(&[&SHARD, &*SHARDING_KEY, &ROLE]), + )?); } - .map_err(Error::PgQuery)?; - Ok(result - .tokens + Ok((shard, role, filtered_query)) +} + +pub fn has_comments(tokenized_query: &Vec) -> bool { + tokenized_query .iter() - .any(|st| st.token == Token::CComment as i32 || st.token == Token::SqlComment as i32)) + .any(|st| st.token == Token::CComment as i32 || st.token == Token::SqlComment as i32) } pub fn remove_comments( query: &str, - engine: QueryParserEngine, + tokenized_query: &Vec, except: Option<&[&Regex]>, ) -> Result { - let result = match engine { - QueryParserEngine::PgQueryProtobuf => scan(query), - QueryParserEngine::PgQueryRaw => scan_raw(query), - } - .map_err(Error::PgQuery)?; - let mut cursor = 0; let mut out = String::with_capacity(query.len()); - for st in &result.tokens { + for st in tokenized_query { let start = st.start as usize; let end = st.end as usize;