Extended placeholder syntax test and moved check in tokenizer

LucaCappelletti94 · LucaCappelletti94 · commit c75f11bf478a · 2025-10-28T16:26:35.000+01:00
diff --git a/src/parser/mod.rs b/src/parser/mod.rs
@@ -18475,7 +18475,7 @@ mod tests {
 
     #[test]
     fn test_placeholder_invalid_whitespace() {
-        for w in ["  ", "/*invalid*/"] {
+        for w in [" ", "  ", "/*invalid*/", "\n", "\t", "\r\n", "--comment\n"] {
             let sql = format!("\nSELECT\n  :{w}fooBar");
             assert!(Parser::parse_sql(&GenericDialect, &sql).is_err());
         }
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -908,6 +908,22 @@ impl<'a> Tokenizer<'a> {
         Ok(Some(Token::make_word(&word, None)))
     }
 
+    /// Returns a standardized error if the previous token is a `:` and
+    /// the method is expected to be called when a space is found after it.
+    fn handle_colon_space_error(
+        &self,
+        chars: &State,
+        prev_token: Option<&Token>,
+    ) -> Result<Option<Token>, TokenizerError> {
+        if let Some(Token::Colon) = prev_token {
+            return Err(TokenizerError {
+                message: "Unexpected whitespace after ':'; did you mean ':placeholder' or '::'?".to_string(),
+                location: chars.location(),
+            });
+        }
+        Ok(None)
+    }
+
     /// Get the next token or return None
     fn next_token(
         &self,
@@ -919,6 +935,7 @@ impl<'a> Tokenizer<'a> {
         match chars.peek() {
             Some(&ch) => match ch {
                 ' ' | '\t' | '\n' | '\r' => {
+                    self.handle_colon_space_error(chars, prev_token)?;
                     chars.next(); // consume
                     *location = chars.location();
                     self.next_token(location, chars, prev_token, true)
@@ -1166,7 +1183,7 @@ impl<'a> Tokenizer<'a> {
                     // if the prev token is not a word, then this is not a valid sql
                     // word or number.
                     if ch == '.' && chars.peekable.clone().nth(1) == Some('_') {
-                        if let Some(Token::Word(_)) = prev_token {
+                        if !preceded_by_whitespace {
                             chars.next();
                             return Ok(Some(Token::Period));
                         }
@@ -1210,7 +1227,7 @@ impl<'a> Tokenizer<'a> {
                     // we should yield the dot as a dedicated token so compound identifiers
                     // starting with digits can be parsed correctly.
                     if s == "." && self.dialect.supports_numeric_prefix() {
-                        if let Some(Token::Word(_)) = prev_token {
+                        if !preceded_by_whitespace {
                             return Ok(Some(Token::Period));
                         }
                     }
@@ -1300,6 +1317,7 @@ impl<'a> Tokenizer<'a> {
                             }
 
                             if is_comment {
+                                self.handle_colon_space_error(chars, prev_token)?;
                                 chars.next(); // consume second '-'
                                 // Consume the rest of the line as comment
                                 let _comment = self.tokenize_single_line_comment(chars);
@@ -1324,12 +1342,14 @@ impl<'a> Tokenizer<'a> {
                     chars.next(); // consume the '/'
                     match chars.peek() {
                         Some('*') => {
+                            self.handle_colon_space_error(chars, prev_token)?;
                             chars.next(); // consume the '*', starting a multi-line comment
                             let _comment = self.consume_multiline_comment(chars)?;
                             *location = chars.location();
                             self.next_token(location, chars, prev_token, true)
                         }
                         Some('/') if dialect_of!(self is SnowflakeDialect) => {
+                            self.handle_colon_space_error(chars, prev_token)?;
                             chars.next(); // consume the second '/', starting a snowflake single-line comment
                             // Consume the rest of the line as comment
                             let _comment = self.tokenize_single_line_comment(chars);
@@ -1534,6 +1554,7 @@ impl<'a> Tokenizer<'a> {
                 '}' => self.consume_and_return(chars, Token::RBrace),
                 '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
                 {
+                    self.handle_colon_space_error(chars, prev_token)?;
                     chars.next(); // consume the '#', starting a snowflake single-line comment
                     // Consume the rest of the line as comment
                     let _comment = self.tokenize_single_line_comment(chars);
@@ -1668,6 +1689,7 @@ impl<'a> Tokenizer<'a> {
 
                 // whitespace check (including unicode chars) should be last as it covers some of the chars above
                 ch if ch.is_whitespace() => {
+                    self.handle_colon_space_error(chars, prev_token)?;
                     chars.next(); // consume
                     *location = chars.location();
                     self.next_token(location, chars, prev_token, true)
diff --git a/tests/sqlparser_bigquery.rs b/tests/sqlparser_bigquery.rs
@@ -1567,7 +1567,10 @@ fn parse_table_identifiers() {
 
     fn test_table_ident_err(ident: &str) {
         let sql = format!("SELECT 1 FROM {ident}");
-        assert!(bigquery().parse_sql_statements(&sql).is_err());
+        assert!(
+            bigquery().parse_sql_statements(&sql).is_err(),
+            "Expected error parsing identifier: `{ident}`, within SQL: `{sql}`"
+        );
     }
 
     test_table_ident("`spa ce`", None, vec![Ident::with_quote('`', "spa ce")]);

Original file line number	Diff line number	Diff line change
`@@ -18475,7 +18475,7 @@ mod tests {`
`18475`	`18475`
`18476`	`18476`	`#[test]`
`18477`	`18477`	`fn test_placeholder_invalid_whitespace() {`
`18478`		`- for w in [" ", "/invalid/"] {`
	`18478`	`+ for w in [" ", " ", "/invalid/", "\n", "\t", "\r\n", "--comment\n"] {`
`18479`	`18479`	`let sql = format!("\nSELECT\n :{w}fooBar");`
`18480`	`18480`	`assert!(Parser::parse_sql(&GenericDialect, &sql).is_err());`
`18481`	`18481`	`}`