From 4d712e3563d076980c849d6104b6079aac471be0 Mon Sep 17 00:00:00 2001 From: Michael Victor Zink Date: Thu, 20 Feb 2025 14:55:54 -0800 Subject: [PATCH] Ignore escaped LIKE wildcards in MySQL MySQL has a special case for escaped LIKE wildcards (%, _) appearing in string literals: the escaping is ignored, whereas normally for any other (non-special) character, the backslash would be stripped. This is to allow them to be used in LIKE patterns without double-escaping as is needed in other escaping dialects (e.g. Snowflake), like so: MySQL matching a literal _ character: ```sql SELECT * FROM users WHERE name LIKE '%\_%'; ``` Snowflake doing the same thing: ```sql SELECT * FROM users WHERE name LIKE '%\\_%'; ``` Note that in MySQL, this escaping rule does not just affect LIKE patterns, but all string literals: ``` mysql> select '\_', hex('\\'), hex('_'), hex('\_'); +----+-----------+----------+-----------+ | \_ | hex('\\') | hex('_') | hex('\_') | +----+-----------+----------+-----------+ | \_ | 5C | 5F | 5C5F | +----+-----------+----------+-----------+ 1 row in set (0.00 sec) ``` This is implemented with a new flag onq dialect which gets passed into the tokenizer, because I don't know if any other dialects have similar behavior and wanted to make it easy to add them if they do. I can't currently test Snowflake, BigQuery, or Clickhouse on this point, so I'm just going off my best guess based on docs and examples. [MySQL docs](https://dev.mysql.com/doc/refman/8.4/en/string-literals.html) --- src/dialect/mod.rs | 27 ++++++++++++++++++++++++++ src/dialect/mysql.rs | 4 ++++ src/tokenizer.rs | 22 +++++++++++++++++++-- tests/sqlparser_common.rs | 40 ++++++++++++++++++++++++++------------- tests/sqlparser_mysql.rs | 11 +++++++++++ 5 files changed, 89 insertions(+), 15 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 1c32bc513..1cea6bc2b 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -201,6 +201,33 @@ pub trait Dialect: Debug + Any { false } + /// Determine whether the dialect strips the backslash when escaping LIKE wildcards (%, _). + /// + /// [MySQL] has a special case when escaping single quoted strings which leaves these unescaped + /// so they can be used in LIKE patterns without double-escaping (as is necessary in other + /// escaping dialects, such as [Snowflake]). Generally, special characters have escaping rules + /// causing them to be replaced with a different byte sequences (e.g. `'\0'` becoming the zero + /// byte), and the default if an escaped character does not have a specific escaping rule is to + /// strip the backslash (e.g. there is no rule for `h`, so `'\h' = 'h'`). MySQL's special case + /// for ignoring LIKE wildcard escapes is to *not* strip the backslash, so that `'\%' = '\\%'`. + /// This applies to all string literals though, not just those used in LIKE patterns. + /// + /// ```text + /// mysql> select '\_', hex('\\'), hex('_'), hex('\_'); + /// +----+-----------+----------+-----------+ + /// | \_ | hex('\\') | hex('_') | hex('\_') | + /// +----+-----------+----------+-----------+ + /// | \_ | 5C | 5F | 5C5F | + /// +----+-----------+----------+-----------+ + /// 1 row in set (0.00 sec) + /// ``` + /// + /// [MySQL]: https://dev.mysql.com/doc/refman/8.4/en/string-literals.html + /// [Snowflake]: https://docs.snowflake.com/en/sql-reference/functions/like#usage-notes + fn ignores_wildcard_escapes(&self) -> bool { + false + } + /// Determine if the dialect supports string literals with `U&` prefix. /// This is used to specify Unicode code points in string literals. /// For example, in PostgreSQL, the following is a valid string literal: diff --git a/src/dialect/mysql.rs b/src/dialect/mysql.rs index 8a0da87e4..cb86f2b47 100644 --- a/src/dialect/mysql.rs +++ b/src/dialect/mysql.rs @@ -62,6 +62,10 @@ impl Dialect for MySqlDialect { true } + fn ignores_wildcard_escapes(&self) -> bool { + true + } + fn supports_numeric_prefix(&self) -> bool { true } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index bc0f0efeb..d33a7d8af 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -2011,8 +2011,13 @@ impl<'a> Tokenizer<'a> { num_consecutive_quotes = 0; if let Some(next) = chars.peek() { - if !self.unescape { - // In no-escape mode, the given query has to be saved completely including backslashes. + if !self.unescape + || (self.dialect.ignores_wildcard_escapes() + && (*next == '%' || *next == '_')) + { + // In no-escape mode, the given query has to be saved completely + // including backslashes. Similarly, with ignore_like_wildcard_escapes, + // the backslash is not stripped. s.push(ch); s.push(*next); chars.next(); // consume next @@ -3585,6 +3590,9 @@ mod tests { (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#), (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#), (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#), + (r#"'\q'"#, r#"\q"#, r#"q"#), + (r#"'\%\_'"#, r#"\%\_"#, r#"%_"#), + (r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#), ] { let tokens = Tokenizer::new(&dialect, sql) .with_unescape(false) @@ -3618,6 +3626,16 @@ mod tests { compare(expected, tokens); } + + // MySQL special case for LIKE escapes + for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] { + let dialect = MySqlDialect {}; + let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); + + let expected = vec![Token::SingleQuotedString(expected.to_string())]; + + compare(expected, tokens); + } } #[test] diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs index 0a68d31e8..3c43ed61f 100644 --- a/tests/sqlparser_common.rs +++ b/tests/sqlparser_common.rs @@ -10387,15 +10387,8 @@ fn parse_with_recursion_limit() { #[test] fn parse_escaped_string_with_unescape() { - fn assert_mysql_query_value(sql: &str, quoted: &str) { - let stmt = TestedDialects::new(vec![ - Box::new(MySqlDialect {}), - Box::new(BigQueryDialect {}), - Box::new(SnowflakeDialect {}), - ]) - .one_statement_parses_to(sql, ""); - - match stmt { + fn assert_mysql_query_value(dialects: &TestedDialects, sql: &str, quoted: &str) { + match dialects.one_statement_parses_to(sql, "") { Statement::Query(query) => match *query.body { SetExpr::Select(value) => { let expr = expr_from_projection(only(&value.projection)); @@ -10411,17 +10404,38 @@ fn parse_escaped_string_with_unescape() { _ => unreachable!(), }; } + + let escaping_dialects = + &all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape()); + let no_wildcard_exception = &all_dialects_where(|dialect| { + dialect.supports_string_literal_backslash_escape() && !dialect.ignores_wildcard_escapes() + }); + let with_wildcard_exception = &all_dialects_where(|dialect| { + dialect.supports_string_literal_backslash_escape() && dialect.ignores_wildcard_escapes() + }); + let sql = r"SELECT 'I\'m fine'"; - assert_mysql_query_value(sql, "I'm fine"); + assert_mysql_query_value(escaping_dialects, sql, "I'm fine"); let sql = r#"SELECT 'I''m fine'"#; - assert_mysql_query_value(sql, "I'm fine"); + assert_mysql_query_value(escaping_dialects, sql, "I'm fine"); let sql = r#"SELECT 'I\"m fine'"#; - assert_mysql_query_value(sql, "I\"m fine"); + assert_mysql_query_value(escaping_dialects, sql, "I\"m fine"); let sql = r"SELECT 'Testing: \0 \\ \% \_ \b \n \r \t \Z \a \h \ '"; - assert_mysql_query_value(sql, "Testing: \0 \\ % _ \u{8} \n \r \t \u{1a} \u{7} h "); + assert_mysql_query_value( + no_wildcard_exception, + sql, + "Testing: \0 \\ % _ \u{8} \n \r \t \u{1a} \u{7} h ", + ); + + // check MySQL doesn't remove backslash from escaped LIKE wildcards + assert_mysql_query_value( + with_wildcard_exception, + sql, + "Testing: \0 \\ \\% \\_ \u{8} \n \r \t \u{1a} \u{7} h ", + ); } #[test] diff --git a/tests/sqlparser_mysql.rs b/tests/sqlparser_mysql.rs index 15f79b4c2..f0774fcf5 100644 --- a/tests/sqlparser_mysql.rs +++ b/tests/sqlparser_mysql.rs @@ -2627,6 +2627,17 @@ fn parse_rlike_and_regexp() { } } +#[test] +fn parse_like_with_escape() { + // verify backslash is not stripped for escaped wildcards + mysql().verified_only_select(r#"SELECT 'a\%c' LIKE 'a\%c'"#); + mysql().verified_only_select(r#"SELECT 'a\_c' LIKE 'a\_c'"#); + mysql().verified_only_select(r#"SELECT '%\_\%' LIKE '%\_\%'"#); + mysql().verified_only_select(r#"SELECT '\_\%' LIKE CONCAT('\_', '\%')"#); + mysql().verified_only_select(r#"SELECT 'a%c' LIKE 'a$%c' ESCAPE '$'"#); + mysql().verified_only_select(r#"SELECT 'a_c' LIKE 'a#_c' ESCAPE '#'"#); +} + #[test] fn parse_kill() { let stmt = mysql_and_generic().verified_stmt("KILL CONNECTION 5");