Skip to content

Commit 50ac50b

Browse files
committed
Ignore escaped LIKE wildcards in MySQL
MySQL has a special case for escaped LIKE wildcards (%, _) appearing in string literals: the escaping is ignored, whereas normally for any other (non-special) character, the backslash would be stripped. This is to allow them to be used in LIKE patterns without double-escaping as is needed in other escaping dialects (e.g. Snowflake), like so: MySQL matching a literal _ character: ```sql SELECT * FROM users WHERE name LIKE '%\_%'; ``` Snowflake doing the same thing: ```sql SELECT * FROM users WHERE name LIKE '%\\_%'; ``` Note that in MySQL, this escaping rule does not just affect LIKE patterns, but all string literals: ``` mysql> select '\_', hex('\\'), hex('_'), hex('\_'); +----+-----------+----------+-----------+ | \_ | hex('\\') | hex('_') | hex('\_') | +----+-----------+----------+-----------+ | \_ | 5C | 5F | 5C5F | +----+-----------+----------+-----------+ 1 row in set (0.00 sec) ``` This is implemented with a new flag onq dialect which gets passed into the tokenizer, because I don't know if any other dialects have similar behavior and wanted to make it easy to add them if they do. I can't currently test Snowflake, BigQuery, or Clickhouse on this point, so I'm just going off my best guess based on docs and examples. [MySQL docs](https://dev.mysql.com/doc/refman/8.4/en/string-literals.html)
1 parent 648efd7 commit 50ac50b

File tree

5 files changed

+101
-15
lines changed

5 files changed

+101
-15
lines changed

src/dialect/mod.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,33 @@ pub trait Dialect: Debug + Any {
201201
false
202202
}
203203

204+
/// Determine whether the dialect strips the backslash when escaping LIKE wildcards (%, _).
205+
///
206+
/// [MySQL] has a special case when escaping single quoted strings which leaves these unescaped
207+
/// so they can be used in LIKE patterns without double-escaping (as is necessary in other
208+
/// escaping dialects, such as [Snowflake]). Generally, special characters have escaping rules
209+
/// causing them to be replaced with a different byte sequences (e.g. `'\0'` becoming the zero
210+
/// byte), and the default if an escaped character does not have a specific escaping rule is to
211+
/// strip the backslash (e.g. there is no rule for `h`, so `'\h' = 'h'`). MySQL's special case
212+
/// for ignoring LIKE wildcard escapes is to *not* strip the backslash, so that `'\%' = '\\%'`.
213+
/// This applies to all string literals though, not just those used in LIKE patterns.
214+
///
215+
/// ```text
216+
/// mysql> select '\_', hex('\\'), hex('_'), hex('\_');
217+
/// +----+-----------+----------+-----------+
218+
/// | \_ | hex('\\') | hex('_') | hex('\_') |
219+
/// +----+-----------+----------+-----------+
220+
/// | \_ | 5C | 5F | 5C5F |
221+
/// +----+-----------+----------+-----------+
222+
/// 1 row in set (0.00 sec)
223+
/// ```
224+
///
225+
/// [MySQL]: https://dev.mysql.com/doc/refman/8.4/en/string-literals.html
226+
/// [Snowflake]: https://docs.snowflake.com/en/sql-reference/functions/like#usage-notes
227+
fn ignores_like_wildcard_escapes(&self) -> bool {
228+
false
229+
}
230+
204231
/// Determine if the dialect supports string literals with `U&` prefix.
205232
/// This is used to specify Unicode code points in string literals.
206233
/// For example, in PostgreSQL, the following is a valid string literal:

src/dialect/mysql.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ impl Dialect for MySqlDialect {
6262
true
6363
}
6464

65+
fn ignores_like_wildcard_escapes(&self) -> bool {
66+
true
67+
}
68+
6569
fn supports_numeric_prefix(&self) -> bool {
6670
true
6771
}

src/tokenizer.rs

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -807,6 +807,9 @@ pub struct Tokenizer<'a> {
807807
/// If true (the default), the tokenizer will un-escape literal
808808
/// SQL strings See [`Tokenizer::with_unescape`] for more details.
809809
unescape: bool,
810+
/// If true, the tokenizer will not escape % and _, for use in in LIKE patterns. See
811+
/// [`Dialect::ignores_like_wildcard_escapes`] for more details.
812+
ignore_like_wildcard_escapes: bool,
810813
}
811814

812815
impl<'a> Tokenizer<'a> {
@@ -831,6 +834,7 @@ impl<'a> Tokenizer<'a> {
831834
dialect,
832835
query,
833836
unescape: true,
837+
ignore_like_wildcard_escapes: dialect.ignores_like_wildcard_escapes(),
834838
}
835839
}
836840

@@ -869,6 +873,13 @@ impl<'a> Tokenizer<'a> {
869873
self
870874
}
871875

876+
/// If true, the tokenizer will ignore escapes of LIKE wildcards. See
877+
/// [`Dialect::ignores_like_wildcard_escapes`] for more details.
878+
pub fn with_ignore_like_wildcard_escapes(mut self, ignore_like_wildcard_escapes: bool) -> Self {
879+
self.ignore_like_wildcard_escapes = ignore_like_wildcard_escapes;
880+
self
881+
}
882+
872883
/// Tokenize the statement and produce a vector of tokens
873884
pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
874885
let twl = self.tokenize_with_location()?;
@@ -2011,8 +2022,12 @@ impl<'a> Tokenizer<'a> {
20112022
num_consecutive_quotes = 0;
20122023

20132024
if let Some(next) = chars.peek() {
2014-
if !self.unescape {
2015-
// In no-escape mode, the given query has to be saved completely including backslashes.
2025+
if !self.unescape
2026+
|| (self.ignore_like_wildcard_escapes && (*next == '%' || *next == '_'))
2027+
{
2028+
// In no-escape mode, the given query has to be saved completely
2029+
// including backslashes. Similarly, with ignore_like_wildcard_escapes,
2030+
// the backslash is not stripped.
20162031
s.push(ch);
20172032
s.push(*next);
20182033
chars.next(); // consume next
@@ -3585,6 +3600,9 @@ mod tests {
35853600
(r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
35863601
(r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
35873602
(r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
3603+
(r#"'\q'"#, r#"\q"#, r#"q"#),
3604+
(r#"'\%\_'"#, r#"\%\_"#, r#"%_"#),
3605+
(r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#),
35883606
] {
35893607
let tokens = Tokenizer::new(&dialect, sql)
35903608
.with_unescape(false)
@@ -3618,6 +3636,16 @@ mod tests {
36183636

36193637
compare(expected, tokens);
36203638
}
3639+
3640+
// MySQL special case for LIKE escapes
3641+
for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] {
3642+
let dialect = MySqlDialect {};
3643+
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3644+
3645+
let expected = vec![Token::SingleQuotedString(expected.to_string())];
3646+
3647+
compare(expected, tokens);
3648+
}
36213649
}
36223650

36233651
#[test]

tests/sqlparser_common.rs

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10387,15 +10387,8 @@ fn parse_with_recursion_limit() {
1038710387

1038810388
#[test]
1038910389
fn parse_escaped_string_with_unescape() {
10390-
fn assert_mysql_query_value(sql: &str, quoted: &str) {
10391-
let stmt = TestedDialects::new(vec![
10392-
Box::new(MySqlDialect {}),
10393-
Box::new(BigQueryDialect {}),
10394-
Box::new(SnowflakeDialect {}),
10395-
])
10396-
.one_statement_parses_to(sql, "");
10397-
10398-
match stmt {
10390+
fn assert_mysql_query_value(dialects: &TestedDialects, sql: &str, quoted: &str) {
10391+
match dialects.one_statement_parses_to(sql, "") {
1039910392
Statement::Query(query) => match *query.body {
1040010393
SetExpr::Select(value) => {
1040110394
let expr = expr_from_projection(only(&value.projection));
@@ -10411,17 +10404,40 @@ fn parse_escaped_string_with_unescape() {
1041110404
_ => unreachable!(),
1041210405
};
1041310406
}
10407+
10408+
let escaping_dialects =
10409+
&all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape());
10410+
let no_wildcard_exception = &all_dialects_where(|dialect| {
10411+
dialect.supports_string_literal_backslash_escape()
10412+
&& !dialect.ignores_like_wildcard_escapes()
10413+
});
10414+
let with_wildcard_exception = &all_dialects_where(|dialect| {
10415+
dialect.supports_string_literal_backslash_escape()
10416+
&& dialect.ignores_like_wildcard_escapes()
10417+
});
10418+
1041410419
let sql = r"SELECT 'I\'m fine'";
10415-
assert_mysql_query_value(sql, "I'm fine");
10420+
assert_mysql_query_value(escaping_dialects, sql, "I'm fine");
1041610421

1041710422
let sql = r#"SELECT 'I''m fine'"#;
10418-
assert_mysql_query_value(sql, "I'm fine");
10423+
assert_mysql_query_value(escaping_dialects, sql, "I'm fine");
1041910424

1042010425
let sql = r#"SELECT 'I\"m fine'"#;
10421-
assert_mysql_query_value(sql, "I\"m fine");
10426+
assert_mysql_query_value(escaping_dialects, sql, "I\"m fine");
1042210427

1042310428
let sql = r"SELECT 'Testing: \0 \\ \% \_ \b \n \r \t \Z \a \h \ '";
10424-
assert_mysql_query_value(sql, "Testing: \0 \\ % _ \u{8} \n \r \t \u{1a} \u{7} h ");
10429+
assert_mysql_query_value(
10430+
no_wildcard_exception,
10431+
sql,
10432+
"Testing: \0 \\ % _ \u{8} \n \r \t \u{1a} \u{7} h ",
10433+
);
10434+
10435+
// check MySQL doesn't remove backslash from escaped LIKE wildcards
10436+
assert_mysql_query_value(
10437+
with_wildcard_exception,
10438+
sql,
10439+
"Testing: \0 \\ \\% \\_ \u{8} \n \r \t \u{1a} \u{7} h ",
10440+
);
1042510441
}
1042610442

1042710443
#[test]

tests/sqlparser_mysql.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2566,6 +2566,17 @@ fn parse_rlike_and_regexp() {
25662566
}
25672567
}
25682568

2569+
#[test]
2570+
fn parse_like_with_escape() {
2571+
// verify backslash is not stripped for escaped wildcards
2572+
mysql().verified_only_select(r#"SELECT 'a\%c' LIKE 'a\%c'"#);
2573+
mysql().verified_only_select(r#"SELECT 'a\_c' LIKE 'a\_c'"#);
2574+
mysql().verified_only_select(r#"SELECT '%\_\%' LIKE '%\_\%'"#);
2575+
mysql().verified_only_select(r#"SELECT '\_\%' LIKE CONCAT('\_', '\%')"#);
2576+
mysql().verified_only_select(r#"SELECT 'a%c' LIKE 'a$%c' ESCAPE '$'"#);
2577+
mysql().verified_only_select(r#"SELECT 'a_c' LIKE 'a#_c' ESCAPE '#'"#);
2578+
}
2579+
25692580
#[test]
25702581
fn parse_kill() {
25712582
let stmt = mysql_and_generic().verified_stmt("KILL CONNECTION 5");

0 commit comments

Comments
 (0)