Skip to content

Commit 488e8a8

Browse files
mskrzypkowsMaciej Skrzypkowskialamb
authored
Support MySQL Character Set Introducers (#788)
* MySQL Character Set Introducers * Documentation fix * Parsing string introducer from Token::word * Fixed lint * fix clippy --------- Co-authored-by: Maciej Skrzypkowski <maciej.skrzypkowski@satoricyber.com> Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent b31ede7 commit 488e8a8

File tree

4 files changed

+77
-5
lines changed

4 files changed

+77
-5
lines changed

src/ast/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,8 @@ pub enum Expr {
437437
Nested(Box<Expr>),
438438
/// A literal value, such as string, number, date or NULL
439439
Value(Value),
440+
/// <https://dev.mysql.com/doc/refman/8.0/en/charset-introducer.html>
441+
IntroducedString { introducer: String, value: Value },
440442
/// A constant of form `<data_type> 'value'`.
441443
/// This can represent ANSI SQL `DATE`, `TIME`, and `TIMESTAMP` literals (such as `DATE '2020-01-01'`),
442444
/// as well as constants of other types (a non-standard PostgreSQL extension).
@@ -696,6 +698,7 @@ impl fmt::Display for Expr {
696698
Expr::Collate { expr, collation } => write!(f, "{expr} COLLATE {collation}"),
697699
Expr::Nested(ast) => write!(f, "({ast})"),
698700
Expr::Value(v) => write!(f, "{v}"),
701+
Expr::IntroducedString { introducer, value } => write!(f, "{introducer} {value}"),
699702
Expr::TypedString { data_type, value } => {
700703
write!(f, "{data_type}")?;
701704
write!(f, " '{}'", &value::escape_single_quote_string(value))

src/parser.rs

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -734,6 +734,17 @@ impl<'a> Parser<'a> {
734734
Ok(Expr::CompoundIdentifier(id_parts))
735735
}
736736
}
737+
// string introducer https://dev.mysql.com/doc/refman/8.0/en/charset-introducer.html
738+
Token::SingleQuotedString(_)
739+
| Token::DoubleQuotedString(_)
740+
| Token::HexStringLiteral(_)
741+
if w.value.starts_with('_') =>
742+
{
743+
Ok(Expr::IntroducedString {
744+
introducer: w.value,
745+
value: self.parse_introduced_string_value()?,
746+
})
747+
}
737748
_ => Ok(Expr::Identifier(w.to_ident())),
738749
},
739750
}, // End of Token::Word
@@ -784,7 +795,6 @@ impl<'a> Parser<'a> {
784795
self.prev_token();
785796
Ok(Expr::Value(self.parse_value()?))
786797
}
787-
788798
Token::LParen => {
789799
let expr =
790800
if self.parse_keyword(Keyword::SELECT) || self.parse_keyword(Keyword::WITH) {
@@ -4142,6 +4152,23 @@ impl<'a> Parser<'a> {
41424152
}
41434153
}
41444154

4155+
fn parse_introduced_string_value(&mut self) -> Result<Value, ParserError> {
4156+
let next_token = self.next_token();
4157+
let location = next_token.location;
4158+
match next_token.token {
4159+
Token::SingleQuotedString(ref s) => Ok(Value::SingleQuotedString(s.to_string())),
4160+
Token::DoubleQuotedString(ref s) => Ok(Value::DoubleQuotedString(s.to_string())),
4161+
Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())),
4162+
unexpected => self.expected(
4163+
"a string value",
4164+
TokenWithLocation {
4165+
token: unexpected,
4166+
location,
4167+
},
4168+
),
4169+
}
4170+
}
4171+
41454172
/// Parse an unsigned literal integer/long
41464173
pub fn parse_literal_uint(&mut self) -> Result<u64, ParserError> {
41474174
let next_token = self.next_token();

src/tokenizer.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -546,12 +546,12 @@ impl<'a> Tokenizer<'a> {
546546
// identifier or keyword
547547
ch if self.dialect.is_identifier_start(ch) => {
548548
chars.next(); // consume the first char
549-
let s = self.tokenize_word(ch, chars);
549+
let word = self.tokenize_word(ch, chars);
550550

551551
// TODO: implement parsing of exponent here
552-
if s.chars().all(|x| ('0'..='9').contains(&x) || x == '.') {
552+
if word.chars().all(|x| ('0'..='9').contains(&x) || x == '.') {
553553
let mut inner_state = State {
554-
peekable: s.chars().peekable(),
554+
peekable: word.chars().peekable(),
555555
line: 0,
556556
col: 0,
557557
};
@@ -562,7 +562,8 @@ impl<'a> Tokenizer<'a> {
562562
s += s2.as_str();
563563
return Ok(Some(Token::Number(s, false)));
564564
}
565-
Ok(Some(Token::make_word(&s, None)))
565+
566+
Ok(Some(Token::make_word(&word, None)))
566567
}
567568
// single quoted string
568569
'\'' => {

tests/sqlparser_mysql.rs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1264,3 +1264,44 @@ fn parse_values() {
12641264
mysql().verified_stmt("VALUES ROW(1, true, 'a')");
12651265
mysql().verified_stmt("SELECT a, c FROM (VALUES ROW(1, true, 'a'), ROW(2, false, 'b'), ROW(3, false, 'c')) AS t (a, b, c)");
12661266
}
1267+
1268+
#[test]
1269+
fn parse_hex_string_introducer() {
1270+
assert_eq!(
1271+
mysql().verified_stmt("SELECT _latin1 X'4D7953514C'"),
1272+
Statement::Query(Box::new(Query {
1273+
with: None,
1274+
body: Box::new(SetExpr::Select(Box::new(Select {
1275+
distinct: false,
1276+
top: None,
1277+
projection: vec![SelectItem::UnnamedExpr(Expr::IntroducedString {
1278+
introducer: "_latin1".to_string(),
1279+
value: Value::HexStringLiteral("4D7953514C".to_string())
1280+
})],
1281+
from: vec![],
1282+
lateral_views: vec![],
1283+
selection: None,
1284+
group_by: vec![],
1285+
cluster_by: vec![],
1286+
distribute_by: vec![],
1287+
sort_by: vec![],
1288+
having: None,
1289+
qualify: None,
1290+
into: None
1291+
}))),
1292+
order_by: vec![],
1293+
limit: None,
1294+
offset: None,
1295+
fetch: None,
1296+
locks: vec![],
1297+
}))
1298+
)
1299+
}
1300+
1301+
#[test]
1302+
fn parse_string_introducers() {
1303+
mysql().verified_stmt("SELECT _binary 'abc'");
1304+
mysql().one_statement_parses_to("SELECT _utf8'abc'", "SELECT _utf8 'abc'");
1305+
mysql().one_statement_parses_to("SELECT _utf8mb4'abc'", "SELECT _utf8mb4 'abc'");
1306+
mysql().verified_stmt("SELECT _binary 'abc', _utf8mb4 'abc'");
1307+
}

0 commit comments

Comments
 (0)