Skip to content

Commit 1c5c90a

Browse files
committed
Code review comments
1 parent 73120f7 commit 1c5c90a

File tree

1 file changed

+254
-14
lines changed

1 file changed

+254
-14
lines changed

src/parser/mod.rs

Lines changed: 254 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,6 +1009,8 @@ impl<'a> Parser<'a> {
10091009
Ok(Statement::NOTIFY { channel, payload })
10101010
}
10111011

1012+
// Tries to parse an expression by matching the specified word to known keywords that have a special meaning in the dialect.
1013+
// Returns `None if no match is found.
10121014
fn parse_expr_prefix_by_reserved_word(
10131015
&mut self,
10141016
w: &Word,
@@ -1115,7 +1117,8 @@ impl<'a> Parser<'a> {
11151117
}
11161118
}
11171119

1118-
fn parse_expr_prefix_by_nonreserved_word(&mut self, w: &Word) -> Result<Expr, ParserError> {
1120+
// Tries to parse an expression by a word that is not known to have a special meaning in the dialect.
1121+
fn parse_expr_prefix_by_unnreserved_word(&mut self, w: &Word) -> Result<Expr, ParserError> {
11191122
match self.peek_token().token {
11201123
Token::LParen | Token::Period => {
11211124
let mut id_parts: Vec<Ident> = vec![w.to_ident()];
@@ -1229,27 +1232,252 @@ impl<'a> Parser<'a> {
12291232
return Ok(expr);
12301233
}
12311234

1235+
let next_token = self.next_token();
1236+
let expr = match next_token.token {
1237+
Token::Word(w) => {
1238+
// The word we consumed may fall into one of two cases: it has a special meaning, or not.
1239+
// For example, in Snowflake, the word `interval` may have two meanings depending on the context:
1240+
// `SELECT CURRENT_DATE() + INTERVAL '1 DAY', MAX(interval) FROM tbl;`
1241+
// ^^^^^^^^^^^^^^^^ ^^^^^^^^
1242+
// interval expression identifier
1243+
//
1244+
// We first try to parse the word and following tokens as a special expression, and if that fails,
1245+
// we rollback and try to parse it as an identifier.
1246+
match self
1247+
.maybe_parse_internal(|parser| parser.parse_expr_prefix_by_reserved_word(&w))
1248+
{
1249+
// This word indicated an expression prefix and parsing was successful
1250+
Ok(Some(expr)) => Ok(expr),
1251+
1252+
// No expression prefix associated with this word
1253+
Ok(None) => Ok(self.parse_expr_prefix_by_unnreserved_word(&w)?),
1254+
1255+
// If parsing of the word as a special expression failed, we are facing two options:
1256+
// 1. The statement is malformed, e.g. `SELECT INTERVAL '1 DAI`
1257+
// 2. The word is used as an identifier, e.g. `SELECT MAX(interval) FROM tbl`
1258+
// We first try to parse the word as an identifier and if that fails
1259+
// we rollback and return the parsing error we got from trying to parse a
1260+
// special expression (to maintain backwards compatibility of parsing errors).
1261+
Err(e) => {
1262+
if !self.dialect.is_reserved_for_identifier(w.keyword) {
1263+
if let Ok(expr) = self.maybe_parse_internal(|parser| {
1264+
parser.parse_expr_prefix_by_unnreserved_word(&w)
1265+
}) {
1266+
return Ok(expr);
1267+
}
1268+
}
1269+
return Err(e);
1270+
}
1271+
}
1272+
} // End of Token::Word
1273+
// array `[1, 2, 3]`
1274+
Token::LBracket => self.parse_array_expr(false),
1275+
tok @ Token::Minus | tok @ Token::Plus => {
1276+
let op = if tok == Token::Plus {
1277+
UnaryOperator::Plus
1278+
} else {
1279+
UnaryOperator::Minus
1280+
};
1281+
Ok(Expr::UnaryOp {
1282+
op,
1283+
expr: Box::new(
1284+
self.parse_subexpr(self.dialect.prec_value(Precedence::MulDivModOp))?,
1285+
),
1286+
})
1287+
}
1288+
Token::ExclamationMark if self.dialect.supports_bang_not_operator() => {
1289+
Ok(Expr::UnaryOp {
1290+
op: UnaryOperator::BangNot,
1291+
expr: Box::new(
1292+
self.parse_subexpr(self.dialect.prec_value(Precedence::UnaryNot))?,
1293+
),
1294+
})
1295+
}
1296+
tok @ Token::DoubleExclamationMark
1297+
| tok @ Token::PGSquareRoot
1298+
| tok @ Token::PGCubeRoot
1299+
| tok @ Token::AtSign
1300+
| tok @ Token::Tilde
1301+
if dialect_of!(self is PostgreSqlDialect) =>
1302+
{
1303+
let op = match tok {
1304+
Token::DoubleExclamationMark => UnaryOperator::PGPrefixFactorial,
1305+
Token::PGSquareRoot => UnaryOperator::PGSquareRoot,
1306+
Token::PGCubeRoot => UnaryOperator::PGCubeRoot,
1307+
Token::AtSign => UnaryOperator::PGAbs,
1308+
Token::Tilde => UnaryOperator::PGBitwiseNot,
1309+
_ => unreachable!(),
1310+
};
1311+
Ok(Expr::UnaryOp {
1312+
op,
1313+
expr: Box::new(
1314+
self.parse_subexpr(self.dialect.prec_value(Precedence::PlusMinus))?,
1315+
),
1316+
})
1317+
}
1318+
Token::EscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect | GenericDialect) =>
1319+
{
1320+
self.prev_token();
1321+
Ok(Expr::Value(self.parse_value()?))
1322+
}
1323+
Token::UnicodeStringLiteral(_) => {
1324+
self.prev_token();
1325+
Ok(Expr::Value(self.parse_value()?))
1326+
}
1327+
Token::Number(_, _)
1328+
| Token::SingleQuotedString(_)
1329+
| Token::DoubleQuotedString(_)
1330+
| Token::TripleSingleQuotedString(_)
1331+
| Token::TripleDoubleQuotedString(_)
1332+
| Token::DollarQuotedString(_)
1333+
| Token::SingleQuotedByteStringLiteral(_)
1334+
| Token::DoubleQuotedByteStringLiteral(_)
1335+
| Token::TripleSingleQuotedByteStringLiteral(_)
1336+
| Token::TripleDoubleQuotedByteStringLiteral(_)
1337+
| Token::SingleQuotedRawStringLiteral(_)
1338+
| Token::DoubleQuotedRawStringLiteral(_)
1339+
| Token::TripleSingleQuotedRawStringLiteral(_)
1340+
| Token::TripleDoubleQuotedRawStringLiteral(_)
1341+
| Token::NationalStringLiteral(_)
1342+
| Token::HexStringLiteral(_) => {
1343+
self.prev_token();
1344+
Ok(Expr::Value(self.parse_value()?))
1345+
}
1346+
Token::LParen => {
1347+
let expr = if let Some(expr) = self.try_parse_expr_sub_query()? {
1348+
expr
1349+
} else if let Some(lambda) = self.try_parse_lambda()? {
1350+
return Ok(lambda);
1351+
} else {
1352+
let exprs = self.parse_comma_separated(Parser::parse_expr)?;
1353+
match exprs.len() {
1354+
0 => unreachable!(), // parse_comma_separated ensures 1 or more
1355+
1 => Expr::Nested(Box::new(exprs.into_iter().next().unwrap())),
1356+
_ => Expr::Tuple(exprs),
1357+
}
1358+
};
1359+
self.expect_token(&Token::RParen)?;
1360+
let expr = self.try_parse_method(expr)?;
1361+
if !self.consume_token(&Token::Period) {
1362+
Ok(expr)
1363+
} else {
1364+
let tok = self.next_token();
1365+
let key = match tok.token {
1366+
Token::Word(word) => word.to_ident(),
1367+
_ => {
1368+
return parser_err!(
1369+
format!("Expected identifier, found: {tok}"),
1370+
tok.location
1371+
)
1372+
}
1373+
};
1374+
Ok(Expr::CompositeAccess {
1375+
expr: Box::new(expr),
1376+
key,
1377+
})
1378+
}
1379+
}
1380+
Token::Placeholder(_) | Token::Colon | Token::AtSign => {
1381+
self.prev_token();
1382+
Ok(Expr::Value(self.parse_value()?))
1383+
}
1384+
Token::LBrace if self.dialect.supports_dictionary_syntax() => {
1385+
self.prev_token();
1386+
self.parse_duckdb_struct_literal()
1387+
}
1388+
_ => self.expected("an expression", next_token),
1389+
}?;
1390+
1391+
let expr = self.try_parse_method(expr)?;
1392+
1393+
if self.parse_keyword(Keyword::COLLATE) {
1394+
Ok(Expr::Collate {
1395+
expr: Box::new(expr),
1396+
collation: self.parse_object_name(false)?,
1397+
})
1398+
} else {
1399+
Ok(expr)
1400+
}
1401+
}
1402+
1403+
/// Parse an expression prefix.
1404+
pub fn parse_prefix2(&mut self) -> Result<Expr, ParserError> {
1405+
// allow the dialect to override prefix parsing
1406+
if let Some(prefix) = self.dialect.parse_prefix(self) {
1407+
return prefix;
1408+
}
1409+
1410+
// PostgreSQL allows any string literal to be preceded by a type name, indicating that the
1411+
// string literal represents a literal of that type. Some examples:
1412+
//
1413+
// DATE '2020-05-20'
1414+
// TIMESTAMP WITH TIME ZONE '2020-05-20 7:43:54'
1415+
// BOOL 'true'
1416+
//
1417+
// The first two are standard SQL, while the latter is a PostgreSQL extension. Complicating
1418+
// matters is the fact that INTERVAL string literals may optionally be followed by special
1419+
// keywords, e.g.:
1420+
//
1421+
// INTERVAL '7' DAY
1422+
//
1423+
// Note also that naively `SELECT date` looks like a syntax error because the `date` type
1424+
// name is not followed by a string literal, but in fact in PostgreSQL it is a valid
1425+
// expression that should parse as the column name "date".
1426+
let loc = self.peek_token().location;
1427+
let opt_expr = self.maybe_parse(|parser| {
1428+
match parser.parse_data_type()? {
1429+
DataType::Interval => parser.parse_interval(),
1430+
// PostgreSQL allows almost any identifier to be used as custom data type name,
1431+
// and we support that in `parse_data_type()`. But unlike Postgres we don't
1432+
// have a list of globally reserved keywords (since they vary across dialects),
1433+
// so given `NOT 'a' LIKE 'b'`, we'd accept `NOT` as a possible custom data type
1434+
// name, resulting in `NOT 'a'` being recognized as a `TypedString` instead of
1435+
// an unary negation `NOT ('a' LIKE 'b')`. To solve this, we don't accept the
1436+
// `type 'string'` syntax for the custom data types at all.
1437+
DataType::Custom(..) => parser_err!("dummy", loc),
1438+
data_type => Ok(Expr::TypedString {
1439+
data_type,
1440+
value: parser.parse_literal_string()?,
1441+
}),
1442+
}
1443+
})?;
1444+
1445+
if let Some(expr) = opt_expr {
1446+
return Ok(expr);
1447+
}
1448+
12321449
let next_token = self.next_token();
12331450
let expr = match next_token.token {
12341451
Token::Word(w) => {
12351452
// Save the parser index so we can rollback
12361453
let index_before = self.index;
1237-
// We first try to parse the word as the prefix of an expression.
1238-
// For example, the word INTERVAL in: SELECT INTERVAL '7' DAY
1454+
// The word we consumed may fall into one of two cases: it's a reserved word in the dialect
1455+
// and has a special meaning, or not. For example, in Snowflake, the word `interval` may have
1456+
// two meanings depending on the context:
1457+
// `SELECT CURRENT_DATE() + INTERVAL '1 DAY', MAX(interval) FROM test;`
1458+
// In its first occurrence it's part of an interval expression and in the second it's an identifier.
1459+
1460+
// We first try to parse the word and following tokens as a special expression, and if that fails,
1461+
// we rollback and try to parse it as an identifier.
12391462
match self.parse_expr_prefix_by_reserved_word(&w) {
12401463
// No expression prefix associated with this word
1241-
Ok(None) => Ok(self.parse_expr_prefix_by_nonreserved_word(&w)?),
1464+
Ok(None) => Ok(self.parse_expr_prefix_by_unnreserved_word(&w)?),
12421465
// This word indicated an expression prefix and parsing was successful
12431466
Ok(Some(expr)) => Ok(expr),
1244-
// This word indicated an expression prefix but parsing failed. Two options:
1245-
// 1. Malformed statement
1246-
// 2. The dialect may allow this word as identifier as well as indicating an expression
1467+
// If parsing of the word as a special expression failed, we are facing two options:
1468+
// 1. The statement is malformed, e.g. `SELECT INTERVAL '1 DAI`
1469+
// 2. The word is used as an identifier, e.g. `SELECT MAX(interval) FROM tbl`
1470+
1471+
// We first try to parse the word as an identifier and if that fails
1472+
// we rollback to the original position in the token stream and return parsing error
1473+
// we got from trying to parse a special expression (to maintain backwards
1474+
// compatibility of parsing errors).
12471475
Err(e) => {
12481476
let index_after_error = self.index;
12491477
if !self.dialect.is_reserved_for_identifier(w.keyword) {
12501478
// Rollback before trying to parse using a different approach
12511479
self.index = index_before;
1252-
if let Ok(expr) = self.parse_expr_prefix_by_nonreserved_word(&w) {
1480+
if let Ok(expr) = self.parse_expr_prefix_by_unnreserved_word(&w) {
12531481
return Ok(expr);
12541482
}
12551483
}
@@ -3672,18 +3900,30 @@ impl<'a> Parser<'a> {
36723900
}
36733901

36743902
/// Run a parser method `f`, reverting back to the current position if unsuccessful.
3675-
pub fn maybe_parse<T, F>(&mut self, mut f: F) -> Result<Option<T>, ParserError>
3903+
/// Returns `None` if `f` returns an error
3904+
pub fn maybe_parse<T, F>(&mut self, f: F) -> Result<Option<T>, ParserError>
36763905
where
36773906
F: FnMut(&mut Parser) -> Result<T, ParserError>,
36783907
{
3679-
let index = self.index;
3680-
match f(self) {
3908+
match self.maybe_parse_internal(f) {
36813909
Ok(t) => Ok(Some(t)),
3682-
// Unwind stack if limit exceeded
36833910
Err(ParserError::RecursionLimitExceeded) => Err(ParserError::RecursionLimitExceeded),
3684-
Err(_) => {
3911+
_ => Ok(None),
3912+
}
3913+
}
3914+
3915+
/// Run a parser method `f`, reverting back to the current position if unsuccessful.
3916+
pub fn maybe_parse_internal<T, F>(&mut self, mut f: F) -> Result<T, ParserError>
3917+
where
3918+
F: FnMut(&mut Parser) -> Result<T, ParserError>,
3919+
{
3920+
let index = self.index;
3921+
match f(self) {
3922+
Ok(t) => Ok(t),
3923+
Err(e) => {
3924+
// Unwind stack if limit exceeded
36853925
self.index = index;
3686-
Ok(None)
3926+
Err(e)
36873927
}
36883928
}
36893929
}

0 commit comments

Comments
 (0)