Skip to content

Commit 02e9724

Browse files
committed
Code review comments
1 parent 36dd588 commit 02e9724

File tree

1 file changed

+254
-14
lines changed

1 file changed

+254
-14
lines changed

src/parser/mod.rs

Lines changed: 254 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1025,6 +1025,8 @@ impl<'a> Parser<'a> {
10251025
Ok(Statement::NOTIFY { channel, payload })
10261026
}
10271027

1028+
// Tries to parse an expression by matching the specified word to known keywords that have a special meaning in the dialect.
1029+
// Returns `None if no match is found.
10281030
fn parse_expr_prefix_by_reserved_word(
10291031
&mut self,
10301032
w: &Word,
@@ -1131,7 +1133,8 @@ impl<'a> Parser<'a> {
11311133
}
11321134
}
11331135

1134-
fn parse_expr_prefix_by_nonreserved_word(&mut self, w: &Word) -> Result<Expr, ParserError> {
1136+
// Tries to parse an expression by a word that is not known to have a special meaning in the dialect.
1137+
fn parse_expr_prefix_by_unnreserved_word(&mut self, w: &Word) -> Result<Expr, ParserError> {
11351138
match self.peek_token().token {
11361139
Token::LParen | Token::Period => {
11371140
let mut id_parts: Vec<Ident> = vec![w.to_ident()];
@@ -1245,27 +1248,252 @@ impl<'a> Parser<'a> {
12451248
return Ok(expr);
12461249
}
12471250

1251+
let next_token = self.next_token();
1252+
let expr = match next_token.token {
1253+
Token::Word(w) => {
1254+
// The word we consumed may fall into one of two cases: it has a special meaning, or not.
1255+
// For example, in Snowflake, the word `interval` may have two meanings depending on the context:
1256+
// `SELECT CURRENT_DATE() + INTERVAL '1 DAY', MAX(interval) FROM tbl;`
1257+
// ^^^^^^^^^^^^^^^^ ^^^^^^^^
1258+
// interval expression identifier
1259+
//
1260+
// We first try to parse the word and following tokens as a special expression, and if that fails,
1261+
// we rollback and try to parse it as an identifier.
1262+
match self
1263+
.maybe_parse_internal(|parser| parser.parse_expr_prefix_by_reserved_word(&w))
1264+
{
1265+
// This word indicated an expression prefix and parsing was successful
1266+
Ok(Some(expr)) => Ok(expr),
1267+
1268+
// No expression prefix associated with this word
1269+
Ok(None) => Ok(self.parse_expr_prefix_by_unnreserved_word(&w)?),
1270+
1271+
// If parsing of the word as a special expression failed, we are facing two options:
1272+
// 1. The statement is malformed, e.g. `SELECT INTERVAL '1 DAI`
1273+
// 2. The word is used as an identifier, e.g. `SELECT MAX(interval) FROM tbl`
1274+
// We first try to parse the word as an identifier and if that fails
1275+
// we rollback and return the parsing error we got from trying to parse a
1276+
// special expression (to maintain backwards compatibility of parsing errors).
1277+
Err(e) => {
1278+
if !self.dialect.is_reserved_for_identifier(w.keyword) {
1279+
if let Ok(expr) = self.maybe_parse_internal(|parser| {
1280+
parser.parse_expr_prefix_by_unnreserved_word(&w)
1281+
}) {
1282+
return Ok(expr);
1283+
}
1284+
}
1285+
return Err(e);
1286+
}
1287+
}
1288+
} // End of Token::Word
1289+
// array `[1, 2, 3]`
1290+
Token::LBracket => self.parse_array_expr(false),
1291+
tok @ Token::Minus | tok @ Token::Plus => {
1292+
let op = if tok == Token::Plus {
1293+
UnaryOperator::Plus
1294+
} else {
1295+
UnaryOperator::Minus
1296+
};
1297+
Ok(Expr::UnaryOp {
1298+
op,
1299+
expr: Box::new(
1300+
self.parse_subexpr(self.dialect.prec_value(Precedence::MulDivModOp))?,
1301+
),
1302+
})
1303+
}
1304+
Token::ExclamationMark if self.dialect.supports_bang_not_operator() => {
1305+
Ok(Expr::UnaryOp {
1306+
op: UnaryOperator::BangNot,
1307+
expr: Box::new(
1308+
self.parse_subexpr(self.dialect.prec_value(Precedence::UnaryNot))?,
1309+
),
1310+
})
1311+
}
1312+
tok @ Token::DoubleExclamationMark
1313+
| tok @ Token::PGSquareRoot
1314+
| tok @ Token::PGCubeRoot
1315+
| tok @ Token::AtSign
1316+
| tok @ Token::Tilde
1317+
if dialect_of!(self is PostgreSqlDialect) =>
1318+
{
1319+
let op = match tok {
1320+
Token::DoubleExclamationMark => UnaryOperator::PGPrefixFactorial,
1321+
Token::PGSquareRoot => UnaryOperator::PGSquareRoot,
1322+
Token::PGCubeRoot => UnaryOperator::PGCubeRoot,
1323+
Token::AtSign => UnaryOperator::PGAbs,
1324+
Token::Tilde => UnaryOperator::PGBitwiseNot,
1325+
_ => unreachable!(),
1326+
};
1327+
Ok(Expr::UnaryOp {
1328+
op,
1329+
expr: Box::new(
1330+
self.parse_subexpr(self.dialect.prec_value(Precedence::PlusMinus))?,
1331+
),
1332+
})
1333+
}
1334+
Token::EscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect | GenericDialect) =>
1335+
{
1336+
self.prev_token();
1337+
Ok(Expr::Value(self.parse_value()?))
1338+
}
1339+
Token::UnicodeStringLiteral(_) => {
1340+
self.prev_token();
1341+
Ok(Expr::Value(self.parse_value()?))
1342+
}
1343+
Token::Number(_, _)
1344+
| Token::SingleQuotedString(_)
1345+
| Token::DoubleQuotedString(_)
1346+
| Token::TripleSingleQuotedString(_)
1347+
| Token::TripleDoubleQuotedString(_)
1348+
| Token::DollarQuotedString(_)
1349+
| Token::SingleQuotedByteStringLiteral(_)
1350+
| Token::DoubleQuotedByteStringLiteral(_)
1351+
| Token::TripleSingleQuotedByteStringLiteral(_)
1352+
| Token::TripleDoubleQuotedByteStringLiteral(_)
1353+
| Token::SingleQuotedRawStringLiteral(_)
1354+
| Token::DoubleQuotedRawStringLiteral(_)
1355+
| Token::TripleSingleQuotedRawStringLiteral(_)
1356+
| Token::TripleDoubleQuotedRawStringLiteral(_)
1357+
| Token::NationalStringLiteral(_)
1358+
| Token::HexStringLiteral(_) => {
1359+
self.prev_token();
1360+
Ok(Expr::Value(self.parse_value()?))
1361+
}
1362+
Token::LParen => {
1363+
let expr = if let Some(expr) = self.try_parse_expr_sub_query()? {
1364+
expr
1365+
} else if let Some(lambda) = self.try_parse_lambda()? {
1366+
return Ok(lambda);
1367+
} else {
1368+
let exprs = self.parse_comma_separated(Parser::parse_expr)?;
1369+
match exprs.len() {
1370+
0 => unreachable!(), // parse_comma_separated ensures 1 or more
1371+
1 => Expr::Nested(Box::new(exprs.into_iter().next().unwrap())),
1372+
_ => Expr::Tuple(exprs),
1373+
}
1374+
};
1375+
self.expect_token(&Token::RParen)?;
1376+
let expr = self.try_parse_method(expr)?;
1377+
if !self.consume_token(&Token::Period) {
1378+
Ok(expr)
1379+
} else {
1380+
let tok = self.next_token();
1381+
let key = match tok.token {
1382+
Token::Word(word) => word.to_ident(),
1383+
_ => {
1384+
return parser_err!(
1385+
format!("Expected identifier, found: {tok}"),
1386+
tok.location
1387+
)
1388+
}
1389+
};
1390+
Ok(Expr::CompositeAccess {
1391+
expr: Box::new(expr),
1392+
key,
1393+
})
1394+
}
1395+
}
1396+
Token::Placeholder(_) | Token::Colon | Token::AtSign => {
1397+
self.prev_token();
1398+
Ok(Expr::Value(self.parse_value()?))
1399+
}
1400+
Token::LBrace if self.dialect.supports_dictionary_syntax() => {
1401+
self.prev_token();
1402+
self.parse_duckdb_struct_literal()
1403+
}
1404+
_ => self.expected("an expression", next_token),
1405+
}?;
1406+
1407+
let expr = self.try_parse_method(expr)?;
1408+
1409+
if self.parse_keyword(Keyword::COLLATE) {
1410+
Ok(Expr::Collate {
1411+
expr: Box::new(expr),
1412+
collation: self.parse_object_name(false)?,
1413+
})
1414+
} else {
1415+
Ok(expr)
1416+
}
1417+
}
1418+
1419+
/// Parse an expression prefix.
1420+
pub fn parse_prefix2(&mut self) -> Result<Expr, ParserError> {
1421+
// allow the dialect to override prefix parsing
1422+
if let Some(prefix) = self.dialect.parse_prefix(self) {
1423+
return prefix;
1424+
}
1425+
1426+
// PostgreSQL allows any string literal to be preceded by a type name, indicating that the
1427+
// string literal represents a literal of that type. Some examples:
1428+
//
1429+
// DATE '2020-05-20'
1430+
// TIMESTAMP WITH TIME ZONE '2020-05-20 7:43:54'
1431+
// BOOL 'true'
1432+
//
1433+
// The first two are standard SQL, while the latter is a PostgreSQL extension. Complicating
1434+
// matters is the fact that INTERVAL string literals may optionally be followed by special
1435+
// keywords, e.g.:
1436+
//
1437+
// INTERVAL '7' DAY
1438+
//
1439+
// Note also that naively `SELECT date` looks like a syntax error because the `date` type
1440+
// name is not followed by a string literal, but in fact in PostgreSQL it is a valid
1441+
// expression that should parse as the column name "date".
1442+
let loc = self.peek_token().location;
1443+
let opt_expr = self.maybe_parse(|parser| {
1444+
match parser.parse_data_type()? {
1445+
DataType::Interval => parser.parse_interval(),
1446+
// PostgreSQL allows almost any identifier to be used as custom data type name,
1447+
// and we support that in `parse_data_type()`. But unlike Postgres we don't
1448+
// have a list of globally reserved keywords (since they vary across dialects),
1449+
// so given `NOT 'a' LIKE 'b'`, we'd accept `NOT` as a possible custom data type
1450+
// name, resulting in `NOT 'a'` being recognized as a `TypedString` instead of
1451+
// an unary negation `NOT ('a' LIKE 'b')`. To solve this, we don't accept the
1452+
// `type 'string'` syntax for the custom data types at all.
1453+
DataType::Custom(..) => parser_err!("dummy", loc),
1454+
data_type => Ok(Expr::TypedString {
1455+
data_type,
1456+
value: parser.parse_literal_string()?,
1457+
}),
1458+
}
1459+
})?;
1460+
1461+
if let Some(expr) = opt_expr {
1462+
return Ok(expr);
1463+
}
1464+
12481465
let next_token = self.next_token();
12491466
let expr = match next_token.token {
12501467
Token::Word(w) => {
12511468
// Save the parser index so we can rollback
12521469
let index_before = self.index;
1253-
// We first try to parse the word as the prefix of an expression.
1254-
// For example, the word INTERVAL in: SELECT INTERVAL '7' DAY
1470+
// The word we consumed may fall into one of two cases: it's a reserved word in the dialect
1471+
// and has a special meaning, or not. For example, in Snowflake, the word `interval` may have
1472+
// two meanings depending on the context:
1473+
// `SELECT CURRENT_DATE() + INTERVAL '1 DAY', MAX(interval) FROM test;`
1474+
// In its first occurrence it's part of an interval expression and in the second it's an identifier.
1475+
1476+
// We first try to parse the word and following tokens as a special expression, and if that fails,
1477+
// we rollback and try to parse it as an identifier.
12551478
match self.parse_expr_prefix_by_reserved_word(&w) {
12561479
// No expression prefix associated with this word
1257-
Ok(None) => Ok(self.parse_expr_prefix_by_nonreserved_word(&w)?),
1480+
Ok(None) => Ok(self.parse_expr_prefix_by_unnreserved_word(&w)?),
12581481
// This word indicated an expression prefix and parsing was successful
12591482
Ok(Some(expr)) => Ok(expr),
1260-
// This word indicated an expression prefix but parsing failed. Two options:
1261-
// 1. Malformed statement
1262-
// 2. The dialect may allow this word as identifier as well as indicating an expression
1483+
// If parsing of the word as a special expression failed, we are facing two options:
1484+
// 1. The statement is malformed, e.g. `SELECT INTERVAL '1 DAI`
1485+
// 2. The word is used as an identifier, e.g. `SELECT MAX(interval) FROM tbl`
1486+
1487+
// We first try to parse the word as an identifier and if that fails
1488+
// we rollback to the original position in the token stream and return parsing error
1489+
// we got from trying to parse a special expression (to maintain backwards
1490+
// compatibility of parsing errors).
12631491
Err(e) => {
12641492
let index_after_error = self.index;
12651493
if !self.dialect.is_reserved_for_identifier(w.keyword) {
12661494
// Rollback before trying to parse using a different approach
12671495
self.index = index_before;
1268-
if let Ok(expr) = self.parse_expr_prefix_by_nonreserved_word(&w) {
1496+
if let Ok(expr) = self.parse_expr_prefix_by_unnreserved_word(&w) {
12691497
return Ok(expr);
12701498
}
12711499
}
@@ -3688,18 +3916,30 @@ impl<'a> Parser<'a> {
36883916
}
36893917

36903918
/// Run a parser method `f`, reverting back to the current position if unsuccessful.
3691-
pub fn maybe_parse<T, F>(&mut self, mut f: F) -> Result<Option<T>, ParserError>
3919+
/// Returns `None` if `f` returns an error
3920+
pub fn maybe_parse<T, F>(&mut self, f: F) -> Result<Option<T>, ParserError>
36923921
where
36933922
F: FnMut(&mut Parser) -> Result<T, ParserError>,
36943923
{
3695-
let index = self.index;
3696-
match f(self) {
3924+
match self.maybe_parse_internal(f) {
36973925
Ok(t) => Ok(Some(t)),
3698-
// Unwind stack if limit exceeded
36993926
Err(ParserError::RecursionLimitExceeded) => Err(ParserError::RecursionLimitExceeded),
3700-
Err(_) => {
3927+
_ => Ok(None),
3928+
}
3929+
}
3930+
3931+
/// Run a parser method `f`, reverting back to the current position if unsuccessful.
3932+
pub fn maybe_parse_internal<T, F>(&mut self, mut f: F) -> Result<T, ParserError>
3933+
where
3934+
F: FnMut(&mut Parser) -> Result<T, ParserError>,
3935+
{
3936+
let index = self.index;
3937+
match f(self) {
3938+
Ok(t) => Ok(t),
3939+
Err(e) => {
3940+
// Unwind stack if limit exceeded
37013941
self.index = index;
3702-
Ok(None)
3942+
Err(e)
37033943
}
37043944
}
37053945
}

0 commit comments

Comments
 (0)