Skip to content

Commit f3214ac

Browse files
Fix: parsing ident starting with underscore in certain dialects
The dialects that support underscore as a separator in numeric literals used to parse ._123 as a number, meaning that an identifier like ._abc would be parsed as Number `._` and word `abc`, which is obv wrong. This PR splits the tokenizer branch for numbers and periods into two branches to make things easier, fixes the issue mentioned above and adds tests.
1 parent 7703fd0 commit f3214ac

File tree

1 file changed

+106
-35
lines changed

1 file changed

+106
-35
lines changed

src/tokenizer.rs

Lines changed: 106 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1186,8 +1186,8 @@ impl<'a> Tokenizer<'a> {
11861186

11871187
Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
11881188
}
1189-
// numbers and period
1190-
'0'..='9' | '.' => {
1189+
// Numbers
1190+
'0'..='9' => {
11911191
// Some dialects support underscore as number separator
11921192
// There can only be one at a time and it must be followed by another digit
11931193
let is_number_separator = |ch: char, next_char: Option<char>| {
@@ -1196,11 +1196,12 @@ impl<'a> Tokenizer<'a> {
11961196
&& next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
11971197
};
11981198

1199+
// Start with number or potential separator
11991200
let mut s = peeking_next_take_while(chars, |ch, next_ch| {
12001201
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
12011202
});
12021203

1203-
// match binary literal that starts with 0x
1204+
// Match binary literal that starts with 0x
12041205
if s == "0" && chars.peek() == Some(&'x') {
12051206
chars.next();
12061207
let s2 = peeking_next_take_while(chars, |ch, next_ch| {
@@ -1209,60 +1210,41 @@ impl<'a> Tokenizer<'a> {
12091210
return Ok(Some(Token::HexStringLiteral(s2)));
12101211
}
12111212

1212-
// match one period
1213+
// Match fractional part after a dot
12131214
if let Some('.') = chars.peek() {
12141215
s.push('.');
12151216
chars.next();
12161217
}
12171218

1218-
// If the dialect supports identifiers that start with a numeric prefix
1219-
// and we have now consumed a dot, check if the previous token was a Word.
1220-
// If so, what follows is definitely not part of a decimal number and
1221-
// we should yield the dot as a dedicated token so compound identifiers
1222-
// starting with digits can be parsed correctly.
1223-
if s == "." && self.dialect.supports_numeric_prefix() {
1224-
if let Some(Token::Word(_)) = prev_token {
1225-
return Ok(Some(Token::Period));
1226-
}
1227-
}
1228-
1229-
// Consume fractional digits.
1219+
// Consume fractional digits
12301220
s += &peeking_next_take_while(chars, |ch, next_ch| {
12311221
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
12321222
});
12331223

1234-
// No fraction -> Token::Period
1235-
if s == "." {
1236-
return Ok(Some(Token::Period));
1237-
}
1238-
1239-
// Parse exponent as number
1224+
// Parse exponent part (e.g., e+10 or E-5)
12401225
let mut exponent_part = String::new();
12411226
if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
12421227
let mut char_clone = chars.peekable.clone();
1243-
exponent_part.push(char_clone.next().unwrap());
1228+
exponent_part.push(char_clone.next().unwrap()); // consume 'e' or 'E'
12441229

12451230
// Optional sign
1246-
match char_clone.peek() {
1247-
Some(&c) if matches!(c, '+' | '-') => {
1231+
if let Some(&c) = char_clone.peek() {
1232+
if c == '+' || c == '-' {
12481233
exponent_part.push(c);
12491234
char_clone.next();
12501235
}
1251-
_ => (),
12521236
}
12531237

1254-
match char_clone.peek() {
1255-
// Definitely an exponent, get original iterator up to speed and use it
1256-
Some(&c) if c.is_ascii_digit() => {
1238+
// Parse digits after the exponent
1239+
if let Some(&c) = char_clone.peek() {
1240+
if c.is_ascii_digit() {
12571241
for _ in 0..exponent_part.len() {
12581242
chars.next();
12591243
}
12601244
exponent_part +=
12611245
&peeking_take_while(chars, |ch| ch.is_ascii_digit());
12621246
s += exponent_part.as_str();
12631247
}
1264-
// Not an exponent, discard the work done
1265-
_ => (),
12661248
}
12671249
}
12681250

@@ -1271,8 +1253,7 @@ impl<'a> Tokenizer<'a> {
12711253
// be tokenized as a word.
12721254
if self.dialect.supports_numeric_prefix() {
12731255
if exponent_part.is_empty() {
1274-
// If it is not a number with an exponent, it may be
1275-
// an identifier starting with digits.
1256+
// Handle as potential word if no exponent part
12761257
let word =
12771258
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
12781259

@@ -1281,20 +1262,84 @@ impl<'a> Tokenizer<'a> {
12811262
return Ok(Some(Token::make_word(s.as_str(), None)));
12821263
}
12831264
} else if prev_token == Some(&Token::Period) {
1284-
// If the previous token was a period, thus not belonging to a number,
1285-
// the value we have is part of an identifier.
1265+
// Handle as word if it follows a period
12861266
return Ok(Some(Token::make_word(s.as_str(), None)));
12871267
}
12881268
}
12891269

1270+
// Handle "L" suffix for long numbers
12901271
let long = if chars.peek() == Some(&'L') {
12911272
chars.next();
12921273
true
12931274
} else {
12941275
false
12951276
};
1277+
1278+
// Return the final token for the number
12961279
Ok(Some(Token::Number(s, long)))
12971280
}
1281+
1282+
// Period (`.`) handling
1283+
'.' => {
1284+
chars.next(); // consume the dot
1285+
1286+
match chars.peek() {
1287+
Some('_') => {
1288+
// Handle "._" case as a period (special token) followed by identifier
1289+
Ok(Some(Token::Period))
1290+
}
1291+
Some(ch)
1292+
// Hive and mysql dialects allow numeric prefixes for identifers
1293+
if ch.is_ascii_digit()
1294+
&& self.dialect.supports_numeric_prefix()
1295+
&& matches!(prev_token, Some(Token::Word(_))) =>
1296+
{
1297+
Ok(Some(Token::Period))
1298+
}
1299+
Some(ch) if ch.is_ascii_digit() => {
1300+
// Handle numbers starting with a dot (e.g., ".123")
1301+
let mut s = String::from(".");
1302+
let is_number_separator = |ch: char, next_char: Option<char>| {
1303+
self.dialect.supports_numeric_literal_underscores()
1304+
&& ch == '_'
1305+
&& next_char.is_some_and(|c| c.is_ascii_digit())
1306+
};
1307+
1308+
s += &peeking_next_take_while(chars, |ch, next_ch| {
1309+
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1310+
});
1311+
1312+
// Handle exponent part
1313+
if matches!(chars.peek(), Some('e' | 'E')) {
1314+
let mut exp = String::new();
1315+
exp.push(chars.next().unwrap());
1316+
1317+
if matches!(chars.peek(), Some('+' | '-')) {
1318+
exp.push(chars.next().unwrap());
1319+
}
1320+
1321+
if matches!(chars.peek(), Some(c) if c.is_ascii_digit()) {
1322+
exp += &peeking_take_while(chars, |c| c.is_ascii_digit());
1323+
s += &exp;
1324+
}
1325+
}
1326+
1327+
// Handle "L" suffix for long numbers
1328+
let long = if chars.peek() == Some(&'L') {
1329+
chars.next();
1330+
true
1331+
} else {
1332+
false
1333+
};
1334+
1335+
Ok(Some(Token::Number(s, long)))
1336+
}
1337+
_ => {
1338+
// Just a plain period
1339+
Ok(Some(Token::Period))
1340+
}
1341+
}
1342+
}
12981343
// punctuation
12991344
'(' => self.consume_and_return(chars, Token::LParen),
13001345
')' => self.consume_and_return(chars, Token::RParen),
@@ -2429,6 +2474,32 @@ mod tests {
24292474
compare(expected, tokens);
24302475
}
24312476

2477+
#[test]
2478+
fn tokenize_period_underscore() {
2479+
let sql = String::from("SELECT table._col");
2480+
// a dialect that supports underscores in numeric literals
2481+
let dialect = PostgreSqlDialect {};
2482+
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2483+
2484+
let expected = vec![
2485+
Token::make_keyword("SELECT"),
2486+
Token::Whitespace(Whitespace::Space),
2487+
Token::Word(Word {
2488+
value: "table".to_string(),
2489+
quote_style: None,
2490+
keyword: Keyword::TABLE,
2491+
}),
2492+
Token::Period,
2493+
Token::Word(Word {
2494+
value: "_col".to_string(),
2495+
quote_style: None,
2496+
keyword: Keyword::NoKeyword,
2497+
}),
2498+
];
2499+
2500+
compare(expected, tokens);
2501+
}
2502+
24322503
#[test]
24332504
fn tokenize_select_float() {
24342505
let sql = String::from("SELECT .1");

0 commit comments

Comments
 (0)