@@ -1186,8 +1186,8 @@ impl<'a> Tokenizer<'a> {
1186
1186
1187
1187
Ok ( Some ( Token :: make_word ( & word. concat ( ) , Some ( quote_start) ) ) )
1188
1188
}
1189
- // numbers and period
1190
- '0' ..='9' | '.' => {
1189
+ // Numbers
1190
+ '0' ..='9' => {
1191
1191
// Some dialects support underscore as number separator
1192
1192
// There can only be one at a time and it must be followed by another digit
1193
1193
let is_number_separator = |ch : char , next_char : Option < char > | {
@@ -1196,11 +1196,12 @@ impl<'a> Tokenizer<'a> {
1196
1196
&& next_char. is_some_and ( |next_ch| next_ch. is_ascii_hexdigit ( ) )
1197
1197
} ;
1198
1198
1199
+ // Start with number or potential separator
1199
1200
let mut s = peeking_next_take_while ( chars, |ch, next_ch| {
1200
1201
ch. is_ascii_digit ( ) || is_number_separator ( ch, next_ch)
1201
1202
} ) ;
1202
1203
1203
- // match binary literal that starts with 0x
1204
+ // Match binary literal that starts with 0x
1204
1205
if s == "0" && chars. peek ( ) == Some ( & 'x' ) {
1205
1206
chars. next ( ) ;
1206
1207
let s2 = peeking_next_take_while ( chars, |ch, next_ch| {
@@ -1209,60 +1210,41 @@ impl<'a> Tokenizer<'a> {
1209
1210
return Ok ( Some ( Token :: HexStringLiteral ( s2) ) ) ;
1210
1211
}
1211
1212
1212
- // match one period
1213
+ // Match fractional part after a dot
1213
1214
if let Some ( '.' ) = chars. peek ( ) {
1214
1215
s. push ( '.' ) ;
1215
1216
chars. next ( ) ;
1216
1217
}
1217
1218
1218
- // If the dialect supports identifiers that start with a numeric prefix
1219
- // and we have now consumed a dot, check if the previous token was a Word.
1220
- // If so, what follows is definitely not part of a decimal number and
1221
- // we should yield the dot as a dedicated token so compound identifiers
1222
- // starting with digits can be parsed correctly.
1223
- if s == "." && self . dialect . supports_numeric_prefix ( ) {
1224
- if let Some ( Token :: Word ( _) ) = prev_token {
1225
- return Ok ( Some ( Token :: Period ) ) ;
1226
- }
1227
- }
1228
-
1229
- // Consume fractional digits.
1219
+ // Consume fractional digits
1230
1220
s += & peeking_next_take_while ( chars, |ch, next_ch| {
1231
1221
ch. is_ascii_digit ( ) || is_number_separator ( ch, next_ch)
1232
1222
} ) ;
1233
1223
1234
- // No fraction -> Token::Period
1235
- if s == "." {
1236
- return Ok ( Some ( Token :: Period ) ) ;
1237
- }
1238
-
1239
- // Parse exponent as number
1224
+ // Parse exponent part (e.g., e+10 or E-5)
1240
1225
let mut exponent_part = String :: new ( ) ;
1241
1226
if chars. peek ( ) == Some ( & 'e' ) || chars. peek ( ) == Some ( & 'E' ) {
1242
1227
let mut char_clone = chars. peekable . clone ( ) ;
1243
- exponent_part. push ( char_clone. next ( ) . unwrap ( ) ) ;
1228
+ exponent_part. push ( char_clone. next ( ) . unwrap ( ) ) ; // consume 'e' or 'E'
1244
1229
1245
1230
// Optional sign
1246
- match char_clone. peek ( ) {
1247
- Some ( & c ) if matches ! ( c , '+' | '-' ) => {
1231
+ if let Some ( & c ) = char_clone. peek ( ) {
1232
+ if c == '+' || c == '-' {
1248
1233
exponent_part. push ( c) ;
1249
1234
char_clone. next ( ) ;
1250
1235
}
1251
- _ => ( ) ,
1252
1236
}
1253
1237
1254
- match char_clone . peek ( ) {
1255
- // Definitely an exponent, get original iterator up to speed and use it
1256
- Some ( & c ) if c. is_ascii_digit ( ) => {
1238
+ // Parse digits after the exponent
1239
+ if let Some ( & c ) = char_clone . peek ( ) {
1240
+ if c. is_ascii_digit ( ) {
1257
1241
for _ in 0 ..exponent_part. len ( ) {
1258
1242
chars. next ( ) ;
1259
1243
}
1260
1244
exponent_part +=
1261
1245
& peeking_take_while ( chars, |ch| ch. is_ascii_digit ( ) ) ;
1262
1246
s += exponent_part. as_str ( ) ;
1263
1247
}
1264
- // Not an exponent, discard the work done
1265
- _ => ( ) ,
1266
1248
}
1267
1249
}
1268
1250
@@ -1271,8 +1253,7 @@ impl<'a> Tokenizer<'a> {
1271
1253
// be tokenized as a word.
1272
1254
if self . dialect . supports_numeric_prefix ( ) {
1273
1255
if exponent_part. is_empty ( ) {
1274
- // If it is not a number with an exponent, it may be
1275
- // an identifier starting with digits.
1256
+ // Handle as potential word if no exponent part
1276
1257
let word =
1277
1258
peeking_take_while ( chars, |ch| self . dialect . is_identifier_part ( ch) ) ;
1278
1259
@@ -1281,20 +1262,84 @@ impl<'a> Tokenizer<'a> {
1281
1262
return Ok ( Some ( Token :: make_word ( s. as_str ( ) , None ) ) ) ;
1282
1263
}
1283
1264
} else if prev_token == Some ( & Token :: Period ) {
1284
- // If the previous token was a period, thus not belonging to a number,
1285
- // the value we have is part of an identifier.
1265
+ // Handle as word if it follows a period
1286
1266
return Ok ( Some ( Token :: make_word ( s. as_str ( ) , None ) ) ) ;
1287
1267
}
1288
1268
}
1289
1269
1270
+ // Handle "L" suffix for long numbers
1290
1271
let long = if chars. peek ( ) == Some ( & 'L' ) {
1291
1272
chars. next ( ) ;
1292
1273
true
1293
1274
} else {
1294
1275
false
1295
1276
} ;
1277
+
1278
+ // Return the final token for the number
1296
1279
Ok ( Some ( Token :: Number ( s, long) ) )
1297
1280
}
1281
+
1282
+ // Period (`.`) handling
1283
+ '.' => {
1284
+ chars. next ( ) ; // consume the dot
1285
+
1286
+ match chars. peek ( ) {
1287
+ Some ( '_' ) => {
1288
+ // Handle "._" case as a period (special token) followed by identifier
1289
+ Ok ( Some ( Token :: Period ) )
1290
+ }
1291
+ Some ( ch)
1292
+ // Hive and mysql dialects allow numeric prefixes for identifers
1293
+ if ch. is_ascii_digit ( )
1294
+ && self . dialect . supports_numeric_prefix ( )
1295
+ && matches ! ( prev_token, Some ( Token :: Word ( _) ) ) =>
1296
+ {
1297
+ Ok ( Some ( Token :: Period ) )
1298
+ }
1299
+ Some ( ch) if ch. is_ascii_digit ( ) => {
1300
+ // Handle numbers starting with a dot (e.g., ".123")
1301
+ let mut s = String :: from ( "." ) ;
1302
+ let is_number_separator = |ch : char , next_char : Option < char > | {
1303
+ self . dialect . supports_numeric_literal_underscores ( )
1304
+ && ch == '_'
1305
+ && next_char. is_some_and ( |c| c. is_ascii_digit ( ) )
1306
+ } ;
1307
+
1308
+ s += & peeking_next_take_while ( chars, |ch, next_ch| {
1309
+ ch. is_ascii_digit ( ) || is_number_separator ( ch, next_ch)
1310
+ } ) ;
1311
+
1312
+ // Handle exponent part
1313
+ if matches ! ( chars. peek( ) , Some ( 'e' | 'E' ) ) {
1314
+ let mut exp = String :: new ( ) ;
1315
+ exp. push ( chars. next ( ) . unwrap ( ) ) ;
1316
+
1317
+ if matches ! ( chars. peek( ) , Some ( '+' | '-' ) ) {
1318
+ exp. push ( chars. next ( ) . unwrap ( ) ) ;
1319
+ }
1320
+
1321
+ if matches ! ( chars. peek( ) , Some ( c) if c. is_ascii_digit( ) ) {
1322
+ exp += & peeking_take_while ( chars, |c| c. is_ascii_digit ( ) ) ;
1323
+ s += & exp;
1324
+ }
1325
+ }
1326
+
1327
+ // Handle "L" suffix for long numbers
1328
+ let long = if chars. peek ( ) == Some ( & 'L' ) {
1329
+ chars. next ( ) ;
1330
+ true
1331
+ } else {
1332
+ false
1333
+ } ;
1334
+
1335
+ Ok ( Some ( Token :: Number ( s, long) ) )
1336
+ }
1337
+ _ => {
1338
+ // Just a plain period
1339
+ Ok ( Some ( Token :: Period ) )
1340
+ }
1341
+ }
1342
+ }
1298
1343
// punctuation
1299
1344
'(' => self . consume_and_return ( chars, Token :: LParen ) ,
1300
1345
')' => self . consume_and_return ( chars, Token :: RParen ) ,
@@ -2429,6 +2474,32 @@ mod tests {
2429
2474
compare ( expected, tokens) ;
2430
2475
}
2431
2476
2477
+ #[ test]
2478
+ fn tokenize_period_underscore ( ) {
2479
+ let sql = String :: from ( "SELECT table._col" ) ;
2480
+ // a dialect that supports underscores in numeric literals
2481
+ let dialect = PostgreSqlDialect { } ;
2482
+ let tokens = Tokenizer :: new ( & dialect, & sql) . tokenize ( ) . unwrap ( ) ;
2483
+
2484
+ let expected = vec ! [
2485
+ Token :: make_keyword( "SELECT" ) ,
2486
+ Token :: Whitespace ( Whitespace :: Space ) ,
2487
+ Token :: Word ( Word {
2488
+ value: "table" . to_string( ) ,
2489
+ quote_style: None ,
2490
+ keyword: Keyword :: TABLE ,
2491
+ } ) ,
2492
+ Token :: Period ,
2493
+ Token :: Word ( Word {
2494
+ value: "_col" . to_string( ) ,
2495
+ quote_style: None ,
2496
+ keyword: Keyword :: NoKeyword ,
2497
+ } ) ,
2498
+ ] ;
2499
+
2500
+ compare ( expected, tokens) ;
2501
+ }
2502
+
2432
2503
#[ test]
2433
2504
fn tokenize_select_float ( ) {
2434
2505
let sql = String :: from ( "SELECT .1" ) ;
0 commit comments