Skip to content

Commit 4551933

Browse files
committed
Slightly faster keyword lookups
Its a micro optimization but seemed to give a bit of a boost to only search words starting with the correct letter.
1 parent e9d8304 commit 4551933

File tree

2 files changed

+60
-4
lines changed

2 files changed

+60
-4
lines changed

src/keywords.rs

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -973,3 +973,61 @@ pub const RESERVED_FOR_IDENTIFIER: &[Keyword] = &[
973973
Keyword::STRUCT,
974974
Keyword::TRIM,
975975
];
976+
977+
pub const NA: usize = usize::MAX;
978+
979+
#[rustfmt::skip]
980+
pub const KEYWORD_LOOKUP_INDEX_ROOT: &[usize; 26] = &[
981+
0, 42, 67, 148, 198, 241, 281, 294, 305, 350, 357, 360, 390,
982+
430, 465, 497, 539, 543, 605, 683, 728, 761, 780, 793, 795, 796,
983+
];
984+
985+
pub fn lookup(word: &str) -> Keyword {
986+
if word.len() < 2 {
987+
return Keyword::NoKeyword;
988+
}
989+
990+
let word = word.to_uppercase();
991+
let byte1 = word.as_bytes()[0];
992+
if !byte1.is_ascii_uppercase() {
993+
return Keyword::NoKeyword;
994+
}
995+
996+
let start = KEYWORD_LOOKUP_INDEX_ROOT[(byte1 - b'A') as usize];
997+
998+
let end = if (byte1 + 1) <= b'Z' {
999+
KEYWORD_LOOKUP_INDEX_ROOT[(byte1 - b'A' + 1) as usize]
1000+
} else {
1001+
ALL_KEYWORDS.len()
1002+
};
1003+
1004+
let keyword = ALL_KEYWORDS[start..end].binary_search(&word.as_str());
1005+
keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x + start])
1006+
}
1007+
1008+
#[cfg(test)]
1009+
mod tests {
1010+
use super::*;
1011+
1012+
#[test]
1013+
fn check_keyword_index_roots() {
1014+
let mut root_index = Vec::with_capacity(26);
1015+
root_index.push(0);
1016+
for idx in 1..ALL_KEYWORDS.len() {
1017+
assert!(ALL_KEYWORDS[idx - 1] < ALL_KEYWORDS[idx]);
1018+
let prev = ALL_KEYWORDS[idx - 1].as_bytes()[0];
1019+
let curr = ALL_KEYWORDS[idx].as_bytes()[0];
1020+
if curr != prev {
1021+
root_index.push(idx);
1022+
}
1023+
}
1024+
assert_eq!(&root_index, KEYWORD_LOOKUP_INDEX_ROOT);
1025+
}
1026+
1027+
#[test]
1028+
fn check_keyword_lookup() {
1029+
for idx in 0..ALL_KEYWORDS.len() {
1030+
assert_eq!(lookup(ALL_KEYWORDS[idx]), ALL_KEYWORDS_INDEX[idx]);
1031+
}
1032+
}
1033+
}

src/tokenizer.rs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ use crate::dialect::{
4646
BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
4747
SnowflakeDialect,
4848
};
49-
use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
49+
use crate::keywords::{self, Keyword};
5050

5151
/// SQL Token enumeration
5252
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
@@ -344,13 +344,11 @@ impl Token {
344344
}
345345

346346
pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
347-
let word_uppercase = word.to_uppercase();
348347
Token::Word(Word {
349348
value: word.to_string(),
350349
quote_style,
351350
keyword: if quote_style.is_none() {
352-
let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
353-
keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
351+
keywords::lookup(word)
354352
} else {
355353
Keyword::NoKeyword
356354
},

0 commit comments

Comments
 (0)