diff --git a/src/librustc_lexer/src/cursor.rs b/src/librustc_lexer/src/cursor.rs index 73d305c6d4fe2..ed0911379c4b3 100644 --- a/src/librustc_lexer/src/cursor.rs +++ b/src/librustc_lexer/src/cursor.rs @@ -41,10 +41,20 @@ impl<'a> Cursor<'a> { /// If requested position doesn't exist, `EOF_CHAR` is returned. /// However, getting `EOF_CHAR` doesn't always mean actual end of file, /// it should be checked with `is_eof` method. - pub(crate) fn nth_char(&self, n: usize) -> char { + fn nth_char(&self, n: usize) -> char { self.chars().nth(n).unwrap_or(EOF_CHAR) } + /// Peeks the next symbol from the input stream without consuming it. + pub(crate) fn first(&self) -> char { + self.nth_char(0) + } + + /// Peeks the second symbol from the input stream without consuming it. + pub(crate) fn second(&self) -> char { + self.nth_char(1) + } + /// Checks if there is nothing more to consume. pub(crate) fn is_eof(&self) -> bool { self.chars.as_str().is_empty() diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs index d55ef46d7506e..c50808adec1dd 100644 --- a/src/librustc_lexer/src/lib.rs +++ b/src/librustc_lexer/src/lib.rs @@ -18,6 +18,8 @@ mod cursor; pub mod unescape; use crate::cursor::{Cursor, EOF_CHAR}; +use self::TokenKind::*; +use self::LiteralKind::*; /// Parsed token. /// It doesn't contain information about data that has been parsed, @@ -116,7 +118,6 @@ pub enum TokenKind { /// Unknown token, not expected by the lexer, e.g. "№" Unknown, } -use self::TokenKind::*; #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] pub enum LiteralKind { @@ -137,7 +138,6 @@ pub enum LiteralKind { /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a" RawByteStr { n_hashes: usize, started: bool, terminated: bool }, } -use self::LiteralKind::*; /// Base of numeric literal encoding according to its prefix. #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] @@ -241,14 +241,13 @@ pub fn is_id_continue(c: char) -> bool { || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c)) } - impl Cursor<'_> { /// Parses a token from the input string. fn advance_token(&mut self) -> Token { let first_char = self.bump().unwrap(); let token_kind = match first_char { // Slash, comment or block comment. - '/' => match self.nth_char(0) { + '/' => match self.first() { '/' => self.line_comment(), '*' => self.block_comment(), _ => Slash, @@ -257,8 +256,8 @@ impl Cursor<'_> { // Whitespace sequence. c if is_whitespace(c) => self.whitespace(), - // Raw string literal or identifier. - 'r' => match (self.nth_char(0), self.nth_char(1)) { + // Raw identifier, raw string literal or identifier. + 'r' => match (self.first(), self.second()) { ('#', c1) if is_id_start(c1) => self.raw_ident(), ('#', _) | ('"', _) => { let (n_hashes, started, terminated) = self.raw_double_quoted_string(); @@ -273,7 +272,7 @@ impl Cursor<'_> { }, // Byte literal, byte string literal, raw byte string literal or identifier. - 'b' => match (self.nth_char(0), self.nth_char(1)) { + 'b' => match (self.first(), self.second()) { ('\'', _) => { self.bump(); let terminated = self.single_quoted_string(); @@ -366,31 +365,23 @@ impl Cursor<'_> { } fn line_comment(&mut self) -> TokenKind { - debug_assert!(self.prev() == '/' && self.nth_char(0) == '/'); + debug_assert!(self.prev() == '/' && self.first() == '/'); self.bump(); - loop { - match self.nth_char(0) { - '\n' => break, - EOF_CHAR if self.is_eof() => break, - _ => { - self.bump(); - } - } - } + self.eat_while(|c| c != '\n'); LineComment } fn block_comment(&mut self) -> TokenKind { - debug_assert!(self.prev() == '/' && self.nth_char(0) == '*'); + debug_assert!(self.prev() == '/' && self.first() == '*'); self.bump(); let mut depth = 1usize; while let Some(c) = self.bump() { match c { - '/' if self.nth_char(0) == '*' => { + '/' if self.first() == '*' => { self.bump(); depth += 1; } - '*' if self.nth_char(0) == '/' => { + '*' if self.first() == '/' => { self.bump(); depth -= 1; if depth == 0 { @@ -409,31 +400,27 @@ impl Cursor<'_> { fn whitespace(&mut self) -> TokenKind { debug_assert!(is_whitespace(self.prev())); - while is_whitespace(self.nth_char(0)) { - self.bump(); - } + self.eat_while(is_whitespace); Whitespace } fn raw_ident(&mut self) -> TokenKind { debug_assert!( self.prev() == 'r' - && self.nth_char(0) == '#' - && is_id_start(self.nth_char(1)) + && self.first() == '#' + && is_id_start(self.second()) ); + // Eat "#" symbol. self.bump(); - self.bump(); - while is_id_continue(self.nth_char(0)) { - self.bump(); - } + // Eat the identifier part of RawIdent. + self.eat_identifier(); RawIdent } fn ident(&mut self) -> TokenKind { debug_assert!(is_id_start(self.prev())); - while is_id_continue(self.nth_char(0)) { - self.bump(); - } + // Start is already eaten, eat the rest of identifier. + self.eat_while(is_id_continue); Ident } @@ -442,7 +429,7 @@ impl Cursor<'_> { let mut base = Base::Decimal; if first_digit == '0' { // Attempt to parse encoding base. - let has_digits = match self.nth_char(0) { + let has_digits = match self.first() { 'b' => { base = Base::Binary; self.bump(); @@ -476,23 +463,23 @@ impl Cursor<'_> { self.eat_decimal_digits(); }; - match self.nth_char(0) { + match self.first() { // Don't be greedy if this is actually an // integer literal followed by field/method access or a range pattern // (`0..2` and `12.foo()`) - '.' if self.nth_char(1) != '.' - && !is_id_start(self.nth_char(1)) => + '.' if self.second() != '.' + && !is_id_start(self.second()) => { // might have stuff after the ., and if it does, it needs to start // with a number self.bump(); let mut empty_exponent = false; - if self.nth_char(0).is_digit(10) { + if self.first().is_digit(10) { self.eat_decimal_digits(); - match self.nth_char(0) { + match self.first() { 'e' | 'E' => { self.bump(); - empty_exponent = self.float_exponent().is_err() + empty_exponent = !self.eat_float_exponent(); } _ => (), } @@ -501,7 +488,7 @@ impl Cursor<'_> { } 'e' | 'E' => { self.bump(); - let empty_exponent = self.float_exponent().is_err(); + let empty_exponent = !self.eat_float_exponent(); Float { base, empty_exponent } } _ => Int { base, empty_int: false }, @@ -510,65 +497,76 @@ impl Cursor<'_> { fn lifetime_or_char(&mut self) -> TokenKind { debug_assert!(self.prev() == '\''); - let mut starts_with_number = false; - - // Check if the first symbol after '\'' is a valid identifier - // character or a number (not a digit followed by '\''). - if (is_id_start(self.nth_char(0)) - || self.nth_char(0).is_digit(10) && { - starts_with_number = true; - true - }) - && self.nth_char(1) != '\'' - { - self.bump(); - // Skip the identifier. - while is_id_continue(self.nth_char(0)) { - self.bump(); - } + let can_be_a_lifetime = if self.second() == '\'' { + // It's surely not a lifetime. + false + } else { + // If the first symbol is valid for identifier, it can be a lifetime. + // Also check if it's a number for a better error reporting (so '0 will + // be reported as invalid lifetime and not as unterminated char literal). + is_id_start(self.first()) || self.first().is_digit(10) + }; - return if self.nth_char(0) == '\'' { - self.bump(); - let kind = Char { terminated: true }; - Literal { kind, suffix_start: self.len_consumed() } - } else { - Lifetime { starts_with_number } - }; + if !can_be_a_lifetime { + let terminated = self.single_quoted_string(); + let suffix_start = self.len_consumed(); + if terminated { + self.eat_literal_suffix(); + } + let kind = Char { terminated }; + return Literal { kind, suffix_start }; } - // This is not a lifetime (checked above), parse a char literal. - let terminated = self.single_quoted_string(); - let suffix_start = self.len_consumed(); - if terminated { - self.eat_literal_suffix(); + // Either a lifetime or a character literal with + // length greater than 1. + + let starts_with_number = self.first().is_digit(10); + + // Skip the literal contents. + // First symbol can be a number (which isn't a valid identifier start), + // so skip it without any checks. + self.bump(); + self.eat_while(is_id_continue); + + // Check if after skipping literal contents we've met a closing + // single quote (which means that user attempted to create a + // string with single quotes). + if self.first() == '\'' { + self.bump(); + let kind = Char { terminated: true }; + return Literal { kind, suffix_start: self.len_consumed() }; } - let kind = Char { terminated }; - return Literal { kind, suffix_start }; + + return Lifetime { starts_with_number }; } fn single_quoted_string(&mut self) -> bool { debug_assert!(self.prev() == '\''); - // Parse `'''` as a single char literal. - if self.nth_char(0) == '\'' && self.nth_char(1) == '\'' { + // Check if it's a one-symbol literal. + if self.second() == '\'' && self.first() != '\\' { + self.bump(); self.bump(); + return true; } + + // Literal has more than one symbol. + // Parse until either quotes are terminated or error is detected. - let mut first = true; loop { - match self.nth_char(0) { - // Probably beginning of the comment, which we don't want to include - // to the error report. - '/' if !first => break, - // Newline without following '\'' means unclosed quote, stop parsing. - '\n' if self.nth_char(1) != '\'' => break, - // End of file, stop parsing. - EOF_CHAR if self.is_eof() => break, + match self.first() { // Quotes are terminated, finish parsing. '\'' => { self.bump(); return true; } + // Probably beginning of the comment, which we don't want to include + // to the error report. + '/' => break, + // Newline without following '\'' means unclosed quote, stop parsing. + '\n' if self.second() != '\'' => break, + // End of file, stop parsing. + EOF_CHAR if self.is_eof() => break, // Escaped slash is considered one character, so bump twice. '\\' => { self.bump(); @@ -579,8 +577,8 @@ impl Cursor<'_> { self.bump(); } } - first = false; } + // String was not terminated. false } @@ -588,62 +586,71 @@ impl Cursor<'_> { /// if string is terminated. fn double_quoted_string(&mut self) -> bool { debug_assert!(self.prev() == '"'); - loop { - match self.nth_char(0) { + while let Some(c) = self.bump() { + match c { '"' => { - self.bump(); return true; } - EOF_CHAR if self.is_eof() => return false, - '\\' if self.nth_char(1) == '\\' || self.nth_char(1) == '"' => { + '\\' if self.first() == '\\' || self.first() == '"' => { + // Bump again to skip escaped character. self.bump(); } _ => (), } - self.bump(); } + // End of file reached. + false } /// Eats the double-quoted string and returns a tuple of /// (amount of the '#' symbols, raw string started, raw string terminated) fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) { debug_assert!(self.prev() == 'r'); + let mut started: bool = false; + let mut finished: bool = false; + // Count opening '#' symbols. - let n_hashes = { - let mut acc: usize = 0; - loop { - match self.bump() { - Some('#') => acc += 1, - Some('"') => break acc, - None | Some(_) => return (acc, false, false), - } + let n_hashes = self.eat_while(|c| c == '#'); + + // Check that string is started. + match self.bump() { + Some('"') => started = true, + _ => return (n_hashes, started, finished), + } + + // Skip the string contents and on each '#' character met, check if this is + // a raw string termination. + while !finished { + self.eat_while(|c| c != '"'); + + if self.is_eof() { + return (n_hashes, started, finished); } - }; - // Skip the string itself and check that amount of closing '#' - // symbols is equal to the amount of opening ones. - loop { - match self.bump() { - Some('"') => { - let mut acc = n_hashes; - while self.nth_char(0) == '#' && acc > 0 { - self.bump(); - acc -= 1; - } - if acc == 0 { - return (n_hashes, true, true); - } + // Eat closing double quote. + self.bump(); + + // Check that amount of closing '#' symbols + // is equal to the amount of opening ones. + let mut hashes_left = n_hashes; + let is_closing_hash = |c| { + if c == '#' && hashes_left != 0 { + hashes_left -= 1; + true + } else { + false } - Some(_) => (), - None => return (n_hashes, true, false), - } + }; + finished = self.eat_while(is_closing_hash) == n_hashes; } + + (n_hashes, started, finished) } fn eat_decimal_digits(&mut self) -> bool { let mut has_digits = false; loop { - match self.nth_char(0) { + match self.first() { '_' => { self.bump(); } @@ -660,7 +667,7 @@ impl Cursor<'_> { fn eat_hexadecimal_digits(&mut self) -> bool { let mut has_digits = false; loop { - match self.nth_char(0) { + match self.first() { '_' => { self.bump(); } @@ -674,23 +681,43 @@ impl Cursor<'_> { has_digits } - fn float_exponent(&mut self) -> Result<(), ()> { + /// Eats the float exponent. Returns true if at least one digit was met, + /// and returns false otherwise. + fn eat_float_exponent(&mut self) -> bool { debug_assert!(self.prev() == 'e' || self.prev() == 'E'); - if self.nth_char(0) == '-' || self.nth_char(0) == '+' { + if self.first() == '-' || self.first() == '+' { self.bump(); } - if self.eat_decimal_digits() { Ok(()) } else { Err(()) } + self.eat_decimal_digits() } - // Eats the suffix if it's an identifier. + // Eats the suffix of the literal, e.g. "_u8". fn eat_literal_suffix(&mut self) { - if !is_id_start(self.nth_char(0)) { + self.eat_identifier(); + } + + // Eats the identifier. + fn eat_identifier(&mut self) { + if !is_id_start(self.first()) { return; } self.bump(); - while is_id_continue(self.nth_char(0)) { + self.eat_while(is_id_continue); + } + + /// Eats symbols while predicate returns true or until the end of file is reached. + /// Returns amount of eaten symbols. + fn eat_while(&mut self, mut predicate: F) -> usize + where + F: FnMut(char) -> bool + { + let mut eaten: usize = 0; + while predicate(self.first()) && !self.is_eof() { + eaten += 1; self.bump(); } + + eaten } }