Make rustc_lexer::cursor::Cursor public.

nnethercote · nnethercote · commit aa6bfaf04b25 · 2022-09-26T13:36:35.000+10:00
`Cursor` is currently hidden, and the main tokenization path uses
`rustc_lexer::first_token` which involves constructing a new `Cursor`
for every single token, which is weird. Also, `first_token` also can't
handle empty input, so callers have to check for that first.

This commit makes `Cursor` public, so `StringReader` can contain a
`Cursor`, which results in a simpler structure. The commit also changes
`StringReader::advance_token` so it returns an `Option&lt;Token&gt;`,
simplifying the the empty input case.
diff --git a/compiler/rustc_lexer/src/cursor.rs b/compiler/rustc_lexer/src/cursor.rs
@@ -4,7 +4,7 @@ use std::str::Chars;
 ///
 /// Next characters can be peeked via `first` method,
 /// and position can be shifted forward via `bump` method.
-pub(crate) struct Cursor<'a> {
+pub struct Cursor<'a> {
     initial_len: usize,
     /// Iterator over chars. Slightly faster than a &str.
     chars: Chars<'a>,
@@ -15,7 +15,7 @@ pub(crate) struct Cursor<'a> {
 pub(crate) const EOF_CHAR: char = '\0';
 
 impl<'a> Cursor<'a> {
-    pub(crate) fn new(input: &'a str) -> Cursor<'a> {
+    pub fn new(input: &'a str) -> Cursor<'a> {
         Cursor {
             initial_len: input.len(),
             chars: input.chars(),
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
@@ -23,7 +23,7 @@
 // We want to be able to build this crate with a stable compiler, so no
 // `#![feature]` attributes should be added.
 
-mod cursor;
+pub mod cursor;
 pub mod unescape;
 
 #[cfg(test)]
@@ -219,13 +219,6 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
     None
 }
 
-/// Parses the first token from the provided input string.
-#[inline]
-pub fn first_token(input: &str) -> Token {
-    debug_assert!(!input.is_empty());
-    Cursor::new(input).advance_token()
-}
-
 /// Validates a raw string literal. Used for getting more information about a
 /// problem with a `RawStr`/`RawByteStr` with a `None` field.
 #[inline]
@@ -242,14 +235,7 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError>
 /// Creates an iterator that produces tokens from the input string.
 pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
     let mut cursor = Cursor::new(input);
-    std::iter::from_fn(move || {
-        if cursor.is_eof() {
-            None
-        } else {
-            cursor.reset_len_consumed();
-            Some(cursor.advance_token())
-        }
-    })
+    std::iter::from_fn(move || cursor.advance_token())
 }
 
 /// True if `c` is considered a whitespace according to Rust language definition.
@@ -311,8 +297,8 @@ pub fn is_ident(string: &str) -> bool {
 
 impl Cursor<'_> {
     /// Parses a token from the input string.
-    fn advance_token(&mut self) -> Token {
-        let first_char = self.bump().unwrap();
+    pub fn advance_token(&mut self) -> Option<Token> {
+        let first_char = self.bump()?;
         let token_kind = match first_char {
             // Slash, comment or block comment.
             '/' => match self.first() {
@@ -433,7 +419,9 @@ impl Cursor<'_> {
             }
             _ => Unknown,
         };
-        Token::new(token_kind, self.len_consumed())
+        let res = Some(Token::new(token_kind, self.len_consumed()));
+        self.reset_len_consumed();
+        res
     }
 
     fn line_comment(&mut self) -> TokenKind {
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
@@ -4,6 +4,7 @@ use rustc_ast::token::{self, CommentKind, Delimiter, Token, TokenKind};
 use rustc_ast::tokenstream::TokenStream;
 use rustc_ast::util::unicode::contains_text_flow_control_chars;
 use rustc_errors::{error_code, Applicability, DiagnosticBuilder, ErrorGuaranteed, PResult};
+use rustc_lexer::cursor::Cursor;
 use rustc_lexer::unescape::{self, Mode};
 use rustc_lexer::{Base, DocStyle, RawStrError};
 use rustc_session::lint::builtin::{
@@ -48,7 +49,9 @@ pub(crate) fn parse_token_trees<'a>(
         start_pos = start_pos + BytePos::from_usize(shebang_len);
     }
 
-    let string_reader = StringReader { sess, start_pos, pos: start_pos, src, override_span };
+    let cursor = Cursor::new(src);
+    let string_reader =
+        StringReader { sess, start_pos, pos: start_pos, src, cursor, override_span };
     tokentrees::TokenTreesReader::parse_token_trees(string_reader)
 }
 
@@ -60,6 +63,8 @@ struct StringReader<'a> {
     pos: BytePos,
     /// Source text to tokenize.
     src: &'a str,
+    /// Cursor for getting lexer tokens.
+    cursor: Cursor<'a>,
     override_span: Option<Span>,
 }
 
@@ -75,15 +80,13 @@ impl<'a> StringReader<'a> {
 
         // Skip trivial (whitespace & comments) tokens
         loop {
-            let start_src_index = self.src_index(self.pos);
-            let text: &str = &self.src[start_src_index..];
-
-            if text.is_empty() {
-                let span = self.mk_sp(self.pos, self.pos);
-                return (Token::new(token::Eof, span), preceded_by_whitespace);
-            }
-
-            let token = rustc_lexer::first_token(text);
+            let token = match self.cursor.advance_token() {
+                Some(token) => token,
+                None => {
+                    let span = self.mk_sp(self.pos, self.pos);
+                    return (Token::new(token::Eof, span), preceded_by_whitespace);
+                }
+            };
 
             let start = self.pos;
             self.pos = self.pos + BytePos(token.len);
diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs
@@ -13,6 +13,7 @@ use std::collections::VecDeque;
 use std::fmt::{Display, Write};
 
 use rustc_data_structures::fx::FxHashMap;
+use rustc_lexer::cursor::Cursor;
 use rustc_lexer::{LiteralKind, TokenKind};
 use rustc_span::edition::Edition;
 use rustc_span::symbol::Symbol;
@@ -408,15 +409,13 @@ enum Highlight<'a> {
 
 struct TokenIter<'a> {
     src: &'a str,
+    cursor: Cursor<'a>,
 }
 
 impl<'a> Iterator for TokenIter<'a> {
     type Item = (TokenKind, &'a str);
     fn next(&mut self) -> Option<(TokenKind, &'a str)> {
-        if self.src.is_empty() {
-            return None;
-        }
-        let token = rustc_lexer::first_token(self.src);
+        let token = self.cursor.advance_token()?;
         let (text, rest) = self.src.split_at(token.len as usize);
         self.src = rest;
         Some((token.kind, text))
@@ -525,7 +524,7 @@ impl<'a> Classifier<'a> {
     /// Takes as argument the source code to HTML-ify, the rust edition to use and the source code
     /// file span which will be used later on by the `span_correspondance_map`.
     fn new(src: &str, file_span: Span, decoration_info: Option<DecorationInfo>) -> Classifier<'_> {
-        let tokens = PeekIter::new(TokenIter { src });
+        let tokens = PeekIter::new(TokenIter { src, cursor: Cursor::new(src) });
         let decorations = decoration_info.map(Decorations::new);
         Classifier {
             tokens,