Skip to content

Commit fb4dba0

Browse files
committed
Inline and remove cook_lexer_token.
This is a small performance win, alas.
1 parent da84f0f commit fb4dba0

File tree

1 file changed

+174
-171
lines changed
  • compiler/rustc_parse/src/lexer

1 file changed

+174
-171
lines changed

compiler/rustc_parse/src/lexer/mod.rs

Lines changed: 174 additions & 171 deletions
Original file line numberDiff line numberDiff line change
@@ -86,13 +86,182 @@ impl<'a> StringReader<'a> {
8686

8787
debug!("next_token: {:?}({:?})", token.kind, self.str_from(start));
8888

89-
match self.cook_lexer_token(token.kind, start) {
90-
Some(kind) => {
89+
// Now "cook" the token, converting the simple `rustc_lexer::TokenKind` enum into a
90+
// rich `rustc_ast::TokenKind`. This turns strings into interned symbols and runs
91+
// additional validation.
92+
let kind = match token.kind {
93+
rustc_lexer::TokenKind::LineComment { doc_style } => {
94+
// Skip non-doc comments
95+
let Some(doc_style) = doc_style else {
96+
self.lint_unicode_text_flow(start);
97+
preceded_by_whitespace = true;
98+
continue;
99+
};
100+
101+
// Opening delimiter of the length 3 is not included into the symbol.
102+
let content_start = start + BytePos(3);
103+
let content = self.str_from(content_start);
104+
self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
105+
}
106+
rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
107+
if !terminated {
108+
self.report_unterminated_block_comment(start, doc_style);
109+
}
110+
111+
// Skip non-doc comments
112+
let Some(doc_style) = doc_style else {
113+
self.lint_unicode_text_flow(start);
114+
preceded_by_whitespace = true;
115+
continue;
116+
};
117+
118+
// Opening delimiter of the length 3 and closing delimiter of the length 2
119+
// are not included into the symbol.
120+
let content_start = start + BytePos(3);
121+
let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
122+
let content = self.str_from_to(content_start, content_end);
123+
self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
124+
}
125+
rustc_lexer::TokenKind::Whitespace => {
126+
preceded_by_whitespace = true;
127+
continue;
128+
}
129+
rustc_lexer::TokenKind::Ident => {
130+
let sym = nfc_normalize(self.str_from(start));
91131
let span = self.mk_sp(start, self.pos);
92-
return (Token::new(kind, span), preceded_by_whitespace);
132+
self.sess.symbol_gallery.insert(sym, span);
133+
token::Ident(sym, false)
93134
}
94-
None => preceded_by_whitespace = true,
95-
}
135+
rustc_lexer::TokenKind::RawIdent => {
136+
let sym = nfc_normalize(self.str_from(start + BytePos(2)));
137+
let span = self.mk_sp(start, self.pos);
138+
self.sess.symbol_gallery.insert(sym, span);
139+
if !sym.can_be_raw() {
140+
self.err_span(span, &format!("`{}` cannot be a raw identifier", sym));
141+
}
142+
self.sess.raw_identifier_spans.borrow_mut().push(span);
143+
token::Ident(sym, true)
144+
}
145+
rustc_lexer::TokenKind::UnknownPrefix => {
146+
self.report_unknown_prefix(start);
147+
let sym = nfc_normalize(self.str_from(start));
148+
let span = self.mk_sp(start, self.pos);
149+
self.sess.symbol_gallery.insert(sym, span);
150+
token::Ident(sym, false)
151+
}
152+
rustc_lexer::TokenKind::InvalidIdent
153+
// Do not recover an identifier with emoji if the codepoint is a confusable
154+
// with a recoverable substitution token, like `➖`.
155+
if !UNICODE_ARRAY
156+
.iter()
157+
.any(|&(c, _, _)| {
158+
let sym = self.str_from(start);
159+
sym.chars().count() == 1 && c == sym.chars().next().unwrap()
160+
}) =>
161+
{
162+
let sym = nfc_normalize(self.str_from(start));
163+
let span = self.mk_sp(start, self.pos);
164+
self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default()
165+
.push(span);
166+
token::Ident(sym, false)
167+
}
168+
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
169+
let suffix_start = start + BytePos(suffix_start);
170+
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
171+
let suffix = if suffix_start < self.pos {
172+
let string = self.str_from(suffix_start);
173+
if string == "_" {
174+
self.sess
175+
.span_diagnostic
176+
.struct_span_warn(
177+
self.mk_sp(suffix_start, self.pos),
178+
"underscore literal suffix is not allowed",
179+
)
180+
.warn(
181+
"this was previously accepted by the compiler but is \
182+
being phased out; it will become a hard error in \
183+
a future release!",
184+
)
185+
.note(
186+
"see issue #42326 \
187+
<https://github.com/rust-lang/rust/issues/42326> \
188+
for more information",
189+
)
190+
.emit();
191+
None
192+
} else {
193+
Some(Symbol::intern(string))
194+
}
195+
} else {
196+
None
197+
};
198+
token::Literal(token::Lit { kind, symbol, suffix })
199+
}
200+
rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
201+
// Include the leading `'` in the real identifier, for macro
202+
// expansion purposes. See #12512 for the gory details of why
203+
// this is necessary.
204+
let lifetime_name = self.str_from(start);
205+
if starts_with_number {
206+
self.err_span_(start, self.pos, "lifetimes cannot start with a number");
207+
}
208+
let ident = Symbol::intern(lifetime_name);
209+
token::Lifetime(ident)
210+
}
211+
rustc_lexer::TokenKind::Semi => token::Semi,
212+
rustc_lexer::TokenKind::Comma => token::Comma,
213+
rustc_lexer::TokenKind::Dot => token::Dot,
214+
rustc_lexer::TokenKind::OpenParen => token::OpenDelim(Delimiter::Parenthesis),
215+
rustc_lexer::TokenKind::CloseParen => token::CloseDelim(Delimiter::Parenthesis),
216+
rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(Delimiter::Brace),
217+
rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(Delimiter::Brace),
218+
rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(Delimiter::Bracket),
219+
rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(Delimiter::Bracket),
220+
rustc_lexer::TokenKind::At => token::At,
221+
rustc_lexer::TokenKind::Pound => token::Pound,
222+
rustc_lexer::TokenKind::Tilde => token::Tilde,
223+
rustc_lexer::TokenKind::Question => token::Question,
224+
rustc_lexer::TokenKind::Colon => token::Colon,
225+
rustc_lexer::TokenKind::Dollar => token::Dollar,
226+
rustc_lexer::TokenKind::Eq => token::Eq,
227+
rustc_lexer::TokenKind::Bang => token::Not,
228+
rustc_lexer::TokenKind::Lt => token::Lt,
229+
rustc_lexer::TokenKind::Gt => token::Gt,
230+
rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus),
231+
rustc_lexer::TokenKind::And => token::BinOp(token::And),
232+
rustc_lexer::TokenKind::Or => token::BinOp(token::Or),
233+
rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus),
234+
rustc_lexer::TokenKind::Star => token::BinOp(token::Star),
235+
rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash),
236+
rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
237+
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
238+
239+
rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
240+
let c = self.str_from(start).chars().next().unwrap();
241+
let mut err =
242+
self.struct_err_span_char(start, self.pos, "unknown start of token", c);
243+
// FIXME: the lexer could be used to turn the ASCII version of unicode
244+
// homoglyphs, instead of keeping a table in `check_for_substitution`into the
245+
// token. Ideally, this should be inside `rustc_lexer`. However, we should
246+
// first remove compound tokens like `<<` from `rustc_lexer`, and then add
247+
// fancier error recovery to it, as there will be less overall work to do this
248+
// way.
249+
let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
250+
if c == '\x00' {
251+
err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used");
252+
}
253+
err.emit();
254+
if let Some(token) = token {
255+
token
256+
} else {
257+
preceded_by_whitespace = true;
258+
continue;
259+
}
260+
}
261+
rustc_lexer::TokenKind::Eof => token::Eof,
262+
};
263+
let span = self.mk_sp(start, self.pos);
264+
return (Token::new(kind, span), preceded_by_whitespace);
96265
}
97266
}
98267

@@ -158,172 +327,6 @@ impl<'a> StringReader<'a> {
158327
}
159328
}
160329

161-
/// Turns simple `rustc_lexer::TokenKind` enum into a rich
162-
/// `rustc_ast::TokenKind`. This turns strings into interned
163-
/// symbols and runs additional validation.
164-
fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> Option<TokenKind> {
165-
Some(match token {
166-
rustc_lexer::TokenKind::LineComment { doc_style } => {
167-
// Skip non-doc comments
168-
let Some(doc_style) = doc_style else {
169-
self.lint_unicode_text_flow(start);
170-
return None;
171-
};
172-
173-
// Opening delimiter of the length 3 is not included into the symbol.
174-
let content_start = start + BytePos(3);
175-
let content = self.str_from(content_start);
176-
self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
177-
}
178-
rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
179-
if !terminated {
180-
self.report_unterminated_block_comment(start, doc_style);
181-
}
182-
183-
// Skip non-doc comments
184-
let Some(doc_style) = doc_style else {
185-
self.lint_unicode_text_flow(start);
186-
return None;
187-
};
188-
189-
// Opening delimiter of the length 3 and closing delimiter of the length 2
190-
// are not included into the symbol.
191-
let content_start = start + BytePos(3);
192-
let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
193-
let content = self.str_from_to(content_start, content_end);
194-
self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
195-
}
196-
rustc_lexer::TokenKind::Whitespace => return None,
197-
rustc_lexer::TokenKind::Ident => {
198-
let sym = nfc_normalize(self.str_from(start));
199-
let span = self.mk_sp(start, self.pos);
200-
self.sess.symbol_gallery.insert(sym, span);
201-
token::Ident(sym, false)
202-
}
203-
rustc_lexer::TokenKind::RawIdent => {
204-
let sym = nfc_normalize(self.str_from(start + BytePos(2)));
205-
let span = self.mk_sp(start, self.pos);
206-
self.sess.symbol_gallery.insert(sym, span);
207-
if !sym.can_be_raw() {
208-
self.err_span(span, &format!("`{}` cannot be a raw identifier", sym));
209-
}
210-
self.sess.raw_identifier_spans.borrow_mut().push(span);
211-
token::Ident(sym, true)
212-
}
213-
rustc_lexer::TokenKind::UnknownPrefix => {
214-
self.report_unknown_prefix(start);
215-
let sym = nfc_normalize(self.str_from(start));
216-
let span = self.mk_sp(start, self.pos);
217-
self.sess.symbol_gallery.insert(sym, span);
218-
token::Ident(sym, false)
219-
}
220-
rustc_lexer::TokenKind::InvalidIdent
221-
// Do not recover an identifier with emoji if the codepoint is a confusable
222-
// with a recoverable substitution token, like `➖`.
223-
if !UNICODE_ARRAY
224-
.iter()
225-
.any(|&(c, _, _)| {
226-
let sym = self.str_from(start);
227-
sym.chars().count() == 1 && c == sym.chars().next().unwrap()
228-
})
229-
=>
230-
{
231-
let sym = nfc_normalize(self.str_from(start));
232-
let span = self.mk_sp(start, self.pos);
233-
self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span);
234-
token::Ident(sym, false)
235-
}
236-
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
237-
let suffix_start = start + BytePos(suffix_start);
238-
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
239-
let suffix = if suffix_start < self.pos {
240-
let string = self.str_from(suffix_start);
241-
if string == "_" {
242-
self.sess
243-
.span_diagnostic
244-
.struct_span_warn(
245-
self.mk_sp(suffix_start, self.pos),
246-
"underscore literal suffix is not allowed",
247-
)
248-
.warn(
249-
"this was previously accepted by the compiler but is \
250-
being phased out; it will become a hard error in \
251-
a future release!",
252-
)
253-
.note(
254-
"see issue #42326 \
255-
<https://github.com/rust-lang/rust/issues/42326> \
256-
for more information",
257-
)
258-
.emit();
259-
None
260-
} else {
261-
Some(Symbol::intern(string))
262-
}
263-
} else {
264-
None
265-
};
266-
token::Literal(token::Lit { kind, symbol, suffix })
267-
}
268-
rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
269-
// Include the leading `'` in the real identifier, for macro
270-
// expansion purposes. See #12512 for the gory details of why
271-
// this is necessary.
272-
let lifetime_name = self.str_from(start);
273-
if starts_with_number {
274-
self.err_span_(start, self.pos, "lifetimes cannot start with a number");
275-
}
276-
let ident = Symbol::intern(lifetime_name);
277-
token::Lifetime(ident)
278-
}
279-
rustc_lexer::TokenKind::Semi => token::Semi,
280-
rustc_lexer::TokenKind::Comma => token::Comma,
281-
rustc_lexer::TokenKind::Dot => token::Dot,
282-
rustc_lexer::TokenKind::OpenParen => token::OpenDelim(Delimiter::Parenthesis),
283-
rustc_lexer::TokenKind::CloseParen => token::CloseDelim(Delimiter::Parenthesis),
284-
rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(Delimiter::Brace),
285-
rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(Delimiter::Brace),
286-
rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(Delimiter::Bracket),
287-
rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(Delimiter::Bracket),
288-
rustc_lexer::TokenKind::At => token::At,
289-
rustc_lexer::TokenKind::Pound => token::Pound,
290-
rustc_lexer::TokenKind::Tilde => token::Tilde,
291-
rustc_lexer::TokenKind::Question => token::Question,
292-
rustc_lexer::TokenKind::Colon => token::Colon,
293-
rustc_lexer::TokenKind::Dollar => token::Dollar,
294-
rustc_lexer::TokenKind::Eq => token::Eq,
295-
rustc_lexer::TokenKind::Bang => token::Not,
296-
rustc_lexer::TokenKind::Lt => token::Lt,
297-
rustc_lexer::TokenKind::Gt => token::Gt,
298-
rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus),
299-
rustc_lexer::TokenKind::And => token::BinOp(token::And),
300-
rustc_lexer::TokenKind::Or => token::BinOp(token::Or),
301-
rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus),
302-
rustc_lexer::TokenKind::Star => token::BinOp(token::Star),
303-
rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash),
304-
rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
305-
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
306-
307-
rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
308-
let c = self.str_from(start).chars().next().unwrap();
309-
let mut err =
310-
self.struct_err_span_char(start, self.pos, "unknown start of token", c);
311-
// FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs,
312-
// instead of keeping a table in `check_for_substitution`into the token. Ideally,
313-
// this should be inside `rustc_lexer`. However, we should first remove compound
314-
// tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
315-
// as there will be less overall work to do this way.
316-
let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
317-
if c == '\x00' {
318-
err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used");
319-
}
320-
err.emit();
321-
token?
322-
}
323-
rustc_lexer::TokenKind::Eof => token::Eof,
324-
})
325-
}
326-
327330
fn cook_doc_comment(
328331
&self,
329332
content_start: BytePos,

0 commit comments

Comments
 (0)