@@ -86,13 +86,182 @@ impl<'a> StringReader<'a> {
86
86
87
87
debug ! ( "next_token: {:?}({:?})" , token. kind, self . str_from( start) ) ;
88
88
89
- match self . cook_lexer_token ( token. kind , start) {
90
- Some ( kind) => {
89
+ // Now "cook" the token, converting the simple `rustc_lexer::TokenKind` enum into a
90
+ // rich `rustc_ast::TokenKind`. This turns strings into interned symbols and runs
91
+ // additional validation.
92
+ let kind = match token. kind {
93
+ rustc_lexer:: TokenKind :: LineComment { doc_style } => {
94
+ // Skip non-doc comments
95
+ let Some ( doc_style) = doc_style else {
96
+ self . lint_unicode_text_flow ( start) ;
97
+ preceded_by_whitespace = true ;
98
+ continue ;
99
+ } ;
100
+
101
+ // Opening delimiter of the length 3 is not included into the symbol.
102
+ let content_start = start + BytePos ( 3 ) ;
103
+ let content = self . str_from ( content_start) ;
104
+ self . cook_doc_comment ( content_start, content, CommentKind :: Line , doc_style)
105
+ }
106
+ rustc_lexer:: TokenKind :: BlockComment { doc_style, terminated } => {
107
+ if !terminated {
108
+ self . report_unterminated_block_comment ( start, doc_style) ;
109
+ }
110
+
111
+ // Skip non-doc comments
112
+ let Some ( doc_style) = doc_style else {
113
+ self . lint_unicode_text_flow ( start) ;
114
+ preceded_by_whitespace = true ;
115
+ continue ;
116
+ } ;
117
+
118
+ // Opening delimiter of the length 3 and closing delimiter of the length 2
119
+ // are not included into the symbol.
120
+ let content_start = start + BytePos ( 3 ) ;
121
+ let content_end = self . pos - BytePos ( if terminated { 2 } else { 0 } ) ;
122
+ let content = self . str_from_to ( content_start, content_end) ;
123
+ self . cook_doc_comment ( content_start, content, CommentKind :: Block , doc_style)
124
+ }
125
+ rustc_lexer:: TokenKind :: Whitespace => {
126
+ preceded_by_whitespace = true ;
127
+ continue ;
128
+ }
129
+ rustc_lexer:: TokenKind :: Ident => {
130
+ let sym = nfc_normalize ( self . str_from ( start) ) ;
91
131
let span = self . mk_sp ( start, self . pos ) ;
92
- return ( Token :: new ( kind, span) , preceded_by_whitespace) ;
132
+ self . sess . symbol_gallery . insert ( sym, span) ;
133
+ token:: Ident ( sym, false )
93
134
}
94
- None => preceded_by_whitespace = true ,
95
- }
135
+ rustc_lexer:: TokenKind :: RawIdent => {
136
+ let sym = nfc_normalize ( self . str_from ( start + BytePos ( 2 ) ) ) ;
137
+ let span = self . mk_sp ( start, self . pos ) ;
138
+ self . sess . symbol_gallery . insert ( sym, span) ;
139
+ if !sym. can_be_raw ( ) {
140
+ self . err_span ( span, & format ! ( "`{}` cannot be a raw identifier" , sym) ) ;
141
+ }
142
+ self . sess . raw_identifier_spans . borrow_mut ( ) . push ( span) ;
143
+ token:: Ident ( sym, true )
144
+ }
145
+ rustc_lexer:: TokenKind :: UnknownPrefix => {
146
+ self . report_unknown_prefix ( start) ;
147
+ let sym = nfc_normalize ( self . str_from ( start) ) ;
148
+ let span = self . mk_sp ( start, self . pos ) ;
149
+ self . sess . symbol_gallery . insert ( sym, span) ;
150
+ token:: Ident ( sym, false )
151
+ }
152
+ rustc_lexer:: TokenKind :: InvalidIdent
153
+ // Do not recover an identifier with emoji if the codepoint is a confusable
154
+ // with a recoverable substitution token, like `➖`.
155
+ if !UNICODE_ARRAY
156
+ . iter ( )
157
+ . any ( |& ( c, _, _) | {
158
+ let sym = self . str_from ( start) ;
159
+ sym. chars ( ) . count ( ) == 1 && c == sym. chars ( ) . next ( ) . unwrap ( )
160
+ } ) =>
161
+ {
162
+ let sym = nfc_normalize ( self . str_from ( start) ) ;
163
+ let span = self . mk_sp ( start, self . pos ) ;
164
+ self . sess . bad_unicode_identifiers . borrow_mut ( ) . entry ( sym) . or_default ( )
165
+ . push ( span) ;
166
+ token:: Ident ( sym, false )
167
+ }
168
+ rustc_lexer:: TokenKind :: Literal { kind, suffix_start } => {
169
+ let suffix_start = start + BytePos ( suffix_start) ;
170
+ let ( kind, symbol) = self . cook_lexer_literal ( start, suffix_start, kind) ;
171
+ let suffix = if suffix_start < self . pos {
172
+ let string = self . str_from ( suffix_start) ;
173
+ if string == "_" {
174
+ self . sess
175
+ . span_diagnostic
176
+ . struct_span_warn (
177
+ self . mk_sp ( suffix_start, self . pos ) ,
178
+ "underscore literal suffix is not allowed" ,
179
+ )
180
+ . warn (
181
+ "this was previously accepted by the compiler but is \
182
+ being phased out; it will become a hard error in \
183
+ a future release!",
184
+ )
185
+ . note (
186
+ "see issue #42326 \
187
+ <https://github.com/rust-lang/rust/issues/42326> \
188
+ for more information",
189
+ )
190
+ . emit ( ) ;
191
+ None
192
+ } else {
193
+ Some ( Symbol :: intern ( string) )
194
+ }
195
+ } else {
196
+ None
197
+ } ;
198
+ token:: Literal ( token:: Lit { kind, symbol, suffix } )
199
+ }
200
+ rustc_lexer:: TokenKind :: Lifetime { starts_with_number } => {
201
+ // Include the leading `'` in the real identifier, for macro
202
+ // expansion purposes. See #12512 for the gory details of why
203
+ // this is necessary.
204
+ let lifetime_name = self . str_from ( start) ;
205
+ if starts_with_number {
206
+ self . err_span_ ( start, self . pos , "lifetimes cannot start with a number" ) ;
207
+ }
208
+ let ident = Symbol :: intern ( lifetime_name) ;
209
+ token:: Lifetime ( ident)
210
+ }
211
+ rustc_lexer:: TokenKind :: Semi => token:: Semi ,
212
+ rustc_lexer:: TokenKind :: Comma => token:: Comma ,
213
+ rustc_lexer:: TokenKind :: Dot => token:: Dot ,
214
+ rustc_lexer:: TokenKind :: OpenParen => token:: OpenDelim ( Delimiter :: Parenthesis ) ,
215
+ rustc_lexer:: TokenKind :: CloseParen => token:: CloseDelim ( Delimiter :: Parenthesis ) ,
216
+ rustc_lexer:: TokenKind :: OpenBrace => token:: OpenDelim ( Delimiter :: Brace ) ,
217
+ rustc_lexer:: TokenKind :: CloseBrace => token:: CloseDelim ( Delimiter :: Brace ) ,
218
+ rustc_lexer:: TokenKind :: OpenBracket => token:: OpenDelim ( Delimiter :: Bracket ) ,
219
+ rustc_lexer:: TokenKind :: CloseBracket => token:: CloseDelim ( Delimiter :: Bracket ) ,
220
+ rustc_lexer:: TokenKind :: At => token:: At ,
221
+ rustc_lexer:: TokenKind :: Pound => token:: Pound ,
222
+ rustc_lexer:: TokenKind :: Tilde => token:: Tilde ,
223
+ rustc_lexer:: TokenKind :: Question => token:: Question ,
224
+ rustc_lexer:: TokenKind :: Colon => token:: Colon ,
225
+ rustc_lexer:: TokenKind :: Dollar => token:: Dollar ,
226
+ rustc_lexer:: TokenKind :: Eq => token:: Eq ,
227
+ rustc_lexer:: TokenKind :: Bang => token:: Not ,
228
+ rustc_lexer:: TokenKind :: Lt => token:: Lt ,
229
+ rustc_lexer:: TokenKind :: Gt => token:: Gt ,
230
+ rustc_lexer:: TokenKind :: Minus => token:: BinOp ( token:: Minus ) ,
231
+ rustc_lexer:: TokenKind :: And => token:: BinOp ( token:: And ) ,
232
+ rustc_lexer:: TokenKind :: Or => token:: BinOp ( token:: Or ) ,
233
+ rustc_lexer:: TokenKind :: Plus => token:: BinOp ( token:: Plus ) ,
234
+ rustc_lexer:: TokenKind :: Star => token:: BinOp ( token:: Star ) ,
235
+ rustc_lexer:: TokenKind :: Slash => token:: BinOp ( token:: Slash ) ,
236
+ rustc_lexer:: TokenKind :: Caret => token:: BinOp ( token:: Caret ) ,
237
+ rustc_lexer:: TokenKind :: Percent => token:: BinOp ( token:: Percent ) ,
238
+
239
+ rustc_lexer:: TokenKind :: Unknown | rustc_lexer:: TokenKind :: InvalidIdent => {
240
+ let c = self . str_from ( start) . chars ( ) . next ( ) . unwrap ( ) ;
241
+ let mut err =
242
+ self . struct_err_span_char ( start, self . pos , "unknown start of token" , c) ;
243
+ // FIXME: the lexer could be used to turn the ASCII version of unicode
244
+ // homoglyphs, instead of keeping a table in `check_for_substitution`into the
245
+ // token. Ideally, this should be inside `rustc_lexer`. However, we should
246
+ // first remove compound tokens like `<<` from `rustc_lexer`, and then add
247
+ // fancier error recovery to it, as there will be less overall work to do this
248
+ // way.
249
+ let token = unicode_chars:: check_for_substitution ( self , start, c, & mut err) ;
250
+ if c == '\x00' {
251
+ err. help ( "source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used" ) ;
252
+ }
253
+ err. emit ( ) ;
254
+ if let Some ( token) = token {
255
+ token
256
+ } else {
257
+ preceded_by_whitespace = true ;
258
+ continue ;
259
+ }
260
+ }
261
+ rustc_lexer:: TokenKind :: Eof => token:: Eof ,
262
+ } ;
263
+ let span = self . mk_sp ( start, self . pos ) ;
264
+ return ( Token :: new ( kind, span) , preceded_by_whitespace) ;
96
265
}
97
266
}
98
267
@@ -158,172 +327,6 @@ impl<'a> StringReader<'a> {
158
327
}
159
328
}
160
329
161
- /// Turns simple `rustc_lexer::TokenKind` enum into a rich
162
- /// `rustc_ast::TokenKind`. This turns strings into interned
163
- /// symbols and runs additional validation.
164
- fn cook_lexer_token ( & self , token : rustc_lexer:: TokenKind , start : BytePos ) -> Option < TokenKind > {
165
- Some ( match token {
166
- rustc_lexer:: TokenKind :: LineComment { doc_style } => {
167
- // Skip non-doc comments
168
- let Some ( doc_style) = doc_style else {
169
- self . lint_unicode_text_flow ( start) ;
170
- return None ;
171
- } ;
172
-
173
- // Opening delimiter of the length 3 is not included into the symbol.
174
- let content_start = start + BytePos ( 3 ) ;
175
- let content = self . str_from ( content_start) ;
176
- self . cook_doc_comment ( content_start, content, CommentKind :: Line , doc_style)
177
- }
178
- rustc_lexer:: TokenKind :: BlockComment { doc_style, terminated } => {
179
- if !terminated {
180
- self . report_unterminated_block_comment ( start, doc_style) ;
181
- }
182
-
183
- // Skip non-doc comments
184
- let Some ( doc_style) = doc_style else {
185
- self . lint_unicode_text_flow ( start) ;
186
- return None ;
187
- } ;
188
-
189
- // Opening delimiter of the length 3 and closing delimiter of the length 2
190
- // are not included into the symbol.
191
- let content_start = start + BytePos ( 3 ) ;
192
- let content_end = self . pos - BytePos ( if terminated { 2 } else { 0 } ) ;
193
- let content = self . str_from_to ( content_start, content_end) ;
194
- self . cook_doc_comment ( content_start, content, CommentKind :: Block , doc_style)
195
- }
196
- rustc_lexer:: TokenKind :: Whitespace => return None ,
197
- rustc_lexer:: TokenKind :: Ident => {
198
- let sym = nfc_normalize ( self . str_from ( start) ) ;
199
- let span = self . mk_sp ( start, self . pos ) ;
200
- self . sess . symbol_gallery . insert ( sym, span) ;
201
- token:: Ident ( sym, false )
202
- }
203
- rustc_lexer:: TokenKind :: RawIdent => {
204
- let sym = nfc_normalize ( self . str_from ( start + BytePos ( 2 ) ) ) ;
205
- let span = self . mk_sp ( start, self . pos ) ;
206
- self . sess . symbol_gallery . insert ( sym, span) ;
207
- if !sym. can_be_raw ( ) {
208
- self . err_span ( span, & format ! ( "`{}` cannot be a raw identifier" , sym) ) ;
209
- }
210
- self . sess . raw_identifier_spans . borrow_mut ( ) . push ( span) ;
211
- token:: Ident ( sym, true )
212
- }
213
- rustc_lexer:: TokenKind :: UnknownPrefix => {
214
- self . report_unknown_prefix ( start) ;
215
- let sym = nfc_normalize ( self . str_from ( start) ) ;
216
- let span = self . mk_sp ( start, self . pos ) ;
217
- self . sess . symbol_gallery . insert ( sym, span) ;
218
- token:: Ident ( sym, false )
219
- }
220
- rustc_lexer:: TokenKind :: InvalidIdent
221
- // Do not recover an identifier with emoji if the codepoint is a confusable
222
- // with a recoverable substitution token, like `➖`.
223
- if !UNICODE_ARRAY
224
- . iter ( )
225
- . any ( |& ( c, _, _) | {
226
- let sym = self . str_from ( start) ;
227
- sym. chars ( ) . count ( ) == 1 && c == sym. chars ( ) . next ( ) . unwrap ( )
228
- } )
229
- =>
230
- {
231
- let sym = nfc_normalize ( self . str_from ( start) ) ;
232
- let span = self . mk_sp ( start, self . pos ) ;
233
- self . sess . bad_unicode_identifiers . borrow_mut ( ) . entry ( sym) . or_default ( ) . push ( span) ;
234
- token:: Ident ( sym, false )
235
- }
236
- rustc_lexer:: TokenKind :: Literal { kind, suffix_start } => {
237
- let suffix_start = start + BytePos ( suffix_start) ;
238
- let ( kind, symbol) = self . cook_lexer_literal ( start, suffix_start, kind) ;
239
- let suffix = if suffix_start < self . pos {
240
- let string = self . str_from ( suffix_start) ;
241
- if string == "_" {
242
- self . sess
243
- . span_diagnostic
244
- . struct_span_warn (
245
- self . mk_sp ( suffix_start, self . pos ) ,
246
- "underscore literal suffix is not allowed" ,
247
- )
248
- . warn (
249
- "this was previously accepted by the compiler but is \
250
- being phased out; it will become a hard error in \
251
- a future release!",
252
- )
253
- . note (
254
- "see issue #42326 \
255
- <https://github.com/rust-lang/rust/issues/42326> \
256
- for more information",
257
- )
258
- . emit ( ) ;
259
- None
260
- } else {
261
- Some ( Symbol :: intern ( string) )
262
- }
263
- } else {
264
- None
265
- } ;
266
- token:: Literal ( token:: Lit { kind, symbol, suffix } )
267
- }
268
- rustc_lexer:: TokenKind :: Lifetime { starts_with_number } => {
269
- // Include the leading `'` in the real identifier, for macro
270
- // expansion purposes. See #12512 for the gory details of why
271
- // this is necessary.
272
- let lifetime_name = self . str_from ( start) ;
273
- if starts_with_number {
274
- self . err_span_ ( start, self . pos , "lifetimes cannot start with a number" ) ;
275
- }
276
- let ident = Symbol :: intern ( lifetime_name) ;
277
- token:: Lifetime ( ident)
278
- }
279
- rustc_lexer:: TokenKind :: Semi => token:: Semi ,
280
- rustc_lexer:: TokenKind :: Comma => token:: Comma ,
281
- rustc_lexer:: TokenKind :: Dot => token:: Dot ,
282
- rustc_lexer:: TokenKind :: OpenParen => token:: OpenDelim ( Delimiter :: Parenthesis ) ,
283
- rustc_lexer:: TokenKind :: CloseParen => token:: CloseDelim ( Delimiter :: Parenthesis ) ,
284
- rustc_lexer:: TokenKind :: OpenBrace => token:: OpenDelim ( Delimiter :: Brace ) ,
285
- rustc_lexer:: TokenKind :: CloseBrace => token:: CloseDelim ( Delimiter :: Brace ) ,
286
- rustc_lexer:: TokenKind :: OpenBracket => token:: OpenDelim ( Delimiter :: Bracket ) ,
287
- rustc_lexer:: TokenKind :: CloseBracket => token:: CloseDelim ( Delimiter :: Bracket ) ,
288
- rustc_lexer:: TokenKind :: At => token:: At ,
289
- rustc_lexer:: TokenKind :: Pound => token:: Pound ,
290
- rustc_lexer:: TokenKind :: Tilde => token:: Tilde ,
291
- rustc_lexer:: TokenKind :: Question => token:: Question ,
292
- rustc_lexer:: TokenKind :: Colon => token:: Colon ,
293
- rustc_lexer:: TokenKind :: Dollar => token:: Dollar ,
294
- rustc_lexer:: TokenKind :: Eq => token:: Eq ,
295
- rustc_lexer:: TokenKind :: Bang => token:: Not ,
296
- rustc_lexer:: TokenKind :: Lt => token:: Lt ,
297
- rustc_lexer:: TokenKind :: Gt => token:: Gt ,
298
- rustc_lexer:: TokenKind :: Minus => token:: BinOp ( token:: Minus ) ,
299
- rustc_lexer:: TokenKind :: And => token:: BinOp ( token:: And ) ,
300
- rustc_lexer:: TokenKind :: Or => token:: BinOp ( token:: Or ) ,
301
- rustc_lexer:: TokenKind :: Plus => token:: BinOp ( token:: Plus ) ,
302
- rustc_lexer:: TokenKind :: Star => token:: BinOp ( token:: Star ) ,
303
- rustc_lexer:: TokenKind :: Slash => token:: BinOp ( token:: Slash ) ,
304
- rustc_lexer:: TokenKind :: Caret => token:: BinOp ( token:: Caret ) ,
305
- rustc_lexer:: TokenKind :: Percent => token:: BinOp ( token:: Percent ) ,
306
-
307
- rustc_lexer:: TokenKind :: Unknown | rustc_lexer:: TokenKind :: InvalidIdent => {
308
- let c = self . str_from ( start) . chars ( ) . next ( ) . unwrap ( ) ;
309
- let mut err =
310
- self . struct_err_span_char ( start, self . pos , "unknown start of token" , c) ;
311
- // FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs,
312
- // instead of keeping a table in `check_for_substitution`into the token. Ideally,
313
- // this should be inside `rustc_lexer`. However, we should first remove compound
314
- // tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
315
- // as there will be less overall work to do this way.
316
- let token = unicode_chars:: check_for_substitution ( self , start, c, & mut err) ;
317
- if c == '\x00' {
318
- err. help ( "source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used" ) ;
319
- }
320
- err. emit ( ) ;
321
- token?
322
- }
323
- rustc_lexer:: TokenKind :: Eof => token:: Eof ,
324
- } )
325
- }
326
-
327
330
fn cook_doc_comment (
328
331
& self ,
329
332
content_start : BytePos ,
0 commit comments