Skip to content

Commit 36c55b2

Browse files
committed
Add support in lexer for utf8 identifiers. No NFKC logic in char yet.
1 parent 5fd0a3b commit 36c55b2

File tree

4 files changed

+48
-5
lines changed

4 files changed

+48
-5
lines changed

doc/rust.texi

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -595,9 +595,10 @@ otherwise defined as keywords or reserved
595595
tokens. @xref{Ref.Lex.Key}. @xref{Ref.Lex.Res}.
596596

597597
That is: an identifier starts with any character having derived property
598-
@code{XID_Start} and continues with zero or more characters having derived
599-
property @code{XID_Continue}; and such an identifier is NFKC-normalized during
600-
lexing, such that all subsequent comparison of identifiers is performed on the
598+
@code{XID_Start}, or the character U+005F (underscore, @code{_}), and
599+
continues with zero or more characters having derived property
600+
@code{XID_Continue}. An identifier is NFKC-normalized during lexing, such
601+
that all subsequent comparison of identifiers is performed on the
601602
NFKC-normalized forms.
602603

603604
@emph{TODO: define relationship between Unicode and Rust versions}.

src/comp/syntax/parse/lexer.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -309,14 +309,16 @@ fn next_token(rdr: reader) -> {tok: token::token, chpos: uint, bpos: uint} {
309309
fn next_token_inner(rdr: reader) -> token::token {
310310
let accum_str = "";
311311
let c = rdr.curr();
312-
if is_alpha(c) || c == '_' {
313-
while is_alnum(c) || c == '_' {
312+
if char::is_XID_start(c) || c == '_' {
313+
while char::is_XID_continue(c) {
314314
str::push_char(accum_str, c);
315315
rdr.bump();
316316
c = rdr.curr();
317317
}
318318
if str::eq(accum_str, "_") { ret token::UNDERSCORE; }
319319
let is_mod_name = c == ':' && rdr.next() == ':';
320+
321+
// FIXME: perform NFKC normalization here.
320322
ret token::IDENT(interner::intern::<str>(*rdr.get_interner(),
321323
accum_str), is_mod_name);
322324
}

src/libcore/char.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,12 @@ Utilities for manipulating the char type
3737
Cn Unassigned a reserved unassigned code point or a noncharacter
3838
*/
3939

40+
export is_alphabetic,
41+
is_XID_start, is_XID_continue,
42+
is_lowercase, is_uppercase,
43+
is_whitespace, is_alphanumeric,
44+
to_digit, maybe_digit, cmp;
45+
4046
import is_alphabetic = unicode::derived_property::Alphabetic;
4147
import is_XID_start = unicode::derived_property::XID_Start;
4248
import is_XID_continue = unicode::derived_property::XID_Continue;

src/test/run-pass/utf8_idents.rs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
fn main() {
2+
let Π = 3.14;
3+
let लंच = Π * Π + 1.54;
4+
assert लंच - 1.54 == Π * Π;
5+
assert საჭმელად_გემრიელი_სადილი() == 0;
6+
}
7+
8+
fn საჭმელად_გემრიელი_სადილი() -> int {
9+
10+
// Lunch in several languages.
11+
12+
let ランチ = 10;
13+
let 午餐 = 10;
14+
15+
let ארוחת_צהריי = 10;
16+
let غداء = 10;
17+
let լանչ = 10;
18+
let обед = 10;
19+
let абед = 10;
20+
let μεσημεριανό = 10;
21+
let hádegismatur = 10;
22+
let ручек = 10;
23+
24+
let ăn_trưa = 10;
25+
let อาหารกลางวัน = 10;
26+
27+
// Lunchy arithmetic, mm.
28+
29+
assert hádegismatur * ручек * обед == 1000;
30+
assert 10 == ארוחת_צהריי;
31+
assert ランチ + 午餐 + μεσημεριανό == 30;
32+
assert ăn_trưa + อาหารกลางวัน == 20;
33+
ret (абед + լանչ) >> غداء;
34+
}

0 commit comments

Comments
 (0)