Skip to content

Commit 8812e8a

Browse files
huonwalexcrichton
authored andcommitted
syntax: calculate positions of multibyte characters more correctly.
They are still are not completely correct, since it does not handle graphemes at all, just codepoints, but at least it handles the common case correctly. The calculation was previously very wrong (rather than just a little bit wrong): it wasn't accounting for the fact that every character is 1 byte, and so multibyte characters were pretending to be zero width. cc #8706
1 parent ff79a44 commit 8812e8a

File tree

4 files changed

+83
-4
lines changed

4 files changed

+83
-4
lines changed

src/libsyntax/codemap.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -460,11 +460,12 @@ impl CodeMap {
460460
for mbc in multibyte_chars.get().iter() {
461461
debug!("codemap: {:?}-byte char at {:?}", mbc.bytes, mbc.pos);
462462
if mbc.pos < bpos {
463-
total_extra_bytes += mbc.bytes;
463+
// every character is at least one byte, so we only
464+
// count the actual extra bytes.
465+
total_extra_bytes += mbc.bytes - 1;
464466
// We should never see a byte position in the middle of a
465467
// character
466-
assert!(bpos == mbc.pos ||
467-
bpos.to_uint() >= mbc.pos.to_uint() + mbc.bytes);
468+
assert!(bpos.to_uint() >= mbc.pos.to_uint() + mbc.bytes);
468469
} else {
469470
break;
470471
}

src/test/run-make/unicode-input/Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,8 @@ all:
44
# check that we don't ICE on unicode input, issue #11178
55
$(RUSTC) multiple_files.rs
66
$(call RUN,multiple_files) "$(RUSTC)" "$(TMPDIR)"
7+
8+
# check that our multibyte-ident spans are (approximately) the
9+
# correct length. issue #8706
10+
$(RUSTC) span_length.rs
11+
$(call RUN,span_length) "$(RUSTC)" "$(TMPDIR)"

src/test/run-make/unicode-input/multiple_files.rs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
111
use std::{char, os, run, str};
212
use std::rand::{task_rng, Rng};
313
use std::io::File;
@@ -36,7 +46,8 @@ fn main() {
3646

3747
for _ in range(0, 100) {
3848
{
39-
let mut w = File::create(&tmpdir.join("unicode_input_multiple_files_chars.rs")).unwrap();
49+
let randoms = tmpdir.join("unicode_input_multiple_files_chars.rs");
50+
let mut w = File::create(&randoms).unwrap();
4051
for _ in range(0, 30) {
4152
let _ = w.write_char(random_char());
4253
}
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
use std::{char, os, run, str};
12+
use std::rand::{task_rng, Rng};
13+
use std::io::File;
14+
15+
// creates a file with `fn main() { <random ident> }` and checks the
16+
// compiler emits a span of the appropriate length (for the
17+
// "unresolved name" message); currently just using the number of code
18+
// points, but should be the number of graphemes (FIXME #7043)
19+
20+
fn random_char() -> char {
21+
let mut rng = task_rng();
22+
// a subset of the XID_start unicode table (ensuring that the
23+
// compiler doesn't fail with an "unrecognised token" error)
24+
let (lo, hi): (u32, u32) = match rng.gen_range(1, 4 + 1) {
25+
1 => (0x41, 0x5a),
26+
2 => (0xf8, 0x1ba),
27+
3 => (0x1401, 0x166c),
28+
_ => (0x10400, 0x1044f)
29+
};
30+
31+
char::from_u32(rng.gen_range(lo, hi + 1)).unwrap()
32+
}
33+
34+
fn main() {
35+
let args = os::args();
36+
let rustc = args[1].as_slice();
37+
let tmpdir = Path::new(args[2].as_slice());
38+
39+
let main_file = tmpdir.join("span_main.rs");
40+
let main_file_str = main_file.as_str().unwrap();
41+
42+
for _ in range(0, 100) {
43+
let n = task_rng().gen_range(3u, 20);
44+
45+
{
46+
let _ = write!(&mut File::create(&main_file).unwrap(),
47+
r"\#[feature(non_ascii_idents)]; fn main() \{ {} \}",
48+
// random string of length n
49+
range(0, n).map(|_| random_char()).collect::<~str>());
50+
}
51+
52+
// rustc is passed to us with --out-dir and -L etc., so we
53+
// can't exec it directly
54+
let result = run::process_output("sh", [~"-c", rustc + " " + main_file_str]).unwrap();
55+
56+
let err = str::from_utf8_lossy(result.error);
57+
58+
// the span should end the line (e.g no extra ~'s)
59+
let expected_span = "^" + "~".repeat(n - 1) + "\n";
60+
assert!(err.as_slice().contains(expected_span));
61+
}
62+
}

0 commit comments

Comments
 (0)