Skip to content

Commit 0c6cc11

Browse files
committed
auto merge of #8445 : Florob/rust/unicode, r=graydon
This adds support for performing Unicode Normalization Forms D and KD on strings. To enable this the decomposition and canonical combining class properties are added to std::unicode. On my system this increases libstd's size by ~250KiB.
2 parents e664781 + 3d720c6 commit 0c6cc11

File tree

4 files changed

+2556
-34
lines changed

4 files changed

+2556
-34
lines changed

src/etc/unicode.py

Lines changed: 150 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,15 @@ def fetch(f):
2626
def load_unicode_data(f):
2727
fetch(f)
2828
gencats = {}
29+
combines = []
2930
canon_decomp = {}
3031
compat_decomp = {}
3132
curr_cat = ""
33+
curr_combine = ""
3234
c_lo = 0
3335
c_hi = 0
36+
com_lo = 0
37+
com_hi = 0
3438
for line in fileinput.input(f):
3539
fields = line.split(";")
3640
if len(fields) != 15:
@@ -69,7 +73,21 @@ def load_unicode_data(f):
6973
c_lo = code
7074
c_hi = code
7175

72-
return (canon_decomp, compat_decomp, gencats)
76+
if curr_combine == "":
77+
curr_combine = combine
78+
com_lo = code
79+
com_hi = code
80+
81+
if curr_combine == combine:
82+
com_hi = code
83+
else:
84+
if curr_combine != "0":
85+
combines.append((com_lo, com_hi, curr_combine))
86+
curr_combine = combine
87+
com_lo = code
88+
com_hi = code
89+
90+
return (canon_decomp, compat_decomp, gencats, combines)
7391

7492

7593
def load_derived_core_properties(f):
@@ -178,50 +196,149 @@ def emit_property_module_old(f, mod, tbl):
178196
f.write(" }\n\n")
179197
f.write("}\n")
180198

181-
def emit_decomp_module(f, canon, compat):
199+
def format_table_content(f, content, indent):
200+
line = " "*indent
201+
first = True
202+
for chunk in content.split(","):
203+
if len(line) + len(chunk) < 98:
204+
if first:
205+
line += chunk
206+
else:
207+
line += ", " + chunk
208+
first = False
209+
else:
210+
f.write(line + ",\n")
211+
line = " "*indent + chunk
212+
f.write(line)
213+
214+
def emit_decomp_module(f, canon, compat, combine):
182215
canon_keys = canon.keys()
183216
canon_keys.sort()
184217

185218
compat_keys = compat.keys()
186219
compat_keys.sort()
187-
f.write("mod decompose {\n\n");
188-
f.write(" export canonical, compatibility;\n\n")
189-
f.write(" fn canonical(c: char, i: block(char)) "
190-
+ "{ d(c, i, false); }\n\n")
191-
f.write(" fn compatibility(c: char, i: block(char)) "
192-
+"{ d(c, i, true); }\n\n")
193-
f.write(" fn d(c: char, i: block(char), k: bool) {\n")
220+
f.write("pub mod decompose {\n");
221+
f.write(" use option::Option;\n");
222+
f.write(" use option::{Some, None};\n");
223+
f.write(" use vec::ImmutableVector;\n");
224+
f.write("""
225+
fn bsearch_table(c: char, r: &'static [(char, &'static [char])]) -> Option<&'static [char]> {
226+
use cmp::{Equal, Less, Greater};
227+
match r.bsearch(|&(val, _)| {
228+
if c == val { Equal }
229+
else if val < c { Less }
230+
else { Greater }
231+
}) {
232+
Some(idx) => {
233+
let (_, result) = r[idx];
234+
Some(result)
235+
}
236+
None => None
237+
}
238+
}\n
239+
""")
194240

195-
f.write(" if c <= '\\x7f' { i(c); ret; }\n")
241+
f.write("""
242+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {
243+
use cmp::{Equal, Less, Greater};
244+
match r.bsearch(|&(lo, hi, _)| {
245+
if lo <= c && c <= hi { Equal }
246+
else if hi < c { Less }
247+
else { Greater }
248+
}) {
249+
Some(idx) => {
250+
let (_, _, result) = r[idx];
251+
result
252+
}
253+
None => 0
254+
}
255+
}\n\n
256+
""")
196257

197-
# First check the canonical decompositions
198-
f.write(" // Canonical decomposition\n")
199-
f.write(" alt c {\n")
258+
f.write(" // Canonical decompositions\n")
259+
f.write(" static canonical_table : &'static [(char, &'static [char])] = &[\n")
260+
data = ""
261+
first = True
200262
for char in canon_keys:
201-
f.write(" %s {\n" % escape_char(char))
263+
if not first:
264+
data += ","
265+
first = False
266+
data += "(%s,&[" % escape_char(char)
267+
first2 = True
202268
for d in canon[char]:
203-
f.write(" d(%s, i, k);\n"
204-
% escape_char(d))
205-
f.write(" }\n")
269+
if not first2:
270+
data += ","
271+
first2 = False
272+
data += escape_char(d)
273+
data += "])"
274+
format_table_content(f, data, 8)
275+
f.write("\n ];\n\n")
276+
277+
f.write(" // Compatibility decompositions\n")
278+
f.write(" static compatibility_table : &'static [(char, &'static [char])] = &[\n")
279+
data = ""
280+
first = True
281+
for char in compat_keys:
282+
if not first:
283+
data += ","
284+
first = False
285+
data += "(%s,&[" % escape_char(char)
286+
first2 = True
287+
for d in compat[char]:
288+
if not first2:
289+
data += ","
290+
first2 = False
291+
data += escape_char(d)
292+
data += "])"
293+
format_table_content(f, data, 8)
294+
f.write("\n ];\n\n")
295+
296+
f.write(" static combining_class_table : &'static [(char, char, u8)] = &[\n")
297+
ix = 0
298+
for pair in combine:
299+
f.write(ch_prefix(ix))
300+
f.write("(%s, %s, %s)" % (escape_char(pair[0]), escape_char(pair[1]), pair[2]))
301+
ix += 1
302+
f.write("\n ];\n")
303+
304+
f.write(" pub fn canonical(c: char, i: &fn(char)) "
305+
+ "{ d(c, i, false); }\n\n")
306+
f.write(" pub fn compatibility(c: char, i: &fn(char)) "
307+
+"{ d(c, i, true); }\n\n")
308+
f.write(" pub fn canonical_combining_class(c: char) -> u8 {\n"
309+
+ " bsearch_range_value_table(c, combining_class_table)\n"
310+
+ " }\n\n")
311+
f.write(" fn d(c: char, i: &fn(char), k: bool) {\n")
312+
f.write(" use iterator::Iterator;\n");
206313

207-
f.write(" _ { }\n")
208-
f.write(" }\n\n")
314+
f.write(" if c <= '\\x7f' { i(c); return; }\n")
315+
316+
# First check the canonical decompositions
317+
f.write("""
318+
match bsearch_table(c, canonical_table) {
319+
Some(canon) => {
320+
for x in canon.iter() {
321+
d(*x, |b| i(b), k);
322+
}
323+
return;
324+
}
325+
None => ()
326+
}\n\n""")
209327

210328
# Bottom out if we're not doing compat.
211-
f.write(" if !k { i(c); ret; }\n\n ")
329+
f.write(" if !k { i(c); return; }\n")
212330

213331
# Then check the compatibility decompositions
214-
f.write(" // Compatibility decomposition\n")
215-
f.write(" alt c {\n")
216-
for char in compat_keys:
217-
f.write(" %s {\n" % escape_char(char))
218-
for d in compat[char]:
219-
f.write(" d(%s, i, k);\n"
220-
% escape_char(d))
221-
f.write(" }\n")
222-
223-
f.write(" _ { }\n")
224-
f.write(" }\n\n")
332+
f.write("""
333+
match bsearch_table(c, compatibility_table) {
334+
Some(compat) => {
335+
for x in compat.iter() {
336+
d(*x, |b| i(b), k);
337+
}
338+
return;
339+
}
340+
None => ()
341+
}\n\n""")
225342

226343
# Finally bottom out.
227344
f.write(" i(c);\n")
@@ -234,7 +351,7 @@ def emit_decomp_module(f, canon, compat):
234351
os.remove(i);
235352
rf = open(r, "w")
236353

237-
(canon_decomp, compat_decomp, gencats) = load_unicode_data("UnicodeData.txt")
354+
(canon_decomp, compat_decomp, gencats, combines) = load_unicode_data("UnicodeData.txt")
238355

239356
# Preamble
240357
rf.write('''// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
@@ -256,7 +373,7 @@ def emit_decomp_module(f, canon, compat):
256373

257374
emit_property_module(rf, "general_category", gencats)
258375

259-
#emit_decomp_module(rf, canon_decomp, compat_decomp)
376+
emit_decomp_module(rf, canon_decomp, compat_decomp, combines)
260377

261378
derived = load_derived_core_properties("DerivedCoreProperties.txt")
262379
emit_property_module(rf, "derived_property", derived)

src/libstd/char.rs

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
use option::{None, Option, Some};
1414
use int;
1515
use str::StrSlice;
16-
use unicode::{derived_property, general_category};
16+
use unicode::{derived_property, general_category, decompose};
1717

1818
#[cfg(test)] use str::OwnedStr;
1919

@@ -202,6 +202,51 @@ pub fn from_digit(num: uint, radix: uint) -> Option<char> {
202202
}
203203
}
204204

205+
// Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior
206+
static S_BASE: uint = 0xAC00;
207+
static L_BASE: uint = 0x1100;
208+
static V_BASE: uint = 0x1161;
209+
static T_BASE: uint = 0x11A7;
210+
static L_COUNT: uint = 19;
211+
static V_COUNT: uint = 21;
212+
static T_COUNT: uint = 28;
213+
static N_COUNT: uint = (V_COUNT * T_COUNT);
214+
static S_COUNT: uint = (L_COUNT * N_COUNT);
215+
216+
// Decompose a precomposed Hangul syllable
217+
fn decompose_hangul(s: char, f: &fn(char)) {
218+
let si = s as uint - S_BASE;
219+
220+
let li = si / N_COUNT;
221+
f((L_BASE + li) as char);
222+
223+
let vi = (si % N_COUNT) / T_COUNT;
224+
f((V_BASE + vi) as char);
225+
226+
let ti = si % T_COUNT;
227+
if ti > 0 {
228+
f((T_BASE + ti) as char);
229+
}
230+
}
231+
232+
/// Returns the canonical decompostion of a character
233+
pub fn decompose_canonical(c: char, f: &fn(char)) {
234+
if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) {
235+
decompose::canonical(c, f);
236+
} else {
237+
decompose_hangul(c, f);
238+
}
239+
}
240+
241+
/// Returns the compatibility decompostion of a character
242+
pub fn decompose_compatible(c: char, f: &fn(char)) {
243+
if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) {
244+
decompose::compatibility(c, f);
245+
} else {
246+
decompose_hangul(c, f);
247+
}
248+
}
249+
205250
///
206251
/// Return the hexadecimal unicode escape of a char.
207252
///

0 commit comments

Comments
 (0)