Skip to content

Commit aaa28c4

Browse files
committed
Merge pull request #73 from kwantam/master
optimize generated tables ; clean up unicode.py
2 parents b38fe7b + 3fc4d2c commit aaa28c4

File tree

2 files changed

+1081
-1868
lines changed

2 files changed

+1081
-1868
lines changed

scripts/unicode.py

Lines changed: 13 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323
import fileinput, re, os, sys, operator
2424

25-
preamble = '''// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
25+
preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2626
// file at the top-level directory of this distribution and at
2727
// http://rust-lang.org/COPYRIGHT.
2828
//
@@ -67,7 +67,6 @@ def is_surrogate(n):
6767
def load_unicode_data(f):
6868
fetch(f)
6969
gencats = {}
70-
combines = {}
7170

7271
udict = {};
7372
range_start = -1;
@@ -98,22 +97,15 @@ def load_unicode_data(f):
9897
gencats[cat] = []
9998
gencats[cat].append(code)
10099

101-
# record combining class, if any
102-
if combine != "0":
103-
if combine not in combines:
104-
combines[combine] = []
105-
combines[combine].append(code)
106-
107100
# generate Not_Assigned from Assigned
108101
gencats["Cn"] = gen_unassigned(gencats["Assigned"])
109102
# Assigned is not a real category
110103
del(gencats["Assigned"])
111104
# Other contains Not_Assigned
112105
gencats["C"].extend(gencats["Cn"])
113106
gencats = group_cats(gencats)
114-
combines = to_combines(group_cats(combines))
115107

116-
return (gencats, combines)
108+
return gencats
117109

118110
def group_cats(cats):
119111
cats_out = {}
@@ -150,14 +142,6 @@ def gen_unassigned(assigned):
150142
return ([i for i in range(0, 0xd800) if i not in assigned] +
151143
[i for i in range(0xe000, 0x110000) if i not in assigned])
152144

153-
def to_combines(combs):
154-
combs_out = []
155-
for comb in combs:
156-
for (lo, hi) in combs[comb]:
157-
combs_out.append((lo, hi, comb))
158-
combs_out.sort(key=lambda comb: comb[0])
159-
return combs_out
160-
161145
def format_table_content(f, content, indent):
162146
line = " "*indent
163147
first = True
@@ -203,44 +187,12 @@ def load_properties(f, interestingprops):
203187
if prop not in props:
204188
props[prop] = []
205189
props[prop].append((d_lo, d_hi))
206-
return props
207190

208-
# load all widths of want_widths, except those in except_cats
209-
def load_east_asian_width(want_widths, except_cats):
210-
f = "EastAsianWidth.txt"
211-
fetch(f)
212-
widths = {}
213-
re1 = re.compile("^([0-9A-F]+);(\w+) +# (\w+)")
214-
re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+);(\w+) +# (\w+)")
191+
# optimize props if possible
192+
for prop in props:
193+
props[prop] = group_cat(ungroup_cat(props[prop]))
215194

216-
for line in fileinput.input(f):
217-
width = None
218-
d_lo = 0
219-
d_hi = 0
220-
cat = None
221-
m = re1.match(line)
222-
if m:
223-
d_lo = m.group(1)
224-
d_hi = m.group(1)
225-
width = m.group(2)
226-
cat = m.group(3)
227-
else:
228-
m = re2.match(line)
229-
if m:
230-
d_lo = m.group(1)
231-
d_hi = m.group(2)
232-
width = m.group(3)
233-
cat = m.group(4)
234-
else:
235-
continue
236-
if cat in except_cats or width not in want_widths:
237-
continue
238-
d_lo = int(d_lo, 16)
239-
d_hi = int(d_hi, 16)
240-
if width not in widths:
241-
widths[width] = []
242-
widths[width].append((d_lo, d_hi))
243-
return widths
195+
return props
244196

245197
def escape_char(c):
246198
return "'\\u{%x}'" % c
@@ -261,7 +213,7 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
261213
format_table_content(f, data, 8)
262214
f.write("\n ];\n\n")
263215

264-
def emit_property_module(f, mod, tbl, emit_fn):
216+
def emit_property_module(f, mod, tbl):
265217
f.write("pub mod %s {\n" % mod)
266218
keys = tbl.keys()
267219
keys.sort()
@@ -286,43 +238,6 @@ def emit_regex_module(f, cats, w_data):
286238

287239
f.write("}\n\n")
288240

289-
def remove_from_wtable(wtable, val):
290-
wtable_out = []
291-
while wtable:
292-
if wtable[0][1] < val:
293-
wtable_out.append(wtable.pop(0))
294-
elif wtable[0][0] > val:
295-
break
296-
else:
297-
(wt_lo, wt_hi, width, width_cjk) = wtable.pop(0)
298-
if wt_lo == wt_hi == val:
299-
continue
300-
elif wt_lo == val:
301-
wtable_out.append((wt_lo+1, wt_hi, width, width_cjk))
302-
elif wt_hi == val:
303-
wtable_out.append((wt_lo, wt_hi-1, width, width_cjk))
304-
else:
305-
wtable_out.append((wt_lo, val-1, width, width_cjk))
306-
wtable_out.append((val+1, wt_hi, width, width_cjk))
307-
if wtable:
308-
wtable_out.extend(wtable)
309-
return wtable_out
310-
311-
312-
313-
def optimize_width_table(wtable):
314-
wtable_out = []
315-
w_this = wtable.pop(0)
316-
while wtable:
317-
if w_this[1] == wtable[0][0] - 1 and w_this[2:3] == wtable[0][2:3]:
318-
w_tmp = wtable.pop(0)
319-
w_this = (w_this[0], w_tmp[1], w_tmp[2], w_tmp[3])
320-
else:
321-
wtable_out.append(w_this)
322-
w_this = wtable.pop(0)
323-
wtable_out.append(w_this)
324-
return wtable_out
325-
326241
if __name__ == "__main__":
327242
r = "unicode.rs"
328243
if os.path.exists(r):
@@ -336,7 +251,7 @@ def optimize_width_table(wtable):
336251
with open("ReadMe.txt") as readme:
337252
pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
338253
unicode_version = re.search(pattern, readme.read()).groups()
339-
(gencats, combines) = load_unicode_data("UnicodeData.txt")
254+
gencats = load_unicode_data("UnicodeData.txt")
340255
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
341256
other_derived = ["Default_Ignorable_Code_Point", "Grapheme_Extend"]
342257
derived = load_properties("DerivedCoreProperties.txt", want_derived + other_derived)
@@ -346,11 +261,11 @@ def optimize_width_table(wtable):
346261

347262
# all of these categories will also be available as \p{} in libregex
348263
allcats = []
349-
for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \
350-
("derived_property", derived, want_derived), \
351-
("script", scripts, []), \
352-
("property", props, ["White_Space"]):
353-
emit_property_module(rf, name, cat, pfuns)
264+
for (name, cat) in ("general_category", gencats), \
265+
("derived_property", derived), \
266+
("script", scripts), \
267+
("property", props):
268+
emit_property_module(rf, name, cat)
354269
allcats.extend(map(lambda x: (x, name), cat))
355270
allcats.sort(key=lambda c: c[0])
356271

0 commit comments

Comments
 (0)