From 713af75c221a26909d541772a3d6592d633e82a9 Mon Sep 17 00:00:00 2001 From: Florian Zeitz Date: Fri, 28 Feb 2014 20:10:42 +0100 Subject: [PATCH 1/5] Update unicode.py to actually generate unicode.rs --- src/etc/unicode.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/etc/unicode.py b/src/etc/unicode.py index 71c5c8f8a9527..cce302ed972b4 100755 --- a/src/etc/unicode.py +++ b/src/etc/unicode.py @@ -376,6 +376,7 @@ def emit_decomp_module(f, canon, compat, combine): #[allow(missing_doc)]; #[allow(non_uppercase_statics)]; +#[allow(dead_code)]; ''') From ab05d1ad6ab19650d8b1f24a756e38d36b50bca5 Mon Sep 17 00:00:00 2001 From: Florian Zeitz Date: Sat, 1 Mar 2014 22:36:30 +0100 Subject: [PATCH 2/5] std: Use appropriately sized integers for codepoints and bytes --- src/libstd/char.rs | 72 ++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 38 deletions(-) diff --git a/src/libstd/char.rs b/src/libstd/char.rs index 4c0f77586db44..05a503ecffcc9 100644 --- a/src/libstd/char.rs +++ b/src/libstd/char.rs @@ -36,13 +36,14 @@ use unicode::{derived_property, property, general_category, decompose}; #[cfg(not(test))] use default::Default; // UTF-8 ranges and tags for encoding characters -static TAG_CONT: uint = 128u; -static MAX_ONE_B: uint = 128u; -static TAG_TWO_B: uint = 192u; -static MAX_TWO_B: uint = 2048u; -static TAG_THREE_B: uint = 224u; -static MAX_THREE_B: uint = 65536u; -static TAG_FOUR_B: uint = 240u; +static TAG_CONT: u8 = 128u8; +static MAX_ONE_B: u32 = 128u32; +static TAG_TWO_B: u8 = 192u8; +static MAX_TWO_B: u32 = 2048u32; +static TAG_THREE_B: u8 = 224u8; +static MAX_THREE_B: u32 = 65536u32; +static TAG_FOUR_B: u8 = 240u8; +static MAX_FOUR_B: u32 = 2097152u32; /* Lu Uppercase_Letter an uppercase letter @@ -256,37 +257,37 @@ pub fn from_digit(num: uint, radix: uint) -> Option { } // Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior -static S_BASE: uint = 0xAC00; -static L_BASE: uint = 0x1100; -static V_BASE: uint = 0x1161; -static T_BASE: uint = 0x11A7; -static L_COUNT: uint = 19; -static V_COUNT: uint = 21; -static T_COUNT: uint = 28; -static N_COUNT: uint = (V_COUNT * T_COUNT); -static S_COUNT: uint = (L_COUNT * N_COUNT); +static S_BASE: u32 = 0xAC00; +static L_BASE: u32 = 0x1100; +static V_BASE: u32 = 0x1161; +static T_BASE: u32 = 0x11A7; +static L_COUNT: u32 = 19; +static V_COUNT: u32 = 21; +static T_COUNT: u32 = 28; +static N_COUNT: u32 = (V_COUNT * T_COUNT); +static S_COUNT: u32 = (L_COUNT * N_COUNT); // Decompose a precomposed Hangul syllable fn decompose_hangul(s: char, f: |char|) { - let si = s as uint - S_BASE; + let si = s as u32 - S_BASE; let li = si / N_COUNT; unsafe { - f(transmute((L_BASE + li) as u32)); + f(transmute(L_BASE + li)); let vi = (si % N_COUNT) / T_COUNT; - f(transmute((V_BASE + vi) as u32)); + f(transmute(V_BASE + vi)); let ti = si % T_COUNT; if ti > 0 { - f(transmute((T_BASE + ti) as u32)); + f(transmute(T_BASE + ti)); } } } /// Returns the canonical decomposition of a character pub fn decompose_canonical(c: char, f: |char|) { - if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) { + if (c as u32) < S_BASE || (c as u32) >= (S_BASE + S_COUNT) { decompose::canonical(c, f); } else { decompose_hangul(c, f); @@ -295,7 +296,7 @@ pub fn decompose_canonical(c: char, f: |char|) { /// Returns the compatibility decomposition of a character pub fn decompose_compatible(c: char, f: |char|) { - if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) { + if (c as u32) < S_BASE || (c as u32) >= (S_BASE + S_COUNT) { decompose::compatibility(c, f); } else { decompose_hangul(c, f); @@ -357,12 +358,7 @@ pub fn escape_default(c: char, f: |char|) { /// Returns the amount of bytes this `char` would need if encoded in UTF-8 pub fn len_utf8_bytes(c: char) -> uint { - static MAX_ONE_B: uint = 128u; - static MAX_TWO_B: uint = 2048u; - static MAX_THREE_B: uint = 65536u; - static MAX_FOUR_B: uint = 2097152u; - - let code = c as uint; + let code = c as u32; match () { _ if code < MAX_ONE_B => 1u, _ if code < MAX_TWO_B => 2u, @@ -430,24 +426,24 @@ impl Char for char { fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) } fn encode_utf8<'a>(&self, dst: &'a mut [u8]) -> uint { - let code = *self as uint; + let code = *self as u32; if code < MAX_ONE_B { dst[0] = code as u8; return 1; } else if code < MAX_TWO_B { - dst[0] = (code >> 6u & 31u | TAG_TWO_B) as u8; - dst[1] = (code & 63u | TAG_CONT) as u8; + dst[0] = (code >> 6u & 31u32) as u8 | TAG_TWO_B; + dst[1] = (code & 63u32) as u8 | TAG_CONT; return 2; } else if code < MAX_THREE_B { - dst[0] = (code >> 12u & 15u | TAG_THREE_B) as u8; - dst[1] = (code >> 6u & 63u | TAG_CONT) as u8; - dst[2] = (code & 63u | TAG_CONT) as u8; + dst[0] = (code >> 12u & 15u32) as u8 | TAG_THREE_B; + dst[1] = (code >> 6u & 63u32) as u8 | TAG_CONT; + dst[2] = (code & 63u32) as u8 | TAG_CONT; return 3; } else { - dst[0] = (code >> 18u & 7u | TAG_FOUR_B) as u8; - dst[1] = (code >> 12u & 63u | TAG_CONT) as u8; - dst[2] = (code >> 6u & 63u | TAG_CONT) as u8; - dst[3] = (code & 63u | TAG_CONT) as u8; + dst[0] = (code >> 18u & 7u32) as u8 | TAG_FOUR_B; + dst[1] = (code >> 12u & 63u32) as u8 | TAG_CONT; + dst[2] = (code >> 6u & 63u32) as u8 | TAG_CONT; + dst[3] = (code & 63u32) as u8 | TAG_CONT; return 4; } } From bb587d9791133645eccb118341dee0798996fb50 Mon Sep 17 00:00:00 2001 From: Florian Zeitz Date: Thu, 6 Mar 2014 04:41:43 +0100 Subject: [PATCH 3/5] std: Move Hangul decomposition into unicode.rs --- src/etc/unicode.py | 86 +++++++++++++++++++++++++++++++------------ src/libstd/char.rs | 54 +++------------------------ src/libstd/unicode.rs | 48 +++++++++++++++++++++++- 3 files changed, 115 insertions(+), 73 deletions(-) diff --git a/src/etc/unicode.py b/src/etc/unicode.py index cce302ed972b4..7b93e10b26549 100755 --- a/src/etc/unicode.py +++ b/src/etc/unicode.py @@ -309,20 +309,28 @@ def emit_decomp_module(f, canon, compat, combine): ix += 1 f.write("\n ];\n") - f.write(" pub fn canonical(c: char, i: |char|) " - + "{ d(c, i, false); }\n\n") - f.write(" pub fn compatibility(c: char, i: |char|) " - +"{ d(c, i, true); }\n\n") - f.write(" pub fn canonical_combining_class(c: char) -> u8 {\n" - + " bsearch_range_value_table(c, combining_class_table)\n" - + " }\n\n") - f.write(" fn d(c: char, i: |char|, k: bool) {\n") - f.write(" use iter::Iterator;\n"); - - f.write(" if c <= '\\x7f' { i(c); return; }\n") - - # First check the canonical decompositions f.write(""" + pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); } + + pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); } + + pub fn canonical_combining_class(c: char) -> u8 { + bsearch_range_value_table(c, combining_class_table) + } + + fn d(c: char, i: |char|, k: bool) { + use iter::Iterator; + + // 7-bit ASCII never decomposes + if c <= '\\x7f' { i(c); return; } + + // Perform decomposition for Hangul + if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) { + decompose_hangul(c, i); + return; + } + + // First check the canonical decompositions match bsearch_table(c, canonical_table) { Some(canon) => { for x in canon.iter() { @@ -331,13 +339,12 @@ def emit_decomp_module(f, canon, compat, combine): return; } None => () - }\n\n""") + } - # Bottom out if we're not doing compat. - f.write(" if !k { i(c); return; }\n") + // Bottom out if we're not doing compat. + if !k { i(c); return; } - # Then check the compatibility decompositions - f.write(""" + // Then check the compatibility decompositions match bsearch_table(c, compatibility_table) { Some(compat) => { for x in compat.iter() { @@ -346,12 +353,45 @@ def emit_decomp_module(f, canon, compat, combine): return; } None => () - }\n\n""") + } - # Finally bottom out. - f.write(" i(c);\n") - f.write(" }\n") - f.write("}\n\n") + // Finally bottom out. + i(c); + } + + // Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior + static S_BASE: u32 = 0xAC00; + static L_BASE: u32 = 0x1100; + static V_BASE: u32 = 0x1161; + static T_BASE: u32 = 0x11A7; + static L_COUNT: u32 = 19; + static V_COUNT: u32 = 21; + static T_COUNT: u32 = 28; + static N_COUNT: u32 = (V_COUNT * T_COUNT); + static S_COUNT: u32 = (L_COUNT * N_COUNT); + + // Decompose a precomposed Hangul syllable + fn decompose_hangul(s: char, f: |char|) { + use cast::transmute; + + let si = s as u32 - S_BASE; + + let li = si / N_COUNT; + unsafe { + f(transmute(L_BASE + li)); + + let vi = (si % N_COUNT) / T_COUNT; + f(transmute(V_BASE + vi)); + + let ti = si % T_COUNT; + if ti > 0 { + f(transmute(T_BASE + ti)); + } + } + } +} + +""") r = "unicode.rs" for i in [r]: diff --git a/src/libstd/char.rs b/src/libstd/char.rs index 05a503ecffcc9..c731b922eb4eb 100644 --- a/src/libstd/char.rs +++ b/src/libstd/char.rs @@ -28,7 +28,12 @@ use cast::transmute; use option::{None, Option, Some}; use iter::{Iterator, range_step}; use str::StrSlice; -use unicode::{derived_property, property, general_category, decompose}; +use unicode::{derived_property, property, general_category}; + +/// Returns the canonical decomposition of a character. +pub use unicode::normalization::decompose_canonical; +/// Returns the compatibility decomposition of a character. +pub use unicode::normalization::decompose_compatible; #[cfg(test)] use str::OwnedStr; @@ -256,53 +261,6 @@ pub fn from_digit(num: uint, radix: uint) -> Option { } } -// Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior -static S_BASE: u32 = 0xAC00; -static L_BASE: u32 = 0x1100; -static V_BASE: u32 = 0x1161; -static T_BASE: u32 = 0x11A7; -static L_COUNT: u32 = 19; -static V_COUNT: u32 = 21; -static T_COUNT: u32 = 28; -static N_COUNT: u32 = (V_COUNT * T_COUNT); -static S_COUNT: u32 = (L_COUNT * N_COUNT); - -// Decompose a precomposed Hangul syllable -fn decompose_hangul(s: char, f: |char|) { - let si = s as u32 - S_BASE; - - let li = si / N_COUNT; - unsafe { - f(transmute(L_BASE + li)); - - let vi = (si % N_COUNT) / T_COUNT; - f(transmute(V_BASE + vi)); - - let ti = si % T_COUNT; - if ti > 0 { - f(transmute(T_BASE + ti)); - } - } -} - -/// Returns the canonical decomposition of a character -pub fn decompose_canonical(c: char, f: |char|) { - if (c as u32) < S_BASE || (c as u32) >= (S_BASE + S_COUNT) { - decompose::canonical(c, f); - } else { - decompose_hangul(c, f); - } -} - -/// Returns the compatibility decomposition of a character -pub fn decompose_compatible(c: char, f: |char|) { - if (c as u32) < S_BASE || (c as u32) >= (S_BASE + S_COUNT) { - decompose::compatibility(c, f); - } else { - decompose_hangul(c, f); - } -} - /// /// Returns the hexadecimal Unicode escape of a `char` /// diff --git a/src/libstd/unicode.rs b/src/libstd/unicode.rs index 144500fac5d96..866580ce9c986 100644 --- a/src/libstd/unicode.rs +++ b/src/libstd/unicode.rs @@ -3624,9 +3624,10 @@ pub mod decompose { ('\U0001d185', '\U0001d189', 230), ('\U0001d18a', '\U0001d18b', 220), ('\U0001d1aa', '\U0001d1ad', 230), ('\U0001d242', '\U0001d244', 230) ]; - pub fn canonical(c: char, i: |char|) { d(c, i, false); } - pub fn compatibility(c: char, i: |char|) { d(c, i, true); } + pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); } + + pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); } pub fn canonical_combining_class(c: char) -> u8 { bsearch_range_value_table(c, combining_class_table) @@ -3634,8 +3635,17 @@ pub mod decompose { fn d(c: char, i: |char|, k: bool) { use iter::Iterator; + + // 7-bit ASCII never decomposes if c <= '\x7f' { i(c); return; } + // Perform decomposition for Hangul + if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) { + decompose_hangul(c, i); + return; + } + + // First check the canonical decompositions match bsearch_table(c, canonical_table) { Some(canon) => { for x in canon.iter() { @@ -3646,8 +3656,10 @@ pub mod decompose { None => () } + // Bottom out if we're not doing compat. if !k { i(c); return; } + // Then check the compatibility decompositions match bsearch_table(c, compatibility_table) { Some(compat) => { for x in compat.iter() { @@ -3658,8 +3670,40 @@ pub mod decompose { None => () } + // Finally bottom out. i(c); } + + // Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior + static S_BASE: u32 = 0xAC00; + static L_BASE: u32 = 0x1100; + static V_BASE: u32 = 0x1161; + static T_BASE: u32 = 0x11A7; + static L_COUNT: u32 = 19; + static V_COUNT: u32 = 21; + static T_COUNT: u32 = 28; + static N_COUNT: u32 = (V_COUNT * T_COUNT); + static S_COUNT: u32 = (L_COUNT * N_COUNT); + + // Decompose a precomposed Hangul syllable + fn decompose_hangul(s: char, f: |char|) { + use cast::transmute; + + let si = s as u32 - S_BASE; + + let li = si / N_COUNT; + unsafe { + f(transmute(L_BASE + li)); + + let vi = (si % N_COUNT) / T_COUNT; + f(transmute(V_BASE + vi)); + + let ti = si % T_COUNT; + if ti > 0 { + f(transmute(T_BASE + ti)); + } + } + } } pub mod derived_property { From f41ecef1500dd15a88e7995617951c8b74b0ab5a Mon Sep 17 00:00:00 2001 From: Florian Zeitz Date: Fri, 7 Mar 2014 03:39:06 +0100 Subject: [PATCH 4/5] std: Rename str::Normalizations to str::Decompositions --- src/etc/unicode.py | 2 +- src/libstd/str.rs | 36 ++++++++++++++++++------------------ src/libstd/unicode.rs | 2 +- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/etc/unicode.py b/src/etc/unicode.py index 7b93e10b26549..6e058c37a01a1 100755 --- a/src/etc/unicode.py +++ b/src/etc/unicode.py @@ -225,7 +225,7 @@ def emit_decomp_module(f, canon, compat, combine): compat_keys = compat.keys() compat_keys.sort() - f.write("pub mod decompose {\n"); + f.write("pub mod normalization {\n"); f.write(" use option::Option;\n"); f.write(" use option::{Some, None};\n"); f.write(" use vec::ImmutableVector;\n"); diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 3464c4a1128e3..ce2c7e8af88f9 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -587,25 +587,25 @@ fn canonical_sort(comb: &mut [(char, u8)]) { } #[deriving(Clone)] -enum NormalizationForm { - NFD, - NFKD +enum DecompositionType { + Canonical, + Compatible } -/// External iterator for a string's normalization's characters. +/// External iterator for a string's decomposition's characters. /// Use with the `std::iter` module. #[deriving(Clone)] -pub struct Normalizations<'a> { - priv kind: NormalizationForm, +pub struct Decompositions<'a> { + priv kind: DecompositionType, priv iter: Chars<'a>, priv buffer: ~[(char, u8)], priv sorted: bool } -impl<'a> Iterator for Normalizations<'a> { +impl<'a> Iterator for Decompositions<'a> { #[inline] fn next(&mut self) -> Option { - use unicode::decompose::canonical_combining_class; + use unicode::normalization::canonical_combining_class; match self.buffer.head() { Some(&(c, 0)) => { @@ -621,8 +621,8 @@ impl<'a> Iterator for Normalizations<'a> { } let decomposer = match self.kind { - NFD => char::decompose_canonical, - NFKD => char::decompose_compatible + Canonical => char::decompose_canonical, + Compatible => char::decompose_compatible }; if !self.sorted { @@ -1858,11 +1858,11 @@ pub trait StrSlice<'a> { /// An Iterator over the string in Unicode Normalization Form D /// (canonical decomposition). - fn nfd_chars(&self) -> Normalizations<'a>; + fn nfd_chars(&self) -> Decompositions<'a>; /// An Iterator over the string in Unicode Normalization Form KD /// (compatibility decomposition). - fn nfkd_chars(&self) -> Normalizations<'a>; + fn nfkd_chars(&self) -> Decompositions<'a>; /// Returns true if the string contains only whitespace. /// @@ -2444,22 +2444,22 @@ impl<'a> StrSlice<'a> for &'a str { } #[inline] - fn nfd_chars(&self) -> Normalizations<'a> { - Normalizations { + fn nfd_chars(&self) -> Decompositions<'a> { + Decompositions { iter: self.chars(), buffer: ~[], sorted: false, - kind: NFD + kind: Canonical } } #[inline] - fn nfkd_chars(&self) -> Normalizations<'a> { - Normalizations { + fn nfkd_chars(&self) -> Decompositions<'a> { + Decompositions { iter: self.chars(), buffer: ~[], sorted: false, - kind: NFKD + kind: Compatible } } diff --git a/src/libstd/unicode.rs b/src/libstd/unicode.rs index 866580ce9c986..1edc26e21e207 100644 --- a/src/libstd/unicode.rs +++ b/src/libstd/unicode.rs @@ -1445,7 +1445,7 @@ pub mod general_category { } } -pub mod decompose { +pub mod normalization { use option::Option; use option::{Some, None}; use vec::ImmutableVector; From 8e444f289c5c71f7c891f8e01f74e06e8a89e557 Mon Sep 17 00:00:00 2001 From: Florian Zeitz Date: Sun, 9 Mar 2014 22:20:24 +0100 Subject: [PATCH 5/5] std: Add support for NFC and NFKC --- src/etc/unicode.py | 82 +++++++++- src/libstd/str.rs | 169 +++++++++++++++++++- src/libstd/unicode.rs | 357 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 600 insertions(+), 8 deletions(-) diff --git a/src/etc/unicode.py b/src/etc/unicode.py index 6e058c37a01a1..eac56192b6e46 100755 --- a/src/etc/unicode.py +++ b/src/etc/unicode.py @@ -219,7 +219,7 @@ def format_table_content(f, content, indent): line = " "*indent + chunk f.write(line) -def emit_decomp_module(f, canon, compat, combine): +def emit_norm_module(f, canon, compat, combine, norm_props): canon_keys = canon.keys() canon_keys.sort() @@ -230,7 +230,7 @@ def emit_decomp_module(f, canon, compat, combine): f.write(" use option::{Some, None};\n"); f.write(" use vec::ImmutableVector;\n"); f.write(""" - fn bsearch_table(c: char, r: &'static [(char, &'static [char])]) -> Option<&'static [char]> { + fn bsearch_table(c: char, r: &'static [(char, &'static [T])]) -> Option<&'static [T]> { use cmp::{Equal, Less, Greater}; match r.bsearch(|&(val, _)| { if c == val { Equal } @@ -301,6 +301,39 @@ def emit_decomp_module(f, canon, compat, combine): format_table_content(f, data, 8) f.write("\n ];\n\n") + + canon_comp = {} + comp_exclusions = norm_props["Full_Composition_Exclusion"] + for char in canon_keys: + if True in map(lambda (lo, hi): lo <= char <= hi, comp_exclusions): + continue + decomp = canon[char] + if len(decomp) == 2: + if not canon_comp.has_key(decomp[0]): + canon_comp[decomp[0]] = [] + canon_comp[decomp[0]].append( (decomp[1], char) ) + canon_comp_keys = canon_comp.keys() + canon_comp_keys.sort() + f.write(" static composition_table : &'static [(char, &'static [(char, char)])] = &[\n") + data = "" + first = True + for char in canon_comp_keys: + if not first: + data += "," + first = False + data += "(%s, &[" % escape_char(char) + canon_comp[char].sort(lambda x, y: x[0] - y[0]) + first2 = True + for pair in canon_comp[char]: + if not first2: + data += "," + first2 = False + data += "(%s, %s)" % (escape_char(pair[0]), escape_char(pair[1])) + data += "])" + format_table_content(f, data, 8) + f.write("\n ];\n\n") + + f.write(" static combining_class_table : &'static [(char, char, u8)] = &[\n") ix = 0 for pair in combine: @@ -314,6 +347,28 @@ def emit_decomp_module(f, canon, compat, combine): pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); } + pub fn compose(a: char, b: char) -> Option { + use cmp::{Equal, Less, Greater}; + compose_hangul(a, b).or_else(|| { + match bsearch_table(a, composition_table) { + None => None, + Some(candidates) => { + match candidates.bsearch(|&(val, _)| { + if b == val { Equal } + else if val < b { Less } + else { Greater } + }) { + Some(idx) => { + let (_, result) = candidates[idx]; + Some(result) + } + None => None + } + } + } + }) + } + pub fn canonical_combining_class(c: char) -> u8 { bsearch_range_value_table(c, combining_class_table) } @@ -371,6 +426,7 @@ def emit_decomp_module(f, canon, compat, combine): static S_COUNT: u32 = (L_COUNT * N_COUNT); // Decompose a precomposed Hangul syllable + #[inline(always)] fn decompose_hangul(s: char, f: |char|) { use cast::transmute; @@ -389,6 +445,25 @@ def emit_decomp_module(f, canon, compat, combine): } } } + + // Compose a pair of Hangul Jamo + #[inline(always)] + fn compose_hangul(a: char, b: char) -> Option { + use cast::transmute; + let l = a as u32; + let v = b as u32; + // Compose an LPart and a VPart + if L_BASE <= l && l < (L_BASE + L_COUNT) && V_BASE <= v && v < (V_BASE + V_COUNT) { + let r = S_BASE + (l - L_BASE) * N_COUNT + (v - V_BASE) * T_COUNT; + unsafe { return Some(transmute(r)); } + } + // Compose an LVPart and a TPart + if S_BASE <= l && l <= (S_BASE+S_COUNT-T_COUNT) && T_BASE <= v && v < (T_BASE+T_COUNT) { + let r = l + (v - T_BASE); + unsafe { return Some(transmute(r)); } + } + None + } } """) @@ -422,7 +497,8 @@ def emit_decomp_module(f, canon, compat, combine): emit_property_module(rf, "general_category", gencats) -emit_decomp_module(rf, canon_decomp, compat_decomp, combines) +norm_props = load_properties("DerivedNormalizationProps.txt", ["Full_Composition_Exclusion"]) +emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props) derived = load_properties("DerivedCoreProperties.txt", ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]) diff --git a/src/libstd/str.rs b/src/libstd/str.rs index ce2c7e8af88f9..8abc8e90936aa 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -99,6 +99,7 @@ use option::{None, Option, Some}; use ptr; use ptr::RawPtr; use from_str::FromStr; +use unicode::normalization::{canonical_combining_class, compose}; use vec; use vec::{OwnedVector, OwnedCloneableVector, ImmutableVector, MutableVector}; use vec_ng::Vec; @@ -575,8 +576,8 @@ fn canonical_sort(comb: &mut [(char, u8)]) { for i in range(0, len) { let mut swapped = false; for j in range(1, len-i) { - let class_a = *comb[j-1].ref1(); - let class_b = *comb[j].ref1(); + let class_a = comb[j-1].val1(); + let class_b = comb[j].val1(); if class_a != 0 && class_b != 0 && class_a > class_b { comb.swap(j-1, j); swapped = true; @@ -605,8 +606,6 @@ pub struct Decompositions<'a> { impl<'a> Iterator for Decompositions<'a> { #[inline] fn next(&mut self) -> Option { - use unicode::normalization::canonical_combining_class; - match self.buffer.head() { Some(&(c, 0)) => { self.sorted = false; @@ -662,6 +661,106 @@ impl<'a> Iterator for Decompositions<'a> { } } +#[deriving(Clone)] +enum RecompositionState { + Composing, + Purging, + Finished +} + +/// External iterator for a string's recomposition's characters. +/// Use with the `std::iter` module. +#[deriving(Clone)] +pub struct Recompositions<'a> { + priv iter: Decompositions<'a>, + priv state: RecompositionState, + priv buffer: ~[char], + priv composee: Option, + priv last_ccc: Option +} + +impl<'a> Iterator for Recompositions<'a> { + #[inline] + fn next(&mut self) -> Option { + loop { + match self.state { + Composing => { + for ch in self.iter { + let ch_class = canonical_combining_class(ch); + if self.composee.is_none() { + if ch_class != 0 { + return Some(ch); + } + self.composee = Some(ch); + continue; + } + let k = self.composee.clone().unwrap(); + + match self.last_ccc { + None => { + match compose(k, ch) { + Some(r) => { + self.composee = Some(r); + continue; + } + None => { + if ch_class == 0 { + self.composee = Some(ch); + return Some(k); + } + self.buffer.push(ch); + self.last_ccc = Some(ch_class); + } + } + } + Some(l_class) => { + if l_class >= ch_class { + // `ch` is blocked from `composee` + if ch_class == 0 { + self.composee = Some(ch); + self.last_ccc = None; + self.state = Purging; + return Some(k); + } + self.buffer.push(ch); + self.last_ccc = Some(ch_class); + continue; + } + match compose(k, ch) { + Some(r) => { + self.composee = Some(r); + continue; + } + None => { + self.buffer.push(ch); + self.last_ccc = Some(ch_class); + } + } + } + } + } + self.state = Finished; + if self.composee.is_some() { + return self.composee.take(); + } + } + Purging => { + match self.buffer.shift() { + None => self.state = Composing, + s => return s + } + } + Finished => { + match self.buffer.shift() { + None => return self.composee.take(), + s => return s + } + } + } + } + } +} + /// Replace all occurrences of one string with another /// /// # Arguments @@ -1864,6 +1963,14 @@ pub trait StrSlice<'a> { /// (compatibility decomposition). fn nfkd_chars(&self) -> Decompositions<'a>; + /// An Iterator over the string in Unicode Normalization Form C + /// (canonical decomposition followed by canonical composition). + fn nfc_chars(&self) -> Recompositions<'a>; + + /// An Iterator over the string in Unicode Normalization Form KC + /// (compatibility decomposition followed by canonical composition). + fn nfkc_chars(&self) -> Recompositions<'a>; + /// Returns true if the string contains only whitespace. /// /// Whitespace characters are determined by `char::is_whitespace`. @@ -2463,6 +2570,28 @@ impl<'a> StrSlice<'a> for &'a str { } } + #[inline] + fn nfc_chars(&self) -> Recompositions<'a> { + Recompositions { + iter: self.nfd_chars(), + state: Composing, + buffer: ~[], + composee: None, + last_ccc: None + } + } + + #[inline] + fn nfkc_chars(&self) -> Recompositions<'a> { + Recompositions { + iter: self.nfkd_chars(), + state: Composing, + buffer: ~[], + composee: None, + last_ccc: None + } + } + #[inline] fn is_whitespace(&self) -> bool { self.chars().all(char::is_whitespace) } @@ -4257,6 +4386,38 @@ mod tests { assert_eq!("\uac1c".nfkd_chars().collect::<~str>(), ~"\u1100\u1162"); } + #[test] + fn test_nfc_chars() { + assert_eq!("abc".nfc_chars().collect::<~str>(), ~"abc"); + assert_eq!("\u1e0b\u01c4".nfc_chars().collect::<~str>(), ~"\u1e0b\u01c4"); + assert_eq!("\u2026".nfc_chars().collect::<~str>(), ~"\u2026"); + assert_eq!("\u2126".nfc_chars().collect::<~str>(), ~"\u03a9"); + assert_eq!("\u1e0b\u0323".nfc_chars().collect::<~str>(), ~"\u1e0d\u0307"); + assert_eq!("\u1e0d\u0307".nfc_chars().collect::<~str>(), ~"\u1e0d\u0307"); + assert_eq!("a\u0301".nfc_chars().collect::<~str>(), ~"\xe1"); + assert_eq!("\u0301a".nfc_chars().collect::<~str>(), ~"\u0301a"); + assert_eq!("\ud4db".nfc_chars().collect::<~str>(), ~"\ud4db"); + assert_eq!("\uac1c".nfc_chars().collect::<~str>(), ~"\uac1c"); + assert_eq!("a\u0300\u0305\u0315\u05aeb".nfc_chars().collect::<~str>(), + ~"\xe0\u05ae\u0305\u0315b"); + } + + #[test] + fn test_nfkc_chars() { + assert_eq!("abc".nfkc_chars().collect::<~str>(), ~"abc"); + assert_eq!("\u1e0b\u01c4".nfkc_chars().collect::<~str>(), ~"\u1e0bD\u017d"); + assert_eq!("\u2026".nfkc_chars().collect::<~str>(), ~"..."); + assert_eq!("\u2126".nfkc_chars().collect::<~str>(), ~"\u03a9"); + assert_eq!("\u1e0b\u0323".nfkc_chars().collect::<~str>(), ~"\u1e0d\u0307"); + assert_eq!("\u1e0d\u0307".nfkc_chars().collect::<~str>(), ~"\u1e0d\u0307"); + assert_eq!("a\u0301".nfkc_chars().collect::<~str>(), ~"\xe1"); + assert_eq!("\u0301a".nfkc_chars().collect::<~str>(), ~"\u0301a"); + assert_eq!("\ud4db".nfkc_chars().collect::<~str>(), ~"\ud4db"); + assert_eq!("\uac1c".nfkc_chars().collect::<~str>(), ~"\uac1c"); + assert_eq!("a\u0300\u0305\u0315\u05aeb".nfkc_chars().collect::<~str>(), + ~"\xe0\u05ae\u0305\u0315b"); + } + #[test] fn test_lines() { let data = "\nMäry häd ä little lämb\n\nLittle lämb\n"; diff --git a/src/libstd/unicode.rs b/src/libstd/unicode.rs index 1edc26e21e207..4e34ca59f5c88 100644 --- a/src/libstd/unicode.rs +++ b/src/libstd/unicode.rs @@ -1450,7 +1450,7 @@ pub mod normalization { use option::{Some, None}; use vec::ImmutableVector; - fn bsearch_table(c: char, r: &'static [(char, &'static [char])]) -> Option<&'static [char]> { + fn bsearch_table(c: char, r: &'static [(char, &'static [T])]) -> Option<&'static [T]> { use cmp::{Equal, Less, Greater}; match r.bsearch(|&(val, _)| { if c == val { Equal } @@ -3478,6 +3478,319 @@ pub mod normalization { &['\u53ef']) ]; + static composition_table : &'static [(char, &'static [(char, char)])] = &[ + ('\x3c', &[('\u0338', '\u226e')]), ('\x3d', &[('\u0338', '\u2260')]), ('\x3e', + &[('\u0338', '\u226f')]), ('\x41', &[('\u0300', '\xc0'), ('\u0301', '\xc1'), ('\u0302', + '\xc2'), ('\u0303', '\xc3'), ('\u0304', '\u0100'), ('\u0306', '\u0102'), ('\u0307', + '\u0226'), ('\u0308', '\xc4'), ('\u0309', '\u1ea2'), ('\u030a', '\xc5'), ('\u030c', + '\u01cd'), ('\u030f', '\u0200'), ('\u0311', '\u0202'), ('\u0323', '\u1ea0'), ('\u0325', + '\u1e00'), ('\u0328', '\u0104')]), ('\x42', &[('\u0307', '\u1e02'), ('\u0323', + '\u1e04'), ('\u0331', '\u1e06')]), ('\x43', &[('\u0301', '\u0106'), ('\u0302', + '\u0108'), ('\u0307', '\u010a'), ('\u030c', '\u010c'), ('\u0327', '\xc7')]), ('\x44', + &[('\u0307', '\u1e0a'), ('\u030c', '\u010e'), ('\u0323', '\u1e0c'), ('\u0327', + '\u1e10'), ('\u032d', '\u1e12'), ('\u0331', '\u1e0e')]), ('\x45', &[('\u0300', '\xc8'), + ('\u0301', '\xc9'), ('\u0302', '\xca'), ('\u0303', '\u1ebc'), ('\u0304', '\u0112'), + ('\u0306', '\u0114'), ('\u0307', '\u0116'), ('\u0308', '\xcb'), ('\u0309', '\u1eba'), + ('\u030c', '\u011a'), ('\u030f', '\u0204'), ('\u0311', '\u0206'), ('\u0323', '\u1eb8'), + ('\u0327', '\u0228'), ('\u0328', '\u0118'), ('\u032d', '\u1e18'), ('\u0330', + '\u1e1a')]), ('\x46', &[('\u0307', '\u1e1e')]), ('\x47', &[('\u0301', '\u01f4'), + ('\u0302', '\u011c'), ('\u0304', '\u1e20'), ('\u0306', '\u011e'), ('\u0307', '\u0120'), + ('\u030c', '\u01e6'), ('\u0327', '\u0122')]), ('\x48', &[('\u0302', '\u0124'), + ('\u0307', '\u1e22'), ('\u0308', '\u1e26'), ('\u030c', '\u021e'), ('\u0323', '\u1e24'), + ('\u0327', '\u1e28'), ('\u032e', '\u1e2a')]), ('\x49', &[('\u0300', '\xcc'), ('\u0301', + '\xcd'), ('\u0302', '\xce'), ('\u0303', '\u0128'), ('\u0304', '\u012a'), ('\u0306', + '\u012c'), ('\u0307', '\u0130'), ('\u0308', '\xcf'), ('\u0309', '\u1ec8'), ('\u030c', + '\u01cf'), ('\u030f', '\u0208'), ('\u0311', '\u020a'), ('\u0323', '\u1eca'), ('\u0328', + '\u012e'), ('\u0330', '\u1e2c')]), ('\x4a', &[('\u0302', '\u0134')]), ('\x4b', + &[('\u0301', '\u1e30'), ('\u030c', '\u01e8'), ('\u0323', '\u1e32'), ('\u0327', + '\u0136'), ('\u0331', '\u1e34')]), ('\x4c', &[('\u0301', '\u0139'), ('\u030c', + '\u013d'), ('\u0323', '\u1e36'), ('\u0327', '\u013b'), ('\u032d', '\u1e3c'), ('\u0331', + '\u1e3a')]), ('\x4d', &[('\u0301', '\u1e3e'), ('\u0307', '\u1e40'), ('\u0323', + '\u1e42')]), ('\x4e', &[('\u0300', '\u01f8'), ('\u0301', '\u0143'), ('\u0303', '\xd1'), + ('\u0307', '\u1e44'), ('\u030c', '\u0147'), ('\u0323', '\u1e46'), ('\u0327', '\u0145'), + ('\u032d', '\u1e4a'), ('\u0331', '\u1e48')]), ('\x4f', &[('\u0300', '\xd2'), ('\u0301', + '\xd3'), ('\u0302', '\xd4'), ('\u0303', '\xd5'), ('\u0304', '\u014c'), ('\u0306', + '\u014e'), ('\u0307', '\u022e'), ('\u0308', '\xd6'), ('\u0309', '\u1ece'), ('\u030b', + '\u0150'), ('\u030c', '\u01d1'), ('\u030f', '\u020c'), ('\u0311', '\u020e'), ('\u031b', + '\u01a0'), ('\u0323', '\u1ecc'), ('\u0328', '\u01ea')]), ('\x50', &[('\u0301', + '\u1e54'), ('\u0307', '\u1e56')]), ('\x52', &[('\u0301', '\u0154'), ('\u0307', + '\u1e58'), ('\u030c', '\u0158'), ('\u030f', '\u0210'), ('\u0311', '\u0212'), ('\u0323', + '\u1e5a'), ('\u0327', '\u0156'), ('\u0331', '\u1e5e')]), ('\x53', &[('\u0301', + '\u015a'), ('\u0302', '\u015c'), ('\u0307', '\u1e60'), ('\u030c', '\u0160'), ('\u0323', + '\u1e62'), ('\u0326', '\u0218'), ('\u0327', '\u015e')]), ('\x54', &[('\u0307', + '\u1e6a'), ('\u030c', '\u0164'), ('\u0323', '\u1e6c'), ('\u0326', '\u021a'), ('\u0327', + '\u0162'), ('\u032d', '\u1e70'), ('\u0331', '\u1e6e')]), ('\x55', &[('\u0300', '\xd9'), + ('\u0301', '\xda'), ('\u0302', '\xdb'), ('\u0303', '\u0168'), ('\u0304', '\u016a'), + ('\u0306', '\u016c'), ('\u0308', '\xdc'), ('\u0309', '\u1ee6'), ('\u030a', '\u016e'), + ('\u030b', '\u0170'), ('\u030c', '\u01d3'), ('\u030f', '\u0214'), ('\u0311', '\u0216'), + ('\u031b', '\u01af'), ('\u0323', '\u1ee4'), ('\u0324', '\u1e72'), ('\u0328', '\u0172'), + ('\u032d', '\u1e76'), ('\u0330', '\u1e74')]), ('\x56', &[('\u0303', '\u1e7c'), + ('\u0323', '\u1e7e')]), ('\x57', &[('\u0300', '\u1e80'), ('\u0301', '\u1e82'), + ('\u0302', '\u0174'), ('\u0307', '\u1e86'), ('\u0308', '\u1e84'), ('\u0323', + '\u1e88')]), ('\x58', &[('\u0307', '\u1e8a'), ('\u0308', '\u1e8c')]), ('\x59', + &[('\u0300', '\u1ef2'), ('\u0301', '\xdd'), ('\u0302', '\u0176'), ('\u0303', '\u1ef8'), + ('\u0304', '\u0232'), ('\u0307', '\u1e8e'), ('\u0308', '\u0178'), ('\u0309', '\u1ef6'), + ('\u0323', '\u1ef4')]), ('\x5a', &[('\u0301', '\u0179'), ('\u0302', '\u1e90'), + ('\u0307', '\u017b'), ('\u030c', '\u017d'), ('\u0323', '\u1e92'), ('\u0331', + '\u1e94')]), ('\x61', &[('\u0300', '\xe0'), ('\u0301', '\xe1'), ('\u0302', '\xe2'), + ('\u0303', '\xe3'), ('\u0304', '\u0101'), ('\u0306', '\u0103'), ('\u0307', '\u0227'), + ('\u0308', '\xe4'), ('\u0309', '\u1ea3'), ('\u030a', '\xe5'), ('\u030c', '\u01ce'), + ('\u030f', '\u0201'), ('\u0311', '\u0203'), ('\u0323', '\u1ea1'), ('\u0325', '\u1e01'), + ('\u0328', '\u0105')]), ('\x62', &[('\u0307', '\u1e03'), ('\u0323', '\u1e05'), + ('\u0331', '\u1e07')]), ('\x63', &[('\u0301', '\u0107'), ('\u0302', '\u0109'), + ('\u0307', '\u010b'), ('\u030c', '\u010d'), ('\u0327', '\xe7')]), ('\x64', &[('\u0307', + '\u1e0b'), ('\u030c', '\u010f'), ('\u0323', '\u1e0d'), ('\u0327', '\u1e11'), ('\u032d', + '\u1e13'), ('\u0331', '\u1e0f')]), ('\x65', &[('\u0300', '\xe8'), ('\u0301', '\xe9'), + ('\u0302', '\xea'), ('\u0303', '\u1ebd'), ('\u0304', '\u0113'), ('\u0306', '\u0115'), + ('\u0307', '\u0117'), ('\u0308', '\xeb'), ('\u0309', '\u1ebb'), ('\u030c', '\u011b'), + ('\u030f', '\u0205'), ('\u0311', '\u0207'), ('\u0323', '\u1eb9'), ('\u0327', '\u0229'), + ('\u0328', '\u0119'), ('\u032d', '\u1e19'), ('\u0330', '\u1e1b')]), ('\x66', + &[('\u0307', '\u1e1f')]), ('\x67', &[('\u0301', '\u01f5'), ('\u0302', '\u011d'), + ('\u0304', '\u1e21'), ('\u0306', '\u011f'), ('\u0307', '\u0121'), ('\u030c', '\u01e7'), + ('\u0327', '\u0123')]), ('\x68', &[('\u0302', '\u0125'), ('\u0307', '\u1e23'), + ('\u0308', '\u1e27'), ('\u030c', '\u021f'), ('\u0323', '\u1e25'), ('\u0327', '\u1e29'), + ('\u032e', '\u1e2b'), ('\u0331', '\u1e96')]), ('\x69', &[('\u0300', '\xec'), ('\u0301', + '\xed'), ('\u0302', '\xee'), ('\u0303', '\u0129'), ('\u0304', '\u012b'), ('\u0306', + '\u012d'), ('\u0308', '\xef'), ('\u0309', '\u1ec9'), ('\u030c', '\u01d0'), ('\u030f', + '\u0209'), ('\u0311', '\u020b'), ('\u0323', '\u1ecb'), ('\u0328', '\u012f'), ('\u0330', + '\u1e2d')]), ('\x6a', &[('\u0302', '\u0135'), ('\u030c', '\u01f0')]), ('\x6b', + &[('\u0301', '\u1e31'), ('\u030c', '\u01e9'), ('\u0323', '\u1e33'), ('\u0327', + '\u0137'), ('\u0331', '\u1e35')]), ('\x6c', &[('\u0301', '\u013a'), ('\u030c', + '\u013e'), ('\u0323', '\u1e37'), ('\u0327', '\u013c'), ('\u032d', '\u1e3d'), ('\u0331', + '\u1e3b')]), ('\x6d', &[('\u0301', '\u1e3f'), ('\u0307', '\u1e41'), ('\u0323', + '\u1e43')]), ('\x6e', &[('\u0300', '\u01f9'), ('\u0301', '\u0144'), ('\u0303', '\xf1'), + ('\u0307', '\u1e45'), ('\u030c', '\u0148'), ('\u0323', '\u1e47'), ('\u0327', '\u0146'), + ('\u032d', '\u1e4b'), ('\u0331', '\u1e49')]), ('\x6f', &[('\u0300', '\xf2'), ('\u0301', + '\xf3'), ('\u0302', '\xf4'), ('\u0303', '\xf5'), ('\u0304', '\u014d'), ('\u0306', + '\u014f'), ('\u0307', '\u022f'), ('\u0308', '\xf6'), ('\u0309', '\u1ecf'), ('\u030b', + '\u0151'), ('\u030c', '\u01d2'), ('\u030f', '\u020d'), ('\u0311', '\u020f'), ('\u031b', + '\u01a1'), ('\u0323', '\u1ecd'), ('\u0328', '\u01eb')]), ('\x70', &[('\u0301', + '\u1e55'), ('\u0307', '\u1e57')]), ('\x72', &[('\u0301', '\u0155'), ('\u0307', + '\u1e59'), ('\u030c', '\u0159'), ('\u030f', '\u0211'), ('\u0311', '\u0213'), ('\u0323', + '\u1e5b'), ('\u0327', '\u0157'), ('\u0331', '\u1e5f')]), ('\x73', &[('\u0301', + '\u015b'), ('\u0302', '\u015d'), ('\u0307', '\u1e61'), ('\u030c', '\u0161'), ('\u0323', + '\u1e63'), ('\u0326', '\u0219'), ('\u0327', '\u015f')]), ('\x74', &[('\u0307', + '\u1e6b'), ('\u0308', '\u1e97'), ('\u030c', '\u0165'), ('\u0323', '\u1e6d'), ('\u0326', + '\u021b'), ('\u0327', '\u0163'), ('\u032d', '\u1e71'), ('\u0331', '\u1e6f')]), ('\x75', + &[('\u0300', '\xf9'), ('\u0301', '\xfa'), ('\u0302', '\xfb'), ('\u0303', '\u0169'), + ('\u0304', '\u016b'), ('\u0306', '\u016d'), ('\u0308', '\xfc'), ('\u0309', '\u1ee7'), + ('\u030a', '\u016f'), ('\u030b', '\u0171'), ('\u030c', '\u01d4'), ('\u030f', '\u0215'), + ('\u0311', '\u0217'), ('\u031b', '\u01b0'), ('\u0323', '\u1ee5'), ('\u0324', '\u1e73'), + ('\u0328', '\u0173'), ('\u032d', '\u1e77'), ('\u0330', '\u1e75')]), ('\x76', + &[('\u0303', '\u1e7d'), ('\u0323', '\u1e7f')]), ('\x77', &[('\u0300', '\u1e81'), + ('\u0301', '\u1e83'), ('\u0302', '\u0175'), ('\u0307', '\u1e87'), ('\u0308', '\u1e85'), + ('\u030a', '\u1e98'), ('\u0323', '\u1e89')]), ('\x78', &[('\u0307', '\u1e8b'), + ('\u0308', '\u1e8d')]), ('\x79', &[('\u0300', '\u1ef3'), ('\u0301', '\xfd'), ('\u0302', + '\u0177'), ('\u0303', '\u1ef9'), ('\u0304', '\u0233'), ('\u0307', '\u1e8f'), ('\u0308', + '\xff'), ('\u0309', '\u1ef7'), ('\u030a', '\u1e99'), ('\u0323', '\u1ef5')]), ('\x7a', + &[('\u0301', '\u017a'), ('\u0302', '\u1e91'), ('\u0307', '\u017c'), ('\u030c', + '\u017e'), ('\u0323', '\u1e93'), ('\u0331', '\u1e95')]), ('\xa8', &[('\u0300', + '\u1fed'), ('\u0301', '\u0385'), ('\u0342', '\u1fc1')]), ('\xc2', &[('\u0300', + '\u1ea6'), ('\u0301', '\u1ea4'), ('\u0303', '\u1eaa'), ('\u0309', '\u1ea8')]), ('\xc4', + &[('\u0304', '\u01de')]), ('\xc5', &[('\u0301', '\u01fa')]), ('\xc6', &[('\u0301', + '\u01fc'), ('\u0304', '\u01e2')]), ('\xc7', &[('\u0301', '\u1e08')]), ('\xca', + &[('\u0300', '\u1ec0'), ('\u0301', '\u1ebe'), ('\u0303', '\u1ec4'), ('\u0309', + '\u1ec2')]), ('\xcf', &[('\u0301', '\u1e2e')]), ('\xd4', &[('\u0300', '\u1ed2'), + ('\u0301', '\u1ed0'), ('\u0303', '\u1ed6'), ('\u0309', '\u1ed4')]), ('\xd5', + &[('\u0301', '\u1e4c'), ('\u0304', '\u022c'), ('\u0308', '\u1e4e')]), ('\xd6', + &[('\u0304', '\u022a')]), ('\xd8', &[('\u0301', '\u01fe')]), ('\xdc', &[('\u0300', + '\u01db'), ('\u0301', '\u01d7'), ('\u0304', '\u01d5'), ('\u030c', '\u01d9')]), ('\xe2', + &[('\u0300', '\u1ea7'), ('\u0301', '\u1ea5'), ('\u0303', '\u1eab'), ('\u0309', + '\u1ea9')]), ('\xe4', &[('\u0304', '\u01df')]), ('\xe5', &[('\u0301', '\u01fb')]), + ('\xe6', &[('\u0301', '\u01fd'), ('\u0304', '\u01e3')]), ('\xe7', &[('\u0301', + '\u1e09')]), ('\xea', &[('\u0300', '\u1ec1'), ('\u0301', '\u1ebf'), ('\u0303', + '\u1ec5'), ('\u0309', '\u1ec3')]), ('\xef', &[('\u0301', '\u1e2f')]), ('\xf4', + &[('\u0300', '\u1ed3'), ('\u0301', '\u1ed1'), ('\u0303', '\u1ed7'), ('\u0309', + '\u1ed5')]), ('\xf5', &[('\u0301', '\u1e4d'), ('\u0304', '\u022d'), ('\u0308', + '\u1e4f')]), ('\xf6', &[('\u0304', '\u022b')]), ('\xf8', &[('\u0301', '\u01ff')]), + ('\xfc', &[('\u0300', '\u01dc'), ('\u0301', '\u01d8'), ('\u0304', '\u01d6'), ('\u030c', + '\u01da')]), ('\u0102', &[('\u0300', '\u1eb0'), ('\u0301', '\u1eae'), ('\u0303', + '\u1eb4'), ('\u0309', '\u1eb2')]), ('\u0103', &[('\u0300', '\u1eb1'), ('\u0301', + '\u1eaf'), ('\u0303', '\u1eb5'), ('\u0309', '\u1eb3')]), ('\u0112', &[('\u0300', + '\u1e14'), ('\u0301', '\u1e16')]), ('\u0113', &[('\u0300', '\u1e15'), ('\u0301', + '\u1e17')]), ('\u014c', &[('\u0300', '\u1e50'), ('\u0301', '\u1e52')]), ('\u014d', + &[('\u0300', '\u1e51'), ('\u0301', '\u1e53')]), ('\u015a', &[('\u0307', '\u1e64')]), + ('\u015b', &[('\u0307', '\u1e65')]), ('\u0160', &[('\u0307', '\u1e66')]), ('\u0161', + &[('\u0307', '\u1e67')]), ('\u0168', &[('\u0301', '\u1e78')]), ('\u0169', &[('\u0301', + '\u1e79')]), ('\u016a', &[('\u0308', '\u1e7a')]), ('\u016b', &[('\u0308', '\u1e7b')]), + ('\u017f', &[('\u0307', '\u1e9b')]), ('\u01a0', &[('\u0300', '\u1edc'), ('\u0301', + '\u1eda'), ('\u0303', '\u1ee0'), ('\u0309', '\u1ede'), ('\u0323', '\u1ee2')]), + ('\u01a1', &[('\u0300', '\u1edd'), ('\u0301', '\u1edb'), ('\u0303', '\u1ee1'), + ('\u0309', '\u1edf'), ('\u0323', '\u1ee3')]), ('\u01af', &[('\u0300', '\u1eea'), + ('\u0301', '\u1ee8'), ('\u0303', '\u1eee'), ('\u0309', '\u1eec'), ('\u0323', + '\u1ef0')]), ('\u01b0', &[('\u0300', '\u1eeb'), ('\u0301', '\u1ee9'), ('\u0303', + '\u1eef'), ('\u0309', '\u1eed'), ('\u0323', '\u1ef1')]), ('\u01b7', &[('\u030c', + '\u01ee')]), ('\u01ea', &[('\u0304', '\u01ec')]), ('\u01eb', &[('\u0304', '\u01ed')]), + ('\u0226', &[('\u0304', '\u01e0')]), ('\u0227', &[('\u0304', '\u01e1')]), ('\u0228', + &[('\u0306', '\u1e1c')]), ('\u0229', &[('\u0306', '\u1e1d')]), ('\u022e', &[('\u0304', + '\u0230')]), ('\u022f', &[('\u0304', '\u0231')]), ('\u0292', &[('\u030c', '\u01ef')]), + ('\u0391', &[('\u0300', '\u1fba'), ('\u0301', '\u0386'), ('\u0304', '\u1fb9'), + ('\u0306', '\u1fb8'), ('\u0313', '\u1f08'), ('\u0314', '\u1f09'), ('\u0345', + '\u1fbc')]), ('\u0395', &[('\u0300', '\u1fc8'), ('\u0301', '\u0388'), ('\u0313', + '\u1f18'), ('\u0314', '\u1f19')]), ('\u0397', &[('\u0300', '\u1fca'), ('\u0301', + '\u0389'), ('\u0313', '\u1f28'), ('\u0314', '\u1f29'), ('\u0345', '\u1fcc')]), + ('\u0399', &[('\u0300', '\u1fda'), ('\u0301', '\u038a'), ('\u0304', '\u1fd9'), + ('\u0306', '\u1fd8'), ('\u0308', '\u03aa'), ('\u0313', '\u1f38'), ('\u0314', + '\u1f39')]), ('\u039f', &[('\u0300', '\u1ff8'), ('\u0301', '\u038c'), ('\u0313', + '\u1f48'), ('\u0314', '\u1f49')]), ('\u03a1', &[('\u0314', '\u1fec')]), ('\u03a5', + &[('\u0300', '\u1fea'), ('\u0301', '\u038e'), ('\u0304', '\u1fe9'), ('\u0306', + '\u1fe8'), ('\u0308', '\u03ab'), ('\u0314', '\u1f59')]), ('\u03a9', &[('\u0300', + '\u1ffa'), ('\u0301', '\u038f'), ('\u0313', '\u1f68'), ('\u0314', '\u1f69'), ('\u0345', + '\u1ffc')]), ('\u03ac', &[('\u0345', '\u1fb4')]), ('\u03ae', &[('\u0345', '\u1fc4')]), + ('\u03b1', &[('\u0300', '\u1f70'), ('\u0301', '\u03ac'), ('\u0304', '\u1fb1'), + ('\u0306', '\u1fb0'), ('\u0313', '\u1f00'), ('\u0314', '\u1f01'), ('\u0342', '\u1fb6'), + ('\u0345', '\u1fb3')]), ('\u03b5', &[('\u0300', '\u1f72'), ('\u0301', '\u03ad'), + ('\u0313', '\u1f10'), ('\u0314', '\u1f11')]), ('\u03b7', &[('\u0300', '\u1f74'), + ('\u0301', '\u03ae'), ('\u0313', '\u1f20'), ('\u0314', '\u1f21'), ('\u0342', '\u1fc6'), + ('\u0345', '\u1fc3')]), ('\u03b9', &[('\u0300', '\u1f76'), ('\u0301', '\u03af'), + ('\u0304', '\u1fd1'), ('\u0306', '\u1fd0'), ('\u0308', '\u03ca'), ('\u0313', '\u1f30'), + ('\u0314', '\u1f31'), ('\u0342', '\u1fd6')]), ('\u03bf', &[('\u0300', '\u1f78'), + ('\u0301', '\u03cc'), ('\u0313', '\u1f40'), ('\u0314', '\u1f41')]), ('\u03c1', + &[('\u0313', '\u1fe4'), ('\u0314', '\u1fe5')]), ('\u03c5', &[('\u0300', '\u1f7a'), + ('\u0301', '\u03cd'), ('\u0304', '\u1fe1'), ('\u0306', '\u1fe0'), ('\u0308', '\u03cb'), + ('\u0313', '\u1f50'), ('\u0314', '\u1f51'), ('\u0342', '\u1fe6')]), ('\u03c9', + &[('\u0300', '\u1f7c'), ('\u0301', '\u03ce'), ('\u0313', '\u1f60'), ('\u0314', + '\u1f61'), ('\u0342', '\u1ff6'), ('\u0345', '\u1ff3')]), ('\u03ca', &[('\u0300', + '\u1fd2'), ('\u0301', '\u0390'), ('\u0342', '\u1fd7')]), ('\u03cb', &[('\u0300', + '\u1fe2'), ('\u0301', '\u03b0'), ('\u0342', '\u1fe7')]), ('\u03ce', &[('\u0345', + '\u1ff4')]), ('\u03d2', &[('\u0301', '\u03d3'), ('\u0308', '\u03d4')]), ('\u0406', + &[('\u0308', '\u0407')]), ('\u0410', &[('\u0306', '\u04d0'), ('\u0308', '\u04d2')]), + ('\u0413', &[('\u0301', '\u0403')]), ('\u0415', &[('\u0300', '\u0400'), ('\u0306', + '\u04d6'), ('\u0308', '\u0401')]), ('\u0416', &[('\u0306', '\u04c1'), ('\u0308', + '\u04dc')]), ('\u0417', &[('\u0308', '\u04de')]), ('\u0418', &[('\u0300', '\u040d'), + ('\u0304', '\u04e2'), ('\u0306', '\u0419'), ('\u0308', '\u04e4')]), ('\u041a', + &[('\u0301', '\u040c')]), ('\u041e', &[('\u0308', '\u04e6')]), ('\u0423', &[('\u0304', + '\u04ee'), ('\u0306', '\u040e'), ('\u0308', '\u04f0'), ('\u030b', '\u04f2')]), + ('\u0427', &[('\u0308', '\u04f4')]), ('\u042b', &[('\u0308', '\u04f8')]), ('\u042d', + &[('\u0308', '\u04ec')]), ('\u0430', &[('\u0306', '\u04d1'), ('\u0308', '\u04d3')]), + ('\u0433', &[('\u0301', '\u0453')]), ('\u0435', &[('\u0300', '\u0450'), ('\u0306', + '\u04d7'), ('\u0308', '\u0451')]), ('\u0436', &[('\u0306', '\u04c2'), ('\u0308', + '\u04dd')]), ('\u0437', &[('\u0308', '\u04df')]), ('\u0438', &[('\u0300', '\u045d'), + ('\u0304', '\u04e3'), ('\u0306', '\u0439'), ('\u0308', '\u04e5')]), ('\u043a', + &[('\u0301', '\u045c')]), ('\u043e', &[('\u0308', '\u04e7')]), ('\u0443', &[('\u0304', + '\u04ef'), ('\u0306', '\u045e'), ('\u0308', '\u04f1'), ('\u030b', '\u04f3')]), + ('\u0447', &[('\u0308', '\u04f5')]), ('\u044b', &[('\u0308', '\u04f9')]), ('\u044d', + &[('\u0308', '\u04ed')]), ('\u0456', &[('\u0308', '\u0457')]), ('\u0474', &[('\u030f', + '\u0476')]), ('\u0475', &[('\u030f', '\u0477')]), ('\u04d8', &[('\u0308', '\u04da')]), + ('\u04d9', &[('\u0308', '\u04db')]), ('\u04e8', &[('\u0308', '\u04ea')]), ('\u04e9', + &[('\u0308', '\u04eb')]), ('\u0627', &[('\u0653', '\u0622'), ('\u0654', '\u0623'), + ('\u0655', '\u0625')]), ('\u0648', &[('\u0654', '\u0624')]), ('\u064a', &[('\u0654', + '\u0626')]), ('\u06c1', &[('\u0654', '\u06c2')]), ('\u06d2', &[('\u0654', '\u06d3')]), + ('\u06d5', &[('\u0654', '\u06c0')]), ('\u0928', &[('\u093c', '\u0929')]), ('\u0930', + &[('\u093c', '\u0931')]), ('\u0933', &[('\u093c', '\u0934')]), ('\u09c7', &[('\u09be', + '\u09cb'), ('\u09d7', '\u09cc')]), ('\u0b47', &[('\u0b3e', '\u0b4b'), ('\u0b56', + '\u0b48'), ('\u0b57', '\u0b4c')]), ('\u0b92', &[('\u0bd7', '\u0b94')]), ('\u0bc6', + &[('\u0bbe', '\u0bca'), ('\u0bd7', '\u0bcc')]), ('\u0bc7', &[('\u0bbe', '\u0bcb')]), + ('\u0c46', &[('\u0c56', '\u0c48')]), ('\u0cbf', &[('\u0cd5', '\u0cc0')]), ('\u0cc6', + &[('\u0cc2', '\u0cca'), ('\u0cd5', '\u0cc7'), ('\u0cd6', '\u0cc8')]), ('\u0cca', + &[('\u0cd5', '\u0ccb')]), ('\u0d46', &[('\u0d3e', '\u0d4a'), ('\u0d57', '\u0d4c')]), + ('\u0d47', &[('\u0d3e', '\u0d4b')]), ('\u0dd9', &[('\u0dca', '\u0dda'), ('\u0dcf', + '\u0ddc'), ('\u0ddf', '\u0dde')]), ('\u0ddc', &[('\u0dca', '\u0ddd')]), ('\u1025', + &[('\u102e', '\u1026')]), ('\u1b05', &[('\u1b35', '\u1b06')]), ('\u1b07', &[('\u1b35', + '\u1b08')]), ('\u1b09', &[('\u1b35', '\u1b0a')]), ('\u1b0b', &[('\u1b35', '\u1b0c')]), + ('\u1b0d', &[('\u1b35', '\u1b0e')]), ('\u1b11', &[('\u1b35', '\u1b12')]), ('\u1b3a', + &[('\u1b35', '\u1b3b')]), ('\u1b3c', &[('\u1b35', '\u1b3d')]), ('\u1b3e', &[('\u1b35', + '\u1b40')]), ('\u1b3f', &[('\u1b35', '\u1b41')]), ('\u1b42', &[('\u1b35', '\u1b43')]), + ('\u1e36', &[('\u0304', '\u1e38')]), ('\u1e37', &[('\u0304', '\u1e39')]), ('\u1e5a', + &[('\u0304', '\u1e5c')]), ('\u1e5b', &[('\u0304', '\u1e5d')]), ('\u1e62', &[('\u0307', + '\u1e68')]), ('\u1e63', &[('\u0307', '\u1e69')]), ('\u1ea0', &[('\u0302', '\u1eac'), + ('\u0306', '\u1eb6')]), ('\u1ea1', &[('\u0302', '\u1ead'), ('\u0306', '\u1eb7')]), + ('\u1eb8', &[('\u0302', '\u1ec6')]), ('\u1eb9', &[('\u0302', '\u1ec7')]), ('\u1ecc', + &[('\u0302', '\u1ed8')]), ('\u1ecd', &[('\u0302', '\u1ed9')]), ('\u1f00', &[('\u0300', + '\u1f02'), ('\u0301', '\u1f04'), ('\u0342', '\u1f06'), ('\u0345', '\u1f80')]), + ('\u1f01', &[('\u0300', '\u1f03'), ('\u0301', '\u1f05'), ('\u0342', '\u1f07'), + ('\u0345', '\u1f81')]), ('\u1f02', &[('\u0345', '\u1f82')]), ('\u1f03', &[('\u0345', + '\u1f83')]), ('\u1f04', &[('\u0345', '\u1f84')]), ('\u1f05', &[('\u0345', '\u1f85')]), + ('\u1f06', &[('\u0345', '\u1f86')]), ('\u1f07', &[('\u0345', '\u1f87')]), ('\u1f08', + &[('\u0300', '\u1f0a'), ('\u0301', '\u1f0c'), ('\u0342', '\u1f0e'), ('\u0345', + '\u1f88')]), ('\u1f09', &[('\u0300', '\u1f0b'), ('\u0301', '\u1f0d'), ('\u0342', + '\u1f0f'), ('\u0345', '\u1f89')]), ('\u1f0a', &[('\u0345', '\u1f8a')]), ('\u1f0b', + &[('\u0345', '\u1f8b')]), ('\u1f0c', &[('\u0345', '\u1f8c')]), ('\u1f0d', &[('\u0345', + '\u1f8d')]), ('\u1f0e', &[('\u0345', '\u1f8e')]), ('\u1f0f', &[('\u0345', '\u1f8f')]), + ('\u1f10', &[('\u0300', '\u1f12'), ('\u0301', '\u1f14')]), ('\u1f11', &[('\u0300', + '\u1f13'), ('\u0301', '\u1f15')]), ('\u1f18', &[('\u0300', '\u1f1a'), ('\u0301', + '\u1f1c')]), ('\u1f19', &[('\u0300', '\u1f1b'), ('\u0301', '\u1f1d')]), ('\u1f20', + &[('\u0300', '\u1f22'), ('\u0301', '\u1f24'), ('\u0342', '\u1f26'), ('\u0345', + '\u1f90')]), ('\u1f21', &[('\u0300', '\u1f23'), ('\u0301', '\u1f25'), ('\u0342', + '\u1f27'), ('\u0345', '\u1f91')]), ('\u1f22', &[('\u0345', '\u1f92')]), ('\u1f23', + &[('\u0345', '\u1f93')]), ('\u1f24', &[('\u0345', '\u1f94')]), ('\u1f25', &[('\u0345', + '\u1f95')]), ('\u1f26', &[('\u0345', '\u1f96')]), ('\u1f27', &[('\u0345', '\u1f97')]), + ('\u1f28', &[('\u0300', '\u1f2a'), ('\u0301', '\u1f2c'), ('\u0342', '\u1f2e'), + ('\u0345', '\u1f98')]), ('\u1f29', &[('\u0300', '\u1f2b'), ('\u0301', '\u1f2d'), + ('\u0342', '\u1f2f'), ('\u0345', '\u1f99')]), ('\u1f2a', &[('\u0345', '\u1f9a')]), + ('\u1f2b', &[('\u0345', '\u1f9b')]), ('\u1f2c', &[('\u0345', '\u1f9c')]), ('\u1f2d', + &[('\u0345', '\u1f9d')]), ('\u1f2e', &[('\u0345', '\u1f9e')]), ('\u1f2f', &[('\u0345', + '\u1f9f')]), ('\u1f30', &[('\u0300', '\u1f32'), ('\u0301', '\u1f34'), ('\u0342', + '\u1f36')]), ('\u1f31', &[('\u0300', '\u1f33'), ('\u0301', '\u1f35'), ('\u0342', + '\u1f37')]), ('\u1f38', &[('\u0300', '\u1f3a'), ('\u0301', '\u1f3c'), ('\u0342', + '\u1f3e')]), ('\u1f39', &[('\u0300', '\u1f3b'), ('\u0301', '\u1f3d'), ('\u0342', + '\u1f3f')]), ('\u1f40', &[('\u0300', '\u1f42'), ('\u0301', '\u1f44')]), ('\u1f41', + &[('\u0300', '\u1f43'), ('\u0301', '\u1f45')]), ('\u1f48', &[('\u0300', '\u1f4a'), + ('\u0301', '\u1f4c')]), ('\u1f49', &[('\u0300', '\u1f4b'), ('\u0301', '\u1f4d')]), + ('\u1f50', &[('\u0300', '\u1f52'), ('\u0301', '\u1f54'), ('\u0342', '\u1f56')]), + ('\u1f51', &[('\u0300', '\u1f53'), ('\u0301', '\u1f55'), ('\u0342', '\u1f57')]), + ('\u1f59', &[('\u0300', '\u1f5b'), ('\u0301', '\u1f5d'), ('\u0342', '\u1f5f')]), + ('\u1f60', &[('\u0300', '\u1f62'), ('\u0301', '\u1f64'), ('\u0342', '\u1f66'), + ('\u0345', '\u1fa0')]), ('\u1f61', &[('\u0300', '\u1f63'), ('\u0301', '\u1f65'), + ('\u0342', '\u1f67'), ('\u0345', '\u1fa1')]), ('\u1f62', &[('\u0345', '\u1fa2')]), + ('\u1f63', &[('\u0345', '\u1fa3')]), ('\u1f64', &[('\u0345', '\u1fa4')]), ('\u1f65', + &[('\u0345', '\u1fa5')]), ('\u1f66', &[('\u0345', '\u1fa6')]), ('\u1f67', &[('\u0345', + '\u1fa7')]), ('\u1f68', &[('\u0300', '\u1f6a'), ('\u0301', '\u1f6c'), ('\u0342', + '\u1f6e'), ('\u0345', '\u1fa8')]), ('\u1f69', &[('\u0300', '\u1f6b'), ('\u0301', + '\u1f6d'), ('\u0342', '\u1f6f'), ('\u0345', '\u1fa9')]), ('\u1f6a', &[('\u0345', + '\u1faa')]), ('\u1f6b', &[('\u0345', '\u1fab')]), ('\u1f6c', &[('\u0345', '\u1fac')]), + ('\u1f6d', &[('\u0345', '\u1fad')]), ('\u1f6e', &[('\u0345', '\u1fae')]), ('\u1f6f', + &[('\u0345', '\u1faf')]), ('\u1f70', &[('\u0345', '\u1fb2')]), ('\u1f74', &[('\u0345', + '\u1fc2')]), ('\u1f7c', &[('\u0345', '\u1ff2')]), ('\u1fb6', &[('\u0345', '\u1fb7')]), + ('\u1fbf', &[('\u0300', '\u1fcd'), ('\u0301', '\u1fce'), ('\u0342', '\u1fcf')]), + ('\u1fc6', &[('\u0345', '\u1fc7')]), ('\u1ff6', &[('\u0345', '\u1ff7')]), ('\u1ffe', + &[('\u0300', '\u1fdd'), ('\u0301', '\u1fde'), ('\u0342', '\u1fdf')]), ('\u2190', + &[('\u0338', '\u219a')]), ('\u2192', &[('\u0338', '\u219b')]), ('\u2194', &[('\u0338', + '\u21ae')]), ('\u21d0', &[('\u0338', '\u21cd')]), ('\u21d2', &[('\u0338', '\u21cf')]), + ('\u21d4', &[('\u0338', '\u21ce')]), ('\u2203', &[('\u0338', '\u2204')]), ('\u2208', + &[('\u0338', '\u2209')]), ('\u220b', &[('\u0338', '\u220c')]), ('\u2223', &[('\u0338', + '\u2224')]), ('\u2225', &[('\u0338', '\u2226')]), ('\u223c', &[('\u0338', '\u2241')]), + ('\u2243', &[('\u0338', '\u2244')]), ('\u2245', &[('\u0338', '\u2247')]), ('\u2248', + &[('\u0338', '\u2249')]), ('\u224d', &[('\u0338', '\u226d')]), ('\u2261', &[('\u0338', + '\u2262')]), ('\u2264', &[('\u0338', '\u2270')]), ('\u2265', &[('\u0338', '\u2271')]), + ('\u2272', &[('\u0338', '\u2274')]), ('\u2273', &[('\u0338', '\u2275')]), ('\u2276', + &[('\u0338', '\u2278')]), ('\u2277', &[('\u0338', '\u2279')]), ('\u227a', &[('\u0338', + '\u2280')]), ('\u227b', &[('\u0338', '\u2281')]), ('\u227c', &[('\u0338', '\u22e0')]), + ('\u227d', &[('\u0338', '\u22e1')]), ('\u2282', &[('\u0338', '\u2284')]), ('\u2283', + &[('\u0338', '\u2285')]), ('\u2286', &[('\u0338', '\u2288')]), ('\u2287', &[('\u0338', + '\u2289')]), ('\u2291', &[('\u0338', '\u22e2')]), ('\u2292', &[('\u0338', '\u22e3')]), + ('\u22a2', &[('\u0338', '\u22ac')]), ('\u22a8', &[('\u0338', '\u22ad')]), ('\u22a9', + &[('\u0338', '\u22ae')]), ('\u22ab', &[('\u0338', '\u22af')]), ('\u22b2', &[('\u0338', + '\u22ea')]), ('\u22b3', &[('\u0338', '\u22eb')]), ('\u22b4', &[('\u0338', '\u22ec')]), + ('\u22b5', &[('\u0338', '\u22ed')]), ('\u3046', &[('\u3099', '\u3094')]), ('\u304b', + &[('\u3099', '\u304c')]), ('\u304d', &[('\u3099', '\u304e')]), ('\u304f', &[('\u3099', + '\u3050')]), ('\u3051', &[('\u3099', '\u3052')]), ('\u3053', &[('\u3099', '\u3054')]), + ('\u3055', &[('\u3099', '\u3056')]), ('\u3057', &[('\u3099', '\u3058')]), ('\u3059', + &[('\u3099', '\u305a')]), ('\u305b', &[('\u3099', '\u305c')]), ('\u305d', &[('\u3099', + '\u305e')]), ('\u305f', &[('\u3099', '\u3060')]), ('\u3061', &[('\u3099', '\u3062')]), + ('\u3064', &[('\u3099', '\u3065')]), ('\u3066', &[('\u3099', '\u3067')]), ('\u3068', + &[('\u3099', '\u3069')]), ('\u306f', &[('\u3099', '\u3070'), ('\u309a', '\u3071')]), + ('\u3072', &[('\u3099', '\u3073'), ('\u309a', '\u3074')]), ('\u3075', &[('\u3099', + '\u3076'), ('\u309a', '\u3077')]), ('\u3078', &[('\u3099', '\u3079'), ('\u309a', + '\u307a')]), ('\u307b', &[('\u3099', '\u307c'), ('\u309a', '\u307d')]), ('\u309d', + &[('\u3099', '\u309e')]), ('\u30a6', &[('\u3099', '\u30f4')]), ('\u30ab', &[('\u3099', + '\u30ac')]), ('\u30ad', &[('\u3099', '\u30ae')]), ('\u30af', &[('\u3099', '\u30b0')]), + ('\u30b1', &[('\u3099', '\u30b2')]), ('\u30b3', &[('\u3099', '\u30b4')]), ('\u30b5', + &[('\u3099', '\u30b6')]), ('\u30b7', &[('\u3099', '\u30b8')]), ('\u30b9', &[('\u3099', + '\u30ba')]), ('\u30bb', &[('\u3099', '\u30bc')]), ('\u30bd', &[('\u3099', '\u30be')]), + ('\u30bf', &[('\u3099', '\u30c0')]), ('\u30c1', &[('\u3099', '\u30c2')]), ('\u30c4', + &[('\u3099', '\u30c5')]), ('\u30c6', &[('\u3099', '\u30c7')]), ('\u30c8', &[('\u3099', + '\u30c9')]), ('\u30cf', &[('\u3099', '\u30d0'), ('\u309a', '\u30d1')]), ('\u30d2', + &[('\u3099', '\u30d3'), ('\u309a', '\u30d4')]), ('\u30d5', &[('\u3099', '\u30d6'), + ('\u309a', '\u30d7')]), ('\u30d8', &[('\u3099', '\u30d9'), ('\u309a', '\u30da')]), + ('\u30db', &[('\u3099', '\u30dc'), ('\u309a', '\u30dd')]), ('\u30ef', &[('\u3099', + '\u30f7')]), ('\u30f0', &[('\u3099', '\u30f8')]), ('\u30f1', &[('\u3099', '\u30f9')]), + ('\u30f2', &[('\u3099', '\u30fa')]), ('\u30fd', &[('\u3099', '\u30fe')]), ('\U00011099', + &[('\U000110ba', '\U0001109a')]), ('\U0001109b', &[('\U000110ba', '\U0001109c')]), + ('\U000110a5', &[('\U000110ba', '\U000110ab')]), ('\U00011131', &[('\U00011127', + '\U0001112e')]), ('\U00011132', &[('\U00011127', '\U0001112f')]) + ]; + static combining_class_table : &'static [(char, char, u8)] = &[ ('\u0300', '\u0314', 230), ('\u0315', '\u0315', 232), ('\u0316', '\u0319', 220), ('\u031a', '\u031a', 232), @@ -3629,6 +3942,28 @@ pub mod normalization { pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); } + pub fn compose(a: char, b: char) -> Option { + use cmp::{Equal, Less, Greater}; + compose_hangul(a, b).or_else(|| { + match bsearch_table(a, composition_table) { + None => None, + Some(candidates) => { + match candidates.bsearch(|&(val, _)| { + if b == val { Equal } + else if val < b { Less } + else { Greater } + }) { + Some(idx) => { + let (_, result) = candidates[idx]; + Some(result) + } + None => None + } + } + } + }) + } + pub fn canonical_combining_class(c: char) -> u8 { bsearch_range_value_table(c, combining_class_table) } @@ -3686,6 +4021,7 @@ pub mod normalization { static S_COUNT: u32 = (L_COUNT * N_COUNT); // Decompose a precomposed Hangul syllable + #[inline(always)] fn decompose_hangul(s: char, f: |char|) { use cast::transmute; @@ -3704,6 +4040,25 @@ pub mod normalization { } } } + + // Compose a pair of Hangul Jamo + #[inline(always)] + fn compose_hangul(a: char, b: char) -> Option { + use cast::transmute; + let l = a as u32; + let v = b as u32; + // Compose an LPart and a VPart + if L_BASE <= l && l < (L_BASE + L_COUNT) && V_BASE <= v && v < (V_BASE + V_COUNT) { + let r = S_BASE + (l - L_BASE) * N_COUNT + (v - V_BASE) * T_COUNT; + unsafe { return Some(transmute(r)); } + } + // Compose an LVPart and a TPart + if S_BASE <= l && l <= (S_BASE+S_COUNT-T_COUNT) && T_BASE <= v && v < (T_BASE+T_COUNT) { + let r = l + (v - T_BASE); + unsafe { return Some(transmute(r)); } + } + None + } } pub mod derived_property {