Skip to content

Move static code outside of unciode.py. #47088

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 3, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions src/libstd_unicode/bool_trie.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

/// BoolTrie is a trie for representing a set of Unicode codepoints. It is
/// implemented with postfix compression (sharing of identical child nodes),
/// which gives both compact size and fast lookup.
///
/// The space of Unicode codepoints is divided into 3 subareas, each
/// represented by a trie with different depth. In the first (0..0x800), there
/// is no trie structure at all; each u64 entry corresponds to a bitvector
/// effectively holding 64 bool values.
///
/// In the second (0x800..0x10000), each child of the root node represents a
/// 64-wide subrange, but instead of storing the full 64-bit value of the leaf,
/// the trie stores an 8-bit index into a shared table of leaf values. This
/// exploits the fact that in reasonable sets, many such leaves can be shared.
///
/// In the third (0x10000..0x110000), each child of the root node represents a
/// 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice
/// of a child tree. Each of these 64 bytes represents an index into the table
/// of shared 64-bit leaf values. This exploits the sparse structure in the
/// non-BMP range of most Unicode sets.
pub struct BoolTrie {
// 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences)
pub r1: [u64; 32], // leaves

// 0x800..0x10000 (corresponding to 3 byte utf-8 sequences)
pub r2: [u8; 992], // first level
pub r3: &'static [u64], // leaves

// 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences)
pub r4: [u8; 256], // first level
pub r5: &'static [u8], // second level
pub r6: &'static [u64], // leaves
}
impl BoolTrie {
pub fn lookup(&self, c: char) -> bool {
let c = c as usize;
if c < 0x800 {
trie_range_leaf(c, self.r1[c >> 6])
} else if c < 0x10000 {
let child = self.r2[(c >> 6) - 0x20];
trie_range_leaf(c, self.r3[child as usize])
} else {
let child = self.r4[(c >> 12) - 0x10];
let leaf = self.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
trie_range_leaf(c, self.r6[leaf as usize])
}
}
}

pub struct SmallBoolTrie {
pub(crate) r1: &'static [u8], // first level
pub(crate) r2: &'static [u64], // leaves
}

impl SmallBoolTrie {
pub fn lookup(&self, c: char) -> bool {
let c = c as usize;
match self.r1.get(c >> 6) {
Some(&child) => trie_range_leaf(c, self.r2[child as usize]),
None => false,
}
}
}

fn trie_range_leaf(c: usize, bitmap_chunk: u64) -> bool {
((bitmap_chunk >> (c & 63)) & 1) != 0
}
4 changes: 3 additions & 1 deletion src/libstd_unicode/char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ pub use core::char::CharTryFromError;
#[unstable(feature = "decode_utf8", issue = "33906")]
pub use core::char::{DecodeUtf8, decode_utf8};
#[unstable(feature = "unicode", issue = "27783")]
pub use tables::{UnicodeVersion, UNICODE_VERSION};
pub use tables::{UNICODE_VERSION};
#[unstable(feature = "unicode", issue = "27783")]
pub use version::UnicodeVersion;

/// Returns an iterator that yields the lowercase equivalent of a `char`.
///
Expand Down
5 changes: 4 additions & 1 deletion src/libstd_unicode/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,18 @@
#![feature(core_char_ext)]
#![feature(str_internals)]
#![feature(decode_utf8)]
#![feature(fused)]
#![feature(fn_traits)]
#![feature(fused)]
#![feature(lang_items)]
#![feature(non_exhaustive)]
#![feature(staged_api)]
#![feature(try_from)]
#![feature(unboxed_closures)]

mod bool_trie;
mod tables;
mod u_str;
mod version;
pub mod char;
pub mod lossy;

Expand Down
131 changes: 24 additions & 107 deletions src/libstd_unicode/tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,8 @@

#![allow(missing_docs, non_upper_case_globals, non_snake_case)]

/// Represents a Unicode Version.
///
/// See also: <http://www.unicode.org/versions/>
#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
pub struct UnicodeVersion {
/// Major version.
pub major: u32,

/// Minor version.
pub minor: u32,

/// Micro (or Update) version.
pub micro: u32,

// Private field to keep struct expandable.
_priv: (),
}
use version::UnicodeVersion;
use bool_trie::{BoolTrie, SmallBoolTrie};
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[00:04:03] error: unused import: `BoolTrie`
[00:04:03]   --> /checkout/src/libstd_unicode/tables.rs:16:17
[00:04:03]    |
[00:04:03] 16 | use bool_trie::{BoolTrie, SmallBoolTrie};
[00:04:03]    |                 ^^^^^^^^
[00:04:03]    |


/// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
/// `CharExt` and `UnicodeStrPrelude` traits are based on.
Expand All @@ -38,76 +23,8 @@ pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {
micro: 0,
_priv: (),
};


// BoolTrie is a trie for representing a set of Unicode codepoints. It is
// implemented with postfix compression (sharing of identical child nodes),
// which gives both compact size and fast lookup.
//
// The space of Unicode codepoints is divided into 3 subareas, each
// represented by a trie with different depth. In the first (0..0x800), there
// is no trie structure at all; each u64 entry corresponds to a bitvector
// effectively holding 64 bool values.
//
// In the second (0x800..0x10000), each child of the root node represents a
// 64-wide subrange, but instead of storing the full 64-bit value of the leaf,
// the trie stores an 8-bit index into a shared table of leaf values. This
// exploits the fact that in reasonable sets, many such leaves can be shared.
//
// In the third (0x10000..0x110000), each child of the root node represents a
// 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice
// of a child tree. Each of these 64 bytes represents an index into the table
// of shared 64-bit leaf values. This exploits the sparse structure in the
// non-BMP range of most Unicode sets.
pub struct BoolTrie {
// 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences)
r1: [u64; 32], // leaves

// 0x800..0x10000 (corresponding to 3 byte utf-8 sequences)
r2: [u8; 992], // first level
r3: &'static [u64], // leaves

// 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences)
r4: [u8; 256], // first level
r5: &'static [u8], // second level
r6: &'static [u64], // leaves
}

fn trie_range_leaf(c: usize, bitmap_chunk: u64) -> bool {
((bitmap_chunk >> (c & 63)) & 1) != 0
}

fn trie_lookup_range_table(c: char, r: &'static BoolTrie) -> bool {
let c = c as usize;
if c < 0x800 {
trie_range_leaf(c, r.r1[c >> 6])
} else if c < 0x10000 {
let child = r.r2[(c >> 6) - 0x20];
trie_range_leaf(c, r.r3[child as usize])
} else {
let child = r.r4[(c >> 12) - 0x10];
let leaf = r.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
trie_range_leaf(c, r.r6[leaf as usize])
}
}

pub struct SmallBoolTrie {
r1: &'static [u8], // first level
r2: &'static [u64], // leaves
}

impl SmallBoolTrie {
fn lookup(&self, c: char) -> bool {
let c = c as usize;
match self.r1.get(c >> 6) {
Some(&child) => trie_range_leaf(c, self.r2[child as usize]),
None => false,
}
}
}

pub mod general_category {
pub const Cc_table: &'static super::SmallBoolTrie = &super::SmallBoolTrie {
pub const Cc_table: &super::SmallBoolTrie = &super::SmallBoolTrie {
r1: &[
0, 1, 0
],
Expand All @@ -120,7 +37,7 @@ pub mod general_category {
Cc_table.lookup(c)
}

pub const N_table: &'static super::BoolTrie = &super::BoolTrie {
pub const N_table: &super::BoolTrie = &super::BoolTrie {
r1: [
0x03ff000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
Expand Down Expand Up @@ -212,13 +129,13 @@ pub mod general_category {
};

pub fn N(c: char) -> bool {
super::trie_lookup_range_table(c, N_table)
N_table.lookup(c)
}

}

pub mod derived_property {
pub const Alphabetic_table: &'static super::BoolTrie = &super::BoolTrie {
pub const Alphabetic_table: &super::BoolTrie = &super::BoolTrie {
r1: [
0x0000000000000000, 0x07fffffe07fffffe, 0x0420040000000000, 0xff7fffffff7fffff,
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
Expand Down Expand Up @@ -397,10 +314,10 @@ pub mod derived_property {
};

pub fn Alphabetic(c: char) -> bool {
super::trie_lookup_range_table(c, Alphabetic_table)
Alphabetic_table.lookup(c)
}

pub const Case_Ignorable_table: &'static super::BoolTrie = &super::BoolTrie {
pub const Case_Ignorable_table: &super::BoolTrie = &super::BoolTrie {
r1: [
0x0400408000000000, 0x0000000140000000, 0x0190a10000000000, 0x0000000000000000,
0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
Expand Down Expand Up @@ -529,10 +446,10 @@ pub mod derived_property {
};

pub fn Case_Ignorable(c: char) -> bool {
super::trie_lookup_range_table(c, Case_Ignorable_table)
Case_Ignorable_table.lookup(c)
}

pub const Cased_table: &'static super::BoolTrie = &super::BoolTrie {
pub const Cased_table: &super::BoolTrie = &super::BoolTrie {
r1: [
0x0000000000000000, 0x07fffffe07fffffe, 0x0420040000000000, 0xff7fffffff7fffff,
0xffffffffffffffff, 0xffffffffffffffff, 0xf7ffffffffffffff, 0xfffffffffffffff0,
Expand Down Expand Up @@ -628,10 +545,10 @@ pub mod derived_property {
};

pub fn Cased(c: char) -> bool {
super::trie_lookup_range_table(c, Cased_table)
Cased_table.lookup(c)
}

pub const Lowercase_table: &'static super::BoolTrie = &super::BoolTrie {
pub const Lowercase_table: &super::BoolTrie = &super::BoolTrie {
r1: [
0x0000000000000000, 0x07fffffe00000000, 0x0420040000000000, 0xff7fffff80000000,
0x55aaaaaaaaaaaaaa, 0xd4aaaaaaaaaaab55, 0xe6512d2a4e243129, 0xaa29aaaab5555240,
Expand Down Expand Up @@ -725,10 +642,10 @@ pub mod derived_property {
};

pub fn Lowercase(c: char) -> bool {
super::trie_lookup_range_table(c, Lowercase_table)
Lowercase_table.lookup(c)
}

pub const Uppercase_table: &'static super::BoolTrie = &super::BoolTrie {
pub const Uppercase_table: &super::BoolTrie = &super::BoolTrie {
r1: [
0x0000000000000000, 0x0000000007fffffe, 0x0000000000000000, 0x000000007f7fffff,
0xaa55555555555555, 0x2b555555555554aa, 0x11aed2d5b1dbced6, 0x55d255554aaaa490,
Expand Down Expand Up @@ -823,10 +740,10 @@ pub mod derived_property {
};

pub fn Uppercase(c: char) -> bool {
super::trie_lookup_range_table(c, Uppercase_table)
Uppercase_table.lookup(c)
}

pub const XID_Continue_table: &'static super::BoolTrie = &super::BoolTrie {
pub const XID_Continue_table: &super::BoolTrie = &super::BoolTrie {
r1: [
0x03ff000000000000, 0x07fffffe87fffffe, 0x04a0040000000000, 0xff7fffffff7fffff,
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
Expand Down Expand Up @@ -998,10 +915,10 @@ pub mod derived_property {
};

pub fn XID_Continue(c: char) -> bool {
super::trie_lookup_range_table(c, XID_Continue_table)
XID_Continue_table.lookup(c)
}

pub const XID_Start_table: &'static super::BoolTrie = &super::BoolTrie {
pub const XID_Start_table: &super::BoolTrie = &super::BoolTrie {
r1: [
0x0000000000000000, 0x07fffffe07fffffe, 0x0420040000000000, 0xff7fffffff7fffff,
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
Expand Down Expand Up @@ -1175,13 +1092,13 @@ pub mod derived_property {
};

pub fn XID_Start(c: char) -> bool {
super::trie_lookup_range_table(c, XID_Start_table)
XID_Start_table.lookup(c)
}

}

pub mod property {
pub const Pattern_White_Space_table: &'static super::SmallBoolTrie = &super::SmallBoolTrie {
pub const Pattern_White_Space_table: &super::SmallBoolTrie = &super::SmallBoolTrie {
r1: &[
0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Expand All @@ -1198,7 +1115,7 @@ pub mod property {
Pattern_White_Space_table.lookup(c)
}

pub const White_Space_table: &'static super::SmallBoolTrie = &super::SmallBoolTrie {
pub const White_Space_table: &super::SmallBoolTrie = &super::SmallBoolTrie {
r1: &[
0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Expand Down Expand Up @@ -1238,11 +1155,11 @@ pub mod conversions {
}
}

fn bsearch_case_table(c: char, table: &'static [(char, [char; 3])]) -> Option<usize> {
fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option<usize> {
table.binary_search_by(|&(key, _)| key.cmp(&c)).ok()
}

const to_lowercase_table: &'static [(char, [char; 3])] = &[
const to_lowercase_table: &[(char, [char; 3])] = &[
('\u{41}', ['\u{61}', '\0', '\0']), ('\u{42}', ['\u{62}', '\0', '\0']), ('\u{43}',
['\u{63}', '\0', '\0']), ('\u{44}', ['\u{64}', '\0', '\0']), ('\u{45}', ['\u{65}', '\0',
'\0']), ('\u{46}', ['\u{66}', '\0', '\0']), ('\u{47}', ['\u{67}', '\0', '\0']), ('\u{48}',
Expand Down Expand Up @@ -1826,7 +1743,7 @@ pub mod conversions {
('\u{1e920}', ['\u{1e942}', '\0', '\0']), ('\u{1e921}', ['\u{1e943}', '\0', '\0'])
];

const to_uppercase_table: &'static [(char, [char; 3])] = &[
const to_uppercase_table: &[(char, [char; 3])] = &[
('\u{61}', ['\u{41}', '\0', '\0']), ('\u{62}', ['\u{42}', '\0', '\0']), ('\u{63}',
['\u{43}', '\0', '\0']), ('\u{64}', ['\u{44}', '\0', '\0']), ('\u{65}', ['\u{45}', '\0',
'\0']), ('\u{66}', ['\u{46}', '\0', '\0']), ('\u{67}', ['\u{47}', '\0', '\0']), ('\u{68}',
Expand Down
Loading