8
8
// option. This file may not be copied, modified, or distributed
9
9
// except according to those terms.
10
10
11
- //! Unicode characters manipulation (`char` type)
11
+ //! Character manipulation (`char` type, Unicode Scalar Value)
12
+ //!
13
+ //! This module provides the `Char` trait, as well as its implementation
14
+ //! for the primitive `char` type, in order to allow basic character manipulation.
15
+ //!
16
+ //! A `char` actually represents a
17
+ //! *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*,
18
+ //! as it can contain any Unicode code point except high-surrogate and
19
+ //! low-surrogate code points.
20
+ //!
21
+ //! As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
22
+ //! (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
23
+ //! however the converse is not always true due to the above range limits
24
+ //! and, as such, should be performed via the `from_u32` function..
25
+
12
26
13
27
use cast:: transmute;
14
28
use option:: { None , Option , Some } ;
@@ -66,7 +80,7 @@ static TAG_FOUR_B: uint = 240u;
66
80
/// The highest valid code point
67
81
pub static MAX : char = ' \U 0010 ffff' ;
68
82
69
- /// Convert from `u32` to a character .
83
+ /// Convert from `u32` to a `char` .
70
84
#[ inline]
71
85
pub fn from_u32 ( i : u32 ) -> Option < char > {
72
86
// catch out-of-bounds and surrogates
@@ -77,30 +91,30 @@ pub fn from_u32(i: u32) -> Option<char> {
77
91
}
78
92
}
79
93
80
- /// Returns whether the specified character is considered a unicode alphabetic
81
- /// character
94
+ /// Returns whether the specified `char` is considered a unicode alphabetic
95
+ /// scalar value
82
96
pub fn is_alphabetic ( c : char ) -> bool { derived_property:: Alphabetic ( c) }
83
97
#[ allow( missing_doc) ]
84
98
pub fn is_XID_start ( c : char ) -> bool { derived_property:: XID_Start ( c) }
85
99
#[ allow( missing_doc) ]
86
100
pub fn is_XID_continue ( c : char ) -> bool { derived_property:: XID_Continue ( c) }
87
101
88
102
///
89
- /// Indicates whether a character is in lower case, defined
103
+ /// Indicates whether a `char` is in lower case, defined
90
104
/// in terms of the Unicode Derived Core Property 'Lowercase'.
91
105
///
92
106
#[ inline]
93
107
pub fn is_lowercase ( c : char ) -> bool { derived_property:: Lowercase ( c) }
94
108
95
109
///
96
- /// Indicates whether a character is in upper case, defined
110
+ /// Indicates whether a `char` is in upper case, defined
97
111
/// in terms of the Unicode Derived Core Property 'Uppercase'.
98
112
///
99
113
#[ inline]
100
114
pub fn is_uppercase ( c : char ) -> bool { derived_property:: Uppercase ( c) }
101
115
102
116
///
103
- /// Indicates whether a character is whitespace. Whitespace is defined in
117
+ /// Indicates whether a `char` is whitespace. Whitespace is defined in
104
118
/// terms of the Unicode Property 'White_Space'.
105
119
///
106
120
#[ inline]
@@ -112,7 +126,7 @@ pub fn is_whitespace(c: char) -> bool {
112
126
}
113
127
114
128
///
115
- /// Indicates whether a character is alphanumeric. Alphanumericness is
129
+ /// Indicates whether a `char` is alphanumeric. Alphanumericness is
116
130
/// defined in terms of the Unicode General Categories 'Nd', 'Nl', 'No'
117
131
/// and the Derived Core Property 'Alphabetic'.
118
132
///
@@ -125,14 +139,14 @@ pub fn is_alphanumeric(c: char) -> bool {
125
139
}
126
140
127
141
///
128
- /// Indicates whether a character is a control character . Control
129
- /// characters are defined in terms of the Unicode General Category
142
+ /// Indicates whether a `char` is a control code point . Control
143
+ /// code points are defined in terms of the Unicode General Category
130
144
/// 'Cc'.
131
145
///
132
146
#[ inline]
133
147
pub fn is_control ( c : char ) -> bool { general_category:: Cc ( c) }
134
148
135
- /// Indicates whether the character is numeric (Nd, Nl, or No)
149
+ /// Indicates whether the `char` is numeric (Nd, Nl, or No)
136
150
#[ inline]
137
151
pub fn is_digit ( c : char ) -> bool {
138
152
general_category:: Nd ( c)
@@ -141,7 +155,7 @@ pub fn is_digit(c: char) -> bool {
141
155
}
142
156
143
157
///
144
- /// Checks if a character parses as a numeric digit in the given radix.
158
+ /// Checks if a `char` parses as a numeric digit in the given radix.
145
159
/// Compared to `is_digit()`, this function only recognizes the
146
160
/// characters `0-9`, `a-z` and `A-Z`.
147
161
///
@@ -167,13 +181,13 @@ pub fn is_digit_radix(c: char, radix: uint) -> bool {
167
181
}
168
182
169
183
///
170
- /// Convert a char to the corresponding digit.
184
+ /// Convert a ` char` to the corresponding digit.
171
185
///
172
186
/// # Return value
173
187
///
174
188
/// If `c` is between '0' and '9', the corresponding value
175
189
/// between 0 and 9. If `c` is 'a' or 'A', 10. If `c` is
176
- /// 'b' or 'B', 11, etc. Returns none if the char does not
190
+ /// 'b' or 'B', 11, etc. Returns none if the ` char` does not
177
191
/// refer to a digit in the given radix.
178
192
///
179
193
/// # Failure
@@ -273,7 +287,7 @@ pub fn decompose_compatible(c: char, f: |char|) {
273
287
}
274
288
275
289
///
276
- /// Return the hexadecimal unicode escape of a char.
290
+ /// Return the hexadecimal unicode escape of a ` char` .
277
291
///
278
292
/// The rules are as follows:
279
293
///
@@ -301,7 +315,7 @@ pub fn escape_unicode(c: char, f: |char|) {
301
315
}
302
316
303
317
///
304
- /// Return a 'default' ASCII and C++11-like char- literal escape of a char.
318
+ /// Return a 'default' ASCII and C++11-like literal escape of a ` char` .
305
319
///
306
320
/// The default is chosen with a bias toward producing literals that are
307
321
/// legal in a variety of languages, including C++11 and similar C-family
@@ -325,7 +339,7 @@ pub fn escape_default(c: char, f: |char|) {
325
339
}
326
340
}
327
341
328
- /// Returns the amount of bytes this character would need if encoded in utf8
342
+ /// Returns the amount of bytes this `char` would need if encoded in UTF-8
329
343
pub fn len_utf8_bytes ( c : char ) -> uint {
330
344
static MAX_ONE_B : uint = 128 u;
331
345
static MAX_TWO_B : uint = 2048 u;
@@ -360,7 +374,7 @@ pub trait Char {
360
374
fn escape_default ( & self , f: |char|) ;
361
375
fn len_utf8_bytes ( & self ) -> uint ;
362
376
363
- /// Encodes this character as utf-8 into the provided byte-buffer. The
377
+ /// Encodes this `char` as utf-8 into the provided byte-buffer. The
364
378
/// buffer must be at least 4 bytes long or a runtime failure will occur.
365
379
///
366
380
/// This will then return the number of characters written to the slice.
0 commit comments