Skip to content

Commit af7c313

Browse files
committed
doc: don't refer to 'char' as characters
This seems to be causing some confusion among users. Rust's char are not 8bit characters, but 32bit UCS-4 codepoint without surrogates (Unicode Scalar Values as per Unicode glossary). Make the doc more explicit about it. Signed-off-by: Luca Bruno <lucab@debian.org>
1 parent 33768c4 commit af7c313

File tree

1 file changed

+32
-18
lines changed

1 file changed

+32
-18
lines changed

src/libstd/char.rs

Lines changed: 32 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,21 @@
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
1010

11-
//! Unicode characters manipulation (`char` type)
11+
//! Character manipulation (`char` type, Unicode Scalar Value)
12+
//!
13+
//! This module provides the `Char` trait, as well as its implementation
14+
//! for the primitive `char` type, in order to allow basic character manipulation.
15+
//!
16+
//! A `char` actually represents a
17+
//! *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*,
18+
//! as it can contain any Unicode code point except high-surrogate and
19+
//! low-surrogate code points.
20+
//!
21+
//! As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
22+
//! (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
23+
//! however the converse is not always true due to the above range limits
24+
//! and, as such, should be performed via the `from_u32` function..
25+
1226

1327
use cast::transmute;
1428
use option::{None, Option, Some};
@@ -66,7 +80,7 @@ static TAG_FOUR_B: uint = 240u;
6680
/// The highest valid code point
6781
pub static MAX: char = '\U0010ffff';
6882

69-
/// Convert from `u32` to a character.
83+
/// Convert from `u32` to a `char`.
7084
#[inline]
7185
pub fn from_u32(i: u32) -> Option<char> {
7286
// catch out-of-bounds and surrogates
@@ -77,30 +91,30 @@ pub fn from_u32(i: u32) -> Option<char> {
7791
}
7892
}
7993

80-
/// Returns whether the specified character is considered a unicode alphabetic
81-
/// character
94+
/// Returns whether the specified `char` is considered a unicode alphabetic
95+
/// scalar value
8296
pub fn is_alphabetic(c: char) -> bool { derived_property::Alphabetic(c) }
8397
#[allow(missing_doc)]
8498
pub fn is_XID_start(c: char) -> bool { derived_property::XID_Start(c) }
8599
#[allow(missing_doc)]
86100
pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
87101

88102
///
89-
/// Indicates whether a character is in lower case, defined
103+
/// Indicates whether a `char` is in lower case, defined
90104
/// in terms of the Unicode Derived Core Property 'Lowercase'.
91105
///
92106
#[inline]
93107
pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) }
94108

95109
///
96-
/// Indicates whether a character is in upper case, defined
110+
/// Indicates whether a `char` is in upper case, defined
97111
/// in terms of the Unicode Derived Core Property 'Uppercase'.
98112
///
99113
#[inline]
100114
pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) }
101115

102116
///
103-
/// Indicates whether a character is whitespace. Whitespace is defined in
117+
/// Indicates whether a `char` is whitespace. Whitespace is defined in
104118
/// terms of the Unicode Property 'White_Space'.
105119
///
106120
#[inline]
@@ -112,7 +126,7 @@ pub fn is_whitespace(c: char) -> bool {
112126
}
113127

114128
///
115-
/// Indicates whether a character is alphanumeric. Alphanumericness is
129+
/// Indicates whether a `char` is alphanumeric. Alphanumericness is
116130
/// defined in terms of the Unicode General Categories 'Nd', 'Nl', 'No'
117131
/// and the Derived Core Property 'Alphabetic'.
118132
///
@@ -125,14 +139,14 @@ pub fn is_alphanumeric(c: char) -> bool {
125139
}
126140

127141
///
128-
/// Indicates whether a character is a control character. Control
129-
/// characters are defined in terms of the Unicode General Category
142+
/// Indicates whether a `char` is a control code point. Control
143+
/// code points are defined in terms of the Unicode General Category
130144
/// 'Cc'.
131145
///
132146
#[inline]
133147
pub fn is_control(c: char) -> bool { general_category::Cc(c) }
134148

135-
/// Indicates whether the character is numeric (Nd, Nl, or No)
149+
/// Indicates whether the `char` is numeric (Nd, Nl, or No)
136150
#[inline]
137151
pub fn is_digit(c: char) -> bool {
138152
general_category::Nd(c)
@@ -141,7 +155,7 @@ pub fn is_digit(c: char) -> bool {
141155
}
142156

143157
///
144-
/// Checks if a character parses as a numeric digit in the given radix.
158+
/// Checks if a `char` parses as a numeric digit in the given radix.
145159
/// Compared to `is_digit()`, this function only recognizes the
146160
/// characters `0-9`, `a-z` and `A-Z`.
147161
///
@@ -167,13 +181,13 @@ pub fn is_digit_radix(c: char, radix: uint) -> bool {
167181
}
168182

169183
///
170-
/// Convert a char to the corresponding digit.
184+
/// Convert a `char` to the corresponding digit.
171185
///
172186
/// # Return value
173187
///
174188
/// If `c` is between '0' and '9', the corresponding value
175189
/// between 0 and 9. If `c` is 'a' or 'A', 10. If `c` is
176-
/// 'b' or 'B', 11, etc. Returns none if the char does not
190+
/// 'b' or 'B', 11, etc. Returns none if the `char` does not
177191
/// refer to a digit in the given radix.
178192
///
179193
/// # Failure
@@ -273,7 +287,7 @@ pub fn decompose_compatible(c: char, f: |char|) {
273287
}
274288

275289
///
276-
/// Return the hexadecimal unicode escape of a char.
290+
/// Return the hexadecimal unicode escape of a `char`.
277291
///
278292
/// The rules are as follows:
279293
///
@@ -301,7 +315,7 @@ pub fn escape_unicode(c: char, f: |char|) {
301315
}
302316

303317
///
304-
/// Return a 'default' ASCII and C++11-like char-literal escape of a char.
318+
/// Return a 'default' ASCII and C++11-like literal escape of a `char`.
305319
///
306320
/// The default is chosen with a bias toward producing literals that are
307321
/// legal in a variety of languages, including C++11 and similar C-family
@@ -325,7 +339,7 @@ pub fn escape_default(c: char, f: |char|) {
325339
}
326340
}
327341

328-
/// Returns the amount of bytes this character would need if encoded in utf8
342+
/// Returns the amount of bytes this `char` would need if encoded in UTF-8
329343
pub fn len_utf8_bytes(c: char) -> uint {
330344
static MAX_ONE_B: uint = 128u;
331345
static MAX_TWO_B: uint = 2048u;
@@ -360,7 +374,7 @@ pub trait Char {
360374
fn escape_default(&self, f: |char|);
361375
fn len_utf8_bytes(&self) -> uint;
362376

363-
/// Encodes this character as utf-8 into the provided byte-buffer. The
377+
/// Encodes this `char` as utf-8 into the provided byte-buffer. The
364378
/// buffer must be at least 4 bytes long or a runtime failure will occur.
365379
///
366380
/// This will then return the number of characters written to the slice.

0 commit comments

Comments
 (0)