Skip to content

Commit a96cea4

Browse files
committed
str: provide lossy UTF-16 support.
This replaces the iterator with one that handles lone surrogates gracefully and uses that to implement `from_utf16_lossy` which replaces invalid `u16`s with U+FFFD.
1 parent b7656d0 commit a96cea4

File tree

1 file changed

+133
-23
lines changed

1 file changed

+133
-23
lines changed

src/libstd/str.rs

Lines changed: 133 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -826,50 +826,142 @@ pub fn is_utf16(v: &[u16]) -> bool {
826826

827827
/// An iterator that decodes UTF-16 encoded codepoints from a vector
828828
/// of `u16`s.
829-
///
830-
/// Fails when it encounters invalid UTF-16 data.
831-
pub struct UTF16Chars<'a> {
829+
#[deriving(Clone)]
830+
pub struct UTF16Items<'a> {
832831
priv iter: vec::Items<'a, u16>
833832
}
834-
impl<'a> Iterator<char> for UTF16Chars<'a> {
835-
fn next(&mut self) -> Option<char> {
833+
/// The possibilities for values decoded from a `u16` stream.
834+
#[deriving(Eq, TotalEq, Clone)]
835+
pub enum UTF16Item {
836+
/// A valid codepoint.
837+
ScalarValue(char),
838+
/// An invalid surrogate without its pair.
839+
LoneSurrogate(u16)
840+
}
841+
842+
impl UTF16Item {
843+
/// Convert `self` to a `char`, taking `LoneSurrogate`s to the
844+
/// replacement character (U+FFFD).
845+
#[inline]
846+
pub fn to_char_lossy(&self) -> char {
847+
match *self {
848+
ScalarValue(c) => c,
849+
LoneSurrogate(_) => '\uFFFD'
850+
}
851+
}
852+
}
853+
854+
impl<'a> Iterator<UTF16Item> for UTF16Items<'a> {
855+
fn next(&mut self) -> Option<UTF16Item> {
836856
let u = match self.iter.next() {
837857
Some(u) => *u,
838858
None => return None
839859
};
840-
match char::from_u32(u as u32) {
841-
Some(c) => Some(c),
842-
None => {
843-
let u2 = *self.iter.next().expect("UTF16Chars: unmatched lead surrogate");
844-
if u < 0xD7FF || u > 0xDBFF ||
845-
u2 < 0xDC00 || u2 > 0xDFFF {
846-
fail!("UTF16Chars: invalid surrogate pair")
847-
}
848860

849-
let mut c = (u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32 | 0x1_0000;
850-
char::from_u32(c)
861+
if u < 0xD800 || 0xDFFF < u {
862+
// not a surrogate
863+
Some(ScalarValue(unsafe {cast::transmute(u as u32)}))
864+
} else if u >= 0xDC00 {
865+
// a trailing surrogate
866+
Some(LoneSurrogate(u))
867+
} else {
868+
// preserve state for rewinding.
869+
let old = self.iter;
870+
871+
let u2 = match self.iter.next() {
872+
Some(u2) => *u2,
873+
// eof
874+
None => return Some(LoneSurrogate(u))
875+
};
876+
if u2 < 0xDC00 || u2 > 0xDFFF {
877+
// not a trailing surrogate so we're not a valid
878+
// surrogate pair, so rewind to redecode u2 next time.
879+
self.iter = old;
880+
return Some(LoneSurrogate(u))
851881
}
882+
883+
// all ok, so lets decode it.
884+
let c = (u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32 | 0x1_0000;
885+
Some(ScalarValue(unsafe {cast::transmute(c)}))
852886
}
853887
}
854888

889+
#[inline]
855890
fn size_hint(&self) -> (uint, Option<uint>) {
856891
let (low, high) = self.iter.size_hint();
857-
// we could be entirely surrogates (2 elements per char), or
858-
// entirely non-surrogates (1 element per char)
892+
// we could be entirely valid surrogates (2 elements per
893+
// char), or entirely non-surrogates (1 element per char)
859894
(low / 2, high)
860895
}
861896
}
862897

863-
/// Create an iterator over the UTF-16 encoded codepoints in `v`.
898+
/// Create an iterator over the UTF-16 encoded codepoints in `v`,
899+
/// returning invalid surrogates as `LoneSurrogate`s.
864900
///
865-
/// The iterator fails if it attempts to decode invalid UTF-16 data.
866-
pub fn utf16_chars<'a>(v: &'a [u16]) -> UTF16Chars<'a> {
867-
UTF16Chars { iter : v.iter() }
901+
/// # Example
902+
///
903+
/// ```rust
904+
/// use std::str;
905+
/// use std::str::{ScalarValue, LoneSurrogate};
906+
///
907+
/// // 𝄞mus<invalid>ic<invalid>
908+
/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
909+
/// 0x0073, 0xDD1E, 0x0069, 0x0063,
910+
/// 0xD834];
911+
///
912+
/// assert_eq!(str::utf16_items(v).to_owned_vec(),
913+
/// ~[ScalarValue('𝄞'),
914+
/// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
915+
/// LoneSurrogate(0xDD1E),
916+
/// ScalarValue('i'), ScalarValue('c'),
917+
/// LoneSurrogate(0xD834)]);
918+
/// ```
919+
pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> {
920+
UTF16Items { iter : v.iter() }
868921
}
869922

870-
/// Allocates a new string from the utf-16 slice provided
923+
/// Decode a UTF-16 encoded vector `v` into a string.
924+
///
925+
/// # Failure
926+
///
927+
/// Fails on invalid UTF-16 data.
928+
///
929+
/// # Example
930+
///
931+
/// ```rust
932+
/// use std::str;
933+
///
934+
/// // 𝄞music
935+
/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
936+
/// 0x0073, 0x0069, 0x0063];
937+
/// assert_eq!(str::from_utf16(v), ~"𝄞music");
938+
/// ```
871939
pub fn from_utf16(v: &[u16]) -> ~str {
872-
utf16_chars(v).collect()
940+
utf16_items(v).map(|c| {
941+
match c {
942+
ScalarValue(c) => c,
943+
LoneSurrogate(u) => fail!("from_utf16: found lone surrogate {}", u)
944+
}
945+
}).collect()
946+
}
947+
948+
/// Decode a UTF-16 encoded vector `v` into a string, replacing
949+
/// invalid data with the replacement character (U+FFFD).
950+
///
951+
/// # Example
952+
/// ```rust
953+
/// use std::str;
954+
///
955+
/// // 𝄞mus<invalid>ic<invalid>
956+
/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
957+
/// 0x0073, 0xDD1E, 0x0069, 0x0063,
958+
/// 0xD834];
959+
///
960+
/// assert_eq!(str::from_utf16_lossy(v),
961+
/// ~"𝄞mus\uFFFDic\uFFFD");
962+
/// ```
963+
pub fn from_utf16_lossy(v: &[u16]) -> ~str {
964+
utf16_items(v).map(|c| c.to_char_lossy()).collect()
873965
}
874966

875967
/// Allocates a new string with the specified capacity. The string returned is
@@ -3738,12 +3830,30 @@ mod tests {
37383830
let (s, u) = (*p).clone();
37393831
assert!(is_utf16(u));
37403832
assert_eq!(s.to_utf16(), u);
3833+
37413834
assert_eq!(from_utf16(u), s);
3835+
assert_eq!(from_utf16_lossy(u), s);
3836+
37423837
assert_eq!(from_utf16(s.to_utf16()), s);
37433838
assert_eq!(from_utf16(u).to_utf16(), u);
37443839
}
37453840
}
37463841

3842+
#[test]
3843+
fn test_utf16_lossy() {
3844+
// completely positive cases tested above.
3845+
// lead + eof
3846+
assert_eq!(from_utf16_lossy([0xD800]), ~"\uFFFD");
3847+
// lead + lead
3848+
assert_eq!(from_utf16_lossy([0xD800, 0xD800]), ~"\uFFFD\uFFFD");
3849+
3850+
// isolated trail
3851+
assert_eq!(from_utf16_lossy([0x0061, 0xDC00]), ~"a\uFFFD");
3852+
3853+
// general
3854+
assert_eq!(from_utf16_lossy([0xD800, 0xd801, 0xdc8b, 0xD800]), ~"\uFFFD𐒋\uFFFD");
3855+
}
3856+
37473857
#[test]
37483858
fn test_char_at() {
37493859
let s = ~"ศไทย中华Việt Nam";

0 commit comments

Comments
 (0)