@@ -826,50 +826,142 @@ pub fn is_utf16(v: &[u16]) -> bool {
826
826
827
827
/// An iterator that decodes UTF-16 encoded codepoints from a vector
828
828
/// of `u16`s.
829
- ///
830
- /// Fails when it encounters invalid UTF-16 data.
831
- pub struct UTF16Chars < ' a > {
829
+ #[ deriving( Clone ) ]
830
+ pub struct UTF16Items < ' a > {
832
831
priv iter : vec:: Items < ' a , u16 >
833
832
}
834
- impl < ' a > Iterator < char > for UTF16Chars < ' a > {
835
- fn next ( & mut self ) -> Option < char > {
833
+ /// The possibilities for values decoded from a `u16` stream.
834
+ #[ deriving( Eq , TotalEq , Clone ) ]
835
+ pub enum UTF16Item {
836
+ /// A valid codepoint.
837
+ ScalarValue ( char ) ,
838
+ /// An invalid surrogate without its pair.
839
+ LoneSurrogate ( u16 )
840
+ }
841
+
842
+ impl UTF16Item {
843
+ /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
844
+ /// replacement character (U+FFFD).
845
+ #[ inline]
846
+ pub fn to_char_lossy ( & self ) -> char {
847
+ match * self {
848
+ ScalarValue ( c) => c,
849
+ LoneSurrogate ( _) => '\uFFFD'
850
+ }
851
+ }
852
+ }
853
+
854
+ impl < ' a > Iterator < UTF16Item > for UTF16Items < ' a > {
855
+ fn next ( & mut self ) -> Option < UTF16Item > {
836
856
let u = match self . iter . next ( ) {
837
857
Some ( u) => * u,
838
858
None => return None
839
859
} ;
840
- match char:: from_u32 ( u as u32 ) {
841
- Some ( c) => Some ( c) ,
842
- None => {
843
- let u2 = * self . iter . next ( ) . expect ( "UTF16Chars: unmatched lead surrogate" ) ;
844
- if u < 0xD7FF || u > 0xDBFF ||
845
- u2 < 0xDC00 || u2 > 0xDFFF {
846
- fail ! ( "UTF16Chars: invalid surrogate pair" )
847
- }
848
860
849
- let mut c = ( u - 0xD800 ) as u32 << 10 | ( u2 - 0xDC00 ) as u32 | 0x1_0000 ;
850
- char:: from_u32 ( c)
861
+ if u < 0xD800 || 0xDFFF < u {
862
+ // not a surrogate
863
+ Some ( ScalarValue ( unsafe { cast:: transmute ( u as u32 ) } ) )
864
+ } else if u >= 0xDC00 {
865
+ // a trailing surrogate
866
+ Some ( LoneSurrogate ( u) )
867
+ } else {
868
+ // preserve state for rewinding.
869
+ let old = self . iter ;
870
+
871
+ let u2 = match self . iter . next ( ) {
872
+ Some ( u2) => * u2,
873
+ // eof
874
+ None => return Some ( LoneSurrogate ( u) )
875
+ } ;
876
+ if u2 < 0xDC00 || u2 > 0xDFFF {
877
+ // not a trailing surrogate so we're not a valid
878
+ // surrogate pair, so rewind to redecode u2 next time.
879
+ self . iter = old;
880
+ return Some ( LoneSurrogate ( u) )
851
881
}
882
+
883
+ // all ok, so lets decode it.
884
+ let c = ( u - 0xD800 ) as u32 << 10 | ( u2 - 0xDC00 ) as u32 | 0x1_0000 ;
885
+ Some ( ScalarValue ( unsafe { cast:: transmute ( c) } ) )
852
886
}
853
887
}
854
888
889
+ #[ inline]
855
890
fn size_hint ( & self ) -> ( uint , Option < uint > ) {
856
891
let ( low, high) = self . iter . size_hint ( ) ;
857
- // we could be entirely surrogates (2 elements per char), or
858
- // entirely non-surrogates (1 element per char)
892
+ // we could be entirely valid surrogates (2 elements per
893
+ // char), or entirely non-surrogates (1 element per char)
859
894
( low / 2 , high)
860
895
}
861
896
}
862
897
863
- /// Create an iterator over the UTF-16 encoded codepoints in `v`.
898
+ /// Create an iterator over the UTF-16 encoded codepoints in `v`,
899
+ /// returning invalid surrogates as `LoneSurrogate`s.
864
900
///
865
- /// The iterator fails if it attempts to decode invalid UTF-16 data.
866
- pub fn utf16_chars < ' a > ( v : & ' a [ u16 ] ) -> UTF16Chars < ' a > {
867
- UTF16Chars { iter : v. iter ( ) }
901
+ /// # Example
902
+ ///
903
+ /// ```rust
904
+ /// use std::str;
905
+ /// use std::str::{ScalarValue, LoneSurrogate};
906
+ ///
907
+ /// // 𝄞mus<invalid>ic<invalid>
908
+ /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
909
+ /// 0x0073, 0xDD1E, 0x0069, 0x0063,
910
+ /// 0xD834];
911
+ ///
912
+ /// assert_eq!(str::utf16_items(v).to_owned_vec(),
913
+ /// ~[ScalarValue('𝄞'),
914
+ /// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
915
+ /// LoneSurrogate(0xDD1E),
916
+ /// ScalarValue('i'), ScalarValue('c'),
917
+ /// LoneSurrogate(0xD834)]);
918
+ /// ```
919
+ pub fn utf16_items < ' a > ( v : & ' a [ u16 ] ) -> UTF16Items < ' a > {
920
+ UTF16Items { iter : v. iter ( ) }
868
921
}
869
922
870
- /// Allocates a new string from the utf-16 slice provided
923
+ /// Decode a UTF-16 encoded vector `v` into a string.
924
+ ///
925
+ /// # Failure
926
+ ///
927
+ /// Fails on invalid UTF-16 data.
928
+ ///
929
+ /// # Example
930
+ ///
931
+ /// ```rust
932
+ /// use std::str;
933
+ ///
934
+ /// // 𝄞music
935
+ /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
936
+ /// 0x0073, 0x0069, 0x0063];
937
+ /// assert_eq!(str::from_utf16(v), ~"𝄞music");
938
+ /// ```
871
939
pub fn from_utf16 ( v : & [ u16 ] ) -> ~str {
872
- utf16_chars ( v) . collect ( )
940
+ utf16_items ( v) . map ( |c| {
941
+ match c {
942
+ ScalarValue ( c) => c,
943
+ LoneSurrogate ( u) => fail ! ( "from_utf16: found lone surrogate {}" , u)
944
+ }
945
+ } ) . collect ( )
946
+ }
947
+
948
+ /// Decode a UTF-16 encoded vector `v` into a string, replacing
949
+ /// invalid data with the replacement character (U+FFFD).
950
+ ///
951
+ /// # Example
952
+ /// ```rust
953
+ /// use std::str;
954
+ ///
955
+ /// // 𝄞mus<invalid>ic<invalid>
956
+ /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
957
+ /// 0x0073, 0xDD1E, 0x0069, 0x0063,
958
+ /// 0xD834];
959
+ ///
960
+ /// assert_eq!(str::from_utf16_lossy(v),
961
+ /// ~"𝄞mus\uFFFDic\uFFFD");
962
+ /// ```
963
+ pub fn from_utf16_lossy ( v : & [ u16 ] ) -> ~str {
964
+ utf16_items ( v) . map ( |c| c. to_char_lossy ( ) ) . collect ( )
873
965
}
874
966
875
967
/// Allocates a new string with the specified capacity. The string returned is
@@ -3738,12 +3830,30 @@ mod tests {
3738
3830
let ( s, u) = ( * p) . clone( ) ;
3739
3831
assert!( is_utf16( u) ) ;
3740
3832
assert_eq!( s. to_utf16( ) , u) ;
3833
+
3741
3834
assert_eq!( from_utf16( u) , s) ;
3835
+ assert_eq!( from_utf16_lossy( u) , s) ;
3836
+
3742
3837
assert_eq!( from_utf16( s. to_utf16( ) ) , s) ;
3743
3838
assert_eq!( from_utf16( u) . to_utf16( ) , u) ;
3744
3839
}
3745
3840
}
3746
3841
3842
+ #[ test]
3843
+ fn test_utf16_lossy( ) {
3844
+ // completely positive cases tested above.
3845
+ // lead + eof
3846
+ assert_eq!( from_utf16_lossy( [ 0xD800 ] ) , ~"\uFFFD ") ;
3847
+ // lead + lead
3848
+ assert_eq!( from_utf16_lossy( [ 0xD800 , 0xD800 ] ) , ~"\uFFFD \uFFFD ") ;
3849
+
3850
+ // isolated trail
3851
+ assert_eq!( from_utf16_lossy( [ 0x0061 , 0xDC00 ] ) , ~"a\uFFFD ") ;
3852
+
3853
+ // general
3854
+ assert_eq!( from_utf16_lossy( [ 0xD800 , 0xd801 , 0xdc8b , 0xD800 ] ) , ~"\uFFFD 𐒋\uFFFD ") ;
3855
+ }
3856
+
3747
3857
#[ test]
3748
3858
fn test_char_at( ) {
3749
3859
let s = ~"ศไทย中华Việt Nam ";
0 commit comments