@@ -729,6 +729,11 @@ Section: Misc
729
729
730
730
/// Determines if a vector of bytes contains valid UTF-8
731
731
pub fn is_utf8 ( v : & [ u8 ] ) -> bool {
732
+ first_non_utf8_index ( v) . is_none ( )
733
+ }
734
+
735
+ #[ inline( always) ]
736
+ fn first_non_utf8_index ( v : & [ u8 ] ) -> Option < uint > {
732
737
let mut i = 0 u;
733
738
let total = v. len ( ) ;
734
739
fn unsafe_get ( xs : & [ u8 ] , i : uint ) -> u8 {
@@ -740,10 +745,10 @@ pub fn is_utf8(v: &[u8]) -> bool {
740
745
i += 1 u;
741
746
} else {
742
747
let w = utf8_char_width ( v_i) ;
743
- if w == 0 u { return false ; }
748
+ if w == 0 u { return Some ( i ) ; }
744
749
745
750
let nexti = i + w;
746
- if nexti > total { return false ; }
751
+ if nexti > total { return Some ( i ) ; }
747
752
748
753
// 2-byte encoding is for codepoints \u0080 to \u07ff
749
754
// first C2 80 last DF BF
@@ -766,7 +771,7 @@ pub fn is_utf8(v: &[u8]) -> bool {
766
771
// UTF8-tail = %x80-BF
767
772
match w {
768
773
2 => if unsafe_get ( v, i + 1 ) & 192u8 != TAG_CONT_U8 {
769
- return false
774
+ return Some ( i )
770
775
} ,
771
776
3 => match ( v_i,
772
777
unsafe_get ( v, i + 1 ) ,
@@ -775,7 +780,7 @@ pub fn is_utf8(v: &[u8]) -> bool {
775
780
( 0xE1 .. 0xEC , 0x80 .. 0xBF , TAG_CONT_U8 ) => ( ) ,
776
781
( 0xED , 0x80 .. 0x9F , TAG_CONT_U8 ) => ( ) ,
777
782
( 0xEE .. 0xEF , 0x80 .. 0xBF , TAG_CONT_U8 ) => ( ) ,
778
- _ => return false ,
783
+ _ => return Some ( i ) ,
779
784
} ,
780
785
_ => match ( v_i,
781
786
unsafe_get ( v, i + 1 ) ,
@@ -784,14 +789,14 @@ pub fn is_utf8(v: &[u8]) -> bool {
784
789
( 0xF0 , 0x90 .. 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) => ( ) ,
785
790
( 0xF1 .. 0xF3 , 0x80 .. 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) => ( ) ,
786
791
( 0xF4 , 0x80 .. 0x8F , TAG_CONT_U8 , TAG_CONT_U8 ) => ( ) ,
787
- _ => return false ,
792
+ _ => return Some ( i )
788
793
} ,
789
794
}
790
795
791
796
i = nexti;
792
797
}
793
798
}
794
- true
799
+ None
795
800
}
796
801
797
802
/// Determines if a vector of `u16` contains valid UTF-16
@@ -910,6 +915,53 @@ macro_rules! utf8_acc_cont_byte(
910
915
911
916
static TAG_CONT_U8 : u8 = 128u8 ;
912
917
918
+ /// Enum that represents either a borrowed or an owned string.
919
+ #[ deriving( Eq , Clone ) ]
920
+ pub enum MaybeOwned < ' a > {
921
+ /// A borrowed string
922
+ Slice ( & ' a str ) ,
923
+ /// An owned string
924
+ Owned ( ~str )
925
+ }
926
+
927
+ impl < ' a > Str for MaybeOwned < ' a > {
928
+ #[ inline]
929
+ fn as_slice < ' b > ( & ' b self ) -> & ' b str {
930
+ match * self {
931
+ Slice ( s) => s,
932
+ Owned ( ref s) => s. as_slice ( )
933
+ }
934
+ }
935
+
936
+ #[ inline]
937
+ fn into_owned ( self ) -> ~str {
938
+ match self {
939
+ Slice ( s) => s. to_owned ( ) ,
940
+ Owned ( s) => s
941
+ }
942
+ }
943
+ }
944
+
945
+ impl < ' a > ToStr for MaybeOwned < ' a > {
946
+ #[ inline]
947
+ fn to_str ( & self ) -> ~str {
948
+ match * self {
949
+ Slice ( s) => s. to_str ( ) ,
950
+ Owned ( ref s) => s. clone ( )
951
+ }
952
+ }
953
+ }
954
+
955
+ impl < ' a > :: fmt:: Show for MaybeOwned < ' a > {
956
+ #[ inline]
957
+ fn fmt ( mo : & MaybeOwned , f : & mut :: fmt:: Formatter ) -> :: fmt:: Result {
958
+ match * mo {
959
+ Slice ( ref s) => :: fmt:: Show :: fmt ( s, f) ,
960
+ Owned ( ref s) => :: fmt:: Show :: fmt ( & s. as_slice ( ) , f)
961
+ }
962
+ }
963
+ }
964
+
913
965
/// Converts a vector of bytes to a new utf-8 string.
914
966
/// Any invalid utf-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
915
967
///
@@ -918,12 +970,16 @@ static TAG_CONT_U8: u8 = 128u8;
918
970
/// ```rust
919
971
/// let input = bytes!("Hello ", 0xF0, 0x90, 0x80, "World");
920
972
/// let output = std::str::from_utf8_lossy(input);
921
- /// assert_eq!(output, ~ "Hello \uFFFDWorld");
973
+ /// assert_eq!(output.as_slice(), "Hello \uFFFDWorld");
922
974
/// ```
923
- pub fn from_utf8_lossy ( v : & [ u8 ] ) -> ~str {
975
+ pub fn from_utf8_lossy < ' a > ( v : & ' a [ u8 ] ) -> MaybeOwned < ' a > {
976
+ let firstbad = match first_non_utf8_index ( v) {
977
+ None => return Slice ( unsafe { cast:: transmute ( v) } ) ,
978
+ Some ( i) => i
979
+ } ;
980
+
924
981
static REPLACEMENT : & ' static [ u8 ] = bytes ! ( 0xEF , 0xBF , 0xBD ) ; // U+FFFD in UTF-8
925
- let mut i = 0 u;
926
- let mut lastgood = 0 u;
982
+ let mut i = firstbad;
927
983
let total = v. len ( ) ;
928
984
fn unsafe_get ( xs : & [ u8 ] , i : uint ) -> u8 {
929
985
unsafe { * xs. unsafe_ref ( i) }
@@ -937,23 +993,32 @@ pub fn from_utf8_lossy(v: &[u8]) -> ~str {
937
993
}
938
994
let mut res = with_capacity ( total) ;
939
995
996
+ if i > 0 {
997
+ unsafe { raw:: push_bytes ( & mut res, v. slice_to ( i) ) } ;
998
+ }
999
+
1000
+ // subseqidx is the index of the first byte of the subsequence we're looking at.
1001
+ // It's used to copy a bunch of contiguous good codepoints at once instead of copying
1002
+ // them one by one.
1003
+ let mut subseqidx = firstbad;
1004
+
940
1005
while i < total {
941
1006
let i_ = i;
942
1007
let byte = unsafe_get ( v, i) ;
943
1008
i += 1 ;
944
1009
945
- macro_rules! error( ( ) => {
1010
+ macro_rules! error( ( ) => ( {
946
1011
unsafe {
947
- if lastgood != i_ {
948
- raw:: push_bytes( & mut res, v. slice( lastgood , i_) ) ;
1012
+ if subseqidx != i_ {
1013
+ raw:: push_bytes( & mut res, v. slice( subseqidx , i_) ) ;
949
1014
}
950
- lastgood = i;
1015
+ subseqidx = i;
951
1016
raw:: push_bytes( & mut res, REPLACEMENT ) ;
952
1017
}
953
- } )
1018
+ } ) )
954
1019
955
1020
if byte < 128u8 {
956
- // lastgood handles this
1021
+ // subseqidx handles this
957
1022
} else {
958
1023
let w = utf8_char_width ( byte) ;
959
1024
@@ -1012,8 +1077,10 @@ pub fn from_utf8_lossy(v: &[u8]) -> ~str {
1012
1077
}
1013
1078
}
1014
1079
}
1015
- unsafe { raw:: push_bytes ( & mut res, v. slice ( lastgood, total) ) } ;
1016
- res
1080
+ if subseqidx < total {
1081
+ unsafe { raw:: push_bytes ( & mut res, v. slice ( subseqidx, total) ) } ;
1082
+ }
1083
+ Owned ( res)
1017
1084
}
1018
1085
1019
1086
/// Unsafe operations
@@ -3943,32 +4010,32 @@ mod tests {
3943
4010
#[test]
3944
4011
fn test_str_from_utf8_lossy() {
3945
4012
let xs = bytes!(" hello");
3946
- assert_eq!(from_utf8_lossy(xs), ~ " hello");
4013
+ assert_eq!(from_utf8_lossy(xs), Slice( " hello") );
3947
4014
3948
4015
let xs = bytes!(" ศไทย中华Việt Nam ");
3949
- assert_eq!(from_utf8_lossy(xs), ~ " ศไทย中华Việt Nam ");
4016
+ assert_eq!(from_utf8_lossy(xs), Slice( " ศไทย中华Việt Nam ") );
3950
4017
3951
4018
let xs = bytes!(" Hello ", 0xC2, " There ", 0xFF, " Goodbye ");
3952
- assert_eq!(from_utf8_lossy(xs), ~" Hello \uFFFD There \uFFFD Goodbye ");
4019
+ assert_eq!(from_utf8_lossy(xs), Owned( ~" Hello \uFFFD There \uFFFD Goodbye ") );
3953
4020
3954
4021
let xs = bytes!(" Hello ", 0xC0, 0x80, " There ", 0xE6, 0x83, " Goodbye ");
3955
- assert_eq!(from_utf8_lossy(xs), ~" Hello \uFFFD \uFFFD There \uFFFD Goodbye ");
4022
+ assert_eq!(from_utf8_lossy(xs), Owned( ~" Hello \uFFFD \uFFFD There \uFFFD Goodbye ") );
3956
4023
3957
4024
let xs = bytes!(0xF5, " foo", 0xF5, 0x80, " bar");
3958
- assert_eq!(from_utf8_lossy(xs), ~"\uFFFD foo\uFFFD \uFFFD bar" ) ;
4025
+ assert_eq!(from_utf8_lossy(xs), Owned( ~"\uFFFD foo\uFFFD \uFFFD bar" ) ) ;
3959
4026
3960
4027
let xs = bytes!( 0xF1 , "foo" , 0xF1 , 0x80 , "bar" , 0xF1 , 0x80 , 0x80 , "baz" ) ;
3961
- assert_eq!( from_utf8_lossy( xs) , ~"\uFFFD foo\uFFFD bar\uFFFD baz");
4028
+ assert_eq!( from_utf8_lossy( xs) , Owned ( ~"\uFFFD foo\uFFFD bar\uFFFD baz") );
3962
4029
3963
4030
let xs = bytes!(0xF4, " foo", 0xF4, 0x80, " bar", 0xF4, 0xBF, " baz");
3964
- assert_eq!(from_utf8_lossy(xs), ~"\uFFFD foo\uFFFD bar\uFFFD \uFFFD baz" ) ;
4031
+ assert_eq!(from_utf8_lossy(xs), Owned( ~"\uFFFD foo\uFFFD bar\uFFFD \uFFFD baz" ) ) ;
3965
4032
3966
4033
let xs = bytes!( 0xF0 , 0x80 , 0x80 , 0x80 , "foo" , 0xF0 , 0x90 , 0x80 , 0x80 , "bar" ) ;
3967
- assert_eq!( from_utf8_lossy( xs) , ~"\uFFFD \uFFFD \uFFFD \uFFFD foo\U 00010000 bar");
4034
+ assert_eq!( from_utf8_lossy( xs) , Owned ( ~"\uFFFD \uFFFD \uFFFD \uFFFD foo\U 00010000 bar") );
3968
4035
3969
4036
// surrogates
3970
4037
let xs = bytes!(0xED, 0xA0, 0x80, " foo", 0xED, 0xBF, 0xBF, " bar");
3971
- assert_eq!(from_utf8_lossy(xs), ~"\uFFFD \uFFFD \uFFFD foo\uFFFD \uFFFD \uFFFD bar" ) ;
4038
+ assert_eq!(from_utf8_lossy(xs), Owned( ~"\uFFFD \uFFFD \uFFFD foo\uFFFD \uFFFD \uFFFD bar" ) ) ;
3972
4039
}
3973
4040
3974
4041
#[ test]
0 commit comments