Skip to content

Commit 28467f5

Browse files
committed
Tweak from_utf8_lossy to return a new MaybeOwned enum
MaybeOwned allows from_utf8_lossy to avoid allocation if there are no invalid bytes in the input.
1 parent dde2e0b commit 28467f5

File tree

2 files changed

+96
-29
lines changed

2 files changed

+96
-29
lines changed

src/libstd/path/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -508,10 +508,10 @@ impl<'a, P: GenericPath> ToStr for Display<'a, P> {
508508
if self.filename {
509509
match self.path.filename() {
510510
None => ~"",
511-
Some(v) => str::from_utf8_lossy(v)
511+
Some(v) => str::from_utf8_lossy(v).into_owned()
512512
}
513513
} else {
514-
str::from_utf8_lossy(self.path.as_vec())
514+
str::from_utf8_lossy(self.path.as_vec()).into_owned()
515515
}
516516
}
517517
}

src/libstd/str.rs

Lines changed: 94 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -729,6 +729,11 @@ Section: Misc
729729

730730
/// Determines if a vector of bytes contains valid UTF-8
731731
pub fn is_utf8(v: &[u8]) -> bool {
732+
first_non_utf8_index(v).is_none()
733+
}
734+
735+
#[inline(always)]
736+
fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
732737
let mut i = 0u;
733738
let total = v.len();
734739
fn unsafe_get(xs: &[u8], i: uint) -> u8 {
@@ -740,10 +745,10 @@ pub fn is_utf8(v: &[u8]) -> bool {
740745
i += 1u;
741746
} else {
742747
let w = utf8_char_width(v_i);
743-
if w == 0u { return false; }
748+
if w == 0u { return Some(i); }
744749

745750
let nexti = i + w;
746-
if nexti > total { return false; }
751+
if nexti > total { return Some(i); }
747752

748753
// 2-byte encoding is for codepoints \u0080 to \u07ff
749754
// first C2 80 last DF BF
@@ -766,7 +771,7 @@ pub fn is_utf8(v: &[u8]) -> bool {
766771
// UTF8-tail = %x80-BF
767772
match w {
768773
2 => if unsafe_get(v, i + 1) & 192u8 != TAG_CONT_U8 {
769-
return false
774+
return Some(i)
770775
},
771776
3 => match (v_i,
772777
unsafe_get(v, i + 1),
@@ -775,7 +780,7 @@ pub fn is_utf8(v: &[u8]) -> bool {
775780
(0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) => (),
776781
(0xED , 0x80 .. 0x9F, TAG_CONT_U8) => (),
777782
(0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (),
778-
_ => return false,
783+
_ => return Some(i),
779784
},
780785
_ => match (v_i,
781786
unsafe_get(v, i + 1),
@@ -784,14 +789,14 @@ pub fn is_utf8(v: &[u8]) -> bool {
784789
(0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
785790
(0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
786791
(0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => (),
787-
_ => return false,
792+
_ => return Some(i)
788793
},
789794
}
790795

791796
i = nexti;
792797
}
793798
}
794-
true
799+
None
795800
}
796801

797802
/// Determines if a vector of `u16` contains valid UTF-16
@@ -910,6 +915,53 @@ macro_rules! utf8_acc_cont_byte(
910915

911916
static TAG_CONT_U8: u8 = 128u8;
912917

918+
/// Enum that represents either a borrowed or an owned string.
919+
#[deriving(Eq,Clone)]
920+
pub enum MaybeOwned<'a> {
921+
/// A borrowed string
922+
Slice(&'a str),
923+
/// An owned string
924+
Owned(~str)
925+
}
926+
927+
impl<'a> Str for MaybeOwned<'a> {
928+
#[inline]
929+
fn as_slice<'b>(&'b self) -> &'b str {
930+
match *self {
931+
Slice(s) => s,
932+
Owned(ref s) => s.as_slice()
933+
}
934+
}
935+
936+
#[inline]
937+
fn into_owned(self) -> ~str {
938+
match self {
939+
Slice(s) => s.to_owned(),
940+
Owned(s) => s
941+
}
942+
}
943+
}
944+
945+
impl<'a> ToStr for MaybeOwned<'a> {
946+
#[inline]
947+
fn to_str(&self) -> ~str {
948+
match *self {
949+
Slice(s) => s.to_str(),
950+
Owned(ref s) => s.clone()
951+
}
952+
}
953+
}
954+
955+
impl<'a> ::fmt::Show for MaybeOwned<'a> {
956+
#[inline]
957+
fn fmt(mo: &MaybeOwned, f: &mut ::fmt::Formatter) -> ::fmt::Result {
958+
match *mo {
959+
Slice(ref s) => ::fmt::Show::fmt(s, f),
960+
Owned(ref s) => ::fmt::Show::fmt(&s.as_slice(), f)
961+
}
962+
}
963+
}
964+
913965
/// Converts a vector of bytes to a new utf-8 string.
914966
/// Any invalid utf-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
915967
///
@@ -918,12 +970,16 @@ static TAG_CONT_U8: u8 = 128u8;
918970
/// ```rust
919971
/// let input = bytes!("Hello ", 0xF0, 0x90, 0x80, "World");
920972
/// let output = std::str::from_utf8_lossy(input);
921-
/// assert_eq!(output, ~"Hello \uFFFDWorld");
973+
/// assert_eq!(output.as_slice(), "Hello \uFFFDWorld");
922974
/// ```
923-
pub fn from_utf8_lossy(v: &[u8]) -> ~str {
975+
pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> MaybeOwned<'a> {
976+
let firstbad = match first_non_utf8_index(v) {
977+
None => return Slice(unsafe { cast::transmute(v) }),
978+
Some(i) => i
979+
};
980+
924981
static REPLACEMENT: &'static [u8] = bytes!(0xEF, 0xBF, 0xBD); // U+FFFD in UTF-8
925-
let mut i = 0u;
926-
let mut lastgood = 0u;
982+
let mut i = firstbad;
927983
let total = v.len();
928984
fn unsafe_get(xs: &[u8], i: uint) -> u8 {
929985
unsafe { *xs.unsafe_ref(i) }
@@ -937,23 +993,32 @@ pub fn from_utf8_lossy(v: &[u8]) -> ~str {
937993
}
938994
let mut res = with_capacity(total);
939995

996+
if i > 0 {
997+
unsafe { raw::push_bytes(&mut res, v.slice_to(i)) };
998+
}
999+
1000+
// subseqidx is the index of the first byte of the subsequence we're looking at.
1001+
// It's used to copy a bunch of contiguous good codepoints at once instead of copying
1002+
// them one by one.
1003+
let mut subseqidx = firstbad;
1004+
9401005
while i < total {
9411006
let i_ = i;
9421007
let byte = unsafe_get(v, i);
9431008
i += 1;
9441009

945-
macro_rules! error(() => {
1010+
macro_rules! error(() => ({
9461011
unsafe {
947-
if lastgood != i_ {
948-
raw::push_bytes(&mut res, v.slice(lastgood, i_));
1012+
if subseqidx != i_ {
1013+
raw::push_bytes(&mut res, v.slice(subseqidx, i_));
9491014
}
950-
lastgood = i;
1015+
subseqidx = i;
9511016
raw::push_bytes(&mut res, REPLACEMENT);
9521017
}
953-
})
1018+
}))
9541019

9551020
if byte < 128u8 {
956-
// lastgood handles this
1021+
// subseqidx handles this
9571022
} else {
9581023
let w = utf8_char_width(byte);
9591024

@@ -1012,8 +1077,10 @@ pub fn from_utf8_lossy(v: &[u8]) -> ~str {
10121077
}
10131078
}
10141079
}
1015-
unsafe { raw::push_bytes(&mut res, v.slice(lastgood, total)) };
1016-
res
1080+
if subseqidx < total {
1081+
unsafe { raw::push_bytes(&mut res, v.slice(subseqidx, total)) };
1082+
}
1083+
Owned(res)
10171084
}
10181085

10191086
/// Unsafe operations
@@ -3943,32 +4010,32 @@ mod tests {
39434010
#[test]
39444011
fn test_str_from_utf8_lossy() {
39454012
let xs = bytes!("hello");
3946-
assert_eq!(from_utf8_lossy(xs), ~"hello");
4013+
assert_eq!(from_utf8_lossy(xs), Slice("hello"));
39474014
39484015
let xs = bytes!("ศไทย中华Việt Nam");
3949-
assert_eq!(from_utf8_lossy(xs), ~"ศไทย中华Việt Nam");
4016+
assert_eq!(from_utf8_lossy(xs), Slice("ศไทย中华Việt Nam"));
39504017
39514018
let xs = bytes!("Hello", 0xC2, " There", 0xFF, " Goodbye");
3952-
assert_eq!(from_utf8_lossy(xs), ~"Hello\uFFFD There\uFFFD Goodbye");
4019+
assert_eq!(from_utf8_lossy(xs), Owned(~"Hello\uFFFD There\uFFFD Goodbye"));
39534020
39544021
let xs = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
3955-
assert_eq!(from_utf8_lossy(xs), ~"Hello\uFFFD\uFFFD There\uFFFD Goodbye");
4022+
assert_eq!(from_utf8_lossy(xs), Owned(~"Hello\uFFFD\uFFFD There\uFFFD Goodbye"));
39564023
39574024
let xs = bytes!(0xF5, "foo", 0xF5, 0x80, "bar");
3958-
assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFD\uFFFDbar");
4025+
assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFD\uFFFDbar"));
39594026

39604027
let xs = bytes!(0xF1, "foo", 0xF1, 0x80, "bar", 0xF1, 0x80, 0x80, "baz");
3961-
assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFDbar\uFFFDbaz");
4028+
assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFDbar\uFFFDbaz"));
39624029
39634030
let xs = bytes!(0xF4, "foo", 0xF4, 0x80, "bar", 0xF4, 0xBF, "baz");
3964-
assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz");
4031+
assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz"));
39654032

39664033
let xs = bytes!(0xF0, 0x80, 0x80, 0x80, "foo", 0xF0, 0x90, 0x80, 0x80, "bar");
3967-
assert_eq!(from_utf8_lossy(xs), ~"\uFFFD\uFFFD\uFFFD\uFFFDfoo\U00010000bar");
4034+
assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFD\uFFFD\uFFFD\uFFFDfoo\U00010000bar"));
39684035
39694036
// surrogates
39704037
let xs = bytes!(0xED, 0xA0, 0x80, "foo", 0xED, 0xBF, 0xBF, "bar");
3971-
assert_eq!(from_utf8_lossy(xs), ~"\uFFFD\uFFFD\uFFFDfoo\uFFFD\uFFFD\uFFFDbar");
4038+
assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFD\uFFFD\uFFFDfoo\uFFFD\uFFFD\uFFFDbar"));
39724039
}
39734040

39744041
#[test]

0 commit comments

Comments
 (0)