From c7ba2281f8da448dd4c1dbb2f8604232a82c031b Mon Sep 17 00:00:00 2001 From: Kevin Cantu Date: Tue, 24 Jan 2012 01:29:45 -0800 Subject: [PATCH 1/2] Reorganizing str.rs to group and document strings better (no functional changes, though FIXMEs added) --- src/libcore/str.rs | 1566 ++++++++++++++++++++++++-------------------- 1 file changed, 863 insertions(+), 703 deletions(-) diff --git a/src/libcore/str.rs b/src/libcore/str.rs index fa1aca29efa4c..2d4d1a7419d64 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -1,182 +1,130 @@ /* Module: str -String manipulation. -*/ - -export eq, lteq, hash, is_empty, is_not_empty, is_whitespace, byte_len, - byte_len_range, index, - rindex, find, starts_with, ends_with, substr, slice, split, splitn, - split_str, split_func, split_char, lines, lines_any, words, - concat, connect, to_lower, to_upper, replace, char_slice, - trim_left, trim_right, trim, unshift_char, shift_char, pop_char, - push_char, is_utf8, from_chars, to_chars, char_len, char_len_range, - char_at, bytes, is_ascii, shift_byte, pop_byte, - unsafe_from_byte, unsafe_from_bytes, from_char, char_range_at, - from_bytes, - from_cstr, sbuf, as_buf, push_byte, utf8_char_width, safe_slice, - contains, iter_chars, chars_iter, bytes_iter, words_iter, lines_iter, - loop_chars, loop_chars_sub, escape, any, all, map, windowed; +String manipulation + +Strings are a packed UTF-8 representation of text, stored as null terminated +buffers of u8 bytes. Strings should be considered by character, +for correctness, but some UTF-8 unsafe functions are also provided. +For some heavy-duty uses, we recommend trying std::rope. +*/ + +export + // Creating a string + from_bytes, + unsafe_from_bytes, + unsafe_from_byte, + //push_utf8_bytes, + from_char, + from_chars, + from_cstr, + concat, + connect, + + // Adding things to and removing things from a string + push_char, + pop_char, + shift_char, + unshift_char, + push_byte, + //push_bytes, + pop_byte, + shift_byte, + trim_left, + trim_right, + trim, + + // Transforming strings + bytes, + to_chars, + substr, + char_slice, + slice, + safe_slice, + split, + splitn, + split_str, + split_func, + split_char, + lines, + lines_any, + words, + windowed, + to_lower, + to_upper, + replace, + escape, + + // Comparing strings + eq, + lteq, + hash, + + // Iterating through strings + loop_chars, + all, + any, + map, + bytes_iter, + iter_chars, + chars_iter, + words_iter, + lines_iter, + + // Searching + index, + rindex, + find, + contains, + starts_with, + ends_with, + + // String properties + is_ascii, + is_empty, + is_not_empty, + is_whitespace, + byte_len, + char_len, + + // Misc + // FIXME: perhaps some more of this section shouldn't be exported? + is_utf8, + char_len_range, + byte_len_range, + utf8_char_width, + char_range_at, + char_at, + loop_chars_sub, + escape_char, + as_buf, + //buf, + sbuf; + + #[abi = "cdecl"] native mod rustrt { fn rust_str_push(&s: str, ch: u8); } -/* -Function: eq - -Bytewise string equality -*/ -pure fn eq(&&a: str, &&b: str) -> bool { a == b } - -/* -Function: lteq - -Bytewise less than or equal -*/ -pure fn lteq(&&a: str, &&b: str) -> bool { a <= b } - -/* -Function: hash - -String hash function -*/ -fn hash(&&s: str) -> uint { - // djb hash. - // FIXME: replace with murmur. - - let u: uint = 5381u; - for c: u8 in s { u *= 33u; u += c as uint; } - ret u; -} - -// UTF-8 tags and ranges -const tag_cont_u8: u8 = 128u8; -const tag_cont: uint = 128u; -const max_one_b: uint = 128u; -const tag_two_b: uint = 192u; -const max_two_b: uint = 2048u; -const tag_three_b: uint = 224u; -const max_three_b: uint = 65536u; -const tag_four_b: uint = 240u; -const max_four_b: uint = 2097152u; -const tag_five_b: uint = 248u; -const max_five_b: uint = 67108864u; -const tag_six_b: uint = 252u; - -/* -Function: is_utf8 - -Determines if a vector uf bytes contains valid UTF-8 -*/ -fn is_utf8(v: [u8]) -> bool { - let i = 0u; - let total = vec::len::(v); - while i < total { - let chsize = utf8_char_width(v[i]); - if chsize == 0u { ret false; } - if i + chsize > total { ret false; } - i += 1u; - while chsize > 1u { - if v[i] & 192u8 != tag_cont_u8 { ret false; } - i += 1u; - chsize -= 1u; - } - } - ret true; -} - -/* -Function: is_ascii - -Determines if a string contains only ASCII characters -*/ -fn is_ascii(s: str) -> bool { - let i: uint = byte_len(s); - while i > 0u { i -= 1u; if s[i] & 128u8 != 0u8 { ret false; } } - ret true; -} - -/* -Predicate: is_empty - -Returns true if the string has length 0 -*/ -pure fn is_empty(s: str) -> bool { for c: u8 in s { ret false; } ret true; } - -/* -Predicate: is_not_empty - -Returns true if the string has length greater than 0 -*/ -pure fn is_not_empty(s: str) -> bool { !is_empty(s) } - -/* -Function: is_whitespace - -Returns true if the string contains only whitespace -*/ -fn is_whitespace(s: str) -> bool { - ret loop_chars(s, char::is_whitespace); -} - -/* -Function: byte_len - -Returns the length in bytes of a string -*/ -pure fn byte_len(s: str) -> uint unsafe { - let v: [u8] = unsafe::reinterpret_cast(s); - let vlen = vec::len(v); - unsafe::leak(v); - // There should always be a null terminator - assert (vlen > 0u); - ret vlen - 1u; -} +// FIXME: add pure to a lot of functions /* -Function: byte_len_range - -As byte_len but for a substring - -Parameters: -s - A string -byte_offset - The byte offset at which to start in the string -char_len - The number of chars (not bytes!) in the range - -Returns: -The number of bytes in the substring starting at `byte_offset` and -containing `char_len` chars. - -Safety note: - -This function fails if `byte_offset` or `char_len` do not represent -valid positions in `s` +Section: Creating a string */ -fn byte_len_range(s: str, byte_offset: uint, char_len: uint) -> uint { - let i = byte_offset; - let chars = 0u; - while chars < char_len { - let chsize = utf8_char_width(s[i]); - assert (chsize > 0u); - i += chsize; - chars += 1u; - } - ret i - byte_offset; -} /* -Function: bytes +Function: from_bytes -Converts a string to a vector of bytes. The result vector is not -null-terminated. +Safely convert a vector of bytes to a UTF-8 string, or error */ -fn bytes(s: str) -> [u8] unsafe { - let v = unsafe::reinterpret_cast(s); - let vcopy = vec::slice(v, 0u, vec::len(v) - 1u); - unsafe::leak(v); - ret vcopy; +fn from_bytes(vv: [u8]) -> result::t { + if is_utf8(vv) { + ret result::ok(unsafe_from_bytes(vv)); + } else { + ret result::err("vector doesn't contain valid UTF-8"); + } } /* @@ -184,6 +132,8 @@ Function: unsafe_from_bytes Converts a vector of bytes to a string. Does not verify that the vector contains valid UTF-8. + +// FIXME: remove? */ fn unsafe_from_bytes(v: [const u8]) -> str unsafe { let vcopy: [u8] = v + [0u8]; @@ -192,24 +142,13 @@ fn unsafe_from_bytes(v: [const u8]) -> str unsafe { ret scopy; } -/* -Function: from_bytes - -Safely convert a vector of bytes to a UTF-8 string, or error -*/ -fn from_bytes(vv: [u8]) -> result::t { - if is_utf8(vv) { - ret result::ok(unsafe_from_bytes(vv)); - } else { - ret result::err("vector doesn't contain valid UTF-8"); - } -} - /* Function: unsafe_from_byte Converts a byte to a string. Does not verify that the byte is valid UTF-8. + +FIXME: rename to 'from_byte' */ fn unsafe_from_byte(u: u8) -> str { unsafe_from_bytes([u]) } @@ -265,237 +204,219 @@ fn from_chars(chs: [char]) -> str { } /* -Function: utf8_char_width +Function: from_cstr -Given a first byte, determine how many bytes are in this UTF-8 character +Create a Rust string from a null-terminated C string */ -pure fn utf8_char_width(b: u8) -> uint { - let byte: uint = b as uint; - if byte < 128u { ret 1u; } - if byte < 192u { - ret 0u; // Not a valid start byte - +unsafe fn from_cstr(cstr: sbuf) -> str { + let res = ""; + let start = cstr; + let curr = start; + let i = 0u; + while *curr != 0u8 { + push_byte(res, *curr); + i += 1u; + curr = ptr::offset(start, i); } - if byte < 224u { ret 2u; } - if byte < 240u { ret 3u; } - if byte < 248u { ret 4u; } - if byte < 252u { ret 5u; } - ret 6u; + ret res; } /* -Function: char_range_at +Function: concat -Pluck a character out of a string and return the index of the next character. -This function can be used to iterate over the unicode characters of a string. +Concatenate a vector of strings +*/ +fn concat(v: [str]) -> str { + let s: str = ""; + for ss: str in v { s += ss; } + ret s; +} -Example: -> let s = "中华Việt Nam"; -> let i = 0u; -> while i < str::byte_len(s) { -> let {ch, next} = str::char_range_at(s, i); -> std::io::println(#fmt("%u: %c",i,ch)); -> i = next; -> } +/* +Function: connect -Example output: +Concatenate a vector of strings, placing a given separator between each +*/ +fn connect(v: [str], sep: str) -> str { + let s: str = ""; + let first: bool = true; + for ss: str in v { + if first { first = false; } else { s += sep; } + s += ss; + } + ret s; +} - 0: 中 - 3: 华 - 6: V - 7: i - 8: ệ - 11: t - 12: - 13: N - 14: a - 15: m +/* +Section: Adding to and removing from a string +*/ -Parameters: +/* +Function: push_char -s - The string -i - The byte offset of the char to extract +Append a character to a string +*/ +fn push_char(&s: str, ch: char) { s += from_char(ch); } -Returns: +/* +Function: pop_char -A record {ch: char, next: uint} containing the char value and the byte -index of the next unicode character. +Remove the final character from a string and return it. Failure: -If `i` is greater than or equal to the length of the string. -If `i` is not the index of the beginning of a valid UTF-8 character. +If the string does not contain any characters. */ -fn char_range_at(s: str, i: uint) -> {ch: char, next: uint} { - let b0 = s[i]; - let w = utf8_char_width(b0); - assert (w != 0u); - if w == 1u { ret {ch: b0 as char, next: i + 1u}; } - let val = 0u; - let end = i + w; - let i = i + 1u; - while i < end { - let byte = s[i]; - assert (byte & 192u8 == tag_cont_u8); - val <<= 6u; - val += byte & 63u8 as uint; - i += 1u; - } - // Clunky way to get the right bits from the first byte. Uses two shifts, - // the first to clip off the marker bits at the left of the byte, and then - // a second (as uint) to get it to the right position. - val += (b0 << (w + 1u as u8) as uint) << ((w - 1u) * 6u - w - 1u); - ret {ch: val as char, next: i}; +fn pop_char(&s: str) -> char { + let end = byte_len(s); + while end > 0u && s[end - 1u] & 192u8 == tag_cont_u8 { end -= 1u; } + assert (end > 0u); + let ch = char_at(s, end - 1u); + s = substr(s, 0u, end - 1u); + ret ch; } /* -Function: char_at +Function: shift_char -Pluck a character out of a string +Remove the first character from a string and return it. + +Failure: + +If the string does not contain any characters. */ -fn char_at(s: str, i: uint) -> char { ret char_range_at(s, i).ch; } +fn shift_char(&s: str) -> char { + let r = char_range_at(s, 0u); + s = substr(s, r.next, byte_len(s) - r.next); + ret r.ch; +} /* -Function: iter_chars +Function: unshift_char -Iterate over the characters in a string +Prepend a char to a string */ -fn iter_chars(s: str, it: fn(char)) { - let pos = 0u, len = byte_len(s); - while (pos < len) { - let {ch, next} = char_range_at(s, pos); - pos = next; - it(ch); - } -} +fn unshift_char(&s: str, ch: char) { s = from_char(ch) + s; } /* -Function: chars_iter +Function: push_byte -Iterate over the characters in a string +Appends a byte to a string. -FIXME: A synonym to iter_chars +This function is not unicode-safe. */ -fn chars_iter(ss: str, it: fn(char)) { - iter_chars(ss, it) -} +fn push_byte(&s: str, b: u8) { rustrt::rust_str_push(s, b); } /* -Function: bytes_iter +Function: push_bytes -Iterate over the bytes in a string +Appends a vector of bytes to a string. -FIXME: Should it really include the last byte '\0'? +This function is not unicode-safe. */ -fn bytes_iter(ss: str, it: fn(u8)) { - let pos = 0u; - let len = byte_len(ss); - - while (pos < len) { - it(ss[pos]); - pos += 1u; - } +fn push_bytes(&s: str, bytes: [u8]) { + for byte in bytes { rustrt::rust_str_push(s, byte); } } /* -Function: loop_chars - -Loop through a string, char by char - -Parameters: -s - A string to traverse. It may be empty. -it - A block to execute with each consecutive character of `s`. -Return `true` to continue, `false` to stop. +Function: pop_byte -Returns: +Removes the last byte from a string and returns it. -`true` If execution proceeded correctly, `false` if it was interrupted, -that is if `it` returned `false` at any point. - */ -fn loop_chars(s: str, it: fn(char) -> bool) -> bool{ - ret loop_chars_sub(s, 0u, byte_len(s), it); +This function is not unicode-safe. +*/ +fn pop_byte(&s: str) -> u8 { + let len = byte_len(s); + assert (len > 0u); + let b = s[len - 1u]; + s = substr(s, 0u, len - 1u); + ret b; } /* -Function: loop_chars_sub - -Loop through a substring, char by char +Function: shift_byte -Parameters: -s - A string to traverse. It may be empty. -byte_offset - The byte offset at which to start in the string. -byte_len - The number of bytes to traverse in the string -it - A block to execute with each consecutive character of `s`. -Return `true` to continue, `false` to stop. +Removes the first byte from a string and returns it. -Returns: +This function is not unicode-safe. +*/ +fn shift_byte(&s: str) -> u8 { + let len = byte_len(s); + assert (len > 0u); + let b = s[0]; + s = substr(s, 1u, len - 1u); + ret b; +} -`true` If execution proceeded correctly, `false` if it was interrupted, -that is if `it` returned `false` at any point. +/* +Function: trim_left -Safety note: -- This function does not check whether the substring is valid. -- This function fails if `byte_offset` or `byte_len` do not - represent valid positions inside `s` - */ -fn loop_chars_sub(s: str, byte_offset: uint, byte_len: uint, - it: fn(char) -> bool) -> bool { - let i = byte_offset; - let result = true; - while i < byte_len { - let {ch, next} = char_range_at(s, i); - if !it(ch) {result = false; break;} - i = next; - } - ret result; +Returns a string with leading whitespace removed. +*/ +fn trim_left(s: str) -> str { + fn count_whities(s: [char]) -> uint { + let i = 0u; + while i < vec::len(s) { + if !char::is_whitespace(s[i]) { break; } + i += 1u; + } + ret i; + } + let chars = to_chars(s); + let whities = count_whities(chars); + ret from_chars(vec::slice(chars, whities, vec::len(chars))); } - /* -Function: char_len +Function: trim_right -Count the number of unicode characters in a string +Returns a string with trailing whitespace removed. */ -fn char_len(s: str) -> uint { - ret char_len_range(s, 0u, byte_len(s)); +fn trim_right(s: str) -> str { + fn count_whities(s: [char]) -> uint { + let i = vec::len(s); + while 0u < i { + if !char::is_whitespace(s[i - 1u]) { break; } + i -= 1u; + } + ret i; + } + let chars = to_chars(s); + let whities = count_whities(chars); + ret from_chars(vec::slice(chars, 0u, whities)); } /* -Function: char_len_range +Function: trim -As char_len but for a slice of a string +Returns a string with leading and trailing whitespace removed +*/ +fn trim(s: str) -> str { trim_left(trim_right(s)) } -Parameters: - s - A valid string - byte_start - The position inside `s` where to start counting in bytes. - byte_len - The number of bytes of `s` to take into account. -Returns: - The number of Unicode characters in `s` in -segment [byte_start, byte_start+len( . +/* +Section: Transforming strings +*/ -Safety note: -- This function does not check whether the substring is valid. -- This function fails if `byte_offset` or `byte_len` do not - represent valid positions inside `s` +/* +Function: bytes + +Converts a string to a vector of bytes. The result vector is not +null-terminated. */ -fn char_len_range(s: str, byte_start: uint, byte_len: uint) -> uint { - let i = byte_start; - let len = 0u; - while i < byte_len { - let chsize = utf8_char_width(s[i]); - assert (chsize > 0u); - len += 1u; - i += chsize; - } - assert (i == byte_len); - ret len; +fn bytes(s: str) -> [u8] unsafe { + let v = unsafe::reinterpret_cast(s); + let vcopy = vec::slice(v, 0u, vec::len(v) - 1u); + unsafe::leak(v); + ret vcopy; } /* Function: to_chars Convert a string to a vector of characters + +FIXME: rename to 'chars' */ fn to_chars(s: str) -> [char] { let buf: [char] = []; @@ -510,175 +431,37 @@ fn to_chars(s: str) -> [char] { } /* -Function: push_char - -Append a character to a string -*/ -fn push_char(&s: str, ch: char) { s += from_char(ch); } +Function: substr -/* -Function: pop_char +Take a substring of another. Returns a string containing `len` bytes +starting at byte offset `begin`. -Remove the final character from a string and return it. +FIXME: This function is not unicode-safe. Failure: -If the string does not contain any characters. +If `begin` + `len` is is greater than the byte length of the string */ -fn pop_char(&s: str) -> char { - let end = byte_len(s); - while end > 0u && s[end - 1u] & 192u8 == tag_cont_u8 { end -= 1u; } - assert (end > 0u); - let ch = char_at(s, end - 1u); - s = substr(s, 0u, end - 1u); - ret ch; +fn substr(s: str, begin: uint, len: uint) -> str { + ret slice(s, begin, begin + len); } /* -Function: shift_char +Function: char_slice -Remove the first character from a string and return it. +Unicode-safe slice. Returns a slice of the given string containing +the characters in the range [`begin`..`end`). `begin` and `end` are +character indexes, not byte indexes. Failure: -If the string does not contain any characters. +- If begin is greater than end +- If end is greater than the character length of the string + +FIXME: rename to slice(), make faster by avoiding char conversion */ -fn shift_char(&s: str) -> char { - let r = char_range_at(s, 0u); - s = substr(s, r.next, byte_len(s) - r.next); - ret r.ch; -} - -/* -Function: unshift_char - -Prepend a char to a string -*/ -fn unshift_char(&s: str, ch: char) { s = from_char(ch) + s; } - -/* -Function: index - -Returns the index of the first matching byte. Returns -1 if -no match is found. -*/ -fn index(s: str, c: u8) -> int { - let i: int = 0; - for k: u8 in s { if k == c { ret i; } i += 1; } - ret -1; -} - -/* -Function: rindex - -Returns the index of the last matching byte. Returns -1 -if no match is found. -*/ -fn rindex(s: str, c: u8) -> int { - let n: int = byte_len(s) as int; - while n >= 0 { if s[n] == c { ret n; } n -= 1; } - ret n; -} - -/* -Function: find - -Finds the index of the first matching substring. -Returns -1 if `haystack` does not contain `needle`. - -Parameters: - -haystack - The string to look in -needle - The string to look for - -Returns: - -The index of the first occurance of `needle`, or -1 if not found. -*/ -fn find(haystack: str, needle: str) -> int { - let haystack_len: int = byte_len(haystack) as int; - let needle_len: int = byte_len(needle) as int; - if needle_len == 0 { ret 0; } - fn match_at(haystack: str, needle: str, i: int) -> bool { - let j: int = i; - for c: u8 in needle { if haystack[j] != c { ret false; } j += 1; } - ret true; - } - let i: int = 0; - while i <= haystack_len - needle_len { - if match_at(haystack, needle, i) { ret i; } - i += 1; - } - ret -1; -} - -/* -Function: contains - -Returns true if one string contains another - -Parameters: - -haystack - The string to look in -needle - The string to look for -*/ -fn contains(haystack: str, needle: str) -> bool { - 0 <= find(haystack, needle) -} - -/* -Function: starts_with - -Returns true if one string starts with another - -Parameters: - -haystack - The string to look in -needle - The string to look for -*/ -fn starts_with(haystack: str, needle: str) -> bool { - let haystack_len: uint = byte_len(haystack); - let needle_len: uint = byte_len(needle); - if needle_len == 0u { ret true; } - if needle_len > haystack_len { ret false; } - ret eq(substr(haystack, 0u, needle_len), needle); -} - -/* -Function: ends_with - -Returns true if one string ends with another - -haystack - The string to look in -needle - The string to look for -*/ -fn ends_with(haystack: str, needle: str) -> bool { - let haystack_len: uint = byte_len(haystack); - let needle_len: uint = byte_len(needle); - ret if needle_len == 0u { - true - } else if needle_len > haystack_len { - false - } else { - eq(substr(haystack, haystack_len - needle_len, needle_len), - needle) - }; -} - -/* -Function: substr - -Take a substring of another. Returns a string containing `len` bytes -starting at byte offset `begin`. - -This function is not unicode-safe. - -Failure: - -If `begin` + `len` is is greater than the byte length of the string -*/ -fn substr(s: str, begin: uint, len: uint) -> str { - ret slice(s, begin, begin + len); +fn char_slice(s: str, begin: uint, end: uint) -> str { + from_chars(vec::slice(to_chars(s), begin, end)) } /* @@ -693,6 +476,8 @@ Failure: - If begin is greater than end. - If end is greater than the length of the string. + +FIXME: rename to slice_byte or slice_byte_unsafe */ fn slice(s: str, begin: uint, end: uint) -> str unsafe { // FIXME: Typestate precondition @@ -710,6 +495,10 @@ fn slice(s: str, begin: uint, end: uint) -> str unsafe { /* Function: safe_slice + +FIXME: make sure char_slice / slice / byte_slice + have these preconditions and assertions +FIXME: this shouldn't be mistaken for a UTF-8 safe slice */ fn safe_slice(s: str, begin: uint, end: uint) : uint::le(begin, end) -> str { // would need some magic to make this a precondition @@ -717,56 +506,6 @@ fn safe_slice(s: str, begin: uint, end: uint) : uint::le(begin, end) -> str { ret slice(s, begin, end); } -/* -Function: shift_byte - -Removes the first byte from a string and returns it. - -This function is not unicode-safe. -*/ -fn shift_byte(&s: str) -> u8 { - let len = byte_len(s); - assert (len > 0u); - let b = s[0]; - s = substr(s, 1u, len - 1u); - ret b; -} - -/* -Function: pop_byte - -Removes the last byte from a string and returns it. - -This function is not unicode-safe. -*/ -fn pop_byte(&s: str) -> u8 { - let len = byte_len(s); - assert (len > 0u); - let b = s[len - 1u]; - s = substr(s, 0u, len - 1u); - ret b; -} - -/* -Function: push_byte - -Appends a byte to a string. - -This function is not unicode-safe. -*/ -fn push_byte(&s: str, b: u8) { rustrt::rust_str_push(s, b); } - -/* -Function: push_bytes - -Appends a vector of bytes to a string. - -This function is not unicode-safe. -*/ -fn push_bytes(&s: str, bytes: [u8]) { - for byte in bytes { rustrt::rust_str_push(s, byte); } -} - /* Function: split @@ -801,6 +540,8 @@ Split a string at each occurance of a given separator up to count times. Returns: A vector containing all the strings between each occurance of the separator + +FIXME: rename to 'splitn_char' */ fn splitn(s: str, sep: u8, count: uint) -> [str] { let v = []; @@ -864,7 +605,7 @@ Function: split_func Splits a string into substrings using a function (unicode safe) -FIXME: will be renamed to split. +FIXME: rename to 'split' */ fn split_func(ss: str, sepfn: fn(cc: char)->bool) -> [str] { let vv: [str] = []; @@ -929,53 +670,32 @@ fn words(ss: str) -> [str] { } /* -Function: words_iter - -Apply a function to each word -*/ -fn words_iter(ss: str, ff: fn(&&str)) { - vec::iter(words(ss), ff) -} - -/* -Function: lines_iter - -Apply a function to each lines (by '\n') -*/ -fn lines_iter(ss: str, ff: fn(&&str)) { - vec::iter(lines(ss), ff) -} - -/* -Function: concat +Function: windowed -Concatenate a vector of strings +Create a vector of substrings of size `nn` */ -fn concat(v: [str]) -> str { - let s: str = ""; - for ss: str in v { s += ss; } - ret s; -} +fn windowed(nn: uint, ss: str) -> [str] { + let ww = []; + let len = str::char_len(ss); -/* -Function: connect + assert 1u <= nn; -Concatenate a vector of strings, placing a given separator between each -*/ -fn connect(v: [str], sep: str) -> str { - let s: str = ""; - let first: bool = true; - for ss: str in v { - if first { first = false; } else { s += sep; } - s += ss; + let ii = 0u; + while ii+nn <= len { + let w = char_slice( ss, ii, ii+nn ); + vec::push(ww,w); + ii += 1u; } - ret s; + + ret ww; } /* Function: to_lower Convert a string to lowercase + +FIXME: rewrite with map */ fn to_lower(s: str) -> str { let outstr = ""; @@ -984,10 +704,13 @@ fn to_lower(s: str) -> str { } ret outstr; } + /* Function: to_upper Convert a string to uppercase + +FIXME: rewrite with map */ fn to_upper(s: str) -> str { let outstr = ""; @@ -1031,145 +754,72 @@ fn replace(s: str, from: str, to: str) : is_not_empty(from) -> str { } } -// FIXME: Also not efficient /* -Function: char_slice - -Unicode-safe slice. Returns a slice of the given string containing -the characters in the range [`begin`..`end`). `begin` and `end` are -character indexes, not byte indexes. - -Failure: +Function: escape -- If begin is greater than end -- If end is greater than the character length of the string +Escapes special characters inside the string, making it safe for transfer. */ -fn char_slice(s: str, begin: uint, end: uint) -> str { - from_chars(vec::slice(to_chars(s), begin, end)) +fn escape(s: str) -> str { + let r = ""; + loop_chars(s, { |c| r += escape_char(c); true }); + r } /* -Function: trim_left - -Returns a string with leading whitespace removed. +Section: Comparing strings */ -fn trim_left(s: str) -> str { - fn count_whities(s: [char]) -> uint { - let i = 0u; - while i < vec::len(s) { - if !char::is_whitespace(s[i]) { break; } - i += 1u; - } - ret i; - } - let chars = to_chars(s); - let whities = count_whities(chars); - ret from_chars(vec::slice(chars, whities, vec::len(chars))); -} /* -Function: trim_right +Function: eq -Returns a string with trailing whitespace removed. +Bytewise string equality */ -fn trim_right(s: str) -> str { - fn count_whities(s: [char]) -> uint { - let i = vec::len(s); - while 0u < i { - if !char::is_whitespace(s[i - 1u]) { break; } - i -= 1u; - } - ret i; - } - let chars = to_chars(s); - let whities = count_whities(chars); - ret from_chars(vec::slice(chars, 0u, whities)); -} +pure fn eq(&&a: str, &&b: str) -> bool { a == b } /* -Function: trim +Function: lteq -Returns a string with leading and trailing whitespace removed +Bytewise less than or equal */ -fn trim(s: str) -> str { trim_left(trim_right(s)) } +pure fn lteq(&&a: str, &&b: str) -> bool { a <= b } /* -Type: sbuf +Function: hash -An unsafe buffer of bytes. Corresponds to a C char pointer. +String hash function */ -type sbuf = *u8; +fn hash(&&s: str) -> uint { + // djb hash. + // FIXME: replace with murmur. -// NB: This is intentionally unexported because it's easy to misuse (there's -// no guarantee that the string is rooted). Instead, use as_buf below. -unsafe fn buf(s: str) -> sbuf { - let saddr = ptr::addr_of(s); - let vaddr: *[u8] = unsafe::reinterpret_cast(saddr); - let buf = vec::to_ptr(*vaddr); - ret buf; + let u: uint = 5381u; + for c: u8 in s { u *= 33u; u += c as uint; } + ret u; } /* -Function: as_buf - -Work with the byte buffer of a string. Allows for unsafe manipulation -of strings, which is useful for native interop. - -Example: - -> let s = str::as_buf("PATH", { |path_buf| libc::getenv(path_buf) }); - +Section: Iterating through strings */ -fn as_buf(s: str, f: fn(sbuf) -> T) -> T unsafe { - let buf = buf(s); f(buf) -} /* -Function: from_cstr +Function: loop_chars -Create a Rust string from a null-terminated C string -*/ -unsafe fn from_cstr(cstr: sbuf) -> str { - let res = ""; - let start = cstr; - let curr = start; - let i = 0u; - while *curr != 0u8 { - push_byte(res, *curr); - i += 1u; - curr = ptr::offset(start, i); - } - ret res; -} +Loop through a string, char by char -/* -Function: escape_char +Parameters: +s - A string to traverse. It may be empty. +it - A block to execute with each consecutive character of `s`. +Return `true` to continue, `false` to stop. -Escapes a single character. -*/ -fn escape_char(c: char) -> str { - alt c { - '"' { "\\\"" } - '\\' { "\\\\" } - '\n' { "\\n" } - '\t' { "\\t" } - '\r' { "\\r" } - // FIXME: uncomment this when extfmt is moved to core - // in a snapshot. - // '\x00' to '\x1f' { #fmt["\\x%02x", c as uint] } - v { from_char(c) } - } -} +Returns: -/* -Function: escape +`true` If execution proceeded correctly, `false` if it was interrupted, +that is if `it` returned `false` at any point. -Escapes special characters inside the string, making it safe for transfer. -*/ -fn escape(s: str) -> str { - let r = ""; - loop_chars(s, { |c| r += escape_char(c); true }); - r +FIXME: rename to 'chars_loop' (change? currently a synonym to 'all') + */ +fn loop_chars(s: str, it: fn(char) -> bool) -> bool{ + ret loop_chars_sub(s, 0u, byte_len(s), it); } /* @@ -1210,32 +860,542 @@ fn map(ss: str, ff: fn(char) -> char) -> str { } /* -Function: windowed +Function: bytes_iter -Create a vector of substrings of size `nn` +Iterate over the bytes in a string + +FIXME: Should it really include the last byte '\0'? */ -fn windowed(nn: uint, ss: str) -> [str] { - let ww = []; - let len = str::char_len(ss); +fn bytes_iter(ss: str, it: fn(u8)) { + let pos = 0u; + let len = byte_len(ss); - assert 1u <= nn; + while (pos < len) { + it(ss[pos]); + pos += 1u; + } +} - let ii = 0u; - while ii+nn <= len { - let w = char_slice( ss, ii, ii+nn ); - vec::push(ww,w); - ii += 1u; +/* +Function: iter_chars + +Iterate over the characters in a string + +FIXME: rename to 'chars_iter' +*/ +fn iter_chars(s: str, it: fn(char)) { + let pos = 0u, len = byte_len(s); + while (pos < len) { + let {ch, next} = char_range_at(s, pos); + pos = next; + it(ch); } +} - ret ww; +/* +Function: chars_iter + +Iterate over the characters in a string + +FIXME: A synonym to iter_chars +*/ +fn chars_iter(ss: str, it: fn(char)) { + iter_chars(ss, it) +} + +/* +Function: words_iter + +Apply a function to each word +*/ +fn words_iter(ss: str, ff: fn(&&str)) { + vec::iter(words(ss), ff) +} + +/* +Function: lines_iter + +Apply a function to each lines (by '\n') +*/ +fn lines_iter(ss: str, ff: fn(&&str)) { + vec::iter(lines(ss), ff) +} + +// FIXME: ADD split_char_iter +// FIXME: ADD splitn_char_iter + +/* +Section: Searching +*/ + +/* +Function: index + +Returns the index of the first matching byte. Returns -1 if +no match is found. +*/ +fn index(s: str, c: u8) -> int { + let i: int = 0; + for k: u8 in s { if k == c { ret i; } i += 1; } + ret -1; +} + +/* +Function: rindex + +Returns the index of the last matching byte. Returns -1 +if no match is found. +*/ +fn rindex(s: str, c: u8) -> int { + let n: int = byte_len(s) as int; + while n >= 0 { if s[n] == c { ret n; } n -= 1; } + ret n; +} + +/* +Function: find + +Finds the index of the first matching substring. +Returns -1 if `haystack` does not contain `needle`. + +Parameters: + +haystack - The string to look in +needle - The string to look for + +Returns: + +The index of the first occurance of `needle`, or -1 if not found. +*/ +fn find(haystack: str, needle: str) -> int { + let haystack_len: int = byte_len(haystack) as int; + let needle_len: int = byte_len(needle) as int; + if needle_len == 0 { ret 0; } + fn match_at(haystack: str, needle: str, i: int) -> bool { + let j: int = i; + for c: u8 in needle { if haystack[j] != c { ret false; } j += 1; } + ret true; + } + let i: int = 0; + while i <= haystack_len - needle_len { + if match_at(haystack, needle, i) { ret i; } + i += 1; + } + ret -1; } +/* +Function: contains + +Returns true if one string contains another + +Parameters: + +haystack - The string to look in +needle - The string to look for +*/ +fn contains(haystack: str, needle: str) -> bool { + 0 <= find(haystack, needle) +} + +/* +Function: starts_with + +Returns true if one string starts with another + +Parameters: + +haystack - The string to look in +needle - The string to look for +*/ +fn starts_with(haystack: str, needle: str) -> bool { + let haystack_len: uint = byte_len(haystack); + let needle_len: uint = byte_len(needle); + if needle_len == 0u { ret true; } + if needle_len > haystack_len { ret false; } + ret eq(substr(haystack, 0u, needle_len), needle); +} + +/* +Function: ends_with + +Returns true if one string ends with another + +haystack - The string to look in +needle - The string to look for +*/ +fn ends_with(haystack: str, needle: str) -> bool { + let haystack_len: uint = byte_len(haystack); + let needle_len: uint = byte_len(needle); + ret if needle_len == 0u { + true + } else if needle_len > haystack_len { + false + } else { + eq(substr(haystack, haystack_len - needle_len, needle_len), + needle) + }; +} + +/* +Section: String properties +*/ + +/* +Function: is_ascii + +Determines if a string contains only ASCII characters + +FIXME: possibly implement using char::is_ascii when it exists +*/ +fn is_ascii(s: str) -> bool { + let i: uint = byte_len(s); + while i > 0u { i -= 1u; if s[i] & 128u8 != 0u8 { ret false; } } + ret true; +} + +/* +Predicate: is_empty + +Returns true if the string has length 0 +*/ +pure fn is_empty(s: str) -> bool { for c: u8 in s { ret false; } ret true; } + +/* +Predicate: is_not_empty + +Returns true if the string has length greater than 0 +*/ +pure fn is_not_empty(s: str) -> bool { !is_empty(s) } + +/* +Function: is_whitespace + +Returns true if the string contains only whitespace +*/ +fn is_whitespace(s: str) -> bool { + ret loop_chars(s, char::is_whitespace); +} + +/* +Function: byte_len + +Returns the length in bytes of a string + +FIXME: rename to 'len_bytes'? +*/ +pure fn byte_len(s: str) -> uint unsafe { + let v: [u8] = unsafe::reinterpret_cast(s); + let vlen = vec::len(v); + unsafe::leak(v); + // There should always be a null terminator + assert (vlen > 0u); + ret vlen - 1u; +} + +/* +Function: char_len + +Count the number of unicode characters in a string + +FIXME: rename to 'len_chars'? +*/ +fn char_len(s: str) -> uint { + ret char_len_range(s, 0u, byte_len(s)); +} + +/* +Section: Misc +*/ + +/* +Function: is_utf8 + +Determines if a vector of bytes contains valid UTF-8 +*/ +fn is_utf8(v: [u8]) -> bool { + let i = 0u; + let total = vec::len::(v); + while i < total { + let chsize = utf8_char_width(v[i]); + if chsize == 0u { ret false; } + if i + chsize > total { ret false; } + i += 1u; + while chsize > 1u { + if v[i] & 192u8 != tag_cont_u8 { ret false; } + i += 1u; + chsize -= 1u; + } + } + ret true; +} + +/* +Function: char_len_range + +As char_len but for a slice of a string + +Parameters: + s - A valid string + byte_start - The position inside `s` where to start counting in bytes. + byte_len - The number of bytes of `s` to take into account. + +Returns: + The number of Unicode characters in `s` in +segment [byte_start, byte_start+len( . + +Safety note: +- This function does not check whether the substring is valid. +- This function fails if `byte_offset` or `byte_len` do not + represent valid positions inside `s` + +FIXME: rename to 'substr_len_chars' +*/ +fn char_len_range(s: str, byte_start: uint, byte_len: uint) -> uint { + let i = byte_start; + let len = 0u; + while i < byte_len { + let chsize = utf8_char_width(s[i]); + assert (chsize > 0u); + len += 1u; + i += chsize; + } + assert (i == byte_len); + ret len; +} + +/* +Function: byte_len_range + +As byte_len but for a substring + +Parameters: +s - A string +byte_offset - The byte offset at which to start in the string +char_len - The number of chars (not bytes!) in the range + +Returns: +The number of bytes in the substring starting at `byte_offset` and +containing `char_len` chars. + +Safety note: + +This function fails if `byte_offset` or `char_len` do not represent +valid positions in `s` + +FIXME: rename to 'substr_len_bytes' +*/ +fn byte_len_range(s: str, byte_offset: uint, char_len: uint) -> uint { + let i = byte_offset; + let chars = 0u; + while chars < char_len { + let chsize = utf8_char_width(s[i]); + assert (chsize > 0u); + i += chsize; + chars += 1u; + } + ret i - byte_offset; +} + +/* +Function: utf8_char_width + +Given a first byte, determine how many bytes are in this UTF-8 character + +*/ +pure fn utf8_char_width(b: u8) -> uint { + let byte: uint = b as uint; + if byte < 128u { ret 1u; } + if byte < 192u { + ret 0u; // Not a valid start byte + + } + if byte < 224u { ret 2u; } + if byte < 240u { ret 3u; } + if byte < 248u { ret 4u; } + if byte < 252u { ret 5u; } + ret 6u; +} + +/* +Function: char_range_at + +Pluck a character out of a string and return the index of the next character. +This function can be used to iterate over the unicode characters of a string. + +Example: +> let s = "中华Việt Nam"; +> let i = 0u; +> while i < str::byte_len(s) { +> let {ch, next} = str::char_range_at(s, i); +> std::io::println(#fmt("%u: %c",i,ch)); +> i = next; +> } + +Example output: + + 0: 中 + 3: 华 + 6: V + 7: i + 8: ệ + 11: t + 12: + 13: N + 14: a + 15: m + +Parameters: + +s - The string +i - The byte offset of the char to extract + +Returns: + +A record {ch: char, next: uint} containing the char value and the byte +index of the next unicode character. + +Failure: + +If `i` is greater than or equal to the length of the string. +If `i` is not the index of the beginning of a valid UTF-8 character. +*/ +fn char_range_at(s: str, i: uint) -> {ch: char, next: uint} { + let b0 = s[i]; + let w = utf8_char_width(b0); + assert (w != 0u); + if w == 1u { ret {ch: b0 as char, next: i + 1u}; } + let val = 0u; + let end = i + w; + let i = i + 1u; + while i < end { + let byte = s[i]; + assert (byte & 192u8 == tag_cont_u8); + val <<= 6u; + val += byte & 63u8 as uint; + i += 1u; + } + // Clunky way to get the right bits from the first byte. Uses two shifts, + // the first to clip off the marker bits at the left of the byte, and then + // a second (as uint) to get it to the right position. + val += (b0 << (w + 1u as u8) as uint) << ((w - 1u) * 6u - w - 1u); + ret {ch: val as char, next: i}; +} + +/* +Function: char_at + +Pluck a character out of a string +*/ +fn char_at(s: str, i: uint) -> char { ret char_range_at(s, i).ch; } + +/* +Function: loop_chars_sub + +Loop through a substring, char by char + +Parameters: +s - A string to traverse. It may be empty. +byte_offset - The byte offset at which to start in the string. +byte_len - The number of bytes to traverse in the string +it - A block to execute with each consecutive character of `s`. +Return `true` to continue, `false` to stop. + +Returns: + +`true` If execution proceeded correctly, `false` if it was interrupted, +that is if `it` returned `false` at any point. + +Safety note: +- This function does not check whether the substring is valid. +- This function fails if `byte_offset` or `byte_len` do not + represent valid positions inside `s` + +FIXME: rename to 'substr_all' + */ +fn loop_chars_sub(s: str, byte_offset: uint, byte_len: uint, + it: fn(char) -> bool) -> bool { + let i = byte_offset; + let result = true; + while i < byte_len { + let {ch, next} = char_range_at(s, i); + if !it(ch) {result = false; break;} + i = next; + } + ret result; +} + + +/* +Function: escape_char + +Escapes a single character. +*/ +fn escape_char(c: char) -> str { + alt c { + '"' { "\\\"" } + '\\' { "\\\\" } + '\n' { "\\n" } + '\t' { "\\t" } + '\r' { "\\r" } + // FIXME: uncomment this when extfmt is moved to core + // in a snapshot. + // '\x00' to '\x1f' { #fmt["\\x%02x", c as uint] } + v { from_char(c) } + } +} + +// UTF-8 tags and ranges +const tag_cont_u8: u8 = 128u8; +const tag_cont: uint = 128u; +const max_one_b: uint = 128u; +const tag_two_b: uint = 192u; +const max_two_b: uint = 2048u; +const tag_three_b: uint = 224u; +const max_three_b: uint = 65536u; +const tag_four_b: uint = 240u; +const max_four_b: uint = 2097152u; +const tag_five_b: uint = 248u; +const max_five_b: uint = 67108864u; +const tag_six_b: uint = 252u; + +// NB: This is intentionally unexported because it's easy to misuse (there's +// no guarantee that the string is rooted). Instead, use as_buf below. +unsafe fn buf(s: str) -> sbuf { + let saddr = ptr::addr_of(s); + let vaddr: *[u8] = unsafe::reinterpret_cast(saddr); + let buf = vec::to_ptr(*vaddr); + ret buf; +} + +/* +Function: as_buf + +Work with the byte buffer of a string. Allows for unsafe manipulation +of strings, which is useful for native interop. + +Example: + +> let s = str::as_buf("PATH", { |path_buf| libc::getenv(path_buf) }); + +*/ +fn as_buf(s: str, f: fn(sbuf) -> T) -> T unsafe { + let buf = buf(s); f(buf) +} + +/* +Type: sbuf + +An unsafe buffer of bytes. Corresponds to a C char pointer. +*/ +type sbuf = *u8; + + #[cfg(test)] mod tests { #[test] - fn test_eq() { - assert (eq("", "")); + fn test_eq() { assert (eq("", "")); assert (eq("foo", "foo")); assert (!eq("foo", "bar")); } From 3c81aa9c2b86d7d5f9bf498c8835c1f7386ecddb Mon Sep 17 00:00:00 2001 From: Kevin Cantu Date: Tue, 24 Jan 2012 01:34:18 -0800 Subject: [PATCH 2/2] fix a typo --- src/libcore/str.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/libcore/str.rs b/src/libcore/str.rs index 2d4d1a7419d64..823f7cc78c6b2 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -1395,7 +1395,8 @@ type sbuf = *u8; mod tests { #[test] - fn test_eq() { assert (eq("", "")); + fn test_eq() { + assert (eq("", "")); assert (eq("foo", "foo")); assert (!eq("foo", "bar")); }