Skip to content

Commit 75c07a3

Browse files
committed
Add memchr search support for multibyte characters
1 parent f865164 commit 75c07a3

File tree

1 file changed

+102
-48
lines changed

1 file changed

+102
-48
lines changed

src/libcore/str/pattern.rs

Lines changed: 102 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -252,13 +252,28 @@ pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {}
252252
#[derive(Clone, Debug)]
253253
pub struct CharSearcher<'a> {
254254
haystack: &'a str,
255-
// invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack`
255+
// safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack`
256+
// This invariant can be broken *within* next_match and next_match_back, however
257+
// they must exit with fingers on valid code point boundaries.
258+
259+
/// `finger` is the current byte index of the forward search.
260+
/// Imagine that it exists before the byte at its index, i.e.
261+
/// haystack[finger] is the first byte of the slice we must inspect during
262+
/// forward searching
256263
finger: usize,
264+
/// `finger_back` is the current byte index of the reverse search.
265+
/// Imagine that it exists after the byte at its index, i.e.
266+
/// haystack[finger_back - 1] is the last byte of the slice we must inspect during
267+
/// forward searching (and thus the first byte to be inspected when calling next_back())
257268
finger_back: usize,
269+
/// The character being searched for
258270
needle: char,
259-
// For ascii chars
260-
// invariant: must be an ASCII byte (no high bit)
261-
single_byte: Option<u8>,
271+
272+
// safety invariant: `utf8_size` must be less than 5
273+
/// The number of bytes `needle` takes up when encoded in utf8
274+
utf8_size: usize,
275+
/// A utf8 encoded copy of the `needle`
276+
utf8_encoded: [u8; 4],
262277
}
263278

264279
unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
@@ -269,12 +284,12 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
269284
#[inline]
270285
fn next(&mut self) -> SearchStep {
271286
let old_finger = self.finger;
272-
let slice = unsafe { self.haystack.get_unchecked(old_finger..) };
287+
let slice = unsafe { self.haystack.get_unchecked(old_finger..self.haystack.len()) };
273288
let mut iter = slice.chars();
274289
let old_len = iter.iter.len();
275290
if let Some(ch) = iter.next() {
276291
// add byte offset of current character
277-
// without recalculating
292+
// without re-encoding as utf-8
278293
self.finger += old_len - iter.iter.len();
279294
if ch == self.needle {
280295
SearchStep::Match(old_finger, self.finger)
@@ -287,25 +302,44 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
287302
}
288303
#[inline]
289304
fn next_match(&mut self) -> Option<(usize, usize)> {
290-
if let Some(byte) = self.single_byte {
291-
let old_finger = self.finger;
292-
let slice = unsafe { self.haystack.get_unchecked(old_finger..) };
293-
let bytes = slice.as_bytes();
294-
if let Some(index) = memchr::memchr(byte, bytes) {
295-
// index is the index of a valid ASCII byte,
296-
// so we can add one to it
297-
self.finger += index + 1;
298-
Some((self.finger - 1, self.finger))
305+
loop {
306+
// get the haystack after the last character found
307+
let bytes = if let Some(slice) = self.haystack.as_bytes().get(self.finger..) {
308+
slice
299309
} else {
300-
None
301-
}
302-
} else {
303-
loop {
304-
match self.next() {
305-
SearchStep::Match(a, b) => break Some((a, b)),
306-
SearchStep::Done => break None,
307-
_ => continue,
310+
return None;
311+
};
312+
// the last byte of the utf8 encoded needle
313+
let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
314+
if let Some(index) = memchr::memchr(last_byte, bytes) {
315+
// The new finger is the index of the byte we found,
316+
// plus one, since we memchr'd for the last byte of the character.
317+
//
318+
// Note that this doesn't always give us a finger on a UTF8 boundary.
319+
// If we *didn't* find our character
320+
// we may have indexed to the non-last byte of a 3-byte or 4-byte character.
321+
// We can't just skip to the next valid starting byte because a character like
322+
// ꁁ (U+A041 YI SYLLABLE PA), utf-8 `EA 81 81` will have us always find
323+
// the second byte when searching for the third.
324+
//
325+
// However, this is totally okay. While we have the invariant that
326+
// self.finger is on a UTF8 boundary, this invariant is not relid upon
327+
// within this method (it is relied upon in CharSearcher::next()).
328+
//
329+
// We only exit this method when we reach the end of the string, or if we
330+
// find something. When we find something the `finger` will be set
331+
// to a UTF8 boundary.
332+
self.finger += index + 1;
333+
let found_char = self.finger - self.utf8_size;
334+
if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) {
335+
if slice == &self.utf8_encoded[0..self.utf8_size] {
336+
return Some((found_char, self.finger));
337+
}
308338
}
339+
} else {
340+
// found nothing, exit
341+
self.finger = self.haystack.len();
342+
return None;
309343
}
310344
}
311345
}
@@ -322,7 +356,7 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
322356
let old_len = iter.iter.len();
323357
if let Some(ch) = iter.next_back() {
324358
// subtract byte offset of current character
325-
// without recalculating
359+
// without re-encoding as utf-8
326360
self.finger_back -= old_len - iter.iter.len();
327361
if ch == self.needle {
328362
SearchStep::Match(self.finger_back, old_finger)
@@ -335,24 +369,47 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
335369
}
336370
#[inline]
337371
fn next_match_back(&mut self) -> Option<(usize, usize)> {
338-
if let Some(byte) = self.single_byte {
339-
let old_finger = self.finger_back;
340-
let slice = unsafe { self.haystack.slice_unchecked(0, old_finger) };
341-
let bytes = slice.as_bytes();
342-
if let Some(index) = memchr::memrchr(byte, bytes) {
343-
// index is the index of a valid ASCII byte
344-
self.finger_back = index;
345-
Some((self.finger_back, self.finger_back + 1))
372+
let haystack = self.haystack.as_bytes();
373+
loop {
374+
// get the haystack up to but not including the last character searched
375+
let bytes = if let Some(slice) = haystack.get(..self.finger_back) {
376+
slice
346377
} else {
347-
None
348-
}
349-
} else {
350-
loop {
351-
match self.next_back() {
352-
SearchStep::Match(a, b) => break Some((a, b)),
353-
SearchStep::Done => break None,
354-
_ => continue,
378+
return None;
379+
};
380+
// the last byte of the utf8 encoded needle
381+
let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
382+
if let Some(index) = memchr::memrchr(last_byte, bytes) {
383+
// memrchr will return the index of the byte we wish to
384+
// find. In case of an ASCII character, this is indeed
385+
// were we wish our new finger to be ("after" the found
386+
// char in the paradigm of reverse iteration). For
387+
// multibyte chars we need to skip down by the number of more
388+
// bytes they have than ASCII
389+
let found_char = index - (self.utf8_size - 1);
390+
if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) {
391+
if slice == &self.utf8_encoded[0..self.utf8_size] {
392+
// move finger to before the character found (i.e. at its start index)
393+
self.finger_back = found_char;
394+
return Some((self.finger_back, self.finger_back + self.utf8_size));
395+
}
355396
}
397+
// We can't use finger_back = index - size + 1 here. If we found the last char
398+
// of a different-sized character (or the middle byte of a different character)
399+
// we need to bump the finger_back down to `index`. This similarly makes
400+
// `finger_back` have the potential to no longer be on a boundary,
401+
// but this is OK since we only exit this function on a boundary
402+
// or when the haystack has been searched completely.
403+
//
404+
// Unlike next_match this does not
405+
// have the problem of repeated bytes in utf-8 because
406+
// we're searching for the last byte, and we can only have
407+
// found the last byte when searching in reverse.
408+
self.finger_back = index;
409+
} else {
410+
self.finger_back = 0;
411+
// found nothing, exit
412+
return None;
356413
}
357414
}
358415
}
@@ -368,19 +425,16 @@ impl<'a> Pattern<'a> for char {
368425

369426
#[inline]
370427
fn into_searcher(self, haystack: &'a str) -> Self::Searcher {
371-
let single_byte = if self.len_utf8() == 1 {
372-
let mut storage = [0];
373-
self.encode_utf8(&mut storage);
374-
Some(storage[0])
375-
} else {
376-
None
377-
};
428+
let mut utf8_encoded = [0; 4];
429+
self.encode_utf8(&mut utf8_encoded);
430+
let utf8_size = self.len_utf8();
378431
CharSearcher {
379432
haystack,
380433
finger: 0,
381434
finger_back: haystack.len(),
382435
needle: self,
383-
single_byte,
436+
utf8_size,
437+
utf8_encoded
384438
}
385439
}
386440

0 commit comments

Comments
 (0)