@@ -252,13 +252,28 @@ pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {}
252
252
#[ derive( Clone , Debug ) ]
253
253
pub struct CharSearcher < ' a > {
254
254
haystack : & ' a str ,
255
- // invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack`
255
+ // safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack`
256
+ // This invariant can be broken *within* next_match and next_match_back, however
257
+ // they must exit with fingers on valid code point boundaries.
258
+
259
+ /// `finger` is the current byte index of the forward search.
260
+ /// Imagine that it exists before the byte at its index, i.e.
261
+ /// haystack[finger] is the first byte of the slice we must inspect during
262
+ /// forward searching
256
263
finger : usize ,
264
+ /// `finger_back` is the current byte index of the reverse search.
265
+ /// Imagine that it exists after the byte at its index, i.e.
266
+ /// haystack[finger_back - 1] is the last byte of the slice we must inspect during
267
+ /// forward searching (and thus the first byte to be inspected when calling next_back())
257
268
finger_back : usize ,
269
+ /// The character being searched for
258
270
needle : char ,
259
- // For ascii chars
260
- // invariant: must be an ASCII byte (no high bit)
261
- single_byte : Option < u8 > ,
271
+
272
+ // safety invariant: `utf8_size` must be less than 5
273
+ /// The number of bytes `needle` takes up when encoded in utf8
274
+ utf8_size : usize ,
275
+ /// A utf8 encoded copy of the `needle`
276
+ utf8_encoded : [ u8 ; 4 ] ,
262
277
}
263
278
264
279
unsafe impl < ' a > Searcher < ' a > for CharSearcher < ' a > {
@@ -269,12 +284,12 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
269
284
#[ inline]
270
285
fn next ( & mut self ) -> SearchStep {
271
286
let old_finger = self . finger ;
272
- let slice = unsafe { self . haystack . get_unchecked ( old_finger..) } ;
287
+ let slice = unsafe { self . haystack . get_unchecked ( old_finger..self . haystack . len ( ) ) } ;
273
288
let mut iter = slice. chars ( ) ;
274
289
let old_len = iter. iter . len ( ) ;
275
290
if let Some ( ch) = iter. next ( ) {
276
291
// add byte offset of current character
277
- // without recalculating
292
+ // without re-encoding as utf-8
278
293
self . finger += old_len - iter. iter . len ( ) ;
279
294
if ch == self . needle {
280
295
SearchStep :: Match ( old_finger, self . finger )
@@ -287,25 +302,44 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
287
302
}
288
303
#[ inline]
289
304
fn next_match ( & mut self ) -> Option < ( usize , usize ) > {
290
- if let Some ( byte) = self . single_byte {
291
- let old_finger = self . finger ;
292
- let slice = unsafe { self . haystack . get_unchecked ( old_finger..) } ;
293
- let bytes = slice. as_bytes ( ) ;
294
- if let Some ( index) = memchr:: memchr ( byte, bytes) {
295
- // index is the index of a valid ASCII byte,
296
- // so we can add one to it
297
- self . finger += index + 1 ;
298
- Some ( ( self . finger - 1 , self . finger ) )
305
+ loop {
306
+ // get the haystack after the last character found
307
+ let bytes = if let Some ( slice) = self . haystack . as_bytes ( ) . get ( self . finger ..) {
308
+ slice
299
309
} else {
300
- None
301
- }
302
- } else {
303
- loop {
304
- match self . next ( ) {
305
- SearchStep :: Match ( a, b) => break Some ( ( a, b) ) ,
306
- SearchStep :: Done => break None ,
307
- _ => continue ,
310
+ return None ;
311
+ } ;
312
+ // the last byte of the utf8 encoded needle
313
+ let last_byte = unsafe { * self . utf8_encoded . get_unchecked ( self . utf8_size - 1 ) } ;
314
+ if let Some ( index) = memchr:: memchr ( last_byte, bytes) {
315
+ // The new finger is the index of the byte we found,
316
+ // plus one, since we memchr'd for the last byte of the character.
317
+ //
318
+ // Note that this doesn't always give us a finger on a UTF8 boundary.
319
+ // If we *didn't* find our character
320
+ // we may have indexed to the non-last byte of a 3-byte or 4-byte character.
321
+ // We can't just skip to the next valid starting byte because a character like
322
+ // ꁁ (U+A041 YI SYLLABLE PA), utf-8 `EA 81 81` will have us always find
323
+ // the second byte when searching for the third.
324
+ //
325
+ // However, this is totally okay. While we have the invariant that
326
+ // self.finger is on a UTF8 boundary, this invariant is not relid upon
327
+ // within this method (it is relied upon in CharSearcher::next()).
328
+ //
329
+ // We only exit this method when we reach the end of the string, or if we
330
+ // find something. When we find something the `finger` will be set
331
+ // to a UTF8 boundary.
332
+ self . finger += index + 1 ;
333
+ let found_char = self . finger - self . utf8_size ;
334
+ if let Some ( slice) = self . haystack . as_bytes ( ) . get ( found_char..self . finger ) {
335
+ if slice == & self . utf8_encoded [ 0 ..self . utf8_size ] {
336
+ return Some ( ( found_char, self . finger ) ) ;
337
+ }
308
338
}
339
+ } else {
340
+ // found nothing, exit
341
+ self . finger = self . haystack . len ( ) ;
342
+ return None ;
309
343
}
310
344
}
311
345
}
@@ -322,7 +356,7 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
322
356
let old_len = iter. iter . len ( ) ;
323
357
if let Some ( ch) = iter. next_back ( ) {
324
358
// subtract byte offset of current character
325
- // without recalculating
359
+ // without re-encoding as utf-8
326
360
self . finger_back -= old_len - iter. iter . len ( ) ;
327
361
if ch == self . needle {
328
362
SearchStep :: Match ( self . finger_back , old_finger)
@@ -335,24 +369,47 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
335
369
}
336
370
#[ inline]
337
371
fn next_match_back ( & mut self ) -> Option < ( usize , usize ) > {
338
- if let Some ( byte) = self . single_byte {
339
- let old_finger = self . finger_back ;
340
- let slice = unsafe { self . haystack . slice_unchecked ( 0 , old_finger) } ;
341
- let bytes = slice. as_bytes ( ) ;
342
- if let Some ( index) = memchr:: memrchr ( byte, bytes) {
343
- // index is the index of a valid ASCII byte
344
- self . finger_back = index;
345
- Some ( ( self . finger_back , self . finger_back + 1 ) )
372
+ let haystack = self . haystack . as_bytes ( ) ;
373
+ loop {
374
+ // get the haystack up to but not including the last character searched
375
+ let bytes = if let Some ( slice) = haystack. get ( ..self . finger_back ) {
376
+ slice
346
377
} else {
347
- None
348
- }
349
- } else {
350
- loop {
351
- match self . next_back ( ) {
352
- SearchStep :: Match ( a, b) => break Some ( ( a, b) ) ,
353
- SearchStep :: Done => break None ,
354
- _ => continue ,
378
+ return None ;
379
+ } ;
380
+ // the last byte of the utf8 encoded needle
381
+ let last_byte = unsafe { * self . utf8_encoded . get_unchecked ( self . utf8_size - 1 ) } ;
382
+ if let Some ( index) = memchr:: memrchr ( last_byte, bytes) {
383
+ // memrchr will return the index of the byte we wish to
384
+ // find. In case of an ASCII character, this is indeed
385
+ // were we wish our new finger to be ("after" the found
386
+ // char in the paradigm of reverse iteration). For
387
+ // multibyte chars we need to skip down by the number of more
388
+ // bytes they have than ASCII
389
+ let found_char = index - ( self . utf8_size - 1 ) ;
390
+ if let Some ( slice) = haystack. get ( found_char..( found_char + self . utf8_size ) ) {
391
+ if slice == & self . utf8_encoded [ 0 ..self . utf8_size ] {
392
+ // move finger to before the character found (i.e. at its start index)
393
+ self . finger_back = found_char;
394
+ return Some ( ( self . finger_back , self . finger_back + self . utf8_size ) ) ;
395
+ }
355
396
}
397
+ // We can't use finger_back = index - size + 1 here. If we found the last char
398
+ // of a different-sized character (or the middle byte of a different character)
399
+ // we need to bump the finger_back down to `index`. This similarly makes
400
+ // `finger_back` have the potential to no longer be on a boundary,
401
+ // but this is OK since we only exit this function on a boundary
402
+ // or when the haystack has been searched completely.
403
+ //
404
+ // Unlike next_match this does not
405
+ // have the problem of repeated bytes in utf-8 because
406
+ // we're searching for the last byte, and we can only have
407
+ // found the last byte when searching in reverse.
408
+ self . finger_back = index;
409
+ } else {
410
+ self . finger_back = 0 ;
411
+ // found nothing, exit
412
+ return None ;
356
413
}
357
414
}
358
415
}
@@ -368,19 +425,16 @@ impl<'a> Pattern<'a> for char {
368
425
369
426
#[ inline]
370
427
fn into_searcher ( self , haystack : & ' a str ) -> Self :: Searcher {
371
- let single_byte = if self . len_utf8 ( ) == 1 {
372
- let mut storage = [ 0 ] ;
373
- self . encode_utf8 ( & mut storage) ;
374
- Some ( storage[ 0 ] )
375
- } else {
376
- None
377
- } ;
428
+ let mut utf8_encoded = [ 0 ; 4 ] ;
429
+ self . encode_utf8 ( & mut utf8_encoded) ;
430
+ let utf8_size = self . len_utf8 ( ) ;
378
431
CharSearcher {
379
432
haystack,
380
433
finger : 0 ,
381
434
finger_back : haystack. len ( ) ,
382
435
needle : self ,
383
- single_byte,
436
+ utf8_size,
437
+ utf8_encoded
384
438
}
385
439
}
386
440
0 commit comments