@@ -19,6 +19,7 @@ use memchr::{memchr, memchr2, memchr3};
19
19
use char_utf8:: encode_utf8;
20
20
use inst:: { Insts , Inst , InstBytes , InstRanges } ;
21
21
22
+ #[ derive( Clone , Eq , PartialEq ) ]
22
23
pub struct AlternateLiterals {
23
24
at_match : bool ,
24
25
literals : Vec < Vec < u8 > > ,
@@ -29,10 +30,11 @@ impl AlternateLiterals {
29
30
if self . literals . is_empty ( ) {
30
31
Literals :: empty ( )
31
32
} else {
32
- let at_match = self . at_match ;
33
+ let alts = self . unambiguous ( ) ;
34
+ let at_match = alts. at_match ;
33
35
Literals {
34
36
at_match : at_match,
35
- matcher : LiteralMatcher :: new ( self ) ,
37
+ matcher : LiteralMatcher :: new ( alts ) ,
36
38
}
37
39
}
38
40
}
@@ -53,6 +55,13 @@ impl AlternateLiterals {
53
55
self . literals . len ( ) >= 1 && self . literals . iter ( ) . all ( |s| s. len ( ) == 1 )
54
56
}
55
57
58
+ fn all_same_length ( & self ) -> bool {
59
+ if self . literals . is_empty ( ) {
60
+ return true ;
61
+ }
62
+ self . literals . iter ( ) . all ( |s| s. len ( ) == self . literals [ 0 ] . len ( ) )
63
+ }
64
+
56
65
fn is_one_literal ( & self ) -> bool {
57
66
self . literals . len ( ) == 1
58
67
}
@@ -123,6 +132,71 @@ impl AlternateLiterals {
123
132
}
124
133
}
125
134
}
135
+
136
+ /// Returns a new set of alternate literals that are guaranteed to be
137
+ /// unambiguous.
138
+ ///
139
+ /// State differently, the set of literals returned is guaranteed to never
140
+ /// result in any overlapping matches.
141
+ ///
142
+ /// Duplicate literals are dropped. Literals that are otherwise distinct
143
+ /// will be possibly truncated.
144
+ fn unambiguous ( & self ) -> AlternateLiterals {
145
+ fn position ( needle : & [ u8 ] , mut haystack : & [ u8 ] ) -> Option < usize > {
146
+ let mut i = 0 ;
147
+ while haystack. len ( ) >= needle. len ( ) {
148
+ if needle == & haystack[ ..needle. len ( ) ] {
149
+ return Some ( i) ;
150
+ }
151
+ i += 1 ;
152
+ haystack = & haystack[ 1 ..] ;
153
+ }
154
+ None
155
+ }
156
+
157
+ // This function is a bit gratuitous and allocation heavy, but in
158
+ // general, we limit the number of alternates to be pretty small.
159
+
160
+ if self . all_same_length ( ) {
161
+ return self . clone ( ) ;
162
+ }
163
+ let mut new = AlternateLiterals {
164
+ at_match : self . at_match ,
165
+ literals : Vec :: with_capacity ( self . literals . len ( ) ) ,
166
+ } ;
167
+ ' OUTER :
168
+ for lit1 in & self . literals {
169
+ if new. literals . is_empty ( ) {
170
+ new. literals . push ( lit1. clone ( ) ) ;
171
+ continue ;
172
+ }
173
+ let mut candidate = lit1. clone ( ) ;
174
+ for lit2 in & mut new. literals {
175
+ if & candidate == lit2 {
176
+ // If the literal is already in the set, then we can
177
+ // just drop it.
178
+ continue ' OUTER ;
179
+ }
180
+ if lit1. len ( ) <= lit2. len ( ) {
181
+ if let Some ( i) = position ( & candidate, lit2) {
182
+ new. at_match = false ;
183
+ lit2. truncate ( i) ;
184
+ }
185
+ } else {
186
+ if let Some ( i) = position ( lit2, & candidate) {
187
+ new. at_match = false ;
188
+ candidate. truncate ( i) ;
189
+ }
190
+ }
191
+ }
192
+ new. literals . push ( candidate) ;
193
+ }
194
+ new. literals . retain ( |lit| !lit. is_empty ( ) ) ;
195
+ // This is OK only because the alternates are unambiguous.
196
+ new. literals . sort ( ) ;
197
+ new. literals . dedup ( ) ;
198
+ new
199
+ }
126
200
}
127
201
128
202
pub struct BuildPrefixes < ' a > {
@@ -324,20 +398,59 @@ enum LiteralMatcher {
324
398
/// A single byte prefix.
325
399
Byte ( u8 ) ,
326
400
/// A set of two or more single byte prefixes.
327
- /// This could be reduced to a bitset, which would use only 8 bytes,
328
- /// but I don't think we care.
329
401
Bytes {
330
402
chars : Vec < u8 > ,
331
403
sparse : Vec < bool > ,
332
404
} ,
405
+ /// A single substring. (Likely using Boyer-Moore with memchr.)
333
406
Single ( SingleSearch ) ,
334
- /// A full Aho-Corasick DFA. A "full" DFA in this case means that all of
335
- /// the failure transitions have been expanded and the entire DFA is
336
- /// represented by a memory inefficient sparse matrix. This makes matching
337
- /// extremely fast. We only use this "full" DFA when the number of bytes
338
- /// in our literals does not exceed 250. This generally leads to a DFA that
339
- /// consumes 250KB of memory.
340
- FullAutomaton ( FullAcAutomaton < Vec < u8 > > ) ,
407
+ /// An Aho-Corasick automaton.
408
+ AC ( FullAcAutomaton < Vec < u8 > > ) ,
409
+ }
410
+
411
+ impl LiteralMatcher {
412
+ /// Create a new prefix matching machine.
413
+ fn new ( mut alts : AlternateLiterals ) -> Self {
414
+ use self :: LiteralMatcher :: * ;
415
+
416
+ if alts. is_empty ( ) {
417
+ Empty
418
+ } else if alts. distinct_single_bytes ( ) >= 26 {
419
+ // Why do we do this? Well, it's a heuristic to prevent thrashing.
420
+ // Basically, if our literal matcher has lots of literals that are
421
+ // a single byte, then we lose a lot of the benefits of fast
422
+ // literal searching. In particular, single bytes have a high
423
+ // probability of matching. In a regex that rarely matches, we end
424
+ // up ping-ponging between the literal matcher and the regex engine
425
+ // for every byte of input. That's bad juju.
426
+ //
427
+ // Note that we only count distinct starting bytes from literals of
428
+ // length 1. For literals longer than that, we assume they have
429
+ // a lower probability of matching.
430
+ //
431
+ // This particular heuristic would be triggered on, e.g.,
432
+ // `[a-z].+`. The prefix here is a single byte that is very likely
433
+ // to match on any given byte in the input, so it's quicker just
434
+ // to let the matching engine process it.
435
+ //
436
+ // TODO(burntsushi): Consider lowering the threshold!
437
+ Empty
438
+ } else if alts. is_single_byte ( ) {
439
+ Byte ( alts. literals [ 0 ] [ 0 ] )
440
+ } else if alts. all_single_bytes ( ) {
441
+ let mut set = vec ! [ false ; 256 ] ;
442
+ let mut bytes = vec ! [ ] ;
443
+ for lit in alts. literals {
444
+ bytes. push ( lit[ 0 ] ) ;
445
+ set[ lit[ 0 ] as usize ] = true ;
446
+ }
447
+ Bytes { chars : bytes, sparse : set }
448
+ } else if alts. is_one_literal ( ) {
449
+ Single ( SingleSearch :: new ( alts. literals . pop ( ) . unwrap ( ) ) )
450
+ } else {
451
+ AC ( AcAutomaton :: new ( alts. literals ) . into_full ( ) )
452
+ }
453
+ }
341
454
}
342
455
343
456
impl Literals {
@@ -377,7 +490,7 @@ impl Literals {
377
490
Single ( ref searcher) => {
378
491
searcher. find ( haystack) . map ( |i| ( i, i + searcher. pat . len ( ) ) )
379
492
}
380
- FullAutomaton ( ref aut) => {
493
+ AC ( ref aut) => {
381
494
aut. find ( haystack) . next ( ) . map ( |m| ( m. start , m. end ) )
382
495
}
383
496
}
@@ -396,34 +509,7 @@ impl Literals {
396
509
Byte ( _) => 1 ,
397
510
Bytes { ref chars, .. } => chars. len ( ) ,
398
511
Single ( _) => 1 ,
399
- FullAutomaton ( ref aut) => aut. len ( ) ,
400
- }
401
- }
402
-
403
- /// Returns true iff the prefix match preserves priority.
404
- ///
405
- /// For example, given the alternation `ab|a` and the target string `ab`,
406
- /// does the prefix machine guarantee that `ab` will match? (A full
407
- /// Aho-Corasick automaton does not!)
408
- pub fn preserves_priority ( & self ) -> bool {
409
- use self :: LiteralMatcher :: * ;
410
- match self . matcher {
411
- Empty => true ,
412
- Byte ( _) => true ,
413
- Bytes { ..} => true ,
414
- Single ( _) => true ,
415
- FullAutomaton ( ref aut) => {
416
- // Okay, so the automaton can respect priority in one
417
- // particular case: when every pattern is of the same length.
418
- // The trick is that the automaton will report the leftmost
419
- // match, which in this case, corresponds to the correct
420
- // match for the regex engine. If any other alternate matches
421
- // at the same position, then they must be exactly equivalent.
422
-
423
- // Guaranteed at least one prefix by construction, so use
424
- // that for the length.
425
- aut. patterns ( ) . iter ( ) . all ( |p| p. len ( ) == aut. pattern ( 0 ) . len ( ) )
426
- }
512
+ AC ( ref aut) => aut. len ( ) ,
427
513
}
428
514
}
429
515
@@ -440,7 +526,7 @@ impl Literals {
440
526
( single. pat . len ( ) * mem:: size_of :: < u8 > ( ) )
441
527
+ ( single. shift . len ( ) * mem:: size_of :: < usize > ( ) )
442
528
}
443
- FullAutomaton ( ref aut) => aut. heap_bytes ( ) ,
529
+ AC ( ref aut) => aut. heap_bytes ( ) ,
444
530
}
445
531
}
446
532
@@ -465,52 +551,7 @@ impl Literals {
465
551
chars. iter ( ) . map ( |& byte| vec ! [ byte] ) . collect ( )
466
552
}
467
553
Single ( ref searcher) => vec ! [ searcher. pat. clone( ) ] ,
468
- FullAutomaton ( ref aut) => aut. patterns ( ) . iter ( ) . cloned ( ) . collect ( ) ,
469
- }
470
- }
471
- }
472
-
473
- impl LiteralMatcher {
474
- /// Create a new prefix matching machine.
475
- fn new ( mut alts : AlternateLiterals ) -> Self {
476
- use self :: LiteralMatcher :: * ;
477
-
478
- if alts. is_empty ( ) {
479
- Empty
480
- } else if alts. distinct_single_bytes ( ) >= 26 {
481
- // Why do we do this? Well, it's a heuristic to prevent thrashing.
482
- // Basically, if our literal matcher has lots of literals that are
483
- // a single byte, then we lose a lot of the benefits of fast
484
- // literal searching. In particular, single bytes have a high
485
- // probability of matching. In a regex that rarely matches, we end
486
- // up ping-ponging between the literal matcher and the regex engine
487
- // for every byte of input. That's bad juju.
488
- //
489
- // Note that we only count distinct starting bytes from literals of
490
- // length 1. For literals longer than that, we assume they have
491
- // a lower probability of matching.
492
- //
493
- // This particular heuristic would be triggered on, e.g.,
494
- // `[a-z].+`. The prefix here is a single byte that is very likely
495
- // to match on any given byte in the input, so it's quicker just
496
- // to let the matching engine process it.
497
- //
498
- // TODO(burntsushi): Consider lowering the threshold!
499
- Empty
500
- } else if alts. is_single_byte ( ) {
501
- Byte ( alts. literals [ 0 ] [ 0 ] )
502
- } else if alts. all_single_bytes ( ) {
503
- let mut set = vec ! [ false ; 256 ] ;
504
- let mut bytes = vec ! [ ] ;
505
- for lit in alts. literals {
506
- bytes. push ( lit[ 0 ] ) ;
507
- set[ lit[ 0 ] as usize ] = true ;
508
- }
509
- Bytes { chars : bytes, sparse : set }
510
- } else if alts. is_one_literal ( ) {
511
- Single ( SingleSearch :: new ( alts. literals . pop ( ) . unwrap ( ) ) )
512
- } else {
513
- FullAutomaton ( AcAutomaton :: new ( alts. literals ) . into_full ( ) )
554
+ AC ( ref aut) => aut. patterns ( ) . iter ( ) . cloned ( ) . collect ( ) ,
514
555
}
515
556
}
516
557
}
@@ -586,6 +627,19 @@ fn find_singles(
586
627
None
587
628
}
588
629
630
+ impl fmt:: Debug for AlternateLiterals {
631
+ fn fmt ( & self , f : & mut fmt:: Formatter ) -> fmt:: Result {
632
+ let mut strings = vec ! [ ] ;
633
+ for lit in & self . literals {
634
+ strings. push ( String :: from_utf8_lossy ( lit) . into_owned ( ) ) ;
635
+ }
636
+ f. debug_struct ( "AlternateLiterals" )
637
+ . field ( "at_match" , & self . at_match )
638
+ . field ( "literals" , & strings)
639
+ . finish ( )
640
+ }
641
+ }
642
+
589
643
impl fmt:: Debug for Literals {
590
644
fn fmt ( & self , f : & mut fmt:: Formatter ) -> fmt:: Result {
591
645
use self :: LiteralMatcher :: * ;
@@ -601,14 +655,15 @@ impl fmt::Debug for Literals {
601
655
write ! ( f, "alternate single bytes: {}" , chars. join( ", " ) )
602
656
}
603
657
Single ( ref searcher) => write ! ( f, "{:?}" , searcher) ,
604
- FullAutomaton ( ref aut) => write ! ( f, "{:?}" , aut) ,
658
+ AC ( ref aut) => write ! ( f, "{:?}" , aut) ,
605
659
}
606
660
}
607
661
}
608
662
609
663
#[ cfg( test) ]
610
664
mod tests {
611
665
use program:: ProgramBuilder ;
666
+ use super :: AlternateLiterals ;
612
667
613
668
macro_rules! prog {
614
669
( $re: expr) => { ProgramBuilder :: new( $re) . compile( ) . unwrap( ) }
@@ -702,6 +757,46 @@ mod tests {
702
757
assert_eq ! ( prefixes_complete!( "☃" ) , vec![ "☃" ] ) ;
703
758
}
704
759
760
+ macro_rules! alts {
761
+ ( $( $s: expr) ,* ) => { {
762
+ AlternateLiterals {
763
+ at_match: false ,
764
+ literals: vec![ $( $s. as_bytes( ) . to_owned( ) ) ,* ] ,
765
+ }
766
+ } }
767
+ }
768
+
769
+ #[ test]
770
+ fn unambiguous ( ) {
771
+ let given = alts ! ( "z" , "azb" ) ;
772
+ let expected = alts ! ( "a" , "z" ) ;
773
+ assert_eq ! ( expected, given. unambiguous( ) ) ;
774
+
775
+ let given = alts ! ( "zaaaaaaaaaa" , "aa" ) ;
776
+ let expected = alts ! ( "aa" , "z" ) ;
777
+ assert_eq ! ( expected, given. unambiguous( ) ) ;
778
+
779
+ let given = alts ! ( "Sherlock" , "Watson" ) ;
780
+ let expected = alts ! ( "Sherlock" , "Watson" ) ;
781
+ assert_eq ! ( expected, given. unambiguous( ) ) ;
782
+
783
+ let given = alts ! ( "abc" , "bc" ) ;
784
+ let expected = alts ! ( "a" , "bc" ) ;
785
+ assert_eq ! ( expected, given. unambiguous( ) ) ;
786
+
787
+ let given = alts ! ( "bc" , "abc" ) ;
788
+ let expected = alts ! ( "a" , "bc" ) ;
789
+ assert_eq ! ( expected, given. unambiguous( ) ) ;
790
+
791
+ let given = alts ! ( "a" , "aa" ) ;
792
+ let expected = alts ! ( "a" ) ;
793
+ assert_eq ! ( expected, given. unambiguous( ) ) ;
794
+
795
+ let given = alts ! ( "ab" , "a" ) ;
796
+ let expected = alts ! ( "a" ) ;
797
+ assert_eq ! ( expected, given. unambiguous( ) ) ;
798
+ }
799
+
705
800
// That this test fails suggests that the literal finder needs to be
706
801
// completely rewritten. Ug. It's not that it is wrong currently, but
707
802
// it's not as good at finding literals as it should be.
0 commit comments