From 4844e5162c33c743c9a7999ac781b13a829fc68c Mon Sep 17 00:00:00 2001
From: The 8472 <git@infinite-source.de>
Date: Sun, 30 Oct 2022 21:26:05 +0100
Subject: [PATCH 1/5] black_box test strings in str.contains(str) benchmarks

---
 library/alloc/benches/str.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/library/alloc/benches/str.rs b/library/alloc/benches/str.rs
index 391475bc0c75d..b355a4da9dcb1 100644
--- a/library/alloc/benches/str.rs
+++ b/library/alloc/benches/str.rs
@@ -123,7 +123,7 @@ fn bench_contains_short_short(b: &mut Bencher) {
     let needle = "sit";
 
     b.iter(|| {
-        assert!(haystack.contains(needle));
+        assert!(black_box(haystack).contains(black_box(needle)));
     })
 }
 
@@ -167,7 +167,7 @@ malesuada sollicitudin quam eu fermentum.";
     let needle = "english";
 
     b.iter(|| {
-        assert!(!haystack.contains(needle));
+        assert!(!black_box(haystack).contains(black_box(needle)));
     })
 }
 
@@ -177,7 +177,7 @@ fn bench_contains_bad_naive(b: &mut Bencher) {
     let needle = "aaaaaaaab";
 
     b.iter(|| {
-        assert!(!haystack.contains(needle));
+        assert!(!black_box(haystack).contains(black_box(needle)));
     })
 }
 
@@ -187,7 +187,7 @@ fn bench_contains_equal(b: &mut Bencher) {
     let needle = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";
 
     b.iter(|| {
-        assert!(haystack.contains(needle));
+        assert!(black_box(haystack).contains(black_box(needle)));
     })
 }
 

From 467b299e537cc94e29c1db252557cb7365924d9a Mon Sep 17 00:00:00 2001
From: The 8472 <git@infinite-source.de>
Date: Sun, 30 Oct 2022 21:50:49 +0100
Subject: [PATCH 2/5] update str.contains benchmarks

---
 library/alloc/benches/str.rs | 57 ++++++++++++++++++++++++++++++++++--
 1 file changed, 54 insertions(+), 3 deletions(-)

diff --git a/library/alloc/benches/str.rs b/library/alloc/benches/str.rs
index b355a4da9dcb1..54af389dedcd2 100644
--- a/library/alloc/benches/str.rs
+++ b/library/alloc/benches/str.rs
@@ -1,3 +1,4 @@
+use core::iter::Iterator;
 use test::{black_box, Bencher};
 
 #[bench]
@@ -122,14 +123,13 @@ fn bench_contains_short_short(b: &mut Bencher) {
     let haystack = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";
     let needle = "sit";
 
+    b.bytes = haystack.len() as u64;
     b.iter(|| {
         assert!(black_box(haystack).contains(black_box(needle)));
     })
 }
 
-#[bench]
-fn bench_contains_short_long(b: &mut Bencher) {
-    let haystack = "\
+static LONG_HAYSTACK: &str = "\
 Lorem ipsum dolor sit amet, consectetur adipiscing elit. Suspendisse quis lorem sit amet dolor \
 ultricies condimentum. Praesent iaculis purus elit, ac malesuada quam malesuada in. Duis sed orci \
 eros. Suspendisse sit amet magna mollis, mollis nunc luctus, imperdiet mi. Integer fringilla non \
@@ -164,8 +164,46 @@ feugiat. Etiam quis mauris vel risus luctus mattis a a nunc. Nullam orci quam, i
 vehicula in, porttitor ut nibh. Duis sagittis adipiscing nisl vitae congue. Donec mollis risus eu \
 leo suscipit, varius porttitor nulla porta. Pellentesque ut sem nec nisi euismod vehicula. Nulla \
 malesuada sollicitudin quam eu fermentum.";
+
+#[bench]
+fn bench_contains_2b_repeated_long(b: &mut Bencher) {
+    let haystack = LONG_HAYSTACK;
+    let needle = "::";
+
+    b.bytes = haystack.len() as u64;
+    b.iter(|| {
+        assert!(!black_box(haystack).contains(black_box(needle)));
+    })
+}
+
+#[bench]
+fn bench_contains_short_long(b: &mut Bencher) {
+    let haystack = LONG_HAYSTACK;
     let needle = "english";
 
+    b.bytes = haystack.len() as u64;
+    b.iter(|| {
+        assert!(!black_box(haystack).contains(black_box(needle)));
+    })
+}
+
+#[bench]
+fn bench_contains_16b_in_long(b: &mut Bencher) {
+    let haystack = LONG_HAYSTACK;
+    let needle = "english language";
+
+    b.bytes = haystack.len() as u64;
+    b.iter(|| {
+        assert!(!black_box(haystack).contains(black_box(needle)));
+    })
+}
+
+#[bench]
+fn bench_contains_32b_in_long(b: &mut Bencher) {
+    let haystack = LONG_HAYSTACK;
+    let needle = "the english language sample text";
+
+    b.bytes = haystack.len() as u64;
     b.iter(|| {
         assert!(!black_box(haystack).contains(black_box(needle)));
     })
@@ -176,6 +214,18 @@ fn bench_contains_bad_naive(b: &mut Bencher) {
     let haystack = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
     let needle = "aaaaaaaab";
 
+    b.bytes = haystack.len() as u64;
+    b.iter(|| {
+        assert!(!black_box(haystack).contains(black_box(needle)));
+    })
+}
+
+#[bench]
+fn bench_contains_bad_simd(b: &mut Bencher) {
+    let haystack = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
+    let needle = "aaabaaaa";
+
+    b.bytes = haystack.len() as u64;
     b.iter(|| {
         assert!(!black_box(haystack).contains(black_box(needle)));
     })
@@ -186,6 +236,7 @@ fn bench_contains_equal(b: &mut Bencher) {
     let haystack = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";
     let needle = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";
 
+    b.bytes = haystack.len() as u64;
     b.iter(|| {
         assert!(black_box(haystack).contains(black_box(needle)));
     })

From 3d4a8482b93313be4d6e8dc62030860fa2fc46ef Mon Sep 17 00:00:00 2001
From: The 8472 <git@infinite-source.de>
Date: Sun, 30 Oct 2022 21:47:04 +0100
Subject: [PATCH 3/5] x86_64 SSE2 fast-path for str.contains(&str) and short
 needles
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Based on Wojciech Muła's "SIMD-friendly algorithms for substring searching"[0]

The two-way algorithm is Big-O efficient but it needs to preprocess the needle
to find a "criticla factorization" of it. This additional work is significant
for short needles. Additionally it mostly advances needle.len() bytes at a time.

The SIMD-based approach used here on the other hand can advance based on its
vector width, which can exceed the needle length. Except for pathological cases,
but due to being limited to small needles the worst case blowup is also small.

benchmarks taken on a Zen2:

```
16CGU, OLD:
test str::bench_contains_short_short                     ... bench:          27 ns/iter (+/- 1)
test str::bench_contains_short_long                      ... bench:         667 ns/iter (+/- 29)
test str::bench_contains_bad_naive                       ... bench:         131 ns/iter (+/- 2)
test str::bench_contains_bad_simd                        ... bench:         130 ns/iter (+/- 2)
test str::bench_contains_equal                           ... bench:         148 ns/iter (+/- 4)


16CGU, NEW:
test str::bench_contains_short_short                     ... bench:           8 ns/iter (+/- 0)
test str::bench_contains_short_long                      ... bench:         135 ns/iter (+/- 4)
test str::bench_contains_bad_naive                       ... bench:         130 ns/iter (+/- 2)
test str::bench_contains_bad_simd                        ... bench:         292 ns/iter (+/- 1)
test str::bench_contains_equal                           ... bench:           3 ns/iter (+/- 0)


1CGU, OLD:
test str::bench_contains_short_short                     ... bench:          30 ns/iter (+/- 0)
test str::bench_contains_short_long                      ... bench:         713 ns/iter (+/- 17)
test str::bench_contains_bad_naive                       ... bench:         131 ns/iter (+/- 3)
test str::bench_contains_bad_simd                        ... bench:         130 ns/iter (+/- 3)
test str::bench_contains_equal                           ... bench:         148 ns/iter (+/- 6)

1CGU, NEW:
test str::bench_contains_short_short                     ... bench:          10 ns/iter (+/- 0)
test str::bench_contains_short_long                      ... bench:         111 ns/iter (+/- 0)
test str::bench_contains_bad_naive                       ... bench:         135 ns/iter (+/- 3)
test str::bench_contains_bad_simd                        ... bench:         274 ns/iter (+/- 2)
test str::bench_contains_equal                           ... bench:           4 ns/iter (+/- 0)
```


[0] http://0x80.pl/articles/simd-strfind.html#sse-avx2
---
 library/core/src/str/pattern.rs | 102 ++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

diff --git a/library/core/src/str/pattern.rs b/library/core/src/str/pattern.rs
index ec2cb429e67bf..def11ca45c05e 100644
--- a/library/core/src/str/pattern.rs
+++ b/library/core/src/str/pattern.rs
@@ -39,6 +39,7 @@
 )]
 
 use crate::cmp;
+use crate::cmp::Ordering;
 use crate::fmt;
 use crate::slice::memchr;
 
@@ -946,6 +947,27 @@ impl<'a, 'b> Pattern<'a> for &'b str {
         haystack.as_bytes().starts_with(self.as_bytes())
     }
 
+    /// Checks whether the pattern matches anywhere in the haystack
+    #[inline]
+    fn is_contained_in(self, haystack: &'a str) -> bool {
+        if self.len() == 0 {
+            return true;
+        }
+
+        match self.len().cmp(&haystack.len()) {
+            Ordering::Less => {
+                #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+                if self.as_bytes().len() <= 8 {
+                    return simd_contains(self, haystack);
+                }
+
+                self.into_searcher(haystack).next_match().is_some()
+            }
+            Ordering::Equal => self == haystack,
+            Ordering::Greater => false,
+        }
+    }
+
     /// Removes the pattern from the front of haystack, if it matches.
     #[inline]
     fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> {
@@ -1684,3 +1706,83 @@ impl TwoWayStrategy for RejectAndMatch {
         SearchStep::Match(a, b)
     }
 }
+
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+#[inline]
+fn simd_contains(needle: &str, haystack: &str) -> bool {
+    let needle = needle.as_bytes();
+    let haystack = haystack.as_bytes();
+
+    if needle.len() == 1 {
+        return haystack.contains(&needle[0]);
+    }
+
+    const CHUNK: usize = 16;
+
+    // do a naive search if if the haystack is too small to fit
+    if haystack.len() < CHUNK + needle.len() - 1 {
+        return haystack.windows(needle.len()).any(|c| c == needle);
+    }
+
+    use crate::arch::x86_64::{
+        __m128i, _mm_and_si128, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_set1_epi8,
+    };
+
+    // SAFETY: no preconditions other than sse2 being available
+    let first: __m128i = unsafe { _mm_set1_epi8(needle[0] as i8) };
+    // SAFETY: no preconditions other than sse2 being available
+    let last: __m128i = unsafe { _mm_set1_epi8(*needle.last().unwrap() as i8) };
+
+    let check_mask = #[cold]
+    |idx, mut mask: u32| -> bool {
+        while mask != 0 {
+            let trailing = mask.trailing_zeros();
+            let offset = idx + trailing as usize + 1;
+            let sub = &haystack[offset..][..needle.len() - 2];
+            let trimmed_needle = &needle[1..needle.len() - 1];
+
+            if sub == trimmed_needle {
+                return true;
+            }
+            mask &= !(1 << trailing);
+        }
+        return false;
+    };
+
+    let test_chunk = |i| -> bool {
+        // SAFETY: this requires at least CHUNK bytes being readable at offset i
+        // that is ensured by the loop ranges (see comments below)
+        let a: __m128i = unsafe { _mm_loadu_si128(haystack.as_ptr().add(i) as *const _) };
+        let b: __m128i =
+            // SAFETY: this requires CHUNK + needle.len() - 1 bytes being readable at offset i
+            unsafe { _mm_loadu_si128(haystack.as_ptr().add(i + needle.len() - 1) as *const _) };
+
+        // SAFETY: no preconditions other than sse2 being available
+        let eq_first: __m128i = unsafe { _mm_cmpeq_epi8(first, a) };
+        // SAFETY: no preconditions other than sse2 being available
+        let eq_last: __m128i = unsafe { _mm_cmpeq_epi8(last, b) };
+
+        // SAFETY: no preconditions other than sse2 being available
+        let mask: u32 = unsafe { _mm_movemask_epi8(_mm_and_si128(eq_first, eq_last)) } as u32;
+
+        if mask != 0 {
+            return check_mask(i, mask);
+        }
+        return false;
+    };
+
+    let mut i = 0;
+    let mut result = false;
+    while !result && i + CHUNK + needle.len() <= haystack.len() {
+        result |= test_chunk(i);
+        i += CHUNK;
+    }
+
+    // process the tail that didn't fit into CHUNK-sized steps
+    // this simply repeats the same procedure but as right-aligned chunk instead
+    // of a left-aligned one. The last byte must be exactly flush with the string end so
+    // we don't miss a single byte or read out of bounds.
+    result |= test_chunk(haystack.len() + 1 - needle.len() - CHUNK);
+
+    return result;
+}

From c37e8fae57862383fad43f3201a28b1fb8249904 Mon Sep 17 00:00:00 2001
From: The 8472 <git@infinite-source.de>
Date: Sun, 30 Oct 2022 21:50:08 +0100
Subject: [PATCH 4/5] generalize str.contains() tests to a range of haystack
 sizes

The Big-O is cubic, but this is only called with ~70 chars so it's still fast enough
---
 library/alloc/tests/str.rs | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs
index e30329aa1cb6c..9689196ef21ac 100644
--- a/library/alloc/tests/str.rs
+++ b/library/alloc/tests/str.rs
@@ -1590,11 +1590,27 @@ fn test_bool_from_str() {
     assert_eq!("not even a boolean".parse::<bool>().ok(), None);
 }
 
-fn check_contains_all_substrings(s: &str) {
-    assert!(s.contains(""));
-    for i in 0..s.len() {
-        for j in i + 1..=s.len() {
-            assert!(s.contains(&s[i..j]));
+fn check_contains_all_substrings(haystack: &str) {
+    let mut modified_needle = String::new();
+
+    for i in 0..haystack.len() {
+        // check different haystack lengths since we special-case short haystacks.
+        let haystack = &haystack[0..i];
+        assert!(haystack.contains(""));
+        for j in 0..haystack.len() {
+            for k in j + 1..=haystack.len() {
+                let needle = &haystack[j..k];
+                assert!(haystack.contains(needle));
+                modified_needle.clear();
+                modified_needle.push_str(needle);
+                modified_needle.replace_range(0..1, "\0");
+                assert!(!haystack.contains(&modified_needle));
+
+                modified_needle.clear();
+                modified_needle.push_str(needle);
+                modified_needle.replace_range(needle.len() - 1..needle.len(), "\0");
+                assert!(!haystack.contains(&modified_needle));
+            }
         }
     }
 }

From a2b2010891224cbaf448d5d799b3c47c87e863be Mon Sep 17 00:00:00 2001
From: The 8472 <git@infinite-source.de>
Date: Thu, 3 Nov 2022 23:31:00 +0100
Subject: [PATCH 5/5] - convert from core::arch to core::simd - bump simd
 compare to 32bytes - import small slice compare code from memmem crate - try
 a few different probe bytes to avoid degenerate cases   - but special-case
 2-byte needles

---
 library/core/src/str/pattern.rs | 234 +++++++++++++++++++++++++-------
 1 file changed, 182 insertions(+), 52 deletions(-)

diff --git a/library/core/src/str/pattern.rs b/library/core/src/str/pattern.rs
index def11ca45c05e..c5be32861f9a5 100644
--- a/library/core/src/str/pattern.rs
+++ b/library/core/src/str/pattern.rs
@@ -956,15 +956,20 @@ impl<'a, 'b> Pattern<'a> for &'b str {
 
         match self.len().cmp(&haystack.len()) {
             Ordering::Less => {
+                if self.len() == 1 {
+                    return haystack.as_bytes().contains(&self.as_bytes()[0]);
+                }
+
                 #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
-                if self.as_bytes().len() <= 8 {
-                    return simd_contains(self, haystack);
+                if self.len() <= 32 {
+                    if let Some(result) = simd_contains(self, haystack) {
+                        return result;
+                    }
                 }
 
                 self.into_searcher(haystack).next_match().is_some()
             }
-            Ordering::Equal => self == haystack,
-            Ordering::Greater => false,
+            _ => self == haystack,
         }
     }
 
@@ -1707,82 +1712,207 @@ impl TwoWayStrategy for RejectAndMatch {
     }
 }
 
+/// SIMD search for short needles based on
+/// Wojciech Muła's "SIMD-friendly algorithms for substring searching"[0]
+///
+/// It skips ahead by the vector width on each iteration (rather than the needle length as two-way
+/// does) by probing the first and last byte of the needle for the whole vector width
+/// and only doing full needle comparisons when the vectorized probe indicated potential matches.
+///
+/// Since the x86_64 baseline only offers SSE2 we only use u8x16 here.
+/// If we ever ship std with for x86-64-v3 or adapt this for other platforms then wider vectors
+/// should be evaluated.
+///
+/// For haystacks smaller than vector-size + needle length it falls back to
+/// a naive O(n*m) search so this implementation should not be called on larger needles.
+///
+/// [0]: http://0x80.pl/articles/simd-strfind.html#sse-avx2
 #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
 #[inline]
-fn simd_contains(needle: &str, haystack: &str) -> bool {
+fn simd_contains(needle: &str, haystack: &str) -> Option<bool> {
     let needle = needle.as_bytes();
     let haystack = haystack.as_bytes();
 
-    if needle.len() == 1 {
-        return haystack.contains(&needle[0]);
-    }
-
-    const CHUNK: usize = 16;
+    debug_assert!(needle.len() > 1);
+
+    use crate::ops::BitAnd;
+    use crate::simd::mask8x16 as Mask;
+    use crate::simd::u8x16 as Block;
+    use crate::simd::{SimdPartialEq, ToBitMask};
+
+    let first_probe = needle[0];
+
+    // the offset used for the 2nd vector
+    let second_probe_offset = if needle.len() == 2 {
+        // never bail out on len=2 needles because the probes will fully cover them and have
+        // no degenerate cases.
+        1
+    } else {
+        // try a few bytes in case first and last byte of the needle are the same
+        let Some(second_probe_offset) = (needle.len().saturating_sub(4)..needle.len()).rfind(|&idx| needle[idx] != first_probe) else {
+            // fall back to other search methods if we can't find any different bytes
+            // since we could otherwise hit some degenerate cases
+            return None;
+        };
+        second_probe_offset
+    };
 
-    // do a naive search if if the haystack is too small to fit
-    if haystack.len() < CHUNK + needle.len() - 1 {
-        return haystack.windows(needle.len()).any(|c| c == needle);
+    // do a naive search if the haystack is too small to fit
+    if haystack.len() < Block::LANES + second_probe_offset {
+        return Some(haystack.windows(needle.len()).any(|c| c == needle));
     }
 
-    use crate::arch::x86_64::{
-        __m128i, _mm_and_si128, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_set1_epi8,
-    };
-
-    // SAFETY: no preconditions other than sse2 being available
-    let first: __m128i = unsafe { _mm_set1_epi8(needle[0] as i8) };
-    // SAFETY: no preconditions other than sse2 being available
-    let last: __m128i = unsafe { _mm_set1_epi8(*needle.last().unwrap() as i8) };
+    let first_probe: Block = Block::splat(first_probe);
+    let second_probe: Block = Block::splat(needle[second_probe_offset]);
+    // first byte are already checked by the outer loop. to verify a match only the
+    // remainder has to be compared.
+    let trimmed_needle = &needle[1..];
 
+    // this #[cold] is load-bearing, benchmark before removing it...
     let check_mask = #[cold]
-    |idx, mut mask: u32| -> bool {
+    |idx, mask: u16, skip: bool| -> bool {
+        if skip {
+            return false;
+        }
+
+        // and so is this. optimizations are weird.
+        let mut mask = mask;
+
         while mask != 0 {
             let trailing = mask.trailing_zeros();
             let offset = idx + trailing as usize + 1;
-            let sub = &haystack[offset..][..needle.len() - 2];
-            let trimmed_needle = &needle[1..needle.len() - 1];
-
-            if sub == trimmed_needle {
-                return true;
+            // SAFETY: mask is between 0 and 15 trailing zeroes, we skip one additional byte that was already compared
+            // and then take trimmed_needle.len() bytes. This is within the bounds defined by the outer loop
+            unsafe {
+                let sub = haystack.get_unchecked(offset..).get_unchecked(..trimmed_needle.len());
+                if small_slice_eq(sub, trimmed_needle) {
+                    return true;
+                }
             }
             mask &= !(1 << trailing);
         }
         return false;
     };
 
-    let test_chunk = |i| -> bool {
-        // SAFETY: this requires at least CHUNK bytes being readable at offset i
+    let test_chunk = |idx| -> u16 {
+        // SAFETY: this requires at least LANES bytes being readable at idx
         // that is ensured by the loop ranges (see comments below)
-        let a: __m128i = unsafe { _mm_loadu_si128(haystack.as_ptr().add(i) as *const _) };
-        let b: __m128i =
-            // SAFETY: this requires CHUNK + needle.len() - 1 bytes being readable at offset i
-            unsafe { _mm_loadu_si128(haystack.as_ptr().add(i + needle.len() - 1) as *const _) };
-
-        // SAFETY: no preconditions other than sse2 being available
-        let eq_first: __m128i = unsafe { _mm_cmpeq_epi8(first, a) };
-        // SAFETY: no preconditions other than sse2 being available
-        let eq_last: __m128i = unsafe { _mm_cmpeq_epi8(last, b) };
-
-        // SAFETY: no preconditions other than sse2 being available
-        let mask: u32 = unsafe { _mm_movemask_epi8(_mm_and_si128(eq_first, eq_last)) } as u32;
+        let a: Block = unsafe { haystack.as_ptr().add(idx).cast::<Block>().read_unaligned() };
+        // SAFETY: this requires LANES + block_offset bytes being readable at idx
+        let b: Block = unsafe {
+            haystack.as_ptr().add(idx).add(second_probe_offset).cast::<Block>().read_unaligned()
+        };
+        let eq_first: Mask = a.simd_eq(first_probe);
+        let eq_last: Mask = b.simd_eq(second_probe);
+        let both = eq_first.bitand(eq_last);
+        let mask = both.to_bitmask();
 
-        if mask != 0 {
-            return check_mask(i, mask);
-        }
-        return false;
+        return mask;
     };
 
     let mut i = 0;
     let mut result = false;
-    while !result && i + CHUNK + needle.len() <= haystack.len() {
-        result |= test_chunk(i);
-        i += CHUNK;
+    // The loop condition must ensure that there's enough headroom to read LANE bytes,
+    // and not only at the current index but also at the index shifted by block_offset
+    const UNROLL: usize = 4;
+    while i + second_probe_offset + UNROLL * Block::LANES < haystack.len() && !result {
+        let mut masks = [0u16; UNROLL];
+        for j in 0..UNROLL {
+            masks[j] = test_chunk(i + j * Block::LANES);
+        }
+        for j in 0..UNROLL {
+            let mask = masks[j];
+            if mask != 0 {
+                result |= check_mask(i + j * Block::LANES, mask, result);
+            }
+        }
+        i += UNROLL * Block::LANES;
+    }
+    while i + second_probe_offset + Block::LANES < haystack.len() && !result {
+        let mask = test_chunk(i);
+        if mask != 0 {
+            result |= check_mask(i, mask, result);
+        }
+        i += Block::LANES;
     }
 
-    // process the tail that didn't fit into CHUNK-sized steps
-    // this simply repeats the same procedure but as right-aligned chunk instead
+    // Process the tail that didn't fit into LANES-sized steps.
+    // This simply repeats the same procedure but as right-aligned chunk instead
     // of a left-aligned one. The last byte must be exactly flush with the string end so
     // we don't miss a single byte or read out of bounds.
-    result |= test_chunk(haystack.len() + 1 - needle.len() - CHUNK);
+    let i = haystack.len() - second_probe_offset - Block::LANES;
+    let mask = test_chunk(i);
+    if mask != 0 {
+        result |= check_mask(i, mask, result);
+    }
+
+    Some(result)
+}
+
+/// Compares short slices for equality.
+///
+/// It avoids a call to libc's memcmp which is faster on long slices
+/// due to SIMD optimizations but it incurs a function call overhead.
+///
+/// # Safety
+///
+/// Both slices must have the same length.
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] // only called on x86
+#[inline]
+unsafe fn small_slice_eq(x: &[u8], y: &[u8]) -> bool {
+    // This function is adapted from
+    // https://github.com/BurntSushi/memchr/blob/8037d11b4357b0f07be2bb66dc2659d9cf28ad32/src/memmem/util.rs#L32
 
-    return result;
+    // If we don't have enough bytes to do 4-byte at a time loads, then
+    // fall back to the naive slow version.
+    //
+    // Potential alternative: We could do a copy_nonoverlapping combined with a mask instead
+    // of a loop. Benchmark it.
+    if x.len() < 4 {
+        for (&b1, &b2) in x.iter().zip(y) {
+            if b1 != b2 {
+                return false;
+            }
+        }
+        return true;
+    }
+    // When we have 4 or more bytes to compare, then proceed in chunks of 4 at
+    // a time using unaligned loads.
+    //
+    // Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is
+    // that this particular version of memcmp is likely to be called with tiny
+    // needles. That means that if we do 8 byte loads, then a higher proportion
+    // of memcmp calls will use the slower variant above. With that said, this
+    // is a hypothesis and is only loosely supported by benchmarks. There's
+    // likely some improvement that could be made here. The main thing here
+    // though is to optimize for latency, not throughput.
+
+    // SAFETY: Via the conditional above, we know that both `px` and `py`
+    // have the same length, so `px < pxend` implies that `py < pyend`.
+    // Thus, derefencing both `px` and `py` in the loop below is safe.
+    //
+    // Moreover, we set `pxend` and `pyend` to be 4 bytes before the actual
+    // end of of `px` and `py`. Thus, the final dereference outside of the
+    // loop is guaranteed to be valid. (The final comparison will overlap with
+    // the last comparison done in the loop for lengths that aren't multiples
+    // of four.)
+    //
+    // Finally, we needn't worry about alignment here, since we do unaligned
+    // loads.
+    unsafe {
+        let (mut px, mut py) = (x.as_ptr(), y.as_ptr());
+        let (pxend, pyend) = (px.add(x.len() - 4), py.add(y.len() - 4));
+        while px < pxend {
+            let vx = (px as *const u32).read_unaligned();
+            let vy = (py as *const u32).read_unaligned();
+            if vx != vy {
+                return false;
+            }
+            px = px.add(4);
+            py = py.add(4);
+        }
+        let vx = (pxend as *const u32).read_unaligned();
+        let vy = (pyend as *const u32).read_unaligned();
+        vx == vy
+    }
 }