safety: guard in Input::new against incorrect AsRef implementations

SkiFire13 · BurntSushi · commit fbd2537a5800 · 2024-01-21T09:07:41.000-05:00
Before this commit, Input::new calls haystack.as_ref() twice, once to get the actual haystack slice and the second time to get its length. It makes the assumption that the second call will return the same slice, but malicious implementations of AsRef can return different slices and thus different lengths. This is important because there's unsafe code relying on the Input's span being inbounds with respect to the haystack, but if the second call to .as_ref() returns a bigger slice this won't be true. For example, this snippet causes Miri to report UB on an unchecked slice access in find_fwd_imp (though it will also panic sometime later when run normally, but at that point the UB already happened): use regex_automata::{Input, meta::{Builder, Config}}; use std::cell::Cell; struct Bad(Cell<bool>); impl AsRef<[u8]> for Bad { fn as_ref(&self) -> &[u8] { if self.0.replace(false) { &[] } else { &[0; 1000] } } } let bad = Bad(Cell::new(true)); let input = Input::new(&bad); let regex = Builder::new() // Not setting this causes some checked access to occur before // the unchecked ones, avoiding the UB .configure(Config::new().auto_prefilter(false)) .build("a+") .unwrap(); regex.find(input); This commit fixes the problem by just calling .as_ref() once and use the length of the returned slice as the span's end value. A regression test has also been added. Closes #1154
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,12 +1,14 @@
 1.10.3 (TBD)
 ============
 This is a new patch release that fixes the feature configuration of optional
-dependencies.
+dependencies, and fixes an unsound use of bounds check elision.
 
 Bug fixes:
 
 * [BUG #1147](https://github.com/rust-lang/regex/issues/1147):
 Set `default-features=false` for the `memchr` and `aho-corasick` dependencies.
+* [BUG #1154](https://github.com/rust-lang/regex/pull/1154):
+Fix unsound bounds check elision.
 
 
 1.10.2 (2023-10-16)
diff --git a/regex-automata/src/util/search.rs b/regex-automata/src/util/search.rs
@@ -110,9 +110,14 @@ impl<'h> Input<'h> {
     /// Create a new search configuration for the given haystack.
     #[inline]
     pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> {
+        // Perform only one call to `haystack.as_ref()` to protect from incorrect
+        // implementations that return different values from multiple calls.
+        // This is important because there's code that relies on `span` not being
+        // out of bounds with respect to the stored `haystack`.
+        let haystack = haystack.as_ref();
         Input {
-            haystack: haystack.as_ref(),
-            span: Span { start: 0, end: haystack.as_ref().len() },
+            haystack,
+            span: Span { start: 0, end: haystack.len() },
             anchored: Anchored::No,
             earliest: false,
         }
@@ -1966,4 +1971,23 @@ mod tests {
         let expected_size = 3 * core::mem::size_of::<usize>();
         assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>());
     }
+
+    #[test]
+    fn incorrect_asref_guard() {
+        struct Bad(std::cell::Cell<bool>);
+
+        impl AsRef<[u8]> for Bad {
+            fn as_ref(&self) -> &[u8] {
+                if self.0.replace(false) {
+                    &[]
+                } else {
+                    &[0; 1000]
+                }
+            }
+        }
+
+        let bad = Bad(std::cell::Cell::new(true));
+        let input = Input::new(&bad);
+        assert!(input.end() <= input.haystack().len());
+    }
 }