From 85ec3c85e1207df8c31b2d050dbcb95f57bb192d Mon Sep 17 00:00:00 2001 From: Nathaniel Brough Date: Mon, 20 Feb 2023 09:43:11 -0800 Subject: [PATCH] fuzz: Add a roundtrip regex fuzz harness This change adds an optional dependency on 'arbitrary' for regex-syntax. This allows us to generate arbitrary high-level intermediate representations (HIR). Using this generated HIR we convert this back to a regex string and exercise the regex matching code under src. Using this approach we can generate arbitrary well-formed regex strings, allowing the fuzzer to penetrate deeper into the regex code. --- fuzz/Cargo.toml | 13 ++- fuzz/fuzz_targets/fuzz_regex.rs | 161 +++++++++++++++++++++++++++++++ regex-syntax/Cargo.toml | 13 +-- regex-syntax/src/hir/interval.rs | 3 + regex-syntax/src/hir/mod.rs | 19 ++++ 5 files changed, 198 insertions(+), 11 deletions(-) create mode 100644 fuzz/fuzz_targets/fuzz_regex.rs diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 874e19a0f8..f66cf8ce7f 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -1,5 +1,3 @@ -cargo-features = ['named-profiles'] - [package] name = "regex-fuzz" version = "0.0.0" @@ -11,11 +9,16 @@ edition = "2018" cargo-fuzz = true [dependencies] +arbitrary = { version = "1.2.3", features = ["derive"] } libfuzzer-sys = "0.4.1" [dependencies.regex] path = ".." +[dependencies.regex-syntax] +path = "../regex-syntax" +features = ["arbitrary"] + # Prevent this from interfering with workspaces [workspace] members = ["."] @@ -24,6 +27,12 @@ members = ["."] name = "fuzz_regex_match" path = "fuzz_targets/fuzz_regex_match.rs" +[[bin]] +name = "fuzz_regex" +path = "fuzz_targets/fuzz_regex.rs" +test = false +doc = false + [profile.release] opt-level = 3 debug = true diff --git a/fuzz/fuzz_targets/fuzz_regex.rs b/fuzz/fuzz_targets/fuzz_regex.rs new file mode 100644 index 0000000000..ca0a0f0d7d --- /dev/null +++ b/fuzz/fuzz_targets/fuzz_regex.rs @@ -0,0 +1,161 @@ +#![no_main] + +use arbitrary::Arbitrary; +use libfuzzer_sys::fuzz_target; +use regex_syntax::hir::print::Printer; +use regex_syntax::hir::Hir; +use std::{convert::TryFrom, hint::black_box}; + +#[derive(Arbitrary, Debug, Clone)] +enum Pattern { + WellFormed(Hir), + Random(String), +} + +impl TryFrom for String { + type Error = std::fmt::Error; + + fn try_from(pattern: Pattern) -> Result { + match pattern { + Pattern::WellFormed(hir) => { + let mut printer = Printer::new(); + let mut dst = String::new(); + printer.print(&hir, &mut dst)?; + return Ok(dst); + } + Pattern::Random(s) => { + return Ok(s); + } + } + } +} + +#[derive(Arbitrary, Debug)] +struct Data<'a> { + pattern: Pattern, + replacen: (usize, &'a str), + replacen_bytes: (usize, &'a [u8]), + input: &'a str, + input_bytes: &'a [u8], + pattern_set: Vec, + set_input: &'a str, + set_input_bytes: &'a [u8], +} + +fn fuzz_regex( + pattern: &Pattern, + input: &str, + replacen: &(usize, &str), +) -> Result<(), Box> { + let re = regex::Regex::new(&String::try_from(pattern.clone())?)?; + _ = black_box(re.is_match(&input)); + _ = black_box(re.captures_iter(&input).collect::>()); + _ = black_box(re.split(&input).collect::>()); + + let (limit, replace) = *replacen; + _ = black_box(re.replacen(&input, limit, replace)); + + _ = black_box(re.find(&input)); + _ = black_box(re.shortest_match(&input)); + Ok(()) +} + +fn fuzz_regex_bytes( + pattern: &Pattern, + input: &[u8], + replacen: &(usize, &[u8]), +) -> Result<(), Box> { + let re = regex::bytes::Regex::new(&String::try_from(pattern.clone())?)?; + _ = black_box(re.is_match(&input)); + _ = black_box( + re.captures_iter(&input).collect::>(), + ); + _ = black_box(re.split(&input).collect::>()); + + let (limit, replace) = *replacen; + _ = black_box(re.replacen(&input, limit, replace)); + + _ = black_box(re.find(&input)); + _ = black_box(re.shortest_match(&input)); + Ok(()) +} + +fn fuzz_regex_set( + pattern_set: &Vec, + input: &str, +) -> Result<(), Box> { + let set = regex::RegexSet::new( + pattern_set + .into_iter() + .filter_map(|x| String::try_from(x.clone()).ok()), + )?; + _ = black_box(set.is_match(&input)); + _ = black_box(set.matches(&input).into_iter().collect::>()); + Ok(()) +} + +fn fuzz_regex_set_bytes( + pattern_set: &Vec, + input: &[u8], +) -> Result<(), Box> { + let set = regex::bytes::RegexSet::new( + pattern_set + .into_iter() + .filter_map(|x| String::try_from(x.clone()).ok()), + )?; + _ = black_box(set.is_match(&input)); + _ = black_box(set.matches(&input).into_iter().collect::>()); + Ok(()) +} + +fuzz_target!(|data: Data| { + if data.pattern_set.len() > 10 { + return; + } + let (_, replace) = data.replacen; + if replace.len() > 100 { + return; + } + let (_, replace) = data.replacen_bytes; + if replace.len() > 100 { + return; + } + if data.set_input.len() > 500 { + return; + } + if data.set_input_bytes.len() > 500 { + return; + } + if data.input_bytes.len() > 500 { + return; + } + if data.input.len() > 500 { + return; + } + + if let Err(e) = + black_box(fuzz_regex(&data.pattern, &data.input, &data.replacen)) + { + black_box(format!("{e:?}")); + } + + if let Err(e) = black_box(fuzz_regex_bytes( + &data.pattern, + &data.input_bytes, + &data.replacen_bytes, + )) { + black_box(format!("{e:?}")); + } + if let Err(e) = + black_box(fuzz_regex_set(&data.pattern_set, &data.set_input)) + { + black_box(format!("{e:?}")); + } + + if let Err(e) = black_box(fuzz_regex_set_bytes( + &data.pattern_set, + &data.set_input_bytes, + )) { + black_box(format!("{e:?}")); + } +}); diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index e491bf1d4d..fd311edada 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -15,15 +15,7 @@ edition = "2018" [features] default = ["unicode"] -unicode = [ - "unicode-age", - "unicode-bool", - "unicode-case", - "unicode-gencat", - "unicode-perl", - "unicode-script", - "unicode-segment", -] +unicode = ["unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] unicode-age = [] unicode-bool = [] unicode-case = [] @@ -31,3 +23,6 @@ unicode-gencat = [] unicode-perl = [] unicode-script = [] unicode-segment = [] + +[dependencies] +arbitrary = { version = "1.2.3", features = ["derive"], optional = true } diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index 56698c53af..344f3503cc 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -5,6 +5,8 @@ use std::slice; use std::u8; use crate::unicode; +#[cfg(feature = "arbitrary")] +use arbitrary::Arbitrary; // This module contains an *internal* implementation of interval sets. // @@ -33,6 +35,7 @@ use crate::unicode; // Tests on this are relegated to the public API of HIR in src/hir.rs. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(Arbitrary))] pub struct IntervalSet { ranges: Vec, } diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 1096e9f05a..dd0ba71963 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -8,6 +8,9 @@ use std::fmt; use std::result; use std::u8; +#[cfg(feature = "arbitrary")] +use arbitrary::Arbitrary; + use crate::ast::Span; use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter}; use crate::unicode; @@ -172,6 +175,7 @@ impl fmt::Display for ErrorKind { /// expression pattern string, and uses constant stack space and heap space /// proportional to the size of the `Hir`. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(Arbitrary))] pub struct Hir { /// The underlying HIR kind. kind: HirKind, @@ -181,6 +185,7 @@ pub struct Hir { /// The kind of an arbitrary `Hir` expression. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(Arbitrary))] pub enum HirKind { /// The empty regular expression, which matches everything, including the /// empty string. @@ -744,6 +749,7 @@ impl fmt::Display for Hir { /// are preferred whenever possible. In particular, a `Byte` variant is only /// ever produced when it could match invalid UTF-8. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(Arbitrary))] pub enum Literal { /// A single character represented by a Unicode scalar value. Unicode(char), @@ -780,6 +786,7 @@ impl Literal { /// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not /// match the same set of strings. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(Arbitrary))] pub enum Class { /// A set of characters represented by Unicode scalar values. Unicode(ClassUnicode), @@ -834,6 +841,7 @@ impl Class { /// A set of characters represented by Unicode scalar values. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(Arbitrary))] pub struct ClassUnicode { set: IntervalSet, } @@ -970,6 +978,7 @@ impl<'a> Iterator for ClassUnicodeIter<'a> { /// The range is closed. That is, the start and end of the range are included /// in the range. #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] +#[cfg_attr(feature = "arbitrary", derive(Arbitrary))] pub struct ClassUnicodeRange { start: char, end: char, @@ -1077,6 +1086,7 @@ impl ClassUnicodeRange { /// A set of characters represented by arbitrary bytes (where one byte /// corresponds to one character). #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(Arbitrary))] pub struct ClassBytes { set: IntervalSet, } @@ -1187,6 +1197,7 @@ impl<'a> Iterator for ClassBytesIter<'a> { /// The range is closed. That is, the start and end of the range are included /// in the range. #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] +#[cfg_attr(feature = "arbitrary", derive(Arbitrary))] pub struct ClassBytesRange { start: u8, end: u8, @@ -1282,6 +1293,7 @@ impl fmt::Debug for ClassBytesRange { /// /// A matching anchor assertion is always zero-length. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(Arbitrary))] pub enum Anchor { /// Match the beginning of a line or the beginning of text. Specifically, /// this matches at the starting position of the input, or at the position @@ -1303,6 +1315,7 @@ pub enum Anchor { /// /// A matching word boundary assertion is always zero-length. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(Arbitrary))] pub enum WordBoundary { /// Match a Unicode-aware word boundary. That is, this matches a position /// where the left adjacent character and right adjacent character @@ -1336,6 +1349,7 @@ impl WordBoundary { /// 2. A capturing group (e.g., `(expr)`). /// 3. A named capturing group (e.g., `(?Pexpr)`). #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(Arbitrary))] pub struct Group { /// The kind of this group. If it is a capturing group, then the kind /// contains the capture group index (and the name, if it is a named @@ -1347,6 +1361,7 @@ pub struct Group { /// The kind of group. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(Arbitrary))] pub enum GroupKind { /// A normal unnamed capturing group. /// @@ -1368,6 +1383,7 @@ pub enum GroupKind { /// A repetition operator permits the repetition of an arbitrary /// sub-expression. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(Arbitrary))] pub struct Repetition { /// The kind of this repetition operator. pub kind: RepetitionKind, @@ -1407,6 +1423,7 @@ impl Repetition { /// The kind of a repetition operator. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(Arbitrary))] pub enum RepetitionKind { /// Matches a sub-expression zero or one times. ZeroOrOne, @@ -1420,6 +1437,7 @@ pub enum RepetitionKind { /// The kind of a counted repetition operator. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(Arbitrary))] pub enum RepetitionRange { /// Matches a sub-expression exactly this many times. Exactly(u32), @@ -1477,6 +1495,7 @@ impl Drop for Hir { /// /// These attributes are typically defined inductively on the HIR. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(Arbitrary))] struct HirInfo { /// Represent yes/no questions by a bitfield to conserve space, since /// this is included in every HIR expression.