From b311a75884408193fe2d4611b33057519e437c3a Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 6 Jan 2025 15:02:01 -0600 Subject: [PATCH 1/2] Fix repeated case-insensitive ASCII match The optimized bytecode for matching repetition of a single character overlooks case insensitivity. This resolves that by falling back to use an ASCII bitset when doing a case-insensitive match of a cased character. Fixes #785. --- Sources/_StringProcessing/ByteCodeGen.swift | 16 ++++++++++++---- Tests/RegexTests/MatchTests.swift | 19 +++++++++++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index ae8a1c8b..6a00a0df 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -773,11 +773,19 @@ fileprivate extension Compiler.ByteCodeGen { case .atom(let atom): switch atom { case .char(let c): - // Single scalar ascii value character - guard let val = c._singleScalarAsciiValue else { - return false + if options.isCaseInsensitive && c.isCased { + // Cased character with case-insensitive matching; match only as an ASCII bitset + guard let bitset = DSLTree.CustomCharacterClass(members: [.atom(atom)]).asAsciiBitset(options) else { + return false + } + builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + } else { + // Uncased character OR case-sensitive matching; match as a single scalar ascii value character + guard let val = c._singleScalarAsciiValue else { + return false + } + builder.buildQuantify(asciiChar: val, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) } - builder.buildQuantify(asciiChar: val, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) case .any: builder.buildQuantifyAny( diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 1b5c67e0..256372b3 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -2193,6 +2193,25 @@ extension RegexTests { ("cafe", true), ("CaFe", true), ("EfAc", true)) + + matchTest( + #"(?i)a+b"#, + ("ab", true), + ("Ab", true), + ("aB", true), + ("AB", true)) + matchTest( + #"(?i)a?b"#, + ("ab", true), + ("Ab", true), + ("aB", true), + ("AB", true)) + matchTest( + #"(?i)[a]?b"#, + ("ab", true), + ("Ab", true), + ("aB", true), + ("AB", true)) } func testNonSemanticWhitespace() { From cf27df1d897f156716bdc4abd1c628720a6003db Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Tue, 7 Jan 2025 17:37:11 -0600 Subject: [PATCH 2/2] Expand test coverage --- Tests/RegexTests/MatchTests.swift | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 256372b3..c52560d6 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -2199,19 +2199,35 @@ extension RegexTests { ("ab", true), ("Ab", true), ("aB", true), - ("AB", true)) + ("AB", true), + ("AaAab", true), + ("aaaAB", true)) matchTest( - #"(?i)a?b"#, + #"^(?i)a?b$"#, ("ab", true), ("Ab", true), ("aB", true), - ("AB", true)) + ("AB", true), + ("aaB", false), + ("b", true), + ("B", true)) matchTest( - #"(?i)[a]?b"#, + #"^(?i)[a]?b$"#, ("ab", true), ("Ab", true), ("aB", true), - ("AB", true)) + ("AB", true), + ("b", true), + ("B", true)) + matchTest( + #"^(?i)a{2,4}b$"#, + ("ab", false), + ("Ab", false), + ("AaB", true), + ("aAB", true), + ("aAaB", true), + ("aAaAB", true), + ("AaAaAB", false)) } func testNonSemanticWhitespace() {