From 29faff1e83e2bdedb0067dc79ccd8c53dca23cfe Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 13 Jan 2022 10:09:08 -0600 Subject: [PATCH 01/15] Add MatchingOptionSet AST.MatchingOptionSet is an OptionSet representation of the AST.MatchingOption enum (I wish we had a language feature for this transformation). This also adds the capability for AST.MatchingOptionSequence to convert itself to an OptionSet, adding or removing options as needed. Include additional options for the three potential semantic levels. --- .../Regex/AST/MatchingOptions.swift | 84 ++++++++++++++++++- .../Regex/Parse/LexicalAnalysis.swift | 5 ++ 2 files changed, 88 insertions(+), 1 deletion(-) diff --git a/Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift b/Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift index dd331ca3e..7b9ea8358 100644 --- a/Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift +++ b/Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift @@ -12,7 +12,7 @@ extension AST { /// An option written in source that changes matching semantics. public struct MatchingOption: Hashable { - public enum Kind { + public enum Kind: Int { // PCRE options case caseInsensitive // i case allowDuplicateGroupNames // J @@ -36,6 +36,11 @@ extension AST { // be unset, only flipped between) case textSegmentGraphemeMode // y{g} case textSegmentWordMode // y{w} + + // Swift semantic matching level + case graphemeClusterSemantics // X + case unicodeScalarSemantics // u + case byteSemantics // b } public var kind: Kind public var location: SourceLocation @@ -53,6 +58,15 @@ extension AST { return false } } + + public var isSemanticMatchingLevel: Bool { + switch kind { + case .graphemeClusterSemantics, .unicodeScalarSemantics, .byteSemantics: + return true + default: + return false + } + } } /// A sequence of matching options written in source. @@ -79,6 +93,74 @@ extension AST { self.minusLoc = minusLoc self.removing = removing } + + public func options(merging optionSet: MatchingOptionSet = []) -> MatchingOptionSet { + var result = optionSet + for opt in adding { + if opt.isSemanticMatchingLevel { + result.remove(.semanticMatchingLevels) + } + if opt.isTextSegmentMode { + result.remove(.textSegmentOptions) + } + + result.insert(.init(opt.kind)) + } + for opt in removing { + result.remove(.init(opt.kind)) + } + return result + } + } + + /// A set of matching options. + public struct MatchingOptionSet: OptionSet { + public var rawValue: UInt32 + + public init(rawValue: UInt32) { + self.rawValue = rawValue + } + + public init(_ kind: AST.MatchingOption.Kind) { + self.rawValue = 1 << kind.rawValue + } + + // PCRE options + public static var caseInsensitive: Self { .init(.caseInsensitive) } + public static var allowDuplicateGroupNames: Self { .init(.allowDuplicateGroupNames) } + public static var multiline: Self { .init(.multiline) } + public static var noAutoCapture: Self { .init(.noAutoCapture) } + public static var singleLine: Self { .init(.singleLine) } + public static var reluctantByDefault: Self { .init(.reluctantByDefault) } + public static var extended: Self { .init(.extended) } + public static var extraExtended: Self { .init(.extraExtended) } + + // ICU options + public static var unicodeWordBoundaries: Self { .init(.unicodeWordBoundaries) } + + // Oniguruma options + public static var asciiOnlyDigit: Self { .init(.asciiOnlyDigit) } + public static var asciiOnlyPOSIXProps: Self { .init(.asciiOnlyPOSIXProps) } + public static var asciiOnlySpace: Self { .init(.asciiOnlySpace) } + public static var asciiOnlyWord: Self { .init(.asciiOnlyWord) } + + // Oniguruma text segment options (these are mutually exclusive and cannot + // be unset, only flipped between) + public static var textSegmentGraphemeMode: Self { .init(.textSegmentGraphemeMode) } + public static var textSegmentWordMode: Self { .init(.textSegmentWordMode) } + + public static var textSegmentOptions: Self { + [.textSegmentGraphemeMode, .textSegmentWordMode] + } + + // Swift semantic matching level + public static var graphemeClusterSemantics: Self { .init(.graphemeClusterSemantics) } + public static var unicodeScalarSemantics: Self { .init(.unicodeScalarSemantics) } + public static var byteSemantics: Self { .init(.byteSemantics) } + + public static var semanticMatchingLevels: Self { + [.graphemeClusterSemantics, .unicodeScalarSemantics, .byteSemantics] + } } } diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift index 99e97727e..db220faf5 100644 --- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift @@ -550,6 +550,11 @@ extension Source { try src.expect("}") return opt + // Swift semantic level options + case "X": return advanceAndReturn(.graphemeClusterSemantics) + case "u": return advanceAndReturn(.unicodeScalarSemantics) + case "b": return advanceAndReturn(.byteSemantics) + default: return nil } From 2086a063f6f1e7105f0e7bee922d47600607f4c4 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 13 Jan 2022 10:10:48 -0600 Subject: [PATCH 02/15] Remove shortcut for `.` matching Nothing's ever this simple -- `.` behavior depends on current options --- Sources/_StringProcessing/Compiler.swift | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 1dbee7bcd..445b410e1 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -42,11 +42,6 @@ class Compiler { func emit(_ node: AST) throws { switch node { - // Any: . - // consume 1 - case .atom(let a) where a.kind == .any && matchLevel == .graphemeCluster: - builder.buildAdvance(1) - // Single characters we just match case .atom(let a) where a.singleCharacter != nil : builder.buildMatch(a.singleCharacter!) From db077d56949fd8d32b1d64eae14e6664f191544f Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 13 Jan 2022 10:27:00 -0600 Subject: [PATCH 03/15] Track the current matching options within the compiler This also makes a little progress in getting off the old CharacterClass / CC.MatchLevel types, and implements the correct matching behavior for `.` depending on matching level and the 's' flag. --- .../_StringProcessing/CharacterClass.swift | 2 - Sources/_StringProcessing/Compiler.swift | 63 ++++++++++++++++--- .../_StringProcessing/ConsumerInterface.swift | 41 +++++++----- Tests/RegexTests/LegacyTests.swift | 8 +-- Tests/RegexTests/MatchTests.swift | 8 ++- 5 files changed, 90 insertions(+), 32 deletions(-) diff --git a/Sources/_StringProcessing/CharacterClass.swift b/Sources/_StringProcessing/CharacterClass.swift index 149af7d53..0594f7a35 100644 --- a/Sources/_StringProcessing/CharacterClass.swift +++ b/Sources/_StringProcessing/CharacterClass.swift @@ -338,8 +338,6 @@ extension AST.Atom { switch kind { case let .escaped(b): return b.characterClass - case .any: return .any - case .property: // TODO: Would our model type for character classes include // this? Or does grapheme-semantic mode complicate that? diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 445b410e1..1de9b245d 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -18,18 +18,17 @@ struct RegexProgram { class Compiler { let ast: AST - let matchLevel: CharacterClass.MatchLevel - let options: REOptions + private var optionStack: MatchingOptionSetStack private var builder = RegexProgram.Program.Builder() + private var currentOptions: AST.MatchingOptionSet { optionStack.top } + init( ast: AST, - matchLevel: CharacterClass.MatchLevel = .graphemeCluster, - options: REOptions = [] + options: AST.MatchingOptionSet = [.init(.graphemeClusterSemantics)] ) { self.ast = ast - self.matchLevel = matchLevel - self.options = options + self.optionStack = MatchingOptionSetStack(options) } __consuming func emit() throws -> RegexProgram { @@ -102,6 +101,17 @@ class Compiler { try emit(g.child) builder.buildEndCapture(cap) + case .changeMatchingOptions(let optionSequence, isIsolated: let isIsolated): + let updated = optionSequence.options(merging: currentOptions) + optionStack.push(updated) + + try emit(g.child) + if !isIsolated { + optionStack.pop() + } + + // TODO: `optionStack.pop()` whenever a group ends + default: // FIXME: Other kinds... try emit(g.child) @@ -113,8 +123,8 @@ class Compiler { // For now, we model sets and atoms as consumers. // This lets us rapidly expand support, and we can better // design the actual instruction set with real examples - case _ where try node.generateConsumer(matchLevel) != nil: - try builder.buildConsume(by: node.generateConsumer(matchLevel)!) + case _ where try node.generateConsumer(optionStack.top) != nil: + try builder.buildConsume(by: node.generateConsumer(optionStack.top)!) case .quote(let q): // We stick quoted content into read-only constant strings @@ -469,6 +479,43 @@ class Compiler { } } +/// A stack of MatchingOptionSets that never pops its initial element. +fileprivate struct MatchingOptionSetStack { + internal var stack: [AST.MatchingOptionSet] + + init(_ initial: AST.MatchingOptionSet) { + self.stack = [initial] + } + + var top: AST.MatchingOptionSet { stack.last.unsafelyUnwrapped } + + mutating func push(_ set: AST.MatchingOptionSet) { + stack.append(set) + } + + mutating func pop() { + if stack.count > 1 { + _ = stack.removeLast() + } + } +} + +// Deprecated matchLevel-based initializer +extension Compiler { + @available(*, deprecated) + convenience init( + ast: AST, + matchLevel: CharacterClass.MatchLevel, + options: REOptions = [] + ) { + if matchLevel == .graphemeCluster { + self.init(ast: ast, options: .init(.graphemeClusterSemantics)) + } else { + self.init(ast: ast, options: .init(.unicodeScalarSemantics)) + } + } +} + public func _compileRegex( _ regex: String, _ syntax: SyntaxOptions = .traditional ) throws -> Executor { diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index fb414e1df..e34b850b2 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -28,12 +28,8 @@ func unsupported( file: StaticString = #file, line: UInt = #line ) -> Unsupported { - // TODO: how do we not have a public init for this? - let fStr = file.withUTF8Buffer { - String(decoding: $0, as: UTF8.self) - } return Unsupported( - message: s, file: fStr, line: Int(line)) + message: s, file: String(describing: file), line: Int(line)) } extension AST { @@ -42,8 +38,7 @@ extension AST { /// A consumer is a Swift closure that matches against /// the front of an input range func generateConsumer( - // TODO: Better option modeling - _ opts: CharacterClass.MatchLevel + _ opts: AST.MatchingOptionSet ) throws -> Program.ConsumeFunction? { switch self { case .atom(let a): @@ -77,10 +72,13 @@ extension AST.Atom { } func generateConsumer( - _ opts: CharacterClass.MatchLevel + _ opts: AST.MatchingOptionSet ) throws -> Program.ConsumeFunction? { // TODO: Wean ourselves off of this type... - if let cc = self.characterClass?.withMatchLevel(opts) { + let matchLevel: CharacterClass.MatchLevel = opts.contains(.unicodeScalarSemantics) + ? .unicodeScalar + : .graphemeCluster + if let cc = self.characterClass?.withMatchLevel(matchLevel) { return { input, bounds in // FIXME: should we worry about out of bounds? cc.matches(in: input, at: bounds.lowerBound) @@ -109,9 +107,20 @@ extension AST.Atom { // TODO: alias? casing? $0.name == name || $0.nameAlias == name } + + case .any: + return { input, bounds in + let curIndex = bounds.lowerBound + if !opts.contains(.singleLine) && input[curIndex].isNewline { + return nil + } + return opts.contains(.graphemeClusterSemantics) + ? input.index(after: curIndex) + : input.unicodeScalars.index(after: curIndex) + } case .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl, - .any, .startOfLine, .endOfLine, + .startOfLine, .endOfLine, .backreference, .subpattern, .condition: // FIXME: implement return nil @@ -121,7 +130,7 @@ extension AST.Atom { extension AST.CustomCharacterClass.Member { func generateConsumer( - _ opts: CharacterClass.MatchLevel + _ opts: AST.MatchingOptionSet ) throws -> Program.ConsumeFunction { switch self { case .custom(let ccc): @@ -212,7 +221,7 @@ extension AST.CustomCharacterClass.Member { extension AST.CustomCharacterClass { func generateConsumer( - _ opts: CharacterClass.MatchLevel + _ opts: AST.MatchingOptionSet ) throws -> Program.ConsumeFunction { // NOTE: Easy way to implement, obviously not performant let consumers = try members.map { @@ -265,7 +274,7 @@ private func consumeScalar( extension AST.Atom.CharacterProperty { func generateConsumer( - _ opts: CharacterClass.MatchLevel + _ opts: AST.MatchingOptionSet ) throws -> Program.ConsumeFunction { // Handle inversion for us, albeit not efficiently func invert( @@ -335,7 +344,7 @@ extension AST.Atom.CharacterProperty { extension Unicode.BinaryProperty { // FIXME: Semantic level, vet for precise defs func generateConsumer( - _ opts: CharacterClass.MatchLevel + _ opts: AST.MatchingOptionSet ) throws -> Program.ConsumeFunction { switch self { @@ -499,7 +508,7 @@ extension Unicode.BinaryProperty { extension Unicode.POSIXProperty { // FIXME: Semantic level, vet for precise defs func generateConsumer( - _ opts: CharacterClass.MatchLevel + _ opts: AST.MatchingOptionSet ) -> Program.ConsumeFunction { // FIXME: semantic levels, modes, etc switch self { @@ -545,7 +554,7 @@ extension Unicode.POSIXProperty { extension Unicode.ExtendedGeneralCategory { // FIXME: Semantic level func generateConsumer( - _ opts: CharacterClass.MatchLevel + _ opts: AST.MatchingOptionSet ) throws -> Program.ConsumeFunction { switch self { case .letter: diff --git a/Tests/RegexTests/LegacyTests.swift b/Tests/RegexTests/LegacyTests.swift index 785d4293e..ede9d2066 100644 --- a/Tests/RegexTests/LegacyTests.swift +++ b/Tests/RegexTests/LegacyTests.swift @@ -154,14 +154,14 @@ private func performTest( extension RegexTests { func testLegacyCompile() { - func performTest(_ input: String, _ expecting: RECode) { + func performTest(_ input: String, _ expecting: RECode, line: UInt = #line) { let recode = try! compile(input) guard recode == expecting else { XCTFail(""" Expected: \(expecting) Found: \(recode) - """) + """, line: line) return } } @@ -292,7 +292,7 @@ extension RegexTests { recode( label(0), split(disfavoring: 1), .beginGroup, - label(2), split(disfavoring: 3), .characterClass(.any), goto(label: 2), + label(2), split(disfavoring: 3), .any, goto(label: 2), label(3), .endGroup, goto(label: 0), @@ -302,7 +302,7 @@ extension RegexTests { "a.*?b+?c??", recode("a", label(0), split(disfavoring: 1), goto(label: 2), - label(1), .characterClass(.any), goto(label: 0), + label(1), .any, goto(label: 0), label(2), label(3), "b", split(disfavoring: 3), split(disfavoring: 4), goto(label: 5), diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 1717b6a7d..4409d2f3d 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1098,6 +1098,11 @@ extension RegexTests { ) } + func testSingleLineMode() { + firstMatchTest(#".+"#, input: "a\nb", match: "a") + firstMatchTest(#"(?s:.+)"#, input: "a\nb", match: "a\nb") + } + // MARK: Character Semantics var eComposed: String { "é" } @@ -1256,8 +1261,7 @@ extension RegexTests { // a single Unicode scalar value, leaving any other grapheme scalar // components to be matched. - firstMatchTest(#"(?u:.)"#, input: eDecomposed, match: "e", - xfail: true) + firstMatchTest(#"(?u:.)"#, input: eDecomposed, match: "e") matchTest( #".\u{301}"#, From bf0a8f583a8add4467d2df66918be7c057c6b19a Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 13 Jan 2022 11:35:08 -0600 Subject: [PATCH 04/15] Correctly scope matching options within groups --- Sources/_StringProcessing/Compiler.swift | 41 ++++++++++++++++-------- Tests/RegexTests/MatchTests.swift | 21 ++++++++++++ 2 files changed, 48 insertions(+), 14 deletions(-) diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 1de9b245d..d898da62f 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -21,7 +21,12 @@ class Compiler { private var optionStack: MatchingOptionSetStack private var builder = RegexProgram.Program.Builder() - private var currentOptions: AST.MatchingOptionSet { optionStack.top } + private var currentOptions: AST.MatchingOptionSet { + guard let top = optionStack.top else { + fatalError("Unbalanced matching options removal") + } + return top + } init( ast: AST, @@ -85,6 +90,9 @@ class Compiler { break case .group(let g): + optionStack.push(currentOptions) + defer { optionStack.pop() } + if let lookaround = g.lookaroundKind { try emitLookaround(lookaround, g.child) return @@ -103,14 +111,15 @@ class Compiler { case .changeMatchingOptions(let optionSequence, isIsolated: let isIsolated): let updated = optionSequence.options(merging: currentOptions) - optionStack.push(updated) - try emit(g.child) - if !isIsolated { + if isIsolated { + optionStack.replaceTop(updated) + try emit(g.child) + } else { + optionStack.push(updated) + try emit(g.child) optionStack.pop() } - - // TODO: `optionStack.pop()` whenever a group ends default: // FIXME: Other kinds... @@ -123,8 +132,8 @@ class Compiler { // For now, we model sets and atoms as consumers. // This lets us rapidly expand support, and we can better // design the actual instruction set with real examples - case _ where try node.generateConsumer(optionStack.top) != nil: - try builder.buildConsume(by: node.generateConsumer(optionStack.top)!) + case _ where try node.generateConsumer(currentOptions) != nil: + try builder.buildConsume(by: node.generateConsumer(currentOptions)!) case .quote(let q): // We stick quoted content into read-only constant strings @@ -479,7 +488,7 @@ class Compiler { } } -/// A stack of MatchingOptionSets that never pops its initial element. +/// A stack of `MatchingOptionSet`s. fileprivate struct MatchingOptionSetStack { internal var stack: [AST.MatchingOptionSet] @@ -487,16 +496,20 @@ fileprivate struct MatchingOptionSetStack { self.stack = [initial] } - var top: AST.MatchingOptionSet { stack.last.unsafelyUnwrapped } + var top: AST.MatchingOptionSet? { stack.last } mutating func push(_ set: AST.MatchingOptionSet) { stack.append(set) } - mutating func pop() { - if stack.count > 1 { - _ = stack.removeLast() - } + mutating func replaceTop(_ set: AST.MatchingOptionSet) { + stack.removeLast() + stack.append(set) + } + + @discardableResult + mutating func pop() -> AST.MatchingOptionSet { + stack.removeLast() } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 4409d2f3d..a4d971605 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1103,6 +1103,27 @@ extension RegexTests { firstMatchTest(#"(?s:.+)"#, input: "a\nb", match: "a\nb") } + func testMatchingOptionsScope() { + // `.` only matches newlines when the 's' option (single-line mode) + // is turned on. Standalone option-setting groups (e.g. `(?s)`) are + // scoped only to the current group. + + firstMatchTest(#"(?s)a.b"#, input: "a\nb", match: "a\nb") + firstMatchTest(#"((?s)a.)b"#, input: "a\nb", match: "a\nb") + firstMatchTest(#"(?-s)((?s)a.)b"#, input: "a\nb", match: "a\nb") + firstMatchTest(#"(?-s)(?s:a.)b"#, input: "a\nb", match: "a\nb") + firstMatchTest(#"((?s)a).b"#, input: "a\nb", match: nil) + firstMatchTest(#"((?s))a.b"#, input: "a\nb", match: nil) + firstMatchTest(#"(?:(?s))a.b"#, input: "a\nb", match: nil) + firstMatchTest(#"((?s)a(?s)).b"#, input: "a\nb", match: nil) + firstMatchTest(#"(?s)a(?-s).b"#, input: "a\nb", match: nil) + firstMatchTest(#"(?s)a(?-s:.b)"#, input: "a\nb", match: nil) + firstMatchTest(#"(?:(?s)a).b"#, input: "a\nb", match: nil) + firstMatchTest(#"(((?s)a)).b"#, input: "a\nb", match: nil) + firstMatchTest(#"(?s)(((?-s)a)).b"#, input: "a\nb", match: "a\nb") + firstMatchTest(#"(?s)((?-s)((?i)a)).b"#, input: "a\nb", match: "a\nb") + } + // MARK: Character Semantics var eComposed: String { "é" } From 29d8f0de59f8646f36826619a46b83dbdf9c5601 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 14 Jan 2022 12:49:53 -0600 Subject: [PATCH 05/15] Move MatchingOptionSet up to the compiler level --- .../Regex/AST/MatchingOptions.swift | 68 ---------- Sources/_StringProcessing/Compiler.swift | 48 +------ .../_StringProcessing/ConsumerInterface.swift | 16 +-- .../_StringProcessing/MatchingOptionSet.swift | 121 ++++++++++++++++++ 4 files changed, 135 insertions(+), 118 deletions(-) create mode 100644 Sources/_StringProcessing/MatchingOptionSet.swift diff --git a/Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift b/Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift index 7b9ea8358..8efbafb51 100644 --- a/Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift +++ b/Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift @@ -93,74 +93,6 @@ extension AST { self.minusLoc = minusLoc self.removing = removing } - - public func options(merging optionSet: MatchingOptionSet = []) -> MatchingOptionSet { - var result = optionSet - for opt in adding { - if opt.isSemanticMatchingLevel { - result.remove(.semanticMatchingLevels) - } - if opt.isTextSegmentMode { - result.remove(.textSegmentOptions) - } - - result.insert(.init(opt.kind)) - } - for opt in removing { - result.remove(.init(opt.kind)) - } - return result - } - } - - /// A set of matching options. - public struct MatchingOptionSet: OptionSet { - public var rawValue: UInt32 - - public init(rawValue: UInt32) { - self.rawValue = rawValue - } - - public init(_ kind: AST.MatchingOption.Kind) { - self.rawValue = 1 << kind.rawValue - } - - // PCRE options - public static var caseInsensitive: Self { .init(.caseInsensitive) } - public static var allowDuplicateGroupNames: Self { .init(.allowDuplicateGroupNames) } - public static var multiline: Self { .init(.multiline) } - public static var noAutoCapture: Self { .init(.noAutoCapture) } - public static var singleLine: Self { .init(.singleLine) } - public static var reluctantByDefault: Self { .init(.reluctantByDefault) } - public static var extended: Self { .init(.extended) } - public static var extraExtended: Self { .init(.extraExtended) } - - // ICU options - public static var unicodeWordBoundaries: Self { .init(.unicodeWordBoundaries) } - - // Oniguruma options - public static var asciiOnlyDigit: Self { .init(.asciiOnlyDigit) } - public static var asciiOnlyPOSIXProps: Self { .init(.asciiOnlyPOSIXProps) } - public static var asciiOnlySpace: Self { .init(.asciiOnlySpace) } - public static var asciiOnlyWord: Self { .init(.asciiOnlyWord) } - - // Oniguruma text segment options (these are mutually exclusive and cannot - // be unset, only flipped between) - public static var textSegmentGraphemeMode: Self { .init(.textSegmentGraphemeMode) } - public static var textSegmentWordMode: Self { .init(.textSegmentWordMode) } - - public static var textSegmentOptions: Self { - [.textSegmentGraphemeMode, .textSegmentWordMode] - } - - // Swift semantic matching level - public static var graphemeClusterSemantics: Self { .init(.graphemeClusterSemantics) } - public static var unicodeScalarSemantics: Self { .init(.unicodeScalarSemantics) } - public static var byteSemantics: Self { .init(.byteSemantics) } - - public static var semanticMatchingLevels: Self { - [.graphemeClusterSemantics, .unicodeScalarSemantics, .byteSemantics] - } } } diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index d898da62f..096672118 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -21,16 +21,13 @@ class Compiler { private var optionStack: MatchingOptionSetStack private var builder = RegexProgram.Program.Builder() - private var currentOptions: AST.MatchingOptionSet { - guard let top = optionStack.top else { - fatalError("Unbalanced matching options removal") - } - return top + private var currentOptions: MatchingOptionSet { + return optionStack.top } init( ast: AST, - options: AST.MatchingOptionSet = [.init(.graphemeClusterSemantics)] + options: MatchingOptionSet = .default ) { self.ast = ast self.optionStack = MatchingOptionSetStack(options) @@ -109,17 +106,9 @@ class Compiler { try emit(g.child) builder.buildEndCapture(cap) - case .changeMatchingOptions(let optionSequence, isIsolated: let isIsolated): - let updated = optionSequence.options(merging: currentOptions) - - if isIsolated { - optionStack.replaceTop(updated) - try emit(g.child) - } else { - optionStack.push(updated) - try emit(g.child) - optionStack.pop() - } + case .changeMatchingOptions(let optionSequence, _): + optionStack.replaceTop(currentOptions.merging(optionSequence)) + try emit(g.child) default: // FIXME: Other kinds... @@ -488,31 +477,6 @@ class Compiler { } } -/// A stack of `MatchingOptionSet`s. -fileprivate struct MatchingOptionSetStack { - internal var stack: [AST.MatchingOptionSet] - - init(_ initial: AST.MatchingOptionSet) { - self.stack = [initial] - } - - var top: AST.MatchingOptionSet? { stack.last } - - mutating func push(_ set: AST.MatchingOptionSet) { - stack.append(set) - } - - mutating func replaceTop(_ set: AST.MatchingOptionSet) { - stack.removeLast() - stack.append(set) - } - - @discardableResult - mutating func pop() -> AST.MatchingOptionSet { - stack.removeLast() - } -} - // Deprecated matchLevel-based initializer extension Compiler { @available(*, deprecated) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index e34b850b2..070fdff6f 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -38,7 +38,7 @@ extension AST { /// A consumer is a Swift closure that matches against /// the front of an input range func generateConsumer( - _ opts: AST.MatchingOptionSet + _ opts: MatchingOptionSet ) throws -> Program.ConsumeFunction? { switch self { case .atom(let a): @@ -72,7 +72,7 @@ extension AST.Atom { } func generateConsumer( - _ opts: AST.MatchingOptionSet + _ opts: MatchingOptionSet ) throws -> Program.ConsumeFunction? { // TODO: Wean ourselves off of this type... let matchLevel: CharacterClass.MatchLevel = opts.contains(.unicodeScalarSemantics) @@ -130,7 +130,7 @@ extension AST.Atom { extension AST.CustomCharacterClass.Member { func generateConsumer( - _ opts: AST.MatchingOptionSet + _ opts: MatchingOptionSet ) throws -> Program.ConsumeFunction { switch self { case .custom(let ccc): @@ -221,7 +221,7 @@ extension AST.CustomCharacterClass.Member { extension AST.CustomCharacterClass { func generateConsumer( - _ opts: AST.MatchingOptionSet + _ opts: MatchingOptionSet ) throws -> Program.ConsumeFunction { // NOTE: Easy way to implement, obviously not performant let consumers = try members.map { @@ -274,7 +274,7 @@ private func consumeScalar( extension AST.Atom.CharacterProperty { func generateConsumer( - _ opts: AST.MatchingOptionSet + _ opts: MatchingOptionSet ) throws -> Program.ConsumeFunction { // Handle inversion for us, albeit not efficiently func invert( @@ -344,7 +344,7 @@ extension AST.Atom.CharacterProperty { extension Unicode.BinaryProperty { // FIXME: Semantic level, vet for precise defs func generateConsumer( - _ opts: AST.MatchingOptionSet + _ opts: MatchingOptionSet ) throws -> Program.ConsumeFunction { switch self { @@ -508,7 +508,7 @@ extension Unicode.BinaryProperty { extension Unicode.POSIXProperty { // FIXME: Semantic level, vet for precise defs func generateConsumer( - _ opts: AST.MatchingOptionSet + _ opts: MatchingOptionSet ) -> Program.ConsumeFunction { // FIXME: semantic levels, modes, etc switch self { @@ -554,7 +554,7 @@ extension Unicode.POSIXProperty { extension Unicode.ExtendedGeneralCategory { // FIXME: Semantic level func generateConsumer( - _ opts: AST.MatchingOptionSet + _ opts: MatchingOptionSet ) throws -> Program.ConsumeFunction { switch self { case .letter: diff --git a/Sources/_StringProcessing/MatchingOptionSet.swift b/Sources/_StringProcessing/MatchingOptionSet.swift new file mode 100644 index 000000000..ad8775f02 --- /dev/null +++ b/Sources/_StringProcessing/MatchingOptionSet.swift @@ -0,0 +1,121 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +import _MatchingEngine + +/// A set of matching options. +public struct MatchingOptionSet: OptionSet { + public var rawValue: UInt32 + + public init(rawValue: UInt32) { + self.rawValue = rawValue + } + + public init(_ kind: AST.MatchingOption.Kind) { + self.rawValue = 1 << kind.rawValue + } + + public static var `default`: Self { + [.graphemeClusterSemantics, .textSegmentGraphemeMode] + } + + // PCRE options + public static var caseInsensitive: Self { .init(.caseInsensitive) } + public static var allowDuplicateGroupNames: Self { .init(.allowDuplicateGroupNames) } + public static var multiline: Self { .init(.multiline) } + public static var noAutoCapture: Self { .init(.noAutoCapture) } + public static var singleLine: Self { .init(.singleLine) } + public static var reluctantByDefault: Self { .init(.reluctantByDefault) } + public static var extended: Self { .init(.extended) } + public static var extraExtended: Self { .init(.extraExtended) } + + // ICU options + public static var unicodeWordBoundaries: Self { .init(.unicodeWordBoundaries) } + + // Oniguruma options + public static var asciiOnlyDigit: Self { .init(.asciiOnlyDigit) } + public static var asciiOnlyPOSIXProps: Self { .init(.asciiOnlyPOSIXProps) } + public static var asciiOnlySpace: Self { .init(.asciiOnlySpace) } + public static var asciiOnlyWord: Self { .init(.asciiOnlyWord) } + + // Oniguruma text segment options (these are mutually exclusive and cannot + // be unset, only flipped between) + public static var textSegmentGraphemeMode: Self { .init(.textSegmentGraphemeMode) } + public static var textSegmentWordMode: Self { .init(.textSegmentWordMode) } + + public static var textSegmentOptions: Self { + [.textSegmentGraphemeMode, .textSegmentWordMode] + } + + // Swift semantic matching level + public static var graphemeClusterSemantics: Self { .init(.graphemeClusterSemantics) } + public static var unicodeScalarSemantics: Self { .init(.unicodeScalarSemantics) } + public static var byteSemantics: Self { .init(.byteSemantics) } + + public static var semanticMatchingLevels: Self { + [.graphemeClusterSemantics, .unicodeScalarSemantics, .byteSemantics] + } +} + +extension MatchingOptionSet { + public func merging(_ sequence: AST.MatchingOptionSequence) -> MatchingOptionSet { + var result = self + for opt in sequence.adding { + if opt.isSemanticMatchingLevel { + result.remove(.semanticMatchingLevels) + } + if opt.isTextSegmentMode { + result.remove(.textSegmentOptions) + } + + result.insert(.init(opt.kind)) + } + for opt in sequence.removing { + result.remove(.init(opt.kind)) + } + return result + } +} + +/// A never-empty stack of `MatchingOptionSet`s. +struct MatchingOptionSetStack { + var stack: [MatchingOptionSet] + + init(_ initial: MatchingOptionSet) { + self.stack = [initial] + } + + private func _invariantCheck() { + assert(!stack.isEmpty, "Unbalanced matching options pop") + } + + var top: MatchingOptionSet { + _invariantCheck() + return stack.last! + } + + mutating func push(_ set: MatchingOptionSet) { + stack.append(set) + } + + mutating func replaceTop(_ set: MatchingOptionSet) { + _invariantCheck() + stack.removeLast() + stack.append(set) + } + + @discardableResult + mutating func pop() -> MatchingOptionSet { + let result = stack.removeLast() + _invariantCheck() + return result + } +} From ffc1f5f1d2cda085cecd5bda5d8baf76a33305e1 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 14 Jan 2022 14:05:27 -0600 Subject: [PATCH 06/15] Simplify MatchingOptionSet down to what the compiler needs --- .../_StringProcessing/MatchingOptionSet.swift | 75 ++++++++----------- 1 file changed, 32 insertions(+), 43 deletions(-) diff --git a/Sources/_StringProcessing/MatchingOptionSet.swift b/Sources/_StringProcessing/MatchingOptionSet.swift index ad8775f02..8d3237540 100644 --- a/Sources/_StringProcessing/MatchingOptionSet.swift +++ b/Sources/_StringProcessing/MatchingOptionSet.swift @@ -12,61 +12,50 @@ import _MatchingEngine /// A set of matching options. -public struct MatchingOptionSet: OptionSet { - public var rawValue: UInt32 - - public init(rawValue: UInt32) { - self.rawValue = rawValue - } +struct MatchingOptionSet: RawRepresentable { + var rawValue: UInt32 +} - public init(_ kind: AST.MatchingOption.Kind) { +extension MatchingOptionSet { + init(_ kind: AST.MatchingOption.Kind) { self.rawValue = 1 << kind.rawValue } - - public static var `default`: Self { - [.graphemeClusterSemantics, .textSegmentGraphemeMode] + + fileprivate init(unchecked kinds: AST.MatchingOption.Kind...) { + self.rawValue = 0 + for kind in kinds { + self.rawValue |= 1 << kind.rawValue + } } - // PCRE options - public static var caseInsensitive: Self { .init(.caseInsensitive) } - public static var allowDuplicateGroupNames: Self { .init(.allowDuplicateGroupNames) } - public static var multiline: Self { .init(.multiline) } - public static var noAutoCapture: Self { .init(.noAutoCapture) } - public static var singleLine: Self { .init(.singleLine) } - public static var reluctantByDefault: Self { .init(.reluctantByDefault) } - public static var extended: Self { .init(.extended) } - public static var extraExtended: Self { .init(.extraExtended) } - - // ICU options - public static var unicodeWordBoundaries: Self { .init(.unicodeWordBoundaries) } - - // Oniguruma options - public static var asciiOnlyDigit: Self { .init(.asciiOnlyDigit) } - public static var asciiOnlyPOSIXProps: Self { .init(.asciiOnlyPOSIXProps) } - public static var asciiOnlySpace: Self { .init(.asciiOnlySpace) } - public static var asciiOnlyWord: Self { .init(.asciiOnlyWord) } - - // Oniguruma text segment options (these are mutually exclusive and cannot - // be unset, only flipped between) - public static var textSegmentGraphemeMode: Self { .init(.textSegmentGraphemeMode) } - public static var textSegmentWordMode: Self { .init(.textSegmentWordMode) } - - public static var textSegmentOptions: Self { - [.textSegmentGraphemeMode, .textSegmentWordMode] + fileprivate mutating func remove(_ kind: Self) { + self.rawValue &= ~kind.rawValue + } + + fileprivate mutating func insert(_ kind: Self) { + self.rawValue |= kind.rawValue } - // Swift semantic matching level - public static var graphemeClusterSemantics: Self { .init(.graphemeClusterSemantics) } - public static var unicodeScalarSemantics: Self { .init(.unicodeScalarSemantics) } - public static var byteSemantics: Self { .init(.byteSemantics) } + fileprivate static var textSegmentOptions: Self { + Self(unchecked: .textSegmentGraphemeMode, .textSegmentWordMode) + } - public static var semanticMatchingLevels: Self { - [.graphemeClusterSemantics, .unicodeScalarSemantics, .byteSemantics] + fileprivate static var semanticMatchingLevels: Self { + Self(unchecked: .graphemeClusterSemantics, .unicodeScalarSemantics, .byteSemantics) } } +// Compiler API extension MatchingOptionSet { - public func merging(_ sequence: AST.MatchingOptionSequence) -> MatchingOptionSet { + static var `default`: Self { + Self(unchecked: .graphemeClusterSemantics, .textSegmentGraphemeMode) + } + + func contains(_ kind: AST.MatchingOption.Kind) -> Bool { + self.rawValue & (1 << kind.rawValue) != 0 + } + + func merging(_ sequence: AST.MatchingOptionSequence) -> MatchingOptionSet { var result = self for opt in sequence.adding { if opt.isSemanticMatchingLevel { From 6e9a5db737c1e660e6858b75ffc454a722363ffb Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 14 Jan 2022 23:23:41 -0600 Subject: [PATCH 07/15] Don't allow removing semantic level options --- .../Regex/Parse/Diagnostics.swift | 3 +++ .../Regex/Parse/LexicalAnalysis.swift | 4 ++++ Tests/RegexTests/CompileTests.swift | 14 +++++++++++--- Tests/RegexTests/LexTests.swift | 19 ++++++++++++++++--- Tests/RegexTests/ParseTests.swift | 11 ++++++----- 5 files changed, 40 insertions(+), 11 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift b/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift index 7dd49049e..a9075799c 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift @@ -45,6 +45,7 @@ enum ParseError: Error, Hashable { case expectedGroupSpecifier case cannotRemoveTextSegmentOptions + case cannotRemoveSemanticsOptions } extension ParseError: CustomStringConvertible { @@ -90,6 +91,8 @@ extension ParseError: CustomStringConvertible { return "expected group specifier" case .cannotRemoveTextSegmentOptions: return "text segment mode cannot be unset, only changed" + case .cannotRemoveSemanticsOptions: + return "matching semantics cannot be unset, only changed" } } } diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift index db220faf5..db441a4a6 100644 --- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift @@ -599,6 +599,10 @@ extension Source { if opt.isTextSegmentMode { throw ParseError.cannotRemoveTextSegmentOptions } + // Matching semantics options can only be added, not removed. + if opt.isSemanticMatchingLevel { + throw ParseError.cannotRemoveSemanticsOptions + } removing.append(opt) } return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location, diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 6d8a45544..3bc7905fc 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -18,7 +18,9 @@ import XCTest extension RegexTests { private func testCompilationEquivalence( - _ equivs: [String] + _ equivs: [String], + file: StaticString = #file, + line: UInt = #line ) throws { assert(!equivs.isEmpty) let progs = try equivs.map { @@ -33,7 +35,8 @@ extension RegexTests { \(ref) Current: \(prog) - """) + """, + file: file, line: line) continue } } @@ -70,7 +73,12 @@ extension RegexTests { "(*positive_lookahead: assert)"], ["(?! assert)", "(*nla: assert)", - "(*negative_lookahead: assert)"] + "(*negative_lookahead: assert)"], + + ["(?i) case-insensitive", + "(?i: case-insensitive)"], + ["(?i) case-insensitive(?-i) post", + "(?i: case-insensitive) post"], ] for row in equivalents { diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift index 0bd01f930..f18b6839c 100644 --- a/Tests/RegexTests/LexTests.swift +++ b/Tests/RegexTests/LexTests.swift @@ -18,21 +18,23 @@ func diagnose( _ input: String, expecting expected: ParseError, _ syntax: SyntaxOptions = .traditional, - _ f: (inout Source) throws -> () + _ f: (inout Source) throws -> (), + file: StaticString = #file, + line: UInt = #line ) { var src = Source(input, syntax) do { try f(&src) XCTFail(""" Passed, but expected error: \(expected) - """) + """, file: file, line: line) } catch let e as Source.LocatedError { guard e.error == expected else { XCTFail(""" Expected: \(expected) Actual: \(e.error) - """) + """, file: file, line: line) return } } catch let e { @@ -108,6 +110,17 @@ extension RegexTests { diagnose("(?-y{w})", expecting: .cannotRemoveTextSegmentOptions) { _ = try $0.lexGroupStart() } + + // Semantic level options + diagnose("(?-X)", expecting: .cannotRemoveSemanticsOptions) { + _ = try $0.lexGroupStart() + } + diagnose("(?-u)", expecting: .cannotRemoveSemanticsOptions) { + _ = try $0.lexGroupStart() + } + diagnose("(?-b)", expecting: .cannotRemoveSemanticsOptions) { + _ = try $0.lexGroupStart() + } // Test expected group. diagnose(#"(*"#, expecting: .misc("Quantifier '*' must follow operand")) { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 6dd38d28d..020fad88f 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -655,19 +655,20 @@ extension RegexTests { .singleLine, .reluctantByDefault, .extraExtended, .extended, .unicodeWordBoundaries, .asciiOnlyDigit, .asciiOnlyPOSIXProps, .asciiOnlySpace, .asciiOnlyWord, .textSegmentGraphemeMode, - .textSegmentWordMode + .textSegmentWordMode, .graphemeClusterSemantics, .unicodeScalarSemantics, + .byteSemantics ] - parseTest("(?iJmnsUxxxwDPSWy{g}y{w}-iJmnsUxxxwDPSW)", changeMatchingOptions( + parseTest("(?iJmnsUxxxwDPSWy{g}y{w}Xub-iJmnsUxxxwDPSW)", changeMatchingOptions( matchingOptions( adding: allOptions, - removing: allOptions.dropLast(2) + removing: allOptions.dropLast(5) ), isIsolated: true, empty()) ) - parseTest("(?iJmnsUxxxwDPSWy{g}y{w}-iJmnsUxxxwDPSW:)", changeMatchingOptions( + parseTest("(?iJmnsUxxxwDPSWy{g}y{w}Xub-iJmnsUxxxwDPSW:)", changeMatchingOptions( matchingOptions( adding: allOptions, - removing: allOptions.dropLast(2) + removing: allOptions.dropLast(5) ), isIsolated: false, empty()) ) From f1999102cb28185ae65bff02238a70ee6e14b98e Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Sat, 15 Jan 2022 00:16:35 -0600 Subject: [PATCH 08/15] Implement the 'U' reluctant default option --- Sources/_StringProcessing/Compiler.swift | 13 ++++++++++++- Tests/RegexTests/MatchTests.swift | 14 ++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 096672118..5ed5eb220 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -455,7 +455,18 @@ class Compiler { func emitQuantification(_ quant: AST.Quantification) throws { let child = quant.child - let kind = quant.kind.value + + // If in reluctant-by-default mode, eager and reluctant need to be switched. + let kind: AST.Quantification.Kind + if currentOptions.contains(.reluctantByDefault) + && quant.kind.value != .possessive + { + kind = quant.kind.value == .eager + ? .reluctant + : .eager + } else { + kind = quant.kind.value + } switch quant.amount.value.bounds { case (_, atMost: 0): diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index a4d971605..745be056d 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -464,6 +464,20 @@ extension RegexTests { // TODO: Nested reluctant reentrant example, xfailed ) + // Reluctant by default - '*/+/.' and '*?/+?/.?' are swapped + firstMatchTest("(?U)a*", input: "aaa", match: "") + firstMatchTest("(?U)a*a", input: "aaa", match: "a") + firstMatchTest("(?U)a*?", input: "aaa", match: "aaa") + firstMatchTest("(?U)a*?a", input: "aaa", match: "aaa") + + firstMatchTest("(?U)a+", input: "aaa", match: "a") + firstMatchTest("(?U)a+?", input: "aaa", match: "aaa") + + firstMatchTest("(?U)a?", input: "a", match: "") + firstMatchTest("(?U)a?a", input: "aaa", match: "a") + firstMatchTest("(?U)a??", input: "a", match: "a") + firstMatchTest("(?U)a??a", input: "aaa", match: "aa") + // TODO: After captures, easier to test these } From 57a550cac0c9c69f5344b99be8a1ed681254e051 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Sat, 15 Jan 2022 00:18:44 -0600 Subject: [PATCH 09/15] Implement options resetting When a group sets options with (?^abc), the options should reset before the new options are applied. --- Sources/_StringProcessing/MatchingOptionSet.swift | 4 ++++ Tests/RegexTests/CompileTests.swift | 14 +++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/Sources/_StringProcessing/MatchingOptionSet.swift b/Sources/_StringProcessing/MatchingOptionSet.swift index 8d3237540..aba40db39 100644 --- a/Sources/_StringProcessing/MatchingOptionSet.swift +++ b/Sources/_StringProcessing/MatchingOptionSet.swift @@ -57,6 +57,10 @@ extension MatchingOptionSet { func merging(_ sequence: AST.MatchingOptionSequence) -> MatchingOptionSet { var result = self + if sequence.caretLoc != nil { + result = .default + } + for opt in sequence.adding { if opt.isSemanticMatchingLevel { result.remove(.semanticMatchingLevels) diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 3bc7905fc..63e48fa61 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -27,7 +27,7 @@ extension RegexTests { try _compileRegex($0).engine.program } let ref = progs.first! - for prog in progs.dropFirst() { + for (prog, equiv) in zip(progs, equivs).dropFirst() { guard ref.instructions.elementsEqual( prog.instructions) else { XCTFail(""" @@ -35,6 +35,8 @@ extension RegexTests { \(ref) Current: \(prog) + Compiled from: + \(equiv) """, file: file, line: line) continue @@ -75,10 +77,12 @@ extension RegexTests { "(*nla: assert)", "(*negative_lookahead: assert)"], - ["(?i) case-insensitive", - "(?i: case-insensitive)"], - ["(?i) case-insensitive(?-i) post", - "(?i: case-insensitive) post"], + ["a+?", + "(?U)a+", + "(?U:a+)"], + ["a+", + "(?U)(?-U)a+", + "(?U)(?^s)a+"], ] for row in equivalents { From 4fd82dd3d03cf05b98d615f46a12c2bf0f832f16 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Tue, 18 Jan 2022 11:11:04 -0600 Subject: [PATCH 10/15] Collapse MatchingOptionSet/Stack interface This converts MatchingOptionSet & ...Stack to a single interface that is more semantically oriented toward the compiler. --- Sources/_StringProcessing/Compiler.swift | 57 ++++--- .../_StringProcessing/ConsumerInterface.swift | 36 ++--- .../_StringProcessing/MatchingOptionSet.swift | 114 ------------- .../_StringProcessing/MatchingOptions.swift | 151 ++++++++++++++++++ 4 files changed, 203 insertions(+), 155 deletions(-) delete mode 100644 Sources/_StringProcessing/MatchingOptionSet.swift create mode 100644 Sources/_StringProcessing/MatchingOptions.swift diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 5ed5eb220..09eef0816 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -18,19 +18,13 @@ struct RegexProgram { class Compiler { let ast: AST - private var optionStack: MatchingOptionSetStack + private var options = MatchingOptions() private var builder = RegexProgram.Program.Builder() - private var currentOptions: MatchingOptionSet { - return optionStack.top - } - init( - ast: AST, - options: MatchingOptionSet = .default + ast: AST ) { self.ast = ast - self.optionStack = MatchingOptionSetStack(options) } __consuming func emit() throws -> RegexProgram { @@ -43,6 +37,9 @@ class Compiler { func emit(_ node: AST) throws { switch node { + case .atom(let a) where a.kind == .any: + try emitAny() + // Single characters we just match case .atom(let a) where a.singleCharacter != nil : builder.buildMatch(a.singleCharacter!) @@ -87,8 +84,8 @@ class Compiler { break case .group(let g): - optionStack.push(currentOptions) - defer { optionStack.pop() } + options.beginScope() + defer { options.endScope() } if let lookaround = g.lookaroundKind { try emitLookaround(lookaround, g.child) @@ -107,7 +104,7 @@ class Compiler { builder.buildEndCapture(cap) case .changeMatchingOptions(let optionSequence, _): - optionStack.replaceTop(currentOptions.merging(optionSequence)) + options.replaceCurrent(optionSequence) try emit(g.child) default: @@ -121,8 +118,8 @@ class Compiler { // For now, we model sets and atoms as consumers. // This lets us rapidly expand support, and we can better // design the actual instruction set with real examples - case _ where try node.generateConsumer(currentOptions) != nil: - try builder.buildConsume(by: node.generateConsumer(currentOptions)!) + case _ where try node.generateConsumer(options) != nil: + try builder.buildConsume(by: node.generateConsumer(options)!) case .quote(let q): // We stick quoted content into read-only constant strings @@ -155,6 +152,31 @@ class Compiler { throw unsupported(node.renderAsCanonical()) } } + + func emitAny() throws { + switch (options.semanticLevel, options.dotMatchesNewline) { + case (.graphemeCluster, true): + builder.buildAdvance(1) + case (.graphemeCluster, false): + builder.buildConsume { input, bounds in + input[bounds.lowerBound].isNewline + ? nil + : input.index(after: bounds.lowerBound) + } + + case (.unicodeScalar, true): + // TODO: builder.buildAdvanceUnicodeScalar(1) + builder.buildConsume { input, bounds in + input.unicodeScalars.index(after: bounds.lowerBound) + } + case (.unicodeScalar, false): + builder.buildConsume { input, bounds in + input[bounds.lowerBound].isNewline + ? nil + : input.unicodeScalars.index(after: bounds.lowerBound) + } + } + } func emitAssertion(_ kind: AST.Atom.AssertionKind) throws { // FIXME: Depends on API model we have... We may want to @@ -458,7 +480,7 @@ class Compiler { // If in reluctant-by-default mode, eager and reluctant need to be switched. let kind: AST.Quantification.Kind - if currentOptions.contains(.reluctantByDefault) + if options.isReluctantByDefault && quant.kind.value != .possessive { kind = quant.kind.value == .eager @@ -496,11 +518,8 @@ extension Compiler { matchLevel: CharacterClass.MatchLevel, options: REOptions = [] ) { - if matchLevel == .graphemeCluster { - self.init(ast: ast, options: .init(.graphemeClusterSemantics)) - } else { - self.init(ast: ast, options: .init(.unicodeScalarSemantics)) - } + self.init(ast: ast) + self.options.replaceMatchLevel(matchLevel) } } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 070fdff6f..1badf1dfa 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -38,7 +38,7 @@ extension AST { /// A consumer is a Swift closure that matches against /// the front of an input range func generateConsumer( - _ opts: MatchingOptionSet + _ opts: MatchingOptions ) throws -> Program.ConsumeFunction? { switch self { case .atom(let a): @@ -72,13 +72,10 @@ extension AST.Atom { } func generateConsumer( - _ opts: MatchingOptionSet + _ opts: MatchingOptions ) throws -> Program.ConsumeFunction? { // TODO: Wean ourselves off of this type... - let matchLevel: CharacterClass.MatchLevel = opts.contains(.unicodeScalarSemantics) - ? .unicodeScalar - : .graphemeCluster - if let cc = self.characterClass?.withMatchLevel(matchLevel) { + if let cc = self.characterClass?.withMatchLevel(opts.matchLevel) { return { input, bounds in // FIXME: should we worry about out of bounds? cc.matches(in: input, at: bounds.lowerBound) @@ -109,18 +106,13 @@ extension AST.Atom { } case .any: - return { input, bounds in - let curIndex = bounds.lowerBound - if !opts.contains(.singleLine) && input[curIndex].isNewline { - return nil - } - return opts.contains(.graphemeClusterSemantics) - ? input.index(after: curIndex) - : input.unicodeScalars.index(after: curIndex) - } + fatalError(".atom(.any) is handled in emitAny") + case .startOfLine, .endOfLine: + // handled in emitAssertion + return nil + case .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl, - .startOfLine, .endOfLine, .backreference, .subpattern, .condition: // FIXME: implement return nil @@ -130,7 +122,7 @@ extension AST.Atom { extension AST.CustomCharacterClass.Member { func generateConsumer( - _ opts: MatchingOptionSet + _ opts: MatchingOptions ) throws -> Program.ConsumeFunction { switch self { case .custom(let ccc): @@ -221,7 +213,7 @@ extension AST.CustomCharacterClass.Member { extension AST.CustomCharacterClass { func generateConsumer( - _ opts: MatchingOptionSet + _ opts: MatchingOptions ) throws -> Program.ConsumeFunction { // NOTE: Easy way to implement, obviously not performant let consumers = try members.map { @@ -274,7 +266,7 @@ private func consumeScalar( extension AST.Atom.CharacterProperty { func generateConsumer( - _ opts: MatchingOptionSet + _ opts: MatchingOptions ) throws -> Program.ConsumeFunction { // Handle inversion for us, albeit not efficiently func invert( @@ -344,7 +336,7 @@ extension AST.Atom.CharacterProperty { extension Unicode.BinaryProperty { // FIXME: Semantic level, vet for precise defs func generateConsumer( - _ opts: MatchingOptionSet + _ opts: MatchingOptions ) throws -> Program.ConsumeFunction { switch self { @@ -508,7 +500,7 @@ extension Unicode.BinaryProperty { extension Unicode.POSIXProperty { // FIXME: Semantic level, vet for precise defs func generateConsumer( - _ opts: MatchingOptionSet + _ opts: MatchingOptions ) -> Program.ConsumeFunction { // FIXME: semantic levels, modes, etc switch self { @@ -554,7 +546,7 @@ extension Unicode.POSIXProperty { extension Unicode.ExtendedGeneralCategory { // FIXME: Semantic level func generateConsumer( - _ opts: MatchingOptionSet + _ opts: MatchingOptions ) throws -> Program.ConsumeFunction { switch self { case .letter: diff --git a/Sources/_StringProcessing/MatchingOptionSet.swift b/Sources/_StringProcessing/MatchingOptionSet.swift deleted file mode 100644 index aba40db39..000000000 --- a/Sources/_StringProcessing/MatchingOptionSet.swift +++ /dev/null @@ -1,114 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// -//===----------------------------------------------------------------------===// - -import _MatchingEngine - -/// A set of matching options. -struct MatchingOptionSet: RawRepresentable { - var rawValue: UInt32 -} - -extension MatchingOptionSet { - init(_ kind: AST.MatchingOption.Kind) { - self.rawValue = 1 << kind.rawValue - } - - fileprivate init(unchecked kinds: AST.MatchingOption.Kind...) { - self.rawValue = 0 - for kind in kinds { - self.rawValue |= 1 << kind.rawValue - } - } - - fileprivate mutating func remove(_ kind: Self) { - self.rawValue &= ~kind.rawValue - } - - fileprivate mutating func insert(_ kind: Self) { - self.rawValue |= kind.rawValue - } - - fileprivate static var textSegmentOptions: Self { - Self(unchecked: .textSegmentGraphemeMode, .textSegmentWordMode) - } - - fileprivate static var semanticMatchingLevels: Self { - Self(unchecked: .graphemeClusterSemantics, .unicodeScalarSemantics, .byteSemantics) - } -} - -// Compiler API -extension MatchingOptionSet { - static var `default`: Self { - Self(unchecked: .graphemeClusterSemantics, .textSegmentGraphemeMode) - } - - func contains(_ kind: AST.MatchingOption.Kind) -> Bool { - self.rawValue & (1 << kind.rawValue) != 0 - } - - func merging(_ sequence: AST.MatchingOptionSequence) -> MatchingOptionSet { - var result = self - if sequence.caretLoc != nil { - result = .default - } - - for opt in sequence.adding { - if opt.isSemanticMatchingLevel { - result.remove(.semanticMatchingLevels) - } - if opt.isTextSegmentMode { - result.remove(.textSegmentOptions) - } - - result.insert(.init(opt.kind)) - } - for opt in sequence.removing { - result.remove(.init(opt.kind)) - } - return result - } -} - -/// A never-empty stack of `MatchingOptionSet`s. -struct MatchingOptionSetStack { - var stack: [MatchingOptionSet] - - init(_ initial: MatchingOptionSet) { - self.stack = [initial] - } - - private func _invariantCheck() { - assert(!stack.isEmpty, "Unbalanced matching options pop") - } - - var top: MatchingOptionSet { - _invariantCheck() - return stack.last! - } - - mutating func push(_ set: MatchingOptionSet) { - stack.append(set) - } - - mutating func replaceTop(_ set: MatchingOptionSet) { - _invariantCheck() - stack.removeLast() - stack.append(set) - } - - @discardableResult - mutating func pop() -> MatchingOptionSet { - let result = stack.removeLast() - _invariantCheck() - return result - } -} diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift new file mode 100644 index 000000000..6db41595e --- /dev/null +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -0,0 +1,151 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +import _MatchingEngine + +/// A type that represents the current state of regex matching options, with +/// stack-based scoping. +struct MatchingOptions { + /// A set of matching options. + fileprivate struct Representation: OptionSet, RawRepresentable { + var rawValue: UInt32 + + /// Options that comprise the mutually exclusive test segmentation group. + static var textSegmentOptions: Self { + Self(unchecked: .textSegmentGraphemeMode, .textSegmentWordMode) + } + + /// Options that comprise the mutually exclusive semantic matching level + /// group. + static var semanticMatchingLevels: Self { + Self(unchecked: .graphemeClusterSemantics, .unicodeScalarSemantics, .byteSemantics) + } + + /// The default set of options. + static var `default`: Self { + Self(unchecked: .graphemeClusterSemantics, .textSegmentGraphemeMode) + } + + func contains(_ kind: AST.MatchingOption.Kind) -> Bool { + self.rawValue & (1 << kind.rawValue) != 0 + } + + /// Merges `sequence` with this option set, preserving the + mutating func merge(with sequence: AST.MatchingOptionSequence) { + if sequence.caretLoc != nil { + self = .default + } + + for opt in sequence.adding { + // If opt is in one of the mutually exclusive groups, clear out the + // group before inserting. + if opt.isSemanticMatchingLevel { + remove(.semanticMatchingLevels) + } + if opt.isTextSegmentMode { + remove(.textSegmentOptions) + } + + insert(.init(opt.kind)) + } + for opt in sequence.removing { + remove(.init(opt.kind)) + } + } + } + + fileprivate var stack: [Representation] + + fileprivate func _invariantCheck() { + assert(!stack.isEmpty, "Unbalanced call to endScope") + } +} + +// Compiler API +extension MatchingOptions { + /// Creates an instance with the default options. + init() { + self.stack = [.default] + } + + mutating func beginScope() { + stack.append(stack.last!) + } + + mutating func endScope() { + _ = stack.removeLast() + _invariantCheck() + } + + mutating func replaceCurrent(_ sequence: AST.MatchingOptionSequence) { + stack[stack.count - 1].merge(with: sequence) + } + + var isReluctantByDefault: Bool { + stack.last!.contains(.reluctantByDefault) + } + + var dotMatchesNewline: Bool { + stack.last!.contains(.singleLine) + } + + enum SemanticLevel { + case graphemeCluster + case unicodeScalar + // TODO: include? + // case byte + } + + var semanticLevel: SemanticLevel { + stack.last!.contains(.graphemeClusterSemantics) + ? .graphemeCluster + : .unicodeScalar + } +} + +// Deprecated CharacterClass.MatchLevel API +extension MatchingOptions { + @available(*, deprecated) + mutating func replaceMatchLevel(_ matchLevel: CharacterClass.MatchLevel) { + var result = stack.last! + result.remove(.semanticMatchingLevels) + switch matchLevel { + case .graphemeCluster: + result.insert(.init(.graphemeClusterSemantics)) + case .unicodeScalar: + result.insert(.init(.unicodeScalarSemantics)) + } + stack[stack.count - 1] = result + } + + @available(*, deprecated) + var matchLevel: CharacterClass.MatchLevel { + switch semanticLevel { + case .graphemeCluster: + return .graphemeCluster + case .unicodeScalar: + return .unicodeScalar + } + } +} + +extension MatchingOptions.Representation { + fileprivate init(_ kind: AST.MatchingOption.Kind) { + self.rawValue = 1 << kind.rawValue + } + + fileprivate init(unchecked kinds: AST.MatchingOption.Kind...) { + self.rawValue = 0 + for kind in kinds { + self.rawValue |= 1 << kind.rawValue + } + } +} From a7a3917592b77c9931dfe9c93a1e6bf78d033d19 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 24 Jan 2022 14:56:19 -0600 Subject: [PATCH 11/15] Enforce exclusive options as a MatchingOptions invariant --- .../Regex/Parse/Diagnostics.swift | 2 +- Sources/_StringProcessing/Compiler.swift | 15 +----- .../_StringProcessing/MatchingOptions.swift | 51 +++++++++---------- Tests/RegexTests/LegacyTests.swift | 28 ---------- Tests/RegexTests/MatchTests.swift | 29 +++++++++-- 5 files changed, 50 insertions(+), 75 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift b/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift index f1bc51a34..28421223d 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift @@ -106,7 +106,7 @@ extension ParseError: CustomStringConvertible { case .cannotRemoveTextSegmentOptions: return "text segment mode cannot be unset, only changed" case .cannotRemoveSemanticsOptions: - return "matching semantics cannot be unset, only changed" + return "semantic level cannot be unset, only changed" } } } diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 7bbd5a6de..82fa3b994 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -107,7 +107,7 @@ class Compiler { builder.buildEndCapture(cap) case .changeMatchingOptions(let optionSequence, _): - options.replaceCurrent(optionSequence) + options.apply(optionSequence) try emit(g.child) default: @@ -513,19 +513,6 @@ class Compiler { } } -// Deprecated matchLevel-based initializer -extension Compiler { - @available(*, deprecated) - convenience init( - ast: AST, - matchLevel: CharacterClass.MatchLevel, - options: REOptions = [] - ) { - self.init(ast: ast) - self.options.replaceMatchLevel(matchLevel) - } -} - public func _compileRegex( _ regex: String, _ syntax: SyntaxOptions = .traditional ) throws -> Executor { diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift index 6db41595e..4259a0e5e 100644 --- a/Sources/_StringProcessing/MatchingOptions.swift +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -18,28 +18,38 @@ struct MatchingOptions { fileprivate struct Representation: OptionSet, RawRepresentable { var rawValue: UInt32 + // Text segmentation options + static var textSegmentGraphemeMode: Self { .init(.textSegmentGraphemeMode) } + static var textSegmentWordMode: Self { .init(.textSegmentWordMode) } + /// Options that comprise the mutually exclusive test segmentation group. static var textSegmentOptions: Self { - Self(unchecked: .textSegmentGraphemeMode, .textSegmentWordMode) + [.textSegmentGraphemeMode, .textSegmentWordMode] } + // Semantic matching level options + static var graphemeClusterSemantics: Self { .init(.graphemeClusterSemantics) } + static var unicodeScalarSemantics: Self { .init(.unicodeScalarSemantics) } + static var byteSemantics: Self { .init(.byteSemantics) } + /// Options that comprise the mutually exclusive semantic matching level /// group. static var semanticMatchingLevels: Self { - Self(unchecked: .graphemeClusterSemantics, .unicodeScalarSemantics, .byteSemantics) + [.graphemeClusterSemantics, .unicodeScalarSemantics, .byteSemantics] } /// The default set of options. static var `default`: Self { - Self(unchecked: .graphemeClusterSemantics, .textSegmentGraphemeMode) + [.graphemeClusterSemantics, .textSegmentGraphemeMode] } + /// Tests to see if the option denoted by `kind` is a member of this set. func contains(_ kind: AST.MatchingOption.Kind) -> Bool { self.rawValue & (1 << kind.rawValue) != 0 } - /// Merges `sequence` with this option set, preserving the - mutating func merge(with sequence: AST.MatchingOptionSequence) { + /// Applies the changes described by `sequence` to this set of options. + mutating func apply(_ sequence: AST.MatchingOptionSequence) { if sequence.caretLoc != nil { self = .default } @@ -66,6 +76,10 @@ struct MatchingOptions { fileprivate func _invariantCheck() { assert(!stack.isEmpty, "Unbalanced call to endScope") + + // Must contain exactly one of each mutually exclusive group + assert(stack.last!.intersection(.textSegmentOptions).rawValue.nonzeroBitCount == 1) + assert(stack.last!.intersection(.semanticMatchingLevels).rawValue.nonzeroBitCount == 1) } } @@ -74,10 +88,12 @@ extension MatchingOptions { /// Creates an instance with the default options. init() { self.stack = [.default] + _invariantCheck() } mutating func beginScope() { stack.append(stack.last!) + _invariantCheck() } mutating func endScope() { @@ -85,8 +101,9 @@ extension MatchingOptions { _invariantCheck() } - mutating func replaceCurrent(_ sequence: AST.MatchingOptionSequence) { - stack[stack.count - 1].merge(with: sequence) + mutating func apply(_ sequence: AST.MatchingOptionSequence) { + stack[stack.count - 1].apply(sequence) + _invariantCheck() } var isReluctantByDefault: Bool { @@ -113,19 +130,6 @@ extension MatchingOptions { // Deprecated CharacterClass.MatchLevel API extension MatchingOptions { - @available(*, deprecated) - mutating func replaceMatchLevel(_ matchLevel: CharacterClass.MatchLevel) { - var result = stack.last! - result.remove(.semanticMatchingLevels) - switch matchLevel { - case .graphemeCluster: - result.insert(.init(.graphemeClusterSemantics)) - case .unicodeScalar: - result.insert(.init(.unicodeScalarSemantics)) - } - stack[stack.count - 1] = result - } - @available(*, deprecated) var matchLevel: CharacterClass.MatchLevel { switch semanticLevel { @@ -141,11 +145,4 @@ extension MatchingOptions.Representation { fileprivate init(_ kind: AST.MatchingOption.Kind) { self.rawValue = 1 << kind.rawValue } - - fileprivate init(unchecked kinds: AST.MatchingOption.Kind...) { - self.rawValue = 0 - for kind in kinds { - self.rawValue |= 1 << kind.rawValue - } - } } diff --git a/Tests/RegexTests/LegacyTests.swift b/Tests/RegexTests/LegacyTests.swift index ede9d2066..316d1dac7 100644 --- a/Tests/RegexTests/LegacyTests.swift +++ b/Tests/RegexTests/LegacyTests.swift @@ -419,34 +419,6 @@ extension RegexTests { // expecting: .init(captures: "aaaa", capturesEqual: ==)) } - func testLegacyMatchLevel() throws { - let tests: Array<(String, chars: [String], unicodes: [String])> = [ - ("..", ["e\u{301}e\u{301}"], ["e\u{301}"]), - ] - - for (regex, characterInputs, scalarInputs) in tests { - let ast = try parse(regex, .traditional) - let program = try Compiler(ast: ast).emit() - let executor = Executor(program: program) - - let scalarProgram = try Compiler( - ast: ast, matchLevel: .unicodeScalar - ).emit() - let scalarExecutor = Executor( - program: scalarProgram, enablesTracing: false) - - for input in characterInputs { - XCTAssertNotNil(executor.execute(input: input)) - XCTAssertNil(scalarExecutor.execute(input: input)) - } - - for input in scalarInputs { - XCTAssertNotNil(scalarExecutor.execute(input: input)) - XCTAssertNil(executor.execute(input: input)) - } - } - } - func testLegacyPartialMatches() { let tests: Array<(String, pass: [(String, matched: String)], fail: [String])> = [ ("a+", diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 745be056d..e29907d8c 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1317,12 +1317,31 @@ extension RegexTests { firstMatchTest(#"e\O"#, input: eComposed, match: nil, xfail: true) - // FIXME: Unicode scalar semantic flag (?U) doesn't change behavior of `.` matchTest( - #"(?U).\u{301}"#, - (eComposed, true), - (eDecomposed, true), - xfail: true) + #"(?u).\u{301}"#, + (eComposed, false), + (eDecomposed, true)) + firstMatchTest(#"(?u).$"#, input: eComposed, match: eComposed) + + // Option permutations for 'u' and 's' + matchTest( + #"...."#, + ("e\u{301}ab", false), + ("e\u{301}abc", true), + ("e\u{301}\nab", false)) + matchTest( + #"(?s)...."#, + ("e\u{301}ab", false), + ("e\u{301}abc", true), + ("e\u{301}\nab", true)) + matchTest( + #"(?u)...."#, + ("e\u{301}ab", true), + ("e\u{301}\na", false)) + matchTest( + #"(?us)...."#, + ("e\u{301}ab", true), + ("e\u{301}\na", true)) } // TODO: Add test for implied grapheme cluster requirement at group boundaries From 1345bfcc9c5c16ac7991062d0b0c2bbc447154bf Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 24 Jan 2022 23:30:08 -0600 Subject: [PATCH 12/15] Handle exclusivity of `x` and `xx` option flags --- .../_StringProcessing/CharacterClass.swift | 4 ++++ .../_StringProcessing/MatchingOptions.swift | 21 +++++++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/Sources/_StringProcessing/CharacterClass.swift b/Sources/_StringProcessing/CharacterClass.swift index 6fcd145de..a527ac363 100644 --- a/Sources/_StringProcessing/CharacterClass.swift +++ b/Sources/_StringProcessing/CharacterClass.swift @@ -342,6 +342,10 @@ extension AST.Atom { // TODO: Would our model type for character classes include // this? Or does grapheme-semantic mode complicate that? return nil + + case .any: + // 'any' is handled by Compiler.emitAny(), not `CharacterClass` + return nil default: return nil diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift index 4259a0e5e..592fbd13e 100644 --- a/Sources/_StringProcessing/MatchingOptions.swift +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -37,7 +37,16 @@ struct MatchingOptions { static var semanticMatchingLevels: Self { [.graphemeClusterSemantics, .unicodeScalarSemantics, .byteSemantics] } + + // Extended whitespace literal options + static var extended: Self { .init(.extended) } + static var extraExtended: Self { .init(.extraExtended) } + /// Options that affect whitespace in literals + static var literalWhitespaceOptions: Self { + [.extended, .extraExtended] + } + /// The default set of options. static var `default`: Self { [.graphemeClusterSemantics, .textSegmentGraphemeMode] @@ -57,17 +66,25 @@ struct MatchingOptions { for opt in sequence.adding { // If opt is in one of the mutually exclusive groups, clear out the // group before inserting. - if opt.isSemanticMatchingLevel { + if Self.semanticMatchingLevels.contains(opt.kind) { remove(.semanticMatchingLevels) } - if opt.isTextSegmentMode { + if Self.textSegmentOptions.contains(opt.kind) { remove(.textSegmentOptions) } + if Self.literalWhitespaceOptions.contains(opt.kind) { + remove(.literalWhitespaceOptions) + } insert(.init(opt.kind)) } for opt in sequence.removing { remove(.init(opt.kind)) + + // Removing either extended whitespace option removes both + if Self.literalWhitespaceOptions.contains(opt.kind) { + remove(.literalWhitespaceOptions) + } } } } From 25345cccca0bc667fdf31bc1dfb7417f2beaab6b Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 26 Jan 2022 16:39:36 -0600 Subject: [PATCH 13/15] Separate AST options from options in the compiler --- .../Regex/AST/MatchingOptions.swift | 2 +- .../_StringProcessing/MatchingOptions.swift | 236 ++++++++++++------ 2 files changed, 161 insertions(+), 77 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift b/Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift index 8efbafb51..115a28af1 100644 --- a/Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift +++ b/Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift @@ -12,7 +12,7 @@ extension AST { /// An option written in source that changes matching semantics. public struct MatchingOption: Hashable { - public enum Kind: Int { + public enum Kind { // PCRE options case caseInsensitive // i case allowDuplicateGroupNames // J diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift index 592fbd13e..7509d3e2b 100644 --- a/Sources/_StringProcessing/MatchingOptions.swift +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -14,81 +14,6 @@ import _MatchingEngine /// A type that represents the current state of regex matching options, with /// stack-based scoping. struct MatchingOptions { - /// A set of matching options. - fileprivate struct Representation: OptionSet, RawRepresentable { - var rawValue: UInt32 - - // Text segmentation options - static var textSegmentGraphemeMode: Self { .init(.textSegmentGraphemeMode) } - static var textSegmentWordMode: Self { .init(.textSegmentWordMode) } - - /// Options that comprise the mutually exclusive test segmentation group. - static var textSegmentOptions: Self { - [.textSegmentGraphemeMode, .textSegmentWordMode] - } - - // Semantic matching level options - static var graphemeClusterSemantics: Self { .init(.graphemeClusterSemantics) } - static var unicodeScalarSemantics: Self { .init(.unicodeScalarSemantics) } - static var byteSemantics: Self { .init(.byteSemantics) } - - /// Options that comprise the mutually exclusive semantic matching level - /// group. - static var semanticMatchingLevels: Self { - [.graphemeClusterSemantics, .unicodeScalarSemantics, .byteSemantics] - } - - // Extended whitespace literal options - static var extended: Self { .init(.extended) } - static var extraExtended: Self { .init(.extraExtended) } - - /// Options that affect whitespace in literals - static var literalWhitespaceOptions: Self { - [.extended, .extraExtended] - } - - /// The default set of options. - static var `default`: Self { - [.graphemeClusterSemantics, .textSegmentGraphemeMode] - } - - /// Tests to see if the option denoted by `kind` is a member of this set. - func contains(_ kind: AST.MatchingOption.Kind) -> Bool { - self.rawValue & (1 << kind.rawValue) != 0 - } - - /// Applies the changes described by `sequence` to this set of options. - mutating func apply(_ sequence: AST.MatchingOptionSequence) { - if sequence.caretLoc != nil { - self = .default - } - - for opt in sequence.adding { - // If opt is in one of the mutually exclusive groups, clear out the - // group before inserting. - if Self.semanticMatchingLevels.contains(opt.kind) { - remove(.semanticMatchingLevels) - } - if Self.textSegmentOptions.contains(opt.kind) { - remove(.textSegmentOptions) - } - if Self.literalWhitespaceOptions.contains(opt.kind) { - remove(.literalWhitespaceOptions) - } - - insert(.init(opt.kind)) - } - for opt in sequence.removing { - remove(.init(opt.kind)) - - // Removing either extended whitespace option removes both - if Self.literalWhitespaceOptions.contains(opt.kind) { - remove(.literalWhitespaceOptions) - } - } - } - } - fileprivate var stack: [Representation] fileprivate func _invariantCheck() { @@ -108,16 +33,20 @@ extension MatchingOptions { _invariantCheck() } + /// Starts a new scope with the current options. mutating func beginScope() { stack.append(stack.last!) _invariantCheck() } + /// Ends the current scope. mutating func endScope() { _ = stack.removeLast() _invariantCheck() } + /// Updates the options in the current scope with the changes described by + /// `sequence`. mutating func apply(_ sequence: AST.MatchingOptionSequence) { stack[stack.count - 1].apply(sequence) _invariantCheck() @@ -158,8 +87,163 @@ extension MatchingOptions { } } +extension MatchingOptions { + /// An option that changes the behavior of a regular expression. + fileprivate enum Option: Int { + // PCRE options + case caseInsensitive + case allowDuplicateGroupNames + case multiline + case noAutoCapture + case singleLine + case reluctantByDefault + + // ICU options + case unicodeWordBoundaries + + // NSRegularExpression compatibility options + // Not available via regex literal flags + case transparentBounds + case withoutAnchoringBounds + + // Oniguruma options + case asciiOnlyDigit + case asciiOnlyPOSIXProps + case asciiOnlySpace + case asciiOnlyWord + + // Oniguruma text segment options (these are mutually exclusive and cannot + // be unset, only flipped between) + case textSegmentGraphemeMode + case textSegmentWordMode + + // Swift semantic matching level + case graphemeClusterSemantics + case unicodeScalarSemantics + case byteSemantics + + init?(_ astKind: AST.MatchingOption.Kind) { + switch astKind { + case .caseInsensitive: + self = .caseInsensitive + case .allowDuplicateGroupNames: + self = .allowDuplicateGroupNames + case .multiline: + self = .multiline + case .noAutoCapture: + self = .noAutoCapture + case .singleLine: + self = .singleLine + case .reluctantByDefault: + self = .reluctantByDefault + case .unicodeWordBoundaries: + self = .unicodeWordBoundaries + case .asciiOnlyDigit: + self = .asciiOnlyDigit + case .asciiOnlyPOSIXProps: + self = .asciiOnlyPOSIXProps + case .asciiOnlySpace: + self = .asciiOnlySpace + case .asciiOnlyWord: + self = .asciiOnlyWord + case .textSegmentGraphemeMode: + self = .textSegmentGraphemeMode + case .textSegmentWordMode: + self = .textSegmentWordMode + case .graphemeClusterSemantics: + self = .graphemeClusterSemantics + case .unicodeScalarSemantics: + self = .unicodeScalarSemantics + case .byteSemantics: + self = .byteSemantics + + // Whitespace options are only relevant during parsing, not compilation. + case .extended, .extraExtended: + return nil + @unknown default: + // Ignore unknown + return nil + } + } + + fileprivate var representation: Representation { + return .init(self) + } + } +} + +extension MatchingOptions { + /// A set of matching options. + fileprivate struct Representation: OptionSet, RawRepresentable { + var rawValue: UInt32 + + /// Returns `true` if the option denoted by `kind` is a member of this set. + func contains(_ kind: Option) -> Bool { + contains(.init(kind)) + } + + /// Applies the changes described by `sequence` to this set of options. + mutating func apply(_ sequence: AST.MatchingOptionSequence) { + // Replace entirely if the sequence includes a caret, e.g. `(?^is)`. + if sequence.caretLoc != nil { + self = .default + } + + for opt in sequence.adding { + guard let opt = Option(opt.kind)?.representation else { + continue + } + + // If opt is in one of the mutually exclusive groups, clear out the + // group before inserting. + if Self.semanticMatchingLevels.contains(opt) { + remove(.semanticMatchingLevels) + } + if Self.textSegmentOptions.contains(opt) { + remove(.textSegmentOptions) + } + + insert(opt) + } + + for opt in sequence.removing { + guard let opt = Option(opt.kind)?.representation else { + continue + } + + remove(opt) + } + } + } +} + extension MatchingOptions.Representation { - fileprivate init(_ kind: AST.MatchingOption.Kind) { + fileprivate init(_ kind: MatchingOptions.Option) { self.rawValue = 1 << kind.rawValue } + + // Text segmentation options + static var textSegmentGraphemeMode: Self { .init(.textSegmentGraphemeMode) } + static var textSegmentWordMode: Self { .init(.textSegmentWordMode) } + + /// Options that comprise the mutually exclusive test segmentation group. + static var textSegmentOptions: Self { + [.textSegmentGraphemeMode, .textSegmentWordMode] + } + + // Semantic matching level options + static var graphemeClusterSemantics: Self { .init(.graphemeClusterSemantics) } + static var unicodeScalarSemantics: Self { .init(.unicodeScalarSemantics) } + static var byteSemantics: Self { .init(.byteSemantics) } + + /// Options that comprise the mutually exclusive semantic matching level + /// group. + static var semanticMatchingLevels: Self { + [.graphemeClusterSemantics, .unicodeScalarSemantics, .byteSemantics] + } + + /// The default set of options. + static var `default`: Self { + [.graphemeClusterSemantics, .textSegmentGraphemeMode] + } } From 550068f831e8a43c3780d8ac797bb71db2121621 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 26 Jan 2022 16:42:04 -0600 Subject: [PATCH 14/15] Clarify Atom.any compilation --- Sources/_StringProcessing/CharacterClass.swift | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Sources/_StringProcessing/CharacterClass.swift b/Sources/_StringProcessing/CharacterClass.swift index a527ac363..b9f62fd35 100644 --- a/Sources/_StringProcessing/CharacterClass.swift +++ b/Sources/_StringProcessing/CharacterClass.swift @@ -344,8 +344,11 @@ extension AST.Atom { return nil case .any: - // 'any' is handled by Compiler.emitAny(), not `CharacterClass` - return nil + // 'any' is handled by Compiler.emitAny(), not `CharacterClass`, to + // provide lower level instructions than the CharacterClass-generated + // consumer closure + // TODO: throw an error instead of trapping here + fatalError(".any is handled in Compiler.emitAny()") default: return nil From 15c470d0ed65374d64b613d83ad6eb4cf4e9daaa Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 28 Jan 2022 10:53:14 -0600 Subject: [PATCH 15/15] Update Sources/_StringProcessing/CharacterClass.swift Co-authored-by: Michael Ilseman --- Sources/_StringProcessing/CharacterClass.swift | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Sources/_StringProcessing/CharacterClass.swift b/Sources/_StringProcessing/CharacterClass.swift index 280b70d1b..b51c2018d 100644 --- a/Sources/_StringProcessing/CharacterClass.swift +++ b/Sources/_StringProcessing/CharacterClass.swift @@ -347,6 +347,9 @@ extension AST.Atom { // `.any` is handled in the matching engine by Compiler.emitAny() and in // the legacy compiler by the `.any` instruction, which can provide lower // level instructions than the CharacterClass-generated consumer closure + // + // FIXME: We shouldn't be returning `nil` here, but instead fixing the call + // site to check for any before trying to construct a character class. return nil default: return nil