From 1784c0e2781da84f95a2b503334d4aa378ece0b1 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 20 Jan 2022 21:05:32 +0000 Subject: [PATCH 1/4] Parse PCRE callout syntax Parse the `(?C)` syntax with an integer or string argument. This doesn't yet handle the Oniguruma specific callout syntax, which is a little more involved. --- Sources/_MatchingEngine/Regex/AST/Atom.swift | 20 ++- .../Regex/Parse/Diagnostics.swift | 3 + .../Regex/Parse/LexicalAnalysis.swift | 121 +++++++++++++++--- .../_MatchingEngine/Regex/Parse/Source.swift | 2 +- .../Regex/Printing/DumpAST.swift | 6 + .../Regex/Printing/PrintAsPattern.swift | 3 + Sources/_StringProcessing/ASTBuilder.swift | 4 + .../_StringProcessing/ConsumerInterface.swift | 2 +- Tests/RegexTests/ParseTests.swift | 21 +++ 9 files changed, 162 insertions(+), 20 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/AST/Atom.swift b/Sources/_MatchingEngine/Regex/AST/Atom.swift index 240ee729f..750351cb1 100644 --- a/Sources/_MatchingEngine/Regex/AST/Atom.swift +++ b/Sources/_MatchingEngine/Regex/AST/Atom.swift @@ -66,6 +66,9 @@ extension AST { // References case backreference(Reference) case subpattern(Reference) + + // (?C) + case callout(Callout) } } } @@ -443,6 +446,19 @@ extension AST.Atom { } } +extension AST.Atom { + public struct Callout: Hashable { + public enum Argument: Hashable { + case number(Int) + case string(String) + } + public var arg: AST.Located + public init(_ arg: AST.Located) { + self.arg = arg + } + } +} + extension AST.Atom { /// Retrieve the character value of the atom if it represents a literal /// character or unicode scalar, nil otherwise. @@ -458,7 +474,7 @@ extension AST.Atom { fallthrough case .property, .escaped, .any, .startOfLine, .endOfLine, - .backreference, .subpattern, .namedCharacter: + .backreference, .subpattern, .namedCharacter, .callout: return nil } } @@ -483,7 +499,7 @@ extension AST.Atom { return "\\M-\\C-\(x)" case .property, .escaped, .any, .startOfLine, .endOfLine, - .backreference, .subpattern, .namedCharacter: + .backreference, .subpattern, .namedCharacter, .callout: return nil } } diff --git a/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift b/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift index dcc7afc1c..574691288 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift @@ -39,6 +39,7 @@ enum ParseError: Error, Hashable { case cannotReferToWholePattern case unknownGroupKind(String) + case unknownCalloutKind(String) case invalidMatchingOption(Character) case cannotRemoveMatchingOptionsAfterCaret @@ -86,6 +87,8 @@ extension ParseError: CustomStringConvertible { return "\(str) cannot be used as condition" case let .unknownGroupKind(str): return "unknown group kind '(\(str)'" + case let .unknownCalloutKind(str): + return "unknown callout kind '\(str)'" case let .invalidMatchingOption(c): return "invalid matching option '\(c)'" case .cannotRemoveMatchingOptionsAfterCaret: diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift index 26cc863cf..30d887931 100644 --- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift @@ -631,8 +631,15 @@ extension Source { ) throws -> Located? { try recordLoc { src in try src.tryEating { src in - guard src.tryEat("(") else { return nil } + // There are some atoms that syntactically look like groups, bail here + // if we see any. Care needs to be taken here as e.g a group starting + // with '(?-' is a subpattern if the next character is a digit, + // otherwise a matching option specifier. Conversely, '(?P' can be the + // start of a matching option sequence, or a reference if it is followed + // by '=' or '<'. + guard !src.shouldLexGroupLikeAtom() else { return nil } + guard src.tryEat("(") else { return nil } if src.tryEat("?") { if src.tryEat(":") { return .nonCapture } if src.tryEat("|") { return .nonCaptureReset } @@ -658,15 +665,6 @@ extension Source { return .namedCapture(name) } - // Check if we can lex a group-like reference. Do this before matching - // options to avoid ambiguity with a group starting with (?-, which - // is a subpattern if the next character is a digit, otherwise a - // matching option specifier. In addition, we need to be careful with - // (?P, which can also be the start of a matching option sequence. - if src.canLexGroupLikeReference() { - return nil - } - // Matching option changing group (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:). if let seq = try src.lexMatchingOptionSequence() { if src.tryEat(":") { @@ -1059,11 +1057,11 @@ extension Source { for openChar: Character ) -> Character { switch openChar { + // Identically-balanced delimiters. + case "'", "\"", "`", "^", "%", "#", "$": return openChar case "<": return ">" - case "'": return "'" case "{": return "}" - default: - fatalError("Not implemented") + default: fatalError("Not implemented") } } @@ -1204,6 +1202,24 @@ extension Source { return src.canLexNumberedReference() } + /// Whether a group specifier should be lexed as an atom instead of a group. + private func shouldLexGroupLikeAtom() -> Bool { + var src = self + guard src.tryEat("(") else { return false } + + if src.tryEat("?") { + // The start of a reference '(?P=', '(?R', ... + if src.canLexGroupLikeReference() { return true } + + // The start of a callout. + if src.tryEat("C") { return true } + + return false + } + + return false + } + /// Consume an escaped atom, starting from after the backslash /// /// Escaped -> KeyboardModified | Builtin @@ -1265,6 +1281,78 @@ extension Source { } } + /// Try to consume a callout. + /// + /// Callout -> '(?C' CalloutBody ')' + /// CalloutBody -> '' | + /// | '`' '`' + /// | "'" "'" + /// | '"' '"' + /// | '^' '^' + /// | '%' '%' + /// | '#' '#' + /// | '$' '$' + /// | '{' '}' + /// + mutating func lexCallout() throws -> AST.Atom.Callout? { + guard tryEat(sequence: "(?C") else { return nil } + let arg = try recordLoc { src -> AST.Atom.Callout.Argument in + // Parse '(?C' followed by a number. + if let num = try src.lexNumber() { + return .number(num.value) + } + // '(?C)' is implicitly '(?C0)'. + if src.peek() == ")" { + return .number(0) + } + // Parse '(C?' followed by a set of balanced delimiters as defined by + // http://pcre.org/current/doc/html/pcre2pattern.html#SEC28 + if let open = src.tryEat(anyOf: "`", "'", "\"", "^", "%", "#", "$", "{") { + let closing = String(Source.getClosingDelimiter(for: open)) + return .string(try src.expectQuoted(endingWith: closing).value) + } + // If we don't know what this syntax is, consume up to the ending ')' and + // emit an error. + let remaining = src.lexUntil { $0.isEmpty || $0.tryEat(")") }.value + if remaining.isEmpty { + throw ParseError.expected(")") + } + throw ParseError.unknownCalloutKind("(?C\(remaining))") + } + try expect(")") + return .init(arg) + } + + /// Consume a group-like atom, throwing an error if an atom could not be + /// produced. + /// + /// GroupLikeAtom -> GroupLikeReference | Callout | BacktrackingDirective + /// + mutating func expectGroupLikeAtom() throws -> AST.Atom.Kind { + try recordLoc { src in + // References that look like groups, e.g (?R), (?1), ... + if let ref = try src.lexGroupLikeReference() { + return ref.value + } + + // (?C) + if let callout = try src.lexCallout() { + return .callout(callout) + } + + // If we didn't produce an atom, consume up until a reasonable end-point + // and throw an error. + try src.expect("(") + let remaining = src.lexUntil { + $0.isEmpty || $0.tryEat(anyOf: ":", ")") != nil + }.value + if remaining.isEmpty { + throw ParseError.expected(")") + } + throw ParseError.unknownGroupKind(remaining) + }.value + } + /// Try to consume an Atom. /// @@ -1293,9 +1381,10 @@ extension Source { return .property(prop) } - // References that look like groups, e.g (?R), (?1), ... - if !customCC, let ref = try src.lexGroupLikeReference() { - return ref.value + // If we have group syntax that was skipped over in lexGroupStart, we + // need to handle it as an atom, or throw an error. + if !customCC && src.shouldLexGroupLikeAtom() { + return try src.expectGroupLikeAtom() } let char = src.eat() diff --git a/Sources/_MatchingEngine/Regex/Parse/Source.swift b/Sources/_MatchingEngine/Regex/Parse/Source.swift index d373fefa7..13188767b 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Source.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Source.swift @@ -60,7 +60,7 @@ extension Source { var isEmpty: Bool { _slice.isEmpty } - mutating func peek() -> Char? { _slice.first } + func peek() -> Char? { _slice.first } mutating func advance() { assert(!isEmpty) diff --git a/Sources/_MatchingEngine/Regex/Printing/DumpAST.swift b/Sources/_MatchingEngine/Regex/Printing/DumpAST.swift index 3587f061e..95b3bc843 100644 --- a/Sources/_MatchingEngine/Regex/Printing/DumpAST.swift +++ b/Sources/_MatchingEngine/Regex/Printing/DumpAST.swift @@ -133,12 +133,18 @@ extension AST.Atom { case .backreference(let r), .subpattern(let r): return "\(r._dumpBase)" + case .callout(let c): return "\(c)" + case .char, .scalar: fatalError("Unreachable") } } } +extension AST.Atom.Callout: _ASTPrintable { + public var _dumpBase: String { "callout <\(arg.value)>" } +} + extension AST.Reference: _ASTPrintable { public var _dumpBase: String { "\(kind)" diff --git a/Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift b/Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift index 298b2cad2..dbbddba1a 100644 --- a/Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift +++ b/Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift @@ -266,6 +266,9 @@ extension AST.Atom { case .subpattern: return " /* TODO: subpattern */" + + case .callout: + return " /* TODO: callout */" } } } diff --git a/Sources/_StringProcessing/ASTBuilder.swift b/Sources/_StringProcessing/ASTBuilder.swift index 1c7d4fc6b..ecf16bc8b 100644 --- a/Sources/_StringProcessing/ASTBuilder.swift +++ b/Sources/_StringProcessing/ASTBuilder.swift @@ -165,6 +165,10 @@ func groupCondition( .group(.init(.init(faking: kind), child, .fake)) } +func callout(_ arg: AST.Atom.Callout.Argument) -> AST { + atom(.callout(.init(.init(faking: arg)))) +} + func quant( _ amount: AST.Quantification.Amount, _ kind: AST.Quantification.Kind = .eager, diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index c417e40bc..2358c4f30 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -112,7 +112,7 @@ extension AST.Atom { case .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl, .any, .startOfLine, .endOfLine, - .backreference, .subpattern: + .backreference, .subpattern, .callout: // FIXME: implement return nil } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 8f4590081..831b903b1 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1087,6 +1087,17 @@ extension RegexTests { trueBranch: empty(), falseBranch: empty()) ) + // MARK: Callouts + + parseTest(#"(?C)"#, callout(.number(0))) + parseTest(#"(?C0)"#, callout(.number(0))) + parseTest(#"(?C20)"#, callout(.number(20))) + parseTest("(?C{abc})", callout(.string("abc"))) + + for delim in ["`", "'", "\"", "^", "%", "#", "$"] { + parseTest("(?C\(delim)hello\(delim))", callout(.string("hello"))) + } + // MARK: Parse with delimiters parseWithDelimitersTest("'/a b/'", concat("a", " ", "b")) @@ -1150,6 +1161,9 @@ extension RegexTests { parseNotEqualTest(#"(?(VERSION=0.1))"#, #"(?(VERSION=0.2))"#) parseNotEqualTest(#"(?(VERSION=0.1))"#, #"(?(VERSION>=0.1))"#) + parseNotEqualTest("(?C0)", "(?C1)") + parseNotEqualTest("(?C0)", "(?C'hello')") + // TODO: failure tests } @@ -1242,6 +1256,8 @@ extension RegexTests { diagnosticTest(#""ab\""#, .expected("\""), syntax: .experimental) diagnosticTest("\"ab\\", .expectedEscape, syntax: .experimental) + diagnosticTest("(?C", .expected(")")) + // MARK: Text Segment options diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions) @@ -1277,5 +1293,10 @@ extension RegexTests { diagnosticTest(#"(?(1)a|b|c)"#, .tooManyBranchesInConditional(3)) diagnosticTest(#"(?(1)||)"#, .tooManyBranchesInConditional(3)) diagnosticTest(#"(?(?i))"#, .unsupportedCondition("implicitly scoped group")) + + // MARK: Callouts + + diagnosticTest("(?C-1)", .unknownCalloutKind("(?C-1)")) + diagnosticTest("(?C-1", .unknownCalloutKind("(?C-1)")) } } From 7f3ee1f5ca5e549e299883e253013d392251e24d Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 20 Jan 2022 21:05:32 +0000 Subject: [PATCH 2/4] Parse PCRE backtracking directives This requires generalizing `canLexGroupLikeAtom` a bit to treat all `(*` groups as being atoms, and as such we need to special-case the PCRE2 explicit group syntax. We do it this way around to accommodate the extended Oniguruma callout syntax which also uses `(*`, which we aim to support. --- Sources/_MatchingEngine/Regex/AST/AST.swift | 14 ++ Sources/_MatchingEngine/Regex/AST/Atom.swift | 59 +++++++- .../Regex/Parse/Diagnostics.swift | 8 ++ .../Regex/Parse/LexicalAnalysis.swift | 132 ++++++++++++------ .../_MatchingEngine/Regex/Parse/Parse.swift | 7 +- .../_MatchingEngine/Regex/Parse/Source.swift | 2 + .../Regex/Printing/DumpAST.swift | 12 ++ .../Regex/Printing/PrintAsPattern.swift | 3 + Sources/_StringProcessing/ASTBuilder.swift | 8 ++ .../_StringProcessing/ConsumerInterface.swift | 2 +- Tests/RegexTests/ParseTests.swift | 36 ++++- 11 files changed, 238 insertions(+), 45 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/AST/AST.swift b/Sources/_MatchingEngine/Regex/AST/AST.swift index 168222133..7ad5e3edb 100644 --- a/Sources/_MatchingEngine/Regex/AST/AST.swift +++ b/Sources/_MatchingEngine/Regex/AST/AST.swift @@ -100,6 +100,20 @@ extension AST { return self.children?.any(\.hasCapture) ?? false } + + /// Whether this AST node may be used as the operand of a quantifier such as + /// `?`, `+` or `*`. + public var isQuantifiable: Bool { + switch self { + case .atom(let a): + return a.isQuantifiable + case .group, .conditional, .customCharacterClass: + return true + case .alternation, .concatenation, .quantification, .quote, .trivia, + .empty, .groupTransform: + return false + } + } } // MARK: - AST types diff --git a/Sources/_MatchingEngine/Regex/AST/Atom.swift b/Sources/_MatchingEngine/Regex/AST/Atom.swift index 750351cb1..b47266432 100644 --- a/Sources/_MatchingEngine/Regex/AST/Atom.swift +++ b/Sources/_MatchingEngine/Regex/AST/Atom.swift @@ -69,6 +69,9 @@ extension AST { // (?C) case callout(Callout) + + // (*ACCEPT), (*FAIL), ... + case backtrackingDirective(BacktrackingDirective) } } } @@ -459,6 +462,46 @@ extension AST.Atom { } } +extension AST.Atom { + public struct BacktrackingDirective: Hashable { + public enum Kind: Hashable { + /// (*ACCEPT) + case accept + + /// (*FAIL) + case fail + + /// (*MARK:NAME) + case mark + + /// (*COMMIT) + case commit + + /// (*PRUNE) + case prune + + /// (*SKIP) + case skip + + /// (*THEN) + case then + } + public var kind: AST.Located + public var name: AST.Located? + + public init(_ kind: AST.Located, name: AST.Located?) { + self.kind = kind + self.name = name + } + + public var isQuantifiable: Bool { + // As per http://pcre.org/current/doc/html/pcre2pattern.html#SEC29, only + // (*ACCEPT) is quantifiable. + kind.value == .accept + } + } +} + extension AST.Atom { /// Retrieve the character value of the atom if it represents a literal /// character or unicode scalar, nil otherwise. @@ -474,7 +517,8 @@ extension AST.Atom { fallthrough case .property, .escaped, .any, .startOfLine, .endOfLine, - .backreference, .subpattern, .namedCharacter, .callout: + .backreference, .subpattern, .namedCharacter, .callout, + .backtrackingDirective: return nil } } @@ -499,10 +543,21 @@ extension AST.Atom { return "\\M-\\C-\(x)" case .property, .escaped, .any, .startOfLine, .endOfLine, - .backreference, .subpattern, .namedCharacter, .callout: + .backreference, .subpattern, .namedCharacter, .callout, + .backtrackingDirective: return nil } } + + public var isQuantifiable: Bool { + switch kind { + case .backtrackingDirective(let b): + return b.isQuantifiable + // TODO: Are callouts quantifiable? + default: + return true + } + } } extension AST { diff --git a/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift b/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift index 574691288..adb8810d7 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift @@ -38,6 +38,10 @@ enum ParseError: Error, Hashable { case cannotReferToWholePattern + case notQuantifiable + + case backtrackingDirectiveMustHaveName(String) + case unknownGroupKind(String) case unknownCalloutKind(String) @@ -81,6 +85,10 @@ extension ParseError: CustomStringConvertible { return "expected escape sequence" case .cannotReferToWholePattern: return "cannot refer to whole pattern here" + case .notQuantifiable: + return "expression is not quantifiable" + case .backtrackingDirectiveMustHaveName(let b): + return "backtracking directive '\(b)' must include name" case let .tooManyBranchesInConditional(i): return "expected 2 branches in conditional, have \(i)" case let .unsupportedCondition(str): diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift index 30d887931..1cd1404be 100644 --- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift @@ -608,6 +608,49 @@ extension Source { return .init(caretLoc: nil, adding: adding, minusLoc: nil, removing: []) } + /// Try to consume explicitly spelled-out PCRE2 group syntax. + mutating func lexExplicitPCRE2GroupStart() -> AST.Group.Kind? { + tryEating { src in + guard src.tryEat(sequence: "(*") else { return nil } + + if src.tryEat(sequence: "atomic:") { + return .atomicNonCapturing + } + if src.tryEat(sequence: "pla:") || + src.tryEat(sequence: "positive_lookahead:") { + return .lookahead + } + if src.tryEat(sequence: "nla:") || + src.tryEat(sequence: "negative_lookahead:") { + return .negativeLookahead + } + if src.tryEat(sequence: "plb:") || + src.tryEat(sequence: "positive_lookbehind:") { + return .lookbehind + } + if src.tryEat(sequence: "nlb:") || + src.tryEat(sequence: "negative_lookbehind:") { + return .negativeLookbehind + } + if src.tryEat(sequence: "napla:") || + src.tryEat(sequence: "non_atomic_positive_lookahead:") { + return .nonAtomicLookahead + } + if src.tryEat(sequence: "naplb:") || + src.tryEat(sequence: "non_atomic_positive_lookbehind:") { + return .nonAtomicLookbehind + } + if src.tryEat(sequence: "sr:") || src.tryEat(sequence: "script_run:") { + return .scriptRun + } + if src.tryEat(sequence: "asr:") || + src.tryEat(sequence: "atomic_script_run:") { + return .atomicScriptRun + } + return nil + } + } + /// Try to consume the start of a group /// /// GroupStart -> '(?' GroupKind | '(' @@ -631,6 +674,11 @@ extension Source { ) throws -> Located? { try recordLoc { src in try src.tryEating { src in + // Explicitly spelled out PRCE2 syntax for some groups. This needs to be + // done before group-like atoms, as it uses the '(*' syntax, which is + // otherwise a group-like atom. + if let g = src.lexExplicitPCRE2GroupStart() { return g } + // There are some atoms that syntactically look like groups, bail here // if we see any. Care needs to be taken here as e.g a group starting // with '(?-' is a subpattern if the next character is a digit, @@ -691,45 +739,6 @@ extension Source { throw ParseError.unknownGroupKind("?\(next)") } - // Explicitly spelled out PRCE2 syntax for some groups. - if src.tryEat("*") { - if src.tryEat(sequence: "atomic:") { return .atomicNonCapturing } - - if src.tryEat(sequence: "pla:") || - src.tryEat(sequence: "positive_lookahead:") { - return .lookahead - } - if src.tryEat(sequence: "nla:") || - src.tryEat(sequence: "negative_lookahead:") { - return .negativeLookahead - } - if src.tryEat(sequence: "plb:") || - src.tryEat(sequence: "positive_lookbehind:") { - return .lookbehind - } - if src.tryEat(sequence: "nlb:") || - src.tryEat(sequence: "negative_lookbehind:") { - return .negativeLookbehind - } - if src.tryEat(sequence: "napla:") || - src.tryEat(sequence: "non_atomic_positive_lookahead:") { - return .nonAtomicLookahead - } - if src.tryEat(sequence: "naplb:") || - src.tryEat(sequence: "non_atomic_positive_lookbehind:") { - return .nonAtomicLookbehind - } - if src.tryEat(sequence: "sr:") || src.tryEat(sequence: "script_run:") { - return .scriptRun - } - if src.tryEat(sequence: "asr:") || - src.tryEat(sequence: "atomic_script_run:") { - return .atomicScriptRun - } - - throw ParseError.misc("Quantifier '*' must follow operand") - } - // (_:) if src.experimentalCaptures && src.tryEat(sequence: "_:") { return .nonCapture @@ -1216,6 +1225,8 @@ extension Source { return false } + // The start of a backreference directive. + if src.tryEat("*") { return true } return false } @@ -1323,6 +1334,44 @@ extension Source { return .init(arg) } + /// Try to consume a backtracking directive. + /// + /// BacktrackingDirective -> '(*' BacktrackingDirectiveKind (':' )? ')' + /// BacktrackingDirectiveKind -> 'ACCEPT' | 'FAIL' | 'F' | 'MARK' | '' + /// | 'COMMIT' | 'PRUNE' | 'SKIP' | 'THEN' + /// + mutating func lexBacktrackingDirective( + ) throws -> AST.Atom.BacktrackingDirective? { + try tryEating { src in + guard src.tryEat(sequence: "(*") else { return nil } + let kind = src.recordLoc { src -> AST.Atom.BacktrackingDirective.Kind? in + if src.tryEat(sequence: "ACCEPT") { return .accept } + if src.tryEat(sequence: "FAIL") || src.tryEat("F") { return .fail } + if src.tryEat(sequence: "MARK") || src.peek() == ":" { return .mark } + if src.tryEat(sequence: "COMMIT") { return .commit } + if src.tryEat(sequence: "PRUNE") { return .prune } + if src.tryEat(sequence: "SKIP") { return .skip } + if src.tryEat(sequence: "THEN") { return .then } + return nil + } + guard let kind = kind else { return nil } + var name: Located? + if src.tryEat(":") { + // TODO: PCRE allows escaped delimiters or '\Q...\E' sequences in the + // name under PCRE2_ALT_VERBNAMES. + name = try src.expectQuoted(endingWith: ")", eatEnding: false) + } + try src.expect(")") + + // MARK directives must be named. + if name == nil && kind.value == .mark { + throw ParseError.backtrackingDirectiveMustHaveName( + String(src[kind.location.range])) + } + return .init(kind, name: name) + } + } + /// Consume a group-like atom, throwing an error if an atom could not be /// produced. /// @@ -1340,6 +1389,11 @@ extension Source { return .callout(callout) } + // (*ACCEPT), (*FAIL), (*MARK), ... + if let b = try src.lexBacktrackingDirective() { + return .backtrackingDirective(b) + } + // If we didn't produce an atom, consume up until a reasonable end-point // and throw an error. try src.expect("(") diff --git a/Sources/_MatchingEngine/Regex/Parse/Parse.swift b/Sources/_MatchingEngine/Regex/Parse/Parse.swift index c8ff36b11..a633ee95b 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Parse.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Parse.swift @@ -176,8 +176,11 @@ extension Parser { // Quantification -> QuantOperand Quantifier? if let operand = try parseQuantifierOperand() { if let (amt, kind) = try source.lexQuantifier() { - result.append(.quantification(.init( - amt, kind, operand, loc(_start)))) + let location = loc(_start) + guard operand.isQuantifiable else { + throw Source.LocatedError(ParseError.notQuantifiable, location) + } + result.append(.quantification(.init(amt, kind, operand, location))) } else { result.append(operand) } diff --git a/Sources/_MatchingEngine/Regex/Parse/Source.swift b/Sources/_MatchingEngine/Regex/Parse/Source.swift index 13188767b..260ee3b63 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Source.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Source.swift @@ -27,6 +27,8 @@ public struct Source { self.bounds = str.startIndex ..< str.endIndex self.syntax = syntax } + + subscript(_ range: Range) -> Input.SubSequence { input[range] } } // MARK: - Prototype uses String diff --git a/Sources/_MatchingEngine/Regex/Printing/DumpAST.swift b/Sources/_MatchingEngine/Regex/Printing/DumpAST.swift index 95b3bc843..f3a3a0252 100644 --- a/Sources/_MatchingEngine/Regex/Printing/DumpAST.swift +++ b/Sources/_MatchingEngine/Regex/Printing/DumpAST.swift @@ -135,6 +135,8 @@ extension AST.Atom { case .callout(let c): return "\(c)" + case .backtrackingDirective(let d): return "\(d)" + case .char, .scalar: fatalError("Unreachable") } @@ -245,3 +247,13 @@ extension AST.CustomCharacterClass.Range: _ASTPrintable { "\(lhs)-\(rhs)" } } + +extension AST.Atom.BacktrackingDirective: _ASTPrintable { + public var _dumpBase: String { + var result = "\(kind.value)" + if let name = name { + result += ": \(name.value)" + } + return result + } +} diff --git a/Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift b/Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift index dbbddba1a..6b4e8ac8c 100644 --- a/Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift +++ b/Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift @@ -269,6 +269,9 @@ extension AST.Atom { case .callout: return " /* TODO: callout */" + + case .backtrackingDirective: + return " /* TODO: backtracking directive */" } } } diff --git a/Sources/_StringProcessing/ASTBuilder.swift b/Sources/_StringProcessing/ASTBuilder.swift index ecf16bc8b..ea30a09b4 100644 --- a/Sources/_StringProcessing/ASTBuilder.swift +++ b/Sources/_StringProcessing/ASTBuilder.swift @@ -169,6 +169,14 @@ func callout(_ arg: AST.Atom.Callout.Argument) -> AST { atom(.callout(.init(.init(faking: arg)))) } +func backtrackingDirective( + _ kind: AST.Atom.BacktrackingDirective.Kind, name: String? = nil +) -> AST { + atom(.backtrackingDirective( + .init(.init(faking: kind), name: name.map { .init(faking: $0) }) + )) +} + func quant( _ amount: AST.Quantification.Amount, _ kind: AST.Quantification.Kind = .eager, diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 2358c4f30..1961dae9c 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -112,7 +112,7 @@ extension AST.Atom { case .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl, .any, .startOfLine, .endOfLine, - .backreference, .subpattern, .callout: + .backreference, .subpattern, .callout, .backtrackingDirective: // FIXME: implement return nil } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 831b903b1..1a10f9d53 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1098,6 +1098,22 @@ extension RegexTests { parseTest("(?C\(delim)hello\(delim))", callout(.string("hello"))) } + // MARK: Backtracking directives + + parseTest("(*ACCEPT)?", zeroOrOne(.eager, backtrackingDirective(.accept))) + parseTest( + "(*ACCEPT:a)??", + zeroOrOne(.reluctant, backtrackingDirective(.accept, name: "a")) + ) + parseTest("(*:a)", backtrackingDirective(.mark, name: "a")) + parseTest("(*MARK:a)", backtrackingDirective(.mark, name: "a")) + parseTest("(*F)", backtrackingDirective(.fail)) + parseTest("(*COMMIT)", backtrackingDirective(.commit)) + parseTest("(*SKIP)", backtrackingDirective(.skip)) + parseTest("(*SKIP:SKIP)", backtrackingDirective(.skip, name: "SKIP")) + parseTest("(*PRUNE)", backtrackingDirective(.prune)) + parseTest("(*THEN)", backtrackingDirective(.then)) + // MARK: Parse with delimiters parseWithDelimitersTest("'/a b/'", concat("a", " ", "b")) @@ -1164,6 +1180,11 @@ extension RegexTests { parseNotEqualTest("(?C0)", "(?C1)") parseNotEqualTest("(?C0)", "(?C'hello')") + parseNotEqualTest("(*ACCEPT)", "(*ACCEPT:a)") + parseNotEqualTest("(*MARK:a)", "(*MARK:b)") + parseNotEqualTest("(*:a)", "(*:b)") + parseNotEqualTest("(*FAIL)", "(*SKIP)") + // TODO: failure tests } @@ -1265,7 +1286,8 @@ extension RegexTests { // MARK: Group specifiers - diagnosticTest(#"(*"#, .misc("Quantifier '*' must follow operand")) + diagnosticTest(#"(*"#, .unknownGroupKind("*")) + diagnosticTest("(*X)", .unknownGroupKind("*X")) diagnosticTest(#"(?k)"#, .unknownGroupKind("?k")) diagnosticTest(#"(?P#)"#, .invalidMatchingOption("#")) @@ -1298,5 +1320,17 @@ extension RegexTests { diagnosticTest("(?C-1)", .unknownCalloutKind("(?C-1)")) diagnosticTest("(?C-1", .unknownCalloutKind("(?C-1)")) + + // MARK: Backtracking directives + + diagnosticTest("(*MARK)", .backtrackingDirectiveMustHaveName("MARK")) + diagnosticTest("(*:)", .expectedNonEmptyContents) + diagnosticTest("(*MARK:a)?", .notQuantifiable) + diagnosticTest("(*FAIL)+", .notQuantifiable) + diagnosticTest("(*COMMIT:b)*", .notQuantifiable) + diagnosticTest("(*PRUNE:a)??", .notQuantifiable) + diagnosticTest("(*SKIP:a)*?", .notQuantifiable) + diagnosticTest("(*F)+?", .notQuantifiable) + diagnosticTest("(*:a){2}", .notQuantifiable) } } From 4254dd8bfb7a08a417d4dbf0667572e805b63e13 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 20 Jan 2022 21:05:33 +0000 Subject: [PATCH 3/4] Add Source.tryEatWithLoc --- .../Regex/Parse/LexicalAnalysis.swift | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift index 1cd1404be..1d3e1d426 100644 --- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift @@ -140,6 +140,14 @@ extension Source { return result } + /// Attempt to eat the given character, returning its source location if + /// successful, `nil` otherwise. + mutating func tryEatWithLoc(_ c: Character) -> SourceLocation? { + let start = currentPosition + guard tryEat(c) else { return nil } + return .init(start ..< currentPosition) + } + /// Throws an expected ASCII character error if not matched mutating func expectASCII() throws -> Located { try recordLoc { src in @@ -1475,15 +1483,13 @@ extension Source { ) throws -> (dashLoc: SourceLocation, AST.Atom)? { // Make sure we don't have a binary operator e.g '--', and the '-' is not // ending the custom character class (in which case it is literal). - let start = currentPosition - guard peekCCBinOp() == nil && !starts(with: "-]") && tryEat("-") else { - return nil - } - let dashLoc = Location(start ..< currentPosition) - guard let end = try lexAtom(context: context) else { + guard peekCCBinOp() == nil, !starts(with: "-]"), + let dash = tryEatWithLoc("-"), + let end = try lexAtom(context: context) + else { return nil } - return (dashLoc, end) + return (dash, end) } } From df6be0cede92ab9820b67adb29478d6fec3b200b Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 20 Jan 2022 21:05:33 +0000 Subject: [PATCH 4/4] Parse .NET balanced captures This requires imposing some restrictions on what can be used as a group name to allow for the syntax `(?)`. For now, restrict the characters to letters, numbers and `_`, and forbid the first character from being a number. This should be no stricter than the rules imposed by PCRE, Oniguruma, ICU, Java and .NET. --- Sources/_MatchingEngine/Regex/AST/Group.swift | 27 +++++- .../Regex/Parse/CaptureStructure.swift | 3 + .../Regex/Parse/Diagnostics.swift | 9 ++ .../Regex/Parse/LexicalAnalysis.swift | 84 +++++++++++++++---- .../Regex/Printing/DumpAST.swift | 33 +++++--- .../Regex/Printing/PrintAsCanonical.swift | 33 +++++--- .../Regex/Printing/PrintAsPattern.swift | 3 + .../Utility/MissingUnicode.swift | 4 +- Sources/_StringProcessing/ASTBuilder.swift | 6 ++ .../_StringProcessing/CharacterClass.swift | 2 +- Tests/RegexTests/ParseTests.swift | 52 +++++++++--- 11 files changed, 199 insertions(+), 57 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/AST/Group.swift b/Sources/_MatchingEngine/Regex/AST/Group.swift index 3714fedf6..2399a509c 100644 --- a/Sources/_MatchingEngine/Regex/AST/Group.swift +++ b/Sources/_MatchingEngine/Regex/AST/Group.swift @@ -31,6 +31,9 @@ extension AST { // (?...) (?'name'...) (?P...) case namedCapture(Located) + // (?) (?'name-priorName') + case balancedCapture(BalancedCapture) + // (?:...) case nonCapture @@ -79,7 +82,7 @@ extension AST { extension AST.Group.Kind { public var isCapturing: Bool { switch self { - case .capture, .namedCapture: return true + case .capture, .namedCapture, .balancedCapture: return true default: return false } } @@ -103,6 +106,7 @@ extension AST.Group.Kind { public var name: String? { switch self { case .namedCapture(let name): return name.value + case .balancedCapture(let b): return b.name?.value default: return nil } } @@ -121,5 +125,26 @@ extension AST.Group { default: return nil } } +} + +extension AST.Group { + public struct BalancedCapture: Hashable { + /// The name of the group, or nil if the group has no name. + public var name: AST.Located? + + /// The location of the `-` in the group. + public var dash: SourceLocation + /// The name of the prior group that the balancing group references. + public var priorName: AST.Located + + public init( + name: AST.Located?, dash: SourceLocation, + priorName: AST.Located + ) { + self.name = name + self.dash = dash + self.priorName = priorName + } + } } diff --git a/Sources/_MatchingEngine/Regex/Parse/CaptureStructure.swift b/Sources/_MatchingEngine/Regex/Parse/CaptureStructure.swift index 1511c4499..965b6b246 100644 --- a/Sources/_MatchingEngine/Regex/Parse/CaptureStructure.swift +++ b/Sources/_MatchingEngine/Regex/Parse/CaptureStructure.swift @@ -44,7 +44,10 @@ extension AST { return .atom() + innerCaptures case .namedCapture(let name): return .atom(name: name.value) + innerCaptures + case .balancedCapture(let b): + return .atom(name: b.name?.value) + innerCaptures default: + precondition(!group.kind.value.isCapturing) return innerCaptures } case .conditional(let c): diff --git a/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift b/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift index adb8810d7..d5e4f122b 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift @@ -55,6 +55,9 @@ enum ParseError: Error, Hashable { case emptyProperty case expectedGroupSpecifier + case expectedGroupName + case groupNameMustBeAlphaNumeric + case groupNameCannotStartWithNumber case cannotRemoveTextSegmentOptions } @@ -113,6 +116,12 @@ extension ParseError: CustomStringConvertible { return "empty property" case .expectedGroupSpecifier: return "expected group specifier" + case .expectedGroupName: + return "expected group name" + case .groupNameMustBeAlphaNumeric: + return "group name must only contain alphanumeric characters" + case .groupNameCannotStartWithNumber: + return "group name must not start with number" case .cannotRemoveTextSegmentOptions: return "text segment mode cannot be unset, only changed" } diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift index 1d3e1d426..d3c074574 100644 --- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift @@ -659,13 +659,61 @@ extension Source { } } + /// Consume a group name. + private mutating func expectGroupName( + endingWith ending: String, eatEnding: Bool = true + ) throws -> Located { + let str = try recordLoc { src -> String in + if src.isEmpty || src.tryEat(sequence: ending) { + throw ParseError.expectedGroupName + } + if src.peek()!.isNumber { + throw ParseError.groupNameCannotStartWithNumber + } + guard let str = src.tryEatPrefix(\.isWordCharacter)?.string else { + throw ParseError.groupNameMustBeAlphaNumeric + } + return str + } + if eatEnding { + try expect(sequence: ending) + } + return str + } + + /// Consume a named group field, producing either a named capture or balanced + /// capture. + /// + /// NamedGroup -> 'P<' GroupNameBody '>' + /// | '<' GroupNameBody '>' + /// | "'" GroupNameBody "'" + /// GroupNameBody -> \w+ | \w* '-' \w+ + /// + private mutating func expectNamedGroup( + endingWith ending: String + ) throws -> AST.Group.Kind { + func lexBalanced(_ lhs: Located? = nil) throws -> AST.Group.Kind? { + // If we have a '-', this is a .NET-style 'balanced group'. + guard let dash = tryEatWithLoc("-") else { return nil } + let rhs = try expectGroupName(endingWith: ending) + return .balancedCapture(.init(name: lhs, dash: dash, priorName: rhs)) + } + + // Lex a group name, trying to lex a '-rhs' for a balanced capture group + // both before and after. + if let b = try lexBalanced() { return b } + let name = try expectGroupName(endingWith: ending, eatEnding: false) + if let b = try lexBalanced(name) { return b } + + try expect(sequence: ending) + return .namedCapture(name) + } + /// Try to consume the start of a group /// /// GroupStart -> '(?' GroupKind | '(' - /// GroupKind -> Named | ':' | '|' | '>' | '=' | '!' | '*' | '<=' | ' '<' [^'>']+ '>' | 'P<' [^'>']+ '>' - /// | '\'' [^'\'']+ '\'' + /// GroupKind -> ':' | '|' | '>' | '=' | '!' | '*' | '<=' | '") - return .namedCapture(name) + return try src.expectNamedGroup(endingWith: ">") } if src.tryEat("'") { - let name = try src.expectQuoted(endingWith: "'") - return .namedCapture(name) + return try src.expectNamedGroup(endingWith: "'") } // Matching option changing group (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:). @@ -853,9 +896,9 @@ extension Source { // FIXME: This should apply to future groups too. // TODO: We should probably advise users to use the more explicit // syntax. - let nameRef = try src.expectNamedReference( - endingWith: ")", eatEnding: false) - if context.isPriorGroupRef(nameRef.kind) { + if let nameRef = src.lexNamedReference(endingWith: ")", + eatEnding: false), + context.isPriorGroupRef(nameRef.kind) { return .groupMatched(nameRef) } return nil @@ -1046,11 +1089,20 @@ extension Source { private mutating func expectNamedReference( endingWith end: String, eatEnding: Bool = true ) throws -> AST.Reference { - // TODO: Group name validation, see comment in lexGroupStart. - let str = try expectQuoted(endingWith: end, eatEnding: eatEnding) + let str = try expectGroupName(endingWith: end, eatEnding: eatEnding) return .init(.named(str.value), innerLoc: str.location) } + /// Try to consume a named reference up to a closing delimiter, returning + /// `nil` if the characters aren't valid for a named reference. + private mutating func lexNamedReference( + endingWith end: String, eatEnding: Bool = true + ) -> AST.Reference? { + tryEating { src in + try? src.expectNamedReference(endingWith: end, eatEnding: eatEnding) + } + } + /// Try to lex a numbered reference, or otherwise a named reference. /// /// NameOrNumberRef -> NumberRef | diff --git a/Sources/_MatchingEngine/Regex/Printing/DumpAST.swift b/Sources/_MatchingEngine/Regex/Printing/DumpAST.swift index f3a3a0252..88c1b7949 100644 --- a/Sources/_MatchingEngine/Regex/Printing/DumpAST.swift +++ b/Sources/_MatchingEngine/Regex/Printing/DumpAST.swift @@ -156,19 +156,20 @@ extension AST.Reference: _ASTPrintable { extension AST.Group.Kind: _ASTPrintable { public var _dumpBase: String { switch self { - case .capture: return "capture" - case .namedCapture(let s): return "capture<\(s.value)>" - case .nonCapture: return "nonCapture" - case .nonCaptureReset: return "nonCaptureReset" - case .atomicNonCapturing: return "atomicNonCapturing" - case .lookahead: return "lookahead" - case .negativeLookahead: return "negativeLookahead" - case .nonAtomicLookahead: return "nonAtomicLookahead" - case .lookbehind: return "lookbehind" - case .negativeLookbehind: return "negativeLookbehind" - case .nonAtomicLookbehind: return "nonAtomicLookbehind" - case .scriptRun: return "scriptRun" - case .atomicScriptRun: return "atomicScriptRun" + case .capture: return "capture" + case .namedCapture(let s): return "capture<\(s.value)>" + case .balancedCapture(let b): return "balanced capture \(b)" + case .nonCapture: return "nonCapture" + case .nonCaptureReset: return "nonCaptureReset" + case .atomicNonCapturing: return "atomicNonCapturing" + case .lookahead: return "lookahead" + case .negativeLookahead: return "negativeLookahead" + case .nonAtomicLookahead: return "nonAtomicLookahead" + case .lookbehind: return "lookbehind" + case .negativeLookbehind: return "negativeLookbehind" + case .nonAtomicLookbehind: return "nonAtomicLookbehind" + case .scriptRun: return "scriptRun" + case .atomicScriptRun: return "atomicScriptRun" case .changeMatchingOptions(let seq, let isIsolated): return "changeMatchingOptions<\(seq), \(isIsolated)>" } @@ -257,3 +258,9 @@ extension AST.Atom.BacktrackingDirective: _ASTPrintable { return result } } + +extension AST.Group.BalancedCapture: _ASTPrintable { + public var _dumpBase: String { + "\(name?.value ?? "")-\(priorName.value)" + } +} diff --git a/Sources/_MatchingEngine/Regex/Printing/PrintAsCanonical.swift b/Sources/_MatchingEngine/Regex/Printing/PrintAsCanonical.swift index e33d125fe..41cfcd4e1 100644 --- a/Sources/_MatchingEngine/Regex/Printing/PrintAsCanonical.swift +++ b/Sources/_MatchingEngine/Regex/Printing/PrintAsCanonical.swift @@ -138,19 +138,20 @@ extension AST.Quote { extension AST.Group.Kind { var _canonicalBase: String { switch self { - case .capture: return "(" - case .namedCapture(let n): return "(?<\(n.value)>" - case .nonCapture: return "(?:" - case .nonCaptureReset: return "(?|" - case .atomicNonCapturing: return "(?>" - case .lookahead: return "(?=" - case .negativeLookahead: return "(?!" - case .nonAtomicLookahead: return "(?*" - case .lookbehind: return "(?<=" - case .negativeLookbehind: return "(?" + case .balancedCapture(let b): return "(?<\(b._canonicalBase)>" + case .nonCapture: return "(?:" + case .nonCaptureReset: return "(?|" + case .atomicNonCapturing: return "(?>" + case .lookahead: return "(?=" + case .negativeLookahead: return "(?!" + case .nonAtomicLookahead: return "(?*" + case .lookbehind: return "(?<=" + case .negativeLookbehind: return "(? AST { group(.namedCapture(.init(faking: name)), child) } +func balancedCapture(name: String?, priorName: String, _ child: AST) -> AST { + group(.balancedCapture( + .init(name: name.map { .init(faking: $0) }, dash: .fake, + priorName: .init(faking: priorName)) + ), child) +} func nonCaptureReset( _ child: AST ) -> AST { diff --git a/Sources/_StringProcessing/CharacterClass.swift b/Sources/_StringProcessing/CharacterClass.swift index 149af7d53..ce9d6242f 100644 --- a/Sources/_StringProcessing/CharacterClass.swift +++ b/Sources/_StringProcessing/CharacterClass.swift @@ -148,7 +148,7 @@ public struct CharacterClass: Hashable { case .newlineSequence: matched = c.isNewline case .verticalWhitespace: fatalError("Not implemented") case .whitespace: matched = c.isWhitespace - case .word: matched = c.isLetter || c.isNumber || c == "_" + case .word: matched = c.isWordCharacter case .custom(let set): matched = set.any { $0.matches(c) } } if isInverted { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 1a10f9d53..6171f2ee5 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -549,6 +549,14 @@ extension RegexTests { concat("a", namedCapture("label", "b"), "c"), captures: .atom(name: "label")) + // Balanced captures + parseTest(#"(?)"#, balancedCapture(name: "a", priorName: "c", empty()), + captures: .atom(name: "a")) + parseTest(#"(?<-c>)"#, balancedCapture(name: nil, priorName: "c", empty()), + captures: .atom()) + parseTest(#"(?'a-b'c)"#, balancedCapture(name: "a", priorName: "b", "c"), + captures: .atom(name: "a")) + // Other groups parseTest( #"a(?:b)c"#, @@ -852,12 +860,6 @@ extension RegexTests { parseTest(#"(?&hello)"#, subpattern(.named("hello"))) parseTest(#"(?P>P)"#, subpattern(.named("P"))) - // TODO: Should we enforce that names only use certain characters? - parseTest(#"(?&&)"#, subpattern(.named("&"))) - parseTest(#"(?&-1)"#, subpattern(.named("-1"))) - parseTest(#"(?P>+1)"#, subpattern(.named("+1"))) - parseTest(#"(?P=+1)"#, backreference(.named("+1"))) - parseTest(#"[(?R)]"#, charClass("(", "?", "R", ")")) parseTest(#"[(?&a)]"#, charClass("(", "?", "&", "a", ")")) parseTest(#"[(?1)]"#, charClass("(", "?", "1", ")")) @@ -1185,6 +1187,10 @@ extension RegexTests { parseNotEqualTest("(*:a)", "(*:b)") parseNotEqualTest("(*FAIL)", "(*SKIP)") + parseNotEqualTest("(?)", "(?)") + parseNotEqualTest("(?)", "(?)") + parseNotEqualTest("(?<-b>)", "(?)") + // TODO: failure tests } @@ -1279,6 +1285,13 @@ extension RegexTests { diagnosticTest("(?C", .expected(")")) + diagnosticTest("(?<", .expectedGroupName) + diagnosticTest("(?")) + diagnosticTest("(?")) + diagnosticTest("(?", .expected(")")) + // MARK: Text Segment options diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions) @@ -1292,6 +1305,13 @@ extension RegexTests { diagnosticTest(#"(?k)"#, .unknownGroupKind("?k")) diagnosticTest(#"(?P#)"#, .invalidMatchingOption("#")) + diagnosticTest(#"(?<#>)"#, .groupNameMustBeAlphaNumeric) + diagnosticTest(#"(?'1A')"#, .groupNameCannotStartWithNumber) + + diagnosticTest(#"(?'-')"#, .expectedGroupName) + diagnosticTest(#"(?'--')"#, .groupNameMustBeAlphaNumeric) + diagnosticTest(#"(?'a-b-c')"#, .expected("'")) + // MARK: Matching options diagnosticTest(#"(?^-"#, .cannotRemoveMatchingOptionsAfterCaret) @@ -1299,17 +1319,25 @@ extension RegexTests { diagnosticTest(#"(?^i-"#, .cannotRemoveMatchingOptionsAfterCaret) diagnosticTest(#"(?^i-m)"#, .cannotRemoveMatchingOptionsAfterCaret) - // MARK: Quotes - - diagnosticTest(#"\k''"#, .expectedNonEmptyContents) - diagnosticTest(#"(?&)"#, .expectedNonEmptyContents) - diagnosticTest(#"(?P>)"#, .expectedNonEmptyContents) - // MARK: References + diagnosticTest(#"\k''"#, .expectedGroupName) + diagnosticTest(#"(?&)"#, .expectedGroupName) + diagnosticTest(#"(?P>)"#, .expectedGroupName) + diagnosticTest(#"\g{0}"#, .cannotReferToWholePattern) diagnosticTest(#"(?(0))"#, .cannotReferToWholePattern) + diagnosticTest(#"(?&&)"#, .groupNameMustBeAlphaNumeric) + diagnosticTest(#"(?&-1)"#, .groupNameMustBeAlphaNumeric) + diagnosticTest(#"(?P>+1)"#, .groupNameMustBeAlphaNumeric) + diagnosticTest(#"(?P=+1)"#, .groupNameMustBeAlphaNumeric) + diagnosticTest(#"\k'#'"#, .groupNameMustBeAlphaNumeric) + diagnosticTest(#"(?&#)"#, .groupNameMustBeAlphaNumeric) + + diagnosticTest(#"\k'1'"#, .groupNameCannotStartWithNumber) + diagnosticTest(#"(?P>1)"#, .groupNameCannotStartWithNumber) + // MARK: Conditionals diagnosticTest(#"(?(1)a|b|c)"#, .tooManyBranchesInConditional(3))