Skip to content

Commit aea6d9d

Browse files
committed
Parse PCRE backtracking directives
This requires generalizing `canLexGroupLikeAtom` a bit to treat all `(*` groups as being atoms, and as such we need to special-case the PCRE2 explicit group syntax. We do it this way around to accommodate the extended Oniguruma callout syntax which also uses `(*`, which we aim to support.
1 parent 3048988 commit aea6d9d

File tree

11 files changed

+231
-45
lines changed

11 files changed

+231
-45
lines changed

Sources/_MatchingEngine/Regex/AST/AST.swift

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,20 @@ extension AST {
100100

101101
return self.children?.any(\.hasCapture) ?? false
102102
}
103+
104+
/// Whether this AST node may be used as the operand of a quantifier such as
105+
/// `?`, `+` or `*`.
106+
public var isQuantifiable: Bool {
107+
switch self {
108+
case .atom(let a):
109+
return a.isQuantifiable
110+
case .group, .conditional, .customCharacterClass:
111+
return true
112+
case .alternation, .concatenation, .quantification, .quote, .trivia,
113+
.empty, .groupTransform:
114+
return false
115+
}
116+
}
103117
}
104118

105119
// MARK: - AST types

Sources/_MatchingEngine/Regex/AST/Atom.swift

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ extension AST {
6969

7070
// (?C)
7171
case callout(Callout)
72+
73+
// (*ACCEPT), (*FAIL), ...
74+
case backtrackingDirective(BacktrackingDirective)
7275
}
7376
}
7477
}
@@ -459,6 +462,40 @@ extension AST.Atom {
459462
}
460463
}
461464

465+
extension AST.Atom {
466+
public struct BacktrackingDirective: Hashable {
467+
public enum Kind: Hashable {
468+
/// (*ACCEPT)
469+
case accept
470+
/// (*FAIL*)
471+
case fail
472+
/// (*MARK:NAME)
473+
case mark
474+
/// (*COMMIT)
475+
case commit
476+
/// (*PRUNE)
477+
case prune
478+
/// (*SKIP)
479+
case skip
480+
/// (*THEN)
481+
case then
482+
}
483+
public var kind: AST.Located<Kind>
484+
public var name: AST.Located<String>?
485+
486+
public init(_ kind: AST.Located<Kind>, name: AST.Located<String>?) {
487+
self.kind = kind
488+
self.name = name
489+
}
490+
491+
public var isQuantifiable: Bool {
492+
// As per http://pcre.org/current/doc/html/pcre2pattern.html#SEC29, only
493+
// (*ACCEPT) is quantifiable.
494+
kind.value == .accept
495+
}
496+
}
497+
}
498+
462499
extension AST.Atom {
463500
/// Retrieve the character value of the atom if it represents a literal
464501
/// character or unicode scalar, nil otherwise.
@@ -474,7 +511,8 @@ extension AST.Atom {
474511
fallthrough
475512

476513
case .property, .escaped, .any, .startOfLine, .endOfLine,
477-
.backreference, .subpattern, .namedCharacter, .callout:
514+
.backreference, .subpattern, .namedCharacter, .callout,
515+
.backtrackingDirective:
478516
return nil
479517
}
480518
}
@@ -499,10 +537,21 @@ extension AST.Atom {
499537
return "\\M-\\C-\(x)"
500538

501539
case .property, .escaped, .any, .startOfLine, .endOfLine,
502-
.backreference, .subpattern, .namedCharacter, .callout:
540+
.backreference, .subpattern, .namedCharacter, .callout,
541+
.backtrackingDirective:
503542
return nil
504543
}
505544
}
545+
546+
public var isQuantifiable: Bool {
547+
switch kind {
548+
case .backtrackingDirective(let b):
549+
return b.isQuantifiable
550+
// TODO: Are callouts quantifiable?
551+
default:
552+
return true
553+
}
554+
}
506555
}
507556

508557
extension AST {

Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ enum ParseError: Error, Hashable {
3838

3939
case cannotReferToWholePattern
4040

41+
case notQuantifiable
42+
43+
case backtrackingDirectiveMustHaveName(String)
44+
4145
case unknownGroupKind(String)
4246
case unknownCalloutKind(String)
4347

@@ -81,6 +85,10 @@ extension ParseError: CustomStringConvertible {
8185
return "expected escape sequence"
8286
case .cannotReferToWholePattern:
8387
return "cannot refer to whole pattern here"
88+
case .notQuantifiable:
89+
return "expression is not quantifiable"
90+
case .backtrackingDirectiveMustHaveName(let b):
91+
return "backtracking directive '\(b)' must include name"
8492
case let .tooManyBranchesInConditional(i):
8593
return "expected 2 branches in conditional, have \(i)"
8694
case let .unsupportedCondition(str):

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 92 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,49 @@ extension Source {
608608
return .init(caretLoc: nil, adding: adding, minusLoc: nil, removing: [])
609609
}
610610

611+
/// Try to consume explicitly spelled-out PCRE2 group syntax.
612+
mutating func lexExplicitPCRE2GroupStart() -> AST.Group.Kind? {
613+
tryEating { src in
614+
guard src.tryEat(sequence: "(*") else { return nil }
615+
616+
if src.tryEat(sequence: "atomic:") {
617+
return .atomicNonCapturing
618+
}
619+
if src.tryEat(sequence: "pla:") ||
620+
src.tryEat(sequence: "positive_lookahead:") {
621+
return .lookahead
622+
}
623+
if src.tryEat(sequence: "nla:") ||
624+
src.tryEat(sequence: "negative_lookahead:") {
625+
return .negativeLookahead
626+
}
627+
if src.tryEat(sequence: "plb:") ||
628+
src.tryEat(sequence: "positive_lookbehind:") {
629+
return .lookbehind
630+
}
631+
if src.tryEat(sequence: "nlb:") ||
632+
src.tryEat(sequence: "negative_lookbehind:") {
633+
return .negativeLookbehind
634+
}
635+
if src.tryEat(sequence: "napla:") ||
636+
src.tryEat(sequence: "non_atomic_positive_lookahead:") {
637+
return .nonAtomicLookahead
638+
}
639+
if src.tryEat(sequence: "naplb:") ||
640+
src.tryEat(sequence: "non_atomic_positive_lookbehind:") {
641+
return .nonAtomicLookbehind
642+
}
643+
if src.tryEat(sequence: "sr:") || src.tryEat(sequence: "script_run:") {
644+
return .scriptRun
645+
}
646+
if src.tryEat(sequence: "asr:") ||
647+
src.tryEat(sequence: "atomic_script_run:") {
648+
return .atomicScriptRun
649+
}
650+
return nil
651+
}
652+
}
653+
611654
/// Try to consume the start of a group
612655
///
613656
/// GroupStart -> '(?' GroupKind | '('
@@ -631,6 +674,11 @@ extension Source {
631674
) throws -> Located<AST.Group.Kind>? {
632675
try recordLoc { src in
633676
try src.tryEating { src in
677+
// Explicitly spelled out PRCE2 syntax for some groups. This needs to be
678+
// done before group-like atoms, as it uses the '(*' syntax, which is
679+
// otherwise a group-like atom.
680+
if let g = src.lexExplicitPCRE2GroupStart() { return g }
681+
634682
// There are some atoms that syntactically look like groups, bail here
635683
// if we see any. Care needs to be taken here as e.g a group starting
636684
// with '(?-' is a subpattern if the next character is a digit,
@@ -691,45 +739,6 @@ extension Source {
691739
throw ParseError.unknownGroupKind("?\(next)")
692740
}
693741

694-
// Explicitly spelled out PRCE2 syntax for some groups.
695-
if src.tryEat("*") {
696-
if src.tryEat(sequence: "atomic:") { return .atomicNonCapturing }
697-
698-
if src.tryEat(sequence: "pla:") ||
699-
src.tryEat(sequence: "positive_lookahead:") {
700-
return .lookahead
701-
}
702-
if src.tryEat(sequence: "nla:") ||
703-
src.tryEat(sequence: "negative_lookahead:") {
704-
return .negativeLookahead
705-
}
706-
if src.tryEat(sequence: "plb:") ||
707-
src.tryEat(sequence: "positive_lookbehind:") {
708-
return .lookbehind
709-
}
710-
if src.tryEat(sequence: "nlb:") ||
711-
src.tryEat(sequence: "negative_lookbehind:") {
712-
return .negativeLookbehind
713-
}
714-
if src.tryEat(sequence: "napla:") ||
715-
src.tryEat(sequence: "non_atomic_positive_lookahead:") {
716-
return .nonAtomicLookahead
717-
}
718-
if src.tryEat(sequence: "naplb:") ||
719-
src.tryEat(sequence: "non_atomic_positive_lookbehind:") {
720-
return .nonAtomicLookbehind
721-
}
722-
if src.tryEat(sequence: "sr:") || src.tryEat(sequence: "script_run:") {
723-
return .scriptRun
724-
}
725-
if src.tryEat(sequence: "asr:") ||
726-
src.tryEat(sequence: "atomic_script_run:") {
727-
return .atomicScriptRun
728-
}
729-
730-
throw ParseError.misc("Quantifier '*' must follow operand")
731-
}
732-
733742
// (_:)
734743
if src.experimentalCaptures && src.tryEat(sequence: "_:") {
735744
return .nonCapture
@@ -1216,6 +1225,8 @@ extension Source {
12161225

12171226
return false
12181227
}
1228+
// The start of a backreference directive.
1229+
if src.tryEat("*") { return true }
12191230

12201231
return false
12211232
}
@@ -1323,6 +1334,44 @@ extension Source {
13231334
return .init(arg)
13241335
}
13251336

1337+
/// Try to consume a backtracking directive.
1338+
///
1339+
/// BacktrackingDirective -> '(*' BacktrackingDirectiveKind (':' <String>)? ')'
1340+
/// BacktrackingDirectiveKind -> 'ACCEPT' | 'FAIL' | 'F' | 'MARK' | ''
1341+
/// | 'COMMIT' | 'PRUNE' | 'SKIP' | 'THEN'
1342+
///
1343+
mutating func lexBacktrackingDirective(
1344+
) throws -> AST.Atom.BacktrackingDirective? {
1345+
try tryEating { src in
1346+
guard src.tryEat(sequence: "(*") else { return nil }
1347+
let kind = src.recordLoc { src -> AST.Atom.BacktrackingDirective.Kind? in
1348+
if src.tryEat(sequence: "ACCEPT") { return .accept }
1349+
if src.tryEat(sequence: "FAIL") || src.tryEat("F") { return .fail }
1350+
if src.tryEat(sequence: "MARK") || src.peek() == ":" { return .mark }
1351+
if src.tryEat(sequence: "COMMIT") { return .commit }
1352+
if src.tryEat(sequence: "PRUNE") { return .prune }
1353+
if src.tryEat(sequence: "SKIP") { return .skip }
1354+
if src.tryEat(sequence: "THEN") { return .then }
1355+
return nil
1356+
}
1357+
guard let kind = kind else { return nil }
1358+
var name: Located<String>?
1359+
if src.tryEat(":") {
1360+
// TODO: PCRE allows escaped delimiters or '\Q...\E' sequences in the
1361+
// name under PCRE2_ALT_VERBNAMES.
1362+
name = try src.expectQuoted(endingWith: ")", eatEnding: false)
1363+
}
1364+
try src.expect(")")
1365+
1366+
// MARK directives must be named.
1367+
if name == nil && kind.value == .mark {
1368+
throw ParseError.backtrackingDirectiveMustHaveName(
1369+
String(src[kind.location.range]))
1370+
}
1371+
return .init(kind, name: name)
1372+
}
1373+
}
1374+
13261375
/// Consume a group-like atom, throwing an error if an atom could not be
13271376
/// produced.
13281377
///
@@ -1338,6 +1387,10 @@ extension Source {
13381387
return .callout(callout)
13391388
}
13401389

1390+
if let b = try src.lexBacktrackingDirective() {
1391+
return .backtrackingDirective(b)
1392+
}
1393+
13411394
// If we didn't produce an atom, consume up until a reasonable end-point
13421395
// and throw an error.
13431396
try src.expect("(")

Sources/_MatchingEngine/Regex/Parse/Parse.swift

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,8 +176,11 @@ extension Parser {
176176
// Quantification -> QuantOperand Quantifier?
177177
if let operand = try parseQuantifierOperand() {
178178
if let (amt, kind) = try source.lexQuantifier() {
179-
result.append(.quantification(.init(
180-
amt, kind, operand, loc(_start))))
179+
let location = loc(_start)
180+
guard operand.isQuantifiable else {
181+
throw Source.LocatedError(ParseError.notQuantifiable, location)
182+
}
183+
result.append(.quantification(.init(amt, kind, operand, location)))
181184
} else {
182185
result.append(operand)
183186
}

Sources/_MatchingEngine/Regex/Parse/Source.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ public struct Source {
2727
self.bounds = str.startIndex ..< str.endIndex
2828
self.syntax = syntax
2929
}
30+
31+
subscript(_ range: Range<Input.Index>) -> Input.SubSequence { input[range] }
3032
}
3133

3234
// MARK: - Prototype uses String

Sources/_MatchingEngine/Regex/Printing/DumpAST.swift

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ extension AST.Atom {
135135

136136
case .callout(let c): return "\(c)"
137137

138+
case .backtrackingDirective(let d): return "\(d)"
139+
138140
case .char, .scalar:
139141
fatalError("Unreachable")
140142
}
@@ -245,3 +247,13 @@ extension AST.CustomCharacterClass.Range: _ASTPrintable {
245247
"\(lhs)-\(rhs)"
246248
}
247249
}
250+
251+
extension AST.Atom.BacktrackingDirective: _ASTPrintable {
252+
public var _dumpBase: String {
253+
var result = "\(kind.value)"
254+
if let name = name {
255+
result += ": \(name.value)"
256+
}
257+
return result
258+
}
259+
}

Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,9 @@ extension AST.Atom {
269269

270270
case .callout:
271271
return " /* TODO: callout */"
272+
273+
case .backtrackingDirective:
274+
return " /* TODO: backtracking directive */"
272275
}
273276
}
274277
}

Sources/_StringProcessing/ASTBuilder.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,14 @@ func callout(_ arg: AST.Atom.Callout.Argument) -> AST {
169169
atom(.callout(.init(.init(faking: arg))))
170170
}
171171

172+
func backtrackingDirective(
173+
_ kind: AST.Atom.BacktrackingDirective.Kind, name: String? = nil
174+
) -> AST {
175+
atom(.backtrackingDirective(
176+
.init(.init(faking: kind), name: name.map { .init(faking: $0) })
177+
))
178+
}
179+
172180
func quant(
173181
_ amount: AST.Quantification.Amount,
174182
_ kind: AST.Quantification.Kind = .eager,

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ extension AST.Atom {
112112

113113
case .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl,
114114
.any, .startOfLine, .endOfLine,
115-
.backreference, .subpattern, .callout:
115+
.backreference, .subpattern, .callout, .backtrackingDirective:
116116
// FIXME: implement
117117
return nil
118118
}

0 commit comments

Comments
 (0)