Skip to content

Commit 3048988

Browse files
committed
Parse PCRE callout syntax
Parse the `(?C)` syntax with an integer or string argument. This doesn't yet handle the Oniguruma specific callout syntax, which is a little more involved.
1 parent 83c94bf commit 3048988

File tree

9 files changed

+160
-20
lines changed

9 files changed

+160
-20
lines changed

Sources/_MatchingEngine/Regex/AST/Atom.swift

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ extension AST {
6666
// References
6767
case backreference(Reference)
6868
case subpattern(Reference)
69+
70+
// (?C)
71+
case callout(Callout)
6972
}
7073
}
7174
}
@@ -443,6 +446,19 @@ extension AST.Atom {
443446
}
444447
}
445448

449+
extension AST.Atom {
450+
public struct Callout: Hashable {
451+
public enum Argument: Hashable {
452+
case number(Int)
453+
case string(String)
454+
}
455+
public var arg: AST.Located<Argument>
456+
public init(_ arg: AST.Located<Argument>) {
457+
self.arg = arg
458+
}
459+
}
460+
}
461+
446462
extension AST.Atom {
447463
/// Retrieve the character value of the atom if it represents a literal
448464
/// character or unicode scalar, nil otherwise.
@@ -458,7 +474,7 @@ extension AST.Atom {
458474
fallthrough
459475

460476
case .property, .escaped, .any, .startOfLine, .endOfLine,
461-
.backreference, .subpattern, .namedCharacter:
477+
.backreference, .subpattern, .namedCharacter, .callout:
462478
return nil
463479
}
464480
}
@@ -483,7 +499,7 @@ extension AST.Atom {
483499
return "\\M-\\C-\(x)"
484500

485501
case .property, .escaped, .any, .startOfLine, .endOfLine,
486-
.backreference, .subpattern, .namedCharacter:
502+
.backreference, .subpattern, .namedCharacter, .callout:
487503
return nil
488504
}
489505
}

Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ enum ParseError: Error, Hashable {
3939
case cannotReferToWholePattern
4040

4141
case unknownGroupKind(String)
42+
case unknownCalloutKind(String)
4243

4344
case invalidMatchingOption(Character)
4445
case cannotRemoveMatchingOptionsAfterCaret
@@ -86,6 +87,8 @@ extension ParseError: CustomStringConvertible {
8687
return "\(str) cannot be used as condition"
8788
case let .unknownGroupKind(str):
8889
return "unknown group kind '(\(str)'"
90+
case let .unknownCalloutKind(str):
91+
return "unknown callout kind '\(str)'"
8992
case let .invalidMatchingOption(c):
9093
return "invalid matching option '\(c)'"
9194
case .cannotRemoveMatchingOptionsAfterCaret:

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 103 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -631,8 +631,15 @@ extension Source {
631631
) throws -> Located<AST.Group.Kind>? {
632632
try recordLoc { src in
633633
try src.tryEating { src in
634-
guard src.tryEat("(") else { return nil }
634+
// There are some atoms that syntactically look like groups, bail here
635+
// if we see any. Care needs to be taken here as e.g a group starting
636+
// with '(?-' is a subpattern if the next character is a digit,
637+
// otherwise a matching option specifier. Conversely, '(?P' can be the
638+
// start of a matching option sequence, or a reference if it is followed
639+
// by '=' or '<'.
640+
guard !src.canLexGroupLikeAtom() else { return nil }
635641

642+
guard src.tryEat("(") else { return nil }
636643
if src.tryEat("?") {
637644
if src.tryEat(":") { return .nonCapture }
638645
if src.tryEat("|") { return .nonCaptureReset }
@@ -658,15 +665,6 @@ extension Source {
658665
return .namedCapture(name)
659666
}
660667

661-
// Check if we can lex a group-like reference. Do this before matching
662-
// options to avoid ambiguity with a group starting with (?-, which
663-
// is a subpattern if the next character is a digit, otherwise a
664-
// matching option specifier. In addition, we need to be careful with
665-
// (?P, which can also be the start of a matching option sequence.
666-
if src.canLexGroupLikeReference() {
667-
return nil
668-
}
669-
670668
// Matching option changing group (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:).
671669
if let seq = try src.lexMatchingOptionSequence() {
672670
if src.tryEat(":") {
@@ -1059,11 +1057,11 @@ extension Source {
10591057
for openChar: Character
10601058
) -> Character {
10611059
switch openChar {
1060+
// Identically-balanced delimiters.
1061+
case "'", "\"", "`", "^", "%", "#", "$": return openChar
10621062
case "<": return ">"
1063-
case "'": return "'"
10641063
case "{": return "}"
1065-
default:
1066-
fatalError("Not implemented")
1064+
default: fatalError("Not implemented")
10671065
}
10681066
}
10691067

@@ -1204,6 +1202,24 @@ extension Source {
12041202
return src.canLexNumberedReference()
12051203
}
12061204

1205+
/// Whether a group specifier should be lexed as an atom instead of a group.
1206+
private func canLexGroupLikeAtom() -> Bool {
1207+
var src = self
1208+
guard src.tryEat("(") else { return false }
1209+
1210+
if src.tryEat("?") {
1211+
// The start of a reference '(?P=', '(?R', ...
1212+
if src.canLexGroupLikeReference() { return true }
1213+
1214+
// The start of a callout.
1215+
if src.tryEat("C") { return true }
1216+
1217+
return false
1218+
}
1219+
1220+
return false
1221+
}
1222+
12071223
/// Consume an escaped atom, starting from after the backslash
12081224
///
12091225
/// Escaped -> KeyboardModified | Builtin
@@ -1265,6 +1281,76 @@ extension Source {
12651281
}
12661282
}
12671283

1284+
/// Try to consume a callout.
1285+
///
1286+
/// Callout -> '(?C' CalloutBody ')'
1287+
/// CalloutBody -> '' | <Number>
1288+
/// | '`' <String> '`'
1289+
/// | "'" <String> "'"
1290+
/// | '"' <String> '"'
1291+
/// | '^' <String> '^'
1292+
/// | '%' <String> '%'
1293+
/// | '#' <String> '#'
1294+
/// | '$' <String> '$'
1295+
/// | '{' <String> '}'
1296+
///
1297+
mutating func lexCallout() throws -> AST.Atom.Callout? {
1298+
guard tryEat(sequence: "(?C") else { return nil }
1299+
let arg = try recordLoc { src -> AST.Atom.Callout.Argument in
1300+
// Parse '(?C' followed by a number.
1301+
if let num = try src.lexNumber() {
1302+
return .number(num.value)
1303+
}
1304+
// '(?C)' is implicitly '(?C0)'.
1305+
if src.peek() == ")" {
1306+
return .number(0)
1307+
}
1308+
// Parse '(C?' followed by a set of balanced delimiters as defined by
1309+
// http://pcre.org/current/doc/html/pcre2pattern.html#SEC28
1310+
if let open = src.tryEat(anyOf: "`", "'", "\"", "^", "%", "#", "$", "{") {
1311+
let closing = String(Source.getClosingDelimiter(for: open))
1312+
return .string(try src.expectQuoted(endingWith: closing).value)
1313+
}
1314+
// If we don't know what this syntax is, consume up to the ending ')' and
1315+
// emit an error.
1316+
let remaining = src.lexUntil { $0.isEmpty || $0.tryEat(")") }.value
1317+
if remaining.isEmpty {
1318+
throw ParseError.expected(")")
1319+
}
1320+
throw ParseError.unknownCalloutKind("(?C\(remaining))")
1321+
}
1322+
try expect(")")
1323+
return .init(arg)
1324+
}
1325+
1326+
/// Consume a group-like atom, throwing an error if an atom could not be
1327+
/// produced.
1328+
///
1329+
/// GroupLikeAtom -> GroupLikeReference | Callout | BacktrackingDirective
1330+
mutating func expectGroupLikeAtom() throws -> AST.Atom.Kind {
1331+
try recordLoc { src in
1332+
// References that look like groups, e.g (?R), (?1), ...
1333+
if let ref = try src.lexGroupLikeReference() {
1334+
return ref.value
1335+
}
1336+
1337+
if let callout = try src.lexCallout() {
1338+
return .callout(callout)
1339+
}
1340+
1341+
// If we didn't produce an atom, consume up until a reasonable end-point
1342+
// and throw an error.
1343+
try src.expect("(")
1344+
let remaining = src.lexUntil {
1345+
$0.isEmpty || $0.tryEat(anyOf: ":", ")") != nil
1346+
}.value
1347+
if remaining.isEmpty {
1348+
throw ParseError.expected(")")
1349+
}
1350+
throw ParseError.unknownGroupKind(remaining)
1351+
}.value
1352+
}
1353+
12681354

12691355
/// Try to consume an Atom.
12701356
///
@@ -1293,9 +1379,10 @@ extension Source {
12931379
return .property(prop)
12941380
}
12951381

1296-
// References that look like groups, e.g (?R), (?1), ...
1297-
if !customCC, let ref = try src.lexGroupLikeReference() {
1298-
return ref.value
1382+
// If we have group syntax that was skipped over in lexGroupStart, we
1383+
// need to handle it as an atom, or throw an error.
1384+
if !customCC && src.canLexGroupLikeAtom() {
1385+
return try src.expectGroupLikeAtom()
12991386
}
13001387

13011388
let char = src.eat()

Sources/_MatchingEngine/Regex/Parse/Source.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ extension Source {
6060

6161
var isEmpty: Bool { _slice.isEmpty }
6262

63-
mutating func peek() -> Char? { _slice.first }
63+
func peek() -> Char? { _slice.first }
6464

6565
mutating func advance() {
6666
assert(!isEmpty)

Sources/_MatchingEngine/Regex/Printing/DumpAST.swift

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,12 +133,18 @@ extension AST.Atom {
133133
case .backreference(let r), .subpattern(let r):
134134
return "\(r._dumpBase)"
135135

136+
case .callout(let c): return "\(c)"
137+
136138
case .char, .scalar:
137139
fatalError("Unreachable")
138140
}
139141
}
140142
}
141143

144+
extension AST.Atom.Callout: _ASTPrintable {
145+
public var _dumpBase: String { "callout <\(arg.value)>" }
146+
}
147+
142148
extension AST.Reference: _ASTPrintable {
143149
public var _dumpBase: String {
144150
"\(kind)"

Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,9 @@ extension AST.Atom {
266266

267267
case .subpattern:
268268
return " /* TODO: subpattern */"
269+
270+
case .callout:
271+
return " /* TODO: callout */"
269272
}
270273
}
271274
}

Sources/_StringProcessing/ASTBuilder.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,10 @@ func groupCondition(
165165
.group(.init(.init(faking: kind), child, .fake))
166166
}
167167

168+
func callout(_ arg: AST.Atom.Callout.Argument) -> AST {
169+
atom(.callout(.init(.init(faking: arg))))
170+
}
171+
168172
func quant(
169173
_ amount: AST.Quantification.Amount,
170174
_ kind: AST.Quantification.Kind = .eager,

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ extension AST.Atom {
112112

113113
case .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl,
114114
.any, .startOfLine, .endOfLine,
115-
.backreference, .subpattern:
115+
.backreference, .subpattern, .callout:
116116
// FIXME: implement
117117
return nil
118118
}

Tests/RegexTests/ParseTests.swift

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1087,6 +1087,17 @@ extension RegexTests {
10871087
trueBranch: empty(), falseBranch: empty())
10881088
)
10891089

1090+
// MARK: Callouts
1091+
1092+
parseTest(#"(?C)"#, callout(.number(0)))
1093+
parseTest(#"(?C0)"#, callout(.number(0)))
1094+
parseTest(#"(?C20)"#, callout(.number(20)))
1095+
parseTest("(?C{abc})", callout(.string("abc")))
1096+
1097+
for delim in ["`", "'", "\"", "^", "%", "#", "$"] {
1098+
parseTest("(?C\(delim)hello\(delim))", callout(.string("hello")))
1099+
}
1100+
10901101
// MARK: Parse with delimiters
10911102

10921103
parseWithDelimitersTest("'/a b/'", concat("a", " ", "b"))
@@ -1150,6 +1161,9 @@ extension RegexTests {
11501161
parseNotEqualTest(#"(?(VERSION=0.1))"#, #"(?(VERSION=0.2))"#)
11511162
parseNotEqualTest(#"(?(VERSION=0.1))"#, #"(?(VERSION>=0.1))"#)
11521163

1164+
parseNotEqualTest("(?C0)", "(?C1)")
1165+
parseNotEqualTest("(?C0)", "(?C'hello')")
1166+
11531167
// TODO: failure tests
11541168
}
11551169

@@ -1242,6 +1256,8 @@ extension RegexTests {
12421256
diagnosticTest(#""ab\""#, .expected("\""), syntax: .experimental)
12431257
diagnosticTest("\"ab\\", .expectedEscape, syntax: .experimental)
12441258

1259+
diagnosticTest("(?C", .expected(")"))
1260+
12451261
// MARK: Text Segment options
12461262

12471263
diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions)
@@ -1277,5 +1293,10 @@ extension RegexTests {
12771293
diagnosticTest(#"(?(1)a|b|c)"#, .tooManyBranchesInConditional(3))
12781294
diagnosticTest(#"(?(1)||)"#, .tooManyBranchesInConditional(3))
12791295
diagnosticTest(#"(?(?i))"#, .unsupportedCondition("implicitly scoped group"))
1296+
1297+
// MARK: Callouts
1298+
1299+
diagnosticTest("(?C-1)", .unknownCalloutKind("(?C-1)"))
1300+
diagnosticTest("(?C-1", .unknownCalloutKind("(?C-1)"))
12801301
}
12811302
}

0 commit comments

Comments
 (0)