Skip to content

Commit 9d5529e

Browse files
committed
Update \DDD parsing
Previously we followed PCRE's parsing of this syntax such that it may either be an octal sequence or backreference depending on a list of heuristics. However this model is complicated and not particularly intuitive, especially as there are other engines that disambiguate using subtly different rules. Instead, always parse `\DDD` as a backreference, unless it begins with `0`, in which case it is an octal sequence. This matches ICU and Java's behavior. Once we start validating group references, we can then start emitting an error on invalid backreferences using this syntax, and suggest prefixing with 0 if an octal sequence is desired.
1 parent 04ad64a commit 9d5529e

File tree

3 files changed

+39
-59
lines changed

3 files changed

+39
-59
lines changed

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 14 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ extension Source {
279279
/// | 'x' HexDigit{0...2}
280280
/// | 'U' HexDigit{8}
281281
/// | 'o{' OctalDigit{1...} '}'
282-
/// | OctalDigit{1...3}
282+
/// | '0' OctalDigit{0...3}
283283
///
284284
mutating func expectUnicodeScalar(
285285
escapedCharacter base: Character
@@ -313,13 +313,14 @@ extension Source {
313313
let str = try src.lexUntil(eating: "}").value
314314
return try Source.validateUnicodeScalar(str, .octal)
315315

316-
case let c where c.isOctalDigit:
317-
// We can read *up to* 2 more octal digits per PCRE.
318-
// FIXME: ICU can read up to 3 octal digits if the leading digit is 0,
319-
// we should have a parser mode to switch.
320-
let nextDigits = src.tryEatPrefix(maxLength: 2, \.isOctalDigit)
321-
let str = String(c) + (nextDigits?.string ?? "")
322-
return try Source.validateUnicodeScalar(str, .octal)
316+
case "0":
317+
// We can read *up to* 3 more octal digits.
318+
// FIXME: PCRE can only read up to 2 octal digits, if we get a strict
319+
// PCRE mode, we should limit it here.
320+
guard let digits = src.tryEatPrefix(maxLength: 3, \.isOctalDigit) else {
321+
return Unicode.Scalar(0)
322+
}
323+
return try Source.validateUnicodeScalar(digits.string, .octal)
323324

324325
default:
325326
fatalError("Unexpected scalar start")
@@ -1341,26 +1342,10 @@ extension Source {
13411342
return nil
13421343
}
13431344

1344-
// Lexing \n is tricky, as it's ambiguous with octal sequences. In PCRE
1345-
// it is treated as a backreference if its first digit is not 0 (as that
1346-
// is always octal) and one of the following holds:
1347-
//
1348-
// - It's 0 < n < 10 (as octal would be pointless here)
1349-
// - Its first digit is 8 or 9 (as not valid octal)
1350-
// - There have been as many prior groups as the reference.
1351-
//
1352-
// Oniguruma follows the same rules except the second one. e.g \81 and
1353-
// \91 are instead treated as literal 81 and 91 respectively.
1354-
// TODO: If we want a strict Oniguruma mode, we'll need to add a check
1355-
// here.
1345+
// Backslash followed by a non-0 digit character is a backreference.
13561346
if firstChar != "0", let numAndLoc = try src.lexNumber() {
1357-
let num = numAndLoc.value
1358-
let ref = AST.Reference(.absolute(num), innerLoc: numAndLoc.location)
1359-
if num < 10 || firstChar == "8" || firstChar == "9" ||
1360-
context.isPriorGroupRef(ref.kind) {
1361-
return .backreference(ref)
1362-
}
1363-
return nil
1347+
return .backreference(.init(
1348+
.absolute(numAndLoc.value), innerLoc: numAndLoc.location))
13641349
}
13651350
return nil
13661351
}
@@ -1497,10 +1482,8 @@ extension Source {
14971482
}
14981483

14991484
switch char {
1500-
// Hexadecimal and octal unicode scalars. This must be done after
1501-
// backreference lexing due to the ambiguity with \nnn.
1502-
case let c where c.isOctalDigit: fallthrough
1503-
case "u", "x", "U", "o":
1485+
// Hexadecimal and octal unicode scalars.
1486+
case "u", "x", "U", "o", "0":
15041487
return try .scalar(
15051488
src.expectUnicodeScalar(escapedCharacter: char).value)
15061489
default:

Tests/RegexTests/MatchTests.swift

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ extension RegexTests {
261261
firstMatchTest(#"\070"#, input: "1238xyz", match: "8")
262262
firstMatchTest(#"\07A"#, input: "123\u{7}Axyz", match: "\u{7}A")
263263
firstMatchTest(#"\08"#, input: "123\08xyz", match: "\08")
264-
firstMatchTest(#"\0707"#, input: "12387xyz", match: "87")
264+
firstMatchTest(#"\0707"#, input: "12387\u{1C7}xyz", match: "\u{1C7}")
265265

266266
// code point sequence
267267
firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true)
@@ -1021,9 +1021,6 @@ extension RegexTests {
10211021
firstMatchTest(
10221022
#"(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)\10"#,
10231023
input: "aaaaaaaaabbc", match: "aaaaaaaaabb")
1024-
firstMatchTest(
1025-
#"(.)\10"#,
1026-
input: "a\u{8}b", match: "a\u{8}")
10271024

10281025
firstMatchTest(
10291026
#"(.)\g001"#,

Tests/RegexTests/ParseTests.swift

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -325,21 +325,23 @@ extension RegexTests {
325325
parseTest(#"\070"#, scalar("\u{38}"))
326326
parseTest(#"\07A"#, concat(scalar("\u{7}"), "A"))
327327
parseTest(#"\08"#, concat(scalar("\u{0}"), "8"))
328-
parseTest(#"\0707"#, concat(scalar("\u{38}"), "7"))
328+
parseTest(#"\0707"#, scalar("\u{1C7}"))
329329

330330
parseTest(#"[\0]"#, charClass(scalar_m("\u{0}")))
331331
parseTest(#"[\01]"#, charClass(scalar_m("\u{1}")))
332332
parseTest(#"[\070]"#, charClass(scalar_m("\u{38}")))
333333

334334
parseTest(#"[\07A]"#, charClass(scalar_m("\u{7}"), "A"))
335335
parseTest(#"[\08]"#, charClass(scalar_m("\u{0}"), "8"))
336-
parseTest(#"[\0707]"#, charClass(scalar_m("\u{38}"), "7"))
336+
parseTest(#"[\0707]"#, charClass(scalar_m("\u{1C7}")))
337337

338-
parseTest(#"[\1]"#, charClass(scalar_m("\u{1}")))
339-
parseTest(#"[\123]"#, charClass(scalar_m("\u{53}")))
340-
parseTest(#"[\101]"#, charClass(scalar_m("\u{41}")))
341-
parseTest(#"[\7777]"#, charClass(scalar_m("\u{1FF}"), "7"))
342-
parseTest(#"[\181]"#, charClass(scalar_m("\u{1}"), "8", "1"))
338+
// TODO: These are treated as octal sequences by PCRE, we should warn and
339+
// suggest user prefix with 0.
340+
parseTest(#"[\1]"#, charClass("1"))
341+
parseTest(#"[\123]"#, charClass("1", "2", "3"))
342+
parseTest(#"[\101]"#, charClass("1", "0", "1"))
343+
parseTest(#"[\7777]"#, charClass("7", "7", "7", "7"))
344+
parseTest(#"[\181]"#, charClass("1", "8", "1"))
343345

344346
// We take *up to* the first two valid digits for \x. No valid digits is 0.
345347
parseTest(#"\x"#, scalar("\u{0}"))
@@ -797,11 +799,9 @@ extension RegexTests {
797799
)
798800
}
799801

800-
// TODO: Some of these behaviors are unintuitive, we should likely warn on
801-
// some of them.
802-
parseTest(#"\10"#, scalar("\u{8}"))
803-
parseTest(#"\18"#, concat(scalar("\u{1}"), "8"))
804-
parseTest(#"\7777"#, concat(scalar("\u{1FF}"), "7"))
802+
parseTest(#"\10"#, backreference(.absolute(10)))
803+
parseTest(#"\18"#, backreference(.absolute(18)))
804+
parseTest(#"\7777"#, backreference(.absolute(7777)))
805805
parseTest(#"\91"#, backreference(.absolute(91)))
806806

807807
parseTest(
@@ -813,21 +813,22 @@ extension RegexTests {
813813
parseTest(
814814
#"()()()()()()()()()\10()"#,
815815
concat(Array(repeating: capture(empty()), count: 9)
816-
+ [scalar("\u{8}"), capture(empty())]),
816+
+ [backreference(.absolute(10)), capture(empty())]),
817817
captures: .tuple(Array(repeating: .atom(), count: 10))
818818
)
819-
parseTest(#"()()\10"#,
820-
concat(capture(empty()), capture(empty()), scalar("\u{8}")),
821-
captures: .tuple(.atom(), .atom()))
819+
parseTest(#"()()\10"#, concat(
820+
capture(empty()), capture(empty()), backreference(.absolute(10))),
821+
captures: .tuple(.atom(), .atom())
822+
)
822823

823824
// A capture of three empty captures.
824825
let fourCaptures = capture(
825826
concat(capture(empty()), capture(empty()), capture(empty()))
826827
)
827828
parseTest(
828829
// There are 9 capture groups in total here.
829-
#"((()()())(()()()))\10"#,
830-
concat(capture(concat(fourCaptures, fourCaptures)), scalar("\u{8}")),
830+
#"((()()())(()()()))\10"#, concat(capture(concat(
831+
fourCaptures, fourCaptures)), backreference(.absolute(10))),
831832
captures: .tuple(Array(repeating: .atom(), count: 9))
832833
)
833834
parseTest(
@@ -852,7 +853,7 @@ extension RegexTests {
852853
concat(Array(repeating: capture(empty()), count: 40) + [scalar(" ")]),
853854
captures: .tuple(Array(repeating: .atom(), count: 40))
854855
)
855-
parseTest(#"\40"#, scalar(" "))
856+
parseTest(#"\40"#, backreference(.absolute(40)))
856857
parseTest(
857858
String(repeating: "()", count: 40) + #"\40"#,
858859
concat(Array(repeating: capture(empty()), count: 40)
@@ -862,7 +863,7 @@ extension RegexTests {
862863

863864
parseTest(#"\7"#, backreference(.absolute(7)))
864865

865-
parseTest(#"\11"#, scalar("\u{9}"))
866+
parseTest(#"\11"#, backreference(.absolute(11)))
866867
parseTest(
867868
String(repeating: "()", count: 11) + #"\11"#,
868869
concat(Array(repeating: capture(empty()), count: 11)
@@ -876,12 +877,11 @@ extension RegexTests {
876877
captures: .tuple(Array(repeating: .atom(), count: 11))
877878
)
878879

879-
parseTest(#"\0113"#, concat(scalar("\u{9}"), "3"))
880-
parseTest(#"\113"#, scalar("\u{4B}"))
881-
parseTest(#"\377"#, scalar("\u{FF}"))
880+
parseTest(#"\0113"#, scalar("\u{4B}"))
881+
parseTest(#"\113"#, backreference(.absolute(113)))
882+
parseTest(#"\377"#, backreference(.absolute(377)))
882883
parseTest(#"\81"#, backreference(.absolute(81)))
883884

884-
885885
parseTest(#"\g1"#, backreference(.absolute(1)))
886886
parseTest(#"\g001"#, backreference(.absolute(1)))
887887
parseTest(#"\g52"#, backreference(.absolute(52)))

0 commit comments

Comments
 (0)