Update \DDD parsing

hamishknight · hamishknight · commit 9d5529ee0585 · 2022-02-17T17:15:55.000Z
Previously we followed PCRE's parsing of this syntax
such that it may either be an octal sequence or
backreference depending on a list of heuristics.
However this model is complicated and not particularly
intuitive, especially as there are other engines
that disambiguate using subtly different rules.

Instead, always parse `\DDD` as a backreference,
unless it begins with `0`, in which case it is an
octal sequence. This matches ICU and Java's
behavior. Once we start validating group references,
we can then start emitting an error on invalid
backreferences using this syntax, and suggest
prefixing with 0 if an octal sequence is desired.
diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
@@ -279,7 +279,7 @@ extension Source {
   ///                | 'x'  HexDigit{0...2}
   ///                | 'U'  HexDigit{8}
   ///                | 'o{' OctalDigit{1...} '}'
-  ///                | OctalDigit{1...3}
+  ///                | '0' OctalDigit{0...3}
   ///
   mutating func expectUnicodeScalar(
     escapedCharacter base: Character
@@ -313,13 +313,14 @@ extension Source {
         let str = try src.lexUntil(eating: "}").value
         return try Source.validateUnicodeScalar(str, .octal)
 
-      case let c where c.isOctalDigit:
-        // We can read *up to* 2 more octal digits per PCRE.
-        // FIXME: ICU can read up to 3 octal digits if the leading digit is 0,
-        // we should have a parser mode to switch.
-        let nextDigits = src.tryEatPrefix(maxLength: 2, \.isOctalDigit)
-        let str = String(c) + (nextDigits?.string ?? "")
-        return try Source.validateUnicodeScalar(str, .octal)
+      case "0":
+        // We can read *up to* 3 more octal digits.
+        // FIXME: PCRE can only read up to 2 octal digits, if we get a strict
+        // PCRE mode, we should limit it here.
+        guard let digits = src.tryEatPrefix(maxLength: 3, \.isOctalDigit) else {
+          return Unicode.Scalar(0)
+        }
+        return try Source.validateUnicodeScalar(digits.string, .octal)
 
       default:
         fatalError("Unexpected scalar start")
@@ -1341,26 +1342,10 @@ extension Source {
           return nil
         }
 
-        // Lexing \n is tricky, as it's ambiguous with octal sequences. In PCRE
-        // it is treated as a backreference if its first digit is not 0 (as that
-        // is always octal) and one of the following holds:
-        //
-        // - It's 0 < n < 10 (as octal would be pointless here)
-        // - Its first digit is 8 or 9 (as not valid octal)
-        // - There have been as many prior groups as the reference.
-        //
-        // Oniguruma follows the same rules except the second one. e.g \81 and
-        // \91 are instead treated as literal 81 and 91 respectively.
-        // TODO: If we want a strict Oniguruma mode, we'll need to add a check
-        // here.
+        // Backslash followed by a non-0 digit character is a backreference.
         if firstChar != "0", let numAndLoc = try src.lexNumber() {
-          let num = numAndLoc.value
-          let ref = AST.Reference(.absolute(num), innerLoc: numAndLoc.location)
-          if num < 10 || firstChar == "8" || firstChar == "9" ||
-              context.isPriorGroupRef(ref.kind) {
-            return .backreference(ref)
-          }
-          return nil
+          return .backreference(.init(
+            .absolute(numAndLoc.value), innerLoc: numAndLoc.location))
         }
         return nil
       }
@@ -1497,10 +1482,8 @@ extension Source {
       }
 
       switch char {
-      // Hexadecimal and octal unicode scalars. This must be done after
-      // backreference lexing due to the ambiguity with \nnn.
-      case let c where c.isOctalDigit: fallthrough
-      case "u", "x", "U", "o":
+      // Hexadecimal and octal unicode scalars.
+      case "u", "x", "U", "o", "0":
         return try .scalar(
           src.expectUnicodeScalar(escapedCharacter: char).value)
       default:
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
@@ -261,7 +261,7 @@ extension RegexTests {
     firstMatchTest(#"\070"#, input: "1238xyz", match: "8")
     firstMatchTest(#"\07A"#, input: "123\u{7}Axyz", match: "\u{7}A")
     firstMatchTest(#"\08"#, input: "123\08xyz", match: "\08")
-    firstMatchTest(#"\0707"#, input: "12387xyz", match: "87")
+    firstMatchTest(#"\0707"#, input: "12387\u{1C7}xyz", match: "\u{1C7}")
 
     // code point sequence
     firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true)
@@ -1021,9 +1021,6 @@ extension RegexTests {
     firstMatchTest(
       #"(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)\10"#,
       input: "aaaaaaaaabbc", match: "aaaaaaaaabb")
-    firstMatchTest(
-      #"(.)\10"#,
-      input: "a\u{8}b", match: "a\u{8}")
 
     firstMatchTest(
       #"(.)\g001"#,
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
@@ -325,21 +325,23 @@ extension RegexTests {
     parseTest(#"\070"#, scalar("\u{38}"))
     parseTest(#"\07A"#, concat(scalar("\u{7}"), "A"))
     parseTest(#"\08"#, concat(scalar("\u{0}"), "8"))
-    parseTest(#"\0707"#, concat(scalar("\u{38}"), "7"))
+    parseTest(#"\0707"#, scalar("\u{1C7}"))
 
     parseTest(#"[\0]"#, charClass(scalar_m("\u{0}")))
     parseTest(#"[\01]"#, charClass(scalar_m("\u{1}")))
     parseTest(#"[\070]"#, charClass(scalar_m("\u{38}")))
 
     parseTest(#"[\07A]"#, charClass(scalar_m("\u{7}"), "A"))
     parseTest(#"[\08]"#, charClass(scalar_m("\u{0}"), "8"))
-    parseTest(#"[\0707]"#, charClass(scalar_m("\u{38}"), "7"))
+    parseTest(#"[\0707]"#, charClass(scalar_m("\u{1C7}")))
 
-    parseTest(#"[\1]"#, charClass(scalar_m("\u{1}")))
-    parseTest(#"[\123]"#, charClass(scalar_m("\u{53}")))
-    parseTest(#"[\101]"#, charClass(scalar_m("\u{41}")))
-    parseTest(#"[\7777]"#, charClass(scalar_m("\u{1FF}"), "7"))
-    parseTest(#"[\181]"#, charClass(scalar_m("\u{1}"), "8", "1"))
+    // TODO: These are treated as octal sequences by PCRE, we should warn and
+    // suggest user prefix with 0.
+    parseTest(#"[\1]"#, charClass("1"))
+    parseTest(#"[\123]"#, charClass("1", "2", "3"))
+    parseTest(#"[\101]"#, charClass("1", "0", "1"))
+    parseTest(#"[\7777]"#, charClass("7", "7", "7", "7"))
+    parseTest(#"[\181]"#, charClass("1", "8", "1"))
 
     // We take *up to* the first two valid digits for \x. No valid digits is 0.
     parseTest(#"\x"#, scalar("\u{0}"))
@@ -797,11 +799,9 @@ extension RegexTests {
       )
     }
 
-    // TODO: Some of these behaviors are unintuitive, we should likely warn on
-    // some of them.
-    parseTest(#"\10"#, scalar("\u{8}"))
-    parseTest(#"\18"#, concat(scalar("\u{1}"), "8"))
-    parseTest(#"\7777"#, concat(scalar("\u{1FF}"), "7"))
+    parseTest(#"\10"#, backreference(.absolute(10)))
+    parseTest(#"\18"#, backreference(.absolute(18)))
+    parseTest(#"\7777"#, backreference(.absolute(7777)))
     parseTest(#"\91"#, backreference(.absolute(91)))
 
     parseTest(
@@ -813,21 +813,22 @@ extension RegexTests {
     parseTest(
       #"()()()()()()()()()\10()"#,
       concat(Array(repeating: capture(empty()), count: 9)
-             + [scalar("\u{8}"), capture(empty())]),
+             + [backreference(.absolute(10)), capture(empty())]),
       captures: .tuple(Array(repeating: .atom(), count: 10))
     )
-    parseTest(#"()()\10"#,
-              concat(capture(empty()), capture(empty()), scalar("\u{8}")),
-              captures: .tuple(.atom(), .atom()))
+    parseTest(#"()()\10"#, concat(
+      capture(empty()), capture(empty()), backreference(.absolute(10))),
+              captures: .tuple(.atom(), .atom())
+    )
 
     // A capture of three empty captures.
     let fourCaptures = capture(
       concat(capture(empty()), capture(empty()), capture(empty()))
     )
     parseTest(
       // There are 9 capture groups in total here.
-      #"((()()())(()()()))\10"#,
-      concat(capture(concat(fourCaptures, fourCaptures)), scalar("\u{8}")),
+      #"((()()())(()()()))\10"#, concat(capture(concat(
+        fourCaptures, fourCaptures)), backreference(.absolute(10))),
       captures: .tuple(Array(repeating: .atom(), count: 9))
     )
     parseTest(
@@ -852,7 +853,7 @@ extension RegexTests {
       concat(Array(repeating: capture(empty()), count: 40) + [scalar(" ")]),
       captures: .tuple(Array(repeating: .atom(), count: 40))
     )
-    parseTest(#"\40"#, scalar(" "))
+    parseTest(#"\40"#, backreference(.absolute(40)))
     parseTest(
       String(repeating: "()", count: 40) + #"\40"#,
       concat(Array(repeating: capture(empty()), count: 40)
@@ -862,7 +863,7 @@ extension RegexTests {
 
     parseTest(#"\7"#, backreference(.absolute(7)))
 
-    parseTest(#"\11"#, scalar("\u{9}"))
+    parseTest(#"\11"#, backreference(.absolute(11)))
     parseTest(
       String(repeating: "()", count: 11) + #"\11"#,
       concat(Array(repeating: capture(empty()), count: 11)
@@ -876,12 +877,11 @@ extension RegexTests {
       captures: .tuple(Array(repeating: .atom(), count: 11))
     )
 
-    parseTest(#"\0113"#, concat(scalar("\u{9}"), "3"))
-    parseTest(#"\113"#, scalar("\u{4B}"))
-    parseTest(#"\377"#, scalar("\u{FF}"))
+    parseTest(#"\0113"#, scalar("\u{4B}"))
+    parseTest(#"\113"#, backreference(.absolute(113)))
+    parseTest(#"\377"#, backreference(.absolute(377)))
     parseTest(#"\81"#, backreference(.absolute(81)))
 
-
     parseTest(#"\g1"#, backreference(.absolute(1)))
     parseTest(#"\g001"#, backreference(.absolute(1)))
     parseTest(#"\g52"#, backreference(.absolute(52)))