swiftlang · hamishknight · Feb 28, 2022 · Feb 17, 2022 · Feb 17, 2022
diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
@@ -279,7 +279,7 @@ extension Source {
   ///                | 'x'  HexDigit{0...2}
   ///                | 'U'  HexDigit{8}
   ///                | 'o{' OctalDigit{1...} '}'
-  ///                | OctalDigit{1...3}
+  ///                | '0' OctalDigit{0...3}
   ///
   mutating func expectUnicodeScalar(
     escapedCharacter base: Character
@@ -313,13 +313,14 @@ extension Source {
         let str = try src.lexUntil(eating: "}").value
         return try Source.validateUnicodeScalar(str, .octal)
 
-      case let c where c.isOctalDigit:
-        // We can read *up to* 2 more octal digits per PCRE.
-        // FIXME: ICU can read up to 3 octal digits if the leading digit is 0,
-        // we should have a parser mode to switch.
-        let nextDigits = src.tryEatPrefix(maxLength: 2, \.isOctalDigit)
-        let str = String(c) + (nextDigits?.string ?? "")
-        return try Source.validateUnicodeScalar(str, .octal)
+      case "0":
+        // We can read *up to* 3 more octal digits.
+        // FIXME: PCRE can only read up to 2 octal digits, if we get a strict
+        // PCRE mode, we should limit it here.
+        guard let digits = src.tryEatPrefix(maxLength: 3, \.isOctalDigit) else {
+          return Unicode.Scalar(0)
+        }
+        return try Source.validateUnicodeScalar(digits.string, .octal)
 
       default:
         fatalError("Unexpected scalar start")
@@ -1341,26 +1342,10 @@ extension Source {
           return nil
         }
 
-        // Lexing \n is tricky, as it's ambiguous with octal sequences. In PCRE
-        // it is treated as a backreference if its first digit is not 0 (as that
-        // is always octal) and one of the following holds:
-        //
-        // - It's 0 < n < 10 (as octal would be pointless here)
-        // - Its first digit is 8 or 9 (as not valid octal)
-        // - There have been as many prior groups as the reference.
-        //
-        // Oniguruma follows the same rules except the second one. e.g \81 and
-        // \91 are instead treated as literal 81 and 91 respectively.
-        // TODO: If we want a strict Oniguruma mode, we'll need to add a check
-        // here.
+        // Backslash followed by a non-0 digit character is a backreference.
         if firstChar != "0", let numAndLoc = try src.lexNumber() {
-          let num = numAndLoc.value
-          let ref = AST.Reference(.absolute(num), innerLoc: numAndLoc.location)
-          if num < 10 || firstChar == "8" || firstChar == "9" ||
-              context.isPriorGroupRef(ref.kind) {
-            return .backreference(ref)
-          }
-          return nil
+          return .backreference(.init(
+            .absolute(numAndLoc.value), innerLoc: numAndLoc.location))
         }
         return nil
       }
@@ -1497,10 +1482,8 @@ extension Source {
       }
 
       switch char {
-      // Hexadecimal and octal unicode scalars. This must be done after
-      // backreference lexing due to the ambiguity with \nnn.
-      case let c where c.isOctalDigit: fallthrough
-      case "u", "x", "U", "o":
+      // Hexadecimal and octal unicode scalars.
+      case "u", "x", "U", "o", "0":
         return try .scalar(
           src.expectUnicodeScalar(escapedCharacter: char).value)
       default:

diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
@@ -261,7 +261,7 @@ extension RegexTests {
     firstMatchTest(#"\070"#, input: "1238xyz", match: "8")
     firstMatchTest(#"\07A"#, input: "123\u{7}Axyz", match: "\u{7}A")
     firstMatchTest(#"\08"#, input: "123\08xyz", match: "\08")
-    firstMatchTest(#"\0707"#, input: "12387xyz", match: "87")
+    firstMatchTest(#"\0707"#, input: "12387\u{1C7}xyz", match: "\u{1C7}")
 
     // code point sequence
     firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true)
@@ -1021,9 +1021,6 @@ extension RegexTests {
     firstMatchTest(
       #"(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)\10"#,
       input: "aaaaaaaaabbc", match: "aaaaaaaaabb")
-    firstMatchTest(
-      #"(.)\10"#,
-      input: "a\u{8}b", match: "a\u{8}")
 
     firstMatchTest(
       #"(.)\g001"#,

diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
@@ -325,21 +325,23 @@ extension RegexTests {
     parseTest(#"\070"#, scalar("\u{38}"))
     parseTest(#"\07A"#, concat(scalar("\u{7}"), "A"))
     parseTest(#"\08"#, concat(scalar("\u{0}"), "8"))
-    parseTest(#"\0707"#, concat(scalar("\u{38}"), "7"))
+    parseTest(#"\0707"#, scalar("\u{1C7}"))
 
     parseTest(#"[\0]"#, charClass(scalar_m("\u{0}")))
     parseTest(#"[\01]"#, charClass(scalar_m("\u{1}")))
     parseTest(#"[\070]"#, charClass(scalar_m("\u{38}")))
 
     parseTest(#"[\07A]"#, charClass(scalar_m("\u{7}"), "A"))
     parseTest(#"[\08]"#, charClass(scalar_m("\u{0}"), "8"))
-    parseTest(#"[\0707]"#, charClass(scalar_m("\u{38}"), "7"))
+    parseTest(#"[\0707]"#, charClass(scalar_m("\u{1C7}")))
 
-    parseTest(#"[\1]"#, charClass(scalar_m("\u{1}")))
-    parseTest(#"[\123]"#, charClass(scalar_m("\u{53}")))
-    parseTest(#"[\101]"#, charClass(scalar_m("\u{41}")))
-    parseTest(#"[\7777]"#, charClass(scalar_m("\u{1FF}"), "7"))
-    parseTest(#"[\181]"#, charClass(scalar_m("\u{1}"), "8", "1"))
+    // TODO: These are treated as octal sequences by PCRE, we should warn and
+    // suggest user prefix with 0.
+    parseTest(#"[\1]"#, charClass("1"))
+    parseTest(#"[\123]"#, charClass("1", "2", "3"))
+    parseTest(#"[\101]"#, charClass("1", "0", "1"))
+    parseTest(#"[\7777]"#, charClass("7", "7", "7", "7"))
+    parseTest(#"[\181]"#, charClass("1", "8", "1"))
 
     // We take *up to* the first two valid digits for \x. No valid digits is 0.
     parseTest(#"\x"#, scalar("\u{0}"))
@@ -488,6 +490,10 @@ extension RegexTests {
       #"a\Q \Q \\.\Eb"#,
       concat("a", quote(#" \Q \\."#), "b"))
 
+    // These follow the PCRE behavior.
+    parseTest(#"\Q\\E"#, quote("\\"))
+    parseTest(#"\E"#, "E")
+
     parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"),
               syntax: .experimental)
     parseTest(#"a" .""b""#, concat("a", quote(" ."), quote("b")),
@@ -793,11 +799,9 @@ extension RegexTests {
       )
     }
 
-    // TODO: Some of these behaviors are unintuitive, we should likely warn on
-    // some of them.
-    parseTest(#"\10"#, scalar("\u{8}"))
-    parseTest(#"\18"#, concat(scalar("\u{1}"), "8"))
-    parseTest(#"\7777"#, concat(scalar("\u{1FF}"), "7"))
+    parseTest(#"\10"#, backreference(.absolute(10)))
+    parseTest(#"\18"#, backreference(.absolute(18)))
+    parseTest(#"\7777"#, backreference(.absolute(7777)))
     parseTest(#"\91"#, backreference(.absolute(91)))
 
     parseTest(
@@ -809,21 +813,22 @@ extension RegexTests {
     parseTest(
       #"()()()()()()()()()\10()"#,
       concat(Array(repeating: capture(empty()), count: 9)
-             + [scalar("\u{8}"), capture(empty())]),
+             + [backreference(.absolute(10)), capture(empty())]),
       captures: .tuple(Array(repeating: .atom(), count: 10))
     )
-    parseTest(#"()()\10"#,
-              concat(capture(empty()), capture(empty()), scalar("\u{8}")),
-              captures: .tuple(.atom(), .atom()))
+    parseTest(#"()()\10"#, concat(
+      capture(empty()), capture(empty()), backreference(.absolute(10))),
+              captures: .tuple(.atom(), .atom())
+    )
 
     // A capture of three empty captures.
     let fourCaptures = capture(
       concat(capture(empty()), capture(empty()), capture(empty()))
     )
     parseTest(
       // There are 9 capture groups in total here.
-      #"((()()())(()()()))\10"#,
-      concat(capture(concat(fourCaptures, fourCaptures)), scalar("\u{8}")),
+      #"((()()())(()()()))\10"#, concat(capture(concat(
+        fourCaptures, fourCaptures)), backreference(.absolute(10))),
       captures: .tuple(Array(repeating: .atom(), count: 9))
     )
     parseTest(
@@ -848,7 +853,7 @@ extension RegexTests {
       concat(Array(repeating: capture(empty()), count: 40) + [scalar(" ")]),
       captures: .tuple(Array(repeating: .atom(), count: 40))
     )
-    parseTest(#"\40"#, scalar(" "))
+    parseTest(#"\40"#, backreference(.absolute(40)))
     parseTest(
       String(repeating: "()", count: 40) + #"\40"#,
       concat(Array(repeating: capture(empty()), count: 40)
@@ -858,7 +863,7 @@ extension RegexTests {
 
     parseTest(#"\7"#, backreference(.absolute(7)))
 
-    parseTest(#"\11"#, scalar("\u{9}"))
+    parseTest(#"\11"#, backreference(.absolute(11)))
     parseTest(
       String(repeating: "()", count: 11) + #"\11"#,
       concat(Array(repeating: capture(empty()), count: 11)
@@ -872,12 +877,11 @@ extension RegexTests {
       captures: .tuple(Array(repeating: .atom(), count: 11))
     )
 
-    parseTest(#"\0113"#, concat(scalar("\u{9}"), "3"))
-    parseTest(#"\113"#, scalar("\u{4B}"))
-    parseTest(#"\377"#, scalar("\u{FF}"))
+    parseTest(#"\0113"#, scalar("\u{4B}"))
+    parseTest(#"\113"#, backreference(.absolute(113)))
+    parseTest(#"\377"#, backreference(.absolute(377)))
     parseTest(#"\81"#, backreference(.absolute(81)))
 
-
     parseTest(#"\g1"#, backreference(.absolute(1)))
     parseTest(#"\g001"#, backreference(.absolute(1)))
     parseTest(#"\g52"#, backreference(.absolute(52)))