From 04ad64ab7e936cb7427f5f00df017ab86472a3b3 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Thu, 17 Feb 2022 17:15:54 +0000
Subject: [PATCH 1/2] Add a couple of quoted test cases

---
 Tests/RegexTests/ParseTests.swift | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index 4872e256e..e70939d53 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -488,6 +488,10 @@ extension RegexTests {
       #"a\Q \Q \\.\Eb"#,
       concat("a", quote(#" \Q \\."#), "b"))
 
+    // These follow the PCRE behavior.
+    parseTest(#"\Q\\E"#, quote("\\"))
+    parseTest(#"\E"#, "E")
+
     parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"),
               syntax: .experimental)
     parseTest(#"a" .""b""#, concat("a", quote(" ."), quote("b")),

From 9d5529ee05858ccb49efeec1ba1a0cf50ee9046f Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Thu, 17 Feb 2022 17:15:55 +0000
Subject: [PATCH 2/2] Update \DDD parsing

Previously we followed PCRE's parsing of this syntax
such that it may either be an octal sequence or
backreference depending on a list of heuristics.
However this model is complicated and not particularly
intuitive, especially as there are other engines
that disambiguate using subtly different rules.

Instead, always parse `\DDD` as a backreference,
unless it begins with `0`, in which case it is an
octal sequence. This matches ICU and Java's
behavior. Once we start validating group references,
we can then start emitting an error on invalid
backreferences using this syntax, and suggest
prefixing with 0 if an octal sequence is desired.
---
 .../Regex/Parse/LexicalAnalysis.swift         | 45 ++++++-----------
 Tests/RegexTests/MatchTests.swift             |  5 +-
 Tests/RegexTests/ParseTests.swift             | 48 +++++++++----------
 3 files changed, 39 insertions(+), 59 deletions(-)

diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
index 727727ce1..dd785f12d 100644
--- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
@@ -279,7 +279,7 @@ extension Source {
   ///                | 'x'  HexDigit{0...2}
   ///                | 'U'  HexDigit{8}
   ///                | 'o{' OctalDigit{1...} '}'
-  ///                | OctalDigit{1...3}
+  ///                | '0' OctalDigit{0...3}
   ///
   mutating func expectUnicodeScalar(
     escapedCharacter base: Character
@@ -313,13 +313,14 @@ extension Source {
         let str = try src.lexUntil(eating: "}").value
         return try Source.validateUnicodeScalar(str, .octal)
 
-      case let c where c.isOctalDigit:
-        // We can read *up to* 2 more octal digits per PCRE.
-        // FIXME: ICU can read up to 3 octal digits if the leading digit is 0,
-        // we should have a parser mode to switch.
-        let nextDigits = src.tryEatPrefix(maxLength: 2, \.isOctalDigit)
-        let str = String(c) + (nextDigits?.string ?? "")
-        return try Source.validateUnicodeScalar(str, .octal)
+      case "0":
+        // We can read *up to* 3 more octal digits.
+        // FIXME: PCRE can only read up to 2 octal digits, if we get a strict
+        // PCRE mode, we should limit it here.
+        guard let digits = src.tryEatPrefix(maxLength: 3, \.isOctalDigit) else {
+          return Unicode.Scalar(0)
+        }
+        return try Source.validateUnicodeScalar(digits.string, .octal)
 
       default:
         fatalError("Unexpected scalar start")
@@ -1341,26 +1342,10 @@ extension Source {
           return nil
         }
 
-        // Lexing \n is tricky, as it's ambiguous with octal sequences. In PCRE
-        // it is treated as a backreference if its first digit is not 0 (as that
-        // is always octal) and one of the following holds:
-        //
-        // - It's 0 < n < 10 (as octal would be pointless here)
-        // - Its first digit is 8 or 9 (as not valid octal)
-        // - There have been as many prior groups as the reference.
-        //
-        // Oniguruma follows the same rules except the second one. e.g \81 and
-        // \91 are instead treated as literal 81 and 91 respectively.
-        // TODO: If we want a strict Oniguruma mode, we'll need to add a check
-        // here.
+        // Backslash followed by a non-0 digit character is a backreference.
         if firstChar != "0", let numAndLoc = try src.lexNumber() {
-          let num = numAndLoc.value
-          let ref = AST.Reference(.absolute(num), innerLoc: numAndLoc.location)
-          if num < 10 || firstChar == "8" || firstChar == "9" ||
-              context.isPriorGroupRef(ref.kind) {
-            return .backreference(ref)
-          }
-          return nil
+          return .backreference(.init(
+            .absolute(numAndLoc.value), innerLoc: numAndLoc.location))
         }
         return nil
       }
@@ -1497,10 +1482,8 @@ extension Source {
       }
 
       switch char {
-      // Hexadecimal and octal unicode scalars. This must be done after
-      // backreference lexing due to the ambiguity with \nnn.
-      case let c where c.isOctalDigit: fallthrough
-      case "u", "x", "U", "o":
+      // Hexadecimal and octal unicode scalars.
+      case "u", "x", "U", "o", "0":
         return try .scalar(
           src.expectUnicodeScalar(escapedCharacter: char).value)
       default:
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index 088deb151..2c07f4c79 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -261,7 +261,7 @@ extension RegexTests {
     firstMatchTest(#"\070"#, input: "1238xyz", match: "8")
     firstMatchTest(#"\07A"#, input: "123\u{7}Axyz", match: "\u{7}A")
     firstMatchTest(#"\08"#, input: "123\08xyz", match: "\08")
-    firstMatchTest(#"\0707"#, input: "12387xyz", match: "87")
+    firstMatchTest(#"\0707"#, input: "12387\u{1C7}xyz", match: "\u{1C7}")
 
     // code point sequence
     firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true)
@@ -1021,9 +1021,6 @@ extension RegexTests {
     firstMatchTest(
       #"(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)\10"#,
       input: "aaaaaaaaabbc", match: "aaaaaaaaabb")
-    firstMatchTest(
-      #"(.)\10"#,
-      input: "a\u{8}b", match: "a\u{8}")
 
     firstMatchTest(
       #"(.)\g001"#,
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index e70939d53..ce070cc38 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -325,7 +325,7 @@ extension RegexTests {
     parseTest(#"\070"#, scalar("\u{38}"))
     parseTest(#"\07A"#, concat(scalar("\u{7}"), "A"))
     parseTest(#"\08"#, concat(scalar("\u{0}"), "8"))
-    parseTest(#"\0707"#, concat(scalar("\u{38}"), "7"))
+    parseTest(#"\0707"#, scalar("\u{1C7}"))
 
     parseTest(#"[\0]"#, charClass(scalar_m("\u{0}")))
     parseTest(#"[\01]"#, charClass(scalar_m("\u{1}")))
@@ -333,13 +333,15 @@ extension RegexTests {
 
     parseTest(#"[\07A]"#, charClass(scalar_m("\u{7}"), "A"))
     parseTest(#"[\08]"#, charClass(scalar_m("\u{0}"), "8"))
-    parseTest(#"[\0707]"#, charClass(scalar_m("\u{38}"), "7"))
+    parseTest(#"[\0707]"#, charClass(scalar_m("\u{1C7}")))
 
-    parseTest(#"[\1]"#, charClass(scalar_m("\u{1}")))
-    parseTest(#"[\123]"#, charClass(scalar_m("\u{53}")))
-    parseTest(#"[\101]"#, charClass(scalar_m("\u{41}")))
-    parseTest(#"[\7777]"#, charClass(scalar_m("\u{1FF}"), "7"))
-    parseTest(#"[\181]"#, charClass(scalar_m("\u{1}"), "8", "1"))
+    // TODO: These are treated as octal sequences by PCRE, we should warn and
+    // suggest user prefix with 0.
+    parseTest(#"[\1]"#, charClass("1"))
+    parseTest(#"[\123]"#, charClass("1", "2", "3"))
+    parseTest(#"[\101]"#, charClass("1", "0", "1"))
+    parseTest(#"[\7777]"#, charClass("7", "7", "7", "7"))
+    parseTest(#"[\181]"#, charClass("1", "8", "1"))
 
     // We take *up to* the first two valid digits for \x. No valid digits is 0.
     parseTest(#"\x"#, scalar("\u{0}"))
@@ -797,11 +799,9 @@ extension RegexTests {
       )
     }
 
-    // TODO: Some of these behaviors are unintuitive, we should likely warn on
-    // some of them.
-    parseTest(#"\10"#, scalar("\u{8}"))
-    parseTest(#"\18"#, concat(scalar("\u{1}"), "8"))
-    parseTest(#"\7777"#, concat(scalar("\u{1FF}"), "7"))
+    parseTest(#"\10"#, backreference(.absolute(10)))
+    parseTest(#"\18"#, backreference(.absolute(18)))
+    parseTest(#"\7777"#, backreference(.absolute(7777)))
     parseTest(#"\91"#, backreference(.absolute(91)))
 
     parseTest(
@@ -813,12 +813,13 @@ extension RegexTests {
     parseTest(
       #"()()()()()()()()()\10()"#,
       concat(Array(repeating: capture(empty()), count: 9)
-             + [scalar("\u{8}"), capture(empty())]),
+             + [backreference(.absolute(10)), capture(empty())]),
       captures: .tuple(Array(repeating: .atom(), count: 10))
     )
-    parseTest(#"()()\10"#,
-              concat(capture(empty()), capture(empty()), scalar("\u{8}")),
-              captures: .tuple(.atom(), .atom()))
+    parseTest(#"()()\10"#, concat(
+      capture(empty()), capture(empty()), backreference(.absolute(10))),
+              captures: .tuple(.atom(), .atom())
+    )
 
     // A capture of three empty captures.
     let fourCaptures = capture(
@@ -826,8 +827,8 @@ extension RegexTests {
     )
     parseTest(
       // There are 9 capture groups in total here.
-      #"((()()())(()()()))\10"#,
-      concat(capture(concat(fourCaptures, fourCaptures)), scalar("\u{8}")),
+      #"((()()())(()()()))\10"#, concat(capture(concat(
+        fourCaptures, fourCaptures)), backreference(.absolute(10))),
       captures: .tuple(Array(repeating: .atom(), count: 9))
     )
     parseTest(
@@ -852,7 +853,7 @@ extension RegexTests {
       concat(Array(repeating: capture(empty()), count: 40) + [scalar(" ")]),
       captures: .tuple(Array(repeating: .atom(), count: 40))
     )
-    parseTest(#"\40"#, scalar(" "))
+    parseTest(#"\40"#, backreference(.absolute(40)))
     parseTest(
       String(repeating: "()", count: 40) + #"\40"#,
       concat(Array(repeating: capture(empty()), count: 40)
@@ -862,7 +863,7 @@ extension RegexTests {
 
     parseTest(#"\7"#, backreference(.absolute(7)))
 
-    parseTest(#"\11"#, scalar("\u{9}"))
+    parseTest(#"\11"#, backreference(.absolute(11)))
     parseTest(
       String(repeating: "()", count: 11) + #"\11"#,
       concat(Array(repeating: capture(empty()), count: 11)
@@ -876,12 +877,11 @@ extension RegexTests {
       captures: .tuple(Array(repeating: .atom(), count: 11))
     )
 
-    parseTest(#"\0113"#, concat(scalar("\u{9}"), "3"))
-    parseTest(#"\113"#, scalar("\u{4B}"))
-    parseTest(#"\377"#, scalar("\u{FF}"))
+    parseTest(#"\0113"#, scalar("\u{4B}"))
+    parseTest(#"\113"#, backreference(.absolute(113)))
+    parseTest(#"\377"#, backreference(.absolute(377)))
     parseTest(#"\81"#, backreference(.absolute(81)))
 
-
     parseTest(#"\g1"#, backreference(.absolute(1)))
     parseTest(#"\g001"#, backreference(.absolute(1)))
     parseTest(#"\g52"#, backreference(.absolute(52)))