From 1b7779a70402a3ef100436895ca415a06a739924 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Tue, 2 Aug 2022 20:36:00 +0100
Subject: [PATCH] Remove `re'...'` and `rx'...'` delimiters

We didn't end up choosing these, remove their
lexing code. `#|...|#` remains to test the
experimental syntax.
---
 .../Regex/Parse/DelimiterLexing.swift         | 100 +-----------------
 Sources/_RegexParser/Regex/Parse/Parse.swift  |   4 +-
 Tests/RegexTests/LexTests.swift               |   3 -
 Tests/RegexTests/ParseTests.swift             |  89 ++++++----------
 4 files changed, 37 insertions(+), 159 deletions(-)

diff --git a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
index dd142f016..4d86f9d93 100644
--- a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
@@ -31,7 +31,7 @@ public struct Delimiter: Hashable {
     switch kind {
     case .forwardSlash:
       return poundCount > 0
-    case .experimental, .reSingleQuote, .rxSingleQuote:
+    case .experimental:
       return false
     }
   }
@@ -47,15 +47,11 @@ extension Delimiter {
   enum Kind: Hashable, CaseIterable {
     case forwardSlash
     case experimental
-    case reSingleQuote
-    case rxSingleQuote
 
     var openingAndClosing: (opening: String, closing: String) {
       switch self {
       case .forwardSlash: return ("/", "/")
       case .experimental: return ("#|", "|#")
-      case .reSingleQuote: return ("re'", "'")
-      case .rxSingleQuote: return ("rx'", "'")
       }
     }
     var opening: String { openingAndClosing.opening }
@@ -67,7 +63,7 @@ extension Delimiter {
       switch self {
       case .forwardSlash:
         return true
-      case .experimental, .reSingleQuote, .rxSingleQuote:
+      case .experimental:
         return false
       }
     }
@@ -150,14 +146,6 @@ fileprivate struct DelimiterLexer {
     slice(at: cursor, count)
   }
 
-  /// Return the slice of `count` bytes preceding the current cursor, or `nil`
-  /// if there are fewer than `count` bytes before the cursor.
-  func sliceBehind(_ count: Int) -> UnsafeRawBufferPointer? {
-    let priorCursor = cursor - count
-    guard priorCursor >= start else { return nil }
-    return slice(at: priorCursor, count)
-  }
-
   /// Advance the cursor `n` bytes.
   mutating func advanceCursor(_ n: Int = 1) {
     cursor += n
@@ -186,86 +174,6 @@ fileprivate struct DelimiterLexer {
     return true
   }
 
-  /// Attempt to skip over a closing delimiter character that is unlikely to be
-  /// the actual closing delimiter.
-  mutating func trySkipDelimiter(_ delimiter: Delimiter) {
-    // Only the closing `'` for re'...'/rx'...' can potentially be skipped over.
-    switch delimiter.kind {
-    case .forwardSlash, .experimental:
-      return
-    case .reSingleQuote, .rxSingleQuote:
-      break
-    }
-    guard load() == ascii("'") else { return }
-
-    /// Need to look for a prefix of `(?`, `(?(`, `\k`, `\g`, `(?C`, as those
-    /// are the cases that could use single quotes. Note that none of these
-    /// would be valid regex endings anyway.
-    let calloutPrefix = "(?C"
-    let prefix = ["(?", "(?(", #"\k"#, #"\g"#, calloutPrefix].first { prior in
-      guard let priorSlice = sliceBehind(prior.utf8.count),
-            priorSlice.elementsEqual(prior.utf8)
-      else { return false }
-
-      // Make sure the slice isn't preceded by a '\', as that invalidates this
-      // analysis.
-      if let prior = sliceBehind(priorSlice.count + 1) {
-        return prior[0] != ascii("\\")
-      }
-      return true
-    }
-    guard let prefix = prefix else { return }
-    let isCallout = prefix == calloutPrefix
-
-    func isPossiblyGroupReference(_ c: UInt8) -> Bool {
-      // If this is an ASCII character, make sure it's for a group name. Leave
-      // other UTF-8 encoded scalars alone, this should at least catch cases
-      // where we run into a symbol such as `{`, `.`, `;` that would indicate
-      // we've likely advanced out of the bounds of the regex.
-      let scalar = UnicodeScalar(c)
-      guard scalar.isASCII else { return true }
-      switch scalar {
-      // Include '-' and '+' which may be used in recursion levels and relative
-      // references.
-      case "A"..."Z", "a"..."z", "0"..."9", "_", "-", "+":
-        return true
-      default:
-        return false
-      }
-    }
-
-    // Make a note of the current lexing position, as we may need to revert
-    // back to it.
-    let originalCursor = cursor
-    advanceCursor()
-
-    // Try skip over what would be the contents of a group identifier/reference.
-    while let next = load() {
-      // Found the ending, we're done. Return so we can continue to lex to the
-      // real delimiter.
-      if next == ascii("'") {
-        advanceCursor()
-        return
-      }
-
-      // If this isn't a callout, make sure we have something that could be a
-      // group reference. We limit the character set here to improve diagnostic
-      // behavior in the case where the literal is actually unterminated. We
-      // ideally don't want to go wandering off into Swift source code. We can't
-      // do the same for callouts, as they take arbitrary strings.
-      guard isCallout || isPossiblyGroupReference(next) else { break }
-      do {
-        try advance()
-      } catch {
-        break
-      }
-    }
-    // We bailed out, either because we ran into something that didn't look like
-    // an identifier, or we reached the end of the line. Revert back to the
-    // original guess of delimiter.
-    cursor = originalCursor
-  }
-
   /// Attempt to eat a particular closing delimiter, returning the contents of
   /// the literal, and ending pointer, or `nil` if this is not a delimiter
   /// ending.
@@ -401,10 +309,6 @@ fileprivate struct DelimiterLexer {
       }
     }
     while true {
-      // Check to see if we're at a character that looks like a delimiter, but
-      // likely isn't. In such a case, we can attempt to skip over it.
-      trySkipDelimiter(delimiter)
-
       // Try to lex the closing delimiter.
       if let (contents, end) = try tryEatEnding(delimiter,
                                                 contentsStart: contentsStart) {
diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift
index 0aae031d5..d9b6f23a0 100644
--- a/Sources/_RegexParser/Regex/Parse/Parse.swift
+++ b/Sources/_RegexParser/Regex/Parse/Parse.swift
@@ -672,9 +672,7 @@ fileprivate func defaultSyntaxOptions(
       return [.multilineCompilerLiteral, .extendedSyntax]
     }
     return .traditional
-  case .reSingleQuote:
-    return .traditional
-  case .experimental, .rxSingleQuote:
+  case .experimental:
     return .experimental
   }
 }
diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift
index 49184deb3..53775e66e 100644
--- a/Tests/RegexTests/LexTests.swift
+++ b/Tests/RegexTests/LexTests.swift
@@ -96,9 +96,6 @@ extension RegexTests {
       ("#|abc/#def#", nil),
       ("#/abc\n/#", nil),
       ("#/abc\r/#", nil),
-
-      (#"re'abcre\''"#, (#"abcre\'"#, delim(.reSingleQuote))),
-      (#"re'\'"#, nil)
     ]
 
     for (input, expected) in testCases {
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index 84ce361f3..0e7d41eed 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -2151,9 +2151,6 @@ extension RegexTests {
     parseWithDelimitersTest("##/a b/##", concat("a", " ", "b"))
     parseWithDelimitersTest("#|a b|#", concat("a", "b"))
 
-    parseWithDelimitersTest("re'a b'", concat("a", " ", "b"))
-    parseWithDelimitersTest("rx'a b'", concat("a", "b"))
-
     parseWithDelimitersTest("#|[a b]|#", charClass("a", "b"))
     parseWithDelimitersTest(
       "#|(?-x)[a b]|#", concat(
@@ -2176,13 +2173,13 @@ extension RegexTests {
     parseWithDelimitersTest("#||||#", alt(empty(), empty(), empty()))
     parseWithDelimitersTest("#|a||#", alt("a", empty()))
 
-    parseWithDelimitersTest("re'x*'", zeroOrMore(of: "x"))
+    parseWithDelimitersTest("/x*/", zeroOrMore(of: "x"))
 
-    parseWithDelimitersTest(#"re'🔥🇩🇰'"#, concat("🔥", "🇩🇰"))
-    parseWithDelimitersTest(#"re'🔥✅'"#, concat("🔥", "✅"))
+    parseWithDelimitersTest(#"/🔥🇩🇰/"#, concat("🔥", "🇩🇰"))
+    parseWithDelimitersTest(#"/🔥✅/"#, concat("🔥", "✅"))
 
     // Printable ASCII characters.
-    delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##)
+    delimiterLexingTest(##"#/ !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~/#"##)
 
     // Make sure we can handle a combining accent as first character.
     parseWithDelimitersTest("/\u{301}/", "\u{301}")
@@ -2294,72 +2291,61 @@ extension RegexTests {
       /#
       """#, charClass(range_m("a", "b")))
 
-
-    // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter
-    // if it's clear that it's part of the regex syntax.
-
     parseWithDelimitersTest(
-      #"re'(?'a_bcA0'\')'"#, namedCapture("a_bcA0", "'"))
+      #"/(?'a_bcA0'\')/"#, namedCapture("a_bcA0", "'"))
     parseWithDelimitersTest(
-      #"re'(?'a_bcA0-c1A'x*)'"#,
+      #"/(?'a_bcA0-c1A'x*)/"#,
       balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x")),
       unsupported: true)
 
     parseWithDelimitersTest(
-      #"rx' (?'a_bcA0' a b)'"#, concat(namedCapture("a_bcA0", concat("a", "b"))))
+      #"/ (?'a_bcA0' a b)/"#, concat(" ", namedCapture("a_bcA0", concat(" ", "a", " ", "b"))))
 
     parseWithDelimitersTest(
-      #"re'(?('a_bcA0')x|y)'"#, conditional(
+      #"/(?('a_bcA0')x|y)/"#, conditional(
         .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y"),
       unsupported: true
     )
     parseWithDelimitersTest(
-      #"re'(?('+20')\')'"#, conditional(
+      #"/(?('+20')\')/"#, conditional(
         .groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty()),
       unsupported: true
     )
     parseWithDelimitersTest(
-      #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .invalidNamedReference("b0A"))
+      #"/a\k'b0A'/"#, concat("a", backreference(.named("b0A"))), throwsError: .invalidNamedReference("b0A"))
     parseWithDelimitersTest(
-      #"re'\k'+2-1''"#, backreference(ref(plus: 2), recursionLevel: -1),
+      #"/\k'+2-1'/"#, backreference(ref(plus: 2), recursionLevel: -1),
       unsupported: true
     )
 
     parseWithDelimitersTest(
-      #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A"))), unsupported: true)
+      #"/a\g'b0A'/"#, concat("a", subpattern(.named("b0A"))), unsupported: true)
     parseWithDelimitersTest(
-      #"re'\g'-1'\''"#, concat(subpattern(ref(minus: 1)), "'"), unsupported: true)
+      #"/\g'-1'\'/"#, concat(subpattern(ref(minus: 1)), "'"), unsupported: true)
 
     parseWithDelimitersTest(
-      #"re'(?C'a*b\c 🔥_ ;')'"#, pcreCallout(string: #"a*b\c 🔥_ ;"#),
+      #"/(?C'a*b\c 🔥_ ;')/"#, pcreCallout(string: #"a*b\c 🔥_ ;"#),
       unsupported: true)
 
-    // Fine, because we don't end up skipping.
-    delimiterLexingTest(#"re'(?'"#)
-    delimiterLexingTest(#"re'(?('"#)
-    delimiterLexingTest(#"re'\k'"#)
-    delimiterLexingTest(#"re'\g'"#)
-    delimiterLexingTest(#"re'(?C'"#)
+    delimiterLexingTest(#"/(?/"#)
+    delimiterLexingTest(#"/(?(/"#)
+    delimiterLexingTest(#"/\k/"#)
+    delimiterLexingTest(#"/\g/"#)
+    delimiterLexingTest(#"/(?C/"#)
 
-    // Not a valid group name, but we can still skip over it.
-    delimiterLexingTest(#"re'(?'🔥')'"#)
+    delimiterLexingTest(#"/(?'🔥')/"#)
 
-    // Escaped, so don't skip. These will ignore the ending `'` as we've already
-    // closed the literal.
     parseWithDelimitersTest(
-      #"re'\(?''"#, zeroOrOne(of: "("), ignoreTrailing: true
-    )
+      #"/\(?/"#, zeroOrOne(of: "("))
     parseWithDelimitersTest(
-      #"re'\\k''"#, concat("\\", "k"), ignoreTrailing: true
-    )
+      #"/\\k/"#, concat("\\", "k"))
     parseWithDelimitersTest(
-      #"re'\\g''"#, concat("\\", "g"), ignoreTrailing: true
-    )
+      #"/\\g/"#, concat("\\", "g"))
     parseWithDelimitersTest(
-      #"re'\(?C''"#, concat(zeroOrOne(of: "("), "C"), ignoreTrailing: true
-    )
-    delimiterLexingTest(#"re'(\?''"#, ignoreTrailing: true)
-    delimiterLexingTest(#"re'\(?(''"#, ignoreTrailing: true)
+      #"/\(?C/"#, concat(zeroOrOne(of: "("), "C"))
+
+    delimiterLexingTest(#"/(\?/"#)
+    delimiterLexingTest(#"/\(?(/"#)
 
     // MARK: Parse not-equal
 
@@ -3322,21 +3308,17 @@ extension RegexTests {
 
     // MARK: Printable ASCII
 
-    delimiterLexingDiagnosticTest(#"re'\\#n'"#, .unterminated)
     for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r.
-      delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII)
+      delimiterLexingDiagnosticTest("/\(UnicodeScalar(i))/", .unprintableASCII)
     }
-    delimiterLexingDiagnosticTest("re'\n'", .unterminated)
-    delimiterLexingDiagnosticTest("re'\r'", .unterminated)
-    delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII)
 
-    // MARK: Delimiter skipping
+    // Can only be done if pound signs are used.
+    delimiterLexingDiagnosticTest("/\n/", .unterminated)
+    delimiterLexingDiagnosticTest("/\r/", .unterminated)
+    delimiterLexingDiagnosticTest("/\u{7F}/", .unprintableASCII)
 
-    delimiterLexingDiagnosticTest("re'(?''", .unterminated)
-    delimiterLexingDiagnosticTest("re'(?'abc'", .unterminated)
-    delimiterLexingDiagnosticTest("re'(?('abc'", .unterminated)
-    delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .unterminated)
-    delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .unterminated)
+    delimiterLexingDiagnosticTest("/", .unterminated)
+    delimiterLexingDiagnosticTest("/x", .unterminated)
 
     // MARK: Unbalanced extended syntax
     delimiterLexingDiagnosticTest("#/a/", .unterminated)
@@ -3344,9 +3326,6 @@ extension RegexTests {
 
     // MARK: Multiline
 
-    // Can only be done if pound signs are used.
-    delimiterLexingDiagnosticTest("/\n/", .unterminated)
-
     // Opening and closing delimiters must be on a newline.
     delimiterLexingDiagnosticTest("#/a\n/#", .unterminated)
     delimiterLexingDiagnosticTest("#/\na/#", .multilineClosingNotOnNewline)