From cebf4a6a20fde052287358c6706c27084aa4d27e Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Wed, 2 Mar 2022 16:53:55 +0000
Subject: [PATCH 1/8] Fix crash on lone backslash

---
 Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift | 4 +++-
 Sources/_MatchingEngine/Regex/Parse/Source.swift          | 6 ++++++
 Tests/RegexTests/ParseTests.swift                         | 4 ++++
 3 files changed, 13 insertions(+), 1 deletion(-)
diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
index dd785f12d..cfab75312 100644
--- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
@@ -1472,7 +1472,9 @@ extension Source {
         return ref
       }
 
-      let char = src.eat()
+      guard let char = src.tryEat() else {
+        throw ParseError.expectedEscape
+      }
 
       // Single-character builtins.
       if let builtin = AST.Atom.EscapedBuiltin(
diff --git a/Sources/_MatchingEngine/Regex/Parse/Source.swift b/Sources/_MatchingEngine/Regex/Parse/Source.swift
index 11bd8152f..ddf0475f3 100644
--- a/Sources/_MatchingEngine/Regex/Parse/Source.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/Source.swift
@@ -86,6 +86,12 @@ extension Source {
     tryEat(anyOf: set)
   }
 
+  /// Try to eat any character, returning `nil` if the input has been exhausted.
+  mutating func tryEat() -> Char? {
+    guard !isEmpty else { return nil }
+    return eat()
+  }
+
   mutating func eat(asserting c: Char) {
     assert(peek() == c)
     advance()
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index e55abcbb9..23a3b910f 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -1753,6 +1753,10 @@ extension RegexTests {
     diagnosticTest("(?<a-b", .expected(">"))
     diagnosticTest("(?<a-b>", .expected(")"))
 
+    // MARK: Bad escapes
+
+    diagnosticTest("\\", .expectedEscape)
+
     // MARK: Text Segment options
 
     diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions)

From c48fb1cbdeeb37ba55ff449a1046bbefe48e7d24 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Wed, 2 Mar 2022 16:53:55 +0000
Subject: [PATCH 2/8] Separate out DelimiterLexing.swift

---
 .../Regex/Parse/DelimiterLexing.swift         | 153 ++++++++++++++++++
 .../_MatchingEngine/Regex/Parse/Mocking.swift | 144 -----------------
 2 files changed, 153 insertions(+), 144 deletions(-)
 create mode 100644 Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift

diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
new file mode 100644
index 000000000..70532f9e7
--- /dev/null
+++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
@@ -0,0 +1,153 @@
+//===----------------------------------------------------------------------===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2022 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See https://swift.org/LICENSE.txt for license information
+//
+//===----------------------------------------------------------------------===//
+
+// TODO: mock up multi-line soon
+
+enum Delimiter: Hashable, CaseIterable {
+  case traditional
+  case experimental
+  case reSingleQuote
+
+  var openingAndClosing: (opening: String, closing: String) {
+    switch self {
+    case .traditional: return ("#/", "/#")
+    case .experimental: return ("#|", "|#")
+    case .reSingleQuote: return ("re'", "'")
+    }
+  }
+  var opening: String { openingAndClosing.opening }
+  var closing: String { openingAndClosing.closing }
+
+  /// The default set of syntax options that the delimiter indicates.
+  var defaultSyntaxOptions: SyntaxOptions {
+    switch self {
+    case .traditional, .reSingleQuote:
+      return .traditional
+    case .experimental:
+      return .experimental
+    }
+  }
+}
+
+struct LexError: Error, CustomStringConvertible {
+  enum Kind: Hashable {
+    case endOfString
+    case invalidUTF8 // TODO: better range reporting
+    case unknownDelimiter
+  }
+
+  var kind: Kind
+
+  /// The pointer at which to resume lexing.
+  var resumePtr: UnsafeRawPointer
+
+  init(_ kind: Kind, resumeAt resumePtr: UnsafeRawPointer) {
+    self.kind = kind
+    self.resumePtr = resumePtr
+  }
+
+  var description: String {
+    switch kind {
+    case .endOfString: return "unterminated regex literal"
+    case .invalidUTF8: return "invalid UTF-8 found in source file"
+    case .unknownDelimiter: return "unknown regex literal delimiter"
+    }
+  }
+}
+
+/// Attempt to lex a regex literal between `start` and `end`, returning either
+/// the contents and pointer from which to resume lexing, or an error.
+func lexRegex(
+  start: UnsafeRawPointer, end: UnsafeRawPointer
+) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
+  precondition(start <= end)
+  var current = start
+
+  func ascii(_ s: Unicode.Scalar) -> UInt8 {
+    assert(s.value <= 0x7F)
+    return UInt8(asserting: s.value)
+  }
+  func load(offset: Int) -> UInt8? {
+    guard current + offset < end else { return nil }
+    return current.load(fromByteOffset: offset, as: UInt8.self)
+  }
+  func load() -> UInt8? { load(offset: 0) }
+  func advance(_ n: Int = 1) {
+    precondition(current + n <= end, "Cannot advance past end")
+    current = current.advanced(by: n)
+  }
+
+  func tryEat(_ utf8: String.UTF8View) -> Bool {
+    for (i, idx) in utf8.indices.enumerated() {
+      guard load(offset: i) == utf8[idx] else { return false }
+    }
+    advance(utf8.count)
+    return true
+  }
+
+  // Try to lex the opening delimiter.
+  guard let delimiter = Delimiter.allCases.first(
+    where: { tryEat($0.opening.utf8) }
+  ) else {
+    throw LexError(.unknownDelimiter, resumeAt: current.successor())
+  }
+
+  let contentsStart = current
+  while true {
+    switch load() {
+    case nil, ascii("\n"), ascii("\r"):
+      throw LexError(.endOfString, resumeAt: current)
+
+    case ascii("\\"):
+      // Skip next byte.
+      advance(2)
+
+    default:
+      // Try to lex the closing delimiter.
+      let contentsEnd = current
+      guard tryEat(delimiter.closing.utf8) else {
+        advance()
+        continue
+      }
+
+      // Form a string from the contents and make sure it's valid UTF-8.
+      let count = contentsEnd - contentsStart
+      let contents = UnsafeRawBufferPointer(
+        start: contentsStart, count: count)
+      let s = String(decoding: contents, as: UTF8.self)
+
+      guard s.utf8.elementsEqual(contents) else {
+        throw LexError(.invalidUTF8, resumeAt: current)
+      }
+      return (contents: s, delimiter, end: current)
+    }
+  }
+}
+
+/// Drop a set of regex delimiters from the input string, returning the contents
+/// and the delimiter used. The input string must have valid delimiters.
+func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
+  let utf8 = str.utf8
+  func stripDelimiter(_ delim: Delimiter) -> String? {
+    let prefix = delim.opening.utf8
+    let suffix = delim.closing.utf8
+    guard utf8.prefix(prefix.count).elementsEqual(prefix),
+          utf8.suffix(suffix.count).elementsEqual(suffix) else { return nil }
+
+    return String(utf8.dropFirst(prefix.count).dropLast(suffix.count))
+  }
+  for d in Delimiter.allCases {
+    if let contents = stripDelimiter(d) {
+      return (contents, d)
+    }
+  }
+  fatalError("No valid delimiters")
+}
diff --git a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
index e3a178a15..dfba4757e 100644
--- a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
@@ -9,150 +9,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
-// TODO: mock up multi-line soon
-
-enum Delimiter: Hashable, CaseIterable {
-  case traditional
-  case experimental
-  case reSingleQuote
-
-  var openingAndClosing: (opening: String, closing: String) {
-    switch self {
-    case .traditional: return ("#/", "/#")
-    case .experimental: return ("#|", "|#")
-    case .reSingleQuote: return ("re'", "'")
-    }
-  }
-  var opening: String { openingAndClosing.opening }
-  var closing: String { openingAndClosing.closing }
-
-  /// The default set of syntax options that the delimiter indicates.
-  var defaultSyntaxOptions: SyntaxOptions {
-    switch self {
-    case .traditional, .reSingleQuote:
-      return .traditional
-    case .experimental:
-      return .experimental
-    }
-  }
-}
-
-struct LexError: Error, CustomStringConvertible {
-  enum Kind: Hashable {
-    case endOfString
-    case invalidUTF8 // TODO: better range reporting
-    case unknownDelimiter
-  }
-
-  var kind: Kind
-
-  /// The pointer at which to resume lexing.
-  var resumePtr: UnsafeRawPointer
-
-  init(_ kind: Kind, resumeAt resumePtr: UnsafeRawPointer) {
-    self.kind = kind
-    self.resumePtr = resumePtr
-  }
-
-  var description: String {
-    switch kind {
-    case .endOfString: return "unterminated regex literal"
-    case .invalidUTF8: return "invalid UTF-8 found in source file"
-    case .unknownDelimiter: return "unknown regex literal delimiter"
-    }
-  }
-}
-
-/// Drop a set of regex delimiters from the input string, returning the contents
-/// and the delimiter used. The input string must have valid delimiters.
-func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
-  let utf8 = str.utf8
-  func stripDelimiter(_ delim: Delimiter) -> String? {
-    let prefix = delim.opening.utf8
-    let suffix = delim.closing.utf8
-    guard utf8.prefix(prefix.count).elementsEqual(prefix),
-          utf8.suffix(suffix.count).elementsEqual(suffix) else { return nil }
-
-    return String(utf8.dropFirst(prefix.count).dropLast(suffix.count))
-  }
-  for d in Delimiter.allCases {
-    if let contents = stripDelimiter(d) {
-      return (contents, d)
-    }
-  }
-  fatalError("No valid delimiters")
-}
-
-/// Attempt to lex a regex literal between `start` and `end`, returning either
-/// the contents and pointer from which to resume lexing, or an error.
-func lexRegex(
-  start: UnsafeRawPointer, end: UnsafeRawPointer
-) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
-  precondition(start <= end)
-  var current = start
-
-  func ascii(_ s: Unicode.Scalar) -> UInt8 {
-    assert(s.value <= 0x7F)
-    return UInt8(asserting: s.value)
-  }
-  func load(offset: Int) -> UInt8? {
-    guard current + offset < end else { return nil }
-    return current.load(fromByteOffset: offset, as: UInt8.self)
-  }
-  func load() -> UInt8? { load(offset: 0) }
-  func advance(_ n: Int = 1) {
-    precondition(current + n <= end, "Cannot advance past end")
-    current = current.advanced(by: n)
-  }
-
-  func tryEat(_ utf8: String.UTF8View) -> Bool {
-    for (i, idx) in utf8.indices.enumerated() {
-      guard load(offset: i) == utf8[idx] else { return false }
-    }
-    advance(utf8.count)
-    return true
-  }
-
-  // Try to lex the opening delimiter.
-  guard let delimiter = Delimiter.allCases.first(
-    where: { tryEat($0.opening.utf8) }
-  ) else {
-    throw LexError(.unknownDelimiter, resumeAt: current.successor())
-  }
-
-  let contentsStart = current
-  while true {
-    switch load() {
-    case nil, ascii("\n"), ascii("\r"):
-      throw LexError(.endOfString, resumeAt: current)
-
-    case ascii("\\"):
-      // Skip next byte.
-      advance(2)
-
-    default:
-      // Try to lex the closing delimiter.
-      let contentsEnd = current
-      guard tryEat(delimiter.closing.utf8) else {
-        advance()
-        continue
-      }
-
-      // Form a string from the contents and make sure it's valid UTF-8.
-      let count = contentsEnd - contentsStart
-      let contents = UnsafeRawBufferPointer(
-        start: contentsStart, count: count)
-      let s = String(decoding: contents, as: UTF8.self)
-
-      guard s.utf8.elementsEqual(contents) else {
-        throw LexError(.invalidUTF8, resumeAt: current)
-      }
-      return (contents: s, delimiter, end: current)
-    }
-  }
-}
-
 private func copyCString(_ str: String) -> UnsafePointer<CChar> {
   let count = str.utf8.count + 1
   return str.withCString {

From 0cbb9af76935b22a32c5ce5a8d02b1e6ad285700 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Wed, 2 Mar 2022 16:53:55 +0000
Subject: [PATCH 3/8] Rename LexError -> DelimiterLexError

To avoid confusion with more general regex lexical
analysis.
---
 Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift | 8 ++++----
 Sources/_MatchingEngine/Regex/Parse/Mocking.swift         | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
index 70532f9e7..c023a069c 100644
--- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
@@ -37,7 +37,7 @@ enum Delimiter: Hashable, CaseIterable {
   }
 }
 
-struct LexError: Error, CustomStringConvertible {
+struct DelimiterLexError: Error, CustomStringConvertible {
   enum Kind: Hashable {
     case endOfString
     case invalidUTF8 // TODO: better range reporting
@@ -97,14 +97,14 @@ func lexRegex(
   guard let delimiter = Delimiter.allCases.first(
     where: { tryEat($0.opening.utf8) }
   ) else {
-    throw LexError(.unknownDelimiter, resumeAt: current.successor())
+    throw DelimiterLexError(.unknownDelimiter, resumeAt: current.successor())
   }
 
   let contentsStart = current
   while true {
     switch load() {
     case nil, ascii("\n"), ascii("\r"):
-      throw LexError(.endOfString, resumeAt: current)
+      throw DelimiterLexError(.endOfString, resumeAt: current)
 
     case ascii("\\"):
       // Skip next byte.
@@ -125,7 +125,7 @@ func lexRegex(
       let s = String(decoding: contents, as: UTF8.self)
 
       guard s.utf8.elementsEqual(contents) else {
-        throw LexError(.invalidUTF8, resumeAt: current)
+        throw DelimiterLexError(.invalidUTF8, resumeAt: current)
       }
       return (contents: s, delimiter, end: current)
     }
diff --git a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
index dfba4757e..b535edf1b 100644
--- a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
@@ -52,7 +52,7 @@ func libswiftLexRegexLiteral(
     let (_, _, endPtr) = try lexRegex(start: inputPtr, end: bufferEndPtr)
     curPtrPtr.pointee = endPtr.assumingMemoryBound(to: CChar.self)
     return false
-  } catch let error as LexError {
+  } catch let error as DelimiterLexError {
     if error.kind == .unknownDelimiter {
       // An unknown delimiter should be recovered from, as we may want to try
       // lex something else.
@@ -66,7 +66,7 @@ func libswiftLexRegexLiteral(
     // closing delimiters, which would help with code completion.
     return true
   } catch {
-    fatalError("Should be a LexError")
+    fatalError("Should be a DelimiterLexError")
   }
 }
 

From 7e820821f296436b504bb0d9cd782a46eda11f04 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Wed, 2 Mar 2022 16:53:56 +0000
Subject: [PATCH 4/8] Refactor delimiter lexing logic

Introduce a DelimiterLexer type to perform the
lexing.
---
 .../Regex/Parse/DelimiterLexing.swift         | 167 +++++++++++++-----
 1 file changed, 121 insertions(+), 46 deletions(-)

diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
index c023a069c..e49a442e7 100644
--- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
@@ -63,71 +63,137 @@ struct DelimiterLexError: Error, CustomStringConvertible {
   }
 }
 
-/// Attempt to lex a regex literal between `start` and `end`, returning either
-/// the contents and pointer from which to resume lexing, or an error.
-func lexRegex(
-  start: UnsafeRawPointer, end: UnsafeRawPointer
-) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
-  precondition(start <= end)
-  var current = start
+fileprivate struct DelimiterLexer {
+  let start: UnsafeRawPointer
+  var cursor: UnsafeRawPointer
+  let end: UnsafeRawPointer
+
+  init(start: UnsafeRawPointer, end: UnsafeRawPointer) {
+    precondition(start <= end)
+    self.start = start
+    self.cursor = start
+    self.end = end
+  }
 
   func ascii(_ s: Unicode.Scalar) -> UInt8 {
     assert(s.value <= 0x7F)
     return UInt8(asserting: s.value)
   }
-  func load(offset: Int) -> UInt8? {
-    guard current + offset < end else { return nil }
-    return current.load(fromByteOffset: offset, as: UInt8.self)
+
+  /// Return the byte at the current cursor, or `nil` if the end of the buffer
+  /// has been reached.
+  func load() -> UInt8? {
+    guard cursor < end else { return nil }
+    return cursor.load(as: UInt8.self)
   }
-  func load() -> UInt8? { load(offset: 0) }
-  func advance(_ n: Int = 1) {
-    precondition(current + n <= end, "Cannot advance past end")
-    current = current.advanced(by: n)
+
+  /// Return the slice of `count` bytes from a specified cursor position, or
+  /// `nil` if there are fewer than `count` bytes until the end of the buffer.
+  func slice(
+    at cursor: UnsafeRawPointer, _ count: Int
+  ) -> UnsafeRawBufferPointer? {
+    guard cursor + count <= end else { return nil }
+    return UnsafeRawBufferPointer(start: cursor, count: count)
   }
 
-  func tryEat(_ utf8: String.UTF8View) -> Bool {
-    for (i, idx) in utf8.indices.enumerated() {
-      guard load(offset: i) == utf8[idx] else { return false }
-    }
-    advance(utf8.count)
+  /// Return the slice of `count` bytes from the current cursor, or `nil` if
+  /// there are fewer than `count` bytes until the end of the buffer.
+  func slice(_ count: Int) -> UnsafeRawBufferPointer? {
+    slice(at: cursor, count)
+  }
+
+  /// Advance the cursor `n` bytes.
+  mutating func advanceCursor(_ n: Int = 1) {
+    cursor += n
+    precondition(cursor <= end, "Cannot advance past end")
+  }
+
+  /// Check to see if a UTF-8 sequence can be eaten from the current cursor.
+  func canEat(_ utf8: String.UTF8View) -> Bool {
+    guard let slice = slice(utf8.count) else { return false }
+    return slice.elementsEqual(utf8)
+  }
+
+  /// Attempt to eat a UTF-8 byte sequence, returning `true` if successful.
+  mutating func tryEat(_ utf8: String.UTF8View) -> Bool {
+    guard canEat(utf8) else { return false }
+    advanceCursor(utf8.count)
     return true
   }
 
-  // Try to lex the opening delimiter.
-  guard let delimiter = Delimiter.allCases.first(
-    where: { tryEat($0.opening.utf8) }
-  ) else {
-    throw DelimiterLexError(.unknownDelimiter, resumeAt: current.successor())
+  /// Attempt to eat a particular closing delimiter, returning the contents of
+  /// the literal, and ending pointer, or `nil` if this is not a delimiter
+  /// ending.
+  mutating func tryEatEnding(
+    _ delimiter: Delimiter, contentsStart: UnsafeRawPointer
+  ) throws -> (contents: String, end: UnsafeRawPointer)? {
+    let contentsEnd = cursor
+    guard tryEat(delimiter.closing.utf8) else { return nil }
+
+    // Form a string from the contents and make sure it's valid UTF-8.
+    let count = contentsEnd - contentsStart
+    let contents = UnsafeRawBufferPointer(
+      start: contentsStart, count: count)
+    let s = String(decoding: contents, as: UTF8.self)
+
+    guard s.utf8.elementsEqual(contents) else {
+      throw DelimiterLexError(.invalidUTF8, resumeAt: cursor)
+    }
+    return (contents: s, end: cursor)
   }
 
-  let contentsStart = current
-  while true {
-    switch load() {
-    case nil, ascii("\n"), ascii("\r"):
-      throw DelimiterLexError(.endOfString, resumeAt: current)
+  /// Attempt to advance the lexer, throwing an error if the end of a line or
+  /// the end of the buffer is reached.
+  mutating func advance(escaped: Bool = false) throws {
+    guard let next = load() else {
+      throw DelimiterLexError(.endOfString, resumeAt: cursor)
+    }
+    switch UnicodeScalar(next) {
+    case let next where !next.isASCII:
+      // Just advance into a UTF-8 sequence. It shouldn't matter that we'll
+      // iterate through each byte as we only match against ASCII, and we
+      // validate it at the end. This case is separated out so we can just deal
+      // with the ASCII cases below.
+      advanceCursor()
+
+    case "\n", "\r":
+      throw DelimiterLexError(.endOfString, resumeAt: cursor)
+
+    case "\0":
+      // TODO: Warn to match the behavior of String literal lexer? Or should
+      // we error as unprintable?
+      advanceCursor()
+
+    case "\\" where !escaped:
+      // Advance again for an escape sequence.
+      advanceCursor()
+      try advance(escaped: true)
 
-    case ascii("\\"):
-      // Skip next byte.
-      advance(2)
 
     default:
-      // Try to lex the closing delimiter.
-      let contentsEnd = current
-      guard tryEat(delimiter.closing.utf8) else {
-        advance()
-        continue
-      }
+      advanceCursor()
+    }
+  }
 
-      // Form a string from the contents and make sure it's valid UTF-8.
-      let count = contentsEnd - contentsStart
-      let contents = UnsafeRawBufferPointer(
-        start: contentsStart, count: count)
-      let s = String(decoding: contents, as: UTF8.self)
+  /*consuming*/ mutating func lex(
+  ) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
+
+    // Try to lex the opening delimiter.
+    guard let delimiter = Delimiter.allCases.first(
+      where: { tryEat($0.opening.utf8) }
+    ) else {
+      throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor())
+    }
 
-      guard s.utf8.elementsEqual(contents) else {
-        throw DelimiterLexError(.invalidUTF8, resumeAt: current)
+    let contentsStart = cursor
+    while true {
+      // Try to lex the closing delimiter.
+      if let (contents, end) = try tryEatEnding(delimiter,
+                                                contentsStart: contentsStart) {
+        return (contents, delimiter, end)
       }
-      return (contents: s, delimiter, end: current)
+      // Try to advance the lexer.
+      try advance()
     }
   }
 }
@@ -151,3 +217,12 @@ func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
   }
   fatalError("No valid delimiters")
 }
+
+/// Attempt to lex a regex literal between `start` and `end`, returning either
+/// the contents and pointer from which to resume lexing, or an error.
+func lexRegex(
+  start: UnsafeRawPointer, end: UnsafeRawPointer
+) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
+  var lexer = DelimiterLexer(start: start, end: end)
+  return try lexer.lex()
+}

From 8b3e2ef4bfd748159171332d3f8dc94d7bbb8ce0 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Wed, 2 Mar 2022 16:53:56 +0000
Subject: [PATCH 5/8] Diagnose unprintable ASCII characters

This matches the behavior of the C++ lexer for
string literals.
---
 .../Regex/Parse/DelimiterLexing.swift         |  7 +++
 .../Utility/MissingUnicode.swift              |  8 +++
 Tests/RegexTests/ParseTests.swift             | 63 ++++++++++++++++---
 3 files changed, 70 insertions(+), 8 deletions(-)

diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
index e49a442e7..4b4618318 100644
--- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
@@ -42,6 +42,7 @@ struct DelimiterLexError: Error, CustomStringConvertible {
     case endOfString
     case invalidUTF8 // TODO: better range reporting
     case unknownDelimiter
+    case unprintableASCII
   }
 
   var kind: Kind
@@ -59,6 +60,7 @@ struct DelimiterLexError: Error, CustomStringConvertible {
     case .endOfString: return "unterminated regex literal"
     case .invalidUTF8: return "invalid UTF-8 found in source file"
     case .unknownDelimiter: return "unknown regex literal delimiter"
+    case .unprintableASCII: return "unprintable ASCII character found in source file"
     }
   }
 }
@@ -169,6 +171,11 @@ fileprivate struct DelimiterLexer {
       advanceCursor()
       try advance(escaped: true)
 
+    case let next where !next.isPrintableASCII:
+      // Diagnose unprintable ASCII.
+      // TODO: Ideally we would recover and continue to lex until the ending
+      // delimiter.
+      throw DelimiterLexError(.unprintableASCII, resumeAt: cursor.successor())
 
     default:
       advanceCursor()
diff --git a/Sources/_MatchingEngine/Utility/MissingUnicode.swift b/Sources/_MatchingEngine/Utility/MissingUnicode.swift
index a6aae0b82..dccba3286 100644
--- a/Sources/_MatchingEngine/Utility/MissingUnicode.swift
+++ b/Sources/_MatchingEngine/Utility/MissingUnicode.swift
@@ -661,3 +661,11 @@ extension Character {
 
   public var isWordCharacter: Bool { isLetter || isNumber || self == "_" }
 }
+
+extension UnicodeScalar {
+  public var isPrintableASCII: Bool {
+    // Exclude non-printables before the space character U+20, and anything
+    // including and above the DEL character U+7F.
+    value >= 0x20 && value < 0x7F
+  }
+}
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index 23a3b910f..b0b2e5309 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -107,20 +107,26 @@ func parseTest(
   serializedCaptures.deallocate()
 }
 
-func parseWithDelimitersTest(
-  _ input: String, _ expecting: AST.Node,
-  file: StaticString = #file, line: UInt = #line
+func delimiterLexingTest(
+  _ input: String, file: StaticString = #file, line: UInt = #line
 ) {
-  // First try lexing.
-  input.withCString { ptr in
-    let (contents, delim, end) = try! lexRegex(start: ptr,
-                                               end: ptr + input.count)
-    XCTAssertEqual(end, ptr + input.count, file: file, line: line)
+  input.withCString(encodedAs: UTF8.self) { ptr in
+    let endPtr = ptr + input.utf8.count
+    let (contents, delim, end) = try! lexRegex(start: ptr, end: endPtr)
+    XCTAssertEqual(end, endPtr, file: file, line: line)
 
     let (parseContents, parseDelim) = droppingRegexDelimiters(input)
     XCTAssertEqual(contents, parseContents, file: file, line: line)
     XCTAssertEqual(delim, parseDelim, file: file, line: line)
   }
+}
+
+func parseWithDelimitersTest(
+  _ input: String, _ expecting: AST.Node,
+  file: StaticString = #file, line: UInt = #line
+) {
+  // First try lexing.
+  delimiterLexingTest(input, file: file, line: line)
 
   let orig = try! parseWithDelimiters(input)
   let ast = orig.root
@@ -199,6 +205,32 @@ func diagnosticTest(
   }
 }
 
+func delimiterLexingDiagnosticTest(
+  _ input: String, _ expected: DelimiterLexError.Kind,
+  syntax: SyntaxOptions = .traditional,
+  file: StaticString = #file, line: UInt = #line
+) {
+  do {
+    _ = try input.withCString { ptr in
+      try lexRegex(start: ptr, end: ptr + input.count)
+    }
+    XCTFail("""
+      Passed, but expected error: \(expected)
+    """, file: file, line: line)
+  } catch let e as DelimiterLexError {
+    guard e.kind == expected else {
+      XCTFail("""
+
+        Expected: \(expected)
+        Actual: \(e.kind)
+      """, file: file, line: line)
+      return
+    }
+  } catch let e {
+    XCTFail("Unexpected error type: \(e)", file: file, line: line)
+  }
+}
+
 func libswiftDiagnosticMessageTest(
   _ input: String, _ expectedErr: String, file: StaticString = #file,
   line: UInt = #line
@@ -1472,6 +1504,11 @@ extension RegexTests {
 
     parseWithDelimitersTest("re'x*'", zeroOrMore(of: "x"))
 
+    parseWithDelimitersTest(#"re'🔥🇩🇰'"#, concat("🔥", "🇩🇰"))
+    parseWithDelimitersTest(#"re'\🔥✅'"#, concat("🔥", "✅"))
+
+    // Printable ASCII characters.
+    delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##)
     // MARK: Parse not-equal
 
     // Make sure dumping output correctly reflects differences in AST.
@@ -1890,6 +1927,16 @@ extension RegexTests {
     diagnosticTest("(*LIMIT_DEPTH=-1", .expectedNumber("", kind: .decimal))
   }
 
+  func testDelimiterLexingErrors() {
+    delimiterLexingDiagnosticTest(#"re'\\#n'"#, .endOfString)
+    for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r.
+      delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII)
+    }
+    delimiterLexingDiagnosticTest("re'\n'", .endOfString)
+    delimiterLexingDiagnosticTest("re'\r'", .endOfString)
+    delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII)
+  }
+
   func testlibswiftDiagnostics() {
     libswiftDiagnosticMessageTest(
       "#/[x*/#", "cannot parse regular expression: expected ']'")

From 56414b8afd3cacaeb31ed3bf7b7a1186b889b624 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Wed, 2 Mar 2022 16:53:56 +0000
Subject: [PATCH 6/8] Allow lexer recovery for missing closing delimiter

Allow the C++ lexer to form a tok::regex_literal.
This avoids generic fallback behavior, and better
allows for things like code completion. The test
case for this will be in the C++ repo.
---
 .../Regex/Parse/DelimiterLexing.swift         | 17 ++++++++-------
 .../_MatchingEngine/Regex/Parse/Mocking.swift | 14 +++++++++----
 Sources/_MatchingEngine/Utility/Misc.swift    | 21 +++++++++++++++++++
 3 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
index 4b4618318..c4be948ac 100644
--- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
@@ -208,14 +208,17 @@ fileprivate struct DelimiterLexer {
 /// Drop a set of regex delimiters from the input string, returning the contents
 /// and the delimiter used. The input string must have valid delimiters.
 func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
-  let utf8 = str.utf8
   func stripDelimiter(_ delim: Delimiter) -> String? {
-    let prefix = delim.opening.utf8
-    let suffix = delim.closing.utf8
-    guard utf8.prefix(prefix.count).elementsEqual(prefix),
-          utf8.suffix(suffix.count).elementsEqual(suffix) else { return nil }
-
-    return String(utf8.dropFirst(prefix.count).dropLast(suffix.count))
+    // The opening delimiter must match.
+    guard var slice = str.utf8.tryDropPrefix(delim.opening.utf8)
+    else { return nil }
+
+    // The closing delimiter may optionally match, as it may not be present in
+    // invalid code.
+    if let newSlice = slice.tryDropSuffix(delim.closing.utf8) {
+      slice = newSlice
+    }
+    return String(slice)
   }
   for d in Delimiter.allCases {
     if let contents = stripDelimiter(d) {
diff --git a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
index b535edf1b..5994a4f52 100644
--- a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
@@ -61,10 +61,16 @@ func libswiftLexRegexLiteral(
     errOut.pointee = copyCString("\(error)")
     curPtrPtr.pointee = error.resumePtr.assumingMemoryBound(to: CChar.self)
 
-    // For now, treat every error as unrecoverable.
-    // TODO: We should ideally be able to recover from a regex with missing
-    // closing delimiters, which would help with code completion.
-    return true
+    switch error.kind {
+    case .endOfString:
+      // Missing closing delimiter can be recovered from.
+      return false
+    case .unprintableASCII, .invalidUTF8:
+      // We don't currently have good recovery behavior for these.
+      return true
+    case .unknownDelimiter:
+      fatalError("Already handled")
+    }
   } catch {
     fatalError("Should be a DelimiterLexError")
   }
diff --git a/Sources/_MatchingEngine/Utility/Misc.swift b/Sources/_MatchingEngine/Utility/Misc.swift
index bd1e395b5..55d3d3adc 100644
--- a/Sources/_MatchingEngine/Utility/Misc.swift
+++ b/Sources/_MatchingEngine/Utility/Misc.swift
@@ -108,7 +108,28 @@ extension Collection {
   >(_ idx: Index, in c: C) -> C.Index {
     c.index(atOffset: offset(of: idx))
   }
+}
 
+extension Collection where Element: Equatable {
+  /// Attempt to drop a given prefix from the collection, returning the
+  /// resulting subsequence, or `nil` if the prefix does not match.
+  public func tryDropPrefix<C : Collection>(
+    _ other: C
+  ) -> SubSequence? where C.Element == Element {
+    let prefixCount = other.count
+    guard prefix(prefixCount).elementsEqual(other) else { return nil }
+    return dropFirst(prefixCount)
+  }
+
+  /// Attempt to drop a given suffix from the collection, returning the
+  /// resulting subsequence, or `nil` if the suffix does not match.
+  public func tryDropSuffix<C : Collection>(
+    _ other: C
+  ) -> SubSequence? where C.Element == Element {
+    let suffixCount = other.count
+    guard suffix(suffixCount).elementsEqual(other) else { return nil }
+    return dropLast(suffixCount)
+  }
 }
 
 extension UnsafeMutableRawPointer {

From 61450e875d80c622d5b7adb60cdf6cfc6be56439 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Wed, 2 Mar 2022 16:53:56 +0000
Subject: [PATCH 7/8] Add lexing heuristic to handle single quotes in re'...'

If a single quote is encountered with a prefix of
either `(?`, `(?(`, `\k`, `\g` or `(?C`, continue
to scan ahead to a closing `'`. Such prefixes would
not be valid endings for a regex literal anyway,
and this lets us handle the single quote variant
of their syntax.

For the group name cases, further refine this
skipping behavior by only skipping over characters
that could possibly appear in that case. This
improves diagnostic behavior by ensuring we don't
go wandering off into Swift code.
---
 .../Regex/Parse/DelimiterLexing.swift         |  92 +++++++++++++++
 Tests/RegexTests/ParseTests.swift             | 106 ++++++++++++++++--
 2 files changed, 191 insertions(+), 7 deletions(-)

diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
index c4be948ac..f1d3d5607 100644
--- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
@@ -104,6 +104,14 @@ fileprivate struct DelimiterLexer {
     slice(at: cursor, count)
   }
 
+  /// Return the slice of `count` bytes preceding the current cursor, or `nil`
+  /// if there are fewer than `count` bytes before the cursor.
+  func sliceBehind(_ count: Int) -> UnsafeRawBufferPointer? {
+    let priorCursor = cursor - count
+    guard priorCursor >= start else { return nil }
+    return slice(at: priorCursor, count)
+  }
+
   /// Advance the cursor `n` bytes.
   mutating func advanceCursor(_ n: Int = 1) {
     cursor += n
@@ -123,6 +131,86 @@ fileprivate struct DelimiterLexer {
     return true
   }
 
+  /// Attempt to skip over a closing delimiter character that is unlikely to be
+  /// the actual closing delimiter.
+  mutating func trySkipDelimiter(_ delimiter: Delimiter) {
+    // Only the closing `'` for re'...' can potentially be skipped over.
+    switch delimiter {
+    case .traditional, .experimental:
+      return
+    case .reSingleQuote:
+      break
+    }
+    guard load() == ascii("'") else { return }
+
+    /// Need to look for a prefix of `(?`, `(?(`, `\k`, `\g`, `(?C`, as those
+    /// are the cases that could use single quotes. Note that none of these
+    /// would be valid regex endings anyway.
+    let calloutPrefix = "(?C"
+    let prefix = ["(?", "(?(", #"\k"#, #"\g"#, calloutPrefix].first { prior in
+      guard let priorSlice = sliceBehind(prior.utf8.count),
+            priorSlice.elementsEqual(prior.utf8)
+      else { return false }
+
+      // Make sure the slice isn't preceded by a '\', as that invalidates this
+      // analysis.
+      if let prior = sliceBehind(priorSlice.count + 1) {
+        return prior[0] != ascii("\\")
+      }
+      return true
+    }
+    guard let prefix = prefix else { return }
+    let isCallout = prefix == calloutPrefix
+
+    func isPossiblyGroupReference(_ c: UInt8) -> Bool {
+      // If this is an ASCII character, make sure it's for a group name. Leave
+      // other UTF-8 encoded scalars alone, this should at least catch cases
+      // where we run into a symbol such as `{`, `.`, `;` that would indicate
+      // we've likely advanced out of the bounds of the regex.
+      let scalar = UnicodeScalar(c)
+      guard scalar.isASCII else { return true }
+      switch scalar {
+      // Include '-' and '+' which may be used in recursion levels and relative
+      // references.
+      case "A"..."Z", "a"..."z", "0"..."9", "_", "-", "+":
+        return true
+      default:
+        return false
+      }
+    }
+
+    // Make a note of the current lexing position, as we may need to revert
+    // back to it.
+    let originalCursor = cursor
+    advanceCursor()
+
+    // Try skip over what would be the contents of a group identifier/reference.
+    while let next = load() {
+      // Found the ending, we're done. Return so we can continue to lex to the
+      // real delimiter.
+      if next == ascii("'") {
+        advanceCursor()
+        return
+      }
+
+      // If this isn't a callout, make sure we have something that could be a
+      // group reference. We limit the character set here to improve diagnostic
+      // behavior in the case where the literal is actually unterminated. We
+      // ideally don't want to go wandering off into Swift source code. We can't
+      // do the same for callouts, as they take arbitrary strings.
+      guard isCallout || isPossiblyGroupReference(next) else { break }
+      do {
+        try advance()
+      } catch {
+        break
+      }
+    }
+    // We bailed out, either because we ran into something that didn't look like
+    // an identifier, or we reached the end of the line. Revert back to the
+    // original guess of delimiter.
+    cursor = originalCursor
+  }
+
   /// Attempt to eat a particular closing delimiter, returning the contents of
   /// the literal, and ending pointer, or `nil` if this is not a delimiter
   /// ending.
@@ -194,6 +282,10 @@ fileprivate struct DelimiterLexer {
 
     let contentsStart = cursor
     while true {
+      // Check to see if we're at a character that looks like a delimiter, but
+      // likely isn't. In such a case, we can attempt to skip over it.
+      trySkipDelimiter(delimiter)
+
       // Try to lex the closing delimiter.
       if let (contents, end) = try tryEatEnding(delimiter,
                                                 contentsStart: contentsStart) {
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index b0b2e5309..b499c0b98 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -107,28 +107,46 @@ func parseTest(
   serializedCaptures.deallocate()
 }
 
+/// Test delimiter lexing. Takes an input string that starts with a regex
+/// literal. If `ignoreTrailing` is true, there may be additional characters
+/// that follow the literal that are not considered part of it.
+@discardableResult
 func delimiterLexingTest(
-  _ input: String, file: StaticString = #file, line: UInt = #line
-) {
+  _ input: String, ignoreTrailing: Bool = false,
+  file: StaticString = #file, line: UInt = #line
+) -> String {
   input.withCString(encodedAs: UTF8.self) { ptr in
     let endPtr = ptr + input.utf8.count
     let (contents, delim, end) = try! lexRegex(start: ptr, end: endPtr)
-    XCTAssertEqual(end, endPtr, file: file, line: line)
+    if ignoreTrailing {
+      XCTAssertNotEqual(end, endPtr, file: file, line: line)
+    } else {
+      XCTAssertEqual(end, endPtr, file: file, line: line)
+    }
 
-    let (parseContents, parseDelim) = droppingRegexDelimiters(input)
+    let rawPtr = UnsafeRawPointer(ptr)
+    let buffer = UnsafeRawBufferPointer(start: rawPtr, count: end - rawPtr)
+    let literal = String(decoding: buffer, as: UTF8.self)
+
+    let (parseContents, parseDelim) = droppingRegexDelimiters(literal)
     XCTAssertEqual(contents, parseContents, file: file, line: line)
     XCTAssertEqual(delim, parseDelim, file: file, line: line)
+    return literal
   }
 }
 
+/// Test parsing an input string with regex delimiters. If `ignoreTrailing` is
+/// true, there may be additional characters that follow the literal that are
+/// not considered part of it.
 func parseWithDelimitersTest(
-  _ input: String, _ expecting: AST.Node,
+  _ input: String, _ expecting: AST.Node, ignoreTrailing: Bool = false,
   file: StaticString = #file, line: UInt = #line
 ) {
   // First try lexing.
-  delimiterLexingTest(input, file: file, line: line)
+  let literal = delimiterLexingTest(
+    input, ignoreTrailing: ignoreTrailing, file: file, line: line)
 
-  let orig = try! parseWithDelimiters(input)
+  let orig = try! parseWithDelimiters(literal)
   let ast = orig.root
   guard ast == expecting
           || ast._dump() == expecting._dump() // EQ workaround
@@ -1509,6 +1527,63 @@ extension RegexTests {
 
     // Printable ASCII characters.
     delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##)
+
+    // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter
+    // if it's clear that it's part of the regex syntax.
+
+    parseWithDelimitersTest(
+      #"re'(?'a_bcA0'\')'"#, namedCapture("a_bcA0", "'"))
+    parseWithDelimitersTest(
+      #"re'(?'a_bcA0-c1A'x*)'"#,
+      balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x")))
+
+    parseWithDelimitersTest(
+      #"re'(?('a_bcA0')x|y)'"#, conditional(
+        .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y"))
+    parseWithDelimitersTest(
+      #"re'(?('+20')\')'"#, conditional(
+        .groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty()))
+
+    parseWithDelimitersTest(
+      #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))))
+    parseWithDelimitersTest(
+      #"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1))
+
+    parseWithDelimitersTest(
+      #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A"))))
+    parseWithDelimitersTest(
+      #"re'\g'-1'\''"#, concat(subpattern(.relative(-1)), "'"))
+
+    parseWithDelimitersTest(
+      #"re'(?C'a*b\c 🔥_ ;')'"#, pcreCallout(.string(#"a*b\c 🔥_ ;"#)))
+
+    // Fine, because we don't end up skipping.
+    delimiterLexingTest(#"re'(?'"#)
+    delimiterLexingTest(#"re'(?('"#)
+    delimiterLexingTest(#"re'\k'"#)
+    delimiterLexingTest(#"re'\g'"#)
+    delimiterLexingTest(#"re'(?C'"#)
+
+    // Not a valid group name, but we can still skip over it.
+    delimiterLexingTest(#"re'(?'🔥')'"#)
+
+    // Escaped, so don't skip. These will ignore the ending `'` as we've already
+    // closed the literal.
+    parseWithDelimitersTest(
+      #"re'\(?''"#, zeroOrOne(of: "("), ignoreTrailing: true
+    )
+    parseWithDelimitersTest(
+      #"re'\\k''"#, concat("\\", "k"), ignoreTrailing: true
+    )
+    parseWithDelimitersTest(
+      #"re'\\g''"#, concat("\\", "g"), ignoreTrailing: true
+    )
+    parseWithDelimitersTest(
+      #"re'\(?C''"#, concat(zeroOrOne(of: "("), "C"), ignoreTrailing: true
+    )
+    delimiterLexingTest(#"re'(\?''"#, ignoreTrailing: true)
+    delimiterLexingTest(#"re'\(?(''"#, ignoreTrailing: true)
+
     // MARK: Parse not-equal
 
     // Make sure dumping output correctly reflects differences in AST.
@@ -1815,6 +1890,12 @@ extension RegexTests {
     diagnosticTest(#"(?<#>)"#, .identifierMustBeAlphaNumeric(.groupName))
     diagnosticTest(#"(?'1A')"#, .identifierCannotStartWithNumber(.groupName))
 
+    // TODO: It might be better if tried to consume up to the closing `'` and
+    // diagnosed an invalid group name based on that.
+    diagnosticTest(#"(?'abc ')"#, .expected("'"))
+
+    diagnosticTest("(?'🔥')", .identifierMustBeAlphaNumeric(.groupName))
+
     diagnosticTest(#"(?'-')"#, .expectedIdentifier(.groupName))
     diagnosticTest(#"(?'--')"#, .identifierMustBeAlphaNumeric(.groupName))
     diagnosticTest(#"(?'a-b-c')"#, .expected("'"))
@@ -1928,6 +2009,9 @@ extension RegexTests {
   }
 
   func testDelimiterLexingErrors() {
+
+    // MARK: Printable ASCII
+
     delimiterLexingDiagnosticTest(#"re'\\#n'"#, .endOfString)
     for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r.
       delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII)
@@ -1935,6 +2019,14 @@ extension RegexTests {
     delimiterLexingDiagnosticTest("re'\n'", .endOfString)
     delimiterLexingDiagnosticTest("re'\r'", .endOfString)
     delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII)
+
+    // MARK: Delimiter skipping
+
+    delimiterLexingDiagnosticTest("re'(?''", .endOfString)
+    delimiterLexingDiagnosticTest("re'(?'abc'", .endOfString)
+    delimiterLexingDiagnosticTest("re'(?('abc'", .endOfString)
+    delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .endOfString)
+    delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .endOfString)
   }
 
   func testlibswiftDiagnostics() {

From 2325cef781477a4b51deeafcdae9c528a486e682 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Wed, 2 Mar 2022 16:53:57 +0000
Subject: [PATCH 8/8] Add support for rx'...' for experimental syntax

---
 Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift | 8 +++++---
 Tests/RegexTests/ParseTests.swift                         | 6 ++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
index f1d3d5607..1227ade1f 100644
--- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
@@ -15,12 +15,14 @@ enum Delimiter: Hashable, CaseIterable {
   case traditional
   case experimental
   case reSingleQuote
+  case rxSingleQuote
 
   var openingAndClosing: (opening: String, closing: String) {
     switch self {
     case .traditional: return ("#/", "/#")
     case .experimental: return ("#|", "|#")
     case .reSingleQuote: return ("re'", "'")
+    case .rxSingleQuote: return ("rx'", "'")
     }
   }
   var opening: String { openingAndClosing.opening }
@@ -31,7 +33,7 @@ enum Delimiter: Hashable, CaseIterable {
     switch self {
     case .traditional, .reSingleQuote:
       return .traditional
-    case .experimental:
+    case .experimental, .rxSingleQuote:
       return .experimental
     }
   }
@@ -134,11 +136,11 @@ fileprivate struct DelimiterLexer {
   /// Attempt to skip over a closing delimiter character that is unlikely to be
   /// the actual closing delimiter.
   mutating func trySkipDelimiter(_ delimiter: Delimiter) {
-    // Only the closing `'` for re'...' can potentially be skipped over.
+    // Only the closing `'` for re'...'/rx'...' can potentially be skipped over.
     switch delimiter {
     case .traditional, .experimental:
       return
-    case .reSingleQuote:
+    case .reSingleQuote, .rxSingleQuote:
       break
     }
     guard load() == ascii("'") else { return }
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index b499c0b98..2ee76b682 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -1497,6 +1497,9 @@ extension RegexTests {
     parseWithDelimitersTest("#/a b/#", concat("a", " ", "b"))
     parseWithDelimitersTest("#|a b|#", concat("a", "b"))
 
+    parseWithDelimitersTest("re'a b'", concat("a", " ", "b"))
+    parseWithDelimitersTest("rx'a b'", concat("a", "b"))
+
     parseWithDelimitersTest("#|[a b]|#", charClass("a", "b"))
     parseWithDelimitersTest(
       "#|(?-x)[a b]|#", changeMatchingOptions(
@@ -1537,6 +1540,9 @@ extension RegexTests {
       #"re'(?'a_bcA0-c1A'x*)'"#,
       balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x")))
 
+    parseWithDelimitersTest(
+      #"rx' (?'a_bcA0' a b)'"#, concat(namedCapture("a_bcA0", concat("a", "b"))))
+
     parseWithDelimitersTest(
       #"re'(?('a_bcA0')x|y)'"#, conditional(
         .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y"))