Add lexing heuristic to handle single quotes in re'...'

hamishknight · hamishknight · commit b1172a1b9101 · 2022-03-02T15:18:03.000Z
If a single quote is encountered with a prefix of
either `(?`, `(?(`, `\k`, `\g` or `(?C`, continue
to scan ahead to a closing `'`. Such prefixes would
not be valid endings for a regex literal anyway,
and this lets us handle the single quote variant
of their syntax.

For the group name cases, further refine this
skipping behavior by only skipping over characters
that could possibly appear in that case. This
improves diagnostic behavior by ensuring we don't
go wandering off into Swift code.
diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
@@ -104,6 +104,14 @@ fileprivate struct DelimiterLexer {
     slice(at: cursor, count)
   }
 
+  /// Return the slice of `count` bytes preceding the current cursor, or `nil`
+  /// if there are fewer than `count` bytes before the cursor.
+  func sliceBehind(_ count: Int) -> UnsafeRawBufferPointer? {
+    let priorCursor = cursor - count
+    guard priorCursor >= start else { return nil }
+    return slice(at: priorCursor, count)
+  }
+
   /// Advance the cursor `n` bytes.
   mutating func advanceCursor(_ n: Int = 1) {
     cursor += n
@@ -123,6 +131,86 @@ fileprivate struct DelimiterLexer {
     return true
   }
 
+  /// Attempt to skip over a closing delimiter character that is unlikely to be
+  /// the actual closing delimiter.
+  mutating func trySkipDelimiter(_ delimiter: Delimiter) {
+    // Only the closing `'` for re'...' can potentially be skipped over.
+    switch delimiter {
+    case .traditional, .experimental:
+      return
+    case .reSingleQuote:
+      break
+    }
+    guard load() == ascii("'") else { return }
+
+    /// Need to look for a prefix of `(?`, `(?(`, `\k`, `\g`, `(?C`, as those
+    /// are the cases that could use single quotes. Note that none of these
+    /// would be valid regex endings anyway.
+    let calloutPrefix = "(?C"
+    let prefix = ["(?", "(?(", #"\k"#, #"\g"#, calloutPrefix].first { prior in
+      guard let priorSlice = sliceBehind(prior.utf8.count),
+            priorSlice.elementsEqual(prior.utf8)
+      else { return false }
+
+      // Make sure the slice isn't preceded by a '\', as that invalidates this
+      // analysis.
+      if let prior = sliceBehind(priorSlice.count + 1) {
+        return prior[0] != ascii("\\")
+      }
+      return true
+    }
+    guard let prefix = prefix else { return }
+    let isCallout = prefix == calloutPrefix
+
+    func isPossiblyGroupReference(_ c: UInt8) -> Bool {
+      // If this is an ASCII character, make sure it's for a group name. Leave
+      // other UTF-8 encoded scalars alone, this should at least catch cases
+      // where we run into a symbol such as `{`, `.`, `;` that would indicate
+      // we've likely advanced out of the bounds of the regex.
+      let scalar = UnicodeScalar(c)
+      guard scalar.isASCII else { return true }
+      switch scalar {
+      // Include '-' and '+' which may be used in recursion levels and relative
+      // references.
+      case "A"..."Z", "a"..."z", "0"..."9", "_", "-", "+":
+        return true
+      default:
+        return false
+      }
+    }
+
+    // Make a note of the current lexing position, as we may need to revert
+    // back to it.
+    let originalCursor = cursor
+    advanceCursor()
+
+    // Try skip over what would be the contents of a group identifier/reference.
+    while let next = load() {
+      // Found the ending, we're done. Return so we can continue to lex to the
+      // real delimiter.
+      if next == ascii("'") {
+        advanceCursor()
+        return
+      }
+
+      // If this isn't a callout, make sure we have something that could be a
+      // group reference. We limit the character set here to improve diagnostic
+      // behavior in the case where the literal is actually unterminated. We
+      // ideally don't want to go wandering off into Swift source code. We can't
+      // do the same for callouts, as they take arbitrary strings.
+      guard isCallout || isPossiblyGroupReference(next) else { break }
+      do {
+        try advance()
+      } catch {
+        break
+      }
+    }
+    // We bailed out, either because we ran into something that didn't look like
+    // an identifier, or we reached the end of the line. Revert back to the
+    // original guess of delimiter.
+    cursor = originalCursor
+  }
+
   /// Attempt to eat a particular closing delimiter, returning the contents of
   /// the literal, and ending pointer, or `nil` if this is not a delimiter
   /// ending.
@@ -194,6 +282,10 @@ fileprivate struct DelimiterLexer {
 
     let contentsStart = cursor
     while true {
+      // Check to see if we're at a character that looks like a delimiter, but
+      // likely isn't. In such a case, we can attempt to skip over it.
+      trySkipDelimiter(delimiter)
+
       // Try to lex the closing delimiter.
       if let (contents, end) = try tryEatEnding(delimiter,
                                                 contentsStart: contentsStart) {
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
@@ -107,28 +107,46 @@ func parseTest(
   serializedCaptures.deallocate()
 }
 
+/// Test delimiter lexing. Takes an input string that starts with a regex
+/// literal. If `ignoreTrailing` is true, there may be additional characters
+/// that follow the literal that are not considered part of it.
+@discardableResult
 func delimiterLexingTest(
-  _ input: String, file: StaticString = #file, line: UInt = #line
-) {
+  _ input: String, ignoreTrailing: Bool = false,
+  file: StaticString = #file, line: UInt = #line
+) -> String {
   input.withCString(encodedAs: UTF8.self) { ptr in
     let endPtr = ptr + input.utf8.count
     let (contents, delim, end) = try! lexRegex(start: ptr, end: endPtr)
-    XCTAssertEqual(end, endPtr, file: file, line: line)
+    if ignoreTrailing {
+      XCTAssertNotEqual(end, endPtr, file: file, line: line)
+    } else {
+      XCTAssertEqual(end, endPtr, file: file, line: line)
+    }
 
-    let (parseContents, parseDelim) = droppingRegexDelimiters(input)
+    let rawPtr = UnsafeRawPointer(ptr)
+    let buffer = UnsafeRawBufferPointer(start: rawPtr, count: end - rawPtr)
+    let literal = String(decoding: buffer, as: UTF8.self)
+
+    let (parseContents, parseDelim) = droppingRegexDelimiters(literal)
     XCTAssertEqual(contents, parseContents, file: file, line: line)
     XCTAssertEqual(delim, parseDelim, file: file, line: line)
+    return literal
   }
 }
 
+/// Test parsing an input string with regex delimiters. If `ignoreTrailing` is
+/// true, there may be additional characters that follow the literal that are
+/// not considered part of it.
 func parseWithDelimitersTest(
-  _ input: String, _ expecting: AST.Node,
+  _ input: String, _ expecting: AST.Node, ignoreTrailing: Bool = false,
   file: StaticString = #file, line: UInt = #line
 ) {
   // First try lexing.
-  delimiterLexingTest(input, file: file, line: line)
+  let literal = delimiterLexingTest(
+    input, ignoreTrailing: ignoreTrailing, file: file, line: line)
 
-  let orig = try! parseWithDelimiters(input)
+  let orig = try! parseWithDelimiters(literal)
   let ast = orig.root
   guard ast == expecting
           || ast._dump() == expecting._dump() // EQ workaround
@@ -1505,6 +1523,63 @@ extension RegexTests {
 
     // Printable ASCII characters.
     delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##)
+
+    // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter
+    // if it's clear that it's part of the regex syntax.
+
+    parseWithDelimitersTest(
+      #"re'(?'a_bcA0'\')'"#, namedCapture("a_bcA0", "'"))
+    parseWithDelimitersTest(
+      #"re'(?'a_bcA0-c1A'x*)'"#,
+      balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x")))
+
+    parseWithDelimitersTest(
+      #"re'(?('a_bcA0')x|y)'"#, conditional(
+        .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y"))
+    parseWithDelimitersTest(
+      #"re'(?('+20')\')'"#, conditional(
+        .groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty()))
+
+    parseWithDelimitersTest(
+      #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))))
+    parseWithDelimitersTest(
+      #"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1))
+
+    parseWithDelimitersTest(
+      #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A"))))
+    parseWithDelimitersTest(
+      #"re'\g'-1'\''"#, concat(subpattern(.relative(-1)), "'"))
+
+    parseWithDelimitersTest(
+      #"re'(?C'a*b\c 🔥_ ;')'"#, pcreCallout(.string(#"a*b\c 🔥_ ;"#)))
+
+    // Fine, because we don't end up skipping.
+    delimiterLexingTest(#"re'(?'"#)
+    delimiterLexingTest(#"re'(?('"#)
+    delimiterLexingTest(#"re'\k'"#)
+    delimiterLexingTest(#"re'\g'"#)
+    delimiterLexingTest(#"re'(?C'"#)
+
+    // Not a valid group name, but we can still skip over it.
+    delimiterLexingTest(#"re'(?'🔥')'"#)
+
+    // Escaped, so don't skip. These will ignore the ending `'` as we've already
+    // closed the literal.
+    parseWithDelimitersTest(
+      #"re'\(?''"#, zeroOrOne(of: "("), ignoreTrailing: true
+    )
+    parseWithDelimitersTest(
+      #"re'\\k''"#, concat("\\", "k"), ignoreTrailing: true
+    )
+    parseWithDelimitersTest(
+      #"re'\\g''"#, concat("\\", "g"), ignoreTrailing: true
+    )
+    parseWithDelimitersTest(
+      #"re'\(?C''"#, concat(zeroOrOne(of: "("), "C"), ignoreTrailing: true
+    )
+    delimiterLexingTest(#"re'(\?''"#, ignoreTrailing: true)
+    delimiterLexingTest(#"re'\(?(''"#, ignoreTrailing: true)
+
     // MARK: Parse not-equal
 
     // Make sure dumping output correctly reflects differences in AST.
@@ -1811,6 +1886,12 @@ extension RegexTests {
     diagnosticTest(#"(?<#>)"#, .identifierMustBeAlphaNumeric(.groupName))
     diagnosticTest(#"(?'1A')"#, .identifierCannotStartWithNumber(.groupName))
 
+    // TODO: It might be better if tried to consume up to the closing `'` and
+    // diagnosed an invalid group name based on that.
+    diagnosticTest(#"(?'abc ')"#, .expected("'"))
+
+    diagnosticTest("(?'🔥')", .identifierMustBeAlphaNumeric(.groupName))
+
     diagnosticTest(#"(?'-')"#, .expectedIdentifier(.groupName))
     diagnosticTest(#"(?'--')"#, .identifierMustBeAlphaNumeric(.groupName))
     diagnosticTest(#"(?'a-b-c')"#, .expected("'"))
@@ -1924,13 +2005,24 @@ extension RegexTests {
   }
 
   func testDelimiterLexingErrors() {
+
+    // MARK: Printable ASCII
+
     delimiterLexingDiagnosticTest(#"re'\\#n'"#, .endOfString)
     for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r.
       delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII)
     }
     delimiterLexingDiagnosticTest("re'\n'", .endOfString)
     delimiterLexingDiagnosticTest("re'\r'", .endOfString)
     delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII)
+
+    // MARK: Delimiter skipping
+
+    delimiterLexingDiagnosticTest("re'(?''", .endOfString)
+    delimiterLexingDiagnosticTest("re'(?'abc'", .endOfString)
+    delimiterLexingDiagnosticTest("re'(?('abc'", .endOfString)
+    delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .endOfString)
+    delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .endOfString)
   }
 
   func testlibswiftDiagnostics() {