diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift new file mode 100644 index 000000000..1227ade1f --- /dev/null +++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift @@ -0,0 +1,332 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +// TODO: mock up multi-line soon + +enum Delimiter: Hashable, CaseIterable { + case traditional + case experimental + case reSingleQuote + case rxSingleQuote + + var openingAndClosing: (opening: String, closing: String) { + switch self { + case .traditional: return ("#/", "/#") + case .experimental: return ("#|", "|#") + case .reSingleQuote: return ("re'", "'") + case .rxSingleQuote: return ("rx'", "'") + } + } + var opening: String { openingAndClosing.opening } + var closing: String { openingAndClosing.closing } + + /// The default set of syntax options that the delimiter indicates. + var defaultSyntaxOptions: SyntaxOptions { + switch self { + case .traditional, .reSingleQuote: + return .traditional + case .experimental, .rxSingleQuote: + return .experimental + } + } +} + +struct DelimiterLexError: Error, CustomStringConvertible { + enum Kind: Hashable { + case endOfString + case invalidUTF8 // TODO: better range reporting + case unknownDelimiter + case unprintableASCII + } + + var kind: Kind + + /// The pointer at which to resume lexing. + var resumePtr: UnsafeRawPointer + + init(_ kind: Kind, resumeAt resumePtr: UnsafeRawPointer) { + self.kind = kind + self.resumePtr = resumePtr + } + + var description: String { + switch kind { + case .endOfString: return "unterminated regex literal" + case .invalidUTF8: return "invalid UTF-8 found in source file" + case .unknownDelimiter: return "unknown regex literal delimiter" + case .unprintableASCII: return "unprintable ASCII character found in source file" + } + } +} + +fileprivate struct DelimiterLexer { + let start: UnsafeRawPointer + var cursor: UnsafeRawPointer + let end: UnsafeRawPointer + + init(start: UnsafeRawPointer, end: UnsafeRawPointer) { + precondition(start <= end) + self.start = start + self.cursor = start + self.end = end + } + + func ascii(_ s: Unicode.Scalar) -> UInt8 { + assert(s.value <= 0x7F) + return UInt8(asserting: s.value) + } + + /// Return the byte at the current cursor, or `nil` if the end of the buffer + /// has been reached. + func load() -> UInt8? { + guard cursor < end else { return nil } + return cursor.load(as: UInt8.self) + } + + /// Return the slice of `count` bytes from a specified cursor position, or + /// `nil` if there are fewer than `count` bytes until the end of the buffer. + func slice( + at cursor: UnsafeRawPointer, _ count: Int + ) -> UnsafeRawBufferPointer? { + guard cursor + count <= end else { return nil } + return UnsafeRawBufferPointer(start: cursor, count: count) + } + + /// Return the slice of `count` bytes from the current cursor, or `nil` if + /// there are fewer than `count` bytes until the end of the buffer. + func slice(_ count: Int) -> UnsafeRawBufferPointer? { + slice(at: cursor, count) + } + + /// Return the slice of `count` bytes preceding the current cursor, or `nil` + /// if there are fewer than `count` bytes before the cursor. + func sliceBehind(_ count: Int) -> UnsafeRawBufferPointer? { + let priorCursor = cursor - count + guard priorCursor >= start else { return nil } + return slice(at: priorCursor, count) + } + + /// Advance the cursor `n` bytes. + mutating func advanceCursor(_ n: Int = 1) { + cursor += n + precondition(cursor <= end, "Cannot advance past end") + } + + /// Check to see if a UTF-8 sequence can be eaten from the current cursor. + func canEat(_ utf8: String.UTF8View) -> Bool { + guard let slice = slice(utf8.count) else { return false } + return slice.elementsEqual(utf8) + } + + /// Attempt to eat a UTF-8 byte sequence, returning `true` if successful. + mutating func tryEat(_ utf8: String.UTF8View) -> Bool { + guard canEat(utf8) else { return false } + advanceCursor(utf8.count) + return true + } + + /// Attempt to skip over a closing delimiter character that is unlikely to be + /// the actual closing delimiter. + mutating func trySkipDelimiter(_ delimiter: Delimiter) { + // Only the closing `'` for re'...'/rx'...' can potentially be skipped over. + switch delimiter { + case .traditional, .experimental: + return + case .reSingleQuote, .rxSingleQuote: + break + } + guard load() == ascii("'") else { return } + + /// Need to look for a prefix of `(?`, `(?(`, `\k`, `\g`, `(?C`, as those + /// are the cases that could use single quotes. Note that none of these + /// would be valid regex endings anyway. + let calloutPrefix = "(?C" + let prefix = ["(?", "(?(", #"\k"#, #"\g"#, calloutPrefix].first { prior in + guard let priorSlice = sliceBehind(prior.utf8.count), + priorSlice.elementsEqual(prior.utf8) + else { return false } + + // Make sure the slice isn't preceded by a '\', as that invalidates this + // analysis. + if let prior = sliceBehind(priorSlice.count + 1) { + return prior[0] != ascii("\\") + } + return true + } + guard let prefix = prefix else { return } + let isCallout = prefix == calloutPrefix + + func isPossiblyGroupReference(_ c: UInt8) -> Bool { + // If this is an ASCII character, make sure it's for a group name. Leave + // other UTF-8 encoded scalars alone, this should at least catch cases + // where we run into a symbol such as `{`, `.`, `;` that would indicate + // we've likely advanced out of the bounds of the regex. + let scalar = UnicodeScalar(c) + guard scalar.isASCII else { return true } + switch scalar { + // Include '-' and '+' which may be used in recursion levels and relative + // references. + case "A"..."Z", "a"..."z", "0"..."9", "_", "-", "+": + return true + default: + return false + } + } + + // Make a note of the current lexing position, as we may need to revert + // back to it. + let originalCursor = cursor + advanceCursor() + + // Try skip over what would be the contents of a group identifier/reference. + while let next = load() { + // Found the ending, we're done. Return so we can continue to lex to the + // real delimiter. + if next == ascii("'") { + advanceCursor() + return + } + + // If this isn't a callout, make sure we have something that could be a + // group reference. We limit the character set here to improve diagnostic + // behavior in the case where the literal is actually unterminated. We + // ideally don't want to go wandering off into Swift source code. We can't + // do the same for callouts, as they take arbitrary strings. + guard isCallout || isPossiblyGroupReference(next) else { break } + do { + try advance() + } catch { + break + } + } + // We bailed out, either because we ran into something that didn't look like + // an identifier, or we reached the end of the line. Revert back to the + // original guess of delimiter. + cursor = originalCursor + } + + /// Attempt to eat a particular closing delimiter, returning the contents of + /// the literal, and ending pointer, or `nil` if this is not a delimiter + /// ending. + mutating func tryEatEnding( + _ delimiter: Delimiter, contentsStart: UnsafeRawPointer + ) throws -> (contents: String, end: UnsafeRawPointer)? { + let contentsEnd = cursor + guard tryEat(delimiter.closing.utf8) else { return nil } + + // Form a string from the contents and make sure it's valid UTF-8. + let count = contentsEnd - contentsStart + let contents = UnsafeRawBufferPointer( + start: contentsStart, count: count) + let s = String(decoding: contents, as: UTF8.self) + + guard s.utf8.elementsEqual(contents) else { + throw DelimiterLexError(.invalidUTF8, resumeAt: cursor) + } + return (contents: s, end: cursor) + } + + /// Attempt to advance the lexer, throwing an error if the end of a line or + /// the end of the buffer is reached. + mutating func advance(escaped: Bool = false) throws { + guard let next = load() else { + throw DelimiterLexError(.endOfString, resumeAt: cursor) + } + switch UnicodeScalar(next) { + case let next where !next.isASCII: + // Just advance into a UTF-8 sequence. It shouldn't matter that we'll + // iterate through each byte as we only match against ASCII, and we + // validate it at the end. This case is separated out so we can just deal + // with the ASCII cases below. + advanceCursor() + + case "\n", "\r": + throw DelimiterLexError(.endOfString, resumeAt: cursor) + + case "\0": + // TODO: Warn to match the behavior of String literal lexer? Or should + // we error as unprintable? + advanceCursor() + + case "\\" where !escaped: + // Advance again for an escape sequence. + advanceCursor() + try advance(escaped: true) + + case let next where !next.isPrintableASCII: + // Diagnose unprintable ASCII. + // TODO: Ideally we would recover and continue to lex until the ending + // delimiter. + throw DelimiterLexError(.unprintableASCII, resumeAt: cursor.successor()) + + default: + advanceCursor() + } + } + + /*consuming*/ mutating func lex( + ) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { + + // Try to lex the opening delimiter. + guard let delimiter = Delimiter.allCases.first( + where: { tryEat($0.opening.utf8) } + ) else { + throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor()) + } + + let contentsStart = cursor + while true { + // Check to see if we're at a character that looks like a delimiter, but + // likely isn't. In such a case, we can attempt to skip over it. + trySkipDelimiter(delimiter) + + // Try to lex the closing delimiter. + if let (contents, end) = try tryEatEnding(delimiter, + contentsStart: contentsStart) { + return (contents, delimiter, end) + } + // Try to advance the lexer. + try advance() + } + } +} + +/// Drop a set of regex delimiters from the input string, returning the contents +/// and the delimiter used. The input string must have valid delimiters. +func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) { + func stripDelimiter(_ delim: Delimiter) -> String? { + // The opening delimiter must match. + guard var slice = str.utf8.tryDropPrefix(delim.opening.utf8) + else { return nil } + + // The closing delimiter may optionally match, as it may not be present in + // invalid code. + if let newSlice = slice.tryDropSuffix(delim.closing.utf8) { + slice = newSlice + } + return String(slice) + } + for d in Delimiter.allCases { + if let contents = stripDelimiter(d) { + return (contents, d) + } + } + fatalError("No valid delimiters") +} + +/// Attempt to lex a regex literal between `start` and `end`, returning either +/// the contents and pointer from which to resume lexing, or an error. +func lexRegex( + start: UnsafeRawPointer, end: UnsafeRawPointer +) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { + var lexer = DelimiterLexer(start: start, end: end) + return try lexer.lex() +} diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift index dd785f12d..cfab75312 100644 --- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift @@ -1472,7 +1472,9 @@ extension Source { return ref } - let char = src.eat() + guard let char = src.tryEat() else { + throw ParseError.expectedEscape + } // Single-character builtins. if let builtin = AST.Atom.EscapedBuiltin( diff --git a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift index e3a178a15..5994a4f52 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift @@ -9,150 +9,6 @@ // //===----------------------------------------------------------------------===// - -// TODO: mock up multi-line soon - -enum Delimiter: Hashable, CaseIterable { - case traditional - case experimental - case reSingleQuote - - var openingAndClosing: (opening: String, closing: String) { - switch self { - case .traditional: return ("#/", "/#") - case .experimental: return ("#|", "|#") - case .reSingleQuote: return ("re'", "'") - } - } - var opening: String { openingAndClosing.opening } - var closing: String { openingAndClosing.closing } - - /// The default set of syntax options that the delimiter indicates. - var defaultSyntaxOptions: SyntaxOptions { - switch self { - case .traditional, .reSingleQuote: - return .traditional - case .experimental: - return .experimental - } - } -} - -struct LexError: Error, CustomStringConvertible { - enum Kind: Hashable { - case endOfString - case invalidUTF8 // TODO: better range reporting - case unknownDelimiter - } - - var kind: Kind - - /// The pointer at which to resume lexing. - var resumePtr: UnsafeRawPointer - - init(_ kind: Kind, resumeAt resumePtr: UnsafeRawPointer) { - self.kind = kind - self.resumePtr = resumePtr - } - - var description: String { - switch kind { - case .endOfString: return "unterminated regex literal" - case .invalidUTF8: return "invalid UTF-8 found in source file" - case .unknownDelimiter: return "unknown regex literal delimiter" - } - } -} - -/// Drop a set of regex delimiters from the input string, returning the contents -/// and the delimiter used. The input string must have valid delimiters. -func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) { - let utf8 = str.utf8 - func stripDelimiter(_ delim: Delimiter) -> String? { - let prefix = delim.opening.utf8 - let suffix = delim.closing.utf8 - guard utf8.prefix(prefix.count).elementsEqual(prefix), - utf8.suffix(suffix.count).elementsEqual(suffix) else { return nil } - - return String(utf8.dropFirst(prefix.count).dropLast(suffix.count)) - } - for d in Delimiter.allCases { - if let contents = stripDelimiter(d) { - return (contents, d) - } - } - fatalError("No valid delimiters") -} - -/// Attempt to lex a regex literal between `start` and `end`, returning either -/// the contents and pointer from which to resume lexing, or an error. -func lexRegex( - start: UnsafeRawPointer, end: UnsafeRawPointer -) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { - precondition(start <= end) - var current = start - - func ascii(_ s: Unicode.Scalar) -> UInt8 { - assert(s.value <= 0x7F) - return UInt8(asserting: s.value) - } - func load(offset: Int) -> UInt8? { - guard current + offset < end else { return nil } - return current.load(fromByteOffset: offset, as: UInt8.self) - } - func load() -> UInt8? { load(offset: 0) } - func advance(_ n: Int = 1) { - precondition(current + n <= end, "Cannot advance past end") - current = current.advanced(by: n) - } - - func tryEat(_ utf8: String.UTF8View) -> Bool { - for (i, idx) in utf8.indices.enumerated() { - guard load(offset: i) == utf8[idx] else { return false } - } - advance(utf8.count) - return true - } - - // Try to lex the opening delimiter. - guard let delimiter = Delimiter.allCases.first( - where: { tryEat($0.opening.utf8) } - ) else { - throw LexError(.unknownDelimiter, resumeAt: current.successor()) - } - - let contentsStart = current - while true { - switch load() { - case nil, ascii("\n"), ascii("\r"): - throw LexError(.endOfString, resumeAt: current) - - case ascii("\\"): - // Skip next byte. - advance(2) - - default: - // Try to lex the closing delimiter. - let contentsEnd = current - guard tryEat(delimiter.closing.utf8) else { - advance() - continue - } - - // Form a string from the contents and make sure it's valid UTF-8. - let count = contentsEnd - contentsStart - let contents = UnsafeRawBufferPointer( - start: contentsStart, count: count) - let s = String(decoding: contents, as: UTF8.self) - - guard s.utf8.elementsEqual(contents) else { - throw LexError(.invalidUTF8, resumeAt: current) - } - return (contents: s, delimiter, end: current) - } - } -} - private func copyCString(_ str: String) -> UnsafePointer { let count = str.utf8.count + 1 return str.withCString { @@ -196,7 +52,7 @@ func libswiftLexRegexLiteral( let (_, _, endPtr) = try lexRegex(start: inputPtr, end: bufferEndPtr) curPtrPtr.pointee = endPtr.assumingMemoryBound(to: CChar.self) return false - } catch let error as LexError { + } catch let error as DelimiterLexError { if error.kind == .unknownDelimiter { // An unknown delimiter should be recovered from, as we may want to try // lex something else. @@ -205,12 +61,18 @@ func libswiftLexRegexLiteral( errOut.pointee = copyCString("\(error)") curPtrPtr.pointee = error.resumePtr.assumingMemoryBound(to: CChar.self) - // For now, treat every error as unrecoverable. - // TODO: We should ideally be able to recover from a regex with missing - // closing delimiters, which would help with code completion. - return true + switch error.kind { + case .endOfString: + // Missing closing delimiter can be recovered from. + return false + case .unprintableASCII, .invalidUTF8: + // We don't currently have good recovery behavior for these. + return true + case .unknownDelimiter: + fatalError("Already handled") + } } catch { - fatalError("Should be a LexError") + fatalError("Should be a DelimiterLexError") } } diff --git a/Sources/_MatchingEngine/Regex/Parse/Source.swift b/Sources/_MatchingEngine/Regex/Parse/Source.swift index 11bd8152f..ddf0475f3 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Source.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Source.swift @@ -86,6 +86,12 @@ extension Source { tryEat(anyOf: set) } + /// Try to eat any character, returning `nil` if the input has been exhausted. + mutating func tryEat() -> Char? { + guard !isEmpty else { return nil } + return eat() + } + mutating func eat(asserting c: Char) { assert(peek() == c) advance() diff --git a/Sources/_MatchingEngine/Utility/Misc.swift b/Sources/_MatchingEngine/Utility/Misc.swift index bd1e395b5..55d3d3adc 100644 --- a/Sources/_MatchingEngine/Utility/Misc.swift +++ b/Sources/_MatchingEngine/Utility/Misc.swift @@ -108,7 +108,28 @@ extension Collection { >(_ idx: Index, in c: C) -> C.Index { c.index(atOffset: offset(of: idx)) } +} +extension Collection where Element: Equatable { + /// Attempt to drop a given prefix from the collection, returning the + /// resulting subsequence, or `nil` if the prefix does not match. + public func tryDropPrefix( + _ other: C + ) -> SubSequence? where C.Element == Element { + let prefixCount = other.count + guard prefix(prefixCount).elementsEqual(other) else { return nil } + return dropFirst(prefixCount) + } + + /// Attempt to drop a given suffix from the collection, returning the + /// resulting subsequence, or `nil` if the suffix does not match. + public func tryDropSuffix( + _ other: C + ) -> SubSequence? where C.Element == Element { + let suffixCount = other.count + guard suffix(suffixCount).elementsEqual(other) else { return nil } + return dropLast(suffixCount) + } } extension UnsafeMutableRawPointer { diff --git a/Sources/_MatchingEngine/Utility/MissingUnicode.swift b/Sources/_MatchingEngine/Utility/MissingUnicode.swift index a6aae0b82..dccba3286 100644 --- a/Sources/_MatchingEngine/Utility/MissingUnicode.swift +++ b/Sources/_MatchingEngine/Utility/MissingUnicode.swift @@ -661,3 +661,11 @@ extension Character { public var isWordCharacter: Bool { isLetter || isNumber || self == "_" } } + +extension UnicodeScalar { + public var isPrintableASCII: Bool { + // Exclude non-printables before the space character U+20, and anything + // including and above the DEL character U+7F. + value >= 0x20 && value < 0x7F + } +} diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index e55abcbb9..2ee76b682 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -107,22 +107,46 @@ func parseTest( serializedCaptures.deallocate() } -func parseWithDelimitersTest( - _ input: String, _ expecting: AST.Node, +/// Test delimiter lexing. Takes an input string that starts with a regex +/// literal. If `ignoreTrailing` is true, there may be additional characters +/// that follow the literal that are not considered part of it. +@discardableResult +func delimiterLexingTest( + _ input: String, ignoreTrailing: Bool = false, file: StaticString = #file, line: UInt = #line -) { - // First try lexing. - input.withCString { ptr in - let (contents, delim, end) = try! lexRegex(start: ptr, - end: ptr + input.count) - XCTAssertEqual(end, ptr + input.count, file: file, line: line) +) -> String { + input.withCString(encodedAs: UTF8.self) { ptr in + let endPtr = ptr + input.utf8.count + let (contents, delim, end) = try! lexRegex(start: ptr, end: endPtr) + if ignoreTrailing { + XCTAssertNotEqual(end, endPtr, file: file, line: line) + } else { + XCTAssertEqual(end, endPtr, file: file, line: line) + } + + let rawPtr = UnsafeRawPointer(ptr) + let buffer = UnsafeRawBufferPointer(start: rawPtr, count: end - rawPtr) + let literal = String(decoding: buffer, as: UTF8.self) - let (parseContents, parseDelim) = droppingRegexDelimiters(input) + let (parseContents, parseDelim) = droppingRegexDelimiters(literal) XCTAssertEqual(contents, parseContents, file: file, line: line) XCTAssertEqual(delim, parseDelim, file: file, line: line) + return literal } +} + +/// Test parsing an input string with regex delimiters. If `ignoreTrailing` is +/// true, there may be additional characters that follow the literal that are +/// not considered part of it. +func parseWithDelimitersTest( + _ input: String, _ expecting: AST.Node, ignoreTrailing: Bool = false, + file: StaticString = #file, line: UInt = #line +) { + // First try lexing. + let literal = delimiterLexingTest( + input, ignoreTrailing: ignoreTrailing, file: file, line: line) - let orig = try! parseWithDelimiters(input) + let orig = try! parseWithDelimiters(literal) let ast = orig.root guard ast == expecting || ast._dump() == expecting._dump() // EQ workaround @@ -199,6 +223,32 @@ func diagnosticTest( } } +func delimiterLexingDiagnosticTest( + _ input: String, _ expected: DelimiterLexError.Kind, + syntax: SyntaxOptions = .traditional, + file: StaticString = #file, line: UInt = #line +) { + do { + _ = try input.withCString { ptr in + try lexRegex(start: ptr, end: ptr + input.count) + } + XCTFail(""" + Passed, but expected error: \(expected) + """, file: file, line: line) + } catch let e as DelimiterLexError { + guard e.kind == expected else { + XCTFail(""" + + Expected: \(expected) + Actual: \(e.kind) + """, file: file, line: line) + return + } + } catch let e { + XCTFail("Unexpected error type: \(e)", file: file, line: line) + } +} + func libswiftDiagnosticMessageTest( _ input: String, _ expectedErr: String, file: StaticString = #file, line: UInt = #line @@ -1447,6 +1497,9 @@ extension RegexTests { parseWithDelimitersTest("#/a b/#", concat("a", " ", "b")) parseWithDelimitersTest("#|a b|#", concat("a", "b")) + parseWithDelimitersTest("re'a b'", concat("a", " ", "b")) + parseWithDelimitersTest("rx'a b'", concat("a", "b")) + parseWithDelimitersTest("#|[a b]|#", charClass("a", "b")) parseWithDelimitersTest( "#|(?-x)[a b]|#", changeMatchingOptions( @@ -1472,6 +1525,71 @@ extension RegexTests { parseWithDelimitersTest("re'x*'", zeroOrMore(of: "x")) + parseWithDelimitersTest(#"re'🔥🇩🇰'"#, concat("🔥", "🇩🇰")) + parseWithDelimitersTest(#"re'\🔥✅'"#, concat("🔥", "✅")) + + // Printable ASCII characters. + delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##) + + // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter + // if it's clear that it's part of the regex syntax. + + parseWithDelimitersTest( + #"re'(?'a_bcA0'\')'"#, namedCapture("a_bcA0", "'")) + parseWithDelimitersTest( + #"re'(?'a_bcA0-c1A'x*)'"#, + balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x"))) + + parseWithDelimitersTest( + #"rx' (?'a_bcA0' a b)'"#, concat(namedCapture("a_bcA0", concat("a", "b")))) + + parseWithDelimitersTest( + #"re'(?('a_bcA0')x|y)'"#, conditional( + .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y")) + parseWithDelimitersTest( + #"re'(?('+20')\')'"#, conditional( + .groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty())) + + parseWithDelimitersTest( + #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A")))) + parseWithDelimitersTest( + #"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1)) + + parseWithDelimitersTest( + #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A")))) + parseWithDelimitersTest( + #"re'\g'-1'\''"#, concat(subpattern(.relative(-1)), "'")) + + parseWithDelimitersTest( + #"re'(?C'a*b\c 🔥_ ;')'"#, pcreCallout(.string(#"a*b\c 🔥_ ;"#))) + + // Fine, because we don't end up skipping. + delimiterLexingTest(#"re'(?'"#) + delimiterLexingTest(#"re'(?('"#) + delimiterLexingTest(#"re'\k'"#) + delimiterLexingTest(#"re'\g'"#) + delimiterLexingTest(#"re'(?C'"#) + + // Not a valid group name, but we can still skip over it. + delimiterLexingTest(#"re'(?'🔥')'"#) + + // Escaped, so don't skip. These will ignore the ending `'` as we've already + // closed the literal. + parseWithDelimitersTest( + #"re'\(?''"#, zeroOrOne(of: "("), ignoreTrailing: true + ) + parseWithDelimitersTest( + #"re'\\k''"#, concat("\\", "k"), ignoreTrailing: true + ) + parseWithDelimitersTest( + #"re'\\g''"#, concat("\\", "g"), ignoreTrailing: true + ) + parseWithDelimitersTest( + #"re'\(?C''"#, concat(zeroOrOne(of: "("), "C"), ignoreTrailing: true + ) + delimiterLexingTest(#"re'(\?''"#, ignoreTrailing: true) + delimiterLexingTest(#"re'\(?(''"#, ignoreTrailing: true) + // MARK: Parse not-equal // Make sure dumping output correctly reflects differences in AST. @@ -1753,6 +1871,10 @@ extension RegexTests { diagnosticTest("(?")) diagnosticTest("(?", .expected(")")) + // MARK: Bad escapes + + diagnosticTest("\\", .expectedEscape) + // MARK: Text Segment options diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions) @@ -1774,6 +1896,12 @@ extension RegexTests { diagnosticTest(#"(?<#>)"#, .identifierMustBeAlphaNumeric(.groupName)) diagnosticTest(#"(?'1A')"#, .identifierCannotStartWithNumber(.groupName)) + // TODO: It might be better if tried to consume up to the closing `'` and + // diagnosed an invalid group name based on that. + diagnosticTest(#"(?'abc ')"#, .expected("'")) + + diagnosticTest("(?'🔥')", .identifierMustBeAlphaNumeric(.groupName)) + diagnosticTest(#"(?'-')"#, .expectedIdentifier(.groupName)) diagnosticTest(#"(?'--')"#, .identifierMustBeAlphaNumeric(.groupName)) diagnosticTest(#"(?'a-b-c')"#, .expected("'")) @@ -1886,6 +2014,27 @@ extension RegexTests { diagnosticTest("(*LIMIT_DEPTH=-1", .expectedNumber("", kind: .decimal)) } + func testDelimiterLexingErrors() { + + // MARK: Printable ASCII + + delimiterLexingDiagnosticTest(#"re'\\#n'"#, .endOfString) + for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r. + delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII) + } + delimiterLexingDiagnosticTest("re'\n'", .endOfString) + delimiterLexingDiagnosticTest("re'\r'", .endOfString) + delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII) + + // MARK: Delimiter skipping + + delimiterLexingDiagnosticTest("re'(?''", .endOfString) + delimiterLexingDiagnosticTest("re'(?'abc'", .endOfString) + delimiterLexingDiagnosticTest("re'(?('abc'", .endOfString) + delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .endOfString) + delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .endOfString) + } + func testlibswiftDiagnostics() { libswiftDiagnosticMessageTest( "#/[x*/#", "cannot parse regular expression: expected ']'")