From 1b7779a70402a3ef100436895ca415a06a739924 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 2 Aug 2022 20:36:00 +0100 Subject: [PATCH] Remove `re'...'` and `rx'...'` delimiters We didn't end up choosing these, remove their lexing code. `#|...|#` remains to test the experimental syntax. --- .../Regex/Parse/DelimiterLexing.swift | 100 +----------------- Sources/_RegexParser/Regex/Parse/Parse.swift | 4 +- Tests/RegexTests/LexTests.swift | 3 - Tests/RegexTests/ParseTests.swift | 89 ++++++---------- 4 files changed, 37 insertions(+), 159 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift index dd142f016..4d86f9d93 100644 --- a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift @@ -31,7 +31,7 @@ public struct Delimiter: Hashable { switch kind { case .forwardSlash: return poundCount > 0 - case .experimental, .reSingleQuote, .rxSingleQuote: + case .experimental: return false } } @@ -47,15 +47,11 @@ extension Delimiter { enum Kind: Hashable, CaseIterable { case forwardSlash case experimental - case reSingleQuote - case rxSingleQuote var openingAndClosing: (opening: String, closing: String) { switch self { case .forwardSlash: return ("/", "/") case .experimental: return ("#|", "|#") - case .reSingleQuote: return ("re'", "'") - case .rxSingleQuote: return ("rx'", "'") } } var opening: String { openingAndClosing.opening } @@ -67,7 +63,7 @@ extension Delimiter { switch self { case .forwardSlash: return true - case .experimental, .reSingleQuote, .rxSingleQuote: + case .experimental: return false } } @@ -150,14 +146,6 @@ fileprivate struct DelimiterLexer { slice(at: cursor, count) } - /// Return the slice of `count` bytes preceding the current cursor, or `nil` - /// if there are fewer than `count` bytes before the cursor. - func sliceBehind(_ count: Int) -> UnsafeRawBufferPointer? { - let priorCursor = cursor - count - guard priorCursor >= start else { return nil } - return slice(at: priorCursor, count) - } - /// Advance the cursor `n` bytes. mutating func advanceCursor(_ n: Int = 1) { cursor += n @@ -186,86 +174,6 @@ fileprivate struct DelimiterLexer { return true } - /// Attempt to skip over a closing delimiter character that is unlikely to be - /// the actual closing delimiter. - mutating func trySkipDelimiter(_ delimiter: Delimiter) { - // Only the closing `'` for re'...'/rx'...' can potentially be skipped over. - switch delimiter.kind { - case .forwardSlash, .experimental: - return - case .reSingleQuote, .rxSingleQuote: - break - } - guard load() == ascii("'") else { return } - - /// Need to look for a prefix of `(?`, `(?(`, `\k`, `\g`, `(?C`, as those - /// are the cases that could use single quotes. Note that none of these - /// would be valid regex endings anyway. - let calloutPrefix = "(?C" - let prefix = ["(?", "(?(", #"\k"#, #"\g"#, calloutPrefix].first { prior in - guard let priorSlice = sliceBehind(prior.utf8.count), - priorSlice.elementsEqual(prior.utf8) - else { return false } - - // Make sure the slice isn't preceded by a '\', as that invalidates this - // analysis. - if let prior = sliceBehind(priorSlice.count + 1) { - return prior[0] != ascii("\\") - } - return true - } - guard let prefix = prefix else { return } - let isCallout = prefix == calloutPrefix - - func isPossiblyGroupReference(_ c: UInt8) -> Bool { - // If this is an ASCII character, make sure it's for a group name. Leave - // other UTF-8 encoded scalars alone, this should at least catch cases - // where we run into a symbol such as `{`, `.`, `;` that would indicate - // we've likely advanced out of the bounds of the regex. - let scalar = UnicodeScalar(c) - guard scalar.isASCII else { return true } - switch scalar { - // Include '-' and '+' which may be used in recursion levels and relative - // references. - case "A"..."Z", "a"..."z", "0"..."9", "_", "-", "+": - return true - default: - return false - } - } - - // Make a note of the current lexing position, as we may need to revert - // back to it. - let originalCursor = cursor - advanceCursor() - - // Try skip over what would be the contents of a group identifier/reference. - while let next = load() { - // Found the ending, we're done. Return so we can continue to lex to the - // real delimiter. - if next == ascii("'") { - advanceCursor() - return - } - - // If this isn't a callout, make sure we have something that could be a - // group reference. We limit the character set here to improve diagnostic - // behavior in the case where the literal is actually unterminated. We - // ideally don't want to go wandering off into Swift source code. We can't - // do the same for callouts, as they take arbitrary strings. - guard isCallout || isPossiblyGroupReference(next) else { break } - do { - try advance() - } catch { - break - } - } - // We bailed out, either because we ran into something that didn't look like - // an identifier, or we reached the end of the line. Revert back to the - // original guess of delimiter. - cursor = originalCursor - } - /// Attempt to eat a particular closing delimiter, returning the contents of /// the literal, and ending pointer, or `nil` if this is not a delimiter /// ending. @@ -401,10 +309,6 @@ fileprivate struct DelimiterLexer { } } while true { - // Check to see if we're at a character that looks like a delimiter, but - // likely isn't. In such a case, we can attempt to skip over it. - trySkipDelimiter(delimiter) - // Try to lex the closing delimiter. if let (contents, end) = try tryEatEnding(delimiter, contentsStart: contentsStart) { diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 0aae031d5..d9b6f23a0 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -672,9 +672,7 @@ fileprivate func defaultSyntaxOptions( return [.multilineCompilerLiteral, .extendedSyntax] } return .traditional - case .reSingleQuote: - return .traditional - case .experimental, .rxSingleQuote: + case .experimental: return .experimental } } diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift index 49184deb3..53775e66e 100644 --- a/Tests/RegexTests/LexTests.swift +++ b/Tests/RegexTests/LexTests.swift @@ -96,9 +96,6 @@ extension RegexTests { ("#|abc/#def#", nil), ("#/abc\n/#", nil), ("#/abc\r/#", nil), - - (#"re'abcre\''"#, (#"abcre\'"#, delim(.reSingleQuote))), - (#"re'\'"#, nil) ] for (input, expected) in testCases { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 84ce361f3..0e7d41eed 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2151,9 +2151,6 @@ extension RegexTests { parseWithDelimitersTest("##/a b/##", concat("a", " ", "b")) parseWithDelimitersTest("#|a b|#", concat("a", "b")) - parseWithDelimitersTest("re'a b'", concat("a", " ", "b")) - parseWithDelimitersTest("rx'a b'", concat("a", "b")) - parseWithDelimitersTest("#|[a b]|#", charClass("a", "b")) parseWithDelimitersTest( "#|(?-x)[a b]|#", concat( @@ -2176,13 +2173,13 @@ extension RegexTests { parseWithDelimitersTest("#||||#", alt(empty(), empty(), empty())) parseWithDelimitersTest("#|a||#", alt("a", empty())) - parseWithDelimitersTest("re'x*'", zeroOrMore(of: "x")) + parseWithDelimitersTest("/x*/", zeroOrMore(of: "x")) - parseWithDelimitersTest(#"re'🔥🇩🇰'"#, concat("🔥", "🇩🇰")) - parseWithDelimitersTest(#"re'🔥✅'"#, concat("🔥", "✅")) + parseWithDelimitersTest(#"/🔥🇩🇰/"#, concat("🔥", "🇩🇰")) + parseWithDelimitersTest(#"/🔥✅/"#, concat("🔥", "✅")) // Printable ASCII characters. - delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##) + delimiterLexingTest(##"#/ !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~/#"##) // Make sure we can handle a combining accent as first character. parseWithDelimitersTest("/\u{301}/", "\u{301}") @@ -2294,72 +2291,61 @@ extension RegexTests { /# """#, charClass(range_m("a", "b"))) - - // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter - // if it's clear that it's part of the regex syntax. - parseWithDelimitersTest( - #"re'(?'a_bcA0'\')'"#, namedCapture("a_bcA0", "'")) + #"/(?'a_bcA0'\')/"#, namedCapture("a_bcA0", "'")) parseWithDelimitersTest( - #"re'(?'a_bcA0-c1A'x*)'"#, + #"/(?'a_bcA0-c1A'x*)/"#, balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x")), unsupported: true) parseWithDelimitersTest( - #"rx' (?'a_bcA0' a b)'"#, concat(namedCapture("a_bcA0", concat("a", "b")))) + #"/ (?'a_bcA0' a b)/"#, concat(" ", namedCapture("a_bcA0", concat(" ", "a", " ", "b")))) parseWithDelimitersTest( - #"re'(?('a_bcA0')x|y)'"#, conditional( + #"/(?('a_bcA0')x|y)/"#, conditional( .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y"), unsupported: true ) parseWithDelimitersTest( - #"re'(?('+20')\')'"#, conditional( + #"/(?('+20')\')/"#, conditional( .groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty()), unsupported: true ) parseWithDelimitersTest( - #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .invalidNamedReference("b0A")) + #"/a\k'b0A'/"#, concat("a", backreference(.named("b0A"))), throwsError: .invalidNamedReference("b0A")) parseWithDelimitersTest( - #"re'\k'+2-1''"#, backreference(ref(plus: 2), recursionLevel: -1), + #"/\k'+2-1'/"#, backreference(ref(plus: 2), recursionLevel: -1), unsupported: true ) parseWithDelimitersTest( - #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A"))), unsupported: true) + #"/a\g'b0A'/"#, concat("a", subpattern(.named("b0A"))), unsupported: true) parseWithDelimitersTest( - #"re'\g'-1'\''"#, concat(subpattern(ref(minus: 1)), "'"), unsupported: true) + #"/\g'-1'\'/"#, concat(subpattern(ref(minus: 1)), "'"), unsupported: true) parseWithDelimitersTest( - #"re'(?C'a*b\c 🔥_ ;')'"#, pcreCallout(string: #"a*b\c 🔥_ ;"#), + #"/(?C'a*b\c 🔥_ ;')/"#, pcreCallout(string: #"a*b\c 🔥_ ;"#), unsupported: true) - // Fine, because we don't end up skipping. - delimiterLexingTest(#"re'(?'"#) - delimiterLexingTest(#"re'(?('"#) - delimiterLexingTest(#"re'\k'"#) - delimiterLexingTest(#"re'\g'"#) - delimiterLexingTest(#"re'(?C'"#) + delimiterLexingTest(#"/(?/"#) + delimiterLexingTest(#"/(?(/"#) + delimiterLexingTest(#"/\k/"#) + delimiterLexingTest(#"/\g/"#) + delimiterLexingTest(#"/(?C/"#) - // Not a valid group name, but we can still skip over it. - delimiterLexingTest(#"re'(?'🔥')'"#) + delimiterLexingTest(#"/(?'🔥')/"#) - // Escaped, so don't skip. These will ignore the ending `'` as we've already - // closed the literal. parseWithDelimitersTest( - #"re'\(?''"#, zeroOrOne(of: "("), ignoreTrailing: true - ) + #"/\(?/"#, zeroOrOne(of: "(")) parseWithDelimitersTest( - #"re'\\k''"#, concat("\\", "k"), ignoreTrailing: true - ) + #"/\\k/"#, concat("\\", "k")) parseWithDelimitersTest( - #"re'\\g''"#, concat("\\", "g"), ignoreTrailing: true - ) + #"/\\g/"#, concat("\\", "g")) parseWithDelimitersTest( - #"re'\(?C''"#, concat(zeroOrOne(of: "("), "C"), ignoreTrailing: true - ) - delimiterLexingTest(#"re'(\?''"#, ignoreTrailing: true) - delimiterLexingTest(#"re'\(?(''"#, ignoreTrailing: true) + #"/\(?C/"#, concat(zeroOrOne(of: "("), "C")) + + delimiterLexingTest(#"/(\?/"#) + delimiterLexingTest(#"/\(?(/"#) // MARK: Parse not-equal @@ -3322,21 +3308,17 @@ extension RegexTests { // MARK: Printable ASCII - delimiterLexingDiagnosticTest(#"re'\\#n'"#, .unterminated) for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r. - delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII) + delimiterLexingDiagnosticTest("/\(UnicodeScalar(i))/", .unprintableASCII) } - delimiterLexingDiagnosticTest("re'\n'", .unterminated) - delimiterLexingDiagnosticTest("re'\r'", .unterminated) - delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII) - // MARK: Delimiter skipping + // Can only be done if pound signs are used. + delimiterLexingDiagnosticTest("/\n/", .unterminated) + delimiterLexingDiagnosticTest("/\r/", .unterminated) + delimiterLexingDiagnosticTest("/\u{7F}/", .unprintableASCII) - delimiterLexingDiagnosticTest("re'(?''", .unterminated) - delimiterLexingDiagnosticTest("re'(?'abc'", .unterminated) - delimiterLexingDiagnosticTest("re'(?('abc'", .unterminated) - delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .unterminated) - delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .unterminated) + delimiterLexingDiagnosticTest("/", .unterminated) + delimiterLexingDiagnosticTest("/x", .unterminated) // MARK: Unbalanced extended syntax delimiterLexingDiagnosticTest("#/a/", .unterminated) @@ -3344,9 +3326,6 @@ extension RegexTests { // MARK: Multiline - // Can only be done if pound signs are used. - delimiterLexingDiagnosticTest("/\n/", .unterminated) - // Opening and closing delimiters must be on a newline. delimiterLexingDiagnosticTest("#/a\n/#", .unterminated) delimiterLexingDiagnosticTest("#/\na/#", .multilineClosingNotOnNewline)