diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift index 727727ce1..dd785f12d 100644 --- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift @@ -279,7 +279,7 @@ extension Source { /// | 'x' HexDigit{0...2} /// | 'U' HexDigit{8} /// | 'o{' OctalDigit{1...} '}' - /// | OctalDigit{1...3} + /// | '0' OctalDigit{0...3} /// mutating func expectUnicodeScalar( escapedCharacter base: Character @@ -313,13 +313,14 @@ extension Source { let str = try src.lexUntil(eating: "}").value return try Source.validateUnicodeScalar(str, .octal) - case let c where c.isOctalDigit: - // We can read *up to* 2 more octal digits per PCRE. - // FIXME: ICU can read up to 3 octal digits if the leading digit is 0, - // we should have a parser mode to switch. - let nextDigits = src.tryEatPrefix(maxLength: 2, \.isOctalDigit) - let str = String(c) + (nextDigits?.string ?? "") - return try Source.validateUnicodeScalar(str, .octal) + case "0": + // We can read *up to* 3 more octal digits. + // FIXME: PCRE can only read up to 2 octal digits, if we get a strict + // PCRE mode, we should limit it here. + guard let digits = src.tryEatPrefix(maxLength: 3, \.isOctalDigit) else { + return Unicode.Scalar(0) + } + return try Source.validateUnicodeScalar(digits.string, .octal) default: fatalError("Unexpected scalar start") @@ -1341,26 +1342,10 @@ extension Source { return nil } - // Lexing \n is tricky, as it's ambiguous with octal sequences. In PCRE - // it is treated as a backreference if its first digit is not 0 (as that - // is always octal) and one of the following holds: - // - // - It's 0 < n < 10 (as octal would be pointless here) - // - Its first digit is 8 or 9 (as not valid octal) - // - There have been as many prior groups as the reference. - // - // Oniguruma follows the same rules except the second one. e.g \81 and - // \91 are instead treated as literal 81 and 91 respectively. - // TODO: If we want a strict Oniguruma mode, we'll need to add a check - // here. + // Backslash followed by a non-0 digit character is a backreference. if firstChar != "0", let numAndLoc = try src.lexNumber() { - let num = numAndLoc.value - let ref = AST.Reference(.absolute(num), innerLoc: numAndLoc.location) - if num < 10 || firstChar == "8" || firstChar == "9" || - context.isPriorGroupRef(ref.kind) { - return .backreference(ref) - } - return nil + return .backreference(.init( + .absolute(numAndLoc.value), innerLoc: numAndLoc.location)) } return nil } @@ -1497,10 +1482,8 @@ extension Source { } switch char { - // Hexadecimal and octal unicode scalars. This must be done after - // backreference lexing due to the ambiguity with \nnn. - case let c where c.isOctalDigit: fallthrough - case "u", "x", "U", "o": + // Hexadecimal and octal unicode scalars. + case "u", "x", "U", "o", "0": return try .scalar( src.expectUnicodeScalar(escapedCharacter: char).value) default: diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 088deb151..2c07f4c79 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -261,7 +261,7 @@ extension RegexTests { firstMatchTest(#"\070"#, input: "1238xyz", match: "8") firstMatchTest(#"\07A"#, input: "123\u{7}Axyz", match: "\u{7}A") firstMatchTest(#"\08"#, input: "123\08xyz", match: "\08") - firstMatchTest(#"\0707"#, input: "12387xyz", match: "87") + firstMatchTest(#"\0707"#, input: "12387\u{1C7}xyz", match: "\u{1C7}") // code point sequence firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true) @@ -1021,9 +1021,6 @@ extension RegexTests { firstMatchTest( #"(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)\10"#, input: "aaaaaaaaabbc", match: "aaaaaaaaabb") - firstMatchTest( - #"(.)\10"#, - input: "a\u{8}b", match: "a\u{8}") firstMatchTest( #"(.)\g001"#, diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 4872e256e..ce070cc38 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -325,7 +325,7 @@ extension RegexTests { parseTest(#"\070"#, scalar("\u{38}")) parseTest(#"\07A"#, concat(scalar("\u{7}"), "A")) parseTest(#"\08"#, concat(scalar("\u{0}"), "8")) - parseTest(#"\0707"#, concat(scalar("\u{38}"), "7")) + parseTest(#"\0707"#, scalar("\u{1C7}")) parseTest(#"[\0]"#, charClass(scalar_m("\u{0}"))) parseTest(#"[\01]"#, charClass(scalar_m("\u{1}"))) @@ -333,13 +333,15 @@ extension RegexTests { parseTest(#"[\07A]"#, charClass(scalar_m("\u{7}"), "A")) parseTest(#"[\08]"#, charClass(scalar_m("\u{0}"), "8")) - parseTest(#"[\0707]"#, charClass(scalar_m("\u{38}"), "7")) + parseTest(#"[\0707]"#, charClass(scalar_m("\u{1C7}"))) - parseTest(#"[\1]"#, charClass(scalar_m("\u{1}"))) - parseTest(#"[\123]"#, charClass(scalar_m("\u{53}"))) - parseTest(#"[\101]"#, charClass(scalar_m("\u{41}"))) - parseTest(#"[\7777]"#, charClass(scalar_m("\u{1FF}"), "7")) - parseTest(#"[\181]"#, charClass(scalar_m("\u{1}"), "8", "1")) + // TODO: These are treated as octal sequences by PCRE, we should warn and + // suggest user prefix with 0. + parseTest(#"[\1]"#, charClass("1")) + parseTest(#"[\123]"#, charClass("1", "2", "3")) + parseTest(#"[\101]"#, charClass("1", "0", "1")) + parseTest(#"[\7777]"#, charClass("7", "7", "7", "7")) + parseTest(#"[\181]"#, charClass("1", "8", "1")) // We take *up to* the first two valid digits for \x. No valid digits is 0. parseTest(#"\x"#, scalar("\u{0}")) @@ -488,6 +490,10 @@ extension RegexTests { #"a\Q \Q \\.\Eb"#, concat("a", quote(#" \Q \\."#), "b")) + // These follow the PCRE behavior. + parseTest(#"\Q\\E"#, quote("\\")) + parseTest(#"\E"#, "E") + parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"), syntax: .experimental) parseTest(#"a" .""b""#, concat("a", quote(" ."), quote("b")), @@ -793,11 +799,9 @@ extension RegexTests { ) } - // TODO: Some of these behaviors are unintuitive, we should likely warn on - // some of them. - parseTest(#"\10"#, scalar("\u{8}")) - parseTest(#"\18"#, concat(scalar("\u{1}"), "8")) - parseTest(#"\7777"#, concat(scalar("\u{1FF}"), "7")) + parseTest(#"\10"#, backreference(.absolute(10))) + parseTest(#"\18"#, backreference(.absolute(18))) + parseTest(#"\7777"#, backreference(.absolute(7777))) parseTest(#"\91"#, backreference(.absolute(91))) parseTest( @@ -809,12 +813,13 @@ extension RegexTests { parseTest( #"()()()()()()()()()\10()"#, concat(Array(repeating: capture(empty()), count: 9) - + [scalar("\u{8}"), capture(empty())]), + + [backreference(.absolute(10)), capture(empty())]), captures: .tuple(Array(repeating: .atom(), count: 10)) ) - parseTest(#"()()\10"#, - concat(capture(empty()), capture(empty()), scalar("\u{8}")), - captures: .tuple(.atom(), .atom())) + parseTest(#"()()\10"#, concat( + capture(empty()), capture(empty()), backreference(.absolute(10))), + captures: .tuple(.atom(), .atom()) + ) // A capture of three empty captures. let fourCaptures = capture( @@ -822,8 +827,8 @@ extension RegexTests { ) parseTest( // There are 9 capture groups in total here. - #"((()()())(()()()))\10"#, - concat(capture(concat(fourCaptures, fourCaptures)), scalar("\u{8}")), + #"((()()())(()()()))\10"#, concat(capture(concat( + fourCaptures, fourCaptures)), backreference(.absolute(10))), captures: .tuple(Array(repeating: .atom(), count: 9)) ) parseTest( @@ -848,7 +853,7 @@ extension RegexTests { concat(Array(repeating: capture(empty()), count: 40) + [scalar(" ")]), captures: .tuple(Array(repeating: .atom(), count: 40)) ) - parseTest(#"\40"#, scalar(" ")) + parseTest(#"\40"#, backreference(.absolute(40))) parseTest( String(repeating: "()", count: 40) + #"\40"#, concat(Array(repeating: capture(empty()), count: 40) @@ -858,7 +863,7 @@ extension RegexTests { parseTest(#"\7"#, backreference(.absolute(7))) - parseTest(#"\11"#, scalar("\u{9}")) + parseTest(#"\11"#, backreference(.absolute(11))) parseTest( String(repeating: "()", count: 11) + #"\11"#, concat(Array(repeating: capture(empty()), count: 11) @@ -872,12 +877,11 @@ extension RegexTests { captures: .tuple(Array(repeating: .atom(), count: 11)) ) - parseTest(#"\0113"#, concat(scalar("\u{9}"), "3")) - parseTest(#"\113"#, scalar("\u{4B}")) - parseTest(#"\377"#, scalar("\u{FF}")) + parseTest(#"\0113"#, scalar("\u{4B}")) + parseTest(#"\113"#, backreference(.absolute(113))) + parseTest(#"\377"#, backreference(.absolute(377))) parseTest(#"\81"#, backreference(.absolute(81))) - parseTest(#"\g1"#, backreference(.absolute(1))) parseTest(#"\g001"#, backreference(.absolute(1))) parseTest(#"\g52"#, backreference(.absolute(52)))