Skip to content

Update \DDD parsing #169

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 14 additions & 31 deletions Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ extension Source {
/// | 'x' HexDigit{0...2}
/// | 'U' HexDigit{8}
/// | 'o{' OctalDigit{1...} '}'
/// | OctalDigit{1...3}
/// | '0' OctalDigit{0...3}
///
mutating func expectUnicodeScalar(
escapedCharacter base: Character
Expand Down Expand Up @@ -313,13 +313,14 @@ extension Source {
let str = try src.lexUntil(eating: "}").value
return try Source.validateUnicodeScalar(str, .octal)

case let c where c.isOctalDigit:
// We can read *up to* 2 more octal digits per PCRE.
// FIXME: ICU can read up to 3 octal digits if the leading digit is 0,
// we should have a parser mode to switch.
let nextDigits = src.tryEatPrefix(maxLength: 2, \.isOctalDigit)
let str = String(c) + (nextDigits?.string ?? "")
return try Source.validateUnicodeScalar(str, .octal)
case "0":
// We can read *up to* 3 more octal digits.
// FIXME: PCRE can only read up to 2 octal digits, if we get a strict
// PCRE mode, we should limit it here.
guard let digits = src.tryEatPrefix(maxLength: 3, \.isOctalDigit) else {
return Unicode.Scalar(0)
}
return try Source.validateUnicodeScalar(digits.string, .octal)

default:
fatalError("Unexpected scalar start")
Expand Down Expand Up @@ -1341,26 +1342,10 @@ extension Source {
return nil
}

// Lexing \n is tricky, as it's ambiguous with octal sequences. In PCRE
// it is treated as a backreference if its first digit is not 0 (as that
// is always octal) and one of the following holds:
//
// - It's 0 < n < 10 (as octal would be pointless here)
// - Its first digit is 8 or 9 (as not valid octal)
// - There have been as many prior groups as the reference.
//
// Oniguruma follows the same rules except the second one. e.g \81 and
// \91 are instead treated as literal 81 and 91 respectively.
// TODO: If we want a strict Oniguruma mode, we'll need to add a check
// here.
// Backslash followed by a non-0 digit character is a backreference.
if firstChar != "0", let numAndLoc = try src.lexNumber() {
let num = numAndLoc.value
let ref = AST.Reference(.absolute(num), innerLoc: numAndLoc.location)
if num < 10 || firstChar == "8" || firstChar == "9" ||
context.isPriorGroupRef(ref.kind) {
return .backreference(ref)
}
return nil
return .backreference(.init(
.absolute(numAndLoc.value), innerLoc: numAndLoc.location))
}
return nil
}
Expand Down Expand Up @@ -1497,10 +1482,8 @@ extension Source {
}

switch char {
// Hexadecimal and octal unicode scalars. This must be done after
// backreference lexing due to the ambiguity with \nnn.
case let c where c.isOctalDigit: fallthrough
case "u", "x", "U", "o":
// Hexadecimal and octal unicode scalars.
case "u", "x", "U", "o", "0":
return try .scalar(
src.expectUnicodeScalar(escapedCharacter: char).value)
default:
Expand Down
5 changes: 1 addition & 4 deletions Tests/RegexTests/MatchTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ extension RegexTests {
firstMatchTest(#"\070"#, input: "1238xyz", match: "8")
firstMatchTest(#"\07A"#, input: "123\u{7}Axyz", match: "\u{7}A")
firstMatchTest(#"\08"#, input: "123\08xyz", match: "\08")
firstMatchTest(#"\0707"#, input: "12387xyz", match: "87")
firstMatchTest(#"\0707"#, input: "12387\u{1C7}xyz", match: "\u{1C7}")

// code point sequence
firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true)
Expand Down Expand Up @@ -1021,9 +1021,6 @@ extension RegexTests {
firstMatchTest(
#"(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)\10"#,
input: "aaaaaaaaabbc", match: "aaaaaaaaabb")
firstMatchTest(
#"(.)\10"#,
input: "a\u{8}b", match: "a\u{8}")

firstMatchTest(
#"(.)\g001"#,
Expand Down
52 changes: 28 additions & 24 deletions Tests/RegexTests/ParseTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -325,21 +325,23 @@ extension RegexTests {
parseTest(#"\070"#, scalar("\u{38}"))
parseTest(#"\07A"#, concat(scalar("\u{7}"), "A"))
parseTest(#"\08"#, concat(scalar("\u{0}"), "8"))
parseTest(#"\0707"#, concat(scalar("\u{38}"), "7"))
parseTest(#"\0707"#, scalar("\u{1C7}"))

parseTest(#"[\0]"#, charClass(scalar_m("\u{0}")))
parseTest(#"[\01]"#, charClass(scalar_m("\u{1}")))
parseTest(#"[\070]"#, charClass(scalar_m("\u{38}")))

parseTest(#"[\07A]"#, charClass(scalar_m("\u{7}"), "A"))
parseTest(#"[\08]"#, charClass(scalar_m("\u{0}"), "8"))
parseTest(#"[\0707]"#, charClass(scalar_m("\u{38}"), "7"))
parseTest(#"[\0707]"#, charClass(scalar_m("\u{1C7}")))

parseTest(#"[\1]"#, charClass(scalar_m("\u{1}")))
parseTest(#"[\123]"#, charClass(scalar_m("\u{53}")))
parseTest(#"[\101]"#, charClass(scalar_m("\u{41}")))
parseTest(#"[\7777]"#, charClass(scalar_m("\u{1FF}"), "7"))
parseTest(#"[\181]"#, charClass(scalar_m("\u{1}"), "8", "1"))
// TODO: These are treated as octal sequences by PCRE, we should warn and
// suggest user prefix with 0.
parseTest(#"[\1]"#, charClass("1"))
parseTest(#"[\123]"#, charClass("1", "2", "3"))
parseTest(#"[\101]"#, charClass("1", "0", "1"))
parseTest(#"[\7777]"#, charClass("7", "7", "7", "7"))
parseTest(#"[\181]"#, charClass("1", "8", "1"))

// We take *up to* the first two valid digits for \x. No valid digits is 0.
parseTest(#"\x"#, scalar("\u{0}"))
Expand Down Expand Up @@ -488,6 +490,10 @@ extension RegexTests {
#"a\Q \Q \\.\Eb"#,
concat("a", quote(#" \Q \\."#), "b"))

// These follow the PCRE behavior.
parseTest(#"\Q\\E"#, quote("\\"))
parseTest(#"\E"#, "E")

parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"),
syntax: .experimental)
parseTest(#"a" .""b""#, concat("a", quote(" ."), quote("b")),
Expand Down Expand Up @@ -793,11 +799,9 @@ extension RegexTests {
)
}

// TODO: Some of these behaviors are unintuitive, we should likely warn on
// some of them.
parseTest(#"\10"#, scalar("\u{8}"))
parseTest(#"\18"#, concat(scalar("\u{1}"), "8"))
parseTest(#"\7777"#, concat(scalar("\u{1FF}"), "7"))
parseTest(#"\10"#, backreference(.absolute(10)))
parseTest(#"\18"#, backreference(.absolute(18)))
parseTest(#"\7777"#, backreference(.absolute(7777)))
parseTest(#"\91"#, backreference(.absolute(91)))

parseTest(
Expand All @@ -809,21 +813,22 @@ extension RegexTests {
parseTest(
#"()()()()()()()()()\10()"#,
concat(Array(repeating: capture(empty()), count: 9)
+ [scalar("\u{8}"), capture(empty())]),
+ [backreference(.absolute(10)), capture(empty())]),
captures: .tuple(Array(repeating: .atom(), count: 10))
)
parseTest(#"()()\10"#,
concat(capture(empty()), capture(empty()), scalar("\u{8}")),
captures: .tuple(.atom(), .atom()))
parseTest(#"()()\10"#, concat(
capture(empty()), capture(empty()), backreference(.absolute(10))),
captures: .tuple(.atom(), .atom())
)

// A capture of three empty captures.
let fourCaptures = capture(
concat(capture(empty()), capture(empty()), capture(empty()))
)
parseTest(
// There are 9 capture groups in total here.
#"((()()())(()()()))\10"#,
concat(capture(concat(fourCaptures, fourCaptures)), scalar("\u{8}")),
#"((()()())(()()()))\10"#, concat(capture(concat(
fourCaptures, fourCaptures)), backreference(.absolute(10))),
captures: .tuple(Array(repeating: .atom(), count: 9))
)
parseTest(
Expand All @@ -848,7 +853,7 @@ extension RegexTests {
concat(Array(repeating: capture(empty()), count: 40) + [scalar(" ")]),
captures: .tuple(Array(repeating: .atom(), count: 40))
)
parseTest(#"\40"#, scalar(" "))
parseTest(#"\40"#, backreference(.absolute(40)))
parseTest(
String(repeating: "()", count: 40) + #"\40"#,
concat(Array(repeating: capture(empty()), count: 40)
Expand All @@ -858,7 +863,7 @@ extension RegexTests {

parseTest(#"\7"#, backreference(.absolute(7)))

parseTest(#"\11"#, scalar("\u{9}"))
parseTest(#"\11"#, backreference(.absolute(11)))
parseTest(
String(repeating: "()", count: 11) + #"\11"#,
concat(Array(repeating: capture(empty()), count: 11)
Expand All @@ -872,12 +877,11 @@ extension RegexTests {
captures: .tuple(Array(repeating: .atom(), count: 11))
)

parseTest(#"\0113"#, concat(scalar("\u{9}"), "3"))
parseTest(#"\113"#, scalar("\u{4B}"))
parseTest(#"\377"#, scalar("\u{FF}"))
parseTest(#"\0113"#, scalar("\u{4B}"))
parseTest(#"\113"#, backreference(.absolute(113)))
parseTest(#"\377"#, backreference(.absolute(377)))
parseTest(#"\81"#, backreference(.absolute(81)))


parseTest(#"\g1"#, backreference(.absolute(1)))
parseTest(#"\g001"#, backreference(.absolute(1)))
parseTest(#"\g52"#, backreference(.absolute(52)))
Expand Down