Skip to content

Treat range quantifiers as literal if invalid #104

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 85 additions & 62 deletions Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
Original file line number Diff line number Diff line change
Expand Up @@ -100,18 +100,35 @@ extension Source {
}
}

mutating func tryEatNonEmpty(_ c: Char) throws -> Bool {
guard !isEmpty else { throw ParseError.expected(String(c)) }
return tryEat(c)
}

mutating func tryEatNonEmpty<C: Collection>(sequence c: C) throws -> Bool
where C.Element == Char
{
guard !isEmpty else { throw ParseError.expected(String(c)) }
_ = try recordLoc { src in
guard !src.isEmpty else { throw ParseError.expected(String(c)) }
}
return tryEat(sequence: c)
}

mutating func tryEatNonEmpty(_ c: Char) throws -> Bool {
try tryEatNonEmpty(sequence: String(c))
}

/// Attempt to make a series of lexing steps in `body`, returning `nil` if
/// unsuccesful, which will revert the source back to its previous state. If
/// an error is thrown, the source will not be reverted.
mutating func tryEating<T>(
_ body: (inout Source) throws -> T?
) rethrows -> T? {
// We don't revert the source if an error is thrown, as it's useful to
// maintain the source location in that case.
let current = self
guard let result = try body(&self) else {
self = current
return nil
}
return result
}

/// Throws an expected ASCII character error if not matched
mutating func expectASCII() throws -> Located<Character> {
try recordLoc { src in
Expand Down Expand Up @@ -277,7 +294,7 @@ extension Source {
return try Source.validateUnicodeScalar(str, .octal)

default:
throw ParseError.misc("TODO: Or is this an assert?")
fatalError("Unexpected scalar start")
}
}
}
Expand All @@ -295,16 +312,11 @@ extension Source {
if src.tryEat("+") { return .oneOrMore }
if src.tryEat("?") { return .zeroOrOne }

// FIXME: Actually, PCRE treats empty as literal `{}`...
// But Java 8 errors out?
if src.tryEat("{") {
// FIXME: Erm, PCRE parses as literal if no lowerbound...
let amt = try src.expectRange()
try src.expect("}")
return amt.value // FIXME: track actual range...
return try src.tryEating { src in
guard src.tryEat("{"), let range = try src.lexRange(), src.tryEat("}")
else { return nil }
return range.value
}

return nil
}
guard let amt = amt else { return nil }

Expand All @@ -317,56 +329,56 @@ extension Source {
return (amt, kind)
}

/// Consume a range
/// Try to consume a range, returning `nil` if unsuccessful.
///
/// Range -> ',' <Int> | <Int> ',' <Int>? | <Int>
/// | ExpRange
/// ExpRange -> '..<' <Int> | '...' <Int>
/// | <Int> '..<' <Int> | <Int> '...' <Int>?
mutating func expectRange() throws -> Located<Quant.Amount> {
mutating func lexRange() throws -> Located<Quant.Amount>? {
try recordLoc { src in
// TODO: lex positive numbers, more specifically...

let lowerOpt = try src.lexNumber()

// ',' or '...' or '..<' or nothing
let closedRange: Bool?
if src.tryEat(",") {
closedRange = true
} else if src.experimentalRanges && src.tryEat(".") {
try src.expect(".")
if src.tryEat(".") {
try src.tryEating { src in
let lowerOpt = try src.lexNumber()

// ',' or '...' or '..<' or nothing
// TODO: We ought to try and consume whitespace here and emit a
// diagnostic for the user warning them that it would cause the range to
// be treated as literal.
let closedRange: Bool?
if src.tryEat(",") {
closedRange = true
} else if src.experimentalRanges && src.tryEat(".") {
try src.expect(".")
if src.tryEat(".") {
closedRange = true
} else {
try src.expect("<")
closedRange = false
}
} else {
try src.expect("<")
closedRange = false
closedRange = nil
}
} else {
closedRange = nil
}
// FIXME: wait, why `try!` ?
let upperOpt: Located<Int>?
if let u = try! src.lexNumber() {
upperOpt = (closedRange == true) ? u : Located(u.value-1, u.location)
} else {
upperOpt = nil
}

switch (lowerOpt, closedRange, upperOpt) {
case let (l?, nil, nil):
return .exactly(l)
case let (l?, true, nil):
return .nOrMore(l)
case let (nil, _, u?):
return .upToN(u)
case let (l?, _, u?):
// FIXME: source location tracking
return .range(l, u)

case let (nil, nil, u) where u != nil:
fatalError("Not possible")
default:
throw ParseError.misc("Invalid range")
let upperOpt = try src.lexNumber()?.map { upper in
// If we have an open range, the upper bound should be adjusted down.
closedRange == true ? upper : upper - 1
}

switch (lowerOpt, closedRange, upperOpt) {
case let (l?, nil, nil):
return .exactly(l)
case let (l?, true, nil):
return .nOrMore(l)
case let (nil, _?, u?):
return .upToN(u)
case let (l?, _?, u?):
return .range(l, u)

case (nil, nil, _?):
fatalError("Didn't lex lower bound, but lexed upper bound?")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At some point, we should do a pass over the codebase and convert unreachable fatalErrors to unreachable throws, differentiating them from diagnostics.

default:
return nil
}
}
}
}
Expand All @@ -393,12 +405,24 @@ extension Source {
try lexUntil(eating: String(end))
}

/// Expect a linear run of non-nested non-empty content
/// Expect a linear run of non-nested non-empty content ending with a given
/// delimiter. If `ignoreEscaped` is true, escaped characters will not be
/// considered for the ending delimiter.
private mutating func expectQuoted(
endingWith end: String
endingWith end: String, ignoreEscaped: Bool = false
) throws -> Located<String> {
try recordLoc { src in
let result = try src.lexUntil(eating: end).value
let result = try src.lexUntil { src in
if try src.tryEatNonEmpty(sequence: end) {
return true
}
// Ignore escapes if we're allowed to. lexUntil will consume the next
// character.
if ignoreEscaped, src.tryEat("\\") {
try src.expectNonEmpty()
}
return false
}.value
guard !result.isEmpty else {
throw ParseError.misc("Expected non-empty contents")
}
Expand All @@ -412,7 +436,7 @@ extension Source {
///
/// With `SyntaxOptions.experimentalQuotes`, also accepts
///
/// ExpQuote -> '"' [^"]* '"'
/// ExpQuote -> '"' ('\"' | [^"])* '"'
///
/// Future: Experimental quotes are full fledged Swift string literals
///
Expand All @@ -424,8 +448,7 @@ extension Source {
return try src.expectQuoted(endingWith: #"\E"#).value
}
if src.experimentalQuotes, src.tryEat("\"") {
// TODO: escaped `"`, etc...
return try src.expectQuoted(endingWith: "\"").value
return try src.expectQuoted(endingWith: "\"", ignoreEscaped: true).value
}
return nil
}
Expand Down
4 changes: 4 additions & 0 deletions Sources/_MatchingEngine/Regex/Parse/SourceLocation.swift
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ extension Source {
// externally?
self.init(v, .fake)
}

public func map<U>(_ fn: (T) throws -> U) rethrows -> Located<U> {
Located<U>(try fn(value), location)
}
}
}
extension AST {
Expand Down
6 changes: 6 additions & 0 deletions Tests/RegexTests/LexTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,12 @@ extension RegexTests {
_ = try $0.lexGroupStart()
}

diagnose(#"\Qab"#, expecting: .expected("\\E")) { _ = try $0.lexQuote() }
diagnose(#"\Qab\"#, expecting: .expected("\\E")) { _ = try $0.lexQuote() }
diagnose(#""ab"#, expecting: .expected("\""), .experimental) { _ = try $0.lexQuote() }
diagnose(#""ab\""#, expecting: .expected("\""), .experimental) { _ = try $0.lexQuote() }
diagnose(#""ab\"#, expecting: .unexpectedEndOfInput, .experimental) { _ = try $0.lexQuote() }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice, these kinds of tests are coming along nicely!


// TODO: want to dummy print out source ranges, etc, test that.
}

Expand Down
4 changes: 4 additions & 0 deletions Tests/RegexTests/MatchTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,10 @@ extension RegexTests {
#"a{1,2}?"#, input: "123aaaxyz", match: "a")
matchTest(
#"a{1,2}?x"#, input: "123aaaxyz", match: "aax")
matchTest(
#"xa{0}y"#, input: "123aaaxyz", match: "xy")
matchTest(
#"xa{0,0}y"#, input: "123aaaxyz", match: "xy")

matchTest("a.*", input: "dcba", match: "a")

Expand Down
36 changes: 36 additions & 0 deletions Tests/RegexTests/ParseTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,14 @@ extension RegexTests {
#"a\Q \Q \\.\Eb"#,
concat("a", quote(#" \Q \\."#), "b"))

parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"),
syntax: .experimental)
parseTest(#"a" .""b""#, concat("a", quote(" ."), quote("b")),
syntax: .experimental)
parseTest(#"a" .\"\"b""#, concat("a", quote(" .\"\"b")),
syntax: .experimental)
parseTest(#""\"""#, quote("\""), syntax: .experimental)

// MARK: Comments

parseTest(
Expand All @@ -440,6 +448,34 @@ extension RegexTests {
parseTest(
#"a{1,2}?"#,
quantRange(.reluctant, 1...2, "a"))
parseTest(
#"a{0}"#,
exactly(.eager, 0, "a"))
parseTest(
#"a{0,0}"#,
quantRange(.eager, 0...0, "a"))

// Make sure ranges get treated as literal if invalid.
parseTest("{", "{")
parseTest("{,", concat("{", ","))
parseTest("{}", concat("{", "}"))
parseTest("{,}", concat("{", ",", "}"))
parseTest("{,6", concat("{", ",", "6"))
parseTest("{6", concat("{", "6"))
parseTest("{6,", concat("{", "6", ","))
parseTest("{+", oneOrMore(.eager, "{"))
parseTest("{6,+", concat("{", "6", oneOrMore(.eager, ",")))
parseTest("x{", concat("x", "{"))
parseTest("x{}", concat("x", "{", "}"))
parseTest("x{,}", concat("x", "{", ",", "}"))
parseTest("x{,6", concat("x", "{", ",", "6"))
parseTest("x{6", concat("x", "{", "6"))
parseTest("x{6,", concat("x", "{", "6", ","))
parseTest("x{+", concat("x", oneOrMore(.eager, "{")))
parseTest("x{6,+", concat("x", "{", "6", oneOrMore(.eager, ",")))

// TODO: We should emit a diagnostic for this.
parseTest("x{3, 5}", concat("x", "{", "3", ",", " ", "5", "}"))

// MARK: Groups

Expand Down