-
Notifications
You must be signed in to change notification settings - Fork 50
Parse matching options #91
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,6 +53,12 @@ extension AST { | |
// (*asr:...) | ||
case atomicScriptRun | ||
|
||
// (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:) | ||
// If hasImplicitScope is true, it was written as e.g (?i), and implicitly | ||
// forms a group containing all the following elements of the current | ||
// group. | ||
case changeMatchingOptions(MatchingOptionSequence, hasImplicitScope: Bool) | ||
|
||
// NOTE: Comments appear to be groups, but are not parsed | ||
// the same. They parse more like quotes, so are not | ||
// listed here. | ||
|
@@ -68,21 +74,38 @@ extension AST.Group.Kind: _ASTPrintable { | |
} | ||
} | ||
|
||
/// Whether this is a group with an implicit scope, e.g matching options | ||
/// written as (?i) implicitly become parent groups for the rest of the | ||
/// elements in the current group: | ||
/// | ||
/// (a(?i)bc)de -> (a(?i:bc))de | ||
/// | ||
public var hasImplicitScope: Bool { | ||
switch self { | ||
case .changeMatchingOptions(_, let hasImplicitScope): | ||
return hasImplicitScope | ||
default: | ||
return false | ||
} | ||
} | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What would be an example use of this information? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's used to determine whether we need to eat the closing |
||
public var _dumpBase: String { | ||
switch self { | ||
case .capture: return "capture" | ||
case .namedCapture(let s): return "capture<\(s.value)>" | ||
case .nonCapture: return "nonCapture" | ||
case .nonCaptureReset: return "nonCaptureReset" | ||
case .atomicNonCapturing: return "atomicNonCapturing" | ||
case .lookahead: return "lookahead" | ||
case .negativeLookahead: return "negativeLookahead" | ||
case .nonAtomicLookahead: return "nonAtomicLookahead" | ||
case .lookbehind: return "lookbehind" | ||
case .negativeLookbehind: return "negativeLookbehind" | ||
case .nonAtomicLookbehind: return "nonAtomicLookbehind" | ||
case .scriptRun: return "scriptRun" | ||
case .atomicScriptRun: return "atomicScriptRun" | ||
case .capture: return "capture" | ||
case .namedCapture(let s): return "capture<\(s.value)>" | ||
case .nonCapture: return "nonCapture" | ||
case .nonCaptureReset: return "nonCaptureReset" | ||
case .atomicNonCapturing: return "atomicNonCapturing" | ||
case .lookahead: return "lookahead" | ||
case .negativeLookahead: return "negativeLookahead" | ||
case .nonAtomicLookahead: return "nonAtomicLookahead" | ||
case .lookbehind: return "lookbehind" | ||
case .negativeLookbehind: return "negativeLookbehind" | ||
case .nonAtomicLookbehind: return "nonAtomicLookbehind" | ||
case .scriptRun: return "scriptRun" | ||
case .atomicScriptRun: return "atomicScriptRun" | ||
case .changeMatchingOptions(let seq, let hasImplicitScope): | ||
return "changeMatchingOptions<\(seq), \(hasImplicitScope)>" | ||
} | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
extension AST { | ||
/// An option written in source that changes matching semantics. | ||
public struct MatchingOption: Hashable { | ||
public enum Kind { | ||
// PCRE options | ||
case caseInsensitive // i | ||
case allowDuplicateGroupNames // J | ||
case multiline // m | ||
case noAutoCapture // n | ||
case singleLine // s | ||
case reluctantByDefault // U | ||
case extended // x | ||
case extraExtended // xx | ||
|
||
// ICU options | ||
case unicodeWordBoundaries // w | ||
|
||
// Oniguruma options | ||
case asciiOnlyDigit // D | ||
case asciiOnlyPOSIXProps // P | ||
case asciiOnlySpace // S | ||
case asciiOnlyWord // W | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @natecook1000 Should we include the |
||
// Oniguruma text segment options (these are mutually exclusive and cannot | ||
// be unset, only flipped between) | ||
case textSegmentGraphemeMode // y{g} | ||
case textSegmentWordMode // y{w} | ||
} | ||
public var kind: Kind | ||
public var location: SourceLocation | ||
|
||
public init(_ kind: Kind, location: SourceLocation) { | ||
self.kind = kind | ||
self.location = location | ||
} | ||
|
||
public var isTextSegmentMode: Bool { | ||
switch kind { | ||
case .textSegmentGraphemeMode, .textSegmentWordMode: | ||
return true | ||
default: | ||
return false | ||
} | ||
} | ||
} | ||
|
||
/// A sequence of matching options written in source. | ||
public struct MatchingOptionSequence: Hashable { | ||
/// If the sequence starts with a caret '^', its source location, or nil | ||
/// otherwise. If this is set, it indicates that all the matching options | ||
/// are unset, except the ones in `adding`. | ||
public var caretLoc: SourceLocation? | ||
|
||
/// The options to add. | ||
public var adding: [MatchingOption] | ||
|
||
/// The location of the '-' between the options to add and options to | ||
/// remove. | ||
public var minusLoc: SourceLocation? | ||
|
||
/// The options to remove. | ||
public var removing: [MatchingOption] | ||
|
||
public init(caretLoc: SourceLocation?, adding: [MatchingOption], | ||
minusLoc: SourceLocation?, removing: [MatchingOption]) { | ||
self.caretLoc = caretLoc | ||
self.adding = adding | ||
self.minusLoc = minusLoc | ||
self.removing = removing | ||
} | ||
} | ||
} | ||
|
||
extension AST.MatchingOption: _ASTPrintable { | ||
public var _dumpBase: String { "\(kind)" } | ||
} | ||
|
||
extension AST.MatchingOptionSequence: _ASTPrintable { | ||
public var _dumpBase: String { | ||
"adding: \(adding), removing: \(removing), hasCaret: \(caretLoc != nil)" | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -456,11 +456,108 @@ extension Source { | |
return AST.Trivia(trivia) | ||
} | ||
|
||
/// Try to lex a matching option. | ||
/// | ||
/// MatchingOption -> 'i' | 'J' | 'm' | 'n' | 's' | 'U' | 'x' | 'xx' | 'w' | ||
/// | 'D' | 'P' | 'S' | 'W' | 'y{' ('g' | 'w') '}' | ||
/// | ||
mutating func lexMatchingOption() throws -> AST.MatchingOption? { | ||
typealias OptKind = AST.MatchingOption.Kind | ||
|
||
let locOpt = try recordLoc { src -> OptKind? in | ||
func advanceAndReturn(_ o: OptKind) -> OptKind { | ||
src.advance() | ||
return o | ||
} | ||
guard let c = src.peek() else { return nil } | ||
switch c { | ||
// PCRE options. | ||
case "i": return advanceAndReturn(.caseInsensitive) | ||
case "J": return advanceAndReturn(.allowDuplicateGroupNames) | ||
case "m": return advanceAndReturn(.multiline) | ||
case "n": return advanceAndReturn(.noAutoCapture) | ||
case "s": return advanceAndReturn(.singleLine) | ||
case "U": return advanceAndReturn(.reluctantByDefault) | ||
case "x": | ||
src.advance() | ||
return src.tryEat("x") ? .extraExtended : .extended | ||
|
||
// ICU options. | ||
case "w": return advanceAndReturn(.unicodeWordBoundaries) | ||
|
||
// Oniguruma options. | ||
case "D": return advanceAndReturn(.asciiOnlyDigit) | ||
case "P": return advanceAndReturn(.asciiOnlyPOSIXProps) | ||
case "S": return advanceAndReturn(.asciiOnlySpace) | ||
case "W": return advanceAndReturn(.asciiOnlyWord) | ||
case "y": | ||
src.advance() | ||
try src.expect("{") | ||
let opt: OptKind | ||
if src.tryEat("w") { | ||
opt = .textSegmentWordMode | ||
} else { | ||
try src.expect("g") | ||
opt = .textSegmentGraphemeMode | ||
} | ||
try src.expect("}") | ||
return opt | ||
|
||
default: | ||
return nil | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have a known terminator ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm it's tricky because there's also other valid group syntax that starts with |
||
} | ||
} | ||
guard let locOpt = locOpt else { return nil } | ||
return .init(locOpt.value, location: locOpt.location) | ||
} | ||
|
||
/// Try to lex a sequence of matching options. | ||
/// | ||
/// MatchingOptionSeq -> '^' MatchingOption* | MatchingOption+ | ||
/// | MatchingOption* '-' MatchingOption+ | ||
/// | ||
mutating func lexMatchingOptionSequence( | ||
) throws -> AST.MatchingOptionSequence? { | ||
let ateCaret = recordLoc { $0.tryEat("^") } | ||
|
||
// TODO: Warn on duplicate options, and options appearing in both adding | ||
// and removing lists? | ||
var adding: [AST.MatchingOption] = [] | ||
while let opt = try lexMatchingOption() { | ||
adding.append(opt) | ||
} | ||
|
||
// If the sequence begun with a caret '^', options can be added, so we're | ||
// done. | ||
if ateCaret.value { | ||
return .init(caretLoc: ateCaret.location, adding: adding, minusLoc: nil, | ||
removing: []) | ||
} | ||
|
||
// Try to lex options to remove. | ||
let ateMinus = recordLoc { $0.tryEat("-") } | ||
if ateMinus.value { | ||
var removing: [AST.MatchingOption] = [] | ||
while let opt = try lexMatchingOption() { | ||
// Text segment options can only be added, they cannot be removed | ||
// with (?-), they should instead be set to a different mode. | ||
if opt.isTextSegmentMode { | ||
throw ParseError.cannotRemoveTextSegmentOptions | ||
} | ||
removing.append(opt) | ||
} | ||
return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location, | ||
removing: removing) | ||
} | ||
guard !adding.isEmpty else { return nil } | ||
return .init(caretLoc: nil, adding: adding, minusLoc: nil, removing: []) | ||
} | ||
|
||
/// Try to consume the start of a group | ||
/// | ||
/// GroupStart -> '(?' GroupKind | '(' | ||
/// GroupKind -> Named | ':' | '|' | '>' | '=' | '!' | '<=' | '<!' | ||
/// GroupKind -> Named | ':' | '|' | '>' | '=' | '!' | '*' | '<=' | '<!' | ||
/// | '<*' | MatchingOptionSeq (':' | ')') | ||
/// Named -> '<' [^'>']+ '>' | 'P<' [^'>']+ '>' | ||
/// | '\'' [^'\'']+ '\'' | ||
/// | ||
|
@@ -502,8 +599,25 @@ extension Source { | |
return .namedCapture(name) | ||
} | ||
|
||
throw ParseError.misc( | ||
"Unknown group kind '(?\(src.peek()!)'") | ||
// Matching option changing group (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:). | ||
if let seq = try src.lexMatchingOptionSequence() { | ||
if src.tryEat(":") { | ||
return .changeMatchingOptions(seq, hasImplicitScope: false) | ||
} | ||
// If this isn't start of an explicit group, we should have an | ||
// implicit group that covers the remaining elements of the current | ||
// group. | ||
// TODO: This implicit scoping behavior matches Oniguruma, but PCRE | ||
// also does it across alternations, which will require additional | ||
// handling. | ||
try src.expect(")") | ||
return .changeMatchingOptions(seq, hasImplicitScope: true) | ||
} | ||
|
||
guard let next = src.peek() else { | ||
throw ParseError.expectedGroupSpecifier | ||
} | ||
throw ParseError.misc("Unknown group kind '(?\(next)'") | ||
} | ||
|
||
// Explicitly spelled out PRCE2 syntax for some groups. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe just
isolated: Bool
, or is there something important about the scope concept?Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently it's used to check if we need to eat a closing
)
for a group, so the implicitness of the scope is relevant there, but I'm also fine with renaming it toisolated
as we'd still have thehasImplicitScope
property. I think at the time I just couldn't think of a good descriptive name for the syntax :)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, so it’s purely a syntactic detail. Yeah, I’d prefer “isolated” as that’s what it is from a syntactic perspective.