Skip to content

Parse Oniguruma callout and absent function syntax #129

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 58 additions & 11 deletions Sources/_MatchingEngine/Regex/AST/AST.swift
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ public indirect enum AST:

case customCharacterClass(CustomCharacterClass)

case absentFunction(AbsentFunction)

case empty(Empty)

// FIXME: Move off the regex literal AST
Expand All @@ -55,16 +57,17 @@ extension AST {
// over `self` _everywhere_ we want to do anything.
var _associatedValue: _ASTNode {
switch self {
case let .alternation(v): return v
case let .concatenation(v): return v
case let .group(v): return v
case let .conditional(v): return v
case let .quantification(v): return v
case let .quote(v): return v
case let .trivia(v): return v
case let .atom(v): return v
case let .customCharacterClass(v): return v
case let .empty(v): return v
case let .alternation(v): return v
case let .concatenation(v): return v
case let .group(v): return v
case let .conditional(v): return v
case let .quantification(v): return v
case let .quote(v): return v
case let .trivia(v): return v
case let .atom(v): return v
case let .customCharacterClass(v): return v
case let .empty(v): return v
case let .absentFunction(v): return v

case let .groupTransform(g, _):
return g // FIXME: get this out of here
Expand Down Expand Up @@ -110,7 +113,7 @@ extension AST {
switch self {
case .atom(let a):
return a.isQuantifiable
case .group, .conditional, .customCharacterClass:
case .group, .conditional, .customCharacterClass, .absentFunction:
return true
case .alternation, .concatenation, .quantification, .quote, .trivia,
.empty, .groupTransform:
Expand Down Expand Up @@ -185,6 +188,50 @@ extension AST {
}
}

/// An Oniguruma absent function. This is used to model a pattern which should
/// not be matched against across varying scopes.
public struct AbsentFunction: Hashable, _ASTNode {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know Oniguruma uses this "absent function" name, but is there a better or more descriptive name? What does this do?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, its primary role it to match against an inverse of a pattern (though the .expression variant also specifies what it can match). So maybe something like MatchInverse or Exclusion?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see. This is Oniguruma's approach to the regex-inversion problem (i.e. inverting each component of a concatenation is surprising and not the same as inverting the regex). I'm fine keeping this name with a comment for now, as a term-of-art or something to be renamed later.

@rxwei any thoughts on how to model pattern inversions in the DSL?

public enum Start: Hashable {
/// `(?~|`
case withPipe

/// `(?~`
case withoutPipe
}
public enum Kind: Hashable {
/// An absent repeater `(?~absent)`. This is equivalent to `(?~|absent|.*)`
/// and therefore matches as long as the pattern `absent` is not matched.
case repeater(AST)

/// An absent expression `(?~|absent|expr)`, which defines an `absent`
/// pattern which must not be matched against while the pattern `expr` is
/// matched.
case expression(absentee: AST, pipe: SourceLocation, expr: AST)

/// An absent stopper `(?~|absent)`, which prevents matching against
/// `absent` until the end of the regex, or until it is cleared.
case stopper(AST)

/// An absent clearer `(?~|)` which cancels the effect of an absent
/// stopper.
case clearer
}
/// The location of `(?~` or `(?~|`
public var start: SourceLocation

public var kind: Kind

public var location: SourceLocation

public init(
_ kind: Kind, start: SourceLocation, location: SourceLocation
) {
self.kind = kind
self.start = start
self.location = location
}
}

public struct Reference: Hashable {
@frozen
public enum Kind: Hashable {
Expand Down
9 changes: 9 additions & 0 deletions Sources/_MatchingEngine/Regex/AST/ASTProtocols.swift
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,12 @@ extension AST.Group: _ASTParent {
extension AST.Quantification: _ASTParent {
var children: [AST] { [child] }
}
extension AST.AbsentFunction: _ASTParent {
var children: [AST] {
switch kind {
case .repeater(let a), .stopper(let a): return [a]
case .expression(let a, _, let c): return [a, c]
case .clearer: return []
}
}
}
119 changes: 111 additions & 8 deletions Sources/_MatchingEngine/Regex/AST/Atom.swift
Original file line number Diff line number Diff line change
Expand Up @@ -476,14 +476,117 @@ extension AST.Atom {
}

extension AST.Atom {
public struct Callout: Hashable {
public enum Argument: Hashable {
case number(Int)
case string(String)
public enum Callout: Hashable {
/// A PCRE callout written `(?C...)`
public struct PCRE: Hashable {
public enum Argument: Hashable {
case number(Int)
case string(String)
}
public var arg: AST.Located<Argument>

public init(_ arg: AST.Located<Argument>) {
self.arg = arg
}

/// Whether the argument isn't written explicitly in the source, e.g
/// `(?C)` which is implicitly `(?C0)`.
public var isImplicit: Bool { arg.location.isEmpty }
}
public var arg: AST.Located<Argument>
public init(_ arg: AST.Located<Argument>) {
self.arg = arg

/// A named Oniguruma callout written `(*name[tag]{args, ...})`
public struct OnigurumaNamed: Hashable {
public struct ArgList: Hashable {
public var leftBrace: SourceLocation
public var args: [AST.Located<String>]
public var rightBrace: SourceLocation

public init(
_ leftBrace: SourceLocation,
_ args: [AST.Located<String>],
_ rightBrace: SourceLocation
) {
self.leftBrace = leftBrace
self.args = args
self.rightBrace = rightBrace
}
}

public var name: AST.Located<String>
public var tag: OnigurumaTag?
public var args: ArgList?

public init(
_ name: AST.Located<String>, tag: OnigurumaTag?, args: ArgList?
) {
self.name = name
self.tag = tag
self.args = args
}
}

/// An Oniguruma callout 'of contents', written `(?{...}[tag]D)`
public struct OnigurumaOfContents: Hashable {
public enum Direction: Hashable {
case inProgress // > (the default)
case inRetraction // <
case both // X
}
public var openBraces: SourceLocation
public var contents: AST.Located<String>
public var closeBraces: SourceLocation
public var tag: OnigurumaTag?
public var direction: AST.Located<Direction>

public init(
_ openBraces: SourceLocation, _ contents: AST.Located<String>,
_ closeBraces: SourceLocation, tag: OnigurumaTag?,
direction: AST.Located<Direction>
) {
self.openBraces = openBraces
self.contents = contents
self.closeBraces = closeBraces
self.tag = tag
self.direction = direction
}

/// Whether the direction flag isn't written explicitly in the
/// source, e.g `(?{x})` which is implicitly `(?{x}>)`.
public var isDirectionImplicit: Bool { direction.location.isEmpty }
}
case pcre(PCRE)
case onigurumaNamed(OnigurumaNamed)
case onigurumaOfContents(OnigurumaOfContents)

private var _associatedValue: Any {
switch self {
case .pcre(let v): return v
case .onigurumaNamed(let v): return v
case .onigurumaOfContents(let v): return v
}
}

func `as`<T>(_ t: T.Type = T.self) -> T? {
_associatedValue as? T
}
}
}

extension AST.Atom.Callout {
/// A tag specifier `[...]` which may appear in an Oniguruma callout.
public struct OnigurumaTag: Hashable {
public var leftBracket: SourceLocation
public var name: AST.Located<String>
public var rightBracket: SourceLocation

public init(
_ leftBracket: SourceLocation,
_ name: AST.Located<String>,
_ rightBracket: SourceLocation
) {
self.leftBracket = leftBracket
self.name = name
self.rightBracket = rightBracket
}
}
}
Expand Down Expand Up @@ -594,7 +697,7 @@ extension AST {
case .alternation, .concatenation, .group,
.conditional, .quantification, .quote,
.trivia, .customCharacterClass, .empty,
.groupTransform:
.groupTransform, .absentFunction:
return nil
}
}
Expand Down
9 changes: 9 additions & 0 deletions Sources/_MatchingEngine/Regex/Parse/CaptureStructure.swift
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,15 @@ extension AST {
quantification.amount.value == .zeroOrOne
? CaptureStructure.optional
: CaptureStructure.array)
case .absentFunction(let abs):
// Only the child of an expression absent function is relevant, as the
// other expressions don't actually get matched against.
switch abs.kind {
case .expression(_, _, let child):
return child.captureStructure
case .clearer, .repeater, .stopper:
return .empty
}
case .quote, .trivia, .atom, .customCharacterClass, .empty:
return .empty
}
Expand Down
41 changes: 32 additions & 9 deletions Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ enum ParseError: Error, Hashable {
case tooManyBranchesInConditional(Int)
case unsupportedCondition(String)

case tooManyAbsentExpressionChildren(Int)

case expectedASCII(Character)

case expectedNonEmptyContents
Expand All @@ -55,10 +57,25 @@ enum ParseError: Error, Hashable {
case emptyProperty

case expectedGroupSpecifier
case expectedGroupName
case groupNameMustBeAlphaNumeric
case groupNameCannotStartWithNumber
case unbalancedEndOfGroup

// Identifier diagnostics.
case expectedIdentifier(IdentifierKind)
case identifierMustBeAlphaNumeric(IdentifierKind)
case identifierCannotStartWithNumber(IdentifierKind)

case cannotRemoveTextSegmentOptions
case expectedCalloutArgument
}

extension IdentifierKind {
fileprivate var diagDescription: String {
switch self {
case .groupName: return "group name"
case .onigurumaCalloutName: return "callout name"
case .onigurumaCalloutTag: return "callout tag"
}
}
}

extension ParseError: CustomStringConvertible {
Expand Down Expand Up @@ -96,6 +113,8 @@ extension ParseError: CustomStringConvertible {
return "expected 2 branches in conditional, have \(i)"
case let .unsupportedCondition(str):
return "\(str) cannot be used as condition"
case let .tooManyAbsentExpressionChildren(i):
return "expected 2 expressions in absent expression, have \(i)"
case let .unknownGroupKind(str):
return "unknown group kind '(\(str)'"
case let .unknownCalloutKind(str):
Expand All @@ -116,14 +135,18 @@ extension ParseError: CustomStringConvertible {
return "empty property"
case .expectedGroupSpecifier:
return "expected group specifier"
case .expectedGroupName:
return "expected group name"
case .groupNameMustBeAlphaNumeric:
return "group name must only contain alphanumeric characters"
case .groupNameCannotStartWithNumber:
return "group name must not start with number"
case .unbalancedEndOfGroup:
return "closing ')' does not balance any groups openings"
case .expectedIdentifier(let i):
return "expected \(i.diagDescription)"
case .identifierMustBeAlphaNumeric(let i):
return "\(i.diagDescription) must only contain alphanumeric characters"
case .identifierCannotStartWithNumber(let i):
return "\(i.diagDescription) must not start with number"
case .cannotRemoveTextSegmentOptions:
return "text segment mode cannot be unset, only changed"
case .expectedCalloutArgument:
return "expected argument to callout"
}
}
}
Expand Down
Loading