Skip to content

[swift/main] Optimize search for start-anchored regexes #683

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Sources/RegexBenchmark/Suite/NotFound.swift
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ extension BenchmarkRunner {
baseName: "AnchoredNotFound",
regex: "^ +a",
input: input,
isWhole: true)
includeFirst: true)
anchoredNotFound.register(&self)
}
}
1 change: 1 addition & 0 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ extension Compiler.ByteCodeGen {
// The whole match (`.0` element of output) is equivalent to an implicit
// capture over the entire regex.
try emitNode(.capture(name: nil, reference: nil, root))
builder.canOnlyMatchAtStart = root.canOnlyMatchAtStart()
builder.buildAccept()
return try builder.assemble()
}
Expand Down
6 changes: 5 additions & 1 deletion Sources/_StringProcessing/Engine/MEBuilder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ extension MEProgram {
var captureList = CaptureList()
var initialOptions = MatchingOptions()

// Starting constraint
var canOnlyMatchAtStart = false

// Symbolic reference resolution
var unresolvedReferences: [ReferenceID: [InstructionAddress]] = [:]
var referencedCaptureOffsets: [ReferenceID: Int] = [:]
Expand Down Expand Up @@ -404,7 +407,8 @@ extension MEProgram.Builder {
enableMetrics: enableMetrics,
captureList: captureList,
referencedCaptureOffsets: referencedCaptureOffsets,
initialOptions: initialOptions)
initialOptions: initialOptions,
canOnlyMatchAtStart: canOnlyMatchAtStart)
}

mutating func reset() { self = Self() }
Expand Down
1 change: 1 addition & 0 deletions Sources/_StringProcessing/Engine/MEProgram.swift
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ struct MEProgram {
let referencedCaptureOffsets: [ReferenceID: Int]

var initialOptions: MatchingOptions
var canOnlyMatchAtStart: Bool
}

extension MEProgram: CustomStringConvertible {
Expand Down
110 changes: 110 additions & 0 deletions Sources/_StringProcessing/Regex/DSLTree.swift
Original file line number Diff line number Diff line change
Expand Up @@ -711,6 +711,105 @@ extension DSLTree.Node {
}
}

extension DSLTree.Node {
/// Implementation for `canOnlyMatchAtStart`, which maintains the option
/// state.
///
/// For a given specific node, this method can return one of three values:
///
/// - `true`: This node is guaranteed to match only at the start of a subject.
/// - `false`: This node can match anywhere in the subject.
/// - `nil`: This node is inconclusive about where it can match.
///
/// In particular, non-required groups and option-setting groups are
/// inconclusive about where they can match.
private func _canOnlyMatchAtStartImpl(_ options: inout MatchingOptions) -> Bool? {
switch self {
// Defining cases
case .atom(.assertion(.startOfSubject)):
return true
case .atom(.assertion(.caretAnchor)):
return !options.anchorsMatchNewlines

// Changing options doesn't determine `true`/`false`.
case .atom(.changeMatchingOptions(let sequence)):
options.apply(sequence.ast)
return nil

// Any other atom or consuming node returns `false`.
case .atom, .customCharacterClass, .quotedLiteral:
return false

// Trivia/empty have no effect.
case .trivia, .empty:
return nil

// In an alternation, all of its children must match only at start.
case .orderedChoice(let children):
return children.allSatisfy { $0._canOnlyMatchAtStartImpl(&options) == true }

// In a concatenation, the first definitive child provides the answer.
case .concatenation(let children):
for child in children {
if let result = child._canOnlyMatchAtStartImpl(&options) {
return result
}
}
return false

// Groups (and other parent nodes) defer to the child.
case .nonCapturingGroup(let kind, let child):
options.beginScope()
defer { options.endScope() }
if case .changeMatchingOptions(let sequence) = kind.ast {
options.apply(sequence)
}
return child._canOnlyMatchAtStartImpl(&options)
case .capture(_, _, let child, _):
options.beginScope()
defer { options.endScope() }
return child._canOnlyMatchAtStartImpl(&options)
case .ignoreCapturesInTypedOutput(let child),
.convertedRegexLiteral(let child, _):
return child._canOnlyMatchAtStartImpl(&options)

// A quantification that doesn't require its child to exist can still
// allow a start-only match. (e.g. `/(foo)?^bar/`)
case .quantification(let amount, _, let child):
return amount.requiresAtLeastOne
? child._canOnlyMatchAtStartImpl(&options)
: nil

// For conditional nodes, both sides must require matching at start.
case .conditional(_, let child1, let child2):
return child1._canOnlyMatchAtStartImpl(&options) == true
&& child2._canOnlyMatchAtStartImpl(&options) == true

// Extended behavior isn't known, so we return `false` for safety.
case .consumer, .matcher, .characterPredicate, .absentFunction:
return false
}
}

/// Returns a Boolean value indicating whether the regex with this node as
/// the root can _only_ match at the start of a subject.
///
/// For example, these regexes can only match at the start of a subject:
///
/// - `/^foo/`
/// - `/(^foo|^bar)/` (both sides of the alternation start with `^`)
///
/// These can match other places in a subject:
///
/// - `/(^foo)?bar/` (`^` is in an optional group)
/// - `/(^foo|bar)/` (only one side of the alternation starts with `^`)
/// - `/(?m)^foo/` (`^` means "the start of a line" due to `(?m)`)
internal func canOnlyMatchAtStart() -> Bool {
var options = MatchingOptions()
return _canOnlyMatchAtStartImpl(&options) ?? false
}
}

// MARK: AST wrapper types
//
// These wrapper types are required because even @_spi-marked public APIs can't
Expand Down Expand Up @@ -818,6 +917,17 @@ extension DSLTree {
public static func range(_ lower: Int, _ upper: Int) -> Self {
.init(ast: .range(.init(lower, at: .fake), .init(upper, at: .fake)))
}

internal var requiresAtLeastOne: Bool {
switch ast {
case .zeroOrOne, .zeroOrMore, .upToN:
return false
case .oneOrMore:
return true
case .exactly(let num), .nOrMore(let num), .range(let num, _):
return num.value.map { $0 > 0 } ?? false
}
}
}

@_spi(RegexBuilder)
Expand Down
4 changes: 3 additions & 1 deletion Sources/_StringProcessing/Regex/Match.swift
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,9 @@ extension Regex {
_ input: String,
in subjectBounds: Range<String.Index>
) throws -> Regex<Output>.Match? {
try _firstMatch(input, subjectBounds: subjectBounds, searchBounds: subjectBounds)
try regex.program.loweredProgram.canOnlyMatchAtStart
? _match(input, in: subjectBounds, mode: .partialFromFront)
: _firstMatch(input, subjectBounds: subjectBounds, searchBounds: subjectBounds)
}

func _firstMatch(
Expand Down
51 changes: 50 additions & 1 deletion Tests/RegexBuilderTests/RegexDSLTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
//===----------------------------------------------------------------------===//

import XCTest
import _StringProcessing
@testable import _StringProcessing
import RegexBuilder
import TestSupport

Expand Down Expand Up @@ -973,6 +973,55 @@ class RegexDSLTests: XCTestCase {
}
}

func testCanOnlyMatchAtStart() throws {
func expectCanOnlyMatchAtStart(
_ expectation: Bool,
file: StaticString = #file, line: UInt = #line,
@RegexComponentBuilder _ content: () -> some RegexComponent
) {
let regex = content().regex
XCTAssertEqual(regex.program.loweredProgram.canOnlyMatchAtStart, expectation, file: file, line: line)
}

expectCanOnlyMatchAtStart(true) {
Anchor.startOfSubject
"foo"
}
expectCanOnlyMatchAtStart(false) {
"foo"
}
expectCanOnlyMatchAtStart(true) {
Optionally { "foo" }
Anchor.startOfSubject
"bar"
}

expectCanOnlyMatchAtStart(true) {
ChoiceOf {
Regex {
Anchor.startOfSubject
"foo"
}
Regex {
Anchor.startOfSubject
"bar"
}
}
}
expectCanOnlyMatchAtStart(false) {
ChoiceOf {
Regex {
Anchor.startOfSubject
"foo"
}
Regex {
Anchor.startOfLine
"bar"
}
}
}
}

func testNestedGroups() throws {
return;

Expand Down
38 changes: 38 additions & 0 deletions Tests/RegexTests/CompileTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -484,4 +484,42 @@ extension RegexTests {
expectProgram(for: #"(a+)*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
expectProgram(for: #"(a{1,})*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
}

func testCanOnlyMatchAtStart() throws {
func expectCanOnlyMatchAtStart(
_ regexStr: String,
_ expectTrue: Bool,
file: StaticString = #file,
line: UInt = #line
) throws {
let regex = try Regex(regexStr)
XCTAssertEqual(
regex.program.loweredProgram.canOnlyMatchAtStart, expectTrue,
file: file, line: line)
}

try expectCanOnlyMatchAtStart("^foo", true) // anchor
try expectCanOnlyMatchAtStart("\\Afoo", true) // more specific anchor
try expectCanOnlyMatchAtStart("foo", false) // no anchor

try expectCanOnlyMatchAtStart("(?i)^foo", true) // unrelated option
try expectCanOnlyMatchAtStart("(?m)^foo", false) // anchors match newlines
try expectCanOnlyMatchAtStart("(?i:^foo)", true) // unrelated option
try expectCanOnlyMatchAtStart("(?m:^foo)", false) // anchors match newlines

try expectCanOnlyMatchAtStart("(^foo|bar)", false) // one side of alternation
try expectCanOnlyMatchAtStart("(foo|^bar)", false) // other side of alternation
try expectCanOnlyMatchAtStart("(^foo|^bar)", true) // both sides of alternation

// Test quantifiers that include the anchor
try expectCanOnlyMatchAtStart("(^foo)?bar", false)
try expectCanOnlyMatchAtStart("(^foo)*bar", false)
try expectCanOnlyMatchAtStart("(^foo)+bar", true)
try expectCanOnlyMatchAtStart("(?:^foo)+bar", true)

// Test quantifiers before the anchor
try expectCanOnlyMatchAtStart("(foo)?^bar", true) // The initial group must match ""
try expectCanOnlyMatchAtStart("(?:foo)?^bar", true)
try expectCanOnlyMatchAtStart("(foo)+^bar", false) // This can't actually match anywhere
}
}