From c2b93abd63366e35140ae20b704aee3acc33afb6 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Mon, 11 Dec 2023 13:45:22 -0700 Subject: [PATCH 01/16] Speed up quantification fast paths by unswitching the loop --- .../_StringProcessing/ConsumerInterface.swift | 11 +- .../_StringProcessing/Engine/MEQuantify.swift | 186 +++++++++++++++--- .../_StringProcessing/Engine/Processor.swift | 20 +- 3 files changed, 162 insertions(+), 55 deletions(-) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 808a1e498..c19996d44 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -391,9 +391,8 @@ extension DSLTree.CustomCharacterClass.Member { return { input, bounds in let curIdx = bounds.lowerBound - let nextIndex = isCharacterSemantic - ? input.index(after: curIdx) - : input.unicodeScalars.index(after: curIdx) + let nextIndex = input.index( + after: curIdx, isScalarSemantics: !isCharacterSemantic) // Under grapheme semantics, we compare based on single NFC scalars. If // such a character is not single scalar under NFC, the match fails. In @@ -603,9 +602,9 @@ extension AST.Atom.CharacterProperty { if p(input, bounds) != nil { return nil } // TODO: bounds check - return opts.semanticLevel == .graphemeCluster - ? input.index(after: bounds.lowerBound) - : input.unicodeScalars.index(after: bounds.lowerBound) + return input.index( + after: bounds.lowerBound, + isScalarSemantics: opts.semanticLevel == .unicodeScalar) } } diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index a0480cde6..7e2a1097a 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -1,5 +1,49 @@ +private typealias ASCIIBitset = DSLTree.CustomCharacterClass.AsciiBitset + extension Processor { - func _doQuantifyMatch(_ payload: QuantifyPayload) -> Input.Index? { + func _doASCIIBitsetMatch( + _: AsciiBitsetRegister + ) -> Input.Index? { + fatalError() + } +} + + +extension String { + func index(after idx: Index, isScalarSemantics: Bool) -> Index { + if isScalarSemantics { + return unicodeScalars.index(after: idx) + } else { + return index(after: idx) + } + } +} + + +extension Processor { + + internal mutating func runQuantify(_ payload: QuantifyPayload) -> Bool { + let matched: Bool + switch (payload.quantKind, payload.minTrips, payload.maxExtraTrips) { + case (.reluctant, _, _): + assertionFailure(".reluctant is not supported by .quantify") + // TODO: this was pre-refactoring behavior, should we fatal error + // instead? + return false + case (.eager, 0, nil): + runEagerZeroOrMoreQuantify(payload) + return true + case (.eager, 1, nil): + return runEagerOneOrMoreQuantify(payload) + case (_, 0, 1): + runZeroOrOneQuantify(payload) + return true + default: + return runGeneralQuantify(payload) + } + } + + private func doQuantifyMatch(_ payload: QuantifyPayload) -> Input.Index? { let isScalarSemantics = payload.isScalarSemantics switch payload.type { @@ -31,10 +75,8 @@ extension Processor { guard currentPosition < end else { return nil } if payload.anyMatchesNewline { - if isScalarSemantics { - return input.unicodeScalars.index(after: currentPosition) - } - return input.index(after: currentPosition) + return input.index( + after: currentPosition, isScalarSemantics: isScalarSemantics) } return input.matchAnyNonNewline( @@ -47,14 +89,14 @@ extension Processor { /// Generic quantify instruction interpreter /// - Handles .eager and .posessive /// - Handles arbitrary minTrips and maxExtraTrips - mutating func runQuantify(_ payload: QuantifyPayload) -> Bool { + private mutating func runGeneralQuantify(_ payload: QuantifyPayload) -> Bool { assert(payload.quantKind != .reluctant) var trips = 0 var maxExtraTrips = payload.maxExtraTrips while trips < payload.minTrips { - guard let next = _doQuantifyMatch(payload) else { + guard let next = doQuantifyMatch(payload) else { signalFailure() return false } @@ -67,7 +109,7 @@ extension Processor { return true } - guard let next = _doQuantifyMatch(payload) else { + guard let next = doQuantifyMatch(payload) else { return true } maxExtraTrips = maxExtraTrips.map { $0 - 1 } @@ -81,7 +123,7 @@ extension Processor { while true { if maxExtraTrips == 0 { break } - guard let next = _doQuantifyMatch(payload) else { + guard let next = doQuantifyMatch(payload) else { break } maxExtraTrips = maxExtraTrips.map({$0 - 1}) @@ -100,67 +142,147 @@ extension Processor { } /// Specialized quantify instruction interpreter for `*`, always succeeds - mutating func runEagerZeroOrMoreQuantify(_ payload: QuantifyPayload) { + private mutating func runEagerZeroOrMoreQuantify(_ payload: QuantifyPayload) { assert(payload.quantKind == .eager && payload.minTrips == 0 && payload.maxExtraTrips == nil) - _doRunEagerZeroOrMoreQuantify(payload) + _ = doRunEagerZeroOrMoreQuantify(payload) } - // NOTE: So-as to inline into one-or-more call, which makes a significant - // performance difference + // Returns whether it matched at least once + // + // NOTE: inline-always so-as to inline into one-or-more call, which makes a + // significant performance difference @inline(__always) - mutating func _doRunEagerZeroOrMoreQuantify(_ payload: QuantifyPayload) { - guard let next = _doQuantifyMatch(payload) else { - // Consumed no input, no point saved - return - } - + private mutating func doRunEagerZeroOrMoreQuantify(_ payload: QuantifyPayload) -> Bool { // Create a quantified save point for every part of the input matched up // to the final position. + let isScalarSemantics = payload.isScalarSemantics let rangeStart = currentPosition var rangeEnd = currentPosition - currentPosition = next - while true { - guard let next = _doQuantifyMatch(payload) else { break } - rangeEnd = currentPosition - currentPosition = next + var matchedOnce = false + + switch payload.type { + case .asciiBitset: + let bitset = registers[payload.bitset] + while true { + guard let next = input.matchASCIIBitset( + bitset, + at: currentPosition, + limitedBy: end, + isScalarSemantics: isScalarSemantics) + else { + break + } + matchedOnce = true + rangeEnd = currentPosition + currentPosition = next + assert(currentPosition > rangeEnd) + } + case .asciiChar: + let asciiScalar = UnicodeScalar.init(_value: UInt32(payload.asciiChar)) + while true { + guard let next = input.matchScalar( + asciiScalar, + at: currentPosition, + limitedBy: end, + boundaryCheck: !isScalarSemantics, + isCaseInsensitive: false) + else { + break + } + matchedOnce = true + rangeEnd = currentPosition + currentPosition = next + assert(currentPosition > rangeEnd) + } + case .builtin: + let builtin = payload.builtin + let isInverted = payload.builtinIsInverted + let isStrictASCII = payload.builtinIsStrict + while true { + guard let next = input.matchBuiltinCC( + builtin, + at: currentPosition, + limitedBy: end, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics) + else { + break + } + matchedOnce = true + rangeEnd = currentPosition + currentPosition = next + assert(currentPosition > rangeEnd) + } + case .any: + while true { + guard currentPosition < end else { break } + let next: String.Index? + if payload.anyMatchesNewline { + next = input.index( + after: currentPosition, isScalarSemantics: isScalarSemantics) + } else { + next = input.matchAnyNonNewline( + at: currentPosition, + limitedBy: end, + isScalarSemantics: isScalarSemantics) + } + + guard let next else { break } + matchedOnce = true + rangeEnd = currentPosition + currentPosition = next + assert(currentPosition > rangeEnd) + } + } + + guard matchedOnce else { + // Consumed no input, no point saved + return false } - savePoints.append(makeQuantifiedSavePoint(rangeStart.. Bool { + private mutating func runEagerOneOrMoreQuantify(_ payload: QuantifyPayload) -> Bool { assert(payload.quantKind == .eager && payload.minTrips == 1 && payload.maxExtraTrips == nil) // Match at least once - guard let next = _doQuantifyMatch(payload) else { + guard let next = doQuantifyMatch(payload) else { signalFailure() return false } // Run `a+` as `aa*` currentPosition = next - _doRunEagerZeroOrMoreQuantify(payload) + doRunEagerZeroOrMoreQuantify(payload) return true } /// Specialized quantify instruction interpreter for ? - mutating func runZeroOrOneQuantify(_ payload: QuantifyPayload) -> Bool { + private mutating func runZeroOrOneQuantify(_ payload: QuantifyPayload) { assert(payload.minTrips == 0 && payload.maxExtraTrips == 1) - let next = _doQuantifyMatch(payload) + let next = doQuantifyMatch(payload) guard let idx = next else { - return true // matched zero times + return // matched zero times } if payload.quantKind != .possessive { // Save the zero match savePoints.append(makeSavePoint(resumingAt: currentPC+1)) } currentPosition = idx - return true + return } } diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 86365322b..eccbcff64 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -515,26 +515,12 @@ extension Processor { controller.step() } case .quantify: - let quantPayload = payload.quantify - let matched: Bool - switch (quantPayload.quantKind, quantPayload.minTrips, quantPayload.maxExtraTrips) { - case (.reluctant, _, _): - assertionFailure(".reluctant is not supported by .quantify") - return - case (.eager, 0, nil): - runEagerZeroOrMoreQuantify(quantPayload) - matched = true - case (.eager, 1, nil): - matched = runEagerOneOrMoreQuantify(quantPayload) - case (_, 0, 1): - matched = runZeroOrOneQuantify(quantPayload) - default: - matched = runQuantify(quantPayload) - } - if matched { + if runQuantify(payload.quantify) { controller.step() } + + case .consumeBy: let reg = payload.consumer let consumer = registers[reg] From b48f09e1130f9315695e1f177c6a2f8751503ead Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Mon, 11 Dec 2023 16:09:19 -0700 Subject: [PATCH 02/16] squash me: cleanup --- .../_StringProcessing/Engine/MEBuiltins.swift | 19 +++++++ .../_StringProcessing/Engine/MEQuantify.swift | 53 ++++--------------- Sources/_StringProcessing/Utility/Misc.swift | 12 +++++ 3 files changed, 41 insertions(+), 43 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 0dafd6720..33b13178b 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -223,6 +223,25 @@ extension String { else { return nil } return next } + + internal func matchRegexDot( + at currentPosition: Index, + limitedBy end: Index, + anyMatchesNewline: Bool, + isScalarSemantics: Bool + ) -> Index? { + guard currentPosition < end else { return nil } + + if anyMatchesNewline { + return index( + after: currentPosition, isScalarSemantics: isScalarSemantics) + } + + return matchAnyNonNewline( + at: currentPosition, + limitedBy: end, + isScalarSemantics: isScalarSemantics) + } } // MARK: - Built-in character class matching diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 7e2a1097a..dfab9e17b 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -1,27 +1,6 @@ private typealias ASCIIBitset = DSLTree.CustomCharacterClass.AsciiBitset extension Processor { - func _doASCIIBitsetMatch( - _: AsciiBitsetRegister - ) -> Input.Index? { - fatalError() - } -} - - -extension String { - func index(after idx: Index, isScalarSemantics: Bool) -> Index { - if isScalarSemantics { - return unicodeScalars.index(after: idx) - } else { - return index(after: idx) - } - } -} - - -extension Processor { - internal mutating func runQuantify(_ payload: QuantifyPayload) -> Bool { let matched: Bool switch (payload.quantKind, payload.minTrips, payload.maxExtraTrips) { @@ -61,8 +40,6 @@ extension Processor { boundaryCheck: !isScalarSemantics, isCaseInsensitive: false) case .builtin: - guard currentPosition < end else { return nil } - // We only emit .quantify if it consumes a single character return input.matchBuiltinCC( payload.builtin, @@ -72,16 +49,10 @@ extension Processor { isStrictASCII: payload.builtinIsStrict, isScalarSemantics: isScalarSemantics) case .any: - guard currentPosition < end else { return nil } - - if payload.anyMatchesNewline { - return input.index( - after: currentPosition, isScalarSemantics: isScalarSemantics) - } - - return input.matchAnyNonNewline( + return input.matchRegexDot( at: currentPosition, limitedBy: end, + anyMatchesNewline: payload.anyMatchesNewline, isScalarSemantics: isScalarSemantics) } } @@ -217,20 +188,16 @@ extension Processor { assert(currentPosition > rangeEnd) } case .any: + let anyMatchesNewline = payload.anyMatchesNewline while true { - guard currentPosition < end else { break } - let next: String.Index? - if payload.anyMatchesNewline { - next = input.index( - after: currentPosition, isScalarSemantics: isScalarSemantics) - } else { - next = input.matchAnyNonNewline( - at: currentPosition, - limitedBy: end, - isScalarSemantics: isScalarSemantics) + guard let next = input.matchRegexDot( + at: currentPosition, + limitedBy: end, + anyMatchesNewline: anyMatchesNewline, + isScalarSemantics: isScalarSemantics) + else { + break } - - guard let next else { break } matchedOnce = true rangeEnd = currentPosition currentPosition = next diff --git a/Sources/_StringProcessing/Utility/Misc.swift b/Sources/_StringProcessing/Utility/Misc.swift index 8555ec85c..d63370b55 100644 --- a/Sources/_StringProcessing/Utility/Misc.swift +++ b/Sources/_StringProcessing/Utility/Misc.swift @@ -65,3 +65,15 @@ enum QuickResult { case unknown } +extension String { + /// Index after in either grapheme or scalar view + func index(after idx: Index, isScalarSemantics: Bool) -> Index { + if isScalarSemantics { + return unicodeScalars.index(after: idx) + } else { + return index(after: idx) + } + } +} + + From efd4a1dee774dd90462cbec72ffaa93a54962f97 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Mon, 11 Dec 2023 16:12:35 -0700 Subject: [PATCH 03/16] comments --- Sources/_StringProcessing/Engine/MEQuantify.swift | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index dfab9e17b..2dcdc2ecb 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -226,6 +226,11 @@ extension Processor { && payload.maxExtraTrips == nil) // Match at least once + // + // NOTE: Due to newline-sequence in scalar-semantic mode advancing two + // positions, we can't just have doRunEagerZeroOrMoreQuantify return the + // range-end and advance the range-start ourselves. Instead, we do one + // call before looping. guard let next = doQuantifyMatch(payload) else { signalFailure() return false From 45b4da39e9a1bbdcb6f381ce9e0aa9b9143ffe71 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Mon, 11 Dec 2023 17:38:05 -0700 Subject: [PATCH 04/16] Refactor off of mutating methods Refactors mutating methods into string methods for easier unit testing and parity-checking via assertions. Prepares for more efficient implementations. --- .../_StringProcessing/Engine/MEQuantify.swift | 253 ++++++++++++++---- .../_StringProcessing/Engine/Processor.swift | 2 - 2 files changed, 198 insertions(+), 57 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 2dcdc2ecb..09702f7b4 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -1,8 +1,20 @@ private typealias ASCIIBitset = DSLTree.CustomCharacterClass.AsciiBitset extension Processor { + private func maybeASCIIBitset( + _ payload: QuantifyPayload + ) -> ASCIIBitset? { + guard payload.type == .asciiBitset else { return nil } + return registers[payload.bitset] + } + internal mutating func runQuantify(_ payload: QuantifyPayload) -> Bool { - let matched: Bool + let asciiBitset = maybeASCIIBitset(payload) + + // TODO: Refactor below called functions to be non-mutating. + // They might need to communicate save-point info upwards in addition to + // a new (optional) currentPosition. Then, we can assert in testing that the + // specialized functions produce the same answer as `runGeneralQuantify`. switch (payload.quantKind, payload.minTrips, payload.maxExtraTrips) { case (.reluctant, _, _): assertionFailure(".reluctant is not supported by .quantify") @@ -10,30 +22,111 @@ extension Processor { // instead? return false case (.eager, 0, nil): - runEagerZeroOrMoreQuantify(payload) + let (next, savePointRange) = input.runEagerZeroOrMoreQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end) + assert((next, savePointRange) == input.runGeneralQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end)!) + if let savePointRange { + savePoints.append(makeQuantifiedSavePoint( + savePointRange, isScalarSemantics: payload.isScalarSemantics)) + } + currentPosition = next return true case (.eager, 1, nil): - return runEagerOneOrMoreQuantify(payload) + guard let (next, savePointRange) = input.runEagerOneOrMoreQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end + ) else { + assert(nil == input.runGeneralQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end)) + signalFailure() + return false + } + assert((next, savePointRange) == input.runGeneralQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end)!) + if let savePointRange { + savePoints.append(makeQuantifiedSavePoint( + savePointRange, isScalarSemantics: payload.isScalarSemantics)) + } + currentPosition = next + return true case (_, 0, 1): - runZeroOrOneQuantify(payload) + // FIXME: Is this correct for lazy zero-or-one? + let (next, save) = input.runZeroOrOneQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end) + // Also, we should assert same answer as runGeneralQuantify... + if save { + savePoints.append(makeSavePoint(resumingAt: currentPC+1)) + } + currentPosition = next return true default: - return runGeneralQuantify(payload) + guard let (next, savePointRange) = input.runGeneralQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end + ) else { + assert(nil == input.runGeneralQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end)) + signalFailure() + return false + } + assert((next, savePointRange) == input.runGeneralQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end)!) + if let savePointRange { + savePoints.append(makeQuantifiedSavePoint( + savePointRange, isScalarSemantics: payload.isScalarSemantics)) + } + currentPosition = next + + return true } } +} - private func doQuantifyMatch(_ payload: QuantifyPayload) -> Input.Index? { +extension String { + fileprivate func doQuantifyMatch( + _ payload: QuantifyPayload, + asciiBitset: ASCIIBitset?, // Necessary ugliness... + at currentPosition: Index, + limitedBy end: Index + ) -> Index? { let isScalarSemantics = payload.isScalarSemantics switch payload.type { case .asciiBitset: - return input.matchASCIIBitset( - registers[payload.bitset], + assert(asciiBitset != nil, "Invariant: needs to be passed in") + return matchASCIIBitset( + asciiBitset!, at: currentPosition, limitedBy: end, isScalarSemantics: isScalarSemantics) case .asciiChar: - return input.matchScalar( + return matchScalar( UnicodeScalar.init(_value: UInt32(payload.asciiChar)), at: currentPosition, limitedBy: end, @@ -41,7 +134,7 @@ extension Processor { isCaseInsensitive: false) case .builtin: // We only emit .quantify if it consumes a single character - return input.matchBuiltinCC( + return matchBuiltinCC( payload.builtin, at: currentPosition, limitedBy: end, @@ -49,7 +142,7 @@ extension Processor { isStrictASCII: payload.builtinIsStrict, isScalarSemantics: isScalarSemantics) case .any: - return input.matchRegexDot( + return matchRegexDot( at: currentPosition, limitedBy: end, anyMatchesNewline: payload.anyMatchesNewline, @@ -60,16 +153,29 @@ extension Processor { /// Generic quantify instruction interpreter /// - Handles .eager and .posessive /// - Handles arbitrary minTrips and maxExtraTrips - private mutating func runGeneralQuantify(_ payload: QuantifyPayload) -> Bool { + fileprivate func runGeneralQuantify( + _ payload: QuantifyPayload, + asciiBitset: ASCIIBitset?, + at currentPosition: Index, + limitedBy end: Index + ) -> ( + nextPosition: Index, + savePointRange: Range? + )? { assert(payload.quantKind != .reluctant) var trips = 0 var maxExtraTrips = payload.maxExtraTrips + var currentPosition = currentPosition while trips < payload.minTrips { - guard let next = doQuantifyMatch(payload) else { - signalFailure() - return false + guard let next = doQuantifyMatch( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end + ) else { + return nil } currentPosition = next trips += 1 @@ -77,11 +183,16 @@ extension Processor { if maxExtraTrips == 0 { // We're done - return true + return (currentPosition, nil) } - guard let next = doQuantifyMatch(payload) else { - return true + guard let next = doQuantifyMatch( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end + ) else { + return (currentPosition, nil) } maxExtraTrips = maxExtraTrips.map { $0 - 1 } @@ -94,7 +205,12 @@ extension Processor { while true { if maxExtraTrips == 0 { break } - guard let next = doQuantifyMatch(payload) else { + guard let next = doQuantifyMatch( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end + ) else { break } maxExtraTrips = maxExtraTrips.map({$0 - 1}) @@ -103,31 +219,43 @@ extension Processor { } if payload.quantKind == .eager { - savePoints.append(makeQuantifiedSavePoint( - rangeStart.. (Index, savePointRange: Range?) { assert(payload.quantKind == .eager && payload.minTrips == 0 && payload.maxExtraTrips == nil) - _ = doRunEagerZeroOrMoreQuantify(payload) + return doRunEagerZeroOrMoreQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end) } - // Returns whether it matched at least once - // // NOTE: inline-always so-as to inline into one-or-more call, which makes a // significant performance difference @inline(__always) - private mutating func doRunEagerZeroOrMoreQuantify(_ payload: QuantifyPayload) -> Bool { + private func doRunEagerZeroOrMoreQuantify( + _ payload: QuantifyPayload, + asciiBitset: ASCIIBitset?, // Necessary ugliness... + at currentPosition: Index, + limitedBy end: Index + ) -> (Index, savePointRange: Range?) { // Create a quantified save point for every part of the input matched up // to the final position. + var currentPosition = currentPosition let isScalarSemantics = payload.isScalarSemantics let rangeStart = currentPosition var rangeEnd = currentPosition @@ -135,10 +263,10 @@ extension Processor { switch payload.type { case .asciiBitset: - let bitset = registers[payload.bitset] while true { - guard let next = input.matchASCIIBitset( - bitset, + assert(asciiBitset != nil, "Invariant: needs to be passed in") + guard let next = matchASCIIBitset( + asciiBitset!, at: currentPosition, limitedBy: end, isScalarSemantics: isScalarSemantics) @@ -153,7 +281,7 @@ extension Processor { case .asciiChar: let asciiScalar = UnicodeScalar.init(_value: UInt32(payload.asciiChar)) while true { - guard let next = input.matchScalar( + guard let next = matchScalar( asciiScalar, at: currentPosition, limitedBy: end, @@ -172,7 +300,7 @@ extension Processor { let isInverted = payload.builtinIsInverted let isStrictASCII = payload.builtinIsStrict while true { - guard let next = input.matchBuiltinCC( + guard let next = matchBuiltinCC( builtin, at: currentPosition, limitedBy: end, @@ -190,7 +318,7 @@ extension Processor { case .any: let anyMatchesNewline = payload.anyMatchesNewline while true { - guard let next = input.matchRegexDot( + guard let next = matchRegexDot( at: currentPosition, limitedBy: end, anyMatchesNewline: anyMatchesNewline, @@ -207,20 +335,23 @@ extension Processor { guard matchedOnce else { // Consumed no input, no point saved - return false + return (currentPosition, nil) } // NOTE: We can't assert that rangeEnd trails currentPosition by one // position, because newline-sequence in scalar semantic mode still // matches two scalars - savePoints.append(makeQuantifiedSavePoint( - rangeStart.. Bool { + fileprivate func runEagerOneOrMoreQuantify( + _ payload: QuantifyPayload, + asciiBitset: ASCIIBitset?, // Necessary ugliness... + at currentPosition: Index, + limitedBy end: Index + ) -> (Index, savePointRange: Range?)? { assert(payload.quantKind == .eager && payload.minTrips == 1 && payload.maxExtraTrips == nil) @@ -231,30 +362,42 @@ extension Processor { // positions, we can't just have doRunEagerZeroOrMoreQuantify return the // range-end and advance the range-start ourselves. Instead, we do one // call before looping. - guard let next = doQuantifyMatch(payload) else { - signalFailure() - return false + guard let next = doQuantifyMatch( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end + ) else { + return nil } // Run `a+` as `aa*` - currentPosition = next - doRunEagerZeroOrMoreQuantify(payload) - return true + return doRunEagerZeroOrMoreQuantify( + payload, + asciiBitset: asciiBitset, + at: next, + limitedBy: end) } /// Specialized quantify instruction interpreter for ? - private mutating func runZeroOrOneQuantify(_ payload: QuantifyPayload) { + fileprivate func runZeroOrOneQuantify( + _ payload: QuantifyPayload, + asciiBitset: ASCIIBitset?, // Necessary ugliness... + at currentPosition: Index, + limitedBy end: Index + ) -> (Index, makeSavePoint: Bool) { assert(payload.minTrips == 0 && payload.maxExtraTrips == 1) - let next = doQuantifyMatch(payload) - guard let idx = next else { - return // matched zero times - } - if payload.quantKind != .possessive { - // Save the zero match - savePoints.append(makeSavePoint(resumingAt: currentPC+1)) + guard let next = doQuantifyMatch( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end + ) else { + return (currentPosition, false) } - currentPosition = idx - return - } + return (next, payload.quantKind != .possessive) + } } + + diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index eccbcff64..310b5d932 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -519,8 +519,6 @@ extension Processor { controller.step() } - - case .consumeBy: let reg = payload.consumer let consumer = registers[reg] From eb6e0e0050cc68e31175d3360f458aa91a014cfc Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Tue, 12 Dec 2023 14:05:21 -0700 Subject: [PATCH 05/16] wip: trying to generalize and make fast enough --- .../_StringProcessing/Engine/MEQuantify.swift | 127 ++++++++++++------ 1 file changed, 88 insertions(+), 39 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 09702f7b4..f6813ff61 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -64,6 +64,32 @@ extension Processor { } currentPosition = next return true + case (.eager, _, nil): + guard let (next, savePointRange) = input.runEagerNOrMoreQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end + ) else { + assert(nil == input.runGeneralQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end)) + signalFailure() + return false + } + assert((next, savePointRange) == input.runGeneralQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end)!) + if let savePointRange { + savePoints.append(makeQuantifiedSavePoint( + savePointRange, isScalarSemantics: payload.isScalarSemantics)) + } + currentPosition = next + return true case (_, 0, 1): // FIXME: Is this correct for lazy zero-or-one? let (next, save) = input.runZeroOrOneQuantify( @@ -84,19 +110,9 @@ extension Processor { at: currentPosition, limitedBy: end ) else { - assert(nil == input.runGeneralQuantify( - payload, - asciiBitset: asciiBitset, - at: currentPosition, - limitedBy: end)) signalFailure() return false } - assert((next, savePointRange) == input.runGeneralQuantify( - payload, - asciiBitset: asciiBitset, - at: currentPosition, - limitedBy: end)!) if let savePointRange { savePoints.append(makeQuantifiedSavePoint( savePointRange, isScalarSemantics: payload.isScalarSemantics)) @@ -237,29 +253,42 @@ extension String { assert(payload.quantKind == .eager && payload.minTrips == 0 && payload.maxExtraTrips == nil) - return doRunEagerZeroOrMoreQuantify( + guard let res = _runEagerNOrMoreQuantify( payload, + minTrips: 0, asciiBitset: asciiBitset, at: currentPosition, - limitedBy: end) + limitedBy: end + ) else { + fatalError("Unreachable: zero-or-more always succeeds") + } + + return res } - // NOTE: inline-always so-as to inline into one-or-more call, which makes a - // significant performance difference + /// Specialized n-or-more eager quantification interpreter + /// + /// NOTE: inline always makes a huge perf difference for zero-or-more case @inline(__always) - private func doRunEagerZeroOrMoreQuantify( + fileprivate func _runEagerNOrMoreQuantify( _ payload: QuantifyPayload, + minTrips: UInt64, asciiBitset: ASCIIBitset?, // Necessary ugliness... at currentPosition: Index, limitedBy end: Index - ) -> (Index, savePointRange: Range?) { + ) -> (Index, savePointRange: Range?)? { + assert(payload.quantKind == .eager) + assert(payload.maxExtraTrips == nil) + assert(minTrips == payload.minTrips) + // Create a quantified save point for every part of the input matched up // to the final position. var currentPosition = currentPosition let isScalarSemantics = payload.isScalarSemantics - let rangeStart = currentPosition + var rangeStart = currentPosition var rangeEnd = currentPosition - var matchedOnce = false + + var numMatches = 0 switch payload.type { case .asciiBitset: @@ -273,7 +302,10 @@ extension String { else { break } - matchedOnce = true + numMatches += 1 + if numMatches == minTrips { + rangeStart = next + } rangeEnd = currentPosition currentPosition = next assert(currentPosition > rangeEnd) @@ -290,7 +322,10 @@ extension String { else { break } - matchedOnce = true + numMatches += 1 + if numMatches == minTrips { + rangeStart = next + } rangeEnd = currentPosition currentPosition = next assert(currentPosition > rangeEnd) @@ -310,7 +345,10 @@ extension String { else { break } - matchedOnce = true + numMatches += 1 + if numMatches == minTrips { + rangeStart = next + } rangeEnd = currentPosition currentPosition = next assert(currentPosition > rangeEnd) @@ -326,17 +364,25 @@ extension String { else { break } - matchedOnce = true + numMatches += 1 + if numMatches == minTrips { + rangeStart = next + } rangeEnd = currentPosition currentPosition = next assert(currentPosition > rangeEnd) } } - guard matchedOnce else { + guard numMatches >= minTrips else { + return nil + } + + guard numMatches > minTrips else { // Consumed no input, no point saved return (currentPosition, nil) } + assert(rangeStart <= rangeEnd) // NOTE: We can't assert that rangeEnd trails currentPosition by one // position, because newline-sequence in scalar semantic mode still @@ -346,36 +392,39 @@ extension String { } /// Specialized quantify instruction interpreter for `+` - fileprivate func runEagerOneOrMoreQuantify( + fileprivate func runEagerNOrMoreQuantify( _ payload: QuantifyPayload, asciiBitset: ASCIIBitset?, // Necessary ugliness... at currentPosition: Index, limitedBy end: Index ) -> (Index, savePointRange: Range?)? { assert(payload.quantKind == .eager - && payload.minTrips == 1 && payload.maxExtraTrips == nil) - // Match at least once - // - // NOTE: Due to newline-sequence in scalar-semantic mode advancing two - // positions, we can't just have doRunEagerZeroOrMoreQuantify return the - // range-end and advance the range-start ourselves. Instead, we do one - // call before looping. - guard let next = doQuantifyMatch( + return _runEagerNOrMoreQuantify( payload, + minTrips: payload.minTrips, asciiBitset: asciiBitset, at: currentPosition, - limitedBy: end - ) else { - return nil - } + limitedBy: end) + } + + /// Specialized quantify instruction interpreter for `+` + fileprivate func runEagerOneOrMoreQuantify( + _ payload: QuantifyPayload, + asciiBitset: ASCIIBitset?, // Necessary ugliness... + at currentPosition: Index, + limitedBy end: Index + ) -> (Index, savePointRange: Range?)? { + assert(payload.quantKind == .eager + && payload.minTrips == 1 + && payload.maxExtraTrips == nil) - // Run `a+` as `aa*` - return doRunEagerZeroOrMoreQuantify( + return _runEagerNOrMoreQuantify( payload, + minTrips: 1, asciiBitset: asciiBitset, - at: next, + at: currentPosition, limitedBy: end) } From 3fe6fe273112445b6be5b735b3daf29193dd9dbf Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Tue, 12 Dec 2023 15:10:09 -0700 Subject: [PATCH 06/16] wip: on the way to total un-switching while converting to non-mutating --- .../_StringProcessing/Engine/MEQuantify.swift | 129 ++++-------------- 1 file changed, 29 insertions(+), 100 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index f6813ff61..2446a1801 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -21,69 +21,44 @@ extension Processor { // TODO: this was pre-refactoring behavior, should we fatal error // instead? return false - case (.eager, 0, nil): - let (next, savePointRange) = input.runEagerZeroOrMoreQuantify( + case (_, 0, nil): + let (next, savePointRange) = input.runZeroOrMoreQuantify( payload, asciiBitset: asciiBitset, at: currentPosition, limitedBy: end) - assert((next, savePointRange) == input.runGeneralQuantify( - payload, - asciiBitset: asciiBitset, - at: currentPosition, - limitedBy: end)!) if let savePointRange { savePoints.append(makeQuantifiedSavePoint( savePointRange, isScalarSemantics: payload.isScalarSemantics)) } currentPosition = next return true - case (.eager, 1, nil): + case (_, 1, nil): guard let (next, savePointRange) = input.runEagerOneOrMoreQuantify( payload, asciiBitset: asciiBitset, at: currentPosition, limitedBy: end ) else { - assert(nil == input.runGeneralQuantify( - payload, - asciiBitset: asciiBitset, - at: currentPosition, - limitedBy: end)) signalFailure() return false } - assert((next, savePointRange) == input.runGeneralQuantify( - payload, - asciiBitset: asciiBitset, - at: currentPosition, - limitedBy: end)!) if let savePointRange { savePoints.append(makeQuantifiedSavePoint( savePointRange, isScalarSemantics: payload.isScalarSemantics)) } currentPosition = next return true - case (.eager, _, nil): - guard let (next, savePointRange) = input.runEagerNOrMoreQuantify( + case (_, _, nil): + guard let (next, savePointRange) = input.runNOrMoreQuantify( payload, asciiBitset: asciiBitset, at: currentPosition, limitedBy: end ) else { - assert(nil == input.runGeneralQuantify( - payload, - asciiBitset: asciiBitset, - at: currentPosition, - limitedBy: end)) signalFailure() return false } - assert((next, savePointRange) == input.runGeneralQuantify( - payload, - asciiBitset: asciiBitset, - at: currentPosition, - limitedBy: end)!) if let savePointRange { savePoints.append(makeQuantifiedSavePoint( savePointRange, isScalarSemantics: payload.isScalarSemantics)) @@ -180,82 +155,35 @@ extension String { )? { assert(payload.quantKind != .reluctant) - var trips = 0 - var maxExtraTrips = payload.maxExtraTrips - var currentPosition = currentPosition - - while trips < payload.minTrips { - guard let next = doQuantifyMatch( - payload, - asciiBitset: asciiBitset, - at: currentPosition, - limitedBy: end - ) else { - return nil - } - currentPosition = next - trips += 1 - } - - if maxExtraTrips == 0 { - // We're done - return (currentPosition, nil) + let minTrips = payload.minTrips + let maxTrips: UInt64 + if let maxExtraTrips = payload.maxExtraTrips { + maxTrips = payload.minTrips + maxExtraTrips + } else { + maxTrips = UInt64.max } - guard let next = doQuantifyMatch( + return _runEagerNOrMoreQuantify( payload, + minTrips: minTrips, + maxTrips: maxTrips, asciiBitset: asciiBitset, at: currentPosition, - limitedBy: end - ) else { - return (currentPosition, nil) - } - maxExtraTrips = maxExtraTrips.map { $0 - 1 } - - // Remember the range of valid positions in case we can create a quantified - // save point - let rangeStart = currentPosition - var rangeEnd = currentPosition - currentPosition = next - - while true { - if maxExtraTrips == 0 { break } - - guard let next = doQuantifyMatch( - payload, - asciiBitset: asciiBitset, - at: currentPosition, - limitedBy: end - ) else { - break - } - maxExtraTrips = maxExtraTrips.map({$0 - 1}) - rangeEnd = currentPosition - currentPosition = next - } - - if payload.quantKind == .eager { - return (currentPosition, rangeStart.. (Index, savePointRange: Range?) { - assert(payload.quantKind == .eager - && payload.minTrips == 0 - && payload.maxExtraTrips == nil) + assert(payload.minTrips == 0 && payload.maxExtraTrips == nil) guard let res = _runEagerNOrMoreQuantify( payload, minTrips: 0, + maxTrips: UInt64.max, asciiBitset: asciiBitset, at: currentPosition, limitedBy: end @@ -273,13 +201,13 @@ extension String { fileprivate func _runEagerNOrMoreQuantify( _ payload: QuantifyPayload, minTrips: UInt64, + maxTrips: UInt64, asciiBitset: ASCIIBitset?, // Necessary ugliness... at currentPosition: Index, limitedBy end: Index ) -> (Index, savePointRange: Range?)? { - assert(payload.quantKind == .eager) - assert(payload.maxExtraTrips == nil) assert(minTrips == payload.minTrips) + assert(minTrips + (payload.maxExtraTrips ?? UInt64.max - minTrips) == maxTrips) // Create a quantified save point for every part of the input matched up // to the final position. @@ -292,7 +220,7 @@ extension String { switch payload.type { case .asciiBitset: - while true { + while numMatches < maxTrips { assert(asciiBitset != nil, "Invariant: needs to be passed in") guard let next = matchASCIIBitset( asciiBitset!, @@ -312,7 +240,7 @@ extension String { } case .asciiChar: let asciiScalar = UnicodeScalar.init(_value: UInt32(payload.asciiChar)) - while true { + while numMatches < maxTrips { guard let next = matchScalar( asciiScalar, at: currentPosition, @@ -334,7 +262,7 @@ extension String { let builtin = payload.builtin let isInverted = payload.builtinIsInverted let isStrictASCII = payload.builtinIsStrict - while true { + while numMatches < maxTrips { guard let next = matchBuiltinCC( builtin, at: currentPosition, @@ -355,7 +283,7 @@ extension String { } case .any: let anyMatchesNewline = payload.anyMatchesNewline - while true { + while numMatches < maxTrips { guard let next = matchRegexDot( at: currentPosition, limitedBy: end, @@ -378,7 +306,7 @@ extension String { return nil } - guard numMatches > minTrips else { + guard payload.quantKind == .eager && numMatches > minTrips else { // Consumed no input, no point saved return (currentPosition, nil) } @@ -392,18 +320,18 @@ extension String { } /// Specialized quantify instruction interpreter for `+` - fileprivate func runEagerNOrMoreQuantify( + fileprivate func runNOrMoreQuantify( _ payload: QuantifyPayload, asciiBitset: ASCIIBitset?, // Necessary ugliness... at currentPosition: Index, limitedBy end: Index ) -> (Index, savePointRange: Range?)? { - assert(payload.quantKind == .eager - && payload.maxExtraTrips == nil) + assert(payload.maxExtraTrips == nil) return _runEagerNOrMoreQuantify( payload, minTrips: payload.minTrips, + maxTrips: UInt64.max, asciiBitset: asciiBitset, at: currentPosition, limitedBy: end) @@ -423,6 +351,7 @@ extension String { return _runEagerNOrMoreQuantify( payload, minTrips: 1, + maxTrips: UInt64.max, asciiBitset: asciiBitset, at: currentPosition, limitedBy: end) From 1687315eaaf4f3ccedbb48d2e0bec90c4308c382 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Tue, 12 Dec 2023 15:21:12 -0700 Subject: [PATCH 07/16] wip: getting closer --- .../_StringProcessing/Engine/MEQuantify.swift | 43 ++++++++++--------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 2446a1801..f6a97884b 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -34,7 +34,7 @@ extension Processor { currentPosition = next return true case (_, 1, nil): - guard let (next, savePointRange) = input.runEagerOneOrMoreQuantify( + guard let (next, savePointRange) = input.runOneOrMoreQuantify( payload, asciiBitset: asciiBitset, at: currentPosition, @@ -67,14 +67,14 @@ extension Processor { return true case (_, 0, 1): // FIXME: Is this correct for lazy zero-or-one? - let (next, save) = input.runZeroOrOneQuantify( + let (next, savePointRange) = input.runZeroOrOneQuantify( payload, asciiBitset: asciiBitset, at: currentPosition, limitedBy: end) - // Also, we should assert same answer as runGeneralQuantify... - if save { - savePoints.append(makeSavePoint(resumingAt: currentPC+1)) + if let savePointRange { + savePoints.append(makeQuantifiedSavePoint( + savePointRange, isScalarSemantics: payload.isScalarSemantics)) } currentPosition = next return true @@ -163,7 +163,7 @@ extension String { maxTrips = UInt64.max } - return _runEagerNOrMoreQuantify( + return _runNOrMoreQuantify( payload, minTrips: minTrips, maxTrips: maxTrips, @@ -180,7 +180,7 @@ extension String { limitedBy end: Index ) -> (Index, savePointRange: Range?) { assert(payload.minTrips == 0 && payload.maxExtraTrips == nil) - guard let res = _runEagerNOrMoreQuantify( + guard let res = _runNOrMoreQuantify( payload, minTrips: 0, maxTrips: UInt64.max, @@ -198,7 +198,7 @@ extension String { /// /// NOTE: inline always makes a huge perf difference for zero-or-more case @inline(__always) - fileprivate func _runEagerNOrMoreQuantify( + fileprivate func _runNOrMoreQuantify( _ payload: QuantifyPayload, minTrips: UInt64, maxTrips: UInt64, @@ -207,7 +207,7 @@ extension String { limitedBy end: Index ) -> (Index, savePointRange: Range?)? { assert(minTrips == payload.minTrips) - assert(minTrips + (payload.maxExtraTrips ?? UInt64.max - minTrips) == maxTrips) + assert(minTrips + (payload.maxExtraTrips ?? (UInt64.max - minTrips)) == maxTrips) // Create a quantified save point for every part of the input matched up // to the final position. @@ -328,7 +328,7 @@ extension String { ) -> (Index, savePointRange: Range?)? { assert(payload.maxExtraTrips == nil) - return _runEagerNOrMoreQuantify( + return _runNOrMoreQuantify( payload, minTrips: payload.minTrips, maxTrips: UInt64.max, @@ -338,17 +338,15 @@ extension String { } /// Specialized quantify instruction interpreter for `+` - fileprivate func runEagerOneOrMoreQuantify( + fileprivate func runOneOrMoreQuantify( _ payload: QuantifyPayload, asciiBitset: ASCIIBitset?, // Necessary ugliness... at currentPosition: Index, limitedBy end: Index ) -> (Index, savePointRange: Range?)? { - assert(payload.quantKind == .eager - && payload.minTrips == 1 - && payload.maxExtraTrips == nil) + assert(payload.minTrips == 1 && payload.maxExtraTrips == nil) - return _runEagerNOrMoreQuantify( + return _runNOrMoreQuantify( payload, minTrips: 1, maxTrips: UInt64.max, @@ -363,18 +361,21 @@ extension String { asciiBitset: ASCIIBitset?, // Necessary ugliness... at currentPosition: Index, limitedBy end: Index - ) -> (Index, makeSavePoint: Bool) { - assert(payload.minTrips == 0 - && payload.maxExtraTrips == 1) - guard let next = doQuantifyMatch( + ) -> (Index, savePointRange: Range?) { + assert(payload.minTrips == 0 && payload.maxExtraTrips == 1) + + guard let res = _runNOrMoreQuantify( payload, + minTrips: 0, + maxTrips: 1, asciiBitset: asciiBitset, at: currentPosition, limitedBy: end ) else { - return (currentPosition, false) + fatalError("Unreachable: zero-or-more always succeeds") } - return (next, payload.quantKind != .possessive) + + return res } } From 15706a4a7751b6e7952d71ab29197b3344503e35 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 13 Dec 2023 12:04:19 -0700 Subject: [PATCH 08/16] wip --- .../_StringProcessing/Engine/MEQuantify.swift | 90 +++++++++++-------- 1 file changed, 51 insertions(+), 39 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index f6a97884b..1f2aa63b5 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -1,3 +1,4 @@ +@_implementationOnly import _RegexParser private typealias ASCIIBitset = DSLTree.CustomCharacterClass.AsciiBitset extension Processor { @@ -100,45 +101,56 @@ extension Processor { } extension String { - fileprivate func doQuantifyMatch( - _ payload: QuantifyPayload, - asciiBitset: ASCIIBitset?, // Necessary ugliness... + fileprivate func matchQuantifiedASCIIBitset( + _ asciiBitset: ASCIIBitset, at currentPosition: Index, - limitedBy end: Index - ) -> Index? { - let isScalarSemantics = payload.isScalarSemantics + limitedBy end: Index, + minMatches: UInt64, + maxMatches: UInt64, + quantificationKind: AST.Quantification.Kind, + isScalarSemantics: Bool + ) -> (next: Index, savePointRange: Range?)? { + // Create a quantified save point for every part of the input + // (after minTrips) matched up to the final position. + var currentPosition = currentPosition + var rangeStart = currentPosition + var rangeEnd = currentPosition - switch payload.type { - case .asciiBitset: - assert(asciiBitset != nil, "Invariant: needs to be passed in") - return matchASCIIBitset( - asciiBitset!, - at: currentPosition, - limitedBy: end, - isScalarSemantics: isScalarSemantics) - case .asciiChar: - return matchScalar( - UnicodeScalar.init(_value: UInt32(payload.asciiChar)), - at: currentPosition, - limitedBy: end, - boundaryCheck: !isScalarSemantics, - isCaseInsensitive: false) - case .builtin: - // We only emit .quantify if it consumes a single character - return matchBuiltinCC( - payload.builtin, - at: currentPosition, - limitedBy: end, - isInverted: payload.builtinIsInverted, - isStrictASCII: payload.builtinIsStrict, - isScalarSemantics: isScalarSemantics) - case .any: - return matchRegexDot( + var numMatches = 0 + + while numMatches < maxMatches { + guard let next = matchASCIIBitset( + asciiBitset, at: currentPosition, limitedBy: end, - anyMatchesNewline: payload.anyMatchesNewline, isScalarSemantics: isScalarSemantics) + else { + break + } + numMatches &+= 1 + if numMatches == minMatches { + rangeStart = next + } + rangeEnd = currentPosition + currentPosition = next + assert(currentPosition > rangeEnd) } + + guard numMatches >= minMatches else { + return nil + } + + guard quantificationKind == .eager && numMatches > minMatches else { + // Consumed no input, no point saved + return (currentPosition, nil) + } + assert(rangeStart <= rangeEnd) + + // NOTE: We can't assert that rangeEnd trails currentPosition by one + // position, because newline-sequence in scalar semantic mode still + // matches two scalars + + return (currentPosition, rangeStart.. Date: Wed, 13 Dec 2023 12:18:30 -0700 Subject: [PATCH 09/16] wip --- .../Engine/InstPayload.swift | 8 +++ .../_StringProcessing/Engine/MEQuantify.swift | 57 +++++++++---------- 2 files changed, 34 insertions(+), 31 deletions(-) diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 78baf9ce1..c4522a219 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -408,6 +408,14 @@ struct QuantifyPayload: RawRepresentable { var typeMask: UInt64 { 7 } var payloadMask: UInt64 { 0xFF_FF } + // Calculate the maximum number of trips, else UInt64.max if unbounded + var maxTrips: UInt64 { + guard let maxExtraTrips else { + return UInt64.max + } + return minTrips + maxExtraTrips + } + static func packInfoValues( _ kind: AST.Quantification.Kind, _ minTrips: Int, diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 1f2aa63b5..28019dbc8 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -2,15 +2,27 @@ private typealias ASCIIBitset = DSLTree.CustomCharacterClass.AsciiBitset extension Processor { - private func maybeASCIIBitset( - _ payload: QuantifyPayload - ) -> ASCIIBitset? { - guard payload.type == .asciiBitset else { return nil } - return registers[payload.bitset] - } - internal mutating func runQuantify(_ payload: QuantifyPayload) -> Bool { - let asciiBitset = maybeASCIIBitset(payload) + if payload.type == .asciiBitset { + guard let (next, savePointRange) = input.matchQuantifiedASCIIBitset( + registers[payload.bitset], + at: currentPosition, + limitedBy: end, + minMatches: payload.minTrips, + maxMatches: payload.maxTrips, + quantificationKind: payload.quantKind, + isScalarSemantics: payload.isScalarSemantics + ) else { + signalFailure() + return false + } + if let savePointRange { + savePoints.append(makeQuantifiedSavePoint( + savePointRange, isScalarSemantics: payload.isScalarSemantics)) + } + currentPosition = next + return true + } // TODO: Refactor below called functions to be non-mutating. // They might need to communicate save-point info upwards in addition to @@ -25,7 +37,7 @@ extension Processor { case (_, 0, nil): let (next, savePointRange) = input.runZeroOrMoreQuantify( payload, - asciiBitset: asciiBitset, + asciiBitset: nil, at: currentPosition, limitedBy: end) if let savePointRange { @@ -37,7 +49,7 @@ extension Processor { case (_, 1, nil): guard let (next, savePointRange) = input.runOneOrMoreQuantify( payload, - asciiBitset: asciiBitset, + asciiBitset: nil, at: currentPosition, limitedBy: end ) else { @@ -53,7 +65,7 @@ extension Processor { case (_, _, nil): guard let (next, savePointRange) = input.runNOrMoreQuantify( payload, - asciiBitset: asciiBitset, + asciiBitset: nil, at: currentPosition, limitedBy: end ) else { @@ -70,7 +82,7 @@ extension Processor { // FIXME: Is this correct for lazy zero-or-one? let (next, savePointRange) = input.runZeroOrOneQuantify( payload, - asciiBitset: asciiBitset, + asciiBitset: nil, at: currentPosition, limitedBy: end) if let savePointRange { @@ -82,7 +94,7 @@ extension Processor { default: guard let (next, savePointRange) = input.runGeneralQuantify( payload, - asciiBitset: asciiBitset, + asciiBitset: nil, at: currentPosition, limitedBy: end ) else { @@ -232,24 +244,7 @@ extension String { switch payload.type { case .asciiBitset: - while numMatches < maxTrips { - assert(asciiBitset != nil, "Invariant: needs to be passed in") - guard let next = matchASCIIBitset( - asciiBitset!, - at: currentPosition, - limitedBy: end, - isScalarSemantics: isScalarSemantics) - else { - break - } - numMatches &+= 1 - if numMatches == minTrips { - rangeStart = next - } - rangeEnd = currentPosition - currentPosition = next - assert(currentPosition > rangeEnd) - } + fatalError("handled above") case .asciiChar: let asciiScalar = UnicodeScalar.init(_value: UInt32(payload.asciiChar)) while numMatches < maxTrips { From e5ee8459bcad288f7e2506b35b214ba4632fb8a2 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 13 Dec 2023 19:28:07 -0700 Subject: [PATCH 10/16] all converted to string methods --- .../_StringProcessing/Engine/MEQuantify.swift | 473 ++++++------------ 1 file changed, 159 insertions(+), 314 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 28019dbc8..d8b39f6eb 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -1,129 +1,91 @@ -@_implementationOnly import _RegexParser private typealias ASCIIBitset = DSLTree.CustomCharacterClass.AsciiBitset extension Processor { internal mutating func runQuantify(_ payload: QuantifyPayload) -> Bool { - if payload.type == .asciiBitset { - guard let (next, savePointRange) = input.matchQuantifiedASCIIBitset( + assert(payload.quantKind != .reluctant, ".reluctant is not supported by .quantify") + + let minMatches = payload.minTrips + let maxMatches = payload.maxTrips + let produceSavePointRange = payload.quantKind == .eager + let isScalarSemantics = payload.isScalarSemantics + + let matchResult: (next: String.Index, savePointRange: Range?)? + switch payload.type { + case .asciiBitset: + matchResult = input.matchQuantifiedASCIIBitset( registers[payload.bitset], at: currentPosition, limitedBy: end, - minMatches: payload.minTrips, - maxMatches: payload.maxTrips, - quantificationKind: payload.quantKind, - isScalarSemantics: payload.isScalarSemantics - ) else { - signalFailure() - return false - } - if let savePointRange { - savePoints.append(makeQuantifiedSavePoint( - savePointRange, isScalarSemantics: payload.isScalarSemantics)) - } - currentPosition = next - return true - } + minMatches: minMatches, + maxMatches: maxMatches, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics) - // TODO: Refactor below called functions to be non-mutating. - // They might need to communicate save-point info upwards in addition to - // a new (optional) currentPosition. Then, we can assert in testing that the - // specialized functions produce the same answer as `runGeneralQuantify`. - switch (payload.quantKind, payload.minTrips, payload.maxExtraTrips) { - case (.reluctant, _, _): - assertionFailure(".reluctant is not supported by .quantify") - // TODO: this was pre-refactoring behavior, should we fatal error - // instead? - return false - case (_, 0, nil): - let (next, savePointRange) = input.runZeroOrMoreQuantify( - payload, - asciiBitset: nil, - at: currentPosition, - limitedBy: end) - if let savePointRange { - savePoints.append(makeQuantifiedSavePoint( - savePointRange, isScalarSemantics: payload.isScalarSemantics)) - } - currentPosition = next - return true - case (_, 1, nil): - guard let (next, savePointRange) = input.runOneOrMoreQuantify( - payload, - asciiBitset: nil, - at: currentPosition, - limitedBy: end - ) else { - signalFailure() - return false - } - if let savePointRange { - savePoints.append(makeQuantifiedSavePoint( - savePointRange, isScalarSemantics: payload.isScalarSemantics)) - } - currentPosition = next - return true - case (_, _, nil): - guard let (next, savePointRange) = input.runNOrMoreQuantify( - payload, - asciiBitset: nil, + case .asciiChar: + matchResult = input.matchQuantifiedScalar( + Unicode.Scalar(payload.asciiChar), at: currentPosition, - limitedBy: end - ) else { - signalFailure() - return false - } - if let savePointRange { - savePoints.append(makeQuantifiedSavePoint( - savePointRange, isScalarSemantics: payload.isScalarSemantics)) - } - currentPosition = next - return true - case (_, 0, 1): - // FIXME: Is this correct for lazy zero-or-one? - let (next, savePointRange) = input.runZeroOrOneQuantify( - payload, - asciiBitset: nil, + limitedBy: end, + minMatches: minMatches, + maxMatches: maxMatches, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics) + + case .any: + matchResult = input.matchQuantifiedRegexDot( at: currentPosition, - limitedBy: end) - if let savePointRange { - savePoints.append(makeQuantifiedSavePoint( - savePointRange, isScalarSemantics: payload.isScalarSemantics)) - } - currentPosition = next - return true - default: - guard let (next, savePointRange) = input.runGeneralQuantify( - payload, - asciiBitset: nil, + limitedBy: end, + minMatches: minMatches, + maxMatches: maxMatches, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics, + anyMatchesNewline: payload.anyMatchesNewline) + + case .builtin: + matchResult = input.matchQuantifiedBuiltinCC( + payload.builtin, at: currentPosition, - limitedBy: end - ) else { - signalFailure() - return false - } - if let savePointRange { - savePoints.append(makeQuantifiedSavePoint( - savePointRange, isScalarSemantics: payload.isScalarSemantics)) - } - currentPosition = next + limitedBy: end, + minMatches: minMatches, + maxMatches: maxMatches, + produceSavePointRange: produceSavePointRange, + isInverted: payload.builtinIsInverted, + isStrictASCII: payload.builtinIsStrict, + isScalarSemantics: isScalarSemantics) + } - return true + guard let (next, savePointRange) = matchResult else { + signalFailure() + return false + } + if let savePointRange { + savePoints.append(makeQuantifiedSavePoint( + savePointRange, isScalarSemantics: payload.isScalarSemantics)) } + currentPosition = next + return true } } +/// MARK: - Non-reluctant quantification operations on String + extension String { - fileprivate func matchQuantifiedASCIIBitset( - _ asciiBitset: ASCIIBitset, + /// Run the quant loop, using the supplied matching closure + /// + /// NOTE: inline-always to help elimiate the closure overhead, + /// simplify some of the looping structure, etc. + @inline(__always) + fileprivate func _runQuantLoop( at currentPosition: Index, limitedBy end: Index, minMatches: UInt64, maxMatches: UInt64, - quantificationKind: AST.Quantification.Kind, - isScalarSemantics: Bool + produceSavePointRange: Bool, + isScalarSemantics: Bool, + _ doMatch: ( + _ currentPosition: Index, _ limitedBy: Index, _ isScalarSemantics: Bool + ) -> Index? ) -> (next: Index, savePointRange: Range?)? { - // Create a quantified save point for every part of the input - // (after minTrips) matched up to the final position. var currentPosition = currentPosition var rangeStart = currentPosition var rangeEnd = currentPosition @@ -131,12 +93,9 @@ extension String { var numMatches = 0 while numMatches < maxMatches { - guard let next = matchASCIIBitset( - asciiBitset, - at: currentPosition, - limitedBy: end, - isScalarSemantics: isScalarSemantics) - else { + guard let next = doMatch( + currentPosition, end, isScalarSemantics + ) else { break } numMatches &+= 1 @@ -152,7 +111,7 @@ extension String { return nil } - guard quantificationKind == .eager && numMatches > minMatches else { + guard produceSavePointRange && numMatches > minMatches else { // Consumed no input, no point saved return (currentPosition, nil) } @@ -162,228 +121,114 @@ extension String { // position, because newline-sequence in scalar semantic mode still // matches two scalars - return (currentPosition, rangeStart.. ( - nextPosition: Index, - savePointRange: Range? - )? { - assert(payload.quantKind != .reluctant) - - let minTrips = payload.minTrips - let maxTrips: UInt64 - if let maxExtraTrips = payload.maxExtraTrips { - maxTrips = payload.minTrips + maxExtraTrips - } else { - maxTrips = UInt64.max - } - - return _runNOrMoreQuantify( - payload, - minTrips: minTrips, - maxTrips: maxTrips, - asciiBitset: asciiBitset, - at: currentPosition, - limitedBy: end) + return (currentPosition, rangeStart.. (Index, savePointRange: Range?) { - assert(payload.minTrips == 0 && payload.maxExtraTrips == nil) - guard let res = _runNOrMoreQuantify( - payload, - minTrips: 0, - maxTrips: UInt64.max, - asciiBitset: asciiBitset, + limitedBy end: Index, + minMatches: UInt64, + maxMatches: UInt64, + produceSavePointRange: Bool, + isScalarSemantics: Bool + ) -> (next: Index, savePointRange: Range?)? { + _runQuantLoop( at: currentPosition, - limitedBy: end - ) else { - fatalError("Unreachable: zero-or-more always succeeds") + limitedBy: end, + minMatches: minMatches, + maxMatches: maxMatches, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, end, isScalarSemantics in + matchASCIIBitset( + asciiBitset, + at: currentPosition, + limitedBy: end, + isScalarSemantics: isScalarSemantics) } - - return res } - /// Specialized n-or-more eager quantification interpreter - /// - /// NOTE: inline always makes a huge perf difference for zero-or-more case - @inline(__always) - fileprivate func _runNOrMoreQuantify( - _ payload: QuantifyPayload, - minTrips: UInt64, - maxTrips: UInt64, - asciiBitset: ASCIIBitset?, // Necessary ugliness... + fileprivate func matchQuantifiedScalar( + _ scalar: Unicode.Scalar, at currentPosition: Index, - limitedBy end: Index - ) -> (Index, savePointRange: Range?)? { - assert(minTrips == payload.minTrips) - assert(minTrips + (payload.maxExtraTrips ?? (UInt64.max - minTrips)) == maxTrips) - - // Create a quantified save point for every part of the input - // (after minTrips) matched up to the final position. - var currentPosition = currentPosition - let isScalarSemantics = payload.isScalarSemantics - var rangeStart = currentPosition - var rangeEnd = currentPosition - - var numMatches = 0 - - switch payload.type { - case .asciiBitset: - fatalError("handled above") - case .asciiChar: - let asciiScalar = UnicodeScalar.init(_value: UInt32(payload.asciiChar)) - while numMatches < maxTrips { - guard let next = matchScalar( - asciiScalar, - at: currentPosition, - limitedBy: end, - boundaryCheck: !isScalarSemantics, - isCaseInsensitive: false) - else { - break - } - numMatches &+= 1 - if numMatches == minTrips { - rangeStart = next - } - rangeEnd = currentPosition - currentPosition = next - assert(currentPosition > rangeEnd) - } - case .builtin: - let builtin = payload.builtin - let isInverted = payload.builtinIsInverted - let isStrictASCII = payload.builtinIsStrict - while numMatches < maxTrips { - guard let next = matchBuiltinCC( - builtin, - at: currentPosition, - limitedBy: end, - isInverted: isInverted, - isStrictASCII: isStrictASCII, - isScalarSemantics: isScalarSemantics) - else { - break - } - numMatches &+= 1 - if numMatches == minTrips { - rangeStart = next - } - rangeEnd = currentPosition - currentPosition = next - assert(currentPosition > rangeEnd) - } - case .any: - let anyMatchesNewline = payload.anyMatchesNewline - while numMatches < maxTrips { - guard let next = matchRegexDot( - at: currentPosition, - limitedBy: end, - anyMatchesNewline: anyMatchesNewline, - isScalarSemantics: isScalarSemantics) - else { - break - } - numMatches &+= 1 - if numMatches == minTrips { - rangeStart = next - } - rangeEnd = currentPosition - currentPosition = next - assert(currentPosition > rangeEnd) - } - } + limitedBy end: Index, + minMatches: UInt64, + maxMatches: UInt64, + produceSavePointRange: Bool, + isScalarSemantics: Bool + ) -> (next: Index, savePointRange: Range?)? { + _runQuantLoop( + at: currentPosition, + limitedBy: end, + minMatches: minMatches, + maxMatches: maxMatches, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, end, isScalarSemantics in + matchScalar( + scalar, + at: currentPosition, + limitedBy: end, + boundaryCheck: !isScalarSemantics, + isCaseInsensitive: false) - guard numMatches >= minTrips else { - return nil } - - guard payload.quantKind == .eager && numMatches > minTrips else { - // Consumed no input, no point saved - return (currentPosition, nil) - } - assert(rangeStart <= rangeEnd) - - // NOTE: We can't assert that rangeEnd trails currentPosition by one - // position, because newline-sequence in scalar semantic mode still - // matches two scalars - - return (currentPosition, rangeStart.. (Index, savePointRange: Range?)? { - assert(payload.maxExtraTrips == nil) - - return _runNOrMoreQuantify( - payload, - minTrips: payload.minTrips, - maxTrips: UInt64.max, - asciiBitset: asciiBitset, - at: currentPosition, - limitedBy: end) - } - - /// Specialized quantify instruction interpreter for `+` - fileprivate func runOneOrMoreQuantify( - _ payload: QuantifyPayload, - asciiBitset: ASCIIBitset?, // Necessary ugliness... - at currentPosition: Index, - limitedBy end: Index - ) -> (Index, savePointRange: Range?)? { - assert(payload.minTrips == 1 && payload.maxExtraTrips == nil) - - return _runNOrMoreQuantify( - payload, - minTrips: 1, - maxTrips: UInt64.max, - asciiBitset: asciiBitset, + limitedBy end: Index, + minMatches: UInt64, + maxMatches: UInt64, + produceSavePointRange: Bool, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> (next: Index, savePointRange: Range?)? { + _runQuantLoop( at: currentPosition, - limitedBy: end) + limitedBy: end, + minMatches: minMatches, + maxMatches: maxMatches, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, end, isScalarSemantics in + matchBuiltinCC( + builtinCC, + at: currentPosition, + limitedBy: end, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics) + } } - /// Specialized quantify instruction interpreter for ? - fileprivate func runZeroOrOneQuantify( - _ payload: QuantifyPayload, - asciiBitset: ASCIIBitset?, // Necessary ugliness... + fileprivate func matchQuantifiedRegexDot( at currentPosition: Index, - limitedBy end: Index - ) -> (Index, savePointRange: Range?) { - assert(payload.minTrips == 0 && payload.maxExtraTrips == 1) - - guard let res = _runNOrMoreQuantify( - payload, - minTrips: 0, - maxTrips: 1, - asciiBitset: asciiBitset, + limitedBy end: Index, + minMatches: UInt64, + maxMatches: UInt64, + produceSavePointRange: Bool, + isScalarSemantics: Bool, + anyMatchesNewline: Bool + ) -> (next: Index, savePointRange: Range?)? { + _runQuantLoop( at: currentPosition, - limitedBy: end - ) else { - fatalError("Unreachable: zero-or-more always succeeds") + limitedBy: end, + minMatches: minMatches, + maxMatches: maxMatches, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, end, isScalarSemantics in + matchRegexDot( + at: currentPosition, + limitedBy: end, + anyMatchesNewline: anyMatchesNewline, + isScalarSemantics: isScalarSemantics) } - - return res - } + } } From 2d7a3914acd744e64ed65929a6b6fba985b815a4 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 13 Dec 2023 20:07:40 -0700 Subject: [PATCH 11/16] wip --- .../_StringProcessing/Engine/MEQuantify.swift | 339 ++++++++++++++++-- 1 file changed, 302 insertions(+), 37 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index d8b39f6eb..8d62215ba 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -9,49 +9,120 @@ extension Processor { let produceSavePointRange = payload.quantKind == .eager let isScalarSemantics = payload.isScalarSemantics + let isZeroOrMore = payload.minTrips == 0 && payload.maxExtraTrips == nil + let isOneOrMore = payload.minTrips == 1 && payload.maxExtraTrips == nil + let matchResult: (next: String.Index, savePointRange: Range?)? switch payload.type { case .asciiBitset: - matchResult = input.matchQuantifiedASCIIBitset( - registers[payload.bitset], - at: currentPosition, - limitedBy: end, - minMatches: minMatches, - maxMatches: maxMatches, - produceSavePointRange: produceSavePointRange, - isScalarSemantics: isScalarSemantics) + if isZeroOrMore { + matchResult = input.matchZeroOrMoreASCIIBitset( + registers[payload.bitset], + at: currentPosition, + limitedBy: end, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics) + } else if isOneOrMore { + matchResult = input.matchOneOrMoreASCIIBitset( + registers[payload.bitset], + at: currentPosition, + limitedBy: end, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics) + } else { + matchResult = input.matchQuantifiedASCIIBitset( + registers[payload.bitset], + at: currentPosition, + limitedBy: end, + minMatches: minMatches, + maxMatches: maxMatches, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics) + } case .asciiChar: - matchResult = input.matchQuantifiedScalar( - Unicode.Scalar(payload.asciiChar), - at: currentPosition, - limitedBy: end, - minMatches: minMatches, - maxMatches: maxMatches, - produceSavePointRange: produceSavePointRange, - isScalarSemantics: isScalarSemantics) + if isZeroOrMore { + matchResult = input.matchZeroOrMoreScalar( + Unicode.Scalar(payload.asciiChar), + at: currentPosition, + limitedBy: end, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics) + } else if isOneOrMore { + matchResult = input.matchOneOrMoreScalar( + Unicode.Scalar(payload.asciiChar), + at: currentPosition, + limitedBy: end, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics) + } else { + matchResult = input.matchQuantifiedScalar( + Unicode.Scalar(payload.asciiChar), + at: currentPosition, + limitedBy: end, + minMatches: minMatches, + maxMatches: maxMatches, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics) + } case .any: - matchResult = input.matchQuantifiedRegexDot( - at: currentPosition, - limitedBy: end, - minMatches: minMatches, - maxMatches: maxMatches, - produceSavePointRange: produceSavePointRange, - isScalarSemantics: isScalarSemantics, - anyMatchesNewline: payload.anyMatchesNewline) + if isZeroOrMore { + matchResult = input.matchZeroOrMoreRegexDot( + at: currentPosition, + limitedBy: end, + produceSavePointRange: produceSavePointRange, + anyMatchesNewline: payload.anyMatchesNewline, + isScalarSemantics: isScalarSemantics) + } else if isOneOrMore { + matchResult = input.matchOneOrMoreRegexDot( + at: currentPosition, + limitedBy: end, + produceSavePointRange: produceSavePointRange, + anyMatchesNewline: payload.anyMatchesNewline, + isScalarSemantics: isScalarSemantics) + } else { + matchResult = input.matchQuantifiedRegexDot( + at: currentPosition, + limitedBy: end, + minMatches: minMatches, + maxMatches: maxMatches, + produceSavePointRange: produceSavePointRange, + anyMatchesNewline: payload.anyMatchesNewline, + isScalarSemantics: isScalarSemantics) + } case .builtin: - matchResult = input.matchQuantifiedBuiltinCC( - payload.builtin, - at: currentPosition, - limitedBy: end, - minMatches: minMatches, - maxMatches: maxMatches, - produceSavePointRange: produceSavePointRange, - isInverted: payload.builtinIsInverted, - isStrictASCII: payload.builtinIsStrict, - isScalarSemantics: isScalarSemantics) + if isZeroOrMore { + matchResult = input.matchZeroOrMoreBuiltinCC( + payload.builtin, + at: currentPosition, + limitedBy: end, + produceSavePointRange: produceSavePointRange, + isInverted: payload.builtinIsInverted, + isStrictASCII: payload.builtinIsStrict, + isScalarSemantics: isScalarSemantics) + } else if isOneOrMore { + matchResult = input.matchOneOrMoreBuiltinCC( + payload.builtin, + at: currentPosition, + limitedBy: end, + produceSavePointRange: produceSavePointRange, + isInverted: payload.builtinIsInverted, + isStrictASCII: payload.builtinIsStrict, + isScalarSemantics: isScalarSemantics) + } else { + matchResult = input.matchQuantifiedBuiltinCC( + payload.builtin, + at: currentPosition, + limitedBy: end, + minMatches: minMatches, + maxMatches: maxMatches, + produceSavePointRange: produceSavePointRange, + isInverted: payload.builtinIsInverted, + isStrictASCII: payload.builtinIsStrict, + isScalarSemantics: isScalarSemantics) + } } guard let (next, savePointRange) = matchResult else { @@ -121,7 +192,55 @@ extension String { // position, because newline-sequence in scalar semantic mode still // matches two scalars - return (currentPosition, rangeStart.. (next: Index, savePointRange: Range?)? { + _runQuantLoop( + at: currentPosition, + limitedBy: end, + minMatches: 0, + maxMatches: UInt64.max, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, end, isScalarSemantics in + matchASCIIBitset( + asciiBitset, + at: currentPosition, + limitedBy: end, + isScalarSemantics: isScalarSemantics) + } + } + fileprivate func matchOneOrMoreASCIIBitset( + _ asciiBitset: ASCIIBitset, + at currentPosition: Index, + limitedBy end: Index, + produceSavePointRange: Bool, + isScalarSemantics: Bool + ) -> (next: Index, savePointRange: Range?)? { + _runQuantLoop( + at: currentPosition, + limitedBy: end, + minMatches: 1, + maxMatches: UInt64.max, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, end, isScalarSemantics in + matchASCIIBitset( + asciiBitset, + at: currentPosition, + limitedBy: end, + isScalarSemantics: isScalarSemantics) + } } fileprivate func matchQuantifiedASCIIBitset( @@ -149,6 +268,54 @@ extension String { } } + fileprivate func matchZeroOrMoreScalar( + _ scalar: Unicode.Scalar, + at currentPosition: Index, + limitedBy end: Index, + produceSavePointRange: Bool, + isScalarSemantics: Bool + ) -> (next: Index, savePointRange: Range?)? { + _runQuantLoop( + at: currentPosition, + limitedBy: end, + minMatches: 0, + maxMatches: UInt64.max, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, end, isScalarSemantics in + matchScalar( + scalar, + at: currentPosition, + limitedBy: end, + boundaryCheck: !isScalarSemantics, + isCaseInsensitive: false) + } + } + fileprivate func matchOneOrMoreScalar( + _ scalar: Unicode.Scalar, + at currentPosition: Index, + limitedBy end: Index, + produceSavePointRange: Bool, + isScalarSemantics: Bool + ) -> (next: Index, savePointRange: Range?)? { + _runQuantLoop( + at: currentPosition, + limitedBy: end, + minMatches: 1, + maxMatches: UInt64.max, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, end, isScalarSemantics in + matchScalar( + scalar, + at: currentPosition, + limitedBy: end, + boundaryCheck: !isScalarSemantics, + isCaseInsensitive: false) + + } + } + fileprivate func matchQuantifiedScalar( _ scalar: Unicode.Scalar, at currentPosition: Index, @@ -176,6 +343,59 @@ extension String { } } + fileprivate func matchZeroOrMoreBuiltinCC( + _ builtinCC: _CharacterClassModel.Representation, + at currentPosition: Index, + limitedBy end: Index, + produceSavePointRange: Bool, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> (next: Index, savePointRange: Range?)? { + _runQuantLoop( + at: currentPosition, + limitedBy: end, + minMatches: 0, + maxMatches: UInt64.max, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, end, isScalarSemantics in + matchBuiltinCC( + builtinCC, + at: currentPosition, + limitedBy: end, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics) + } + } + fileprivate func matchOneOrMoreBuiltinCC( + _ builtinCC: _CharacterClassModel.Representation, + at currentPosition: Index, + limitedBy end: Index, + produceSavePointRange: Bool, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> (next: Index, savePointRange: Range?)? { + _runQuantLoop( + at: currentPosition, + limitedBy: end, + minMatches: 1, + maxMatches: UInt64.max, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, end, isScalarSemantics in + matchBuiltinCC( + builtinCC, + at: currentPosition, + limitedBy: end, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics) + } + } + fileprivate func matchQuantifiedBuiltinCC( _ builtinCC: _CharacterClassModel.Representation, at currentPosition: Index, @@ -205,14 +425,59 @@ extension String { } } + fileprivate func matchZeroOrMoreRegexDot( + at currentPosition: Index, + limitedBy end: Index, + produceSavePointRange: Bool, + anyMatchesNewline: Bool, + isScalarSemantics: Bool + ) -> (next: Index, savePointRange: Range?)? { + _runQuantLoop( + at: currentPosition, + limitedBy: end, + minMatches: 0, + maxMatches: UInt64.max, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, end, isScalarSemantics in + matchRegexDot( + at: currentPosition, + limitedBy: end, + anyMatchesNewline: anyMatchesNewline, + isScalarSemantics: isScalarSemantics) + } + } + fileprivate func matchOneOrMoreRegexDot( + at currentPosition: Index, + limitedBy end: Index, + produceSavePointRange: Bool, + anyMatchesNewline: Bool, + isScalarSemantics: Bool + ) -> (next: Index, savePointRange: Range?)? { + _runQuantLoop( + at: currentPosition, + limitedBy: end, + minMatches: 1, + maxMatches: UInt64.max, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, end, isScalarSemantics in + matchRegexDot( + at: currentPosition, + limitedBy: end, + anyMatchesNewline: anyMatchesNewline, + isScalarSemantics: isScalarSemantics) + } + } + fileprivate func matchQuantifiedRegexDot( at currentPosition: Index, limitedBy end: Index, minMatches: UInt64, maxMatches: UInt64, produceSavePointRange: Bool, - isScalarSemantics: Bool, - anyMatchesNewline: Bool + anyMatchesNewline: Bool, + isScalarSemantics: Bool ) -> (next: Index, savePointRange: Range?)? { _runQuantLoop( at: currentPosition, From 4c3ebf0d47aea9483daa756f1e22f842b2702ecf Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Thu, 14 Dec 2023 13:00:38 -0700 Subject: [PATCH 12/16] fix findo/replaco --- Tests/RegexTests/MatchTests.swift | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index ea59cbc5c..47f8f4f9a 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -2659,11 +2659,11 @@ extension RegexTests { func testQuantifyOptimization() throws { // test that the maximum values for minTrips and maxExtraTrips are handled correctly let maxStorable = Int(QuantifyPayload.maxStorableTrips) - let maxmaxExtraTrips = "a{,\(maxStorable)}" - expectProgram(for: maxmaxExtraTrips, contains: [.quantify]) - firstMatchTest(maxmaxExtraTrips, input: String(repeating: "a", count: maxStorable), match: String(repeating: "a", count: maxStorable)) - firstMatchTest(maxmaxExtraTrips, input: String(repeating: "a", count: maxStorable + 1), match: String(repeating: "a", count: maxStorable)) - XCTAssertNil(try Regex(maxmaxExtraTrips).wholeMatch(in: String(repeating: "a", count: maxStorable + 1))) + let maxExtraTrips = "a{,\(maxStorable)}" + expectProgram(for: maxExtraTrips, contains: [.quantify]) + firstMatchTest(maxExtraTrips, input: String(repeating: "a", count: maxStorable), match: String(repeating: "a", count: maxStorable)) + firstMatchTest(maxExtraTrips, input: String(repeating: "a", count: maxStorable + 1), match: String(repeating: "a", count: maxStorable)) + XCTAssertNil(try Regex(maxExtraTrips).wholeMatch(in: String(repeating: "a", count: maxStorable + 1))) let maxMinTrips = "a{\(maxStorable),}" expectProgram(for: maxMinTrips, contains: [.quantify]) From 3c5dea053795184e9e1b07833c7c68f083a989b3 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Fri, 15 Dec 2023 10:34:48 -0700 Subject: [PATCH 13/16] better names and comments --- .../Engine/InstPayload.swift | 6 ++-- .../_StringProcessing/Engine/MEQuantify.swift | 31 +++++++++++++------ 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index c4522a219..0476b882b 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -381,7 +381,7 @@ struct QuantifyPayload: RawRepresentable { case asciiBitset = 0 case asciiChar = 1 case any = 2 - case builtin = 4 + case builtinCC = 4 } // TODO: figure out how to better organize this... @@ -493,7 +493,7 @@ struct QuantifyPayload: RawRepresentable { + (model.isInverted ? 1 << 9 : 0) + (model.isStrictASCII ? 1 << 10 : 0) self.rawValue = packedModel - + QuantifyPayload.packInfoValues(kind, minTrips, maxExtraTrips, .builtin, isScalarSemantics: isScalarSemantics) + + QuantifyPayload.packInfoValues(kind, minTrips, maxExtraTrips, .builtinCC, isScalarSemantics: isScalarSemantics) } var type: PayloadType { @@ -539,7 +539,7 @@ struct QuantifyPayload: RawRepresentable { (self.rawValue & 1) == 1 } - var builtin: _CharacterClassModel.Representation { + var builtinCC: _CharacterClassModel.Representation { _CharacterClassModel.Representation(rawValue: self.rawValue & 0xFF)! } var builtinIsInverted: Bool { diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 8d62215ba..0652e4559 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -92,10 +92,10 @@ extension Processor { isScalarSemantics: isScalarSemantics) } - case .builtin: + case .builtinCC: if isZeroOrMore { matchResult = input.matchZeroOrMoreBuiltinCC( - payload.builtin, + payload.builtinCC, at: currentPosition, limitedBy: end, produceSavePointRange: produceSavePointRange, @@ -104,7 +104,7 @@ extension Processor { isScalarSemantics: isScalarSemantics) } else if isOneOrMore { matchResult = input.matchOneOrMoreBuiltinCC( - payload.builtin, + payload.builtinCC, at: currentPosition, limitedBy: end, produceSavePointRange: produceSavePointRange, @@ -113,7 +113,7 @@ extension Processor { isScalarSemantics: isScalarSemantics) } else { matchResult = input.matchQuantifiedBuiltinCC( - payload.builtin, + payload.builtinCC, at: currentPosition, limitedBy: end, minMatches: minMatches, @@ -158,6 +158,11 @@ extension String { ) -> Index? ) -> (next: Index, savePointRange: Range?)? { var currentPosition = currentPosition + + // The range of backtracking positions to try. For zero-or-more, starts + // before any match happens. Always ends before the final match, since + // the final match is what is tried without backtracking. An empty range + // is valid and means a single backtracking position at rangeStart. var rangeStart = currentPosition var rangeEnd = currentPosition @@ -171,6 +176,12 @@ extension String { } numMatches &+= 1 if numMatches == minMatches { + // For this loop iteration, rangeEnd will actually trail rangeStart by + // a single match position. Next iteration, they will be equal + // (empty range denoting a single backtracking point). Note that we + // only ever return a range if we have exceeded `minMatches`; if we + // exactly mach `minMatches` there is no backtracking positions to + // remember. rangeStart = next } rangeEnd = currentPosition @@ -183,20 +194,22 @@ extension String { } guard produceSavePointRange && numMatches > minMatches else { - // Consumed no input, no point saved + // No backtracking positions to try return (currentPosition, nil) } assert(rangeStart <= rangeEnd) - // NOTE: We can't assert that rangeEnd trails currentPosition by one - // position, because newline-sequence in scalar semantic mode still + // NOTE: We can't assert that rangeEnd trails currentPosition by exactly + // one position, because newline-sequence in scalar semantic mode still // matches two scalars return (currentPosition, rangeStart.. Date: Fri, 15 Dec 2023 10:38:18 -0700 Subject: [PATCH 14/16] wip: typo --- Sources/_StringProcessing/Engine/MEQuantify.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 0652e4559..ebaf337cb 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -180,7 +180,7 @@ extension String { // a single match position. Next iteration, they will be equal // (empty range denoting a single backtracking point). Note that we // only ever return a range if we have exceeded `minMatches`; if we - // exactly mach `minMatches` there is no backtracking positions to + // exactly match `minMatches` there is no backtracking positions to // remember. rangeStart = next } From 93a220bff3de4ee2883fac1d5e0a1b024d0ca73a Mon Sep 17 00:00:00 2001 From: David Smith Date: Thu, 1 Feb 2024 14:44:08 -0800 Subject: [PATCH 15/16] Address review comments --- Sources/_StringProcessing/Engine/MEQuantify.swift | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index ebaf337cb..47ba45221 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -130,6 +130,7 @@ extension Processor { return false } if let savePointRange { + assert(!produceSavePointRange) savePoints.append(makeQuantifiedSavePoint( savePointRange, isScalarSemantics: payload.isScalarSemantics)) } @@ -203,7 +204,10 @@ extension String { // one position, because newline-sequence in scalar semantic mode still // matches two scalars - return (currentPosition, rangeStart.. Date: Tue, 6 Feb 2024 14:05:08 -0800 Subject: [PATCH 16/16] Fix assertion --- Sources/_StringProcessing/Engine/MEQuantify.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 47ba45221..b3d4818b0 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -130,7 +130,7 @@ extension Processor { return false } if let savePointRange { - assert(!produceSavePointRange) + assert(produceSavePointRange) savePoints.append(makeQuantifiedSavePoint( savePointRange, isScalarSemantics: payload.isScalarSemantics)) }