diff --git a/Documentation/ProgrammersManual.md b/Documentation/ProgrammersManual.md new file mode 100644 index 000000000..67021f44c --- /dev/null +++ b/Documentation/ProgrammersManual.md @@ -0,0 +1,30 @@ +# Programmer's Manual + +## Programming patterns + +### Engine quick checks and fast paths + +In the engine nomenclature, a quick-check results in a yes/no/maybe while a thorough check always results in a definite answer. + +The nature of quick checks and fast paths is that they bifurcate testing coverage. One easy way to prevent this in simple cases is to assert that a definite quick result matches the thorough result. + +One example of this pattern is matching against a builtin character class. The engine has a `_matchBuiltinCC` + +```swift + func _matchBuiltinCC(...) -> Input.Index? { + // Calls _quickMatchBuiltinCC, if that gives a definite result + // asserts that it is the same as the result of + // _thoroughMatchBuiltinCC and returns it. Otherwise returns the + // result of _thoroughMatchBuiltinCC + } + + @inline(__always) + func _quickMatchBuiltinCC(...) -> QuickResult + + @inline(never) + func _thoroughMatchBuiltinCC(...) -> Input.Index? +``` + +The thorough check is never inlined, as it is a lot of cold code. Note that quick and thorough functions should be pure, that is they shouldn't update processor state. + + diff --git a/Sources/_StringProcessing/CMakeLists.txt b/Sources/_StringProcessing/CMakeLists.txt index ba3e2e03c..ef4aeb6ef 100644 --- a/Sources/_StringProcessing/CMakeLists.txt +++ b/Sources/_StringProcessing/CMakeLists.txt @@ -47,6 +47,7 @@ add_library(_StringProcessing Regex/DSLTree.swift Regex/Match.swift Regex/Options.swift + Unicode/ASCII.swift Unicode/CaseConversion.swift Unicode/CharacterProps.swift Unicode/Comparison.swift diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 6f99058fd..fa4f8fa1a 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -9,17 +9,18 @@ extension Character { } extension Processor { - mutating func matchBuiltin( + mutating func matchBuiltinCC( _ cc: _CharacterClassModel.Representation, - _ isInverted: Bool, - _ isStrictASCII: Bool, - _ isScalarSemantics: Bool + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool ) -> Bool { - guard let next = _doMatchBuiltin( + guard let next = input._matchBuiltinCC( cc, - isInverted, - isStrictASCII, - isScalarSemantics + at: currentPosition, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics ) else { signalFailure() return false @@ -27,96 +28,7 @@ extension Processor { currentPosition = next return true } - - func _doMatchBuiltin( - _ cc: _CharacterClassModel.Representation, - _ isInverted: Bool, - _ isStrictASCII: Bool, - _ isScalarSemantics: Bool - ) -> Input.Index? { - guard let char = load(), let scalar = loadScalar() else { - return nil - } - - let asciiCheck = !isStrictASCII - || (scalar.isASCII && isScalarSemantics) - || char.isASCII - - var matched: Bool - var next: Input.Index - switch (isScalarSemantics, cc) { - case (_, .anyGrapheme): - next = input.index(after: currentPosition) - case (_, .anyScalar): - next = input.unicodeScalars.index(after: currentPosition) - case (true, _): - next = input.unicodeScalars.index(after: currentPosition) - case (false, _): - next = input.index(after: currentPosition) - } - - switch cc { - case .any, .anyGrapheme: - matched = true - case .anyScalar: - if isScalarSemantics { - matched = true - } else { - matched = input.isOnGraphemeClusterBoundary(next) - } - case .digit: - if isScalarSemantics { - matched = scalar.properties.numericType != nil && asciiCheck - } else { - matched = char.isNumber && asciiCheck - } - case .horizontalWhitespace: - if isScalarSemantics { - matched = scalar.isHorizontalWhitespace && asciiCheck - } else { - matched = char._isHorizontalWhitespace && asciiCheck - } - case .verticalWhitespace: - if isScalarSemantics { - matched = scalar.isNewline && asciiCheck - } else { - matched = char._isNewline && asciiCheck - } - case .newlineSequence: - if isScalarSemantics { - matched = scalar.isNewline && asciiCheck - if matched && scalar == "\r" - && next != input.endIndex && input.unicodeScalars[next] == "\n" { - // Match a full CR-LF sequence even in scalar semantics - input.unicodeScalars.formIndex(after: &next) - } - } else { - matched = char._isNewline && asciiCheck - } - case .whitespace: - if isScalarSemantics { - matched = scalar.properties.isWhitespace && asciiCheck - } else { - matched = char.isWhitespace && asciiCheck - } - case .word: - if isScalarSemantics { - matched = scalar.properties.isAlphabetic && asciiCheck - } else { - matched = char.isWordCharacter && asciiCheck - } - } - - if isInverted { - matched.toggle() - } - guard matched else { - return nil - } - return next - } - func isAtStartOfLine(_ payload: AssertionPayload) -> Bool { if currentPosition == subjectBounds.lowerBound { return true } switch payload.semanticLevel { @@ -126,7 +38,7 @@ extension Processor { return input.unicodeScalars[input.unicodeScalars.index(before: currentPosition)].isNewline } } - + func isAtEndOfLine(_ payload: AssertionPayload) -> Bool { if currentPosition == subjectBounds.upperBound { return true } switch payload.semanticLevel { @@ -169,7 +81,7 @@ extension Processor { return isAtStartOfLine(payload) case .endOfLine: return isAtEndOfLine(payload) - + case .caretAnchor: if payload.anchorsMatchNewlines { return isAtStartOfLine(payload) @@ -202,3 +114,152 @@ extension Processor { } } } + +// MARK: Built-in character class matching + +extension String { + + // Mentioned in ProgrammersManual.md, update docs if redesigned + func _matchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + at currentPosition: String.Index, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> String.Index? { + guard currentPosition < endIndex else { + return nil + } + if case .definite(let result) = _quickMatchBuiltinCC( + cc, + at: currentPosition, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics + ) { + assert(result == _thoroughMatchBuiltinCC( + cc, + at: currentPosition, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics)) + return result + } + return _thoroughMatchBuiltinCC( + cc, + at: currentPosition, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics) + } + + // Mentioned in ProgrammersManual.md, update docs if redesigned + @inline(__always) + func _quickMatchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + at currentPosition: String.Index, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> QuickResult { + assert(currentPosition < endIndex) + guard let (next, result) = _quickMatch( + cc, at: currentPosition, isScalarSemantics: isScalarSemantics + ) else { + return .unknown + } + return .definite(result == isInverted ? nil : next) + } + + // Mentioned in ProgrammersManual.md, update docs if redesigned + @inline(never) + func _thoroughMatchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + at currentPosition: String.Index, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> String.Index? { + assert(currentPosition < endIndex) + let char = self[currentPosition] + let scalar = unicodeScalars[currentPosition] + + let asciiCheck = !isStrictASCII + || (scalar.isASCII && isScalarSemantics) + || char.isASCII + + var matched: Bool + var next: String.Index + switch (isScalarSemantics, cc) { + case (_, .anyGrapheme): + next = index(after: currentPosition) + case (_, .anyScalar): + next = unicodeScalars.index(after: currentPosition) + case (true, _): + next = unicodeScalars.index(after: currentPosition) + case (false, _): + next = index(after: currentPosition) + } + + switch cc { + case .any, .anyGrapheme: + matched = true + case .anyScalar: + if isScalarSemantics { + matched = true + } else { + matched = isOnGraphemeClusterBoundary(next) + } + case .digit: + if isScalarSemantics { + matched = scalar.properties.numericType != nil && asciiCheck + } else { + matched = char.isNumber && asciiCheck + } + case .horizontalWhitespace: + if isScalarSemantics { + matched = scalar.isHorizontalWhitespace && asciiCheck + } else { + matched = char._isHorizontalWhitespace && asciiCheck + } + case .verticalWhitespace: + if isScalarSemantics { + matched = scalar.isNewline && asciiCheck + } else { + matched = char._isNewline && asciiCheck + } + case .newlineSequence: + if isScalarSemantics { + matched = scalar.isNewline && asciiCheck + if matched && scalar == "\r" + && next != endIndex && unicodeScalars[next] == "\n" { + // Match a full CR-LF sequence even in scalar semantics + unicodeScalars.formIndex(after: &next) + } + } else { + matched = char._isNewline && asciiCheck + } + case .whitespace: + if isScalarSemantics { + matched = scalar.properties.isWhitespace && asciiCheck + } else { + matched = char.isWhitespace && asciiCheck + } + case .word: + if isScalarSemantics { + matched = scalar.properties.isAlphabetic && asciiCheck + } else { + matched = char.isWordCharacter && asciiCheck + } + } + + if isInverted { + matched.toggle() + } + + guard matched else { + return nil + } + return next + } +} diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 9d17dc9bd..fa68b8b76 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -9,11 +9,12 @@ extension Processor { UnicodeScalar.init(_value: UInt32(payload.asciiChar)), true) case .builtin: // We only emit .quantify if it consumes a single character - next = _doMatchBuiltin( + next = input._matchBuiltinCC( payload.builtin, - payload.builtinIsInverted, - payload.builtinIsStrict, - false) + at: currentPosition, + isInverted: payload.builtinIsInverted, + isStrictASCII: payload.builtinIsStrict, + isScalarSemantics: false) case .any: let matched = currentPosition != input.endIndex && (!input[currentPosition].isNewline || payload.anyMatchesNewline) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 18e355fb5..25cad5c8c 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -583,11 +583,11 @@ extension Processor { case .matchBuiltin: let payload = payload.characterClassPayload - if matchBuiltin( + if matchBuiltinCC( payload.cc, - payload.isInverted, - payload.isStrictASCII, - payload.isScalarSemantics + isInverted: payload.isInverted, + isStrictASCII: payload.isStrictASCII, + isScalarSemantics: payload.isScalarSemantics ) { controller.step() } diff --git a/Sources/_StringProcessing/Unicode/ASCII.swift b/Sources/_StringProcessing/Unicode/ASCII.swift new file mode 100644 index 000000000..f27584893 --- /dev/null +++ b/Sources/_StringProcessing/Unicode/ASCII.swift @@ -0,0 +1,170 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2023 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +private var _lineFeed: UInt8 { 0x0A } +private var _carriageReturn: UInt8 { 0x0D } +private var _lineTab: UInt8 { 0x0B } +private var _formFeed: UInt8 { 0x0C } +private var _space: UInt8 { 0x20 } +private var _tab: UInt8 { 0x09 } + +private var _0: UInt8 { 0x30 } +private var _9: UInt8 { 0x39 } +private func _isASCIINumber(_ x: UInt8) -> Bool { + return (_0..._9).contains(x) +} + +private var _a: UInt8 { 0x61 } +private var _z: UInt8 { 0x7A } +private var _A: UInt8 { 0x41 } +private var _Z: UInt8 { 0x5A } + +private var _underscore: UInt8 { 0x5F } + +extension UInt8 { + var _isASCII: Bool { self < 0x80 } + + // TODO: Bitvectors for the below + + /// Assuming we're ASCII, whether we match `\d` + var _asciiIsDigit: Bool { + assert(_isASCII) + return(_0..._9).contains(self) + } + + /// Assuming we're ASCII, whether we match `\h` + var _asciiIsHorizontalWhitespace: Bool { + assert(_isASCII) + return self == _space || self == _tab + } + + /// Assuming we're ASCII, whether we match `\v` + var _asciiIsVerticalWhitespace: Bool { + assert(_isASCII) + switch self { + case _lineFeed, _carriageReturn, _lineTab, _formFeed: + return true + default: + return false + } + } + + /// Assuming we're ASCII, whether we match `\s` + var _asciiIsWhitespace: Bool { + assert(_isASCII) + switch self { + case _space, _tab, _lineFeed, _lineTab, _formFeed, _carriageReturn: + return true + default: + return false + } + } + + /// Assuming we're ASCII, whether we match `[a-zA-Z]` + var _asciiIsLetter: Bool { + assert(_isASCII) + return (_a..._z).contains(self) || (_A..._Z).contains(self) + } + + /// Assuming we're ASCII, whether we match `\w` + var _asciiIsWord: Bool { + assert(_isASCII) + return _asciiIsDigit || _asciiIsLetter || self == _underscore + } +} + +extension String { + /// TODO: detailed description of nuanced semantics + func _quickASCIICharacter( + at idx: Index + ) -> (first: UInt8, next: Index, crLF: Bool)? { + // TODO: fastUTF8 version + + if idx == endIndex { + return nil + } + let base = utf8[idx] + guard base._isASCII else { + assert(!self[idx].isASCII) + return nil + } + + var next = utf8.index(after: idx) + if next == utf8.endIndex { + assert(self[idx].isASCII) + return (first: base, next: next, crLF: false) + } + + let tail = utf8[next] + guard tail._isSub300StartingByte else { return nil } + + // Handle CR-LF: + if base == _carriageReturn && tail == _lineFeed { + utf8.formIndex(after: &next) + guard next == endIndex || utf8[next]._isSub300StartingByte else { + return nil + } + assert(self[idx] == "\r\n") + return (first: base, next: next, crLF: true) + } + + assert(self[idx].isASCII && self[idx] != "\r\n") + return (first: base, next: next, crLF: false) + } + + func _quickMatch( + _ cc: _CharacterClassModel.Representation, + at idx: Index, + isScalarSemantics: Bool + ) -> (next: Index, matchResult: Bool)? { + /// ASCII fast-paths + guard let (asciiValue, next, isCRLF) = _quickASCIICharacter( + at: idx + ) else { + return nil + } + + // TODO: bitvectors + switch cc { + case .any, .anyGrapheme, .anyScalar: + // TODO: should any scalar not consume CR-LF in scalar semantic mode? + return (next, true) + + case .digit: + return (next, asciiValue._asciiIsDigit) + + case .horizontalWhitespace: + return (next, asciiValue._asciiIsHorizontalWhitespace) + + case .verticalWhitespace, .newlineSequence: + if asciiValue._asciiIsVerticalWhitespace { + if isScalarSemantics && isCRLF && cc == .verticalWhitespace { + return (utf8.index(before: next), true) + } + return (next, true) + } + return (next, false) + + case .whitespace: + if asciiValue._asciiIsWhitespace { + if isScalarSemantics && isCRLF { + return (utf8.index(before: next), true) + } + return (next, true) + } + return (next, false) + + case .word: + return (next, asciiValue._asciiIsWord) + } + } +} + diff --git a/Sources/_StringProcessing/Unicode/NFC.swift b/Sources/_StringProcessing/Unicode/NFC.swift index 5c2c4aa48..59d195bb6 100644 --- a/Sources/_StringProcessing/Unicode/NFC.swift +++ b/Sources/_StringProcessing/Unicode/NFC.swift @@ -12,6 +12,11 @@ @_spi(_Unicode) import Swift +extension UInt8 { + /// Whether this is the starting byte of a sub-300 (i.e. pre-combining scalar) scalars + var _isSub300StartingByte: Bool { self < 0xCC } +} + extension UnicodeScalar { /// Checks whether the scalar is in NFC form. var isNFC: Bool { Character(self).singleNFCScalar == self } diff --git a/Sources/_StringProcessing/Utility/Misc.swift b/Sources/_StringProcessing/Utility/Misc.swift index 8a9cbe325..8555ec85c 100644 --- a/Sources/_StringProcessing/Utility/Misc.swift +++ b/Sources/_StringProcessing/Utility/Misc.swift @@ -57,3 +57,11 @@ extension Array { with: initialAccumulator, into: { [finish($0) ]}, accumulate: accumulate) } } + +/// An enum for quick-check functions, which could return an answer or indefinite. +/// We use a separate type because often the answer itself is optional. +enum QuickResult { + case definite(_ r: R) + case unknown +} + diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 548a9eea0..35a11a7ef 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -18,10 +18,10 @@ struct _CharacterClassModel: Hashable { /// The actual character class to match. let cc: Representation - + /// The level (character or Unicode scalar) at which to match. let matchLevel: MatchingOptions.SemanticLevel - + /// If this character character class only matches ascii characters let isStrictASCII: Bool @@ -61,7 +61,7 @@ struct _CharacterClassModel: Hashable { /// Character.isLetter or Character.isDigit or Character == "_" case word } - + /// Returns the end of the match of this character class in the string. /// /// - Parameter str: The string to match against. @@ -78,82 +78,15 @@ struct _CharacterClassModel: Hashable { guard currentPosition != input.endIndex else { return nil } - let char = input[currentPosition] - let scalar = input.unicodeScalars[currentPosition] - let isScalarSemantics = matchLevel == .unicodeScalar - let asciiCheck = !isStrictASCII - || (scalar.isASCII && isScalarSemantics) - || char.isASCII - - var matched: Bool - var next: String.Index - switch (isScalarSemantics, cc) { - case (_, .anyGrapheme): - next = input.index(after: currentPosition) - case (_, .anyScalar): - // FIXME: This allows us to be not-scalar aligned when in grapheme mode - // Should this even be allowed? - next = input.unicodeScalars.index(after: currentPosition) - case (true, _): - next = input.unicodeScalars.index(after: currentPosition) - case (false, _): - next = input.index(after: currentPosition) - } + let isScalarSemantics = matchLevel == .unicodeScalar - switch cc { - case .any, .anyGrapheme, .anyScalar: - matched = true - case .digit: - if isScalarSemantics { - matched = scalar.properties.numericType != nil && asciiCheck - } else { - matched = char.isNumber && asciiCheck - } - case .horizontalWhitespace: - if isScalarSemantics { - matched = scalar.isHorizontalWhitespace && asciiCheck - } else { - matched = char._isHorizontalWhitespace && asciiCheck - } - case .verticalWhitespace: - if isScalarSemantics { - matched = scalar.isNewline && asciiCheck - } else { - matched = char._isNewline && asciiCheck - } - case .newlineSequence: - if isScalarSemantics { - matched = scalar.isNewline && asciiCheck - if matched && scalar == "\r" - && next != input.endIndex && input.unicodeScalars[next] == "\n" { - // Match a full CR-LF sequence even in scalar sematnics - input.unicodeScalars.formIndex(after: &next) - } - } else { - matched = char._isNewline && asciiCheck - } - case .whitespace: - if isScalarSemantics { - matched = scalar.properties.isWhitespace && asciiCheck - } else { - matched = char.isWhitespace && asciiCheck - } - case .word: - if isScalarSemantics { - matched = scalar.properties.isAlphabetic && asciiCheck - } else { - matched = char.isWordCharacter && asciiCheck - } - } - if isInverted { - matched.toggle() - } - if matched { - return next - } else { - return nil - } + return input._matchBuiltinCC( + cc, + at: currentPosition, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics) } } @@ -257,3 +190,5 @@ extension DSLTree.Atom.CharacterClass { return _CharacterClassModel(cc: cc, options: options, isInverted: inverted) } } + +