From bf06ee1fbb2c3ab96d56700e3427cbac0de92a85 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Fri, 31 Mar 2023 09:29:01 -0600 Subject: [PATCH 1/7] wip: General ASCII fast-paths for builtin character classes --- .../_StringProcessing/Engine/MEBuiltins.swift | 8 ++ Sources/_StringProcessing/StringExtras.swift | 130 ++++++++++++++++++ .../_CharacterClassModel.swift | 11 +- 3 files changed, 148 insertions(+), 1 deletion(-) create mode 100644 Sources/_StringProcessing/StringExtras.swift diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 6f99058fd..4bebc6b0d 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -34,6 +34,14 @@ extension Processor { _ isStrictASCII: Bool, _ isScalarSemantics: Bool ) -> Input.Index? { + + // ASCII fast-path + if let (next, result) = input._quickMatch( + cc, at: currentPosition, isScalarSemantics: isScalarSemantics + ) { + return result == isInverted ? nil : next + } + guard let char = load(), let scalar = loadScalar() else { return nil } diff --git a/Sources/_StringProcessing/StringExtras.swift b/Sources/_StringProcessing/StringExtras.swift new file mode 100644 index 000000000..d2837ea51 --- /dev/null +++ b/Sources/_StringProcessing/StringExtras.swift @@ -0,0 +1,130 @@ + +/// TODO: better description +/// Whether this is the starting byte of a sub-300 (i.e. pre-combining scalar) scalars +private func _isSub300StartingByte(_ x: UInt8) -> Bool { + x < 0xCC +} +private func _isASCII(_ x: UInt8) -> Bool { + x < 0x80 +} + +private var _lineFeed: UInt8 { 0x0A } +private var _carriageReturn: UInt8 { 0x0D } +private var _lineTab: UInt8 { 0x0B } +private var _formFeed: UInt8 { 0x0C } +private var _space: UInt8 { 0x20 } +private var _tab: UInt8 { 0x09 } + + + +private var _0: UInt8 { 0x30 } +private var _9: UInt8 { 0x39 } +private func _isASCIINumber(_ x: UInt8) -> Bool { + return (_0..._9).contains(x) +} + +private var _a: UInt8 { 0x61 } +private var _z: UInt8 { 0x7A } +private var _A: UInt8 { 0x41 } +private var _Z: UInt8 { 0x5A } +private func _isASCIILetter(_ x: UInt8) -> Bool { + return (_a..._z).contains(x) || (_A..._Z).contains(x) +} + +private var _underscore: UInt8 { 0x5F } + + +extension String { + + + /// TODO: detailed description of nuanced semantics + func _asciiCharacter( + at idx: Index + ) -> (first: UInt8, next: Index, crLF: Bool)? { + // TODO: fastUTF8 version + + if idx == endIndex { + return nil + } + let base = utf8[idx] + guard _isASCII(base) else { return nil } + + var next = utf8.index(after: idx) + if next == utf8.endIndex { + return (first: base, next: next, crLF: false) + } + + let tail = utf8[next] + guard _isSub300StartingByte(tail) else { return nil } + + // Handle CR-LF: + if base == _carriageReturn && tail == _lineFeed { + utf8.formIndex(after: &next) + guard next == endIndex || _isSub300StartingByte(utf8[next]) else { + return nil + } + return (first: base, next: next, crLF: true) + } + + return (first: base, next: next, crLF: false) + } + + func _quickMatch( + _ cc: _CharacterClassModel.Representation, + at idx: Index, + isScalarSemantics: Bool + ) -> (next: Index, matchResult: Bool)? { + /// ASCII fast-paths + guard let (asciiValue, next, isCRLF) = _asciiCharacter( + at: idx + ) else { + return nil + } + + // TODO: bitvectors + switch cc { + case .any, .anyGrapheme, .anyScalar: + // TODO: should any scalar not consume CR-LF in scalar semantic mode? + return (next, true) + + case .digit: + if _isASCIINumber(asciiValue) { + return (next, true) + } + return (next, false) + + case .horizontalWhitespace: + switch asciiValue { + case _space, _tab: return (next, true) + default: return (next, false) + } + + case .verticalWhitespace, .newlineSequence: + switch asciiValue { + case _lineFeed, _carriageReturn, _lineTab, _formFeed: + // Scalar semantics: For `\v`, only advance past the CR instead of CR-LF + if isScalarSemantics && isCRLF && cc == .verticalWhitespace { + return (utf8.index(before: next), true) + } + return (next, true) + + default: + return (next, false) + } + + case .whitespace: + switch asciiValue { + case _space, _tab, _lineFeed, _lineTab, _formFeed, _carriageReturn: + return (next, true) + default: + return (next, false) + } + + case .word: + let matches = _isASCIINumber(asciiValue) || _isASCIILetter(asciiValue) || asciiValue == _underscore + return (next, matches) + } + } + +} + diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 548a9eea0..d9a32a8e4 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -78,9 +78,18 @@ struct _CharacterClassModel: Hashable { guard currentPosition != input.endIndex else { return nil } + + let isScalarSemantics = matchLevel == .unicodeScalar + + // ASCII fast-path + if let (next, result) = input._quickMatch( + cc, at: currentPosition, isScalarSemantics: isScalarSemantics + ) { + return result == isInverted ? nil : next + } + let char = input[currentPosition] let scalar = input.unicodeScalars[currentPosition] - let isScalarSemantics = matchLevel == .unicodeScalar let asciiCheck = !isStrictASCII || (scalar.isASCII && isScalarSemantics) From 88283b712dde0092290f65fa8946cc9d0766e36a Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Mon, 3 Apr 2023 14:35:02 -0600 Subject: [PATCH 2/7] wip --- .../_StringProcessing/Engine/MEBuiltins.swift | 73 ++++++++++++++----- .../_StringProcessing/Engine/MEQuantify.swift | 8 +- .../_StringProcessing/Engine/Processor.swift | 8 +- Sources/_StringProcessing/StringExtras.swift | 16 ++++ 4 files changed, 80 insertions(+), 25 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 4bebc6b0d..bcaf9d990 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -9,17 +9,17 @@ extension Character { } extension Processor { - mutating func matchBuiltin( + mutating func matchBuiltinCC( _ cc: _CharacterClassModel.Representation, - _ isInverted: Bool, - _ isStrictASCII: Bool, - _ isScalarSemantics: Bool + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool ) -> Bool { - guard let next = _doMatchBuiltin( + guard let next = _doMatchBuiltinCC( cc, - isInverted, - isStrictASCII, - isScalarSemantics + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics ) else { signalFailure() return false @@ -27,21 +27,60 @@ extension Processor { currentPosition = next return true } - - func _doMatchBuiltin( + + func _doMatchBuiltinCC( _ cc: _CharacterClassModel.Representation, - _ isInverted: Bool, - _ isStrictASCII: Bool, - _ isScalarSemantics: Bool + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool ) -> Input.Index? { + switch _quickMatchBuiltinCC( + cc, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics + ) { + case .no: + return nil + case .yes(let next): + return next + case .maybe: + return _slowMatchBuiltinCC( + cc, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics) + } + } - // ASCII fast-path - if let (next, result) = input._quickMatch( + @inline(__always) + func _quickMatchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> QuickResult { + guard let (next, result) = input._quickMatch( cc, at: currentPosition, isScalarSemantics: isScalarSemantics - ) { - return result == isInverted ? nil : next + ) else { + return .maybe } + let idx = result == isInverted ? nil : next + assert(idx == _slowMatchBuiltinCC( + cc, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics)) + return .definite(idx) + } + @inline(never) + func _slowMatchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> Input.Index? { guard let char = load(), let scalar = loadScalar() else { return nil } diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 9d17dc9bd..5b471941a 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -9,11 +9,11 @@ extension Processor { UnicodeScalar.init(_value: UInt32(payload.asciiChar)), true) case .builtin: // We only emit .quantify if it consumes a single character - next = _doMatchBuiltin( + next = _doMatchBuiltinCC( payload.builtin, - payload.builtinIsInverted, - payload.builtinIsStrict, - false) + isInverted: payload.builtinIsInverted, + isStrictASCII: payload.builtinIsStrict, + isScalarSemantics: false) case .any: let matched = currentPosition != input.endIndex && (!input[currentPosition].isNewline || payload.anyMatchesNewline) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 18e355fb5..25cad5c8c 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -583,11 +583,11 @@ extension Processor { case .matchBuiltin: let payload = payload.characterClassPayload - if matchBuiltin( + if matchBuiltinCC( payload.cc, - payload.isInverted, - payload.isStrictASCII, - payload.isScalarSemantics + isInverted: payload.isInverted, + isStrictASCII: payload.isStrictASCII, + isScalarSemantics: payload.isScalarSemantics ) { controller.step() } diff --git a/Sources/_StringProcessing/StringExtras.swift b/Sources/_StringProcessing/StringExtras.swift index d2837ea51..abe4dbd97 100644 --- a/Sources/_StringProcessing/StringExtras.swift +++ b/Sources/_StringProcessing/StringExtras.swift @@ -128,3 +128,19 @@ extension String { } +/// An enum for quick-check functions, which could return a yes, no, or maybe +/// result. +enum QuickResult { + case yes(_ r: R) + case no + case maybe + + static func definite(_ r: R?) -> QuickResult { + if let r = r { + return .yes(r) + } + return .no + } +} + + From 38cf10e8b8e35434c675bf142141d39cd5b91450 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Mon, 3 Apr 2023 14:49:40 -0600 Subject: [PATCH 3/7] wip: shorter --- .../_StringProcessing/Engine/MEBuiltins.swift | 37 ++++++++----------- Sources/_StringProcessing/StringExtras.swift | 16 ++------ 2 files changed, 20 insertions(+), 33 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index bcaf9d990..81a4b65c3 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -34,23 +34,24 @@ extension Processor { isStrictASCII: Bool, isScalarSemantics: Bool ) -> Input.Index? { - switch _quickMatchBuiltinCC( + if case .definite(let result) = _quickMatchBuiltinCC( cc, isInverted: isInverted, isStrictASCII: isStrictASCII, isScalarSemantics: isScalarSemantics ) { - case .no: - return nil - case .yes(let next): - return next - case .maybe: - return _slowMatchBuiltinCC( - cc, - isInverted: isInverted, - isStrictASCII: isStrictASCII, - isScalarSemantics: isScalarSemantics) + assert(result == _thoroughMatchBuiltinCC( + cc, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics)) + return result } + return _thoroughMatchBuiltinCC( + cc, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics) } @inline(__always) @@ -59,23 +60,17 @@ extension Processor { isInverted: Bool, isStrictASCII: Bool, isScalarSemantics: Bool - ) -> QuickResult { + ) -> QuickResult { guard let (next, result) = input._quickMatch( cc, at: currentPosition, isScalarSemantics: isScalarSemantics ) else { - return .maybe + return .unknown } - let idx = result == isInverted ? nil : next - assert(idx == _slowMatchBuiltinCC( - cc, - isInverted: isInverted, - isStrictASCII: isStrictASCII, - isScalarSemantics: isScalarSemantics)) - return .definite(idx) + return .definite(result == isInverted ? nil : next) } @inline(never) - func _slowMatchBuiltinCC( + func _thoroughMatchBuiltinCC( _ cc: _CharacterClassModel.Representation, isInverted: Bool, isStrictASCII: Bool, diff --git a/Sources/_StringProcessing/StringExtras.swift b/Sources/_StringProcessing/StringExtras.swift index abe4dbd97..b7f663212 100644 --- a/Sources/_StringProcessing/StringExtras.swift +++ b/Sources/_StringProcessing/StringExtras.swift @@ -128,19 +128,11 @@ extension String { } -/// An enum for quick-check functions, which could return a yes, no, or maybe -/// result. +/// An enum for quick-check functions, which could return an answer or indefinite. +/// We use a separate type because often the answer itself is optional. enum QuickResult { - case yes(_ r: R) - case no - case maybe - - static func definite(_ r: R?) -> QuickResult { - if let r = r { - return .yes(r) - } - return .no - } + case definite(_ r: R) + case unknown } From 21b0b429ff520d79389f27618b8bcc0a8f1590b9 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Mon, 3 Apr 2023 15:08:10 -0600 Subject: [PATCH 4/7] Add programmers manual --- Documentation/ProgrammersManual.md | 30 +++++++++++++++++++ .../_StringProcessing/Engine/MEBuiltins.swift | 3 ++ 2 files changed, 33 insertions(+) create mode 100644 Documentation/ProgrammersManual.md diff --git a/Documentation/ProgrammersManual.md b/Documentation/ProgrammersManual.md new file mode 100644 index 000000000..43378b00b --- /dev/null +++ b/Documentation/ProgrammersManual.md @@ -0,0 +1,30 @@ +# Programmer's Manual + +## Programming patterns + +### Engine quick checks and fast paths + +In the engine nomenclature, a quick-check results in a yes/no/maybe while a thorough check always results in a definite answer. + +The nature of quick checks and fast paths is that they bifurcate testing coverage. One easy way to prevent this in simple cases is to assert that a definite quick result matches the thorough result. + +One example of this pattern is matching against a builtin character class. The engine has a `_doMatchBuiltinCC` + +```swift + func _doMatchBuiltinCC(...) -> Input.Index? { + // Calls _quickMatchBuiltinCC, if that gives a definite result + // asserts that it is the same as the result of + // _thoroughMatchBuiltinCC and returns it. Otherwise returns the + // result of _thoroughMatchBuiltinCC + } + + @inline(__always) + func _quickMatchBuiltinCC(...) -> QuickResult + + @inline(never) + func _thoroughMatchBuiltinCC(...) -> Input.Index? +``` + +The thorough check is never inlined, as it is a lot of cold code. Note that quick and thorough functions should be pure, that is they shouldn't update processor state. + + diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 81a4b65c3..cab76ff01 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -28,6 +28,7 @@ extension Processor { return true } + // Mentioned in ProgrammersManual.md, update docs if redesigned func _doMatchBuiltinCC( _ cc: _CharacterClassModel.Representation, isInverted: Bool, @@ -54,6 +55,7 @@ extension Processor { isScalarSemantics: isScalarSemantics) } + // Mentioned in ProgrammersManual.md, update docs if redesigned @inline(__always) func _quickMatchBuiltinCC( _ cc: _CharacterClassModel.Representation, @@ -69,6 +71,7 @@ extension Processor { return .definite(result == isInverted ? nil : next) } + // Mentioned in ProgrammersManual.md, update docs if redesigned @inline(never) func _thoroughMatchBuiltinCC( _ cc: _CharacterClassModel.Representation, From 9aca0457dc9c74508a51c4b38db2526be9188c15 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Mon, 3 Apr 2023 18:22:04 -0600 Subject: [PATCH 5/7] unify code paths --- Documentation/ProgrammersManual.md | 8 +- .../_StringProcessing/Engine/MEBuiltins.swift | 298 ++++++++++-------- .../_StringProcessing/Engine/MEQuantify.swift | 4 +- Sources/_StringProcessing/StringExtras.swift | 2 + .../_CharacterClassModel.swift | 97 +----- 5 files changed, 182 insertions(+), 227 deletions(-) diff --git a/Documentation/ProgrammersManual.md b/Documentation/ProgrammersManual.md index 43378b00b..67021f44c 100644 --- a/Documentation/ProgrammersManual.md +++ b/Documentation/ProgrammersManual.md @@ -8,12 +8,12 @@ In the engine nomenclature, a quick-check results in a yes/no/maybe while a thor The nature of quick checks and fast paths is that they bifurcate testing coverage. One easy way to prevent this in simple cases is to assert that a definite quick result matches the thorough result. -One example of this pattern is matching against a builtin character class. The engine has a `_doMatchBuiltinCC` +One example of this pattern is matching against a builtin character class. The engine has a `_matchBuiltinCC` ```swift - func _doMatchBuiltinCC(...) -> Input.Index? { - // Calls _quickMatchBuiltinCC, if that gives a definite result - // asserts that it is the same as the result of + func _matchBuiltinCC(...) -> Input.Index? { + // Calls _quickMatchBuiltinCC, if that gives a definite result + // asserts that it is the same as the result of // _thoroughMatchBuiltinCC and returns it. Otherwise returns the // result of _thoroughMatchBuiltinCC } diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index cab76ff01..bb2228dcb 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -15,8 +15,10 @@ extension Processor { isStrictASCII: Bool, isScalarSemantics: Bool ) -> Bool { - guard let next = _doMatchBuiltinCC( + guard let next = _matchBuiltinCC( cc, + in: input, + at: currentPosition, isInverted: isInverted, isStrictASCII: isStrictASCII, isScalarSemantics: isScalarSemantics @@ -28,140 +30,6 @@ extension Processor { return true } - // Mentioned in ProgrammersManual.md, update docs if redesigned - func _doMatchBuiltinCC( - _ cc: _CharacterClassModel.Representation, - isInverted: Bool, - isStrictASCII: Bool, - isScalarSemantics: Bool - ) -> Input.Index? { - if case .definite(let result) = _quickMatchBuiltinCC( - cc, - isInverted: isInverted, - isStrictASCII: isStrictASCII, - isScalarSemantics: isScalarSemantics - ) { - assert(result == _thoroughMatchBuiltinCC( - cc, - isInverted: isInverted, - isStrictASCII: isStrictASCII, - isScalarSemantics: isScalarSemantics)) - return result - } - return _thoroughMatchBuiltinCC( - cc, - isInverted: isInverted, - isStrictASCII: isStrictASCII, - isScalarSemantics: isScalarSemantics) - } - - // Mentioned in ProgrammersManual.md, update docs if redesigned - @inline(__always) - func _quickMatchBuiltinCC( - _ cc: _CharacterClassModel.Representation, - isInverted: Bool, - isStrictASCII: Bool, - isScalarSemantics: Bool - ) -> QuickResult { - guard let (next, result) = input._quickMatch( - cc, at: currentPosition, isScalarSemantics: isScalarSemantics - ) else { - return .unknown - } - return .definite(result == isInverted ? nil : next) - } - - // Mentioned in ProgrammersManual.md, update docs if redesigned - @inline(never) - func _thoroughMatchBuiltinCC( - _ cc: _CharacterClassModel.Representation, - isInverted: Bool, - isStrictASCII: Bool, - isScalarSemantics: Bool - ) -> Input.Index? { - guard let char = load(), let scalar = loadScalar() else { - return nil - } - - let asciiCheck = !isStrictASCII - || (scalar.isASCII && isScalarSemantics) - || char.isASCII - - var matched: Bool - var next: Input.Index - switch (isScalarSemantics, cc) { - case (_, .anyGrapheme): - next = input.index(after: currentPosition) - case (_, .anyScalar): - next = input.unicodeScalars.index(after: currentPosition) - case (true, _): - next = input.unicodeScalars.index(after: currentPosition) - case (false, _): - next = input.index(after: currentPosition) - } - - switch cc { - case .any, .anyGrapheme: - matched = true - case .anyScalar: - if isScalarSemantics { - matched = true - } else { - matched = input.isOnGraphemeClusterBoundary(next) - } - case .digit: - if isScalarSemantics { - matched = scalar.properties.numericType != nil && asciiCheck - } else { - matched = char.isNumber && asciiCheck - } - case .horizontalWhitespace: - if isScalarSemantics { - matched = scalar.isHorizontalWhitespace && asciiCheck - } else { - matched = char._isHorizontalWhitespace && asciiCheck - } - case .verticalWhitespace: - if isScalarSemantics { - matched = scalar.isNewline && asciiCheck - } else { - matched = char._isNewline && asciiCheck - } - case .newlineSequence: - if isScalarSemantics { - matched = scalar.isNewline && asciiCheck - if matched && scalar == "\r" - && next != input.endIndex && input.unicodeScalars[next] == "\n" { - // Match a full CR-LF sequence even in scalar semantics - input.unicodeScalars.formIndex(after: &next) - } - } else { - matched = char._isNewline && asciiCheck - } - case .whitespace: - if isScalarSemantics { - matched = scalar.properties.isWhitespace && asciiCheck - } else { - matched = char.isWhitespace && asciiCheck - } - case .word: - if isScalarSemantics { - matched = scalar.properties.isAlphabetic && asciiCheck - } else { - matched = char.isWordCharacter && asciiCheck - } - } - - if isInverted { - matched.toggle() - } - - guard matched else { - return nil - } - return next - } - func isAtStartOfLine(_ payload: AssertionPayload) -> Bool { if currentPosition == subjectBounds.lowerBound { return true } switch payload.semanticLevel { @@ -171,7 +39,7 @@ extension Processor { return input.unicodeScalars[input.unicodeScalars.index(before: currentPosition)].isNewline } } - + func isAtEndOfLine(_ payload: AssertionPayload) -> Bool { if currentPosition == subjectBounds.upperBound { return true } switch payload.semanticLevel { @@ -214,7 +82,7 @@ extension Processor { return isAtStartOfLine(payload) case .endOfLine: return isAtEndOfLine(payload) - + case .caretAnchor: if payload.anchorsMatchNewlines { return isAtStartOfLine(payload) @@ -247,3 +115,159 @@ extension Processor { } } } + +// MARK: Built-in character class matching + +// Mentioned in ProgrammersManual.md, update docs if redesigned +@_effects(releasenone) +func _matchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + in input: String, + at currentPosition: String.Index, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool +) -> String.Index? { + guard currentPosition < input.endIndex else { + return nil + } + if case .definite(let result) = _quickMatchBuiltinCC( + cc, + in: input, + at: currentPosition, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics + ) { + assert(result == _thoroughMatchBuiltinCC( + cc, + in: input, + at: currentPosition, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics)) + return result + } + return _thoroughMatchBuiltinCC( + cc, + in: input, + at: currentPosition, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics) +} + +// Mentioned in ProgrammersManual.md, update docs if redesigned +@_effects(releasenone) +@inline(__always) +func _quickMatchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + in input: String, + at currentPosition: String.Index, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool +) -> QuickResult { + assert(currentPosition < input.endIndex) + guard let (next, result) = input._quickMatch( + cc, at: currentPosition, isScalarSemantics: isScalarSemantics + ) else { + return .unknown + } + return .definite(result == isInverted ? nil : next) +} + +// Mentioned in ProgrammersManual.md, update docs if redesigned +@_effects(releasenone) +@inline(never) +func _thoroughMatchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + in input: String, + at currentPosition: String.Index, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool +) -> String.Index? { + assert(currentPosition < input.endIndex) + let char = input[currentPosition] + let scalar = input.unicodeScalars[currentPosition] + + let asciiCheck = !isStrictASCII + || (scalar.isASCII && isScalarSemantics) + || char.isASCII + + var matched: Bool + var next: String.Index + switch (isScalarSemantics, cc) { + case (_, .anyGrapheme): + next = input.index(after: currentPosition) + case (_, .anyScalar): + next = input.unicodeScalars.index(after: currentPosition) + case (true, _): + next = input.unicodeScalars.index(after: currentPosition) + case (false, _): + next = input.index(after: currentPosition) + } + + switch cc { + case .any, .anyGrapheme: + matched = true + case .anyScalar: + if isScalarSemantics { + matched = true + } else { + matched = input.isOnGraphemeClusterBoundary(next) + } + case .digit: + if isScalarSemantics { + matched = scalar.properties.numericType != nil && asciiCheck + } else { + matched = char.isNumber && asciiCheck + } + case .horizontalWhitespace: + if isScalarSemantics { + matched = scalar.isHorizontalWhitespace && asciiCheck + } else { + matched = char._isHorizontalWhitespace && asciiCheck + } + case .verticalWhitespace: + if isScalarSemantics { + matched = scalar.isNewline && asciiCheck + } else { + matched = char._isNewline && asciiCheck + } + case .newlineSequence: + if isScalarSemantics { + matched = scalar.isNewline && asciiCheck + if matched && scalar == "\r" + && next != input.endIndex && input.unicodeScalars[next] == "\n" { + // Match a full CR-LF sequence even in scalar semantics + input.unicodeScalars.formIndex(after: &next) + } + } else { + matched = char._isNewline && asciiCheck + } + case .whitespace: + if isScalarSemantics { + matched = scalar.properties.isWhitespace && asciiCheck + } else { + matched = char.isWhitespace && asciiCheck + } + case .word: + if isScalarSemantics { + matched = scalar.properties.isAlphabetic && asciiCheck + } else { + matched = char.isWordCharacter && asciiCheck + } + } + + if isInverted { + matched.toggle() + } + + guard matched else { + return nil + } + return next +} + diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 5b471941a..32fcfff48 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -9,8 +9,10 @@ extension Processor { UnicodeScalar.init(_value: UInt32(payload.asciiChar)), true) case .builtin: // We only emit .quantify if it consumes a single character - next = _doMatchBuiltinCC( + next = _matchBuiltinCC( payload.builtin, + in: input, + at: currentPosition, isInverted: payload.builtinIsInverted, isStrictASCII: payload.builtinIsStrict, isScalarSemantics: false) diff --git a/Sources/_StringProcessing/StringExtras.swift b/Sources/_StringProcessing/StringExtras.swift index b7f663212..14ae8a04d 100644 --- a/Sources/_StringProcessing/StringExtras.swift +++ b/Sources/_StringProcessing/StringExtras.swift @@ -63,9 +63,11 @@ extension String { guard next == endIndex || _isSub300StartingByte(utf8[next]) else { return nil } + assert(self[idx] == "\r\n") return (first: base, next: next, crLF: true) } + assert(self[idx].isASCII && self[idx] != "\r\n") return (first: base, next: next, crLF: false) } diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index d9a32a8e4..7c09b3f58 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -18,10 +18,10 @@ struct _CharacterClassModel: Hashable { /// The actual character class to match. let cc: Representation - + /// The level (character or Unicode scalar) at which to match. let matchLevel: MatchingOptions.SemanticLevel - + /// If this character character class only matches ascii characters let isStrictASCII: Bool @@ -61,7 +61,7 @@ struct _CharacterClassModel: Hashable { /// Character.isLetter or Character.isDigit or Character == "_" case word } - + /// Returns the end of the match of this character class in the string. /// /// - Parameter str: The string to match against. @@ -81,88 +81,13 @@ struct _CharacterClassModel: Hashable { let isScalarSemantics = matchLevel == .unicodeScalar - // ASCII fast-path - if let (next, result) = input._quickMatch( - cc, at: currentPosition, isScalarSemantics: isScalarSemantics - ) { - return result == isInverted ? nil : next - } - - let char = input[currentPosition] - let scalar = input.unicodeScalars[currentPosition] - - let asciiCheck = !isStrictASCII - || (scalar.isASCII && isScalarSemantics) - || char.isASCII - - var matched: Bool - var next: String.Index - switch (isScalarSemantics, cc) { - case (_, .anyGrapheme): - next = input.index(after: currentPosition) - case (_, .anyScalar): - // FIXME: This allows us to be not-scalar aligned when in grapheme mode - // Should this even be allowed? - next = input.unicodeScalars.index(after: currentPosition) - case (true, _): - next = input.unicodeScalars.index(after: currentPosition) - case (false, _): - next = input.index(after: currentPosition) - } - - switch cc { - case .any, .anyGrapheme, .anyScalar: - matched = true - case .digit: - if isScalarSemantics { - matched = scalar.properties.numericType != nil && asciiCheck - } else { - matched = char.isNumber && asciiCheck - } - case .horizontalWhitespace: - if isScalarSemantics { - matched = scalar.isHorizontalWhitespace && asciiCheck - } else { - matched = char._isHorizontalWhitespace && asciiCheck - } - case .verticalWhitespace: - if isScalarSemantics { - matched = scalar.isNewline && asciiCheck - } else { - matched = char._isNewline && asciiCheck - } - case .newlineSequence: - if isScalarSemantics { - matched = scalar.isNewline && asciiCheck - if matched && scalar == "\r" - && next != input.endIndex && input.unicodeScalars[next] == "\n" { - // Match a full CR-LF sequence even in scalar sematnics - input.unicodeScalars.formIndex(after: &next) - } - } else { - matched = char._isNewline && asciiCheck - } - case .whitespace: - if isScalarSemantics { - matched = scalar.properties.isWhitespace && asciiCheck - } else { - matched = char.isWhitespace && asciiCheck - } - case .word: - if isScalarSemantics { - matched = scalar.properties.isAlphabetic && asciiCheck - } else { - matched = char.isWordCharacter && asciiCheck - } - } - if isInverted { - matched.toggle() - } - if matched { - return next - } else { - return nil - } + return _matchBuiltinCC( + cc, + in: input, + at: currentPosition, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics) } } @@ -266,3 +191,5 @@ extension DSLTree.Atom.CharacterClass { return _CharacterClassModel(cc: cc, options: options, isInverted: inverted) } } + + From cf8fd526f3d740c5662b0b0c6c5dc4527fc24a0e Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Mon, 3 Apr 2023 18:33:40 -0600 Subject: [PATCH 6/7] fix for scalar semantic cr-lf --- Sources/_StringProcessing/StringExtras.swift | 21 +++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/Sources/_StringProcessing/StringExtras.swift b/Sources/_StringProcessing/StringExtras.swift index 14ae8a04d..938096f17 100644 --- a/Sources/_StringProcessing/StringExtras.swift +++ b/Sources/_StringProcessing/StringExtras.swift @@ -97,29 +97,32 @@ extension String { case .horizontalWhitespace: switch asciiValue { - case _space, _tab: return (next, true) - default: return (next, false) + case _space, _tab: return (next, true) + default: return (next, false) } case .verticalWhitespace, .newlineSequence: switch asciiValue { - case _lineFeed, _carriageReturn, _lineTab, _formFeed: + case _lineFeed, _carriageReturn, _lineTab, _formFeed: // Scalar semantics: For `\v`, only advance past the CR instead of CR-LF if isScalarSemantics && isCRLF && cc == .verticalWhitespace { return (utf8.index(before: next), true) } return (next, true) - default: - return (next, false) + default: + return (next, false) } case .whitespace: switch asciiValue { - case _space, _tab, _lineFeed, _lineTab, _formFeed, _carriageReturn: - return (next, true) - default: - return (next, false) + case _space, _tab, _lineFeed, _lineTab, _formFeed, _carriageReturn: + if isScalarSemantics && isCRLF { + return (utf8.index(before: next), true) + } + return (next, true) + default: + return (next, false) } case .word: From c35b846ed9f3b3b0e2be48540ad09bb99e18153b Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Tue, 4 Apr 2023 08:41:56 -0600 Subject: [PATCH 7/7] refactor and reorganize --- Sources/_StringProcessing/CMakeLists.txt | 1 + .../_StringProcessing/Engine/MEBuiltins.swift | 262 +++++++++--------- .../_StringProcessing/Engine/MEQuantify.swift | 3 +- Sources/_StringProcessing/StringExtras.swift | 143 ---------- Sources/_StringProcessing/Unicode/ASCII.swift | 170 ++++++++++++ Sources/_StringProcessing/Unicode/NFC.swift | 5 + Sources/_StringProcessing/Utility/Misc.swift | 8 + .../_CharacterClassModel.swift | 3 +- 8 files changed, 313 insertions(+), 282 deletions(-) delete mode 100644 Sources/_StringProcessing/StringExtras.swift create mode 100644 Sources/_StringProcessing/Unicode/ASCII.swift diff --git a/Sources/_StringProcessing/CMakeLists.txt b/Sources/_StringProcessing/CMakeLists.txt index ba3e2e03c..ef4aeb6ef 100644 --- a/Sources/_StringProcessing/CMakeLists.txt +++ b/Sources/_StringProcessing/CMakeLists.txt @@ -47,6 +47,7 @@ add_library(_StringProcessing Regex/DSLTree.swift Regex/Match.swift Regex/Options.swift + Unicode/ASCII.swift Unicode/CaseConversion.swift Unicode/CharacterProps.swift Unicode/Comparison.swift diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index bb2228dcb..fa4f8fa1a 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -15,9 +15,8 @@ extension Processor { isStrictASCII: Bool, isScalarSemantics: Bool ) -> Bool { - guard let next = _matchBuiltinCC( + guard let next = input._matchBuiltinCC( cc, - in: input, at: currentPosition, isInverted: isInverted, isStrictASCII: isStrictASCII, @@ -118,156 +117,149 @@ extension Processor { // MARK: Built-in character class matching -// Mentioned in ProgrammersManual.md, update docs if redesigned -@_effects(releasenone) -func _matchBuiltinCC( - _ cc: _CharacterClassModel.Representation, - in input: String, - at currentPosition: String.Index, - isInverted: Bool, - isStrictASCII: Bool, - isScalarSemantics: Bool -) -> String.Index? { - guard currentPosition < input.endIndex else { - return nil - } - if case .definite(let result) = _quickMatchBuiltinCC( - cc, - in: input, - at: currentPosition, - isInverted: isInverted, - isStrictASCII: isStrictASCII, - isScalarSemantics: isScalarSemantics - ) { - assert(result == _thoroughMatchBuiltinCC( +extension String { + + // Mentioned in ProgrammersManual.md, update docs if redesigned + func _matchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + at currentPosition: String.Index, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> String.Index? { + guard currentPosition < endIndex else { + return nil + } + if case .definite(let result) = _quickMatchBuiltinCC( + cc, + at: currentPosition, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics + ) { + assert(result == _thoroughMatchBuiltinCC( + cc, + at: currentPosition, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics)) + return result + } + return _thoroughMatchBuiltinCC( cc, - in: input, at: currentPosition, isInverted: isInverted, isStrictASCII: isStrictASCII, - isScalarSemantics: isScalarSemantics)) - return result + isScalarSemantics: isScalarSemantics) } - return _thoroughMatchBuiltinCC( - cc, - in: input, - at: currentPosition, - isInverted: isInverted, - isStrictASCII: isStrictASCII, - isScalarSemantics: isScalarSemantics) -} -// Mentioned in ProgrammersManual.md, update docs if redesigned -@_effects(releasenone) -@inline(__always) -func _quickMatchBuiltinCC( - _ cc: _CharacterClassModel.Representation, - in input: String, - at currentPosition: String.Index, - isInverted: Bool, - isStrictASCII: Bool, - isScalarSemantics: Bool -) -> QuickResult { - assert(currentPosition < input.endIndex) - guard let (next, result) = input._quickMatch( - cc, at: currentPosition, isScalarSemantics: isScalarSemantics - ) else { - return .unknown + // Mentioned in ProgrammersManual.md, update docs if redesigned + @inline(__always) + func _quickMatchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + at currentPosition: String.Index, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> QuickResult { + assert(currentPosition < endIndex) + guard let (next, result) = _quickMatch( + cc, at: currentPosition, isScalarSemantics: isScalarSemantics + ) else { + return .unknown + } + return .definite(result == isInverted ? nil : next) } - return .definite(result == isInverted ? nil : next) -} -// Mentioned in ProgrammersManual.md, update docs if redesigned -@_effects(releasenone) -@inline(never) -func _thoroughMatchBuiltinCC( - _ cc: _CharacterClassModel.Representation, - in input: String, - at currentPosition: String.Index, - isInverted: Bool, - isStrictASCII: Bool, - isScalarSemantics: Bool -) -> String.Index? { - assert(currentPosition < input.endIndex) - let char = input[currentPosition] - let scalar = input.unicodeScalars[currentPosition] + // Mentioned in ProgrammersManual.md, update docs if redesigned + @inline(never) + func _thoroughMatchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + at currentPosition: String.Index, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> String.Index? { + assert(currentPosition < endIndex) + let char = self[currentPosition] + let scalar = unicodeScalars[currentPosition] - let asciiCheck = !isStrictASCII + let asciiCheck = !isStrictASCII || (scalar.isASCII && isScalarSemantics) || char.isASCII - var matched: Bool - var next: String.Index - switch (isScalarSemantics, cc) { - case (_, .anyGrapheme): - next = input.index(after: currentPosition) - case (_, .anyScalar): - next = input.unicodeScalars.index(after: currentPosition) - case (true, _): - next = input.unicodeScalars.index(after: currentPosition) - case (false, _): - next = input.index(after: currentPosition) - } + var matched: Bool + var next: String.Index + switch (isScalarSemantics, cc) { + case (_, .anyGrapheme): + next = index(after: currentPosition) + case (_, .anyScalar): + next = unicodeScalars.index(after: currentPosition) + case (true, _): + next = unicodeScalars.index(after: currentPosition) + case (false, _): + next = index(after: currentPosition) + } - switch cc { - case .any, .anyGrapheme: - matched = true - case .anyScalar: - if isScalarSemantics { + switch cc { + case .any, .anyGrapheme: matched = true - } else { - matched = input.isOnGraphemeClusterBoundary(next) - } - case .digit: - if isScalarSemantics { - matched = scalar.properties.numericType != nil && asciiCheck - } else { - matched = char.isNumber && asciiCheck - } - case .horizontalWhitespace: - if isScalarSemantics { - matched = scalar.isHorizontalWhitespace && asciiCheck - } else { - matched = char._isHorizontalWhitespace && asciiCheck - } - case .verticalWhitespace: - if isScalarSemantics { - matched = scalar.isNewline && asciiCheck - } else { - matched = char._isNewline && asciiCheck - } - case .newlineSequence: - if isScalarSemantics { - matched = scalar.isNewline && asciiCheck - if matched && scalar == "\r" - && next != input.endIndex && input.unicodeScalars[next] == "\n" { - // Match a full CR-LF sequence even in scalar semantics - input.unicodeScalars.formIndex(after: &next) + case .anyScalar: + if isScalarSemantics { + matched = true + } else { + matched = isOnGraphemeClusterBoundary(next) + } + case .digit: + if isScalarSemantics { + matched = scalar.properties.numericType != nil && asciiCheck + } else { + matched = char.isNumber && asciiCheck + } + case .horizontalWhitespace: + if isScalarSemantics { + matched = scalar.isHorizontalWhitespace && asciiCheck + } else { + matched = char._isHorizontalWhitespace && asciiCheck + } + case .verticalWhitespace: + if isScalarSemantics { + matched = scalar.isNewline && asciiCheck + } else { + matched = char._isNewline && asciiCheck + } + case .newlineSequence: + if isScalarSemantics { + matched = scalar.isNewline && asciiCheck + if matched && scalar == "\r" + && next != endIndex && unicodeScalars[next] == "\n" { + // Match a full CR-LF sequence even in scalar semantics + unicodeScalars.formIndex(after: &next) + } + } else { + matched = char._isNewline && asciiCheck + } + case .whitespace: + if isScalarSemantics { + matched = scalar.properties.isWhitespace && asciiCheck + } else { + matched = char.isWhitespace && asciiCheck + } + case .word: + if isScalarSemantics { + matched = scalar.properties.isAlphabetic && asciiCheck + } else { + matched = char.isWordCharacter && asciiCheck } - } else { - matched = char._isNewline && asciiCheck - } - case .whitespace: - if isScalarSemantics { - matched = scalar.properties.isWhitespace && asciiCheck - } else { - matched = char.isWhitespace && asciiCheck - } - case .word: - if isScalarSemantics { - matched = scalar.properties.isAlphabetic && asciiCheck - } else { - matched = char.isWordCharacter && asciiCheck } - } - if isInverted { - matched.toggle() - } + if isInverted { + matched.toggle() + } - guard matched else { - return nil + guard matched else { + return nil + } + return next } - return next } - diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 32fcfff48..fa68b8b76 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -9,9 +9,8 @@ extension Processor { UnicodeScalar.init(_value: UInt32(payload.asciiChar)), true) case .builtin: // We only emit .quantify if it consumes a single character - next = _matchBuiltinCC( + next = input._matchBuiltinCC( payload.builtin, - in: input, at: currentPosition, isInverted: payload.builtinIsInverted, isStrictASCII: payload.builtinIsStrict, diff --git a/Sources/_StringProcessing/StringExtras.swift b/Sources/_StringProcessing/StringExtras.swift deleted file mode 100644 index 938096f17..000000000 --- a/Sources/_StringProcessing/StringExtras.swift +++ /dev/null @@ -1,143 +0,0 @@ - -/// TODO: better description -/// Whether this is the starting byte of a sub-300 (i.e. pre-combining scalar) scalars -private func _isSub300StartingByte(_ x: UInt8) -> Bool { - x < 0xCC -} -private func _isASCII(_ x: UInt8) -> Bool { - x < 0x80 -} - -private var _lineFeed: UInt8 { 0x0A } -private var _carriageReturn: UInt8 { 0x0D } -private var _lineTab: UInt8 { 0x0B } -private var _formFeed: UInt8 { 0x0C } -private var _space: UInt8 { 0x20 } -private var _tab: UInt8 { 0x09 } - - - -private var _0: UInt8 { 0x30 } -private var _9: UInt8 { 0x39 } -private func _isASCIINumber(_ x: UInt8) -> Bool { - return (_0..._9).contains(x) -} - -private var _a: UInt8 { 0x61 } -private var _z: UInt8 { 0x7A } -private var _A: UInt8 { 0x41 } -private var _Z: UInt8 { 0x5A } -private func _isASCIILetter(_ x: UInt8) -> Bool { - return (_a..._z).contains(x) || (_A..._Z).contains(x) -} - -private var _underscore: UInt8 { 0x5F } - - -extension String { - - - /// TODO: detailed description of nuanced semantics - func _asciiCharacter( - at idx: Index - ) -> (first: UInt8, next: Index, crLF: Bool)? { - // TODO: fastUTF8 version - - if idx == endIndex { - return nil - } - let base = utf8[idx] - guard _isASCII(base) else { return nil } - - var next = utf8.index(after: idx) - if next == utf8.endIndex { - return (first: base, next: next, crLF: false) - } - - let tail = utf8[next] - guard _isSub300StartingByte(tail) else { return nil } - - // Handle CR-LF: - if base == _carriageReturn && tail == _lineFeed { - utf8.formIndex(after: &next) - guard next == endIndex || _isSub300StartingByte(utf8[next]) else { - return nil - } - assert(self[idx] == "\r\n") - return (first: base, next: next, crLF: true) - } - - assert(self[idx].isASCII && self[idx] != "\r\n") - return (first: base, next: next, crLF: false) - } - - func _quickMatch( - _ cc: _CharacterClassModel.Representation, - at idx: Index, - isScalarSemantics: Bool - ) -> (next: Index, matchResult: Bool)? { - /// ASCII fast-paths - guard let (asciiValue, next, isCRLF) = _asciiCharacter( - at: idx - ) else { - return nil - } - - // TODO: bitvectors - switch cc { - case .any, .anyGrapheme, .anyScalar: - // TODO: should any scalar not consume CR-LF in scalar semantic mode? - return (next, true) - - case .digit: - if _isASCIINumber(asciiValue) { - return (next, true) - } - return (next, false) - - case .horizontalWhitespace: - switch asciiValue { - case _space, _tab: return (next, true) - default: return (next, false) - } - - case .verticalWhitespace, .newlineSequence: - switch asciiValue { - case _lineFeed, _carriageReturn, _lineTab, _formFeed: - // Scalar semantics: For `\v`, only advance past the CR instead of CR-LF - if isScalarSemantics && isCRLF && cc == .verticalWhitespace { - return (utf8.index(before: next), true) - } - return (next, true) - - default: - return (next, false) - } - - case .whitespace: - switch asciiValue { - case _space, _tab, _lineFeed, _lineTab, _formFeed, _carriageReturn: - if isScalarSemantics && isCRLF { - return (utf8.index(before: next), true) - } - return (next, true) - default: - return (next, false) - } - - case .word: - let matches = _isASCIINumber(asciiValue) || _isASCIILetter(asciiValue) || asciiValue == _underscore - return (next, matches) - } - } - -} - -/// An enum for quick-check functions, which could return an answer or indefinite. -/// We use a separate type because often the answer itself is optional. -enum QuickResult { - case definite(_ r: R) - case unknown -} - - diff --git a/Sources/_StringProcessing/Unicode/ASCII.swift b/Sources/_StringProcessing/Unicode/ASCII.swift new file mode 100644 index 000000000..f27584893 --- /dev/null +++ b/Sources/_StringProcessing/Unicode/ASCII.swift @@ -0,0 +1,170 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2023 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +private var _lineFeed: UInt8 { 0x0A } +private var _carriageReturn: UInt8 { 0x0D } +private var _lineTab: UInt8 { 0x0B } +private var _formFeed: UInt8 { 0x0C } +private var _space: UInt8 { 0x20 } +private var _tab: UInt8 { 0x09 } + +private var _0: UInt8 { 0x30 } +private var _9: UInt8 { 0x39 } +private func _isASCIINumber(_ x: UInt8) -> Bool { + return (_0..._9).contains(x) +} + +private var _a: UInt8 { 0x61 } +private var _z: UInt8 { 0x7A } +private var _A: UInt8 { 0x41 } +private var _Z: UInt8 { 0x5A } + +private var _underscore: UInt8 { 0x5F } + +extension UInt8 { + var _isASCII: Bool { self < 0x80 } + + // TODO: Bitvectors for the below + + /// Assuming we're ASCII, whether we match `\d` + var _asciiIsDigit: Bool { + assert(_isASCII) + return(_0..._9).contains(self) + } + + /// Assuming we're ASCII, whether we match `\h` + var _asciiIsHorizontalWhitespace: Bool { + assert(_isASCII) + return self == _space || self == _tab + } + + /// Assuming we're ASCII, whether we match `\v` + var _asciiIsVerticalWhitespace: Bool { + assert(_isASCII) + switch self { + case _lineFeed, _carriageReturn, _lineTab, _formFeed: + return true + default: + return false + } + } + + /// Assuming we're ASCII, whether we match `\s` + var _asciiIsWhitespace: Bool { + assert(_isASCII) + switch self { + case _space, _tab, _lineFeed, _lineTab, _formFeed, _carriageReturn: + return true + default: + return false + } + } + + /// Assuming we're ASCII, whether we match `[a-zA-Z]` + var _asciiIsLetter: Bool { + assert(_isASCII) + return (_a..._z).contains(self) || (_A..._Z).contains(self) + } + + /// Assuming we're ASCII, whether we match `\w` + var _asciiIsWord: Bool { + assert(_isASCII) + return _asciiIsDigit || _asciiIsLetter || self == _underscore + } +} + +extension String { + /// TODO: detailed description of nuanced semantics + func _quickASCIICharacter( + at idx: Index + ) -> (first: UInt8, next: Index, crLF: Bool)? { + // TODO: fastUTF8 version + + if idx == endIndex { + return nil + } + let base = utf8[idx] + guard base._isASCII else { + assert(!self[idx].isASCII) + return nil + } + + var next = utf8.index(after: idx) + if next == utf8.endIndex { + assert(self[idx].isASCII) + return (first: base, next: next, crLF: false) + } + + let tail = utf8[next] + guard tail._isSub300StartingByte else { return nil } + + // Handle CR-LF: + if base == _carriageReturn && tail == _lineFeed { + utf8.formIndex(after: &next) + guard next == endIndex || utf8[next]._isSub300StartingByte else { + return nil + } + assert(self[idx] == "\r\n") + return (first: base, next: next, crLF: true) + } + + assert(self[idx].isASCII && self[idx] != "\r\n") + return (first: base, next: next, crLF: false) + } + + func _quickMatch( + _ cc: _CharacterClassModel.Representation, + at idx: Index, + isScalarSemantics: Bool + ) -> (next: Index, matchResult: Bool)? { + /// ASCII fast-paths + guard let (asciiValue, next, isCRLF) = _quickASCIICharacter( + at: idx + ) else { + return nil + } + + // TODO: bitvectors + switch cc { + case .any, .anyGrapheme, .anyScalar: + // TODO: should any scalar not consume CR-LF in scalar semantic mode? + return (next, true) + + case .digit: + return (next, asciiValue._asciiIsDigit) + + case .horizontalWhitespace: + return (next, asciiValue._asciiIsHorizontalWhitespace) + + case .verticalWhitespace, .newlineSequence: + if asciiValue._asciiIsVerticalWhitespace { + if isScalarSemantics && isCRLF && cc == .verticalWhitespace { + return (utf8.index(before: next), true) + } + return (next, true) + } + return (next, false) + + case .whitespace: + if asciiValue._asciiIsWhitespace { + if isScalarSemantics && isCRLF { + return (utf8.index(before: next), true) + } + return (next, true) + } + return (next, false) + + case .word: + return (next, asciiValue._asciiIsWord) + } + } +} + diff --git a/Sources/_StringProcessing/Unicode/NFC.swift b/Sources/_StringProcessing/Unicode/NFC.swift index 5c2c4aa48..59d195bb6 100644 --- a/Sources/_StringProcessing/Unicode/NFC.swift +++ b/Sources/_StringProcessing/Unicode/NFC.swift @@ -12,6 +12,11 @@ @_spi(_Unicode) import Swift +extension UInt8 { + /// Whether this is the starting byte of a sub-300 (i.e. pre-combining scalar) scalars + var _isSub300StartingByte: Bool { self < 0xCC } +} + extension UnicodeScalar { /// Checks whether the scalar is in NFC form. var isNFC: Bool { Character(self).singleNFCScalar == self } diff --git a/Sources/_StringProcessing/Utility/Misc.swift b/Sources/_StringProcessing/Utility/Misc.swift index 8a9cbe325..8555ec85c 100644 --- a/Sources/_StringProcessing/Utility/Misc.swift +++ b/Sources/_StringProcessing/Utility/Misc.swift @@ -57,3 +57,11 @@ extension Array { with: initialAccumulator, into: { [finish($0) ]}, accumulate: accumulate) } } + +/// An enum for quick-check functions, which could return an answer or indefinite. +/// We use a separate type because often the answer itself is optional. +enum QuickResult { + case definite(_ r: R) + case unknown +} + diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 7c09b3f58..35a11a7ef 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -81,9 +81,8 @@ struct _CharacterClassModel: Hashable { let isScalarSemantics = matchLevel == .unicodeScalar - return _matchBuiltinCC( + return input._matchBuiltinCC( cc, - in: input, at: currentPosition, isInverted: isInverted, isStrictASCII: isStrictASCII,