diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 705b354fb..808a1e498 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -68,14 +68,16 @@ extension Character { switch opts.semanticLevel { case .graphemeCluster: return { input, bounds in - let low = bounds.lowerBound + guard let (char, next) = input.characterAndEnd( + at: bounds.lowerBound, limitedBy: bounds.upperBound) + else { return nil } if isCaseInsensitive && isCased { - return input[low].lowercased() == lowercased() - ? input.index(after: low) + return char.lowercased() == self.lowercased() + ? next : nil } else { - return input[low] == self - ? input.index(after: low) + return char == self + ? next : nil } } @@ -188,7 +190,7 @@ extension DSLTree.Atom.CharacterClass { func generateConsumer(_ opts: MatchingOptions) -> MEProgram.ConsumeFunction { let model = asRuntimeModel(opts) return { input, bounds in - model.matches(in: input, at: bounds.lowerBound) + model.matches(in: input, at: bounds.lowerBound, limitedBy: bounds.upperBound) } } } @@ -517,7 +519,7 @@ extension DSLTree.CustomCharacterClass { } if isInverted { return opts.semanticLevel == .graphemeCluster - ? input.index(after: bounds.lowerBound) + ? Swift.min(input.index(after: bounds.lowerBound), bounds.upperBound) : input.unicodeScalars.index(after: bounds.lowerBound) } return nil diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 5e446b472..0dafd6720 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -15,9 +15,10 @@ extension Processor { isStrictASCII: Bool, isScalarSemantics: Bool ) -> Bool { - guard let next = input.matchBuiltinCC( + guard currentPosition < end, let next = input.matchBuiltinCC( cc, at: currentPosition, + limitedBy: end, isInverted: isInverted, isStrictASCII: isStrictASCII, isScalarSemantics: isScalarSemantics @@ -102,18 +103,16 @@ extension Processor { case .wordBoundary: if payload.usesSimpleUnicodeBoundaries { - // TODO: How should we handle bounds? return atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel) } else { - return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) + return input.isOnWordBoundary(at: currentPosition, in: searchBounds, using: &wordIndexCache, &wordIndexMaxIndex) } case .notWordBoundary: if payload.usesSimpleUnicodeBoundaries { - // TODO: How should we handle bounds? return !atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel) } else { - return !input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) + return !input.isOnWordBoundary(at: currentPosition, in: searchBounds, using: &wordIndexCache, &wordIndexMaxIndex) } } } @@ -121,37 +120,79 @@ extension Processor { // MARK: Matching `.` extension String { - // TODO: Should the below have a `limitedBy` parameter? + /// Returns the character at `pos`, bounded by `end`, as well as the upper + /// boundary of the returned character. + /// + /// This function handles loading a character from a string while respecting + /// an end boundary, even if that end boundary is sub-character or sub-scalar. + /// + /// - If `pos` is at or past `end`, this function returns `nil`. + /// - If `end` is between `pos` and the next grapheme cluster boundary (i.e., + /// `end` is before `self.index(after: pos)`, then the returned character + /// is smaller than the one that would be produced by `self[pos]` and the + /// returned index is at the end of that character. + /// - If `end` is between `pos` and the next grapheme cluster boundary, and + /// is not on a Unicode scalar boundary, the partial scalar is dropped. This + /// can result in a `nil` return or a character that includes only part of + /// the `self[pos]` character. + /// + /// - Parameters: + /// - pos: The position to load a character from. + /// - end: The limit for the character at `pos`. + /// - Returns: The character at `pos`, bounded by `end`, if it exists, along + /// with the upper bound of that character. The upper bound is always + /// scalar-aligned. + func characterAndEnd(at pos: String.Index, limitedBy end: String.Index) -> (Character, String.Index)? { + // FIXME: Sink into the stdlib to avoid multiple boundary calculations + guard pos < end else { return nil } + let next = index(after: pos) + if next <= end { + return (self[pos], next) + } + // `end` must be a sub-character position that is between `pos` and the + // next grapheme boundary. This is okay if `end` is on a Unicode scalar + // boundary, but if it's in the middle of a scalar's code units, there + // may not be a character to return at all after rounding down. Use + // `Substring`'s rounding to determine what we can return. + let substr = self[pos.. String.Index? { - guard currentPosition < endIndex else { - return nil - } + guard currentPosition < end else { return nil } if case .definite(let result) = _quickMatchAnyNonNewline( at: currentPosition, + limitedBy: end, isScalarSemantics: isScalarSemantics ) { assert(result == _thoroughMatchAnyNonNewline( at: currentPosition, + limitedBy: end, isScalarSemantics: isScalarSemantics)) return result } return _thoroughMatchAnyNonNewline( at: currentPosition, + limitedBy: end, isScalarSemantics: isScalarSemantics) } @inline(__always) private func _quickMatchAnyNonNewline( at currentPosition: String.Index, + limitedBy end: String.Index, isScalarSemantics: Bool ) -> QuickResult { - assert(currentPosition < endIndex) + assert(currentPosition < end) guard let (asciiValue, next, isCRLF) = _quickASCIICharacter( - at: currentPosition + at: currentPosition, limitedBy: end ) else { return .unknown } @@ -167,39 +208,39 @@ extension String { @inline(never) private func _thoroughMatchAnyNonNewline( at currentPosition: String.Index, + limitedBy end: String.Index, isScalarSemantics: Bool ) -> String.Index? { - assert(currentPosition < endIndex) if isScalarSemantics { + guard currentPosition < end else { return nil } let scalar = unicodeScalars[currentPosition] guard !scalar.isNewline else { return nil } return unicodeScalars.index(after: currentPosition) } - let char = self[currentPosition] - guard !char.isNewline else { return nil } - return index(after: currentPosition) + guard let (char, next) = characterAndEnd(at: currentPosition, limitedBy: end), + !char.isNewline + else { return nil } + return next } } // MARK: - Built-in character class matching extension String { - // TODO: Should the below have a `limitedBy` parameter? - // Mentioned in ProgrammersManual.md, update docs if redesigned func matchBuiltinCC( _ cc: _CharacterClassModel.Representation, at currentPosition: String.Index, + limitedBy end: String.Index, isInverted: Bool, isStrictASCII: Bool, isScalarSemantics: Bool ) -> String.Index? { - guard currentPosition < endIndex else { - return nil - } + guard currentPosition < end else { return nil } if case .definite(let result) = _quickMatchBuiltinCC( cc, at: currentPosition, + limitedBy: end, isInverted: isInverted, isStrictASCII: isStrictASCII, isScalarSemantics: isScalarSemantics @@ -207,6 +248,7 @@ extension String { assert(result == _thoroughMatchBuiltinCC( cc, at: currentPosition, + limitedBy: end, isInverted: isInverted, isStrictASCII: isStrictASCII, isScalarSemantics: isScalarSemantics)) @@ -215,6 +257,7 @@ extension String { return _thoroughMatchBuiltinCC( cc, at: currentPosition, + limitedBy: end, isInverted: isInverted, isStrictASCII: isStrictASCII, isScalarSemantics: isScalarSemantics) @@ -225,13 +268,17 @@ extension String { private func _quickMatchBuiltinCC( _ cc: _CharacterClassModel.Representation, at currentPosition: String.Index, + limitedBy end: String.Index, isInverted: Bool, isStrictASCII: Bool, isScalarSemantics: Bool ) -> QuickResult { - assert(currentPosition < endIndex) + assert(currentPosition < end) guard let (next, result) = _quickMatch( - cc, at: currentPosition, isScalarSemantics: isScalarSemantics + cc, + at: currentPosition, + limitedBy: end, + isScalarSemantics: isScalarSemantics ) else { return .unknown } @@ -243,12 +290,16 @@ extension String { private func _thoroughMatchBuiltinCC( _ cc: _CharacterClassModel.Representation, at currentPosition: String.Index, + limitedBy end: String.Index, isInverted: Bool, isStrictASCII: Bool, isScalarSemantics: Bool ) -> String.Index? { - assert(currentPosition < endIndex) - let char = self[currentPosition] + // TODO: Branch here on scalar semantics + // Don't want to pay character cost if unnecessary + guard var (char, next) = + characterAndEnd(at: currentPosition, limitedBy: end) + else { return nil } let scalar = unicodeScalars[currentPosition] let asciiCheck = !isStrictASCII @@ -256,14 +307,8 @@ extension String { || char.isASCII var matched: Bool - var next: String.Index - switch (isScalarSemantics, cc) { - case (_, .anyGrapheme): - next = index(after: currentPosition) - case (true, _): + if isScalarSemantics && cc != .anyGrapheme { next = unicodeScalars.index(after: currentPosition) - case (false, _): - next = index(after: currentPosition) } switch cc { @@ -291,7 +336,7 @@ extension String { if isScalarSemantics { matched = scalar.isNewline && asciiCheck if matched && scalar == "\r" - && next != endIndex && unicodeScalars[next] == "\n" { + && next < end && unicodeScalars[next] == "\n" { // Match a full CR-LF sequence even in scalar semantics unicodeScalars.formIndex(after: &next) } diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 1ff734ccd..c6f55ee34 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -17,18 +17,18 @@ extension Processor { boundaryCheck: !isScalarSemantics, isCaseInsensitive: false) case .builtin: - // FIXME: bounds check? endIndex or end? + guard currentPosition < end else { return nil } // We only emit .quantify if it consumes a single character return input.matchBuiltinCC( payload.builtin, at: currentPosition, + limitedBy: end, isInverted: payload.builtinIsInverted, isStrictASCII: payload.builtinIsStrict, isScalarSemantics: isScalarSemantics) case .any: - // FIXME: endIndex or end? - guard currentPosition < input.endIndex else { return nil } + guard currentPosition < end else { return nil } if payload.anyMatchesNewline { if isScalarSemantics { @@ -38,7 +38,9 @@ extension Processor { } return input.matchAnyNonNewline( - at: currentPosition, isScalarSemantics: isScalarSemantics) + at: currentPosition, + limitedBy: end, + isScalarSemantics: isScalarSemantics) } } diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 0350a37db..6ecc49df7 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -310,6 +310,7 @@ extension Processor { ) -> Bool { guard let next = input.matchAnyNonNewline( at: currentPosition, + limitedBy: end, isScalarSemantics: isScalarSemantics ) else { signalFailure() @@ -536,9 +537,10 @@ extension Processor { case .consumeBy: let reg = payload.consumer + let consumer = registers[reg] guard currentPosition < searchBounds.upperBound, - let nextIndex = registers[reg]( - input, currentPosition.. Index? { // TODO: This can be greatly sped up with string internals // TODO: This is also very much quick-check-able - assert(end <= endIndex) - - guard pos < end else { return nil } + guard let (stringChar, next) = characterAndEnd(at: pos, limitedBy: end) + else { return nil } if isCaseInsensitive { - guard self[pos].lowercased() == char.lowercased() else { return nil } + guard stringChar.lowercased() == char.lowercased() else { return nil } } else { - guard self[pos] == char else { return nil } + guard stringChar == char else { return nil } } - let idx = index(after: pos) - guard idx <= end else { return nil } - - return idx + return next } func matchSeq( @@ -676,8 +674,6 @@ extension String { ) -> Index? { // TODO: This can be greatly sped up with string internals // TODO: This is also very much quick-check-able - assert(end <= endIndex) - var cur = pos if isScalarSemantics { @@ -687,8 +683,10 @@ extension String { } } else { for e in seq { - guard cur < end, self[cur] == e else { return nil } - self.formIndex(after: &cur) + guard let (char, next) = characterAndEnd(at: cur, limitedBy: end), + char == e + else { return nil } + cur = next } } @@ -705,8 +703,6 @@ extension String { ) -> Index? { // TODO: extremely quick-check-able // TODO: can be sped up with string internals - assert(end <= endIndex) - guard pos < end else { return nil } let curScalar = unicodeScalars[pos] @@ -720,7 +716,7 @@ extension String { } let idx = unicodeScalars.index(after: pos) - guard idx <= end else { return nil } + assert(idx <= end, "Input is a substring with a sub-scalar endIndex.") if boundaryCheck && !isOnGraphemeClusterBoundary(idx) { return nil @@ -737,22 +733,14 @@ extension String { ) -> Index? { // TODO: extremely quick-check-able // TODO: can be sped up with string internals - assert(end <= endIndex) - - guard pos < end else { return nil } - - let idx: String.Index if isScalarSemantics { + guard pos < end else { return nil } guard bitset.matches(unicodeScalars[pos]) else { return nil } - idx = unicodeScalars.index(after: pos) + return unicodeScalars.index(after: pos) } else { - guard bitset.matches(self[pos]) else { return nil } - idx = index(after: pos) + guard let (char, next) = characterAndEnd(at: pos, limitedBy: end), + bitset.matches(char) else { return nil } + return next } - - guard idx <= end else { return nil } - return idx } - - } diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 0453fcd80..92fd91b7d 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -43,7 +43,8 @@ struct Executor { } if low >= high { return nil } if graphemeSemantic { - input.formIndex(after: &low) + low = input.index( + low, offsetBy: 1, limitedBy: searchBounds.upperBound) ?? searchBounds.upperBound } else { input.unicodeScalars.formIndex(after: &low) } diff --git a/Sources/_StringProcessing/Unicode/ASCII.swift b/Sources/_StringProcessing/Unicode/ASCII.swift index de13e340a..53dfe652d 100644 --- a/Sources/_StringProcessing/Unicode/ASCII.swift +++ b/Sources/_StringProcessing/Unicode/ASCII.swift @@ -85,11 +85,14 @@ extension String { /// and we can give the right `next` index, not requiring the caller to re-adjust it /// TODO: detailed description of nuanced semantics func _quickASCIICharacter( - at idx: Index + at idx: Index, + limitedBy end: Index ) -> (first: UInt8, next: Index, crLF: Bool)? { // TODO: fastUTF8 version - - if idx == endIndex { + assert(String.Index(idx, within: unicodeScalars) != nil) + assert(idx <= end) + + if idx == end { return nil } let base = utf8[idx] @@ -99,8 +102,7 @@ extension String { } var next = utf8.index(after: idx) - if next == utf8.endIndex { - assert(self[idx].isASCII) + if next == end { return (first: base, next: next, crLF: false) } @@ -110,10 +112,9 @@ extension String { // Handle CR-LF: if base == ._carriageReturn && tail == ._lineFeed { utf8.formIndex(after: &next) - guard next == endIndex || utf8[next]._isSub300StartingByte else { + guard next == end || utf8[next]._isSub300StartingByte else { return nil } - assert(self[idx] == "\r\n") return (first: base, next: next, crLF: true) } @@ -124,11 +125,12 @@ extension String { func _quickMatch( _ cc: _CharacterClassModel.Representation, at idx: Index, + limitedBy end: Index, isScalarSemantics: Bool ) -> (next: Index, matchResult: Bool)? { /// ASCII fast-paths guard let (asciiValue, next, isCRLF) = _quickASCIICharacter( - at: idx + at: idx, limitedBy: end ) else { return nil } diff --git a/Sources/_StringProcessing/Unicode/WordBreaking.swift b/Sources/_StringProcessing/Unicode/WordBreaking.swift index f1f9573c1..10aadde32 100644 --- a/Sources/_StringProcessing/Unicode/WordBreaking.swift +++ b/Sources/_StringProcessing/Unicode/WordBreaking.swift @@ -37,7 +37,9 @@ extension Processor { if currentPosition == subjectBounds.lowerBound { return matchesWord(at: currentPosition) } - let priorIdx = input.index(before: currentPosition) + let priorIdx = semanticLevel == .graphemeCluster + ? input.index(before: currentPosition) + : input.unicodeScalars.index(before: currentPosition) if currentPosition == subjectBounds.upperBound { return matchesWord(at: priorIdx) } @@ -51,14 +53,16 @@ extension Processor { extension String { func isOnWordBoundary( at i: String.Index, + in range: Range, using cache: inout Set?, _ maxIndex: inout String.Index? ) -> Bool { // TODO: needs benchmark coverage - guard i != startIndex, i != endIndex else { + guard i != range.lowerBound, i != range.upperBound else { return true } - + assert(range.contains(i)) + // If our index is already in our cache, then this is obviously on a // boundary. if let cache = cache, cache.contains(i) { @@ -76,9 +80,9 @@ extension String { if #available(SwiftStdlib 5.7, *) { var indices: Set = [] - var j = maxIndex ?? startIndex + var j = maxIndex ?? range.lowerBound - while j < endIndex, j <= i { + while j < range.upperBound, j <= i { indices.insert(j) j = _wordIndex(after: j) } diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index c053b31e4..53e7b2e65 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -68,12 +68,13 @@ struct _CharacterClassModel: Hashable { /// - Returns: The index of the end of the match, or `nil` if there is no match. func matches( in input: String, - at currentPosition: String.Index + at currentPosition: String.Index, + limitedBy end: String.Index ) -> String.Index? { // FIXME: This is only called in custom character classes that contain builtin // character classes as members (ie: [a\w] or set operations), is there // any way to avoid that? Can we remove this somehow? - guard currentPosition != input.endIndex else { + guard currentPosition < end else { return nil } @@ -82,6 +83,7 @@ struct _CharacterClassModel: Hashable { return input.matchBuiltinCC( cc, at: currentPosition, + limitedBy: end, isInverted: isInverted, isStrictASCII: isStrictASCII, isScalarSemantics: isScalarSemantics) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 3fc547e34..e5bb574e6 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -30,6 +30,68 @@ func _firstMatch( ) throws -> (String, [String?])? { var regex = try Regex(regexStr, syntax: syntax).matchingSemantics(semanticLevel) let result = try regex.firstMatch(in: input) + + func validateSubstring(_ substringInput: Substring) throws { + // Sometimes the characters we add to a substring merge with existing + // string members. This messes up cross-validation, so skip the test. + guard input == substringInput else { return } + + let substringResult = try regex.firstMatch(in: substringInput) + switch (result, substringResult) { + case (nil, nil): + break + case let (result?, substringResult?): + if substringResult.range.upperBound > substringInput.endIndex { + throw MatchError("Range exceeded substring upper bound for \(input) and \(regexStr)") + } + let stringMatch = input[result.range] + let substringMatch = substringInput[substringResult.range] + if stringMatch != substringMatch { + throw MatchError(""" + Pattern: '\(regexStr)' + String match returned: '\(stringMatch)' + Substring match returned: '\(substringMatch)' + """) + } + case (.some(let result), nil): + throw MatchError(""" + Pattern: '\(regexStr)' + Input: '\(input)' + Substring '\(substringInput)' ('\(substringInput.base)') + String match returned: '\(input[result.range])' + Substring match returned: nil + """) + case (nil, .some(let substringResult)): + throw MatchError(""" + Pattern: '\(regexStr)' + Input: '\(input)' + Substring '\(substringInput)' ('\(substringInput.base)') + String match returned: nil + Substring match returned: '\(substringInput[substringResult.range])' + """) + } + } + + if !input.isEmpty { + try validateSubstring("\(input)\(input.last!)".dropLast()) + } + try validateSubstring("\(input)\n".dropLast()) + try validateSubstring("A\(input)Z".dropFirst().dropLast()) + do { + // Test sub-character slicing + let str = input + "\n" + let prevIndex = str.unicodeScalars.index(str.endIndex, offsetBy: -1) + try validateSubstring(str[..