Skip to content

Commit 9024254

Browse files
committed
Clean up, test mid-scalar substring boundaries
1 parent e694693 commit 9024254

File tree

3 files changed

+34
-5
lines changed

3 files changed

+34
-5
lines changed

Sources/_StringProcessing/Engine/MEBuiltins.swift

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,27 @@ extension Processor {
120120

121121
// MARK: Matching `.`
122122
extension String {
123+
/// Returns the character at `pos`, bounded by `end`, as well as the upper
124+
/// boundary of the returned character.
125+
///
126+
/// This function handles loading a character from a string while respecting
127+
/// an end boundary, even if that end boundary is sub-character or sub-scalar.
128+
///
129+
/// - Parameters:
130+
/// - pos: The position to load a character from.
131+
/// - end: The limit for the character at `pos`.
132+
/// - Returns:
133+
/// - If `pos` is at or past `end`, this function returns `nil`.
134+
/// - If `end` is between `pos` and the next grapheme cluster boundary (i.e.,
135+
/// `end` is before `self.index(after: pos)`, then the returned character
136+
/// is smaller than the one that would be produced by `self[pos]` and the
137+
/// returned index is at the end of that character.
138+
/// - If `end` is between `pos` and the next grapheme cluster boundary, and
139+
/// is not on a Unicode scalar boundary, the partial scalar is dropped. This
140+
/// can result in a `nil` return or a character that includes only part of
141+
/// the `self[pos]` character.
123142
func characterAndEnd(at pos: String.Index, limitedBy end: String.Index) -> (Character, String.Index)? {
143+
// FIXME: Sink into the stdlib to avoid multiple boundary calculations
124144
guard pos < end else { return nil }
125145
let next = index(pos, offsetBy: 1, limitedBy: end) ?? end
126146
return self[pos..<next].first.map { ($0, next) }

Sources/_StringProcessing/Unicode/WordBreaking.swift

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ extension String {
6161
guard i != range.lowerBound, i != range.upperBound else {
6262
return true
6363
}
64-
64+
assert(range.contains(i))
65+
6566
// If our index is already in our cache, then this is obviously on a
6667
// boundary.
6768
if let cache = cache, cache.contains(i) {

Tests/RegexTests/MatchTests.swift

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,10 +79,18 @@ func _firstMatch(
7979
try validateSubstring("A\(input)Z".dropFirst().dropLast())
8080
do {
8181
// Test sub-character slicing
82-
// TODO: Test sub-scalar slicing
83-
let substr = input + "\n"
84-
let prevIndex = substr.unicodeScalars.index(substr.endIndex, offsetBy: -1)
85-
try validateSubstring(substr[..<prevIndex])
82+
let str = input + "\n"
83+
let prevIndex = str.unicodeScalars.index(str.endIndex, offsetBy: -1)
84+
try validateSubstring(str[..<prevIndex])
85+
}
86+
do {
87+
// Validate that we don't crash when sub-scalar slicing is used
88+
// Actual matching behavior is untested here
89+
let str = "\u{e9}\(input)e\u{e9}"
90+
let upper = str.utf8.index(before: str.endIndex)
91+
_ = try regex.firstMatch(in: str[..<upper])
92+
let lower = str.utf8.index(after: str.startIndex)
93+
_ = try regex.firstMatch(in: str[lower...])
8694
}
8795

8896
if validateOptimizations {

0 commit comments

Comments
 (0)