From 5b157cf61cad8f6e1b89c5edb28fe8fed88215b0 Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Mon, 22 May 2023 11:40:01 -0500
Subject: [PATCH 1/8] Handle boundaries when matching in substrings

Some of our existing matching routines use the start/endIndex
of the input, which is basically never the right thing to do.

This change revises those checks to use the search bounds, by
either moving the boundary check out of the matching method, or
if the boundary is a part of what needs to be matched (e.g.
word boundaries have different behavior at the start/end than
in the middle of a string) the search bounds are passed into
the matching method.

Testing is currently handled by piggy-backing on the existing
match tests; we should add more tests to handle substring-
specific edge cases.
---
 .../_StringProcessing/Engine/MEBuiltins.swift | 16 ++----
 .../_StringProcessing/Engine/MEQuantify.swift |  5 +-
 .../_StringProcessing/Engine/Processor.swift  | 15 ++---
 .../Unicode/WordBreaking.swift                | 11 ++--
 Tests/RegexTests/MatchTests.swift             | 56 +++++++++++++++++++
 5 files changed, 78 insertions(+), 25 deletions(-)

diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift
index 5e446b472..649ee9720 100644
--- a/Sources/_StringProcessing/Engine/MEBuiltins.swift
+++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift
@@ -15,7 +15,7 @@ extension Processor {
     isStrictASCII: Bool,
     isScalarSemantics: Bool
   ) -> Bool {
-    guard let next = input.matchBuiltinCC(
+    guard currentPosition < end, let next = input.matchBuiltinCC(
       cc,
       at: currentPosition,
       isInverted: isInverted,
@@ -102,18 +102,16 @@ extension Processor {
 
     case .wordBoundary:
       if payload.usesSimpleUnicodeBoundaries {
-        // TODO: How should we handle bounds?
         return atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel)
       } else {
-        return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex)
+        return input.isOnWordBoundary(at: currentPosition, in: searchBounds, using: &wordIndexCache, &wordIndexMaxIndex)
       }
 
     case .notWordBoundary:
       if payload.usesSimpleUnicodeBoundaries {
-        // TODO: How should we handle bounds?
         return !atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel)
       } else {
-        return !input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex)
+        return !input.isOnWordBoundary(at: currentPosition, in: searchBounds, using: &wordIndexCache, &wordIndexMaxIndex)
       }
     }
   }
@@ -127,9 +125,7 @@ extension String {
     at currentPosition: String.Index,
     isScalarSemantics: Bool
   ) -> String.Index? {
-    guard currentPosition < endIndex else {
-      return nil
-    }
+    assert(currentPosition < endIndex)
     if case .definite(let result) = _quickMatchAnyNonNewline(
       at: currentPosition,
       isScalarSemantics: isScalarSemantics
@@ -194,9 +190,7 @@ extension String {
     isStrictASCII: Bool,
     isScalarSemantics: Bool
   ) -> String.Index? {
-    guard currentPosition < endIndex else {
-      return nil
-    }
+    assert(currentPosition < endIndex)
     if case .definite(let result) = _quickMatchBuiltinCC(
       cc,
       at: currentPosition,
diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift
index 1ff734ccd..2baa3d7ab 100644
--- a/Sources/_StringProcessing/Engine/MEQuantify.swift
+++ b/Sources/_StringProcessing/Engine/MEQuantify.swift
@@ -17,7 +17,7 @@ extension Processor {
         boundaryCheck: !isScalarSemantics,
         isCaseInsensitive: false)
     case .builtin:
-      // FIXME: bounds check? endIndex or end?
+      guard currentPosition < end else { return nil }
 
       // We only emit .quantify if it consumes a single character
       return input.matchBuiltinCC(
@@ -27,8 +27,7 @@ extension Processor {
         isStrictASCII: payload.builtinIsStrict,
         isScalarSemantics: isScalarSemantics)
     case .any:
-      // FIXME: endIndex or end?
-      guard currentPosition < input.endIndex else { return nil }
+      guard currentPosition < end else { return nil }
 
       if payload.anyMatchesNewline {
         if isScalarSemantics {
diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift
index 0350a37db..07dda212e 100644
--- a/Sources/_StringProcessing/Engine/Processor.swift
+++ b/Sources/_StringProcessing/Engine/Processor.swift
@@ -232,7 +232,7 @@ extension Processor {
   mutating func match(
     _ e: Element, isCaseInsensitive: Bool
   ) -> Bool {
-    guard let next = input.match(
+    guard currentPosition < end, let next = input.match(
       e,
       at: currentPosition,
       limitedBy: end,
@@ -251,7 +251,7 @@ extension Processor {
     _ seq: Substring,
     isScalarSemantics: Bool
   ) -> Bool  {
-    guard let next = input.matchSeq(
+    guard currentPosition < end, let next = input.matchSeq(
       seq,
       at: currentPosition,
       limitedBy: end,
@@ -270,7 +270,7 @@ extension Processor {
     boundaryCheck: Bool,
     isCaseInsensitive: Bool
   ) -> Bool {
-    guard let next = input.matchScalar(
+    guard currentPosition < end, let next = input.matchScalar(
       s,
       at: currentPosition,
       limitedBy: end,
@@ -291,7 +291,7 @@ extension Processor {
     _ bitset: DSLTree.CustomCharacterClass.AsciiBitset,
     isScalarSemantics: Bool
   ) -> Bool {
-    guard let next = input.matchBitset(
+    guard currentPosition < end, let next = input.matchBitset(
       bitset,
       at: currentPosition,
       limitedBy: end,
@@ -308,7 +308,7 @@ extension Processor {
   mutating func matchAnyNonNewline(
     isScalarSemantics: Bool
   ) -> Bool {
-    guard let next = input.matchAnyNonNewline(
+    guard currentPosition < end, let next = input.matchAnyNonNewline(
       at: currentPosition,
       isScalarSemantics: isScalarSemantics
     ) else {
@@ -538,7 +538,8 @@ extension Processor {
       let reg = payload.consumer
       guard currentPosition < searchBounds.upperBound,
             let nextIndex = registers[reg](
-              input, currentPosition..<searchBounds.upperBound)
+              input, currentPosition..<searchBounds.upperBound),
+            nextIndex <= end
       else {
         signalFailure()
         return
@@ -565,7 +566,7 @@ extension Processor {
       do {
         guard let (nextIdx, val) = try matcher(
           input, currentPosition, searchBounds
-        ) else {
+        ), nextIdx <= end else {
           signalFailure()
           return
         }
diff --git a/Sources/_StringProcessing/Unicode/WordBreaking.swift b/Sources/_StringProcessing/Unicode/WordBreaking.swift
index f1f9573c1..02ec150cb 100644
--- a/Sources/_StringProcessing/Unicode/WordBreaking.swift
+++ b/Sources/_StringProcessing/Unicode/WordBreaking.swift
@@ -37,7 +37,9 @@ extension Processor {
     if currentPosition == subjectBounds.lowerBound {
       return matchesWord(at: currentPosition)
     }
-    let priorIdx = input.index(before: currentPosition)
+    let priorIdx = semanticLevel == .graphemeCluster
+      ? input.index(before: currentPosition)
+      : input.unicodeScalars.index(before: currentPosition)
     if currentPosition == subjectBounds.upperBound {
       return matchesWord(at: priorIdx)
     }
@@ -51,11 +53,12 @@ extension Processor {
 extension String {
   func isOnWordBoundary(
     at i: String.Index,
+    in range: Range<String.Index>,
     using cache: inout Set<String.Index>?,
     _ maxIndex: inout String.Index?
   ) -> Bool {
     // TODO: needs benchmark coverage
-    guard i != startIndex, i != endIndex else {
+    guard i != range.lowerBound, i != range.upperBound else {
       return true
     }
     
@@ -76,9 +79,9 @@ extension String {
     
     if #available(SwiftStdlib 5.7, *) {
       var indices: Set<String.Index> = []
-      var j = maxIndex ?? startIndex
+      var j = maxIndex ?? range.lowerBound
       
-      while j < endIndex, j <= i {
+      while j < range.upperBound, j <= i {
         indices.insert(j)
         j = _wordIndex(after: j)
       }
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index 3fc547e34..8d825aecc 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -30,6 +30,61 @@ func _firstMatch(
 ) throws -> (String, [String?])? {
   var regex = try Regex(regexStr, syntax: syntax).matchingSemantics(semanticLevel)
   let result = try regex.firstMatch(in: input)
+  
+  func validateSubstring(_ substringInput: Substring) throws {
+    // Sometimes the characters we add to a substring merge with existing
+    // string members. This messes up cross-validation, so skip the test.
+    guard input == substringInput else { return }
+    
+    let substringResult = try regex.firstMatch(in: substringInput)
+    switch (result, substringResult) {
+    case (nil, nil):
+      break
+    case let (result?, substringResult?):
+      if substringResult.range.upperBound > substringInput.endIndex {
+        throw MatchError("Range exceeded substring upper bound for \(input) and \(regexStr)")
+      }
+      let stringMatch = input[result.range]
+      let substringMatch = substringInput[substringResult.range]
+      if stringMatch != substringMatch {
+        throw MatchError("""
+        Pattern: '\(regexStr)'
+        String match returned: '\(stringMatch)'
+        Substring match returned: '\(substringMatch)'
+        """)
+      }
+    case (.some(let result), nil):
+      throw MatchError("""
+        Pattern: '\(regexStr)'
+        Input: '\(input)'
+        Substring '\(substringInput)' ('\(substringInput.base)')
+        String match returned: '\(input[result.range])'
+        Substring match returned: nil
+        """)
+    case (nil, .some(let substringResult)):
+      throw MatchError("""
+        Pattern: '\(regexStr)'
+        Input: '\(input)'
+        Substring '\(substringInput)' ('\(substringInput.base)')
+        String match returned: nil
+        Substring match returned: '\(substringInput[substringResult.range])'
+        """)
+    }
+  }
+  
+  if !input.isEmpty {
+    try validateSubstring("\(input)\(input.last!)".dropLast())
+  }
+  try validateSubstring("\(input)\n".dropLast())
+  try validateSubstring("A\(input)Z".dropFirst().dropLast())
+
+// This is causing out-of-bounds indexing errors:
+//  do {
+//    // Test sub-character slicing
+//    let substr = input + "\n"
+//    let prevIndex = substr.unicodeScalars.index(substr.endIndex, offsetBy: -1)
+//    try validateSubstring(substr[..<prevIndex])
+//  }
 
   if validateOptimizations {
     assert(regex._forceAction(.addOptions(.disableOptimizations)))
@@ -52,6 +107,7 @@ func _firstMatch(
       }
     }
   }
+  
   guard let result = result else { return nil }
   let caps = result.output.slices(from: input)
   return (String(input[result.range]), caps.map { $0.map(String.init) })

From e6946933b7a07a26e731fc602ba6f265082043ae Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Thu, 25 May 2023 14:34:25 -0500
Subject: [PATCH 2/8] Handle sub-character substring boundaries

This change passes the end boundary down into matching methods, and
uses it to find the actual character that is part of the input
substring, even if the substring's end boundary is in the middle of
a grapheme cluster.
---
 .../_StringProcessing/ConsumerInterface.swift | 16 +++--
 .../_StringProcessing/Engine/MEBuiltins.swift | 64 ++++++++++++-------
 .../_StringProcessing/Engine/MEQuantify.swift |  5 +-
 .../_StringProcessing/Engine/Processor.swift  | 59 +++++++----------
 Sources/_StringProcessing/Executor.swift      |  3 +-
 Sources/_StringProcessing/Unicode/ASCII.swift | 20 +++---
 .../_CharacterClassModel.swift                |  6 +-
 Tests/RegexTests/MatchTests.swift             | 15 ++---
 8 files changed, 102 insertions(+), 86 deletions(-)

diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
index 705b354fb..808a1e498 100644
--- a/Sources/_StringProcessing/ConsumerInterface.swift
+++ b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -68,14 +68,16 @@ extension Character {
     switch opts.semanticLevel {
     case .graphemeCluster:
       return { input, bounds in
-        let low = bounds.lowerBound
+        guard let (char, next) = input.characterAndEnd(
+          at: bounds.lowerBound, limitedBy: bounds.upperBound)
+        else { return nil }
         if isCaseInsensitive && isCased {
-          return input[low].lowercased() == lowercased()
-            ? input.index(after: low)
+          return char.lowercased() == self.lowercased()
+            ? next
             : nil
         } else {
-          return input[low] == self
-            ? input.index(after: low)
+          return char == self
+            ? next
             : nil
         }
       }
@@ -188,7 +190,7 @@ extension DSLTree.Atom.CharacterClass {
   func generateConsumer(_ opts: MatchingOptions) -> MEProgram.ConsumeFunction {
     let model = asRuntimeModel(opts)
     return { input, bounds in
-      model.matches(in: input, at: bounds.lowerBound)
+      model.matches(in: input, at: bounds.lowerBound, limitedBy: bounds.upperBound)
     }
   }
 }
@@ -517,7 +519,7 @@ extension DSLTree.CustomCharacterClass {
       }
       if isInverted {
         return opts.semanticLevel == .graphemeCluster
-          ? input.index(after: bounds.lowerBound)
+          ? Swift.min(input.index(after: bounds.lowerBound), bounds.upperBound)
           : input.unicodeScalars.index(after: bounds.lowerBound)
       }
       return nil
diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift
index 649ee9720..049557053 100644
--- a/Sources/_StringProcessing/Engine/MEBuiltins.swift
+++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift
@@ -18,6 +18,7 @@ extension Processor {
     guard currentPosition < end, let next = input.matchBuiltinCC(
       cc,
       at: currentPosition,
+      limitedBy: end,
       isInverted: isInverted,
       isStrictASCII: isStrictASCII,
       isScalarSemantics: isScalarSemantics
@@ -119,35 +120,44 @@ extension Processor {
 
 // MARK: Matching `.`
 extension String {
-  // TODO: Should the below have a `limitedBy` parameter?
-
+  func characterAndEnd(at pos: String.Index, limitedBy end: String.Index) -> (Character, String.Index)? {
+    guard pos < end else { return nil }
+    let next = index(pos, offsetBy: 1, limitedBy: end) ?? end
+    return self[pos..<next].first.map { ($0, next) }
+  }
+  
   func matchAnyNonNewline(
     at currentPosition: String.Index,
+    limitedBy end: String.Index,
     isScalarSemantics: Bool
   ) -> String.Index? {
-    assert(currentPosition < endIndex)
+    guard currentPosition < end else { return nil }
     if case .definite(let result) = _quickMatchAnyNonNewline(
       at: currentPosition,
+      limitedBy: end,
       isScalarSemantics: isScalarSemantics
     ) {
       assert(result == _thoroughMatchAnyNonNewline(
         at: currentPosition,
+        limitedBy: end,
         isScalarSemantics: isScalarSemantics))
       return result
     }
     return _thoroughMatchAnyNonNewline(
       at: currentPosition,
+      limitedBy: end,
       isScalarSemantics: isScalarSemantics)
   }
 
   @inline(__always)
   private func _quickMatchAnyNonNewline(
     at currentPosition: String.Index,
+    limitedBy end: String.Index,
     isScalarSemantics: Bool
   ) -> QuickResult<String.Index?> {
-    assert(currentPosition < endIndex)
+    guard currentPosition < end else { return .definite(nil) }
     guard let (asciiValue, next, isCRLF) = _quickASCIICharacter(
-      at: currentPosition
+      at: currentPosition, limitedBy: end
     ) else {
       return .unknown
     }
@@ -163,37 +173,39 @@ extension String {
   @inline(never)
   private func _thoroughMatchAnyNonNewline(
     at currentPosition: String.Index,
+    limitedBy end: String.Index,
     isScalarSemantics: Bool
   ) -> String.Index? {
-    assert(currentPosition < endIndex)
     if isScalarSemantics {
+      guard currentPosition < end else { return nil }
       let scalar = unicodeScalars[currentPosition]
       guard !scalar.isNewline else { return nil }
       return unicodeScalars.index(after: currentPosition)
     }
 
-    let char = self[currentPosition]
-    guard !char.isNewline else { return nil }
-    return index(after: currentPosition)
+    guard let (char, next) = characterAndEnd(at: currentPosition, limitedBy: end),
+          !char.isNewline
+    else { return nil }
+    return next
   }
 }
 
 // MARK: - Built-in character class matching
 extension String {
-  // TODO: Should the below have a `limitedBy` parameter?
-
   // Mentioned in ProgrammersManual.md, update docs if redesigned
   func matchBuiltinCC(
     _ cc: _CharacterClassModel.Representation,
     at currentPosition: String.Index,
+    limitedBy end: String.Index,
     isInverted: Bool,
     isStrictASCII: Bool,
     isScalarSemantics: Bool
   ) -> String.Index? {
-    assert(currentPosition < endIndex)
+    guard currentPosition < end else { return nil }
     if case .definite(let result) = _quickMatchBuiltinCC(
       cc,
       at: currentPosition,
+      limitedBy: end,
       isInverted: isInverted,
       isStrictASCII: isStrictASCII,
       isScalarSemantics: isScalarSemantics
@@ -201,6 +213,7 @@ extension String {
       assert(result == _thoroughMatchBuiltinCC(
         cc,
         at: currentPosition,
+        limitedBy: end,
         isInverted: isInverted,
         isStrictASCII: isStrictASCII,
         isScalarSemantics: isScalarSemantics))
@@ -209,6 +222,7 @@ extension String {
     return _thoroughMatchBuiltinCC(
       cc,
       at: currentPosition,
+      limitedBy: end,
       isInverted: isInverted,
       isStrictASCII: isStrictASCII,
       isScalarSemantics: isScalarSemantics)
@@ -219,13 +233,17 @@ extension String {
   private func _quickMatchBuiltinCC(
     _ cc: _CharacterClassModel.Representation,
     at currentPosition: String.Index,
+    limitedBy end: String.Index,
     isInverted: Bool,
     isStrictASCII: Bool,
     isScalarSemantics: Bool
   ) -> QuickResult<String.Index?> {
-    assert(currentPosition < endIndex)
+    guard currentPosition < end else { return .definite(nil) }
     guard let (next, result) = _quickMatch(
-      cc, at: currentPosition, isScalarSemantics: isScalarSemantics
+      cc,
+      at: currentPosition,
+      limitedBy: end,
+      isScalarSemantics: isScalarSemantics
     ) else {
       return .unknown
     }
@@ -237,12 +255,16 @@ extension String {
   private func _thoroughMatchBuiltinCC(
     _ cc: _CharacterClassModel.Representation,
     at currentPosition: String.Index,
+    limitedBy end: String.Index,
     isInverted: Bool,
     isStrictASCII: Bool,
     isScalarSemantics: Bool
   ) -> String.Index? {
-    assert(currentPosition < endIndex)
-    let char = self[currentPosition]
+    // TODO: Branch here on scalar semantics
+    // Don't want to pay character cost if unnecessary
+    guard var (char, next) =
+            characterAndEnd(at: currentPosition, limitedBy: end)
+    else { return nil }
     let scalar = unicodeScalars[currentPosition]
 
     let asciiCheck = !isStrictASCII
@@ -250,14 +272,8 @@ extension String {
     || char.isASCII
 
     var matched: Bool
-    var next: String.Index
-    switch (isScalarSemantics, cc) {
-    case (_, .anyGrapheme):
-      next = index(after: currentPosition)
-    case (true, _):
+    if isScalarSemantics && cc != .anyGrapheme {
       next = unicodeScalars.index(after: currentPosition)
-    case (false, _):
-      next = index(after: currentPosition)
     }
 
     switch cc {
@@ -285,7 +301,7 @@ extension String {
       if isScalarSemantics {
         matched = scalar.isNewline && asciiCheck
         if matched && scalar == "\r"
-            && next != endIndex && unicodeScalars[next] == "\n" {
+            && next != end && unicodeScalars[next] == "\n" {
           // Match a full CR-LF sequence even in scalar semantics
           unicodeScalars.formIndex(after: &next)
         }
diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift
index 2baa3d7ab..c6f55ee34 100644
--- a/Sources/_StringProcessing/Engine/MEQuantify.swift
+++ b/Sources/_StringProcessing/Engine/MEQuantify.swift
@@ -23,6 +23,7 @@ extension Processor {
       return input.matchBuiltinCC(
         payload.builtin,
         at: currentPosition,
+        limitedBy: end,
         isInverted: payload.builtinIsInverted,
         isStrictASCII: payload.builtinIsStrict,
         isScalarSemantics: isScalarSemantics)
@@ -37,7 +38,9 @@ extension Processor {
       }
 
       return input.matchAnyNonNewline(
-        at: currentPosition, isScalarSemantics: isScalarSemantics)
+        at: currentPosition,
+        limitedBy: end,
+        isScalarSemantics: isScalarSemantics)
     }
   }
 
diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift
index 07dda212e..976f41f7f 100644
--- a/Sources/_StringProcessing/Engine/Processor.swift
+++ b/Sources/_StringProcessing/Engine/Processor.swift
@@ -232,7 +232,7 @@ extension Processor {
   mutating func match(
     _ e: Element, isCaseInsensitive: Bool
   ) -> Bool {
-    guard currentPosition < end, let next = input.match(
+    guard let next = input.match(
       e,
       at: currentPosition,
       limitedBy: end,
@@ -251,7 +251,7 @@ extension Processor {
     _ seq: Substring,
     isScalarSemantics: Bool
   ) -> Bool  {
-    guard currentPosition < end, let next = input.matchSeq(
+    guard let next = input.matchSeq(
       seq,
       at: currentPosition,
       limitedBy: end,
@@ -270,7 +270,7 @@ extension Processor {
     boundaryCheck: Bool,
     isCaseInsensitive: Bool
   ) -> Bool {
-    guard currentPosition < end, let next = input.matchScalar(
+    guard let next = input.matchScalar(
       s,
       at: currentPosition,
       limitedBy: end,
@@ -291,7 +291,7 @@ extension Processor {
     _ bitset: DSLTree.CustomCharacterClass.AsciiBitset,
     isScalarSemantics: Bool
   ) -> Bool {
-    guard currentPosition < end, let next = input.matchBitset(
+    guard let next = input.matchBitset(
       bitset,
       at: currentPosition,
       limitedBy: end,
@@ -308,8 +308,9 @@ extension Processor {
   mutating func matchAnyNonNewline(
     isScalarSemantics: Bool
   ) -> Bool {
-    guard currentPosition < end, let next = input.matchAnyNonNewline(
+    guard let next = input.matchAnyNonNewline(
       at: currentPosition,
+      limitedBy: end,
       isScalarSemantics: isScalarSemantics
     ) else {
       signalFailure()
@@ -536,9 +537,9 @@ extension Processor {
 
     case .consumeBy:
       let reg = payload.consumer
+      let consumer = registers[reg]
       guard currentPosition < searchBounds.upperBound,
-            let nextIndex = registers[reg](
-              input, currentPosition..<searchBounds.upperBound),
+            let nextIndex = consumer(input, currentPosition..<searchBounds.upperBound),
             nextIndex <= end
       else {
         signalFailure()
@@ -653,20 +654,16 @@ extension String {
   ) -> Index? {
     // TODO: This can be greatly sped up with string internals
     // TODO: This is also very much quick-check-able
-    assert(end <= endIndex)
-
-    guard pos < end else { return nil }
+    guard let (stringChar, next) = characterAndEnd(at: pos, limitedBy: end)
+    else { return nil }
 
     if isCaseInsensitive {
-      guard self[pos].lowercased() == char.lowercased() else { return nil }
+      guard stringChar.lowercased() == char.lowercased() else { return nil }
     } else {
-      guard self[pos] == char else { return nil }
+      guard stringChar == char else { return nil }
     }
 
-    let idx = index(after: pos)
-    guard idx <= end else { return nil }
-
-    return idx
+    return next
   }
 
   func matchSeq(
@@ -677,8 +674,6 @@ extension String {
   ) -> Index? {
     // TODO: This can be greatly sped up with string internals
     // TODO: This is also very much quick-check-able
-    assert(end <= endIndex)
-
     var cur = pos
 
     if isScalarSemantics {
@@ -688,8 +683,10 @@ extension String {
       }
     } else {
       for e in seq {
-        guard cur < end, self[cur] == e else { return nil }
-        self.formIndex(after: &cur)
+        guard let (char, next) = characterAndEnd(at: cur, limitedBy: end),
+              char == e
+        else { return nil }
+        cur = next
       }
     }
 
@@ -706,8 +703,6 @@ extension String {
   ) -> Index? {
     // TODO: extremely quick-check-able
     // TODO: can be sped up with string internals
-    assert(end <= endIndex)
-
     guard pos < end else { return nil }
     let curScalar = unicodeScalars[pos]
 
@@ -720,6 +715,8 @@ extension String {
       guard curScalar == scalar else { return nil }
     }
 
+    // TODO: Is this the correct behavior for a sub-scalar substring end?
+    // Can a malformed Unicode scalar match anything?
     let idx = unicodeScalars.index(after: pos)
     guard idx <= end else { return nil }
 
@@ -738,22 +735,14 @@ extension String {
   ) -> Index? {
     // TODO: extremely quick-check-able
     // TODO: can be sped up with string internals
-    assert(end <= endIndex)
-
-    guard pos < end else { return nil }
-
-    let idx: String.Index
     if isScalarSemantics {
+      guard pos < end else { return nil }
       guard bitset.matches(unicodeScalars[pos]) else { return nil }
-      idx = unicodeScalars.index(after: pos)
+      return unicodeScalars.index(after: pos)
     } else {
-      guard bitset.matches(self[pos]) else { return nil }
-      idx = index(after: pos)
+      guard let (char, next) = characterAndEnd(at: pos, limitedBy: end),
+            bitset.matches(char) else { return nil }
+      return next
     }
-
-    guard idx <= end else { return nil }
-    return idx
   }
-
-
 }
diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift
index 0453fcd80..92fd91b7d 100644
--- a/Sources/_StringProcessing/Executor.swift
+++ b/Sources/_StringProcessing/Executor.swift
@@ -43,7 +43,8 @@ struct Executor {
       }
       if low >= high { return nil }
       if graphemeSemantic {
-        input.formIndex(after: &low)
+        low = input.index(
+          low, offsetBy: 1, limitedBy: searchBounds.upperBound) ?? searchBounds.upperBound
       } else {
         input.unicodeScalars.formIndex(after: &low)
       }
diff --git a/Sources/_StringProcessing/Unicode/ASCII.swift b/Sources/_StringProcessing/Unicode/ASCII.swift
index de13e340a..c13f95213 100644
--- a/Sources/_StringProcessing/Unicode/ASCII.swift
+++ b/Sources/_StringProcessing/Unicode/ASCII.swift
@@ -85,11 +85,14 @@ extension String {
   /// and we can give the right `next` index, not requiring the caller to re-adjust it
   /// TODO: detailed description of nuanced semantics
   func _quickASCIICharacter(
-    at idx: Index
+    at idx: Index,
+    limitedBy end: Index
   ) -> (first: UInt8, next: Index, crLF: Bool)? {
     // TODO: fastUTF8 version
-
-    if idx == endIndex {
+    assert(String.Index(idx, within: unicodeScalars) != nil)
+    assert(idx <= end)
+    
+    if idx == end {
       return nil
     }
     let base = utf8[idx]
@@ -99,8 +102,8 @@ extension String {
     }
 
     var next = utf8.index(after: idx)
-    if next == utf8.endIndex {
-      assert(self[idx].isASCII)
+    if next == end {
+      // assert(self[idx].isASCII)
       return (first: base, next: next, crLF: false)
     }
 
@@ -110,10 +113,10 @@ extension String {
     // Handle CR-LF:
     if base == ._carriageReturn && tail == ._lineFeed {
       utf8.formIndex(after: &next)
-      guard next == endIndex || utf8[next]._isSub300StartingByte else {
+      guard next == end || utf8[next]._isSub300StartingByte else {
         return nil
       }
-      assert(self[idx] == "\r\n")
+      // assert(self[idx] == "\r\n")
       return (first: base, next: next, crLF: true)
     }
 
@@ -124,11 +127,12 @@ extension String {
   func _quickMatch(
     _ cc: _CharacterClassModel.Representation,
     at idx: Index,
+    limitedBy end: Index,
     isScalarSemantics: Bool
   ) -> (next: Index, matchResult: Bool)? {
     /// ASCII fast-paths
     guard let (asciiValue, next, isCRLF) = _quickASCIICharacter(
-      at: idx
+      at: idx, limitedBy: end
     ) else {
       return nil
     }
diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift
index c053b31e4..53e7b2e65 100644
--- a/Sources/_StringProcessing/_CharacterClassModel.swift
+++ b/Sources/_StringProcessing/_CharacterClassModel.swift
@@ -68,12 +68,13 @@ struct _CharacterClassModel: Hashable {
   /// - Returns: The index of the end of the match, or `nil` if there is no match.
   func matches(
     in input: String,
-    at currentPosition: String.Index
+    at currentPosition: String.Index,
+    limitedBy end: String.Index
   ) -> String.Index? {
     // FIXME: This is only called in custom character classes that contain builtin
     // character classes as members (ie: [a\w] or set operations), is there
     // any way to avoid that? Can we remove this somehow?
-    guard currentPosition != input.endIndex else {
+    guard currentPosition < end else {
       return nil
     }
 
@@ -82,6 +83,7 @@ struct _CharacterClassModel: Hashable {
     return input.matchBuiltinCC(
       cc,
       at: currentPosition,
+      limitedBy: end,
       isInverted: isInverted,
       isStrictASCII: isStrictASCII,
       isScalarSemantics: isScalarSemantics)
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index 8d825aecc..f548ffb13 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -77,14 +77,13 @@ func _firstMatch(
   }
   try validateSubstring("\(input)\n".dropLast())
   try validateSubstring("A\(input)Z".dropFirst().dropLast())
-
-// This is causing out-of-bounds indexing errors:
-//  do {
-//    // Test sub-character slicing
-//    let substr = input + "\n"
-//    let prevIndex = substr.unicodeScalars.index(substr.endIndex, offsetBy: -1)
-//    try validateSubstring(substr[..<prevIndex])
-//  }
+  do {
+    // Test sub-character slicing
+    // TODO: Test sub-scalar slicing
+    let substr = input + "\n"
+    let prevIndex = substr.unicodeScalars.index(substr.endIndex, offsetBy: -1)
+    try validateSubstring(substr[..<prevIndex])
+  }
 
   if validateOptimizations {
     assert(regex._forceAction(.addOptions(.disableOptimizations)))

From 90242540c79ecc855fe36cebc7a378cc6ed84575 Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Fri, 26 May 2023 10:30:43 -0500
Subject: [PATCH 3/8] Clean up, test mid-scalar substring boundaries

---
 .../_StringProcessing/Engine/MEBuiltins.swift | 20 +++++++++++++++++++
 .../Unicode/WordBreaking.swift                |  3 ++-
 Tests/RegexTests/MatchTests.swift             | 16 +++++++++++----
 3 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift
index 049557053..e117e260d 100644
--- a/Sources/_StringProcessing/Engine/MEBuiltins.swift
+++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift
@@ -120,7 +120,27 @@ extension Processor {
 
 // MARK: Matching `.`
 extension String {
+  /// Returns the character at `pos`, bounded by `end`, as well as the upper
+  /// boundary of the returned character.
+  ///
+  /// This function handles loading a character from a string while respecting
+  /// an end boundary, even if that end boundary is sub-character or sub-scalar.
+  ///
+  /// - Parameters:
+  ///   - pos: The position to load a character from.
+  ///   - end: The limit for the character at `pos`.
+  /// - Returns:
+  ///   - If `pos` is at or past `end`, this function returns `nil`.
+  ///   - If `end` is between `pos` and the next grapheme cluster boundary (i.e.,
+  ///     `end` is before `self.index(after: pos)`, then the returned character
+  ///     is smaller than the one that would be produced by `self[pos]` and the
+  ///     returned index is at the end of that character.
+  ///   - If `end` is between `pos` and the next grapheme cluster boundary, and
+  ///     is not on a Unicode scalar boundary, the partial scalar is dropped. This
+  ///     can result in a `nil` return or a character that includes only part of
+  ///     the `self[pos]` character.
   func characterAndEnd(at pos: String.Index, limitedBy end: String.Index) -> (Character, String.Index)? {
+    // FIXME: Sink into the stdlib to avoid multiple boundary calculations
     guard pos < end else { return nil }
     let next = index(pos, offsetBy: 1, limitedBy: end) ?? end
     return self[pos..<next].first.map { ($0, next) }
diff --git a/Sources/_StringProcessing/Unicode/WordBreaking.swift b/Sources/_StringProcessing/Unicode/WordBreaking.swift
index 02ec150cb..10aadde32 100644
--- a/Sources/_StringProcessing/Unicode/WordBreaking.swift
+++ b/Sources/_StringProcessing/Unicode/WordBreaking.swift
@@ -61,7 +61,8 @@ extension String {
     guard i != range.lowerBound, i != range.upperBound else {
       return true
     }
-    
+    assert(range.contains(i))
+
     // If our index is already in our cache, then this is obviously on a
     // boundary.
     if let cache = cache, cache.contains(i) {
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index f548ffb13..e5bb574e6 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -79,10 +79,18 @@ func _firstMatch(
   try validateSubstring("A\(input)Z".dropFirst().dropLast())
   do {
     // Test sub-character slicing
-    // TODO: Test sub-scalar slicing
-    let substr = input + "\n"
-    let prevIndex = substr.unicodeScalars.index(substr.endIndex, offsetBy: -1)
-    try validateSubstring(substr[..<prevIndex])
+    let str = input + "\n"
+    let prevIndex = str.unicodeScalars.index(str.endIndex, offsetBy: -1)
+    try validateSubstring(str[..<prevIndex])
+  }
+  do {
+    // Validate that we don't crash when sub-scalar slicing is used
+    // Actual matching behavior is untested here
+    let str = "\u{e9}\(input)e\u{e9}"
+    let upper = str.utf8.index(before: str.endIndex)
+    _ = try regex.firstMatch(in: str[..<upper])
+    let lower = str.utf8.index(after: str.startIndex)
+    _ = try regex.firstMatch(in: str[lower...])
   }
 
   if validateOptimizations {

From 616022042cc679ce9151a74ecac1e307f6161b3a Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Fri, 26 May 2023 13:34:54 -0500
Subject: [PATCH 4/8] Use substring end to handle unaligned indices

---
 .../_StringProcessing/Engine/MEBuiltins.swift   | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift
index e117e260d..258bce1f7 100644
--- a/Sources/_StringProcessing/Engine/MEBuiltins.swift
+++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift
@@ -125,11 +125,7 @@ extension String {
   ///
   /// This function handles loading a character from a string while respecting
   /// an end boundary, even if that end boundary is sub-character or sub-scalar.
-  ///
-  /// - Parameters:
-  ///   - pos: The position to load a character from.
-  ///   - end: The limit for the character at `pos`.
-  /// - Returns:
+
   ///   - If `pos` is at or past `end`, this function returns `nil`.
   ///   - If `end` is between `pos` and the next grapheme cluster boundary (i.e.,
   ///     `end` is before `self.index(after: pos)`, then the returned character
@@ -139,11 +135,20 @@ extension String {
   ///     is not on a Unicode scalar boundary, the partial scalar is dropped. This
   ///     can result in a `nil` return or a character that includes only part of
   ///     the `self[pos]` character.
+  ///
+  /// - Parameters:
+  ///   - pos: The position to load a character from.
+  ///   - end: The limit for the character at `pos`.
+  /// - Returns: The character at `pos`, bounded by `end`, if it exists, along
+  ///   with the upper bound of that character. The upper bound is always
+  ///   scalar-aligned.
   func characterAndEnd(at pos: String.Index, limitedBy end: String.Index) -> (Character, String.Index)? {
     // FIXME: Sink into the stdlib to avoid multiple boundary calculations
     guard pos < end else { return nil }
     let next = index(pos, offsetBy: 1, limitedBy: end) ?? end
-    return self[pos..<next].first.map { ($0, next) }
+    // Substring will round down non-scalar aligned indices
+    let substr = self[pos..<next]
+    return substr.first.map { ($0, substr.endIndex) }
   }
   
   func matchAnyNonNewline(

From b331bc073f4bcfe84c7af05eebbc6736820dd887 Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Mon, 31 Jul 2023 10:32:36 -0500
Subject: [PATCH 5/8] Remove unused asserts

---
 Sources/_StringProcessing/Unicode/ASCII.swift | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Sources/_StringProcessing/Unicode/ASCII.swift b/Sources/_StringProcessing/Unicode/ASCII.swift
index c13f95213..53dfe652d 100644
--- a/Sources/_StringProcessing/Unicode/ASCII.swift
+++ b/Sources/_StringProcessing/Unicode/ASCII.swift
@@ -103,7 +103,6 @@ extension String {
 
     var next = utf8.index(after: idx)
     if next == end {
-      // assert(self[idx].isASCII)
       return (first: base, next: next, crLF: false)
     }
 
@@ -116,7 +115,6 @@ extension String {
       guard next == end || utf8[next]._isSub300StartingByte else {
         return nil
       }
-      // assert(self[idx] == "\r\n")
       return (first: base, next: next, crLF: true)
     }
 

From 5114ea45d0b8f0680318e19ccf792f0eb2257143 Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Wed, 2 Aug 2023 12:38:14 -0500
Subject: [PATCH 6/8] Improve characterAndEnd algorithm a bit

---
 .../_StringProcessing/Engine/MEBuiltins.swift | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift
index 258bce1f7..f0de423a7 100644
--- a/Sources/_StringProcessing/Engine/MEBuiltins.swift
+++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift
@@ -125,7 +125,7 @@ extension String {
   ///
   /// This function handles loading a character from a string while respecting
   /// an end boundary, even if that end boundary is sub-character or sub-scalar.
-
+  ///
   ///   - If `pos` is at or past `end`, this function returns `nil`.
   ///   - If `end` is between `pos` and the next grapheme cluster boundary (i.e.,
   ///     `end` is before `self.index(after: pos)`, then the returned character
@@ -145,10 +145,20 @@ extension String {
   func characterAndEnd(at pos: String.Index, limitedBy end: String.Index) -> (Character, String.Index)? {
     // FIXME: Sink into the stdlib to avoid multiple boundary calculations
     guard pos < end else { return nil }
-    let next = index(pos, offsetBy: 1, limitedBy: end) ?? end
-    // Substring will round down non-scalar aligned indices
-    let substr = self[pos..<next]
-    return substr.first.map { ($0, substr.endIndex) }
+    let next = index(after: pos)
+    if next <= end {
+      return (self[pos], next)
+    }
+
+    // `end` must be a sub-character position that is between `pos` and the
+    // next grapheme boundary. This is okay if `end` is on a Unicode scalar
+    // boundary, but if it's in the middle of a scalar's code units, there
+    // may not be a character to return at all after rounding down. Use
+    // `Substring`'s rounding to determine what we can return.
+    let substr = self[pos..<end]
+    return substr.isEmpty
+      ? nil
+      : (substr.first!, substr.endIndex)
   }
   
   func matchAnyNonNewline(

From 656c388ddd003348cc6d69ef67c67f0693caac4f Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Thu, 3 Aug 2023 15:51:46 -0500
Subject: [PATCH 7/8] Remove an unnecessary end check

---
 Sources/_StringProcessing/Engine/MEBuiltins.swift | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift
index f0de423a7..0dafd6720 100644
--- a/Sources/_StringProcessing/Engine/MEBuiltins.swift
+++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift
@@ -190,7 +190,7 @@ extension String {
     limitedBy end: String.Index,
     isScalarSemantics: Bool
   ) -> QuickResult<String.Index?> {
-    guard currentPosition < end else { return .definite(nil) }
+    assert(currentPosition < end)
     guard let (asciiValue, next, isCRLF) = _quickASCIICharacter(
       at: currentPosition, limitedBy: end
     ) else {
@@ -273,7 +273,7 @@ extension String {
     isStrictASCII: Bool,
     isScalarSemantics: Bool
   ) -> QuickResult<String.Index?> {
-    guard currentPosition < end else { return .definite(nil) }
+    assert(currentPosition < end)
     guard let (next, result) = _quickMatch(
       cc,
       at: currentPosition,
@@ -336,7 +336,7 @@ extension String {
       if isScalarSemantics {
         matched = scalar.isNewline && asciiCheck
         if matched && scalar == "\r"
-            && next != end && unicodeScalars[next] == "\n" {
+            && next < end && unicodeScalars[next] == "\n" {
           // Match a full CR-LF sequence even in scalar semantics
           unicodeScalars.formIndex(after: &next)
         }

From 0f4f00585c0b0e68ab286c250a17df1190136b12 Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Thu, 3 Aug 2023 16:21:00 -0500
Subject: [PATCH 8/8] Remove an unnecessary check for sub-scalar upper bound

Substrings cannot have sub-Unicode scalar boundaries as of Swift
5.7; we can remove a check for this when matching an individual
scalar.
---
 Sources/_StringProcessing/Engine/Processor.swift | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift
index 976f41f7f..6ecc49df7 100644
--- a/Sources/_StringProcessing/Engine/Processor.swift
+++ b/Sources/_StringProcessing/Engine/Processor.swift
@@ -715,10 +715,8 @@ extension String {
       guard curScalar == scalar else { return nil }
     }
 
-    // TODO: Is this the correct behavior for a sub-scalar substring end?
-    // Can a malformed Unicode scalar match anything?
     let idx = unicodeScalars.index(after: pos)
-    guard idx <= end else { return nil }
+    assert(idx <= end, "Input is a substring with a sub-scalar endIndex.")
 
     if boundaryCheck && !isOnGraphemeClusterBoundary(idx) {
       return nil