From 3146746120785e464dc44728a9462b2d78ef5094 Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Mon, 16 May 2022 13:45:42 -0500
Subject: [PATCH 1/3] Limit CC ranges to single-scalar bounds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Character class ranges don't work well with multi-scalar inputs,
in either the range or the matched character. This change limits
range endpoints to single-scalar characters and matches only
characters that are themselves a single scalar.

Fixes issue #407, which now displays this behavior:

```
try /[1-2]/.wholeMatch(in: "1️⃣")     // nil
try /[12]/.wholeMatch(in: "1️⃣")      // nil
try /(?U)[\d]/.wholeMatch(in: "1️⃣")  // nil
```
---
 Sources/_RegexParser/Regex/AST/Atom.swift              | 2 +-
 Sources/_RegexParser/Utility/Misc.swift                | 7 +++++++
 Sources/_StringProcessing/ConsumerInterface.swift      | 6 ++++--
 Sources/_StringProcessing/Unicode/CharacterProps.swift | 7 -------
 Sources/_StringProcessing/_CharacterClassModel.swift   | 5 +++++
 Tests/RegexTests/ParseTests.swift                      | 2 ++
 6 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift
index eba720f9b..ffeb7c8e1 100644
--- a/Sources/_RegexParser/Regex/AST/Atom.swift
+++ b/Sources/_RegexParser/Regex/AST/Atom.swift
@@ -771,7 +771,7 @@ extension AST.Atom {
   /// range.
   public var isValidCharacterClassRangeBound: Bool {
     // If we have a literal character value for this, it can be used as a bound.
-    if literalCharacterValue != nil { return true }
+    if literalCharacterValue?.hasExactlyOneScalar == true { return true }
     switch kind {
     // \cx, \C-x, \M-x, \M-\C-x, \N{...}
     case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter:
diff --git a/Sources/_RegexParser/Utility/Misc.swift b/Sources/_RegexParser/Utility/Misc.swift
index d37dfbd4a..77275776c 100644
--- a/Sources/_RegexParser/Utility/Misc.swift
+++ b/Sources/_RegexParser/Utility/Misc.swift
@@ -19,6 +19,13 @@ extension Substring {
   var string: String { String(self) }
 }
 
+extension Character {
+  /// Whether this character is made up of exactly one Unicode scalar value.
+  public var hasExactlyOneScalar: Bool {
+    unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex
+  }
+}
+
 extension CustomStringConvertible {
   @_alwaysEmitIntoClient
   public var halfWidthCornerQuoted: String {
diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
index b6bbfd83e..3e436cd48 100644
--- a/Sources/_StringProcessing/ConsumerInterface.swift
+++ b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -248,10 +248,10 @@ extension DSLTree.CustomCharacterClass.Member {
       return c
     case let .range(low, high):
       // TODO:
-      guard let lhs = low.literalCharacterValue else {
+      guard let lhs = low.literalCharacterValue, lhs.hasExactlyOneScalar else {
         throw Unsupported("\(low) in range")
       }
-      guard let rhs = high.literalCharacterValue else {
+      guard let rhs = high.literalCharacterValue, rhs.hasExactlyOneScalar else {
         throw Unsupported("\(high) in range")
       }
 
@@ -262,6 +262,7 @@ extension DSLTree.CustomCharacterClass.Member {
         return { input, bounds in
           // TODO: check for out of bounds?
           let curIdx = bounds.lowerBound
+          guard input[curIdx].hasExactlyOneScalar else { return nil }
           if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) {
             // TODO: semantic level
             return input.index(after: curIdx)
@@ -273,6 +274,7 @@ extension DSLTree.CustomCharacterClass.Member {
         return { input, bounds in
           // TODO: check for out of bounds?
           let curIdx = bounds.lowerBound
+          guard input[curIdx].hasExactlyOneScalar else { return nil }
           if (lhs...rhs).contains(input[curIdx]) {
             // TODO: semantic level
             return input.index(after: curIdx)
diff --git a/Sources/_StringProcessing/Unicode/CharacterProps.swift b/Sources/_StringProcessing/Unicode/CharacterProps.swift
index 80f6819a6..e0be4e386 100644
--- a/Sources/_StringProcessing/Unicode/CharacterProps.swift
+++ b/Sources/_StringProcessing/Unicode/CharacterProps.swift
@@ -11,10 +11,3 @@
 
 
 // TODO
-
-extension Character {
-  /// Whether this character is made up of exactly one Unicode scalar value.
-  var hasExactlyOneScalar: Bool {
-    unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex
-  }
-}
diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift
index 85dd1ca37..b53ac8576 100644
--- a/Sources/_StringProcessing/_CharacterClassModel.swift
+++ b/Sources/_StringProcessing/_CharacterClassModel.swift
@@ -103,6 +103,11 @@ public struct _CharacterClassModel: Hashable {
           return c == character
         }
       case .range(let range):
+        // Ranges can be formed with single-scalar characters, and can only
+        // match as such.
+        // FIXME: Convert to canonical composed version before testing?
+        guard character.hasExactlyOneScalar else { return false }
+        
         if options.isCaseInsensitive {
           let newLower = range.lowerBound.lowercased()
           let newUpper = range.upperBound.lowercased()
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index e36d84f60..60cddd4a0 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -2534,6 +2534,8 @@ extension RegexTests {
     diagnosticTest("[[:=:]]", .emptyProperty)
 
     diagnosticTest(#"|([\d-c])?"#, .invalidCharacterClassRangeOperand)
+    diagnosticTest(#"|([🇦🇫-🇿🇼])?"#, .invalidCharacterClassRangeOperand)
+    diagnosticTest(#"|([👨‍👩‍👦-👩‍👩‍👧‍👧])?"#, .invalidCharacterClassRangeOperand)
 
     diagnosticTest(#"[_-A]"#, .invalidCharacterRange(from: "_", to: "A"))
     diagnosticTest(#"(?i)[_-A]"#, .invalidCharacterRange(from: "_", to: "A"))

From bae64011f9e50437b489c66148a5aaccabc22f03 Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Wed, 18 May 2022 18:19:52 -0500
Subject: [PATCH 2/3] Fix matching semantics for custom character classes

This applies the current matching semantics for character classes,
matching either characters or Unicode scalars depending on the
current options.
---
 .../_StringProcessing/ConsumerInterface.swift | 77 ++++++++++++++-----
 Tests/RegexTests/MatchTests.swift             | 62 +++++++--------
 2 files changed, 85 insertions(+), 54 deletions(-)

diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
index 3e436cd48..73d9f7cd7 100644
--- a/Sources/_StringProcessing/ConsumerInterface.swift
+++ b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -60,24 +60,53 @@ extension DSLTree.Atom {
     _ opts: MatchingOptions
   ) throws -> MEProgram<String>.ConsumeFunction? {
     let isCaseInsensitive = opts.isCaseInsensitive
-    
+    let isCharacterSemantics = opts.semanticLevel == .graphemeCluster
+
     switch self {
     case let .char(c):
-      // TODO: Match level?
       return { input, bounds in
-        let low = bounds.lowerBound
+        let nextIndex = isCharacterSemantics
+          ? input.index(after: bounds.lowerBound)
+          : input.unicodeScalars.index(after: bounds.lowerBound)
+
+        var curIdx = bounds.lowerBound
         if isCaseInsensitive && c.isCased {
-          return input[low].lowercased() == c.lowercased()
-            ? input.index(after: low)
-            : nil
+          if isCharacterSemantics {
+            return input[curIdx].lowercased() == c.lowercased()
+              ? nextIndex
+              : nil
+          } else {
+            // FIXME: How do multi-scalar characters match in case insensitive mode?
+            return input.unicodeScalars[curIdx].properties.lowercaseMapping == c.lowercased()
+              ? nextIndex
+              : nil
+          }
         } else {
-          return input[low] == c
-            ? input.index(after: low)
-            : nil
+          if isCharacterSemantics {
+            return input[curIdx] == c
+              ? nextIndex
+              : nil
+          } else {
+            // Try to match the sequence of unicodeScalars in `input` and `c`
+            var patternIndex = c.unicodeScalars.startIndex
+            while curIdx < input.endIndex, patternIndex < c.unicodeScalars.endIndex {
+              if input.unicodeScalars[curIdx] != c.unicodeScalars[patternIndex] {
+                return nil
+              }
+              input.unicodeScalars.formIndex(after: &curIdx)
+              c.unicodeScalars.formIndex(after: &patternIndex)
+            }
+            
+            // Match succeeded if all scalars in `c.unicodeScalars` matched
+            return patternIndex == c.unicodeScalars.endIndex
+              ? curIdx
+              : nil
+          }
         }
       }
     case let .scalar(s):
-      return consumeScalar {
+      let consume = consumeFunction(for: opts)
+      return consume {
         isCaseInsensitive
           ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping
           : $0 == s
@@ -255,6 +284,8 @@ extension DSLTree.CustomCharacterClass.Member {
         throw Unsupported("\(high) in range")
       }
 
+      let isCharacterSemantic = opts.semanticLevel == .graphemeCluster
+      
       if opts.isCaseInsensitive {
         let lhsLower = lhs.lowercased()
         let rhsLower = rhs.lowercased()
@@ -262,10 +293,15 @@ extension DSLTree.CustomCharacterClass.Member {
         return { input, bounds in
           // TODO: check for out of bounds?
           let curIdx = bounds.lowerBound
-          guard input[curIdx].hasExactlyOneScalar else { return nil }
-          if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) {
-            // TODO: semantic level
-            return input.index(after: curIdx)
+          if isCharacterSemantic {
+            guard input[curIdx].hasExactlyOneScalar else { return nil }
+            if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) {
+              return input.index(after: curIdx)
+            }
+          } else {
+            if (lhsLower...rhsLower).contains(input.unicodeScalars[curIdx].properties.lowercaseMapping) {
+              return input.unicodeScalars.index(after: curIdx)
+            }
           }
           return nil
         }
@@ -274,10 +310,15 @@ extension DSLTree.CustomCharacterClass.Member {
         return { input, bounds in
           // TODO: check for out of bounds?
           let curIdx = bounds.lowerBound
-          guard input[curIdx].hasExactlyOneScalar else { return nil }
-          if (lhs...rhs).contains(input[curIdx]) {
-            // TODO: semantic level
-            return input.index(after: curIdx)
+          if isCharacterSemantic {
+            guard input[curIdx].hasExactlyOneScalar else { return nil }
+            if (lhs...rhs).contains(input[curIdx]) {
+              return input.index(after: curIdx)
+            }
+          } else {
+            if (lhs...rhs).contains(Character(input.unicodeScalars[curIdx])) {
+              return input.unicodeScalars.index(after: curIdx)
+            }
           }
           return nil
         }
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index 4ae489300..fce627da8 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -673,6 +673,11 @@ extension RegexTests {
     }
     firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}")
 
+    firstMatchTest(#"[12]"#, input: "1️⃣", match: nil)
+    firstMatchTest(#"[1-2]"#, input: "1️⃣", match: nil)
+    firstMatchTest(#"[\d]"#, input: "1️⃣", match: "1️⃣")
+    firstMatchTest(#"(?P)[\d]"#, input: "1️⃣", match: nil)
+
     // Currently not supported in the matching engine.
     for c: UnicodeScalar in ["a", "b", "c"] {
       firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)",
@@ -1507,31 +1512,28 @@ extension RegexTests {
   }
   
   func testCanonicalEquivalenceCustomCharacterClass() throws {
-    // Expectation: Concatenations with custom character classes should be able
-    // to match within a grapheme cluster. That is, a regex should be able to
-    // match the scalar values that comprise a grapheme cluster in separate,
-    // or repeated, custom character classes.
+    // Expectation: Custom character class matches do not cross grapheme
+    // character boundaries by default. When matching with Unicode scalar
+    // semantics, grapheme cluster boundaries are ignored, so matching
+    // sequences of custom character classes can succeed.
     
     matchTest(
       #"[áéíóú]$"#,
       (eComposed, true),
       (eDecomposed, true))
 
-    // FIXME: Custom char classes don't use canonical equivalence with composed characters
-    firstMatchTest(#"e[\u{301}]$"#, input: eComposed, match: eComposed,
-              xfail: true)
-    firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eComposed, match: eComposed,
-              xfail: true)
-    firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eComposed, match: eComposed,
-              xfail: true)
+    // Unicode scalar semantics
+    firstMatchTest(#"(?u)e[\u{301}]$"#, input: eDecomposed, match: eDecomposed)
+    firstMatchTest(#"(?u)e[\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed)
+    firstMatchTest(#"(?u)[e][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed)
+    firstMatchTest(#"(?u)[e-e][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed)
+    firstMatchTest(#"(?u)[a-z][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed)
 
-    // FIXME: Custom char classes don't match decomposed characters
-    firstMatchTest(#"e[\u{301}]$"#, input: eDecomposed, match: eDecomposed,
-              xfail: true)
-    firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed,
-              xfail: true)
-    firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed,
-              xfail: true)
+    // Grapheme cluster semantics
+    firstMatchTest(#"e[\u{301}]$"#, input: eComposed, match: nil)
+    firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eComposed, match: nil)
+    firstMatchTest(#"[e][\u{300}-\u{320}]$"#, input: eComposed, match: nil)
+    firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eComposed, match: nil)
 
     let flag = "🇰🇷"
     firstMatchTest(#"🇰🇷"#, input: flag, match: flag)
@@ -1540,27 +1542,15 @@ extension RegexTests {
     firstMatchTest(#"\u{1F1F0 1F1F7}"#, input: flag, match: flag)
 
     // First Unicode scalar followed by CCC of regional indicators
-    firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag,
-              xfail: true)
-
-    // FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character
+    firstMatchTest(#"(?u)^\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: flag)
+    // A CCC of regional indicators followed by the second Unicode scalar
+    firstMatchTest(#"(?u)^[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}$"#, input: flag, match: flag)
     // A CCC of regional indicators x 2
-    firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]{2}"#, input: flag, match: flag,
-              xfail: true)
+    firstMatchTest(#"(?u)^[\u{1F1E6}-\u{1F1FF}]{2}$"#, input: flag, match: flag)
 
-    // FIXME: A single CCC of regional indicators matches the whole flag character
-    // A CCC of regional indicators followed by the second Unicode scalar
-    firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}"#, input: flag, match: flag,
-              xfail: true)
     // A single CCC of regional indicators
-    firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: nil,
-              xfail: true)
-    
-    // A single CCC of actual flag emojis / combined regional indicators
-    firstMatchTest(#"[🇦🇫-🇿🇼]"#, input: flag, match: flag)
-    // This succeeds (correctly) because \u{1F1F0} is lexicographically
-    // within the CCC range
-    firstMatchTest(#"[🇦🇫-🇿🇼]"#, input: "\u{1F1F0}abc", match: "\u{1F1F0}")
+    firstMatchTest(#"^[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: nil)
+    firstMatchTest(#"^(?u)[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: nil)
   }
   
   func testAnyChar() throws {

From b716d50c504fb1c37888db230584afee1f9f0180 Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Wed, 18 May 2022 23:33:15 -0500
Subject: [PATCH 3/3] Handle case insensitivity properly in CCC ranges

The prior implementation didn't make a lot of sense, and couldn't
handle cases like `/(?i)[X-c]/`. This new approach uses simple case
matching to test if the character is within the range, then tests if
the uppercase or lowercase mappings are within the range.

Fixes #395
---
 .../_StringProcessing/ConsumerInterface.swift | 62 +++++++++----------
 Tests/RegexTests/MatchTests.swift             | 27 ++++++++
 2 files changed, 55 insertions(+), 34 deletions(-)

diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
index 73d9f7cd7..a538be5f3 100644
--- a/Sources/_StringProcessing/ConsumerInterface.swift
+++ b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -276,54 +276,48 @@ extension DSLTree.CustomCharacterClass.Member {
       }
       return c
     case let .range(low, high):
-      // TODO:
       guard let lhs = low.literalCharacterValue, lhs.hasExactlyOneScalar else {
         throw Unsupported("\(low) in range")
       }
       guard let rhs = high.literalCharacterValue, rhs.hasExactlyOneScalar else {
         throw Unsupported("\(high) in range")
       }
+      guard lhs <= rhs else {
+        throw Unsupported("Invalid range \(low)-\(high)")
+      }
 
+      let isCaseInsensitive = opts.isCaseInsensitive
       let isCharacterSemantic = opts.semanticLevel == .graphemeCluster
       
-      if opts.isCaseInsensitive {
-        let lhsLower = lhs.lowercased()
-        let rhsLower = rhs.lowercased()
-        guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") }
-        return { input, bounds in
-          // TODO: check for out of bounds?
-          let curIdx = bounds.lowerBound
-          if isCharacterSemantic {
-            guard input[curIdx].hasExactlyOneScalar else { return nil }
-            if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) {
-              return input.index(after: curIdx)
-            }
-          } else {
-            if (lhsLower...rhsLower).contains(input.unicodeScalars[curIdx].properties.lowercaseMapping) {
-              return input.unicodeScalars.index(after: curIdx)
-            }
-          }
+      return { input, bounds in
+        // TODO: check for out of bounds?
+        let curIdx = bounds.lowerBound
+        let nextIndex = isCharacterSemantic
+          ? input.index(after: curIdx)
+          : input.unicodeScalars.index(after: curIdx)
+        if isCharacterSemantic && !input[curIdx].hasExactlyOneScalar {
           return nil
         }
-      } else {
-        guard lhs <= rhs else { throw Unsupported("Invalid range \(lhs)-\(rhs)") }
-        return { input, bounds in
-          // TODO: check for out of bounds?
-          let curIdx = bounds.lowerBound
-          if isCharacterSemantic {
-            guard input[curIdx].hasExactlyOneScalar else { return nil }
-            if (lhs...rhs).contains(input[curIdx]) {
-              return input.index(after: curIdx)
-            }
-          } else {
-            if (lhs...rhs).contains(Character(input.unicodeScalars[curIdx])) {
-              return input.unicodeScalars.index(after: curIdx)
-            }
-          }
+        let scalar = input.unicodeScalars[curIdx]
+        let scalarRange = lhs.unicodeScalars.first! ... rhs.unicodeScalars.first!
+        if scalarRange.contains(scalar) {
+          return nextIndex
+        }
+        if !isCaseInsensitive {
           return nil
         }
+        
+        let stringRange = String(lhs)...String(rhs)
+        if (scalar.properties.changesWhenLowercased
+            && stringRange.contains(scalar.properties.lowercaseMapping))
+          || (scalar.properties.changesWhenUppercased
+            && stringRange.contains(scalar.properties.uppercaseMapping)) {
+          return nextIndex
+        }
+        
+        return nil
       }
-
+      
     case let .custom(ccc):
       return try ccc.generateConsumer(opts)
 
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index fce627da8..847a3cefb 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -731,6 +731,33 @@ extension RegexTests {
     firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: "abc",
                    syntax: .experimental)
     firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: #""abc""#)
+    
+    // Case sensitivity and ranges.
+    for ch in "abcD" {
+      firstMatchTest("[a-cD]", input: String(ch), match: String(ch))
+    }
+    for ch in "ABCd" {
+      firstMatchTest("[a-cD]", input: String(ch), match: nil)
+    }
+
+    for ch in "abcABCdD" {
+      firstMatchTest("(?i)[a-cd]", input: String(ch), match: String(ch))
+      firstMatchTest("(?i)[A-CD]", input: String(ch), match: String(ch))
+      firstMatchTest("(?iu)[a-cd]", input: String(ch), match: String(ch))
+      firstMatchTest("(?iu)[A-CD]", input: String(ch), match: String(ch))
+    }
+
+    for ch in "XYZ[\\]^_`abcd" {
+      firstMatchTest("[X-cd]", input: String(ch), match: String(ch))
+      firstMatchTest("[X-cd]", input: String(ch), match: String(ch))
+      firstMatchTest("(?u)[X-cd]", input: String(ch), match: String(ch))
+      firstMatchTest("(?u)[X-cd]", input: String(ch), match: String(ch))
+    }
+
+    for ch in "XYZ[\\]^_`abcxyzABCdD" {
+      firstMatchTest("(?i)[X-cd]", input: String(ch), match: String(ch))
+      firstMatchTest("(?iu)[X-cD]", input: String(ch), match: String(ch))
+    }
   }
 
   func testCharacterProperties() {