From 3146746120785e464dc44728a9462b2d78ef5094 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 16 May 2022 13:45:42 -0500 Subject: [PATCH 1/3] Limit CC ranges to single-scalar bounds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Character class ranges don't work well with multi-scalar inputs, in either the range or the matched character. This change limits range endpoints to single-scalar characters and matches only characters that are themselves a single scalar. Fixes issue #407, which now displays this behavior: ``` try /[1-2]/.wholeMatch(in: "1️⃣") // nil try /[12]/.wholeMatch(in: "1️⃣") // nil try /(?U)[\d]/.wholeMatch(in: "1️⃣") // nil ``` --- Sources/_RegexParser/Regex/AST/Atom.swift | 2 +- Sources/_RegexParser/Utility/Misc.swift | 7 +++++++ Sources/_StringProcessing/ConsumerInterface.swift | 6 ++++-- Sources/_StringProcessing/Unicode/CharacterProps.swift | 7 ------- Sources/_StringProcessing/_CharacterClassModel.swift | 5 +++++ Tests/RegexTests/ParseTests.swift | 2 ++ 6 files changed, 19 insertions(+), 10 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index eba720f9b..ffeb7c8e1 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -771,7 +771,7 @@ extension AST.Atom { /// range. public var isValidCharacterClassRangeBound: Bool { // If we have a literal character value for this, it can be used as a bound. - if literalCharacterValue != nil { return true } + if literalCharacterValue?.hasExactlyOneScalar == true { return true } switch kind { // \cx, \C-x, \M-x, \M-\C-x, \N{...} case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter: diff --git a/Sources/_RegexParser/Utility/Misc.swift b/Sources/_RegexParser/Utility/Misc.swift index d37dfbd4a..77275776c 100644 --- a/Sources/_RegexParser/Utility/Misc.swift +++ b/Sources/_RegexParser/Utility/Misc.swift @@ -19,6 +19,13 @@ extension Substring { var string: String { String(self) } } +extension Character { + /// Whether this character is made up of exactly one Unicode scalar value. + public var hasExactlyOneScalar: Bool { + unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex + } +} + extension CustomStringConvertible { @_alwaysEmitIntoClient public var halfWidthCornerQuoted: String { diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index b6bbfd83e..3e436cd48 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -248,10 +248,10 @@ extension DSLTree.CustomCharacterClass.Member { return c case let .range(low, high): // TODO: - guard let lhs = low.literalCharacterValue else { + guard let lhs = low.literalCharacterValue, lhs.hasExactlyOneScalar else { throw Unsupported("\(low) in range") } - guard let rhs = high.literalCharacterValue else { + guard let rhs = high.literalCharacterValue, rhs.hasExactlyOneScalar else { throw Unsupported("\(high) in range") } @@ -262,6 +262,7 @@ extension DSLTree.CustomCharacterClass.Member { return { input, bounds in // TODO: check for out of bounds? let curIdx = bounds.lowerBound + guard input[curIdx].hasExactlyOneScalar else { return nil } if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) { // TODO: semantic level return input.index(after: curIdx) @@ -273,6 +274,7 @@ extension DSLTree.CustomCharacterClass.Member { return { input, bounds in // TODO: check for out of bounds? let curIdx = bounds.lowerBound + guard input[curIdx].hasExactlyOneScalar else { return nil } if (lhs...rhs).contains(input[curIdx]) { // TODO: semantic level return input.index(after: curIdx) diff --git a/Sources/_StringProcessing/Unicode/CharacterProps.swift b/Sources/_StringProcessing/Unicode/CharacterProps.swift index 80f6819a6..e0be4e386 100644 --- a/Sources/_StringProcessing/Unicode/CharacterProps.swift +++ b/Sources/_StringProcessing/Unicode/CharacterProps.swift @@ -11,10 +11,3 @@ // TODO - -extension Character { - /// Whether this character is made up of exactly one Unicode scalar value. - var hasExactlyOneScalar: Bool { - unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex - } -} diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 85dd1ca37..b53ac8576 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -103,6 +103,11 @@ public struct _CharacterClassModel: Hashable { return c == character } case .range(let range): + // Ranges can be formed with single-scalar characters, and can only + // match as such. + // FIXME: Convert to canonical composed version before testing? + guard character.hasExactlyOneScalar else { return false } + if options.isCaseInsensitive { let newLower = range.lowerBound.lowercased() let newUpper = range.upperBound.lowercased() diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index e36d84f60..60cddd4a0 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2534,6 +2534,8 @@ extension RegexTests { diagnosticTest("[[:=:]]", .emptyProperty) diagnosticTest(#"|([\d-c])?"#, .invalidCharacterClassRangeOperand) + diagnosticTest(#"|([🇦🇫-🇿🇼])?"#, .invalidCharacterClassRangeOperand) + diagnosticTest(#"|([👨‍👩‍👦-👩‍👩‍👧‍👧])?"#, .invalidCharacterClassRangeOperand) diagnosticTest(#"[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) diagnosticTest(#"(?i)[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) From bae64011f9e50437b489c66148a5aaccabc22f03 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 18 May 2022 18:19:52 -0500 Subject: [PATCH 2/3] Fix matching semantics for custom character classes This applies the current matching semantics for character classes, matching either characters or Unicode scalars depending on the current options. --- .../_StringProcessing/ConsumerInterface.swift | 77 ++++++++++++++----- Tests/RegexTests/MatchTests.swift | 62 +++++++-------- 2 files changed, 85 insertions(+), 54 deletions(-) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 3e436cd48..73d9f7cd7 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -60,24 +60,53 @@ extension DSLTree.Atom { _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { let isCaseInsensitive = opts.isCaseInsensitive - + let isCharacterSemantics = opts.semanticLevel == .graphemeCluster + switch self { case let .char(c): - // TODO: Match level? return { input, bounds in - let low = bounds.lowerBound + let nextIndex = isCharacterSemantics + ? input.index(after: bounds.lowerBound) + : input.unicodeScalars.index(after: bounds.lowerBound) + + var curIdx = bounds.lowerBound if isCaseInsensitive && c.isCased { - return input[low].lowercased() == c.lowercased() - ? input.index(after: low) - : nil + if isCharacterSemantics { + return input[curIdx].lowercased() == c.lowercased() + ? nextIndex + : nil + } else { + // FIXME: How do multi-scalar characters match in case insensitive mode? + return input.unicodeScalars[curIdx].properties.lowercaseMapping == c.lowercased() + ? nextIndex + : nil + } } else { - return input[low] == c - ? input.index(after: low) - : nil + if isCharacterSemantics { + return input[curIdx] == c + ? nextIndex + : nil + } else { + // Try to match the sequence of unicodeScalars in `input` and `c` + var patternIndex = c.unicodeScalars.startIndex + while curIdx < input.endIndex, patternIndex < c.unicodeScalars.endIndex { + if input.unicodeScalars[curIdx] != c.unicodeScalars[patternIndex] { + return nil + } + input.unicodeScalars.formIndex(after: &curIdx) + c.unicodeScalars.formIndex(after: &patternIndex) + } + + // Match succeeded if all scalars in `c.unicodeScalars` matched + return patternIndex == c.unicodeScalars.endIndex + ? curIdx + : nil + } } } case let .scalar(s): - return consumeScalar { + let consume = consumeFunction(for: opts) + return consume { isCaseInsensitive ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping : $0 == s @@ -255,6 +284,8 @@ extension DSLTree.CustomCharacterClass.Member { throw Unsupported("\(high) in range") } + let isCharacterSemantic = opts.semanticLevel == .graphemeCluster + if opts.isCaseInsensitive { let lhsLower = lhs.lowercased() let rhsLower = rhs.lowercased() @@ -262,10 +293,15 @@ extension DSLTree.CustomCharacterClass.Member { return { input, bounds in // TODO: check for out of bounds? let curIdx = bounds.lowerBound - guard input[curIdx].hasExactlyOneScalar else { return nil } - if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) { - // TODO: semantic level - return input.index(after: curIdx) + if isCharacterSemantic { + guard input[curIdx].hasExactlyOneScalar else { return nil } + if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) { + return input.index(after: curIdx) + } + } else { + if (lhsLower...rhsLower).contains(input.unicodeScalars[curIdx].properties.lowercaseMapping) { + return input.unicodeScalars.index(after: curIdx) + } } return nil } @@ -274,10 +310,15 @@ extension DSLTree.CustomCharacterClass.Member { return { input, bounds in // TODO: check for out of bounds? let curIdx = bounds.lowerBound - guard input[curIdx].hasExactlyOneScalar else { return nil } - if (lhs...rhs).contains(input[curIdx]) { - // TODO: semantic level - return input.index(after: curIdx) + if isCharacterSemantic { + guard input[curIdx].hasExactlyOneScalar else { return nil } + if (lhs...rhs).contains(input[curIdx]) { + return input.index(after: curIdx) + } + } else { + if (lhs...rhs).contains(Character(input.unicodeScalars[curIdx])) { + return input.unicodeScalars.index(after: curIdx) + } } return nil } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 4ae489300..fce627da8 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -673,6 +673,11 @@ extension RegexTests { } firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}") + firstMatchTest(#"[12]"#, input: "1️⃣", match: nil) + firstMatchTest(#"[1-2]"#, input: "1️⃣", match: nil) + firstMatchTest(#"[\d]"#, input: "1️⃣", match: "1️⃣") + firstMatchTest(#"(?P)[\d]"#, input: "1️⃣", match: nil) + // Currently not supported in the matching engine. for c: UnicodeScalar in ["a", "b", "c"] { firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)", @@ -1507,31 +1512,28 @@ extension RegexTests { } func testCanonicalEquivalenceCustomCharacterClass() throws { - // Expectation: Concatenations with custom character classes should be able - // to match within a grapheme cluster. That is, a regex should be able to - // match the scalar values that comprise a grapheme cluster in separate, - // or repeated, custom character classes. + // Expectation: Custom character class matches do not cross grapheme + // character boundaries by default. When matching with Unicode scalar + // semantics, grapheme cluster boundaries are ignored, so matching + // sequences of custom character classes can succeed. matchTest( #"[áéíóú]$"#, (eComposed, true), (eDecomposed, true)) - // FIXME: Custom char classes don't use canonical equivalence with composed characters - firstMatchTest(#"e[\u{301}]$"#, input: eComposed, match: eComposed, - xfail: true) - firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eComposed, match: eComposed, - xfail: true) - firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eComposed, match: eComposed, - xfail: true) + // Unicode scalar semantics + firstMatchTest(#"(?u)e[\u{301}]$"#, input: eDecomposed, match: eDecomposed) + firstMatchTest(#"(?u)e[\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed) + firstMatchTest(#"(?u)[e][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed) + firstMatchTest(#"(?u)[e-e][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed) + firstMatchTest(#"(?u)[a-z][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed) - // FIXME: Custom char classes don't match decomposed characters - firstMatchTest(#"e[\u{301}]$"#, input: eDecomposed, match: eDecomposed, - xfail: true) - firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed, - xfail: true) - firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed, - xfail: true) + // Grapheme cluster semantics + firstMatchTest(#"e[\u{301}]$"#, input: eComposed, match: nil) + firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eComposed, match: nil) + firstMatchTest(#"[e][\u{300}-\u{320}]$"#, input: eComposed, match: nil) + firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eComposed, match: nil) let flag = "🇰🇷" firstMatchTest(#"🇰🇷"#, input: flag, match: flag) @@ -1540,27 +1542,15 @@ extension RegexTests { firstMatchTest(#"\u{1F1F0 1F1F7}"#, input: flag, match: flag) // First Unicode scalar followed by CCC of regional indicators - firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag, - xfail: true) - - // FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character + firstMatchTest(#"(?u)^\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: flag) + // A CCC of regional indicators followed by the second Unicode scalar + firstMatchTest(#"(?u)^[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}$"#, input: flag, match: flag) // A CCC of regional indicators x 2 - firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]{2}"#, input: flag, match: flag, - xfail: true) + firstMatchTest(#"(?u)^[\u{1F1E6}-\u{1F1FF}]{2}$"#, input: flag, match: flag) - // FIXME: A single CCC of regional indicators matches the whole flag character - // A CCC of regional indicators followed by the second Unicode scalar - firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}"#, input: flag, match: flag, - xfail: true) // A single CCC of regional indicators - firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: nil, - xfail: true) - - // A single CCC of actual flag emojis / combined regional indicators - firstMatchTest(#"[🇦🇫-🇿🇼]"#, input: flag, match: flag) - // This succeeds (correctly) because \u{1F1F0} is lexicographically - // within the CCC range - firstMatchTest(#"[🇦🇫-🇿🇼]"#, input: "\u{1F1F0}abc", match: "\u{1F1F0}") + firstMatchTest(#"^[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: nil) + firstMatchTest(#"^(?u)[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: nil) } func testAnyChar() throws { From b716d50c504fb1c37888db230584afee1f9f0180 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 18 May 2022 23:33:15 -0500 Subject: [PATCH 3/3] Handle case insensitivity properly in CCC ranges The prior implementation didn't make a lot of sense, and couldn't handle cases like `/(?i)[X-c]/`. This new approach uses simple case matching to test if the character is within the range, then tests if the uppercase or lowercase mappings are within the range. Fixes #395 --- .../_StringProcessing/ConsumerInterface.swift | 62 +++++++++---------- Tests/RegexTests/MatchTests.swift | 27 ++++++++ 2 files changed, 55 insertions(+), 34 deletions(-) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 73d9f7cd7..a538be5f3 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -276,54 +276,48 @@ extension DSLTree.CustomCharacterClass.Member { } return c case let .range(low, high): - // TODO: guard let lhs = low.literalCharacterValue, lhs.hasExactlyOneScalar else { throw Unsupported("\(low) in range") } guard let rhs = high.literalCharacterValue, rhs.hasExactlyOneScalar else { throw Unsupported("\(high) in range") } + guard lhs <= rhs else { + throw Unsupported("Invalid range \(low)-\(high)") + } + let isCaseInsensitive = opts.isCaseInsensitive let isCharacterSemantic = opts.semanticLevel == .graphemeCluster - if opts.isCaseInsensitive { - let lhsLower = lhs.lowercased() - let rhsLower = rhs.lowercased() - guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } - return { input, bounds in - // TODO: check for out of bounds? - let curIdx = bounds.lowerBound - if isCharacterSemantic { - guard input[curIdx].hasExactlyOneScalar else { return nil } - if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) { - return input.index(after: curIdx) - } - } else { - if (lhsLower...rhsLower).contains(input.unicodeScalars[curIdx].properties.lowercaseMapping) { - return input.unicodeScalars.index(after: curIdx) - } - } + return { input, bounds in + // TODO: check for out of bounds? + let curIdx = bounds.lowerBound + let nextIndex = isCharacterSemantic + ? input.index(after: curIdx) + : input.unicodeScalars.index(after: curIdx) + if isCharacterSemantic && !input[curIdx].hasExactlyOneScalar { return nil } - } else { - guard lhs <= rhs else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } - return { input, bounds in - // TODO: check for out of bounds? - let curIdx = bounds.lowerBound - if isCharacterSemantic { - guard input[curIdx].hasExactlyOneScalar else { return nil } - if (lhs...rhs).contains(input[curIdx]) { - return input.index(after: curIdx) - } - } else { - if (lhs...rhs).contains(Character(input.unicodeScalars[curIdx])) { - return input.unicodeScalars.index(after: curIdx) - } - } + let scalar = input.unicodeScalars[curIdx] + let scalarRange = lhs.unicodeScalars.first! ... rhs.unicodeScalars.first! + if scalarRange.contains(scalar) { + return nextIndex + } + if !isCaseInsensitive { return nil } + + let stringRange = String(lhs)...String(rhs) + if (scalar.properties.changesWhenLowercased + && stringRange.contains(scalar.properties.lowercaseMapping)) + || (scalar.properties.changesWhenUppercased + && stringRange.contains(scalar.properties.uppercaseMapping)) { + return nextIndex + } + + return nil } - + case let .custom(ccc): return try ccc.generateConsumer(opts) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index fce627da8..847a3cefb 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -731,6 +731,33 @@ extension RegexTests { firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: "abc", syntax: .experimental) firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: #""abc""#) + + // Case sensitivity and ranges. + for ch in "abcD" { + firstMatchTest("[a-cD]", input: String(ch), match: String(ch)) + } + for ch in "ABCd" { + firstMatchTest("[a-cD]", input: String(ch), match: nil) + } + + for ch in "abcABCdD" { + firstMatchTest("(?i)[a-cd]", input: String(ch), match: String(ch)) + firstMatchTest("(?i)[A-CD]", input: String(ch), match: String(ch)) + firstMatchTest("(?iu)[a-cd]", input: String(ch), match: String(ch)) + firstMatchTest("(?iu)[A-CD]", input: String(ch), match: String(ch)) + } + + for ch in "XYZ[\\]^_`abcd" { + firstMatchTest("[X-cd]", input: String(ch), match: String(ch)) + firstMatchTest("[X-cd]", input: String(ch), match: String(ch)) + firstMatchTest("(?u)[X-cd]", input: String(ch), match: String(ch)) + firstMatchTest("(?u)[X-cd]", input: String(ch), match: String(ch)) + } + + for ch in "XYZ[\\]^_`abcxyzABCdD" { + firstMatchTest("(?i)[X-cd]", input: String(ch), match: String(ch)) + firstMatchTest("(?iu)[X-cD]", input: String(ch), match: String(ch)) + } } func testCharacterProperties() {