From 0ddffdfa9e2f7efb719c0867dfee5d71eef2a88c Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Sun, 9 Jan 2022 18:25:06 -0600 Subject: [PATCH 1/5] Add tests for Character semantics in regex matching Several of these tests are xfail'd, as we haven't implemented this level of character semantics yet. --- Tests/RegexTests/MatchTests.swift | 151 ++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 9aa07a219..1c008beb1 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -831,5 +831,156 @@ extension RegexTests { matchTest(#"(?.)(.)\k"#, input: "abac", match: "aba", xfail: true) matchTest(#"\g'+2'(.)(.)"#, input: "abac", match: "aba", xfail: true) } + + // MARK: Character Semantics + + var eComposed: String { "é" } + var eDecomposed: String { "e\u{301}" } + + func testCanonicalEquivalence() throws { + // Expectation: Matching should use canonical equivalence whenever comparing + // characters, so a user can write characters using any equivalent spelling + // in either a regex literal or the string targeted for matching. + + matchTests( + #"é$"#, + (eComposed, eComposed), + (eDecomposed, eDecomposed)) + + // FIXME: Decomposed character in regex literal doesn't match an equivalent character + matchTests( + #"e\u{301}$"#, + (eComposed, eComposed), + (eDecomposed, eDecomposed), + xfail: true) + + matchTests( + #"e$"#, + (eComposed, nil), + (eDecomposed, nil)) + } + + func testCanonicalEquivalenceCharacterClass() throws { + // Expectation: Character classes should match equivalent characters to the + // same degree, regardless of how they are spelled. Unicode "property + // classes" should match characters when all the code points that comprise + // the character are members of the property class. + + // \w + matchTests( + #"^\w$"#, + (eComposed, eComposed), + (eDecomposed, eDecomposed)) + // \p{Letter} + matchTest(#"\p{Letter}$"#, input: eComposed, match: eComposed) + // FIXME: \p{Letter} doesn't match a decomposed character + matchTest(#"\p{Letter}$"#, input: eDecomposed, match: eDecomposed, + xfail: true) + + // \d + matchTest(#"\d"#, input: "5", match: "5") + // FIXME: \d shouldn't match a digit composed with a non-digit character + matchTest(#"\d"#, input: "5\u{305}", match: nil, + xfail: true) + // \p{Number} + matchTest(#"\p{Number}"#, input: "5", match: "5") + // FIXME: \p{Number} shouldn't match a number composed with a non-number character + matchTest(#"\p{Number}"#, input: "5\u{305}", match: nil, + xfail: true) + + // \s + matchTest(#"\s"#, input: " ", match: " ") + // FIXME: \s shouldn't match a number composed with a non-number character + matchTest(#"\s\u{305}"#, input: " ", match: nil, + xfail: true) + // \p{Whitespace} + matchTest(#"\s"#, input: " ", match: " ") + // FIXME: \p{Whitespace} shouldn't match whitespace composed with a non-whitespace character + matchTest(#"\s\u{305}"#, input: " ", match: nil, + xfail: true) + } + + func testCanonicalEquivalenceCustomCharacterClass() throws { + // Expectation: Concatenations with custom character classes should be able + // to match within a grapheme cluster. That is, a regex should be able to + // match the scalar values that comprise a grapheme cluster in separate, + // or repeated, custom character classes. + + matchTests( + #"[áéíóú]$"#, + (eComposed, eComposed), + (eDecomposed, eDecomposed)) + + // FIXME: Custom char classes don't use canonical equivalence with composed characters + matchTest(#"e[\u{301}]$"#, input: eComposed, match: eComposed, + xfail: true) + matchTest(#"e[\u{300}-\u{320}]$"#, input: eComposed, match: eComposed, + xfail: true) + matchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eComposed, match: eComposed, + xfail: true) + + // FIXME: Custom char classes don't match decomposed characters + matchTest(#"e[\u{301}]$"#, input: eDecomposed, match: eDecomposed, + xfail: true) + matchTest(#"e[\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed, + xfail: true) + matchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed, + xfail: true) + + let flag = "🇰🇷" + matchTest(#"🇰🇷"#, input: flag, match: flag) + matchTest(#"[🇰🇷]"#, input: flag, match: flag) + matchTest(#"\u{1F1F0}\u{1F1F7}"#, input: flag, match: flag) + + // First Unicode scalar followed by CCC of regional indicators + matchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag) + + // FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character + // A CCC of regional indicators x 2 + matchTest(#"[\u{1F1E6}-\u{1F1FF}]{2}"#, input: flag, match: flag, + xfail: true) + + // FIXME: A single CCC of regional indicators matches the whole flag character + // A CCC of regional indicators followed by the second Unicode scalar + matchTest(#"[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}"#, input: flag, match: flag, + xfail: true) + // A single CCC of regional indicators + matchTest(#"[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: nil, + xfail: true) + } + + func testAnyChar() throws { + // Expectation: \X and, in grapheme cluster mode, `.` should consume an + // entire character, regardless of how it's spelled. \O should consume only + // a single Unicode scalar value, leaving any other grapheme scalar + // components to be matched. + + matchTests( + #".\u{301}"#, + (eComposed, nil), + (eDecomposed, nil)) + matchTests( + #"\X\u{301}"#, + (eComposed, nil), + (eDecomposed, nil)) + + // FIXME: \O is unsupported + matchTest(#"\O\u{301}"#, input: eDecomposed, match: eDecomposed, + xfail: true) + matchTest(#"e\O"#, input: eDecomposed, match: eDecomposed, + xfail: true) + // TODO: Should these two match or not? + matchTest(#"\O\u{301}"#, input: eComposed, match: eComposed, + xfail: true) + matchTest(#"e\O"#, input: eComposed, match: nil, + xfail: true) + + // FIXME: Unicode scalar semantic flag (?U) doesn't change behavior of `.` + matchTests( + #"(?U).\u{301}"#, + (eComposed, eComposed), + (eDecomposed, eDecomposed), + xfail: true) + } } From a8aa4339ac8487791504ac4ad5f998e8e7132aac Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Tue, 11 Jan 2022 11:47:00 -0600 Subject: [PATCH 2/5] Add a couple more semantics tests --- Tests/RegexTests/MatchTests.swift | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 1c008beb1..bd0466776 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -888,6 +888,11 @@ extension RegexTests { matchTest(#"\p{Number}"#, input: "5\u{305}", match: nil, xfail: true) + // Should this match the '5' but not the ZWJ, or should it treat '5'+ZWJ + // as one entity and fail to match altogether? + matchTest(#"^\d"#, input: "5\u{200d}0", match: "5", + xfail: true) + // \s matchTest(#"\s"#, input: " ", match: " ") // FIXME: \s shouldn't match a number composed with a non-number character @@ -947,6 +952,12 @@ extension RegexTests { // A single CCC of regional indicators matchTest(#"[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: nil, xfail: true) + + // A single CCC of actual flag emojis / combined regional indicators + matchTest(#"[🇦🇫-🇿🇼]"#, input: flag, match: flag) + // This succeeds (correctly) because \u{1F1F0} is lexicographically + // within the CCC range + matchTest(#"[🇦🇫-🇿🇼]"#, input: "\u{1F1F0}abc", match: "\u{1F1F0}") } func testAnyChar() throws { From ef371a3f7dc7b9f2f42e5c5d17946156ba0d48d7 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Tue, 11 Jan 2022 15:06:37 -0600 Subject: [PATCH 3/5] Add tests for individual scalar matching --- Tests/RegexTests/MatchTests.swift | 37 ++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index bd0466776..a2751e548 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -837,6 +837,30 @@ extension RegexTests { var eComposed: String { "é" } var eDecomposed: String { "e\u{301}" } + func testIndividualScalars() { + // Expectation: A standalone Unicode scalar value in a regex literal + // can match either that specific scalar value or participate in matching + // as a character. + + matchTest(#"\u{65}\u{301}$"#, input: eDecomposed, match: eDecomposed) + // FIXME: Decomposed character in regex literal doesn't match an equivalent character + matchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed, + xfail: true) + + matchTest(#"\u{65}"#, input: eDecomposed, match: "e") + matchTest(#"\u{65}$"#, input: eDecomposed, match: nil) + // FIXME: \y is unsupported + matchTest(#"\u{65}\y"#, input: eDecomposed, match: nil, + xfail: true) + + // FIXME: Unicode scalars are only matched at the start of a grapheme cluster + matchTest(#"\u{301}"#, input: eDecomposed, match: "\u{301}", + xfail: true) + // FIXME: \y is unsupported + matchTest(#"\y\u{301}"#, input: eDecomposed, match: nil, + xfail: true) + } + func testCanonicalEquivalence() throws { // Expectation: Matching should use canonical equivalence whenever comparing // characters, so a user can write characters using any equivalent spelling @@ -846,20 +870,20 @@ extension RegexTests { #"é$"#, (eComposed, eComposed), (eDecomposed, eDecomposed)) - + // FIXME: Decomposed character in regex literal doesn't match an equivalent character matchTests( #"e\u{301}$"#, (eComposed, eComposed), (eDecomposed, eDecomposed), xfail: true) - + matchTests( #"e$"#, (eComposed, nil), (eDecomposed, nil)) } - + func testCanonicalEquivalenceCharacterClass() throws { // Expectation: Character classes should match equivalent characters to the // same degree, regardless of how they are spelled. Unicode "property @@ -980,8 +1004,7 @@ extension RegexTests { xfail: true) matchTest(#"e\O"#, input: eDecomposed, match: eDecomposed, xfail: true) - // TODO: Should these two match or not? - matchTest(#"\O\u{301}"#, input: eComposed, match: eComposed, + matchTest(#"\O\u{301}"#, input: eComposed, match: nil, xfail: true) matchTest(#"e\O"#, input: eComposed, match: nil, xfail: true) @@ -993,5 +1016,9 @@ extension RegexTests { (eDecomposed, eDecomposed), xfail: true) } + + // TODO: Add test for implied grapheme cluster requirement at group boundaries + + // TODO: Add test for grapheme boundaries at start/end of match } From 925b6e9e8e7eaa29169e85fc7a4aba03cb575d9d Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 13 Jan 2022 10:05:53 -0600 Subject: [PATCH 4/5] Add test for Unicode scalar mode `.` --- Tests/RegexTests/MatchTests.swift | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index a2751e548..6b1f768a9 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -990,6 +990,9 @@ extension RegexTests { // a single Unicode scalar value, leaving any other grapheme scalar // components to be matched. + matchTest(#"(?u:.)"#, input: eDecomposed, match: "e", + xfail: true) + matchTests( #".\u{301}"#, (eComposed, nil), From b4bce2f5a4bdddb74734b51ecd6b80da28d32f72 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 13 Jan 2022 10:38:29 -0600 Subject: [PATCH 5/5] Update for new test methods --- Tests/RegexTests/MatchTests.swift | 124 +++++++++++++++--------------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 3d9fdf60e..1717b6a7d 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1108,22 +1108,22 @@ extension RegexTests { // can match either that specific scalar value or participate in matching // as a character. - matchTest(#"\u{65}\u{301}$"#, input: eDecomposed, match: eDecomposed) + firstMatchTest(#"\u{65}\u{301}$"#, input: eDecomposed, match: eDecomposed) // FIXME: Decomposed character in regex literal doesn't match an equivalent character - matchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed, + firstMatchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed, xfail: true) - matchTest(#"\u{65}"#, input: eDecomposed, match: "e") - matchTest(#"\u{65}$"#, input: eDecomposed, match: nil) + firstMatchTest(#"\u{65}"#, input: eDecomposed, match: "e") + firstMatchTest(#"\u{65}$"#, input: eDecomposed, match: nil) // FIXME: \y is unsupported - matchTest(#"\u{65}\y"#, input: eDecomposed, match: nil, + firstMatchTest(#"\u{65}\y"#, input: eDecomposed, match: nil, xfail: true) // FIXME: Unicode scalars are only matched at the start of a grapheme cluster - matchTest(#"\u{301}"#, input: eDecomposed, match: "\u{301}", + firstMatchTest(#"\u{301}"#, input: eDecomposed, match: "\u{301}", xfail: true) // FIXME: \y is unsupported - matchTest(#"\y\u{301}"#, input: eDecomposed, match: nil, + firstMatchTest(#"\y\u{301}"#, input: eDecomposed, match: nil, xfail: true) } @@ -1132,22 +1132,22 @@ extension RegexTests { // characters, so a user can write characters using any equivalent spelling // in either a regex literal or the string targeted for matching. - matchTests( + matchTest( #"é$"#, - (eComposed, eComposed), - (eDecomposed, eDecomposed)) + (eComposed, true), + (eDecomposed, true)) // FIXME: Decomposed character in regex literal doesn't match an equivalent character - matchTests( + matchTest( #"e\u{301}$"#, - (eComposed, eComposed), - (eDecomposed, eDecomposed), + (eComposed, true), + (eDecomposed, true), xfail: true) - matchTests( + matchTest( #"e$"#, - (eComposed, nil), - (eDecomposed, nil)) + (eComposed, false), + (eDecomposed, false)) } func testCanonicalEquivalenceCharacterClass() throws { @@ -1157,41 +1157,41 @@ extension RegexTests { // the character are members of the property class. // \w - matchTests( + matchTest( #"^\w$"#, - (eComposed, eComposed), - (eDecomposed, eDecomposed)) + (eComposed, true), + (eDecomposed, true)) // \p{Letter} - matchTest(#"\p{Letter}$"#, input: eComposed, match: eComposed) + firstMatchTest(#"\p{Letter}$"#, input: eComposed, match: eComposed) // FIXME: \p{Letter} doesn't match a decomposed character - matchTest(#"\p{Letter}$"#, input: eDecomposed, match: eDecomposed, + firstMatchTest(#"\p{Letter}$"#, input: eDecomposed, match: eDecomposed, xfail: true) // \d - matchTest(#"\d"#, input: "5", match: "5") + firstMatchTest(#"\d"#, input: "5", match: "5") // FIXME: \d shouldn't match a digit composed with a non-digit character - matchTest(#"\d"#, input: "5\u{305}", match: nil, + firstMatchTest(#"\d"#, input: "5\u{305}", match: nil, xfail: true) // \p{Number} - matchTest(#"\p{Number}"#, input: "5", match: "5") + firstMatchTest(#"\p{Number}"#, input: "5", match: "5") // FIXME: \p{Number} shouldn't match a number composed with a non-number character - matchTest(#"\p{Number}"#, input: "5\u{305}", match: nil, + firstMatchTest(#"\p{Number}"#, input: "5\u{305}", match: nil, xfail: true) // Should this match the '5' but not the ZWJ, or should it treat '5'+ZWJ // as one entity and fail to match altogether? - matchTest(#"^\d"#, input: "5\u{200d}0", match: "5", + firstMatchTest(#"^\d"#, input: "5\u{200d}0", match: "5", xfail: true) // \s - matchTest(#"\s"#, input: " ", match: " ") + firstMatchTest(#"\s"#, input: " ", match: " ") // FIXME: \s shouldn't match a number composed with a non-number character - matchTest(#"\s\u{305}"#, input: " ", match: nil, + firstMatchTest(#"\s\u{305}"#, input: " ", match: nil, xfail: true) // \p{Whitespace} - matchTest(#"\s"#, input: " ", match: " ") + firstMatchTest(#"\s"#, input: " ", match: " ") // FIXME: \p{Whitespace} shouldn't match whitespace composed with a non-whitespace character - matchTest(#"\s\u{305}"#, input: " ", match: nil, + firstMatchTest(#"\s\u{305}"#, input: " ", match: nil, xfail: true) } @@ -1201,53 +1201,53 @@ extension RegexTests { // match the scalar values that comprise a grapheme cluster in separate, // or repeated, custom character classes. - matchTests( + matchTest( #"[áéíóú]$"#, - (eComposed, eComposed), - (eDecomposed, eDecomposed)) + (eComposed, true), + (eDecomposed, true)) // FIXME: Custom char classes don't use canonical equivalence with composed characters - matchTest(#"e[\u{301}]$"#, input: eComposed, match: eComposed, + firstMatchTest(#"e[\u{301}]$"#, input: eComposed, match: eComposed, xfail: true) - matchTest(#"e[\u{300}-\u{320}]$"#, input: eComposed, match: eComposed, + firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eComposed, match: eComposed, xfail: true) - matchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eComposed, match: eComposed, + firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eComposed, match: eComposed, xfail: true) // FIXME: Custom char classes don't match decomposed characters - matchTest(#"e[\u{301}]$"#, input: eDecomposed, match: eDecomposed, + firstMatchTest(#"e[\u{301}]$"#, input: eDecomposed, match: eDecomposed, xfail: true) - matchTest(#"e[\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed, + firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed, xfail: true) - matchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed, + firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed, xfail: true) let flag = "🇰🇷" - matchTest(#"🇰🇷"#, input: flag, match: flag) - matchTest(#"[🇰🇷]"#, input: flag, match: flag) - matchTest(#"\u{1F1F0}\u{1F1F7}"#, input: flag, match: flag) + firstMatchTest(#"🇰🇷"#, input: flag, match: flag) + firstMatchTest(#"[🇰🇷]"#, input: flag, match: flag) + firstMatchTest(#"\u{1F1F0}\u{1F1F7}"#, input: flag, match: flag) // First Unicode scalar followed by CCC of regional indicators - matchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag) + firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag) // FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character // A CCC of regional indicators x 2 - matchTest(#"[\u{1F1E6}-\u{1F1FF}]{2}"#, input: flag, match: flag, + firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]{2}"#, input: flag, match: flag, xfail: true) // FIXME: A single CCC of regional indicators matches the whole flag character // A CCC of regional indicators followed by the second Unicode scalar - matchTest(#"[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}"#, input: flag, match: flag, + firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}"#, input: flag, match: flag, xfail: true) // A single CCC of regional indicators - matchTest(#"[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: nil, + firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: nil, xfail: true) // A single CCC of actual flag emojis / combined regional indicators - matchTest(#"[🇦🇫-🇿🇼]"#, input: flag, match: flag) + firstMatchTest(#"[🇦🇫-🇿🇼]"#, input: flag, match: flag) // This succeeds (correctly) because \u{1F1F0} is lexicographically // within the CCC range - matchTest(#"[🇦🇫-🇿🇼]"#, input: "\u{1F1F0}abc", match: "\u{1F1F0}") + firstMatchTest(#"[🇦🇫-🇿🇼]"#, input: "\u{1F1F0}abc", match: "\u{1F1F0}") } func testAnyChar() throws { @@ -1256,33 +1256,33 @@ extension RegexTests { // a single Unicode scalar value, leaving any other grapheme scalar // components to be matched. - matchTest(#"(?u:.)"#, input: eDecomposed, match: "e", + firstMatchTest(#"(?u:.)"#, input: eDecomposed, match: "e", xfail: true) - matchTests( + matchTest( #".\u{301}"#, - (eComposed, nil), - (eDecomposed, nil)) - matchTests( + (eComposed, false), + (eDecomposed, false)) + matchTest( #"\X\u{301}"#, - (eComposed, nil), - (eDecomposed, nil)) + (eComposed, false), + (eDecomposed, false)) // FIXME: \O is unsupported - matchTest(#"\O\u{301}"#, input: eDecomposed, match: eDecomposed, + firstMatchTest(#"\O\u{301}"#, input: eDecomposed, match: eDecomposed, xfail: true) - matchTest(#"e\O"#, input: eDecomposed, match: eDecomposed, + firstMatchTest(#"e\O"#, input: eDecomposed, match: eDecomposed, xfail: true) - matchTest(#"\O\u{301}"#, input: eComposed, match: nil, + firstMatchTest(#"\O\u{301}"#, input: eComposed, match: nil, xfail: true) - matchTest(#"e\O"#, input: eComposed, match: nil, + firstMatchTest(#"e\O"#, input: eComposed, match: nil, xfail: true) // FIXME: Unicode scalar semantic flag (?U) doesn't change behavior of `.` - matchTests( + matchTest( #"(?U).\u{301}"#, - (eComposed, eComposed), - (eDecomposed, eDecomposed), + (eComposed, true), + (eDecomposed, true), xfail: true) }