Skip to content

Commit 3902577

Browse files
authored
Add tests for Character semantics in regex matching (#102)
Several of these tests are xfail'd, as we haven't implemented this level of character semantics yet.
1 parent 32256d6 commit 3902577

File tree

1 file changed

+192
-0
lines changed

1 file changed

+192
-0
lines changed

Tests/RegexTests/MatchTests.swift

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1097,5 +1097,197 @@ extension RegexTests {
10971097
("-123e1.2", nil)
10981098
)
10991099
}
1100+
1101+
// MARK: Character Semantics
1102+
1103+
var eComposed: String { "é" }
1104+
var eDecomposed: String { "e\u{301}" }
1105+
1106+
func testIndividualScalars() {
1107+
// Expectation: A standalone Unicode scalar value in a regex literal
1108+
// can match either that specific scalar value or participate in matching
1109+
// as a character.
1110+
1111+
firstMatchTest(#"\u{65}\u{301}$"#, input: eDecomposed, match: eDecomposed)
1112+
// FIXME: Decomposed character in regex literal doesn't match an equivalent character
1113+
firstMatchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed,
1114+
xfail: true)
1115+
1116+
firstMatchTest(#"\u{65}"#, input: eDecomposed, match: "e")
1117+
firstMatchTest(#"\u{65}$"#, input: eDecomposed, match: nil)
1118+
// FIXME: \y is unsupported
1119+
firstMatchTest(#"\u{65}\y"#, input: eDecomposed, match: nil,
1120+
xfail: true)
1121+
1122+
// FIXME: Unicode scalars are only matched at the start of a grapheme cluster
1123+
firstMatchTest(#"\u{301}"#, input: eDecomposed, match: "\u{301}",
1124+
xfail: true)
1125+
// FIXME: \y is unsupported
1126+
firstMatchTest(#"\y\u{301}"#, input: eDecomposed, match: nil,
1127+
xfail: true)
1128+
}
1129+
1130+
func testCanonicalEquivalence() throws {
1131+
// Expectation: Matching should use canonical equivalence whenever comparing
1132+
// characters, so a user can write characters using any equivalent spelling
1133+
// in either a regex literal or the string targeted for matching.
1134+
1135+
matchTest(
1136+
#"é$"#,
1137+
(eComposed, true),
1138+
(eDecomposed, true))
1139+
1140+
// FIXME: Decomposed character in regex literal doesn't match an equivalent character
1141+
matchTest(
1142+
#"e\u{301}$"#,
1143+
(eComposed, true),
1144+
(eDecomposed, true),
1145+
xfail: true)
1146+
1147+
matchTest(
1148+
#"e$"#,
1149+
(eComposed, false),
1150+
(eDecomposed, false))
1151+
}
1152+
1153+
func testCanonicalEquivalenceCharacterClass() throws {
1154+
// Expectation: Character classes should match equivalent characters to the
1155+
// same degree, regardless of how they are spelled. Unicode "property
1156+
// classes" should match characters when all the code points that comprise
1157+
// the character are members of the property class.
1158+
1159+
// \w
1160+
matchTest(
1161+
#"^\w$"#,
1162+
(eComposed, true),
1163+
(eDecomposed, true))
1164+
// \p{Letter}
1165+
firstMatchTest(#"\p{Letter}$"#, input: eComposed, match: eComposed)
1166+
// FIXME: \p{Letter} doesn't match a decomposed character
1167+
firstMatchTest(#"\p{Letter}$"#, input: eDecomposed, match: eDecomposed,
1168+
xfail: true)
1169+
1170+
// \d
1171+
firstMatchTest(#"\d"#, input: "5", match: "5")
1172+
// FIXME: \d shouldn't match a digit composed with a non-digit character
1173+
firstMatchTest(#"\d"#, input: "5\u{305}", match: nil,
1174+
xfail: true)
1175+
// \p{Number}
1176+
firstMatchTest(#"\p{Number}"#, input: "5", match: "5")
1177+
// FIXME: \p{Number} shouldn't match a number composed with a non-number character
1178+
firstMatchTest(#"\p{Number}"#, input: "5\u{305}", match: nil,
1179+
xfail: true)
1180+
1181+
// Should this match the '5' but not the ZWJ, or should it treat '5'+ZWJ
1182+
// as one entity and fail to match altogether?
1183+
firstMatchTest(#"^\d"#, input: "5\u{200d}0", match: "5",
1184+
xfail: true)
1185+
1186+
// \s
1187+
firstMatchTest(#"\s"#, input: " ", match: " ")
1188+
// FIXME: \s shouldn't match a number composed with a non-number character
1189+
firstMatchTest(#"\s\u{305}"#, input: " ", match: nil,
1190+
xfail: true)
1191+
// \p{Whitespace}
1192+
firstMatchTest(#"\s"#, input: " ", match: " ")
1193+
// FIXME: \p{Whitespace} shouldn't match whitespace composed with a non-whitespace character
1194+
firstMatchTest(#"\s\u{305}"#, input: " ", match: nil,
1195+
xfail: true)
1196+
}
1197+
1198+
func testCanonicalEquivalenceCustomCharacterClass() throws {
1199+
// Expectation: Concatenations with custom character classes should be able
1200+
// to match within a grapheme cluster. That is, a regex should be able to
1201+
// match the scalar values that comprise a grapheme cluster in separate,
1202+
// or repeated, custom character classes.
1203+
1204+
matchTest(
1205+
#"[áéíóú]$"#,
1206+
(eComposed, true),
1207+
(eDecomposed, true))
1208+
1209+
// FIXME: Custom char classes don't use canonical equivalence with composed characters
1210+
firstMatchTest(#"e[\u{301}]$"#, input: eComposed, match: eComposed,
1211+
xfail: true)
1212+
firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eComposed, match: eComposed,
1213+
xfail: true)
1214+
firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eComposed, match: eComposed,
1215+
xfail: true)
1216+
1217+
// FIXME: Custom char classes don't match decomposed characters
1218+
firstMatchTest(#"e[\u{301}]$"#, input: eDecomposed, match: eDecomposed,
1219+
xfail: true)
1220+
firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed,
1221+
xfail: true)
1222+
firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed,
1223+
xfail: true)
1224+
1225+
let flag = "🇰🇷"
1226+
firstMatchTest(#"🇰🇷"#, input: flag, match: flag)
1227+
firstMatchTest(#"[🇰🇷]"#, input: flag, match: flag)
1228+
firstMatchTest(#"\u{1F1F0}\u{1F1F7}"#, input: flag, match: flag)
1229+
1230+
// First Unicode scalar followed by CCC of regional indicators
1231+
firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag)
1232+
1233+
// FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character
1234+
// A CCC of regional indicators x 2
1235+
firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]{2}"#, input: flag, match: flag,
1236+
xfail: true)
1237+
1238+
// FIXME: A single CCC of regional indicators matches the whole flag character
1239+
// A CCC of regional indicators followed by the second Unicode scalar
1240+
firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}"#, input: flag, match: flag,
1241+
xfail: true)
1242+
// A single CCC of regional indicators
1243+
firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: nil,
1244+
xfail: true)
1245+
1246+
// A single CCC of actual flag emojis / combined regional indicators
1247+
firstMatchTest(#"[🇦🇫-🇿🇼]"#, input: flag, match: flag)
1248+
// This succeeds (correctly) because \u{1F1F0} is lexicographically
1249+
// within the CCC range
1250+
firstMatchTest(#"[🇦🇫-🇿🇼]"#, input: "\u{1F1F0}abc", match: "\u{1F1F0}")
1251+
}
1252+
1253+
func testAnyChar() throws {
1254+
// Expectation: \X and, in grapheme cluster mode, `.` should consume an
1255+
// entire character, regardless of how it's spelled. \O should consume only
1256+
// a single Unicode scalar value, leaving any other grapheme scalar
1257+
// components to be matched.
1258+
1259+
firstMatchTest(#"(?u:.)"#, input: eDecomposed, match: "e",
1260+
xfail: true)
1261+
1262+
matchTest(
1263+
#".\u{301}"#,
1264+
(eComposed, false),
1265+
(eDecomposed, false))
1266+
matchTest(
1267+
#"\X\u{301}"#,
1268+
(eComposed, false),
1269+
(eDecomposed, false))
1270+
1271+
// FIXME: \O is unsupported
1272+
firstMatchTest(#"\O\u{301}"#, input: eDecomposed, match: eDecomposed,
1273+
xfail: true)
1274+
firstMatchTest(#"e\O"#, input: eDecomposed, match: eDecomposed,
1275+
xfail: true)
1276+
firstMatchTest(#"\O\u{301}"#, input: eComposed, match: nil,
1277+
xfail: true)
1278+
firstMatchTest(#"e\O"#, input: eComposed, match: nil,
1279+
xfail: true)
1280+
1281+
// FIXME: Unicode scalar semantic flag (?U) doesn't change behavior of `.`
1282+
matchTest(
1283+
#"(?U).\u{301}"#,
1284+
(eComposed, true),
1285+
(eDecomposed, true),
1286+
xfail: true)
1287+
}
1288+
1289+
// TODO: Add test for implied grapheme cluster requirement at group boundaries
1290+
1291+
// TODO: Add test for grapheme boundaries at start/end of match
11001292
}
11011293

0 commit comments

Comments
 (0)