@@ -1097,5 +1097,197 @@ extension RegexTests {
1097
1097
( " -123e1.2 " , nil )
1098
1098
)
1099
1099
}
1100
+
1101
+ // MARK: Character Semantics
1102
+
1103
+ var eComposed : String { " é " }
1104
+ var eDecomposed : String { " e \u{301} " }
1105
+
1106
+ func testIndividualScalars( ) {
1107
+ // Expectation: A standalone Unicode scalar value in a regex literal
1108
+ // can match either that specific scalar value or participate in matching
1109
+ // as a character.
1110
+
1111
+ firstMatchTest ( #"\u{65}\u{301}$"# , input: eDecomposed, match: eDecomposed)
1112
+ // FIXME: Decomposed character in regex literal doesn't match an equivalent character
1113
+ firstMatchTest ( #"\u{65}\u{301}$"# , input: eComposed, match: eComposed,
1114
+ xfail: true )
1115
+
1116
+ firstMatchTest ( #"\u{65}"# , input: eDecomposed, match: " e " )
1117
+ firstMatchTest ( #"\u{65}$"# , input: eDecomposed, match: nil )
1118
+ // FIXME: \y is unsupported
1119
+ firstMatchTest ( #"\u{65}\y"# , input: eDecomposed, match: nil ,
1120
+ xfail: true )
1121
+
1122
+ // FIXME: Unicode scalars are only matched at the start of a grapheme cluster
1123
+ firstMatchTest ( #"\u{301}"# , input: eDecomposed, match: " \u{301} " ,
1124
+ xfail: true )
1125
+ // FIXME: \y is unsupported
1126
+ firstMatchTest ( #"\y\u{301}"# , input: eDecomposed, match: nil ,
1127
+ xfail: true )
1128
+ }
1129
+
1130
+ func testCanonicalEquivalence( ) throws {
1131
+ // Expectation: Matching should use canonical equivalence whenever comparing
1132
+ // characters, so a user can write characters using any equivalent spelling
1133
+ // in either a regex literal or the string targeted for matching.
1134
+
1135
+ matchTest (
1136
+ #"é$"# ,
1137
+ ( eComposed, true ) ,
1138
+ ( eDecomposed, true ) )
1139
+
1140
+ // FIXME: Decomposed character in regex literal doesn't match an equivalent character
1141
+ matchTest (
1142
+ #"e\u{301}$"# ,
1143
+ ( eComposed, true ) ,
1144
+ ( eDecomposed, true ) ,
1145
+ xfail: true )
1146
+
1147
+ matchTest (
1148
+ #"e$"# ,
1149
+ ( eComposed, false ) ,
1150
+ ( eDecomposed, false ) )
1151
+ }
1152
+
1153
+ func testCanonicalEquivalenceCharacterClass( ) throws {
1154
+ // Expectation: Character classes should match equivalent characters to the
1155
+ // same degree, regardless of how they are spelled. Unicode "property
1156
+ // classes" should match characters when all the code points that comprise
1157
+ // the character are members of the property class.
1158
+
1159
+ // \w
1160
+ matchTest (
1161
+ #"^\w$"# ,
1162
+ ( eComposed, true ) ,
1163
+ ( eDecomposed, true ) )
1164
+ // \p{Letter}
1165
+ firstMatchTest ( #"\p{Letter}$"# , input: eComposed, match: eComposed)
1166
+ // FIXME: \p{Letter} doesn't match a decomposed character
1167
+ firstMatchTest ( #"\p{Letter}$"# , input: eDecomposed, match: eDecomposed,
1168
+ xfail: true )
1169
+
1170
+ // \d
1171
+ firstMatchTest ( #"\d"# , input: " 5 " , match: " 5 " )
1172
+ // FIXME: \d shouldn't match a digit composed with a non-digit character
1173
+ firstMatchTest ( #"\d"# , input: " 5 \u{305} " , match: nil ,
1174
+ xfail: true )
1175
+ // \p{Number}
1176
+ firstMatchTest ( #"\p{Number}"# , input: " 5 " , match: " 5 " )
1177
+ // FIXME: \p{Number} shouldn't match a number composed with a non-number character
1178
+ firstMatchTest ( #"\p{Number}"# , input: " 5 \u{305} " , match: nil ,
1179
+ xfail: true )
1180
+
1181
+ // Should this match the '5' but not the ZWJ, or should it treat '5'+ZWJ
1182
+ // as one entity and fail to match altogether?
1183
+ firstMatchTest ( #"^\d"# , input: " 5 \u{200d} 0 " , match: " 5 " ,
1184
+ xfail: true )
1185
+
1186
+ // \s
1187
+ firstMatchTest ( #"\s"# , input: " " , match: " " )
1188
+ // FIXME: \s shouldn't match a number composed with a non-number character
1189
+ firstMatchTest ( #"\s\u{305}"# , input: " " , match: nil ,
1190
+ xfail: true )
1191
+ // \p{Whitespace}
1192
+ firstMatchTest ( #"\s"# , input: " " , match: " " )
1193
+ // FIXME: \p{Whitespace} shouldn't match whitespace composed with a non-whitespace character
1194
+ firstMatchTest ( #"\s\u{305}"# , input: " " , match: nil ,
1195
+ xfail: true )
1196
+ }
1197
+
1198
+ func testCanonicalEquivalenceCustomCharacterClass( ) throws {
1199
+ // Expectation: Concatenations with custom character classes should be able
1200
+ // to match within a grapheme cluster. That is, a regex should be able to
1201
+ // match the scalar values that comprise a grapheme cluster in separate,
1202
+ // or repeated, custom character classes.
1203
+
1204
+ matchTest (
1205
+ #"[áéíóú]$"# ,
1206
+ ( eComposed, true ) ,
1207
+ ( eDecomposed, true ) )
1208
+
1209
+ // FIXME: Custom char classes don't use canonical equivalence with composed characters
1210
+ firstMatchTest ( #"e[\u{301}]$"# , input: eComposed, match: eComposed,
1211
+ xfail: true )
1212
+ firstMatchTest ( #"e[\u{300}-\u{320}]$"# , input: eComposed, match: eComposed,
1213
+ xfail: true )
1214
+ firstMatchTest ( #"[a-z][\u{300}-\u{320}]$"# , input: eComposed, match: eComposed,
1215
+ xfail: true )
1216
+
1217
+ // FIXME: Custom char classes don't match decomposed characters
1218
+ firstMatchTest ( #"e[\u{301}]$"# , input: eDecomposed, match: eDecomposed,
1219
+ xfail: true )
1220
+ firstMatchTest ( #"e[\u{300}-\u{320}]$"# , input: eDecomposed, match: eDecomposed,
1221
+ xfail: true )
1222
+ firstMatchTest ( #"[a-z][\u{300}-\u{320}]$"# , input: eDecomposed, match: eDecomposed,
1223
+ xfail: true )
1224
+
1225
+ let flag = " 🇰🇷 "
1226
+ firstMatchTest ( #"🇰🇷"# , input: flag, match: flag)
1227
+ firstMatchTest ( #"[🇰🇷]"# , input: flag, match: flag)
1228
+ firstMatchTest ( #"\u{1F1F0}\u{1F1F7}"# , input: flag, match: flag)
1229
+
1230
+ // First Unicode scalar followed by CCC of regional indicators
1231
+ firstMatchTest ( #"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"# , input: flag, match: flag)
1232
+
1233
+ // FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character
1234
+ // A CCC of regional indicators x 2
1235
+ firstMatchTest ( #"[\u{1F1E6}-\u{1F1FF}]{2}"# , input: flag, match: flag,
1236
+ xfail: true )
1237
+
1238
+ // FIXME: A single CCC of regional indicators matches the whole flag character
1239
+ // A CCC of regional indicators followed by the second Unicode scalar
1240
+ firstMatchTest ( #"[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}"# , input: flag, match: flag,
1241
+ xfail: true )
1242
+ // A single CCC of regional indicators
1243
+ firstMatchTest ( #"[\u{1F1E6}-\u{1F1FF}]"# , input: flag, match: nil ,
1244
+ xfail: true )
1245
+
1246
+ // A single CCC of actual flag emojis / combined regional indicators
1247
+ firstMatchTest ( #"[🇦🇫-🇿🇼]"# , input: flag, match: flag)
1248
+ // This succeeds (correctly) because \u{1F1F0} is lexicographically
1249
+ // within the CCC range
1250
+ firstMatchTest ( #"[🇦🇫-🇿🇼]"# , input: " \u{1F1F0} abc " , match: " \u{1F1F0} " )
1251
+ }
1252
+
1253
+ func testAnyChar( ) throws {
1254
+ // Expectation: \X and, in grapheme cluster mode, `.` should consume an
1255
+ // entire character, regardless of how it's spelled. \O should consume only
1256
+ // a single Unicode scalar value, leaving any other grapheme scalar
1257
+ // components to be matched.
1258
+
1259
+ firstMatchTest ( #"(?u:.)"# , input: eDecomposed, match: " e " ,
1260
+ xfail: true )
1261
+
1262
+ matchTest (
1263
+ #".\u{301}"# ,
1264
+ ( eComposed, false ) ,
1265
+ ( eDecomposed, false ) )
1266
+ matchTest (
1267
+ #"\X\u{301}"# ,
1268
+ ( eComposed, false ) ,
1269
+ ( eDecomposed, false ) )
1270
+
1271
+ // FIXME: \O is unsupported
1272
+ firstMatchTest ( #"\O\u{301}"# , input: eDecomposed, match: eDecomposed,
1273
+ xfail: true )
1274
+ firstMatchTest ( #"e\O"# , input: eDecomposed, match: eDecomposed,
1275
+ xfail: true )
1276
+ firstMatchTest ( #"\O\u{301}"# , input: eComposed, match: nil ,
1277
+ xfail: true )
1278
+ firstMatchTest ( #"e\O"# , input: eComposed, match: nil ,
1279
+ xfail: true )
1280
+
1281
+ // FIXME: Unicode scalar semantic flag (?U) doesn't change behavior of `.`
1282
+ matchTest (
1283
+ #"(?U).\u{301}"# ,
1284
+ ( eComposed, true ) ,
1285
+ ( eDecomposed, true ) ,
1286
+ xfail: true )
1287
+ }
1288
+
1289
+ // TODO: Add test for implied grapheme cluster requirement at group boundaries
1290
+
1291
+ // TODO: Add test for grapheme boundaries at start/end of match
1100
1292
}
1101
1293
0 commit comments