Skip to content

Add tests for Character semantics in regex matching #102

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jan 13, 2022
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 151 additions & 0 deletions Tests/RegexTests/MatchTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -831,5 +831,156 @@ extension RegexTests {
matchTest(#"(?<a>.)(.)\k<a>"#, input: "abac", match: "aba", xfail: true)
matchTest(#"\g'+2'(.)(.)"#, input: "abac", match: "aba", xfail: true)
}

// MARK: Character Semantics

var eComposed: String { "é" }
var eDecomposed: String { "e\u{301}" }

func testCanonicalEquivalence() throws {
// Expectation: Matching should use canonical equivalence whenever comparing
// characters, so a user can write characters using any equivalent spelling
// in either a regex literal or the string targeted for matching.

matchTests(
#"é$"#,
(eComposed, eComposed),
(eDecomposed, eDecomposed))

// FIXME: Decomposed character in regex literal doesn't match an equivalent character
matchTests(
#"e\u{301}$"#,
(eComposed, eComposed),
(eDecomposed, eDecomposed),
xfail: true)

matchTests(
#"e$"#,
(eComposed, nil),
(eDecomposed, nil))
}

func testCanonicalEquivalenceCharacterClass() throws {
// Expectation: Character classes should match equivalent characters to the
// same degree, regardless of how they are spelled. Unicode "property
// classes" should match characters when all the code points that comprise
// the character are members of the property class.

// \w
matchTests(
#"^\w$"#,
(eComposed, eComposed),
(eDecomposed, eDecomposed))
// \p{Letter}
matchTest(#"\p{Letter}$"#, input: eComposed, match: eComposed)
// FIXME: \p{Letter} doesn't match a decomposed character
matchTest(#"\p{Letter}$"#, input: eDecomposed, match: eDecomposed,
xfail: true)

// \d
matchTest(#"\d"#, input: "5", match: "5")
// FIXME: \d shouldn't match a digit composed with a non-digit character
matchTest(#"\d"#, input: "5\u{305}", match: nil,
xfail: true)
// \p{Number}
matchTest(#"\p{Number}"#, input: "5", match: "5")
// FIXME: \p{Number} shouldn't match a number composed with a non-number character
matchTest(#"\p{Number}"#, input: "5\u{305}", match: nil,
xfail: true)

// \s
matchTest(#"\s"#, input: " ", match: " ")
// FIXME: \s shouldn't match a number composed with a non-number character
matchTest(#"\s\u{305}"#, input: " ", match: nil,
xfail: true)
// \p{Whitespace}
matchTest(#"\s"#, input: " ", match: " ")
// FIXME: \p{Whitespace} shouldn't match whitespace composed with a non-whitespace character
matchTest(#"\s\u{305}"#, input: " ", match: nil,
xfail: true)
}

func testCanonicalEquivalenceCustomCharacterClass() throws {
// Expectation: Concatenations with custom character classes should be able
// to match within a grapheme cluster. That is, a regex should be able to
// match the scalar values that comprise a grapheme cluster in separate,
// or repeated, custom character classes.

matchTests(
#"[áéíóú]$"#,
(eComposed, eComposed),
(eDecomposed, eDecomposed))

// FIXME: Custom char classes don't use canonical equivalence with composed characters
matchTest(#"e[\u{301}]$"#, input: eComposed, match: eComposed,
xfail: true)
matchTest(#"e[\u{300}-\u{320}]$"#, input: eComposed, match: eComposed,
xfail: true)
matchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eComposed, match: eComposed,
xfail: true)

// FIXME: Custom char classes don't match decomposed characters
matchTest(#"e[\u{301}]$"#, input: eDecomposed, match: eDecomposed,
xfail: true)
matchTest(#"e[\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed,
xfail: true)
matchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed,
xfail: true)

let flag = "🇰🇷"
matchTest(#"🇰🇷"#, input: flag, match: flag)
matchTest(#"[🇰🇷]"#, input: flag, match: flag)
matchTest(#"\u{1F1F0}\u{1F1F7}"#, input: flag, match: flag)

// First Unicode scalar followed by CCC of regional indicators
matchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag)

// FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character
// A CCC of regional indicators x 2
matchTest(#"[\u{1F1E6}-\u{1F1FF}]{2}"#, input: flag, match: flag,
xfail: true)

// FIXME: A single CCC of regional indicators matches the whole flag character
// A CCC of regional indicators followed by the second Unicode scalar
matchTest(#"[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}"#, input: flag, match: flag,
xfail: true)
// A single CCC of regional indicators
matchTest(#"[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: nil,
xfail: true)
}

func testAnyChar() throws {
// Expectation: \X and, in grapheme cluster mode, `.` should consume an
// entire character, regardless of how it's spelled. \O should consume only
// a single Unicode scalar value, leaving any other grapheme scalar
// components to be matched.

matchTests(
#".\u{301}"#,
(eComposed, nil),
(eDecomposed, nil))
matchTests(
#"\X\u{301}"#,
(eComposed, nil),
(eDecomposed, nil))

// FIXME: \O is unsupported
matchTest(#"\O\u{301}"#, input: eDecomposed, match: eDecomposed,
xfail: true)
matchTest(#"e\O"#, input: eDecomposed, match: eDecomposed,
xfail: true)
// TODO: Should these two match or not?
matchTest(#"\O\u{301}"#, input: eComposed, match: eComposed,
xfail: true)
matchTest(#"e\O"#, input: eComposed, match: nil,
xfail: true)

// FIXME: Unicode scalar semantic flag (?U) doesn't change behavior of `.`
matchTests(
#"(?U).\u{301}"#,
(eComposed, eComposed),
(eDecomposed, eDecomposed),
xfail: true)
}
}