From 50ee12e6b984fa7de48e2ad676461a48ad805e98 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 21 Feb 2022 18:04:52 -0600 Subject: [PATCH 1/5] Support text segment boundary anchors This enables the `\y` and `\Y` anchors in regex literals and `Anchor.textSegmentBoundary` in the DSL. Note: This also includes `UnicodeScalar` conformance to `RegexProtocol`, which acts like Unicode scalar literals in regex literals. --- Sources/_StringProcessing/ByteCodeGen.swift | 12 ++++++++---- Sources/_StringProcessing/RegexDSL/DSL.swift | 8 ++++++++ Tests/RegexTests/MatchTests.swift | 16 ++++++++++++++-- Tests/RegexTests/RegexDSLTests.swift | 10 ++++++++++ 4 files changed, 40 insertions(+), 6 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 5770153da..0c3665200 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -96,12 +96,16 @@ extension Compiler.ByteCodeGen { } case .textSegment: - // This we should be able to do! - throw Unsupported(#"\y (text segment)"#) + builder.buildAssert { (input, pos, _) in + // TODO: Unicode word boundary algorithm? + input.isOnGraphemeClusterBoundary(pos) + } case .notTextSegment: - // This we should be able to do! - throw Unsupported(#"\Y (not text segment)"#) + builder.buildAssert { (input, pos, _) in + // TODO: Unicode word boundary algorithm? + !input.isOnGraphemeClusterBoundary(pos) + } case .startOfLine: builder.buildAssert { (input, pos, bounds) in diff --git a/Sources/_StringProcessing/RegexDSL/DSL.swift b/Sources/_StringProcessing/RegexDSL/DSL.swift index 51880fdf3..14f875464 100644 --- a/Sources/_StringProcessing/RegexDSL/DSL.swift +++ b/Sources/_StringProcessing/RegexDSL/DSL.swift @@ -39,6 +39,14 @@ extension Character: RegexProtocol { } } +extension UnicodeScalar: RegexProtocol { + public typealias Match = Substring + + public var regex: Regex { + .init(ast: atom(.scalar(self))) + } +} + extension CharacterClass: RegexProtocol { public typealias Match = Substring diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 7a9a438c6..47092b6cb 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -892,7 +892,8 @@ extension RegexTests { #"\d+\b"#, ("123", "123"), (" 123", "123"), - ("123 456", "123")) + ("123 456", "123"), + ("123A 456", "456")) firstMatchTests( #"\d+\b\s\b\d+"#, ("123", nil), @@ -908,7 +909,18 @@ extension RegexTests { // TODO: \G and \K // TODO: Oniguruma \y and \Y - + firstMatchTests( + #"\u{65}"#, // Scalar 'e' is present in both: + ("Cafe\u{301}", "e"), // composed and + ("Sol Cafe", "e")) // standalone + firstMatchTests( + #"\u{65}\y"#, // Grapheme boundary assertion + ("Cafe\u{301}", nil), + ("Sol Cafe", "e")) + firstMatchTests( + #"\u{65}\Y"#, // Grapheme non-boundary assertion + ("Cafe\u{301}", "e"), + ("Sol Cafe", nil)) } func testMatchGroups() { diff --git a/Tests/RegexTests/RegexDSLTests.swift b/Tests/RegexTests/RegexDSLTests.swift index d5f5c7e63..52e553f27 100644 --- a/Tests/RegexTests/RegexDSLTests.swift +++ b/Tests/RegexTests/RegexDSLTests.swift @@ -274,6 +274,16 @@ class RegexDSLTests: XCTestCase { Anchor.endOfLine } + try _testDSLCaptures( + ("Cafe\u{301}", nil), + ("Cafe", "Cafe"), + captureType: Substring.self, ==) + { + oneOrMore(.word) + UnicodeScalar("e") + Anchor.textSegmentBoundary + } + try _testDSLCaptures( ("aaaaa1", "aaaaa1"), ("aaaaa2", nil), From 13524a9f756161a2acf55c0983ff465bd69efea6 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 24 Feb 2022 13:15:43 -0600 Subject: [PATCH 2/5] Switch primitive RegexProtocol conformances to DSLTree --- Sources/_StringProcessing/RegexDSL/DSL.swift | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/Sources/_StringProcessing/RegexDSL/DSL.swift b/Sources/_StringProcessing/RegexDSL/DSL.swift index 14f875464..976c8ae78 100644 --- a/Sources/_StringProcessing/RegexDSL/DSL.swift +++ b/Sources/_StringProcessing/RegexDSL/DSL.swift @@ -17,8 +17,7 @@ extension String: RegexProtocol { public typealias Match = Substring public var regex: Regex { - let atoms = self.map { atom(.char($0)) } - return .init(ast: concat(atoms)) + .init(node: .quotedLiteral(self)) } } @@ -26,8 +25,7 @@ extension Substring: RegexProtocol { public typealias Match = Substring public var regex: Regex { - let atoms = self.map { atom(.char($0)) } - return .init(ast: concat(atoms)) + .init(node: .quotedLiteral(String(self))) } } @@ -35,7 +33,7 @@ extension Character: RegexProtocol { public typealias Match = Substring public var regex: Regex { - .init(ast: atom(.char(self))) + .init(node: .atom(.char(self))) } } @@ -43,7 +41,7 @@ extension UnicodeScalar: RegexProtocol { public typealias Match = Substring public var regex: Regex { - .init(ast: atom(.scalar(self))) + .init(node: .atom(.scalar(self))) } } From 6b828750d1dff27a12e6f8c4fb17006ea00cffde Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 2 Mar 2022 00:14:30 -0600 Subject: [PATCH 3/5] Update Sources/_StringProcessing/ByteCodeGen.swift Co-authored-by: Michael Ilseman --- Sources/_StringProcessing/ByteCodeGen.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 0c3665200..00bf20290 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -97,7 +97,7 @@ extension Compiler.ByteCodeGen { case .textSegment: builder.buildAssert { (input, pos, _) in - // TODO: Unicode word boundary algorithm? + // FIXME: Grapheme or word based on options input.isOnGraphemeClusterBoundary(pos) } From b1ca05fb51c303797a33265551d5d026b04c38d7 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 2 Mar 2022 00:14:35 -0600 Subject: [PATCH 4/5] Update Sources/_StringProcessing/ByteCodeGen.swift Co-authored-by: Michael Ilseman --- Sources/_StringProcessing/ByteCodeGen.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 00bf20290..5ca490576 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -103,7 +103,7 @@ extension Compiler.ByteCodeGen { case .notTextSegment: builder.buildAssert { (input, pos, _) in - // TODO: Unicode word boundary algorithm? + // FIXME: Grapheme or word based on options !input.isOnGraphemeClusterBoundary(pos) } From 92d1d7f46c00ab7ef0014e8729f247ed515e2f3a Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 2 Mar 2022 12:06:37 -0600 Subject: [PATCH 5/5] Update to new test method signature --- Tests/RegexTests/RegexDSLTests.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tests/RegexTests/RegexDSLTests.swift b/Tests/RegexTests/RegexDSLTests.swift index 5d30b1442..3ac21c8ca 100644 --- a/Tests/RegexTests/RegexDSLTests.swift +++ b/Tests/RegexTests/RegexDSLTests.swift @@ -283,7 +283,7 @@ class RegexDSLTests: XCTestCase { try _testDSLCaptures( ("Cafe\u{301}", nil), ("Cafe", "Cafe"), - captureType: Substring.self, ==) + matchType: Substring.self, ==) { oneOrMore(.word) UnicodeScalar("e")