From 05ce1d3345c7bd50cac79cef68309f69bbcf4e93 Mon Sep 17 00:00:00 2001
From: Saleem Abdulrasool <compnerd@compnerd.org>
Date: Sun, 23 Jan 2022 11:40:06 -0800
Subject: [PATCH 01/19] build: add a CMake based build

This is a first approximation of a CMake based build for this
repository.  The intent here is to localize the build rules for the
repository to allow it to be consumed during the build of the toolchain.
This allows the source list to be maintained in the repository as the
source of truth rather than be explicitly listed in the swift
repository.

For general purpose development, the SPM based build is recommended.
Unless there is a specific need for the tests to be included, testing
should be done via the SPM build.

This change is sufficient to build the content though does not perform
the install or export steps which will be required to consume the
results in the Swift build.

Example invocation:
~~~
cmake -B S:\b\16 ^
      -D CMAKE_BUILD_TYPE=Release ^
      -D ArgumentParser_DIR=S:\b\10\cmake\modules ^
      -G Ninja ^
      -S S:\SourceCache\swift-experimental-string-processing
cmake --build S:\b\16
~~~
---
 CMakeLists.txt                            | 17 +++++++++
 Sources/CMakeLists.txt                    |  6 +++
 Sources/Prototypes/CMakeLists.txt         | 18 +++++++++
 Sources/VariadicsGenerator/CMakeLists.txt |  7 ++++
 Sources/_MatchingEngine/CMakeLists.txt    | 46 +++++++++++++++++++++++
 Sources/_StringProcessing/CMakeLists.txt  | 42 +++++++++++++++++++++
 Sources/_Unicode/CMakeLists.txt           | 16 ++++++++
 7 files changed, 152 insertions(+)
 create mode 100644 CMakeLists.txt
 create mode 100644 Sources/CMakeLists.txt
 create mode 100644 Sources/Prototypes/CMakeLists.txt
 create mode 100644 Sources/VariadicsGenerator/CMakeLists.txt
 create mode 100644 Sources/_MatchingEngine/CMakeLists.txt
 create mode 100644 Sources/_StringProcessing/CMakeLists.txt
 create mode 100644 Sources/_Unicode/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..ad640f43a
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+cmake_minimum_required(VERSION 3.18)
+project(SwiftExperimentalStringProcessing
+  LANGUAGES Swift)
+
+if(CMAKE_SYSTEM_NAME STREQUAL Windows OR CMAKE_SYSTEM_NAME STREQUAL Darwin)
+  option(BUILD_SHARED_LIBS "Build shared libraries by default" YES)
+endif()
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+set(CMAKE_Swift_MODULE_DIRECTORY ${CMAKE_BINARY_DIR}/swift)
+
+find_package(ArgumentParser CONFIG)
+
+add_subdirectory(Sources)
diff --git a/Sources/CMakeLists.txt b/Sources/CMakeLists.txt
new file mode 100644
index 000000000..19feadbd9
--- /dev/null
+++ b/Sources/CMakeLists.txt
@@ -0,0 +1,6 @@
+
+add_subdirectory(_Unicode)
+add_subdirectory(_MatchingEngine)
+add_subdirectory(_StringProcessing)
+add_subdirectory(Prototypes)
+add_subdirectory(VariadicsGenerator)
diff --git a/Sources/Prototypes/CMakeLists.txt b/Sources/Prototypes/CMakeLists.txt
new file mode 100644
index 000000000..60768f5a3
--- /dev/null
+++ b/Sources/Prototypes/CMakeLists.txt
@@ -0,0 +1,18 @@
+
+add_library(Prototypes
+  Combinators/Combinators.swift
+  PEG/PEG.swift
+  PEG/PEGCode.swift
+  PEG/PEGCompile.swift
+  PEG/PEGCore.swift
+  PEG/PEGInterpreter.swift
+  PEG/PEGTranspile.swift
+  PEG/PEGVM.swift
+  PEG/PEGVMExecute.swift
+  PEG/Printing.swift
+  PTCaRet/Interpreter.swift
+  PTCaRet/PTCaRet.swift
+  TourOfTypes/CharacterClass.swift
+  TourOfTypes/Literal.swift)
+target_link_libraries(Prototypes PUBLIC
+  _MatchingEngine)
diff --git a/Sources/VariadicsGenerator/CMakeLists.txt b/Sources/VariadicsGenerator/CMakeLists.txt
new file mode 100644
index 000000000..8ea543970
--- /dev/null
+++ b/Sources/VariadicsGenerator/CMakeLists.txt
@@ -0,0 +1,7 @@
+
+add_executable(VariadicsGenerator
+  VariadicsGenerator.swift)
+target_compile_options(VariadicsGenerator PRIVATE
+  -parse-as-library)
+target_link_libraries(VariadicsGenerator PUBLIC
+  ArgumentParser)
diff --git a/Sources/_MatchingEngine/CMakeLists.txt b/Sources/_MatchingEngine/CMakeLists.txt
new file mode 100644
index 000000000..f7cb97ce3
--- /dev/null
+++ b/Sources/_MatchingEngine/CMakeLists.txt
@@ -0,0 +1,46 @@
+
+add_library(_MatchingEngine
+  Engine/Backtracking.swift
+  Engine/Builder.swift
+  Engine/Capture.swift
+  Engine/Consume.swift
+  Engine/Engine.swift
+  Engine/InstPayload.swift
+  Engine/Instruction.swift
+  Engine/Processor.swift
+  Engine/Program.swift
+  Engine/Registers.swift
+  Engine/Tracing.swift
+  Regex/AST/AST.swift
+  Regex/AST/ASTAction.swift
+  Regex/AST/ASTProtocols.swift
+  Regex/AST/Atom.swift
+  Regex/AST/Conditional.swift
+  Regex/AST/CustomCharClass.swift
+  Regex/AST/Group.swift
+  Regex/AST/MatchingOptions.swift
+  Regex/AST/Quantification.swift
+  Regex/Parse/CaptureStructure.swift
+  Regex/Parse/CharacterPropertyClassification.swift
+  Regex/Parse/Diagnostics.swift
+  Regex/Parse/LexicalAnalysis.swift
+  Regex/Parse/Mocking.swift
+  Regex/Parse/Parse.swift
+  Regex/Parse/Source.swift
+  Regex/Parse/SourceLocation.swift
+  Regex/Parse/SyntaxOptions.swift
+  Regex/Printing/DumpAST.swift
+  Regex/Printing/PrettyPrinter.swift
+  Regex/Printing/PrintAsCanonical.swift
+  Regex/Printing/PrintAsPattern.swift
+  Regex/Printing/RenderRanges.swift
+  Utility/AllScalars.swift
+  Utility/Formatting.swift
+  Utility/Misc.swift
+  Utility/MissingUnicode.swift
+  Utility/Protocols.swift
+  Utility/TypeConstruction.swift
+  Utility/TypedIndex.swift
+  Utility/TypedInt.swift)
+target_compile_options(_MatchingEngine PRIVATE
+  -enable-library-evolution)
diff --git a/Sources/_StringProcessing/CMakeLists.txt b/Sources/_StringProcessing/CMakeLists.txt
new file mode 100644
index 000000000..c20dcc240
--- /dev/null
+++ b/Sources/_StringProcessing/CMakeLists.txt
@@ -0,0 +1,42 @@
+
+add_library(_StringProcessing
+  Algorithms/Algorithms/Contains.swift
+  Algorithms/Algorithms/FirstRange.swift
+  Algorithms/Algorithms/Ranges.swift
+  Algorithms/Algorithms/Replace.swift
+  Algorithms/Algorithms/Split.swift
+  Algorithms/Algorithms/StartsWith.swift
+  Algorithms/Algorithms/Trim.swift
+  Algorithms/Consumers/CollectionConsumer.swift
+  Algorithms/Consumers/FixedPatternConsumer.swift
+  Algorithms/Consumers/ManyConsumer.swift
+  Algorithms/Consumers/PredicateConsumer.swift
+  Algorithms/Consumers/RegexConsumer.swift
+  Algorithms/Searchers/CollectionSearcher.swift
+  Algorithms/Searchers/ConsumerSearcher.swift
+  Algorithms/Searchers/NaivePatternSearcher.swift
+  Algorithms/Searchers/PatternOrEmpty.swift
+  Algorithms/Searchers/PredicateSearcher.swift
+  Algorithms/Searchers/TwoWaySearcher.swift
+  Algorithms/Searchers/ZSearcher.swift
+  ASTBuilder.swift
+  Capture.swift
+  CharacterClass.swift
+  Compiler.swift
+  ConsumerInterface.swift
+  Executor.swift
+  Legacy/HareVM.swift
+  Legacy/LegacyCompile.swift
+  Legacy/RECode.swift
+  Legacy/TortoiseVM.swift
+  Legacy/VirtualMachine.swift
+  RegexDSL/Builder.swift
+  RegexDSL/Concatenation.swift
+  RegexDSL/Core.swift
+  RegexDSL/DSL.swift
+  RegexDSL/DSLCapture.swift
+  RegexDSL/DynamicCaptures.swift)
+target_compile_options(_StringProcessing PRIVATE
+  -enable-library-evolution)
+target_link_libraries(_StringProcessing PUBLIC
+  _MatchingEngine)
diff --git a/Sources/_Unicode/CMakeLists.txt b/Sources/_Unicode/CMakeLists.txt
new file mode 100644
index 000000000..7fdb44628
--- /dev/null
+++ b/Sources/_Unicode/CMakeLists.txt
@@ -0,0 +1,16 @@
+
+add_library(_Unicode
+  CaseConversion.swift
+  CharacterProps.swift
+  Comparison.swift
+  Decoding.swift
+  Encodings.swift
+  Formatting.swift
+  Graphemes.swift
+  NecessaryEvils.swift
+  Normaliation.swift
+  NumberParsing.swift
+  ScalarProps.swift
+  Transcoding.swift
+  UCD.swift
+  Validation.swift)

From 04ad64ab7e936cb7427f5f00df017ab86472a3b3 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Thu, 17 Feb 2022 17:15:54 +0000
Subject: [PATCH 02/19] Add a couple of quoted test cases

---
 Tests/RegexTests/ParseTests.swift | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index 4872e256e..e70939d53 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -488,6 +488,10 @@ extension RegexTests {
       #"a\Q \Q \\.\Eb"#,
       concat("a", quote(#" \Q \\."#), "b"))
 
+    // These follow the PCRE behavior.
+    parseTest(#"\Q\\E"#, quote("\\"))
+    parseTest(#"\E"#, "E")
+
     parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"),
               syntax: .experimental)
     parseTest(#"a" .""b""#, concat("a", quote(" ."), quote("b")),

From 9d5529ee05858ccb49efeec1ba1a0cf50ee9046f Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Thu, 17 Feb 2022 17:15:55 +0000
Subject: [PATCH 03/19] Update \DDD parsing

Previously we followed PCRE's parsing of this syntax
such that it may either be an octal sequence or
backreference depending on a list of heuristics.
However this model is complicated and not particularly
intuitive, especially as there are other engines
that disambiguate using subtly different rules.

Instead, always parse `\DDD` as a backreference,
unless it begins with `0`, in which case it is an
octal sequence. This matches ICU and Java's
behavior. Once we start validating group references,
we can then start emitting an error on invalid
backreferences using this syntax, and suggest
prefixing with 0 if an octal sequence is desired.
---
 .../Regex/Parse/LexicalAnalysis.swift         | 45 ++++++-----------
 Tests/RegexTests/MatchTests.swift             |  5 +-
 Tests/RegexTests/ParseTests.swift             | 48 +++++++++----------
 3 files changed, 39 insertions(+), 59 deletions(-)

diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
index 727727ce1..dd785f12d 100644
--- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
@@ -279,7 +279,7 @@ extension Source {
   ///                | 'x'  HexDigit{0...2}
   ///                | 'U'  HexDigit{8}
   ///                | 'o{' OctalDigit{1...} '}'
-  ///                | OctalDigit{1...3}
+  ///                | '0' OctalDigit{0...3}
   ///
   mutating func expectUnicodeScalar(
     escapedCharacter base: Character
@@ -313,13 +313,14 @@ extension Source {
         let str = try src.lexUntil(eating: "}").value
         return try Source.validateUnicodeScalar(str, .octal)
 
-      case let c where c.isOctalDigit:
-        // We can read *up to* 2 more octal digits per PCRE.
-        // FIXME: ICU can read up to 3 octal digits if the leading digit is 0,
-        // we should have a parser mode to switch.
-        let nextDigits = src.tryEatPrefix(maxLength: 2, \.isOctalDigit)
-        let str = String(c) + (nextDigits?.string ?? "")
-        return try Source.validateUnicodeScalar(str, .octal)
+      case "0":
+        // We can read *up to* 3 more octal digits.
+        // FIXME: PCRE can only read up to 2 octal digits, if we get a strict
+        // PCRE mode, we should limit it here.
+        guard let digits = src.tryEatPrefix(maxLength: 3, \.isOctalDigit) else {
+          return Unicode.Scalar(0)
+        }
+        return try Source.validateUnicodeScalar(digits.string, .octal)
 
       default:
         fatalError("Unexpected scalar start")
@@ -1341,26 +1342,10 @@ extension Source {
           return nil
         }
 
-        // Lexing \n is tricky, as it's ambiguous with octal sequences. In PCRE
-        // it is treated as a backreference if its first digit is not 0 (as that
-        // is always octal) and one of the following holds:
-        //
-        // - It's 0 < n < 10 (as octal would be pointless here)
-        // - Its first digit is 8 or 9 (as not valid octal)
-        // - There have been as many prior groups as the reference.
-        //
-        // Oniguruma follows the same rules except the second one. e.g \81 and
-        // \91 are instead treated as literal 81 and 91 respectively.
-        // TODO: If we want a strict Oniguruma mode, we'll need to add a check
-        // here.
+        // Backslash followed by a non-0 digit character is a backreference.
         if firstChar != "0", let numAndLoc = try src.lexNumber() {
-          let num = numAndLoc.value
-          let ref = AST.Reference(.absolute(num), innerLoc: numAndLoc.location)
-          if num < 10 || firstChar == "8" || firstChar == "9" ||
-              context.isPriorGroupRef(ref.kind) {
-            return .backreference(ref)
-          }
-          return nil
+          return .backreference(.init(
+            .absolute(numAndLoc.value), innerLoc: numAndLoc.location))
         }
         return nil
       }
@@ -1497,10 +1482,8 @@ extension Source {
       }
 
       switch char {
-      // Hexadecimal and octal unicode scalars. This must be done after
-      // backreference lexing due to the ambiguity with \nnn.
-      case let c where c.isOctalDigit: fallthrough
-      case "u", "x", "U", "o":
+      // Hexadecimal and octal unicode scalars.
+      case "u", "x", "U", "o", "0":
         return try .scalar(
           src.expectUnicodeScalar(escapedCharacter: char).value)
       default:
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index 088deb151..2c07f4c79 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -261,7 +261,7 @@ extension RegexTests {
     firstMatchTest(#"\070"#, input: "1238xyz", match: "8")
     firstMatchTest(#"\07A"#, input: "123\u{7}Axyz", match: "\u{7}A")
     firstMatchTest(#"\08"#, input: "123\08xyz", match: "\08")
-    firstMatchTest(#"\0707"#, input: "12387xyz", match: "87")
+    firstMatchTest(#"\0707"#, input: "12387\u{1C7}xyz", match: "\u{1C7}")
 
     // code point sequence
     firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true)
@@ -1021,9 +1021,6 @@ extension RegexTests {
     firstMatchTest(
       #"(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)\10"#,
       input: "aaaaaaaaabbc", match: "aaaaaaaaabb")
-    firstMatchTest(
-      #"(.)\10"#,
-      input: "a\u{8}b", match: "a\u{8}")
 
     firstMatchTest(
       #"(.)\g001"#,
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index e70939d53..ce070cc38 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -325,7 +325,7 @@ extension RegexTests {
     parseTest(#"\070"#, scalar("\u{38}"))
     parseTest(#"\07A"#, concat(scalar("\u{7}"), "A"))
     parseTest(#"\08"#, concat(scalar("\u{0}"), "8"))
-    parseTest(#"\0707"#, concat(scalar("\u{38}"), "7"))
+    parseTest(#"\0707"#, scalar("\u{1C7}"))
 
     parseTest(#"[\0]"#, charClass(scalar_m("\u{0}")))
     parseTest(#"[\01]"#, charClass(scalar_m("\u{1}")))
@@ -333,13 +333,15 @@ extension RegexTests {
 
     parseTest(#"[\07A]"#, charClass(scalar_m("\u{7}"), "A"))
     parseTest(#"[\08]"#, charClass(scalar_m("\u{0}"), "8"))
-    parseTest(#"[\0707]"#, charClass(scalar_m("\u{38}"), "7"))
+    parseTest(#"[\0707]"#, charClass(scalar_m("\u{1C7}")))
 
-    parseTest(#"[\1]"#, charClass(scalar_m("\u{1}")))
-    parseTest(#"[\123]"#, charClass(scalar_m("\u{53}")))
-    parseTest(#"[\101]"#, charClass(scalar_m("\u{41}")))
-    parseTest(#"[\7777]"#, charClass(scalar_m("\u{1FF}"), "7"))
-    parseTest(#"[\181]"#, charClass(scalar_m("\u{1}"), "8", "1"))
+    // TODO: These are treated as octal sequences by PCRE, we should warn and
+    // suggest user prefix with 0.
+    parseTest(#"[\1]"#, charClass("1"))
+    parseTest(#"[\123]"#, charClass("1", "2", "3"))
+    parseTest(#"[\101]"#, charClass("1", "0", "1"))
+    parseTest(#"[\7777]"#, charClass("7", "7", "7", "7"))
+    parseTest(#"[\181]"#, charClass("1", "8", "1"))
 
     // We take *up to* the first two valid digits for \x. No valid digits is 0.
     parseTest(#"\x"#, scalar("\u{0}"))
@@ -797,11 +799,9 @@ extension RegexTests {
       )
     }
 
-    // TODO: Some of these behaviors are unintuitive, we should likely warn on
-    // some of them.
-    parseTest(#"\10"#, scalar("\u{8}"))
-    parseTest(#"\18"#, concat(scalar("\u{1}"), "8"))
-    parseTest(#"\7777"#, concat(scalar("\u{1FF}"), "7"))
+    parseTest(#"\10"#, backreference(.absolute(10)))
+    parseTest(#"\18"#, backreference(.absolute(18)))
+    parseTest(#"\7777"#, backreference(.absolute(7777)))
     parseTest(#"\91"#, backreference(.absolute(91)))
 
     parseTest(
@@ -813,12 +813,13 @@ extension RegexTests {
     parseTest(
       #"()()()()()()()()()\10()"#,
       concat(Array(repeating: capture(empty()), count: 9)
-             + [scalar("\u{8}"), capture(empty())]),
+             + [backreference(.absolute(10)), capture(empty())]),
       captures: .tuple(Array(repeating: .atom(), count: 10))
     )
-    parseTest(#"()()\10"#,
-              concat(capture(empty()), capture(empty()), scalar("\u{8}")),
-              captures: .tuple(.atom(), .atom()))
+    parseTest(#"()()\10"#, concat(
+      capture(empty()), capture(empty()), backreference(.absolute(10))),
+              captures: .tuple(.atom(), .atom())
+    )
 
     // A capture of three empty captures.
     let fourCaptures = capture(
@@ -826,8 +827,8 @@ extension RegexTests {
     )
     parseTest(
       // There are 9 capture groups in total here.
-      #"((()()())(()()()))\10"#,
-      concat(capture(concat(fourCaptures, fourCaptures)), scalar("\u{8}")),
+      #"((()()())(()()()))\10"#, concat(capture(concat(
+        fourCaptures, fourCaptures)), backreference(.absolute(10))),
       captures: .tuple(Array(repeating: .atom(), count: 9))
     )
     parseTest(
@@ -852,7 +853,7 @@ extension RegexTests {
       concat(Array(repeating: capture(empty()), count: 40) + [scalar(" ")]),
       captures: .tuple(Array(repeating: .atom(), count: 40))
     )
-    parseTest(#"\40"#, scalar(" "))
+    parseTest(#"\40"#, backreference(.absolute(40)))
     parseTest(
       String(repeating: "()", count: 40) + #"\40"#,
       concat(Array(repeating: capture(empty()), count: 40)
@@ -862,7 +863,7 @@ extension RegexTests {
 
     parseTest(#"\7"#, backreference(.absolute(7)))
 
-    parseTest(#"\11"#, scalar("\u{9}"))
+    parseTest(#"\11"#, backreference(.absolute(11)))
     parseTest(
       String(repeating: "()", count: 11) + #"\11"#,
       concat(Array(repeating: capture(empty()), count: 11)
@@ -876,12 +877,11 @@ extension RegexTests {
       captures: .tuple(Array(repeating: .atom(), count: 11))
     )
 
-    parseTest(#"\0113"#, concat(scalar("\u{9}"), "3"))
-    parseTest(#"\113"#, scalar("\u{4B}"))
-    parseTest(#"\377"#, scalar("\u{FF}"))
+    parseTest(#"\0113"#, scalar("\u{4B}"))
+    parseTest(#"\113"#, backreference(.absolute(113)))
+    parseTest(#"\377"#, backreference(.absolute(377)))
     parseTest(#"\81"#, backreference(.absolute(81)))
 
-
     parseTest(#"\g1"#, backreference(.absolute(1)))
     parseTest(#"\g001"#, backreference(.absolute(1)))
     parseTest(#"\g52"#, backreference(.absolute(52)))

From 3f24f5bcb31717d99511c203dbf6bdb7eab033de Mon Sep 17 00:00:00 2001
From: Michael Ilseman <michael.ilseman@gmail.com>
Date: Thu, 24 Feb 2022 15:52:00 -0700
Subject: [PATCH 04/19] Internalize declarations

Lots of ME stuff was annotated with vestigial public
---
 Sources/Prototypes/PEG/PEGCode.swift          |   2 +-
 Sources/Prototypes/PEG/PEGCompile.swift       |   2 +-
 Sources/Prototypes/PEG/PEGCore.swift          |   2 +-
 Sources/Prototypes/PEG/PEGTranspile.swift     |   3 +-
 Sources/Prototypes/PEG/PEGVM.swift            |   3 +-
 Sources/Prototypes/PEG/Printing.swift         |   2 +-
 Sources/_StringProcessing/Compiler.swift      |   2 +-
 .../_StringProcessing/Engine/Consume.swift    |   4 +-
 Sources/_StringProcessing/Engine/Engine.swift |   8 +-
 .../Engine/Instruction.swift                  |   3 +-
 .../_StringProcessing/Engine/MEBuilder.swift  | 124 +++++++-------
 .../_StringProcessing/Engine/MECapture.swift  |   2 +-
 .../_StringProcessing/Engine/MEProgram.swift  |  12 +-
 .../_StringProcessing/Engine/Processor.swift  |   2 +-
 .../_StringProcessing/Engine/Tracing.swift    |   2 +-
 Sources/_StringProcessing/Executor.swift      |  14 +-
 Sources/_StringProcessing/RegexDSL/DSL.swift  |   4 +-
 .../_StringProcessing/RegexDSL/DSLTree.swift  |   2 +-
 .../_StringProcessing/Unicode/Decoding.swift  |  20 +--
 .../Unicode/NecessaryEvils.swift              |   2 +-
 .../_StringProcessing/Utility/Protocols.swift |  12 +-
 .../_StringProcessing/Utility/Traced.swift    |  24 +--
 .../Utility/TypedIndex.swift                  |  59 +++----
 .../_StringProcessing/Utility/TypedInt.swift  | 159 ++++++++----------
 24 files changed, 215 insertions(+), 254 deletions(-)

diff --git a/Sources/Prototypes/PEG/PEGCode.swift b/Sources/Prototypes/PEG/PEGCode.swift
index c33f5759c..b12c5bab6 100644
--- a/Sources/Prototypes/PEG/PEGCode.swift
+++ b/Sources/Prototypes/PEG/PEGCode.swift
@@ -9,7 +9,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-import _StringProcessing
+@testable import _StringProcessing
 
 extension PEG.VM {
   struct Code {
diff --git a/Sources/Prototypes/PEG/PEGCompile.swift b/Sources/Prototypes/PEG/PEGCompile.swift
index 0592cf6a9..0e1b89233 100644
--- a/Sources/Prototypes/PEG/PEGCompile.swift
+++ b/Sources/Prototypes/PEG/PEGCompile.swift
@@ -9,7 +9,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-import _StringProcessing
+@testable import _StringProcessing
 
 extension PEG.VM {
   typealias InIndex = Input.Index
diff --git a/Sources/Prototypes/PEG/PEGCore.swift b/Sources/Prototypes/PEG/PEGCore.swift
index b831cbd0f..5c66dc25a 100644
--- a/Sources/Prototypes/PEG/PEGCore.swift
+++ b/Sources/Prototypes/PEG/PEGCore.swift
@@ -9,7 +9,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-import _StringProcessing
+@testable import _StringProcessing
 let emitComments = true
 
 struct PEGCore<
diff --git a/Sources/Prototypes/PEG/PEGTranspile.swift b/Sources/Prototypes/PEG/PEGTranspile.swift
index df75cea63..84e220d52 100644
--- a/Sources/Prototypes/PEG/PEGTranspile.swift
+++ b/Sources/Prototypes/PEG/PEGTranspile.swift
@@ -9,8 +9,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-import _MatchingEngine
-import _StringProcessing
+@testable import _StringProcessing
 
 extension PEG.VM where Input == String {
   typealias MEProg = MEProgram<String>
diff --git a/Sources/Prototypes/PEG/PEGVM.swift b/Sources/Prototypes/PEG/PEGVM.swift
index a987b581d..4cf91a5c1 100644
--- a/Sources/Prototypes/PEG/PEGVM.swift
+++ b/Sources/Prototypes/PEG/PEGVM.swift
@@ -9,7 +9,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-import _StringProcessing
+
+@testable import _StringProcessing
 
 extension PEG {
 
diff --git a/Sources/Prototypes/PEG/Printing.swift b/Sources/Prototypes/PEG/Printing.swift
index 978250761..be60e72f5 100644
--- a/Sources/Prototypes/PEG/Printing.swift
+++ b/Sources/Prototypes/PEG/Printing.swift
@@ -9,7 +9,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-import _StringProcessing
+@testable import _StringProcessing
 
 extension PEGCore.Instruction: InstructionProtocol {
   var operandPC: InstructionAddress? { self.pc }
diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift
index 5099e187f..1d72a8d27 100644
--- a/Sources/_StringProcessing/Compiler.swift
+++ b/Sources/_StringProcessing/Compiler.swift
@@ -35,7 +35,7 @@ class Compiler {
   }
 }
 
-public func _compileRegex(
+func _compileRegex(
   _ regex: String, _ syntax: SyntaxOptions = .traditional
 ) throws -> Executor {
   let ast = try parse(regex, syntax)
diff --git a/Sources/_StringProcessing/Engine/Consume.swift b/Sources/_StringProcessing/Engine/Consume.swift
index 52f752539..cfb803de8 100644
--- a/Sources/_StringProcessing/Engine/Consume.swift
+++ b/Sources/_StringProcessing/Engine/Consume.swift
@@ -25,13 +25,13 @@ extension Engine {
 }
 
 extension Engine where Input == String {
-  public func consume(
+  func consume(
     _ input: Input
   ) -> (Input.Index, CaptureList)? {
     consume(input, in: input.startIndex ..< input.endIndex)
   }
 
-  public func consume(
+  func consume(
     _ input: Input,
     in range: Range<Input.Index>,
     matchMode: MatchMode = .partialFromFront
diff --git a/Sources/_StringProcessing/Engine/Engine.swift b/Sources/_StringProcessing/Engine/Engine.swift
index 6c9c2efa5..86952c8b7 100644
--- a/Sources/_StringProcessing/Engine/Engine.swift
+++ b/Sources/_StringProcessing/Engine/Engine.swift
@@ -11,7 +11,7 @@
 
 // Currently, engine binds the type and consume binds an instance.
 // But, we can play around with this.
-public struct Engine<Input: BidirectionalCollection> where Input.Element: Hashable {
+struct Engine<Input: BidirectionalCollection> where Input.Element: Hashable {
 
   var program: MEProgram<Input>
 
@@ -24,7 +24,7 @@ public struct Engine<Input: BidirectionalCollection> where Input.Element: Hashab
     set { program.enableTracing = newValue }
   }
 
-  public init(
+  init(
     _ program: MEProgram<Input>,
     enableTracing: Bool? = nil
   ) {
@@ -36,10 +36,10 @@ public struct Engine<Input: BidirectionalCollection> where Input.Element: Hashab
   }
 }
 
-public struct AsyncEngine { /* ... */ }
+struct AsyncEngine { /* ... */ }
 
 extension Engine: CustomStringConvertible {
-  public var description: String {
+  var description: String {
     // TODO: better description
     return program.description
   }
diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift
index fcc257302..ff28ee9e2 100644
--- a/Sources/_StringProcessing/Engine/Instruction.swift
+++ b/Sources/_StringProcessing/Engine/Instruction.swift
@@ -299,8 +299,7 @@ extension Instruction {
 
 internal var _opcodeMask: UInt64 { 0xFF00_0000_0000_0000 }
 
-// TODO: internal after compiler moves in
-public var _payloadMask: UInt64 { ~_opcodeMask }
+var _payloadMask: UInt64 { ~_opcodeMask }
 
 extension Instruction {
   var opcodeMask: UInt64 { 0xFF00_0000_0000_0000 }
diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift
index d81c583a8..78171a001 100644
--- a/Sources/_StringProcessing/Engine/MEBuilder.swift
+++ b/Sources/_StringProcessing/Engine/MEBuilder.swift
@@ -12,7 +12,7 @@
 import _MatchingEngine // For errors
 
 extension MEProgram where Input.Element: Hashable {
-  public struct Builder {
+  struct Builder {
     var instructions: [Instruction] = []
 
     var elements = TypedSetVector<Input.Element, _ElementRegister>()
@@ -50,7 +50,7 @@ extension MEProgram where Input.Element: Hashable {
       nextCaptureRegister.rawValue
     }
 
-    public init() {}
+    init() {}
   }
 }
 
@@ -71,7 +71,7 @@ extension MEProgram.Builder {
   // TODO: We want a better strategy for fixups, leaving
   // the operand in a different form isn't great...
 
-  public init<S: Sequence>(staticElements: S) where S.Element == Input.Element {
+  init<S: Sequence>(staticElements: S) where S.Element == Input.Element {
     staticElements.forEach { elements.store($0) }
   }
 
@@ -79,21 +79,21 @@ extension MEProgram.Builder {
     .init(instructions.endIndex - 1)
   }
 
-  public mutating func buildNop(_ r: StringRegister? = nil) {
+  mutating func buildNop(_ r: StringRegister? = nil) {
     instructions.append(.init(.nop, .init(optionalString: r)))
   }
-  public mutating func buildNop(_ s: String) {
+  mutating func buildNop(_ s: String) {
     buildNop(strings.store(s))
   }
 
-  public mutating func buildDecrement(
+  mutating func buildDecrement(
     _ i: IntRegister, nowZero: BoolRegister
   ) {
     instructions.append(.init(
       .decrement, .init(bool: nowZero, int: i)))
   }
 
-  public mutating func buildMoveImmediate(
+  mutating func buildMoveImmediate(
     _ value: UInt64, into: IntRegister
   ) {
     instructions.append(.init(
@@ -101,25 +101,25 @@ extension MEProgram.Builder {
   }
 
   // TODO: generic
-  public mutating func buildMoveImmediate(
+  mutating func buildMoveImmediate(
     _ value: Int, into: IntRegister
   ) {
     let uint = UInt64(asserting: value)
     buildMoveImmediate(uint, into: into)
   }
 
-  public mutating func buildMoveCurrentPosition(
+  mutating func buildMoveCurrentPosition(
     into: PositionRegister
   ) {
     instructions.append(.init(
       .movePosition, .init(position: into)))
   }
 
-  public mutating func buildBranch(to t: AddressToken) {
+  mutating func buildBranch(to t: AddressToken) {
     instructions.append(.init(.branch))
     fixup(to: t)
   }
-  public mutating func buildCondBranch(
+  mutating func buildCondBranch(
     _ condition: BoolRegister, to t: AddressToken
   ) {
     instructions.append(
@@ -127,7 +127,7 @@ extension MEProgram.Builder {
     fixup(to: t)
   }
 
-  public mutating func buildCondBranch(
+  mutating func buildCondBranch(
     to t: AddressToken, ifZeroElseDecrement i: IntRegister
   ) {
     instructions.append(
@@ -135,56 +135,56 @@ extension MEProgram.Builder {
     fixup(to: t)
   }
 
-  public mutating func buildSave(_ t: AddressToken) {
+  mutating func buildSave(_ t: AddressToken) {
     instructions.append(.init(.save))
     fixup(to: t)
   }
-  public mutating func buildSaveAddress(_ t: AddressToken) {
+  mutating func buildSaveAddress(_ t: AddressToken) {
     instructions.append(.init(.saveAddress))
     fixup(to: t)
   }
-  public mutating func buildSplit(
+  mutating func buildSplit(
     to: AddressToken, saving: AddressToken
   ) {
     instructions.append(.init(.splitSaving))
     fixup(to: (to, saving))
   }
 
-  public mutating func buildClear() {
+  mutating func buildClear() {
     instructions.append(.init(.clear))
   }
-  public mutating func buildRestore() {
+  mutating func buildRestore() {
     instructions.append(.init(.restore))
   }
-  public mutating func buildFail() {
+  mutating func buildFail() {
     instructions.append(.init(.fail))
   }
-  public mutating func buildCall(_ t: AddressToken) {
+  mutating func buildCall(_ t: AddressToken) {
     instructions.append(.init(.call))
     fixup(to: t)
   }
-  public mutating func buildRet() {
+  mutating func buildRet() {
     instructions.append(.init(.ret))
   }
 
-  public mutating func buildAbort(_ s: StringRegister? = nil) {
+  mutating func buildAbort(_ s: StringRegister? = nil) {
     instructions.append(.init(
       .abort, .init(optionalString: s)))
   }
-  public mutating func buildAbort(_ s: String) {
+  mutating func buildAbort(_ s: String) {
     buildAbort(strings.store(s))
   }
 
-  public mutating func buildAdvance(_ n: Distance) {
+  mutating func buildAdvance(_ n: Distance) {
     instructions.append(.init(.advance, .init(distance: n)))
   }
 
-  public mutating func buildMatch(_ e: Input.Element) {
+  mutating func buildMatch(_ e: Input.Element) {
     instructions.append(.init(
       .match, .init(element: elements.store(e))))
   }
 
-  public mutating func buildMatchSequence<S: Sequence>(
+  mutating func buildMatchSequence<S: Sequence>(
     _ s: S
   ) where S.Element == Input.Element {
     instructions.append(.init(
@@ -192,7 +192,7 @@ extension MEProgram.Builder {
       .init(sequence: sequences.store(.init(s)))))
   }
 
-  public mutating func buildMatchSlice(
+  mutating func buildMatchSlice(
     lower: PositionRegister, upper: PositionRegister
   ) {
     instructions.append(.init(
@@ -200,50 +200,50 @@ extension MEProgram.Builder {
       .init(pos: lower, pos2: upper)))
   }
 
-  public mutating func buildConsume(
+  mutating func buildConsume(
     by p: @escaping MEProgram.ConsumeFunction
   ) {
     instructions.append(.init(
       .consumeBy, .init(consumer: makeConsumeFunction(p))))
   }
 
-  public mutating func buildAssert(
+  mutating func buildAssert(
     by p: @escaping MEProgram.AssertionFunction
   ) {
     instructions.append(.init(
       .assertBy, .init(assertion: makeAssertionFunction(p))))
   }
 
-  public mutating func buildAssert(
+  mutating func buildAssert(
     _ e: Input.Element, into cond: BoolRegister
   ) {
     instructions.append(.init(.assertion, .init(
       element: elements.store(e), bool: cond)))
   }
 
-  public mutating func buildAccept() {
+  mutating func buildAccept() {
     instructions.append(.init(.accept))
   }
 
-  public mutating func buildPrint(_ s: StringRegister) {
+  mutating func buildPrint(_ s: StringRegister) {
     instructions.append(.init(.print, .init(string: s)))
   }
 
-  public mutating func buildBeginCapture(
+  mutating func buildBeginCapture(
     _ cap: CaptureRegister
   ) {
     instructions.append(
       .init(.beginCapture, .init(capture: cap)))
   }
 
-  public mutating func buildEndCapture(
+  mutating func buildEndCapture(
     _ cap: CaptureRegister
   ) {
     instructions.append(
       .init(.endCapture, .init(capture: cap)))
   }
 
-  public mutating func buildTransformCapture(
+  mutating func buildTransformCapture(
     _ cap: CaptureRegister, _ trans: TransformRegister
   ) {
     instructions.append(.init(
@@ -251,7 +251,7 @@ extension MEProgram.Builder {
       .init(capture: cap, transform: trans)))
   }
 
-  public mutating func buildMatcher(
+  mutating func buildMatcher(
     _ fun: MatcherRegister, into reg: ValueRegister
   ) {
     instructions.append(.init(
@@ -259,7 +259,7 @@ extension MEProgram.Builder {
       .init(matcher: fun, value: reg)))
   }
 
-  public mutating func buildMove(
+  mutating func buildMove(
     _ value: ValueRegister, into capture: CaptureRegister
   ) {
     instructions.append(.init(
@@ -267,21 +267,21 @@ extension MEProgram.Builder {
       .init(value: value, capture: capture)))
   }
 
-  public mutating func buildBackreference(
+  mutating func buildBackreference(
     _ cap: CaptureRegister
   ) {
     instructions.append(
       .init(.backreference, .init(capture: cap)))
   }
 
-  public mutating func buildUnresolvedReference(id: ReferenceID) {
+  mutating func buildUnresolvedReference(id: ReferenceID) {
     buildBackreference(.init(0))
     unresolvedReferences[id, default: []].append(lastInstructionAddress)
   }
 
   // TODO: Mutating because of fail address fixup, drop when
   // that's removed
-  public mutating func assemble() throws -> MEProgram {
+  mutating func assemble() throws -> MEProgram {
     try resolveReferences()
 
     // TODO: This will add a fail instruction at the end every
@@ -356,22 +356,22 @@ extension MEProgram.Builder {
       referencedCaptureOffsets: referencedCaptureOffsets)
   }
 
-  public mutating func reset() { self = Self() }
+  mutating func reset() { self = Self() }
 }
 
 // Address-agnostic interfaces for label-like support
 extension MEProgram.Builder {
-  public enum _AddressToken {}
-  public typealias AddressToken = TypedInt<_AddressToken>
+  enum _AddressToken {}
+  typealias AddressToken = TypedInt<_AddressToken>
 
-  public mutating func makeAddress() -> AddressToken {
+  mutating func makeAddress() -> AddressToken {
     defer { addressTokens.append(nil) }
     return AddressToken(addressTokens.count)
   }
 
   // Resolves the address token to the most recently added
   // instruction, updating prior and future address references
-  public mutating func resolve(_ t: AddressToken) {
+  mutating func resolve(_ t: AddressToken) {
     assert(!instructions.isEmpty)
 
     addressTokens[t.rawValue] =
@@ -380,7 +380,7 @@ extension MEProgram.Builder {
 
   // Resolves the address token to the next instruction (one past the most
   // recently added one), updating prior and future address references.
-  public mutating func label(_ t: AddressToken) {
+  mutating func label(_ t: AddressToken) {
     addressTokens[t.rawValue] =
       InstructionAddress(instructions.count)
   }
@@ -388,7 +388,7 @@ extension MEProgram.Builder {
   // Associate the most recently added instruction with
   // the provided token, ensuring it is fixed up during
   // assembly
-  public mutating func fixup(to t: AddressToken) {
+  mutating func fixup(to t: AddressToken) {
     assert(!instructions.isEmpty)
     addressFixups.append(
       (InstructionAddress(instructions.endIndex-1), .init(t)))
@@ -397,7 +397,7 @@ extension MEProgram.Builder {
   // Associate the most recently added instruction with
   // the provided tokens, ensuring it is fixed up during
   // assembly
-  public mutating func fixup(
+  mutating func fixup(
     to ts: (AddressToken, AddressToken)
   ) {
     assert(!instructions.isEmpty)
@@ -412,7 +412,7 @@ extension MEProgram.Builder {
   //
   // This is useful for possessive quantification that needs some initial save
   // point to "ratchet" upon a successful match.
-  public mutating func pushEmptySavePoint() {
+  mutating func pushEmptySavePoint() {
     if failAddressToken == nil {
       failAddressToken = makeAddress()
     }
@@ -438,7 +438,7 @@ fileprivate extension MEProgram.Builder {
 
 // Register helpers
 extension MEProgram.Builder {
-  public mutating func makeCapture(id: ReferenceID?) -> CaptureRegister {
+  mutating func makeCapture(id: ReferenceID?) -> CaptureRegister {
     defer { nextCaptureRegister.rawValue += 1 }
     // Register the capture for later lookup via symbolic references.
     if let id = id {
@@ -449,25 +449,25 @@ extension MEProgram.Builder {
     return nextCaptureRegister
   }
 
-  public mutating func makeBoolRegister() -> BoolRegister {
+  mutating func makeBoolRegister() -> BoolRegister {
     defer { nextBoolRegister.rawValue += 1 }
     return nextBoolRegister
   }
-  public mutating func makeIntRegister() -> IntRegister {
+  mutating func makeIntRegister() -> IntRegister {
     defer { nextIntRegister.rawValue += 1 }
     return nextIntRegister
   }
-  public mutating func makePositionRegister() -> PositionRegister {
+  mutating func makePositionRegister() -> PositionRegister {
     defer { nextPositionRegister.rawValue += 1 }
     return nextPositionRegister
   }
-  public mutating func makeValueRegister() -> ValueRegister {
+  mutating func makeValueRegister() -> ValueRegister {
     defer { nextValueRegister.rawValue += 1 }
     return nextValueRegister
   }
 
   // Allocate and initialize a register
-  public mutating func makeIntRegister(
+  mutating func makeIntRegister(
     initialValue: Int
   ) -> IntRegister {
     let r = makeIntRegister()
@@ -476,7 +476,7 @@ extension MEProgram.Builder {
   }
 
   // Allocate and initialize a register
-  public mutating func makePositionRegister(
+  mutating func makePositionRegister(
     initializingWithCurrentPosition: ()
   ) -> PositionRegister {
     let r = makePositionRegister()
@@ -485,17 +485,17 @@ extension MEProgram.Builder {
   }
 
   // 'kill' or release allocated registers
-  public mutating func kill(_ r: IntRegister) {
+  mutating func kill(_ r: IntRegister) {
     // TODO: Release/reuse registers, for now nop makes
     // reading the code easier
     buildNop("kill \(r)")
   }
-  public mutating func kill(_ r: BoolRegister) {
+  mutating func kill(_ r: BoolRegister) {
     // TODO: Release/reuse registers, for now nop makes
     // reading the code easier
     buildNop("kill \(r)")
   }
-  public mutating func kill(_ r: PositionRegister) {
+  mutating func kill(_ r: PositionRegister) {
     // TODO: Release/reuse registers, for now nop makes
     // reading the code easier
     buildNop("kill \(r)")
@@ -504,25 +504,25 @@ extension MEProgram.Builder {
   // TODO: A register-mapping helper struct, which could release
   // registers without monotonicity required
 
-  public mutating func makeConsumeFunction(
+  mutating func makeConsumeFunction(
     _ f: @escaping MEProgram.ConsumeFunction
   ) -> ConsumeFunctionRegister {
     defer { consumeFunctions.append(f) }
     return ConsumeFunctionRegister(consumeFunctions.count)
   }
-  public mutating func makeAssertionFunction(
+  mutating func makeAssertionFunction(
     _ f: @escaping MEProgram.AssertionFunction
   ) -> AssertionFunctionRegister {
     defer { assertionFunctions.append(f) }
     return AssertionFunctionRegister(assertionFunctions.count)
   }
-  public mutating func makeTransformFunction(
+  mutating func makeTransformFunction(
     _ f: @escaping MEProgram.TransformFunction
   ) -> TransformRegister {
     defer { transformFunctions.append(f) }
     return TransformRegister(transformFunctions.count)
   }
-  public mutating func makeMatcherFunction(
+  mutating func makeMatcherFunction(
     _ f: @escaping MEProgram.MatcherFunction
   ) -> MatcherRegister {
     defer { matcherFunctions.append(f) }
diff --git a/Sources/_StringProcessing/Engine/MECapture.swift b/Sources/_StringProcessing/Engine/MECapture.swift
index 88f912ecb..bac632e9e 100644
--- a/Sources/_StringProcessing/Engine/MECapture.swift
+++ b/Sources/_StringProcessing/Engine/MECapture.swift
@@ -142,7 +142,7 @@ extension Processor._StoredCapture: CustomStringConvertible {
   }
 }
 
-public struct CaptureList {
+struct CaptureList {
   var values: Array<Processor<String>._StoredCapture>
   var referencedCaptureOffsets: [ReferenceID: Int]
 
diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift
index d616657e8..1e58ddf54 100644
--- a/Sources/_StringProcessing/Engine/MEProgram.swift
+++ b/Sources/_StringProcessing/Engine/MEProgram.swift
@@ -11,13 +11,13 @@
 
 import _MatchingEngine
 
-public struct MEProgram<Input: Collection> where Input.Element: Equatable {
-  public typealias ConsumeFunction = (Input, Range<Input.Index>) -> Input.Index?
-  public typealias AssertionFunction =
+struct MEProgram<Input: Collection> where Input.Element: Equatable {
+  typealias ConsumeFunction = (Input, Range<Input.Index>) -> Input.Index?
+  typealias AssertionFunction =
     (Input, Input.Index, Range<Input.Index>) -> Bool
-  public typealias TransformFunction =
+  typealias TransformFunction =
     (Input, Range<Input.Index>) -> Any?
-  public typealias MatcherFunction =
+  typealias MatcherFunction =
     (Input, Input.Index, Range<Input.Index>) -> (Input.Index, Any)?
 
   var instructions: InstructionList<Instruction>
@@ -39,7 +39,7 @@ public struct MEProgram<Input: Collection> where Input.Element: Equatable {
 }
 
 extension MEProgram: CustomStringConvertible {
-  public var description: String {
+  var description: String {
     var result = """
     Elements: \(staticElements)
     Strings: \(staticStrings)
diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift
index 10c3eb781..343b02c92 100644
--- a/Sources/_StringProcessing/Engine/Processor.swift
+++ b/Sources/_StringProcessing/Engine/Processor.swift
@@ -9,7 +9,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-public enum MatchMode {
+enum MatchMode {
   case wholeString
   case partialFromFront
 }
diff --git a/Sources/_StringProcessing/Engine/Tracing.swift b/Sources/_StringProcessing/Engine/Tracing.swift
index 7db740f52..24d00d3d7 100644
--- a/Sources/_StringProcessing/Engine/Tracing.swift
+++ b/Sources/_StringProcessing/Engine/Tracing.swift
@@ -15,7 +15,7 @@ extension Processor: TracedProcessor {
 
   var currentPC: InstructionAddress { controller.pc }
 
-  public func formatSavePoints() -> String {
+  func formatSavePoints() -> String {
     if !savePoints.isEmpty {
       var result = "save points:\n"
       for point in savePoints {
diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift
index e066a4369..5c098c5c5 100644
--- a/Sources/_StringProcessing/Executor.swift
+++ b/Sources/_StringProcessing/Executor.swift
@@ -11,8 +11,7 @@
 
 import _MatchingEngine
 
-  // FIXME: Public for prototype
-public struct Executor {
+struct Executor {
   // TODO: consider let, for now lets us toggle tracing
   var engine: Engine<String>
 
@@ -20,9 +19,8 @@ public struct Executor {
     self.engine = Engine(program, enableTracing: enablesTracing)
   }
 
-  // FIXME: Public for prototype
-  public struct Result {
-    public var range: Range<String.Index>
+  struct Result {
+    var range: Range<String.Index>
     var captures: [StructuredCapture]
     var referencedCaptureOffsets: [ReferenceID: Int]
 
@@ -44,7 +42,7 @@ public struct Executor {
     }
   }
 
-  public func execute(
+  func execute(
     input: String,
     in range: Range<String.Index>,
     mode: MatchMode = .wholeString
@@ -65,7 +63,7 @@ public struct Executor {
       fatalError(String(describing: error))
     }
   }
-  public func execute(
+  func execute(
     input: Substring,
     mode: MatchMode = .wholeString
   ) -> Result? {
@@ -75,7 +73,7 @@ public struct Executor {
       mode: mode)
   }
 
-  public func executeFlat(
+  func executeFlat(
     input: String,
     in range: Range<String.Index>,
     mode: MatchMode = .wholeString
diff --git a/Sources/_StringProcessing/RegexDSL/DSL.swift b/Sources/_StringProcessing/RegexDSL/DSL.swift
index 35a4ccb5e..17f006231 100644
--- a/Sources/_StringProcessing/RegexDSL/DSL.swift
+++ b/Sources/_StringProcessing/RegexDSL/DSL.swift
@@ -187,9 +187,7 @@ public func choiceOf<R: RegexProtocol>(
 
 // MARK: - Backreference
 
-
-// FIXME: Public for prototypes.
-public struct ReferenceID: Hashable, Equatable {
+struct ReferenceID: Hashable, Equatable {
   private static var counter: Int = 0
   var base: Int
 
diff --git a/Sources/_StringProcessing/RegexDSL/DSLTree.swift b/Sources/_StringProcessing/RegexDSL/DSLTree.swift
index a44220925..43f8aa62f 100644
--- a/Sources/_StringProcessing/RegexDSL/DSLTree.swift
+++ b/Sources/_StringProcessing/RegexDSL/DSLTree.swift
@@ -249,7 +249,7 @@ extension DSLTree {
   }
 }
 extension DSLTree.Node {
-  public func _captureStructure(
+  func _captureStructure(
     _ constructor: inout CaptureStructure.Constructor
   ) -> CaptureStructure {
     switch self {
diff --git a/Sources/_StringProcessing/Unicode/Decoding.swift b/Sources/_StringProcessing/Unicode/Decoding.swift
index 49eb1f794..68c14f6c1 100644
--- a/Sources/_StringProcessing/Unicode/Decoding.swift
+++ b/Sources/_StringProcessing/Unicode/Decoding.swift
@@ -33,13 +33,13 @@
 
 enum UnsafeAssumingValidUTF8 {
   @inlinable @inline(__always)
-  public static func decode(_ x: UInt8) -> Unicode.Scalar {
+  static func decode(_ x: UInt8) -> Unicode.Scalar {
     _internalInvariant(UTF8.isASCII(x))
     return Unicode.Scalar(_unchecked: UInt32(x))
   }
 
   @inlinable @inline(__always)
-  public static func decode(
+  static func decode(
     _ x: UInt8, _ y: UInt8
   ) -> Unicode.Scalar {
     _internalInvariant(scalarLength(x) == 2)
@@ -50,7 +50,7 @@ enum UnsafeAssumingValidUTF8 {
   }
 
   @inlinable @inline(__always)
-  public static func decode(
+  static func decode(
     _ x: UInt8, _ y: UInt8, _ z: UInt8
   ) -> Unicode.Scalar {
     _internalInvariant(scalarLength(x) == 3)
@@ -63,7 +63,7 @@ enum UnsafeAssumingValidUTF8 {
   }
 
   @inlinable @inline(__always)
-  public static func decode(
+  static func decode(
     _ x: UInt8, _ y: UInt8, _ z: UInt8, _ w: UInt8
   ) -> Unicode.Scalar {
     _internalInvariant(scalarLength(x) == 4)
@@ -80,7 +80,7 @@ enum UnsafeAssumingValidUTF8 {
 
   // Also, assuming we can load from those bounds...
   @inlinable
-  public static func decode(
+  static func decode(
     _ utf8: UnsafeByteBuffer, startingAt i: Int
   ) -> (Unicode.Scalar, scalarLength: Int) {
     let cu0 = utf8[_unchecked: i]
@@ -103,7 +103,7 @@ enum UnsafeAssumingValidUTF8 {
   }
 
   @inlinable
-  public static func decode(
+  static func decode(
     _ utf8: UnsafeByteBuffer, endingAt i: Int
   ) -> (Unicode.Scalar, scalarLength: Int) {
     let len = scalarLength(utf8, endingAt: i)
@@ -113,7 +113,7 @@ enum UnsafeAssumingValidUTF8 {
   }
 
   @inlinable @inline(__always)
-  public static func scalarLength(_ x: UInt8) -> Int {
+  static func scalarLength(_ x: UInt8) -> Int {
     _internalInvariant(!UTF8.isContinuation(x))
     if UTF8.isASCII(x) { return 1 }
     // TODO(String micro-performance): check codegen
@@ -121,7 +121,7 @@ enum UnsafeAssumingValidUTF8 {
   }
 
   @inlinable @inline(__always)
-  public static func scalarLength(
+  static func scalarLength(
     _ utf8: UnsafeByteBuffer, endingAt i: Int
   ) -> Int {
     var len = 1
@@ -133,12 +133,12 @@ enum UnsafeAssumingValidUTF8 {
   }
 
   @inlinable @inline(__always)
-  public static func continuationPayload(_ x: UInt8) -> UInt32 {
+  static func continuationPayload(_ x: UInt8) -> UInt32 {
     return UInt32(x & 0x3F)
   }
 
   @inlinable
-  public static func scalarAlign(
+  static func scalarAlign(
     _ utf8: UnsafeByteBuffer, _ idx: Int
   ) -> Int {
     guard _fastPath(idx != utf8.count) else { return idx }
diff --git a/Sources/_StringProcessing/Unicode/NecessaryEvils.swift b/Sources/_StringProcessing/Unicode/NecessaryEvils.swift
index ef846c14e..a9ae24429 100644
--- a/Sources/_StringProcessing/Unicode/NecessaryEvils.swift
+++ b/Sources/_StringProcessing/Unicode/NecessaryEvils.swift
@@ -40,7 +40,7 @@ extension Optional {
 }
 
 // Don't use UnsafeRawBufferPointer for anything important
-public struct UnsafeByteBuffer {
+struct UnsafeByteBuffer {
   var pointer: UnsafeRawPointer
   var count: Int
 
diff --git a/Sources/_StringProcessing/Utility/Protocols.swift b/Sources/_StringProcessing/Utility/Protocols.swift
index 9c196c18c..7542a17dd 100644
--- a/Sources/_StringProcessing/Utility/Protocols.swift
+++ b/Sources/_StringProcessing/Utility/Protocols.swift
@@ -13,11 +13,11 @@
 // These currently only drive tracing/formatting, but could drive
 // more
 
-public protocol InstructionProtocol {
+protocol InstructionProtocol {
   var operandPC: InstructionAddress? { get }
 }
 
-public protocol ProcessorProtocol {
+protocol ProcessorProtocol {
   associatedtype Input: Collection
   associatedtype Instruction: InstructionProtocol
   associatedtype SavePoint = ()
@@ -45,12 +45,12 @@ public protocol ProcessorProtocol {
 }
 
 extension ProcessorProtocol {
-  public func fetch() -> Instruction {
+  func fetch() -> Instruction {
     instructions[currentPC]
   }
 
-  public var callStack: Array<InstructionAddress> { [] }
-//  public var savePoints: Array<SavePoint> { [] }
-  public var registers: Array<Registers> { [] }
+  var callStack: Array<InstructionAddress> { [] }
+//  var savePoints: Array<SavePoint> { [] }
+  var registers: Array<Registers> { [] }
 
 }
diff --git a/Sources/_StringProcessing/Utility/Traced.swift b/Sources/_StringProcessing/Utility/Traced.swift
index c270aba23..5ae7cd245 100644
--- a/Sources/_StringProcessing/Utility/Traced.swift
+++ b/Sources/_StringProcessing/Utility/Traced.swift
@@ -12,11 +12,11 @@
 
 // TODO: Place shared formatting and trace infrastructure here
 
-public protocol Traced {
+protocol Traced {
   var isTracingEnabled: Bool { get set }
 }
 
-public protocol TracedProcessor: ProcessorProtocol, Traced {
+protocol TracedProcessor: ProcessorProtocol, Traced {
   // Empty defaulted
   func formatCallStack() -> String // empty default
   func formatSavePoints() -> String // empty default
@@ -36,7 +36,7 @@ func lineNumber(_ pc: InstructionAddress) -> String {
 }
 
 extension TracedProcessor where Registers: Collection{
-  public func formatRegisters() -> String {
+  func formatRegisters() -> String {
     typealias E = ()
     if !registers.isEmpty {
       return "\(registers)\n"
@@ -48,19 +48,19 @@ extension TracedProcessor where Registers: Collection{
 extension TracedProcessor {
   func printTrace() { print(formatTrace()) }
 
-  public func trace() {
+  func trace() {
     if isTracingEnabled { printTrace() }
   }
 
   // Helpers for the conformers
-  public func formatCallStack() -> String {
+  func formatCallStack() -> String {
     if !callStack.isEmpty {
       return "call stack: \(callStack)\n"
     }
     return ""
   }
 
-  public func formatSavePoints() -> String {
+  func formatSavePoints() -> String {
     if !savePoints.isEmpty {
       var result = "save points:\n"
       for point in savePoints {
@@ -71,7 +71,7 @@ extension TracedProcessor {
     return ""
   }
 
-  public func formatRegisters() -> String {
+  func formatRegisters() -> String {
      typealias E = ()
      if Registers.self == E.self {
        return ""
@@ -79,7 +79,7 @@ extension TracedProcessor {
      return "\(registers)\n"
   }
 
-  public func formatInput() -> String {
+  func formatInput() -> String {
     // String override for printing sub-character information.
     if !input.indices.contains(currentPosition) {
       // Format unicode scalars as:
@@ -115,7 +115,7 @@ extension TracedProcessor {
       """
   }
 
-  public func formatInstructionWindow(
+  func formatInstructionWindow(
     windowSize: Int = 12
   ) -> String {
     if isAcceptState { return "ACCEPT" }
@@ -139,7 +139,7 @@ extension TracedProcessor {
     return result
   }
 
-  public func formatTrace() -> String {
+  func formatTrace() -> String {
     var result = "\n--- cycle \(cycleCount) ---\n"
     result += formatCallStack()
     result += formatSavePoints()
@@ -150,7 +150,7 @@ extension TracedProcessor {
     return result
   }
 
-  public func formatInstruction(
+  func formatInstruction(
     _ pc: InstructionAddress,
     depth: Int = 5
   ) -> String {
@@ -160,7 +160,7 @@ extension TracedProcessor {
 }
 
 extension Collection where Element: InstructionProtocol, Index == InstructionAddress {
-  public func formatInstruction(
+  func formatInstruction(
     _ pc: InstructionAddress,
     atCurrent: Bool,
     depth: Int
diff --git a/Sources/_StringProcessing/Utility/TypedIndex.swift b/Sources/_StringProcessing/Utility/TypedIndex.swift
index 3bddcadfd..adde06a3e 100644
--- a/Sources/_StringProcessing/Utility/TypedIndex.swift
+++ b/Sources/_StringProcessing/Utility/TypedIndex.swift
@@ -12,55 +12,43 @@
 
 /// Forwarding wrapper around Int-index collections that provide a
 /// strongly (phantom) typed index.
-@frozen
-public struct TypedIndex<C: Collection, 👻>: RawRepresentable where C.Index == Int {
-  @_alwaysEmitIntoClient
-  public var rawValue: C
+struct TypedIndex<C: Collection, 👻>: RawRepresentable where C.Index == Int {
+  var rawValue: C
 
-  @_alwaysEmitIntoClient
-  public init(rawValue: C) { self.rawValue = rawValue }
+  init(rawValue: C) { self.rawValue = rawValue }
 
-  @_alwaysEmitIntoClient
-  public init(_ rawValue: C) { self.init(rawValue: rawValue) }
+  init(_ rawValue: C) { self.init(rawValue: rawValue) }
 }
 
 extension TypedIndex: Collection {
-  public typealias Index = TypedInt<👻>
-  public typealias Element = C.Element
+  typealias Index = TypedInt<👻>
+  typealias Element = C.Element
 
-  @_alwaysEmitIntoClient
-  public var startIndex: Index { Index(rawValue.startIndex) }
+  var startIndex: Index { Index(rawValue.startIndex) }
 
-  @_alwaysEmitIntoClient
-  public var endIndex: Index { Index(rawValue.endIndex )}
+  var endIndex: Index { Index(rawValue.endIndex )}
 
-  @_alwaysEmitIntoClient
-  public var count: Int { rawValue.count }
+  var count: Int { rawValue.count }
 
-  @_alwaysEmitIntoClient
-  public func index(after: Index) -> Index {
+  func index(after: Index) -> Index {
     Index(rawValue.index(after: after.rawValue))
   }
 
-  @_alwaysEmitIntoClient
-  public subscript(position: Index) -> Element {
+  subscript(position: Index) -> Element {
     rawValue[position.rawValue]
   }
 
-  @_alwaysEmitIntoClient
-  public func distance(
+  func distance(
     from start: Index, to end: Index
   ) -> Int {
     rawValue.distance(from: start.rawValue, to: end.rawValue)
   }
 
-  @_alwaysEmitIntoClient
-  public func index(_ i: Index, offsetBy distance: Int) -> Index {
+  func index(_ i: Index, offsetBy distance: Int) -> Index {
     Index(rawValue.index(i.rawValue, offsetBy: distance))
   }
 
-  @_alwaysEmitIntoClient
-  public func index(_ i: Index, offsetBy distance: Int, limitedBy limit: Index) -> Index? {
+  func index(_ i: Index, offsetBy distance: Int, limitedBy limit: Index) -> Index? {
     guard let idx = rawValue.index(i.rawValue, offsetBy: distance, limitedBy: limit.rawValue) else {
       return nil
     }
@@ -71,8 +59,7 @@ extension TypedIndex: Collection {
 extension TypedIndex: RandomAccessCollection where C: RandomAccessCollection {
 }
 extension TypedIndex: MutableCollection where C: MutableCollection {
-  @_alwaysEmitIntoClient
-  public subscript(position: Index) -> Element {
+  subscript(position: Index) -> Element {
     _read {
       yield rawValue[position.rawValue]
     }
@@ -82,8 +69,7 @@ extension TypedIndex: MutableCollection where C: MutableCollection {
   }
 }
 extension TypedIndex: BidirectionalCollection where C: BidirectionalCollection {
-  @_alwaysEmitIntoClient
-  public func index(before: Index) -> Index {
+  func index(before: Index) -> Index {
     Index(rawValue.index(before: before.rawValue))
   }
 }
@@ -92,11 +78,9 @@ extension TypedIndex: BidirectionalCollection where C: BidirectionalCollection {
 // failure in the Swift repo.
 #if false
 extension TypedIndex: RangeReplaceableCollection where C: RangeReplaceableCollection {
-  @_alwaysEmitIntoClient
-  public init() { rawValue = C() }
+  init() { rawValue = C() }
 
-  @_alwaysEmitIntoClient
-  public mutating func replaceSubrange<C>(_ subrange: Range<Index>, with newElements: C) where C : Collection, C.Element == Element {
+  mutating func replaceSubrange<C>(_ subrange: Range<Index>, with newElements: C) where C : Collection, C.Element == Element {
     let rawRange = subrange.lowerBound.rawValue ..< subrange.upperBound.rawValue
     rawValue.replaceSubrange(rawRange, with: newElements)
   }
@@ -107,14 +91,13 @@ extension TypedIndex: RangeReplaceableCollection where C: RangeReplaceableCollec
 
 // Workaround for #73
 extension TypedIndex where C: RangeReplaceableCollection {
-  public mutating func append(_ newElement: Element) {
+  mutating func append(_ newElement: Element) {
     rawValue.append(newElement)
   }
 }
 
 extension TypedIndex: ExpressibleByArrayLiteral where C: ExpressibleByArrayLiteral & RangeReplaceableCollection {
-  @_alwaysEmitIntoClient
-  public init(arrayLiteral elements: Element...) {
+  init(arrayLiteral elements: Element...) {
     // TODO: any way around the RRC copying init?
     self.init(C(elements))
   }
@@ -122,5 +105,5 @@ extension TypedIndex: ExpressibleByArrayLiteral where C: ExpressibleByArrayLiter
 
 // MARK: - Strongly typed wrappers
 
-public typealias InstructionList<Instruction: InstructionProtocol> = TypedIndex<[Instruction], _InstructionAddress>
+typealias InstructionList<Instruction: InstructionProtocol> = TypedIndex<[Instruction], _InstructionAddress>
 
diff --git a/Sources/_StringProcessing/Utility/TypedInt.swift b/Sources/_StringProcessing/Utility/TypedInt.swift
index caff7f64e..249717b68 100644
--- a/Sources/_StringProcessing/Utility/TypedInt.swift
+++ b/Sources/_StringProcessing/Utility/TypedInt.swift
@@ -11,86 +11,71 @@
 
 
 // Just a phantom-typed Int wrapper.
-@frozen
-public struct TypedInt<👻>: RawRepresentable, Hashable {
-  @_alwaysEmitIntoClient
-  public var rawValue: Int
+struct TypedInt<👻>: RawRepresentable, Hashable {
+  var rawValue: Int
 
-  @_alwaysEmitIntoClient
-  public init(rawValue: Int) {
+  init(rawValue: Int) {
     self.rawValue = rawValue
   }
 
-  @_alwaysEmitIntoClient
-  public init(_ rawValue: Int) {
+  init(_ rawValue: Int) {
     self.init(rawValue: rawValue)
   }
 
-  @_alwaysEmitIntoClient
-  public init(_ uint: UInt64) {
+  init(_ uint: UInt64) {
     assert(uint.leadingZeroBitCount > 0)
     self.init(Int(asserting: uint))
   }
 }
 extension TypedInt: Comparable {
-  @_alwaysEmitIntoClient
-  public static func <(lhs: TypedInt, rhs: TypedInt) -> Bool {
+  static func <(lhs: TypedInt, rhs: TypedInt) -> Bool {
     return lhs.rawValue < rhs.rawValue
   }
 }
 extension TypedInt: CustomStringConvertible {
-  @_alwaysEmitIntoClient
-  public var description: String { return "#\(rawValue)" }
+  var description: String { return "#\(rawValue)" }
 }
 extension TypedInt: ExpressibleByIntegerLiteral {
-  @_alwaysEmitIntoClient
-  public init(integerLiteral value: Int) {
+  init(integerLiteral value: Int) {
     self.init(rawValue: value)
   }
 }
 
-public protocol TypedIntProtocol {
+protocol TypedIntProtocol {
   associatedtype 👻
 }
 extension TypedInt: TypedIntProtocol { }
 
 // A placeholder type for when we must supply a type.
 // When the phantom type appears, it says boo
-public enum _Boo {}
+enum _Boo {}
 
 // Easier for clients to just have their own typealias
-public typealias TypedInt_ = TypedInt
+typealias TypedInt_ = TypedInt
 
 // TODO: BinaryInteger, etc.
 extension TypedInt {
-  @_alwaysEmitIntoClient
-  public static func +(lhs: TypedInt, rhs: Int) -> TypedInt {
+  static func +(lhs: TypedInt, rhs: Int) -> TypedInt {
     return TypedInt(lhs.rawValue + rhs)
   }
 
-  @_alwaysEmitIntoClient
-  public var bits: UInt64 {
+  var bits: UInt64 {
     UInt64(asserting: self.rawValue)
   }
 }
 
-@frozen
-public struct TypedSetVector<Element: Hashable, 👻> {
-  public typealias Idx = TypedInt<👻>
+struct TypedSetVector<Element: Hashable, 👻> {
+  typealias Idx = TypedInt<👻>
 
   // TODO: Replace with real set vector
-  @_alwaysEmitIntoClient
-  public var lookup: Dictionary<Element, Idx> = [:]
+  var lookup: Dictionary<Element, Idx> = [:]
 
-  @_alwaysEmitIntoClient
-  public var stored: Array<Element> = []
+  var stored: Array<Element> = []
 
-  @_alwaysEmitIntoClient
-  public func load(_ idx: Idx) -> Element { stored[idx.rawValue] }
+  func load(_ idx: Idx) -> Element { stored[idx.rawValue] }
 
-  @_alwaysEmitIntoClient
   @discardableResult
-  public mutating func store(_ e: Element) -> Idx {
+  mutating func store(_ e: Element) -> Idx {
     if let reg = lookup[e] { return reg }
     let reg = Idx(stored.count)
     stored.append(e)
@@ -98,34 +83,32 @@ public struct TypedSetVector<Element: Hashable, 👻> {
     return reg
   }
 
-  @_alwaysEmitIntoClient
-  public var count: Int { stored.count }
+  var count: Int { stored.count }
 
-  @_alwaysEmitIntoClient
-  public init() {}
+  init() {}
 }
 
 // MARK: - Strongly typed int wrappers
 
 /// A distance in the Input, e.g. `n` in consume(n)
-public typealias Distance = TypedInt<_Distance>
-public enum _Distance {}
+typealias Distance = TypedInt<_Distance>
+enum _Distance {}
 
 /// An instruction address, i.e. the index into our instruction list
-public typealias InstructionAddress = TypedInt<_InstructionAddress>
-public enum _InstructionAddress {}
+typealias InstructionAddress = TypedInt<_InstructionAddress>
+enum _InstructionAddress {}
 
 /// A position in the call stack, i.e. for save point restores
-public typealias CallStackAddress = TypedInt<_CallStackAddress>
-public enum _CallStackAddress {}
+typealias CallStackAddress = TypedInt<_CallStackAddress>
+enum _CallStackAddress {}
 
 /// A position in a position stack, i.e. for NFA simulation
-public typealias PositionStackAddress = TypedInt<_PositionStackAddress>
-public enum _PositionStackAddress {}
+typealias PositionStackAddress = TypedInt<_PositionStackAddress>
+enum _PositionStackAddress {}
 
 /// A position in the save point stack, i.e. for backtracking
-public typealias SavePointStackAddress = TypedInt<_SavePointAddress>
-public enum _SavePointAddress {}
+typealias SavePointStackAddress = TypedInt<_SavePointAddress>
+enum _SavePointAddress {}
 
 
 // MARK: - Registers
@@ -135,85 +118,85 @@ public enum _SavePointAddress {}
 /// NOTE: Currently just used for static data, but e.g. could be
 /// used to save the most recently seen element satisfying some
 /// property
-public typealias ElementRegister = TypedInt<_ElementRegister>
-public enum _ElementRegister {}
+typealias ElementRegister = TypedInt<_ElementRegister>
+enum _ElementRegister {}
 
-public typealias SequenceRegister = TypedInt<_SequenceRegister>
-public enum _SequenceRegister {}
+typealias SequenceRegister = TypedInt<_SequenceRegister>
+enum _SequenceRegister {}
 
 /// The register number for a stored boolean value
 ///
 /// E.g. used for conditional branches
-public typealias BoolRegister = TypedInt<_BoolRegister>
-public enum _BoolRegister {}
+typealias BoolRegister = TypedInt<_BoolRegister>
+enum _BoolRegister {}
 
 /// The register number for a string (e.g. comment, failure reason)
-public typealias StringRegister = TypedInt<_StringRegister>
-public enum _StringRegister {}
+typealias StringRegister = TypedInt<_StringRegister>
+enum _StringRegister {}
 
 /// Used for consume functions, e.g. character classes
-public typealias ConsumeFunctionRegister = TypedInt<_ConsumeFunctionRegister>
-public enum _ConsumeFunctionRegister {}
+typealias ConsumeFunctionRegister = TypedInt<_ConsumeFunctionRegister>
+enum _ConsumeFunctionRegister {}
 
 /// Used for assertion functions, e.g. anchors etc
-public typealias AssertionFunctionRegister = TypedInt<_AssertionFunctionRegister>
-public enum _AssertionFunctionRegister {}
+typealias AssertionFunctionRegister = TypedInt<_AssertionFunctionRegister>
+enum _AssertionFunctionRegister {}
 
 /// Used for capture transforms, etc
-public typealias TransformRegister = TypedInt<_TransformRegister>
-public enum _TransformRegister {}
+typealias TransformRegister = TypedInt<_TransformRegister>
+enum _TransformRegister {}
 
 /// Used for value-producing matchers
-public typealias MatcherRegister = TypedInt<_MatcherRegister>
-public enum _MatcherRegister {}
+typealias MatcherRegister = TypedInt<_MatcherRegister>
+enum _MatcherRegister {}
 
 /// UNIMPLEMENTED
-public typealias IntRegister = TypedInt<_IntRegister>
-public enum _IntRegister {}
+typealias IntRegister = TypedInt<_IntRegister>
+enum _IntRegister {}
 
 /// UNIMPLEMENTED
-public typealias FloatRegister = TypedInt<_FloatRegister>
-public enum _FloatRegister {}
+typealias FloatRegister = TypedInt<_FloatRegister>
+enum _FloatRegister {}
 
 /// UNIMPLEMENTED
 ///
 /// NOTE: This, along with a position stack, might
 /// serve NFA-simulation style execution models
-public typealias PositionRegister = TypedInt<_PositionRegister>
-public enum _PositionRegister {}
+typealias PositionRegister = TypedInt<_PositionRegister>
+enum _PositionRegister {}
 
-public typealias ValueRegister = TypedInt<_ValueRegister>
-public enum _ValueRegister {}
+typealias ValueRegister = TypedInt<_ValueRegister>
+enum _ValueRegister {}
 
-public typealias CaptureRegister = TypedInt<_CaptureRegister>
-public enum _CaptureRegister {}
+typealias CaptureRegister = TypedInt<_CaptureRegister>
+enum _CaptureRegister {}
 
 /// UNIMPLEMENTED
-public typealias InstructionAddressRegister = TypedInt<_InstructionAddressRegister>
-public enum _InstructionAddressRegister {}
+typealias InstructionAddressRegister = TypedInt<_InstructionAddressRegister>
+enum _InstructionAddressRegister {}
 
 /// UNIMPLEMENTED
-public typealias CallStackAddressRegister = TypedInt<_CallStackAddressRegister>
-public enum _CallStackAddressRegister {}
+typealias CallStackAddressRegister = TypedInt<_CallStackAddressRegister>
+enum _CallStackAddressRegister {}
 
 /// UNIMPLEMENTED
-public typealias PositionStackAddressRegister = TypedInt<_PositionStackAddressRegister>
-public enum _PositionStackAddressRegister {}
+typealias PositionStackAddressRegister = TypedInt<_PositionStackAddressRegister>
+enum _PositionStackAddressRegister {}
 
 /// UNIMPLEMENTED
-public typealias SavePointAddressRegister = TypedInt<_SavePointAddressRegister>
-public enum _SavePointAddressRegister {}
+typealias SavePointAddressRegister = TypedInt<_SavePointAddressRegister>
+enum _SavePointAddressRegister {}
 
 /// A numbered label
-public typealias LabelId = TypedInt<_LabelId>
-public enum _LabelId {}
+typealias LabelId = TypedInt<_LabelId>
+enum _LabelId {}
 
 /// A numbered function
-public typealias FunctionId = TypedInt<_FunctionId>
-public enum _FunctionId {}
+typealias FunctionId = TypedInt<_FunctionId>
+enum _FunctionId {}
 
 /// A numbered capture
-public typealias CaptureId = TypedInt<_CaptureId>
-public enum _CaptureId {}
+typealias CaptureId = TypedInt<_CaptureId>
+enum _CaptureId {}
 
 

From 75d49319abf7120c34eab3d443c102e7d957e91a Mon Sep 17 00:00:00 2001
From: Michael Ilseman <michael.ilseman@gmail.com>
Date: Tue, 22 Feb 2022 07:41:03 -0700
Subject: [PATCH 05/19] Initial custom-component test infrastructure

---
 Tests/RegexTests/CustomTests.swift | 82 ++++++++++++++++++++----------
 1 file changed, 54 insertions(+), 28 deletions(-)

diff --git a/Tests/RegexTests/CustomTests.swift b/Tests/RegexTests/CustomTests.swift
index 73a50b108..692ad6c37 100644
--- a/Tests/RegexTests/CustomTests.swift
+++ b/Tests/RegexTests/CustomTests.swift
@@ -37,39 +37,65 @@ private struct Asciibbler: Nibbler {
   }
 }
 
-extension RegexTests {
-
-  // TODO: Refactor below into more exhaustive, declarative
-  // tests.
-  func testMatchingConsumers() {
-
-    let regex = Regex {
-      Numbler()
-      Asciibbler()
-    }
+enum MatchCall {
+  case match
+  case firstMatch
+}
 
-    guard let result = "4t".match(regex) else {
-      XCTFail()
-      return
+func customTest<Match: Equatable>(
+  _ regex: Regex<Match>,
+  _ tests: (input: String, call: MatchCall, match: Match?)...
+) {
+  for (input, call, match) in tests {
+    let result: Match?
+    switch call {
+    case .match:
+      result = input.match(regex)?.match
+    case .firstMatch:
+      result = input.firstMatch(of: regex)?.result
     }
-    XCTAssert(result.match == "4t")
+    XCTAssertEqual(result, match)
+  }
+}
 
-    XCTAssertNil("4".match(regex))
-    XCTAssertNil("t".match(regex))
-    XCTAssertNil("t4".match(regex))
+extension RegexTests {
 
-    let regex2 = Regex {
-      oneOrMore {
+  // TODO: Refactor below into more exhaustive, declarative
+  // tests.
+  func testCustomRegexComponents() {
+    customTest(
+      Regex {
         Numbler()
-      }
-    }
-
-    guard let res2 = "ab123c".firstMatch(of: regex2) else {
-      XCTFail()
-      return
-    }
-
-    XCTAssertEqual(res2.match, "123")
+        Asciibbler()
+      },
+      ("4t", .match, "4t"),
+      ("4", .match, nil),
+      ("t", .match, nil),
+      ("t x1y z", .firstMatch, "1y"),
+      ("t4", .match, nil))
+
+    customTest(
+      Regex {
+        oneOrMore { Numbler() }
+      },
+      ("ab123c", .firstMatch, "123"),
+      ("abc", .firstMatch, nil),
+      ("55z", .match, nil),
+      ("55z", .firstMatch, "55"))
+
+    // FIXME: Requires we return a value instead of a range
+//    customTest(
+//      Regex {
+//        Numbler()
+//      },
+//      ("ab123c", .firstMatch, 1),
+//      ("abc", .firstMatch, nil),
+//      ("55z", .match, nil),
+//      ("55z", .firstMatch, 5))
+
+    // TODO: Convert below tests to better infra. Right now
+    // it's hard because `Match` is constrained to be
+    // `Equatable` which tuples cannot be.
 
     let regex3 = Regex {
       capture {

From 2eb7e4e1563a0dbedc8915dc68e3446dd051d716 Mon Sep 17 00:00:00 2001
From: Michael Ilseman <michael.ilseman@gmail.com>
Date: Sat, 26 Feb 2022 14:20:51 -0700
Subject: [PATCH 06/19] Cleanup/simplify Executor codepaths (#190)

---
 Sources/_StringProcessing/Capture.swift       |  10 +
 .../_StringProcessing/Engine/Consume.swift    |   8 +-
 Sources/_StringProcessing/Executor.swift      |  76 ++---
 .../_StringProcessing/RegexDSL/Match.swift    |  13 +-
 .../MatchingEngineTests.swift                 | 288 +-----------------
 Tests/RegexTests/CaptureTests.swift           |  12 +-
 Tests/RegexTests/MatchTests.swift             |  10 +-
 7 files changed, 49 insertions(+), 368 deletions(-)

diff --git a/Sources/_StringProcessing/Capture.swift b/Sources/_StringProcessing/Capture.swift
index 915c4c5d7..ecfc558fe 100644
--- a/Sources/_StringProcessing/Capture.swift
+++ b/Sources/_StringProcessing/Capture.swift
@@ -71,6 +71,11 @@ extension StructuredCapture {
       value: storedCapture?.value,
       optionalCount: optionalCount)
   }
+
+  func slice(from input: String) -> Substring? {
+    guard let r = storedCapture?.range else { return nil }
+    return input[r]
+  }
 }
 
 extension Sequence where Element == StructuredCapture {
@@ -86,5 +91,10 @@ extension Sequence where Element == StructuredCapture {
     })
     return TypeConstruction.tuple(of: caps)
   }
+
+  func slices(from input: String) -> [Substring?] {
+    self.map { $0.slice(from: input) }
+  }
 }
 
+
diff --git a/Sources/_StringProcessing/Engine/Consume.swift b/Sources/_StringProcessing/Engine/Consume.swift
index cfb803de8..a4a3bf26c 100644
--- a/Sources/_StringProcessing/Engine/Consume.swift
+++ b/Sources/_StringProcessing/Engine/Consume.swift
@@ -25,16 +25,10 @@ extension Engine {
 }
 
 extension Engine where Input == String {
-  func consume(
-    _ input: Input
-  ) -> (Input.Index, CaptureList)? {
-    consume(input, in: input.startIndex ..< input.endIndex)
-  }
-
   func consume(
     _ input: Input,
     in range: Range<Input.Index>,
-    matchMode: MatchMode = .partialFromFront
+    matchMode: MatchMode
   ) -> (Input.Index, CaptureList)? {
     if enableTracing {
       print("Consume: \(input)")
diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift
index 5c098c5c5..9de2b0b3d 100644
--- a/Sources/_StringProcessing/Executor.swift
+++ b/Sources/_StringProcessing/Executor.swift
@@ -19,69 +19,33 @@ struct Executor {
     self.engine = Engine(program, enableTracing: enablesTracing)
   }
 
-  struct Result {
-    var range: Range<String.Index>
-    var captures: [StructuredCapture]
-    var referencedCaptureOffsets: [ReferenceID: Int]
-
-    var destructure: (
-      matched: Range<String.Index>,
-      captures: [StructuredCapture],
-      referencedCaptureOffsets: [ReferenceID: Int]
-    ) {
-      (range, captures, referencedCaptureOffsets)
-    }
-
-    init(
-      _ matched: Range<String.Index>, _ captures: [StructuredCapture],
-      _ referencedCaptureOffsets: [ReferenceID: Int]
-    ) {
-      self.range = matched
-      self.captures = captures
-      self.referencedCaptureOffsets = referencedCaptureOffsets
-    }
-  }
-
-  func execute(
-    input: String,
-    in range: Range<String.Index>,
-    mode: MatchMode = .wholeString
-  ) -> Result? {
+  func match<Match>(
+    _ input: String,
+    in inputRange: Range<String.Index>,
+    _ mode: MatchMode
+  ) throws -> RegexMatch<Match>? {
     guard let (endIdx, capList) = engine.consume(
-      input, in: range, matchMode: mode
+      input, in: inputRange, matchMode: mode
     ) else {
       return nil
     }
     let capStruct = engine.program.captureStructure
-    do {
-      let range = range.lowerBound..<endIdx
-
-      let caps = try capStruct.structuralize(
+    let range = inputRange.lowerBound..<endIdx
+    let caps = try capStruct.structuralize(
         capList, input)
-      return Result(range, caps, capList.referencedCaptureOffsets)
-    } catch {
-      fatalError(String(describing: error))
-    }
-  }
-  func execute(
-    input: Substring,
-    mode: MatchMode = .wholeString
-  ) -> Result? {
-    self.execute(
-      input: input.base,
-      in: input.startIndex..<input.endIndex,
-      mode: mode)
+
+    return RegexMatch(
+      input: input,
+      range: range,
+      rawCaptures: caps,
+      referencedCaptureOffsets: capList.referencedCaptureOffsets)
   }
 
-  func executeFlat(
-    input: String,
-    in range: Range<String.Index>,
-    mode: MatchMode = .wholeString
-  ) -> (Range<String.Index>, CaptureList)? {
-    engine.consume(
-      input, in: range, matchMode: mode
-    ).map { endIndex, capture in
-      (range.lowerBound..<endIndex, capture)
-    }
+  func dynamicMatch(
+    _ input: String,
+    in inputRange: Range<String.Index>,
+    _ mode: MatchMode
+  ) throws -> RegexMatch<(Substring, DynamicCaptures)>? {
+    try match(input, in: inputRange, mode)
   }
 }
diff --git a/Sources/_StringProcessing/RegexDSL/Match.swift b/Sources/_StringProcessing/RegexDSL/Match.swift
index 2dd31c379..c2593b22a 100644
--- a/Sources/_StringProcessing/RegexDSL/Match.swift
+++ b/Sources/_StringProcessing/RegexDSL/Match.swift
@@ -69,16 +69,11 @@ extension RegexProtocol {
     mode: MatchMode = .wholeString
   ) -> RegexMatch<Match>? {
     let executor = Executor(program: regex.program.loweredProgram)
-    guard let (range, captures, captureOffsets) = executor.execute(
-      input: input, in: inputRange, mode: mode
-    )?.destructure else {
-      return nil
+    do {
+      return try executor.match(input, in: inputRange, mode)
+    } catch {
+      fatalError(String(describing: error))
     }
-    return RegexMatch(
-      input: input,
-      range: range,
-      rawCaptures: captures,
-      referencedCaptureOffsets: captureOffsets)
   }
 }
 
diff --git a/Tests/MatchingEngineTests/MatchingEngineTests.swift b/Tests/MatchingEngineTests/MatchingEngineTests.swift
index b7c89661d..ccfe85ec7 100644
--- a/Tests/MatchingEngineTests/MatchingEngineTests.swift
+++ b/Tests/MatchingEngineTests/MatchingEngineTests.swift
@@ -13,289 +13,5 @@ import XCTest
 
 @testable import _StringProcessing
 
-/// Hold context and run variety of ad-hoc tests
-///
-/// TODO: Use these to demonstrate first-order approximation of what
-/// overhead such an engine imposes
-fileprivate struct Test: ExpressibleByStringLiteral {
-  var input: String
-  var aEater: String
-  var manyAEater: String
-  var eatUntilA: String
-  var eatThroughA: String
-
-  // TODO: Have tests explicitly show each step of type binding,
-  // input binding, etc.
-  var enableTracing: Bool? = nil
-
-  /*
-
-   until first A
-   through first A
-   until / through last A
-   etc
-
-   */
-
-  var file: String
-  var line: UInt
-
-  init(
-    _ s: String,
-    enableTracing: Bool? = nil,
-    file: String = #file,
-    line: UInt = #line
-  ) {
-    self.input = s
-    self.aEater = s.first == "A" ? String(s.dropFirst()) : s
-    self.manyAEater = String(s.drop(while: { $0 == "A" }))
-
-    if let firstIdx = s.firstIndex(of: "A") {
-      self.eatUntilA = String(s[firstIdx...])
-      self.eatThroughA = String(eatUntilA.dropFirst())
-    } else {
-      self.eatUntilA = s
-      self.eatThroughA = s
-    }
-
-    self.enableTracing = enableTracing
-
-//    self.untilFirstAEater = String(
-//      s[(s.firstIndex(where: { $0 == "A" }) ?? s.startIndex)...])
-
-
-    self.file = file
-    self.line = line
-  }
-  init(
-    stringLiteral: String,
-    file: String = #file,
-    line: UInt = #line
-  ) {
-    self.init(stringLiteral, file: file, line: line)
-  }
-  init(stringLiteral: String) {
-    // NOTE: Can't get source location of a literal...
-    self.init(stringLiteral)
-  }
-
-  var slicedInput: (String, Range<String.Index>) {
-    let prefix = "aAa prefix ⚠️"
-    let suffix = "⚠️ aAa suffix"
-    let outer = prefix + input + suffix
-    let range = outer.mapOffsets(
-      (lower: prefix.count, upper: -suffix.count))
-    return (outer, range)
-  }
-
-  func check(_ engine: Engine<String>,  expected: String) {
-    var engine = engine
-    if let t = enableTracing {
-      engine.enableTracing = t
-    }
-    let output: String
-    let outputFromSlice: String
-
-    if let (idx, _) = engine.consume(input) {
-      output = String(input[idx...])
-    } else {
-      output = input
-    }
-
-    let (outerInput, range) = slicedInput
-    if let (idx, _) = engine.consume(outerInput, in: range) {
-      outputFromSlice = String(outerInput[idx..<range.upperBound])
-    } else {
-      outputFromSlice = input
-    }
-
-    XCTAssertEqual(expected, output)
-    XCTAssertEqual(output, outputFromSlice)
-  }
-
-  func check(
-    aEater: Engine<String>? = nil,
-    manyAEater: Engine<String>? = nil,
-    eatUntilA: Engine<String>? = nil,
-    eatThroughA: Engine<String>? = nil
-  ) {
-    if let engine = aEater {
-      check(engine, expected: self.aEater)
-    }
-    if let engine = manyAEater {
-      check(engine, expected: self.manyAEater)
-    }
-    if let engine = eatUntilA {
-      check(engine, expected: self.eatUntilA)
-    }
-    if let engine = eatThroughA {
-      check(engine, expected: self.eatThroughA)
-    }
-  }
-}
-
-var doPrint = false
-func show(_ s: CustomStringConvertible) {
-  if doPrint { print(s) }
-}
-
-func makeEngine(
-  _ constructor: (inout Program.Builder) -> ()
-) -> Engine<String> {
-  var builder = Program.Builder()
-  constructor(&builder)
-  let program = try! builder.assemble()
-  let engine = Engine<String>(program)
-  show(engine)
-  return engine
-}
-
-// Eat an A off the front
-//
-//   [0] match "A"
-//   [1] accept
-//
-let aEater: Engine<String> = {
-  makeEngine { builder in
-    builder.buildMatch("A")
-    builder.buildAccept()
-  }
-}()
-
-// Eat many "A"s off the input
-//
-//   [0] saveAddress [3] // .accept
-//   [1] match "A"
-//   [2] goto [1] // match "A"
-//   [3] accept
-//
-// NOTE: a save would restore input position, which we
-// actually don't want to do.
-//
-// NOTE: We should compare with a more sophisticated match
-// instruction that can take at least or at most, etc.
-//
-let manyAEater: Engine<String> = {
-  makeEngine { builder in
-    let accTok = builder.makeAddress()
-    let matchTok = builder.makeAddress()
-
-    builder.buildSaveAddress(accTok)
-    builder.buildMatch("A")
-    builder.resolve(matchTok)
-    builder.buildBranch(to: matchTok)
-    builder.buildAccept()
-    builder.resolve(accTok)
-  }
-}()
-
-// Eat until you find an A (FAIL if no A)
-//
-//   [0] assert #0 #0
-//   [1] condBranch #0 [x] // accept
-//   [2] advance(1)
-//   [3] goto 0
-//   [4] accept
-//
-// NOTE: This check-consume-else-branch pattern
-// could be pretty common and might be worth a dedicated
-// instruction.
-let eatUntilA: Engine<String> = {
-  makeEngine { builder in 
-    let reg = builder.makeBoolRegister()
-    let accTok = builder.makeAddress()
-    let assertTok = builder.makeAddress()
-    builder.buildAssert("A", into: reg)
-    builder.resolve(assertTok)
-    builder.buildCondBranch(reg, to: accTok)
-    builder.buildAdvance(1)
-    builder.buildBranch(to: assertTok)
-    builder.buildAccept()
-    builder.resolve(accTok)
-  }
-}()
-
-// Eat through the first A (FAIL if no A)
-//
-//   [0] assert #0 #0
-//   [1] advance(1)
-//   [2] condBranch #0 [x] // accept
-//   [3] goto 0
-//   [4] accept
-let eatThroughA: Engine<String> = {
-  makeEngine { builder in
-    let reg = builder.makeBoolRegister()
-    let accTok = builder.makeAddress()
-    let assertTok = builder.makeAddress()
-    builder.buildAssert("A", into: reg)
-    builder.resolve(assertTok)
-    builder.buildAdvance(1)
-    builder.buildCondBranch(reg, to: accTok)
-    builder.buildBranch(to: assertTok)
-    builder.buildAccept()
-    builder.resolve(accTok)
-  }
-}()
-
-
-
-class MatchingEngineTests: XCTestCase {
-
-  func testAEaters() {
-    let tests: Array<Test> = [
-      Test("abc"),
-      Test("Abc"),
-      Test("AAbc"),
-      Test(""),
-      Test("A"),
-      Test("b"),
-      Test("bbbA"),
-      Test("bbAbA"),
-    ]
-
-    for test in tests {
-      test.check(aEater: aEater)
-      test.check(manyAEater: manyAEater)
-      test.check(eatUntilA: eatUntilA)
-      test.check(eatThroughA: eatThroughA)
-    }
-  }
-
-  func testThreeLetterRepeat() {
-    // Check for a repeated 3-letter sequence, such as in
-    //   `(...)\1`
-    //
-    //   [0] movePosition(into: %low)
-    //   [1] advance(3)
-    //   [2] movePosition(into: %high)
-    //   [3] matchSlice(%low, %high)
-    //   [4] accept
-    let threeLetterRepeat: Engine<String> = {
-      makeEngine { builder in
-        let low = builder.makePositionRegister(
-          initializingWithCurrentPosition: ())
-        builder.buildAdvance(3)
-        let high = builder.makePositionRegister(
-          initializingWithCurrentPosition: ())
-        builder.buildMatchSlice(lower: low, upper: high)
-        builder.buildAccept()
-      }
-    }()
-
-    let tests: Array<(String, Bool)> = [
-      ("abcabc", true),
-      ("abcabc_____", true),
-      ("dddddd_____", true),
-      ("🥳🧟‍♀️c🥳🧟‍♀️c", true),
-      ("abccba", false),
-      ("abcabb", false),
-      ("abcbac", false),
-      ("🥳🧟‍♀️c🥳🧟‍♂️c", false),
-    ]
-
-    for (test, expect) in tests {
-      let match = threeLetterRepeat.consume(test) != nil
-      XCTAssertEqual(expect, match)
-    }
-  }
-}
+// TODO: Unit tests for the engine itself. Functional testing
+// is handled by regex tests.
diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift
index 9f3cc313b..cc3568c1d 100644
--- a/Tests/RegexTests/CaptureTests.swift
+++ b/Tests/RegexTests/CaptureTests.swift
@@ -142,13 +142,15 @@ func captureTest(
 
   for (input, output) in tests {
     let inputRange = input.startIndex..<input.endIndex
-    let (_, capFlat) = executor.executeFlat(
-      input: input, in: inputRange, mode: .wholeString
-    )!
 
-    let caps = try! capStructure.structuralize(
-      capFlat, input)
+    guard let result = try! executor.dynamicMatch(
+      input, in: inputRange, .wholeString
+    ) else {
+      XCTFail("No match")
+      return
+    }
 
+    let caps = result.rawCaptures
     guard caps.count == output.count else {
       XCTFail("""
       Mismatch capture count:
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index 4dd6392f7..d95eb62f9 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -23,13 +23,13 @@ extension Executor {
     // Consumer -> searcher algorithm
     var start = input.startIndex
     while true {
-      if let (range, caps) = self.executeFlat(
-        input: input,
+      if let result = try! self.dynamicMatch(
+        input,
         in: start..<input.endIndex,
-        mode: .partialFromFront
+        .partialFromFront
       ) {
-        let matched = input[range]
-        return (matched, caps.latestUntyped(from: input))
+        let caps = result.rawCaptures.slices(from: input)
+        return (input[result.range], caps)
       } else if start == input.endIndex {
         throw "match not found for \(regex) in \(input)"
       } else {

From 6bed883cb83c06e2c469b4d5abaf90a48d118ef7 Mon Sep 17 00:00:00 2001
From: Michael Ilseman <michael.ilseman@gmail.com>
Date: Sun, 27 Feb 2022 07:39:15 -0700
Subject: [PATCH 07/19] Quick bug fix / workaround for whole-match values
 (#191)

---
 Sources/_StringProcessing/Capture.swift       |  2 -
 .../_StringProcessing/Engine/Consume.swift    | 42 ++++---------------
 Sources/_StringProcessing/Executor.swift      | 28 +++++++++++--
 .../_StringProcessing/RegexDSL/Match.swift    | 10 +++++
 Tests/RegexTests/CustomTests.swift            | 17 ++++----
 5 files changed, 51 insertions(+), 48 deletions(-)

diff --git a/Sources/_StringProcessing/Capture.swift b/Sources/_StringProcessing/Capture.swift
index ecfc558fe..5b43da870 100644
--- a/Sources/_StringProcessing/Capture.swift
+++ b/Sources/_StringProcessing/Capture.swift
@@ -96,5 +96,3 @@ extension Sequence where Element == StructuredCapture {
     self.map { $0.slice(from: input) }
   }
 }
-
-
diff --git a/Sources/_StringProcessing/Engine/Consume.swift b/Sources/_StringProcessing/Engine/Consume.swift
index a4a3bf26c..4e00a34b4 100644
--- a/Sources/_StringProcessing/Engine/Consume.swift
+++ b/Sources/_StringProcessing/Engine/Consume.swift
@@ -24,41 +24,17 @@ extension Engine {
   }
 }
 
-extension Engine where Input == String {
-  func consume(
-    _ input: Input,
-    in range: Range<Input.Index>,
-    matchMode: MatchMode
-  ) -> (Input.Index, CaptureList)? {
-    if enableTracing {
-      print("Consume: \(input)")
-    }
-
-    var cpu = makeProcessor(input: input, bounds: range, matchMode: matchMode)
-    let result: Input.Index? = {
-      while true {
-        switch cpu.state {
-        case .accept:
-          return cpu.currentPosition
-        case .fail:
-          return nil
-        case .inProgress: cpu.cycle()
-        }
-      }
-    }()
-
-    if enableTracing {
-      if let idx = result {
-        print("Result: \(input[..<idx]) | \(input[idx...])")
-      } else {
-        print("Result: nil")
+extension Processor where Input == String {
+  mutating func consume() -> Input.Index? {
+    while true {
+      switch self.state {
+      case .accept:
+        return self.currentPosition
+      case .fail:
+        return nil
+      case .inProgress: self.cycle()
       }
     }
-    guard let result = result else { return nil }
-
-    let capList = cpu.storedCaptures
-    return (result, CaptureList(
-      values: capList, referencedCaptureOffsets: program.referencedCaptureOffsets))
   }
 }
 
diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift
index 9de2b0b3d..c044cbf24 100644
--- a/Sources/_StringProcessing/Executor.swift
+++ b/Sources/_StringProcessing/Executor.swift
@@ -24,21 +24,41 @@ struct Executor {
     in inputRange: Range<String.Index>,
     _ mode: MatchMode
   ) throws -> RegexMatch<Match>? {
-    guard let (endIdx, capList) = engine.consume(
-      input, in: inputRange, matchMode: mode
-    ) else {
+    var cpu = engine.makeProcessor(
+      input: input, bounds: inputRange, matchMode: mode)
+
+    guard let endIdx = cpu.consume() else {
       return nil
     }
+
+    let capList = CaptureList(
+      values: cpu.storedCaptures,
+      referencedCaptureOffsets: engine.program.referencedCaptureOffsets)
+
     let capStruct = engine.program.captureStructure
     let range = inputRange.lowerBound..<endIdx
     let caps = try capStruct.structuralize(
         capList, input)
 
+    // FIXME: This is a workaround for not tracking (or
+    // specially compiling) whole-match values.
+    let value: Any?
+    if Match.self != Substring.self,
+       Match.self != (Substring, DynamicCaptures).self,
+       caps.isEmpty
+    {
+      value = cpu.registers.values.first
+      assert(value != nil, "hmm, what would this mean?")
+    } else {
+      value = nil
+    }
+
     return RegexMatch(
       input: input,
       range: range,
       rawCaptures: caps,
-      referencedCaptureOffsets: capList.referencedCaptureOffsets)
+      referencedCaptureOffsets: capList.referencedCaptureOffsets,
+      value: value)
   }
 
   func dynamicMatch(
diff --git a/Sources/_StringProcessing/RegexDSL/Match.swift b/Sources/_StringProcessing/RegexDSL/Match.swift
index c2593b22a..c5ada0c9d 100644
--- a/Sources/_StringProcessing/RegexDSL/Match.swift
+++ b/Sources/_StringProcessing/RegexDSL/Match.swift
@@ -16,6 +16,8 @@ public struct RegexMatch<Match> {
   let rawCaptures: [StructuredCapture]
   let referencedCaptureOffsets: [ReferenceID: Int]
 
+  let value: Any?
+
   public var match: Match {
     if Match.self == (Substring, DynamicCaptures).self {
       // FIXME(rdar://89449323): Compiler assertion
@@ -25,7 +27,15 @@ public struct RegexMatch<Match> {
     } else if Match.self == Substring.self {
       // FIXME: Plumb whole match (`.0`) through the matching engine.
       return input[range] as! Match
+    } else if rawCaptures.isEmpty, value != nil {
+      // FIXME: This is a workaround for whole-match values not
+      // being modeled as part of captures. We might want to
+      // switch to a model where results are alongside captures
+      return value! as! Match
     } else {
+      guard value == nil else {
+        fatalError("FIXME: what would this mean?")
+      }
       let typeErasedMatch = rawCaptures.existentialMatch(from: input[range])
       return typeErasedMatch as! Match
     }
diff --git a/Tests/RegexTests/CustomTests.swift b/Tests/RegexTests/CustomTests.swift
index 692ad6c37..0ebfe4652 100644
--- a/Tests/RegexTests/CustomTests.swift
+++ b/Tests/RegexTests/CustomTests.swift
@@ -83,15 +83,14 @@ extension RegexTests {
       ("55z", .match, nil),
       ("55z", .firstMatch, "55"))
 
-    // FIXME: Requires we return a value instead of a range
-//    customTest(
-//      Regex {
-//        Numbler()
-//      },
-//      ("ab123c", .firstMatch, 1),
-//      ("abc", .firstMatch, nil),
-//      ("55z", .match, nil),
-//      ("55z", .firstMatch, 5))
+    customTest(
+      Regex {
+        Numbler()
+      },
+      ("ab123c", .firstMatch, 1),
+      ("abc", .firstMatch, nil),
+      ("55z", .match, nil),
+      ("55z", .firstMatch, 5))
 
     // TODO: Convert below tests to better infra. Right now
     // it's hard because `Match` is constrained to be

From 3ab3b179188bc9d59e4c125fd132f2ddc530fc4a Mon Sep 17 00:00:00 2001
From: Richard Wei <rxwei@apple.com>
Date: Thu, 17 Feb 2022 15:20:47 -0800
Subject: [PATCH 08/19] Document specially integrated modules and integration
 process.

---
 README.md | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/README.md b/README.md
index e6f94377c..941231b24 100644
--- a/README.md
+++ b/README.md
@@ -9,3 +9,65 @@ See [Declarative String Processing Overview][decl-string]
 ## Requirements
 
 - [Swift Trunk Development Snapshot](https://www.swift.org/download/#snapshots) DEVELOPMENT-SNAPSHOT-2022-02-03 or later.
+
+## Integration with Swift
+
+`_MatchingEngine`, `_CUnicode` and `_StringProcessing` are specially integrated modules that are built as part of apple/swift.
+
+Specifically, `_MatchingEngine` contains the parser for regular expression literals and is built both as part of the compiler and as a core library. `_CUnicode` and `_StringProcessing` are built together as a core library named `_StringProcessing`.
+
+| Module              | Swift toolchain component                                                            |
+| ------------------- | ------------------------------------------------------------------------------------ |
+| `_MatchingEngine`   | `SwiftCompilerSources/Sources/ExperimentalRegex` and `stdlib/public/_MatchingEngine` |
+| `_CUnicode`         | `stdlib/public/_StringProcessing`                                                    |
+| `_StringProcessing` | `stdlib/public/_StringProcessing`                                                    |
+
+### Branching scheme
+
+#### Development branch
+
+The `main` branch is the branch for day-to-day development. Generally, you should create PRs against this branch.
+
+#### Swift integration branches
+
+Branches whose name starts with `swift/` are Swift integration branches similar to those in [apple/llvm-project](https://github.com/apple/llvm-project). For each branch, dropping the `swift/` prefix is the corresponding branch in [apple/swift](https://github.com/apple/swift).
+
+| apple/swift branch  | apple/swift-experimental-string-processing branch     |
+| ------------------- | ----------------------------------------------------- |
+| main                | swift/main                                            |
+| release/5.7         | swift/release/5.7                                     |
+| ...                 | swift/...                                             |
+
+A pair of corresponding branches are expected to build successfully together and pass all tests.
+
+### Integration workflow
+
+To integrate the latest changes in apple/swift-experimental-string-processing to apple/swift, carefully follow the workflow: 
+
+- Create pull requests.
+  - Create a pull request in apple/swift-experimental-string-processing from `main` to `swift/main`, e.g. "[Integration] main -> swift/main".
+  - If apple/swift needs to be modified to work with the latest `main` in apple/swift-experimental-string-processing, create a pull request in apple/swift.
+- Trigger CI.
+  - In the apple/swift-experimental-string-processing pull request, trigger CI using the following command (replacing `<PR NUMBER>` with the apple/swift pull request number, if any):
+    ```
+    apple/swift#<PR NUMBER> # use this line only if there is an corresponding apple/swift PR
+    @swift-ci please test
+    ```
+  - In the apple/swift pull request (if any), trigger CI using the following command (replacing `<PR NUMBER>` with the apple/swift-experimental-string-processing pull request number):
+    ```
+    apple/swift-experimental-string-processing#<PR NUMBER>
+    @swift-ci please test
+    ```
+- Merge when approved.
+  - Merge the pull request in apple/swift-experimental-string-processing as a **merge commit**.
+  - Merge the pull request in apple/swift (if any).
+
+### Development notes
+
+Compiler integration can be tricky. Use special caution when developing `_MatchingEngine`, `_CUnicode` and `_StringProcessing` modules.
+
+- Do not change the names of these modules without due approval from compiler and infrastructure teams.
+- Do not modify the existing ABI (e.g. C API, serialization format) between the regular expression parser and the Swift compiler unless absolutely necessary. 
+- Always minimize the number of lockstep integrations, i.e. when apple/swift-experimental-string-processing and apple/swift have to change together. Whenever possible, introduce new API first, migrate Swift compiler onto it, and then deprecate old API. Use versioning if helpful.
+- In `_StringProcessing`, do not write fully qualified references to symbols in `_CUnicode`, and always wrap `import _CUnicode` in a `#if canImport(_CUnicode)`. This is because `_CUnicode` is built as part of `_StringProcessing` with CMake.
+- In `_MatchingEngine`, do not write fully qualified references to `_MatchingEngine` itself. This is because `_MatchingEngine` is built as `ExperimentalRegex` in `SwiftCompilerSources/` with CMake. 

From cebf4a6a20fde052287358c6706c27084aa4d27e Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Wed, 2 Mar 2022 16:53:55 +0000
Subject: [PATCH 09/19] Fix crash on lone backslash

---
 Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift | 4 +++-
 Sources/_MatchingEngine/Regex/Parse/Source.swift          | 6 ++++++
 Tests/RegexTests/ParseTests.swift                         | 4 ++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
index dd785f12d..cfab75312 100644
--- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
@@ -1472,7 +1472,9 @@ extension Source {
         return ref
       }
 
-      let char = src.eat()
+      guard let char = src.tryEat() else {
+        throw ParseError.expectedEscape
+      }
 
       // Single-character builtins.
       if let builtin = AST.Atom.EscapedBuiltin(
diff --git a/Sources/_MatchingEngine/Regex/Parse/Source.swift b/Sources/_MatchingEngine/Regex/Parse/Source.swift
index 11bd8152f..ddf0475f3 100644
--- a/Sources/_MatchingEngine/Regex/Parse/Source.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/Source.swift
@@ -86,6 +86,12 @@ extension Source {
     tryEat(anyOf: set)
   }
 
+  /// Try to eat any character, returning `nil` if the input has been exhausted.
+  mutating func tryEat() -> Char? {
+    guard !isEmpty else { return nil }
+    return eat()
+  }
+
   mutating func eat(asserting c: Char) {
     assert(peek() == c)
     advance()
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index e55abcbb9..23a3b910f 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -1753,6 +1753,10 @@ extension RegexTests {
     diagnosticTest("(?<a-b", .expected(">"))
     diagnosticTest("(?<a-b>", .expected(")"))
 
+    // MARK: Bad escapes
+
+    diagnosticTest("\\", .expectedEscape)
+
     // MARK: Text Segment options
 
     diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions)

From c48fb1cbdeeb37ba55ff449a1046bbefe48e7d24 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Wed, 2 Mar 2022 16:53:55 +0000
Subject: [PATCH 10/19] Separate out DelimiterLexing.swift

---
 .../Regex/Parse/DelimiterLexing.swift         | 153 ++++++++++++++++++
 .../_MatchingEngine/Regex/Parse/Mocking.swift | 144 -----------------
 2 files changed, 153 insertions(+), 144 deletions(-)
 create mode 100644 Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift

diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
new file mode 100644
index 000000000..70532f9e7
--- /dev/null
+++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
@@ -0,0 +1,153 @@
+//===----------------------------------------------------------------------===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2022 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See https://swift.org/LICENSE.txt for license information
+//
+//===----------------------------------------------------------------------===//
+
+// TODO: mock up multi-line soon
+
+enum Delimiter: Hashable, CaseIterable {
+  case traditional
+  case experimental
+  case reSingleQuote
+
+  var openingAndClosing: (opening: String, closing: String) {
+    switch self {
+    case .traditional: return ("#/", "/#")
+    case .experimental: return ("#|", "|#")
+    case .reSingleQuote: return ("re'", "'")
+    }
+  }
+  var opening: String { openingAndClosing.opening }
+  var closing: String { openingAndClosing.closing }
+
+  /// The default set of syntax options that the delimiter indicates.
+  var defaultSyntaxOptions: SyntaxOptions {
+    switch self {
+    case .traditional, .reSingleQuote:
+      return .traditional
+    case .experimental:
+      return .experimental
+    }
+  }
+}
+
+struct LexError: Error, CustomStringConvertible {
+  enum Kind: Hashable {
+    case endOfString
+    case invalidUTF8 // TODO: better range reporting
+    case unknownDelimiter
+  }
+
+  var kind: Kind
+
+  /// The pointer at which to resume lexing.
+  var resumePtr: UnsafeRawPointer
+
+  init(_ kind: Kind, resumeAt resumePtr: UnsafeRawPointer) {
+    self.kind = kind
+    self.resumePtr = resumePtr
+  }
+
+  var description: String {
+    switch kind {
+    case .endOfString: return "unterminated regex literal"
+    case .invalidUTF8: return "invalid UTF-8 found in source file"
+    case .unknownDelimiter: return "unknown regex literal delimiter"
+    }
+  }
+}
+
+/// Attempt to lex a regex literal between `start` and `end`, returning either
+/// the contents and pointer from which to resume lexing, or an error.
+func lexRegex(
+  start: UnsafeRawPointer, end: UnsafeRawPointer
+) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
+  precondition(start <= end)
+  var current = start
+
+  func ascii(_ s: Unicode.Scalar) -> UInt8 {
+    assert(s.value <= 0x7F)
+    return UInt8(asserting: s.value)
+  }
+  func load(offset: Int) -> UInt8? {
+    guard current + offset < end else { return nil }
+    return current.load(fromByteOffset: offset, as: UInt8.self)
+  }
+  func load() -> UInt8? { load(offset: 0) }
+  func advance(_ n: Int = 1) {
+    precondition(current + n <= end, "Cannot advance past end")
+    current = current.advanced(by: n)
+  }
+
+  func tryEat(_ utf8: String.UTF8View) -> Bool {
+    for (i, idx) in utf8.indices.enumerated() {
+      guard load(offset: i) == utf8[idx] else { return false }
+    }
+    advance(utf8.count)
+    return true
+  }
+
+  // Try to lex the opening delimiter.
+  guard let delimiter = Delimiter.allCases.first(
+    where: { tryEat($0.opening.utf8) }
+  ) else {
+    throw LexError(.unknownDelimiter, resumeAt: current.successor())
+  }
+
+  let contentsStart = current
+  while true {
+    switch load() {
+    case nil, ascii("\n"), ascii("\r"):
+      throw LexError(.endOfString, resumeAt: current)
+
+    case ascii("\\"):
+      // Skip next byte.
+      advance(2)
+
+    default:
+      // Try to lex the closing delimiter.
+      let contentsEnd = current
+      guard tryEat(delimiter.closing.utf8) else {
+        advance()
+        continue
+      }
+
+      // Form a string from the contents and make sure it's valid UTF-8.
+      let count = contentsEnd - contentsStart
+      let contents = UnsafeRawBufferPointer(
+        start: contentsStart, count: count)
+      let s = String(decoding: contents, as: UTF8.self)
+
+      guard s.utf8.elementsEqual(contents) else {
+        throw LexError(.invalidUTF8, resumeAt: current)
+      }
+      return (contents: s, delimiter, end: current)
+    }
+  }
+}
+
+/// Drop a set of regex delimiters from the input string, returning the contents
+/// and the delimiter used. The input string must have valid delimiters.
+func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
+  let utf8 = str.utf8
+  func stripDelimiter(_ delim: Delimiter) -> String? {
+    let prefix = delim.opening.utf8
+    let suffix = delim.closing.utf8
+    guard utf8.prefix(prefix.count).elementsEqual(prefix),
+          utf8.suffix(suffix.count).elementsEqual(suffix) else { return nil }
+
+    return String(utf8.dropFirst(prefix.count).dropLast(suffix.count))
+  }
+  for d in Delimiter.allCases {
+    if let contents = stripDelimiter(d) {
+      return (contents, d)
+    }
+  }
+  fatalError("No valid delimiters")
+}
diff --git a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
index e3a178a15..dfba4757e 100644
--- a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
@@ -9,150 +9,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
-// TODO: mock up multi-line soon
-
-enum Delimiter: Hashable, CaseIterable {
-  case traditional
-  case experimental
-  case reSingleQuote
-
-  var openingAndClosing: (opening: String, closing: String) {
-    switch self {
-    case .traditional: return ("#/", "/#")
-    case .experimental: return ("#|", "|#")
-    case .reSingleQuote: return ("re'", "'")
-    }
-  }
-  var opening: String { openingAndClosing.opening }
-  var closing: String { openingAndClosing.closing }
-
-  /// The default set of syntax options that the delimiter indicates.
-  var defaultSyntaxOptions: SyntaxOptions {
-    switch self {
-    case .traditional, .reSingleQuote:
-      return .traditional
-    case .experimental:
-      return .experimental
-    }
-  }
-}
-
-struct LexError: Error, CustomStringConvertible {
-  enum Kind: Hashable {
-    case endOfString
-    case invalidUTF8 // TODO: better range reporting
-    case unknownDelimiter
-  }
-
-  var kind: Kind
-
-  /// The pointer at which to resume lexing.
-  var resumePtr: UnsafeRawPointer
-
-  init(_ kind: Kind, resumeAt resumePtr: UnsafeRawPointer) {
-    self.kind = kind
-    self.resumePtr = resumePtr
-  }
-
-  var description: String {
-    switch kind {
-    case .endOfString: return "unterminated regex literal"
-    case .invalidUTF8: return "invalid UTF-8 found in source file"
-    case .unknownDelimiter: return "unknown regex literal delimiter"
-    }
-  }
-}
-
-/// Drop a set of regex delimiters from the input string, returning the contents
-/// and the delimiter used. The input string must have valid delimiters.
-func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
-  let utf8 = str.utf8
-  func stripDelimiter(_ delim: Delimiter) -> String? {
-    let prefix = delim.opening.utf8
-    let suffix = delim.closing.utf8
-    guard utf8.prefix(prefix.count).elementsEqual(prefix),
-          utf8.suffix(suffix.count).elementsEqual(suffix) else { return nil }
-
-    return String(utf8.dropFirst(prefix.count).dropLast(suffix.count))
-  }
-  for d in Delimiter.allCases {
-    if let contents = stripDelimiter(d) {
-      return (contents, d)
-    }
-  }
-  fatalError("No valid delimiters")
-}
-
-/// Attempt to lex a regex literal between `start` and `end`, returning either
-/// the contents and pointer from which to resume lexing, or an error.
-func lexRegex(
-  start: UnsafeRawPointer, end: UnsafeRawPointer
-) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
-  precondition(start <= end)
-  var current = start
-
-  func ascii(_ s: Unicode.Scalar) -> UInt8 {
-    assert(s.value <= 0x7F)
-    return UInt8(asserting: s.value)
-  }
-  func load(offset: Int) -> UInt8? {
-    guard current + offset < end else { return nil }
-    return current.load(fromByteOffset: offset, as: UInt8.self)
-  }
-  func load() -> UInt8? { load(offset: 0) }
-  func advance(_ n: Int = 1) {
-    precondition(current + n <= end, "Cannot advance past end")
-    current = current.advanced(by: n)
-  }
-
-  func tryEat(_ utf8: String.UTF8View) -> Bool {
-    for (i, idx) in utf8.indices.enumerated() {
-      guard load(offset: i) == utf8[idx] else { return false }
-    }
-    advance(utf8.count)
-    return true
-  }
-
-  // Try to lex the opening delimiter.
-  guard let delimiter = Delimiter.allCases.first(
-    where: { tryEat($0.opening.utf8) }
-  ) else {
-    throw LexError(.unknownDelimiter, resumeAt: current.successor())
-  }
-
-  let contentsStart = current
-  while true {
-    switch load() {
-    case nil, ascii("\n"), ascii("\r"):
-      throw LexError(.endOfString, resumeAt: current)
-
-    case ascii("\\"):
-      // Skip next byte.
-      advance(2)
-
-    default:
-      // Try to lex the closing delimiter.
-      let contentsEnd = current
-      guard tryEat(delimiter.closing.utf8) else {
-        advance()
-        continue
-      }
-
-      // Form a string from the contents and make sure it's valid UTF-8.
-      let count = contentsEnd - contentsStart
-      let contents = UnsafeRawBufferPointer(
-        start: contentsStart, count: count)
-      let s = String(decoding: contents, as: UTF8.self)
-
-      guard s.utf8.elementsEqual(contents) else {
-        throw LexError(.invalidUTF8, resumeAt: current)
-      }
-      return (contents: s, delimiter, end: current)
-    }
-  }
-}
-
 private func copyCString(_ str: String) -> UnsafePointer<CChar> {
   let count = str.utf8.count + 1
   return str.withCString {

From 0cbb9af76935b22a32c5ce5a8d02b1e6ad285700 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Wed, 2 Mar 2022 16:53:55 +0000
Subject: [PATCH 11/19] Rename LexError -> DelimiterLexError

To avoid confusion with more general regex lexical
analysis.
---
 Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift | 8 ++++----
 Sources/_MatchingEngine/Regex/Parse/Mocking.swift         | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
index 70532f9e7..c023a069c 100644
--- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
@@ -37,7 +37,7 @@ enum Delimiter: Hashable, CaseIterable {
   }
 }
 
-struct LexError: Error, CustomStringConvertible {
+struct DelimiterLexError: Error, CustomStringConvertible {
   enum Kind: Hashable {
     case endOfString
     case invalidUTF8 // TODO: better range reporting
@@ -97,14 +97,14 @@ func lexRegex(
   guard let delimiter = Delimiter.allCases.first(
     where: { tryEat($0.opening.utf8) }
   ) else {
-    throw LexError(.unknownDelimiter, resumeAt: current.successor())
+    throw DelimiterLexError(.unknownDelimiter, resumeAt: current.successor())
   }
 
   let contentsStart = current
   while true {
     switch load() {
     case nil, ascii("\n"), ascii("\r"):
-      throw LexError(.endOfString, resumeAt: current)
+      throw DelimiterLexError(.endOfString, resumeAt: current)
 
     case ascii("\\"):
       // Skip next byte.
@@ -125,7 +125,7 @@ func lexRegex(
       let s = String(decoding: contents, as: UTF8.self)
 
       guard s.utf8.elementsEqual(contents) else {
-        throw LexError(.invalidUTF8, resumeAt: current)
+        throw DelimiterLexError(.invalidUTF8, resumeAt: current)
       }
       return (contents: s, delimiter, end: current)
     }
diff --git a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
index dfba4757e..b535edf1b 100644
--- a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
@@ -52,7 +52,7 @@ func libswiftLexRegexLiteral(
     let (_, _, endPtr) = try lexRegex(start: inputPtr, end: bufferEndPtr)
     curPtrPtr.pointee = endPtr.assumingMemoryBound(to: CChar.self)
     return false
-  } catch let error as LexError {
+  } catch let error as DelimiterLexError {
     if error.kind == .unknownDelimiter {
       // An unknown delimiter should be recovered from, as we may want to try
       // lex something else.
@@ -66,7 +66,7 @@ func libswiftLexRegexLiteral(
     // closing delimiters, which would help with code completion.
     return true
   } catch {
-    fatalError("Should be a LexError")
+    fatalError("Should be a DelimiterLexError")
   }
 }
 

From 7e820821f296436b504bb0d9cd782a46eda11f04 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Wed, 2 Mar 2022 16:53:56 +0000
Subject: [PATCH 12/19] Refactor delimiter lexing logic

Introduce a DelimiterLexer type to perform the
lexing.
---
 .../Regex/Parse/DelimiterLexing.swift         | 167 +++++++++++++-----
 1 file changed, 121 insertions(+), 46 deletions(-)

diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
index c023a069c..e49a442e7 100644
--- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
@@ -63,71 +63,137 @@ struct DelimiterLexError: Error, CustomStringConvertible {
   }
 }
 
-/// Attempt to lex a regex literal between `start` and `end`, returning either
-/// the contents and pointer from which to resume lexing, or an error.
-func lexRegex(
-  start: UnsafeRawPointer, end: UnsafeRawPointer
-) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
-  precondition(start <= end)
-  var current = start
+fileprivate struct DelimiterLexer {
+  let start: UnsafeRawPointer
+  var cursor: UnsafeRawPointer
+  let end: UnsafeRawPointer
+
+  init(start: UnsafeRawPointer, end: UnsafeRawPointer) {
+    precondition(start <= end)
+    self.start = start
+    self.cursor = start
+    self.end = end
+  }
 
   func ascii(_ s: Unicode.Scalar) -> UInt8 {
     assert(s.value <= 0x7F)
     return UInt8(asserting: s.value)
   }
-  func load(offset: Int) -> UInt8? {
-    guard current + offset < end else { return nil }
-    return current.load(fromByteOffset: offset, as: UInt8.self)
+
+  /// Return the byte at the current cursor, or `nil` if the end of the buffer
+  /// has been reached.
+  func load() -> UInt8? {
+    guard cursor < end else { return nil }
+    return cursor.load(as: UInt8.self)
   }
-  func load() -> UInt8? { load(offset: 0) }
-  func advance(_ n: Int = 1) {
-    precondition(current + n <= end, "Cannot advance past end")
-    current = current.advanced(by: n)
+
+  /// Return the slice of `count` bytes from a specified cursor position, or
+  /// `nil` if there are fewer than `count` bytes until the end of the buffer.
+  func slice(
+    at cursor: UnsafeRawPointer, _ count: Int
+  ) -> UnsafeRawBufferPointer? {
+    guard cursor + count <= end else { return nil }
+    return UnsafeRawBufferPointer(start: cursor, count: count)
   }
 
-  func tryEat(_ utf8: String.UTF8View) -> Bool {
-    for (i, idx) in utf8.indices.enumerated() {
-      guard load(offset: i) == utf8[idx] else { return false }
-    }
-    advance(utf8.count)
+  /// Return the slice of `count` bytes from the current cursor, or `nil` if
+  /// there are fewer than `count` bytes until the end of the buffer.
+  func slice(_ count: Int) -> UnsafeRawBufferPointer? {
+    slice(at: cursor, count)
+  }
+
+  /// Advance the cursor `n` bytes.
+  mutating func advanceCursor(_ n: Int = 1) {
+    cursor += n
+    precondition(cursor <= end, "Cannot advance past end")
+  }
+
+  /// Check to see if a UTF-8 sequence can be eaten from the current cursor.
+  func canEat(_ utf8: String.UTF8View) -> Bool {
+    guard let slice = slice(utf8.count) else { return false }
+    return slice.elementsEqual(utf8)
+  }
+
+  /// Attempt to eat a UTF-8 byte sequence, returning `true` if successful.
+  mutating func tryEat(_ utf8: String.UTF8View) -> Bool {
+    guard canEat(utf8) else { return false }
+    advanceCursor(utf8.count)
     return true
   }
 
-  // Try to lex the opening delimiter.
-  guard let delimiter = Delimiter.allCases.first(
-    where: { tryEat($0.opening.utf8) }
-  ) else {
-    throw DelimiterLexError(.unknownDelimiter, resumeAt: current.successor())
+  /// Attempt to eat a particular closing delimiter, returning the contents of
+  /// the literal, and ending pointer, or `nil` if this is not a delimiter
+  /// ending.
+  mutating func tryEatEnding(
+    _ delimiter: Delimiter, contentsStart: UnsafeRawPointer
+  ) throws -> (contents: String, end: UnsafeRawPointer)? {
+    let contentsEnd = cursor
+    guard tryEat(delimiter.closing.utf8) else { return nil }
+
+    // Form a string from the contents and make sure it's valid UTF-8.
+    let count = contentsEnd - contentsStart
+    let contents = UnsafeRawBufferPointer(
+      start: contentsStart, count: count)
+    let s = String(decoding: contents, as: UTF8.self)
+
+    guard s.utf8.elementsEqual(contents) else {
+      throw DelimiterLexError(.invalidUTF8, resumeAt: cursor)
+    }
+    return (contents: s, end: cursor)
   }
 
-  let contentsStart = current
-  while true {
-    switch load() {
-    case nil, ascii("\n"), ascii("\r"):
-      throw DelimiterLexError(.endOfString, resumeAt: current)
+  /// Attempt to advance the lexer, throwing an error if the end of a line or
+  /// the end of the buffer is reached.
+  mutating func advance(escaped: Bool = false) throws {
+    guard let next = load() else {
+      throw DelimiterLexError(.endOfString, resumeAt: cursor)
+    }
+    switch UnicodeScalar(next) {
+    case let next where !next.isASCII:
+      // Just advance into a UTF-8 sequence. It shouldn't matter that we'll
+      // iterate through each byte as we only match against ASCII, and we
+      // validate it at the end. This case is separated out so we can just deal
+      // with the ASCII cases below.
+      advanceCursor()
+
+    case "\n", "\r":
+      throw DelimiterLexError(.endOfString, resumeAt: cursor)
+
+    case "\0":
+      // TODO: Warn to match the behavior of String literal lexer? Or should
+      // we error as unprintable?
+      advanceCursor()
+
+    case "\\" where !escaped:
+      // Advance again for an escape sequence.
+      advanceCursor()
+      try advance(escaped: true)
 
-    case ascii("\\"):
-      // Skip next byte.
-      advance(2)
 
     default:
-      // Try to lex the closing delimiter.
-      let contentsEnd = current
-      guard tryEat(delimiter.closing.utf8) else {
-        advance()
-        continue
-      }
+      advanceCursor()
+    }
+  }
 
-      // Form a string from the contents and make sure it's valid UTF-8.
-      let count = contentsEnd - contentsStart
-      let contents = UnsafeRawBufferPointer(
-        start: contentsStart, count: count)
-      let s = String(decoding: contents, as: UTF8.self)
+  /*consuming*/ mutating func lex(
+  ) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
+
+    // Try to lex the opening delimiter.
+    guard let delimiter = Delimiter.allCases.first(
+      where: { tryEat($0.opening.utf8) }
+    ) else {
+      throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor())
+    }
 
-      guard s.utf8.elementsEqual(contents) else {
-        throw DelimiterLexError(.invalidUTF8, resumeAt: current)
+    let contentsStart = cursor
+    while true {
+      // Try to lex the closing delimiter.
+      if let (contents, end) = try tryEatEnding(delimiter,
+                                                contentsStart: contentsStart) {
+        return (contents, delimiter, end)
       }
-      return (contents: s, delimiter, end: current)
+      // Try to advance the lexer.
+      try advance()
     }
   }
 }
@@ -151,3 +217,12 @@ func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
   }
   fatalError("No valid delimiters")
 }
+
+/// Attempt to lex a regex literal between `start` and `end`, returning either
+/// the contents and pointer from which to resume lexing, or an error.
+func lexRegex(
+  start: UnsafeRawPointer, end: UnsafeRawPointer
+) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
+  var lexer = DelimiterLexer(start: start, end: end)
+  return try lexer.lex()
+}

From 8b3e2ef4bfd748159171332d3f8dc94d7bbb8ce0 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Wed, 2 Mar 2022 16:53:56 +0000
Subject: [PATCH 13/19] Diagnose unprintable ASCII characters

This matches the behavior of the C++ lexer for
string literals.
---
 .../Regex/Parse/DelimiterLexing.swift         |  7 +++
 .../Utility/MissingUnicode.swift              |  8 +++
 Tests/RegexTests/ParseTests.swift             | 63 ++++++++++++++++---
 3 files changed, 70 insertions(+), 8 deletions(-)

diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
index e49a442e7..4b4618318 100644
--- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
@@ -42,6 +42,7 @@ struct DelimiterLexError: Error, CustomStringConvertible {
     case endOfString
     case invalidUTF8 // TODO: better range reporting
     case unknownDelimiter
+    case unprintableASCII
   }
 
   var kind: Kind
@@ -59,6 +60,7 @@ struct DelimiterLexError: Error, CustomStringConvertible {
     case .endOfString: return "unterminated regex literal"
     case .invalidUTF8: return "invalid UTF-8 found in source file"
     case .unknownDelimiter: return "unknown regex literal delimiter"
+    case .unprintableASCII: return "unprintable ASCII character found in source file"
     }
   }
 }
@@ -169,6 +171,11 @@ fileprivate struct DelimiterLexer {
       advanceCursor()
       try advance(escaped: true)
 
+    case let next where !next.isPrintableASCII:
+      // Diagnose unprintable ASCII.
+      // TODO: Ideally we would recover and continue to lex until the ending
+      // delimiter.
+      throw DelimiterLexError(.unprintableASCII, resumeAt: cursor.successor())
 
     default:
       advanceCursor()
diff --git a/Sources/_MatchingEngine/Utility/MissingUnicode.swift b/Sources/_MatchingEngine/Utility/MissingUnicode.swift
index a6aae0b82..dccba3286 100644
--- a/Sources/_MatchingEngine/Utility/MissingUnicode.swift
+++ b/Sources/_MatchingEngine/Utility/MissingUnicode.swift
@@ -661,3 +661,11 @@ extension Character {
 
   public var isWordCharacter: Bool { isLetter || isNumber || self == "_" }
 }
+
+extension UnicodeScalar {
+  public var isPrintableASCII: Bool {
+    // Exclude non-printables before the space character U+20, and anything
+    // including and above the DEL character U+7F.
+    value >= 0x20 && value < 0x7F
+  }
+}
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index 23a3b910f..b0b2e5309 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -107,20 +107,26 @@ func parseTest(
   serializedCaptures.deallocate()
 }
 
-func parseWithDelimitersTest(
-  _ input: String, _ expecting: AST.Node,
-  file: StaticString = #file, line: UInt = #line
+func delimiterLexingTest(
+  _ input: String, file: StaticString = #file, line: UInt = #line
 ) {
-  // First try lexing.
-  input.withCString { ptr in
-    let (contents, delim, end) = try! lexRegex(start: ptr,
-                                               end: ptr + input.count)
-    XCTAssertEqual(end, ptr + input.count, file: file, line: line)
+  input.withCString(encodedAs: UTF8.self) { ptr in
+    let endPtr = ptr + input.utf8.count
+    let (contents, delim, end) = try! lexRegex(start: ptr, end: endPtr)
+    XCTAssertEqual(end, endPtr, file: file, line: line)
 
     let (parseContents, parseDelim) = droppingRegexDelimiters(input)
     XCTAssertEqual(contents, parseContents, file: file, line: line)
     XCTAssertEqual(delim, parseDelim, file: file, line: line)
   }
+}
+
+func parseWithDelimitersTest(
+  _ input: String, _ expecting: AST.Node,
+  file: StaticString = #file, line: UInt = #line
+) {
+  // First try lexing.
+  delimiterLexingTest(input, file: file, line: line)
 
   let orig = try! parseWithDelimiters(input)
   let ast = orig.root
@@ -199,6 +205,32 @@ func diagnosticTest(
   }
 }
 
+func delimiterLexingDiagnosticTest(
+  _ input: String, _ expected: DelimiterLexError.Kind,
+  syntax: SyntaxOptions = .traditional,
+  file: StaticString = #file, line: UInt = #line
+) {
+  do {
+    _ = try input.withCString { ptr in
+      try lexRegex(start: ptr, end: ptr + input.count)
+    }
+    XCTFail("""
+      Passed, but expected error: \(expected)
+    """, file: file, line: line)
+  } catch let e as DelimiterLexError {
+    guard e.kind == expected else {
+      XCTFail("""
+
+        Expected: \(expected)
+        Actual: \(e.kind)
+      """, file: file, line: line)
+      return
+    }
+  } catch let e {
+    XCTFail("Unexpected error type: \(e)", file: file, line: line)
+  }
+}
+
 func libswiftDiagnosticMessageTest(
   _ input: String, _ expectedErr: String, file: StaticString = #file,
   line: UInt = #line
@@ -1472,6 +1504,11 @@ extension RegexTests {
 
     parseWithDelimitersTest("re'x*'", zeroOrMore(of: "x"))
 
+    parseWithDelimitersTest(#"re'🔥🇩🇰'"#, concat("🔥", "🇩🇰"))
+    parseWithDelimitersTest(#"re'\🔥✅'"#, concat("🔥", "✅"))
+
+    // Printable ASCII characters.
+    delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##)
     // MARK: Parse not-equal
 
     // Make sure dumping output correctly reflects differences in AST.
@@ -1890,6 +1927,16 @@ extension RegexTests {
     diagnosticTest("(*LIMIT_DEPTH=-1", .expectedNumber("", kind: .decimal))
   }
 
+  func testDelimiterLexingErrors() {
+    delimiterLexingDiagnosticTest(#"re'\\#n'"#, .endOfString)
+    for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r.
+      delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII)
+    }
+    delimiterLexingDiagnosticTest("re'\n'", .endOfString)
+    delimiterLexingDiagnosticTest("re'\r'", .endOfString)
+    delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII)
+  }
+
   func testlibswiftDiagnostics() {
     libswiftDiagnosticMessageTest(
       "#/[x*/#", "cannot parse regular expression: expected ']'")

From 56414b8afd3cacaeb31ed3bf7b7a1186b889b624 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Wed, 2 Mar 2022 16:53:56 +0000
Subject: [PATCH 14/19] Allow lexer recovery for missing closing delimiter

Allow the C++ lexer to form a tok::regex_literal.
This avoids generic fallback behavior, and better
allows for things like code completion. The test
case for this will be in the C++ repo.
---
 .../Regex/Parse/DelimiterLexing.swift         | 17 ++++++++-------
 .../_MatchingEngine/Regex/Parse/Mocking.swift | 14 +++++++++----
 Sources/_MatchingEngine/Utility/Misc.swift    | 21 +++++++++++++++++++
 3 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
index 4b4618318..c4be948ac 100644
--- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
@@ -208,14 +208,17 @@ fileprivate struct DelimiterLexer {
 /// Drop a set of regex delimiters from the input string, returning the contents
 /// and the delimiter used. The input string must have valid delimiters.
 func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
-  let utf8 = str.utf8
   func stripDelimiter(_ delim: Delimiter) -> String? {
-    let prefix = delim.opening.utf8
-    let suffix = delim.closing.utf8
-    guard utf8.prefix(prefix.count).elementsEqual(prefix),
-          utf8.suffix(suffix.count).elementsEqual(suffix) else { return nil }
-
-    return String(utf8.dropFirst(prefix.count).dropLast(suffix.count))
+    // The opening delimiter must match.
+    guard var slice = str.utf8.tryDropPrefix(delim.opening.utf8)
+    else { return nil }
+
+    // The closing delimiter may optionally match, as it may not be present in
+    // invalid code.
+    if let newSlice = slice.tryDropSuffix(delim.closing.utf8) {
+      slice = newSlice
+    }
+    return String(slice)
   }
   for d in Delimiter.allCases {
     if let contents = stripDelimiter(d) {
diff --git a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
index b535edf1b..5994a4f52 100644
--- a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
@@ -61,10 +61,16 @@ func libswiftLexRegexLiteral(
     errOut.pointee = copyCString("\(error)")
     curPtrPtr.pointee = error.resumePtr.assumingMemoryBound(to: CChar.self)
 
-    // For now, treat every error as unrecoverable.
-    // TODO: We should ideally be able to recover from a regex with missing
-    // closing delimiters, which would help with code completion.
-    return true
+    switch error.kind {
+    case .endOfString:
+      // Missing closing delimiter can be recovered from.
+      return false
+    case .unprintableASCII, .invalidUTF8:
+      // We don't currently have good recovery behavior for these.
+      return true
+    case .unknownDelimiter:
+      fatalError("Already handled")
+    }
   } catch {
     fatalError("Should be a DelimiterLexError")
   }
diff --git a/Sources/_MatchingEngine/Utility/Misc.swift b/Sources/_MatchingEngine/Utility/Misc.swift
index bd1e395b5..55d3d3adc 100644
--- a/Sources/_MatchingEngine/Utility/Misc.swift
+++ b/Sources/_MatchingEngine/Utility/Misc.swift
@@ -108,7 +108,28 @@ extension Collection {
   >(_ idx: Index, in c: C) -> C.Index {
     c.index(atOffset: offset(of: idx))
   }
+}
 
+extension Collection where Element: Equatable {
+  /// Attempt to drop a given prefix from the collection, returning the
+  /// resulting subsequence, or `nil` if the prefix does not match.
+  public func tryDropPrefix<C : Collection>(
+    _ other: C
+  ) -> SubSequence? where C.Element == Element {
+    let prefixCount = other.count
+    guard prefix(prefixCount).elementsEqual(other) else { return nil }
+    return dropFirst(prefixCount)
+  }
+
+  /// Attempt to drop a given suffix from the collection, returning the
+  /// resulting subsequence, or `nil` if the suffix does not match.
+  public func tryDropSuffix<C : Collection>(
+    _ other: C
+  ) -> SubSequence? where C.Element == Element {
+    let suffixCount = other.count
+    guard suffix(suffixCount).elementsEqual(other) else { return nil }
+    return dropLast(suffixCount)
+  }
 }
 
 extension UnsafeMutableRawPointer {

From 61450e875d80c622d5b7adb60cdf6cfc6be56439 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Wed, 2 Mar 2022 16:53:56 +0000
Subject: [PATCH 15/19] Add lexing heuristic to handle single quotes in re'...'

If a single quote is encountered with a prefix of
either `(?`, `(?(`, `\k`, `\g` or `(?C`, continue
to scan ahead to a closing `'`. Such prefixes would
not be valid endings for a regex literal anyway,
and this lets us handle the single quote variant
of their syntax.

For the group name cases, further refine this
skipping behavior by only skipping over characters
that could possibly appear in that case. This
improves diagnostic behavior by ensuring we don't
go wandering off into Swift code.
---
 .../Regex/Parse/DelimiterLexing.swift         |  92 +++++++++++++++
 Tests/RegexTests/ParseTests.swift             | 106 ++++++++++++++++--
 2 files changed, 191 insertions(+), 7 deletions(-)

diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
index c4be948ac..f1d3d5607 100644
--- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
@@ -104,6 +104,14 @@ fileprivate struct DelimiterLexer {
     slice(at: cursor, count)
   }
 
+  /// Return the slice of `count` bytes preceding the current cursor, or `nil`
+  /// if there are fewer than `count` bytes before the cursor.
+  func sliceBehind(_ count: Int) -> UnsafeRawBufferPointer? {
+    let priorCursor = cursor - count
+    guard priorCursor >= start else { return nil }
+    return slice(at: priorCursor, count)
+  }
+
   /// Advance the cursor `n` bytes.
   mutating func advanceCursor(_ n: Int = 1) {
     cursor += n
@@ -123,6 +131,86 @@ fileprivate struct DelimiterLexer {
     return true
   }
 
+  /// Attempt to skip over a closing delimiter character that is unlikely to be
+  /// the actual closing delimiter.
+  mutating func trySkipDelimiter(_ delimiter: Delimiter) {
+    // Only the closing `'` for re'...' can potentially be skipped over.
+    switch delimiter {
+    case .traditional, .experimental:
+      return
+    case .reSingleQuote:
+      break
+    }
+    guard load() == ascii("'") else { return }
+
+    /// Need to look for a prefix of `(?`, `(?(`, `\k`, `\g`, `(?C`, as those
+    /// are the cases that could use single quotes. Note that none of these
+    /// would be valid regex endings anyway.
+    let calloutPrefix = "(?C"
+    let prefix = ["(?", "(?(", #"\k"#, #"\g"#, calloutPrefix].first { prior in
+      guard let priorSlice = sliceBehind(prior.utf8.count),
+            priorSlice.elementsEqual(prior.utf8)
+      else { return false }
+
+      // Make sure the slice isn't preceded by a '\', as that invalidates this
+      // analysis.
+      if let prior = sliceBehind(priorSlice.count + 1) {
+        return prior[0] != ascii("\\")
+      }
+      return true
+    }
+    guard let prefix = prefix else { return }
+    let isCallout = prefix == calloutPrefix
+
+    func isPossiblyGroupReference(_ c: UInt8) -> Bool {
+      // If this is an ASCII character, make sure it's for a group name. Leave
+      // other UTF-8 encoded scalars alone, this should at least catch cases
+      // where we run into a symbol such as `{`, `.`, `;` that would indicate
+      // we've likely advanced out of the bounds of the regex.
+      let scalar = UnicodeScalar(c)
+      guard scalar.isASCII else { return true }
+      switch scalar {
+      // Include '-' and '+' which may be used in recursion levels and relative
+      // references.
+      case "A"..."Z", "a"..."z", "0"..."9", "_", "-", "+":
+        return true
+      default:
+        return false
+      }
+    }
+
+    // Make a note of the current lexing position, as we may need to revert
+    // back to it.
+    let originalCursor = cursor
+    advanceCursor()
+
+    // Try skip over what would be the contents of a group identifier/reference.
+    while let next = load() {
+      // Found the ending, we're done. Return so we can continue to lex to the
+      // real delimiter.
+      if next == ascii("'") {
+        advanceCursor()
+        return
+      }
+
+      // If this isn't a callout, make sure we have something that could be a
+      // group reference. We limit the character set here to improve diagnostic
+      // behavior in the case where the literal is actually unterminated. We
+      // ideally don't want to go wandering off into Swift source code. We can't
+      // do the same for callouts, as they take arbitrary strings.
+      guard isCallout || isPossiblyGroupReference(next) else { break }
+      do {
+        try advance()
+      } catch {
+        break
+      }
+    }
+    // We bailed out, either because we ran into something that didn't look like
+    // an identifier, or we reached the end of the line. Revert back to the
+    // original guess of delimiter.
+    cursor = originalCursor
+  }
+
   /// Attempt to eat a particular closing delimiter, returning the contents of
   /// the literal, and ending pointer, or `nil` if this is not a delimiter
   /// ending.
@@ -194,6 +282,10 @@ fileprivate struct DelimiterLexer {
 
     let contentsStart = cursor
     while true {
+      // Check to see if we're at a character that looks like a delimiter, but
+      // likely isn't. In such a case, we can attempt to skip over it.
+      trySkipDelimiter(delimiter)
+
       // Try to lex the closing delimiter.
       if let (contents, end) = try tryEatEnding(delimiter,
                                                 contentsStart: contentsStart) {
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index b0b2e5309..b499c0b98 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -107,28 +107,46 @@ func parseTest(
   serializedCaptures.deallocate()
 }
 
+/// Test delimiter lexing. Takes an input string that starts with a regex
+/// literal. If `ignoreTrailing` is true, there may be additional characters
+/// that follow the literal that are not considered part of it.
+@discardableResult
 func delimiterLexingTest(
-  _ input: String, file: StaticString = #file, line: UInt = #line
-) {
+  _ input: String, ignoreTrailing: Bool = false,
+  file: StaticString = #file, line: UInt = #line
+) -> String {
   input.withCString(encodedAs: UTF8.self) { ptr in
     let endPtr = ptr + input.utf8.count
     let (contents, delim, end) = try! lexRegex(start: ptr, end: endPtr)
-    XCTAssertEqual(end, endPtr, file: file, line: line)
+    if ignoreTrailing {
+      XCTAssertNotEqual(end, endPtr, file: file, line: line)
+    } else {
+      XCTAssertEqual(end, endPtr, file: file, line: line)
+    }
 
-    let (parseContents, parseDelim) = droppingRegexDelimiters(input)
+    let rawPtr = UnsafeRawPointer(ptr)
+    let buffer = UnsafeRawBufferPointer(start: rawPtr, count: end - rawPtr)
+    let literal = String(decoding: buffer, as: UTF8.self)
+
+    let (parseContents, parseDelim) = droppingRegexDelimiters(literal)
     XCTAssertEqual(contents, parseContents, file: file, line: line)
     XCTAssertEqual(delim, parseDelim, file: file, line: line)
+    return literal
   }
 }
 
+/// Test parsing an input string with regex delimiters. If `ignoreTrailing` is
+/// true, there may be additional characters that follow the literal that are
+/// not considered part of it.
 func parseWithDelimitersTest(
-  _ input: String, _ expecting: AST.Node,
+  _ input: String, _ expecting: AST.Node, ignoreTrailing: Bool = false,
   file: StaticString = #file, line: UInt = #line
 ) {
   // First try lexing.
-  delimiterLexingTest(input, file: file, line: line)
+  let literal = delimiterLexingTest(
+    input, ignoreTrailing: ignoreTrailing, file: file, line: line)
 
-  let orig = try! parseWithDelimiters(input)
+  let orig = try! parseWithDelimiters(literal)
   let ast = orig.root
   guard ast == expecting
           || ast._dump() == expecting._dump() // EQ workaround
@@ -1509,6 +1527,63 @@ extension RegexTests {
 
     // Printable ASCII characters.
     delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##)
+
+    // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter
+    // if it's clear that it's part of the regex syntax.
+
+    parseWithDelimitersTest(
+      #"re'(?'a_bcA0'\')'"#, namedCapture("a_bcA0", "'"))
+    parseWithDelimitersTest(
+      #"re'(?'a_bcA0-c1A'x*)'"#,
+      balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x")))
+
+    parseWithDelimitersTest(
+      #"re'(?('a_bcA0')x|y)'"#, conditional(
+        .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y"))
+    parseWithDelimitersTest(
+      #"re'(?('+20')\')'"#, conditional(
+        .groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty()))
+
+    parseWithDelimitersTest(
+      #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))))
+    parseWithDelimitersTest(
+      #"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1))
+
+    parseWithDelimitersTest(
+      #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A"))))
+    parseWithDelimitersTest(
+      #"re'\g'-1'\''"#, concat(subpattern(.relative(-1)), "'"))
+
+    parseWithDelimitersTest(
+      #"re'(?C'a*b\c 🔥_ ;')'"#, pcreCallout(.string(#"a*b\c 🔥_ ;"#)))
+
+    // Fine, because we don't end up skipping.
+    delimiterLexingTest(#"re'(?'"#)
+    delimiterLexingTest(#"re'(?('"#)
+    delimiterLexingTest(#"re'\k'"#)
+    delimiterLexingTest(#"re'\g'"#)
+    delimiterLexingTest(#"re'(?C'"#)
+
+    // Not a valid group name, but we can still skip over it.
+    delimiterLexingTest(#"re'(?'🔥')'"#)
+
+    // Escaped, so don't skip. These will ignore the ending `'` as we've already
+    // closed the literal.
+    parseWithDelimitersTest(
+      #"re'\(?''"#, zeroOrOne(of: "("), ignoreTrailing: true
+    )
+    parseWithDelimitersTest(
+      #"re'\\k''"#, concat("\\", "k"), ignoreTrailing: true
+    )
+    parseWithDelimitersTest(
+      #"re'\\g''"#, concat("\\", "g"), ignoreTrailing: true
+    )
+    parseWithDelimitersTest(
+      #"re'\(?C''"#, concat(zeroOrOne(of: "("), "C"), ignoreTrailing: true
+    )
+    delimiterLexingTest(#"re'(\?''"#, ignoreTrailing: true)
+    delimiterLexingTest(#"re'\(?(''"#, ignoreTrailing: true)
+
     // MARK: Parse not-equal
 
     // Make sure dumping output correctly reflects differences in AST.
@@ -1815,6 +1890,12 @@ extension RegexTests {
     diagnosticTest(#"(?<#>)"#, .identifierMustBeAlphaNumeric(.groupName))
     diagnosticTest(#"(?'1A')"#, .identifierCannotStartWithNumber(.groupName))
 
+    // TODO: It might be better if tried to consume up to the closing `'` and
+    // diagnosed an invalid group name based on that.
+    diagnosticTest(#"(?'abc ')"#, .expected("'"))
+
+    diagnosticTest("(?'🔥')", .identifierMustBeAlphaNumeric(.groupName))
+
     diagnosticTest(#"(?'-')"#, .expectedIdentifier(.groupName))
     diagnosticTest(#"(?'--')"#, .identifierMustBeAlphaNumeric(.groupName))
     diagnosticTest(#"(?'a-b-c')"#, .expected("'"))
@@ -1928,6 +2009,9 @@ extension RegexTests {
   }
 
   func testDelimiterLexingErrors() {
+
+    // MARK: Printable ASCII
+
     delimiterLexingDiagnosticTest(#"re'\\#n'"#, .endOfString)
     for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r.
       delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII)
@@ -1935,6 +2019,14 @@ extension RegexTests {
     delimiterLexingDiagnosticTest("re'\n'", .endOfString)
     delimiterLexingDiagnosticTest("re'\r'", .endOfString)
     delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII)
+
+    // MARK: Delimiter skipping
+
+    delimiterLexingDiagnosticTest("re'(?''", .endOfString)
+    delimiterLexingDiagnosticTest("re'(?'abc'", .endOfString)
+    delimiterLexingDiagnosticTest("re'(?('abc'", .endOfString)
+    delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .endOfString)
+    delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .endOfString)
   }
 
   func testlibswiftDiagnostics() {

From 2325cef781477a4b51deeafcdae9c528a486e682 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Wed, 2 Mar 2022 16:53:57 +0000
Subject: [PATCH 16/19] Add support for rx'...' for experimental syntax

---
 Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift | 8 +++++---
 Tests/RegexTests/ParseTests.swift                         | 6 ++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
index f1d3d5607..1227ade1f 100644
--- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
@@ -15,12 +15,14 @@ enum Delimiter: Hashable, CaseIterable {
   case traditional
   case experimental
   case reSingleQuote
+  case rxSingleQuote
 
   var openingAndClosing: (opening: String, closing: String) {
     switch self {
     case .traditional: return ("#/", "/#")
     case .experimental: return ("#|", "|#")
     case .reSingleQuote: return ("re'", "'")
+    case .rxSingleQuote: return ("rx'", "'")
     }
   }
   var opening: String { openingAndClosing.opening }
@@ -31,7 +33,7 @@ enum Delimiter: Hashable, CaseIterable {
     switch self {
     case .traditional, .reSingleQuote:
       return .traditional
-    case .experimental:
+    case .experimental, .rxSingleQuote:
       return .experimental
     }
   }
@@ -134,11 +136,11 @@ fileprivate struct DelimiterLexer {
   /// Attempt to skip over a closing delimiter character that is unlikely to be
   /// the actual closing delimiter.
   mutating func trySkipDelimiter(_ delimiter: Delimiter) {
-    // Only the closing `'` for re'...' can potentially be skipped over.
+    // Only the closing `'` for re'...'/rx'...' can potentially be skipped over.
     switch delimiter {
     case .traditional, .experimental:
       return
-    case .reSingleQuote:
+    case .reSingleQuote, .rxSingleQuote:
       break
     }
     guard load() == ascii("'") else { return }
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index b499c0b98..2ee76b682 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -1497,6 +1497,9 @@ extension RegexTests {
     parseWithDelimitersTest("#/a b/#", concat("a", " ", "b"))
     parseWithDelimitersTest("#|a b|#", concat("a", "b"))
 
+    parseWithDelimitersTest("re'a b'", concat("a", " ", "b"))
+    parseWithDelimitersTest("rx'a b'", concat("a", "b"))
+
     parseWithDelimitersTest("#|[a b]|#", charClass("a", "b"))
     parseWithDelimitersTest(
       "#|(?-x)[a b]|#", changeMatchingOptions(
@@ -1537,6 +1540,9 @@ extension RegexTests {
       #"re'(?'a_bcA0-c1A'x*)'"#,
       balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x")))
 
+    parseWithDelimitersTest(
+      #"rx' (?'a_bcA0' a b)'"#, concat(namedCapture("a_bcA0", concat("a", "b"))))
+
     parseWithDelimitersTest(
       #"re'(?('a_bcA0')x|y)'"#, conditional(
         .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y"))

From 688f1d82a50615a46018279270a9323065ba12b0 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Wed, 2 Mar 2022 17:09:47 +0000
Subject: [PATCH 17/19] Change script property default to use Script Extension

Change the default script property behavior for an
unqualified value e.g `\p{Greek}` from
`\p{Script=Greek}` to `\p{Script_Extension=Greek}`.
This is arguably the more intuitive behavior, and
matches what Perl does.
---
 .../Regex/Parse/CharacterPropertyClassification.swift       | 2 +-
 Tests/RegexTests/MatchTests.swift                           | 1 +
 Tests/RegexTests/ParseTests.swift                           | 6 +++---
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift
index 6a5740aa1..e5b65a46c 100644
--- a/Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift
@@ -381,7 +381,7 @@ extension Source {
       return .generalCategory(cat)
     }
     if let script = classifyScriptProperty(value) {
-      return .script(script)
+      return .scriptExtension(script)
     }
     if let posix = classifyPOSIX(value) {
       return .posix(posix)
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index 3cd2df585..6ee55e414 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -695,6 +695,7 @@ extension RegexTests {
     firstMatchTest(#"\p{ISBAMUM}"#, input: "123ꚠꚡꚢxyz", match: "ꚠ")
     firstMatchTest(#"\p{Script=Unknown}"#, input: "\u{10FFFF}", match: "\u{10FFFF}")
     firstMatchTest(#"\p{scx=Gujr}"#, input: "\u{a839}", match: "\u{a839}")
+    firstMatchTest(#"\p{Gujr}"#, input: "\u{a839}", match: "\u{a839}")
 
     firstMatchTest(#"\p{alpha}"#, input: "123abcXYZ", match: "a")
     firstMatchTest(#"\P{alpha}"#, input: "123abcXYZ", match: "1")
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index e55abcbb9..e911ee449 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -1003,13 +1003,13 @@ extension RegexTests {
 
     parseTest(#"\p{sc=grek}"#, prop(.script(.greek)))
     parseTest(#"\p{sc=isGreek}"#, prop(.script(.greek)))
-    parseTest(#"\p{Greek}"#, prop(.script(.greek)))
-    parseTest(#"\p{isGreek}"#, prop(.script(.greek)))
+    parseTest(#"\p{Greek}"#, prop(.scriptExtension(.greek)))
+    parseTest(#"\p{isGreek}"#, prop(.scriptExtension(.greek)))
     parseTest(#"\P{Script=Latn}"#, prop(.script(.latin), inverted: true))
     parseTest(#"\p{script=zzzz}"#, prop(.script(.unknown)))
     parseTest(#"\p{ISscript=iszzzz}"#, prop(.script(.unknown)))
     parseTest(#"\p{scx=bamum}"#, prop(.scriptExtension(.bamum)))
-    parseTest(#"\p{ISBAMUM}"#, prop(.script(.bamum)))
+    parseTest(#"\p{ISBAMUM}"#, prop(.scriptExtension(.bamum)))
 
     parseTest(#"\p{alpha}"#, prop(.binary(.alphabetic)))
     parseTest(#"\p{DEP}"#, prop(.binary(.deprecated)))

From e8c84a19714041eb8544c492a2aa148b2868240d Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Wed, 2 Mar 2022 13:52:19 -0600
Subject: [PATCH 18/19] Add CustomRegexComponent example (#196)

Adds a test with a `SemanticVersion` type that conforms to
`CustomRegexComponent`.
---
 Tests/RegexTests/RegexDSLTests.swift | 53 ++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/Tests/RegexTests/RegexDSLTests.swift b/Tests/RegexTests/RegexDSLTests.swift
index 554ef905f..d599400c6 100644
--- a/Tests/RegexTests/RegexDSLTests.swift
+++ b/Tests/RegexTests/RegexDSLTests.swift
@@ -642,6 +642,59 @@ class RegexDSLTests: XCTestCase {
       }
     }
   }
+  
+  func testSemanticVersionExample() {
+    struct SemanticVersion: Equatable {
+      var major: Int
+      var minor: Int
+      var patch: Int
+      var dev: String?
+    }
+    struct SemanticVersionParser: CustomRegexComponent {
+      typealias Match = SemanticVersion
+      func match(
+        _ input: String,
+        startingAt index: String.Index,
+        in bounds: Range<String.Index>
+      ) -> (upperBound: String.Index, match: SemanticVersion)? {
+        let regex = Regex {
+          tryCapture(oneOrMore(.digit)) { Int($0) }
+          "."
+          tryCapture(oneOrMore(.digit)) { Int($0) }
+          optionally {
+            "."
+            tryCapture(oneOrMore(.digit)) { Int($0) }
+          }
+          optionally {
+            "-"
+            capture(oneOrMore(.word))
+          }
+        }
+
+        guard let match = input[index..<bounds.upperBound].firstMatch(of: regex),
+              match.range.lowerBound == index
+        else { return nil }
+
+        let result = SemanticVersion(
+          major: match.result.1,
+          minor: match.result.2,
+          patch: match.result.3 ?? 0,
+          dev: match.result.4.map(String.init))
+        return (match.range.upperBound, result)
+      }
+    }
+    
+    let versions = [
+      ("1.0", SemanticVersion(major: 1, minor: 0, patch: 0)),
+      ("1.0.1", SemanticVersion(major: 1, minor: 0, patch: 1)),
+      ("12.100.5-dev", SemanticVersion(major: 12, minor: 100, patch: 5, dev: "dev")),
+    ]
+    
+    let parser = SemanticVersionParser()
+    for (str, version) in versions {
+      XCTAssertEqual(str.match(parser)?.match, version)
+    }
+  }
 }
 
 extension Unicode.Scalar {

From 7b26028e1888048825a4acd92a0eb90af21e9b7a Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Thu, 3 Mar 2022 08:52:06 -0600
Subject: [PATCH 19/19] Support text segment boundary anchors (#178)

This enables the `\y` and `\Y` anchors in regex literals and
`Anchor.textSegmentBoundary` in the DSL.

Note: This also includes `UnicodeScalar` conformance to
`RegexProtocol`, which acts like Unicode scalar literals in
regex literals.
---
 Sources/_StringProcessing/ByteCodeGen.swift  | 12 ++++++++----
 Sources/_StringProcessing/RegexDSL/DSL.swift | 16 +++++++++++-----
 Tests/RegexTests/MatchTests.swift            | 16 ++++++++++++++--
 Tests/RegexTests/RegexDSLTests.swift         | 10 ++++++++++
 4 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
index 93dca17a8..d6389c1f6 100644
--- a/Sources/_StringProcessing/ByteCodeGen.swift
+++ b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -99,12 +99,16 @@ extension Compiler.ByteCodeGen {
       }
 
     case .textSegment:
-      // This we should be able to do!
-      throw Unsupported(#"\y (text segment)"#)
+      builder.buildAssert { (input, pos, _) in
+        // FIXME: Grapheme or word based on options
+        input.isOnGraphemeClusterBoundary(pos)
+      }
 
     case .notTextSegment:
-      // This we should be able to do!
-      throw Unsupported(#"\Y (not text segment)"#)
+      builder.buildAssert { (input, pos, _) in
+        // FIXME: Grapheme or word based on options
+        !input.isOnGraphemeClusterBoundary(pos)
+      }
 
     case .startOfLine:
       builder.buildAssert { (input, pos, bounds) in
diff --git a/Sources/_StringProcessing/RegexDSL/DSL.swift b/Sources/_StringProcessing/RegexDSL/DSL.swift
index 17f006231..a21dce82d 100644
--- a/Sources/_StringProcessing/RegexDSL/DSL.swift
+++ b/Sources/_StringProcessing/RegexDSL/DSL.swift
@@ -17,8 +17,7 @@ extension String: RegexProtocol {
   public typealias Match = Substring
 
   public var regex: Regex<Match> {
-    let atoms = self.map { atom(.char($0)) }
-    return .init(ast: concat(atoms))
+    .init(node: .quotedLiteral(self))
   }
 }
 
@@ -26,8 +25,7 @@ extension Substring: RegexProtocol {
   public typealias Match = Substring
 
   public var regex: Regex<Match> {
-    let atoms = self.map { atom(.char($0)) }
-    return .init(ast: concat(atoms))
+    .init(node: .quotedLiteral(String(self)))
   }
 }
 
@@ -35,7 +33,15 @@ extension Character: RegexProtocol {
   public typealias Match = Substring
 
   public var regex: Regex<Match> {
-    .init(ast: atom(.char(self)))
+    .init(node: .atom(.char(self)))
+  }
+}
+
+extension UnicodeScalar: RegexProtocol {
+  public typealias Match = Substring
+
+  public var regex: Regex<Match> {
+    .init(node: .atom(.scalar(self)))
   }
 }
 
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index 6ee55e414..dba72820f 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -877,7 +877,8 @@ extension RegexTests {
       #"\d+\b"#,
       ("123", "123"),
       (" 123", "123"),
-      ("123 456", "123"))
+      ("123 456", "123"),
+      ("123A 456", "456"))
     firstMatchTests(
       #"\d+\b\s\b\d+"#,
       ("123", nil),
@@ -893,7 +894,18 @@ extension RegexTests {
     // TODO: \G and \K
 
     // TODO: Oniguruma \y and \Y
-
+    firstMatchTests(
+      #"\u{65}"#,             // Scalar 'e' is present in both:
+      ("Cafe\u{301}", "e"),   // composed and
+      ("Sol Cafe", "e"))      // standalone
+    firstMatchTests(
+      #"\u{65}\y"#,           // Grapheme boundary assertion
+      ("Cafe\u{301}", nil),
+      ("Sol Cafe", "e"))
+    firstMatchTests(
+      #"\u{65}\Y"#,           // Grapheme non-boundary assertion
+      ("Cafe\u{301}", "e"),
+      ("Sol Cafe", nil))
   }
 
   func testMatchGroups() {
diff --git a/Tests/RegexTests/RegexDSLTests.swift b/Tests/RegexTests/RegexDSLTests.swift
index d599400c6..d78ff04e5 100644
--- a/Tests/RegexTests/RegexDSLTests.swift
+++ b/Tests/RegexTests/RegexDSLTests.swift
@@ -280,6 +280,16 @@ class RegexDSLTests: XCTestCase {
       Anchor.endOfLine
     }
     
+    try _testDSLCaptures(
+      ("Cafe\u{301}", nil),
+      ("Cafe", "Cafe"),
+      matchType: Substring.self, ==)
+    {
+      oneOrMore(.word)
+      UnicodeScalar("e")
+      Anchor.textSegmentBoundary
+    }
+
     try _testDSLCaptures(
       ("aaaaa1", "aaaaa1"),
       ("aaaaa2", nil),