diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..ad640f43a
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+cmake_minimum_required(VERSION 3.18)
+project(SwiftExperimentalStringProcessing
+ LANGUAGES Swift)
+
+if(CMAKE_SYSTEM_NAME STREQUAL Windows OR CMAKE_SYSTEM_NAME STREQUAL Darwin)
+ option(BUILD_SHARED_LIBS "Build shared libraries by default" YES)
+endif()
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+set(CMAKE_Swift_MODULE_DIRECTORY ${CMAKE_BINARY_DIR}/swift)
+
+find_package(ArgumentParser CONFIG)
+
+add_subdirectory(Sources)
diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md
new file mode 100644
index 000000000..5bfdd5e8a
--- /dev/null
+++ b/Documentation/Evolution/RegexSyntax.md
@@ -0,0 +1,845 @@
+
+
+# Regex Syntax
+
+- Authors: Hamish Knight, Michael Ilseman
+
+## Introduction
+
+A regex declares a string processing algorithm using syntax familiar across a variety of languages and tools throughout programming history. Regexes can be created from a string at run time or from a literal at compile time. The contents of that run-time string, or the contents in-between the compile-time literal's delimiters, uses regex syntax. We present a detailed and comprehensive treatment of regex syntax.
+
+This is part of a larger effort in supporting regex literals, which in turn is part of a larger effort towards better string processing using regex. See [Pitch and Proposal Status](https://github.com/apple/swift-experimental-string-processing/issues/107), which tracks each relevant piece. This proposal regards _syntactic_ support, and does not necessarily mean that everything that can be written will be supported by Swift's runtime engine in the initial release. Support for more obscure features may appear over time, see [MatchingEngine Capabilities and Roadmap](https://github.com/apple/swift-experimental-string-processing/issues/99) for status.
+
+
+## Motivation
+
+Swift aims to be a pragmatic programming language, striking a balance between familiarity, interoperability, and advancing the art. Swift's `String` presents a uniquely Unicode-forward model of string, but currently suffers from limited processing facilities.
+
+The full string processing effort includes a regex type with strongly typed captures, the ability to create a regex from a string at runtime, a compile-time literal, a result builder DSL, protocols for intermixing 3rd party industrial-strength parsers with regex declarations, and a slew of regex-powered algorithms over strings.
+
+This proposal specifically hones in on the _familiarity_ aspect by providing a best-in-class treatment of familiar regex syntax.
+
+## Proposed Solution
+
+We propose accepting a syntactic "superset" of the following existing regular expression engines:
+
+- [PCRE 2][pcre2-syntax], an "industry standard" and a rough superset of Perl, Python, etc.
+- [Oniguruma][oniguruma-syntax], a modern engine with additional features.
+- [ICU][icu-syntax], used by NSRegularExpression, a Unicode-focused engine.
+- [.NET][.net-syntax], which adds delimiter-balancing and some interesting minor details around conditional patterns.
+
+To our knowledge, all other popular regex engines support a subset of the above syntaxes.
+
+We also support [UTS#18][uts18]'s full set of character class operators (to our knowledge no other engine does). Beyond that, UTS#18 deals with semantics rather than syntax, and what syntax it uses is covered by the above list. We also parse Java's properties (e.g. `\p{javaLowerCase}`), meaning we support a superset of Java 8 as well.
+
+Note that there are minor syntactic incompatibilities and ambiguities involved in this approach. Each is addressed in the relevant sections below.
+
+Regex syntax will be part of Swift's source-compatibility story as well as its binary-compatibility story. Thus, we present a detailed and comprehensive design.
+
+## Detailed Design
+
+We propose the following syntax for regex.
+
+Grammar Notation
+
+For the grammar sections, we use a modified PEG-like notation, in which the grammar also describes an unambiguous top-down parsing algorithm.
+
+- ` -> ` gives the definition of `Element`
+- The `|` operator specifies a choice of alternatives
+- `'x'` is the literal character `x`, otherwise it's a reference to x
+ + A literal `'` is spelled `"'"`
+- Postfix `*` `+` and `?` denote zero-or-more, one-or-more, and zero-or-one
+- Range quantifiers, like `{1...4}`, use Swift range syntax as convention.
+- Basic custom character classes are written like `[0-9a-zA-Z]`
+- Prefix `!` operator means the next element must not appear (a zero-width assertion)
+- Parenthesis group for the purposes of quantification
+- Builtins use angle brackets:
+ - `` refers to an integer, `` a character, etc.
+ - `` is any whitespace character
+ - `` is the end-of-line anchor (e.g. `$` in regex).
+
+For example, `(!'|' !')' ConcatComponent)*` means any number (zero or more) occurrences of `ConcatComponent` so long as the initial character is neither a literal `|` nor a literal `)`.
+
+
+
+### Top-level regular expression
+
+```
+Regex -> GlobalMatchingOptionSequence? RegexNode
+RegexNode -> '' | Alternation
+Alternation -> Concatenation ('|' Concatenation)*
+Concatenation -> (!'|' !')' ConcatComponent)*
+```
+
+A regex may be prefixed with a sequence of [global matching options](#pcre-global-matching-options). Its contents can be empty or a sequence of alternatives separated by `|`.
+
+Alternatives are a series of expressions concatenated together. The concatentation ends with either a `|` denoting the end of the alternative or a `)` denoting the end of a recursively parsed group.
+
+Alternation has a lower precedence than concatenation or other operations, so e.g `abc|def` matches against `abc` or `def`.
+
+### Concatenated subexpressions
+
+```
+ConcatComponent -> Trivia | Quote | Quantification
+
+Trivia -> Comment | NonSemanticWhitespace
+Comment -> '(?#' (!')')* ')' | EndOfLineComment
+
+(extended syntax only) EndOfLineComment -> '#' (! .)*
+(extended syntax only) NonSemanticWhitespace -> +
+
+Quote -> '\Q' (!'\E' .)* '\E'
+
+```
+
+Each component of a concatenation may be "trivia" (comments and non-semantic whitespace, if applicable), a quoted run of literal content, or a potentially-quantified subexpression.
+
+In-line comments, similarly to C, are lexical and are not recursively nested like normal groups are. A closing `)` cannot be escaped. Quotes are similarly lexical, non-nested, and the `\` before a `\E` cannot be escaped.
+
+For example, `\Q^[xy]+$\E`, is treated as the literal characters `^[xy]+$` rather than an anchored quantified character class. `\Q\\E` is a literal `\`.
+
+### Quantified subexpressions
+
+```
+Quantification -> QuantOperand Quantifier?
+Quantifier -> QuantAmount QuantKind?
+QuantAmount -> '?' | '*' | '+' | '{' Range '}'
+QuantKind -> '?' | '+'
+Range -> ',' | ',' ? |
+
+QuantOperand -> AbsentFunction | Atom | Conditional | CustomCharClass | Group
+```
+
+Subexpressions can be quantified, meaning they will be repeated some number of times:
+
+- `?`: 0 or 1 times.
+- `*`: 0 or more times.
+- `+`: 1 or more times.
+- `{n,m}`: Between `n` and `m` (inclusive) times.
+- `{n,}`: `n` or more times.
+- `{,m}`: Up to `m` times.
+- `{n}`: Exactly `n` times.
+
+Behavior can further be refined by a subsequent `?` or `+`:
+
+- `x*` _eager_: consume as much of input as possible.
+- `x*?` _reluctant_: consume as little of the input as possible.
+- `x*+`: _possessive_: eager and never relinquishes any input consumed.
+
+### Atoms
+
+```
+Atom -> Anchor
+ | Backreference
+ | BacktrackingDirective
+ | BuiltinCharacterClass
+ | Callout
+ | CharacterProperty
+ | EscapeSequence
+ | NamedScalar
+ | Subpattern
+ | UnicodeScalar
+ | '\K'
+ | '\'?
+```
+
+Atoms are the smallest units of regex syntax. They include escape sequences, metacharacters, backreferences, etc. The most basic form of atom is a literal character. A metacharacter may be treated as literal by preceding it with a backslash. Other literal characters may also be preceded with a backslash, but it has no effect if they are unknown escape sequences, e.g `\I` is literal `I`.
+
+#### Anchors
+
+```
+Anchor -> '^' | '$' | '\A' | '\b' | '\B' | '\G' | '\y' | '\Y' | '\z' | '\Z'
+```
+
+Anchors match against a certain position in the input rather than on a particular character of the input.
+
+- `^`: Matches at the very start of the input string, or the start of a line when in multi-line mode.
+- `$`: Matches at the very end of the input string, or the end of a line when in multi-line mode.
+- `\A`: Matches at the very start of the input string.
+- `\Z`: Matches at the very end of the input string, in addition to before a newline at the very end of the input string.
+- `\z`: Like `\Z`, but only matches at the very end of the input string.
+- `\G`: Like `\A`, but also matches against the start position of where matching resumes in global matching mode (e.g `\Gab` matches twice in `abab`, `\Aab` would only match once).
+- `\b` matches a boundary between a word character and a non-word character. The definitions of which vary depending on matching engine.
+- `\B` matches a non-word-boundary.
+- `\y` matches a text segment boundary, the definition of which varies based on the `y{w}` and `y{g}` matching option.
+- `\Y` matches a non-text-segment-boundary.
+
+#### Escape sequences
+
+```
+EscapeSequence -> '\a' | '\b' | '\c' | '\e' | '\f' | '\n' | '\r' | '\t'
+```
+
+These escape sequences each denote a specific scalar value.
+
+- `\a`: The alert (bell) character `U+7`.
+- `\b`: The backspace character `U+8`. Note this may only be used in a custom character class, otherwise it represents a word boundary.
+- `\c `: A control character sequence, which denotes a scalar from `U+00` - `U+7F` depending on the ASCII character provided.
+- `\e`: The escape character `U+1B`.
+- `\f`: The form-feed character `U+C`.
+- `\n`: The newline character `U+A`.
+- `\r`: The carriage return character `U+D`.
+- `\t`: The tab character `U+9`.
+
+#### Builtin character classes
+
+```
+BuiltinCharClass -> '.' | '\C' | '\d' | '\D' | '\h' | '\H' | '\N' | '\O' | '\R' | '\s' | '\S' | '\v' | '\V' | '\w' | '\W' | '\X'
+```
+
+- `.`: Any character excluding newlines.
+- `\C`: A single UTF code unit.
+- `\d`: Digit character.
+- `\D`: Non-digit character.
+- `\h`: Horizontal space character.
+- `\H`: Non-horizontal-space character.
+- `\N`: Non-newline character.
+- `\O`: Any character (including newlines). This is syntax from Oniguruma.
+- `\R`: Newline sequence.
+- `\s`: Whitespace character.
+- `\S`: Non-whitespace character.
+- `\v`: Vertical space character.
+- `\V`: Non-vertical-space character.
+- `\w`: Word character.
+- `\W`: Non-word character.
+- `\X`: Any extended grapheme cluster.
+
+Precise definitions of character classes is discussed in [Character Classes for String Processing](https://forums.swift.org/t/pitch-character-classes-for-string-processing/52920).
+
+#### Unicode scalars
+
+```
+UnicodeScalar -> '\u{' HexDigit{1...} '}'
+ | '\u' HexDigit{4}
+ | '\x{' HexDigit{1...} '}'
+ | '\x' HexDigit{0...2}
+ | '\U' HexDigit{8}
+ | '\o{' OctalDigit{1...} '}'
+ | '\0' OctalDigit{0...3}
+
+HexDigit -> [0-9a-zA-Z]
+OctalDigit -> [0-7]
+
+NamedScalar -> '\N{' ScalarName '}'
+ScalarName -> 'U+' HexDigit{1...8} | [\s\w-]+
+```
+
+These sequences define a unicode scalar value using hexadecimal or octal notation.
+
+`\x`, when not followed by any hexadecimal digit characters, is treated as `\0`, matching PCRE's behavior.
+
+`\N{...}` allows a specific Unicode scalar to be specified by name or hexadecimal code point.
+
+#### Character properties
+
+```
+CharacterProperty -> '\' ('p' | 'P') '{' PropertyContents '}'
+POSIXCharacterProperty -> '[:' PropertyContents ':]'
+
+PropertyContents -> PropertyName ('=' PropertyName)?
+PropertyName -> [\s\w-]+
+```
+
+A character property specifies a particular Unicode, POSIX, or PCRE property to match against. We propose supporting:
+
+- The full range of Unicode character properties.
+- The POSIX properties `alnum`, `blank`, `graph`, `print`, `word`, `xdigit` (note that `alpha`, `lower`, `upper`, `space`, `punct`, `digit`, and `cntrl` are covered by Unicode properties).
+- The UTS#18 special properties `any`, `assigned`, `ascii`.
+- The special PCRE2 properties `Xan`, `Xps`, `Xsp`, `Xuc`, `Xwd`.
+- The special Java properties `javaLowerCase`, `javaUpperCase`, `javaWhitespace`, `javaMirrored`.
+
+We follow [UTS#18][uts18]'s guidance for character properties, including fuzzy matching for property name parsing, according to rules set out by [UAX44-LM3]. The following property names are equivalent:
+
+- `whitespace`
+- `isWhitespace`
+- `is-White_Space`
+- `iSwHiTeSpaCe`
+- `i s w h i t e s p a c e`
+
+Unicode properties consist of both a key and a value, e.g `General_Category=Whitespace`. Each component follows the fuzzy matching rule, and additionally may have an alternative alias spelling, as defined by Unicode in [PropertyAliases.txt][unicode-prop-key-aliases] and [PropertyValueAliases.txt][unicode-prop-value-aliases].
+
+There are some Unicode properties where the key or value may be inferred. These include:
+
+- General category properties e.g `\p{Whitespace}` is inferred as `\p{General_Category=Whitespace}`.
+- Script properties e.g `\p{Greek}` is inferred as `\p{Script_Extensions=Greek}`.
+- Boolean properties that are inferred to have a `True` value, e.g `\p{Lowercase}` is inferred as `\p{Lowercase=True}`.
+- Block properties that begin with the prefix `in`, e.g `\p{inBasicLatin}` is inferred to be `\p{Block=Basic_Latin}`.
+
+Other Unicode properties however must specify both a key and value.
+
+For non-Unicode properties, only a value is required. These include:
+
+- The UTS#18 special properties `any`, `assigned`, `ascii`.
+- The POSIX compatibility properties `alnum`, `blank`, `graph`, `print`, `word`, `xdigit`. The remaining POSIX properties are already covered by boolean Unicode property spellings.
+- The special PCRE2 properties `Xan`, `Xps`, `Xsp`, `Xuc`, `Xwd`.
+- The special Java properties `javaLowerCase`, `javaUpperCase`, `javaWhitespace`, `javaMirrored`.
+
+Note that the internal `PropertyContents` syntax is shared by both the `\p{...}` and POSIX-style `[:...:]` syntax, allowing e.g `[:script=Latin:]` as well as `\p{alnum}`.
+
+#### `\K`
+
+The `\K` escape sequence is used to drop any previously matched characters from the final matching result. It does not affect captures, e.g `a(b)\Kc` when matching against `abc` will return a match of `c`, but with a capture of `b`.
+
+### Groups
+
+```
+Group -> GroupStart RegexNode ')'
+GroupStart -> '(' GroupKind | '('
+GroupKind -> '' | '?' BasicGroupKind | '*' PCRE2GroupKind ':'
+
+BasicGroupKind -> ':' | '|' | '>' | '=' | '!' | '*' | '<=' | ' 'atomic'
+ | 'pla' | 'positive_lookahead'
+ | 'nla' | 'negative_lookahead'
+ | 'plb' | 'positive_lookbehind'
+ | 'nlb' | 'negative_lookbehind'
+ | 'napla' | 'non_atomic_positive_lookahead'
+ | 'naplb' | 'non_atomic_positive_lookbehind'
+ | 'sr' | 'script_run'
+ | 'asr' | 'atomic_script_run'
+
+NamedGroup -> 'P<' GroupNameBody '>'
+ | '<' GroupNameBody '>'
+ | "'" GroupNameBody "'"
+
+GroupNameBody -> Identifier | BalancingGroupBody
+
+Identifier -> [\w--\d] \w*
+```
+
+Groups define a new scope that contains a recursively nested regex. Groups have different semantics depending on how they are introduced.
+
+Note there are additional constructs that may syntactically appear similar to groups, such as backreferences and PCRE backtracking directives, but are distinct.
+
+#### Basic group kinds
+
+- `()`: A capturing group.
+- `(?:)`: A non-capturing group.
+- `(?|)`: A group that, for a direct child alternation, resets the numbering of groups at each branch of that alternation. See [Group Numbering](#group-numbering).
+
+Capturing groups produce captures, which remember the range of input matched for the scope of that group.
+
+A capturing group may be named using any of the `NamedGroup` syntax. The characters of the group name may be any letter or number characters or the character `_`. However the name must not start with a number. This restriction follows the behavior of other regex engines and avoids ambiguities when it comes to named and numeric group references.
+
+#### Atomic groups
+
+An atomic group e.g `(?>...)` specifies that its contents should not be re-evaluated for backtracking. This has the same semantics as a possessive quantifier, but applies more generally to any regex pattern.
+
+#### Lookahead and lookbehind
+
+These groups evaluate the input ahead or behind the current matching position, without advancing the input.
+
+- `(?=`: A lookahead, which matches against the input following the current matching position.
+- `(?!`: A negative lookahead, which ensures a negative match against the input following the current matching position.
+- `(?<=`: A lookbehind, which matches against the input prior to the current matching position.
+- `(? Identifier? '-' Identifier
+```
+
+Introduced by .NET, [balancing groups][balancing-groups] extend the `GroupNameBody` syntax to support the ability to refer to a prior group. Upon matching, the prior group is deleted, and any intermediate matched input becomes the capture of the current group.
+
+#### Group numbering
+
+Capturing groups are implicitly numbered according to the position of their opening `(` in the regex. For example:
+
+```
+(a((?:b)(?c)d)(e)f)
+^ ^ ^ ^
+1 2 3 4
+```
+
+Non-capturing groups are skipped over when counting.
+
+Branch reset groups can alter this numbering, as they reset the numbering in the branches of an alternation child. Outside the alternation, numbering resumes at the next available number not used in one of the branches. For example:
+
+```
+(a()(?|(b)(c)|(?:d)|(e)))(f)
+^ ^ ^ ^ ^ ^
+1 2 3 4 3 5
+```
+
+### Custom character classes
+
+```
+CustomCharClass -> Start Set (SetOp Set)* ']'
+Start -> '[' '^'?
+Set -> Member+
+Member -> CustomCharClass | Quote | Range | Atom
+Range -> RangeElt `-` RangeElt
+RangeElt -> | UnicodeScalar | EscapeSequence
+SetOp -> '&&' | '--' | '~~' | '-'
+```
+
+Custom characters classes introduce their own sublanguage, in which most regular expression metacharacters become literal. The basic element in a custom character class is an `Atom`, though only some atoms are considered valid:
+
+- Builtin character classes, except for `.`, `\R`, `\O`, `\X`, `\C`, and `\N`.
+- Escape sequences, including `\b` which becomes the backspace character (rather than a word boundary).
+- Unicode scalars.
+- Named scalars.
+- Character properties.
+- Plain literal characters.
+
+Atoms may be used to compose other character class members, including ranges, quoted sequences, and even nested custom character classes `[[ab]c\d]`. Adjacent members form an implicit union of character classes, e.g `[[ab]c\d]` is the union of the characters `a`, `b`, `c`, and digit characters.
+
+Custom character classes may not be empty, e.g `[]` is forbidden. A custom character class may begin with the `]` character, in which case it is treated as literal, e.g `[]a]` is the custom character class of `]` and `a`.
+
+Quoted sequences may be used to escape the contained characters, e.g `[a\Q]\E]` is also the character class of `[` and `a`.
+
+Ranges of characters may be specified with `-`, e.g `[a-z]` matches against the letters from `a` to `z`. Only unicode scalars and literal characters are valid range operands. If `-` cannot be used to form a range, it is interpreted as literal, e.g `[-a-]` is the character class of `-` and `a`. `[a-c-d]` is the character class of `a`...`c`, `-`, and `d`.
+
+Operators may be used to apply set operations to character class members. The operators supported are:
+
+- `&&`: Intersection of the LHS and RHS.
+- `--`: Subtraction of the RHS from the LHS.
+- `~~`: Symmetric difference of the RHS and LHS.
+- `-`: .NET's spelling of subtracting the RHS from the LHS.
+
+These operators have a lower precedence than the implicit union of members, e.g `[ac-d&&a[d]]` is an intersection of the character classes `[ac-d]` and `[ad]`.
+
+To avoid ambiguity between .NET's subtraction syntax and range syntax, .NET specifies that a subtraction will only be parsed if the right-hand-side is a nested custom character class. We propose following this behavior.
+
+
+### Matching options
+
+```
+MatchingOptionSeq -> '^' MatchingOption*
+ | MatchingOption+
+ | MatchingOption* '-' MatchingOption*
+
+MatchingOption -> 'i' | 'J' | 'm' | 'n' | 's' | 'U' | 'x' | 'xx' | 'w' | 'D' | 'P' | 'S' | 'W' | 'y{' ('g' | 'w') '}'
+```
+
+A matching option sequence may be used as a group specifier, and denotes a change in matching options for the scope of that group. For example `(?x:a b c)` enables extended syntax for `a b c`. A matching option sequence may be part of an "isolated group" which has an implicit scope that wraps the remaining elements of the current group. For example, `(?x)a b c` also enables extended syntax for `a b c`.
+
+If used in the branch of an alternation, an isolated group affects all the following branches of that alternation. For example, `a(?i)b|c|d` is treated as `a(?i:b)|(?i:c)|(?i:d)`.
+
+We support all the matching options accepted by PCRE, ICU, and Oniguruma. In addition, we accept some matching options unique to our matching engine.
+
+#### PCRE options
+
+- `i`: Case insensitive matching.
+- `J`: Allows multiple groups to share the same name, which is otherwise forbidden.
+- `m`: Enables `^` and `$` to match against the start and end of a line rather than only the start and end of the entire string.
+- `n`: Disables the capturing behavior of `(...)` groups. Named capture groups must be used instead.
+- `s`: Changes `.` to match any character, including newlines.
+- `U`: Changes quantifiers to be reluctant by default, with the `?` specifier changing to mean greedy.
+- `x`, `xx`: Enables extended syntax mode, which allows non-semantic whitespace and end-of-line comments. See [Extended Syntax Modes](#extended-syntax-modes) for more info.
+
+#### ICU options
+
+- `w`: Enables the Unicode interpretation of word boundaries `\b`.
+
+#### Oniguruma options
+
+- `D`: Enables ASCII-only digit matching for `\d`, `\p{Digit}`, `[:digit:]`.
+- `S`: Enables ASCII-only space matching for `\s`, `\p{Space}`, `[:space:]`.
+- `W`: Enables ASCII-only word matching for `\w`, `\p{Word}`, `[:word:]`, and `\b`.
+- `P`: Enables ASCII-only for all POSIX properties (including `digit`, `space`, and `word`).
+- `y{g}`, `y{w}`: Changes the meaning of `\X`, `\y`, `\Y`. These are mutually exclusive options, with `y{g}` specifying extended grapheme cluster mode, and `y{w}` specifying word mode.
+
+#### Swift options
+
+These options are specific to the Swift regex matching engine and control the semantic level at which matching takes place.
+
+- `X`: Grapheme cluster matching.
+- `u`: Unicode scalar matching.
+- `b`: Byte matching.
+
+Further details on these are TBD and outside the scope of this pitch.
+
+### References
+
+```
+NamedOrNumberRef -> NamedRef | NumberRef
+NamedRef -> Identifier RecursionLevel?
+NumberRef -> ('+' | '-')? RecursionLevel?
+RecursionLevel -> '+' | '-'
+```
+
+A reference is an abstract identifier for a particular capturing group in a regular expression. It can either be named or numbered, and in the latter case may be specified relative to the current group. For example `-2` refers to the capture group `N - 2` where `N` is the number of the next capture group. References may refer to groups ahead of the current position e.g `+3`, or the name of a future group. These may be useful in recursive cases where the group being referenced has been matched in a prior iteration. If a referenced capture does not exist anywhere in the regular expression, the reference is diagnosed as invalid.
+
+A backreference may optionally include a recursion level in certain cases, which is a syntactic element inherited [from Oniguruma][oniguruma-syntax] that allows the reference to specify a capture relative to a given recursion level.
+
+#### Backreferences
+
+```
+Backreference -> '\g{' NamedOrNumberRef '}'
+ | '\g' NumberRef
+ | '\k<' NamedOrNumberRef '>'
+ | "\k'" NamedOrNumberRef "'"
+ | '\k{' NamedRef '}'
+ | '\' [1-9] [0-9]+
+ | '(?P=' NamedRef ')'
+```
+
+A backreference evaluates to the value last captured by the referenced capturing group. If the referenced capture has not been evaluated yet, the match fails.
+
+#### Subpatterns
+
+```
+Subpattern -> '\g<' NamedOrNumberRef '>'
+ | "\g'" NamedOrNumberRef "'"
+ | '(?' GroupLikeSubpatternBody ')'
+
+GroupLikeSubpatternBody -> 'P>' NamedRef
+ | '&' NamedRef
+ | 'R'
+ | NumberRef
+```
+
+A subpattern causes the referenced capture group to be re-evaluated at the current position. The syntax `(?R)` is equivalent to `(?0)`, and causes the entire pattern to be recursed.
+
+### Conditionals
+
+```
+Conditional -> ConditionalStart Concatenation ('|' Concatenation)? ')'
+ConditionalStart -> KnownConditionalStart | GroupConditionalStart
+
+KnownConditionalStart -> '(?(' KnownCondition ')'
+GroupConditionalStart -> '(?' GroupStart
+
+KnownCondition -> 'R'
+ | 'R' NumberRef
+ | 'R&' NamedRef
+ | '<' NamedOrNumberRef '>'
+ | "'" NamedOrNumberRef "'"
+ | 'DEFINE'
+ | 'VERSION' VersionCheck
+ | NumberRef
+
+PCREVersionCheck -> '>'? '=' PCREVersionNumber
+PCREVersionNumber -> '.'
+```
+
+A conditional evaluates a particular condition, and chooses a branch to match against accordingly. 1 or 2 branches may be specified. If 1 branch is specified e.g `(?(...)x)`, it is treated as the true branch. Note this includes an empty true branch, e.g `(?(...))` which is the null pattern as described in the [Top-Level Regular Expression](#top-level-regular-expression) section. If 2 branches are specified, e.g `(?(...)x|y)`, the first is treated as the true branch, the second being the false branch.
+
+A condition may be:
+
+- A numeric or delimited named reference to a capture group, which checks whether the group matched successfully.
+- A recursion check on either a particular group or the entire regex. In the former case, this checks to see if the last recursive call is through that group. In the latter case, it checks if the match is currently taking place in any kind of recursive call.
+- A PCRE version check.
+
+If the condition does not syntactically match any of the above, it is treated as an arbitrary recursive regular expression. This will be matched against, and evaluates to true if the match is successful. It may contain capture groups that add captures to the match.
+
+The `DEFINE` keyword is not used as a condition, but rather a way in which to define a group which is not evaluated, but may be referenced by a subpattern.
+
+### PCRE backtracking directives
+
+```
+BacktrackingDirective -> '(*' BacktrackingDirectiveKind (':' )? ')'
+BacktrackingDirectiveKind -> 'ACCEPT' | 'FAIL' | 'F' | 'MARK' | '' | 'COMMIT' | 'PRUNE' | 'SKIP' | 'THEN'
+```
+
+This is syntax specific to PCRE, and is used to control backtracking behavior. Any of the directives may include an optional tag, however `MARK` must have a tag. The empty directive is treated as `MARK`. Only the `ACCEPT` directive may be quantified, as it can use the backtracking behavior of the engine to be evaluated only if needed by a reluctant quantification.
+
+- `ACCEPT`: Causes matching to terminate immediately as a successful match. If used within a subpattern, only that level of recursion is terminated.
+- `FAIL`, `F`: Causes matching to fail, forcing backtracking to occur if possible.
+- `MARK`: Assigns a label to the current matching path, which is passed back to the caller on success. Subsequent `MARK` directives overwrite the label assigned, so only the last is passed back.
+- `COMMIT`: Prevents backtracking from reaching any point prior to this directive, causing the match to fail. This does not allow advancing the input to try a different starting match position.
+- `PRUNE`: Similar to `COMMIT`, but allows advancing the input to try and find a different starting match position.
+- `SKIP`: Similar to `PRUNE`, but skips ahead to the position of `SKIP` to try again as the starting position.
+- `THEN`: Similar to `PRUNE`, but when used inside an alternation will try to match in the subsequent branch before attempting to advance the input to find a different starting position.
+
+### PCRE global matching options
+
+```
+GlobalMatchingOptionSequence -> GlobalMatchingOption+
+GlobalMatchingOption -> '(*' GlobalMatchingOptionKind ')'
+
+GlobalMatchingOptionKind -> LimitOptionKind '='
+ | NewlineKind | NewlineSequenceKind
+ | 'NOTEMPTY_ATSTART' | 'NOTEMPTY'
+ | 'NO_AUTO_POSSESS' | 'NO_DOTSTAR_ANCHOR'
+ | 'NO_JIT' | 'NO_START_OPT' | 'UTF' | 'UCP'
+
+LimitOptionKind -> 'LIMIT_DEPTH' | 'LIMIT_HEAP' | 'LIMIT_MATCH'
+NewlineKind -> 'CRLF' | 'CR' | 'ANYCRLF' | 'ANY' | 'LF' | 'NUL'
+NewlineSequenceKind -> 'BSR_ANYCRLF' | 'BSR_UNICODE'
+```
+
+This is syntax specific to PCRE, and allows a set of global options to appear at the start of a regular expression. They may not appear at any other position.
+
+- `LIMIT_DEPTH`, `LIMIT_HEAP`, `LIMIT_MATCH`: These place certain limits on the resources the matching engine may consume, and matches it may make.
+- `CRLF`, `CR`, `ANYCRLF`, `ANY`, `LF`, `NUL`: These control the definition of a newline character, which is used when matching e.g the `.` character class, and evaluating where a line ends in multi-line mode.
+- `BSR_ANYCRLF`, `BSR_UNICODE`: These change the definition of `\R`.
+- `NOTEMPTY`: Does not consider the empty string to be a valid match.
+- `NOTEMPTY_ATSTART`: Like `NOT_EMPTY`, but only applies to the first matching position in the input.
+- `NO_AUTO_POSSESS`: Disables an optimization that treats a quantifier as possessive if the following construct clearly cannot be part of the match. In other words, disables the short-circuiting of backtracks in cases where the engine knows it will not produce a match. This is useful for debugging, or for ensuring a callout gets invoked.
+- `NO_DOTSTAR_ANCHOR`: Disables an optimization that tries to automatically anchor `.*` at the start of a regex. Like `NO_AUTO_POSSESS`, this is mainly used for debugging or ensuring a callout gets invoked.
+- `NO_JIT`: Disables JIT compilation
+- `NO_START_OPT`: Disables various optimizations performed at the start of matching. Like `NO_DOTSTAR_ANCHOR`, is mainly used for debugging or ensuring a callout gets invoked.
+- `UTF`: Enables UTF pattern support.
+- `UCP`: Enables Unicode property support.
+
+### Callouts
+
+```
+Callout -> PCRECallout | NamedCallout | InterpolatedCallout
+
+PCRECallout -> '(?C' CalloutBody ')'
+PCRECalloutBody -> '' |
+ | '`' '`'
+ | "'" "'"
+ | '"' '"'
+ | '^' '^'
+ | '%' '%'
+ | '#' '#'
+ | '$' '$'
+ | '{' '}'
+
+NamedCallout -> '(*' Identifier CalloutTag? CalloutArgs? ')'
+CalloutArgs -> '{' CalloutArgList '}'
+CalloutArgList -> CalloutArg (',' CalloutArgList)*
+CalloutArg -> [^,}]+
+CalloutTag -> '[' Identifier ']'
+
+InterpolatedCallout -> '(?' '{' Interpolation '}' CalloutTag? CalloutDirection? ')'
+Interpolation -> | '{' Interpolation '}'
+CalloutDirection -> 'X' | '<' | '>'
+```
+
+A callout is a feature that allows a user-supplied function to be called when matching reaches that point in the pattern. We supported parsing 3 types of callout:
+
+- PCRE callout syntax, which accepts a string or numeric argument that is passed to the function.
+- Oniguruma named callout syntax, which accepts an identifier with an optional tag and argument list.
+- Interpolated callout syntax, which is equivalent to Oniguruma's "callout of contents". This callout accepts an arbitrary interpolated program. This is an expanded version of Perl's interpolation syntax, and allows an arbitrary nesting of delimiters in addition to an optional tag and direction.
+
+While we propose parsing these for the purposes of issuing helpful diagnostics, we are deferring full support for the interpolated syntax for the future.
+
+### Absent functions
+
+```
+AbsentFunction -> '(?~' RegexNode ')'
+ | '(?~|' Concatenation '|' Concatenation ')'
+ | '(?~|' Concatenation ')'
+ | '(?~|)'
+```
+
+An absent function is an [Oniguruma][oniguruma-syntax] feature that allows for the easy inversion of a given pattern. There are 4 variants of the syntax:
+
+- `(?~|absent|expr)`: Absent expression, which attempts to match against `expr`, but is limited by the range that is not matched by `absent`.
+- `(?~absent)`: Absent repeater, which matches against any input not matched by `absent`. Equivalent to `(?~|absent|\O*)`.
+- `(?~|absent)`: Absent stopper, which limits any subsequent matching to not include `absent`.
+- `(?~|)`: Absent clearer, which undoes the effects of the absent stopper.
+
+
+## Syntactic differences between engines
+
+The proposed "syntactic superset" introduces some minor ambiguities, as each engine supports a slightly different set of features. When a particular engine's parser sees a feature it doesn't support, it typically has a fall-back behavior, such as treating the unknown feature as literal contents.
+
+Explicit compatibility modes, i.e. precisely mimicking emergent behavior from a specific engine's parser, is deferred as future work from this proposal. Conversion from this "syntactic superset" to a particular engine's syntax (e.g. as an AST "pretty printer") is deferred as future work from this proposal.
+
+Below is an exhaustive treatment of every syntactic ambiguity we have encountered.
+
+### Character class set operations
+
+In a custom character class, some engines allow for binary set operations that take two character class inputs, and produce a new character class output. However which set operations are supported and the spellings used differ by engine.
+
+| PCRE | ICU | UTS#18 | Oniguruma | .NET | Java |
+|------|-----|--------|-----------|------|------|
+| â | Intersection `&&`, Subtraction `--` | Intersection, Subtraction | Intersection `&&` | Subtraction via `-` | Intersection `&&` |
+
+
+[UTS#18][uts18] requires intersection and subtraction, and uses the operation spellings `&&` and `--` in its examples, though it doesn't mandate a particular spelling. In particular, conforming implementations could spell the subtraction `[[x]--[y]]` as `[[x]&&[^y]]`. UTS#18 also suggests a symmetric difference operator `~~`, and uses an explicit `||` operator in examples, though doesn't require either.
+
+Engines that don't support a particular operator fallback to treating it as literal, e.g `[x&&y]` in PCRE is the character class of `["x", "&", "y"]` rather than an intersection.
+
+Unlike other engines, .NET supports the use of `-` to denote both a range as well as a set subtraction. .NET disambiguates this by only permitting its use as a subtraction if the right hand operand is a nested custom character class, otherwise it is a range operator. This conflicts with e.g ICU where `[x-[y]]`, in which the `-` is treated as literal.
+
+We propose supporting the operators `&&`, `--`, and `~~`. This means that any regex literal containing these sequences in a custom character class while being written for an engine not supporting that operation will have a different semantic meaning in our engine. However this ought not to be a common occurrence, as specifying a character multiple times in a custom character class is redundant.
+
+In the interests of compatibility, we also propose supporting the `-` operator, though we will likely want to emit a warning and encourage users to switch to `--`.
+
+### Nested custom character classes
+
+This allows e.g `[[a]b[c]]`, which is interpreted the same as `[abc]`. It also allows for more complex set operations with custom character classes as the operands.
+
+| PCRE | ICU | UTS#18 | Oniguruma | .NET | Java |
+|------|-----|--------|-----------|------|------|
+| â | â | đĄ | â | â | â |
+
+
+UTS#18 doesn't require this, though it does suggest it as a way to clarify precedence for chains of character class set operations e.g `[\w--\d&&\s]`, which the user could write as `[[\w--\d]&&\s]`.
+
+PCRE does not support this feature, and as such treats `]` as the closing character of the custom character class. Therefore `[[a]b[c]]` is interpreted as the character class `["[", "a"]`, followed by literal `b`, and then the character class `["c"]`, followed by literal `]`.
+
+.NET does not support nested character classes in general, although allows them as the right-hand side of a subtraction operation.
+
+We propose allowing nested custom character classes.
+
+### `\U`
+
+In PCRE, if `PCRE2_ALT_BSUX` or `PCRE2_EXTRA_ALT_BSUX` are specified, `\U` matches literal `U`. However in ICU, `\Uhhhhhhhh` matches a hex sequence. We propose following the ICU behavior.
+
+### `{,n}`
+
+This quantifier is supported by Oniguruma, but in PCRE it matches the literal characters `{`, `,`, `n`, and `}` in sequence. We propose supporting it as a quantifier.
+
+### `\DDD`
+
+This syntax is implemented in a variety of different ways depending on the engine. In ICU and Java, it is always a backreference unless prefixed with `0`, in which case it is an octal sequence.
+
+In PCRE, Oniguruma, and .NET, it is also always an octal sequence if prefixed with `0`, however there are other cases where it may be treated as octal. These cases vary slightly between the engines. In PCRE, it will be treated as backreference if any of the following hold:
+
+- Its value is `0 < n < 10`.
+- Its first digit is `8` or `9`.
+- Its value corresponds to a valid *prior* group number.
+
+Otherwise it is treated as an octal sequence.
+
+Oniguruma follows all of these except the second. If the first digit is `8` or `9`, it is instead treated as the literal number, e.g `\81` is `81`. .NET also follows this behavior, but additionally has the last condition consider *all* groups, not just prior ones (as backreferences can refer to future groups in recursive cases).
+
+We propose a simpler behavior more inline with ICU and Java. A `\DDD` sequence that does not start with a `0` will be treated as a backreference, otherwise it will be treated as an octal sequence. If an invalid backreference is formed with this syntax, we will suggest prefixing with a `0` if an octal sequence is desired.
+
+One further difference exists between engines in the octal sequence case. In ICU, up to 3 additional digits are read after the `0`. In PCRE, only 2 additional digits may be interpreted as octal, the last is literal. We will follow the ICU behavior, as it is necessary when requiring a `0` prefix.
+
+### `\x`
+
+In PCRE, a bare `\x` denotes the NUL character (`U+00`). In Oniguruma, it denotes literal `x`. We propose following the PCRE behavior.
+
+### Whitespace in ranges
+
+In PCRE, `x{2,4}` is a range quantifier meaning that `x` can be matched from 2 to 4 times. However if any whitespace is introduced within the braces e.g `x{2, 4}`, it becomes an invalid range and is then treated as the literal characters instead. We find this behavior to be unintuitive, and therefore propose parsing any intermixed whitespace in the range.
+
+### Implicitly-scoped matching option scopes
+
+PCRE and Oniguruma both support changing the active matching options through an isolated group e.g `(?i)`. However, they have differing semantics when it comes to their scoping. In Oniguruma, it is treated as an implicit new scope that wraps everything until the end of the current group. In PCRE, it is treated as changing the matching option for all the following expressions until the end of the group.
+
+These sound similar, but have different semantics around alternations, e.g for `a(?i)b|c|d`, in Oniguruma this becomes `a(?i:b|c|d)`, where `a` is no longer part of the alternation. However in PCRE it becomes `a(?i:b)|(?i:c)|(?i:d)`, where `a` remains a child of the alternation.
+
+We propose matching the PCRE behavior.
+
+### Backreference condition kinds
+
+PCRE and .NET allow for conditional patterns to reference a group by its name without any form of delimiter, e.g:
+
+```
+(?x)?(?(group1)y)
+```
+
+where `y` will only be matched if `(?x)` was matched. PCRE will always treat such syntax as a backreference condition, however .NET will only treat it as such if a group with that name exists somewhere in the regex (including after the conditional). Otherwise, .NET interprets `group1` as an arbitrary regular expression condition to try match against. Oniguruma on the other hand will always treat `group1` as an regex condition to match against.
+
+We propose parsing such conditions as an arbitrary regular expression condition, as long as they do not conflict with other known condition spellings such as `R&name`. If the condition has a name that matches a named group in the regex, we will emit a warning asking users to explicitly use the syntax `(?()y)` if they want a backreference condition. This more explicit syntax is supported by both PCRE and Oniguruma.
+
+### `\N`
+
+PCRE supports `\N` meaning "not a newline", however there are engines that treat it as a literal `N`. We propose supporting the PCRE behavior.
+
+### Extended character property syntax
+
+ICU unifies the character property syntax `\p{...}` with the syntax for POSIX character classes `[:...:]`, such that they follow the same internal grammar, which allows referencing any Unicode character property in addition to the POSIX properties. We propose supporting this, though it is a purely additive feature, and therefore should not conflict with regex engines that implement a more limited POSIX syntax.
+
+### Script properties
+
+Shorthand script property syntax e.g `\p{Latin}` is treated as `\p{Script=Latin}` by PCRE, ICU, Oniguruma, and Java. These use [the Unicode Script property][unicode-scripts], which assigns each scalar a particular script value. However, there are scalars that may appear in multiple scripts, e.g U+3003 DITTO MARK. These are often assigned to the `Common` script to reflect this fact, which is not particularly useful for matching purposes. To provide more fine-grained script matching, Unicode provides [the Script Extension property][unicode-script-extensions], which exposes the set of scripts that a scalar appears in.
+
+As such we feel that the more desirable default behavior of shorthand script property syntax e.g `\p{Latin}` is for it to be treated as `\p{Script_Extension=Latin}`. This matches Perl's default behavior. Plain script properties may still be written using the more explicit syntax e.g `\p{Script=Latin}` and `\p{sc=Latin}`.
+
+### Extended syntax modes
+
+Various regex engines offer an "extended syntax" where whitespace is treated as non-semantic (e.g `a b c` is equivalent to `abc`), in addition to allowing end-of-line comments `# comment`. In both PCRE and Perl, this is enabled through the `(?x)`, and in later versions, `(?xx)` matching options. The former allows non-semantic whitespace outside of character classes, and the latter also allows non-semantic whitespace in custom character classes.
+
+Oniguruma, Java, and ICU however enable the more broad behavior under `(?x)`. We therefore propose following this behavior, with `(?x)` and `(?xx)` being treated the same.
+
+Different regex engines also have different rules around what characters are considered non-semantic whitespace. When compiled with Unicode support, PCRE considers the following whitespace:
+
+- The space character `U+20`
+- Whitespace characters `U+9...U+D`
+- Next line `U+85`
+- Left-to-right mark `U+200E`
+- Right-to-left mark `U+200F`
+- Line separator `U+2028`
+- Paragraph separator `U+2029`
+
+This is a subset of the scalars matched by `UnicodeScalar.isWhitespace`. Additionally, in a custom character class, PCRE only considers the space and tab characters as whitespace. Other engines do not differentiate between whitespace characters inside and outside custom character classes, and appear to follow a subset of this list. Therefore we propose supporting exactly the characters in this list for the purposes of non-semantic whitespace parsing.
+
+### Group numbering
+
+In PCRE, groups are numbered according to the position of their opening parenthesis. .NET also follows this rule, with the exception that named groups are numbered after unnamed groups. For example:
+
+```
+(a(?x)b)(?y)(z)
+^ ^ ^ ^
+1 3 4 2
+```
+
+The `(z)` group gets numbered before the named groups get numbered.
+
+We propose matching the PCRE behavior where groups are numbered purely based on order.
+
+
+## Swift canonical syntax
+
+The proposed syntactic superset means there will be multiple ways to write the same thing. Below we discuss what Swift's preferred spelling could be, a "Swift canonical syntax".
+
+We are not formally proposing this as a distinct syntax or concept, rather it is useful for considering compiler features such as fixits, pretty-printing, and refactoring actions. We're hoping for further discussion with the community here. Useful criteria include how well the choice fits in with the rest of Swift, whether there's an existing common practice, and whether one choice is less confusing in the context of others.
+
+[Unicode scalar literals](#unicode-scalars) can be spelled in many ways. We propose treating Swift's string literal syntax of `\u{HexDigit{1...}}` as the preferred spelling.
+
+Character properties can be spelled `\p{...}` or `[:...:]`. We recommend preferring `\p{...}` as the bracket syntax historically meant POSIX-defined character classes, and still has that connotation in some engines. The [spelling of properties themselves can be fuzzy](#character-properties) and we (weakly) recommend the shortest spelling (no opinion on casing yet). For script extensions, we (weakly) recommend e.g. `\p{Greek}` instead of `\p{Script_Extensions=Greek}`. We would like more discussion with the community here.
+
+[Lookaround assertions](#lookahead-and-lookbehind) have common shorthand spellings, while PCRE2 introduced longer more explicit spellings. We are (very weakly) recommending the common short-hand syntax of e.g. `(?=...)` as that's wider spread. We are interested in more discussion with the community here.
+
+Named groups may be specified with a few different delimiters: `(?...)`, `(?P...)`, `(?'name'...)`. We (weakly) recommend `(?...)`, but the final preference may be influenced by choice of delimiter for the regex itself. We'd appreciate any insight from the community.
+
+[Backreferences](#backreferences) have multiple spellings. For absolute numeric references, `\DDD` seems to be a strong candidate for the preferred syntax due to its familiarity. For relative numbered references, as well as named references, either `\k<...>` or `\k'...'` seem like the better choice, depending on the syntax chosen for named groups. This avoids the confusion between `\g{...}` and `\g<...>` referring to a backreferences and subpatterns respectively, as well as any confusion with group syntax.
+
+For [subpatterns](#subpatterns), we recommend either `\g<...>` or `\g'...'` depending on the choice for named group syntax. We're unsure if we should prefer `(?R)` as a spelling for e.g. `\g<0>` or not, as it is more widely used and understood, but less consistent with other subpatterns.
+
+[Conditional references](#conditionals) have a choice between `(?('name'))` and `(?())`. The preferred syntax in this case would likely reflect the syntax chosen for named groups.
+
+We are deferring runtime support for callouts from regex literals as future work, though we will correctly parse their contents. We have no current recommendation for a preference of PCRE-style [callout syntax](#callouts), and would like to discuss with the community whether we should have one.
+
+## Alternatives Considered
+
+
+### Skip the syntax
+
+The top alternative is to just skip regex syntax altogether by only shipping the result builder DSL and forbidding run-time regex construction from strings. However, doing so would miss out on the familiarity benefits of existing regex syntax. Additionally, without support for run-time strings containing regex syntax, important domains would be closed off from better string processing, such as command-line tools and user-input searches. This would land us in a confusing world where NSRegularExpression, even though it operates over a fundamentally different model of string than Swift's `String` and exhibits different behavior than Swift regexes, is still used for these purposes.
+
+We consider our proposed direction to be more compelling, especially when coupled with refactoring actions to convert literals into regex DSLs.
+
+### Introduce a novel regex syntax
+
+Another alternative is to invent a new syntax for regex. This would similarly lose out on the familiarity benefit, though a few simple adjustments could aid readability.
+
+We are prototyping an "experimental" Swift extended syntax, which is future work and outside the scope of this proposal. Every syntactic extension, while individually compelling, does introduce incompatibilities and can lead to an "uncanny valley" effect. Further investigation is needed and such support can be built on top of what is presented here.
+
+### Support a minimal syntactic subset
+
+Regex syntax will become part of Swift's source and binary-compatibility story, so a reasonable alternative is to support the absolute minimal syntactic subset available. However, we would need to ensure that such a minimal approach is extensible far into the future. Because syntax decisions can impact each other, we would want to consider the ramifications of this full syntactic superset ahead of time anyways.
+
+Even though it is more work up-front and creates a longer proposal, it is less risky to support the full intended syntax. The proposed superset maximizes the familiarity benefit of regex syntax.
+
+
+[pcre2-syntax]: https://www.pcre.org/current/doc/html/pcre2syntax.html
+[oniguruma-syntax]: https://github.com/kkos/oniguruma/blob/master/doc/RE
+[icu-syntax]: https://unicode-org.github.io/icu/userguide/strings/regexp.html
+[uts18]: https://www.unicode.org/reports/tr18/
+[.net-syntax]: https://docs.microsoft.com/en-us/dotnet/standard/base-types/regular-expressions
+[UAX44-LM3]: https://www.unicode.org/reports/tr44/#UAX44-LM3
+[unicode-prop-key-aliases]: https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt
+[unicode-prop-value-aliases]: https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt
+[unicode-scripts]: https://www.unicode.org/reports/tr24/#Script
+[unicode-script-extensions]: https://www.unicode.org/reports/tr24/#Script_Extensions
+[balancing-groups]: https://docs.microsoft.com/en-us/dotnet/standard/base-types/grouping-constructs-in-regular-expressions#balancing-group-definitions
diff --git a/README.md b/README.md
index e6f94377c..941231b24 100644
--- a/README.md
+++ b/README.md
@@ -9,3 +9,65 @@ See [Declarative String Processing Overview][decl-string]
## Requirements
- [Swift Trunk Development Snapshot](https://www.swift.org/download/#snapshots) DEVELOPMENT-SNAPSHOT-2022-02-03 or later.
+
+## Integration with Swift
+
+`_MatchingEngine`, `_CUnicode` and `_StringProcessing` are specially integrated modules that are built as part of apple/swift.
+
+Specifically, `_MatchingEngine` contains the parser for regular expression literals and is built both as part of the compiler and as a core library. `_CUnicode` and `_StringProcessing` are built together as a core library named `_StringProcessing`.
+
+| Module | Swift toolchain component |
+| ------------------- | ------------------------------------------------------------------------------------ |
+| `_MatchingEngine` | `SwiftCompilerSources/Sources/ExperimentalRegex` and `stdlib/public/_MatchingEngine` |
+| `_CUnicode` | `stdlib/public/_StringProcessing` |
+| `_StringProcessing` | `stdlib/public/_StringProcessing` |
+
+### Branching scheme
+
+#### Development branch
+
+The `main` branch is the branch for day-to-day development. Generally, you should create PRs against this branch.
+
+#### Swift integration branches
+
+Branches whose name starts with `swift/` are Swift integration branches similar to those in [apple/llvm-project](https://github.com/apple/llvm-project). For each branch, dropping the `swift/` prefix is the corresponding branch in [apple/swift](https://github.com/apple/swift).
+
+| apple/swift branch | apple/swift-experimental-string-processing branch |
+| ------------------- | ----------------------------------------------------- |
+| main | swift/main |
+| release/5.7 | swift/release/5.7 |
+| ... | swift/... |
+
+A pair of corresponding branches are expected to build successfully together and pass all tests.
+
+### Integration workflow
+
+To integrate the latest changes in apple/swift-experimental-string-processing to apple/swift, carefully follow the workflow:
+
+- Create pull requests.
+ - Create a pull request in apple/swift-experimental-string-processing from `main` to `swift/main`, e.g. "[Integration] main -> swift/main".
+ - If apple/swift needs to be modified to work with the latest `main` in apple/swift-experimental-string-processing, create a pull request in apple/swift.
+- Trigger CI.
+ - In the apple/swift-experimental-string-processing pull request, trigger CI using the following command (replacing `` with the apple/swift pull request number, if any):
+ ```
+ apple/swift# # use this line only if there is an corresponding apple/swift PR
+ @swift-ci please test
+ ```
+ - In the apple/swift pull request (if any), trigger CI using the following command (replacing `` with the apple/swift-experimental-string-processing pull request number):
+ ```
+ apple/swift-experimental-string-processing#
+ @swift-ci please test
+ ```
+- Merge when approved.
+ - Merge the pull request in apple/swift-experimental-string-processing as a **merge commit**.
+ - Merge the pull request in apple/swift (if any).
+
+### Development notes
+
+Compiler integration can be tricky. Use special caution when developing `_MatchingEngine`, `_CUnicode` and `_StringProcessing` modules.
+
+- Do not change the names of these modules without due approval from compiler and infrastructure teams.
+- Do not modify the existing ABI (e.g. C API, serialization format) between the regular expression parser and the Swift compiler unless absolutely necessary.
+- Always minimize the number of lockstep integrations, i.e. when apple/swift-experimental-string-processing and apple/swift have to change together. Whenever possible, introduce new API first, migrate Swift compiler onto it, and then deprecate old API. Use versioning if helpful.
+- In `_StringProcessing`, do not write fully qualified references to symbols in `_CUnicode`, and always wrap `import _CUnicode` in a `#if canImport(_CUnicode)`. This is because `_CUnicode` is built as part of `_StringProcessing` with CMake.
+- In `_MatchingEngine`, do not write fully qualified references to `_MatchingEngine` itself. This is because `_MatchingEngine` is built as `ExperimentalRegex` in `SwiftCompilerSources/` with CMake.
diff --git a/Sources/CMakeLists.txt b/Sources/CMakeLists.txt
new file mode 100644
index 000000000..19feadbd9
--- /dev/null
+++ b/Sources/CMakeLists.txt
@@ -0,0 +1,6 @@
+
+add_subdirectory(_Unicode)
+add_subdirectory(_MatchingEngine)
+add_subdirectory(_StringProcessing)
+add_subdirectory(Prototypes)
+add_subdirectory(VariadicsGenerator)
diff --git a/Sources/Prototypes/CMakeLists.txt b/Sources/Prototypes/CMakeLists.txt
new file mode 100644
index 000000000..60768f5a3
--- /dev/null
+++ b/Sources/Prototypes/CMakeLists.txt
@@ -0,0 +1,18 @@
+
+add_library(Prototypes
+ Combinators/Combinators.swift
+ PEG/PEG.swift
+ PEG/PEGCode.swift
+ PEG/PEGCompile.swift
+ PEG/PEGCore.swift
+ PEG/PEGInterpreter.swift
+ PEG/PEGTranspile.swift
+ PEG/PEGVM.swift
+ PEG/PEGVMExecute.swift
+ PEG/Printing.swift
+ PTCaRet/Interpreter.swift
+ PTCaRet/PTCaRet.swift
+ TourOfTypes/CharacterClass.swift
+ TourOfTypes/Literal.swift)
+target_link_libraries(Prototypes PUBLIC
+ _MatchingEngine)
diff --git a/Sources/Prototypes/PEG/PEGCode.swift b/Sources/Prototypes/PEG/PEGCode.swift
index c33f5759c..b12c5bab6 100644
--- a/Sources/Prototypes/PEG/PEGCode.swift
+++ b/Sources/Prototypes/PEG/PEGCode.swift
@@ -9,7 +9,7 @@
//
//===----------------------------------------------------------------------===//
-import _StringProcessing
+@testable import _StringProcessing
extension PEG.VM {
struct Code {
diff --git a/Sources/Prototypes/PEG/PEGCompile.swift b/Sources/Prototypes/PEG/PEGCompile.swift
index 0592cf6a9..0e1b89233 100644
--- a/Sources/Prototypes/PEG/PEGCompile.swift
+++ b/Sources/Prototypes/PEG/PEGCompile.swift
@@ -9,7 +9,7 @@
//
//===----------------------------------------------------------------------===//
-import _StringProcessing
+@testable import _StringProcessing
extension PEG.VM {
typealias InIndex = Input.Index
diff --git a/Sources/Prototypes/PEG/PEGCore.swift b/Sources/Prototypes/PEG/PEGCore.swift
index b831cbd0f..5c66dc25a 100644
--- a/Sources/Prototypes/PEG/PEGCore.swift
+++ b/Sources/Prototypes/PEG/PEGCore.swift
@@ -9,7 +9,7 @@
//
//===----------------------------------------------------------------------===//
-import _StringProcessing
+@testable import _StringProcessing
let emitComments = true
struct PEGCore<
diff --git a/Sources/Prototypes/PEG/PEGTranspile.swift b/Sources/Prototypes/PEG/PEGTranspile.swift
index df75cea63..84e220d52 100644
--- a/Sources/Prototypes/PEG/PEGTranspile.swift
+++ b/Sources/Prototypes/PEG/PEGTranspile.swift
@@ -9,8 +9,7 @@
//
//===----------------------------------------------------------------------===//
-import _MatchingEngine
-import _StringProcessing
+@testable import _StringProcessing
extension PEG.VM where Input == String {
typealias MEProg = MEProgram
diff --git a/Sources/Prototypes/PEG/PEGVM.swift b/Sources/Prototypes/PEG/PEGVM.swift
index a987b581d..4cf91a5c1 100644
--- a/Sources/Prototypes/PEG/PEGVM.swift
+++ b/Sources/Prototypes/PEG/PEGVM.swift
@@ -9,7 +9,8 @@
//
//===----------------------------------------------------------------------===//
-import _StringProcessing
+
+@testable import _StringProcessing
extension PEG {
diff --git a/Sources/Prototypes/PEG/Printing.swift b/Sources/Prototypes/PEG/Printing.swift
index 978250761..be60e72f5 100644
--- a/Sources/Prototypes/PEG/Printing.swift
+++ b/Sources/Prototypes/PEG/Printing.swift
@@ -9,7 +9,7 @@
//
//===----------------------------------------------------------------------===//
-import _StringProcessing
+@testable import _StringProcessing
extension PEGCore.Instruction: InstructionProtocol {
var operandPC: InstructionAddress? { self.pc }
diff --git a/Sources/VariadicsGenerator/CMakeLists.txt b/Sources/VariadicsGenerator/CMakeLists.txt
new file mode 100644
index 000000000..8ea543970
--- /dev/null
+++ b/Sources/VariadicsGenerator/CMakeLists.txt
@@ -0,0 +1,7 @@
+
+add_executable(VariadicsGenerator
+ VariadicsGenerator.swift)
+target_compile_options(VariadicsGenerator PRIVATE
+ -parse-as-library)
+target_link_libraries(VariadicsGenerator PUBLIC
+ ArgumentParser)
diff --git a/Sources/_MatchingEngine/CMakeLists.txt b/Sources/_MatchingEngine/CMakeLists.txt
new file mode 100644
index 000000000..f7cb97ce3
--- /dev/null
+++ b/Sources/_MatchingEngine/CMakeLists.txt
@@ -0,0 +1,46 @@
+
+add_library(_MatchingEngine
+ Engine/Backtracking.swift
+ Engine/Builder.swift
+ Engine/Capture.swift
+ Engine/Consume.swift
+ Engine/Engine.swift
+ Engine/InstPayload.swift
+ Engine/Instruction.swift
+ Engine/Processor.swift
+ Engine/Program.swift
+ Engine/Registers.swift
+ Engine/Tracing.swift
+ Regex/AST/AST.swift
+ Regex/AST/ASTAction.swift
+ Regex/AST/ASTProtocols.swift
+ Regex/AST/Atom.swift
+ Regex/AST/Conditional.swift
+ Regex/AST/CustomCharClass.swift
+ Regex/AST/Group.swift
+ Regex/AST/MatchingOptions.swift
+ Regex/AST/Quantification.swift
+ Regex/Parse/CaptureStructure.swift
+ Regex/Parse/CharacterPropertyClassification.swift
+ Regex/Parse/Diagnostics.swift
+ Regex/Parse/LexicalAnalysis.swift
+ Regex/Parse/Mocking.swift
+ Regex/Parse/Parse.swift
+ Regex/Parse/Source.swift
+ Regex/Parse/SourceLocation.swift
+ Regex/Parse/SyntaxOptions.swift
+ Regex/Printing/DumpAST.swift
+ Regex/Printing/PrettyPrinter.swift
+ Regex/Printing/PrintAsCanonical.swift
+ Regex/Printing/PrintAsPattern.swift
+ Regex/Printing/RenderRanges.swift
+ Utility/AllScalars.swift
+ Utility/Formatting.swift
+ Utility/Misc.swift
+ Utility/MissingUnicode.swift
+ Utility/Protocols.swift
+ Utility/TypeConstruction.swift
+ Utility/TypedIndex.swift
+ Utility/TypedInt.swift)
+target_compile_options(_MatchingEngine PRIVATE
+ -enable-library-evolution)
diff --git a/Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift
index 6a5740aa1..e5b65a46c 100644
--- a/Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift
@@ -381,7 +381,7 @@ extension Source {
return .generalCategory(cat)
}
if let script = classifyScriptProperty(value) {
- return .script(script)
+ return .scriptExtension(script)
}
if let posix = classifyPOSIX(value) {
return .posix(posix)
diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
new file mode 100644
index 000000000..1227ade1f
--- /dev/null
+++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift
@@ -0,0 +1,332 @@
+//===----------------------------------------------------------------------===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2022 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See https://swift.org/LICENSE.txt for license information
+//
+//===----------------------------------------------------------------------===//
+
+// TODO: mock up multi-line soon
+
+enum Delimiter: Hashable, CaseIterable {
+ case traditional
+ case experimental
+ case reSingleQuote
+ case rxSingleQuote
+
+ var openingAndClosing: (opening: String, closing: String) {
+ switch self {
+ case .traditional: return ("#/", "/#")
+ case .experimental: return ("#|", "|#")
+ case .reSingleQuote: return ("re'", "'")
+ case .rxSingleQuote: return ("rx'", "'")
+ }
+ }
+ var opening: String { openingAndClosing.opening }
+ var closing: String { openingAndClosing.closing }
+
+ /// The default set of syntax options that the delimiter indicates.
+ var defaultSyntaxOptions: SyntaxOptions {
+ switch self {
+ case .traditional, .reSingleQuote:
+ return .traditional
+ case .experimental, .rxSingleQuote:
+ return .experimental
+ }
+ }
+}
+
+struct DelimiterLexError: Error, CustomStringConvertible {
+ enum Kind: Hashable {
+ case endOfString
+ case invalidUTF8 // TODO: better range reporting
+ case unknownDelimiter
+ case unprintableASCII
+ }
+
+ var kind: Kind
+
+ /// The pointer at which to resume lexing.
+ var resumePtr: UnsafeRawPointer
+
+ init(_ kind: Kind, resumeAt resumePtr: UnsafeRawPointer) {
+ self.kind = kind
+ self.resumePtr = resumePtr
+ }
+
+ var description: String {
+ switch kind {
+ case .endOfString: return "unterminated regex literal"
+ case .invalidUTF8: return "invalid UTF-8 found in source file"
+ case .unknownDelimiter: return "unknown regex literal delimiter"
+ case .unprintableASCII: return "unprintable ASCII character found in source file"
+ }
+ }
+}
+
+fileprivate struct DelimiterLexer {
+ let start: UnsafeRawPointer
+ var cursor: UnsafeRawPointer
+ let end: UnsafeRawPointer
+
+ init(start: UnsafeRawPointer, end: UnsafeRawPointer) {
+ precondition(start <= end)
+ self.start = start
+ self.cursor = start
+ self.end = end
+ }
+
+ func ascii(_ s: Unicode.Scalar) -> UInt8 {
+ assert(s.value <= 0x7F)
+ return UInt8(asserting: s.value)
+ }
+
+ /// Return the byte at the current cursor, or `nil` if the end of the buffer
+ /// has been reached.
+ func load() -> UInt8? {
+ guard cursor < end else { return nil }
+ return cursor.load(as: UInt8.self)
+ }
+
+ /// Return the slice of `count` bytes from a specified cursor position, or
+ /// `nil` if there are fewer than `count` bytes until the end of the buffer.
+ func slice(
+ at cursor: UnsafeRawPointer, _ count: Int
+ ) -> UnsafeRawBufferPointer? {
+ guard cursor + count <= end else { return nil }
+ return UnsafeRawBufferPointer(start: cursor, count: count)
+ }
+
+ /// Return the slice of `count` bytes from the current cursor, or `nil` if
+ /// there are fewer than `count` bytes until the end of the buffer.
+ func slice(_ count: Int) -> UnsafeRawBufferPointer? {
+ slice(at: cursor, count)
+ }
+
+ /// Return the slice of `count` bytes preceding the current cursor, or `nil`
+ /// if there are fewer than `count` bytes before the cursor.
+ func sliceBehind(_ count: Int) -> UnsafeRawBufferPointer? {
+ let priorCursor = cursor - count
+ guard priorCursor >= start else { return nil }
+ return slice(at: priorCursor, count)
+ }
+
+ /// Advance the cursor `n` bytes.
+ mutating func advanceCursor(_ n: Int = 1) {
+ cursor += n
+ precondition(cursor <= end, "Cannot advance past end")
+ }
+
+ /// Check to see if a UTF-8 sequence can be eaten from the current cursor.
+ func canEat(_ utf8: String.UTF8View) -> Bool {
+ guard let slice = slice(utf8.count) else { return false }
+ return slice.elementsEqual(utf8)
+ }
+
+ /// Attempt to eat a UTF-8 byte sequence, returning `true` if successful.
+ mutating func tryEat(_ utf8: String.UTF8View) -> Bool {
+ guard canEat(utf8) else { return false }
+ advanceCursor(utf8.count)
+ return true
+ }
+
+ /// Attempt to skip over a closing delimiter character that is unlikely to be
+ /// the actual closing delimiter.
+ mutating func trySkipDelimiter(_ delimiter: Delimiter) {
+ // Only the closing `'` for re'...'/rx'...' can potentially be skipped over.
+ switch delimiter {
+ case .traditional, .experimental:
+ return
+ case .reSingleQuote, .rxSingleQuote:
+ break
+ }
+ guard load() == ascii("'") else { return }
+
+ /// Need to look for a prefix of `(?`, `(?(`, `\k`, `\g`, `(?C`, as those
+ /// are the cases that could use single quotes. Note that none of these
+ /// would be valid regex endings anyway.
+ let calloutPrefix = "(?C"
+ let prefix = ["(?", "(?(", #"\k"#, #"\g"#, calloutPrefix].first { prior in
+ guard let priorSlice = sliceBehind(prior.utf8.count),
+ priorSlice.elementsEqual(prior.utf8)
+ else { return false }
+
+ // Make sure the slice isn't preceded by a '\', as that invalidates this
+ // analysis.
+ if let prior = sliceBehind(priorSlice.count + 1) {
+ return prior[0] != ascii("\\")
+ }
+ return true
+ }
+ guard let prefix = prefix else { return }
+ let isCallout = prefix == calloutPrefix
+
+ func isPossiblyGroupReference(_ c: UInt8) -> Bool {
+ // If this is an ASCII character, make sure it's for a group name. Leave
+ // other UTF-8 encoded scalars alone, this should at least catch cases
+ // where we run into a symbol such as `{`, `.`, `;` that would indicate
+ // we've likely advanced out of the bounds of the regex.
+ let scalar = UnicodeScalar(c)
+ guard scalar.isASCII else { return true }
+ switch scalar {
+ // Include '-' and '+' which may be used in recursion levels and relative
+ // references.
+ case "A"..."Z", "a"..."z", "0"..."9", "_", "-", "+":
+ return true
+ default:
+ return false
+ }
+ }
+
+ // Make a note of the current lexing position, as we may need to revert
+ // back to it.
+ let originalCursor = cursor
+ advanceCursor()
+
+ // Try skip over what would be the contents of a group identifier/reference.
+ while let next = load() {
+ // Found the ending, we're done. Return so we can continue to lex to the
+ // real delimiter.
+ if next == ascii("'") {
+ advanceCursor()
+ return
+ }
+
+ // If this isn't a callout, make sure we have something that could be a
+ // group reference. We limit the character set here to improve diagnostic
+ // behavior in the case where the literal is actually unterminated. We
+ // ideally don't want to go wandering off into Swift source code. We can't
+ // do the same for callouts, as they take arbitrary strings.
+ guard isCallout || isPossiblyGroupReference(next) else { break }
+ do {
+ try advance()
+ } catch {
+ break
+ }
+ }
+ // We bailed out, either because we ran into something that didn't look like
+ // an identifier, or we reached the end of the line. Revert back to the
+ // original guess of delimiter.
+ cursor = originalCursor
+ }
+
+ /// Attempt to eat a particular closing delimiter, returning the contents of
+ /// the literal, and ending pointer, or `nil` if this is not a delimiter
+ /// ending.
+ mutating func tryEatEnding(
+ _ delimiter: Delimiter, contentsStart: UnsafeRawPointer
+ ) throws -> (contents: String, end: UnsafeRawPointer)? {
+ let contentsEnd = cursor
+ guard tryEat(delimiter.closing.utf8) else { return nil }
+
+ // Form a string from the contents and make sure it's valid UTF-8.
+ let count = contentsEnd - contentsStart
+ let contents = UnsafeRawBufferPointer(
+ start: contentsStart, count: count)
+ let s = String(decoding: contents, as: UTF8.self)
+
+ guard s.utf8.elementsEqual(contents) else {
+ throw DelimiterLexError(.invalidUTF8, resumeAt: cursor)
+ }
+ return (contents: s, end: cursor)
+ }
+
+ /// Attempt to advance the lexer, throwing an error if the end of a line or
+ /// the end of the buffer is reached.
+ mutating func advance(escaped: Bool = false) throws {
+ guard let next = load() else {
+ throw DelimiterLexError(.endOfString, resumeAt: cursor)
+ }
+ switch UnicodeScalar(next) {
+ case let next where !next.isASCII:
+ // Just advance into a UTF-8 sequence. It shouldn't matter that we'll
+ // iterate through each byte as we only match against ASCII, and we
+ // validate it at the end. This case is separated out so we can just deal
+ // with the ASCII cases below.
+ advanceCursor()
+
+ case "\n", "\r":
+ throw DelimiterLexError(.endOfString, resumeAt: cursor)
+
+ case "\0":
+ // TODO: Warn to match the behavior of String literal lexer? Or should
+ // we error as unprintable?
+ advanceCursor()
+
+ case "\\" where !escaped:
+ // Advance again for an escape sequence.
+ advanceCursor()
+ try advance(escaped: true)
+
+ case let next where !next.isPrintableASCII:
+ // Diagnose unprintable ASCII.
+ // TODO: Ideally we would recover and continue to lex until the ending
+ // delimiter.
+ throw DelimiterLexError(.unprintableASCII, resumeAt: cursor.successor())
+
+ default:
+ advanceCursor()
+ }
+ }
+
+ /*consuming*/ mutating func lex(
+ ) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
+
+ // Try to lex the opening delimiter.
+ guard let delimiter = Delimiter.allCases.first(
+ where: { tryEat($0.opening.utf8) }
+ ) else {
+ throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor())
+ }
+
+ let contentsStart = cursor
+ while true {
+ // Check to see if we're at a character that looks like a delimiter, but
+ // likely isn't. In such a case, we can attempt to skip over it.
+ trySkipDelimiter(delimiter)
+
+ // Try to lex the closing delimiter.
+ if let (contents, end) = try tryEatEnding(delimiter,
+ contentsStart: contentsStart) {
+ return (contents, delimiter, end)
+ }
+ // Try to advance the lexer.
+ try advance()
+ }
+ }
+}
+
+/// Drop a set of regex delimiters from the input string, returning the contents
+/// and the delimiter used. The input string must have valid delimiters.
+func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
+ func stripDelimiter(_ delim: Delimiter) -> String? {
+ // The opening delimiter must match.
+ guard var slice = str.utf8.tryDropPrefix(delim.opening.utf8)
+ else { return nil }
+
+ // The closing delimiter may optionally match, as it may not be present in
+ // invalid code.
+ if let newSlice = slice.tryDropSuffix(delim.closing.utf8) {
+ slice = newSlice
+ }
+ return String(slice)
+ }
+ for d in Delimiter.allCases {
+ if let contents = stripDelimiter(d) {
+ return (contents, d)
+ }
+ }
+ fatalError("No valid delimiters")
+}
+
+/// Attempt to lex a regex literal between `start` and `end`, returning either
+/// the contents and pointer from which to resume lexing, or an error.
+func lexRegex(
+ start: UnsafeRawPointer, end: UnsafeRawPointer
+) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
+ var lexer = DelimiterLexer(start: start, end: end)
+ return try lexer.lex()
+}
diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
index 727727ce1..cfab75312 100644
--- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
@@ -279,7 +279,7 @@ extension Source {
/// | 'x' HexDigit{0...2}
/// | 'U' HexDigit{8}
/// | 'o{' OctalDigit{1...} '}'
- /// | OctalDigit{1...3}
+ /// | '0' OctalDigit{0...3}
///
mutating func expectUnicodeScalar(
escapedCharacter base: Character
@@ -313,13 +313,14 @@ extension Source {
let str = try src.lexUntil(eating: "}").value
return try Source.validateUnicodeScalar(str, .octal)
- case let c where c.isOctalDigit:
- // We can read *up to* 2 more octal digits per PCRE.
- // FIXME: ICU can read up to 3 octal digits if the leading digit is 0,
- // we should have a parser mode to switch.
- let nextDigits = src.tryEatPrefix(maxLength: 2, \.isOctalDigit)
- let str = String(c) + (nextDigits?.string ?? "")
- return try Source.validateUnicodeScalar(str, .octal)
+ case "0":
+ // We can read *up to* 3 more octal digits.
+ // FIXME: PCRE can only read up to 2 octal digits, if we get a strict
+ // PCRE mode, we should limit it here.
+ guard let digits = src.tryEatPrefix(maxLength: 3, \.isOctalDigit) else {
+ return Unicode.Scalar(0)
+ }
+ return try Source.validateUnicodeScalar(digits.string, .octal)
default:
fatalError("Unexpected scalar start")
@@ -1341,26 +1342,10 @@ extension Source {
return nil
}
- // Lexing \n is tricky, as it's ambiguous with octal sequences. In PCRE
- // it is treated as a backreference if its first digit is not 0 (as that
- // is always octal) and one of the following holds:
- //
- // - It's 0 < n < 10 (as octal would be pointless here)
- // - Its first digit is 8 or 9 (as not valid octal)
- // - There have been as many prior groups as the reference.
- //
- // Oniguruma follows the same rules except the second one. e.g \81 and
- // \91 are instead treated as literal 81 and 91 respectively.
- // TODO: If we want a strict Oniguruma mode, we'll need to add a check
- // here.
+ // Backslash followed by a non-0 digit character is a backreference.
if firstChar != "0", let numAndLoc = try src.lexNumber() {
- let num = numAndLoc.value
- let ref = AST.Reference(.absolute(num), innerLoc: numAndLoc.location)
- if num < 10 || firstChar == "8" || firstChar == "9" ||
- context.isPriorGroupRef(ref.kind) {
- return .backreference(ref)
- }
- return nil
+ return .backreference(.init(
+ .absolute(numAndLoc.value), innerLoc: numAndLoc.location))
}
return nil
}
@@ -1487,7 +1472,9 @@ extension Source {
return ref
}
- let char = src.eat()
+ guard let char = src.tryEat() else {
+ throw ParseError.expectedEscape
+ }
// Single-character builtins.
if let builtin = AST.Atom.EscapedBuiltin(
@@ -1497,10 +1484,8 @@ extension Source {
}
switch char {
- // Hexadecimal and octal unicode scalars. This must be done after
- // backreference lexing due to the ambiguity with \nnn.
- case let c where c.isOctalDigit: fallthrough
- case "u", "x", "U", "o":
+ // Hexadecimal and octal unicode scalars.
+ case "u", "x", "U", "o", "0":
return try .scalar(
src.expectUnicodeScalar(escapedCharacter: char).value)
default:
diff --git a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
index e3a178a15..5994a4f52 100644
--- a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift
@@ -9,150 +9,6 @@
//
//===----------------------------------------------------------------------===//
-
-// TODO: mock up multi-line soon
-
-enum Delimiter: Hashable, CaseIterable {
- case traditional
- case experimental
- case reSingleQuote
-
- var openingAndClosing: (opening: String, closing: String) {
- switch self {
- case .traditional: return ("#/", "/#")
- case .experimental: return ("#|", "|#")
- case .reSingleQuote: return ("re'", "'")
- }
- }
- var opening: String { openingAndClosing.opening }
- var closing: String { openingAndClosing.closing }
-
- /// The default set of syntax options that the delimiter indicates.
- var defaultSyntaxOptions: SyntaxOptions {
- switch self {
- case .traditional, .reSingleQuote:
- return .traditional
- case .experimental:
- return .experimental
- }
- }
-}
-
-struct LexError: Error, CustomStringConvertible {
- enum Kind: Hashable {
- case endOfString
- case invalidUTF8 // TODO: better range reporting
- case unknownDelimiter
- }
-
- var kind: Kind
-
- /// The pointer at which to resume lexing.
- var resumePtr: UnsafeRawPointer
-
- init(_ kind: Kind, resumeAt resumePtr: UnsafeRawPointer) {
- self.kind = kind
- self.resumePtr = resumePtr
- }
-
- var description: String {
- switch kind {
- case .endOfString: return "unterminated regex literal"
- case .invalidUTF8: return "invalid UTF-8 found in source file"
- case .unknownDelimiter: return "unknown regex literal delimiter"
- }
- }
-}
-
-/// Drop a set of regex delimiters from the input string, returning the contents
-/// and the delimiter used. The input string must have valid delimiters.
-func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
- let utf8 = str.utf8
- func stripDelimiter(_ delim: Delimiter) -> String? {
- let prefix = delim.opening.utf8
- let suffix = delim.closing.utf8
- guard utf8.prefix(prefix.count).elementsEqual(prefix),
- utf8.suffix(suffix.count).elementsEqual(suffix) else { return nil }
-
- return String(utf8.dropFirst(prefix.count).dropLast(suffix.count))
- }
- for d in Delimiter.allCases {
- if let contents = stripDelimiter(d) {
- return (contents, d)
- }
- }
- fatalError("No valid delimiters")
-}
-
-/// Attempt to lex a regex literal between `start` and `end`, returning either
-/// the contents and pointer from which to resume lexing, or an error.
-func lexRegex(
- start: UnsafeRawPointer, end: UnsafeRawPointer
-) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
- precondition(start <= end)
- var current = start
-
- func ascii(_ s: Unicode.Scalar) -> UInt8 {
- assert(s.value <= 0x7F)
- return UInt8(asserting: s.value)
- }
- func load(offset: Int) -> UInt8? {
- guard current + offset < end else { return nil }
- return current.load(fromByteOffset: offset, as: UInt8.self)
- }
- func load() -> UInt8? { load(offset: 0) }
- func advance(_ n: Int = 1) {
- precondition(current + n <= end, "Cannot advance past end")
- current = current.advanced(by: n)
- }
-
- func tryEat(_ utf8: String.UTF8View) -> Bool {
- for (i, idx) in utf8.indices.enumerated() {
- guard load(offset: i) == utf8[idx] else { return false }
- }
- advance(utf8.count)
- return true
- }
-
- // Try to lex the opening delimiter.
- guard let delimiter = Delimiter.allCases.first(
- where: { tryEat($0.opening.utf8) }
- ) else {
- throw LexError(.unknownDelimiter, resumeAt: current.successor())
- }
-
- let contentsStart = current
- while true {
- switch load() {
- case nil, ascii("\n"), ascii("\r"):
- throw LexError(.endOfString, resumeAt: current)
-
- case ascii("\\"):
- // Skip next byte.
- advance(2)
-
- default:
- // Try to lex the closing delimiter.
- let contentsEnd = current
- guard tryEat(delimiter.closing.utf8) else {
- advance()
- continue
- }
-
- // Form a string from the contents and make sure it's valid UTF-8.
- let count = contentsEnd - contentsStart
- let contents = UnsafeRawBufferPointer(
- start: contentsStart, count: count)
- let s = String(decoding: contents, as: UTF8.self)
-
- guard s.utf8.elementsEqual(contents) else {
- throw LexError(.invalidUTF8, resumeAt: current)
- }
- return (contents: s, delimiter, end: current)
- }
- }
-}
-
private func copyCString(_ str: String) -> UnsafePointer {
let count = str.utf8.count + 1
return str.withCString {
@@ -196,7 +52,7 @@ func libswiftLexRegexLiteral(
let (_, _, endPtr) = try lexRegex(start: inputPtr, end: bufferEndPtr)
curPtrPtr.pointee = endPtr.assumingMemoryBound(to: CChar.self)
return false
- } catch let error as LexError {
+ } catch let error as DelimiterLexError {
if error.kind == .unknownDelimiter {
// An unknown delimiter should be recovered from, as we may want to try
// lex something else.
@@ -205,12 +61,18 @@ func libswiftLexRegexLiteral(
errOut.pointee = copyCString("\(error)")
curPtrPtr.pointee = error.resumePtr.assumingMemoryBound(to: CChar.self)
- // For now, treat every error as unrecoverable.
- // TODO: We should ideally be able to recover from a regex with missing
- // closing delimiters, which would help with code completion.
- return true
+ switch error.kind {
+ case .endOfString:
+ // Missing closing delimiter can be recovered from.
+ return false
+ case .unprintableASCII, .invalidUTF8:
+ // We don't currently have good recovery behavior for these.
+ return true
+ case .unknownDelimiter:
+ fatalError("Already handled")
+ }
} catch {
- fatalError("Should be a LexError")
+ fatalError("Should be a DelimiterLexError")
}
}
diff --git a/Sources/_MatchingEngine/Regex/Parse/Source.swift b/Sources/_MatchingEngine/Regex/Parse/Source.swift
index 11bd8152f..ddf0475f3 100644
--- a/Sources/_MatchingEngine/Regex/Parse/Source.swift
+++ b/Sources/_MatchingEngine/Regex/Parse/Source.swift
@@ -86,6 +86,12 @@ extension Source {
tryEat(anyOf: set)
}
+ /// Try to eat any character, returning `nil` if the input has been exhausted.
+ mutating func tryEat() -> Char? {
+ guard !isEmpty else { return nil }
+ return eat()
+ }
+
mutating func eat(asserting c: Char) {
assert(peek() == c)
advance()
diff --git a/Sources/_MatchingEngine/Utility/Misc.swift b/Sources/_MatchingEngine/Utility/Misc.swift
index bd1e395b5..55d3d3adc 100644
--- a/Sources/_MatchingEngine/Utility/Misc.swift
+++ b/Sources/_MatchingEngine/Utility/Misc.swift
@@ -108,7 +108,28 @@ extension Collection {
>(_ idx: Index, in c: C) -> C.Index {
c.index(atOffset: offset(of: idx))
}
+}
+extension Collection where Element: Equatable {
+ /// Attempt to drop a given prefix from the collection, returning the
+ /// resulting subsequence, or `nil` if the prefix does not match.
+ public func tryDropPrefix(
+ _ other: C
+ ) -> SubSequence? where C.Element == Element {
+ let prefixCount = other.count
+ guard prefix(prefixCount).elementsEqual(other) else { return nil }
+ return dropFirst(prefixCount)
+ }
+
+ /// Attempt to drop a given suffix from the collection, returning the
+ /// resulting subsequence, or `nil` if the suffix does not match.
+ public func tryDropSuffix(
+ _ other: C
+ ) -> SubSequence? where C.Element == Element {
+ let suffixCount = other.count
+ guard suffix(suffixCount).elementsEqual(other) else { return nil }
+ return dropLast(suffixCount)
+ }
}
extension UnsafeMutableRawPointer {
diff --git a/Sources/_MatchingEngine/Utility/MissingUnicode.swift b/Sources/_MatchingEngine/Utility/MissingUnicode.swift
index a6aae0b82..dccba3286 100644
--- a/Sources/_MatchingEngine/Utility/MissingUnicode.swift
+++ b/Sources/_MatchingEngine/Utility/MissingUnicode.swift
@@ -661,3 +661,11 @@ extension Character {
public var isWordCharacter: Bool { isLetter || isNumber || self == "_" }
}
+
+extension UnicodeScalar {
+ public var isPrintableASCII: Bool {
+ // Exclude non-printables before the space character U+20, and anything
+ // including and above the DEL character U+7F.
+ value >= 0x20 && value < 0x7F
+ }
+}
diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift
index 36a28b381..6a8c97a6a 100644
--- a/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift
+++ b/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift
@@ -154,7 +154,7 @@ extension RangeReplaceableCollection where SubSequence == Substring {
with replacement: Replacement,
subrange: Range,
maxReplacements: Int = .max
- ) -> Self where Replacement.Element == Element {
+ ) -> Self where Replacement.Element == Character {
replacing(
RegexConsumer(regex),
with: replacement,
@@ -166,7 +166,7 @@ extension RangeReplaceableCollection where SubSequence == Substring {
_ regex: R,
with replacement: Replacement,
maxReplacements: Int = .max
- ) -> Self where Replacement.Element == Element {
+ ) -> Self where Replacement.Element == Character {
replacing(
regex,
with: replacement,
@@ -178,7 +178,7 @@ extension RangeReplaceableCollection where SubSequence == Substring {
_ regex: R,
with replacement: Replacement,
maxReplacements: Int = .max
- ) where Replacement.Element == Element {
+ ) where Replacement.Element == Character {
self = replacing(
regex,
with: replacement,
diff --git a/Sources/_StringProcessing/Algorithms/Matching/MatchReplace.swift b/Sources/_StringProcessing/Algorithms/Matching/MatchReplace.swift
index 2feb09df0..281b568f7 100644
--- a/Sources/_StringProcessing/Algorithms/Matching/MatchReplace.swift
+++ b/Sources/_StringProcessing/Algorithms/Matching/MatchReplace.swift
@@ -80,7 +80,7 @@ extension RangeReplaceableCollection where SubSequence == Substring {
with replacement: (_MatchResult>) throws -> Replacement,
subrange: Range,
maxReplacements: Int = .max
- ) rethrows -> Self where Replacement.Element == Element {
+ ) rethrows -> Self where Replacement.Element == Character {
try replacing(
RegexConsumer(regex),
with: replacement,
@@ -92,7 +92,7 @@ extension RangeReplaceableCollection where SubSequence == Substring {
_ regex: R,
with replacement: (_MatchResult>) throws -> Replacement,
maxReplacements: Int = .max
- ) rethrows -> Self where Replacement.Element == Element {
+ ) rethrows -> Self where Replacement.Element == Character {
try replacing(
regex,
with: replacement,
@@ -104,7 +104,7 @@ extension RangeReplaceableCollection where SubSequence == Substring {
_ regex: R,
with replacement: (_MatchResult>) throws -> Replacement,
maxReplacements: Int = .max
- ) rethrows where Replacement.Element == Element {
+ ) rethrows where Replacement.Element == Character {
self = try replacing(
regex,
with: replacement,
diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
index 93dca17a8..d6389c1f6 100644
--- a/Sources/_StringProcessing/ByteCodeGen.swift
+++ b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -99,12 +99,16 @@ extension Compiler.ByteCodeGen {
}
case .textSegment:
- // This we should be able to do!
- throw Unsupported(#"\y (text segment)"#)
+ builder.buildAssert { (input, pos, _) in
+ // FIXME: Grapheme or word based on options
+ input.isOnGraphemeClusterBoundary(pos)
+ }
case .notTextSegment:
- // This we should be able to do!
- throw Unsupported(#"\Y (not text segment)"#)
+ builder.buildAssert { (input, pos, _) in
+ // FIXME: Grapheme or word based on options
+ !input.isOnGraphemeClusterBoundary(pos)
+ }
case .startOfLine:
builder.buildAssert { (input, pos, bounds) in
diff --git a/Sources/_StringProcessing/CMakeLists.txt b/Sources/_StringProcessing/CMakeLists.txt
new file mode 100644
index 000000000..c20dcc240
--- /dev/null
+++ b/Sources/_StringProcessing/CMakeLists.txt
@@ -0,0 +1,42 @@
+
+add_library(_StringProcessing
+ Algorithms/Algorithms/Contains.swift
+ Algorithms/Algorithms/FirstRange.swift
+ Algorithms/Algorithms/Ranges.swift
+ Algorithms/Algorithms/Replace.swift
+ Algorithms/Algorithms/Split.swift
+ Algorithms/Algorithms/StartsWith.swift
+ Algorithms/Algorithms/Trim.swift
+ Algorithms/Consumers/CollectionConsumer.swift
+ Algorithms/Consumers/FixedPatternConsumer.swift
+ Algorithms/Consumers/ManyConsumer.swift
+ Algorithms/Consumers/PredicateConsumer.swift
+ Algorithms/Consumers/RegexConsumer.swift
+ Algorithms/Searchers/CollectionSearcher.swift
+ Algorithms/Searchers/ConsumerSearcher.swift
+ Algorithms/Searchers/NaivePatternSearcher.swift
+ Algorithms/Searchers/PatternOrEmpty.swift
+ Algorithms/Searchers/PredicateSearcher.swift
+ Algorithms/Searchers/TwoWaySearcher.swift
+ Algorithms/Searchers/ZSearcher.swift
+ ASTBuilder.swift
+ Capture.swift
+ CharacterClass.swift
+ Compiler.swift
+ ConsumerInterface.swift
+ Executor.swift
+ Legacy/HareVM.swift
+ Legacy/LegacyCompile.swift
+ Legacy/RECode.swift
+ Legacy/TortoiseVM.swift
+ Legacy/VirtualMachine.swift
+ RegexDSL/Builder.swift
+ RegexDSL/Concatenation.swift
+ RegexDSL/Core.swift
+ RegexDSL/DSL.swift
+ RegexDSL/DSLCapture.swift
+ RegexDSL/DynamicCaptures.swift)
+target_compile_options(_StringProcessing PRIVATE
+ -enable-library-evolution)
+target_link_libraries(_StringProcessing PUBLIC
+ _MatchingEngine)
diff --git a/Sources/_StringProcessing/Capture.swift b/Sources/_StringProcessing/Capture.swift
index 915c4c5d7..5b43da870 100644
--- a/Sources/_StringProcessing/Capture.swift
+++ b/Sources/_StringProcessing/Capture.swift
@@ -71,6 +71,11 @@ extension StructuredCapture {
value: storedCapture?.value,
optionalCount: optionalCount)
}
+
+ func slice(from input: String) -> Substring? {
+ guard let r = storedCapture?.range else { return nil }
+ return input[r]
+ }
}
extension Sequence where Element == StructuredCapture {
@@ -86,5 +91,8 @@ extension Sequence where Element == StructuredCapture {
})
return TypeConstruction.tuple(of: caps)
}
-}
+ func slices(from input: String) -> [Substring?] {
+ self.map { $0.slice(from: input) }
+ }
+}
diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift
index 5099e187f..1d72a8d27 100644
--- a/Sources/_StringProcessing/Compiler.swift
+++ b/Sources/_StringProcessing/Compiler.swift
@@ -35,7 +35,7 @@ class Compiler {
}
}
-public func _compileRegex(
+func _compileRegex(
_ regex: String, _ syntax: SyntaxOptions = .traditional
) throws -> Executor {
let ast = try parse(regex, syntax)
diff --git a/Sources/_StringProcessing/Engine/Consume.swift b/Sources/_StringProcessing/Engine/Consume.swift
index 52f752539..4e00a34b4 100644
--- a/Sources/_StringProcessing/Engine/Consume.swift
+++ b/Sources/_StringProcessing/Engine/Consume.swift
@@ -24,47 +24,17 @@ extension Engine {
}
}
-extension Engine where Input == String {
- public func consume(
- _ input: Input
- ) -> (Input.Index, CaptureList)? {
- consume(input, in: input.startIndex ..< input.endIndex)
- }
-
- public func consume(
- _ input: Input,
- in range: Range,
- matchMode: MatchMode = .partialFromFront
- ) -> (Input.Index, CaptureList)? {
- if enableTracing {
- print("Consume: \(input)")
- }
-
- var cpu = makeProcessor(input: input, bounds: range, matchMode: matchMode)
- let result: Input.Index? = {
- while true {
- switch cpu.state {
- case .accept:
- return cpu.currentPosition
- case .fail:
- return nil
- case .inProgress: cpu.cycle()
- }
- }
- }()
-
- if enableTracing {
- if let idx = result {
- print("Result: \(input[.. Input.Index? {
+ while true {
+ switch self.state {
+ case .accept:
+ return self.currentPosition
+ case .fail:
+ return nil
+ case .inProgress: self.cycle()
}
}
- guard let result = result else { return nil }
-
- let capList = cpu.storedCaptures
- return (result, CaptureList(
- values: capList, referencedCaptureOffsets: program.referencedCaptureOffsets))
}
}
diff --git a/Sources/_StringProcessing/Engine/Engine.swift b/Sources/_StringProcessing/Engine/Engine.swift
index 6c9c2efa5..86952c8b7 100644
--- a/Sources/_StringProcessing/Engine/Engine.swift
+++ b/Sources/_StringProcessing/Engine/Engine.swift
@@ -11,7 +11,7 @@
// Currently, engine binds the type and consume binds an instance.
// But, we can play around with this.
-public struct Engine where Input.Element: Hashable {
+struct Engine where Input.Element: Hashable {
var program: MEProgram
@@ -24,7 +24,7 @@ public struct Engine where Input.Element: Hashab
set { program.enableTracing = newValue }
}
- public init(
+ init(
_ program: MEProgram,
enableTracing: Bool? = nil
) {
@@ -36,10 +36,10 @@ public struct Engine where Input.Element: Hashab
}
}
-public struct AsyncEngine { /* ... */ }
+struct AsyncEngine { /* ... */ }
extension Engine: CustomStringConvertible {
- public var description: String {
+ var description: String {
// TODO: better description
return program.description
}
diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift
index fcc257302..ff28ee9e2 100644
--- a/Sources/_StringProcessing/Engine/Instruction.swift
+++ b/Sources/_StringProcessing/Engine/Instruction.swift
@@ -299,8 +299,7 @@ extension Instruction {
internal var _opcodeMask: UInt64 { 0xFF00_0000_0000_0000 }
-// TODO: internal after compiler moves in
-public var _payloadMask: UInt64 { ~_opcodeMask }
+var _payloadMask: UInt64 { ~_opcodeMask }
extension Instruction {
var opcodeMask: UInt64 { 0xFF00_0000_0000_0000 }
diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift
index d81c583a8..78171a001 100644
--- a/Sources/_StringProcessing/Engine/MEBuilder.swift
+++ b/Sources/_StringProcessing/Engine/MEBuilder.swift
@@ -12,7 +12,7 @@
import _MatchingEngine // For errors
extension MEProgram where Input.Element: Hashable {
- public struct Builder {
+ struct Builder {
var instructions: [Instruction] = []
var elements = TypedSetVector()
@@ -50,7 +50,7 @@ extension MEProgram where Input.Element: Hashable {
nextCaptureRegister.rawValue
}
- public init() {}
+ init() {}
}
}
@@ -71,7 +71,7 @@ extension MEProgram.Builder {
// TODO: We want a better strategy for fixups, leaving
// the operand in a different form isn't great...
- public init(staticElements: S) where S.Element == Input.Element {
+ init(staticElements: S) where S.Element == Input.Element {
staticElements.forEach { elements.store($0) }
}
@@ -79,21 +79,21 @@ extension MEProgram.Builder {
.init(instructions.endIndex - 1)
}
- public mutating func buildNop(_ r: StringRegister? = nil) {
+ mutating func buildNop(_ r: StringRegister? = nil) {
instructions.append(.init(.nop, .init(optionalString: r)))
}
- public mutating func buildNop(_ s: String) {
+ mutating func buildNop(_ s: String) {
buildNop(strings.store(s))
}
- public mutating func buildDecrement(
+ mutating func buildDecrement(
_ i: IntRegister, nowZero: BoolRegister
) {
instructions.append(.init(
.decrement, .init(bool: nowZero, int: i)))
}
- public mutating func buildMoveImmediate(
+ mutating func buildMoveImmediate(
_ value: UInt64, into: IntRegister
) {
instructions.append(.init(
@@ -101,25 +101,25 @@ extension MEProgram.Builder {
}
// TODO: generic
- public mutating func buildMoveImmediate(
+ mutating func buildMoveImmediate(
_ value: Int, into: IntRegister
) {
let uint = UInt64(asserting: value)
buildMoveImmediate(uint, into: into)
}
- public mutating func buildMoveCurrentPosition(
+ mutating func buildMoveCurrentPosition(
into: PositionRegister
) {
instructions.append(.init(
.movePosition, .init(position: into)))
}
- public mutating func buildBranch(to t: AddressToken) {
+ mutating func buildBranch(to t: AddressToken) {
instructions.append(.init(.branch))
fixup(to: t)
}
- public mutating func buildCondBranch(
+ mutating func buildCondBranch(
_ condition: BoolRegister, to t: AddressToken
) {
instructions.append(
@@ -127,7 +127,7 @@ extension MEProgram.Builder {
fixup(to: t)
}
- public mutating func buildCondBranch(
+ mutating func buildCondBranch(
to t: AddressToken, ifZeroElseDecrement i: IntRegister
) {
instructions.append(
@@ -135,56 +135,56 @@ extension MEProgram.Builder {
fixup(to: t)
}
- public mutating func buildSave(_ t: AddressToken) {
+ mutating func buildSave(_ t: AddressToken) {
instructions.append(.init(.save))
fixup(to: t)
}
- public mutating func buildSaveAddress(_ t: AddressToken) {
+ mutating func buildSaveAddress(_ t: AddressToken) {
instructions.append(.init(.saveAddress))
fixup(to: t)
}
- public mutating func buildSplit(
+ mutating func buildSplit(
to: AddressToken, saving: AddressToken
) {
instructions.append(.init(.splitSaving))
fixup(to: (to, saving))
}
- public mutating func buildClear() {
+ mutating func buildClear() {
instructions.append(.init(.clear))
}
- public mutating func buildRestore() {
+ mutating func buildRestore() {
instructions.append(.init(.restore))
}
- public mutating func buildFail() {
+ mutating func buildFail() {
instructions.append(.init(.fail))
}
- public mutating func buildCall(_ t: AddressToken) {
+ mutating func buildCall(_ t: AddressToken) {
instructions.append(.init(.call))
fixup(to: t)
}
- public mutating func buildRet() {
+ mutating func buildRet() {
instructions.append(.init(.ret))
}
- public mutating func buildAbort(_ s: StringRegister? = nil) {
+ mutating func buildAbort(_ s: StringRegister? = nil) {
instructions.append(.init(
.abort, .init(optionalString: s)))
}
- public mutating func buildAbort(_ s: String) {
+ mutating func buildAbort(_ s: String) {
buildAbort(strings.store(s))
}
- public mutating func buildAdvance(_ n: Distance) {
+ mutating func buildAdvance(_ n: Distance) {
instructions.append(.init(.advance, .init(distance: n)))
}
- public mutating func buildMatch(_ e: Input.Element) {
+ mutating func buildMatch(_ e: Input.Element) {
instructions.append(.init(
.match, .init(element: elements.store(e))))
}
- public mutating func buildMatchSequence(
+ mutating func buildMatchSequence(
_ s: S
) where S.Element == Input.Element {
instructions.append(.init(
@@ -192,7 +192,7 @@ extension MEProgram.Builder {
.init(sequence: sequences.store(.init(s)))))
}
- public mutating func buildMatchSlice(
+ mutating func buildMatchSlice(
lower: PositionRegister, upper: PositionRegister
) {
instructions.append(.init(
@@ -200,50 +200,50 @@ extension MEProgram.Builder {
.init(pos: lower, pos2: upper)))
}
- public mutating func buildConsume(
+ mutating func buildConsume(
by p: @escaping MEProgram.ConsumeFunction
) {
instructions.append(.init(
.consumeBy, .init(consumer: makeConsumeFunction(p))))
}
- public mutating func buildAssert(
+ mutating func buildAssert(
by p: @escaping MEProgram.AssertionFunction
) {
instructions.append(.init(
.assertBy, .init(assertion: makeAssertionFunction(p))))
}
- public mutating func buildAssert(
+ mutating func buildAssert(
_ e: Input.Element, into cond: BoolRegister
) {
instructions.append(.init(.assertion, .init(
element: elements.store(e), bool: cond)))
}
- public mutating func buildAccept() {
+ mutating func buildAccept() {
instructions.append(.init(.accept))
}
- public mutating func buildPrint(_ s: StringRegister) {
+ mutating func buildPrint(_ s: StringRegister) {
instructions.append(.init(.print, .init(string: s)))
}
- public mutating func buildBeginCapture(
+ mutating func buildBeginCapture(
_ cap: CaptureRegister
) {
instructions.append(
.init(.beginCapture, .init(capture: cap)))
}
- public mutating func buildEndCapture(
+ mutating func buildEndCapture(
_ cap: CaptureRegister
) {
instructions.append(
.init(.endCapture, .init(capture: cap)))
}
- public mutating func buildTransformCapture(
+ mutating func buildTransformCapture(
_ cap: CaptureRegister, _ trans: TransformRegister
) {
instructions.append(.init(
@@ -251,7 +251,7 @@ extension MEProgram.Builder {
.init(capture: cap, transform: trans)))
}
- public mutating func buildMatcher(
+ mutating func buildMatcher(
_ fun: MatcherRegister, into reg: ValueRegister
) {
instructions.append(.init(
@@ -259,7 +259,7 @@ extension MEProgram.Builder {
.init(matcher: fun, value: reg)))
}
- public mutating func buildMove(
+ mutating func buildMove(
_ value: ValueRegister, into capture: CaptureRegister
) {
instructions.append(.init(
@@ -267,21 +267,21 @@ extension MEProgram.Builder {
.init(value: value, capture: capture)))
}
- public mutating func buildBackreference(
+ mutating func buildBackreference(
_ cap: CaptureRegister
) {
instructions.append(
.init(.backreference, .init(capture: cap)))
}
- public mutating func buildUnresolvedReference(id: ReferenceID) {
+ mutating func buildUnresolvedReference(id: ReferenceID) {
buildBackreference(.init(0))
unresolvedReferences[id, default: []].append(lastInstructionAddress)
}
// TODO: Mutating because of fail address fixup, drop when
// that's removed
- public mutating func assemble() throws -> MEProgram {
+ mutating func assemble() throws -> MEProgram {
try resolveReferences()
// TODO: This will add a fail instruction at the end every
@@ -356,22 +356,22 @@ extension MEProgram.Builder {
referencedCaptureOffsets: referencedCaptureOffsets)
}
- public mutating func reset() { self = Self() }
+ mutating func reset() { self = Self() }
}
// Address-agnostic interfaces for label-like support
extension MEProgram.Builder {
- public enum _AddressToken {}
- public typealias AddressToken = TypedInt<_AddressToken>
+ enum _AddressToken {}
+ typealias AddressToken = TypedInt<_AddressToken>
- public mutating func makeAddress() -> AddressToken {
+ mutating func makeAddress() -> AddressToken {
defer { addressTokens.append(nil) }
return AddressToken(addressTokens.count)
}
// Resolves the address token to the most recently added
// instruction, updating prior and future address references
- public mutating func resolve(_ t: AddressToken) {
+ mutating func resolve(_ t: AddressToken) {
assert(!instructions.isEmpty)
addressTokens[t.rawValue] =
@@ -380,7 +380,7 @@ extension MEProgram.Builder {
// Resolves the address token to the next instruction (one past the most
// recently added one), updating prior and future address references.
- public mutating func label(_ t: AddressToken) {
+ mutating func label(_ t: AddressToken) {
addressTokens[t.rawValue] =
InstructionAddress(instructions.count)
}
@@ -388,7 +388,7 @@ extension MEProgram.Builder {
// Associate the most recently added instruction with
// the provided token, ensuring it is fixed up during
// assembly
- public mutating func fixup(to t: AddressToken) {
+ mutating func fixup(to t: AddressToken) {
assert(!instructions.isEmpty)
addressFixups.append(
(InstructionAddress(instructions.endIndex-1), .init(t)))
@@ -397,7 +397,7 @@ extension MEProgram.Builder {
// Associate the most recently added instruction with
// the provided tokens, ensuring it is fixed up during
// assembly
- public mutating func fixup(
+ mutating func fixup(
to ts: (AddressToken, AddressToken)
) {
assert(!instructions.isEmpty)
@@ -412,7 +412,7 @@ extension MEProgram.Builder {
//
// This is useful for possessive quantification that needs some initial save
// point to "ratchet" upon a successful match.
- public mutating func pushEmptySavePoint() {
+ mutating func pushEmptySavePoint() {
if failAddressToken == nil {
failAddressToken = makeAddress()
}
@@ -438,7 +438,7 @@ fileprivate extension MEProgram.Builder {
// Register helpers
extension MEProgram.Builder {
- public mutating func makeCapture(id: ReferenceID?) -> CaptureRegister {
+ mutating func makeCapture(id: ReferenceID?) -> CaptureRegister {
defer { nextCaptureRegister.rawValue += 1 }
// Register the capture for later lookup via symbolic references.
if let id = id {
@@ -449,25 +449,25 @@ extension MEProgram.Builder {
return nextCaptureRegister
}
- public mutating func makeBoolRegister() -> BoolRegister {
+ mutating func makeBoolRegister() -> BoolRegister {
defer { nextBoolRegister.rawValue += 1 }
return nextBoolRegister
}
- public mutating func makeIntRegister() -> IntRegister {
+ mutating func makeIntRegister() -> IntRegister {
defer { nextIntRegister.rawValue += 1 }
return nextIntRegister
}
- public mutating func makePositionRegister() -> PositionRegister {
+ mutating func makePositionRegister() -> PositionRegister {
defer { nextPositionRegister.rawValue += 1 }
return nextPositionRegister
}
- public mutating func makeValueRegister() -> ValueRegister {
+ mutating func makeValueRegister() -> ValueRegister {
defer { nextValueRegister.rawValue += 1 }
return nextValueRegister
}
// Allocate and initialize a register
- public mutating func makeIntRegister(
+ mutating func makeIntRegister(
initialValue: Int
) -> IntRegister {
let r = makeIntRegister()
@@ -476,7 +476,7 @@ extension MEProgram.Builder {
}
// Allocate and initialize a register
- public mutating func makePositionRegister(
+ mutating func makePositionRegister(
initializingWithCurrentPosition: ()
) -> PositionRegister {
let r = makePositionRegister()
@@ -485,17 +485,17 @@ extension MEProgram.Builder {
}
// 'kill' or release allocated registers
- public mutating func kill(_ r: IntRegister) {
+ mutating func kill(_ r: IntRegister) {
// TODO: Release/reuse registers, for now nop makes
// reading the code easier
buildNop("kill \(r)")
}
- public mutating func kill(_ r: BoolRegister) {
+ mutating func kill(_ r: BoolRegister) {
// TODO: Release/reuse registers, for now nop makes
// reading the code easier
buildNop("kill \(r)")
}
- public mutating func kill(_ r: PositionRegister) {
+ mutating func kill(_ r: PositionRegister) {
// TODO: Release/reuse registers, for now nop makes
// reading the code easier
buildNop("kill \(r)")
@@ -504,25 +504,25 @@ extension MEProgram.Builder {
// TODO: A register-mapping helper struct, which could release
// registers without monotonicity required
- public mutating func makeConsumeFunction(
+ mutating func makeConsumeFunction(
_ f: @escaping MEProgram.ConsumeFunction
) -> ConsumeFunctionRegister {
defer { consumeFunctions.append(f) }
return ConsumeFunctionRegister(consumeFunctions.count)
}
- public mutating func makeAssertionFunction(
+ mutating func makeAssertionFunction(
_ f: @escaping MEProgram.AssertionFunction
) -> AssertionFunctionRegister {
defer { assertionFunctions.append(f) }
return AssertionFunctionRegister(assertionFunctions.count)
}
- public mutating func makeTransformFunction(
+ mutating func makeTransformFunction(
_ f: @escaping MEProgram.TransformFunction
) -> TransformRegister {
defer { transformFunctions.append(f) }
return TransformRegister(transformFunctions.count)
}
- public mutating func makeMatcherFunction(
+ mutating func makeMatcherFunction(
_ f: @escaping MEProgram.MatcherFunction
) -> MatcherRegister {
defer { matcherFunctions.append(f) }
diff --git a/Sources/_StringProcessing/Engine/MECapture.swift b/Sources/_StringProcessing/Engine/MECapture.swift
index 88f912ecb..bac632e9e 100644
--- a/Sources/_StringProcessing/Engine/MECapture.swift
+++ b/Sources/_StringProcessing/Engine/MECapture.swift
@@ -142,7 +142,7 @@ extension Processor._StoredCapture: CustomStringConvertible {
}
}
-public struct CaptureList {
+struct CaptureList {
var values: Array._StoredCapture>
var referencedCaptureOffsets: [ReferenceID: Int]
diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift
index d616657e8..1e58ddf54 100644
--- a/Sources/_StringProcessing/Engine/MEProgram.swift
+++ b/Sources/_StringProcessing/Engine/MEProgram.swift
@@ -11,13 +11,13 @@
import _MatchingEngine
-public struct MEProgram where Input.Element: Equatable {
- public typealias ConsumeFunction = (Input, Range) -> Input.Index?
- public typealias AssertionFunction =
+struct MEProgram where Input.Element: Equatable {
+ typealias ConsumeFunction = (Input, Range) -> Input.Index?
+ typealias AssertionFunction =
(Input, Input.Index, Range) -> Bool
- public typealias TransformFunction =
+ typealias TransformFunction =
(Input, Range) -> Any?
- public typealias MatcherFunction =
+ typealias MatcherFunction =
(Input, Input.Index, Range) -> (Input.Index, Any)?
var instructions: InstructionList
@@ -39,7 +39,7 @@ public struct MEProgram where Input.Element: Equatable {
}
extension MEProgram: CustomStringConvertible {
- public var description: String {
+ var description: String {
var result = """
Elements: \(staticElements)
Strings: \(staticStrings)
diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift
index 10c3eb781..343b02c92 100644
--- a/Sources/_StringProcessing/Engine/Processor.swift
+++ b/Sources/_StringProcessing/Engine/Processor.swift
@@ -9,7 +9,7 @@
//
//===----------------------------------------------------------------------===//
-public enum MatchMode {
+enum MatchMode {
case wholeString
case partialFromFront
}
diff --git a/Sources/_StringProcessing/Engine/Tracing.swift b/Sources/_StringProcessing/Engine/Tracing.swift
index 7db740f52..24d00d3d7 100644
--- a/Sources/_StringProcessing/Engine/Tracing.swift
+++ b/Sources/_StringProcessing/Engine/Tracing.swift
@@ -15,7 +15,7 @@ extension Processor: TracedProcessor {
var currentPC: InstructionAddress { controller.pc }
- public func formatSavePoints() -> String {
+ func formatSavePoints() -> String {
if !savePoints.isEmpty {
var result = "save points:\n"
for point in savePoints {
diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift
index e066a4369..c044cbf24 100644
--- a/Sources/_StringProcessing/Executor.swift
+++ b/Sources/_StringProcessing/Executor.swift
@@ -11,8 +11,7 @@
import _MatchingEngine
- // FIXME: Public for prototype
-public struct Executor {
+struct Executor {
// TODO: consider let, for now lets us toggle tracing
var engine: Engine
@@ -20,70 +19,53 @@ public struct Executor {
self.engine = Engine(program, enableTracing: enablesTracing)
}
- // FIXME: Public for prototype
- public struct Result {
- public var range: Range
- var captures: [StructuredCapture]
- var referencedCaptureOffsets: [ReferenceID: Int]
+ func match(
+ _ input: String,
+ in inputRange: Range,
+ _ mode: MatchMode
+ ) throws -> RegexMatch? {
+ var cpu = engine.makeProcessor(
+ input: input, bounds: inputRange, matchMode: mode)
- var destructure: (
- matched: Range,
- captures: [StructuredCapture],
- referencedCaptureOffsets: [ReferenceID: Int]
- ) {
- (range, captures, referencedCaptureOffsets)
+ guard let endIdx = cpu.consume() else {
+ return nil
}
- init(
- _ matched: Range, _ captures: [StructuredCapture],
- _ referencedCaptureOffsets: [ReferenceID: Int]
- ) {
- self.range = matched
- self.captures = captures
- self.referencedCaptureOffsets = referencedCaptureOffsets
- }
- }
+ let capList = CaptureList(
+ values: cpu.storedCaptures,
+ referencedCaptureOffsets: engine.program.referencedCaptureOffsets)
- public func execute(
- input: String,
- in range: Range,
- mode: MatchMode = .wholeString
- ) -> Result? {
- guard let (endIdx, capList) = engine.consume(
- input, in: range, matchMode: mode
- ) else {
- return nil
- }
let capStruct = engine.program.captureStructure
- do {
- let range = range.lowerBound.. Result? {
- self.execute(
- input: input.base,
- in: input.startIndex..,
- mode: MatchMode = .wholeString
- ) -> (Range, CaptureList)? {
- engine.consume(
- input, in: range, matchMode: mode
- ).map { endIndex, capture in
- (range.lowerBound..,
+ _ mode: MatchMode
+ ) throws -> RegexMatch<(Substring, DynamicCaptures)>? {
+ try match(input, in: inputRange, mode)
}
}
diff --git a/Sources/_StringProcessing/RegexDSL/DSL.swift b/Sources/_StringProcessing/RegexDSL/DSL.swift
index 35a4ccb5e..a21dce82d 100644
--- a/Sources/_StringProcessing/RegexDSL/DSL.swift
+++ b/Sources/_StringProcessing/RegexDSL/DSL.swift
@@ -17,8 +17,7 @@ extension String: RegexProtocol {
public typealias Match = Substring
public var regex: Regex {
- let atoms = self.map { atom(.char($0)) }
- return .init(ast: concat(atoms))
+ .init(node: .quotedLiteral(self))
}
}
@@ -26,8 +25,7 @@ extension Substring: RegexProtocol {
public typealias Match = Substring
public var regex: Regex {
- let atoms = self.map { atom(.char($0)) }
- return .init(ast: concat(atoms))
+ .init(node: .quotedLiteral(String(self)))
}
}
@@ -35,7 +33,15 @@ extension Character: RegexProtocol {
public typealias Match = Substring
public var regex: Regex {
- .init(ast: atom(.char(self)))
+ .init(node: .atom(.char(self)))
+ }
+}
+
+extension UnicodeScalar: RegexProtocol {
+ public typealias Match = Substring
+
+ public var regex: Regex {
+ .init(node: .atom(.scalar(self)))
}
}
@@ -187,9 +193,7 @@ public func choiceOf(
// MARK: - Backreference
-
-// FIXME: Public for prototypes.
-public struct ReferenceID: Hashable, Equatable {
+struct ReferenceID: Hashable, Equatable {
private static var counter: Int = 0
var base: Int
diff --git a/Sources/_StringProcessing/RegexDSL/DSLTree.swift b/Sources/_StringProcessing/RegexDSL/DSLTree.swift
index a44220925..43f8aa62f 100644
--- a/Sources/_StringProcessing/RegexDSL/DSLTree.swift
+++ b/Sources/_StringProcessing/RegexDSL/DSLTree.swift
@@ -249,7 +249,7 @@ extension DSLTree {
}
}
extension DSLTree.Node {
- public func _captureStructure(
+ func _captureStructure(
_ constructor: inout CaptureStructure.Constructor
) -> CaptureStructure {
switch self {
diff --git a/Sources/_StringProcessing/RegexDSL/Match.swift b/Sources/_StringProcessing/RegexDSL/Match.swift
index 2dd31c379..c5ada0c9d 100644
--- a/Sources/_StringProcessing/RegexDSL/Match.swift
+++ b/Sources/_StringProcessing/RegexDSL/Match.swift
@@ -16,6 +16,8 @@ public struct RegexMatch {
let rawCaptures: [StructuredCapture]
let referencedCaptureOffsets: [ReferenceID: Int]
+ let value: Any?
+
public var match: Match {
if Match.self == (Substring, DynamicCaptures).self {
// FIXME(rdar://89449323): Compiler assertion
@@ -25,7 +27,15 @@ public struct RegexMatch {
} else if Match.self == Substring.self {
// FIXME: Plumb whole match (`.0`) through the matching engine.
return input[range] as! Match
+ } else if rawCaptures.isEmpty, value != nil {
+ // FIXME: This is a workaround for whole-match values not
+ // being modeled as part of captures. We might want to
+ // switch to a model where results are alongside captures
+ return value! as! Match
} else {
+ guard value == nil else {
+ fatalError("FIXME: what would this mean?")
+ }
let typeErasedMatch = rawCaptures.existentialMatch(from: input[range])
return typeErasedMatch as! Match
}
@@ -69,16 +79,11 @@ extension RegexProtocol {
mode: MatchMode = .wholeString
) -> RegexMatch? {
let executor = Executor(program: regex.program.loweredProgram)
- guard let (range, captures, captureOffsets) = executor.execute(
- input: input, in: inputRange, mode: mode
- )?.destructure else {
- return nil
+ do {
+ return try executor.match(input, in: inputRange, mode)
+ } catch {
+ fatalError(String(describing: error))
}
- return RegexMatch(
- input: input,
- range: range,
- rawCaptures: captures,
- referencedCaptureOffsets: captureOffsets)
}
}
diff --git a/Sources/_StringProcessing/Unicode/Decoding.swift b/Sources/_StringProcessing/Unicode/Decoding.swift
index 49eb1f794..68c14f6c1 100644
--- a/Sources/_StringProcessing/Unicode/Decoding.swift
+++ b/Sources/_StringProcessing/Unicode/Decoding.swift
@@ -33,13 +33,13 @@
enum UnsafeAssumingValidUTF8 {
@inlinable @inline(__always)
- public static func decode(_ x: UInt8) -> Unicode.Scalar {
+ static func decode(_ x: UInt8) -> Unicode.Scalar {
_internalInvariant(UTF8.isASCII(x))
return Unicode.Scalar(_unchecked: UInt32(x))
}
@inlinable @inline(__always)
- public static func decode(
+ static func decode(
_ x: UInt8, _ y: UInt8
) -> Unicode.Scalar {
_internalInvariant(scalarLength(x) == 2)
@@ -50,7 +50,7 @@ enum UnsafeAssumingValidUTF8 {
}
@inlinable @inline(__always)
- public static func decode(
+ static func decode(
_ x: UInt8, _ y: UInt8, _ z: UInt8
) -> Unicode.Scalar {
_internalInvariant(scalarLength(x) == 3)
@@ -63,7 +63,7 @@ enum UnsafeAssumingValidUTF8 {
}
@inlinable @inline(__always)
- public static func decode(
+ static func decode(
_ x: UInt8, _ y: UInt8, _ z: UInt8, _ w: UInt8
) -> Unicode.Scalar {
_internalInvariant(scalarLength(x) == 4)
@@ -80,7 +80,7 @@ enum UnsafeAssumingValidUTF8 {
// Also, assuming we can load from those bounds...
@inlinable
- public static func decode(
+ static func decode(
_ utf8: UnsafeByteBuffer, startingAt i: Int
) -> (Unicode.Scalar, scalarLength: Int) {
let cu0 = utf8[_unchecked: i]
@@ -103,7 +103,7 @@ enum UnsafeAssumingValidUTF8 {
}
@inlinable
- public static func decode(
+ static func decode(
_ utf8: UnsafeByteBuffer, endingAt i: Int
) -> (Unicode.Scalar, scalarLength: Int) {
let len = scalarLength(utf8, endingAt: i)
@@ -113,7 +113,7 @@ enum UnsafeAssumingValidUTF8 {
}
@inlinable @inline(__always)
- public static func scalarLength(_ x: UInt8) -> Int {
+ static func scalarLength(_ x: UInt8) -> Int {
_internalInvariant(!UTF8.isContinuation(x))
if UTF8.isASCII(x) { return 1 }
// TODO(String micro-performance): check codegen
@@ -121,7 +121,7 @@ enum UnsafeAssumingValidUTF8 {
}
@inlinable @inline(__always)
- public static func scalarLength(
+ static func scalarLength(
_ utf8: UnsafeByteBuffer, endingAt i: Int
) -> Int {
var len = 1
@@ -133,12 +133,12 @@ enum UnsafeAssumingValidUTF8 {
}
@inlinable @inline(__always)
- public static func continuationPayload(_ x: UInt8) -> UInt32 {
+ static func continuationPayload(_ x: UInt8) -> UInt32 {
return UInt32(x & 0x3F)
}
@inlinable
- public static func scalarAlign(
+ static func scalarAlign(
_ utf8: UnsafeByteBuffer, _ idx: Int
) -> Int {
guard _fastPath(idx != utf8.count) else { return idx }
diff --git a/Sources/_StringProcessing/Unicode/NecessaryEvils.swift b/Sources/_StringProcessing/Unicode/NecessaryEvils.swift
index ef846c14e..a9ae24429 100644
--- a/Sources/_StringProcessing/Unicode/NecessaryEvils.swift
+++ b/Sources/_StringProcessing/Unicode/NecessaryEvils.swift
@@ -40,7 +40,7 @@ extension Optional {
}
// Don't use UnsafeRawBufferPointer for anything important
-public struct UnsafeByteBuffer {
+struct UnsafeByteBuffer {
var pointer: UnsafeRawPointer
var count: Int
diff --git a/Sources/_StringProcessing/Utility/Protocols.swift b/Sources/_StringProcessing/Utility/Protocols.swift
index 9c196c18c..7542a17dd 100644
--- a/Sources/_StringProcessing/Utility/Protocols.swift
+++ b/Sources/_StringProcessing/Utility/Protocols.swift
@@ -13,11 +13,11 @@
// These currently only drive tracing/formatting, but could drive
// more
-public protocol InstructionProtocol {
+protocol InstructionProtocol {
var operandPC: InstructionAddress? { get }
}
-public protocol ProcessorProtocol {
+protocol ProcessorProtocol {
associatedtype Input: Collection
associatedtype Instruction: InstructionProtocol
associatedtype SavePoint = ()
@@ -45,12 +45,12 @@ public protocol ProcessorProtocol {
}
extension ProcessorProtocol {
- public func fetch() -> Instruction {
+ func fetch() -> Instruction {
instructions[currentPC]
}
- public var callStack: Array { [] }
-// public var savePoints: Array { [] }
- public var registers: Array { [] }
+ var callStack: Array { [] }
+// var savePoints: Array { [] }
+ var registers: Array { [] }
}
diff --git a/Sources/_StringProcessing/Utility/Traced.swift b/Sources/_StringProcessing/Utility/Traced.swift
index c270aba23..5ae7cd245 100644
--- a/Sources/_StringProcessing/Utility/Traced.swift
+++ b/Sources/_StringProcessing/Utility/Traced.swift
@@ -12,11 +12,11 @@
// TODO: Place shared formatting and trace infrastructure here
-public protocol Traced {
+protocol Traced {
var isTracingEnabled: Bool { get set }
}
-public protocol TracedProcessor: ProcessorProtocol, Traced {
+protocol TracedProcessor: ProcessorProtocol, Traced {
// Empty defaulted
func formatCallStack() -> String // empty default
func formatSavePoints() -> String // empty default
@@ -36,7 +36,7 @@ func lineNumber(_ pc: InstructionAddress) -> String {
}
extension TracedProcessor where Registers: Collection{
- public func formatRegisters() -> String {
+ func formatRegisters() -> String {
typealias E = ()
if !registers.isEmpty {
return "\(registers)\n"
@@ -48,19 +48,19 @@ extension TracedProcessor where Registers: Collection{
extension TracedProcessor {
func printTrace() { print(formatTrace()) }
- public func trace() {
+ func trace() {
if isTracingEnabled { printTrace() }
}
// Helpers for the conformers
- public func formatCallStack() -> String {
+ func formatCallStack() -> String {
if !callStack.isEmpty {
return "call stack: \(callStack)\n"
}
return ""
}
- public func formatSavePoints() -> String {
+ func formatSavePoints() -> String {
if !savePoints.isEmpty {
var result = "save points:\n"
for point in savePoints {
@@ -71,7 +71,7 @@ extension TracedProcessor {
return ""
}
- public func formatRegisters() -> String {
+ func formatRegisters() -> String {
typealias E = ()
if Registers.self == E.self {
return ""
@@ -79,7 +79,7 @@ extension TracedProcessor {
return "\(registers)\n"
}
- public func formatInput() -> String {
+ func formatInput() -> String {
// String override for printing sub-character information.
if !input.indices.contains(currentPosition) {
// Format unicode scalars as:
@@ -115,7 +115,7 @@ extension TracedProcessor {
"""
}
- public func formatInstructionWindow(
+ func formatInstructionWindow(
windowSize: Int = 12
) -> String {
if isAcceptState { return "ACCEPT" }
@@ -139,7 +139,7 @@ extension TracedProcessor {
return result
}
- public func formatTrace() -> String {
+ func formatTrace() -> String {
var result = "\n--- cycle \(cycleCount) ---\n"
result += formatCallStack()
result += formatSavePoints()
@@ -150,7 +150,7 @@ extension TracedProcessor {
return result
}
- public func formatInstruction(
+ func formatInstruction(
_ pc: InstructionAddress,
depth: Int = 5
) -> String {
@@ -160,7 +160,7 @@ extension TracedProcessor {
}
extension Collection where Element: InstructionProtocol, Index == InstructionAddress {
- public func formatInstruction(
+ func formatInstruction(
_ pc: InstructionAddress,
atCurrent: Bool,
depth: Int
diff --git a/Sources/_StringProcessing/Utility/TypedIndex.swift b/Sources/_StringProcessing/Utility/TypedIndex.swift
index 3bddcadfd..adde06a3e 100644
--- a/Sources/_StringProcessing/Utility/TypedIndex.swift
+++ b/Sources/_StringProcessing/Utility/TypedIndex.swift
@@ -12,55 +12,43 @@
/// Forwarding wrapper around Int-index collections that provide a
/// strongly (phantom) typed index.
-@frozen
-public struct TypedIndex: RawRepresentable where C.Index == Int {
- @_alwaysEmitIntoClient
- public var rawValue: C
+struct TypedIndex: RawRepresentable where C.Index == Int {
+ var rawValue: C
- @_alwaysEmitIntoClient
- public init(rawValue: C) { self.rawValue = rawValue }
+ init(rawValue: C) { self.rawValue = rawValue }
- @_alwaysEmitIntoClient
- public init(_ rawValue: C) { self.init(rawValue: rawValue) }
+ init(_ rawValue: C) { self.init(rawValue: rawValue) }
}
extension TypedIndex: Collection {
- public typealias Index = TypedInt<đť>
- public typealias Element = C.Element
+ typealias Index = TypedInt<đť>
+ typealias Element = C.Element
- @_alwaysEmitIntoClient
- public var startIndex: Index { Index(rawValue.startIndex) }
+ var startIndex: Index { Index(rawValue.startIndex) }
- @_alwaysEmitIntoClient
- public var endIndex: Index { Index(rawValue.endIndex )}
+ var endIndex: Index { Index(rawValue.endIndex )}
- @_alwaysEmitIntoClient
- public var count: Int { rawValue.count }
+ var count: Int { rawValue.count }
- @_alwaysEmitIntoClient
- public func index(after: Index) -> Index {
+ func index(after: Index) -> Index {
Index(rawValue.index(after: after.rawValue))
}
- @_alwaysEmitIntoClient
- public subscript(position: Index) -> Element {
+ subscript(position: Index) -> Element {
rawValue[position.rawValue]
}
- @_alwaysEmitIntoClient
- public func distance(
+ func distance(
from start: Index, to end: Index
) -> Int {
rawValue.distance(from: start.rawValue, to: end.rawValue)
}
- @_alwaysEmitIntoClient
- public func index(_ i: Index, offsetBy distance: Int) -> Index {
+ func index(_ i: Index, offsetBy distance: Int) -> Index {
Index(rawValue.index(i.rawValue, offsetBy: distance))
}
- @_alwaysEmitIntoClient
- public func index(_ i: Index, offsetBy distance: Int, limitedBy limit: Index) -> Index? {
+ func index(_ i: Index, offsetBy distance: Int, limitedBy limit: Index) -> Index? {
guard let idx = rawValue.index(i.rawValue, offsetBy: distance, limitedBy: limit.rawValue) else {
return nil
}
@@ -71,8 +59,7 @@ extension TypedIndex: Collection {
extension TypedIndex: RandomAccessCollection where C: RandomAccessCollection {
}
extension TypedIndex: MutableCollection where C: MutableCollection {
- @_alwaysEmitIntoClient
- public subscript(position: Index) -> Element {
+ subscript(position: Index) -> Element {
_read {
yield rawValue[position.rawValue]
}
@@ -82,8 +69,7 @@ extension TypedIndex: MutableCollection where C: MutableCollection {
}
}
extension TypedIndex: BidirectionalCollection where C: BidirectionalCollection {
- @_alwaysEmitIntoClient
- public func index(before: Index) -> Index {
+ func index(before: Index) -> Index {
Index(rawValue.index(before: before.rawValue))
}
}
@@ -92,11 +78,9 @@ extension TypedIndex: BidirectionalCollection where C: BidirectionalCollection {
// failure in the Swift repo.
#if false
extension TypedIndex: RangeReplaceableCollection where C: RangeReplaceableCollection {
- @_alwaysEmitIntoClient
- public init() { rawValue = C() }
+ init() { rawValue = C() }
- @_alwaysEmitIntoClient
- public mutating func replaceSubrange(_ subrange: Range, with newElements: C) where C : Collection, C.Element == Element {
+ mutating func replaceSubrange(_ subrange: Range, with newElements: C) where C : Collection, C.Element == Element {
let rawRange = subrange.lowerBound.rawValue ..< subrange.upperBound.rawValue
rawValue.replaceSubrange(rawRange, with: newElements)
}
@@ -107,14 +91,13 @@ extension TypedIndex: RangeReplaceableCollection where C: RangeReplaceableCollec
// Workaround for #73
extension TypedIndex where C: RangeReplaceableCollection {
- public mutating func append(_ newElement: Element) {
+ mutating func append(_ newElement: Element) {
rawValue.append(newElement)
}
}
extension TypedIndex: ExpressibleByArrayLiteral where C: ExpressibleByArrayLiteral & RangeReplaceableCollection {
- @_alwaysEmitIntoClient
- public init(arrayLiteral elements: Element...) {
+ init(arrayLiteral elements: Element...) {
// TODO: any way around the RRC copying init?
self.init(C(elements))
}
@@ -122,5 +105,5 @@ extension TypedIndex: ExpressibleByArrayLiteral where C: ExpressibleByArrayLiter
// MARK: - Strongly typed wrappers
-public typealias InstructionList = TypedIndex<[Instruction], _InstructionAddress>
+typealias InstructionList = TypedIndex<[Instruction], _InstructionAddress>
diff --git a/Sources/_StringProcessing/Utility/TypedInt.swift b/Sources/_StringProcessing/Utility/TypedInt.swift
index caff7f64e..249717b68 100644
--- a/Sources/_StringProcessing/Utility/TypedInt.swift
+++ b/Sources/_StringProcessing/Utility/TypedInt.swift
@@ -11,86 +11,71 @@
// Just a phantom-typed Int wrapper.
-@frozen
-public struct TypedInt<đť>: RawRepresentable, Hashable {
- @_alwaysEmitIntoClient
- public var rawValue: Int
+struct TypedInt<đť>: RawRepresentable, Hashable {
+ var rawValue: Int
- @_alwaysEmitIntoClient
- public init(rawValue: Int) {
+ init(rawValue: Int) {
self.rawValue = rawValue
}
- @_alwaysEmitIntoClient
- public init(_ rawValue: Int) {
+ init(_ rawValue: Int) {
self.init(rawValue: rawValue)
}
- @_alwaysEmitIntoClient
- public init(_ uint: UInt64) {
+ init(_ uint: UInt64) {
assert(uint.leadingZeroBitCount > 0)
self.init(Int(asserting: uint))
}
}
extension TypedInt: Comparable {
- @_alwaysEmitIntoClient
- public static func <(lhs: TypedInt, rhs: TypedInt) -> Bool {
+ static func <(lhs: TypedInt, rhs: TypedInt) -> Bool {
return lhs.rawValue < rhs.rawValue
}
}
extension TypedInt: CustomStringConvertible {
- @_alwaysEmitIntoClient
- public var description: String { return "#\(rawValue)" }
+ var description: String { return "#\(rawValue)" }
}
extension TypedInt: ExpressibleByIntegerLiteral {
- @_alwaysEmitIntoClient
- public init(integerLiteral value: Int) {
+ init(integerLiteral value: Int) {
self.init(rawValue: value)
}
}
-public protocol TypedIntProtocol {
+protocol TypedIntProtocol {
associatedtype đť
}
extension TypedInt: TypedIntProtocol { }
// A placeholder type for when we must supply a type.
// When the phantom type appears, it says boo
-public enum _Boo {}
+enum _Boo {}
// Easier for clients to just have their own typealias
-public typealias TypedInt_ = TypedInt
+typealias TypedInt_ = TypedInt
// TODO: BinaryInteger, etc.
extension TypedInt {
- @_alwaysEmitIntoClient
- public static func +(lhs: TypedInt, rhs: Int) -> TypedInt {
+ static func +(lhs: TypedInt, rhs: Int) -> TypedInt {
return TypedInt(lhs.rawValue + rhs)
}
- @_alwaysEmitIntoClient
- public var bits: UInt64 {
+ var bits: UInt64 {
UInt64(asserting: self.rawValue)
}
}
-@frozen
-public struct TypedSetVector {
- public typealias Idx = TypedInt<đť>
+struct TypedSetVector {
+ typealias Idx = TypedInt<đť>
// TODO: Replace with real set vector
- @_alwaysEmitIntoClient
- public var lookup: Dictionary = [:]
+ var lookup: Dictionary = [:]
- @_alwaysEmitIntoClient
- public var stored: Array = []
+ var stored: Array = []
- @_alwaysEmitIntoClient
- public func load(_ idx: Idx) -> Element { stored[idx.rawValue] }
+ func load(_ idx: Idx) -> Element { stored[idx.rawValue] }
- @_alwaysEmitIntoClient
@discardableResult
- public mutating func store(_ e: Element) -> Idx {
+ mutating func store(_ e: Element) -> Idx {
if let reg = lookup[e] { return reg }
let reg = Idx(stored.count)
stored.append(e)
@@ -98,34 +83,32 @@ public struct TypedSetVector {
return reg
}
- @_alwaysEmitIntoClient
- public var count: Int { stored.count }
+ var count: Int { stored.count }
- @_alwaysEmitIntoClient
- public init() {}
+ init() {}
}
// MARK: - Strongly typed int wrappers
/// A distance in the Input, e.g. `n` in consume(n)
-public typealias Distance = TypedInt<_Distance>
-public enum _Distance {}
+typealias Distance = TypedInt<_Distance>
+enum _Distance {}
/// An instruction address, i.e. the index into our instruction list
-public typealias InstructionAddress = TypedInt<_InstructionAddress>
-public enum _InstructionAddress {}
+typealias InstructionAddress = TypedInt<_InstructionAddress>
+enum _InstructionAddress {}
/// A position in the call stack, i.e. for save point restores
-public typealias CallStackAddress = TypedInt<_CallStackAddress>
-public enum _CallStackAddress {}
+typealias CallStackAddress = TypedInt<_CallStackAddress>
+enum _CallStackAddress {}
/// A position in a position stack, i.e. for NFA simulation
-public typealias PositionStackAddress = TypedInt<_PositionStackAddress>
-public enum _PositionStackAddress {}
+typealias PositionStackAddress = TypedInt<_PositionStackAddress>
+enum _PositionStackAddress {}
/// A position in the save point stack, i.e. for backtracking
-public typealias SavePointStackAddress = TypedInt<_SavePointAddress>
-public enum _SavePointAddress {}
+typealias SavePointStackAddress = TypedInt<_SavePointAddress>
+enum _SavePointAddress {}
// MARK: - Registers
@@ -135,85 +118,85 @@ public enum _SavePointAddress {}
/// NOTE: Currently just used for static data, but e.g. could be
/// used to save the most recently seen element satisfying some
/// property
-public typealias ElementRegister = TypedInt<_ElementRegister>
-public enum _ElementRegister {}
+typealias ElementRegister = TypedInt<_ElementRegister>
+enum _ElementRegister {}
-public typealias SequenceRegister = TypedInt<_SequenceRegister>
-public enum _SequenceRegister {}
+typealias SequenceRegister = TypedInt<_SequenceRegister>
+enum _SequenceRegister {}
/// The register number for a stored boolean value
///
/// E.g. used for conditional branches
-public typealias BoolRegister = TypedInt<_BoolRegister>
-public enum _BoolRegister {}
+typealias BoolRegister = TypedInt<_BoolRegister>
+enum _BoolRegister {}
/// The register number for a string (e.g. comment, failure reason)
-public typealias StringRegister = TypedInt<_StringRegister>
-public enum _StringRegister {}
+typealias StringRegister = TypedInt<_StringRegister>
+enum _StringRegister {}
/// Used for consume functions, e.g. character classes
-public typealias ConsumeFunctionRegister = TypedInt<_ConsumeFunctionRegister>
-public enum _ConsumeFunctionRegister {}
+typealias ConsumeFunctionRegister = TypedInt<_ConsumeFunctionRegister>
+enum _ConsumeFunctionRegister {}
/// Used for assertion functions, e.g. anchors etc
-public typealias AssertionFunctionRegister = TypedInt<_AssertionFunctionRegister>
-public enum _AssertionFunctionRegister {}
+typealias AssertionFunctionRegister = TypedInt<_AssertionFunctionRegister>
+enum _AssertionFunctionRegister {}
/// Used for capture transforms, etc
-public typealias TransformRegister = TypedInt<_TransformRegister>
-public enum _TransformRegister {}
+typealias TransformRegister = TypedInt<_TransformRegister>
+enum _TransformRegister {}
/// Used for value-producing matchers
-public typealias MatcherRegister = TypedInt<_MatcherRegister>
-public enum _MatcherRegister {}
+typealias MatcherRegister = TypedInt<_MatcherRegister>
+enum _MatcherRegister {}
/// UNIMPLEMENTED
-public typealias IntRegister = TypedInt<_IntRegister>
-public enum _IntRegister {}
+typealias IntRegister = TypedInt<_IntRegister>
+enum _IntRegister {}
/// UNIMPLEMENTED
-public typealias FloatRegister = TypedInt<_FloatRegister>
-public enum _FloatRegister {}
+typealias FloatRegister = TypedInt<_FloatRegister>
+enum _FloatRegister {}
/// UNIMPLEMENTED
///
/// NOTE: This, along with a position stack, might
/// serve NFA-simulation style execution models
-public typealias PositionRegister = TypedInt<_PositionRegister>
-public enum _PositionRegister {}
+typealias PositionRegister = TypedInt<_PositionRegister>
+enum _PositionRegister {}
-public typealias ValueRegister = TypedInt<_ValueRegister>
-public enum _ValueRegister {}
+typealias ValueRegister = TypedInt<_ValueRegister>
+enum _ValueRegister {}
-public typealias CaptureRegister = TypedInt<_CaptureRegister>
-public enum _CaptureRegister {}
+typealias CaptureRegister = TypedInt<_CaptureRegister>
+enum _CaptureRegister {}
/// UNIMPLEMENTED
-public typealias InstructionAddressRegister = TypedInt<_InstructionAddressRegister>
-public enum _InstructionAddressRegister {}
+typealias InstructionAddressRegister = TypedInt<_InstructionAddressRegister>
+enum _InstructionAddressRegister {}
/// UNIMPLEMENTED
-public typealias CallStackAddressRegister = TypedInt<_CallStackAddressRegister>
-public enum _CallStackAddressRegister {}
+typealias CallStackAddressRegister = TypedInt<_CallStackAddressRegister>
+enum _CallStackAddressRegister {}
/// UNIMPLEMENTED
-public typealias PositionStackAddressRegister = TypedInt<_PositionStackAddressRegister>
-public enum _PositionStackAddressRegister {}
+typealias PositionStackAddressRegister = TypedInt<_PositionStackAddressRegister>
+enum _PositionStackAddressRegister {}
/// UNIMPLEMENTED
-public typealias SavePointAddressRegister = TypedInt<_SavePointAddressRegister>
-public enum _SavePointAddressRegister {}
+typealias SavePointAddressRegister = TypedInt<_SavePointAddressRegister>
+enum _SavePointAddressRegister {}
/// A numbered label
-public typealias LabelId = TypedInt<_LabelId>
-public enum _LabelId {}
+typealias LabelId = TypedInt<_LabelId>
+enum _LabelId {}
/// A numbered function
-public typealias FunctionId = TypedInt<_FunctionId>
-public enum _FunctionId {}
+typealias FunctionId = TypedInt<_FunctionId>
+enum _FunctionId {}
/// A numbered capture
-public typealias CaptureId = TypedInt<_CaptureId>
-public enum _CaptureId {}
+typealias CaptureId = TypedInt<_CaptureId>
+enum _CaptureId {}
diff --git a/Sources/_Unicode/CMakeLists.txt b/Sources/_Unicode/CMakeLists.txt
new file mode 100644
index 000000000..7fdb44628
--- /dev/null
+++ b/Sources/_Unicode/CMakeLists.txt
@@ -0,0 +1,16 @@
+
+add_library(_Unicode
+ CaseConversion.swift
+ CharacterProps.swift
+ Comparison.swift
+ Decoding.swift
+ Encodings.swift
+ Formatting.swift
+ Graphemes.swift
+ NecessaryEvils.swift
+ Normaliation.swift
+ NumberParsing.swift
+ ScalarProps.swift
+ Transcoding.swift
+ UCD.swift
+ Validation.swift)
diff --git a/Tests/MatchingEngineTests/MatchingEngineTests.swift b/Tests/MatchingEngineTests/MatchingEngineTests.swift
index b7c89661d..ccfe85ec7 100644
--- a/Tests/MatchingEngineTests/MatchingEngineTests.swift
+++ b/Tests/MatchingEngineTests/MatchingEngineTests.swift
@@ -13,289 +13,5 @@ import XCTest
@testable import _StringProcessing
-/// Hold context and run variety of ad-hoc tests
-///
-/// TODO: Use these to demonstrate first-order approximation of what
-/// overhead such an engine imposes
-fileprivate struct Test: ExpressibleByStringLiteral {
- var input: String
- var aEater: String
- var manyAEater: String
- var eatUntilA: String
- var eatThroughA: String
-
- // TODO: Have tests explicitly show each step of type binding,
- // input binding, etc.
- var enableTracing: Bool? = nil
-
- /*
-
- until first A
- through first A
- until / through last A
- etc
-
- */
-
- var file: String
- var line: UInt
-
- init(
- _ s: String,
- enableTracing: Bool? = nil,
- file: String = #file,
- line: UInt = #line
- ) {
- self.input = s
- self.aEater = s.first == "A" ? String(s.dropFirst()) : s
- self.manyAEater = String(s.drop(while: { $0 == "A" }))
-
- if let firstIdx = s.firstIndex(of: "A") {
- self.eatUntilA = String(s[firstIdx...])
- self.eatThroughA = String(eatUntilA.dropFirst())
- } else {
- self.eatUntilA = s
- self.eatThroughA = s
- }
-
- self.enableTracing = enableTracing
-
-// self.untilFirstAEater = String(
-// s[(s.firstIndex(where: { $0 == "A" }) ?? s.startIndex)...])
-
-
- self.file = file
- self.line = line
- }
- init(
- stringLiteral: String,
- file: String = #file,
- line: UInt = #line
- ) {
- self.init(stringLiteral, file: file, line: line)
- }
- init(stringLiteral: String) {
- // NOTE: Can't get source location of a literal...
- self.init(stringLiteral)
- }
-
- var slicedInput: (String, Range) {
- let prefix = "aAa prefix â ď¸"
- let suffix = "â ď¸ aAa suffix"
- let outer = prefix + input + suffix
- let range = outer.mapOffsets(
- (lower: prefix.count, upper: -suffix.count))
- return (outer, range)
- }
-
- func check(_ engine: Engine, expected: String) {
- var engine = engine
- if let t = enableTracing {
- engine.enableTracing = t
- }
- let output: String
- let outputFromSlice: String
-
- if let (idx, _) = engine.consume(input) {
- output = String(input[idx...])
- } else {
- output = input
- }
-
- let (outerInput, range) = slicedInput
- if let (idx, _) = engine.consume(outerInput, in: range) {
- outputFromSlice = String(outerInput[idx..? = nil,
- manyAEater: Engine? = nil,
- eatUntilA: Engine? = nil,
- eatThroughA: Engine? = nil
- ) {
- if let engine = aEater {
- check(engine, expected: self.aEater)
- }
- if let engine = manyAEater {
- check(engine, expected: self.manyAEater)
- }
- if let engine = eatUntilA {
- check(engine, expected: self.eatUntilA)
- }
- if let engine = eatThroughA {
- check(engine, expected: self.eatThroughA)
- }
- }
-}
-
-var doPrint = false
-func show(_ s: CustomStringConvertible) {
- if doPrint { print(s) }
-}
-
-func makeEngine(
- _ constructor: (inout Program.Builder) -> ()
-) -> Engine {
- var builder = Program.Builder()
- constructor(&builder)
- let program = try! builder.assemble()
- let engine = Engine(program)
- show(engine)
- return engine
-}
-
-// Eat an A off the front
-//
-// [0] match "A"
-// [1] accept
-//
-let aEater: Engine = {
- makeEngine { builder in
- builder.buildMatch("A")
- builder.buildAccept()
- }
-}()
-
-// Eat many "A"s off the input
-//
-// [0] saveAddress [3] // .accept
-// [1] match "A"
-// [2] goto [1] // match "A"
-// [3] accept
-//
-// NOTE: a save would restore input position, which we
-// actually don't want to do.
-//
-// NOTE: We should compare with a more sophisticated match
-// instruction that can take at least or at most, etc.
-//
-let manyAEater: Engine = {
- makeEngine { builder in
- let accTok = builder.makeAddress()
- let matchTok = builder.makeAddress()
-
- builder.buildSaveAddress(accTok)
- builder.buildMatch("A")
- builder.resolve(matchTok)
- builder.buildBranch(to: matchTok)
- builder.buildAccept()
- builder.resolve(accTok)
- }
-}()
-
-// Eat until you find an A (FAIL if no A)
-//
-// [0] assert #0 #0
-// [1] condBranch #0 [x] // accept
-// [2] advance(1)
-// [3] goto 0
-// [4] accept
-//
-// NOTE: This check-consume-else-branch pattern
-// could be pretty common and might be worth a dedicated
-// instruction.
-let eatUntilA: Engine = {
- makeEngine { builder in
- let reg = builder.makeBoolRegister()
- let accTok = builder.makeAddress()
- let assertTok = builder.makeAddress()
- builder.buildAssert("A", into: reg)
- builder.resolve(assertTok)
- builder.buildCondBranch(reg, to: accTok)
- builder.buildAdvance(1)
- builder.buildBranch(to: assertTok)
- builder.buildAccept()
- builder.resolve(accTok)
- }
-}()
-
-// Eat through the first A (FAIL if no A)
-//
-// [0] assert #0 #0
-// [1] advance(1)
-// [2] condBranch #0 [x] // accept
-// [3] goto 0
-// [4] accept
-let eatThroughA: Engine = {
- makeEngine { builder in
- let reg = builder.makeBoolRegister()
- let accTok = builder.makeAddress()
- let assertTok = builder.makeAddress()
- builder.buildAssert("A", into: reg)
- builder.resolve(assertTok)
- builder.buildAdvance(1)
- builder.buildCondBranch(reg, to: accTok)
- builder.buildBranch(to: assertTok)
- builder.buildAccept()
- builder.resolve(accTok)
- }
-}()
-
-
-
-class MatchingEngineTests: XCTestCase {
-
- func testAEaters() {
- let tests: Array = [
- Test("abc"),
- Test("Abc"),
- Test("AAbc"),
- Test(""),
- Test("A"),
- Test("b"),
- Test("bbbA"),
- Test("bbAbA"),
- ]
-
- for test in tests {
- test.check(aEater: aEater)
- test.check(manyAEater: manyAEater)
- test.check(eatUntilA: eatUntilA)
- test.check(eatThroughA: eatThroughA)
- }
- }
-
- func testThreeLetterRepeat() {
- // Check for a repeated 3-letter sequence, such as in
- // `(...)\1`
- //
- // [0] movePosition(into: %low)
- // [1] advance(3)
- // [2] movePosition(into: %high)
- // [3] matchSlice(%low, %high)
- // [4] accept
- let threeLetterRepeat: Engine = {
- makeEngine { builder in
- let low = builder.makePositionRegister(
- initializingWithCurrentPosition: ())
- builder.buildAdvance(3)
- let high = builder.makePositionRegister(
- initializingWithCurrentPosition: ())
- builder.buildMatchSlice(lower: low, upper: high)
- builder.buildAccept()
- }
- }()
-
- let tests: Array<(String, Bool)> = [
- ("abcabc", true),
- ("abcabc_____", true),
- ("dddddd_____", true),
- ("đĽłđ§ââď¸cđĽłđ§ââď¸c", true),
- ("abccba", false),
- ("abcabb", false),
- ("abcbac", false),
- ("đĽłđ§ââď¸cđĽłđ§ââď¸c", false),
- ]
-
- for (test, expect) in tests {
- let match = threeLetterRepeat.consume(test) != nil
- XCTAssertEqual(expect, match)
- }
- }
-}
+// TODO: Unit tests for the engine itself. Functional testing
+// is handled by regex tests.
diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift
index 9f3cc313b..cc3568c1d 100644
--- a/Tests/RegexTests/CaptureTests.swift
+++ b/Tests/RegexTests/CaptureTests.swift
@@ -142,13 +142,15 @@ func captureTest(
for (input, output) in tests {
let inputRange = input.startIndex..(
+ _ regex: Regex,
+ _ tests: (input: String, call: MatchCall, match: Match?)...
+) {
+ for (input, call, match) in tests {
+ let result: Match?
+ switch call {
+ case .match:
+ result = input.match(regex)?.match
+ case .firstMatch:
+ result = input.firstMatch(of: regex)?.result
}
- XCTAssert(result.match == "4t")
+ XCTAssertEqual(result, match)
+ }
+}
- XCTAssertNil("4".match(regex))
- XCTAssertNil("t".match(regex))
- XCTAssertNil("t4".match(regex))
+extension RegexTests {
- let regex2 = Regex {
- oneOrMore {
+ // TODO: Refactor below into more exhaustive, declarative
+ // tests.
+ func testCustomRegexComponents() {
+ customTest(
+ Regex {
Numbler()
- }
- }
-
- guard let res2 = "ab123c".firstMatch(of: regex2) else {
- XCTFail()
- return
- }
-
- XCTAssertEqual(res2.match, "123")
+ Asciibbler()
+ },
+ ("4t", .match, "4t"),
+ ("4", .match, nil),
+ ("t", .match, nil),
+ ("t x1y z", .firstMatch, "1y"),
+ ("t4", .match, nil))
+
+ customTest(
+ Regex {
+ oneOrMore { Numbler() }
+ },
+ ("ab123c", .firstMatch, "123"),
+ ("abc", .firstMatch, nil),
+ ("55z", .match, nil),
+ ("55z", .firstMatch, "55"))
+
+ customTest(
+ Regex {
+ Numbler()
+ },
+ ("ab123c", .firstMatch, 1),
+ ("abc", .firstMatch, nil),
+ ("55z", .match, nil),
+ ("55z", .firstMatch, 5))
+
+ // TODO: Convert below tests to better infra. Right now
+ // it's hard because `Match` is constrained to be
+ // `Equatable` which tuples cannot be.
let regex3 = Regex {
capture {
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index 4dd6392f7..dba72820f 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -23,13 +23,13 @@ extension Executor {
// Consumer -> searcher algorithm
var start = input.startIndex
while true {
- if let (range, caps) = self.executeFlat(
- input: input,
+ if let result = try! self.dynamicMatch(
+ input,
in: start.. String {
+ input.withCString(encodedAs: UTF8.self) { ptr in
+ let endPtr = ptr + input.utf8.count
+ let (contents, delim, end) = try! lexRegex(start: ptr, end: endPtr)
+ if ignoreTrailing {
+ XCTAssertNotEqual(end, endPtr, file: file, line: line)
+ } else {
+ XCTAssertEqual(end, endPtr, file: file, line: line)
+ }
+
+ let rawPtr = UnsafeRawPointer(ptr)
+ let buffer = UnsafeRawBufferPointer(start: rawPtr, count: end - rawPtr)
+ let literal = String(decoding: buffer, as: UTF8.self)
- let (parseContents, parseDelim) = droppingRegexDelimiters(input)
+ let (parseContents, parseDelim) = droppingRegexDelimiters(literal)
XCTAssertEqual(contents, parseContents, file: file, line: line)
XCTAssertEqual(delim, parseDelim, file: file, line: line)
+ return literal
}
+}
- let orig = try! parseWithDelimiters(input)
+/// Test parsing an input string with regex delimiters. If `ignoreTrailing` is
+/// true, there may be additional characters that follow the literal that are
+/// not considered part of it.
+func parseWithDelimitersTest(
+ _ input: String, _ expecting: AST.Node, ignoreTrailing: Bool = false,
+ file: StaticString = #file, line: UInt = #line
+) {
+ // First try lexing.
+ let literal = delimiterLexingTest(
+ input, ignoreTrailing: ignoreTrailing, file: file, line: line)
+
+ let orig = try! parseWithDelimiters(literal)
let ast = orig.root
guard ast == expecting
|| ast._dump() == expecting._dump() // EQ workaround
@@ -199,6 +223,32 @@ func diagnosticTest(
}
}
+func delimiterLexingDiagnosticTest(
+ _ input: String, _ expected: DelimiterLexError.Kind,
+ syntax: SyntaxOptions = .traditional,
+ file: StaticString = #file, line: UInt = #line
+) {
+ do {
+ _ = try input.withCString { ptr in
+ try lexRegex(start: ptr, end: ptr + input.count)
+ }
+ XCTFail("""
+ Passed, but expected error: \(expected)
+ """, file: file, line: line)
+ } catch let e as DelimiterLexError {
+ guard e.kind == expected else {
+ XCTFail("""
+
+ Expected: \(expected)
+ Actual: \(e.kind)
+ """, file: file, line: line)
+ return
+ }
+ } catch let e {
+ XCTFail("Unexpected error type: \(e)", file: file, line: line)
+ }
+}
+
func libswiftDiagnosticMessageTest(
_ input: String, _ expectedErr: String, file: StaticString = #file,
line: UInt = #line
@@ -329,7 +379,7 @@ extension RegexTests {
parseTest(#"\070"#, scalar("\u{38}"))
parseTest(#"\07A"#, concat(scalar("\u{7}"), "A"))
parseTest(#"\08"#, concat(scalar("\u{0}"), "8"))
- parseTest(#"\0707"#, concat(scalar("\u{38}"), "7"))
+ parseTest(#"\0707"#, scalar("\u{1C7}"))
parseTest(#"[\0]"#, charClass(scalar_m("\u{0}")))
parseTest(#"[\01]"#, charClass(scalar_m("\u{1}")))
@@ -337,13 +387,15 @@ extension RegexTests {
parseTest(#"[\07A]"#, charClass(scalar_m("\u{7}"), "A"))
parseTest(#"[\08]"#, charClass(scalar_m("\u{0}"), "8"))
- parseTest(#"[\0707]"#, charClass(scalar_m("\u{38}"), "7"))
+ parseTest(#"[\0707]"#, charClass(scalar_m("\u{1C7}")))
- parseTest(#"[\1]"#, charClass(scalar_m("\u{1}")))
- parseTest(#"[\123]"#, charClass(scalar_m("\u{53}")))
- parseTest(#"[\101]"#, charClass(scalar_m("\u{41}")))
- parseTest(#"[\7777]"#, charClass(scalar_m("\u{1FF}"), "7"))
- parseTest(#"[\181]"#, charClass(scalar_m("\u{1}"), "8", "1"))
+ // TODO: These are treated as octal sequences by PCRE, we should warn and
+ // suggest user prefix with 0.
+ parseTest(#"[\1]"#, charClass("1"))
+ parseTest(#"[\123]"#, charClass("1", "2", "3"))
+ parseTest(#"[\101]"#, charClass("1", "0", "1"))
+ parseTest(#"[\7777]"#, charClass("7", "7", "7", "7"))
+ parseTest(#"[\181]"#, charClass("1", "8", "1"))
// We take *up to* the first two valid digits for \x. No valid digits is 0.
parseTest(#"\x"#, scalar("\u{0}"))
@@ -492,6 +544,10 @@ extension RegexTests {
#"a\Q \Q \\.\Eb"#,
concat("a", quote(#" \Q \\."#), "b"))
+ // These follow the PCRE behavior.
+ parseTest(#"\Q\\E"#, quote("\\"))
+ parseTest(#"\E"#, "E")
+
parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"),
syntax: .experimental)
parseTest(#"a" .""b""#, concat("a", quote(" ."), quote("b")),
@@ -797,11 +853,9 @@ extension RegexTests {
)
}
- // TODO: Some of these behaviors are unintuitive, we should likely warn on
- // some of them.
- parseTest(#"\10"#, scalar("\u{8}"))
- parseTest(#"\18"#, concat(scalar("\u{1}"), "8"))
- parseTest(#"\7777"#, concat(scalar("\u{1FF}"), "7"))
+ parseTest(#"\10"#, backreference(.absolute(10)))
+ parseTest(#"\18"#, backreference(.absolute(18)))
+ parseTest(#"\7777"#, backreference(.absolute(7777)))
parseTest(#"\91"#, backreference(.absolute(91)))
parseTest(
@@ -813,12 +867,13 @@ extension RegexTests {
parseTest(
#"()()()()()()()()()\10()"#,
concat(Array(repeating: capture(empty()), count: 9)
- + [scalar("\u{8}"), capture(empty())]),
+ + [backreference(.absolute(10)), capture(empty())]),
captures: .tuple(Array(repeating: .atom(), count: 10))
)
- parseTest(#"()()\10"#,
- concat(capture(empty()), capture(empty()), scalar("\u{8}")),
- captures: .tuple(.atom(), .atom()))
+ parseTest(#"()()\10"#, concat(
+ capture(empty()), capture(empty()), backreference(.absolute(10))),
+ captures: .tuple(.atom(), .atom())
+ )
// A capture of three empty captures.
let fourCaptures = capture(
@@ -826,8 +881,8 @@ extension RegexTests {
)
parseTest(
// There are 9 capture groups in total here.
- #"((()()())(()()()))\10"#,
- concat(capture(concat(fourCaptures, fourCaptures)), scalar("\u{8}")),
+ #"((()()())(()()()))\10"#, concat(capture(concat(
+ fourCaptures, fourCaptures)), backreference(.absolute(10))),
captures: .tuple(Array(repeating: .atom(), count: 9))
)
parseTest(
@@ -852,7 +907,7 @@ extension RegexTests {
concat(Array(repeating: capture(empty()), count: 40) + [scalar(" ")]),
captures: .tuple(Array(repeating: .atom(), count: 40))
)
- parseTest(#"\40"#, scalar(" "))
+ parseTest(#"\40"#, backreference(.absolute(40)))
parseTest(
String(repeating: "()", count: 40) + #"\40"#,
concat(Array(repeating: capture(empty()), count: 40)
@@ -862,7 +917,7 @@ extension RegexTests {
parseTest(#"\7"#, backreference(.absolute(7)))
- parseTest(#"\11"#, scalar("\u{9}"))
+ parseTest(#"\11"#, backreference(.absolute(11)))
parseTest(
String(repeating: "()", count: 11) + #"\11"#,
concat(Array(repeating: capture(empty()), count: 11)
@@ -876,12 +931,11 @@ extension RegexTests {
captures: .tuple(Array(repeating: .atom(), count: 11))
)
- parseTest(#"\0113"#, concat(scalar("\u{9}"), "3"))
- parseTest(#"\113"#, scalar("\u{4B}"))
- parseTest(#"\377"#, scalar("\u{FF}"))
+ parseTest(#"\0113"#, scalar("\u{4B}"))
+ parseTest(#"\113"#, backreference(.absolute(113)))
+ parseTest(#"\377"#, backreference(.absolute(377)))
parseTest(#"\81"#, backreference(.absolute(81)))
-
parseTest(#"\g1"#, backreference(.absolute(1)))
parseTest(#"\g001"#, backreference(.absolute(1)))
parseTest(#"\g52"#, backreference(.absolute(52)))
@@ -999,13 +1053,13 @@ extension RegexTests {
parseTest(#"\p{sc=grek}"#, prop(.script(.greek)))
parseTest(#"\p{sc=isGreek}"#, prop(.script(.greek)))
- parseTest(#"\p{Greek}"#, prop(.script(.greek)))
- parseTest(#"\p{isGreek}"#, prop(.script(.greek)))
+ parseTest(#"\p{Greek}"#, prop(.scriptExtension(.greek)))
+ parseTest(#"\p{isGreek}"#, prop(.scriptExtension(.greek)))
parseTest(#"\P{Script=Latn}"#, prop(.script(.latin), inverted: true))
parseTest(#"\p{script=zzzz}"#, prop(.script(.unknown)))
parseTest(#"\p{ISscript=iszzzz}"#, prop(.script(.unknown)))
parseTest(#"\p{scx=bamum}"#, prop(.scriptExtension(.bamum)))
- parseTest(#"\p{ISBAMUM}"#, prop(.script(.bamum)))
+ parseTest(#"\p{ISBAMUM}"#, prop(.scriptExtension(.bamum)))
parseTest(#"\p{alpha}"#, prop(.binary(.alphabetic)))
parseTest(#"\p{DEP}"#, prop(.binary(.deprecated)))
@@ -1443,6 +1497,9 @@ extension RegexTests {
parseWithDelimitersTest("#/a b/#", concat("a", " ", "b"))
parseWithDelimitersTest("#|a b|#", concat("a", "b"))
+ parseWithDelimitersTest("re'a b'", concat("a", " ", "b"))
+ parseWithDelimitersTest("rx'a b'", concat("a", "b"))
+
parseWithDelimitersTest("#|[a b]|#", charClass("a", "b"))
parseWithDelimitersTest(
"#|(?-x)[a b]|#", changeMatchingOptions(
@@ -1468,6 +1525,71 @@ extension RegexTests {
parseWithDelimitersTest("re'x*'", zeroOrMore(of: "x"))
+ parseWithDelimitersTest(#"re'đĽđŠđ°'"#, concat("đĽ", "đŠđ°"))
+ parseWithDelimitersTest(#"re'\đĽâ '"#, concat("đĽ", "â "))
+
+ // Printable ASCII characters.
+ delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##)
+
+ // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter
+ // if it's clear that it's part of the regex syntax.
+
+ parseWithDelimitersTest(
+ #"re'(?'a_bcA0'\')'"#, namedCapture("a_bcA0", "'"))
+ parseWithDelimitersTest(
+ #"re'(?'a_bcA0-c1A'x*)'"#,
+ balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x")))
+
+ parseWithDelimitersTest(
+ #"rx' (?'a_bcA0' a b)'"#, concat(namedCapture("a_bcA0", concat("a", "b"))))
+
+ parseWithDelimitersTest(
+ #"re'(?('a_bcA0')x|y)'"#, conditional(
+ .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y"))
+ parseWithDelimitersTest(
+ #"re'(?('+20')\')'"#, conditional(
+ .groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty()))
+
+ parseWithDelimitersTest(
+ #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))))
+ parseWithDelimitersTest(
+ #"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1))
+
+ parseWithDelimitersTest(
+ #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A"))))
+ parseWithDelimitersTest(
+ #"re'\g'-1'\''"#, concat(subpattern(.relative(-1)), "'"))
+
+ parseWithDelimitersTest(
+ #"re'(?C'a*b\c đĽ_ ;')'"#, pcreCallout(.string(#"a*b\c đĽ_ ;"#)))
+
+ // Fine, because we don't end up skipping.
+ delimiterLexingTest(#"re'(?'"#)
+ delimiterLexingTest(#"re'(?('"#)
+ delimiterLexingTest(#"re'\k'"#)
+ delimiterLexingTest(#"re'\g'"#)
+ delimiterLexingTest(#"re'(?C'"#)
+
+ // Not a valid group name, but we can still skip over it.
+ delimiterLexingTest(#"re'(?'đĽ')'"#)
+
+ // Escaped, so don't skip. These will ignore the ending `'` as we've already
+ // closed the literal.
+ parseWithDelimitersTest(
+ #"re'\(?''"#, zeroOrOne(of: "("), ignoreTrailing: true
+ )
+ parseWithDelimitersTest(
+ #"re'\\k''"#, concat("\\", "k"), ignoreTrailing: true
+ )
+ parseWithDelimitersTest(
+ #"re'\\g''"#, concat("\\", "g"), ignoreTrailing: true
+ )
+ parseWithDelimitersTest(
+ #"re'\(?C''"#, concat(zeroOrOne(of: "("), "C"), ignoreTrailing: true
+ )
+ delimiterLexingTest(#"re'(\?''"#, ignoreTrailing: true)
+ delimiterLexingTest(#"re'\(?(''"#, ignoreTrailing: true)
+
// MARK: Parse not-equal
// Make sure dumping output correctly reflects differences in AST.
@@ -1749,6 +1871,10 @@ extension RegexTests {
diagnosticTest("(?"))
diagnosticTest("(?", .expected(")"))
+ // MARK: Bad escapes
+
+ diagnosticTest("\\", .expectedEscape)
+
// MARK: Text Segment options
diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions)
@@ -1770,6 +1896,12 @@ extension RegexTests {
diagnosticTest(#"(?<#>)"#, .identifierMustBeAlphaNumeric(.groupName))
diagnosticTest(#"(?'1A')"#, .identifierCannotStartWithNumber(.groupName))
+ // TODO: It might be better if tried to consume up to the closing `'` and
+ // diagnosed an invalid group name based on that.
+ diagnosticTest(#"(?'abc ')"#, .expected("'"))
+
+ diagnosticTest("(?'đĽ')", .identifierMustBeAlphaNumeric(.groupName))
+
diagnosticTest(#"(?'-')"#, .expectedIdentifier(.groupName))
diagnosticTest(#"(?'--')"#, .identifierMustBeAlphaNumeric(.groupName))
diagnosticTest(#"(?'a-b-c')"#, .expected("'"))
@@ -1882,6 +2014,27 @@ extension RegexTests {
diagnosticTest("(*LIMIT_DEPTH=-1", .expectedNumber("", kind: .decimal))
}
+ func testDelimiterLexingErrors() {
+
+ // MARK: Printable ASCII
+
+ delimiterLexingDiagnosticTest(#"re'\\#n'"#, .endOfString)
+ for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r.
+ delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII)
+ }
+ delimiterLexingDiagnosticTest("re'\n'", .endOfString)
+ delimiterLexingDiagnosticTest("re'\r'", .endOfString)
+ delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII)
+
+ // MARK: Delimiter skipping
+
+ delimiterLexingDiagnosticTest("re'(?''", .endOfString)
+ delimiterLexingDiagnosticTest("re'(?'abc'", .endOfString)
+ delimiterLexingDiagnosticTest("re'(?('abc'", .endOfString)
+ delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .endOfString)
+ delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .endOfString)
+ }
+
func testlibswiftDiagnostics() {
libswiftDiagnosticMessageTest(
"#/[x*/#", "cannot parse regular expression: expected ']'")
diff --git a/Tests/RegexTests/RegexDSLTests.swift b/Tests/RegexTests/RegexDSLTests.swift
index 554ef905f..d78ff04e5 100644
--- a/Tests/RegexTests/RegexDSLTests.swift
+++ b/Tests/RegexTests/RegexDSLTests.swift
@@ -280,6 +280,16 @@ class RegexDSLTests: XCTestCase {
Anchor.endOfLine
}
+ try _testDSLCaptures(
+ ("Cafe\u{301}", nil),
+ ("Cafe", "Cafe"),
+ matchType: Substring.self, ==)
+ {
+ oneOrMore(.word)
+ UnicodeScalar("e")
+ Anchor.textSegmentBoundary
+ }
+
try _testDSLCaptures(
("aaaaa1", "aaaaa1"),
("aaaaa2", nil),
@@ -642,6 +652,59 @@ class RegexDSLTests: XCTestCase {
}
}
}
+
+ func testSemanticVersionExample() {
+ struct SemanticVersion: Equatable {
+ var major: Int
+ var minor: Int
+ var patch: Int
+ var dev: String?
+ }
+ struct SemanticVersionParser: CustomRegexComponent {
+ typealias Match = SemanticVersion
+ func match(
+ _ input: String,
+ startingAt index: String.Index,
+ in bounds: Range
+ ) -> (upperBound: String.Index, match: SemanticVersion)? {
+ let regex = Regex {
+ tryCapture(oneOrMore(.digit)) { Int($0) }
+ "."
+ tryCapture(oneOrMore(.digit)) { Int($0) }
+ optionally {
+ "."
+ tryCapture(oneOrMore(.digit)) { Int($0) }
+ }
+ optionally {
+ "-"
+ capture(oneOrMore(.word))
+ }
+ }
+
+ guard let match = input[index..