From 05ce1d3345c7bd50cac79cef68309f69bbcf4e93 Mon Sep 17 00:00:00 2001 From: Saleem Abdulrasool Date: Sun, 23 Jan 2022 11:40:06 -0800 Subject: [PATCH 01/51] build: add a CMake based build This is a first approximation of a CMake based build for this repository. The intent here is to localize the build rules for the repository to allow it to be consumed during the build of the toolchain. This allows the source list to be maintained in the repository as the source of truth rather than be explicitly listed in the swift repository. For general purpose development, the SPM based build is recommended. Unless there is a specific need for the tests to be included, testing should be done via the SPM build. This change is sufficient to build the content though does not perform the install or export steps which will be required to consume the results in the Swift build. Example invocation: ~~~ cmake -B S:\b\16 ^ -D CMAKE_BUILD_TYPE=Release ^ -D ArgumentParser_DIR=S:\b\10\cmake\modules ^ -G Ninja ^ -S S:\SourceCache\swift-experimental-string-processing cmake --build S:\b\16 ~~~ --- CMakeLists.txt | 17 +++++++++ Sources/CMakeLists.txt | 6 +++ Sources/Prototypes/CMakeLists.txt | 18 +++++++++ Sources/VariadicsGenerator/CMakeLists.txt | 7 ++++ Sources/_MatchingEngine/CMakeLists.txt | 46 +++++++++++++++++++++++ Sources/_StringProcessing/CMakeLists.txt | 42 +++++++++++++++++++++ Sources/_Unicode/CMakeLists.txt | 16 ++++++++ 7 files changed, 152 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 Sources/CMakeLists.txt create mode 100644 Sources/Prototypes/CMakeLists.txt create mode 100644 Sources/VariadicsGenerator/CMakeLists.txt create mode 100644 Sources/_MatchingEngine/CMakeLists.txt create mode 100644 Sources/_StringProcessing/CMakeLists.txt create mode 100644 Sources/_Unicode/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..ad640f43a --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,17 @@ + +cmake_minimum_required(VERSION 3.18) +project(SwiftExperimentalStringProcessing + LANGUAGES Swift) + +if(CMAKE_SYSTEM_NAME STREQUAL Windows OR CMAKE_SYSTEM_NAME STREQUAL Darwin) + option(BUILD_SHARED_LIBS "Build shared libraries by default" YES) +endif() + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) +set(CMAKE_Swift_MODULE_DIRECTORY ${CMAKE_BINARY_DIR}/swift) + +find_package(ArgumentParser CONFIG) + +add_subdirectory(Sources) diff --git a/Sources/CMakeLists.txt b/Sources/CMakeLists.txt new file mode 100644 index 000000000..19feadbd9 --- /dev/null +++ b/Sources/CMakeLists.txt @@ -0,0 +1,6 @@ + +add_subdirectory(_Unicode) +add_subdirectory(_MatchingEngine) +add_subdirectory(_StringProcessing) +add_subdirectory(Prototypes) +add_subdirectory(VariadicsGenerator) diff --git a/Sources/Prototypes/CMakeLists.txt b/Sources/Prototypes/CMakeLists.txt new file mode 100644 index 000000000..60768f5a3 --- /dev/null +++ b/Sources/Prototypes/CMakeLists.txt @@ -0,0 +1,18 @@ + +add_library(Prototypes + Combinators/Combinators.swift + PEG/PEG.swift + PEG/PEGCode.swift + PEG/PEGCompile.swift + PEG/PEGCore.swift + PEG/PEGInterpreter.swift + PEG/PEGTranspile.swift + PEG/PEGVM.swift + PEG/PEGVMExecute.swift + PEG/Printing.swift + PTCaRet/Interpreter.swift + PTCaRet/PTCaRet.swift + TourOfTypes/CharacterClass.swift + TourOfTypes/Literal.swift) +target_link_libraries(Prototypes PUBLIC + _MatchingEngine) diff --git a/Sources/VariadicsGenerator/CMakeLists.txt b/Sources/VariadicsGenerator/CMakeLists.txt new file mode 100644 index 000000000..8ea543970 --- /dev/null +++ b/Sources/VariadicsGenerator/CMakeLists.txt @@ -0,0 +1,7 @@ + +add_executable(VariadicsGenerator + VariadicsGenerator.swift) +target_compile_options(VariadicsGenerator PRIVATE + -parse-as-library) +target_link_libraries(VariadicsGenerator PUBLIC + ArgumentParser) diff --git a/Sources/_MatchingEngine/CMakeLists.txt b/Sources/_MatchingEngine/CMakeLists.txt new file mode 100644 index 000000000..f7cb97ce3 --- /dev/null +++ b/Sources/_MatchingEngine/CMakeLists.txt @@ -0,0 +1,46 @@ + +add_library(_MatchingEngine + Engine/Backtracking.swift + Engine/Builder.swift + Engine/Capture.swift + Engine/Consume.swift + Engine/Engine.swift + Engine/InstPayload.swift + Engine/Instruction.swift + Engine/Processor.swift + Engine/Program.swift + Engine/Registers.swift + Engine/Tracing.swift + Regex/AST/AST.swift + Regex/AST/ASTAction.swift + Regex/AST/ASTProtocols.swift + Regex/AST/Atom.swift + Regex/AST/Conditional.swift + Regex/AST/CustomCharClass.swift + Regex/AST/Group.swift + Regex/AST/MatchingOptions.swift + Regex/AST/Quantification.swift + Regex/Parse/CaptureStructure.swift + Regex/Parse/CharacterPropertyClassification.swift + Regex/Parse/Diagnostics.swift + Regex/Parse/LexicalAnalysis.swift + Regex/Parse/Mocking.swift + Regex/Parse/Parse.swift + Regex/Parse/Source.swift + Regex/Parse/SourceLocation.swift + Regex/Parse/SyntaxOptions.swift + Regex/Printing/DumpAST.swift + Regex/Printing/PrettyPrinter.swift + Regex/Printing/PrintAsCanonical.swift + Regex/Printing/PrintAsPattern.swift + Regex/Printing/RenderRanges.swift + Utility/AllScalars.swift + Utility/Formatting.swift + Utility/Misc.swift + Utility/MissingUnicode.swift + Utility/Protocols.swift + Utility/TypeConstruction.swift + Utility/TypedIndex.swift + Utility/TypedInt.swift) +target_compile_options(_MatchingEngine PRIVATE + -enable-library-evolution) diff --git a/Sources/_StringProcessing/CMakeLists.txt b/Sources/_StringProcessing/CMakeLists.txt new file mode 100644 index 000000000..c20dcc240 --- /dev/null +++ b/Sources/_StringProcessing/CMakeLists.txt @@ -0,0 +1,42 @@ + +add_library(_StringProcessing + Algorithms/Algorithms/Contains.swift + Algorithms/Algorithms/FirstRange.swift + Algorithms/Algorithms/Ranges.swift + Algorithms/Algorithms/Replace.swift + Algorithms/Algorithms/Split.swift + Algorithms/Algorithms/StartsWith.swift + Algorithms/Algorithms/Trim.swift + Algorithms/Consumers/CollectionConsumer.swift + Algorithms/Consumers/FixedPatternConsumer.swift + Algorithms/Consumers/ManyConsumer.swift + Algorithms/Consumers/PredicateConsumer.swift + Algorithms/Consumers/RegexConsumer.swift + Algorithms/Searchers/CollectionSearcher.swift + Algorithms/Searchers/ConsumerSearcher.swift + Algorithms/Searchers/NaivePatternSearcher.swift + Algorithms/Searchers/PatternOrEmpty.swift + Algorithms/Searchers/PredicateSearcher.swift + Algorithms/Searchers/TwoWaySearcher.swift + Algorithms/Searchers/ZSearcher.swift + ASTBuilder.swift + Capture.swift + CharacterClass.swift + Compiler.swift + ConsumerInterface.swift + Executor.swift + Legacy/HareVM.swift + Legacy/LegacyCompile.swift + Legacy/RECode.swift + Legacy/TortoiseVM.swift + Legacy/VirtualMachine.swift + RegexDSL/Builder.swift + RegexDSL/Concatenation.swift + RegexDSL/Core.swift + RegexDSL/DSL.swift + RegexDSL/DSLCapture.swift + RegexDSL/DynamicCaptures.swift) +target_compile_options(_StringProcessing PRIVATE + -enable-library-evolution) +target_link_libraries(_StringProcessing PUBLIC + _MatchingEngine) diff --git a/Sources/_Unicode/CMakeLists.txt b/Sources/_Unicode/CMakeLists.txt new file mode 100644 index 000000000..7fdb44628 --- /dev/null +++ b/Sources/_Unicode/CMakeLists.txt @@ -0,0 +1,16 @@ + +add_library(_Unicode + CaseConversion.swift + CharacterProps.swift + Comparison.swift + Decoding.swift + Encodings.swift + Formatting.swift + Graphemes.swift + NecessaryEvils.swift + Normaliation.swift + NumberParsing.swift + ScalarProps.swift + Transcoding.swift + UCD.swift + Validation.swift) From 04ad64ab7e936cb7427f5f00df017ab86472a3b3 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 17 Feb 2022 17:15:54 +0000 Subject: [PATCH 02/51] Add a couple of quoted test cases --- Tests/RegexTests/ParseTests.swift | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 4872e256e..e70939d53 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -488,6 +488,10 @@ extension RegexTests { #"a\Q \Q \\.\Eb"#, concat("a", quote(#" \Q \\."#), "b")) + // These follow the PCRE behavior. + parseTest(#"\Q\\E"#, quote("\\")) + parseTest(#"\E"#, "E") + parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"), syntax: .experimental) parseTest(#"a" .""b""#, concat("a", quote(" ."), quote("b")), From 9d5529ee05858ccb49efeec1ba1a0cf50ee9046f Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 17 Feb 2022 17:15:55 +0000 Subject: [PATCH 03/51] Update \DDD parsing Previously we followed PCRE's parsing of this syntax such that it may either be an octal sequence or backreference depending on a list of heuristics. However this model is complicated and not particularly intuitive, especially as there are other engines that disambiguate using subtly different rules. Instead, always parse `\DDD` as a backreference, unless it begins with `0`, in which case it is an octal sequence. This matches ICU and Java's behavior. Once we start validating group references, we can then start emitting an error on invalid backreferences using this syntax, and suggest prefixing with 0 if an octal sequence is desired. --- .../Regex/Parse/LexicalAnalysis.swift | 45 ++++++----------- Tests/RegexTests/MatchTests.swift | 5 +- Tests/RegexTests/ParseTests.swift | 48 +++++++++---------- 3 files changed, 39 insertions(+), 59 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift index 727727ce1..dd785f12d 100644 --- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift @@ -279,7 +279,7 @@ extension Source { /// | 'x' HexDigit{0...2} /// | 'U' HexDigit{8} /// | 'o{' OctalDigit{1...} '}' - /// | OctalDigit{1...3} + /// | '0' OctalDigit{0...3} /// mutating func expectUnicodeScalar( escapedCharacter base: Character @@ -313,13 +313,14 @@ extension Source { let str = try src.lexUntil(eating: "}").value return try Source.validateUnicodeScalar(str, .octal) - case let c where c.isOctalDigit: - // We can read *up to* 2 more octal digits per PCRE. - // FIXME: ICU can read up to 3 octal digits if the leading digit is 0, - // we should have a parser mode to switch. - let nextDigits = src.tryEatPrefix(maxLength: 2, \.isOctalDigit) - let str = String(c) + (nextDigits?.string ?? "") - return try Source.validateUnicodeScalar(str, .octal) + case "0": + // We can read *up to* 3 more octal digits. + // FIXME: PCRE can only read up to 2 octal digits, if we get a strict + // PCRE mode, we should limit it here. + guard let digits = src.tryEatPrefix(maxLength: 3, \.isOctalDigit) else { + return Unicode.Scalar(0) + } + return try Source.validateUnicodeScalar(digits.string, .octal) default: fatalError("Unexpected scalar start") @@ -1341,26 +1342,10 @@ extension Source { return nil } - // Lexing \n is tricky, as it's ambiguous with octal sequences. In PCRE - // it is treated as a backreference if its first digit is not 0 (as that - // is always octal) and one of the following holds: - // - // - It's 0 < n < 10 (as octal would be pointless here) - // - Its first digit is 8 or 9 (as not valid octal) - // - There have been as many prior groups as the reference. - // - // Oniguruma follows the same rules except the second one. e.g \81 and - // \91 are instead treated as literal 81 and 91 respectively. - // TODO: If we want a strict Oniguruma mode, we'll need to add a check - // here. + // Backslash followed by a non-0 digit character is a backreference. if firstChar != "0", let numAndLoc = try src.lexNumber() { - let num = numAndLoc.value - let ref = AST.Reference(.absolute(num), innerLoc: numAndLoc.location) - if num < 10 || firstChar == "8" || firstChar == "9" || - context.isPriorGroupRef(ref.kind) { - return .backreference(ref) - } - return nil + return .backreference(.init( + .absolute(numAndLoc.value), innerLoc: numAndLoc.location)) } return nil } @@ -1497,10 +1482,8 @@ extension Source { } switch char { - // Hexadecimal and octal unicode scalars. This must be done after - // backreference lexing due to the ambiguity with \nnn. - case let c where c.isOctalDigit: fallthrough - case "u", "x", "U", "o": + // Hexadecimal and octal unicode scalars. + case "u", "x", "U", "o", "0": return try .scalar( src.expectUnicodeScalar(escapedCharacter: char).value) default: diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 088deb151..2c07f4c79 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -261,7 +261,7 @@ extension RegexTests { firstMatchTest(#"\070"#, input: "1238xyz", match: "8") firstMatchTest(#"\07A"#, input: "123\u{7}Axyz", match: "\u{7}A") firstMatchTest(#"\08"#, input: "123\08xyz", match: "\08") - firstMatchTest(#"\0707"#, input: "12387xyz", match: "87") + firstMatchTest(#"\0707"#, input: "12387\u{1C7}xyz", match: "\u{1C7}") // code point sequence firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true) @@ -1021,9 +1021,6 @@ extension RegexTests { firstMatchTest( #"(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)\10"#, input: "aaaaaaaaabbc", match: "aaaaaaaaabb") - firstMatchTest( - #"(.)\10"#, - input: "a\u{8}b", match: "a\u{8}") firstMatchTest( #"(.)\g001"#, diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index e70939d53..ce070cc38 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -325,7 +325,7 @@ extension RegexTests { parseTest(#"\070"#, scalar("\u{38}")) parseTest(#"\07A"#, concat(scalar("\u{7}"), "A")) parseTest(#"\08"#, concat(scalar("\u{0}"), "8")) - parseTest(#"\0707"#, concat(scalar("\u{38}"), "7")) + parseTest(#"\0707"#, scalar("\u{1C7}")) parseTest(#"[\0]"#, charClass(scalar_m("\u{0}"))) parseTest(#"[\01]"#, charClass(scalar_m("\u{1}"))) @@ -333,13 +333,15 @@ extension RegexTests { parseTest(#"[\07A]"#, charClass(scalar_m("\u{7}"), "A")) parseTest(#"[\08]"#, charClass(scalar_m("\u{0}"), "8")) - parseTest(#"[\0707]"#, charClass(scalar_m("\u{38}"), "7")) + parseTest(#"[\0707]"#, charClass(scalar_m("\u{1C7}"))) - parseTest(#"[\1]"#, charClass(scalar_m("\u{1}"))) - parseTest(#"[\123]"#, charClass(scalar_m("\u{53}"))) - parseTest(#"[\101]"#, charClass(scalar_m("\u{41}"))) - parseTest(#"[\7777]"#, charClass(scalar_m("\u{1FF}"), "7")) - parseTest(#"[\181]"#, charClass(scalar_m("\u{1}"), "8", "1")) + // TODO: These are treated as octal sequences by PCRE, we should warn and + // suggest user prefix with 0. + parseTest(#"[\1]"#, charClass("1")) + parseTest(#"[\123]"#, charClass("1", "2", "3")) + parseTest(#"[\101]"#, charClass("1", "0", "1")) + parseTest(#"[\7777]"#, charClass("7", "7", "7", "7")) + parseTest(#"[\181]"#, charClass("1", "8", "1")) // We take *up to* the first two valid digits for \x. No valid digits is 0. parseTest(#"\x"#, scalar("\u{0}")) @@ -797,11 +799,9 @@ extension RegexTests { ) } - // TODO: Some of these behaviors are unintuitive, we should likely warn on - // some of them. - parseTest(#"\10"#, scalar("\u{8}")) - parseTest(#"\18"#, concat(scalar("\u{1}"), "8")) - parseTest(#"\7777"#, concat(scalar("\u{1FF}"), "7")) + parseTest(#"\10"#, backreference(.absolute(10))) + parseTest(#"\18"#, backreference(.absolute(18))) + parseTest(#"\7777"#, backreference(.absolute(7777))) parseTest(#"\91"#, backreference(.absolute(91))) parseTest( @@ -813,12 +813,13 @@ extension RegexTests { parseTest( #"()()()()()()()()()\10()"#, concat(Array(repeating: capture(empty()), count: 9) - + [scalar("\u{8}"), capture(empty())]), + + [backreference(.absolute(10)), capture(empty())]), captures: .tuple(Array(repeating: .atom(), count: 10)) ) - parseTest(#"()()\10"#, - concat(capture(empty()), capture(empty()), scalar("\u{8}")), - captures: .tuple(.atom(), .atom())) + parseTest(#"()()\10"#, concat( + capture(empty()), capture(empty()), backreference(.absolute(10))), + captures: .tuple(.atom(), .atom()) + ) // A capture of three empty captures. let fourCaptures = capture( @@ -826,8 +827,8 @@ extension RegexTests { ) parseTest( // There are 9 capture groups in total here. - #"((()()())(()()()))\10"#, - concat(capture(concat(fourCaptures, fourCaptures)), scalar("\u{8}")), + #"((()()())(()()()))\10"#, concat(capture(concat( + fourCaptures, fourCaptures)), backreference(.absolute(10))), captures: .tuple(Array(repeating: .atom(), count: 9)) ) parseTest( @@ -852,7 +853,7 @@ extension RegexTests { concat(Array(repeating: capture(empty()), count: 40) + [scalar(" ")]), captures: .tuple(Array(repeating: .atom(), count: 40)) ) - parseTest(#"\40"#, scalar(" ")) + parseTest(#"\40"#, backreference(.absolute(40))) parseTest( String(repeating: "()", count: 40) + #"\40"#, concat(Array(repeating: capture(empty()), count: 40) @@ -862,7 +863,7 @@ extension RegexTests { parseTest(#"\7"#, backreference(.absolute(7))) - parseTest(#"\11"#, scalar("\u{9}")) + parseTest(#"\11"#, backreference(.absolute(11))) parseTest( String(repeating: "()", count: 11) + #"\11"#, concat(Array(repeating: capture(empty()), count: 11) @@ -876,12 +877,11 @@ extension RegexTests { captures: .tuple(Array(repeating: .atom(), count: 11)) ) - parseTest(#"\0113"#, concat(scalar("\u{9}"), "3")) - parseTest(#"\113"#, scalar("\u{4B}")) - parseTest(#"\377"#, scalar("\u{FF}")) + parseTest(#"\0113"#, scalar("\u{4B}")) + parseTest(#"\113"#, backreference(.absolute(113))) + parseTest(#"\377"#, backreference(.absolute(377))) parseTest(#"\81"#, backreference(.absolute(81))) - parseTest(#"\g1"#, backreference(.absolute(1))) parseTest(#"\g001"#, backreference(.absolute(1))) parseTest(#"\g52"#, backreference(.absolute(52))) From 1beb1f02655a1d1b5faa0654e29a1c3fd34ee083 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 18 Feb 2022 15:23:48 +0000 Subject: [PATCH 04/51] Start of Regex Syntax Pitch --- Documentation/Evolution/RegexSyntax.md | 698 +++++++++++++++++++++++++ 1 file changed, 698 insertions(+) create mode 100644 Documentation/Evolution/RegexSyntax.md diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md new file mode 100644 index 000000000..4bd1bd553 --- /dev/null +++ b/Documentation/Evolution/RegexSyntax.md @@ -0,0 +1,698 @@ +# Regular Expression Syntax + +- Authors: Hamish Knight, Michael Ilseman + +## Introduction + +We aim to parse a superset of the syntax accepted by a variety of popular regular expression engines. + +**TODO(Michael): Elaborate** + +## Engines supported + +We aim to implement a syntactic superset of: + +- [PCRE 2][pcre2-syntax], an "industry standard" of sorts, and a rough superset of Perl, Python, etc. +- [Oniguruma][oniguruma-syntax], an internationalization-oriented engine with some modern features +- [ICU][icu-syntax], used by NSRegularExpression, a Unicode-focused engine. +- [.NET][.net-syntax]'s regular expressions, which support delimiter-balancing and some interesting minor details on conditional patterns. +- **TODO: List Java here? It doesn't really add any more syntax than the above other than `\p{javaLowerCase}`** + +We also intend to achieve at least Level 1 (**TODO: do we want to promise Level 2?**) [UTS#18][uts18] conformance, which specifies regular expression matching semantics without mandating any particular syntax. However we can infer syntactic feature sets from its guidance. + +**TODO(Michael): Rework and expand prose** + +## Detailed Design + +We're proposing the following regular expression syntactic superset for Swift. + +### Top-level regular expression + +``` +Regex -> GlobalMatchingOptionSequence? RegexNode +RegexNode -> '' | Alternation +``` + +A top-level regular expression may consist of a sequence of global matching options followed by a `RegexNode`, which is the recursive part of the grammar that may be nested within e.g a group. A regex node may be empty, which is the null pattern that always matches, but does not advance the input. + +### Alternation + +``` +Alternation -> Concatenation ('|' Concatenation)* +``` + +The `|` operator denotes what is formally called an alternation, or a choice between alternatives. Any number of alternatives may appear, including empty alternatives. This operator has the lowest precedence of all operators in a regex literal. + +### Concatenation + +``` +Concatenation -> (!'|' !')' ConcatComponent)* +ConcatComponent -> Trivia | Quote | Quantification +``` + +Implicitly denoted by adjacent expressions, a concatenation matches against a sequence of regular expression patterns. This has a higher precedence than an alternation, so e.g `abc|def` matches against `abc` or `def`. The `ConcatComponent` token varies across engine, but at least matches some form of trivia, e.g comments, quoted sequences e.g `\Q...\E`, and a potentially quantified expression. + +### Quantification + +``` +Quantification -> QuantOperand Quantifier? +Quantifier -> QuantAmount QuantKind? +QuantAmount -> '?' | '*' | '+' | '{' Range '}' +QuantKind -> '?' | '+' +Range -> ',' | ',' ? | + +QuantOperand -> AbsentFunction | Atom | Conditional | CustomCharClass | Group +``` + +A quantification consists of an operand optionally followed by a quantifier that specifier how many times it may be matched. An operand without a quantifier is matched once. + +The quantifiers supported are: + +- `?`: 0 or 1 matches +- `*`: 0 or more matches +- `+`: 1 or more matches +- `{n,m}`: Between `n` and `m` (inclusive) matches +- `{n,}`: `n` or more matches +- `{,m}`: Up to `m` matches +- `{n}`: Exactly `n` matches + +A quantifier may optionally be followed by `?` or `+`, which adjust its semantics. If neither are specified, by default the quantification happens *eagerly*, meaning that it will try to maximize the number of matches made. However, if `?` is specified, quantification happens *reluctantly*, meaning that the number of matches will instead be minimized. If `+` is specified, *possessive* matching occurs, which is eager matching with the additional semantic that it may not be backtracked into to try a different number of matches. + +### Atom + +``` +Atom -> Anchor + | Backreference + | BacktrackingDirective + | BuiltinCharClass + | Callout + | CharacterProperty + | EscapeSequence + | NamedCharacter + | Subpattern + | UniScalar + | '\'? +``` + +Atoms are the smallest units of regular expression syntax. They include escape sequences e.g `\b`, `\d`, as well as meta-characters such as `.` and `$`. They also include some larger syntactic constructs such as backreferences and callouts. The most basic form of atom is a literal character. A meta-character may be treated as literal by preceding it with a backslash. Other characters may also be preceded with a backslash, but it has no effect, e.g `\I` is literal `I`. + +### Groups + +``` +Group -> GroupStart RegexNode ')' +GroupStart -> '(' GroupKind | '(' +GroupKind -> '' | '?' BasicGroupKind | '*' PCRE2GroupKind ':' + +BasicGroupKind -> ':' | '|' | '>' | '=' | '!' | '*' | '<=' | ' 'atomic' + | 'pla' | 'positive_lookahead' + | 'nla' | 'negative_lookahead' + | 'plb' | 'positive_lookbehind' + | 'nlb' | 'negative_lookbehind' + | 'napla' | 'non_atomic_positive_lookahead' + | 'naplb' | 'non_atomic_positive_lookbehind' + | 'sr' | 'script_run' + | 'asr' | 'atomic_script_run' + +NamedGroup -> 'P<' GroupNameBody '>' + | '<' GroupNameBody '>' + | "'" GroupNameBody "'" + +GroupNameBody -> Identifier | BalancingGroupBody + +Identifier -> [\w--\d] \w* +``` + +Groups define a new scope within which a recursive regular expression pattern may occur. Groups have different semantics depending on how they are introduced, some may capture the nested match, some may match against the input without advancing, some may change the matching options set in the new scope, etc. + +Groups may be named, the characters of which may be any letter or number characters (or `_`). However the name must not start with a number. This restriction follows the behavior of other regex engines and avoids ambiguities when it comes to named and numeric group references. + +Groups may be used to change the matching options present within their scope, see the *Matching options* section. + +Note there are additional constructs that may syntactically appear similar to groups, but are distinct. See the *group-like atoms* section. + + +#### Lookahead and lookbehind + +- `(?=` specifies a lookahead that attempts to match against the group body, but does not advance. +- `(?!` specifies a negative lookahead that ensures the group body does not match, and does not advance. +- `(?<=` specifies a lookbehind that attempts to match the group body against the input before the current position. Does not advance the input. +- `(?!<` specifies a negative lookbehind that ensures the group body does not match the input before the current position. Does not advance the input. + +PCRE2 defines explicitly spelled out versions of the syntax, e.g `(*negative_lookbehind:)`. + +#### Atomic groups + +An atomic group e.g `(?>...)` specifies that its contents should not be re-evaluated for backtracking. This the same semantics as a possessive quantifier, but applies more generally to any regex pattern. + +#### Script runs + +A script run e.g `(*script_run:...)` specifies that the contents must match against a sequence of characters from the same Unicode script, e.g Latin or Greek. + +#### Balancing groups + +``` +BalancingGroupBody -> Identifier? '-' Identifier +``` + +Introduced by .NET, balancing groups extend the `GroupNameBody` syntax to support the ability to refer to a prior group. Upon matching, the prior group is deleted, and any intermediate matched input becomes the capture of the current group. + +### Matching options + +``` +MatchingOptionSeq -> '^' MatchingOption* + | MatchingOption+ + | MatchingOption* '-' MatchingOption* + +MatchingOption -> 'i' | 'J' | 'm' | 'n' | 's' | 'U' | 'x' | 'xx' | 'w' | 'D' | 'P' | 'S' | 'W' | 'y{' ('g' | 'w') '}' +``` + +A matching option sequence may be used as a group specifier, and denotes a change in matching options for the scope of that group. For example `(?x:a b c)` enables extended syntax for `a b c`. A matching option sequence may be part of an "isolated group" which has an implicit scope that wraps the remaining elements of the current group. For example, `(?x)a b c` also enables extended syntax for `a b c`. + +We support all the matching options accepted by PCRE, ICU, and Oniguruma. In addition, we accept some matching options unique to our matching engine. + +#### PCRE options + +- `i`: Case insensitive matching +- `J`: Allows multiple groups to share the same name, which is otherwise forbidden +- `m`: Enables `^` and `$` to match against the start and end of a line rather than only the start and end of the entire string +- `n`: Disables capturing of `(...)` groups. Named capture groups must be used instead. +- `s`: Changes `.` to match any character, including newlines. +- `U`: Changes quantifiers to be reluctant by default, with the `?` specifier changing to mean greedy. +- `x`, `xx`: Enables extended syntax mode, which allows non-semantic whitespace and end-of-line comments. See the *trivia* section for more info. + +#### ICU options + +- `w`: Enables the Unicode interpretation of word boundaries `\b`. + +#### Oniguruma options + +- `D`: Enables ASCII-only digit matching for `\d`, `\p{Digit}`, `[:digit:]` +- `S`: Enables ASCII-only space matching for `\s`, `\p{Space}`, `[:space:]` +- `W`: Enables ASCII-only word matching for `\w`, `\p{Word}`, `[:word:]`, and `\b` +- `P`: Enables ASCII-only for all POSIX properties (including `digit`, `space`, and `word`) +- `y{g}`, `y{w}`: Changes the meaning of `\X`, `\y`, `\Y`. These are mutually exclusive options, with `y{g}` specifying extended grapheme cluster mode, and `y{w}` specifying word mode. + +#### Swift options + +These options are specific to the Swift regex matching engine and control the semantic level at which matching takes place. + +- `X`: Grapheme cluster matching +- `u`: Unicode scalar matching +- `b`: Byte matching + + +### Anchors + +``` +Anchor -> '^' | '$' | '\A' | '\b' | '\B' | '\G' | '\y' | '\Y' | '\z' | '\Z' +``` + +Anchors match against a certain position in the input rather than on a particular character of the input. + +- `^`: Matches at the start of a line. +- `$`: Matches at the end of a line. +- `\A`: Matches at the very start of the input string. +- `\Z`: Matches at the very end of the input string, in addition to before a newline at the very end of the input string. +- `\z`: Like `\Z`, but only matches at the very end of the input string. +- `\G`: Like `\A`, but also matches against the start position of where matching resumes in global matching mode (e.g `\Gab` matches twice in `abab`, `\Aab` would only match once). +- `\b` matches a boundary between a word character and a non-word character. The definitions of which vary depending on matching engine. +- `\B` matches a non-word-boundary. +- `\y` matches a text segment boundary, the definition of which varies based on the `y{w}` and `y{g}` matching option. +- `\Y` matches a non-text-segment-boundary. + +### Unicode scalars + +``` +UniScalar -> '\u{' HexDigit{1...} '}' + | '\u' HexDigit{4} + | '\x{' HexDigit{1...} '}' + | '\x' HexDigit{0...2} + | '\U' HexDigit{8} + | '\o{' OctalDigit{1...} '}' + | '\0' OctalDigit{0...3} + +HexDigit -> [0-9a-zA-Z] +OctalDigit -> [0-7] +``` + +These sequences define a unicode scalar value to be matched against. There is both syntax for specifying the scalar value in hex notation, as well as octal notation. + +### Escape sequences + +``` +EscapeSequence -> '\a' | '\b' | '\c' | '\e' | '\f' | '\n' | '\r' | '\t' +``` + +These escape sequences denote a specific character. + +- `\a`: The alert (bell) character `U+7`. +- `\b`: The backspace character `U+8`. Note this may only be used in a custom character class, otherwise it represents a word boundary. +- `\c `: A control character sequence (`U+00` - `U+7F`). +- `\e`: The escape character `U+1B`. +- `\f`: The form-feed character `U+C`. +- `\n`: The newline character `U+A`. +- `\r`: The carriage return character `U+D`. +- `\t`: The tab character `U+9` + +### Builtin character classes + +``` +BuiltinCharClass -> '.' | '\d' | '\D' | '\h' | '\H' | '\N' | '\O' | '\R' | '\s' | '\S' | '\v' | '\V' | '\w' | '\W' | '\X' +``` + +- `.`: Any character excluding newlines +- `\d`: Digit character +- `\D`: Non-digit character +- `\h`: Horizontal space character +- `\H`: Non-horizontal-space character +- `\N`: Non-newline character +- `\O`: Any character (including newlines). This is syntax from Oniguruma. +- `\R`: Newline sequence +- `\s`: Whitespace character +- `\S`: Non-whitespace character +- `\v`: Vertical space character +- `\V`: Non-vertical-space character +- `\w`: Word character +- `\W`: Non-word character +- `\X`: Any extended grapheme cluster + +### Custom character classes + +``` +CustomCharClass -> Start Set (SetOp Set)* ']' +Start -> '[' '^'? +Set -> Member+ +Member -> CustomCharClass | Quote | Range | Atom +Range -> RangeElt `-` RangeElt +RangeElt -> | UniScalar | EscapeSequence +SetOp -> '&&' | '--' | '~~' +``` + +Custom characters classes introduce their own language, in which most regular expression metacharacters become literal. The basic element in a custom character class is an `Atom`, though only a few atoms are considered valid: + +- Builtin character classes, except `.`, `\O`, and `\X` +- Escape sequences, including `\b` which becomes the backspace character (rather than a word boundary) +- Unicode scalars +- Named characters +- Character properties +- Plain literal characters + +Ranges of characters may be specified with `-`, e.g `[a-z]` matches against the letters from `a` to `z`. Only unicode scalars and literal characters are valid range operands. If `-` does not appear in a valid position, it is interpreted as literal, e.g `[-a]` is the character class of `-` and `a`. **TODO: .NET's use of it for subtraction** + +Custom character classes may be nested within each other, and may be used with set operations. The supported set operations are intersection `&&`, subtraction `--`, and symmetric difference `~~`. + +Quoted sequences may appear with custom character classes, e.g `[\Q]\E]`, and escape the contained characters. + +### Character properties + +``` +CharacterProperty -> '\' ('p' | 'P') '{' PropertyContents '}' +POSIXCharacterProperty -> '[:' PropertyContents ':]' + +PropertyContents -> PropertyName ('=' PropertyName)? +PropertyName -> [\s\w-]+ +``` + +A character property specifies a particular Unicode or POSIX property to match against. Fuzzy matching is used when parsing the property name, and is done according to rules set out by [UAX44-LM3]. This means that the following property names are considered equivalent: + +- `whitespace` +- `isWhitespace` +- `is-White_Space` +- `iSwHiTeSpaCe` +- `i s w h i t e s p a c e` + +Unicode properties consist of both a key and a value, e.g `General_Category=Whitespace`. However there are some properties where the key or value may be inferred. These include: + +- General category properties e.g `\p{Whitespace}` is inferred as `\p{General_Category=Whitespace}`. +- Script properties e.g `\p{Greek}` is inferred as `\p{Script=Greek}`. +- Boolean properties that are inferred to have a `True` value, e.g `\p{Lowercase}` is inferred as `\p{Lowercase=True}`. + +Other Unicode properties however must specify both a key and value. + +For non-Unicode properties, only a value is required. These include: + +- The special properties `any`, `assigned`, `ascii`. +- The POSIX compatibility properties `alnum`, `blank`, `graph`, `print`, `word`, `xdigit`. The remaining POSIX properties are already covered by boolean Unicode property spellings. + +**TODO: Spell out the properties we recognize while parsing vs. those we just parse as String?** + +Note that the internal `PropertyContents` syntax is shared by both the `\p{...}` and POSIX-style `[:...:]` syntax, allowing e.g `[:script=Latin:]` as well as `\p{alnum}`. + +### Named characters + +``` +NamedCharacter -> '\N{' CharName '}' +CharName -> 'U+' HexDigit{1...8} | [\s\w-]+ +``` + +Allows a specific Unicode scalar to be specified by name or code point. + +**TODO: Should this be called "named scalar" or similar?** + +### Trivia + +``` +Trivia -> Comment | Whitespace +Comment -> InlineComment | EndOfLineComment + +InlineComment -> '(?#' (!')')* ')' +EndOfLineComment -> '#' .*$ + +Whitespace -> \s+ +``` + +Trivia is consumed by the regular expression parser, but has no semantic meaning. This includes inline PCRE-style comments e.g `(?#comment)`. It also includes non-semantic whitespace and end-of-line comments which may only occur when either of the extended syntax matching options `(?x)`, `(?xx)` are enabled. + +### Quotes + +``` +Quote -> '\Q' (!'\E' .)* '\E' +``` + +A quoted sequence is delimited by `\Q...\E`, and allows the escaping of metacharacters such that they are interpreted literally. For example, `\Q^[xy]+$\E`, is treated as the literal characters `^[xy]+$` rather than an anchored quantified character class. + +The backslash character is also treated as literal within a quoted sequence, and may not be used to escape the closing delimiter, e.g `\Q\\E` is a literal `\`. + +`\E` may appear without a preceding `\Q`, in which case it is a literal `E`. + +### References + +``` +NamedRef -> Identifier +NumberRef -> ('+' | '-')? RecursionLevel? +RecursionLevel -> '+' | '-' +``` + +A reference is an abstract identifier for a particular capturing group in a regular expression. It can either be named or numbered, and in the latter case may be specified relative to the current group. For example `-2` refers to the capture group `N - 2` where `N` is the number of the next capture group. References may refer to groups ahead of the current position e.g `+3`, or the name of a future group. These may be useful in recursive cases where the group being referenced has been matched in a prior iteration. + +**TODO: Describe how capture groups are numbered? Including nesting & resets?** + +#### Backreferences + +``` +Backreference -> '\g{' NameOrNumberRef '}' + | '\g' NumberRef + | '\k<' Identifier '>' + | "\k'" Identifier "'" + | '\k{' Identifier '}' + | '\' [1-9] [0-9]+ + | '(?P=' Identifier ')' +``` + +A backreference evaluates to the value last captured by the referenced capturing group. + +#### Subpatterns + +``` +Subpattern -> '\g<' NameOrNumberRef '>' + | "\g'" NameOrNumberRef "'" + | '(?' GroupLikeSubpatternBody ')' + +GroupLikeSubpatternBody -> 'P>' + | '&' + | 'R' + | NumberRef +``` + +A subpattern causes the referenced group to be re-evaluated at the current position. The syntax `(?R)` is equivalent to `(?0)`, and causes the entire pattern to be recursed. + +### Conditionals + +``` +Conditional -> ConditionalStart Concatenation ('|' Concatenation)? ')' +ConditionalStart -> KnownConditionalStart | GroupConditionalStart + +KnownConditionalStart -> '(?(' KnownCondition ')' +GroupConditionalStart -> '(?' GroupStart + +KnownCondition -> 'R' + | 'R' NumberRef + | 'R&' !')' + | '<' NameRef '>' + | "'" NameRef "'" + | 'DEFINE' + | 'VERSION' VersionCheck + | NumberRef + | NameRef + +PCREVersionCheck -> '>'? '=' PCREVersionNumber +PCREVersionNumber -> '.' +``` + +A conditional evaluates a particular condition, and chooses a branch to match against accordingly. 1 or 2 branches may be specified. If 1 branch is specified e.g `(?(...)x)`, it is treated as the true branch. Note this includes an empty true branch, e.g `(?(...))` which is the null pattern as described in *top-level regular expression*. If 2 branches are specified, e.g `(?(...)x|y)`, the first is treated as the true branch, the second being the false branch. + +A condition may be: + +- A reference to a capture group, which checks whether the group matched successfully. +- A recursion check on either a particular group or the entire regex. In the former case, this checks to see if the last recursive call is through that group. In the latter case, it checks if the match is currently taking place in any kind of recursive call. +- An arbitrary recursive regular expression, which is matched against, and evaluates to true if the match is successful. (**TODO: Clarify whether it introduces captures**) +- A PCRE version check. + +The `DEFINE` keyword is not used as a condition, but rather a way in which to define a group which is not evaluated, but may be referenced by a subpattern. + +### PCRE backtracking directives + +``` +BacktrackingDirective -> '(*' BacktrackingDirectiveKind (':' )? ')' +BacktrackingDirectiveKind -> 'ACCEPT' | 'FAIL' | 'F' | 'MARK' | '' | 'COMMIT' | 'PRUNE' | 'SKIP' | 'THEN' +``` + +This is syntax specific to PCRE, and is used to control backtracking behavior. Any of the directives may include an optional tag, however `MARK` must have a tag. The empty directive is treated as `MARK`. Only the `ACCEPT` directive may be quantified, as it can use the backtracking behavior of the engine to be evaluated only if needed by a reluctant quantification. + +- `ACCEPT`: Causes matching to terminate immediately as a successful match. If used within a subpattern, only that level of recursion is terminated. +- `FAIL`, `F`: Causes matching to fail, forcing backtracking to occur if possible. +- `MARK`: Assigns a label to the current matching path, which is passed back to the caller on success. Subsequent `MARK` directives overwrite the label assigned, so only the last is passed back. +- `COMMIT`: Prevents backtracking from reaching any point prior to this directive, causing the match to fail. This does not allow advancing the input to try a different starting match position. +- `PRUNE`: Similar to `COMMIT`, but allows advancing the input to try and find a different starting match position. +- `SKIP`: Similar to `PRUNE`, but skips ahead to the position of `SKIP` to try again as the starting position. +- `THEN`: Similar to `PRUNE`, but when used inside an alternation will try to match in the subsequent branch before attempting to advance the input to find a different starting position. + +### PCRE global matching options + +``` +GlobalMatchingOptionSequence -> GlobalMatchingOption+ +GlobalMatchingOption -> '(*' GlobalMatchingOptionKind ')' + +GlobalMatchingOptionKind -> LimitOptionKind '=' + | NewlineKind | NewlineSequenceKind + | 'NOTEMPTY_ATSTART' | 'NOTEMPTY' + | 'NO_AUTO_POSSESS' | 'NO_DOTSTAR_ANCHOR' + | 'NO_JIT' | 'NO_START_OPT' | 'UTF' | 'UCP' + +LimitOptionKind -> 'LIMIT_DEPTH' | 'LIMIT_HEAP' | 'LIMIT_MATCH' +NewlineKind -> 'CRLF' | 'CR' | 'ANYCRLF' | 'ANY' | 'LF' | 'NUL' +NewlineSequenceKind -> 'BSR_ANYCRLF' | 'BSR_UNICODE' +``` + +This is syntax specific to PCRE, and allows a set of global options to appear at the start of a regular expression. They may not appear at any other position. + +- `LIMIT_DEPTH`, `LIMIT_HEAP`, `LIMIT_MATCH`: These place certain limits on the resources the matching engine may consume, and matches it may make. +- `CRLF`, `CR`, `ANYCRLF`, `ANY`, `LF`, `NUL`: These control the definition of a newline character, which is used when matching e.g the `.` character class, and evaluating where a line ends in multi-line mode. +- `BSR_ANYCRLF`, `BSR_UNICODE`: These change the definition of `\R`. +- `NOTEMPTY`: Does not consider the empty string to be a valid match. +- `NOTEMPTY_ATSTART`: Like `NOT_EMPTY`, but only applies to the first matching position in the input. +- `NO_AUTO_POSSESS`: Disables an optimization that treats a quantifier as possessive if the following construct clearly cannot be part of the match. In other words, disables the short-circuiting of backtracks in cases where the engine knows it will not produce a match. This is useful for debugging, or for ensuring a callout gets invoked. +- `NO_DOTSTAR_ANCHOR`: Disables an optimization that tries to automatically anchor `.*` at the start of a regex. Like `NO_AUTO_POSSESS`, this is mainly used for debugging or ensuring a callout gets invoked. +- `NO_JIT`: Disables JIT compilation +- `NO_START_OPT`: Disables various optimizations performed at the start of matching. Like `NO_DOTSTAR_ANCHOR`, is mainly used for debugging or ensuring a callout gets invoked. +- `UTF`: Enables UTF pattern support. +- `UCP`: Enables Unicode property support. + +### Callouts + +``` +Callout -> PCRECallout | OnigurumaCallout + +PCRECallout -> '(?C' CalloutBody ')' +PCRECalloutBody -> '' | + | '`' '`' + | "'" "'" + | '"' '"' + | '^' '^' + | '%' '%' + | '#' '#' + | '$' '$' + | '{' '}' + +OnigurumaCallout -> OnigurumaNamedCallout | OnigurumaCalloutOfContents + +OnigurumaNamedCallout -> '(*' Identifier OnigurumaTag? OnigurumaCalloutArgs? ')' +OnigurumaCalloutArgs -> '{' OnigurumaCalloutArgList '}' +OnigurumaCalloutArgList -> OnigurumaCalloutArg (',' OnigurumaCalloutArgList)* +OnigurumaCalloutArg -> [^,}]+ +OnigurumaTag -> '[' Identifier ']' + +OnigurumaCalloutOfContents -> '(?' '{'+ Contents '}'+ OnigurumaTag? Direction? ')' +OnigurumaCalloutContents -> +OnigurumaCalloutDirection -> 'X' | '<' | '>' +``` + +A callout is a feature that allows a user-supplied function to be called when matching reaches that point in the pattern. We supported parsing both the PCRE and Oniguruma callout syntax. The PCRE syntax accepts a string or numeric argument that is passed to the function. The Oniguruma syntax is more involved, and may accept a tag, argument list, or even an arbitrary program in the 'callout of contents' syntax. + +### Absent functions + +``` +AbsentFunction -> '(?~' RegexNode ')' + | '(?~|' Concatenation '|' Concatenation ')' + | '(?~|' Concatenation ')' + | '(?~|)' +``` + +An absent function is an Oniguruma feature that allows for the easy inversion of a given pattern. There are 4 variants of the syntax: + +- `(?~|absent|expr)`: Absent expression, which attempts to match against `expr`, but is limited by the range that is not matched by `absent`. +- `(?~absent)`: Absent repeater, which matches against any input not matched by `absent`. Equivalent to `(?~|absent|\O*)`. +- `(?~|absent)`: Absent stopper, which limits any subsequent matching to not include `absent`. +- `(?~|)`: Absent clearer, which undoes the effects of the absent stopper. + +## Syntactic differences between engines + +**TODO: Intro** + +**TODO: Talk about compatibility modes for different engines being a possible future direction?** + +### Character class set operations + +In a custom character class, some engines allow for binary set operations that take two character class inputs, and produce a new character class output. However which set operations are supported and the spellings used differ by engine. + +| PCRE | ICU | UTS#18 | Oniguruma | .NET | Java | +|------|-----|--------|-----------|------|------| +| ❌ | Intersection `&&`, Subtraction `--` | Intersection, Subtraction | Intersection `&&` | Subtraction via `-` | Intersection `&&` | + +[UTS#18][uts18] requires intersection and subtraction, and uses the operation spellings `&&` and `--` in its examples, though it doesn't mandate a particular spelling. In particular, conforming implementations could spell the subtraction `[[x]--[y]]` as `[[x]&&[^y]]`. UTS#18 also suggests a symmetric difference operator `~~`, and uses an explicit `||` operator in examples, though doesn't require either. + +The differing support between engines is conflicting, as engines that don't support a particular operator treat them as literal, e.g `[x&&y]` in PCRE is the character class of `["x", "&", "y"]` rather than an intersection. + +Another conflict arises with .NET's support of using the `-` character in a custom character class to denote both a range as well as a set subtraction. .NET disambiguates this by only permitting its use as a subtraction if the right hand operand is a nested custom character class, otherwise it is a range. This conflicts with e.g ICU where `[x-[y]]`, in which the `-` is treated as literal. + +We intend to support the operators `&&`, `--`, and `~~`. This means that any regex literal containing these sequences in a custom character class while being written for an engine not supporting that operation will have a different semantic meaning in our engine. However this ought not to be a common occurrence, as specifying a character multiple times in a custom character class is redundant. + +We also intend on supporting the `-` operator (**TODO: Justify**). + +### Nested custom character classes + +This allows e.g `[[a]b[c]]`, which is interpreted the same as `[abc]`. It also allows for more complex set operations with custom character classes as the operands. + +| PCRE | ICU | UTS#18 | Oniguruma | .NET | Java | +|------|-----|--------|-----------|------|------| +| ❌ | ✅ | 💡 | ✅ | ❌ | ✅ | + +UTS#18 doesn't require this, though it does suggest it as a way to clarify precedence for chains of character class set operations e.g `[\w--\d&&\s]`, which the user could write as `[[\w--\d]&&\s]`. + +PCRE does not support this feature, and as such treats `]` as the closing character of the custom character class. Therefore `[[a]b[c]]` is interpreted as the character class `["[", "a"]`, followed by literal `b`, and then the character class `["c"]`, followed by literal `]`. + +.NET does not support nested character classes in general, although allows them as the right-hand side of a subtraction operation. + +### `\U` + +In PCRE, if `PCRE2_ALT_BSUX` or `PCRE2_EXTRA_ALT_BSUX` are specified, `\U` matches literal `U`. However in ICU, `\Uhhhhhhhh` matches a hex sequence. We intend on following the ICU behavior. + +### `{,n}` + +This quantifier is supported by Oniguruma, but in PCRE it matches the literal chars. We intend on supporting it as a quantifier. + +### `\DDD` + +This syntax is implemented in a variety of different ways depending on the engine. In ICU and Java, it is always a backreference unless prefixed with `0`, in which case it is an octal sequence. + +In PCRE, Oniguruma, and .NET, it is also always an octal sequence if prefixed with `0`, however there are also other conditions where it may be treated as octal. These conditions vary slightly been the engines. In PCRE, it will be treated as backreference if any of the following hold: + +- Its `0 < n < 10`. +- Its first digit is `8` or `9`. +- Its value corresponds to a valid *prior* group number. + +Otherwise it is treated as an octal sequence. + +Oniguruma follows all of these except the second. If the first digit is `8` or `9`, it is instead treated as the literal number, e.g `\81` is `81`. .NET also follows this behavior, but additionally has the last condition consider *all* groups, not just prior ones (as backreferences can refer to future groups in recursive cases). + +We intend to implement a simpler behavior more inline with ICU and Java. A `\DDD` sequence that does not start with a `0` will be treated as a backreference, otherwise it will be treated as an octal sequence. If an invalid backreference is formed with this syntax, we will suggest prefixing with a `0` if an octal sequence is desired. + +One further difference between engines exists with this syntax in the octal sequence case. In ICU, up to 3 additional digits are read after the `0`. In PCRE, only 2 additional digits may be interpreted as octal, the last is literal. We intend to follow the ICU behavior, as it is necessary when requiring a `0` prefix. + +### `\x` + +In PCRE, a bare `\x` denotes the NUL character (`U+00`). In Oniguruma, it denotes literal `x`. We intend on following the PCRE behavior. + +### Whitespace in ranges + +In PCRE, `x{2,4}` is a range quantifier meaning that `x` can be matched from 2 to 4 times. However if any whitespace is introduced within the braces, it becomes an invalid range and is then treated as the literal characters instead. We find this behavior to be unintuitive, and therefore intend to parse any intermixed whitespace in the range, but will emit a warning telling users that we're doing so (**TODO: how would they silence? move to modern syntax?**). + +### Implicitly-scoped matching option scopes + +PCRE and Oniguruma both support changing the active matching options through an isolated group e.g `(?i)`. However, they have differing semantics when it comes to their scoping. In Oniguruma, it is treated as an implicit new scope that wraps everything until the end of the current group. In PCRE, it is treated as changing the matching option for all the following expressions until the end of the group. + +These sound similar, but have different semantics around alternations, e.g for `a(?i)b|c|d`, in Oniguruma this becomes `a(?i:b|c|d)`, where `a` is no longer part of the alternation. However in PCRE it becomes `a(?i:b)|(?i:c)|(?i:d)`, where `a` remains a child of the alternation. + +We aim to support the Oniguruma behavior. **TODO: The PCRE behavior is more complex for the parser, but seems less surprising, maybe that should become the default?** + +### Backreference condition kinds + +PCRE and .NET allow for conditional patterns to reference a group by its name, e.g: + +``` +(?x)?(?(group1)y) +``` + +where `y` will only be matched if `(?x)` was matched. PCRE will always treat such syntax as a backreference condition, however .NET will only treat it as such if a group with that name exists somewhere in the regex (including after the conditional). Otherwise, .NET interprets `group1` as an arbitrary regular expression condition to try match against. + +We intend to always parse such conditions as an arbitrary regular expression condition, and will emit a warning asking users to explicitly use the syntax `(?('group1')y)` if they want a backreference condition. This more explicit syntax is supported by PCRE. + +### `\N` + +PCRE supports `\N` meaning "not a newline", however there are engines that treat it as a literal `N`. We intend on supporting the PCRE behavior. + +### Extended character property syntax + +**TODO: Can this be conflicting?** + +ICU (**TODO: any others?**) unifies the character property syntax `\p{...}` with the syntax for POSIX character classes `[:...:]`, such that they follow the same internal grammar, which allows referencing any Unicode character property in addition to the POSIX properties. + +### Extended syntax modes + +Various regex engines offer an "extended syntax" where whitespace is treated as non-semantic (e.g `a b c` is equivalent to `abc`), in addition to allowing end-of-line comments `# comment`. In PCRE, this enabled through the `(?x)` and `(?xx)` matching options, where the former allows non-semantic whitespace outside of character classes, and the latter also allows non-semantic whitespace in custom character classes. + +Oniguruma, Java, and ICU however enable the more broad behavior under `(?x)`. We therefore intend to follow this behavior, with `(?x)` and `(?xx)` being treated the same. + +Different regex engines also have different rules around what characters are considered non-semantic whitespace. When compiled with Unicode support, PCRE considers the following whitespace: + +- The space character `U+20` +- Whitespace characters `U+9...U+D` +- Next line `U+85` +- Left-to-right mark `U+200E` +- Right-to-left mark `U+200F` +- Line separator `U+2028` +- Paragraph separator `U+2029` + +This is a subset of the scalars matched by `UnicodeScalar.isWhitespace`. Additionally, in a custom character class, PCRE only considers the space and tab characters as whitespace. Other engines do not differentiate between whitespace characters inside and outside custom character classes, and appear to follow a subset of this list. Therefore we intend to support exactly the characters in this list for the purposes of non-semantic whitespace. + +## Canonical representations + +Many engines have different spellings for the same regex features, and as such we need to decide on a preferred canonical syntax. + +### Backreferences + +There are a variety of backreference spellings accepted by different engines + +``` +Backreference -> '\g{' NameOrNumberRef '}' + | '\g' NumberRef + | '\k<' Identifier '>' + | "\k'" Identifier "'" + | '\k{' Identifier '}' + | '\' [1-9] [0-9]+ + | '(?P=' Identifier ')' +``` + +We plan on choosing the canonical spelling **TODO: decide**. + + +[pcre2-syntax]: https://www.pcre.org/current/doc/html/pcre2syntax.html +[oniguruma-syntax]: https://github.com/kkos/oniguruma/blob/master/doc/RE +[icu-syntax]: https://unicode-org.github.io/icu/userguide/strings/regexp.html +[uts18]: https://www.unicode.org/reports/tr18/ +[.net-syntax]: https://docs.microsoft.com/en-us/dotnet/standard/base-types/regular-expressions +[UAX44-LM3]: https://www.unicode.org/reports/tr44/#UAX44-LM3 From 93cc5ca7de694a9651f040c65f7e2889dbed0f3b Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 18 Feb 2022 19:43:17 +0000 Subject: [PATCH 05/51] Update RegexSyntax.md Co-authored-by: Michael Ilseman --- Documentation/Evolution/RegexSyntax.md | 34 +++++++++++++++++--------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 4bd1bd553..35c0ae8d2 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -1,26 +1,38 @@ -# Regular Expression Syntax + + +# Regex Literal Interior Syntax - Authors: Hamish Knight, Michael Ilseman ## Introduction -We aim to parse a superset of the syntax accepted by a variety of popular regular expression engines. +Regex literals declare a string processing algorithm using syntax familiar across a variety of languages and tools throughout programming history. Formalizing regex literals in Swift requires choosing a delimiter strategy (e.g. `#/.../#` or `re'...'`), detailing the syntax accepted in between the delimiters ("interior syntax"), and specifying actual types and any relevant protocols for the literal itself. + +This proposal-component focuses on the interior syntax, which is large enough for its own targeted discussion ahead of the full proposal. Regex literal interior syntax will be part of Swift's source-compatibility story (and to some extent binary compatibility), so we present a detailed and comprehensive design. + +## Motivation -**TODO(Michael): Elaborate** +Swift aims to be a pragmatic programming language, balancing (TODO: prose). Rather than pursue a novel interior syntax, (TODO: prose). -## Engines supported +Regex interior syntax is part of a larger [proposal](https://forums.swift.org/t/pitch-regular-expression-literals/52820), which in turn is part of a larger [string processing effort](https://forums.swift.org/t/declarative-string-processing-overview/52459). -We aim to implement a syntactic superset of: +## Proposed Solution -- [PCRE 2][pcre2-syntax], an "industry standard" of sorts, and a rough superset of Perl, Python, etc. -- [Oniguruma][oniguruma-syntax], an internationalization-oriented engine with some modern features +We propose accepting a syntactic "superset" of the following existing regular expression engines: + +- [PCRE 2][pcre2-syntax], an "industry standard" and a rough superset of Perl, Python, etc. +- [Oniguruma][oniguruma-syntax], a modern engine with additional features. - [ICU][icu-syntax], used by NSRegularExpression, a Unicode-focused engine. -- [.NET][.net-syntax]'s regular expressions, which support delimiter-balancing and some interesting minor details on conditional patterns. -- **TODO: List Java here? It doesn't really add any more syntax than the above other than `\p{javaLowerCase}`** +- [.NET][.net-syntax], which adds delimiter-balancing and some interesting minor details around conditional patterns. + +To our knowledge, all other popular regex engines support a subset of the above syntaxes. + +We also support [UTS#18][uts18]'s full set of character class operators (to our knowledge no other engine does). Beyond that, UTS#18 deals with semantics rather than syntax, and what syntax it uses is covered by the above list. We also parse `\p{javaLowerCase}`, meaning we support a superset of Java 8 as well. -We also intend to achieve at least Level 1 (**TODO: do we want to promise Level 2?**) [UTS#18][uts18] conformance, which specifies regular expression matching semantics without mandating any particular syntax. However we can infer syntactic feature sets from its guidance. +Note that there are minor syntactic incompatibilities and ambiguities involved in this approach. Each is addressed in the relevant sections below -**TODO(Michael): Rework and expand prose** ## Detailed Design From dbdef62f383566db56adcdd02d54302151d16f1b Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 18 Feb 2022 19:44:47 +0000 Subject: [PATCH 06/51] Update RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 29 +++++++++++++++++--------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 35c0ae8d2..b613533fd 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -301,23 +301,34 @@ Set -> Member+ Member -> CustomCharClass | Quote | Range | Atom Range -> RangeElt `-` RangeElt RangeElt -> | UniScalar | EscapeSequence -SetOp -> '&&' | '--' | '~~' +SetOp -> '&&' | '--' | '~~' | '-' ``` Custom characters classes introduce their own language, in which most regular expression metacharacters become literal. The basic element in a custom character class is an `Atom`, though only a few atoms are considered valid: -- Builtin character classes, except `.`, `\O`, and `\X` +- Builtin character classes, except `.`, `\O`, `\X`, and `\N`. - Escape sequences, including `\b` which becomes the backspace character (rather than a word boundary) - Unicode scalars - Named characters - Character properties - Plain literal characters -Ranges of characters may be specified with `-`, e.g `[a-z]` matches against the letters from `a` to `z`. Only unicode scalars and literal characters are valid range operands. If `-` does not appear in a valid position, it is interpreted as literal, e.g `[-a]` is the character class of `-` and `a`. **TODO: .NET's use of it for subtraction** +Atoms may be used to compose other character class members, including ranges, quoted sequences, and even nested custom character classes `[[ab]c\d]`. Adjacent members form an implicit union of character classes, e.g `[[ab]c\d]` is the union of the characters `a`, `b`, `c`, and digit characters. -Custom character classes may be nested within each other, and may be used with set operations. The supported set operations are intersection `&&`, subtraction `--`, and symmetric difference `~~`. +Quoted sequences may be used to escape the contained characters, e.g `[\Q]\E]` is the character class of the literal character `[`. -Quoted sequences may appear with custom character classes, e.g `[\Q]\E]`, and escape the contained characters. +Ranges of characters may be specified with `-`, e.g `[a-z]` matches against the letters from `a` to `z`. Only unicode scalars and literal characters are valid range operands. If `-` appears at the start or end of a custom character class, it is interpreted as literal, e.g `[-a-]` is the character class of `-` and `a`. + +Operators may be used to apply set operations to character class members. The operators supported are: + +- `&&`: Intersection of the LHS and RHS +- `--`: Subtraction of the RHS from the LHS +- `~~`: Symmetric difference of the RHS and LHS +- `-`: .NET's spelling of subtracting the RHS from the LHS. + +These operators have a lower precedence than the implicit union of members, e.g `[ac-d&&a[d]]` is an intersection of the character classes `[ac-d]` and `[ad]`. + +To avoid ambiguity between .NET's subtraction syntax and range syntax, .NET specifies that a subtraction will only be parsed if the right-hand-side is a nested custom character class. We intend to follow this behavior. ### Character properties @@ -583,7 +594,7 @@ Another conflict arises with .NET's support of using the `-` character in a cust We intend to support the operators `&&`, `--`, and `~~`. This means that any regex literal containing these sequences in a custom character class while being written for an engine not supporting that operation will have a different semantic meaning in our engine. However this ought not to be a common occurrence, as specifying a character multiple times in a custom character class is redundant. -We also intend on supporting the `-` operator (**TODO: Justify**). +In the interests of compatibility, we also intend on supporting the `-` operator, though we likely want to emit a warning and encourage users to switch to `--`. ### Nested custom character classes @@ -639,7 +650,7 @@ PCRE and Oniguruma both support changing the active matching options through an These sound similar, but have different semantics around alternations, e.g for `a(?i)b|c|d`, in Oniguruma this becomes `a(?i:b|c|d)`, where `a` is no longer part of the alternation. However in PCRE it becomes `a(?i:b)|(?i:c)|(?i:d)`, where `a` remains a child of the alternation. -We aim to support the Oniguruma behavior. **TODO: The PCRE behavior is more complex for the parser, but seems less surprising, maybe that should become the default?** +We aim to support the PCRE behavior. ### Backreference condition kinds @@ -659,9 +670,7 @@ PCRE supports `\N` meaning "not a newline", however there are engines that treat ### Extended character property syntax -**TODO: Can this be conflicting?** - -ICU (**TODO: any others?**) unifies the character property syntax `\p{...}` with the syntax for POSIX character classes `[:...:]`, such that they follow the same internal grammar, which allows referencing any Unicode character property in addition to the POSIX properties. +ICU unifies the character property syntax `\p{...}` with the syntax for POSIX character classes `[:...:]`, such that they follow the same internal grammar, which allows referencing any Unicode character property in addition to the POSIX properties. We intend to support this, though it is a purely additive feature, and therefore should not conflict with regex engines that implement a more limited POSIX syntax. ### Extended syntax modes From b7dfa1eccd9808cf8f7f614a7283098c98f3b829 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 18 Feb 2022 20:13:23 +0000 Subject: [PATCH 07/51] Update RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 49 ++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index b613533fd..97d8982f2 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -138,7 +138,7 @@ GroupNameBody -> Identifier | BalancingGroupBody Identifier -> [\w--\d] \w* ``` -Groups define a new scope within which a recursive regular expression pattern may occur. Groups have different semantics depending on how they are introduced, some may capture the nested match, some may match against the input without advancing, some may change the matching options set in the new scope, etc. +Groups define a new scope within which a recursive regular expression pattern may occur. Groups have different semantics depending on how they are introduced. Groups may be named, the characters of which may be any letter or number characters (or `_`). However the name must not start with a number. This restriction follows the behavior of other regex engines and avoids ambiguities when it comes to named and numeric group references. @@ -146,6 +146,11 @@ Groups may be used to change the matching options present within their scope, se Note there are additional constructs that may syntactically appear similar to groups, but are distinct. See the *group-like atoms* section. +#### Basic group kinds + +- `()`: A capturing group. +- `(?:)`: A non-capturing group. +- `(?|)`: A group that, for a direct child alternation, resets the numbering of groups at each branch of that alternation. See *group numbering*. #### Lookahead and lookbehind @@ -154,6 +159,8 @@ Note there are additional constructs that may syntactically appear similar to gr - `(?<=` specifies a lookbehind that attempts to match the group body against the input before the current position. Does not advance the input. - `(?!<` specifies a negative lookbehind that ensures the group body does not match the input before the current position. Does not advance the input. +**TODO: Non-atomic variants** + PCRE2 defines explicitly spelled out versions of the syntax, e.g `(*negative_lookbehind:)`. #### Atomic groups @@ -340,7 +347,13 @@ PropertyContents -> PropertyName ('=' PropertyName)? PropertyName -> [\s\w-]+ ``` -A character property specifies a particular Unicode or POSIX property to match against. Fuzzy matching is used when parsing the property name, and is done according to rules set out by [UAX44-LM3]. This means that the following property names are considered equivalent: +A character property specifies a particular Unicode or POSIX property to match against. We intend on parsing: + +- The full range of Unicode character properties. +- The POSIX properties `alnum`, `blank`, `graph`, `print`, `word`, `xdigit` (note that `alpha`, `lower`, `upper`, `space`, `punct`, `digit`, and `cntrl` are covered by Unicode properties). +- The UTS#18 special properties `any`, `assigned`, `ascii`. + +We intend on following [UTS#18][uts18]'s guidance for character properties. This includes the use of fuzzy matching for property name parsing. This is done according to rules set out by [UAX44-LM3]. This means that the following property names are considered equivalent: - `whitespace` - `isWhitespace` @@ -348,7 +361,9 @@ A character property specifies a particular Unicode or POSIX property to match a - `iSwHiTeSpaCe` - `i s w h i t e s p a c e` -Unicode properties consist of both a key and a value, e.g `General_Category=Whitespace`. However there are some properties where the key or value may be inferred. These include: +Unicode properties consist of both a key and a value, e.g `General_Category=Whitespace`. Each component follows the fuzzy matching rule, and additionally may have an alternative alias spelling, as defined by Unicode in [PropertyAliases.txt][unicode-prop-key-aliases] and [PropertyValueAliases.txt][unicode-prop-value-aliases]. + +There are some Unicode properties where the key or value may be inferred. These include: - General category properties e.g `\p{Whitespace}` is inferred as `\p{General_Category=Whitespace}`. - Script properties e.g `\p{Greek}` is inferred as `\p{Script=Greek}`. @@ -361,8 +376,6 @@ For non-Unicode properties, only a value is required. These include: - The special properties `any`, `assigned`, `ascii`. - The POSIX compatibility properties `alnum`, `blank`, `graph`, `print`, `word`, `xdigit`. The remaining POSIX properties are already covered by boolean Unicode property spellings. -**TODO: Spell out the properties we recognize while parsing vs. those we just parse as String?** - Note that the internal `PropertyContents` syntax is shared by both the `\p{...}` and POSIX-style `[:...:]` syntax, allowing e.g `[:script=Latin:]` as well as `\p{alnum}`. ### Named characters @@ -412,7 +425,25 @@ RecursionLevel -> '+' | '-' A reference is an abstract identifier for a particular capturing group in a regular expression. It can either be named or numbered, and in the latter case may be specified relative to the current group. For example `-2` refers to the capture group `N - 2` where `N` is the number of the next capture group. References may refer to groups ahead of the current position e.g `+3`, or the name of a future group. These may be useful in recursive cases where the group being referenced has been matched in a prior iteration. -**TODO: Describe how capture groups are numbered? Including nesting & resets?** +#### Group numbering + +Capturing groups are implicitly numbered according to the position of their opening `(` in the regex. For example: + +``` +( ((?:a)(?b)c)(d)e) +^ ^ ^ ^ +1 2 3 4 +``` + +Non-capturing groups are skipped over when counting. + +Branch reset groups can alter this numbering, as they reset the numbering in the branches of an alternation child, for example: + +``` +( ()(?|(a)(b)|(?:c)|(d)))() +^ ^ ^ ^ ^ ^ +1 2 3 4 3 5 +``` #### Backreferences @@ -690,6 +721,10 @@ Different regex engines also have different rules around what characters are con This is a subset of the scalars matched by `UnicodeScalar.isWhitespace`. Additionally, in a custom character class, PCRE only considers the space and tab characters as whitespace. Other engines do not differentiate between whitespace characters inside and outside custom character classes, and appear to follow a subset of this list. Therefore we intend to support exactly the characters in this list for the purposes of non-semantic whitespace. +### Group numbering + +**TODO: Discuss how .NET numbers its groups differently** + ## Canonical representations Many engines have different spellings for the same regex features, and as such we need to decide on a preferred canonical syntax. @@ -717,3 +752,5 @@ We plan on choosing the canonical spelling **TODO: decide**. [uts18]: https://www.unicode.org/reports/tr18/ [.net-syntax]: https://docs.microsoft.com/en-us/dotnet/standard/base-types/regular-expressions [UAX44-LM3]: https://www.unicode.org/reports/tr44/#UAX44-LM3 +[unicode-prop-key-aliases]: https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt +[unicode-prop-value-aliases]: https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt From 8dc2b4b56f058bdcfb9ea5f48836f14c28a2ef6a Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 21 Feb 2022 14:07:27 +0000 Subject: [PATCH 08/51] Update RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 43 +++++++++++++++++--------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 97d8982f2..a1f2ea3e9 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -140,7 +140,7 @@ Identifier -> [\w--\d] \w* Groups define a new scope within which a recursive regular expression pattern may occur. Groups have different semantics depending on how they are introduced. -Groups may be named, the characters of which may be any letter or number characters (or `_`). However the name must not start with a number. This restriction follows the behavior of other regex engines and avoids ambiguities when it comes to named and numeric group references. +Groups may be named, the characters of which may be any letter or number characters or the character `_`. However the name must not start with a number. This restriction follows the behavior of other regex engines and avoids ambiguities when it comes to named and numeric group references. Groups may be used to change the matching options present within their scope, see the *Matching options* section. @@ -152,20 +152,23 @@ Note there are additional constructs that may syntactically appear similar to gr - `(?:)`: A non-capturing group. - `(?|)`: A group that, for a direct child alternation, resets the numbering of groups at each branch of that alternation. See *group numbering*. -#### Lookahead and lookbehind +#### Atomic groups -- `(?=` specifies a lookahead that attempts to match against the group body, but does not advance. -- `(?!` specifies a negative lookahead that ensures the group body does not match, and does not advance. -- `(?<=` specifies a lookbehind that attempts to match the group body against the input before the current position. Does not advance the input. -- `(?!<` specifies a negative lookbehind that ensures the group body does not match the input before the current position. Does not advance the input. +An atomic group e.g `(?>...)` specifies that its contents should not be re-evaluated for backtracking. This the same semantics as a possessive quantifier, but applies more generally to any regex pattern. -**TODO: Non-atomic variants** +#### Lookahead and lookbehind -PCRE2 defines explicitly spelled out versions of the syntax, e.g `(*negative_lookbehind:)`. +- `(?=`: A lookahead that attempts to match against the group body, but does not advance. +- `(?!`: A negative lookahead that ensures the group body does not match, and does not advance. +- `(?<=`: A lookbehind that attempts to match the group body against the input before the current position. Does not advance the input. +- `(?!<`: A negative lookbehind that ensures the group body does not match the input before the current position. Does not advance the input. -#### Atomic groups +These groups are all atomic, meaning that they will not be re-evaluated for backtracking. There are however also non-atomic variants: -An atomic group e.g `(?>...)` specifies that its contents should not be re-evaluated for backtracking. This the same semantics as a possessive quantifier, but applies more generally to any regex pattern. +- `(?*`: A non-atomic lookahead. +- `(?<*`: A non-atomic lookbehind. + +PCRE2 also defines explicitly spelled out versions of the above syntax, e.g `(*non_atomic_positive_lookahead` and `(*negative_lookbehind:)`. #### Script runs @@ -430,17 +433,17 @@ A reference is an abstract identifier for a particular capturing group in a regu Capturing groups are implicitly numbered according to the position of their opening `(` in the regex. For example: ``` -( ((?:a)(?b)c)(d)e) +(a((?:b)(?c)d)(e)f) ^ ^ ^ ^ 1 2 3 4 ``` Non-capturing groups are skipped over when counting. -Branch reset groups can alter this numbering, as they reset the numbering in the branches of an alternation child, for example: +Branch reset groups can alter this numbering, as they reset the numbering in the branches of an alternation child. Outside the alternation, numbering resumes at the next available number not used in one of the branches. For example: ``` -( ()(?|(a)(b)|(?:c)|(d)))() +(a()(?|(b)(c)|(?:d)|(e)))(f) ^ ^ ^ ^ ^ ^ 1 2 3 4 3 5 ``` @@ -503,7 +506,7 @@ A condition may be: - A reference to a capture group, which checks whether the group matched successfully. - A recursion check on either a particular group or the entire regex. In the former case, this checks to see if the last recursive call is through that group. In the latter case, it checks if the match is currently taking place in any kind of recursive call. -- An arbitrary recursive regular expression, which is matched against, and evaluates to true if the match is successful. (**TODO: Clarify whether it introduces captures**) +- An arbitrary recursive regular expression, which is matched against, and evaluates to true if the match is successful. It may contain capture groups that add captures to the match. - A PCRE version check. The `DEFINE` keyword is not used as a condition, but rather a way in which to define a group which is not evaluated, but may be referenced by a subpattern. @@ -723,7 +726,17 @@ This is a subset of the scalars matched by `UnicodeScalar.isWhitespace`. Additio ### Group numbering -**TODO: Discuss how .NET numbers its groups differently** +In PCRE, groups are numbered according to the position of their opening parenthesis. .NET also follows this rule, with the exception that named groups are numbered after unnamed groups. For example: + +``` +(a(?x)b)(?y)(z) +^ ^ ^ ^ +1 3 4 2 +``` + +The `(z)` group gets numbered before the named groups get numbered. + +We intend on matching the PCRE behavior where groups are numbered purely based on order. ## Canonical representations From c913eead74b86962fb127fa71525e2dd9a9328a6 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 21 Feb 2022 20:14:27 +0000 Subject: [PATCH 09/51] Update RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 44 ++++++++++++++++++++------ 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index a1f2ea3e9..eaf5de239 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -103,10 +103,15 @@ Atom -> Anchor | NamedCharacter | Subpattern | UniScalar + | '\K' | '\'? ``` -Atoms are the smallest units of regular expression syntax. They include escape sequences e.g `\b`, `\d`, as well as meta-characters such as `.` and `$`. They also include some larger syntactic constructs such as backreferences and callouts. The most basic form of atom is a literal character. A meta-character may be treated as literal by preceding it with a backslash. Other characters may also be preceded with a backslash, but it has no effect, e.g `\I` is literal `I`. +Atoms are the smallest units of regular expression syntax. They include escape sequences e.g `\b`, `\d`, as well as meta-characters such as `.` and `$`. They also include some larger syntactic constructs such as backreferences and callouts. The most basic form of atom is a literal character. A meta-character may be treated as literal by preceding it with a backslash. Other characters may also be preceded with a backslash, but it has no effect if they are unknown escape sequences, e.g `\I` is literal `I`. + +#### `\K` + +The `\K` escape sequence is used to drop any previously matched characters from the final matching result. It does not however interfere with captures, e.g `a(b)\Kc` when matching against `abc` will return a match of `c`, but with a capture of `b`. ### Groups @@ -261,7 +266,7 @@ HexDigit -> [0-9a-zA-Z] OctalDigit -> [0-7] ``` -These sequences define a unicode scalar value to be matched against. There is both syntax for specifying the scalar value in hex notation, as well as octal notation. +These sequences define a unicode scalar value to be matched against. There is both syntax for specifying the scalar value in hex notation, as well as octal notation. Note that `\x` that is not followed by any hexadecimal digit characters is treated as `\0`, which matches PCRE's behavior. ### Escape sequences @@ -283,10 +288,11 @@ These escape sequences denote a specific character. ### Builtin character classes ``` -BuiltinCharClass -> '.' | '\d' | '\D' | '\h' | '\H' | '\N' | '\O' | '\R' | '\s' | '\S' | '\v' | '\V' | '\w' | '\W' | '\X' +BuiltinCharClass -> '.' | '\C' | '\d' | '\D' | '\h' | '\H' | '\N' | '\O' | '\R' | '\s' | '\S' | '\v' | '\V' | '\w' | '\W' | '\X' ``` - `.`: Any character excluding newlines +- `\C`: A single UTF code unit - `\d`: Digit character - `\D`: Non-digit character - `\h`: Horizontal space character @@ -316,7 +322,7 @@ SetOp -> '&&' | '--' | '~~' | '-' Custom characters classes introduce their own language, in which most regular expression metacharacters become literal. The basic element in a custom character class is an `Atom`, though only a few atoms are considered valid: -- Builtin character classes, except `.`, `\O`, `\X`, and `\N`. +- Builtin character classes except `.`, `\R`, `\O`, `\X`, `\C`, and `\N`. - Escape sequences, including `\b` which becomes the backspace character (rather than a word boundary) - Unicode scalars - Named characters @@ -325,9 +331,11 @@ Custom characters classes introduce their own language, in which most regular ex Atoms may be used to compose other character class members, including ranges, quoted sequences, and even nested custom character classes `[[ab]c\d]`. Adjacent members form an implicit union of character classes, e.g `[[ab]c\d]` is the union of the characters `a`, `b`, `c`, and digit characters. +Custom character classes may not be empty, e.g `[]` is forbidden. A custom character class may begin with the `]` character, in which case it is treated as literal, e.g `[]a]` is the custom character class of `]` and `a`. + Quoted sequences may be used to escape the contained characters, e.g `[\Q]\E]` is the character class of the literal character `[`. -Ranges of characters may be specified with `-`, e.g `[a-z]` matches against the letters from `a` to `z`. Only unicode scalars and literal characters are valid range operands. If `-` appears at the start or end of a custom character class, it is interpreted as literal, e.g `[-a-]` is the character class of `-` and `a`. +Ranges of characters may be specified with `-`, e.g `[a-z]` matches against the letters from `a` to `z`. Only unicode scalars and literal characters are valid range operands. If `-` cannot be used to form a range, it is interpreted as literal, e.g `[-a-]` is the character class of `-` and `a`. `[a-c-d]` is the character class of `a`...`c`, `-`, and `d`. Operators may be used to apply set operations to character class members. The operators supported are: @@ -350,11 +358,12 @@ PropertyContents -> PropertyName ('=' PropertyName)? PropertyName -> [\s\w-]+ ``` -A character property specifies a particular Unicode or POSIX property to match against. We intend on parsing: +A character property specifies a particular Unicode, POSIX, or PCRE property to match against. We intend on parsing: - The full range of Unicode character properties. - The POSIX properties `alnum`, `blank`, `graph`, `print`, `word`, `xdigit` (note that `alpha`, `lower`, `upper`, `space`, `punct`, `digit`, and `cntrl` are covered by Unicode properties). - The UTS#18 special properties `any`, `assigned`, `ascii`. +- The special PCRE2 properties `Xan`, `Xps`, `Xsp`, `Xuc`, `Xwd`. We intend on following [UTS#18][uts18]'s guidance for character properties. This includes the use of fuzzy matching for property name parsing. This is done according to rules set out by [UAX44-LM3]. This means that the following property names are considered equivalent: @@ -369,8 +378,9 @@ Unicode properties consist of both a key and a value, e.g `General_Category=Whit There are some Unicode properties where the key or value may be inferred. These include: - General category properties e.g `\p{Whitespace}` is inferred as `\p{General_Category=Whitespace}`. -- Script properties e.g `\p{Greek}` is inferred as `\p{Script=Greek}`. +- Script properties e.g `\p{Greek}` is inferred as `\p{Script=Greek}`. **TODO: Infer as `\p{scx=Greek}` instead?** - Boolean properties that are inferred to have a `True` value, e.g `\p{Lowercase}` is inferred as `\p{Lowercase=True}`. +- Block properties that begin with the prefix `in`, e.g `\p{inBasicLatin}` is inferred to be `\p{Block=Basic_Latin}`. Other Unicode properties however must specify both a key and value. @@ -388,7 +398,7 @@ NamedCharacter -> '\N{' CharName '}' CharName -> 'U+' HexDigit{1...8} | [\s\w-]+ ``` -Allows a specific Unicode scalar to be specified by name or code point. +Allows a specific Unicode scalar to be specified by name or hexadecimal code point. **TODO: Should this be called "named scalar" or similar?** @@ -696,7 +706,7 @@ PCRE and .NET allow for conditional patterns to reference a group by its name, e where `y` will only be matched if `(?x)` was matched. PCRE will always treat such syntax as a backreference condition, however .NET will only treat it as such if a group with that name exists somewhere in the regex (including after the conditional). Otherwise, .NET interprets `group1` as an arbitrary regular expression condition to try match against. -We intend to always parse such conditions as an arbitrary regular expression condition, and will emit a warning asking users to explicitly use the syntax `(?('group1')y)` if they want a backreference condition. This more explicit syntax is supported by PCRE. +We intend to always parse such conditions as an arbitrary regular expression condition, and will emit a warning asking users to explicitly use the syntax `(?()y)` if they want a backreference condition. This more explicit syntax is supported by PCRE. **TODO: Is the opposite more common?** ### `\N` @@ -742,6 +752,16 @@ We intend on matching the PCRE behavior where groups are numbered purely based o Many engines have different spellings for the same regex features, and as such we need to decide on a preferred canonical syntax. +### Unicode scalars + +### Character properties + +### Groups + +#### Named + +#### Lookaheads and lookbehinds + ### Backreferences There are a variety of backreference spellings accepted by different engines @@ -758,6 +778,12 @@ Backreference -> '\g{' NameOrNumberRef '}' We plan on choosing the canonical spelling **TODO: decide**. +### Subpattern + +### Conditional references + +### Callouts + [pcre2-syntax]: https://www.pcre.org/current/doc/html/pcre2syntax.html [oniguruma-syntax]: https://github.com/kkos/oniguruma/blob/master/doc/RE From bbb77569bbcc55d61f5c35242aad98a2bfc49786 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 22 Feb 2022 13:56:03 +0000 Subject: [PATCH 10/51] Update RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index eaf5de239..5b8c0ccab 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -100,7 +100,7 @@ Atom -> Anchor | Callout | CharacterProperty | EscapeSequence - | NamedCharacter + | NamedScalar | Subpattern | UniScalar | '\K' @@ -266,7 +266,7 @@ HexDigit -> [0-9a-zA-Z] OctalDigit -> [0-7] ``` -These sequences define a unicode scalar value to be matched against. There is both syntax for specifying the scalar value in hex notation, as well as octal notation. Note that `\x` that is not followed by any hexadecimal digit characters is treated as `\0`, which matches PCRE's behavior. +These sequences define a unicode scalar value to be matched against. There is both syntax for specifying the scalar value in hex notation, as well as octal notation. Note that `\x`, when not followed by any hexadecimal digit characters, is treated as `\0`, matching PCRE's behavior. ### Escape sequences @@ -325,7 +325,7 @@ Custom characters classes introduce their own language, in which most regular ex - Builtin character classes except `.`, `\R`, `\O`, `\X`, `\C`, and `\N`. - Escape sequences, including `\b` which becomes the backspace character (rather than a word boundary) - Unicode scalars -- Named characters +- Named scalars - Character properties - Plain literal characters @@ -378,7 +378,7 @@ Unicode properties consist of both a key and a value, e.g `General_Category=Whit There are some Unicode properties where the key or value may be inferred. These include: - General category properties e.g `\p{Whitespace}` is inferred as `\p{General_Category=Whitespace}`. -- Script properties e.g `\p{Greek}` is inferred as `\p{Script=Greek}`. **TODO: Infer as `\p{scx=Greek}` instead?** +- Script properties e.g `\p{Greek}` is inferred as `\p{Script_Extensions=Greek}`. - Boolean properties that are inferred to have a `True` value, e.g `\p{Lowercase}` is inferred as `\p{Lowercase=True}`. - Block properties that begin with the prefix `in`, e.g `\p{inBasicLatin}` is inferred to be `\p{Block=Basic_Latin}`. @@ -391,17 +391,15 @@ For non-Unicode properties, only a value is required. These include: Note that the internal `PropertyContents` syntax is shared by both the `\p{...}` and POSIX-style `[:...:]` syntax, allowing e.g `[:script=Latin:]` as well as `\p{alnum}`. -### Named characters +### Named scalars ``` -NamedCharacter -> '\N{' CharName '}' -CharName -> 'U+' HexDigit{1...8} | [\s\w-]+ +NamedScalar -> '\N{' ScalarName '}' +ScalarName -> 'U+' HexDigit{1...8} | [\s\w-]+ ``` Allows a specific Unicode scalar to be specified by name or hexadecimal code point. -**TODO: Should this be called "named scalar" or similar?** - ### Trivia ``` @@ -686,7 +684,7 @@ In PCRE, a bare `\x` denotes the NUL character (`U+00`). In Oniguruma, it denote ### Whitespace in ranges -In PCRE, `x{2,4}` is a range quantifier meaning that `x` can be matched from 2 to 4 times. However if any whitespace is introduced within the braces, it becomes an invalid range and is then treated as the literal characters instead. We find this behavior to be unintuitive, and therefore intend to parse any intermixed whitespace in the range, but will emit a warning telling users that we're doing so (**TODO: how would they silence? move to modern syntax?**). +In PCRE, `x{2,4}` is a range quantifier meaning that `x` can be matched from 2 to 4 times. However if any whitespace is introduced within the braces, it becomes an invalid range and is then treated as the literal characters instead. We find this behavior to be unintuitive, and therefore intend to parse any intermixed whitespace in the range. ### Implicitly-scoped matching option scopes @@ -716,6 +714,12 @@ PCRE supports `\N` meaning "not a newline", however there are engines that treat ICU unifies the character property syntax `\p{...}` with the syntax for POSIX character classes `[:...:]`, such that they follow the same internal grammar, which allows referencing any Unicode character property in addition to the POSIX properties. We intend to support this, though it is a purely additive feature, and therefore should not conflict with regex engines that implement a more limited POSIX syntax. +### Script properties + +Shorthand script property syntax e.g `\p{Latin}` is treated as `\p{Script=Latin}` by PCRE, ICU, Oniguruma, and Java. These use [the Unicode Script property][unicode-scripts], which assigns each scalar a particular script value. However, there are scalars that may appear in multiple scripts, e.g U+3003 DITTO MARK. These often get assigned to the `Common` script to reflect this fact, which is not particularly useful for matching purposes. To provide more fine-grained script matching, Unicode provides [the Script Extension property][unicode-script-extensions], which exposes the set of scripts that a scalar appears in. + +As such we feel that the more desirable default behavior of shorthand script property syntax e.g `\p{Latin}` is for it to be treated as `\p{Script_Extension=Latin}`. This matches Perl's default behavior. Plain script properties may still be written using the more explicit syntax e.g `\p{Script=Latin}` and `\p{sc=Latin}`. + ### Extended syntax modes Various regex engines offer an "extended syntax" where whitespace is treated as non-semantic (e.g `a b c` is equivalent to `abc`), in addition to allowing end-of-line comments `# comment`. In PCRE, this enabled through the `(?x)` and `(?xx)` matching options, where the former allows non-semantic whitespace outside of character classes, and the latter also allows non-semantic whitespace in custom character classes. @@ -793,3 +797,5 @@ We plan on choosing the canonical spelling **TODO: decide**. [UAX44-LM3]: https://www.unicode.org/reports/tr44/#UAX44-LM3 [unicode-prop-key-aliases]: https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt [unicode-prop-value-aliases]: https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt +[unicode-scripts]: https://www.unicode.org/reports/tr24/#Script +[unicode-script-extensions]: https://www.unicode.org/reports/tr24/#Script_Extensions From fb7f38063aad472d079330e7aa6fec4c915834d8 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 22 Feb 2022 16:06:43 +0000 Subject: [PATCH 11/51] Update RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 79 ++++++++++++++++++++++---- 1 file changed, 67 insertions(+), 12 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 5b8c0ccab..5c974ce19 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -461,14 +461,14 @@ Branch reset groups can alter this numbering, as they reset the numbering in the ``` Backreference -> '\g{' NameOrNumberRef '}' | '\g' NumberRef - | '\k<' Identifier '>' - | "\k'" Identifier "'" + | '\k<' NameOrNumberRef '>' + | "\k'" NameOrNumberRef "'" | '\k{' Identifier '}' | '\' [1-9] [0-9]+ | '(?P=' Identifier ')' ``` -A backreference evaluates to the value last captured by the referenced capturing group. +A backreference evaluates to the value last captured by the referenced capturing group. If the referenced capture has not been evaluated yet, the match fails. #### Subpatterns @@ -483,7 +483,7 @@ GroupLikeSubpatternBody -> 'P>' | NumberRef ``` -A subpattern causes the referenced group to be re-evaluated at the current position. The syntax `(?R)` is equivalent to `(?0)`, and causes the entire pattern to be recursed. +A subpattern causes the referenced capture group to be re-evaluated at the current position. The syntax `(?R)` is equivalent to `(?0)`, and causes the entire pattern to be recursed. ### Conditionals @@ -754,39 +754,94 @@ We intend on matching the PCRE behavior where groups are numbered purely based o ## Canonical representations -Many engines have different spellings for the same regex features, and as such we need to decide on a preferred canonical syntax. +Many engines have different spellings for the same regex features, we intend to support parsing. However, for the purposes of e.g printing, we need to decide on a canonical syntax for various constructs. ### Unicode scalars +``` +UniScalar -> '\u{' HexDigit{1...} '}' + | '\u' HexDigit{4} + | '\x{' HexDigit{1...} '}' + | '\x' HexDigit{0...2} + | '\U' HexDigit{8} + | '\o{' OctalDigit{1...} '}' + | '\0' OctalDigit{0...3} + +HexDigit -> [0-9a-zA-Z] +OctalDigit -> [0-7] +``` + +For consistency with String escape syntax, we intend on canonicalizing to `\u{...}`. + ### Character properties +**TODO: Should we canonicalize on e.g `\p{Script_Extensions=Greek}`? Or prefer the shorthand where we can? Or just avoid canonicalizing?** + ### Groups #### Named +``` +NamedGroup -> 'P<' GroupNameBody '>' + | '<' GroupNameBody '>' + | "'" GroupNameBody "'" +``` + +We intend on canonicalizing to the `(?<...>)` spelling. + #### Lookaheads and lookbehinds -### Backreferences +We intend on canonicalizing to the short-form versions of these group kinds, e.g `(?=`. -There are a variety of backreference spellings accepted by different engines +### Backreferences ``` Backreference -> '\g{' NameOrNumberRef '}' | '\g' NumberRef - | '\k<' Identifier '>' - | "\k'" Identifier "'" + | '\k<' NameOrNumberRef '>' + | "\k'" NameOrNumberRef "'" | '\k{' Identifier '}' | '\' [1-9] [0-9]+ | '(?P=' Identifier ')' ``` -We plan on choosing the canonical spelling **TODO: decide**. +For absolute numeric references, we plan on choosing the canonical spelling `\DDD`, as it is unambiguous with octal sequences. For relative numbered references, as well as named references, we intend on canonicalizing to `\k<...>` to match the group name canonicalization `(?<...>)`. **TODO: How valuable is it to have canonical `\DDD`? Would it be better to just use `\k<...>` for everything?** + +### Subpatterns + +``` +Subpattern -> '\g<' NameOrNumberRef '>' + | "\g'" NameOrNumberRef "'" + | '(?' GroupLikeSubpatternBody ')' + +GroupLikeSubpatternBody -> 'P>' + | '&' + | 'R' + | NumberRef +``` -### Subpattern +We intend on canonicalizing to the `\g<...>` spelling. **TODO: For `(?R)` too?** ### Conditional references -### Callouts +**TODO: Decide** + +### PCRE Callouts + +``` +PCRECallout -> '(?C' CalloutBody ')' +PCRECalloutBody -> '' | + | '`' '`' + | "'" "'" + | '"' '"' + | '^' '^' + | '%' '%' + | '#' '#' + | '$' '$' + | '{' '}' +``` + +PCRE accepts a number of alternative delimiters for callout string arguments. We intend to canonicalize to `(?C"...")`. **TODO: May want to alter if we choose `r"..."`, though lexing should be able to handle it by looking for the `(?C` prefix**. [pcre2-syntax]: https://www.pcre.org/current/doc/html/pcre2syntax.html From 5a2517f9ad0d04d01c806b76aaac4f69c1ae9573 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 22 Feb 2022 21:16:05 +0000 Subject: [PATCH 12/51] Update RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 5c974ce19..b02c4c043 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -14,7 +14,7 @@ This proposal-component focuses on the interior syntax, which is large enough fo ## Motivation -Swift aims to be a pragmatic programming language, balancing (TODO: prose). Rather than pursue a novel interior syntax, (TODO: prose). +Swift aims to be a pragmatic programming language, balancing (**TODO(Michael)**: prose). Rather than pursue a novel interior syntax, (**TODO(Michael)**: prose). Regex interior syntax is part of a larger [proposal](https://forums.swift.org/t/pitch-regular-expression-literals/52820), which in turn is part of a larger [string processing effort](https://forums.swift.org/t/declarative-string-processing-overview/52459). @@ -502,7 +502,6 @@ KnownCondition -> 'R' | 'DEFINE' | 'VERSION' VersionCheck | NumberRef - | NameRef PCREVersionCheck -> '>'? '=' PCREVersionNumber PCREVersionNumber -> '.' @@ -512,11 +511,12 @@ A conditional evaluates a particular condition, and chooses a branch to match ag A condition may be: -- A reference to a capture group, which checks whether the group matched successfully. +- A numeric or delimited named reference to a capture group, which checks whether the group matched successfully. - A recursion check on either a particular group or the entire regex. In the former case, this checks to see if the last recursive call is through that group. In the latter case, it checks if the match is currently taking place in any kind of recursive call. -- An arbitrary recursive regular expression, which is matched against, and evaluates to true if the match is successful. It may contain capture groups that add captures to the match. - A PCRE version check. +If the condition does not syntactically match any of the above, it is treated as an arbitrary recursive regular expression. This will be matched against, and evaluates to true if the match is successful. It may contain capture groups that add captures to the match. + The `DEFINE` keyword is not used as a condition, but rather a way in which to define a group which is not evaluated, but may be referenced by a subpattern. ### PCRE backtracking directives @@ -616,9 +616,9 @@ An absent function is an Oniguruma feature that allows for the easy inversion of ## Syntactic differences between engines -**TODO: Intro** +**TODO(Michael, if you want): Intro** -**TODO: Talk about compatibility modes for different engines being a possible future direction?** +**TODO(Michael, if you want): Talk about compatibility modes for different engines being a possible future direction?** ### Character class set operations @@ -696,15 +696,15 @@ We aim to support the PCRE behavior. ### Backreference condition kinds -PCRE and .NET allow for conditional patterns to reference a group by its name, e.g: +PCRE and .NET allow for conditional patterns to reference a group by its name without any form of delimiter, e.g: ``` (?x)?(?(group1)y) ``` -where `y` will only be matched if `(?x)` was matched. PCRE will always treat such syntax as a backreference condition, however .NET will only treat it as such if a group with that name exists somewhere in the regex (including after the conditional). Otherwise, .NET interprets `group1` as an arbitrary regular expression condition to try match against. +where `y` will only be matched if `(?x)` was matched. PCRE will always treat such syntax as a backreference condition, however .NET will only treat it as such if a group with that name exists somewhere in the regex (including after the conditional). Otherwise, .NET interprets `group1` as an arbitrary regular expression condition to try match against. Oniguruma on the other hand will always treat `group1` as an regex condition to match against. -We intend to always parse such conditions as an arbitrary regular expression condition, and will emit a warning asking users to explicitly use the syntax `(?()y)` if they want a backreference condition. This more explicit syntax is supported by PCRE. **TODO: Is the opposite more common?** +We intend to always parse such conditions as an arbitrary regular expression condition, and will emit a warning asking users to explicitly use the syntax `(?()y)` if they want a backreference condition. This more explicit syntax is supported by both PCRE and Oniguruma. ### `\N` @@ -716,7 +716,7 @@ ICU unifies the character property syntax `\p{...}` with the syntax for POSIX ch ### Script properties -Shorthand script property syntax e.g `\p{Latin}` is treated as `\p{Script=Latin}` by PCRE, ICU, Oniguruma, and Java. These use [the Unicode Script property][unicode-scripts], which assigns each scalar a particular script value. However, there are scalars that may appear in multiple scripts, e.g U+3003 DITTO MARK. These often get assigned to the `Common` script to reflect this fact, which is not particularly useful for matching purposes. To provide more fine-grained script matching, Unicode provides [the Script Extension property][unicode-script-extensions], which exposes the set of scripts that a scalar appears in. +Shorthand script property syntax e.g `\p{Latin}` is treated as `\p{Script=Latin}` by PCRE, ICU, Oniguruma, and Java. These use [the Unicode Script property][unicode-scripts], which assigns each scalar a particular script value. However, there are scalars that may appear in multiple scripts, e.g U+3003 DITTO MARK. These are often assigned to the `Common` script to reflect this fact, which is not particularly useful for matching purposes. To provide more fine-grained script matching, Unicode provides [the Script Extension property][unicode-script-extensions], which exposes the set of scripts that a scalar appears in. As such we feel that the more desirable default behavior of shorthand script property syntax e.g `\p{Latin}` is for it to be treated as `\p{Script_Extension=Latin}`. This matches Perl's default behavior. Plain script properties may still be written using the more explicit syntax e.g `\p{Script=Latin}` and `\p{sc=Latin}`. From ff717a3eb1a711f20bcac53a40fdf7c3a510fe07 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 23 Feb 2022 14:37:36 +0000 Subject: [PATCH 13/51] Update RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 175 ++++++++++++++----------- 1 file changed, 95 insertions(+), 80 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index b02c4c043..df15c0d3d 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -1,5 +1,5 @@ # Regex Literal Interior Syntax @@ -62,7 +62,7 @@ Concatenation -> (!'|' !')' ConcatComponent)* ConcatComponent -> Trivia | Quote | Quantification ``` -Implicitly denoted by adjacent expressions, a concatenation matches against a sequence of regular expression patterns. This has a higher precedence than an alternation, so e.g `abc|def` matches against `abc` or `def`. The `ConcatComponent` token varies across engine, but at least matches some form of trivia, e.g comments, quoted sequences e.g `\Q...\E`, and a potentially quantified expression. +Implicitly denoted by adjacent expressions, a concatenation matches against a sequence of regular expression nodes. This has a higher precedence than an alternation, so e.g `abc|def` matches against `abc` or `def`. A concatenation may consist of potentially quantified expressions, trivia such as inline comments, and quoted sequences `\Q...\E`. ### Quantification @@ -76,19 +76,19 @@ Range -> ',' | ',' ? | QuantOperand -> AbsentFunction | Atom | Conditional | CustomCharClass | Group ``` -A quantification consists of an operand optionally followed by a quantifier that specifier how many times it may be matched. An operand without a quantifier is matched once. +A quantification consists of an operand optionally followed by a quantifier that specifies how many times it may be matched. An operand without a quantifier is matched once. The quantifiers supported are: -- `?`: 0 or 1 matches -- `*`: 0 or more matches -- `+`: 1 or more matches -- `{n,m}`: Between `n` and `m` (inclusive) matches -- `{n,}`: `n` or more matches -- `{,m}`: Up to `m` matches -- `{n}`: Exactly `n` matches +- `?`: 0 or 1 matches. +- `*`: 0 or more matches. +- `+`: 1 or more matches. +- `{n,m}`: Between `n` and `m` (inclusive) matches. +- `{n,}`: `n` or more matches. +- `{,m}`: Up to `m` matches. +- `{n}`: Exactly `n` matches. -A quantifier may optionally be followed by `?` or `+`, which adjust its semantics. If neither are specified, by default the quantification happens *eagerly*, meaning that it will try to maximize the number of matches made. However, if `?` is specified, quantification happens *reluctantly*, meaning that the number of matches will instead be minimized. If `+` is specified, *possessive* matching occurs, which is eager matching with the additional semantic that it may not be backtracked into to try a different number of matches. +A quantifier may optionally be followed by `?` or `+`, which adjusts its semantics. If neither are specified, by default the quantification happens *eagerly*, meaning that it will try to maximize the number of matches made. However, if `?` is specified, quantification happens *reluctantly*, meaning that the number of matches will instead be minimized. If `+` is specified, *possessive* matching occurs, which is eager matching with the additional semantic that it may not be backtracked into to try a different number of matches. ### Atom @@ -107,7 +107,7 @@ Atom -> Anchor | '\'? ``` -Atoms are the smallest units of regular expression syntax. They include escape sequences e.g `\b`, `\d`, as well as meta-characters such as `.` and `$`. They also include some larger syntactic constructs such as backreferences and callouts. The most basic form of atom is a literal character. A meta-character may be treated as literal by preceding it with a backslash. Other characters may also be preceded with a backslash, but it has no effect if they are unknown escape sequences, e.g `\I` is literal `I`. +Atoms are the smallest units of regular expression syntax. They include escape sequences e.g `\b`, `\d`, as well as metacharacters such as `.` and `$`. They also include some larger syntactic constructs such as backreferences and callouts. The most basic form of atom is a literal character. A metacharacter may be treated as literal by preceding it with a backslash. Other literal characters may also be preceded with a backslash, but it has no effect if they are unknown escape sequences, e.g `\I` is literal `I`. #### `\K` @@ -143,11 +143,7 @@ GroupNameBody -> Identifier | BalancingGroupBody Identifier -> [\w--\d] \w* ``` -Groups define a new scope within which a recursive regular expression pattern may occur. Groups have different semantics depending on how they are introduced. - -Groups may be named, the characters of which may be any letter or number characters or the character `_`. However the name must not start with a number. This restriction follows the behavior of other regex engines and avoids ambiguities when it comes to named and numeric group references. - -Groups may be used to change the matching options present within their scope, see the *Matching options* section. +Groups define a new scope that contains a recursive regular expression pattern. Groups have different semantics depending on how they are introduced, the details of which are laid out in the following sections. Note there are additional constructs that may syntactically appear similar to groups, but are distinct. See the *group-like atoms* section. @@ -157,18 +153,24 @@ Note there are additional constructs that may syntactically appear similar to gr - `(?:)`: A non-capturing group. - `(?|)`: A group that, for a direct child alternation, resets the numbering of groups at each branch of that alternation. See *group numbering*. +Capturing groups produce captures, which remember the range of input matched for the scope of that group. + +A capturing group may be named using any of the `NamedGroup` syntax. The characters of the group name may be any letter or number characters or the character `_`. However the name must not start with a number. This restriction follows the behavior of other regex engines and avoids ambiguities when it comes to named and numeric group references. + #### Atomic groups -An atomic group e.g `(?>...)` specifies that its contents should not be re-evaluated for backtracking. This the same semantics as a possessive quantifier, but applies more generally to any regex pattern. +An atomic group e.g `(?>...)` specifies that its contents should not be re-evaluated for backtracking. This has the same semantics as a possessive quantifier, but applies more generally to any regex pattern. #### Lookahead and lookbehind -- `(?=`: A lookahead that attempts to match against the group body, but does not advance. -- `(?!`: A negative lookahead that ensures the group body does not match, and does not advance. -- `(?<=`: A lookbehind that attempts to match the group body against the input before the current position. Does not advance the input. -- `(?!<`: A negative lookbehind that ensures the group body does not match the input before the current position. Does not advance the input. +These groups evaluate the input ahead or behind the current matching position, without advancing the input. -These groups are all atomic, meaning that they will not be re-evaluated for backtracking. There are however also non-atomic variants: +- `(?=`: A lookahead, which matches against the input following the current matching position. +- `(?!`: A negative lookahead, which ensures a negative match against the input following the current matching position. +- `(?<=`: A lookbehind, which matches against the input prior to the current matching position. +- `(?!<`: A negative lookbehind, which ensures a negative match against the input prior to the current matching position. + +The above groups are all atomic, meaning that they will not be re-evaluated for backtracking. There are however also non-atomic variants: - `(?*`: A non-atomic lookahead. - `(?<*`: A non-atomic lookbehind. @@ -187,6 +189,26 @@ BalancingGroupBody -> Identifier? '-' Identifier Introduced by .NET, balancing groups extend the `GroupNameBody` syntax to support the ability to refer to a prior group. Upon matching, the prior group is deleted, and any intermediate matched input becomes the capture of the current group. +#### Group numbering + +Capturing groups are implicitly numbered according to the position of their opening `(` in the regex. For example: + +``` +(a((?:b)(?c)d)(e)f) +^ ^ ^ ^ +1 2 3 4 +``` + +Non-capturing groups are skipped over when counting. + +Branch reset groups can alter this numbering, as they reset the numbering in the branches of an alternation child. Outside the alternation, numbering resumes at the next available number not used in one of the branches. For example: + +``` +(a()(?|(b)(c)|(?:d)|(e)))(f) +^ ^ ^ ^ ^ ^ +1 2 3 4 3 5 +``` + ### Matching options ``` @@ -199,14 +221,16 @@ MatchingOption -> 'i' | 'J' | 'm' | 'n' | 's' | 'U' | 'x' | 'xx' | 'w' | 'D' | ' A matching option sequence may be used as a group specifier, and denotes a change in matching options for the scope of that group. For example `(?x:a b c)` enables extended syntax for `a b c`. A matching option sequence may be part of an "isolated group" which has an implicit scope that wraps the remaining elements of the current group. For example, `(?x)a b c` also enables extended syntax for `a b c`. +If used in the branch of an alternation, an isolated group affects all the following branches of that alternation. For example, `a(?i)b|c|d` is treated as `a(?i:b)|(?i:c)|(?i:d)`. + We support all the matching options accepted by PCRE, ICU, and Oniguruma. In addition, we accept some matching options unique to our matching engine. #### PCRE options -- `i`: Case insensitive matching -- `J`: Allows multiple groups to share the same name, which is otherwise forbidden -- `m`: Enables `^` and `$` to match against the start and end of a line rather than only the start and end of the entire string -- `n`: Disables capturing of `(...)` groups. Named capture groups must be used instead. +- `i`: Case insensitive matching. +- `J`: Allows multiple groups to share the same name, which is otherwise forbidden. +- `m`: Enables `^` and `$` to match against the start and end of a line rather than only the start and end of the entire string. +- `n`: Disables the capturing behavior of `(...)` groups. Named capture groups must be used instead. - `s`: Changes `.` to match any character, including newlines. - `U`: Changes quantifiers to be reluctant by default, with the `?` specifier changing to mean greedy. - `x`, `xx`: Enables extended syntax mode, which allows non-semantic whitespace and end-of-line comments. See the *trivia* section for more info. @@ -217,10 +241,10 @@ We support all the matching options accepted by PCRE, ICU, and Oniguruma. In add #### Oniguruma options -- `D`: Enables ASCII-only digit matching for `\d`, `\p{Digit}`, `[:digit:]` -- `S`: Enables ASCII-only space matching for `\s`, `\p{Space}`, `[:space:]` -- `W`: Enables ASCII-only word matching for `\w`, `\p{Word}`, `[:word:]`, and `\b` -- `P`: Enables ASCII-only for all POSIX properties (including `digit`, `space`, and `word`) +- `D`: Enables ASCII-only digit matching for `\d`, `\p{Digit}`, `[:digit:]`. +- `S`: Enables ASCII-only space matching for `\s`, `\p{Space}`, `[:space:]`. +- `W`: Enables ASCII-only word matching for `\w`, `\p{Word}`, `[:word:]`, and `\b`. +- `P`: Enables ASCII-only for all POSIX properties (including `digit`, `space`, and `word`). - `y{g}`, `y{w}`: Changes the meaning of `\X`, `\y`, `\Y`. These are mutually exclusive options, with `y{g}` specifying extended grapheme cluster mode, and `y{w}` specifying word mode. #### Swift options @@ -266,7 +290,7 @@ HexDigit -> [0-9a-zA-Z] OctalDigit -> [0-7] ``` -These sequences define a unicode scalar value to be matched against. There is both syntax for specifying the scalar value in hex notation, as well as octal notation. Note that `\x`, when not followed by any hexadecimal digit characters, is treated as `\0`, matching PCRE's behavior. +These sequences define a unicode scalar value to be matched against. There is syntax for both specifying the scalar value in hex notation, as well as octal notation. Note that `\x`, when not followed by any hexadecimal digit characters, is treated as `\0`, matching PCRE's behavior. ### Escape sequences @@ -274,11 +298,11 @@ These sequences define a unicode scalar value to be matched against. There is bo EscapeSequence -> '\a' | '\b' | '\c' | '\e' | '\f' | '\n' | '\r' | '\t' ``` -These escape sequences denote a specific character. +These escape sequences each denote a specific scalar value. - `\a`: The alert (bell) character `U+7`. - `\b`: The backspace character `U+8`. Note this may only be used in a custom character class, otherwise it represents a word boundary. -- `\c `: A control character sequence (`U+00` - `U+7F`). +- `\c `: A control character sequence, which denotes a scalar from `U+00` - `U+7F` depending on the ASCII character provided. - `\e`: The escape character `U+1B`. - `\f`: The form-feed character `U+C`. - `\n`: The newline character `U+A`. @@ -291,22 +315,22 @@ These escape sequences denote a specific character. BuiltinCharClass -> '.' | '\C' | '\d' | '\D' | '\h' | '\H' | '\N' | '\O' | '\R' | '\s' | '\S' | '\v' | '\V' | '\w' | '\W' | '\X' ``` -- `.`: Any character excluding newlines -- `\C`: A single UTF code unit -- `\d`: Digit character -- `\D`: Non-digit character -- `\h`: Horizontal space character -- `\H`: Non-horizontal-space character -- `\N`: Non-newline character +- `.`: Any character excluding newlines. +- `\C`: A single UTF code unit. +- `\d`: Digit character. +- `\D`: Non-digit character. +- `\h`: Horizontal space character. +- `\H`: Non-horizontal-space character. +- `\N`: Non-newline character. - `\O`: Any character (including newlines). This is syntax from Oniguruma. -- `\R`: Newline sequence -- `\s`: Whitespace character -- `\S`: Non-whitespace character -- `\v`: Vertical space character -- `\V`: Non-vertical-space character -- `\w`: Word character -- `\W`: Non-word character -- `\X`: Any extended grapheme cluster +- `\R`: Newline sequence. +- `\s`: Whitespace character. +- `\S`: Non-whitespace character. +- `\v`: Vertical space character. +- `\V`: Non-vertical-space character. +- `\w`: Word character. +- `\W`: Non-word character. +- `\X`: Any extended grapheme cluster. ### Custom character classes @@ -322,26 +346,26 @@ SetOp -> '&&' | '--' | '~~' | '-' Custom characters classes introduce their own language, in which most regular expression metacharacters become literal. The basic element in a custom character class is an `Atom`, though only a few atoms are considered valid: -- Builtin character classes except `.`, `\R`, `\O`, `\X`, `\C`, and `\N`. -- Escape sequences, including `\b` which becomes the backspace character (rather than a word boundary) -- Unicode scalars -- Named scalars -- Character properties -- Plain literal characters +- Builtin character classes, except for `.`, `\R`, `\O`, `\X`, `\C`, and `\N`. +- Escape sequences, including `\b` which becomes the backspace character (rather than a word boundary). +- Unicode scalars. +- Named scalars. +- Character properties. +- Plain literal characters. Atoms may be used to compose other character class members, including ranges, quoted sequences, and even nested custom character classes `[[ab]c\d]`. Adjacent members form an implicit union of character classes, e.g `[[ab]c\d]` is the union of the characters `a`, `b`, `c`, and digit characters. Custom character classes may not be empty, e.g `[]` is forbidden. A custom character class may begin with the `]` character, in which case it is treated as literal, e.g `[]a]` is the custom character class of `]` and `a`. -Quoted sequences may be used to escape the contained characters, e.g `[\Q]\E]` is the character class of the literal character `[`. +Quoted sequences may be used to escape the contained characters, e.g `[a\Q]\E]` is also the character class of `[` and `a`. Ranges of characters may be specified with `-`, e.g `[a-z]` matches against the letters from `a` to `z`. Only unicode scalars and literal characters are valid range operands. If `-` cannot be used to form a range, it is interpreted as literal, e.g `[-a-]` is the character class of `-` and `a`. `[a-c-d]` is the character class of `a`...`c`, `-`, and `d`. Operators may be used to apply set operations to character class members. The operators supported are: -- `&&`: Intersection of the LHS and RHS -- `--`: Subtraction of the RHS from the LHS -- `~~`: Symmetric difference of the RHS and LHS +- `&&`: Intersection of the LHS and RHS. +- `--`: Subtraction of the RHS from the LHS. +- `~~`: Symmetric difference of the RHS and LHS. - `-`: .NET's spelling of subtracting the RHS from the LHS. These operators have a lower precedence than the implicit union of members, e.g `[ac-d&&a[d]]` is an intersection of the character classes `[ac-d]` and `[ad]`. @@ -436,26 +460,6 @@ RecursionLevel -> '+' | '-' A reference is an abstract identifier for a particular capturing group in a regular expression. It can either be named or numbered, and in the latter case may be specified relative to the current group. For example `-2` refers to the capture group `N - 2` where `N` is the number of the next capture group. References may refer to groups ahead of the current position e.g `+3`, or the name of a future group. These may be useful in recursive cases where the group being referenced has been matched in a prior iteration. -#### Group numbering - -Capturing groups are implicitly numbered according to the position of their opening `(` in the regex. For example: - -``` -(a((?:b)(?c)d)(e)f) -^ ^ ^ ^ -1 2 3 4 -``` - -Non-capturing groups are skipped over when counting. - -Branch reset groups can alter this numbering, as they reset the numbering in the branches of an alternation child. Outside the alternation, numbering resumes at the next available number not used in one of the branches. For example: - -``` -(a()(?|(b)(c)|(?:d)|(e)))(f) -^ ^ ^ ^ ^ ^ -1 2 3 4 3 5 -``` - #### Backreferences ``` @@ -824,7 +828,18 @@ We intend on canonicalizing to the `\g<...>` spelling. **TODO: For `(?R)` too?** ### Conditional references -**TODO: Decide** +``` +KnownCondition -> 'R' + | 'R' NumberRef + | 'R&' !')' + | '<' NameRef '>' + | "'" NameRef "'" + | 'DEFINE' + | 'VERSION' VersionCheck + | NumberRef +``` + +For named references in a group condition, there is a choice between `(?('name'))` and `(?())`. We intend on canonicalizing to `(?())` to match the group name canonicalization. ### PCRE Callouts From 74acaa8d9afedbda1223ddb71da77e50647bce1d Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 23 Feb 2022 15:04:13 +0000 Subject: [PATCH 14/51] Update RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index df15c0d3d..9844c5379 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -620,9 +620,9 @@ An absent function is an Oniguruma feature that allows for the easy inversion of ## Syntactic differences between engines -**TODO(Michael, if you want): Intro** +The above regular expression grammar covers a superset of the syntax accepted by PCRE, ICU, Oniguruma, .NET, and Java. However there are cases where the same syntax is parsed differently by these engines. This section provides a summary of these differences, and specifies the interpretation that will be made by the Swift regex parser. -**TODO(Michael, if you want): Talk about compatibility modes for different engines being a possible future direction?** +These differences inherently mean that our default parser behavior cannot be fully compatible with these other engines. However, this would not preclude the potential future implementation of different compatibility modes for different engines in which we support their parsing behavior of certain syntax. ### Character class set operations From b83c4e77a5f7bf2bd210854bb5deef0a1d6fb73e Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 23 Feb 2022 15:18:41 +0000 Subject: [PATCH 15/51] Update RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 9844c5379..5ff5aa53c 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -620,9 +620,11 @@ An absent function is an Oniguruma feature that allows for the easy inversion of ## Syntactic differences between engines -The above regular expression grammar covers a superset of the syntax accepted by PCRE, ICU, Oniguruma, .NET, and Java. However there are cases where the same syntax is parsed differently by these engines. This section provides a summary of these differences, and specifies the interpretation that will be made by the Swift regex parser. +The proposed "syntactic superset" introduces some minor ambiguities, as each engine supports a slightly different set of features. When a particular engine's parser sees a feature it doesn't support, it typically has a fall-back behavior, such as treating the unknown feature as literal contents. -These differences inherently mean that our default parser behavior cannot be fully compatible with these other engines. However, this would not preclude the potential future implementation of different compatibility modes for different engines in which we support their parsing behavior of certain syntax. +Explicit compatibility modes, i.e. precisely mimicking emergent behavior from a specific engine's parser, is deferred as future work from this proposal. Conversion from this "syntactic superset" to a particular engine's syntax (e.g. as an AST "pretty printer") is deferred as future work from this proposal. + +Below is an exhaustive treatment of every syntactic ambiguity we have encountered. ### Character class set operations @@ -632,6 +634,7 @@ In a custom character class, some engines allow for binary set operations that t |------|-----|--------|-----------|------|------| | ❌ | Intersection `&&`, Subtraction `--` | Intersection, Subtraction | Intersection `&&` | Subtraction via `-` | Intersection `&&` | + [UTS#18][uts18] requires intersection and subtraction, and uses the operation spellings `&&` and `--` in its examples, though it doesn't mandate a particular spelling. In particular, conforming implementations could spell the subtraction `[[x]--[y]]` as `[[x]&&[^y]]`. UTS#18 also suggests a symmetric difference operator `~~`, and uses an explicit `||` operator in examples, though doesn't require either. The differing support between engines is conflicting, as engines that don't support a particular operator treat them as literal, e.g `[x&&y]` in PCRE is the character class of `["x", "&", "y"]` rather than an intersection. @@ -650,6 +653,7 @@ This allows e.g `[[a]b[c]]`, which is interpreted the same as `[abc]`. It also a |------|-----|--------|-----------|------|------| | ❌ | ✅ | 💡 | ✅ | ❌ | ✅ | + UTS#18 doesn't require this, though it does suggest it as a way to clarify precedence for chains of character class set operations e.g `[\w--\d&&\s]`, which the user could write as `[[\w--\d]&&\s]`. PCRE does not support this feature, and as such treats `]` as the closing character of the custom character class. Therefore `[[a]b[c]]` is interpreted as the character class `["[", "a"]`, followed by literal `b`, and then the character class `["c"]`, followed by literal `]`. From 31c5cf5f69fbcc351adac86f8f1eccfc7739d02b Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 23 Feb 2022 16:53:13 +0000 Subject: [PATCH 16/51] Update RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 5ff5aa53c..9cc536dc4 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -637,13 +637,13 @@ In a custom character class, some engines allow for binary set operations that t [UTS#18][uts18] requires intersection and subtraction, and uses the operation spellings `&&` and `--` in its examples, though it doesn't mandate a particular spelling. In particular, conforming implementations could spell the subtraction `[[x]--[y]]` as `[[x]&&[^y]]`. UTS#18 also suggests a symmetric difference operator `~~`, and uses an explicit `||` operator in examples, though doesn't require either. -The differing support between engines is conflicting, as engines that don't support a particular operator treat them as literal, e.g `[x&&y]` in PCRE is the character class of `["x", "&", "y"]` rather than an intersection. +Engines that don't support a particular operator fallback to treating it as literal, e.g `[x&&y]` in PCRE is the character class of `["x", "&", "y"]` rather than an intersection. -Another conflict arises with .NET's support of using the `-` character in a custom character class to denote both a range as well as a set subtraction. .NET disambiguates this by only permitting its use as a subtraction if the right hand operand is a nested custom character class, otherwise it is a range. This conflicts with e.g ICU where `[x-[y]]`, in which the `-` is treated as literal. +Unlike other engines, .NET supports the use of `-` to denote both a range as well as a set subtraction. .NET disambiguates this by only permitting its use as a subtraction if the right hand operand is a nested custom character class, otherwise it is a range operator. This conflicts with e.g ICU where `[x-[y]]`, in which the `-` is treated as literal. We intend to support the operators `&&`, `--`, and `~~`. This means that any regex literal containing these sequences in a custom character class while being written for an engine not supporting that operation will have a different semantic meaning in our engine. However this ought not to be a common occurrence, as specifying a character multiple times in a custom character class is redundant. -In the interests of compatibility, we also intend on supporting the `-` operator, though we likely want to emit a warning and encourage users to switch to `--`. +In the interests of compatibility, we also intend on supporting the `-` operator, though we will likely want to emit a warning and encourage users to switch to `--`. ### Nested custom character classes @@ -660,6 +660,8 @@ PCRE does not support this feature, and as such treats `]` as the closing charac .NET does not support nested character classes in general, although allows them as the right-hand side of a subtraction operation. +We intend on permitting nested custom character classes. + ### `\U` In PCRE, if `PCRE2_ALT_BSUX` or `PCRE2_EXTRA_ALT_BSUX` are specified, `\U` matches literal `U`. However in ICU, `\Uhhhhhhhh` matches a hex sequence. We intend on following the ICU behavior. @@ -672,9 +674,9 @@ This quantifier is supported by Oniguruma, but in PCRE it matches the literal ch This syntax is implemented in a variety of different ways depending on the engine. In ICU and Java, it is always a backreference unless prefixed with `0`, in which case it is an octal sequence. -In PCRE, Oniguruma, and .NET, it is also always an octal sequence if prefixed with `0`, however there are also other conditions where it may be treated as octal. These conditions vary slightly been the engines. In PCRE, it will be treated as backreference if any of the following hold: +In PCRE, Oniguruma, and .NET, it is also always an octal sequence if prefixed with `0`, however there are other cases where it may be treated as octal. These cases vary slightly between the engines. In PCRE, it will be treated as backreference if any of the following hold: -- Its `0 < n < 10`. +- Its value is `0 < n < 10`. - Its first digit is `8` or `9`. - Its value corresponds to a valid *prior* group number. @@ -684,7 +686,7 @@ Oniguruma follows all of these except the second. If the first digit is `8` or ` We intend to implement a simpler behavior more inline with ICU and Java. A `\DDD` sequence that does not start with a `0` will be treated as a backreference, otherwise it will be treated as an octal sequence. If an invalid backreference is formed with this syntax, we will suggest prefixing with a `0` if an octal sequence is desired. -One further difference between engines exists with this syntax in the octal sequence case. In ICU, up to 3 additional digits are read after the `0`. In PCRE, only 2 additional digits may be interpreted as octal, the last is literal. We intend to follow the ICU behavior, as it is necessary when requiring a `0` prefix. +One further difference exists between engines in the octal sequence case. In ICU, up to 3 additional digits are read after the `0`. In PCRE, only 2 additional digits may be interpreted as octal, the last is literal. We intend to follow the ICU behavior, as it is necessary when requiring a `0` prefix. ### `\x` @@ -692,7 +694,7 @@ In PCRE, a bare `\x` denotes the NUL character (`U+00`). In Oniguruma, it denote ### Whitespace in ranges -In PCRE, `x{2,4}` is a range quantifier meaning that `x` can be matched from 2 to 4 times. However if any whitespace is introduced within the braces, it becomes an invalid range and is then treated as the literal characters instead. We find this behavior to be unintuitive, and therefore intend to parse any intermixed whitespace in the range. +In PCRE, `x{2,4}` is a range quantifier meaning that `x` can be matched from 2 to 4 times. However if any whitespace is introduced within the braces e.g `x{2, 4}`, it becomes an invalid range and is then treated as the literal characters instead. We find this behavior to be unintuitive, and therefore intend to parse any intermixed whitespace in the range. ### Implicitly-scoped matching option scopes @@ -700,7 +702,7 @@ PCRE and Oniguruma both support changing the active matching options through an These sound similar, but have different semantics around alternations, e.g for `a(?i)b|c|d`, in Oniguruma this becomes `a(?i:b|c|d)`, where `a` is no longer part of the alternation. However in PCRE it becomes `a(?i:b)|(?i:c)|(?i:d)`, where `a` remains a child of the alternation. -We aim to support the PCRE behavior. +We intend on matching the PCRE behavior. ### Backreference condition kinds @@ -730,7 +732,7 @@ As such we feel that the more desirable default behavior of shorthand script pro ### Extended syntax modes -Various regex engines offer an "extended syntax" where whitespace is treated as non-semantic (e.g `a b c` is equivalent to `abc`), in addition to allowing end-of-line comments `# comment`. In PCRE, this enabled through the `(?x)` and `(?xx)` matching options, where the former allows non-semantic whitespace outside of character classes, and the latter also allows non-semantic whitespace in custom character classes. +Various regex engines offer an "extended syntax" where whitespace is treated as non-semantic (e.g `a b c` is equivalent to `abc`), in addition to allowing end-of-line comments `# comment`. In PCRE, this is enabled through the `(?x)` and `(?xx)` matching options, where the former allows non-semantic whitespace outside of character classes, and the latter also allows non-semantic whitespace in custom character classes. Oniguruma, Java, and ICU however enable the more broad behavior under `(?x)`. We therefore intend to follow this behavior, with `(?x)` and `(?xx)` being treated the same. @@ -744,7 +746,7 @@ Different regex engines also have different rules around what characters are con - Line separator `U+2028` - Paragraph separator `U+2029` -This is a subset of the scalars matched by `UnicodeScalar.isWhitespace`. Additionally, in a custom character class, PCRE only considers the space and tab characters as whitespace. Other engines do not differentiate between whitespace characters inside and outside custom character classes, and appear to follow a subset of this list. Therefore we intend to support exactly the characters in this list for the purposes of non-semantic whitespace. +This is a subset of the scalars matched by `UnicodeScalar.isWhitespace`. Additionally, in a custom character class, PCRE only considers the space and tab characters as whitespace. Other engines do not differentiate between whitespace characters inside and outside custom character classes, and appear to follow a subset of this list. Therefore we intend to support exactly the characters in this list for the purposes of non-semantic whitespace parsing. ### Group numbering From 1b123856b6605a0fc2d7c57a389e0f8eb79673ac Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 25 Feb 2022 15:58:44 +0000 Subject: [PATCH 17/51] Update RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 93 +++++++++++++++----------- 1 file changed, 54 insertions(+), 39 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 9cc536dc4..1238e9e51 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -36,35 +36,47 @@ Note that there are minor syntactic incompatibilities and ambiguities involved i ## Detailed Design -We're proposing the following regular expression syntactic superset for Swift. +We propose the following syntax for use inside Swift regex literals. + +*TODO:* Disclosure triangle explaining the grammar conventions? ### Top-level regular expression ``` Regex -> GlobalMatchingOptionSequence? RegexNode RegexNode -> '' | Alternation +Alternation -> Concatenation ('|' Concatenation)* +Concatenation -> (!'|' !')' ConcatComponent)* ``` -A top-level regular expression may consist of a sequence of global matching options followed by a `RegexNode`, which is the recursive part of the grammar that may be nested within e.g a group. A regex node may be empty, which is the null pattern that always matches, but does not advance the input. - -### Alternation +A regex literal may be prefixed with a sequence of global matching options(*TODO*: intra-doc link). A literal's contents can be empty or a sequence of alternatives separated by `|`. -``` -Alternation -> Concatenation ('|' Concatenation)* -``` +Alternatives are a series of expressions concatenated together. The concatentation ends with either a `|` denoting the end of the alternative or a `)` denoting the end of a recursively parsed group. -The `|` operator denotes what is formally called an alternation, or a choice between alternatives. Any number of alternatives may appear, including empty alternatives. This operator has the lowest precedence of all operators in a regex literal. +Alternation has a lower precedence than concatenation or other operations, so e.g `abc|def` matches against `abc` or `def`.. -### Concatenation +### Concatenated subexpressions ``` -Concatenation -> (!'|' !')' ConcatComponent)* ConcatComponent -> Trivia | Quote | Quantification + +Trivia -> Comment | NonSemanticWhitespace +Comment -> '(?#' (!')')* ')' | EndOfLineComment + +(extended syntax only) EndOfLineComment -> '#' .*$ +(extended syntax only) NonSemanticWhitespace -> \s+ + +Quote -> '\Q' (!'\E' .)* '\E' + ``` -Implicitly denoted by adjacent expressions, a concatenation matches against a sequence of regular expression nodes. This has a higher precedence than an alternation, so e.g `abc|def` matches against `abc` or `def`. A concatenation may consist of potentially quantified expressions, trivia such as inline comments, and quoted sequences `\Q...\E`. +Each component of a concatenation may be "trivia" (comments and non-semantic whitespace, if applicable), a quoted run of literal content, or a potentially-quantified subexpression. + +In-line comments, similarly to C, are lexical and are not recursively nested like normal groups are. A closing `)` cannot be escaped. Quotes are similarly lexical, non-nested, and the `\` before a `\E` cannot be escaped. -### Quantification +For example, `\Q^[xy]+$\E`, is treated as the literal characters `^[xy]+$` rather than an anchored quantified character class. `\Q\\E` is a literal `\`. + +### Quantified subexpressions ``` Quantification -> QuantOperand Quantifier? @@ -453,23 +465,26 @@ The backslash character is also treated as literal within a quoted sequence, and ### References ``` -NamedRef -> Identifier -NumberRef -> ('+' | '-')? RecursionLevel? -RecursionLevel -> '+' | '-' +NamedOrNumberRef -> NamedRef | NumberRef +NamedRef -> Identifier RecursionLevel? +NumberRef -> ('+' | '-')? RecursionLevel? +RecursionLevel -> '+' | '-' ``` A reference is an abstract identifier for a particular capturing group in a regular expression. It can either be named or numbered, and in the latter case may be specified relative to the current group. For example `-2` refers to the capture group `N - 2` where `N` is the number of the next capture group. References may refer to groups ahead of the current position e.g `+3`, or the name of a future group. These may be useful in recursive cases where the group being referenced has been matched in a prior iteration. +A backreference may optionally include a recursion level in certain cases, which is a syntactic element inherited from Oniguruma that allows the reference to specify a capture relative to a given recursion level. + #### Backreferences ``` -Backreference -> '\g{' NameOrNumberRef '}' +Backreference -> '\g{' NamedOrNumberRef '}' | '\g' NumberRef - | '\k<' NameOrNumberRef '>' - | "\k'" NameOrNumberRef "'" - | '\k{' Identifier '}' + | '\k<' NamedOrNumberRef '>' + | "\k'" NamedOrNumberRef "'" + | '\k{' NamedRef '}' | '\' [1-9] [0-9]+ - | '(?P=' Identifier ')' + | '(?P=' NamedRef ')' ``` A backreference evaluates to the value last captured by the referenced capturing group. If the referenced capture has not been evaluated yet, the match fails. @@ -477,12 +492,12 @@ A backreference evaluates to the value last captured by the referenced capturing #### Subpatterns ``` -Subpattern -> '\g<' NameOrNumberRef '>' - | "\g'" NameOrNumberRef "'" +Subpattern -> '\g<' NamedOrNumberRef '>' + | "\g'" NamedOrNumberRef "'" | '(?' GroupLikeSubpatternBody ')' -GroupLikeSubpatternBody -> 'P>' - | '&' +GroupLikeSubpatternBody -> 'P>' NamedRef + | '&' NamedRef | 'R' | NumberRef ``` @@ -500,9 +515,9 @@ GroupConditionalStart -> '(?' GroupStart KnownCondition -> 'R' | 'R' NumberRef - | 'R&' !')' - | '<' NameRef '>' - | "'" NameRef "'" + | 'R&' NamedRef + | '<' NamedOrNumberRef '>' + | "'" NamedOrNumberRef "'" | 'DEFINE' | 'VERSION' VersionCheck | NumberRef @@ -806,13 +821,13 @@ We intend on canonicalizing to the short-form versions of these group kinds, e.g ### Backreferences ``` -Backreference -> '\g{' NameOrNumberRef '}' +Backreference -> '\g{' NamedOrNumberRef '}' | '\g' NumberRef - | '\k<' NameOrNumberRef '>' - | "\k'" NameOrNumberRef "'" - | '\k{' Identifier '}' + | '\k<' NamedOrNumberRef '>' + | "\k'" NamedOrNumberRef "'" + | '\k{' NamedRef '}' | '\' [1-9] [0-9]+ - | '(?P=' Identifier ')' + | '(?P=' NamedRef ')' ``` For absolute numeric references, we plan on choosing the canonical spelling `\DDD`, as it is unambiguous with octal sequences. For relative numbered references, as well as named references, we intend on canonicalizing to `\k<...>` to match the group name canonicalization `(?<...>)`. **TODO: How valuable is it to have canonical `\DDD`? Would it be better to just use `\k<...>` for everything?** @@ -820,12 +835,12 @@ For absolute numeric references, we plan on choosing the canonical spelling `\DD ### Subpatterns ``` -Subpattern -> '\g<' NameOrNumberRef '>' - | "\g'" NameOrNumberRef "'" +Subpattern -> '\g<' NamedOrNumberRef '>' + | "\g'" NamedOrNumberRef "'" | '(?' GroupLikeSubpatternBody ')' -GroupLikeSubpatternBody -> 'P>' - | '&' +GroupLikeSubpatternBody -> 'P>' NamedRef + | '&' NamedRef | 'R' | NumberRef ``` @@ -837,9 +852,9 @@ We intend on canonicalizing to the `\g<...>` spelling. **TODO: For `(?R)` too?** ``` KnownCondition -> 'R' | 'R' NumberRef - | 'R&' !')' - | '<' NameRef '>' - | "'" NameRef "'" + | 'R&' NamedRef + | '<' NamedOrNumberRef '>' + | "'" NamedOrNumberRef "'" | 'DEFINE' | 'VERSION' VersionCheck | NumberRef From 3f24f5bcb31717d99511c203dbf6bdb7eab033de Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Thu, 24 Feb 2022 15:52:00 -0700 Subject: [PATCH 18/51] Internalize declarations Lots of ME stuff was annotated with vestigial public --- Sources/Prototypes/PEG/PEGCode.swift | 2 +- Sources/Prototypes/PEG/PEGCompile.swift | 2 +- Sources/Prototypes/PEG/PEGCore.swift | 2 +- Sources/Prototypes/PEG/PEGTranspile.swift | 3 +- Sources/Prototypes/PEG/PEGVM.swift | 3 +- Sources/Prototypes/PEG/Printing.swift | 2 +- Sources/_StringProcessing/Compiler.swift | 2 +- .../_StringProcessing/Engine/Consume.swift | 4 +- Sources/_StringProcessing/Engine/Engine.swift | 8 +- .../Engine/Instruction.swift | 3 +- .../_StringProcessing/Engine/MEBuilder.swift | 124 +++++++------- .../_StringProcessing/Engine/MECapture.swift | 2 +- .../_StringProcessing/Engine/MEProgram.swift | 12 +- .../_StringProcessing/Engine/Processor.swift | 2 +- .../_StringProcessing/Engine/Tracing.swift | 2 +- Sources/_StringProcessing/Executor.swift | 14 +- Sources/_StringProcessing/RegexDSL/DSL.swift | 4 +- .../_StringProcessing/RegexDSL/DSLTree.swift | 2 +- .../_StringProcessing/Unicode/Decoding.swift | 20 +-- .../Unicode/NecessaryEvils.swift | 2 +- .../_StringProcessing/Utility/Protocols.swift | 12 +- .../_StringProcessing/Utility/Traced.swift | 24 +-- .../Utility/TypedIndex.swift | 59 +++---- .../_StringProcessing/Utility/TypedInt.swift | 159 ++++++++---------- 24 files changed, 215 insertions(+), 254 deletions(-) diff --git a/Sources/Prototypes/PEG/PEGCode.swift b/Sources/Prototypes/PEG/PEGCode.swift index c33f5759c..b12c5bab6 100644 --- a/Sources/Prototypes/PEG/PEGCode.swift +++ b/Sources/Prototypes/PEG/PEGCode.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -import _StringProcessing +@testable import _StringProcessing extension PEG.VM { struct Code { diff --git a/Sources/Prototypes/PEG/PEGCompile.swift b/Sources/Prototypes/PEG/PEGCompile.swift index 0592cf6a9..0e1b89233 100644 --- a/Sources/Prototypes/PEG/PEGCompile.swift +++ b/Sources/Prototypes/PEG/PEGCompile.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -import _StringProcessing +@testable import _StringProcessing extension PEG.VM { typealias InIndex = Input.Index diff --git a/Sources/Prototypes/PEG/PEGCore.swift b/Sources/Prototypes/PEG/PEGCore.swift index b831cbd0f..5c66dc25a 100644 --- a/Sources/Prototypes/PEG/PEGCore.swift +++ b/Sources/Prototypes/PEG/PEGCore.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -import _StringProcessing +@testable import _StringProcessing let emitComments = true struct PEGCore< diff --git a/Sources/Prototypes/PEG/PEGTranspile.swift b/Sources/Prototypes/PEG/PEGTranspile.swift index df75cea63..84e220d52 100644 --- a/Sources/Prototypes/PEG/PEGTranspile.swift +++ b/Sources/Prototypes/PEG/PEGTranspile.swift @@ -9,8 +9,7 @@ // //===----------------------------------------------------------------------===// -import _MatchingEngine -import _StringProcessing +@testable import _StringProcessing extension PEG.VM where Input == String { typealias MEProg = MEProgram diff --git a/Sources/Prototypes/PEG/PEGVM.swift b/Sources/Prototypes/PEG/PEGVM.swift index a987b581d..4cf91a5c1 100644 --- a/Sources/Prototypes/PEG/PEGVM.swift +++ b/Sources/Prototypes/PEG/PEGVM.swift @@ -9,7 +9,8 @@ // //===----------------------------------------------------------------------===// -import _StringProcessing + +@testable import _StringProcessing extension PEG { diff --git a/Sources/Prototypes/PEG/Printing.swift b/Sources/Prototypes/PEG/Printing.swift index 978250761..be60e72f5 100644 --- a/Sources/Prototypes/PEG/Printing.swift +++ b/Sources/Prototypes/PEG/Printing.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -import _StringProcessing +@testable import _StringProcessing extension PEGCore.Instruction: InstructionProtocol { var operandPC: InstructionAddress? { self.pc } diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 5099e187f..1d72a8d27 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -35,7 +35,7 @@ class Compiler { } } -public func _compileRegex( +func _compileRegex( _ regex: String, _ syntax: SyntaxOptions = .traditional ) throws -> Executor { let ast = try parse(regex, syntax) diff --git a/Sources/_StringProcessing/Engine/Consume.swift b/Sources/_StringProcessing/Engine/Consume.swift index 52f752539..cfb803de8 100644 --- a/Sources/_StringProcessing/Engine/Consume.swift +++ b/Sources/_StringProcessing/Engine/Consume.swift @@ -25,13 +25,13 @@ extension Engine { } extension Engine where Input == String { - public func consume( + func consume( _ input: Input ) -> (Input.Index, CaptureList)? { consume(input, in: input.startIndex ..< input.endIndex) } - public func consume( + func consume( _ input: Input, in range: Range, matchMode: MatchMode = .partialFromFront diff --git a/Sources/_StringProcessing/Engine/Engine.swift b/Sources/_StringProcessing/Engine/Engine.swift index 6c9c2efa5..86952c8b7 100644 --- a/Sources/_StringProcessing/Engine/Engine.swift +++ b/Sources/_StringProcessing/Engine/Engine.swift @@ -11,7 +11,7 @@ // Currently, engine binds the type and consume binds an instance. // But, we can play around with this. -public struct Engine where Input.Element: Hashable { +struct Engine where Input.Element: Hashable { var program: MEProgram @@ -24,7 +24,7 @@ public struct Engine where Input.Element: Hashab set { program.enableTracing = newValue } } - public init( + init( _ program: MEProgram, enableTracing: Bool? = nil ) { @@ -36,10 +36,10 @@ public struct Engine where Input.Element: Hashab } } -public struct AsyncEngine { /* ... */ } +struct AsyncEngine { /* ... */ } extension Engine: CustomStringConvertible { - public var description: String { + var description: String { // TODO: better description return program.description } diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index fcc257302..ff28ee9e2 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -299,8 +299,7 @@ extension Instruction { internal var _opcodeMask: UInt64 { 0xFF00_0000_0000_0000 } -// TODO: internal after compiler moves in -public var _payloadMask: UInt64 { ~_opcodeMask } +var _payloadMask: UInt64 { ~_opcodeMask } extension Instruction { var opcodeMask: UInt64 { 0xFF00_0000_0000_0000 } diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index d81c583a8..78171a001 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -12,7 +12,7 @@ import _MatchingEngine // For errors extension MEProgram where Input.Element: Hashable { - public struct Builder { + struct Builder { var instructions: [Instruction] = [] var elements = TypedSetVector() @@ -50,7 +50,7 @@ extension MEProgram where Input.Element: Hashable { nextCaptureRegister.rawValue } - public init() {} + init() {} } } @@ -71,7 +71,7 @@ extension MEProgram.Builder { // TODO: We want a better strategy for fixups, leaving // the operand in a different form isn't great... - public init(staticElements: S) where S.Element == Input.Element { + init(staticElements: S) where S.Element == Input.Element { staticElements.forEach { elements.store($0) } } @@ -79,21 +79,21 @@ extension MEProgram.Builder { .init(instructions.endIndex - 1) } - public mutating func buildNop(_ r: StringRegister? = nil) { + mutating func buildNop(_ r: StringRegister? = nil) { instructions.append(.init(.nop, .init(optionalString: r))) } - public mutating func buildNop(_ s: String) { + mutating func buildNop(_ s: String) { buildNop(strings.store(s)) } - public mutating func buildDecrement( + mutating func buildDecrement( _ i: IntRegister, nowZero: BoolRegister ) { instructions.append(.init( .decrement, .init(bool: nowZero, int: i))) } - public mutating func buildMoveImmediate( + mutating func buildMoveImmediate( _ value: UInt64, into: IntRegister ) { instructions.append(.init( @@ -101,25 +101,25 @@ extension MEProgram.Builder { } // TODO: generic - public mutating func buildMoveImmediate( + mutating func buildMoveImmediate( _ value: Int, into: IntRegister ) { let uint = UInt64(asserting: value) buildMoveImmediate(uint, into: into) } - public mutating func buildMoveCurrentPosition( + mutating func buildMoveCurrentPosition( into: PositionRegister ) { instructions.append(.init( .movePosition, .init(position: into))) } - public mutating func buildBranch(to t: AddressToken) { + mutating func buildBranch(to t: AddressToken) { instructions.append(.init(.branch)) fixup(to: t) } - public mutating func buildCondBranch( + mutating func buildCondBranch( _ condition: BoolRegister, to t: AddressToken ) { instructions.append( @@ -127,7 +127,7 @@ extension MEProgram.Builder { fixup(to: t) } - public mutating func buildCondBranch( + mutating func buildCondBranch( to t: AddressToken, ifZeroElseDecrement i: IntRegister ) { instructions.append( @@ -135,56 +135,56 @@ extension MEProgram.Builder { fixup(to: t) } - public mutating func buildSave(_ t: AddressToken) { + mutating func buildSave(_ t: AddressToken) { instructions.append(.init(.save)) fixup(to: t) } - public mutating func buildSaveAddress(_ t: AddressToken) { + mutating func buildSaveAddress(_ t: AddressToken) { instructions.append(.init(.saveAddress)) fixup(to: t) } - public mutating func buildSplit( + mutating func buildSplit( to: AddressToken, saving: AddressToken ) { instructions.append(.init(.splitSaving)) fixup(to: (to, saving)) } - public mutating func buildClear() { + mutating func buildClear() { instructions.append(.init(.clear)) } - public mutating func buildRestore() { + mutating func buildRestore() { instructions.append(.init(.restore)) } - public mutating func buildFail() { + mutating func buildFail() { instructions.append(.init(.fail)) } - public mutating func buildCall(_ t: AddressToken) { + mutating func buildCall(_ t: AddressToken) { instructions.append(.init(.call)) fixup(to: t) } - public mutating func buildRet() { + mutating func buildRet() { instructions.append(.init(.ret)) } - public mutating func buildAbort(_ s: StringRegister? = nil) { + mutating func buildAbort(_ s: StringRegister? = nil) { instructions.append(.init( .abort, .init(optionalString: s))) } - public mutating func buildAbort(_ s: String) { + mutating func buildAbort(_ s: String) { buildAbort(strings.store(s)) } - public mutating func buildAdvance(_ n: Distance) { + mutating func buildAdvance(_ n: Distance) { instructions.append(.init(.advance, .init(distance: n))) } - public mutating func buildMatch(_ e: Input.Element) { + mutating func buildMatch(_ e: Input.Element) { instructions.append(.init( .match, .init(element: elements.store(e)))) } - public mutating func buildMatchSequence( + mutating func buildMatchSequence( _ s: S ) where S.Element == Input.Element { instructions.append(.init( @@ -192,7 +192,7 @@ extension MEProgram.Builder { .init(sequence: sequences.store(.init(s))))) } - public mutating func buildMatchSlice( + mutating func buildMatchSlice( lower: PositionRegister, upper: PositionRegister ) { instructions.append(.init( @@ -200,50 +200,50 @@ extension MEProgram.Builder { .init(pos: lower, pos2: upper))) } - public mutating func buildConsume( + mutating func buildConsume( by p: @escaping MEProgram.ConsumeFunction ) { instructions.append(.init( .consumeBy, .init(consumer: makeConsumeFunction(p)))) } - public mutating func buildAssert( + mutating func buildAssert( by p: @escaping MEProgram.AssertionFunction ) { instructions.append(.init( .assertBy, .init(assertion: makeAssertionFunction(p)))) } - public mutating func buildAssert( + mutating func buildAssert( _ e: Input.Element, into cond: BoolRegister ) { instructions.append(.init(.assertion, .init( element: elements.store(e), bool: cond))) } - public mutating func buildAccept() { + mutating func buildAccept() { instructions.append(.init(.accept)) } - public mutating func buildPrint(_ s: StringRegister) { + mutating func buildPrint(_ s: StringRegister) { instructions.append(.init(.print, .init(string: s))) } - public mutating func buildBeginCapture( + mutating func buildBeginCapture( _ cap: CaptureRegister ) { instructions.append( .init(.beginCapture, .init(capture: cap))) } - public mutating func buildEndCapture( + mutating func buildEndCapture( _ cap: CaptureRegister ) { instructions.append( .init(.endCapture, .init(capture: cap))) } - public mutating func buildTransformCapture( + mutating func buildTransformCapture( _ cap: CaptureRegister, _ trans: TransformRegister ) { instructions.append(.init( @@ -251,7 +251,7 @@ extension MEProgram.Builder { .init(capture: cap, transform: trans))) } - public mutating func buildMatcher( + mutating func buildMatcher( _ fun: MatcherRegister, into reg: ValueRegister ) { instructions.append(.init( @@ -259,7 +259,7 @@ extension MEProgram.Builder { .init(matcher: fun, value: reg))) } - public mutating func buildMove( + mutating func buildMove( _ value: ValueRegister, into capture: CaptureRegister ) { instructions.append(.init( @@ -267,21 +267,21 @@ extension MEProgram.Builder { .init(value: value, capture: capture))) } - public mutating func buildBackreference( + mutating func buildBackreference( _ cap: CaptureRegister ) { instructions.append( .init(.backreference, .init(capture: cap))) } - public mutating func buildUnresolvedReference(id: ReferenceID) { + mutating func buildUnresolvedReference(id: ReferenceID) { buildBackreference(.init(0)) unresolvedReferences[id, default: []].append(lastInstructionAddress) } // TODO: Mutating because of fail address fixup, drop when // that's removed - public mutating func assemble() throws -> MEProgram { + mutating func assemble() throws -> MEProgram { try resolveReferences() // TODO: This will add a fail instruction at the end every @@ -356,22 +356,22 @@ extension MEProgram.Builder { referencedCaptureOffsets: referencedCaptureOffsets) } - public mutating func reset() { self = Self() } + mutating func reset() { self = Self() } } // Address-agnostic interfaces for label-like support extension MEProgram.Builder { - public enum _AddressToken {} - public typealias AddressToken = TypedInt<_AddressToken> + enum _AddressToken {} + typealias AddressToken = TypedInt<_AddressToken> - public mutating func makeAddress() -> AddressToken { + mutating func makeAddress() -> AddressToken { defer { addressTokens.append(nil) } return AddressToken(addressTokens.count) } // Resolves the address token to the most recently added // instruction, updating prior and future address references - public mutating func resolve(_ t: AddressToken) { + mutating func resolve(_ t: AddressToken) { assert(!instructions.isEmpty) addressTokens[t.rawValue] = @@ -380,7 +380,7 @@ extension MEProgram.Builder { // Resolves the address token to the next instruction (one past the most // recently added one), updating prior and future address references. - public mutating func label(_ t: AddressToken) { + mutating func label(_ t: AddressToken) { addressTokens[t.rawValue] = InstructionAddress(instructions.count) } @@ -388,7 +388,7 @@ extension MEProgram.Builder { // Associate the most recently added instruction with // the provided token, ensuring it is fixed up during // assembly - public mutating func fixup(to t: AddressToken) { + mutating func fixup(to t: AddressToken) { assert(!instructions.isEmpty) addressFixups.append( (InstructionAddress(instructions.endIndex-1), .init(t))) @@ -397,7 +397,7 @@ extension MEProgram.Builder { // Associate the most recently added instruction with // the provided tokens, ensuring it is fixed up during // assembly - public mutating func fixup( + mutating func fixup( to ts: (AddressToken, AddressToken) ) { assert(!instructions.isEmpty) @@ -412,7 +412,7 @@ extension MEProgram.Builder { // // This is useful for possessive quantification that needs some initial save // point to "ratchet" upon a successful match. - public mutating func pushEmptySavePoint() { + mutating func pushEmptySavePoint() { if failAddressToken == nil { failAddressToken = makeAddress() } @@ -438,7 +438,7 @@ fileprivate extension MEProgram.Builder { // Register helpers extension MEProgram.Builder { - public mutating func makeCapture(id: ReferenceID?) -> CaptureRegister { + mutating func makeCapture(id: ReferenceID?) -> CaptureRegister { defer { nextCaptureRegister.rawValue += 1 } // Register the capture for later lookup via symbolic references. if let id = id { @@ -449,25 +449,25 @@ extension MEProgram.Builder { return nextCaptureRegister } - public mutating func makeBoolRegister() -> BoolRegister { + mutating func makeBoolRegister() -> BoolRegister { defer { nextBoolRegister.rawValue += 1 } return nextBoolRegister } - public mutating func makeIntRegister() -> IntRegister { + mutating func makeIntRegister() -> IntRegister { defer { nextIntRegister.rawValue += 1 } return nextIntRegister } - public mutating func makePositionRegister() -> PositionRegister { + mutating func makePositionRegister() -> PositionRegister { defer { nextPositionRegister.rawValue += 1 } return nextPositionRegister } - public mutating func makeValueRegister() -> ValueRegister { + mutating func makeValueRegister() -> ValueRegister { defer { nextValueRegister.rawValue += 1 } return nextValueRegister } // Allocate and initialize a register - public mutating func makeIntRegister( + mutating func makeIntRegister( initialValue: Int ) -> IntRegister { let r = makeIntRegister() @@ -476,7 +476,7 @@ extension MEProgram.Builder { } // Allocate and initialize a register - public mutating func makePositionRegister( + mutating func makePositionRegister( initializingWithCurrentPosition: () ) -> PositionRegister { let r = makePositionRegister() @@ -485,17 +485,17 @@ extension MEProgram.Builder { } // 'kill' or release allocated registers - public mutating func kill(_ r: IntRegister) { + mutating func kill(_ r: IntRegister) { // TODO: Release/reuse registers, for now nop makes // reading the code easier buildNop("kill \(r)") } - public mutating func kill(_ r: BoolRegister) { + mutating func kill(_ r: BoolRegister) { // TODO: Release/reuse registers, for now nop makes // reading the code easier buildNop("kill \(r)") } - public mutating func kill(_ r: PositionRegister) { + mutating func kill(_ r: PositionRegister) { // TODO: Release/reuse registers, for now nop makes // reading the code easier buildNop("kill \(r)") @@ -504,25 +504,25 @@ extension MEProgram.Builder { // TODO: A register-mapping helper struct, which could release // registers without monotonicity required - public mutating func makeConsumeFunction( + mutating func makeConsumeFunction( _ f: @escaping MEProgram.ConsumeFunction ) -> ConsumeFunctionRegister { defer { consumeFunctions.append(f) } return ConsumeFunctionRegister(consumeFunctions.count) } - public mutating func makeAssertionFunction( + mutating func makeAssertionFunction( _ f: @escaping MEProgram.AssertionFunction ) -> AssertionFunctionRegister { defer { assertionFunctions.append(f) } return AssertionFunctionRegister(assertionFunctions.count) } - public mutating func makeTransformFunction( + mutating func makeTransformFunction( _ f: @escaping MEProgram.TransformFunction ) -> TransformRegister { defer { transformFunctions.append(f) } return TransformRegister(transformFunctions.count) } - public mutating func makeMatcherFunction( + mutating func makeMatcherFunction( _ f: @escaping MEProgram.MatcherFunction ) -> MatcherRegister { defer { matcherFunctions.append(f) } diff --git a/Sources/_StringProcessing/Engine/MECapture.swift b/Sources/_StringProcessing/Engine/MECapture.swift index 88f912ecb..bac632e9e 100644 --- a/Sources/_StringProcessing/Engine/MECapture.swift +++ b/Sources/_StringProcessing/Engine/MECapture.swift @@ -142,7 +142,7 @@ extension Processor._StoredCapture: CustomStringConvertible { } } -public struct CaptureList { +struct CaptureList { var values: Array._StoredCapture> var referencedCaptureOffsets: [ReferenceID: Int] diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift index d616657e8..1e58ddf54 100644 --- a/Sources/_StringProcessing/Engine/MEProgram.swift +++ b/Sources/_StringProcessing/Engine/MEProgram.swift @@ -11,13 +11,13 @@ import _MatchingEngine -public struct MEProgram where Input.Element: Equatable { - public typealias ConsumeFunction = (Input, Range) -> Input.Index? - public typealias AssertionFunction = +struct MEProgram where Input.Element: Equatable { + typealias ConsumeFunction = (Input, Range) -> Input.Index? + typealias AssertionFunction = (Input, Input.Index, Range) -> Bool - public typealias TransformFunction = + typealias TransformFunction = (Input, Range) -> Any? - public typealias MatcherFunction = + typealias MatcherFunction = (Input, Input.Index, Range) -> (Input.Index, Any)? var instructions: InstructionList @@ -39,7 +39,7 @@ public struct MEProgram where Input.Element: Equatable { } extension MEProgram: CustomStringConvertible { - public var description: String { + var description: String { var result = """ Elements: \(staticElements) Strings: \(staticStrings) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 10c3eb781..343b02c92 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -public enum MatchMode { +enum MatchMode { case wholeString case partialFromFront } diff --git a/Sources/_StringProcessing/Engine/Tracing.swift b/Sources/_StringProcessing/Engine/Tracing.swift index 7db740f52..24d00d3d7 100644 --- a/Sources/_StringProcessing/Engine/Tracing.swift +++ b/Sources/_StringProcessing/Engine/Tracing.swift @@ -15,7 +15,7 @@ extension Processor: TracedProcessor { var currentPC: InstructionAddress { controller.pc } - public func formatSavePoints() -> String { + func formatSavePoints() -> String { if !savePoints.isEmpty { var result = "save points:\n" for point in savePoints { diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index e066a4369..5c098c5c5 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -11,8 +11,7 @@ import _MatchingEngine - // FIXME: Public for prototype -public struct Executor { +struct Executor { // TODO: consider let, for now lets us toggle tracing var engine: Engine @@ -20,9 +19,8 @@ public struct Executor { self.engine = Engine(program, enableTracing: enablesTracing) } - // FIXME: Public for prototype - public struct Result { - public var range: Range + struct Result { + var range: Range var captures: [StructuredCapture] var referencedCaptureOffsets: [ReferenceID: Int] @@ -44,7 +42,7 @@ public struct Executor { } } - public func execute( + func execute( input: String, in range: Range, mode: MatchMode = .wholeString @@ -65,7 +63,7 @@ public struct Executor { fatalError(String(describing: error)) } } - public func execute( + func execute( input: Substring, mode: MatchMode = .wholeString ) -> Result? { @@ -75,7 +73,7 @@ public struct Executor { mode: mode) } - public func executeFlat( + func executeFlat( input: String, in range: Range, mode: MatchMode = .wholeString diff --git a/Sources/_StringProcessing/RegexDSL/DSL.swift b/Sources/_StringProcessing/RegexDSL/DSL.swift index 35a4ccb5e..17f006231 100644 --- a/Sources/_StringProcessing/RegexDSL/DSL.swift +++ b/Sources/_StringProcessing/RegexDSL/DSL.swift @@ -187,9 +187,7 @@ public func choiceOf( // MARK: - Backreference - -// FIXME: Public for prototypes. -public struct ReferenceID: Hashable, Equatable { +struct ReferenceID: Hashable, Equatable { private static var counter: Int = 0 var base: Int diff --git a/Sources/_StringProcessing/RegexDSL/DSLTree.swift b/Sources/_StringProcessing/RegexDSL/DSLTree.swift index a44220925..43f8aa62f 100644 --- a/Sources/_StringProcessing/RegexDSL/DSLTree.swift +++ b/Sources/_StringProcessing/RegexDSL/DSLTree.swift @@ -249,7 +249,7 @@ extension DSLTree { } } extension DSLTree.Node { - public func _captureStructure( + func _captureStructure( _ constructor: inout CaptureStructure.Constructor ) -> CaptureStructure { switch self { diff --git a/Sources/_StringProcessing/Unicode/Decoding.swift b/Sources/_StringProcessing/Unicode/Decoding.swift index 49eb1f794..68c14f6c1 100644 --- a/Sources/_StringProcessing/Unicode/Decoding.swift +++ b/Sources/_StringProcessing/Unicode/Decoding.swift @@ -33,13 +33,13 @@ enum UnsafeAssumingValidUTF8 { @inlinable @inline(__always) - public static func decode(_ x: UInt8) -> Unicode.Scalar { + static func decode(_ x: UInt8) -> Unicode.Scalar { _internalInvariant(UTF8.isASCII(x)) return Unicode.Scalar(_unchecked: UInt32(x)) } @inlinable @inline(__always) - public static func decode( + static func decode( _ x: UInt8, _ y: UInt8 ) -> Unicode.Scalar { _internalInvariant(scalarLength(x) == 2) @@ -50,7 +50,7 @@ enum UnsafeAssumingValidUTF8 { } @inlinable @inline(__always) - public static func decode( + static func decode( _ x: UInt8, _ y: UInt8, _ z: UInt8 ) -> Unicode.Scalar { _internalInvariant(scalarLength(x) == 3) @@ -63,7 +63,7 @@ enum UnsafeAssumingValidUTF8 { } @inlinable @inline(__always) - public static func decode( + static func decode( _ x: UInt8, _ y: UInt8, _ z: UInt8, _ w: UInt8 ) -> Unicode.Scalar { _internalInvariant(scalarLength(x) == 4) @@ -80,7 +80,7 @@ enum UnsafeAssumingValidUTF8 { // Also, assuming we can load from those bounds... @inlinable - public static func decode( + static func decode( _ utf8: UnsafeByteBuffer, startingAt i: Int ) -> (Unicode.Scalar, scalarLength: Int) { let cu0 = utf8[_unchecked: i] @@ -103,7 +103,7 @@ enum UnsafeAssumingValidUTF8 { } @inlinable - public static func decode( + static func decode( _ utf8: UnsafeByteBuffer, endingAt i: Int ) -> (Unicode.Scalar, scalarLength: Int) { let len = scalarLength(utf8, endingAt: i) @@ -113,7 +113,7 @@ enum UnsafeAssumingValidUTF8 { } @inlinable @inline(__always) - public static func scalarLength(_ x: UInt8) -> Int { + static func scalarLength(_ x: UInt8) -> Int { _internalInvariant(!UTF8.isContinuation(x)) if UTF8.isASCII(x) { return 1 } // TODO(String micro-performance): check codegen @@ -121,7 +121,7 @@ enum UnsafeAssumingValidUTF8 { } @inlinable @inline(__always) - public static func scalarLength( + static func scalarLength( _ utf8: UnsafeByteBuffer, endingAt i: Int ) -> Int { var len = 1 @@ -133,12 +133,12 @@ enum UnsafeAssumingValidUTF8 { } @inlinable @inline(__always) - public static func continuationPayload(_ x: UInt8) -> UInt32 { + static func continuationPayload(_ x: UInt8) -> UInt32 { return UInt32(x & 0x3F) } @inlinable - public static func scalarAlign( + static func scalarAlign( _ utf8: UnsafeByteBuffer, _ idx: Int ) -> Int { guard _fastPath(idx != utf8.count) else { return idx } diff --git a/Sources/_StringProcessing/Unicode/NecessaryEvils.swift b/Sources/_StringProcessing/Unicode/NecessaryEvils.swift index ef846c14e..a9ae24429 100644 --- a/Sources/_StringProcessing/Unicode/NecessaryEvils.swift +++ b/Sources/_StringProcessing/Unicode/NecessaryEvils.swift @@ -40,7 +40,7 @@ extension Optional { } // Don't use UnsafeRawBufferPointer for anything important -public struct UnsafeByteBuffer { +struct UnsafeByteBuffer { var pointer: UnsafeRawPointer var count: Int diff --git a/Sources/_StringProcessing/Utility/Protocols.swift b/Sources/_StringProcessing/Utility/Protocols.swift index 9c196c18c..7542a17dd 100644 --- a/Sources/_StringProcessing/Utility/Protocols.swift +++ b/Sources/_StringProcessing/Utility/Protocols.swift @@ -13,11 +13,11 @@ // These currently only drive tracing/formatting, but could drive // more -public protocol InstructionProtocol { +protocol InstructionProtocol { var operandPC: InstructionAddress? { get } } -public protocol ProcessorProtocol { +protocol ProcessorProtocol { associatedtype Input: Collection associatedtype Instruction: InstructionProtocol associatedtype SavePoint = () @@ -45,12 +45,12 @@ public protocol ProcessorProtocol { } extension ProcessorProtocol { - public func fetch() -> Instruction { + func fetch() -> Instruction { instructions[currentPC] } - public var callStack: Array { [] } -// public var savePoints: Array { [] } - public var registers: Array { [] } + var callStack: Array { [] } +// var savePoints: Array { [] } + var registers: Array { [] } } diff --git a/Sources/_StringProcessing/Utility/Traced.swift b/Sources/_StringProcessing/Utility/Traced.swift index c270aba23..5ae7cd245 100644 --- a/Sources/_StringProcessing/Utility/Traced.swift +++ b/Sources/_StringProcessing/Utility/Traced.swift @@ -12,11 +12,11 @@ // TODO: Place shared formatting and trace infrastructure here -public protocol Traced { +protocol Traced { var isTracingEnabled: Bool { get set } } -public protocol TracedProcessor: ProcessorProtocol, Traced { +protocol TracedProcessor: ProcessorProtocol, Traced { // Empty defaulted func formatCallStack() -> String // empty default func formatSavePoints() -> String // empty default @@ -36,7 +36,7 @@ func lineNumber(_ pc: InstructionAddress) -> String { } extension TracedProcessor where Registers: Collection{ - public func formatRegisters() -> String { + func formatRegisters() -> String { typealias E = () if !registers.isEmpty { return "\(registers)\n" @@ -48,19 +48,19 @@ extension TracedProcessor where Registers: Collection{ extension TracedProcessor { func printTrace() { print(formatTrace()) } - public func trace() { + func trace() { if isTracingEnabled { printTrace() } } // Helpers for the conformers - public func formatCallStack() -> String { + func formatCallStack() -> String { if !callStack.isEmpty { return "call stack: \(callStack)\n" } return "" } - public func formatSavePoints() -> String { + func formatSavePoints() -> String { if !savePoints.isEmpty { var result = "save points:\n" for point in savePoints { @@ -71,7 +71,7 @@ extension TracedProcessor { return "" } - public func formatRegisters() -> String { + func formatRegisters() -> String { typealias E = () if Registers.self == E.self { return "" @@ -79,7 +79,7 @@ extension TracedProcessor { return "\(registers)\n" } - public func formatInput() -> String { + func formatInput() -> String { // String override for printing sub-character information. if !input.indices.contains(currentPosition) { // Format unicode scalars as: @@ -115,7 +115,7 @@ extension TracedProcessor { """ } - public func formatInstructionWindow( + func formatInstructionWindow( windowSize: Int = 12 ) -> String { if isAcceptState { return "ACCEPT" } @@ -139,7 +139,7 @@ extension TracedProcessor { return result } - public func formatTrace() -> String { + func formatTrace() -> String { var result = "\n--- cycle \(cycleCount) ---\n" result += formatCallStack() result += formatSavePoints() @@ -150,7 +150,7 @@ extension TracedProcessor { return result } - public func formatInstruction( + func formatInstruction( _ pc: InstructionAddress, depth: Int = 5 ) -> String { @@ -160,7 +160,7 @@ extension TracedProcessor { } extension Collection where Element: InstructionProtocol, Index == InstructionAddress { - public func formatInstruction( + func formatInstruction( _ pc: InstructionAddress, atCurrent: Bool, depth: Int diff --git a/Sources/_StringProcessing/Utility/TypedIndex.swift b/Sources/_StringProcessing/Utility/TypedIndex.swift index 3bddcadfd..adde06a3e 100644 --- a/Sources/_StringProcessing/Utility/TypedIndex.swift +++ b/Sources/_StringProcessing/Utility/TypedIndex.swift @@ -12,55 +12,43 @@ /// Forwarding wrapper around Int-index collections that provide a /// strongly (phantom) typed index. -@frozen -public struct TypedIndex: RawRepresentable where C.Index == Int { - @_alwaysEmitIntoClient - public var rawValue: C +struct TypedIndex: RawRepresentable where C.Index == Int { + var rawValue: C - @_alwaysEmitIntoClient - public init(rawValue: C) { self.rawValue = rawValue } + init(rawValue: C) { self.rawValue = rawValue } - @_alwaysEmitIntoClient - public init(_ rawValue: C) { self.init(rawValue: rawValue) } + init(_ rawValue: C) { self.init(rawValue: rawValue) } } extension TypedIndex: Collection { - public typealias Index = TypedInt<👻> - public typealias Element = C.Element + typealias Index = TypedInt<👻> + typealias Element = C.Element - @_alwaysEmitIntoClient - public var startIndex: Index { Index(rawValue.startIndex) } + var startIndex: Index { Index(rawValue.startIndex) } - @_alwaysEmitIntoClient - public var endIndex: Index { Index(rawValue.endIndex )} + var endIndex: Index { Index(rawValue.endIndex )} - @_alwaysEmitIntoClient - public var count: Int { rawValue.count } + var count: Int { rawValue.count } - @_alwaysEmitIntoClient - public func index(after: Index) -> Index { + func index(after: Index) -> Index { Index(rawValue.index(after: after.rawValue)) } - @_alwaysEmitIntoClient - public subscript(position: Index) -> Element { + subscript(position: Index) -> Element { rawValue[position.rawValue] } - @_alwaysEmitIntoClient - public func distance( + func distance( from start: Index, to end: Index ) -> Int { rawValue.distance(from: start.rawValue, to: end.rawValue) } - @_alwaysEmitIntoClient - public func index(_ i: Index, offsetBy distance: Int) -> Index { + func index(_ i: Index, offsetBy distance: Int) -> Index { Index(rawValue.index(i.rawValue, offsetBy: distance)) } - @_alwaysEmitIntoClient - public func index(_ i: Index, offsetBy distance: Int, limitedBy limit: Index) -> Index? { + func index(_ i: Index, offsetBy distance: Int, limitedBy limit: Index) -> Index? { guard let idx = rawValue.index(i.rawValue, offsetBy: distance, limitedBy: limit.rawValue) else { return nil } @@ -71,8 +59,7 @@ extension TypedIndex: Collection { extension TypedIndex: RandomAccessCollection where C: RandomAccessCollection { } extension TypedIndex: MutableCollection where C: MutableCollection { - @_alwaysEmitIntoClient - public subscript(position: Index) -> Element { + subscript(position: Index) -> Element { _read { yield rawValue[position.rawValue] } @@ -82,8 +69,7 @@ extension TypedIndex: MutableCollection where C: MutableCollection { } } extension TypedIndex: BidirectionalCollection where C: BidirectionalCollection { - @_alwaysEmitIntoClient - public func index(before: Index) -> Index { + func index(before: Index) -> Index { Index(rawValue.index(before: before.rawValue)) } } @@ -92,11 +78,9 @@ extension TypedIndex: BidirectionalCollection where C: BidirectionalCollection { // failure in the Swift repo. #if false extension TypedIndex: RangeReplaceableCollection where C: RangeReplaceableCollection { - @_alwaysEmitIntoClient - public init() { rawValue = C() } + init() { rawValue = C() } - @_alwaysEmitIntoClient - public mutating func replaceSubrange(_ subrange: Range, with newElements: C) where C : Collection, C.Element == Element { + mutating func replaceSubrange(_ subrange: Range, with newElements: C) where C : Collection, C.Element == Element { let rawRange = subrange.lowerBound.rawValue ..< subrange.upperBound.rawValue rawValue.replaceSubrange(rawRange, with: newElements) } @@ -107,14 +91,13 @@ extension TypedIndex: RangeReplaceableCollection where C: RangeReplaceableCollec // Workaround for #73 extension TypedIndex where C: RangeReplaceableCollection { - public mutating func append(_ newElement: Element) { + mutating func append(_ newElement: Element) { rawValue.append(newElement) } } extension TypedIndex: ExpressibleByArrayLiteral where C: ExpressibleByArrayLiteral & RangeReplaceableCollection { - @_alwaysEmitIntoClient - public init(arrayLiteral elements: Element...) { + init(arrayLiteral elements: Element...) { // TODO: any way around the RRC copying init? self.init(C(elements)) } @@ -122,5 +105,5 @@ extension TypedIndex: ExpressibleByArrayLiteral where C: ExpressibleByArrayLiter // MARK: - Strongly typed wrappers -public typealias InstructionList = TypedIndex<[Instruction], _InstructionAddress> +typealias InstructionList = TypedIndex<[Instruction], _InstructionAddress> diff --git a/Sources/_StringProcessing/Utility/TypedInt.swift b/Sources/_StringProcessing/Utility/TypedInt.swift index caff7f64e..249717b68 100644 --- a/Sources/_StringProcessing/Utility/TypedInt.swift +++ b/Sources/_StringProcessing/Utility/TypedInt.swift @@ -11,86 +11,71 @@ // Just a phantom-typed Int wrapper. -@frozen -public struct TypedInt<👻>: RawRepresentable, Hashable { - @_alwaysEmitIntoClient - public var rawValue: Int +struct TypedInt<👻>: RawRepresentable, Hashable { + var rawValue: Int - @_alwaysEmitIntoClient - public init(rawValue: Int) { + init(rawValue: Int) { self.rawValue = rawValue } - @_alwaysEmitIntoClient - public init(_ rawValue: Int) { + init(_ rawValue: Int) { self.init(rawValue: rawValue) } - @_alwaysEmitIntoClient - public init(_ uint: UInt64) { + init(_ uint: UInt64) { assert(uint.leadingZeroBitCount > 0) self.init(Int(asserting: uint)) } } extension TypedInt: Comparable { - @_alwaysEmitIntoClient - public static func <(lhs: TypedInt, rhs: TypedInt) -> Bool { + static func <(lhs: TypedInt, rhs: TypedInt) -> Bool { return lhs.rawValue < rhs.rawValue } } extension TypedInt: CustomStringConvertible { - @_alwaysEmitIntoClient - public var description: String { return "#\(rawValue)" } + var description: String { return "#\(rawValue)" } } extension TypedInt: ExpressibleByIntegerLiteral { - @_alwaysEmitIntoClient - public init(integerLiteral value: Int) { + init(integerLiteral value: Int) { self.init(rawValue: value) } } -public protocol TypedIntProtocol { +protocol TypedIntProtocol { associatedtype 👻 } extension TypedInt: TypedIntProtocol { } // A placeholder type for when we must supply a type. // When the phantom type appears, it says boo -public enum _Boo {} +enum _Boo {} // Easier for clients to just have their own typealias -public typealias TypedInt_ = TypedInt +typealias TypedInt_ = TypedInt // TODO: BinaryInteger, etc. extension TypedInt { - @_alwaysEmitIntoClient - public static func +(lhs: TypedInt, rhs: Int) -> TypedInt { + static func +(lhs: TypedInt, rhs: Int) -> TypedInt { return TypedInt(lhs.rawValue + rhs) } - @_alwaysEmitIntoClient - public var bits: UInt64 { + var bits: UInt64 { UInt64(asserting: self.rawValue) } } -@frozen -public struct TypedSetVector { - public typealias Idx = TypedInt<👻> +struct TypedSetVector { + typealias Idx = TypedInt<👻> // TODO: Replace with real set vector - @_alwaysEmitIntoClient - public var lookup: Dictionary = [:] + var lookup: Dictionary = [:] - @_alwaysEmitIntoClient - public var stored: Array = [] + var stored: Array = [] - @_alwaysEmitIntoClient - public func load(_ idx: Idx) -> Element { stored[idx.rawValue] } + func load(_ idx: Idx) -> Element { stored[idx.rawValue] } - @_alwaysEmitIntoClient @discardableResult - public mutating func store(_ e: Element) -> Idx { + mutating func store(_ e: Element) -> Idx { if let reg = lookup[e] { return reg } let reg = Idx(stored.count) stored.append(e) @@ -98,34 +83,32 @@ public struct TypedSetVector { return reg } - @_alwaysEmitIntoClient - public var count: Int { stored.count } + var count: Int { stored.count } - @_alwaysEmitIntoClient - public init() {} + init() {} } // MARK: - Strongly typed int wrappers /// A distance in the Input, e.g. `n` in consume(n) -public typealias Distance = TypedInt<_Distance> -public enum _Distance {} +typealias Distance = TypedInt<_Distance> +enum _Distance {} /// An instruction address, i.e. the index into our instruction list -public typealias InstructionAddress = TypedInt<_InstructionAddress> -public enum _InstructionAddress {} +typealias InstructionAddress = TypedInt<_InstructionAddress> +enum _InstructionAddress {} /// A position in the call stack, i.e. for save point restores -public typealias CallStackAddress = TypedInt<_CallStackAddress> -public enum _CallStackAddress {} +typealias CallStackAddress = TypedInt<_CallStackAddress> +enum _CallStackAddress {} /// A position in a position stack, i.e. for NFA simulation -public typealias PositionStackAddress = TypedInt<_PositionStackAddress> -public enum _PositionStackAddress {} +typealias PositionStackAddress = TypedInt<_PositionStackAddress> +enum _PositionStackAddress {} /// A position in the save point stack, i.e. for backtracking -public typealias SavePointStackAddress = TypedInt<_SavePointAddress> -public enum _SavePointAddress {} +typealias SavePointStackAddress = TypedInt<_SavePointAddress> +enum _SavePointAddress {} // MARK: - Registers @@ -135,85 +118,85 @@ public enum _SavePointAddress {} /// NOTE: Currently just used for static data, but e.g. could be /// used to save the most recently seen element satisfying some /// property -public typealias ElementRegister = TypedInt<_ElementRegister> -public enum _ElementRegister {} +typealias ElementRegister = TypedInt<_ElementRegister> +enum _ElementRegister {} -public typealias SequenceRegister = TypedInt<_SequenceRegister> -public enum _SequenceRegister {} +typealias SequenceRegister = TypedInt<_SequenceRegister> +enum _SequenceRegister {} /// The register number for a stored boolean value /// /// E.g. used for conditional branches -public typealias BoolRegister = TypedInt<_BoolRegister> -public enum _BoolRegister {} +typealias BoolRegister = TypedInt<_BoolRegister> +enum _BoolRegister {} /// The register number for a string (e.g. comment, failure reason) -public typealias StringRegister = TypedInt<_StringRegister> -public enum _StringRegister {} +typealias StringRegister = TypedInt<_StringRegister> +enum _StringRegister {} /// Used for consume functions, e.g. character classes -public typealias ConsumeFunctionRegister = TypedInt<_ConsumeFunctionRegister> -public enum _ConsumeFunctionRegister {} +typealias ConsumeFunctionRegister = TypedInt<_ConsumeFunctionRegister> +enum _ConsumeFunctionRegister {} /// Used for assertion functions, e.g. anchors etc -public typealias AssertionFunctionRegister = TypedInt<_AssertionFunctionRegister> -public enum _AssertionFunctionRegister {} +typealias AssertionFunctionRegister = TypedInt<_AssertionFunctionRegister> +enum _AssertionFunctionRegister {} /// Used for capture transforms, etc -public typealias TransformRegister = TypedInt<_TransformRegister> -public enum _TransformRegister {} +typealias TransformRegister = TypedInt<_TransformRegister> +enum _TransformRegister {} /// Used for value-producing matchers -public typealias MatcherRegister = TypedInt<_MatcherRegister> -public enum _MatcherRegister {} +typealias MatcherRegister = TypedInt<_MatcherRegister> +enum _MatcherRegister {} /// UNIMPLEMENTED -public typealias IntRegister = TypedInt<_IntRegister> -public enum _IntRegister {} +typealias IntRegister = TypedInt<_IntRegister> +enum _IntRegister {} /// UNIMPLEMENTED -public typealias FloatRegister = TypedInt<_FloatRegister> -public enum _FloatRegister {} +typealias FloatRegister = TypedInt<_FloatRegister> +enum _FloatRegister {} /// UNIMPLEMENTED /// /// NOTE: This, along with a position stack, might /// serve NFA-simulation style execution models -public typealias PositionRegister = TypedInt<_PositionRegister> -public enum _PositionRegister {} +typealias PositionRegister = TypedInt<_PositionRegister> +enum _PositionRegister {} -public typealias ValueRegister = TypedInt<_ValueRegister> -public enum _ValueRegister {} +typealias ValueRegister = TypedInt<_ValueRegister> +enum _ValueRegister {} -public typealias CaptureRegister = TypedInt<_CaptureRegister> -public enum _CaptureRegister {} +typealias CaptureRegister = TypedInt<_CaptureRegister> +enum _CaptureRegister {} /// UNIMPLEMENTED -public typealias InstructionAddressRegister = TypedInt<_InstructionAddressRegister> -public enum _InstructionAddressRegister {} +typealias InstructionAddressRegister = TypedInt<_InstructionAddressRegister> +enum _InstructionAddressRegister {} /// UNIMPLEMENTED -public typealias CallStackAddressRegister = TypedInt<_CallStackAddressRegister> -public enum _CallStackAddressRegister {} +typealias CallStackAddressRegister = TypedInt<_CallStackAddressRegister> +enum _CallStackAddressRegister {} /// UNIMPLEMENTED -public typealias PositionStackAddressRegister = TypedInt<_PositionStackAddressRegister> -public enum _PositionStackAddressRegister {} +typealias PositionStackAddressRegister = TypedInt<_PositionStackAddressRegister> +enum _PositionStackAddressRegister {} /// UNIMPLEMENTED -public typealias SavePointAddressRegister = TypedInt<_SavePointAddressRegister> -public enum _SavePointAddressRegister {} +typealias SavePointAddressRegister = TypedInt<_SavePointAddressRegister> +enum _SavePointAddressRegister {} /// A numbered label -public typealias LabelId = TypedInt<_LabelId> -public enum _LabelId {} +typealias LabelId = TypedInt<_LabelId> +enum _LabelId {} /// A numbered function -public typealias FunctionId = TypedInt<_FunctionId> -public enum _FunctionId {} +typealias FunctionId = TypedInt<_FunctionId> +enum _FunctionId {} /// A numbered capture -public typealias CaptureId = TypedInt<_CaptureId> -public enum _CaptureId {} +typealias CaptureId = TypedInt<_CaptureId> +enum _CaptureId {} From 75d49319abf7120c34eab3d443c102e7d957e91a Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Tue, 22 Feb 2022 07:41:03 -0700 Subject: [PATCH 19/51] Initial custom-component test infrastructure --- Tests/RegexTests/CustomTests.swift | 82 ++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 28 deletions(-) diff --git a/Tests/RegexTests/CustomTests.swift b/Tests/RegexTests/CustomTests.swift index 73a50b108..692ad6c37 100644 --- a/Tests/RegexTests/CustomTests.swift +++ b/Tests/RegexTests/CustomTests.swift @@ -37,39 +37,65 @@ private struct Asciibbler: Nibbler { } } -extension RegexTests { - - // TODO: Refactor below into more exhaustive, declarative - // tests. - func testMatchingConsumers() { - - let regex = Regex { - Numbler() - Asciibbler() - } +enum MatchCall { + case match + case firstMatch +} - guard let result = "4t".match(regex) else { - XCTFail() - return +func customTest( + _ regex: Regex, + _ tests: (input: String, call: MatchCall, match: Match?)... +) { + for (input, call, match) in tests { + let result: Match? + switch call { + case .match: + result = input.match(regex)?.match + case .firstMatch: + result = input.firstMatch(of: regex)?.result } - XCTAssert(result.match == "4t") + XCTAssertEqual(result, match) + } +} - XCTAssertNil("4".match(regex)) - XCTAssertNil("t".match(regex)) - XCTAssertNil("t4".match(regex)) +extension RegexTests { - let regex2 = Regex { - oneOrMore { + // TODO: Refactor below into more exhaustive, declarative + // tests. + func testCustomRegexComponents() { + customTest( + Regex { Numbler() - } - } - - guard let res2 = "ab123c".firstMatch(of: regex2) else { - XCTFail() - return - } - - XCTAssertEqual(res2.match, "123") + Asciibbler() + }, + ("4t", .match, "4t"), + ("4", .match, nil), + ("t", .match, nil), + ("t x1y z", .firstMatch, "1y"), + ("t4", .match, nil)) + + customTest( + Regex { + oneOrMore { Numbler() } + }, + ("ab123c", .firstMatch, "123"), + ("abc", .firstMatch, nil), + ("55z", .match, nil), + ("55z", .firstMatch, "55")) + + // FIXME: Requires we return a value instead of a range +// customTest( +// Regex { +// Numbler() +// }, +// ("ab123c", .firstMatch, 1), +// ("abc", .firstMatch, nil), +// ("55z", .match, nil), +// ("55z", .firstMatch, 5)) + + // TODO: Convert below tests to better infra. Right now + // it's hard because `Match` is constrained to be + // `Equatable` which tuples cannot be. let regex3 = Regex { capture { From 34bd8df9a32ddae538cf980d581bfb06ceb0387e Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Fri, 25 Feb 2022 17:45:44 -0700 Subject: [PATCH 20/51] Trim prose slightly and reorganize --- Documentation/Evolution/RegexSyntax.md | 409 ++++++++++++------------- 1 file changed, 193 insertions(+), 216 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 1238e9e51..fcb67c819 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -74,7 +74,7 @@ Each component of a concatenation may be "trivia" (comments and non-semantic whi In-line comments, similarly to C, are lexical and are not recursively nested like normal groups are. A closing `)` cannot be escaped. Quotes are similarly lexical, non-nested, and the `\` before a `\E` cannot be escaped. -For example, `\Q^[xy]+$\E`, is treated as the literal characters `^[xy]+$` rather than an anchored quantified character class. `\Q\\E` is a literal `\`. +For example, `\Q^[xy]+$\E`, is treated as the literal characters `^[xy]+$` rather than an anchored quantified character class. `\Q\\E` is a literal `\`. ### Quantified subexpressions @@ -88,42 +88,173 @@ Range -> ',' | ',' ? | QuantOperand -> AbsentFunction | Atom | Conditional | CustomCharClass | Group ``` -A quantification consists of an operand optionally followed by a quantifier that specifies how many times it may be matched. An operand without a quantifier is matched once. +Subexpressions can be quantified, meaning they will be repeated some number of times: -The quantifiers supported are: +- `?`: 0 or 1 times. +- `*`: 0 or more times. +- `+`: 1 or more times. +- `{n,m}`: Between `n` and `m` (inclusive) times. +- `{n,}`: `n` or more times. +- `{,m}`: Up to `m` times. +- `{n}`: Exactly `n` times. -- `?`: 0 or 1 matches. -- `*`: 0 or more matches. -- `+`: 1 or more matches. -- `{n,m}`: Between `n` and `m` (inclusive) matches. -- `{n,}`: `n` or more matches. -- `{,m}`: Up to `m` matches. -- `{n}`: Exactly `n` matches. +Behavior can further be refined by a subsequent `?` or `+`: -A quantifier may optionally be followed by `?` or `+`, which adjusts its semantics. If neither are specified, by default the quantification happens *eagerly*, meaning that it will try to maximize the number of matches made. However, if `?` is specified, quantification happens *reluctantly*, meaning that the number of matches will instead be minimized. If `+` is specified, *possessive* matching occurs, which is eager matching with the additional semantic that it may not be backtracked into to try a different number of matches. +- `x*` _eager_: consume as much of input as possible +- `x*?` _reluctant_: consume as little of the input as possible +- `x*+`: _possessive_: eager and never relinquishes any input consumed -### Atom +### Atoms ``` Atom -> Anchor | Backreference | BacktrackingDirective - | BuiltinCharClass + | BuiltinCharacterClass | Callout | CharacterProperty | EscapeSequence | NamedScalar | Subpattern - | UniScalar + | UnicodeScalar | '\K' | '\'? ``` -Atoms are the smallest units of regular expression syntax. They include escape sequences e.g `\b`, `\d`, as well as metacharacters such as `.` and `$`. They also include some larger syntactic constructs such as backreferences and callouts. The most basic form of atom is a literal character. A metacharacter may be treated as literal by preceding it with a backslash. Other literal characters may also be preceded with a backslash, but it has no effect if they are unknown escape sequences, e.g `\I` is literal `I`. +Atoms are the smallest units of regex syntax. They include escape sequences, metacharacters, backreferences, etc. The most basic form of atom is a literal character. A metacharacter may be treated as literal by preceding it with a backslash. Other literal characters may also be preceded with a backslash, but it has no effect if they are unknown escape sequences, e.g `\I` is literal `I`. + +#### Anchors + +``` +Anchor -> '^' | '$' | '\A' | '\b' | '\B' | '\G' | '\y' | '\Y' | '\z' | '\Z' +``` + +Anchors match against a certain position in the input rather than on a particular character of the input. + +- `^`: Matches at the start of a line. +- `$`: Matches at the end of a line. +- `\A`: Matches at the very start of the input string. +- `\Z`: Matches at the very end of the input string, in addition to before a newline at the very end of the input string. +- `\z`: Like `\Z`, but only matches at the very end of the input string. +- `\G`: Like `\A`, but also matches against the start position of where matching resumes in global matching mode (e.g `\Gab` matches twice in `abab`, `\Aab` would only match once). +- `\b` matches a boundary between a word character and a non-word character. The definitions of which vary depending on matching engine. +- `\B` matches a non-word-boundary. +- `\y` matches a text segment boundary, the definition of which varies based on the `y{w}` and `y{g}` matching option. +- `\Y` matches a non-text-segment-boundary. + +#### Escape sequences + +``` +EscapeSequence -> '\a' | '\b' | '\c' | '\e' | '\f' | '\n' | '\r' | '\t' +``` + +These escape sequences each denote a specific scalar value. + +- `\a`: The alert (bell) character `U+7`. +- `\b`: The backspace character `U+8`. Note this may only be used in a custom character class, otherwise it represents a word boundary. +- `\c `: A control character sequence, which denotes a scalar from `U+00` - `U+7F` depending on the ASCII character provided. +- `\e`: The escape character `U+1B`. +- `\f`: The form-feed character `U+C`. +- `\n`: The newline character `U+A`. +- `\r`: The carriage return character `U+D`. +- `\t`: The tab character `U+9` + +#### Builtin character classes + +``` +BuiltinCharClass -> '.' | '\C' | '\d' | '\D' | '\h' | '\H' | '\N' | '\O' | '\R' | '\s' | '\S' | '\v' | '\V' | '\w' | '\W' | '\X' +``` + +- `.`: Any character excluding newlines. +- `\C`: A single UTF code unit. +- `\d`: Digit character. +- `\D`: Non-digit character. +- `\h`: Horizontal space character. +- `\H`: Non-horizontal-space character. +- `\N`: Non-newline character. +- `\O`: Any character (including newlines). This is syntax from Oniguruma. +- `\R`: Newline sequence. +- `\s`: Whitespace character. +- `\S`: Non-whitespace character. +- `\v`: Vertical space character. +- `\V`: Non-vertical-space character. +- `\w`: Word character. +- `\W`: Non-word character. +- `\X`: Any extended grapheme cluster. + +Precise definitions of character classes is discussed in (Character Classes for String Processing)[https://forums.swift.org/t/pitch-character-classes-for-string-processing/52920]. + +#### Unicode scalars + +``` +UnicodeScalar -> '\u{' HexDigit{1...} '}' + | '\u' HexDigit{4} + | '\x{' HexDigit{1...} '}' + | '\x' HexDigit{0...2} + | '\U' HexDigit{8} + | '\o{' OctalDigit{1...} '}' + | '\0' OctalDigit{0...3} + +HexDigit -> [0-9a-zA-Z] +OctalDigit -> [0-7] + +NamedScalar -> '\N{' ScalarName '}' +ScalarName -> 'U+' HexDigit{1...8} | [\s\w-]+ +``` + +These sequences define a unicode scalar value using hexadecimal or octal notation + +`\x`, when not followed by any hexadecimal digit characters, is treated as `\0`, matching PCRE's behavior. + +`\N{...}` allows a specific Unicode scalar to be specified by name or hexadecimal code point. + +#### Character properties + +``` +CharacterProperty -> '\' ('p' | 'P') '{' PropertyContents '}' +POSIXCharacterProperty -> '[:' PropertyContents ':]' + +PropertyContents -> PropertyName ('=' PropertyName)? +PropertyName -> [\s\w-]+ +``` + +A character property specifies a particular Unicode, POSIX, or PCRE property to match against. We propose supporting: + +- The full range of Unicode character properties. +- The POSIX properties `alnum`, `blank`, `graph`, `print`, `word`, `xdigit` (note that `alpha`, `lower`, `upper`, `space`, `punct`, `digit`, and `cntrl` are covered by Unicode properties). +- The UTS#18 special properties `any`, `assigned`, `ascii`. +- The special PCRE2 properties `Xan`, `Xps`, `Xsp`, `Xuc`, `Xwd`. +- The special Java property `javaLowerCase` + +We follow [UTS#18][uts18]'s guidance for character properties, including fuzzy matching for property name parsing, according to rules set out by [UAX44-LM3]. The following property names are equivalent: + +- `whitespace` +- `isWhitespace` +- `is-White_Space` +- `iSwHiTeSpaCe` +- `i s w h i t e s p a c e` + +Unicode properties consist of both a key and a value, e.g `General_Category=Whitespace`. Each component follows the fuzzy matching rule, and additionally may have an alternative alias spelling, as defined by Unicode in [PropertyAliases.txt][unicode-prop-key-aliases] and [PropertyValueAliases.txt][unicode-prop-value-aliases]. + +There are some Unicode properties where the key or value may be inferred. These include: + +- General category properties e.g `\p{Whitespace}` is inferred as `\p{General_Category=Whitespace}`. +- Script properties e.g `\p{Greek}` is inferred as `\p{Script_Extensions=Greek}`. +- Boolean properties that are inferred to have a `True` value, e.g `\p{Lowercase}` is inferred as `\p{Lowercase=True}`. +- Block properties that begin with the prefix `in`, e.g `\p{inBasicLatin}` is inferred to be `\p{Block=Basic_Latin}`. + +Other Unicode properties however must specify both a key and value. + +For non-Unicode properties, only a value is required. These include: + +- The special properties `any`, `assigned`, `ascii`. +- The POSIX compatibility properties `alnum`, `blank`, `graph`, `print`, `word`, `xdigit`. The remaining POSIX properties are already covered by boolean Unicode property spellings. + +Note that the internal `PropertyContents` syntax is shared by both the `\p{...}` and POSIX-style `[:...:]` syntax, allowing e.g `[:script=Latin:]` as well as `\p{alnum}`. #### `\K` -The `\K` escape sequence is used to drop any previously matched characters from the final matching result. It does not however interfere with captures, e.g `a(b)\Kc` when matching against `abc` will return a match of `c`, but with a capture of `b`. +The `\K` escape sequence is used to drop any previously matched characters from the final matching result. It does not affect captures, e.g `a(b)\Kc` when matching against `abc` will return a match of `c`, but with a capture of `b`. ### Groups @@ -133,10 +264,10 @@ GroupStart -> '(' GroupKind | '(' GroupKind -> '' | '?' BasicGroupKind | '*' PCRE2GroupKind ':' BasicGroupKind -> ':' | '|' | '>' | '=' | '!' | '*' | '<=' | ' 'atomic' + +PCRE2GroupKind -> 'atomic' | 'pla' | 'positive_lookahead' | 'nla' | 'negative_lookahead' | 'plb' | 'positive_lookbehind' @@ -155,7 +286,7 @@ GroupNameBody -> Identifier | BalancingGroupBody Identifier -> [\w--\d] \w* ``` -Groups define a new scope that contains a recursive regular expression pattern. Groups have different semantics depending on how they are introduced, the details of which are laid out in the following sections. +Groups define a new scope that contains a recursively nested regex. Groups have different semantics depending on how they are introduced. Note there are additional constructs that may syntactically appear similar to groups, but are distinct. See the *group-like atoms* section. @@ -221,129 +352,6 @@ Branch reset groups can alter this numbering, as they reset the numbering in the 1 2 3 4 3 5 ``` -### Matching options - -``` -MatchingOptionSeq -> '^' MatchingOption* - | MatchingOption+ - | MatchingOption* '-' MatchingOption* - -MatchingOption -> 'i' | 'J' | 'm' | 'n' | 's' | 'U' | 'x' | 'xx' | 'w' | 'D' | 'P' | 'S' | 'W' | 'y{' ('g' | 'w') '}' -``` - -A matching option sequence may be used as a group specifier, and denotes a change in matching options for the scope of that group. For example `(?x:a b c)` enables extended syntax for `a b c`. A matching option sequence may be part of an "isolated group" which has an implicit scope that wraps the remaining elements of the current group. For example, `(?x)a b c` also enables extended syntax for `a b c`. - -If used in the branch of an alternation, an isolated group affects all the following branches of that alternation. For example, `a(?i)b|c|d` is treated as `a(?i:b)|(?i:c)|(?i:d)`. - -We support all the matching options accepted by PCRE, ICU, and Oniguruma. In addition, we accept some matching options unique to our matching engine. - -#### PCRE options - -- `i`: Case insensitive matching. -- `J`: Allows multiple groups to share the same name, which is otherwise forbidden. -- `m`: Enables `^` and `$` to match against the start and end of a line rather than only the start and end of the entire string. -- `n`: Disables the capturing behavior of `(...)` groups. Named capture groups must be used instead. -- `s`: Changes `.` to match any character, including newlines. -- `U`: Changes quantifiers to be reluctant by default, with the `?` specifier changing to mean greedy. -- `x`, `xx`: Enables extended syntax mode, which allows non-semantic whitespace and end-of-line comments. See the *trivia* section for more info. - -#### ICU options - -- `w`: Enables the Unicode interpretation of word boundaries `\b`. - -#### Oniguruma options - -- `D`: Enables ASCII-only digit matching for `\d`, `\p{Digit}`, `[:digit:]`. -- `S`: Enables ASCII-only space matching for `\s`, `\p{Space}`, `[:space:]`. -- `W`: Enables ASCII-only word matching for `\w`, `\p{Word}`, `[:word:]`, and `\b`. -- `P`: Enables ASCII-only for all POSIX properties (including `digit`, `space`, and `word`). -- `y{g}`, `y{w}`: Changes the meaning of `\X`, `\y`, `\Y`. These are mutually exclusive options, with `y{g}` specifying extended grapheme cluster mode, and `y{w}` specifying word mode. - -#### Swift options - -These options are specific to the Swift regex matching engine and control the semantic level at which matching takes place. - -- `X`: Grapheme cluster matching -- `u`: Unicode scalar matching -- `b`: Byte matching - - -### Anchors - -``` -Anchor -> '^' | '$' | '\A' | '\b' | '\B' | '\G' | '\y' | '\Y' | '\z' | '\Z' -``` - -Anchors match against a certain position in the input rather than on a particular character of the input. - -- `^`: Matches at the start of a line. -- `$`: Matches at the end of a line. -- `\A`: Matches at the very start of the input string. -- `\Z`: Matches at the very end of the input string, in addition to before a newline at the very end of the input string. -- `\z`: Like `\Z`, but only matches at the very end of the input string. -- `\G`: Like `\A`, but also matches against the start position of where matching resumes in global matching mode (e.g `\Gab` matches twice in `abab`, `\Aab` would only match once). -- `\b` matches a boundary between a word character and a non-word character. The definitions of which vary depending on matching engine. -- `\B` matches a non-word-boundary. -- `\y` matches a text segment boundary, the definition of which varies based on the `y{w}` and `y{g}` matching option. -- `\Y` matches a non-text-segment-boundary. - -### Unicode scalars - -``` -UniScalar -> '\u{' HexDigit{1...} '}' - | '\u' HexDigit{4} - | '\x{' HexDigit{1...} '}' - | '\x' HexDigit{0...2} - | '\U' HexDigit{8} - | '\o{' OctalDigit{1...} '}' - | '\0' OctalDigit{0...3} - -HexDigit -> [0-9a-zA-Z] -OctalDigit -> [0-7] -``` - -These sequences define a unicode scalar value to be matched against. There is syntax for both specifying the scalar value in hex notation, as well as octal notation. Note that `\x`, when not followed by any hexadecimal digit characters, is treated as `\0`, matching PCRE's behavior. - -### Escape sequences - -``` -EscapeSequence -> '\a' | '\b' | '\c' | '\e' | '\f' | '\n' | '\r' | '\t' -``` - -These escape sequences each denote a specific scalar value. - -- `\a`: The alert (bell) character `U+7`. -- `\b`: The backspace character `U+8`. Note this may only be used in a custom character class, otherwise it represents a word boundary. -- `\c `: A control character sequence, which denotes a scalar from `U+00` - `U+7F` depending on the ASCII character provided. -- `\e`: The escape character `U+1B`. -- `\f`: The form-feed character `U+C`. -- `\n`: The newline character `U+A`. -- `\r`: The carriage return character `U+D`. -- `\t`: The tab character `U+9` - -### Builtin character classes - -``` -BuiltinCharClass -> '.' | '\C' | '\d' | '\D' | '\h' | '\H' | '\N' | '\O' | '\R' | '\s' | '\S' | '\v' | '\V' | '\w' | '\W' | '\X' -``` - -- `.`: Any character excluding newlines. -- `\C`: A single UTF code unit. -- `\d`: Digit character. -- `\D`: Non-digit character. -- `\h`: Horizontal space character. -- `\H`: Non-horizontal-space character. -- `\N`: Non-newline character. -- `\O`: Any character (including newlines). This is syntax from Oniguruma. -- `\R`: Newline sequence. -- `\s`: Whitespace character. -- `\S`: Non-whitespace character. -- `\v`: Vertical space character. -- `\V`: Non-vertical-space character. -- `\w`: Word character. -- `\W`: Non-word character. -- `\X`: Any extended grapheme cluster. - ### Custom character classes ``` @@ -352,7 +360,7 @@ Start -> '[' '^'? Set -> Member+ Member -> CustomCharClass | Quote | Range | Atom Range -> RangeElt `-` RangeElt -RangeElt -> | UniScalar | EscapeSequence +RangeElt -> | UnicodeScalar | EscapeSequence SetOp -> '&&' | '--' | '~~' | '-' ``` @@ -384,83 +392,52 @@ These operators have a lower precedence than the implicit union of members, e.g To avoid ambiguity between .NET's subtraction syntax and range syntax, .NET specifies that a subtraction will only be parsed if the right-hand-side is a nested custom character class. We intend to follow this behavior. -### Character properties -``` -CharacterProperty -> '\' ('p' | 'P') '{' PropertyContents '}' -POSIXCharacterProperty -> '[:' PropertyContents ':]' +### Matching options -PropertyContents -> PropertyName ('=' PropertyName)? -PropertyName -> [\s\w-]+ ``` +MatchingOptionSeq -> '^' MatchingOption* + | MatchingOption+ + | MatchingOption* '-' MatchingOption* -A character property specifies a particular Unicode, POSIX, or PCRE property to match against. We intend on parsing: - -- The full range of Unicode character properties. -- The POSIX properties `alnum`, `blank`, `graph`, `print`, `word`, `xdigit` (note that `alpha`, `lower`, `upper`, `space`, `punct`, `digit`, and `cntrl` are covered by Unicode properties). -- The UTS#18 special properties `any`, `assigned`, `ascii`. -- The special PCRE2 properties `Xan`, `Xps`, `Xsp`, `Xuc`, `Xwd`. - -We intend on following [UTS#18][uts18]'s guidance for character properties. This includes the use of fuzzy matching for property name parsing. This is done according to rules set out by [UAX44-LM3]. This means that the following property names are considered equivalent: - -- `whitespace` -- `isWhitespace` -- `is-White_Space` -- `iSwHiTeSpaCe` -- `i s w h i t e s p a c e` - -Unicode properties consist of both a key and a value, e.g `General_Category=Whitespace`. Each component follows the fuzzy matching rule, and additionally may have an alternative alias spelling, as defined by Unicode in [PropertyAliases.txt][unicode-prop-key-aliases] and [PropertyValueAliases.txt][unicode-prop-value-aliases]. - -There are some Unicode properties where the key or value may be inferred. These include: - -- General category properties e.g `\p{Whitespace}` is inferred as `\p{General_Category=Whitespace}`. -- Script properties e.g `\p{Greek}` is inferred as `\p{Script_Extensions=Greek}`. -- Boolean properties that are inferred to have a `True` value, e.g `\p{Lowercase}` is inferred as `\p{Lowercase=True}`. -- Block properties that begin with the prefix `in`, e.g `\p{inBasicLatin}` is inferred to be `\p{Block=Basic_Latin}`. - -Other Unicode properties however must specify both a key and value. - -For non-Unicode properties, only a value is required. These include: - -- The special properties `any`, `assigned`, `ascii`. -- The POSIX compatibility properties `alnum`, `blank`, `graph`, `print`, `word`, `xdigit`. The remaining POSIX properties are already covered by boolean Unicode property spellings. - -Note that the internal `PropertyContents` syntax is shared by both the `\p{...}` and POSIX-style `[:...:]` syntax, allowing e.g `[:script=Latin:]` as well as `\p{alnum}`. - -### Named scalars - -``` -NamedScalar -> '\N{' ScalarName '}' -ScalarName -> 'U+' HexDigit{1...8} | [\s\w-]+ +MatchingOption -> 'i' | 'J' | 'm' | 'n' | 's' | 'U' | 'x' | 'xx' | 'w' | 'D' | 'P' | 'S' | 'W' | 'y{' ('g' | 'w') '}' ``` -Allows a specific Unicode scalar to be specified by name or hexadecimal code point. +A matching option sequence may be used as a group specifier, and denotes a change in matching options for the scope of that group. For example `(?x:a b c)` enables extended syntax for `a b c`. A matching option sequence may be part of an "isolated group" which has an implicit scope that wraps the remaining elements of the current group. For example, `(?x)a b c` also enables extended syntax for `a b c`. -### Trivia +If used in the branch of an alternation, an isolated group affects all the following branches of that alternation. For example, `a(?i)b|c|d` is treated as `a(?i:b)|(?i:c)|(?i:d)`. -``` -Trivia -> Comment | Whitespace -Comment -> InlineComment | EndOfLineComment +We support all the matching options accepted by PCRE, ICU, and Oniguruma. In addition, we accept some matching options unique to our matching engine. -InlineComment -> '(?#' (!')')* ')' -EndOfLineComment -> '#' .*$ +#### PCRE options -Whitespace -> \s+ -``` +- `i`: Case insensitive matching. +- `J`: Allows multiple groups to share the same name, which is otherwise forbidden. +- `m`: Enables `^` and `$` to match against the start and end of a line rather than only the start and end of the entire string. +- `n`: Disables the capturing behavior of `(...)` groups. Named capture groups must be used instead. +- `s`: Changes `.` to match any character, including newlines. +- `U`: Changes quantifiers to be reluctant by default, with the `?` specifier changing to mean greedy. +- `x`, `xx`: Enables extended syntax mode, which allows non-semantic whitespace and end-of-line comments. See the *trivia* section for more info. -Trivia is consumed by the regular expression parser, but has no semantic meaning. This includes inline PCRE-style comments e.g `(?#comment)`. It also includes non-semantic whitespace and end-of-line comments which may only occur when either of the extended syntax matching options `(?x)`, `(?xx)` are enabled. +#### ICU options -### Quotes +- `w`: Enables the Unicode interpretation of word boundaries `\b`. -``` -Quote -> '\Q' (!'\E' .)* '\E' -``` +#### Oniguruma options -A quoted sequence is delimited by `\Q...\E`, and allows the escaping of metacharacters such that they are interpreted literally. For example, `\Q^[xy]+$\E`, is treated as the literal characters `^[xy]+$` rather than an anchored quantified character class. +- `D`: Enables ASCII-only digit matching for `\d`, `\p{Digit}`, `[:digit:]`. +- `S`: Enables ASCII-only space matching for `\s`, `\p{Space}`, `[:space:]`. +- `W`: Enables ASCII-only word matching for `\w`, `\p{Word}`, `[:word:]`, and `\b`. +- `P`: Enables ASCII-only for all POSIX properties (including `digit`, `space`, and `word`). +- `y{g}`, `y{w}`: Changes the meaning of `\X`, `\y`, `\Y`. These are mutually exclusive options, with `y{g}` specifying extended grapheme cluster mode, and `y{w}` specifying word mode. -The backslash character is also treated as literal within a quoted sequence, and may not be used to escape the closing delimiter, e.g `\Q\\E` is a literal `\`. +#### Swift options -`\E` may appear without a preceding `\Q`, in which case it is a literal `E`. +These options are specific to the Swift regex matching engine and control the semantic level at which matching takes place. + +- `X`: Grapheme cluster matching +- `u`: Unicode scalar matching +- `b`: Byte matching ### References @@ -492,10 +469,10 @@ A backreference evaluates to the value last captured by the referenced capturing #### Subpatterns ``` -Subpattern -> '\g<' NamedOrNumberRef '>' +Subpattern -> '\g<' NamedOrNumberRef '>' | "\g'" NamedOrNumberRef "'" | '(?' GroupLikeSubpatternBody ')' - + GroupLikeSubpatternBody -> 'P>' NamedRef | '&' NamedRef | 'R' @@ -521,7 +498,7 @@ KnownCondition -> 'R' | 'DEFINE' | 'VERSION' VersionCheck | NumberRef - + PCREVersionCheck -> '>'? '=' PCREVersionNumber PCREVersionNumber -> '.' ``` @@ -566,7 +543,7 @@ GlobalMatchingOptionKind -> LimitOptionKind '=' | 'NOTEMPTY_ATSTART' | 'NOTEMPTY' | 'NO_AUTO_POSSESS' | 'NO_DOTSTAR_ANCHOR' | 'NO_JIT' | 'NO_START_OPT' | 'UTF' | 'UCP' - + LimitOptionKind -> 'LIMIT_DEPTH' | 'LIMIT_HEAP' | 'LIMIT_MATCH' NewlineKind -> 'CRLF' | 'CR' | 'ANYCRLF' | 'ANY' | 'LF' | 'NUL' NewlineSequenceKind -> 'BSR_ANYCRLF' | 'BSR_UNICODE' @@ -601,7 +578,7 @@ PCRECalloutBody -> '' | | '#' '#' | '$' '$' | '{' '}' - + OnigurumaCallout -> OnigurumaNamedCallout | OnigurumaCalloutOfContents OnigurumaNamedCallout -> '(*' Identifier OnigurumaTag? OnigurumaCalloutArgs? ')' @@ -683,7 +660,7 @@ In PCRE, if `PCRE2_ALT_BSUX` or `PCRE2_EXTRA_ALT_BSUX` are specified, `\U` match ### `{,n}` -This quantifier is supported by Oniguruma, but in PCRE it matches the literal chars. We intend on supporting it as a quantifier. +This quantifier is supported by Oniguruma, but in PCRE it matches the literal characters `{`, `,`, `n`, and `}` in sequence. We intend on supporting it as a quantifier. ### `\DDD` @@ -729,7 +706,7 @@ PCRE and .NET allow for conditional patterns to reference a group by its name wi where `y` will only be matched if `(?x)` was matched. PCRE will always treat such syntax as a backreference condition, however .NET will only treat it as such if a group with that name exists somewhere in the regex (including after the conditional). Otherwise, .NET interprets `group1` as an arbitrary regular expression condition to try match against. Oniguruma on the other hand will always treat `group1` as an regex condition to match against. -We intend to always parse such conditions as an arbitrary regular expression condition, and will emit a warning asking users to explicitly use the syntax `(?()y)` if they want a backreference condition. This more explicit syntax is supported by both PCRE and Oniguruma. +We intend to always parse such conditions as an arbitrary regular expression condition, and will emit a warning asking users to explicitly use the syntax `(?()y)` if they want a backreference condition. This more explicit syntax is supported by both PCRE and Oniguruma. ### `\N` @@ -784,7 +761,7 @@ Many engines have different spellings for the same regex features, we intend to ### Unicode scalars ``` -UniScalar -> '\u{' HexDigit{1...} '}' +UnicodeScalar -> '\u{' HexDigit{1...} '}' | '\u' HexDigit{4} | '\x{' HexDigit{1...} '}' | '\x' HexDigit{0...2} @@ -835,10 +812,10 @@ For absolute numeric references, we plan on choosing the canonical spelling `\DD ### Subpatterns ``` -Subpattern -> '\g<' NamedOrNumberRef '>' +Subpattern -> '\g<' NamedOrNumberRef '>' | "\g'" NamedOrNumberRef "'" | '(?' GroupLikeSubpatternBody ')' - + GroupLikeSubpatternBody -> 'P>' NamedRef | '&' NamedRef | 'R' @@ -889,4 +866,4 @@ PCRE accepts a number of alternative delimiters for callout string arguments. We [unicode-prop-key-aliases]: https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt [unicode-prop-value-aliases]: https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt [unicode-scripts]: https://www.unicode.org/reports/tr24/#Script -[unicode-script-extensions]: https://www.unicode.org/reports/tr24/#Script_Extensions +[unicode-script-extensions]: https://www.unicode.org/reports/tr24/#Script_Extensions \ No newline at end of file From 2eb7e4e1563a0dbedc8915dc68e3446dd051d716 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Sat, 26 Feb 2022 14:20:51 -0700 Subject: [PATCH 21/51] Cleanup/simplify Executor codepaths (#190) --- Sources/_StringProcessing/Capture.swift | 10 + .../_StringProcessing/Engine/Consume.swift | 8 +- Sources/_StringProcessing/Executor.swift | 76 ++--- .../_StringProcessing/RegexDSL/Match.swift | 13 +- .../MatchingEngineTests.swift | 288 +----------------- Tests/RegexTests/CaptureTests.swift | 12 +- Tests/RegexTests/MatchTests.swift | 10 +- 7 files changed, 49 insertions(+), 368 deletions(-) diff --git a/Sources/_StringProcessing/Capture.swift b/Sources/_StringProcessing/Capture.swift index 915c4c5d7..ecfc558fe 100644 --- a/Sources/_StringProcessing/Capture.swift +++ b/Sources/_StringProcessing/Capture.swift @@ -71,6 +71,11 @@ extension StructuredCapture { value: storedCapture?.value, optionalCount: optionalCount) } + + func slice(from input: String) -> Substring? { + guard let r = storedCapture?.range else { return nil } + return input[r] + } } extension Sequence where Element == StructuredCapture { @@ -86,5 +91,10 @@ extension Sequence where Element == StructuredCapture { }) return TypeConstruction.tuple(of: caps) } + + func slices(from input: String) -> [Substring?] { + self.map { $0.slice(from: input) } + } } + diff --git a/Sources/_StringProcessing/Engine/Consume.swift b/Sources/_StringProcessing/Engine/Consume.swift index cfb803de8..a4a3bf26c 100644 --- a/Sources/_StringProcessing/Engine/Consume.swift +++ b/Sources/_StringProcessing/Engine/Consume.swift @@ -25,16 +25,10 @@ extension Engine { } extension Engine where Input == String { - func consume( - _ input: Input - ) -> (Input.Index, CaptureList)? { - consume(input, in: input.startIndex ..< input.endIndex) - } - func consume( _ input: Input, in range: Range, - matchMode: MatchMode = .partialFromFront + matchMode: MatchMode ) -> (Input.Index, CaptureList)? { if enableTracing { print("Consume: \(input)") diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 5c098c5c5..9de2b0b3d 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -19,69 +19,33 @@ struct Executor { self.engine = Engine(program, enableTracing: enablesTracing) } - struct Result { - var range: Range - var captures: [StructuredCapture] - var referencedCaptureOffsets: [ReferenceID: Int] - - var destructure: ( - matched: Range, - captures: [StructuredCapture], - referencedCaptureOffsets: [ReferenceID: Int] - ) { - (range, captures, referencedCaptureOffsets) - } - - init( - _ matched: Range, _ captures: [StructuredCapture], - _ referencedCaptureOffsets: [ReferenceID: Int] - ) { - self.range = matched - self.captures = captures - self.referencedCaptureOffsets = referencedCaptureOffsets - } - } - - func execute( - input: String, - in range: Range, - mode: MatchMode = .wholeString - ) -> Result? { + func match( + _ input: String, + in inputRange: Range, + _ mode: MatchMode + ) throws -> RegexMatch? { guard let (endIdx, capList) = engine.consume( - input, in: range, matchMode: mode + input, in: inputRange, matchMode: mode ) else { return nil } let capStruct = engine.program.captureStructure - do { - let range = range.lowerBound.. Result? { - self.execute( - input: input.base, - in: input.startIndex.., - mode: MatchMode = .wholeString - ) -> (Range, CaptureList)? { - engine.consume( - input, in: range, matchMode: mode - ).map { endIndex, capture in - (range.lowerBound.., + _ mode: MatchMode + ) throws -> RegexMatch<(Substring, DynamicCaptures)>? { + try match(input, in: inputRange, mode) } } diff --git a/Sources/_StringProcessing/RegexDSL/Match.swift b/Sources/_StringProcessing/RegexDSL/Match.swift index 2dd31c379..c2593b22a 100644 --- a/Sources/_StringProcessing/RegexDSL/Match.swift +++ b/Sources/_StringProcessing/RegexDSL/Match.swift @@ -69,16 +69,11 @@ extension RegexProtocol { mode: MatchMode = .wholeString ) -> RegexMatch? { let executor = Executor(program: regex.program.loweredProgram) - guard let (range, captures, captureOffsets) = executor.execute( - input: input, in: inputRange, mode: mode - )?.destructure else { - return nil + do { + return try executor.match(input, in: inputRange, mode) + } catch { + fatalError(String(describing: error)) } - return RegexMatch( - input: input, - range: range, - rawCaptures: captures, - referencedCaptureOffsets: captureOffsets) } } diff --git a/Tests/MatchingEngineTests/MatchingEngineTests.swift b/Tests/MatchingEngineTests/MatchingEngineTests.swift index b7c89661d..ccfe85ec7 100644 --- a/Tests/MatchingEngineTests/MatchingEngineTests.swift +++ b/Tests/MatchingEngineTests/MatchingEngineTests.swift @@ -13,289 +13,5 @@ import XCTest @testable import _StringProcessing -/// Hold context and run variety of ad-hoc tests -/// -/// TODO: Use these to demonstrate first-order approximation of what -/// overhead such an engine imposes -fileprivate struct Test: ExpressibleByStringLiteral { - var input: String - var aEater: String - var manyAEater: String - var eatUntilA: String - var eatThroughA: String - - // TODO: Have tests explicitly show each step of type binding, - // input binding, etc. - var enableTracing: Bool? = nil - - /* - - until first A - through first A - until / through last A - etc - - */ - - var file: String - var line: UInt - - init( - _ s: String, - enableTracing: Bool? = nil, - file: String = #file, - line: UInt = #line - ) { - self.input = s - self.aEater = s.first == "A" ? String(s.dropFirst()) : s - self.manyAEater = String(s.drop(while: { $0 == "A" })) - - if let firstIdx = s.firstIndex(of: "A") { - self.eatUntilA = String(s[firstIdx...]) - self.eatThroughA = String(eatUntilA.dropFirst()) - } else { - self.eatUntilA = s - self.eatThroughA = s - } - - self.enableTracing = enableTracing - -// self.untilFirstAEater = String( -// s[(s.firstIndex(where: { $0 == "A" }) ?? s.startIndex)...]) - - - self.file = file - self.line = line - } - init( - stringLiteral: String, - file: String = #file, - line: UInt = #line - ) { - self.init(stringLiteral, file: file, line: line) - } - init(stringLiteral: String) { - // NOTE: Can't get source location of a literal... - self.init(stringLiteral) - } - - var slicedInput: (String, Range) { - let prefix = "aAa prefix ⚠️" - let suffix = "⚠️ aAa suffix" - let outer = prefix + input + suffix - let range = outer.mapOffsets( - (lower: prefix.count, upper: -suffix.count)) - return (outer, range) - } - - func check(_ engine: Engine, expected: String) { - var engine = engine - if let t = enableTracing { - engine.enableTracing = t - } - let output: String - let outputFromSlice: String - - if let (idx, _) = engine.consume(input) { - output = String(input[idx...]) - } else { - output = input - } - - let (outerInput, range) = slicedInput - if let (idx, _) = engine.consume(outerInput, in: range) { - outputFromSlice = String(outerInput[idx..? = nil, - manyAEater: Engine? = nil, - eatUntilA: Engine? = nil, - eatThroughA: Engine? = nil - ) { - if let engine = aEater { - check(engine, expected: self.aEater) - } - if let engine = manyAEater { - check(engine, expected: self.manyAEater) - } - if let engine = eatUntilA { - check(engine, expected: self.eatUntilA) - } - if let engine = eatThroughA { - check(engine, expected: self.eatThroughA) - } - } -} - -var doPrint = false -func show(_ s: CustomStringConvertible) { - if doPrint { print(s) } -} - -func makeEngine( - _ constructor: (inout Program.Builder) -> () -) -> Engine { - var builder = Program.Builder() - constructor(&builder) - let program = try! builder.assemble() - let engine = Engine(program) - show(engine) - return engine -} - -// Eat an A off the front -// -// [0] match "A" -// [1] accept -// -let aEater: Engine = { - makeEngine { builder in - builder.buildMatch("A") - builder.buildAccept() - } -}() - -// Eat many "A"s off the input -// -// [0] saveAddress [3] // .accept -// [1] match "A" -// [2] goto [1] // match "A" -// [3] accept -// -// NOTE: a save would restore input position, which we -// actually don't want to do. -// -// NOTE: We should compare with a more sophisticated match -// instruction that can take at least or at most, etc. -// -let manyAEater: Engine = { - makeEngine { builder in - let accTok = builder.makeAddress() - let matchTok = builder.makeAddress() - - builder.buildSaveAddress(accTok) - builder.buildMatch("A") - builder.resolve(matchTok) - builder.buildBranch(to: matchTok) - builder.buildAccept() - builder.resolve(accTok) - } -}() - -// Eat until you find an A (FAIL if no A) -// -// [0] assert #0 #0 -// [1] condBranch #0 [x] // accept -// [2] advance(1) -// [3] goto 0 -// [4] accept -// -// NOTE: This check-consume-else-branch pattern -// could be pretty common and might be worth a dedicated -// instruction. -let eatUntilA: Engine = { - makeEngine { builder in - let reg = builder.makeBoolRegister() - let accTok = builder.makeAddress() - let assertTok = builder.makeAddress() - builder.buildAssert("A", into: reg) - builder.resolve(assertTok) - builder.buildCondBranch(reg, to: accTok) - builder.buildAdvance(1) - builder.buildBranch(to: assertTok) - builder.buildAccept() - builder.resolve(accTok) - } -}() - -// Eat through the first A (FAIL if no A) -// -// [0] assert #0 #0 -// [1] advance(1) -// [2] condBranch #0 [x] // accept -// [3] goto 0 -// [4] accept -let eatThroughA: Engine = { - makeEngine { builder in - let reg = builder.makeBoolRegister() - let accTok = builder.makeAddress() - let assertTok = builder.makeAddress() - builder.buildAssert("A", into: reg) - builder.resolve(assertTok) - builder.buildAdvance(1) - builder.buildCondBranch(reg, to: accTok) - builder.buildBranch(to: assertTok) - builder.buildAccept() - builder.resolve(accTok) - } -}() - - - -class MatchingEngineTests: XCTestCase { - - func testAEaters() { - let tests: Array = [ - Test("abc"), - Test("Abc"), - Test("AAbc"), - Test(""), - Test("A"), - Test("b"), - Test("bbbA"), - Test("bbAbA"), - ] - - for test in tests { - test.check(aEater: aEater) - test.check(manyAEater: manyAEater) - test.check(eatUntilA: eatUntilA) - test.check(eatThroughA: eatThroughA) - } - } - - func testThreeLetterRepeat() { - // Check for a repeated 3-letter sequence, such as in - // `(...)\1` - // - // [0] movePosition(into: %low) - // [1] advance(3) - // [2] movePosition(into: %high) - // [3] matchSlice(%low, %high) - // [4] accept - let threeLetterRepeat: Engine = { - makeEngine { builder in - let low = builder.makePositionRegister( - initializingWithCurrentPosition: ()) - builder.buildAdvance(3) - let high = builder.makePositionRegister( - initializingWithCurrentPosition: ()) - builder.buildMatchSlice(lower: low, upper: high) - builder.buildAccept() - } - }() - - let tests: Array<(String, Bool)> = [ - ("abcabc", true), - ("abcabc_____", true), - ("dddddd_____", true), - ("🥳🧟‍♀️c🥳🧟‍♀️c", true), - ("abccba", false), - ("abcabb", false), - ("abcbac", false), - ("🥳🧟‍♀️c🥳🧟‍♂️c", false), - ] - - for (test, expect) in tests { - let match = threeLetterRepeat.consume(test) != nil - XCTAssertEqual(expect, match) - } - } -} +// TODO: Unit tests for the engine itself. Functional testing +// is handled by regex tests. diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index 9f3cc313b..cc3568c1d 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -142,13 +142,15 @@ func captureTest( for (input, output) in tests { let inputRange = input.startIndex.. searcher algorithm var start = input.startIndex while true { - if let (range, caps) = self.executeFlat( - input: input, + if let result = try! self.dynamicMatch( + input, in: start.. Date: Sun, 27 Feb 2022 07:39:15 -0700 Subject: [PATCH 22/51] Quick bug fix / workaround for whole-match values (#191) --- Sources/_StringProcessing/Capture.swift | 2 - .../_StringProcessing/Engine/Consume.swift | 42 ++++--------------- Sources/_StringProcessing/Executor.swift | 28 +++++++++++-- .../_StringProcessing/RegexDSL/Match.swift | 10 +++++ Tests/RegexTests/CustomTests.swift | 17 ++++---- 5 files changed, 51 insertions(+), 48 deletions(-) diff --git a/Sources/_StringProcessing/Capture.swift b/Sources/_StringProcessing/Capture.swift index ecfc558fe..5b43da870 100644 --- a/Sources/_StringProcessing/Capture.swift +++ b/Sources/_StringProcessing/Capture.swift @@ -96,5 +96,3 @@ extension Sequence where Element == StructuredCapture { self.map { $0.slice(from: input) } } } - - diff --git a/Sources/_StringProcessing/Engine/Consume.swift b/Sources/_StringProcessing/Engine/Consume.swift index a4a3bf26c..4e00a34b4 100644 --- a/Sources/_StringProcessing/Engine/Consume.swift +++ b/Sources/_StringProcessing/Engine/Consume.swift @@ -24,41 +24,17 @@ extension Engine { } } -extension Engine where Input == String { - func consume( - _ input: Input, - in range: Range, - matchMode: MatchMode - ) -> (Input.Index, CaptureList)? { - if enableTracing { - print("Consume: \(input)") - } - - var cpu = makeProcessor(input: input, bounds: range, matchMode: matchMode) - let result: Input.Index? = { - while true { - switch cpu.state { - case .accept: - return cpu.currentPosition - case .fail: - return nil - case .inProgress: cpu.cycle() - } - } - }() - - if enableTracing { - if let idx = result { - print("Result: \(input[.. Input.Index? { + while true { + switch self.state { + case .accept: + return self.currentPosition + case .fail: + return nil + case .inProgress: self.cycle() } } - guard let result = result else { return nil } - - let capList = cpu.storedCaptures - return (result, CaptureList( - values: capList, referencedCaptureOffsets: program.referencedCaptureOffsets)) } } diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 9de2b0b3d..c044cbf24 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -24,21 +24,41 @@ struct Executor { in inputRange: Range, _ mode: MatchMode ) throws -> RegexMatch? { - guard let (endIdx, capList) = engine.consume( - input, in: inputRange, matchMode: mode - ) else { + var cpu = engine.makeProcessor( + input: input, bounds: inputRange, matchMode: mode) + + guard let endIdx = cpu.consume() else { return nil } + + let capList = CaptureList( + values: cpu.storedCaptures, + referencedCaptureOffsets: engine.program.referencedCaptureOffsets) + let capStruct = engine.program.captureStructure let range = inputRange.lowerBound.. { let rawCaptures: [StructuredCapture] let referencedCaptureOffsets: [ReferenceID: Int] + let value: Any? + public var match: Match { if Match.self == (Substring, DynamicCaptures).self { // FIXME(rdar://89449323): Compiler assertion @@ -25,7 +27,15 @@ public struct RegexMatch { } else if Match.self == Substring.self { // FIXME: Plumb whole match (`.0`) through the matching engine. return input[range] as! Match + } else if rawCaptures.isEmpty, value != nil { + // FIXME: This is a workaround for whole-match values not + // being modeled as part of captures. We might want to + // switch to a model where results are alongside captures + return value! as! Match } else { + guard value == nil else { + fatalError("FIXME: what would this mean?") + } let typeErasedMatch = rawCaptures.existentialMatch(from: input[range]) return typeErasedMatch as! Match } diff --git a/Tests/RegexTests/CustomTests.swift b/Tests/RegexTests/CustomTests.swift index 692ad6c37..0ebfe4652 100644 --- a/Tests/RegexTests/CustomTests.swift +++ b/Tests/RegexTests/CustomTests.swift @@ -83,15 +83,14 @@ extension RegexTests { ("55z", .match, nil), ("55z", .firstMatch, "55")) - // FIXME: Requires we return a value instead of a range -// customTest( -// Regex { -// Numbler() -// }, -// ("ab123c", .firstMatch, 1), -// ("abc", .firstMatch, nil), -// ("55z", .match, nil), -// ("55z", .firstMatch, 5)) + customTest( + Regex { + Numbler() + }, + ("ab123c", .firstMatch, 1), + ("abc", .firstMatch, nil), + ("55z", .match, nil), + ("55z", .firstMatch, 5)) // TODO: Convert below tests to better infra. Right now // it's hard because `Match` is constrained to be From ccc2a7b8eaeec21420733fa4158544f76c5f5aa4 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Sun, 27 Feb 2022 18:05:11 -0700 Subject: [PATCH 23/51] Provide alternatives considered --- Documentation/Evolution/RegexSyntax.md | 54 ++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 7 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index fcb67c819..16450a037 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -8,15 +8,24 @@ Hello, we want to issue an update to [Regular Expression Literals](https://forum ## Introduction -Regex literals declare a string processing algorithm using syntax familiar across a variety of languages and tools throughout programming history. Formalizing regex literals in Swift requires choosing a delimiter strategy (e.g. `#/.../#` or `re'...'`), detailing the syntax accepted in between the delimiters ("interior syntax"), and specifying actual types and any relevant protocols for the literal itself. +Regex literals declare a string processing algorithm using syntax familiar across a variety of languages and tools throughout programming history. Formalizing regex literals in Swift requires: + +- Choosing a delimiter (e.g. `#/.../#` or `re'...'`) +- Detailing the "interior syntax" accepted in between delimiters +- Specifying actual types and relevant protocols for the literal. + +We present a detailed and comprehensive treatment of regex literal interior syntax. The syntax we're proposing is large enough for its own dedicated discussion ahead of a full regex literal proposal. + +This is part of a larger effort in supporting regex literals, which in turn is part of a larger effort towards better string processing using regex. See [Pitch and Proposal Status](https://github.com/apple/swift-experimental-string-processing/issues/107), which tracks each relevant piece. -This proposal-component focuses on the interior syntax, which is large enough for its own targeted discussion ahead of the full proposal. Regex literal interior syntax will be part of Swift's source-compatibility story (and to some extent binary compatibility), so we present a detailed and comprehensive design. ## Motivation -Swift aims to be a pragmatic programming language, balancing (**TODO(Michael)**: prose). Rather than pursue a novel interior syntax, (**TODO(Michael)**: prose). +Swift aims to be a pragmatic programming language, striking a balance between familiarity, interoperability, and advancing the art. Swift's `String` presents a uniquely Unicode-forward model of string, but currently suffers from limited processing facilities. -Regex interior syntax is part of a larger [proposal](https://forums.swift.org/t/pitch-regular-expression-literals/52820), which in turn is part of a larger [string processing effort](https://forums.swift.org/t/declarative-string-processing-overview/52459). +The full string processing effort includes a literal, a result builder DSL, protocols for intermixing 3rd party industrial-strength parsers with regex declarations, strong types, and a slew of regex-powered algorithms over strings. + +This proposal specifically hones in on the _familiarity_ aspect by providing a best-in-class treatment of familiar regex syntax. ## Proposed Solution @@ -33,6 +42,7 @@ We also support [UTS#18][uts18]'s full set of character class operators (to our Note that there are minor syntactic incompatibilities and ambiguities involved in this approach. Each is addressed in the relevant sections below +Regex literal interior syntax will be part of Swift's source-compatibility story as well as its binary-compatibility story. Thus, we present a detailed and comprehensive design. ## Detailed Design @@ -610,6 +620,7 @@ An absent function is an Oniguruma feature that allows for the easy inversion of - `(?~|absent)`: Absent stopper, which limits any subsequent matching to not include `absent`. - `(?~|)`: Absent clearer, which undoes the effects of the absent stopper. + ## Syntactic differences between engines The proposed "syntactic superset" introduces some minor ambiguities, as each engine supports a slightly different set of features. When a particular engine's parser sees a feature it doesn't support, it typically has a fall-back behavior, such as treating the unknown feature as literal contents. @@ -724,7 +735,7 @@ As such we feel that the more desirable default behavior of shorthand script pro ### Extended syntax modes -Various regex engines offer an "extended syntax" where whitespace is treated as non-semantic (e.g `a b c` is equivalent to `abc`), in addition to allowing end-of-line comments `# comment`. In PCRE, this is enabled through the `(?x)` and `(?xx)` matching options, where the former allows non-semantic whitespace outside of character classes, and the latter also allows non-semantic whitespace in custom character classes. +Various regex engines offer an "extended syntax" where whitespace is treated as non-semantic (e.g `a b c` is equivalent to `abc`), in addition to allowing end-of-line comments `# comment`. In PCRE, this is enabled through the `(?x)`, and in later versions, `(?xx)` matching options. The former allows non-semantic whitespace outside of character classes, and the latter also allows non-semantic whitespace in custom character classes. Oniguruma, Java, and ICU however enable the more broad behavior under `(?x)`. We therefore intend to follow this behavior, with `(?x)` and `(?xx)` being treated the same. @@ -754,9 +765,14 @@ The `(z)` group gets numbered before the named groups get numbered. We intend on matching the PCRE behavior where groups are numbered purely based on order. -## Canonical representations -Many engines have different spellings for the same regex features, we intend to support parsing. However, for the purposes of e.g printing, we need to decide on a canonical syntax for various constructs. +## Swift canonical syntax + +The proposed syntactic superset means there will be multiple ways to write the same thing. Below we discuss what Swift's preferred spelling could be, a "Swift canonical syntax". + +We are not formally proposing this as a distinct syntax or concept, rather it is useful for considering compiler features such as fixits, pretty-printing, and refactoring actions. + +*TODO Hamish*: We're not proposing any actual action or language-level representation. So I feel like this section is more of an advisory section and good for discussion. It guides tooling decisions more than it is a formal addition to the Swift programming language. Rather than say, e.g., "we intend on canonicalizing to `\u{...}`", we could say "we consider `\u{...}` to be Swift's preferred spelling, in line with string literals". I think we can be a bit briefer too, perhaps collapsing multiple sub-sections together. ### Unicode scalars @@ -857,6 +873,30 @@ PCRECalloutBody -> '' | PCRE accepts a number of alternative delimiters for callout string arguments. We intend to canonicalize to `(?C"...")`. **TODO: May want to alter if we choose `r"..."`, though lexing should be able to handle it by looking for the `(?C` prefix**. +## Alternatives Considered + +### Skip the literals + +The top alternative is to just skip regex literals and only ship the result builder DSL. However, doing so would miss out on the familiarity benefits of existing regex syntax. + +We consider our proposed direction to be more compelling, especially when coupled with refactoring actions to convert literals into regex DSLs. + +### Introduce a novel regex syntax + +Another alternative is to invent a new syntax for regex. This would similarly lose out on the familiarity benefit, though a few simple adjustments could aid readability. + +We are prototyping an "experimental" Swift extended syntax, which is future work and outside the scope of this proposal. Every syntactic extension, while individually compelling, does introduce incompatibilities and can lead to an "uncanny valley" effect. Further investigation is needed and such support can be built on top of what is presented here. + +### Support a minimal syntactic subset + +Regex literal interior syntax will become part of Swift's source and binary-compatibility story, so a reasonable alternative is to support the absolute minimal syntactic subset available. However, we would need to ensure that such a minimal approach is extensible far into the future. Because syntax decisions can impact each other, we would want to consider the ramifications of this full syntactic superset ahead of time anyways. + +Even though it is more work up-front, and creates a longer proposal, it is less risky to support the full intended syntax. The proposed superset maximizes the familiarity benefit of regex literals. + +Note that this proposal regards _syntactic_ support, and does not necessarily mean that everything that can be written will be supported by Swift's run-time in the initial release. Support for more obscure features may appear over time, see [MatchingEngine Capabilities and Roadmap](https://github.com/apple/swift-experimental-string-processing/issues/99) for status. + + + [pcre2-syntax]: https://www.pcre.org/current/doc/html/pcre2syntax.html [oniguruma-syntax]: https://github.com/kkos/oniguruma/blob/master/doc/RE [icu-syntax]: https://unicode-org.github.io/icu/userguide/strings/regexp.html From cc314a246132780466bacde4a9839626db4fc2c1 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 28 Feb 2022 13:54:43 +0000 Subject: [PATCH 24/51] Update RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 57 ++++++++++++++++++-------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 16450a037..f5a645fe7 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -10,8 +10,8 @@ Hello, we want to issue an update to [Regular Expression Literals](https://forum Regex literals declare a string processing algorithm using syntax familiar across a variety of languages and tools throughout programming history. Formalizing regex literals in Swift requires: -- Choosing a delimiter (e.g. `#/.../#` or `re'...'`) -- Detailing the "interior syntax" accepted in between delimiters +- Choosing a delimiter (e.g. `#/.../#` or `re'...'`). +- Detailing the "interior syntax" accepted in between delimiters. - Specifying actual types and relevant protocols for the literal. We present a detailed and comprehensive treatment of regex literal interior syntax. The syntax we're proposing is large enough for its own dedicated discussion ahead of a full regex literal proposal. @@ -48,18 +48,32 @@ Regex literal interior syntax will be part of Swift's source-compatibility story We propose the following syntax for use inside Swift regex literals. -*TODO:* Disclosure triangle explaining the grammar conventions? +
Grammar Conventions + +Elements of the grammar are defined using the syntax `Element -> `. + +Quoted characters e.g `'abc'`, `"abc"` in the grammar match against the literal characters. Unquoted names e.g `Concatenation` refer to other definitions in the grammar. + +The `|` operator is used to specify that the grammar can match against either branch of the operator, similar to a regular expression. Similarly, `*`, `+`, and `?` are used to quantify an element of the grammar, with the same meaning as in regular expressions. Range quantifiers `{...}` may also be used, though we adopt a more explicit syntax that uses the Swift `..<` & `...` operators, e.g `{1...4}`. + +Basic custom character classes may appear in the grammar, and have the same meaning as in a regular expression. For example `[0-9a-zA-Z]` expresses the digits `0` to `9` and the letters `a` to `z` (both upper and lowercase). + +The `!` prefix operator is used to specify that the following grammar element must not appear at that position. + +Grammar elements may be surrounded by parentheses for the purposes of quantification. + +
### Top-level regular expression ``` -Regex -> GlobalMatchingOptionSequence? RegexNode -RegexNode -> '' | Alternation -Alternation -> Concatenation ('|' Concatenation)* -Concatenation -> (!'|' !')' ConcatComponent)* +Regex -> GlobalMatchingOptionSequence? RegexNode +RegexNode -> '' | Alternation +Alternation -> Concatenation ('|' Concatenation)* +Concatenation -> (!'|' !')' ConcatComponent)* ``` -A regex literal may be prefixed with a sequence of global matching options(*TODO*: intra-doc link). A literal's contents can be empty or a sequence of alternatives separated by `|`. +A regex literal may be prefixed with a sequence of [global matching options](#pcre-global-matching-options). A literal's contents can be empty or a sequence of alternatives separated by `|`. Alternatives are a series of expressions concatenated together. The concatentation ends with either a `|` denoting the end of the alternative or a `)` denoting the end of a recursively parsed group. @@ -192,7 +206,7 @@ BuiltinCharClass -> '.' | '\C' | '\d' | '\D' | '\h' | '\H' | '\N' | '\O' | '\R' - `\W`: Non-word character. - `\X`: Any extended grapheme cluster. -Precise definitions of character classes is discussed in (Character Classes for String Processing)[https://forums.swift.org/t/pitch-character-classes-for-string-processing/52920]. +Precise definitions of character classes is discussed in [Character Classes for String Processing](https://forums.swift.org/t/pitch-character-classes-for-string-processing/52920). #### Unicode scalars @@ -234,7 +248,7 @@ A character property specifies a particular Unicode, POSIX, or PCRE property to - The POSIX properties `alnum`, `blank`, `graph`, `print`, `word`, `xdigit` (note that `alpha`, `lower`, `upper`, `space`, `punct`, `digit`, and `cntrl` are covered by Unicode properties). - The UTS#18 special properties `any`, `assigned`, `ascii`. - The special PCRE2 properties `Xan`, `Xps`, `Xsp`, `Xuc`, `Xwd`. -- The special Java property `javaLowerCase` +- The special Java properties `javaLowerCase`, `javaUpperCase`, `javaWhitespace`, `javaMirrored`. We follow [UTS#18][uts18]'s guidance for character properties, including fuzzy matching for property name parsing, according to rules set out by [UAX44-LM3]. The following property names are equivalent: @@ -257,12 +271,14 @@ Other Unicode properties however must specify both a key and value. For non-Unicode properties, only a value is required. These include: -- The special properties `any`, `assigned`, `ascii`. +- The UTS#18 special properties `any`, `assigned`, `ascii`. - The POSIX compatibility properties `alnum`, `blank`, `graph`, `print`, `word`, `xdigit`. The remaining POSIX properties are already covered by boolean Unicode property spellings. +- The special PCRE2 properties `Xan`, `Xps`, `Xsp`, `Xuc`, `Xwd`. +- The special Java properties `javaLowerCase`, `javaUpperCase`, `javaWhitespace`, `javaMirrored`. Note that the internal `PropertyContents` syntax is shared by both the `\p{...}` and POSIX-style `[:...:]` syntax, allowing e.g `[:script=Latin:]` as well as `\p{alnum}`. -#### `\K` +### `\K` The `\K` escape sequence is used to drop any previously matched characters from the final matching result. It does not affect captures, e.g `a(b)\Kc` when matching against `abc` will return a match of `c`, but with a capture of `b`. @@ -298,13 +314,13 @@ Identifier -> [\w--\d] \w* Groups define a new scope that contains a recursively nested regex. Groups have different semantics depending on how they are introduced. -Note there are additional constructs that may syntactically appear similar to groups, but are distinct. See the *group-like atoms* section. +Note there are additional constructs that may syntactically appear similar to groups, such as backreferences and conditionals, but are distinct. #### Basic group kinds - `()`: A capturing group. - `(?:)`: A non-capturing group. -- `(?|)`: A group that, for a direct child alternation, resets the numbering of groups at each branch of that alternation. See *group numbering*. +- `(?|)`: A group that, for a direct child alternation, resets the numbering of groups at each branch of that alternation. See [Group Numbering](#group-numbering). Capturing groups produce captures, which remember the range of input matched for the scope of that group. @@ -427,7 +443,7 @@ We support all the matching options accepted by PCRE, ICU, and Oniguruma. In add - `n`: Disables the capturing behavior of `(...)` groups. Named capture groups must be used instead. - `s`: Changes `.` to match any character, including newlines. - `U`: Changes quantifiers to be reluctant by default, with the `?` specifier changing to mean greedy. -- `x`, `xx`: Enables extended syntax mode, which allows non-semantic whitespace and end-of-line comments. See the *trivia* section for more info. +- `x`, `xx`: Enables extended syntax mode, which allows non-semantic whitespace and end-of-line comments. See [Extended Syntax Modes](#extended-syntax-modes) for more info. #### ICU options @@ -513,7 +529,7 @@ PCREVersionCheck -> '>'? '=' PCREVersionNumber PCREVersionNumber -> '.' ``` -A conditional evaluates a particular condition, and chooses a branch to match against accordingly. 1 or 2 branches may be specified. If 1 branch is specified e.g `(?(...)x)`, it is treated as the true branch. Note this includes an empty true branch, e.g `(?(...))` which is the null pattern as described in *top-level regular expression*. If 2 branches are specified, e.g `(?(...)x|y)`, the first is treated as the true branch, the second being the false branch. +A conditional evaluates a particular condition, and chooses a branch to match against accordingly. 1 or 2 branches may be specified. If 1 branch is specified e.g `(?(...)x)`, it is treated as the true branch. Note this includes an empty true branch, e.g `(?(...))` which is the null pattern as described in the [Top-Level Regular Expression](#top-level-regular-expression) section. If 2 branches are specified, e.g `(?(...)x|y)`, the first is treated as the true branch, the second being the false branch. A condition may be: @@ -787,12 +803,17 @@ UnicodeScalar -> '\u{' HexDigit{1...} '}' HexDigit -> [0-9a-zA-Z] OctalDigit -> [0-7] + +NamedScalar -> '\N{' ScalarName '}' +ScalarName -> 'U+' HexDigit{1...8} | [\s\w-]+ ``` -For consistency with String escape syntax, we intend on canonicalizing to `\u{...}`. +There are multiple equivalent ways of spelling the same the Unicode scalar value, in either hex, octal, or by spelling the name explicitly. String literals already provide a `\u{...}` syntax that allow a hex sequence for a Unicode scalar. As this is Swift's existing preferred spelling for such a sequence, we consider it to be the preferred spelling in this case too. There may however be value in preserving scalars that are explicitly spelled by name with `\N{...}` for clarity. ### Character properties +Character properties `\p{...}` have a variety of alternative spellings due to fuzzy matching, Unicode aliases, and shorthand syntax for common Unicode properties. They also may be written using POSIX syntax e.g `[:gc=Whitespace:]`. + **TODO: Should we canonicalize on e.g `\p{Script_Extensions=Greek}`? Or prefer the shorthand where we can? Or just avoid canonicalizing?** ### Groups @@ -906,4 +927,4 @@ Note that this proposal regards _syntactic_ support, and does not necessarily me [unicode-prop-key-aliases]: https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt [unicode-prop-value-aliases]: https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt [unicode-scripts]: https://www.unicode.org/reports/tr24/#Script -[unicode-script-extensions]: https://www.unicode.org/reports/tr24/#Script_Extensions \ No newline at end of file +[unicode-script-extensions]: https://www.unicode.org/reports/tr24/#Script_Extensions From 11bb57d95dfd869ca2a8c734580a298c69f6eee4 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 28 Feb 2022 14:42:38 +0000 Subject: [PATCH 25/51] Update RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index f5a645fe7..9a0c62729 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -814,11 +814,11 @@ There are multiple equivalent ways of spelling the same the Unicode scalar value Character properties `\p{...}` have a variety of alternative spellings due to fuzzy matching, Unicode aliases, and shorthand syntax for common Unicode properties. They also may be written using POSIX syntax e.g `[:gc=Whitespace:]`. -**TODO: Should we canonicalize on e.g `\p{Script_Extensions=Greek}`? Or prefer the shorthand where we can? Or just avoid canonicalizing?** +**TODO: Should we suggest canonicalizing on e.g `\p{Script_Extensions=Greek}`? Or prefer the shorthand where we can? Or just avoid canonicalizing?** ### Groups -#### Named +Named groups may be specified with a few different delimiters: ``` NamedGroup -> 'P<' GroupNameBody '>' @@ -826,11 +826,11 @@ NamedGroup -> 'P<' GroupNameBody '>' | "'" GroupNameBody "'" ``` -We intend on canonicalizing to the `(?<...>)` spelling. +The preferable spelling here will likely be influenced by the regex literal delimiter choice. `(?'...')` seems a reasonable preferred spelling in isolation, however not so much if `re'...'` is chosen as the delimiter. To reduce possible confusion for the parser as well as the user, `(?<...>)` would seem the more preferable syntax in that case. This would also likely affect the preferred syntax for references. #### Lookaheads and lookbehinds -We intend on canonicalizing to the short-form versions of these group kinds, e.g `(?=`. +These have both shorthand spellings as well as more explicit PCRE2 spellings. While the more explicit spellings are definitely clearer, they can feel quite verbose. The short-form spellings e.g `(?=` seem more preferable due to their familiarity. ### Backreferences @@ -844,7 +844,9 @@ Backreference -> '\g{' NamedOrNumberRef '}' | '(?P=' NamedRef ')' ``` -For absolute numeric references, we plan on choosing the canonical spelling `\DDD`, as it is unambiguous with octal sequences. For relative numbered references, as well as named references, we intend on canonicalizing to `\k<...>` to match the group name canonicalization `(?<...>)`. **TODO: How valuable is it to have canonical `\DDD`? Would it be better to just use `\k<...>` for everything?** +For absolute numeric references, `\DDD` seems to be a strong candidate for the preferred syntax due to its familiarity. For relative numbered references, as well as named references, `\k<...>` or `\k'...'` seem like the ideal choice (depending on the syntax chosen for named groups). This avoids the confusion between `\g{...}` and `\g<...>` referring to a backreference and subpattern respectively. It additionally avoids confusion with group syntax. + +There may be value in choosing `\k` as the single unified syntax for backreferences (instead of `\DDD` for absolute numeric references), though there may be value in preserving the familiarity of `\DDD`. ### Subpatterns @@ -859,7 +861,7 @@ GroupLikeSubpatternBody -> 'P>' NamedRef | NumberRef ``` -We intend on canonicalizing to the `\g<...>` spelling. **TODO: For `(?R)` too?** +To avoid confusion with groups, `\g<...>` or `\g'...'` seem like the ideal preferred spellings (depending on the syntax chosen for named groups). There may however be value in preserving the `(?R)` spelling where it is used, instead of preferring e.g `\g<0>`. ### Conditional references @@ -874,7 +876,7 @@ KnownCondition -> 'R' | NumberRef ``` -For named references in a group condition, there is a choice between `(?('name'))` and `(?())`. We intend on canonicalizing to `(?())` to match the group name canonicalization. +For named references in a group condition, there is a choice between `(?('name'))` and `(?())`. The preferred syntax in this case would likely reflect the syntax chosen for named groups. ### PCRE Callouts @@ -891,8 +893,7 @@ PCRECalloutBody -> '' | | '{' '}' ``` -PCRE accepts a number of alternative delimiters for callout string arguments. We intend to canonicalize to `(?C"...")`. **TODO: May want to alter if we choose `r"..."`, though lexing should be able to handle it by looking for the `(?C` prefix**. - +PCRE accepts a number of alternative delimiters for callout string arguments. The `(?C"...")` syntax seems preferable due to its consistency with string literal syntax. However it may be necessary to prefer `(?C'...')` depending on whether the regex literal delimiter ends up involving double quotes e.g `re"..."`. ## Alternatives Considered From f64e6aebfdf31a7f8a723de3f0354265f64bd61c Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Mon, 28 Feb 2022 11:59:12 -0700 Subject: [PATCH 26/51] Update RegexSyntax.md - grammar convention --- Documentation/Evolution/RegexSyntax.md | 44 +++++++++++++++----------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 9a0c62729..6d64a3e9f 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -38,7 +38,7 @@ We propose accepting a syntactic "superset" of the following existing regular ex To our knowledge, all other popular regex engines support a subset of the above syntaxes. -We also support [UTS#18][uts18]'s full set of character class operators (to our knowledge no other engine does). Beyond that, UTS#18 deals with semantics rather than syntax, and what syntax it uses is covered by the above list. We also parse `\p{javaLowerCase}`, meaning we support a superset of Java 8 as well. +We also support [UTS#18][uts18]'s full set of character class operators (to our knowledge no other engine does). Beyond that, UTS#18 deals with semantics rather than syntax, and what syntax it uses is covered by the above list. We also parse Java's properties (e.g. `\p{javaLowerCase}`), meaning we support a superset of Java 8 as well. Note that there are minor syntactic incompatibilities and ambiguities involved in this approach. Each is addressed in the relevant sections below @@ -48,19 +48,25 @@ Regex literal interior syntax will be part of Swift's source-compatibility story We propose the following syntax for use inside Swift regex literals. -
Grammar Conventions +
Grammar Notation -Elements of the grammar are defined using the syntax `Element -> `. +For the grammar sections, we use a modified PEG-like notation, in which the grammar also describes an unambiguous top-down parsing algorithm. -Quoted characters e.g `'abc'`, `"abc"` in the grammar match against the literal characters. Unquoted names e.g `Concatenation` refer to other definitions in the grammar. +- ` -> ` gives the definition of `Element` +- The `|` operator specifies a choice of alternatives +- `'x'` is the literal character `x`, otherwise it's a reference to x + + A literal `'` is spelled `"'"` +- Postfix `*` `+` and `?` denote zero-or-more, one-or-more, and zero-or-one +- Range quantifiers, like `{1...4}`, use Swift range syntax as convention. +- Basic custom character classes are written like `[0-9a-zA-Z]` +- Prefix `!` operator means the next element must not appear (a zero-width assertion) +- Parenthesis group for the purposes of quantification +- Builtins use angle brackets: + - `` refers to an integer, `` a character, etc. + - `` is any whitespace character + - `` is the end-of-line anchor (e.g. `$` in regex). -The `|` operator is used to specify that the grammar can match against either branch of the operator, similar to a regular expression. Similarly, `*`, `+`, and `?` are used to quantify an element of the grammar, with the same meaning as in regular expressions. Range quantifiers `{...}` may also be used, though we adopt a more explicit syntax that uses the Swift `..<` & `...` operators, e.g `{1...4}`. - -Basic custom character classes may appear in the grammar, and have the same meaning as in a regular expression. For example `[0-9a-zA-Z]` expresses the digits `0` to `9` and the letters `a` to `z` (both upper and lowercase). - -The `!` prefix operator is used to specify that the following grammar element must not appear at that position. - -Grammar elements may be surrounded by parentheses for the purposes of quantification. +For example, `(!'|' !')' ConcatComponent)*` means any number (zero or more) occurrences of `ConcatComponent` so long as the initial character is neither a literal `|` nor a literal `)`.
@@ -87,8 +93,8 @@ ConcatComponent -> Trivia | Quote | Quantification Trivia -> Comment | NonSemanticWhitespace Comment -> '(?#' (!')')* ')' | EndOfLineComment -(extended syntax only) EndOfLineComment -> '#' .*$ -(extended syntax only) NonSemanticWhitespace -> \s+ +(extended syntax only) EndOfLineComment -> '#' (! .)* +(extended syntax only) NonSemanticWhitespace -> + Quote -> '\Q' (!'\E' .)* '\E' @@ -212,12 +218,12 @@ Precise definitions of character classes is discussed in [Character Classes for ``` UnicodeScalar -> '\u{' HexDigit{1...} '}' - | '\u' HexDigit{4} - | '\x{' HexDigit{1...} '}' - | '\x' HexDigit{0...2} - | '\U' HexDigit{8} - | '\o{' OctalDigit{1...} '}' - | '\0' OctalDigit{0...3} + | '\u' HexDigit{4} + | '\x{' HexDigit{1...} '}' + | '\x' HexDigit{0...2} + | '\U' HexDigit{8} + | '\o{' OctalDigit{1...} '}' + | '\0' OctalDigit{0...3} HexDigit -> [0-9a-zA-Z] OctalDigit -> [0-7] From 3ab3b179188bc9d59e4c125fd132f2ddc530fc4a Mon Sep 17 00:00:00 2001 From: Richard Wei Date: Thu, 17 Feb 2022 15:20:47 -0800 Subject: [PATCH 27/51] Document specially integrated modules and integration process. --- README.md | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/README.md b/README.md index e6f94377c..941231b24 100644 --- a/README.md +++ b/README.md @@ -9,3 +9,65 @@ See [Declarative String Processing Overview][decl-string] ## Requirements - [Swift Trunk Development Snapshot](https://www.swift.org/download/#snapshots) DEVELOPMENT-SNAPSHOT-2022-02-03 or later. + +## Integration with Swift + +`_MatchingEngine`, `_CUnicode` and `_StringProcessing` are specially integrated modules that are built as part of apple/swift. + +Specifically, `_MatchingEngine` contains the parser for regular expression literals and is built both as part of the compiler and as a core library. `_CUnicode` and `_StringProcessing` are built together as a core library named `_StringProcessing`. + +| Module | Swift toolchain component | +| ------------------- | ------------------------------------------------------------------------------------ | +| `_MatchingEngine` | `SwiftCompilerSources/Sources/ExperimentalRegex` and `stdlib/public/_MatchingEngine` | +| `_CUnicode` | `stdlib/public/_StringProcessing` | +| `_StringProcessing` | `stdlib/public/_StringProcessing` | + +### Branching scheme + +#### Development branch + +The `main` branch is the branch for day-to-day development. Generally, you should create PRs against this branch. + +#### Swift integration branches + +Branches whose name starts with `swift/` are Swift integration branches similar to those in [apple/llvm-project](https://github.com/apple/llvm-project). For each branch, dropping the `swift/` prefix is the corresponding branch in [apple/swift](https://github.com/apple/swift). + +| apple/swift branch | apple/swift-experimental-string-processing branch | +| ------------------- | ----------------------------------------------------- | +| main | swift/main | +| release/5.7 | swift/release/5.7 | +| ... | swift/... | + +A pair of corresponding branches are expected to build successfully together and pass all tests. + +### Integration workflow + +To integrate the latest changes in apple/swift-experimental-string-processing to apple/swift, carefully follow the workflow: + +- Create pull requests. + - Create a pull request in apple/swift-experimental-string-processing from `main` to `swift/main`, e.g. "[Integration] main -> swift/main". + - If apple/swift needs to be modified to work with the latest `main` in apple/swift-experimental-string-processing, create a pull request in apple/swift. +- Trigger CI. + - In the apple/swift-experimental-string-processing pull request, trigger CI using the following command (replacing `` with the apple/swift pull request number, if any): + ``` + apple/swift# # use this line only if there is an corresponding apple/swift PR + @swift-ci please test + ``` + - In the apple/swift pull request (if any), trigger CI using the following command (replacing `` with the apple/swift-experimental-string-processing pull request number): + ``` + apple/swift-experimental-string-processing# + @swift-ci please test + ``` +- Merge when approved. + - Merge the pull request in apple/swift-experimental-string-processing as a **merge commit**. + - Merge the pull request in apple/swift (if any). + +### Development notes + +Compiler integration can be tricky. Use special caution when developing `_MatchingEngine`, `_CUnicode` and `_StringProcessing` modules. + +- Do not change the names of these modules without due approval from compiler and infrastructure teams. +- Do not modify the existing ABI (e.g. C API, serialization format) between the regular expression parser and the Swift compiler unless absolutely necessary. +- Always minimize the number of lockstep integrations, i.e. when apple/swift-experimental-string-processing and apple/swift have to change together. Whenever possible, introduce new API first, migrate Swift compiler onto it, and then deprecate old API. Use versioning if helpful. +- In `_StringProcessing`, do not write fully qualified references to symbols in `_CUnicode`, and always wrap `import _CUnicode` in a `#if canImport(_CUnicode)`. This is because `_CUnicode` is built as part of `_StringProcessing` with CMake. +- In `_MatchingEngine`, do not write fully qualified references to `_MatchingEngine` itself. This is because `_MatchingEngine` is built as `ExperimentalRegex` in `SwiftCompilerSources/` with CMake. From a4439677925d062c47a3dd7c3979171d6b03e504 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 28 Feb 2022 21:25:42 +0000 Subject: [PATCH 28/51] Update RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 6d64a3e9f..eeeb5f0e5 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -284,7 +284,7 @@ For non-Unicode properties, only a value is required. These include: Note that the internal `PropertyContents` syntax is shared by both the `\p{...}` and POSIX-style `[:...:]` syntax, allowing e.g `[:script=Latin:]` as well as `\p{alnum}`. -### `\K` +#### `\K` The `\K` escape sequence is used to drop any previously matched characters from the final matching result. It does not affect captures, e.g `a(b)\Kc` when matching against `abc` will return a match of `c`, but with a capture of `b`. @@ -320,7 +320,7 @@ Identifier -> [\w--\d] \w* Groups define a new scope that contains a recursively nested regex. Groups have different semantics depending on how they are introduced. -Note there are additional constructs that may syntactically appear similar to groups, such as backreferences and conditionals, but are distinct. +Note there are additional constructs that may syntactically appear similar to groups, such as backreferences and PCRE backtracking directives, but are distinct. #### Basic group kinds @@ -619,12 +619,12 @@ OnigurumaCalloutArgList -> OnigurumaCalloutArg (',' OnigurumaCalloutArgList)* OnigurumaCalloutArg -> [^,}]+ OnigurumaTag -> '[' Identifier ']' -OnigurumaCalloutOfContents -> '(?' '{'+ Contents '}'+ OnigurumaTag? Direction? ')' -OnigurumaCalloutContents -> +OnigurumaCalloutOfContents -> '(?' '{' OnigurumaCalloutContents '}' OnigurumaTag? Direction? ')' +OnigurumaCalloutContents -> | '{' OnigurumaCalloutContents '}' OnigurumaCalloutDirection -> 'X' | '<' | '>' ``` -A callout is a feature that allows a user-supplied function to be called when matching reaches that point in the pattern. We supported parsing both the PCRE and Oniguruma callout syntax. The PCRE syntax accepts a string or numeric argument that is passed to the function. The Oniguruma syntax is more involved, and may accept a tag, argument list, or even an arbitrary program in the 'callout of contents' syntax. +A callout is a feature that allows a user-supplied function to be called when matching reaches that point in the pattern. We supported parsing both the PCRE and Oniguruma callout syntax. The PCRE syntax accepts a string or numeric argument that is passed to the function. The Oniguruma syntax is more involved, and may accept an identifier with an optional tag and argument list. It may also accept an arbitrary program in the 'callout of contents' syntax. This is an expanded version of Perl's interpolation syntax, and allows an arbitrary nesting of delimiters in addition to an optional tag and direction. ### Absent functions @@ -757,7 +757,7 @@ As such we feel that the more desirable default behavior of shorthand script pro ### Extended syntax modes -Various regex engines offer an "extended syntax" where whitespace is treated as non-semantic (e.g `a b c` is equivalent to `abc`), in addition to allowing end-of-line comments `# comment`. In PCRE, this is enabled through the `(?x)`, and in later versions, `(?xx)` matching options. The former allows non-semantic whitespace outside of character classes, and the latter also allows non-semantic whitespace in custom character classes. +Various regex engines offer an "extended syntax" where whitespace is treated as non-semantic (e.g `a b c` is equivalent to `abc`), in addition to allowing end-of-line comments `# comment`. In both PCRE and Perl, this is enabled through the `(?x)`, and in later versions, `(?xx)` matching options. The former allows non-semantic whitespace outside of character classes, and the latter also allows non-semantic whitespace in custom character classes. Oniguruma, Java, and ICU however enable the more broad behavior under `(?x)`. We therefore intend to follow this behavior, with `(?x)` and `(?xx)` being treated the same. From 9572f99c0b8922ba0046dd0a094eff59ae6508cc Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 1 Mar 2022 12:09:50 +0000 Subject: [PATCH 29/51] Update RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 28 +++++++++++++++----------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index eeeb5f0e5..064b2a1a1 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -598,7 +598,7 @@ This is syntax specific to PCRE, and allows a set of global options to appear at ### Callouts ``` -Callout -> PCRECallout | OnigurumaCallout +Callout -> PCRECallout | NamedCallout | InterpolatedCallout PCRECallout -> '(?C' CalloutBody ')' PCRECalloutBody -> '' | @@ -611,20 +611,24 @@ PCRECalloutBody -> '' | | '$' '$' | '{' '}' -OnigurumaCallout -> OnigurumaNamedCallout | OnigurumaCalloutOfContents +NamedCallout -> '(*' Identifier CalloutTag? CalloutArgs? ')' +CalloutArgs -> '{' CalloutArgList '}' +CalloutArgList -> CalloutArg (',' CalloutArgList)* +CalloutArg -> [^,}]+ +CalloutTag -> '[' Identifier ']' -OnigurumaNamedCallout -> '(*' Identifier OnigurumaTag? OnigurumaCalloutArgs? ')' -OnigurumaCalloutArgs -> '{' OnigurumaCalloutArgList '}' -OnigurumaCalloutArgList -> OnigurumaCalloutArg (',' OnigurumaCalloutArgList)* -OnigurumaCalloutArg -> [^,}]+ -OnigurumaTag -> '[' Identifier ']' - -OnigurumaCalloutOfContents -> '(?' '{' OnigurumaCalloutContents '}' OnigurumaTag? Direction? ')' -OnigurumaCalloutContents -> | '{' OnigurumaCalloutContents '}' -OnigurumaCalloutDirection -> 'X' | '<' | '>' +InterpolatedCallout -> '(?' '{' Interpolation '}' CalloutTag? CalloutDirection? ')' +Interpolation -> | '{' Interpolation '}' +CalloutDirection -> 'X' | '<' | '>' ``` -A callout is a feature that allows a user-supplied function to be called when matching reaches that point in the pattern. We supported parsing both the PCRE and Oniguruma callout syntax. The PCRE syntax accepts a string or numeric argument that is passed to the function. The Oniguruma syntax is more involved, and may accept an identifier with an optional tag and argument list. It may also accept an arbitrary program in the 'callout of contents' syntax. This is an expanded version of Perl's interpolation syntax, and allows an arbitrary nesting of delimiters in addition to an optional tag and direction. +A callout is a feature that allows a user-supplied function to be called when matching reaches that point in the pattern. We supported parsing 3 types of callout: + +- PCRE callout syntax, which accepts a string or numeric argument that is passed to the function. +- Oniguruma named callout syntax, which accepts an identifier with an optional tag and argument list. +- Interpolated callout syntax, which is equivalent to Oniguruma's "callout of contents". This callout accepts an arbitrary interpolated program. This is an expanded version of Perl's interpolation syntax, and allows an arbitrary nesting of delimiters in addition to an optional tag and direction. + +While we propose parsing these for the purposes of issuing helpful diagnostics, we are deferring full support for the interpolated syntax for the future. ### Absent functions From 4d6244eb830615d5e44c5239b962f00c7a0ed1c8 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Tue, 1 Mar 2022 07:46:21 -0700 Subject: [PATCH 30/51] RegexSyntax.md: Shorten canonical syntax --- Documentation/Evolution/RegexSyntax.md | 111 +++---------------------- 1 file changed, 10 insertions(+), 101 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 064b2a1a1..9c45cb239 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -396,7 +396,7 @@ RangeElt -> | UnicodeScalar | EscapeSequence SetOp -> '&&' | '--' | '~~' | '-' ``` -Custom characters classes introduce their own language, in which most regular expression metacharacters become literal. The basic element in a custom character class is an `Atom`, though only a few atoms are considered valid: +Custom characters classes introduce their own sublanguage, in which most regular expression metacharacters become literal. The basic element in a custom character class is an `Atom`, though only some atoms are considered valid: - Builtin character classes, except for `.`, `\R`, `\O`, `\X`, `\C`, and `\N`. - Escape sequences, including `\b` which becomes the backspace character (rather than a word boundary). @@ -796,114 +796,23 @@ We intend on matching the PCRE behavior where groups are numbered purely based o The proposed syntactic superset means there will be multiple ways to write the same thing. Below we discuss what Swift's preferred spelling could be, a "Swift canonical syntax". -We are not formally proposing this as a distinct syntax or concept, rather it is useful for considering compiler features such as fixits, pretty-printing, and refactoring actions. +We are not formally proposing this as a distinct syntax or concept, rather it is useful for considering compiler features such as fixits, pretty-printing, and refactoring actions. We're hoping for further discussion with the community here. Useful criteria include how well the choice fits in with the rest of Swift, whether there's an existing common practice, and whether one choice is less confusing in the context of others. -*TODO Hamish*: We're not proposing any actual action or language-level representation. So I feel like this section is more of an advisory section and good for discussion. It guides tooling decisions more than it is a formal addition to the Swift programming language. Rather than say, e.g., "we intend on canonicalizing to `\u{...}`", we could say "we consider `\u{...}` to be Swift's preferred spelling, in line with string literals". I think we can be a bit briefer too, perhaps collapsing multiple sub-sections together. +Unicode scalar literals can be spelled in many ways (*TODO*: intra-doc link). We propose treating Swift's string literal syntax of `\u{HexDigit{1...}}` as the preferred spelling. -### Unicode scalars +Character properties can be spelled `\p{...}` or `[:...:]`. We recommend preferring `\p{...}` as the bracket syntax historically meant POSIX-defined character classes, and still has that connotation in some engines. The spelling of properties themselves can be fuzzy (*TODO*: intra doc link) and we (weakly) recommend the shortest spelling (no opinion on casing yet). For script extensions, we (weakly) recommend e.g. `\p{Greek}` instead of `\p{Script_Extensions=Greek}`. We would like more discussion with the community here. -``` -UnicodeScalar -> '\u{' HexDigit{1...} '}' - | '\u' HexDigit{4} - | '\x{' HexDigit{1...} '}' - | '\x' HexDigit{0...2} - | '\U' HexDigit{8} - | '\o{' OctalDigit{1...} '}' - | '\0' OctalDigit{0...3} - -HexDigit -> [0-9a-zA-Z] -OctalDigit -> [0-7] - -NamedScalar -> '\N{' ScalarName '}' -ScalarName -> 'U+' HexDigit{1...8} | [\s\w-]+ -``` - -There are multiple equivalent ways of spelling the same the Unicode scalar value, in either hex, octal, or by spelling the name explicitly. String literals already provide a `\u{...}` syntax that allow a hex sequence for a Unicode scalar. As this is Swift's existing preferred spelling for such a sequence, we consider it to be the preferred spelling in this case too. There may however be value in preserving scalars that are explicitly spelled by name with `\N{...}` for clarity. - -### Character properties - -Character properties `\p{...}` have a variety of alternative spellings due to fuzzy matching, Unicode aliases, and shorthand syntax for common Unicode properties. They also may be written using POSIX syntax e.g `[:gc=Whitespace:]`. - -**TODO: Should we suggest canonicalizing on e.g `\p{Script_Extensions=Greek}`? Or prefer the shorthand where we can? Or just avoid canonicalizing?** - -### Groups - -Named groups may be specified with a few different delimiters: - -``` -NamedGroup -> 'P<' GroupNameBody '>' - | '<' GroupNameBody '>' - | "'" GroupNameBody "'" -``` - -The preferable spelling here will likely be influenced by the regex literal delimiter choice. `(?'...')` seems a reasonable preferred spelling in isolation, however not so much if `re'...'` is chosen as the delimiter. To reduce possible confusion for the parser as well as the user, `(?<...>)` would seem the more preferable syntax in that case. This would also likely affect the preferred syntax for references. - -#### Lookaheads and lookbehinds - -These have both shorthand spellings as well as more explicit PCRE2 spellings. While the more explicit spellings are definitely clearer, they can feel quite verbose. The short-form spellings e.g `(?=` seem more preferable due to their familiarity. - -### Backreferences - -``` -Backreference -> '\g{' NamedOrNumberRef '}' - | '\g' NumberRef - | '\k<' NamedOrNumberRef '>' - | "\k'" NamedOrNumberRef "'" - | '\k{' NamedRef '}' - | '\' [1-9] [0-9]+ - | '(?P=' NamedRef ')' -``` - -For absolute numeric references, `\DDD` seems to be a strong candidate for the preferred syntax due to its familiarity. For relative numbered references, as well as named references, `\k<...>` or `\k'...'` seem like the ideal choice (depending on the syntax chosen for named groups). This avoids the confusion between `\g{...}` and `\g<...>` referring to a backreference and subpattern respectively. It additionally avoids confusion with group syntax. +Lookaround assertions have common shorthand spellings, while PCRE2 introduced longer more explicit spellings (*TODO*: doc link). We are (very weakly) recommending the common short-hand syntax of e.g. `(?=...)` as that's wider spread. We are interested in more discussion with the community here. -There may be value in choosing `\k` as the single unified syntax for backreferences (instead of `\DDD` for absolute numeric references), though there may be value in preserving the familiarity of `\DDD`. +Named groups may be specified with a few different delimiters: `(?...)`, `(?P...)`, `(?'name'...)`. We (weakly) recommend `(?...)`, but the final preference may be influenced by choice of delimiter for the regex itself. We'd appreciate any insight from the community. -### Subpatterns +References and backreferences (*TODO*: intra-doc link) have multiple spellings. For absolute numeric references, `\DDD` seems to be a strong candidate for the preferred syntax due to its familiarity. For relative numbered references, as well as named references, either `\k<...>` or `\k'...'` seem like the better choice, depending on the syntax chosen for named groups. This avoids the confusion between `\g{...}` and `\g<...>` referring to a backreferences and subpatterns respectively, as well as any confusion with group syntax. -``` -Subpattern -> '\g<' NamedOrNumberRef '>' - | "\g'" NamedOrNumberRef "'" - | '(?' GroupLikeSubpatternBody ')' - -GroupLikeSubpatternBody -> 'P>' NamedRef - | '&' NamedRef - | 'R' - | NumberRef -``` +For subpatterns, we recommend either `\g<...>` or `\g'...'` depending on the choice for named group syntax. We're unsure if we should prefer `(?R)` as a spelling for e.g. `\g<0>` or not, as it is more widely used and understood, but less consistent with other subpatterns. -To avoid confusion with groups, `\g<...>` or `\g'...'` seem like the ideal preferred spellings (depending on the syntax chosen for named groups). There may however be value in preserving the `(?R)` spelling where it is used, instead of preferring e.g `\g<0>`. - -### Conditional references - -``` -KnownCondition -> 'R' - | 'R' NumberRef - | 'R&' NamedRef - | '<' NamedOrNumberRef '>' - | "'" NamedOrNumberRef "'" - | 'DEFINE' - | 'VERSION' VersionCheck - | NumberRef -``` - -For named references in a group condition, there is a choice between `(?('name'))` and `(?())`. The preferred syntax in this case would likely reflect the syntax chosen for named groups. - -### PCRE Callouts - -``` -PCRECallout -> '(?C' CalloutBody ')' -PCRECalloutBody -> '' | - | '`' '`' - | "'" "'" - | '"' '"' - | '^' '^' - | '%' '%' - | '#' '#' - | '$' '$' - | '{' '}' -``` +Conditional references (*TODO*: intra-doc link) have a choice between `(?('name'))` and `(?())`. The preferred syntax in this case would likely reflect the syntax chosen for named groups. -PCRE accepts a number of alternative delimiters for callout string arguments. The `(?C"...")` syntax seems preferable due to its consistency with string literal syntax. However it may be necessary to prefer `(?C'...')` depending on whether the regex literal delimiter ends up involving double quotes e.g `re"..."`. +We are deferring runtime support for callouts from regex literals as future work, though we will correctly parse their contents. We have no current recommendation for a preference of PCRE-style callout syntax (*TODO*: intra-doc link), and would like to discuss with the community whether we should have one. ## Alternatives Considered From bafe039659ea625ea53c03912e1bf7770d688cd1 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 1 Mar 2022 15:12:31 +0000 Subject: [PATCH 31/51] Intra-doc links --- Documentation/Evolution/RegexSyntax.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 9c45cb239..76cc22d08 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -798,21 +798,21 @@ The proposed syntactic superset means there will be multiple ways to write the s We are not formally proposing this as a distinct syntax or concept, rather it is useful for considering compiler features such as fixits, pretty-printing, and refactoring actions. We're hoping for further discussion with the community here. Useful criteria include how well the choice fits in with the rest of Swift, whether there's an existing common practice, and whether one choice is less confusing in the context of others. -Unicode scalar literals can be spelled in many ways (*TODO*: intra-doc link). We propose treating Swift's string literal syntax of `\u{HexDigit{1...}}` as the preferred spelling. +[Unicode scalar literals](#unicode-scalars) can be spelled in many ways. We propose treating Swift's string literal syntax of `\u{HexDigit{1...}}` as the preferred spelling. -Character properties can be spelled `\p{...}` or `[:...:]`. We recommend preferring `\p{...}` as the bracket syntax historically meant POSIX-defined character classes, and still has that connotation in some engines. The spelling of properties themselves can be fuzzy (*TODO*: intra doc link) and we (weakly) recommend the shortest spelling (no opinion on casing yet). For script extensions, we (weakly) recommend e.g. `\p{Greek}` instead of `\p{Script_Extensions=Greek}`. We would like more discussion with the community here. +Character properties can be spelled `\p{...}` or `[:...:]`. We recommend preferring `\p{...}` as the bracket syntax historically meant POSIX-defined character classes, and still has that connotation in some engines. The [spelling of properties themselves can be fuzzy](#character-properties) and we (weakly) recommend the shortest spelling (no opinion on casing yet). For script extensions, we (weakly) recommend e.g. `\p{Greek}` instead of `\p{Script_Extensions=Greek}`. We would like more discussion with the community here. -Lookaround assertions have common shorthand spellings, while PCRE2 introduced longer more explicit spellings (*TODO*: doc link). We are (very weakly) recommending the common short-hand syntax of e.g. `(?=...)` as that's wider spread. We are interested in more discussion with the community here. +[Lookaround assertions](#lookahead-and-lookbehind) have common shorthand spellings, while PCRE2 introduced longer more explicit spellings. We are (very weakly) recommending the common short-hand syntax of e.g. `(?=...)` as that's wider spread. We are interested in more discussion with the community here. Named groups may be specified with a few different delimiters: `(?...)`, `(?P...)`, `(?'name'...)`. We (weakly) recommend `(?...)`, but the final preference may be influenced by choice of delimiter for the regex itself. We'd appreciate any insight from the community. -References and backreferences (*TODO*: intra-doc link) have multiple spellings. For absolute numeric references, `\DDD` seems to be a strong candidate for the preferred syntax due to its familiarity. For relative numbered references, as well as named references, either `\k<...>` or `\k'...'` seem like the better choice, depending on the syntax chosen for named groups. This avoids the confusion between `\g{...}` and `\g<...>` referring to a backreferences and subpatterns respectively, as well as any confusion with group syntax. +[Backreferences](#backreferences) have multiple spellings. For absolute numeric references, `\DDD` seems to be a strong candidate for the preferred syntax due to its familiarity. For relative numbered references, as well as named references, either `\k<...>` or `\k'...'` seem like the better choice, depending on the syntax chosen for named groups. This avoids the confusion between `\g{...}` and `\g<...>` referring to a backreferences and subpatterns respectively, as well as any confusion with group syntax. -For subpatterns, we recommend either `\g<...>` or `\g'...'` depending on the choice for named group syntax. We're unsure if we should prefer `(?R)` as a spelling for e.g. `\g<0>` or not, as it is more widely used and understood, but less consistent with other subpatterns. +For [subpatterns](#subpatterns), we recommend either `\g<...>` or `\g'...'` depending on the choice for named group syntax. We're unsure if we should prefer `(?R)` as a spelling for e.g. `\g<0>` or not, as it is more widely used and understood, but less consistent with other subpatterns. -Conditional references (*TODO*: intra-doc link) have a choice between `(?('name'))` and `(?())`. The preferred syntax in this case would likely reflect the syntax chosen for named groups. +[Conditional references](#conditionals) have a choice between `(?('name'))` and `(?())`. The preferred syntax in this case would likely reflect the syntax chosen for named groups. -We are deferring runtime support for callouts from regex literals as future work, though we will correctly parse their contents. We have no current recommendation for a preference of PCRE-style callout syntax (*TODO*: intra-doc link), and would like to discuss with the community whether we should have one. +We are deferring runtime support for callouts from regex literals as future work, though we will correctly parse their contents. We have no current recommendation for a preference of PCRE-style [callout syntax](#callouts), and would like to discuss with the community whether we should have one. ## Alternatives Considered From a206f993a305e439050cb4f66dce7657ade0f9df Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 1 Mar 2022 20:21:15 +0000 Subject: [PATCH 32/51] Update RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 37 +++++++++++++------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 76cc22d08..693b0f3a4 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -83,7 +83,7 @@ A regex literal may be prefixed with a sequence of [global matching options](#pc Alternatives are a series of expressions concatenated together. The concatentation ends with either a `|` denoting the end of the alternative or a `)` denoting the end of a recursively parsed group. -Alternation has a lower precedence than concatenation or other operations, so e.g `abc|def` matches against `abc` or `def`.. +Alternation has a lower precedence than concatenation or other operations, so e.g `abc|def` matches against `abc` or `def`. ### Concatenated subexpressions @@ -130,9 +130,9 @@ Subexpressions can be quantified, meaning they will be repeated some number of t Behavior can further be refined by a subsequent `?` or `+`: -- `x*` _eager_: consume as much of input as possible -- `x*?` _reluctant_: consume as little of the input as possible -- `x*+`: _possessive_: eager and never relinquishes any input consumed +- `x*` _eager_: consume as much of input as possible. +- `x*?` _reluctant_: consume as little of the input as possible. +- `x*+`: _possessive_: eager and never relinquishes any input consumed. ### Atoms @@ -187,7 +187,7 @@ These escape sequences each denote a specific scalar value. - `\f`: The form-feed character `U+C`. - `\n`: The newline character `U+A`. - `\r`: The carriage return character `U+D`. -- `\t`: The tab character `U+9` +- `\t`: The tab character `U+9`. #### Builtin character classes @@ -218,12 +218,12 @@ Precise definitions of character classes is discussed in [Character Classes for ``` UnicodeScalar -> '\u{' HexDigit{1...} '}' - | '\u' HexDigit{4} - | '\x{' HexDigit{1...} '}' - | '\x' HexDigit{0...2} - | '\U' HexDigit{8} - | '\o{' OctalDigit{1...} '}' - | '\0' OctalDigit{0...3} + | '\u' HexDigit{4} + | '\x{' HexDigit{1...} '}' + | '\x' HexDigit{0...2} + | '\U' HexDigit{8} + | '\o{' OctalDigit{1...} '}' + | '\0' OctalDigit{0...3} HexDigit -> [0-9a-zA-Z] OctalDigit -> [0-7] @@ -232,7 +232,7 @@ NamedScalar -> '\N{' ScalarName '}' ScalarName -> 'U+' HexDigit{1...8} | [\s\w-]+ ``` -These sequences define a unicode scalar value using hexadecimal or octal notation +These sequences define a unicode scalar value using hexadecimal or octal notation. `\x`, when not followed by any hexadecimal digit characters, is treated as `\0`, matching PCRE's behavior. @@ -362,7 +362,7 @@ A script run e.g `(*script_run:...)` specifies that the contents must match agai BalancingGroupBody -> Identifier? '-' Identifier ``` -Introduced by .NET, balancing groups extend the `GroupNameBody` syntax to support the ability to refer to a prior group. Upon matching, the prior group is deleted, and any intermediate matched input becomes the capture of the current group. +Introduced by .NET, [balancing groups][balancing-groups] extend the `GroupNameBody` syntax to support the ability to refer to a prior group. Upon matching, the prior group is deleted, and any intermediate matched input becomes the capture of the current group. #### Group numbering @@ -467,9 +467,9 @@ We support all the matching options accepted by PCRE, ICU, and Oniguruma. In add These options are specific to the Swift regex matching engine and control the semantic level at which matching takes place. -- `X`: Grapheme cluster matching -- `u`: Unicode scalar matching -- `b`: Byte matching +- `X`: Grapheme cluster matching. +- `u`: Unicode scalar matching. +- `b`: Byte matching. ### References @@ -482,7 +482,7 @@ RecursionLevel -> '+' | '-' A reference is an abstract identifier for a particular capturing group in a regular expression. It can either be named or numbered, and in the latter case may be specified relative to the current group. For example `-2` refers to the capture group `N - 2` where `N` is the number of the next capture group. References may refer to groups ahead of the current position e.g `+3`, or the name of a future group. These may be useful in recursive cases where the group being referenced has been matched in a prior iteration. -A backreference may optionally include a recursion level in certain cases, which is a syntactic element inherited from Oniguruma that allows the reference to specify a capture relative to a given recursion level. +A backreference may optionally include a recursion level in certain cases, which is a syntactic element inherited [from Oniguruma][oniguruma-syntax] that allows the reference to specify a capture relative to a given recursion level. #### Backreferences @@ -639,7 +639,7 @@ AbsentFunction -> '(?~' RegexNode ')' | '(?~|)' ``` -An absent function is an Oniguruma feature that allows for the easy inversion of a given pattern. There are 4 variants of the syntax: +An absent function is an [Oniguruma][oniguruma-syntax] feature that allows for the easy inversion of a given pattern. There are 4 variants of the syntax: - `(?~|absent|expr)`: Absent expression, which attempts to match against `expr`, but is limited by the range that is not matched by `absent`. - `(?~absent)`: Absent repeater, which matches against any input not matched by `absent`. Equivalent to `(?~|absent|\O*)`. @@ -848,3 +848,4 @@ Note that this proposal regards _syntactic_ support, and does not necessarily me [unicode-prop-value-aliases]: https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt [unicode-scripts]: https://www.unicode.org/reports/tr24/#Script [unicode-script-extensions]: https://www.unicode.org/reports/tr24/#Script_Extensions +[balancing-groups]: https://docs.microsoft.com/en-us/dotnet/standard/base-types/grouping-constructs-in-regular-expressions#balancing-group-definitions From cebf4a6a20fde052287358c6706c27084aa4d27e Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 2 Mar 2022 16:53:55 +0000 Subject: [PATCH 33/51] Fix crash on lone backslash --- Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift | 4 +++- Sources/_MatchingEngine/Regex/Parse/Source.swift | 6 ++++++ Tests/RegexTests/ParseTests.swift | 4 ++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift index dd785f12d..cfab75312 100644 --- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift @@ -1472,7 +1472,9 @@ extension Source { return ref } - let char = src.eat() + guard let char = src.tryEat() else { + throw ParseError.expectedEscape + } // Single-character builtins. if let builtin = AST.Atom.EscapedBuiltin( diff --git a/Sources/_MatchingEngine/Regex/Parse/Source.swift b/Sources/_MatchingEngine/Regex/Parse/Source.swift index 11bd8152f..ddf0475f3 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Source.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Source.swift @@ -86,6 +86,12 @@ extension Source { tryEat(anyOf: set) } + /// Try to eat any character, returning `nil` if the input has been exhausted. + mutating func tryEat() -> Char? { + guard !isEmpty else { return nil } + return eat() + } + mutating func eat(asserting c: Char) { assert(peek() == c) advance() diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index e55abcbb9..23a3b910f 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1753,6 +1753,10 @@ extension RegexTests { diagnosticTest("(?")) diagnosticTest("(?", .expected(")")) + // MARK: Bad escapes + + diagnosticTest("\\", .expectedEscape) + // MARK: Text Segment options diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions) From c48fb1cbdeeb37ba55ff449a1046bbefe48e7d24 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 2 Mar 2022 16:53:55 +0000 Subject: [PATCH 34/51] Separate out DelimiterLexing.swift --- .../Regex/Parse/DelimiterLexing.swift | 153 ++++++++++++++++++ .../_MatchingEngine/Regex/Parse/Mocking.swift | 144 ----------------- 2 files changed, 153 insertions(+), 144 deletions(-) create mode 100644 Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift new file mode 100644 index 000000000..70532f9e7 --- /dev/null +++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift @@ -0,0 +1,153 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +// TODO: mock up multi-line soon + +enum Delimiter: Hashable, CaseIterable { + case traditional + case experimental + case reSingleQuote + + var openingAndClosing: (opening: String, closing: String) { + switch self { + case .traditional: return ("#/", "/#") + case .experimental: return ("#|", "|#") + case .reSingleQuote: return ("re'", "'") + } + } + var opening: String { openingAndClosing.opening } + var closing: String { openingAndClosing.closing } + + /// The default set of syntax options that the delimiter indicates. + var defaultSyntaxOptions: SyntaxOptions { + switch self { + case .traditional, .reSingleQuote: + return .traditional + case .experimental: + return .experimental + } + } +} + +struct LexError: Error, CustomStringConvertible { + enum Kind: Hashable { + case endOfString + case invalidUTF8 // TODO: better range reporting + case unknownDelimiter + } + + var kind: Kind + + /// The pointer at which to resume lexing. + var resumePtr: UnsafeRawPointer + + init(_ kind: Kind, resumeAt resumePtr: UnsafeRawPointer) { + self.kind = kind + self.resumePtr = resumePtr + } + + var description: String { + switch kind { + case .endOfString: return "unterminated regex literal" + case .invalidUTF8: return "invalid UTF-8 found in source file" + case .unknownDelimiter: return "unknown regex literal delimiter" + } + } +} + +/// Attempt to lex a regex literal between `start` and `end`, returning either +/// the contents and pointer from which to resume lexing, or an error. +func lexRegex( + start: UnsafeRawPointer, end: UnsafeRawPointer +) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { + precondition(start <= end) + var current = start + + func ascii(_ s: Unicode.Scalar) -> UInt8 { + assert(s.value <= 0x7F) + return UInt8(asserting: s.value) + } + func load(offset: Int) -> UInt8? { + guard current + offset < end else { return nil } + return current.load(fromByteOffset: offset, as: UInt8.self) + } + func load() -> UInt8? { load(offset: 0) } + func advance(_ n: Int = 1) { + precondition(current + n <= end, "Cannot advance past end") + current = current.advanced(by: n) + } + + func tryEat(_ utf8: String.UTF8View) -> Bool { + for (i, idx) in utf8.indices.enumerated() { + guard load(offset: i) == utf8[idx] else { return false } + } + advance(utf8.count) + return true + } + + // Try to lex the opening delimiter. + guard let delimiter = Delimiter.allCases.first( + where: { tryEat($0.opening.utf8) } + ) else { + throw LexError(.unknownDelimiter, resumeAt: current.successor()) + } + + let contentsStart = current + while true { + switch load() { + case nil, ascii("\n"), ascii("\r"): + throw LexError(.endOfString, resumeAt: current) + + case ascii("\\"): + // Skip next byte. + advance(2) + + default: + // Try to lex the closing delimiter. + let contentsEnd = current + guard tryEat(delimiter.closing.utf8) else { + advance() + continue + } + + // Form a string from the contents and make sure it's valid UTF-8. + let count = contentsEnd - contentsStart + let contents = UnsafeRawBufferPointer( + start: contentsStart, count: count) + let s = String(decoding: contents, as: UTF8.self) + + guard s.utf8.elementsEqual(contents) else { + throw LexError(.invalidUTF8, resumeAt: current) + } + return (contents: s, delimiter, end: current) + } + } +} + +/// Drop a set of regex delimiters from the input string, returning the contents +/// and the delimiter used. The input string must have valid delimiters. +func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) { + let utf8 = str.utf8 + func stripDelimiter(_ delim: Delimiter) -> String? { + let prefix = delim.opening.utf8 + let suffix = delim.closing.utf8 + guard utf8.prefix(prefix.count).elementsEqual(prefix), + utf8.suffix(suffix.count).elementsEqual(suffix) else { return nil } + + return String(utf8.dropFirst(prefix.count).dropLast(suffix.count)) + } + for d in Delimiter.allCases { + if let contents = stripDelimiter(d) { + return (contents, d) + } + } + fatalError("No valid delimiters") +} diff --git a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift index e3a178a15..dfba4757e 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift @@ -9,150 +9,6 @@ // //===----------------------------------------------------------------------===// - -// TODO: mock up multi-line soon - -enum Delimiter: Hashable, CaseIterable { - case traditional - case experimental - case reSingleQuote - - var openingAndClosing: (opening: String, closing: String) { - switch self { - case .traditional: return ("#/", "/#") - case .experimental: return ("#|", "|#") - case .reSingleQuote: return ("re'", "'") - } - } - var opening: String { openingAndClosing.opening } - var closing: String { openingAndClosing.closing } - - /// The default set of syntax options that the delimiter indicates. - var defaultSyntaxOptions: SyntaxOptions { - switch self { - case .traditional, .reSingleQuote: - return .traditional - case .experimental: - return .experimental - } - } -} - -struct LexError: Error, CustomStringConvertible { - enum Kind: Hashable { - case endOfString - case invalidUTF8 // TODO: better range reporting - case unknownDelimiter - } - - var kind: Kind - - /// The pointer at which to resume lexing. - var resumePtr: UnsafeRawPointer - - init(_ kind: Kind, resumeAt resumePtr: UnsafeRawPointer) { - self.kind = kind - self.resumePtr = resumePtr - } - - var description: String { - switch kind { - case .endOfString: return "unterminated regex literal" - case .invalidUTF8: return "invalid UTF-8 found in source file" - case .unknownDelimiter: return "unknown regex literal delimiter" - } - } -} - -/// Drop a set of regex delimiters from the input string, returning the contents -/// and the delimiter used. The input string must have valid delimiters. -func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) { - let utf8 = str.utf8 - func stripDelimiter(_ delim: Delimiter) -> String? { - let prefix = delim.opening.utf8 - let suffix = delim.closing.utf8 - guard utf8.prefix(prefix.count).elementsEqual(prefix), - utf8.suffix(suffix.count).elementsEqual(suffix) else { return nil } - - return String(utf8.dropFirst(prefix.count).dropLast(suffix.count)) - } - for d in Delimiter.allCases { - if let contents = stripDelimiter(d) { - return (contents, d) - } - } - fatalError("No valid delimiters") -} - -/// Attempt to lex a regex literal between `start` and `end`, returning either -/// the contents and pointer from which to resume lexing, or an error. -func lexRegex( - start: UnsafeRawPointer, end: UnsafeRawPointer -) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { - precondition(start <= end) - var current = start - - func ascii(_ s: Unicode.Scalar) -> UInt8 { - assert(s.value <= 0x7F) - return UInt8(asserting: s.value) - } - func load(offset: Int) -> UInt8? { - guard current + offset < end else { return nil } - return current.load(fromByteOffset: offset, as: UInt8.self) - } - func load() -> UInt8? { load(offset: 0) } - func advance(_ n: Int = 1) { - precondition(current + n <= end, "Cannot advance past end") - current = current.advanced(by: n) - } - - func tryEat(_ utf8: String.UTF8View) -> Bool { - for (i, idx) in utf8.indices.enumerated() { - guard load(offset: i) == utf8[idx] else { return false } - } - advance(utf8.count) - return true - } - - // Try to lex the opening delimiter. - guard let delimiter = Delimiter.allCases.first( - where: { tryEat($0.opening.utf8) } - ) else { - throw LexError(.unknownDelimiter, resumeAt: current.successor()) - } - - let contentsStart = current - while true { - switch load() { - case nil, ascii("\n"), ascii("\r"): - throw LexError(.endOfString, resumeAt: current) - - case ascii("\\"): - // Skip next byte. - advance(2) - - default: - // Try to lex the closing delimiter. - let contentsEnd = current - guard tryEat(delimiter.closing.utf8) else { - advance() - continue - } - - // Form a string from the contents and make sure it's valid UTF-8. - let count = contentsEnd - contentsStart - let contents = UnsafeRawBufferPointer( - start: contentsStart, count: count) - let s = String(decoding: contents, as: UTF8.self) - - guard s.utf8.elementsEqual(contents) else { - throw LexError(.invalidUTF8, resumeAt: current) - } - return (contents: s, delimiter, end: current) - } - } -} - private func copyCString(_ str: String) -> UnsafePointer { let count = str.utf8.count + 1 return str.withCString { From 0cbb9af76935b22a32c5ce5a8d02b1e6ad285700 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 2 Mar 2022 16:53:55 +0000 Subject: [PATCH 35/51] Rename LexError -> DelimiterLexError To avoid confusion with more general regex lexical analysis. --- Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift | 8 ++++---- Sources/_MatchingEngine/Regex/Parse/Mocking.swift | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift index 70532f9e7..c023a069c 100644 --- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift @@ -37,7 +37,7 @@ enum Delimiter: Hashable, CaseIterable { } } -struct LexError: Error, CustomStringConvertible { +struct DelimiterLexError: Error, CustomStringConvertible { enum Kind: Hashable { case endOfString case invalidUTF8 // TODO: better range reporting @@ -97,14 +97,14 @@ func lexRegex( guard let delimiter = Delimiter.allCases.first( where: { tryEat($0.opening.utf8) } ) else { - throw LexError(.unknownDelimiter, resumeAt: current.successor()) + throw DelimiterLexError(.unknownDelimiter, resumeAt: current.successor()) } let contentsStart = current while true { switch load() { case nil, ascii("\n"), ascii("\r"): - throw LexError(.endOfString, resumeAt: current) + throw DelimiterLexError(.endOfString, resumeAt: current) case ascii("\\"): // Skip next byte. @@ -125,7 +125,7 @@ func lexRegex( let s = String(decoding: contents, as: UTF8.self) guard s.utf8.elementsEqual(contents) else { - throw LexError(.invalidUTF8, resumeAt: current) + throw DelimiterLexError(.invalidUTF8, resumeAt: current) } return (contents: s, delimiter, end: current) } diff --git a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift index dfba4757e..b535edf1b 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift @@ -52,7 +52,7 @@ func libswiftLexRegexLiteral( let (_, _, endPtr) = try lexRegex(start: inputPtr, end: bufferEndPtr) curPtrPtr.pointee = endPtr.assumingMemoryBound(to: CChar.self) return false - } catch let error as LexError { + } catch let error as DelimiterLexError { if error.kind == .unknownDelimiter { // An unknown delimiter should be recovered from, as we may want to try // lex something else. @@ -66,7 +66,7 @@ func libswiftLexRegexLiteral( // closing delimiters, which would help with code completion. return true } catch { - fatalError("Should be a LexError") + fatalError("Should be a DelimiterLexError") } } From 7e820821f296436b504bb0d9cd782a46eda11f04 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 2 Mar 2022 16:53:56 +0000 Subject: [PATCH 36/51] Refactor delimiter lexing logic Introduce a DelimiterLexer type to perform the lexing. --- .../Regex/Parse/DelimiterLexing.swift | 167 +++++++++++++----- 1 file changed, 121 insertions(+), 46 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift index c023a069c..e49a442e7 100644 --- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift @@ -63,71 +63,137 @@ struct DelimiterLexError: Error, CustomStringConvertible { } } -/// Attempt to lex a regex literal between `start` and `end`, returning either -/// the contents and pointer from which to resume lexing, or an error. -func lexRegex( - start: UnsafeRawPointer, end: UnsafeRawPointer -) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { - precondition(start <= end) - var current = start +fileprivate struct DelimiterLexer { + let start: UnsafeRawPointer + var cursor: UnsafeRawPointer + let end: UnsafeRawPointer + + init(start: UnsafeRawPointer, end: UnsafeRawPointer) { + precondition(start <= end) + self.start = start + self.cursor = start + self.end = end + } func ascii(_ s: Unicode.Scalar) -> UInt8 { assert(s.value <= 0x7F) return UInt8(asserting: s.value) } - func load(offset: Int) -> UInt8? { - guard current + offset < end else { return nil } - return current.load(fromByteOffset: offset, as: UInt8.self) + + /// Return the byte at the current cursor, or `nil` if the end of the buffer + /// has been reached. + func load() -> UInt8? { + guard cursor < end else { return nil } + return cursor.load(as: UInt8.self) } - func load() -> UInt8? { load(offset: 0) } - func advance(_ n: Int = 1) { - precondition(current + n <= end, "Cannot advance past end") - current = current.advanced(by: n) + + /// Return the slice of `count` bytes from a specified cursor position, or + /// `nil` if there are fewer than `count` bytes until the end of the buffer. + func slice( + at cursor: UnsafeRawPointer, _ count: Int + ) -> UnsafeRawBufferPointer? { + guard cursor + count <= end else { return nil } + return UnsafeRawBufferPointer(start: cursor, count: count) } - func tryEat(_ utf8: String.UTF8View) -> Bool { - for (i, idx) in utf8.indices.enumerated() { - guard load(offset: i) == utf8[idx] else { return false } - } - advance(utf8.count) + /// Return the slice of `count` bytes from the current cursor, or `nil` if + /// there are fewer than `count` bytes until the end of the buffer. + func slice(_ count: Int) -> UnsafeRawBufferPointer? { + slice(at: cursor, count) + } + + /// Advance the cursor `n` bytes. + mutating func advanceCursor(_ n: Int = 1) { + cursor += n + precondition(cursor <= end, "Cannot advance past end") + } + + /// Check to see if a UTF-8 sequence can be eaten from the current cursor. + func canEat(_ utf8: String.UTF8View) -> Bool { + guard let slice = slice(utf8.count) else { return false } + return slice.elementsEqual(utf8) + } + + /// Attempt to eat a UTF-8 byte sequence, returning `true` if successful. + mutating func tryEat(_ utf8: String.UTF8View) -> Bool { + guard canEat(utf8) else { return false } + advanceCursor(utf8.count) return true } - // Try to lex the opening delimiter. - guard let delimiter = Delimiter.allCases.first( - where: { tryEat($0.opening.utf8) } - ) else { - throw DelimiterLexError(.unknownDelimiter, resumeAt: current.successor()) + /// Attempt to eat a particular closing delimiter, returning the contents of + /// the literal, and ending pointer, or `nil` if this is not a delimiter + /// ending. + mutating func tryEatEnding( + _ delimiter: Delimiter, contentsStart: UnsafeRawPointer + ) throws -> (contents: String, end: UnsafeRawPointer)? { + let contentsEnd = cursor + guard tryEat(delimiter.closing.utf8) else { return nil } + + // Form a string from the contents and make sure it's valid UTF-8. + let count = contentsEnd - contentsStart + let contents = UnsafeRawBufferPointer( + start: contentsStart, count: count) + let s = String(decoding: contents, as: UTF8.self) + + guard s.utf8.elementsEqual(contents) else { + throw DelimiterLexError(.invalidUTF8, resumeAt: cursor) + } + return (contents: s, end: cursor) } - let contentsStart = current - while true { - switch load() { - case nil, ascii("\n"), ascii("\r"): - throw DelimiterLexError(.endOfString, resumeAt: current) + /// Attempt to advance the lexer, throwing an error if the end of a line or + /// the end of the buffer is reached. + mutating func advance(escaped: Bool = false) throws { + guard let next = load() else { + throw DelimiterLexError(.endOfString, resumeAt: cursor) + } + switch UnicodeScalar(next) { + case let next where !next.isASCII: + // Just advance into a UTF-8 sequence. It shouldn't matter that we'll + // iterate through each byte as we only match against ASCII, and we + // validate it at the end. This case is separated out so we can just deal + // with the ASCII cases below. + advanceCursor() + + case "\n", "\r": + throw DelimiterLexError(.endOfString, resumeAt: cursor) + + case "\0": + // TODO: Warn to match the behavior of String literal lexer? Or should + // we error as unprintable? + advanceCursor() + + case "\\" where !escaped: + // Advance again for an escape sequence. + advanceCursor() + try advance(escaped: true) - case ascii("\\"): - // Skip next byte. - advance(2) default: - // Try to lex the closing delimiter. - let contentsEnd = current - guard tryEat(delimiter.closing.utf8) else { - advance() - continue - } + advanceCursor() + } + } - // Form a string from the contents and make sure it's valid UTF-8. - let count = contentsEnd - contentsStart - let contents = UnsafeRawBufferPointer( - start: contentsStart, count: count) - let s = String(decoding: contents, as: UTF8.self) + /*consuming*/ mutating func lex( + ) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { + + // Try to lex the opening delimiter. + guard let delimiter = Delimiter.allCases.first( + where: { tryEat($0.opening.utf8) } + ) else { + throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor()) + } - guard s.utf8.elementsEqual(contents) else { - throw DelimiterLexError(.invalidUTF8, resumeAt: current) + let contentsStart = cursor + while true { + // Try to lex the closing delimiter. + if let (contents, end) = try tryEatEnding(delimiter, + contentsStart: contentsStart) { + return (contents, delimiter, end) } - return (contents: s, delimiter, end: current) + // Try to advance the lexer. + try advance() } } } @@ -151,3 +217,12 @@ func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) { } fatalError("No valid delimiters") } + +/// Attempt to lex a regex literal between `start` and `end`, returning either +/// the contents and pointer from which to resume lexing, or an error. +func lexRegex( + start: UnsafeRawPointer, end: UnsafeRawPointer +) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { + var lexer = DelimiterLexer(start: start, end: end) + return try lexer.lex() +} From 8b3e2ef4bfd748159171332d3f8dc94d7bbb8ce0 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 2 Mar 2022 16:53:56 +0000 Subject: [PATCH 37/51] Diagnose unprintable ASCII characters This matches the behavior of the C++ lexer for string literals. --- .../Regex/Parse/DelimiterLexing.swift | 7 +++ .../Utility/MissingUnicode.swift | 8 +++ Tests/RegexTests/ParseTests.swift | 63 ++++++++++++++++--- 3 files changed, 70 insertions(+), 8 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift index e49a442e7..4b4618318 100644 --- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift @@ -42,6 +42,7 @@ struct DelimiterLexError: Error, CustomStringConvertible { case endOfString case invalidUTF8 // TODO: better range reporting case unknownDelimiter + case unprintableASCII } var kind: Kind @@ -59,6 +60,7 @@ struct DelimiterLexError: Error, CustomStringConvertible { case .endOfString: return "unterminated regex literal" case .invalidUTF8: return "invalid UTF-8 found in source file" case .unknownDelimiter: return "unknown regex literal delimiter" + case .unprintableASCII: return "unprintable ASCII character found in source file" } } } @@ -169,6 +171,11 @@ fileprivate struct DelimiterLexer { advanceCursor() try advance(escaped: true) + case let next where !next.isPrintableASCII: + // Diagnose unprintable ASCII. + // TODO: Ideally we would recover and continue to lex until the ending + // delimiter. + throw DelimiterLexError(.unprintableASCII, resumeAt: cursor.successor()) default: advanceCursor() diff --git a/Sources/_MatchingEngine/Utility/MissingUnicode.swift b/Sources/_MatchingEngine/Utility/MissingUnicode.swift index a6aae0b82..dccba3286 100644 --- a/Sources/_MatchingEngine/Utility/MissingUnicode.swift +++ b/Sources/_MatchingEngine/Utility/MissingUnicode.swift @@ -661,3 +661,11 @@ extension Character { public var isWordCharacter: Bool { isLetter || isNumber || self == "_" } } + +extension UnicodeScalar { + public var isPrintableASCII: Bool { + // Exclude non-printables before the space character U+20, and anything + // including and above the DEL character U+7F. + value >= 0x20 && value < 0x7F + } +} diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 23a3b910f..b0b2e5309 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -107,20 +107,26 @@ func parseTest( serializedCaptures.deallocate() } -func parseWithDelimitersTest( - _ input: String, _ expecting: AST.Node, - file: StaticString = #file, line: UInt = #line +func delimiterLexingTest( + _ input: String, file: StaticString = #file, line: UInt = #line ) { - // First try lexing. - input.withCString { ptr in - let (contents, delim, end) = try! lexRegex(start: ptr, - end: ptr + input.count) - XCTAssertEqual(end, ptr + input.count, file: file, line: line) + input.withCString(encodedAs: UTF8.self) { ptr in + let endPtr = ptr + input.utf8.count + let (contents, delim, end) = try! lexRegex(start: ptr, end: endPtr) + XCTAssertEqual(end, endPtr, file: file, line: line) let (parseContents, parseDelim) = droppingRegexDelimiters(input) XCTAssertEqual(contents, parseContents, file: file, line: line) XCTAssertEqual(delim, parseDelim, file: file, line: line) } +} + +func parseWithDelimitersTest( + _ input: String, _ expecting: AST.Node, + file: StaticString = #file, line: UInt = #line +) { + // First try lexing. + delimiterLexingTest(input, file: file, line: line) let orig = try! parseWithDelimiters(input) let ast = orig.root @@ -199,6 +205,32 @@ func diagnosticTest( } } +func delimiterLexingDiagnosticTest( + _ input: String, _ expected: DelimiterLexError.Kind, + syntax: SyntaxOptions = .traditional, + file: StaticString = #file, line: UInt = #line +) { + do { + _ = try input.withCString { ptr in + try lexRegex(start: ptr, end: ptr + input.count) + } + XCTFail(""" + Passed, but expected error: \(expected) + """, file: file, line: line) + } catch let e as DelimiterLexError { + guard e.kind == expected else { + XCTFail(""" + + Expected: \(expected) + Actual: \(e.kind) + """, file: file, line: line) + return + } + } catch let e { + XCTFail("Unexpected error type: \(e)", file: file, line: line) + } +} + func libswiftDiagnosticMessageTest( _ input: String, _ expectedErr: String, file: StaticString = #file, line: UInt = #line @@ -1472,6 +1504,11 @@ extension RegexTests { parseWithDelimitersTest("re'x*'", zeroOrMore(of: "x")) + parseWithDelimitersTest(#"re'🔥🇩🇰'"#, concat("🔥", "🇩🇰")) + parseWithDelimitersTest(#"re'\🔥✅'"#, concat("🔥", "✅")) + + // Printable ASCII characters. + delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##) // MARK: Parse not-equal // Make sure dumping output correctly reflects differences in AST. @@ -1890,6 +1927,16 @@ extension RegexTests { diagnosticTest("(*LIMIT_DEPTH=-1", .expectedNumber("", kind: .decimal)) } + func testDelimiterLexingErrors() { + delimiterLexingDiagnosticTest(#"re'\\#n'"#, .endOfString) + for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r. + delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII) + } + delimiterLexingDiagnosticTest("re'\n'", .endOfString) + delimiterLexingDiagnosticTest("re'\r'", .endOfString) + delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII) + } + func testlibswiftDiagnostics() { libswiftDiagnosticMessageTest( "#/[x*/#", "cannot parse regular expression: expected ']'") From 56414b8afd3cacaeb31ed3bf7b7a1186b889b624 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 2 Mar 2022 16:53:56 +0000 Subject: [PATCH 38/51] Allow lexer recovery for missing closing delimiter Allow the C++ lexer to form a tok::regex_literal. This avoids generic fallback behavior, and better allows for things like code completion. The test case for this will be in the C++ repo. --- .../Regex/Parse/DelimiterLexing.swift | 17 ++++++++------- .../_MatchingEngine/Regex/Parse/Mocking.swift | 14 +++++++++---- Sources/_MatchingEngine/Utility/Misc.swift | 21 +++++++++++++++++++ 3 files changed, 41 insertions(+), 11 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift index 4b4618318..c4be948ac 100644 --- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift @@ -208,14 +208,17 @@ fileprivate struct DelimiterLexer { /// Drop a set of regex delimiters from the input string, returning the contents /// and the delimiter used. The input string must have valid delimiters. func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) { - let utf8 = str.utf8 func stripDelimiter(_ delim: Delimiter) -> String? { - let prefix = delim.opening.utf8 - let suffix = delim.closing.utf8 - guard utf8.prefix(prefix.count).elementsEqual(prefix), - utf8.suffix(suffix.count).elementsEqual(suffix) else { return nil } - - return String(utf8.dropFirst(prefix.count).dropLast(suffix.count)) + // The opening delimiter must match. + guard var slice = str.utf8.tryDropPrefix(delim.opening.utf8) + else { return nil } + + // The closing delimiter may optionally match, as it may not be present in + // invalid code. + if let newSlice = slice.tryDropSuffix(delim.closing.utf8) { + slice = newSlice + } + return String(slice) } for d in Delimiter.allCases { if let contents = stripDelimiter(d) { diff --git a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift index b535edf1b..5994a4f52 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift @@ -61,10 +61,16 @@ func libswiftLexRegexLiteral( errOut.pointee = copyCString("\(error)") curPtrPtr.pointee = error.resumePtr.assumingMemoryBound(to: CChar.self) - // For now, treat every error as unrecoverable. - // TODO: We should ideally be able to recover from a regex with missing - // closing delimiters, which would help with code completion. - return true + switch error.kind { + case .endOfString: + // Missing closing delimiter can be recovered from. + return false + case .unprintableASCII, .invalidUTF8: + // We don't currently have good recovery behavior for these. + return true + case .unknownDelimiter: + fatalError("Already handled") + } } catch { fatalError("Should be a DelimiterLexError") } diff --git a/Sources/_MatchingEngine/Utility/Misc.swift b/Sources/_MatchingEngine/Utility/Misc.swift index bd1e395b5..55d3d3adc 100644 --- a/Sources/_MatchingEngine/Utility/Misc.swift +++ b/Sources/_MatchingEngine/Utility/Misc.swift @@ -108,7 +108,28 @@ extension Collection { >(_ idx: Index, in c: C) -> C.Index { c.index(atOffset: offset(of: idx)) } +} +extension Collection where Element: Equatable { + /// Attempt to drop a given prefix from the collection, returning the + /// resulting subsequence, or `nil` if the prefix does not match. + public func tryDropPrefix( + _ other: C + ) -> SubSequence? where C.Element == Element { + let prefixCount = other.count + guard prefix(prefixCount).elementsEqual(other) else { return nil } + return dropFirst(prefixCount) + } + + /// Attempt to drop a given suffix from the collection, returning the + /// resulting subsequence, or `nil` if the suffix does not match. + public func tryDropSuffix( + _ other: C + ) -> SubSequence? where C.Element == Element { + let suffixCount = other.count + guard suffix(suffixCount).elementsEqual(other) else { return nil } + return dropLast(suffixCount) + } } extension UnsafeMutableRawPointer { From 61450e875d80c622d5b7adb60cdf6cfc6be56439 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 2 Mar 2022 16:53:56 +0000 Subject: [PATCH 39/51] Add lexing heuristic to handle single quotes in re'...' If a single quote is encountered with a prefix of either `(?`, `(?(`, `\k`, `\g` or `(?C`, continue to scan ahead to a closing `'`. Such prefixes would not be valid endings for a regex literal anyway, and this lets us handle the single quote variant of their syntax. For the group name cases, further refine this skipping behavior by only skipping over characters that could possibly appear in that case. This improves diagnostic behavior by ensuring we don't go wandering off into Swift code. --- .../Regex/Parse/DelimiterLexing.swift | 92 +++++++++++++++ Tests/RegexTests/ParseTests.swift | 106 ++++++++++++++++-- 2 files changed, 191 insertions(+), 7 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift index c4be948ac..f1d3d5607 100644 --- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift @@ -104,6 +104,14 @@ fileprivate struct DelimiterLexer { slice(at: cursor, count) } + /// Return the slice of `count` bytes preceding the current cursor, or `nil` + /// if there are fewer than `count` bytes before the cursor. + func sliceBehind(_ count: Int) -> UnsafeRawBufferPointer? { + let priorCursor = cursor - count + guard priorCursor >= start else { return nil } + return slice(at: priorCursor, count) + } + /// Advance the cursor `n` bytes. mutating func advanceCursor(_ n: Int = 1) { cursor += n @@ -123,6 +131,86 @@ fileprivate struct DelimiterLexer { return true } + /// Attempt to skip over a closing delimiter character that is unlikely to be + /// the actual closing delimiter. + mutating func trySkipDelimiter(_ delimiter: Delimiter) { + // Only the closing `'` for re'...' can potentially be skipped over. + switch delimiter { + case .traditional, .experimental: + return + case .reSingleQuote: + break + } + guard load() == ascii("'") else { return } + + /// Need to look for a prefix of `(?`, `(?(`, `\k`, `\g`, `(?C`, as those + /// are the cases that could use single quotes. Note that none of these + /// would be valid regex endings anyway. + let calloutPrefix = "(?C" + let prefix = ["(?", "(?(", #"\k"#, #"\g"#, calloutPrefix].first { prior in + guard let priorSlice = sliceBehind(prior.utf8.count), + priorSlice.elementsEqual(prior.utf8) + else { return false } + + // Make sure the slice isn't preceded by a '\', as that invalidates this + // analysis. + if let prior = sliceBehind(priorSlice.count + 1) { + return prior[0] != ascii("\\") + } + return true + } + guard let prefix = prefix else { return } + let isCallout = prefix == calloutPrefix + + func isPossiblyGroupReference(_ c: UInt8) -> Bool { + // If this is an ASCII character, make sure it's for a group name. Leave + // other UTF-8 encoded scalars alone, this should at least catch cases + // where we run into a symbol such as `{`, `.`, `;` that would indicate + // we've likely advanced out of the bounds of the regex. + let scalar = UnicodeScalar(c) + guard scalar.isASCII else { return true } + switch scalar { + // Include '-' and '+' which may be used in recursion levels and relative + // references. + case "A"..."Z", "a"..."z", "0"..."9", "_", "-", "+": + return true + default: + return false + } + } + + // Make a note of the current lexing position, as we may need to revert + // back to it. + let originalCursor = cursor + advanceCursor() + + // Try skip over what would be the contents of a group identifier/reference. + while let next = load() { + // Found the ending, we're done. Return so we can continue to lex to the + // real delimiter. + if next == ascii("'") { + advanceCursor() + return + } + + // If this isn't a callout, make sure we have something that could be a + // group reference. We limit the character set here to improve diagnostic + // behavior in the case where the literal is actually unterminated. We + // ideally don't want to go wandering off into Swift source code. We can't + // do the same for callouts, as they take arbitrary strings. + guard isCallout || isPossiblyGroupReference(next) else { break } + do { + try advance() + } catch { + break + } + } + // We bailed out, either because we ran into something that didn't look like + // an identifier, or we reached the end of the line. Revert back to the + // original guess of delimiter. + cursor = originalCursor + } + /// Attempt to eat a particular closing delimiter, returning the contents of /// the literal, and ending pointer, or `nil` if this is not a delimiter /// ending. @@ -194,6 +282,10 @@ fileprivate struct DelimiterLexer { let contentsStart = cursor while true { + // Check to see if we're at a character that looks like a delimiter, but + // likely isn't. In such a case, we can attempt to skip over it. + trySkipDelimiter(delimiter) + // Try to lex the closing delimiter. if let (contents, end) = try tryEatEnding(delimiter, contentsStart: contentsStart) { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index b0b2e5309..b499c0b98 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -107,28 +107,46 @@ func parseTest( serializedCaptures.deallocate() } +/// Test delimiter lexing. Takes an input string that starts with a regex +/// literal. If `ignoreTrailing` is true, there may be additional characters +/// that follow the literal that are not considered part of it. +@discardableResult func delimiterLexingTest( - _ input: String, file: StaticString = #file, line: UInt = #line -) { + _ input: String, ignoreTrailing: Bool = false, + file: StaticString = #file, line: UInt = #line +) -> String { input.withCString(encodedAs: UTF8.self) { ptr in let endPtr = ptr + input.utf8.count let (contents, delim, end) = try! lexRegex(start: ptr, end: endPtr) - XCTAssertEqual(end, endPtr, file: file, line: line) + if ignoreTrailing { + XCTAssertNotEqual(end, endPtr, file: file, line: line) + } else { + XCTAssertEqual(end, endPtr, file: file, line: line) + } - let (parseContents, parseDelim) = droppingRegexDelimiters(input) + let rawPtr = UnsafeRawPointer(ptr) + let buffer = UnsafeRawBufferPointer(start: rawPtr, count: end - rawPtr) + let literal = String(decoding: buffer, as: UTF8.self) + + let (parseContents, parseDelim) = droppingRegexDelimiters(literal) XCTAssertEqual(contents, parseContents, file: file, line: line) XCTAssertEqual(delim, parseDelim, file: file, line: line) + return literal } } +/// Test parsing an input string with regex delimiters. If `ignoreTrailing` is +/// true, there may be additional characters that follow the literal that are +/// not considered part of it. func parseWithDelimitersTest( - _ input: String, _ expecting: AST.Node, + _ input: String, _ expecting: AST.Node, ignoreTrailing: Bool = false, file: StaticString = #file, line: UInt = #line ) { // First try lexing. - delimiterLexingTest(input, file: file, line: line) + let literal = delimiterLexingTest( + input, ignoreTrailing: ignoreTrailing, file: file, line: line) - let orig = try! parseWithDelimiters(input) + let orig = try! parseWithDelimiters(literal) let ast = orig.root guard ast == expecting || ast._dump() == expecting._dump() // EQ workaround @@ -1509,6 +1527,63 @@ extension RegexTests { // Printable ASCII characters. delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##) + + // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter + // if it's clear that it's part of the regex syntax. + + parseWithDelimitersTest( + #"re'(?'a_bcA0'\')'"#, namedCapture("a_bcA0", "'")) + parseWithDelimitersTest( + #"re'(?'a_bcA0-c1A'x*)'"#, + balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x"))) + + parseWithDelimitersTest( + #"re'(?('a_bcA0')x|y)'"#, conditional( + .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y")) + parseWithDelimitersTest( + #"re'(?('+20')\')'"#, conditional( + .groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty())) + + parseWithDelimitersTest( + #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A")))) + parseWithDelimitersTest( + #"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1)) + + parseWithDelimitersTest( + #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A")))) + parseWithDelimitersTest( + #"re'\g'-1'\''"#, concat(subpattern(.relative(-1)), "'")) + + parseWithDelimitersTest( + #"re'(?C'a*b\c 🔥_ ;')'"#, pcreCallout(.string(#"a*b\c 🔥_ ;"#))) + + // Fine, because we don't end up skipping. + delimiterLexingTest(#"re'(?'"#) + delimiterLexingTest(#"re'(?('"#) + delimiterLexingTest(#"re'\k'"#) + delimiterLexingTest(#"re'\g'"#) + delimiterLexingTest(#"re'(?C'"#) + + // Not a valid group name, but we can still skip over it. + delimiterLexingTest(#"re'(?'🔥')'"#) + + // Escaped, so don't skip. These will ignore the ending `'` as we've already + // closed the literal. + parseWithDelimitersTest( + #"re'\(?''"#, zeroOrOne(of: "("), ignoreTrailing: true + ) + parseWithDelimitersTest( + #"re'\\k''"#, concat("\\", "k"), ignoreTrailing: true + ) + parseWithDelimitersTest( + #"re'\\g''"#, concat("\\", "g"), ignoreTrailing: true + ) + parseWithDelimitersTest( + #"re'\(?C''"#, concat(zeroOrOne(of: "("), "C"), ignoreTrailing: true + ) + delimiterLexingTest(#"re'(\?''"#, ignoreTrailing: true) + delimiterLexingTest(#"re'\(?(''"#, ignoreTrailing: true) + // MARK: Parse not-equal // Make sure dumping output correctly reflects differences in AST. @@ -1815,6 +1890,12 @@ extension RegexTests { diagnosticTest(#"(?<#>)"#, .identifierMustBeAlphaNumeric(.groupName)) diagnosticTest(#"(?'1A')"#, .identifierCannotStartWithNumber(.groupName)) + // TODO: It might be better if tried to consume up to the closing `'` and + // diagnosed an invalid group name based on that. + diagnosticTest(#"(?'abc ')"#, .expected("'")) + + diagnosticTest("(?'🔥')", .identifierMustBeAlphaNumeric(.groupName)) + diagnosticTest(#"(?'-')"#, .expectedIdentifier(.groupName)) diagnosticTest(#"(?'--')"#, .identifierMustBeAlphaNumeric(.groupName)) diagnosticTest(#"(?'a-b-c')"#, .expected("'")) @@ -1928,6 +2009,9 @@ extension RegexTests { } func testDelimiterLexingErrors() { + + // MARK: Printable ASCII + delimiterLexingDiagnosticTest(#"re'\\#n'"#, .endOfString) for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r. delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII) @@ -1935,6 +2019,14 @@ extension RegexTests { delimiterLexingDiagnosticTest("re'\n'", .endOfString) delimiterLexingDiagnosticTest("re'\r'", .endOfString) delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII) + + // MARK: Delimiter skipping + + delimiterLexingDiagnosticTest("re'(?''", .endOfString) + delimiterLexingDiagnosticTest("re'(?'abc'", .endOfString) + delimiterLexingDiagnosticTest("re'(?('abc'", .endOfString) + delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .endOfString) + delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .endOfString) } func testlibswiftDiagnostics() { From 2325cef781477a4b51deeafcdae9c528a486e682 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 2 Mar 2022 16:53:57 +0000 Subject: [PATCH 40/51] Add support for rx'...' for experimental syntax --- Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift | 8 +++++--- Tests/RegexTests/ParseTests.swift | 6 ++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift index f1d3d5607..1227ade1f 100644 --- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift @@ -15,12 +15,14 @@ enum Delimiter: Hashable, CaseIterable { case traditional case experimental case reSingleQuote + case rxSingleQuote var openingAndClosing: (opening: String, closing: String) { switch self { case .traditional: return ("#/", "/#") case .experimental: return ("#|", "|#") case .reSingleQuote: return ("re'", "'") + case .rxSingleQuote: return ("rx'", "'") } } var opening: String { openingAndClosing.opening } @@ -31,7 +33,7 @@ enum Delimiter: Hashable, CaseIterable { switch self { case .traditional, .reSingleQuote: return .traditional - case .experimental: + case .experimental, .rxSingleQuote: return .experimental } } @@ -134,11 +136,11 @@ fileprivate struct DelimiterLexer { /// Attempt to skip over a closing delimiter character that is unlikely to be /// the actual closing delimiter. mutating func trySkipDelimiter(_ delimiter: Delimiter) { - // Only the closing `'` for re'...' can potentially be skipped over. + // Only the closing `'` for re'...'/rx'...' can potentially be skipped over. switch delimiter { case .traditional, .experimental: return - case .reSingleQuote: + case .reSingleQuote, .rxSingleQuote: break } guard load() == ascii("'") else { return } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index b499c0b98..2ee76b682 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1497,6 +1497,9 @@ extension RegexTests { parseWithDelimitersTest("#/a b/#", concat("a", " ", "b")) parseWithDelimitersTest("#|a b|#", concat("a", "b")) + parseWithDelimitersTest("re'a b'", concat("a", " ", "b")) + parseWithDelimitersTest("rx'a b'", concat("a", "b")) + parseWithDelimitersTest("#|[a b]|#", charClass("a", "b")) parseWithDelimitersTest( "#|(?-x)[a b]|#", changeMatchingOptions( @@ -1537,6 +1540,9 @@ extension RegexTests { #"re'(?'a_bcA0-c1A'x*)'"#, balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x"))) + parseWithDelimitersTest( + #"rx' (?'a_bcA0' a b)'"#, concat(namedCapture("a_bcA0", concat("a", "b")))) + parseWithDelimitersTest( #"re'(?('a_bcA0')x|y)'"#, conditional( .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y")) From 688f1d82a50615a46018279270a9323065ba12b0 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 2 Mar 2022 17:09:47 +0000 Subject: [PATCH 41/51] Change script property default to use Script Extension Change the default script property behavior for an unqualified value e.g `\p{Greek}` from `\p{Script=Greek}` to `\p{Script_Extension=Greek}`. This is arguably the more intuitive behavior, and matches what Perl does. --- .../Regex/Parse/CharacterPropertyClassification.swift | 2 +- Tests/RegexTests/MatchTests.swift | 1 + Tests/RegexTests/ParseTests.swift | 6 +++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift index 6a5740aa1..e5b65a46c 100644 --- a/Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift @@ -381,7 +381,7 @@ extension Source { return .generalCategory(cat) } if let script = classifyScriptProperty(value) { - return .script(script) + return .scriptExtension(script) } if let posix = classifyPOSIX(value) { return .posix(posix) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 3cd2df585..6ee55e414 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -695,6 +695,7 @@ extension RegexTests { firstMatchTest(#"\p{ISBAMUM}"#, input: "123ꚠꚡꚢxyz", match: "ꚠ") firstMatchTest(#"\p{Script=Unknown}"#, input: "\u{10FFFF}", match: "\u{10FFFF}") firstMatchTest(#"\p{scx=Gujr}"#, input: "\u{a839}", match: "\u{a839}") + firstMatchTest(#"\p{Gujr}"#, input: "\u{a839}", match: "\u{a839}") firstMatchTest(#"\p{alpha}"#, input: "123abcXYZ", match: "a") firstMatchTest(#"\P{alpha}"#, input: "123abcXYZ", match: "1") diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index e55abcbb9..e911ee449 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1003,13 +1003,13 @@ extension RegexTests { parseTest(#"\p{sc=grek}"#, prop(.script(.greek))) parseTest(#"\p{sc=isGreek}"#, prop(.script(.greek))) - parseTest(#"\p{Greek}"#, prop(.script(.greek))) - parseTest(#"\p{isGreek}"#, prop(.script(.greek))) + parseTest(#"\p{Greek}"#, prop(.scriptExtension(.greek))) + parseTest(#"\p{isGreek}"#, prop(.scriptExtension(.greek))) parseTest(#"\P{Script=Latn}"#, prop(.script(.latin), inverted: true)) parseTest(#"\p{script=zzzz}"#, prop(.script(.unknown))) parseTest(#"\p{ISscript=iszzzz}"#, prop(.script(.unknown))) parseTest(#"\p{scx=bamum}"#, prop(.scriptExtension(.bamum))) - parseTest(#"\p{ISBAMUM}"#, prop(.script(.bamum))) + parseTest(#"\p{ISBAMUM}"#, prop(.scriptExtension(.bamum))) parseTest(#"\p{alpha}"#, prop(.binary(.alphabetic))) parseTest(#"\p{DEP}"#, prop(.binary(.deprecated))) From 5c64747bf113a5921b05dbe764f4a006305e9f66 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 2 Mar 2022 10:15:14 -0700 Subject: [PATCH 42/51] Rework: syntax for both literals and run time --- Documentation/Evolution/RegexSyntax.md | 33 ++++++++++++-------------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 693b0f3a4..cfea740f6 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -1,20 +1,14 @@ -# Regex Literal Interior Syntax +# Regex Syntax - Authors: Hamish Knight, Michael Ilseman ## Introduction -Regex literals declare a string processing algorithm using syntax familiar across a variety of languages and tools throughout programming history. Formalizing regex literals in Swift requires: - -- Choosing a delimiter (e.g. `#/.../#` or `re'...'`). -- Detailing the "interior syntax" accepted in between delimiters. -- Specifying actual types and relevant protocols for the literal. - -We present a detailed and comprehensive treatment of regex literal interior syntax. The syntax we're proposing is large enough for its own dedicated discussion ahead of a full regex literal proposal. +A regex declares a string processing algorithm using syntax familiar across a variety of languages and tools throughout programming history. Regexes can be created from a string at run time or from a literal at compile time. The contents of that run-time string, or the contents in-between the compile-time literal's delimiters, uses regex syntax. We present a detailed and comprehensive treatment of regex syntax. This is part of a larger effort in supporting regex literals, which in turn is part of a larger effort towards better string processing using regex. See [Pitch and Proposal Status](https://github.com/apple/swift-experimental-string-processing/issues/107), which tracks each relevant piece. @@ -23,7 +17,7 @@ This is part of a larger effort in supporting regex literals, which in turn is p Swift aims to be a pragmatic programming language, striking a balance between familiarity, interoperability, and advancing the art. Swift's `String` presents a uniquely Unicode-forward model of string, but currently suffers from limited processing facilities. -The full string processing effort includes a literal, a result builder DSL, protocols for intermixing 3rd party industrial-strength parsers with regex declarations, strong types, and a slew of regex-powered algorithms over strings. +The full string processing effort includes a regex type with strongly typed captures, the ability to create a regex from a string at runtime, a compile-time literal, a result builder DSL, protocols for intermixing 3rd party industrial-strength parsers with regex declarations, and a slew of regex-powered algorithms over strings. This proposal specifically hones in on the _familiarity_ aspect by providing a best-in-class treatment of familiar regex syntax. @@ -42,11 +36,11 @@ We also support [UTS#18][uts18]'s full set of character class operators (to our Note that there are minor syntactic incompatibilities and ambiguities involved in this approach. Each is addressed in the relevant sections below -Regex literal interior syntax will be part of Swift's source-compatibility story as well as its binary-compatibility story. Thus, we present a detailed and comprehensive design. +Regex syntax will be part of Swift's source-compatibility story as well as its binary-compatibility story. Thus, we present a detailed and comprehensive design. ## Detailed Design -We propose the following syntax for use inside Swift regex literals. +We propose the following syntax for regex.
Grammar Notation @@ -79,7 +73,7 @@ Alternation -> Concatenation ('|' Concatenation)* Concatenation -> (!'|' !')' ConcatComponent)* ``` -A regex literal may be prefixed with a sequence of [global matching options](#pcre-global-matching-options). A literal's contents can be empty or a sequence of alternatives separated by `|`. +A regex may be prefixed with a sequence of [global matching options](#pcre-global-matching-options). Its contents can be empty or a sequence of alternatives separated by `|`. Alternatives are a series of expressions concatenated together. The concatentation ends with either a `|` denoting the end of the alternative or a `)` denoting the end of a recursively parsed group. @@ -471,6 +465,8 @@ These options are specific to the Swift regex matching engine and control the se - `u`: Unicode scalar matching. - `b`: Byte matching. +Further details on these are TBD and outside the scope of this pitch. + ### References ``` @@ -816,9 +812,10 @@ We are deferring runtime support for callouts from regex literals as future work ## Alternatives Considered -### Skip the literals -The top alternative is to just skip regex literals and only ship the result builder DSL. However, doing so would miss out on the familiarity benefits of existing regex syntax. +### Skip the syntax + +The top alternative is to just skip regex syntax altogether by only shipping the result builder DSL and forbidding run-time regex construction from strings. However, doing so would miss out on the familiarity benefits of existing regex syntax. Additionally, without support for run-time strings containing regex syntax, important domains would be closed off from better string processing, such as command-line tools and user-input searches. This would land us in a confusing world where NSRegularExpression, even though it operates over a fundamentally different model of string than Swift's `String` and exhibits different behavior than Swift regexes, is still used for these purposes. We consider our proposed direction to be more compelling, especially when coupled with refactoring actions to convert literals into regex DSLs. @@ -830,11 +827,11 @@ We are prototyping an "experimental" Swift extended syntax, which is future work ### Support a minimal syntactic subset -Regex literal interior syntax will become part of Swift's source and binary-compatibility story, so a reasonable alternative is to support the absolute minimal syntactic subset available. However, we would need to ensure that such a minimal approach is extensible far into the future. Because syntax decisions can impact each other, we would want to consider the ramifications of this full syntactic superset ahead of time anyways. +Regex syntax will become part of Swift's source and binary-compatibility story, so a reasonable alternative is to support the absolute minimal syntactic subset available. However, we would need to ensure that such a minimal approach is extensible far into the future. Because syntax decisions can impact each other, we would want to consider the ramifications of this full syntactic superset ahead of time anyways. -Even though it is more work up-front, and creates a longer proposal, it is less risky to support the full intended syntax. The proposed superset maximizes the familiarity benefit of regex literals. +Even though it is more work up-front and creates a longer proposal, it is less risky to support the full intended syntax. The proposed superset maximizes the familiarity benefit of regex syntax. -Note that this proposal regards _syntactic_ support, and does not necessarily mean that everything that can be written will be supported by Swift's run-time in the initial release. Support for more obscure features may appear over time, see [MatchingEngine Capabilities and Roadmap](https://github.com/apple/swift-experimental-string-processing/issues/99) for status. +Note that this proposal regards _syntactic_ support, and does not necessarily mean that everything that can be written will be supported by Swift's runtime engine in the initial release. Support for more obscure features may appear over time, see [MatchingEngine Capabilities and Roadmap](https://github.com/apple/swift-experimental-string-processing/issues/99) for status. From e66743608b74924234e47881559e24edb0ea87ae Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 2 Mar 2022 19:00:05 +0000 Subject: [PATCH 43/51] Update RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 34 +++++++++++++------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index cfea740f6..ef864cc6e 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -416,7 +416,7 @@ Operators may be used to apply set operations to character class members. The op These operators have a lower precedence than the implicit union of members, e.g `[ac-d&&a[d]]` is an intersection of the character classes `[ac-d]` and `[ad]`. -To avoid ambiguity between .NET's subtraction syntax and range syntax, .NET specifies that a subtraction will only be parsed if the right-hand-side is a nested custom character class. We intend to follow this behavior. +To avoid ambiguity between .NET's subtraction syntax and range syntax, .NET specifies that a subtraction will only be parsed if the right-hand-side is a nested custom character class. We propose following this behavior. ### Matching options @@ -666,9 +666,9 @@ Engines that don't support a particular operator fallback to treating it as lite Unlike other engines, .NET supports the use of `-` to denote both a range as well as a set subtraction. .NET disambiguates this by only permitting its use as a subtraction if the right hand operand is a nested custom character class, otherwise it is a range operator. This conflicts with e.g ICU where `[x-[y]]`, in which the `-` is treated as literal. -We intend to support the operators `&&`, `--`, and `~~`. This means that any regex literal containing these sequences in a custom character class while being written for an engine not supporting that operation will have a different semantic meaning in our engine. However this ought not to be a common occurrence, as specifying a character multiple times in a custom character class is redundant. +We propose supporting the operators `&&`, `--`, and `~~`. This means that any regex literal containing these sequences in a custom character class while being written for an engine not supporting that operation will have a different semantic meaning in our engine. However this ought not to be a common occurrence, as specifying a character multiple times in a custom character class is redundant. -In the interests of compatibility, we also intend on supporting the `-` operator, though we will likely want to emit a warning and encourage users to switch to `--`. +In the interests of compatibility, we also propose supporting the `-` operator, though we will likely want to emit a warning and encourage users to switch to `--`. ### Nested custom character classes @@ -685,15 +685,15 @@ PCRE does not support this feature, and as such treats `]` as the closing charac .NET does not support nested character classes in general, although allows them as the right-hand side of a subtraction operation. -We intend on permitting nested custom character classes. +We propose allowing nested custom character classes. ### `\U` -In PCRE, if `PCRE2_ALT_BSUX` or `PCRE2_EXTRA_ALT_BSUX` are specified, `\U` matches literal `U`. However in ICU, `\Uhhhhhhhh` matches a hex sequence. We intend on following the ICU behavior. +In PCRE, if `PCRE2_ALT_BSUX` or `PCRE2_EXTRA_ALT_BSUX` are specified, `\U` matches literal `U`. However in ICU, `\Uhhhhhhhh` matches a hex sequence. We propose following the ICU behavior. ### `{,n}` -This quantifier is supported by Oniguruma, but in PCRE it matches the literal characters `{`, `,`, `n`, and `}` in sequence. We intend on supporting it as a quantifier. +This quantifier is supported by Oniguruma, but in PCRE it matches the literal characters `{`, `,`, `n`, and `}` in sequence. We propose supporting it as a quantifier. ### `\DDD` @@ -709,17 +709,17 @@ Otherwise it is treated as an octal sequence. Oniguruma follows all of these except the second. If the first digit is `8` or `9`, it is instead treated as the literal number, e.g `\81` is `81`. .NET also follows this behavior, but additionally has the last condition consider *all* groups, not just prior ones (as backreferences can refer to future groups in recursive cases). -We intend to implement a simpler behavior more inline with ICU and Java. A `\DDD` sequence that does not start with a `0` will be treated as a backreference, otherwise it will be treated as an octal sequence. If an invalid backreference is formed with this syntax, we will suggest prefixing with a `0` if an octal sequence is desired. +We propose a simpler behavior more inline with ICU and Java. A `\DDD` sequence that does not start with a `0` will be treated as a backreference, otherwise it will be treated as an octal sequence. If an invalid backreference is formed with this syntax, we will suggest prefixing with a `0` if an octal sequence is desired. -One further difference exists between engines in the octal sequence case. In ICU, up to 3 additional digits are read after the `0`. In PCRE, only 2 additional digits may be interpreted as octal, the last is literal. We intend to follow the ICU behavior, as it is necessary when requiring a `0` prefix. +One further difference exists between engines in the octal sequence case. In ICU, up to 3 additional digits are read after the `0`. In PCRE, only 2 additional digits may be interpreted as octal, the last is literal. We will follow the ICU behavior, as it is necessary when requiring a `0` prefix. ### `\x` -In PCRE, a bare `\x` denotes the NUL character (`U+00`). In Oniguruma, it denotes literal `x`. We intend on following the PCRE behavior. +In PCRE, a bare `\x` denotes the NUL character (`U+00`). In Oniguruma, it denotes literal `x`. We propose following the PCRE behavior. ### Whitespace in ranges -In PCRE, `x{2,4}` is a range quantifier meaning that `x` can be matched from 2 to 4 times. However if any whitespace is introduced within the braces e.g `x{2, 4}`, it becomes an invalid range and is then treated as the literal characters instead. We find this behavior to be unintuitive, and therefore intend to parse any intermixed whitespace in the range. +In PCRE, `x{2,4}` is a range quantifier meaning that `x` can be matched from 2 to 4 times. However if any whitespace is introduced within the braces e.g `x{2, 4}`, it becomes an invalid range and is then treated as the literal characters instead. We find this behavior to be unintuitive, and therefore propose parsing any intermixed whitespace in the range. ### Implicitly-scoped matching option scopes @@ -727,7 +727,7 @@ PCRE and Oniguruma both support changing the active matching options through an These sound similar, but have different semantics around alternations, e.g for `a(?i)b|c|d`, in Oniguruma this becomes `a(?i:b|c|d)`, where `a` is no longer part of the alternation. However in PCRE it becomes `a(?i:b)|(?i:c)|(?i:d)`, where `a` remains a child of the alternation. -We intend on matching the PCRE behavior. +We propose matching the PCRE behavior. ### Backreference condition kinds @@ -739,15 +739,15 @@ PCRE and .NET allow for conditional patterns to reference a group by its name wi where `y` will only be matched if `(?x)` was matched. PCRE will always treat such syntax as a backreference condition, however .NET will only treat it as such if a group with that name exists somewhere in the regex (including after the conditional). Otherwise, .NET interprets `group1` as an arbitrary regular expression condition to try match against. Oniguruma on the other hand will always treat `group1` as an regex condition to match against. -We intend to always parse such conditions as an arbitrary regular expression condition, and will emit a warning asking users to explicitly use the syntax `(?()y)` if they want a backreference condition. This more explicit syntax is supported by both PCRE and Oniguruma. +We propose parsing such conditions as an arbitrary regular expression condition, as long as they do not conflict with other known condition spellings such as `R&name`. If the condition has a name that matches a named group in the regex, we will emit a warning asking users to explicitly use the syntax `(?()y)` if they want a backreference condition. This more explicit syntax is supported by both PCRE and Oniguruma. ### `\N` -PCRE supports `\N` meaning "not a newline", however there are engines that treat it as a literal `N`. We intend on supporting the PCRE behavior. +PCRE supports `\N` meaning "not a newline", however there are engines that treat it as a literal `N`. We propose supporting the PCRE behavior. ### Extended character property syntax -ICU unifies the character property syntax `\p{...}` with the syntax for POSIX character classes `[:...:]`, such that they follow the same internal grammar, which allows referencing any Unicode character property in addition to the POSIX properties. We intend to support this, though it is a purely additive feature, and therefore should not conflict with regex engines that implement a more limited POSIX syntax. +ICU unifies the character property syntax `\p{...}` with the syntax for POSIX character classes `[:...:]`, such that they follow the same internal grammar, which allows referencing any Unicode character property in addition to the POSIX properties. We propose supporting this, though it is a purely additive feature, and therefore should not conflict with regex engines that implement a more limited POSIX syntax. ### Script properties @@ -759,7 +759,7 @@ As such we feel that the more desirable default behavior of shorthand script pro Various regex engines offer an "extended syntax" where whitespace is treated as non-semantic (e.g `a b c` is equivalent to `abc`), in addition to allowing end-of-line comments `# comment`. In both PCRE and Perl, this is enabled through the `(?x)`, and in later versions, `(?xx)` matching options. The former allows non-semantic whitespace outside of character classes, and the latter also allows non-semantic whitespace in custom character classes. -Oniguruma, Java, and ICU however enable the more broad behavior under `(?x)`. We therefore intend to follow this behavior, with `(?x)` and `(?xx)` being treated the same. +Oniguruma, Java, and ICU however enable the more broad behavior under `(?x)`. We therefore propose following this behavior, with `(?x)` and `(?xx)` being treated the same. Different regex engines also have different rules around what characters are considered non-semantic whitespace. When compiled with Unicode support, PCRE considers the following whitespace: @@ -771,7 +771,7 @@ Different regex engines also have different rules around what characters are con - Line separator `U+2028` - Paragraph separator `U+2029` -This is a subset of the scalars matched by `UnicodeScalar.isWhitespace`. Additionally, in a custom character class, PCRE only considers the space and tab characters as whitespace. Other engines do not differentiate between whitespace characters inside and outside custom character classes, and appear to follow a subset of this list. Therefore we intend to support exactly the characters in this list for the purposes of non-semantic whitespace parsing. +This is a subset of the scalars matched by `UnicodeScalar.isWhitespace`. Additionally, in a custom character class, PCRE only considers the space and tab characters as whitespace. Other engines do not differentiate between whitespace characters inside and outside custom character classes, and appear to follow a subset of this list. Therefore we propose supporting exactly the characters in this list for the purposes of non-semantic whitespace parsing. ### Group numbering @@ -785,7 +785,7 @@ In PCRE, groups are numbered according to the position of their opening parenthe The `(z)` group gets numbered before the named groups get numbered. -We intend on matching the PCRE behavior where groups are numbered purely based on order. +We propose matching the PCRE behavior where groups are numbered purely based on order. ## Swift canonical syntax From 9e2828feae7591b2a7186603e5d5aed8879d25a2 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 2 Mar 2022 19:01:15 +0000 Subject: [PATCH 44/51] Update RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index ef864cc6e..cd809e12c 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -34,7 +34,7 @@ To our knowledge, all other popular regex engines support a subset of the above We also support [UTS#18][uts18]'s full set of character class operators (to our knowledge no other engine does). Beyond that, UTS#18 deals with semantics rather than syntax, and what syntax it uses is covered by the above list. We also parse Java's properties (e.g. `\p{javaLowerCase}`), meaning we support a superset of Java 8 as well. -Note that there are minor syntactic incompatibilities and ambiguities involved in this approach. Each is addressed in the relevant sections below +Note that there are minor syntactic incompatibilities and ambiguities involved in this approach. Each is addressed in the relevant sections below. Regex syntax will be part of Swift's source-compatibility story as well as its binary-compatibility story. Thus, we present a detailed and comprehensive design. From f79fb923f4e02ce736d97578bad5d363ba7eda96 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 2 Mar 2022 12:40:39 -0700 Subject: [PATCH 45/51] Update into Moved the note about syntactic support not meaning run-time support to intro --- Documentation/Evolution/RegexSyntax.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index cd809e12c..d19366602 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -10,7 +10,7 @@ Hello, we want to issue an update to [Regular Expression Literals](https://forum A regex declares a string processing algorithm using syntax familiar across a variety of languages and tools throughout programming history. Regexes can be created from a string at run time or from a literal at compile time. The contents of that run-time string, or the contents in-between the compile-time literal's delimiters, uses regex syntax. We present a detailed and comprehensive treatment of regex syntax. -This is part of a larger effort in supporting regex literals, which in turn is part of a larger effort towards better string processing using regex. See [Pitch and Proposal Status](https://github.com/apple/swift-experimental-string-processing/issues/107), which tracks each relevant piece. +This is part of a larger effort in supporting regex literals, which in turn is part of a larger effort towards better string processing using regex. See [Pitch and Proposal Status](https://github.com/apple/swift-experimental-string-processing/issues/107), which tracks each relevant piece. This proposal regards _syntactic_ support, and does not necessarily mean that everything that can be written will be supported by Swift's runtime engine in the initial release. Support for more obscure features may appear over time, see [MatchingEngine Capabilities and Roadmap](https://github.com/apple/swift-experimental-string-processing/issues/99) for status. ## Motivation @@ -831,9 +831,6 @@ Regex syntax will become part of Swift's source and binary-compatibility story, Even though it is more work up-front and creates a longer proposal, it is less risky to support the full intended syntax. The proposed superset maximizes the familiarity benefit of regex syntax. -Note that this proposal regards _syntactic_ support, and does not necessarily mean that everything that can be written will be supported by Swift's runtime engine in the initial release. Support for more obscure features may appear over time, see [MatchingEngine Capabilities and Roadmap](https://github.com/apple/swift-experimental-string-processing/issues/99) for status. - - [pcre2-syntax]: https://www.pcre.org/current/doc/html/pcre2syntax.html [oniguruma-syntax]: https://github.com/kkos/oniguruma/blob/master/doc/RE From e8c84a19714041eb8544c492a2aa148b2868240d Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 2 Mar 2022 13:52:19 -0600 Subject: [PATCH 46/51] Add CustomRegexComponent example (#196) Adds a test with a `SemanticVersion` type that conforms to `CustomRegexComponent`. --- Tests/RegexTests/RegexDSLTests.swift | 53 ++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/Tests/RegexTests/RegexDSLTests.swift b/Tests/RegexTests/RegexDSLTests.swift index 554ef905f..d599400c6 100644 --- a/Tests/RegexTests/RegexDSLTests.swift +++ b/Tests/RegexTests/RegexDSLTests.swift @@ -642,6 +642,59 @@ class RegexDSLTests: XCTestCase { } } } + + func testSemanticVersionExample() { + struct SemanticVersion: Equatable { + var major: Int + var minor: Int + var patch: Int + var dev: String? + } + struct SemanticVersionParser: CustomRegexComponent { + typealias Match = SemanticVersion + func match( + _ input: String, + startingAt index: String.Index, + in bounds: Range + ) -> (upperBound: String.Index, match: SemanticVersion)? { + let regex = Regex { + tryCapture(oneOrMore(.digit)) { Int($0) } + "." + tryCapture(oneOrMore(.digit)) { Int($0) } + optionally { + "." + tryCapture(oneOrMore(.digit)) { Int($0) } + } + optionally { + "-" + capture(oneOrMore(.word)) + } + } + + guard let match = input[index.. Date: Thu, 3 Mar 2022 13:13:09 +0000 Subject: [PATCH 47/51] Update RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index d19366602..3ff5374f6 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -476,7 +476,7 @@ NumberRef -> ('+' | '-')? RecursionLevel? RecursionLevel -> '+' | '-' ``` -A reference is an abstract identifier for a particular capturing group in a regular expression. It can either be named or numbered, and in the latter case may be specified relative to the current group. For example `-2` refers to the capture group `N - 2` where `N` is the number of the next capture group. References may refer to groups ahead of the current position e.g `+3`, or the name of a future group. These may be useful in recursive cases where the group being referenced has been matched in a prior iteration. +A reference is an abstract identifier for a particular capturing group in a regular expression. It can either be named or numbered, and in the latter case may be specified relative to the current group. For example `-2` refers to the capture group `N - 2` where `N` is the number of the next capture group. References may refer to groups ahead of the current position e.g `+3`, or the name of a future group. These may be useful in recursive cases where the group being referenced has been matched in a prior iteration. If a referenced capture does not exist anywhere in the regular expression, the reference is diagnosed as invalid. A backreference may optionally include a recursion level in certain cases, which is a syntactic element inherited [from Oniguruma][oniguruma-syntax] that allows the reference to specify a capture relative to a given recursion level. From 7b26028e1888048825a4acd92a0eb90af21e9b7a Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 3 Mar 2022 08:52:06 -0600 Subject: [PATCH 48/51] Support text segment boundary anchors (#178) This enables the `\y` and `\Y` anchors in regex literals and `Anchor.textSegmentBoundary` in the DSL. Note: This also includes `UnicodeScalar` conformance to `RegexProtocol`, which acts like Unicode scalar literals in regex literals. --- Sources/_StringProcessing/ByteCodeGen.swift | 12 ++++++++---- Sources/_StringProcessing/RegexDSL/DSL.swift | 16 +++++++++++----- Tests/RegexTests/MatchTests.swift | 16 ++++++++++++++-- Tests/RegexTests/RegexDSLTests.swift | 10 ++++++++++ 4 files changed, 43 insertions(+), 11 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 93dca17a8..d6389c1f6 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -99,12 +99,16 @@ extension Compiler.ByteCodeGen { } case .textSegment: - // This we should be able to do! - throw Unsupported(#"\y (text segment)"#) + builder.buildAssert { (input, pos, _) in + // FIXME: Grapheme or word based on options + input.isOnGraphemeClusterBoundary(pos) + } case .notTextSegment: - // This we should be able to do! - throw Unsupported(#"\Y (not text segment)"#) + builder.buildAssert { (input, pos, _) in + // FIXME: Grapheme or word based on options + !input.isOnGraphemeClusterBoundary(pos) + } case .startOfLine: builder.buildAssert { (input, pos, bounds) in diff --git a/Sources/_StringProcessing/RegexDSL/DSL.swift b/Sources/_StringProcessing/RegexDSL/DSL.swift index 17f006231..a21dce82d 100644 --- a/Sources/_StringProcessing/RegexDSL/DSL.swift +++ b/Sources/_StringProcessing/RegexDSL/DSL.swift @@ -17,8 +17,7 @@ extension String: RegexProtocol { public typealias Match = Substring public var regex: Regex { - let atoms = self.map { atom(.char($0)) } - return .init(ast: concat(atoms)) + .init(node: .quotedLiteral(self)) } } @@ -26,8 +25,7 @@ extension Substring: RegexProtocol { public typealias Match = Substring public var regex: Regex { - let atoms = self.map { atom(.char($0)) } - return .init(ast: concat(atoms)) + .init(node: .quotedLiteral(String(self))) } } @@ -35,7 +33,15 @@ extension Character: RegexProtocol { public typealias Match = Substring public var regex: Regex { - .init(ast: atom(.char(self))) + .init(node: .atom(.char(self))) + } +} + +extension UnicodeScalar: RegexProtocol { + public typealias Match = Substring + + public var regex: Regex { + .init(node: .atom(.scalar(self))) } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 6ee55e414..dba72820f 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -877,7 +877,8 @@ extension RegexTests { #"\d+\b"#, ("123", "123"), (" 123", "123"), - ("123 456", "123")) + ("123 456", "123"), + ("123A 456", "456")) firstMatchTests( #"\d+\b\s\b\d+"#, ("123", nil), @@ -893,7 +894,18 @@ extension RegexTests { // TODO: \G and \K // TODO: Oniguruma \y and \Y - + firstMatchTests( + #"\u{65}"#, // Scalar 'e' is present in both: + ("Cafe\u{301}", "e"), // composed and + ("Sol Cafe", "e")) // standalone + firstMatchTests( + #"\u{65}\y"#, // Grapheme boundary assertion + ("Cafe\u{301}", nil), + ("Sol Cafe", "e")) + firstMatchTests( + #"\u{65}\Y"#, // Grapheme non-boundary assertion + ("Cafe\u{301}", "e"), + ("Sol Cafe", nil)) } func testMatchGroups() { diff --git a/Tests/RegexTests/RegexDSLTests.swift b/Tests/RegexTests/RegexDSLTests.swift index d599400c6..d78ff04e5 100644 --- a/Tests/RegexTests/RegexDSLTests.swift +++ b/Tests/RegexTests/RegexDSLTests.swift @@ -280,6 +280,16 @@ class RegexDSLTests: XCTestCase { Anchor.endOfLine } + try _testDSLCaptures( + ("Cafe\u{301}", nil), + ("Cafe", "Cafe"), + matchType: Substring.self, ==) + { + oneOrMore(.word) + UnicodeScalar("e") + Anchor.textSegmentBoundary + } + try _testDSLCaptures( ("aaaaa1", "aaaaa1"), ("aaaaa2", nil), From 0461fa82986da5cc88068f4848c93d16d5585ea4 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 3 Mar 2022 19:54:32 +0000 Subject: [PATCH 49/51] Update Documentation/Evolution/RegexSyntax.md Co-authored-by: Nate Cook --- Documentation/Evolution/RegexSyntax.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 3ff5374f6..0132bdbe9 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -155,8 +155,8 @@ Anchor -> '^' | '$' | '\A' | '\b' | '\B' | '\G' | '\y' | '\Y' | '\z' | '\Z' Anchors match against a certain position in the input rather than on a particular character of the input. -- `^`: Matches at the start of a line. -- `$`: Matches at the end of a line. +- `^`: Matches at the very start of the input string, or the start of a line when in multi-line mode. +- `$`: Matches at the very end of the input string, or the end of a line when in multi-line mode. - `\A`: Matches at the very start of the input string. - `\Z`: Matches at the very end of the input string, in addition to before a newline at the very end of the input string. - `\z`: Like `\Z`, but only matches at the very end of the input string. From 971cccc00224e9dee3a23ca7893e9706e45d056c Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 3 Mar 2022 21:40:07 +0000 Subject: [PATCH 50/51] Fix negative lookbehind syntax in RegexSyntax.md --- Documentation/Evolution/RegexSyntax.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 0132bdbe9..5bfdd5e8a 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -337,7 +337,7 @@ These groups evaluate the input ahead or behind the current matching position, w - `(?=`: A lookahead, which matches against the input following the current matching position. - `(?!`: A negative lookahead, which ensures a negative match against the input following the current matching position. - `(?<=`: A lookbehind, which matches against the input prior to the current matching position. -- `(?!<`: A negative lookbehind, which ensures a negative match against the input prior to the current matching position. +- `(? Date: Thu, 3 Mar 2022 21:37:44 -0500 Subject: [PATCH 51/51] Tweak a few generic signatures to work around a bug in the Requirement Machine Here is the reduced test case: func foo(_: C, _: R) where C.SubSequence == Substring, C.Element == R.Element {} The Requirement Machine produces a valid but different minimization than the GenericSignatureBuilder for this signature, which flags the assert since right now we run both and compare the results: error: compile command failed due to signal 6 (use -v to see invocation) RequirementMachine generic signature minimization is broken: RequirementMachine says: GenericSignatureBuilder says: Radar tracking the fix: rdar://problem/89791117 --- .../_StringProcessing/Algorithms/Algorithms/Replace.swift | 6 +++--- .../Algorithms/Matching/MatchReplace.swift | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift index 36a28b381..6a8c97a6a 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift @@ -154,7 +154,7 @@ extension RangeReplaceableCollection where SubSequence == Substring { with replacement: Replacement, subrange: Range, maxReplacements: Int = .max - ) -> Self where Replacement.Element == Element { + ) -> Self where Replacement.Element == Character { replacing( RegexConsumer(regex), with: replacement, @@ -166,7 +166,7 @@ extension RangeReplaceableCollection where SubSequence == Substring { _ regex: R, with replacement: Replacement, maxReplacements: Int = .max - ) -> Self where Replacement.Element == Element { + ) -> Self where Replacement.Element == Character { replacing( regex, with: replacement, @@ -178,7 +178,7 @@ extension RangeReplaceableCollection where SubSequence == Substring { _ regex: R, with replacement: Replacement, maxReplacements: Int = .max - ) where Replacement.Element == Element { + ) where Replacement.Element == Character { self = replacing( regex, with: replacement, diff --git a/Sources/_StringProcessing/Algorithms/Matching/MatchReplace.swift b/Sources/_StringProcessing/Algorithms/Matching/MatchReplace.swift index 2feb09df0..281b568f7 100644 --- a/Sources/_StringProcessing/Algorithms/Matching/MatchReplace.swift +++ b/Sources/_StringProcessing/Algorithms/Matching/MatchReplace.swift @@ -80,7 +80,7 @@ extension RangeReplaceableCollection where SubSequence == Substring { with replacement: (_MatchResult>) throws -> Replacement, subrange: Range, maxReplacements: Int = .max - ) rethrows -> Self where Replacement.Element == Element { + ) rethrows -> Self where Replacement.Element == Character { try replacing( RegexConsumer(regex), with: replacement, @@ -92,7 +92,7 @@ extension RangeReplaceableCollection where SubSequence == Substring { _ regex: R, with replacement: (_MatchResult>) throws -> Replacement, maxReplacements: Int = .max - ) rethrows -> Self where Replacement.Element == Element { + ) rethrows -> Self where Replacement.Element == Character { try replacing( regex, with: replacement, @@ -104,7 +104,7 @@ extension RangeReplaceableCollection where SubSequence == Substring { _ regex: R, with replacement: (_MatchResult>) throws -> Replacement, maxReplacements: Int = .max - ) rethrows where Replacement.Element == Element { + ) rethrows where Replacement.Element == Character { self = try replacing( regex, with: replacement,