From 05ce1d3345c7bd50cac79cef68309f69bbcf4e93 Mon Sep 17 00:00:00 2001 From: Saleem Abdulrasool Date: Sun, 23 Jan 2022 11:40:06 -0800 Subject: [PATCH 01/19] build: add a CMake based build This is a first approximation of a CMake based build for this repository. The intent here is to localize the build rules for the repository to allow it to be consumed during the build of the toolchain. This allows the source list to be maintained in the repository as the source of truth rather than be explicitly listed in the swift repository. For general purpose development, the SPM based build is recommended. Unless there is a specific need for the tests to be included, testing should be done via the SPM build. This change is sufficient to build the content though does not perform the install or export steps which will be required to consume the results in the Swift build. Example invocation: ~~~ cmake -B S:\b\16 ^ -D CMAKE_BUILD_TYPE=Release ^ -D ArgumentParser_DIR=S:\b\10\cmake\modules ^ -G Ninja ^ -S S:\SourceCache\swift-experimental-string-processing cmake --build S:\b\16 ~~~ --- CMakeLists.txt | 17 +++++++++ Sources/CMakeLists.txt | 6 +++ Sources/Prototypes/CMakeLists.txt | 18 +++++++++ Sources/VariadicsGenerator/CMakeLists.txt | 7 ++++ Sources/_MatchingEngine/CMakeLists.txt | 46 +++++++++++++++++++++++ Sources/_StringProcessing/CMakeLists.txt | 42 +++++++++++++++++++++ Sources/_Unicode/CMakeLists.txt | 16 ++++++++ 7 files changed, 152 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 Sources/CMakeLists.txt create mode 100644 Sources/Prototypes/CMakeLists.txt create mode 100644 Sources/VariadicsGenerator/CMakeLists.txt create mode 100644 Sources/_MatchingEngine/CMakeLists.txt create mode 100644 Sources/_StringProcessing/CMakeLists.txt create mode 100644 Sources/_Unicode/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..ad640f43a --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,17 @@ + +cmake_minimum_required(VERSION 3.18) +project(SwiftExperimentalStringProcessing + LANGUAGES Swift) + +if(CMAKE_SYSTEM_NAME STREQUAL Windows OR CMAKE_SYSTEM_NAME STREQUAL Darwin) + option(BUILD_SHARED_LIBS "Build shared libraries by default" YES) +endif() + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) +set(CMAKE_Swift_MODULE_DIRECTORY ${CMAKE_BINARY_DIR}/swift) + +find_package(ArgumentParser CONFIG) + +add_subdirectory(Sources) diff --git a/Sources/CMakeLists.txt b/Sources/CMakeLists.txt new file mode 100644 index 000000000..19feadbd9 --- /dev/null +++ b/Sources/CMakeLists.txt @@ -0,0 +1,6 @@ + +add_subdirectory(_Unicode) +add_subdirectory(_MatchingEngine) +add_subdirectory(_StringProcessing) +add_subdirectory(Prototypes) +add_subdirectory(VariadicsGenerator) diff --git a/Sources/Prototypes/CMakeLists.txt b/Sources/Prototypes/CMakeLists.txt new file mode 100644 index 000000000..60768f5a3 --- /dev/null +++ b/Sources/Prototypes/CMakeLists.txt @@ -0,0 +1,18 @@ + +add_library(Prototypes + Combinators/Combinators.swift + PEG/PEG.swift + PEG/PEGCode.swift + PEG/PEGCompile.swift + PEG/PEGCore.swift + PEG/PEGInterpreter.swift + PEG/PEGTranspile.swift + PEG/PEGVM.swift + PEG/PEGVMExecute.swift + PEG/Printing.swift + PTCaRet/Interpreter.swift + PTCaRet/PTCaRet.swift + TourOfTypes/CharacterClass.swift + TourOfTypes/Literal.swift) +target_link_libraries(Prototypes PUBLIC + _MatchingEngine) diff --git a/Sources/VariadicsGenerator/CMakeLists.txt b/Sources/VariadicsGenerator/CMakeLists.txt new file mode 100644 index 000000000..8ea543970 --- /dev/null +++ b/Sources/VariadicsGenerator/CMakeLists.txt @@ -0,0 +1,7 @@ + +add_executable(VariadicsGenerator + VariadicsGenerator.swift) +target_compile_options(VariadicsGenerator PRIVATE + -parse-as-library) +target_link_libraries(VariadicsGenerator PUBLIC + ArgumentParser) diff --git a/Sources/_MatchingEngine/CMakeLists.txt b/Sources/_MatchingEngine/CMakeLists.txt new file mode 100644 index 000000000..f7cb97ce3 --- /dev/null +++ b/Sources/_MatchingEngine/CMakeLists.txt @@ -0,0 +1,46 @@ + +add_library(_MatchingEngine + Engine/Backtracking.swift + Engine/Builder.swift + Engine/Capture.swift + Engine/Consume.swift + Engine/Engine.swift + Engine/InstPayload.swift + Engine/Instruction.swift + Engine/Processor.swift + Engine/Program.swift + Engine/Registers.swift + Engine/Tracing.swift + Regex/AST/AST.swift + Regex/AST/ASTAction.swift + Regex/AST/ASTProtocols.swift + Regex/AST/Atom.swift + Regex/AST/Conditional.swift + Regex/AST/CustomCharClass.swift + Regex/AST/Group.swift + Regex/AST/MatchingOptions.swift + Regex/AST/Quantification.swift + Regex/Parse/CaptureStructure.swift + Regex/Parse/CharacterPropertyClassification.swift + Regex/Parse/Diagnostics.swift + Regex/Parse/LexicalAnalysis.swift + Regex/Parse/Mocking.swift + Regex/Parse/Parse.swift + Regex/Parse/Source.swift + Regex/Parse/SourceLocation.swift + Regex/Parse/SyntaxOptions.swift + Regex/Printing/DumpAST.swift + Regex/Printing/PrettyPrinter.swift + Regex/Printing/PrintAsCanonical.swift + Regex/Printing/PrintAsPattern.swift + Regex/Printing/RenderRanges.swift + Utility/AllScalars.swift + Utility/Formatting.swift + Utility/Misc.swift + Utility/MissingUnicode.swift + Utility/Protocols.swift + Utility/TypeConstruction.swift + Utility/TypedIndex.swift + Utility/TypedInt.swift) +target_compile_options(_MatchingEngine PRIVATE + -enable-library-evolution) diff --git a/Sources/_StringProcessing/CMakeLists.txt b/Sources/_StringProcessing/CMakeLists.txt new file mode 100644 index 000000000..c20dcc240 --- /dev/null +++ b/Sources/_StringProcessing/CMakeLists.txt @@ -0,0 +1,42 @@ + +add_library(_StringProcessing + Algorithms/Algorithms/Contains.swift + Algorithms/Algorithms/FirstRange.swift + Algorithms/Algorithms/Ranges.swift + Algorithms/Algorithms/Replace.swift + Algorithms/Algorithms/Split.swift + Algorithms/Algorithms/StartsWith.swift + Algorithms/Algorithms/Trim.swift + Algorithms/Consumers/CollectionConsumer.swift + Algorithms/Consumers/FixedPatternConsumer.swift + Algorithms/Consumers/ManyConsumer.swift + Algorithms/Consumers/PredicateConsumer.swift + Algorithms/Consumers/RegexConsumer.swift + Algorithms/Searchers/CollectionSearcher.swift + Algorithms/Searchers/ConsumerSearcher.swift + Algorithms/Searchers/NaivePatternSearcher.swift + Algorithms/Searchers/PatternOrEmpty.swift + Algorithms/Searchers/PredicateSearcher.swift + Algorithms/Searchers/TwoWaySearcher.swift + Algorithms/Searchers/ZSearcher.swift + ASTBuilder.swift + Capture.swift + CharacterClass.swift + Compiler.swift + ConsumerInterface.swift + Executor.swift + Legacy/HareVM.swift + Legacy/LegacyCompile.swift + Legacy/RECode.swift + Legacy/TortoiseVM.swift + Legacy/VirtualMachine.swift + RegexDSL/Builder.swift + RegexDSL/Concatenation.swift + RegexDSL/Core.swift + RegexDSL/DSL.swift + RegexDSL/DSLCapture.swift + RegexDSL/DynamicCaptures.swift) +target_compile_options(_StringProcessing PRIVATE + -enable-library-evolution) +target_link_libraries(_StringProcessing PUBLIC + _MatchingEngine) diff --git a/Sources/_Unicode/CMakeLists.txt b/Sources/_Unicode/CMakeLists.txt new file mode 100644 index 000000000..7fdb44628 --- /dev/null +++ b/Sources/_Unicode/CMakeLists.txt @@ -0,0 +1,16 @@ + +add_library(_Unicode + CaseConversion.swift + CharacterProps.swift + Comparison.swift + Decoding.swift + Encodings.swift + Formatting.swift + Graphemes.swift + NecessaryEvils.swift + Normaliation.swift + NumberParsing.swift + ScalarProps.swift + Transcoding.swift + UCD.swift + Validation.swift) From 04ad64ab7e936cb7427f5f00df017ab86472a3b3 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 17 Feb 2022 17:15:54 +0000 Subject: [PATCH 02/19] Add a couple of quoted test cases --- Tests/RegexTests/ParseTests.swift | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 4872e256e..e70939d53 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -488,6 +488,10 @@ extension RegexTests { #"a\Q \Q \\.\Eb"#, concat("a", quote(#" \Q \\."#), "b")) + // These follow the PCRE behavior. + parseTest(#"\Q\\E"#, quote("\\")) + parseTest(#"\E"#, "E") + parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"), syntax: .experimental) parseTest(#"a" .""b""#, concat("a", quote(" ."), quote("b")), From 9d5529ee05858ccb49efeec1ba1a0cf50ee9046f Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 17 Feb 2022 17:15:55 +0000 Subject: [PATCH 03/19] Update \DDD parsing Previously we followed PCRE's parsing of this syntax such that it may either be an octal sequence or backreference depending on a list of heuristics. However this model is complicated and not particularly intuitive, especially as there are other engines that disambiguate using subtly different rules. Instead, always parse `\DDD` as a backreference, unless it begins with `0`, in which case it is an octal sequence. This matches ICU and Java's behavior. Once we start validating group references, we can then start emitting an error on invalid backreferences using this syntax, and suggest prefixing with 0 if an octal sequence is desired. --- .../Regex/Parse/LexicalAnalysis.swift | 45 ++++++----------- Tests/RegexTests/MatchTests.swift | 5 +- Tests/RegexTests/ParseTests.swift | 48 +++++++++---------- 3 files changed, 39 insertions(+), 59 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift index 727727ce1..dd785f12d 100644 --- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift @@ -279,7 +279,7 @@ extension Source { /// | 'x' HexDigit{0...2} /// | 'U' HexDigit{8} /// | 'o{' OctalDigit{1...} '}' - /// | OctalDigit{1...3} + /// | '0' OctalDigit{0...3} /// mutating func expectUnicodeScalar( escapedCharacter base: Character @@ -313,13 +313,14 @@ extension Source { let str = try src.lexUntil(eating: "}").value return try Source.validateUnicodeScalar(str, .octal) - case let c where c.isOctalDigit: - // We can read *up to* 2 more octal digits per PCRE. - // FIXME: ICU can read up to 3 octal digits if the leading digit is 0, - // we should have a parser mode to switch. - let nextDigits = src.tryEatPrefix(maxLength: 2, \.isOctalDigit) - let str = String(c) + (nextDigits?.string ?? "") - return try Source.validateUnicodeScalar(str, .octal) + case "0": + // We can read *up to* 3 more octal digits. + // FIXME: PCRE can only read up to 2 octal digits, if we get a strict + // PCRE mode, we should limit it here. + guard let digits = src.tryEatPrefix(maxLength: 3, \.isOctalDigit) else { + return Unicode.Scalar(0) + } + return try Source.validateUnicodeScalar(digits.string, .octal) default: fatalError("Unexpected scalar start") @@ -1341,26 +1342,10 @@ extension Source { return nil } - // Lexing \n is tricky, as it's ambiguous with octal sequences. In PCRE - // it is treated as a backreference if its first digit is not 0 (as that - // is always octal) and one of the following holds: - // - // - It's 0 < n < 10 (as octal would be pointless here) - // - Its first digit is 8 or 9 (as not valid octal) - // - There have been as many prior groups as the reference. - // - // Oniguruma follows the same rules except the second one. e.g \81 and - // \91 are instead treated as literal 81 and 91 respectively. - // TODO: If we want a strict Oniguruma mode, we'll need to add a check - // here. + // Backslash followed by a non-0 digit character is a backreference. if firstChar != "0", let numAndLoc = try src.lexNumber() { - let num = numAndLoc.value - let ref = AST.Reference(.absolute(num), innerLoc: numAndLoc.location) - if num < 10 || firstChar == "8" || firstChar == "9" || - context.isPriorGroupRef(ref.kind) { - return .backreference(ref) - } - return nil + return .backreference(.init( + .absolute(numAndLoc.value), innerLoc: numAndLoc.location)) } return nil } @@ -1497,10 +1482,8 @@ extension Source { } switch char { - // Hexadecimal and octal unicode scalars. This must be done after - // backreference lexing due to the ambiguity with \nnn. - case let c where c.isOctalDigit: fallthrough - case "u", "x", "U", "o": + // Hexadecimal and octal unicode scalars. + case "u", "x", "U", "o", "0": return try .scalar( src.expectUnicodeScalar(escapedCharacter: char).value) default: diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 088deb151..2c07f4c79 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -261,7 +261,7 @@ extension RegexTests { firstMatchTest(#"\070"#, input: "1238xyz", match: "8") firstMatchTest(#"\07A"#, input: "123\u{7}Axyz", match: "\u{7}A") firstMatchTest(#"\08"#, input: "123\08xyz", match: "\08") - firstMatchTest(#"\0707"#, input: "12387xyz", match: "87") + firstMatchTest(#"\0707"#, input: "12387\u{1C7}xyz", match: "\u{1C7}") // code point sequence firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true) @@ -1021,9 +1021,6 @@ extension RegexTests { firstMatchTest( #"(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)\10"#, input: "aaaaaaaaabbc", match: "aaaaaaaaabb") - firstMatchTest( - #"(.)\10"#, - input: "a\u{8}b", match: "a\u{8}") firstMatchTest( #"(.)\g001"#, diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index e70939d53..ce070cc38 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -325,7 +325,7 @@ extension RegexTests { parseTest(#"\070"#, scalar("\u{38}")) parseTest(#"\07A"#, concat(scalar("\u{7}"), "A")) parseTest(#"\08"#, concat(scalar("\u{0}"), "8")) - parseTest(#"\0707"#, concat(scalar("\u{38}"), "7")) + parseTest(#"\0707"#, scalar("\u{1C7}")) parseTest(#"[\0]"#, charClass(scalar_m("\u{0}"))) parseTest(#"[\01]"#, charClass(scalar_m("\u{1}"))) @@ -333,13 +333,15 @@ extension RegexTests { parseTest(#"[\07A]"#, charClass(scalar_m("\u{7}"), "A")) parseTest(#"[\08]"#, charClass(scalar_m("\u{0}"), "8")) - parseTest(#"[\0707]"#, charClass(scalar_m("\u{38}"), "7")) + parseTest(#"[\0707]"#, charClass(scalar_m("\u{1C7}"))) - parseTest(#"[\1]"#, charClass(scalar_m("\u{1}"))) - parseTest(#"[\123]"#, charClass(scalar_m("\u{53}"))) - parseTest(#"[\101]"#, charClass(scalar_m("\u{41}"))) - parseTest(#"[\7777]"#, charClass(scalar_m("\u{1FF}"), "7")) - parseTest(#"[\181]"#, charClass(scalar_m("\u{1}"), "8", "1")) + // TODO: These are treated as octal sequences by PCRE, we should warn and + // suggest user prefix with 0. + parseTest(#"[\1]"#, charClass("1")) + parseTest(#"[\123]"#, charClass("1", "2", "3")) + parseTest(#"[\101]"#, charClass("1", "0", "1")) + parseTest(#"[\7777]"#, charClass("7", "7", "7", "7")) + parseTest(#"[\181]"#, charClass("1", "8", "1")) // We take *up to* the first two valid digits for \x. No valid digits is 0. parseTest(#"\x"#, scalar("\u{0}")) @@ -797,11 +799,9 @@ extension RegexTests { ) } - // TODO: Some of these behaviors are unintuitive, we should likely warn on - // some of them. - parseTest(#"\10"#, scalar("\u{8}")) - parseTest(#"\18"#, concat(scalar("\u{1}"), "8")) - parseTest(#"\7777"#, concat(scalar("\u{1FF}"), "7")) + parseTest(#"\10"#, backreference(.absolute(10))) + parseTest(#"\18"#, backreference(.absolute(18))) + parseTest(#"\7777"#, backreference(.absolute(7777))) parseTest(#"\91"#, backreference(.absolute(91))) parseTest( @@ -813,12 +813,13 @@ extension RegexTests { parseTest( #"()()()()()()()()()\10()"#, concat(Array(repeating: capture(empty()), count: 9) - + [scalar("\u{8}"), capture(empty())]), + + [backreference(.absolute(10)), capture(empty())]), captures: .tuple(Array(repeating: .atom(), count: 10)) ) - parseTest(#"()()\10"#, - concat(capture(empty()), capture(empty()), scalar("\u{8}")), - captures: .tuple(.atom(), .atom())) + parseTest(#"()()\10"#, concat( + capture(empty()), capture(empty()), backreference(.absolute(10))), + captures: .tuple(.atom(), .atom()) + ) // A capture of three empty captures. let fourCaptures = capture( @@ -826,8 +827,8 @@ extension RegexTests { ) parseTest( // There are 9 capture groups in total here. - #"((()()())(()()()))\10"#, - concat(capture(concat(fourCaptures, fourCaptures)), scalar("\u{8}")), + #"((()()())(()()()))\10"#, concat(capture(concat( + fourCaptures, fourCaptures)), backreference(.absolute(10))), captures: .tuple(Array(repeating: .atom(), count: 9)) ) parseTest( @@ -852,7 +853,7 @@ extension RegexTests { concat(Array(repeating: capture(empty()), count: 40) + [scalar(" ")]), captures: .tuple(Array(repeating: .atom(), count: 40)) ) - parseTest(#"\40"#, scalar(" ")) + parseTest(#"\40"#, backreference(.absolute(40))) parseTest( String(repeating: "()", count: 40) + #"\40"#, concat(Array(repeating: capture(empty()), count: 40) @@ -862,7 +863,7 @@ extension RegexTests { parseTest(#"\7"#, backreference(.absolute(7))) - parseTest(#"\11"#, scalar("\u{9}")) + parseTest(#"\11"#, backreference(.absolute(11))) parseTest( String(repeating: "()", count: 11) + #"\11"#, concat(Array(repeating: capture(empty()), count: 11) @@ -876,12 +877,11 @@ extension RegexTests { captures: .tuple(Array(repeating: .atom(), count: 11)) ) - parseTest(#"\0113"#, concat(scalar("\u{9}"), "3")) - parseTest(#"\113"#, scalar("\u{4B}")) - parseTest(#"\377"#, scalar("\u{FF}")) + parseTest(#"\0113"#, scalar("\u{4B}")) + parseTest(#"\113"#, backreference(.absolute(113))) + parseTest(#"\377"#, backreference(.absolute(377))) parseTest(#"\81"#, backreference(.absolute(81))) - parseTest(#"\g1"#, backreference(.absolute(1))) parseTest(#"\g001"#, backreference(.absolute(1))) parseTest(#"\g52"#, backreference(.absolute(52))) From 3f24f5bcb31717d99511c203dbf6bdb7eab033de Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Thu, 24 Feb 2022 15:52:00 -0700 Subject: [PATCH 04/19] Internalize declarations Lots of ME stuff was annotated with vestigial public --- Sources/Prototypes/PEG/PEGCode.swift | 2 +- Sources/Prototypes/PEG/PEGCompile.swift | 2 +- Sources/Prototypes/PEG/PEGCore.swift | 2 +- Sources/Prototypes/PEG/PEGTranspile.swift | 3 +- Sources/Prototypes/PEG/PEGVM.swift | 3 +- Sources/Prototypes/PEG/Printing.swift | 2 +- Sources/_StringProcessing/Compiler.swift | 2 +- .../_StringProcessing/Engine/Consume.swift | 4 +- Sources/_StringProcessing/Engine/Engine.swift | 8 +- .../Engine/Instruction.swift | 3 +- .../_StringProcessing/Engine/MEBuilder.swift | 124 +++++++------- .../_StringProcessing/Engine/MECapture.swift | 2 +- .../_StringProcessing/Engine/MEProgram.swift | 12 +- .../_StringProcessing/Engine/Processor.swift | 2 +- .../_StringProcessing/Engine/Tracing.swift | 2 +- Sources/_StringProcessing/Executor.swift | 14 +- Sources/_StringProcessing/RegexDSL/DSL.swift | 4 +- .../_StringProcessing/RegexDSL/DSLTree.swift | 2 +- .../_StringProcessing/Unicode/Decoding.swift | 20 +-- .../Unicode/NecessaryEvils.swift | 2 +- .../_StringProcessing/Utility/Protocols.swift | 12 +- .../_StringProcessing/Utility/Traced.swift | 24 +-- .../Utility/TypedIndex.swift | 59 +++---- .../_StringProcessing/Utility/TypedInt.swift | 159 ++++++++---------- 24 files changed, 215 insertions(+), 254 deletions(-) diff --git a/Sources/Prototypes/PEG/PEGCode.swift b/Sources/Prototypes/PEG/PEGCode.swift index c33f5759c..b12c5bab6 100644 --- a/Sources/Prototypes/PEG/PEGCode.swift +++ b/Sources/Prototypes/PEG/PEGCode.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -import _StringProcessing +@testable import _StringProcessing extension PEG.VM { struct Code { diff --git a/Sources/Prototypes/PEG/PEGCompile.swift b/Sources/Prototypes/PEG/PEGCompile.swift index 0592cf6a9..0e1b89233 100644 --- a/Sources/Prototypes/PEG/PEGCompile.swift +++ b/Sources/Prototypes/PEG/PEGCompile.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -import _StringProcessing +@testable import _StringProcessing extension PEG.VM { typealias InIndex = Input.Index diff --git a/Sources/Prototypes/PEG/PEGCore.swift b/Sources/Prototypes/PEG/PEGCore.swift index b831cbd0f..5c66dc25a 100644 --- a/Sources/Prototypes/PEG/PEGCore.swift +++ b/Sources/Prototypes/PEG/PEGCore.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -import _StringProcessing +@testable import _StringProcessing let emitComments = true struct PEGCore< diff --git a/Sources/Prototypes/PEG/PEGTranspile.swift b/Sources/Prototypes/PEG/PEGTranspile.swift index df75cea63..84e220d52 100644 --- a/Sources/Prototypes/PEG/PEGTranspile.swift +++ b/Sources/Prototypes/PEG/PEGTranspile.swift @@ -9,8 +9,7 @@ // //===----------------------------------------------------------------------===// -import _MatchingEngine -import _StringProcessing +@testable import _StringProcessing extension PEG.VM where Input == String { typealias MEProg = MEProgram diff --git a/Sources/Prototypes/PEG/PEGVM.swift b/Sources/Prototypes/PEG/PEGVM.swift index a987b581d..4cf91a5c1 100644 --- a/Sources/Prototypes/PEG/PEGVM.swift +++ b/Sources/Prototypes/PEG/PEGVM.swift @@ -9,7 +9,8 @@ // //===----------------------------------------------------------------------===// -import _StringProcessing + +@testable import _StringProcessing extension PEG { diff --git a/Sources/Prototypes/PEG/Printing.swift b/Sources/Prototypes/PEG/Printing.swift index 978250761..be60e72f5 100644 --- a/Sources/Prototypes/PEG/Printing.swift +++ b/Sources/Prototypes/PEG/Printing.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -import _StringProcessing +@testable import _StringProcessing extension PEGCore.Instruction: InstructionProtocol { var operandPC: InstructionAddress? { self.pc } diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 5099e187f..1d72a8d27 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -35,7 +35,7 @@ class Compiler { } } -public func _compileRegex( +func _compileRegex( _ regex: String, _ syntax: SyntaxOptions = .traditional ) throws -> Executor { let ast = try parse(regex, syntax) diff --git a/Sources/_StringProcessing/Engine/Consume.swift b/Sources/_StringProcessing/Engine/Consume.swift index 52f752539..cfb803de8 100644 --- a/Sources/_StringProcessing/Engine/Consume.swift +++ b/Sources/_StringProcessing/Engine/Consume.swift @@ -25,13 +25,13 @@ extension Engine { } extension Engine where Input == String { - public func consume( + func consume( _ input: Input ) -> (Input.Index, CaptureList)? { consume(input, in: input.startIndex ..< input.endIndex) } - public func consume( + func consume( _ input: Input, in range: Range, matchMode: MatchMode = .partialFromFront diff --git a/Sources/_StringProcessing/Engine/Engine.swift b/Sources/_StringProcessing/Engine/Engine.swift index 6c9c2efa5..86952c8b7 100644 --- a/Sources/_StringProcessing/Engine/Engine.swift +++ b/Sources/_StringProcessing/Engine/Engine.swift @@ -11,7 +11,7 @@ // Currently, engine binds the type and consume binds an instance. // But, we can play around with this. -public struct Engine where Input.Element: Hashable { +struct Engine where Input.Element: Hashable { var program: MEProgram @@ -24,7 +24,7 @@ public struct Engine where Input.Element: Hashab set { program.enableTracing = newValue } } - public init( + init( _ program: MEProgram, enableTracing: Bool? = nil ) { @@ -36,10 +36,10 @@ public struct Engine where Input.Element: Hashab } } -public struct AsyncEngine { /* ... */ } +struct AsyncEngine { /* ... */ } extension Engine: CustomStringConvertible { - public var description: String { + var description: String { // TODO: better description return program.description } diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index fcc257302..ff28ee9e2 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -299,8 +299,7 @@ extension Instruction { internal var _opcodeMask: UInt64 { 0xFF00_0000_0000_0000 } -// TODO: internal after compiler moves in -public var _payloadMask: UInt64 { ~_opcodeMask } +var _payloadMask: UInt64 { ~_opcodeMask } extension Instruction { var opcodeMask: UInt64 { 0xFF00_0000_0000_0000 } diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index d81c583a8..78171a001 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -12,7 +12,7 @@ import _MatchingEngine // For errors extension MEProgram where Input.Element: Hashable { - public struct Builder { + struct Builder { var instructions: [Instruction] = [] var elements = TypedSetVector() @@ -50,7 +50,7 @@ extension MEProgram where Input.Element: Hashable { nextCaptureRegister.rawValue } - public init() {} + init() {} } } @@ -71,7 +71,7 @@ extension MEProgram.Builder { // TODO: We want a better strategy for fixups, leaving // the operand in a different form isn't great... - public init(staticElements: S) where S.Element == Input.Element { + init(staticElements: S) where S.Element == Input.Element { staticElements.forEach { elements.store($0) } } @@ -79,21 +79,21 @@ extension MEProgram.Builder { .init(instructions.endIndex - 1) } - public mutating func buildNop(_ r: StringRegister? = nil) { + mutating func buildNop(_ r: StringRegister? = nil) { instructions.append(.init(.nop, .init(optionalString: r))) } - public mutating func buildNop(_ s: String) { + mutating func buildNop(_ s: String) { buildNop(strings.store(s)) } - public mutating func buildDecrement( + mutating func buildDecrement( _ i: IntRegister, nowZero: BoolRegister ) { instructions.append(.init( .decrement, .init(bool: nowZero, int: i))) } - public mutating func buildMoveImmediate( + mutating func buildMoveImmediate( _ value: UInt64, into: IntRegister ) { instructions.append(.init( @@ -101,25 +101,25 @@ extension MEProgram.Builder { } // TODO: generic - public mutating func buildMoveImmediate( + mutating func buildMoveImmediate( _ value: Int, into: IntRegister ) { let uint = UInt64(asserting: value) buildMoveImmediate(uint, into: into) } - public mutating func buildMoveCurrentPosition( + mutating func buildMoveCurrentPosition( into: PositionRegister ) { instructions.append(.init( .movePosition, .init(position: into))) } - public mutating func buildBranch(to t: AddressToken) { + mutating func buildBranch(to t: AddressToken) { instructions.append(.init(.branch)) fixup(to: t) } - public mutating func buildCondBranch( + mutating func buildCondBranch( _ condition: BoolRegister, to t: AddressToken ) { instructions.append( @@ -127,7 +127,7 @@ extension MEProgram.Builder { fixup(to: t) } - public mutating func buildCondBranch( + mutating func buildCondBranch( to t: AddressToken, ifZeroElseDecrement i: IntRegister ) { instructions.append( @@ -135,56 +135,56 @@ extension MEProgram.Builder { fixup(to: t) } - public mutating func buildSave(_ t: AddressToken) { + mutating func buildSave(_ t: AddressToken) { instructions.append(.init(.save)) fixup(to: t) } - public mutating func buildSaveAddress(_ t: AddressToken) { + mutating func buildSaveAddress(_ t: AddressToken) { instructions.append(.init(.saveAddress)) fixup(to: t) } - public mutating func buildSplit( + mutating func buildSplit( to: AddressToken, saving: AddressToken ) { instructions.append(.init(.splitSaving)) fixup(to: (to, saving)) } - public mutating func buildClear() { + mutating func buildClear() { instructions.append(.init(.clear)) } - public mutating func buildRestore() { + mutating func buildRestore() { instructions.append(.init(.restore)) } - public mutating func buildFail() { + mutating func buildFail() { instructions.append(.init(.fail)) } - public mutating func buildCall(_ t: AddressToken) { + mutating func buildCall(_ t: AddressToken) { instructions.append(.init(.call)) fixup(to: t) } - public mutating func buildRet() { + mutating func buildRet() { instructions.append(.init(.ret)) } - public mutating func buildAbort(_ s: StringRegister? = nil) { + mutating func buildAbort(_ s: StringRegister? = nil) { instructions.append(.init( .abort, .init(optionalString: s))) } - public mutating func buildAbort(_ s: String) { + mutating func buildAbort(_ s: String) { buildAbort(strings.store(s)) } - public mutating func buildAdvance(_ n: Distance) { + mutating func buildAdvance(_ n: Distance) { instructions.append(.init(.advance, .init(distance: n))) } - public mutating func buildMatch(_ e: Input.Element) { + mutating func buildMatch(_ e: Input.Element) { instructions.append(.init( .match, .init(element: elements.store(e)))) } - public mutating func buildMatchSequence( + mutating func buildMatchSequence( _ s: S ) where S.Element == Input.Element { instructions.append(.init( @@ -192,7 +192,7 @@ extension MEProgram.Builder { .init(sequence: sequences.store(.init(s))))) } - public mutating func buildMatchSlice( + mutating func buildMatchSlice( lower: PositionRegister, upper: PositionRegister ) { instructions.append(.init( @@ -200,50 +200,50 @@ extension MEProgram.Builder { .init(pos: lower, pos2: upper))) } - public mutating func buildConsume( + mutating func buildConsume( by p: @escaping MEProgram.ConsumeFunction ) { instructions.append(.init( .consumeBy, .init(consumer: makeConsumeFunction(p)))) } - public mutating func buildAssert( + mutating func buildAssert( by p: @escaping MEProgram.AssertionFunction ) { instructions.append(.init( .assertBy, .init(assertion: makeAssertionFunction(p)))) } - public mutating func buildAssert( + mutating func buildAssert( _ e: Input.Element, into cond: BoolRegister ) { instructions.append(.init(.assertion, .init( element: elements.store(e), bool: cond))) } - public mutating func buildAccept() { + mutating func buildAccept() { instructions.append(.init(.accept)) } - public mutating func buildPrint(_ s: StringRegister) { + mutating func buildPrint(_ s: StringRegister) { instructions.append(.init(.print, .init(string: s))) } - public mutating func buildBeginCapture( + mutating func buildBeginCapture( _ cap: CaptureRegister ) { instructions.append( .init(.beginCapture, .init(capture: cap))) } - public mutating func buildEndCapture( + mutating func buildEndCapture( _ cap: CaptureRegister ) { instructions.append( .init(.endCapture, .init(capture: cap))) } - public mutating func buildTransformCapture( + mutating func buildTransformCapture( _ cap: CaptureRegister, _ trans: TransformRegister ) { instructions.append(.init( @@ -251,7 +251,7 @@ extension MEProgram.Builder { .init(capture: cap, transform: trans))) } - public mutating func buildMatcher( + mutating func buildMatcher( _ fun: MatcherRegister, into reg: ValueRegister ) { instructions.append(.init( @@ -259,7 +259,7 @@ extension MEProgram.Builder { .init(matcher: fun, value: reg))) } - public mutating func buildMove( + mutating func buildMove( _ value: ValueRegister, into capture: CaptureRegister ) { instructions.append(.init( @@ -267,21 +267,21 @@ extension MEProgram.Builder { .init(value: value, capture: capture))) } - public mutating func buildBackreference( + mutating func buildBackreference( _ cap: CaptureRegister ) { instructions.append( .init(.backreference, .init(capture: cap))) } - public mutating func buildUnresolvedReference(id: ReferenceID) { + mutating func buildUnresolvedReference(id: ReferenceID) { buildBackreference(.init(0)) unresolvedReferences[id, default: []].append(lastInstructionAddress) } // TODO: Mutating because of fail address fixup, drop when // that's removed - public mutating func assemble() throws -> MEProgram { + mutating func assemble() throws -> MEProgram { try resolveReferences() // TODO: This will add a fail instruction at the end every @@ -356,22 +356,22 @@ extension MEProgram.Builder { referencedCaptureOffsets: referencedCaptureOffsets) } - public mutating func reset() { self = Self() } + mutating func reset() { self = Self() } } // Address-agnostic interfaces for label-like support extension MEProgram.Builder { - public enum _AddressToken {} - public typealias AddressToken = TypedInt<_AddressToken> + enum _AddressToken {} + typealias AddressToken = TypedInt<_AddressToken> - public mutating func makeAddress() -> AddressToken { + mutating func makeAddress() -> AddressToken { defer { addressTokens.append(nil) } return AddressToken(addressTokens.count) } // Resolves the address token to the most recently added // instruction, updating prior and future address references - public mutating func resolve(_ t: AddressToken) { + mutating func resolve(_ t: AddressToken) { assert(!instructions.isEmpty) addressTokens[t.rawValue] = @@ -380,7 +380,7 @@ extension MEProgram.Builder { // Resolves the address token to the next instruction (one past the most // recently added one), updating prior and future address references. - public mutating func label(_ t: AddressToken) { + mutating func label(_ t: AddressToken) { addressTokens[t.rawValue] = InstructionAddress(instructions.count) } @@ -388,7 +388,7 @@ extension MEProgram.Builder { // Associate the most recently added instruction with // the provided token, ensuring it is fixed up during // assembly - public mutating func fixup(to t: AddressToken) { + mutating func fixup(to t: AddressToken) { assert(!instructions.isEmpty) addressFixups.append( (InstructionAddress(instructions.endIndex-1), .init(t))) @@ -397,7 +397,7 @@ extension MEProgram.Builder { // Associate the most recently added instruction with // the provided tokens, ensuring it is fixed up during // assembly - public mutating func fixup( + mutating func fixup( to ts: (AddressToken, AddressToken) ) { assert(!instructions.isEmpty) @@ -412,7 +412,7 @@ extension MEProgram.Builder { // // This is useful for possessive quantification that needs some initial save // point to "ratchet" upon a successful match. - public mutating func pushEmptySavePoint() { + mutating func pushEmptySavePoint() { if failAddressToken == nil { failAddressToken = makeAddress() } @@ -438,7 +438,7 @@ fileprivate extension MEProgram.Builder { // Register helpers extension MEProgram.Builder { - public mutating func makeCapture(id: ReferenceID?) -> CaptureRegister { + mutating func makeCapture(id: ReferenceID?) -> CaptureRegister { defer { nextCaptureRegister.rawValue += 1 } // Register the capture for later lookup via symbolic references. if let id = id { @@ -449,25 +449,25 @@ extension MEProgram.Builder { return nextCaptureRegister } - public mutating func makeBoolRegister() -> BoolRegister { + mutating func makeBoolRegister() -> BoolRegister { defer { nextBoolRegister.rawValue += 1 } return nextBoolRegister } - public mutating func makeIntRegister() -> IntRegister { + mutating func makeIntRegister() -> IntRegister { defer { nextIntRegister.rawValue += 1 } return nextIntRegister } - public mutating func makePositionRegister() -> PositionRegister { + mutating func makePositionRegister() -> PositionRegister { defer { nextPositionRegister.rawValue += 1 } return nextPositionRegister } - public mutating func makeValueRegister() -> ValueRegister { + mutating func makeValueRegister() -> ValueRegister { defer { nextValueRegister.rawValue += 1 } return nextValueRegister } // Allocate and initialize a register - public mutating func makeIntRegister( + mutating func makeIntRegister( initialValue: Int ) -> IntRegister { let r = makeIntRegister() @@ -476,7 +476,7 @@ extension MEProgram.Builder { } // Allocate and initialize a register - public mutating func makePositionRegister( + mutating func makePositionRegister( initializingWithCurrentPosition: () ) -> PositionRegister { let r = makePositionRegister() @@ -485,17 +485,17 @@ extension MEProgram.Builder { } // 'kill' or release allocated registers - public mutating func kill(_ r: IntRegister) { + mutating func kill(_ r: IntRegister) { // TODO: Release/reuse registers, for now nop makes // reading the code easier buildNop("kill \(r)") } - public mutating func kill(_ r: BoolRegister) { + mutating func kill(_ r: BoolRegister) { // TODO: Release/reuse registers, for now nop makes // reading the code easier buildNop("kill \(r)") } - public mutating func kill(_ r: PositionRegister) { + mutating func kill(_ r: PositionRegister) { // TODO: Release/reuse registers, for now nop makes // reading the code easier buildNop("kill \(r)") @@ -504,25 +504,25 @@ extension MEProgram.Builder { // TODO: A register-mapping helper struct, which could release // registers without monotonicity required - public mutating func makeConsumeFunction( + mutating func makeConsumeFunction( _ f: @escaping MEProgram.ConsumeFunction ) -> ConsumeFunctionRegister { defer { consumeFunctions.append(f) } return ConsumeFunctionRegister(consumeFunctions.count) } - public mutating func makeAssertionFunction( + mutating func makeAssertionFunction( _ f: @escaping MEProgram.AssertionFunction ) -> AssertionFunctionRegister { defer { assertionFunctions.append(f) } return AssertionFunctionRegister(assertionFunctions.count) } - public mutating func makeTransformFunction( + mutating func makeTransformFunction( _ f: @escaping MEProgram.TransformFunction ) -> TransformRegister { defer { transformFunctions.append(f) } return TransformRegister(transformFunctions.count) } - public mutating func makeMatcherFunction( + mutating func makeMatcherFunction( _ f: @escaping MEProgram.MatcherFunction ) -> MatcherRegister { defer { matcherFunctions.append(f) } diff --git a/Sources/_StringProcessing/Engine/MECapture.swift b/Sources/_StringProcessing/Engine/MECapture.swift index 88f912ecb..bac632e9e 100644 --- a/Sources/_StringProcessing/Engine/MECapture.swift +++ b/Sources/_StringProcessing/Engine/MECapture.swift @@ -142,7 +142,7 @@ extension Processor._StoredCapture: CustomStringConvertible { } } -public struct CaptureList { +struct CaptureList { var values: Array._StoredCapture> var referencedCaptureOffsets: [ReferenceID: Int] diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift index d616657e8..1e58ddf54 100644 --- a/Sources/_StringProcessing/Engine/MEProgram.swift +++ b/Sources/_StringProcessing/Engine/MEProgram.swift @@ -11,13 +11,13 @@ import _MatchingEngine -public struct MEProgram where Input.Element: Equatable { - public typealias ConsumeFunction = (Input, Range) -> Input.Index? - public typealias AssertionFunction = +struct MEProgram where Input.Element: Equatable { + typealias ConsumeFunction = (Input, Range) -> Input.Index? + typealias AssertionFunction = (Input, Input.Index, Range) -> Bool - public typealias TransformFunction = + typealias TransformFunction = (Input, Range) -> Any? - public typealias MatcherFunction = + typealias MatcherFunction = (Input, Input.Index, Range) -> (Input.Index, Any)? var instructions: InstructionList @@ -39,7 +39,7 @@ public struct MEProgram where Input.Element: Equatable { } extension MEProgram: CustomStringConvertible { - public var description: String { + var description: String { var result = """ Elements: \(staticElements) Strings: \(staticStrings) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 10c3eb781..343b02c92 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -public enum MatchMode { +enum MatchMode { case wholeString case partialFromFront } diff --git a/Sources/_StringProcessing/Engine/Tracing.swift b/Sources/_StringProcessing/Engine/Tracing.swift index 7db740f52..24d00d3d7 100644 --- a/Sources/_StringProcessing/Engine/Tracing.swift +++ b/Sources/_StringProcessing/Engine/Tracing.swift @@ -15,7 +15,7 @@ extension Processor: TracedProcessor { var currentPC: InstructionAddress { controller.pc } - public func formatSavePoints() -> String { + func formatSavePoints() -> String { if !savePoints.isEmpty { var result = "save points:\n" for point in savePoints { diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index e066a4369..5c098c5c5 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -11,8 +11,7 @@ import _MatchingEngine - // FIXME: Public for prototype -public struct Executor { +struct Executor { // TODO: consider let, for now lets us toggle tracing var engine: Engine @@ -20,9 +19,8 @@ public struct Executor { self.engine = Engine(program, enableTracing: enablesTracing) } - // FIXME: Public for prototype - public struct Result { - public var range: Range + struct Result { + var range: Range var captures: [StructuredCapture] var referencedCaptureOffsets: [ReferenceID: Int] @@ -44,7 +42,7 @@ public struct Executor { } } - public func execute( + func execute( input: String, in range: Range, mode: MatchMode = .wholeString @@ -65,7 +63,7 @@ public struct Executor { fatalError(String(describing: error)) } } - public func execute( + func execute( input: Substring, mode: MatchMode = .wholeString ) -> Result? { @@ -75,7 +73,7 @@ public struct Executor { mode: mode) } - public func executeFlat( + func executeFlat( input: String, in range: Range, mode: MatchMode = .wholeString diff --git a/Sources/_StringProcessing/RegexDSL/DSL.swift b/Sources/_StringProcessing/RegexDSL/DSL.swift index 35a4ccb5e..17f006231 100644 --- a/Sources/_StringProcessing/RegexDSL/DSL.swift +++ b/Sources/_StringProcessing/RegexDSL/DSL.swift @@ -187,9 +187,7 @@ public func choiceOf( // MARK: - Backreference - -// FIXME: Public for prototypes. -public struct ReferenceID: Hashable, Equatable { +struct ReferenceID: Hashable, Equatable { private static var counter: Int = 0 var base: Int diff --git a/Sources/_StringProcessing/RegexDSL/DSLTree.swift b/Sources/_StringProcessing/RegexDSL/DSLTree.swift index a44220925..43f8aa62f 100644 --- a/Sources/_StringProcessing/RegexDSL/DSLTree.swift +++ b/Sources/_StringProcessing/RegexDSL/DSLTree.swift @@ -249,7 +249,7 @@ extension DSLTree { } } extension DSLTree.Node { - public func _captureStructure( + func _captureStructure( _ constructor: inout CaptureStructure.Constructor ) -> CaptureStructure { switch self { diff --git a/Sources/_StringProcessing/Unicode/Decoding.swift b/Sources/_StringProcessing/Unicode/Decoding.swift index 49eb1f794..68c14f6c1 100644 --- a/Sources/_StringProcessing/Unicode/Decoding.swift +++ b/Sources/_StringProcessing/Unicode/Decoding.swift @@ -33,13 +33,13 @@ enum UnsafeAssumingValidUTF8 { @inlinable @inline(__always) - public static func decode(_ x: UInt8) -> Unicode.Scalar { + static func decode(_ x: UInt8) -> Unicode.Scalar { _internalInvariant(UTF8.isASCII(x)) return Unicode.Scalar(_unchecked: UInt32(x)) } @inlinable @inline(__always) - public static func decode( + static func decode( _ x: UInt8, _ y: UInt8 ) -> Unicode.Scalar { _internalInvariant(scalarLength(x) == 2) @@ -50,7 +50,7 @@ enum UnsafeAssumingValidUTF8 { } @inlinable @inline(__always) - public static func decode( + static func decode( _ x: UInt8, _ y: UInt8, _ z: UInt8 ) -> Unicode.Scalar { _internalInvariant(scalarLength(x) == 3) @@ -63,7 +63,7 @@ enum UnsafeAssumingValidUTF8 { } @inlinable @inline(__always) - public static func decode( + static func decode( _ x: UInt8, _ y: UInt8, _ z: UInt8, _ w: UInt8 ) -> Unicode.Scalar { _internalInvariant(scalarLength(x) == 4) @@ -80,7 +80,7 @@ enum UnsafeAssumingValidUTF8 { // Also, assuming we can load from those bounds... @inlinable - public static func decode( + static func decode( _ utf8: UnsafeByteBuffer, startingAt i: Int ) -> (Unicode.Scalar, scalarLength: Int) { let cu0 = utf8[_unchecked: i] @@ -103,7 +103,7 @@ enum UnsafeAssumingValidUTF8 { } @inlinable - public static func decode( + static func decode( _ utf8: UnsafeByteBuffer, endingAt i: Int ) -> (Unicode.Scalar, scalarLength: Int) { let len = scalarLength(utf8, endingAt: i) @@ -113,7 +113,7 @@ enum UnsafeAssumingValidUTF8 { } @inlinable @inline(__always) - public static func scalarLength(_ x: UInt8) -> Int { + static func scalarLength(_ x: UInt8) -> Int { _internalInvariant(!UTF8.isContinuation(x)) if UTF8.isASCII(x) { return 1 } // TODO(String micro-performance): check codegen @@ -121,7 +121,7 @@ enum UnsafeAssumingValidUTF8 { } @inlinable @inline(__always) - public static func scalarLength( + static func scalarLength( _ utf8: UnsafeByteBuffer, endingAt i: Int ) -> Int { var len = 1 @@ -133,12 +133,12 @@ enum UnsafeAssumingValidUTF8 { } @inlinable @inline(__always) - public static func continuationPayload(_ x: UInt8) -> UInt32 { + static func continuationPayload(_ x: UInt8) -> UInt32 { return UInt32(x & 0x3F) } @inlinable - public static func scalarAlign( + static func scalarAlign( _ utf8: UnsafeByteBuffer, _ idx: Int ) -> Int { guard _fastPath(idx != utf8.count) else { return idx } diff --git a/Sources/_StringProcessing/Unicode/NecessaryEvils.swift b/Sources/_StringProcessing/Unicode/NecessaryEvils.swift index ef846c14e..a9ae24429 100644 --- a/Sources/_StringProcessing/Unicode/NecessaryEvils.swift +++ b/Sources/_StringProcessing/Unicode/NecessaryEvils.swift @@ -40,7 +40,7 @@ extension Optional { } // Don't use UnsafeRawBufferPointer for anything important -public struct UnsafeByteBuffer { +struct UnsafeByteBuffer { var pointer: UnsafeRawPointer var count: Int diff --git a/Sources/_StringProcessing/Utility/Protocols.swift b/Sources/_StringProcessing/Utility/Protocols.swift index 9c196c18c..7542a17dd 100644 --- a/Sources/_StringProcessing/Utility/Protocols.swift +++ b/Sources/_StringProcessing/Utility/Protocols.swift @@ -13,11 +13,11 @@ // These currently only drive tracing/formatting, but could drive // more -public protocol InstructionProtocol { +protocol InstructionProtocol { var operandPC: InstructionAddress? { get } } -public protocol ProcessorProtocol { +protocol ProcessorProtocol { associatedtype Input: Collection associatedtype Instruction: InstructionProtocol associatedtype SavePoint = () @@ -45,12 +45,12 @@ public protocol ProcessorProtocol { } extension ProcessorProtocol { - public func fetch() -> Instruction { + func fetch() -> Instruction { instructions[currentPC] } - public var callStack: Array { [] } -// public var savePoints: Array { [] } - public var registers: Array { [] } + var callStack: Array { [] } +// var savePoints: Array { [] } + var registers: Array { [] } } diff --git a/Sources/_StringProcessing/Utility/Traced.swift b/Sources/_StringProcessing/Utility/Traced.swift index c270aba23..5ae7cd245 100644 --- a/Sources/_StringProcessing/Utility/Traced.swift +++ b/Sources/_StringProcessing/Utility/Traced.swift @@ -12,11 +12,11 @@ // TODO: Place shared formatting and trace infrastructure here -public protocol Traced { +protocol Traced { var isTracingEnabled: Bool { get set } } -public protocol TracedProcessor: ProcessorProtocol, Traced { +protocol TracedProcessor: ProcessorProtocol, Traced { // Empty defaulted func formatCallStack() -> String // empty default func formatSavePoints() -> String // empty default @@ -36,7 +36,7 @@ func lineNumber(_ pc: InstructionAddress) -> String { } extension TracedProcessor where Registers: Collection{ - public func formatRegisters() -> String { + func formatRegisters() -> String { typealias E = () if !registers.isEmpty { return "\(registers)\n" @@ -48,19 +48,19 @@ extension TracedProcessor where Registers: Collection{ extension TracedProcessor { func printTrace() { print(formatTrace()) } - public func trace() { + func trace() { if isTracingEnabled { printTrace() } } // Helpers for the conformers - public func formatCallStack() -> String { + func formatCallStack() -> String { if !callStack.isEmpty { return "call stack: \(callStack)\n" } return "" } - public func formatSavePoints() -> String { + func formatSavePoints() -> String { if !savePoints.isEmpty { var result = "save points:\n" for point in savePoints { @@ -71,7 +71,7 @@ extension TracedProcessor { return "" } - public func formatRegisters() -> String { + func formatRegisters() -> String { typealias E = () if Registers.self == E.self { return "" @@ -79,7 +79,7 @@ extension TracedProcessor { return "\(registers)\n" } - public func formatInput() -> String { + func formatInput() -> String { // String override for printing sub-character information. if !input.indices.contains(currentPosition) { // Format unicode scalars as: @@ -115,7 +115,7 @@ extension TracedProcessor { """ } - public func formatInstructionWindow( + func formatInstructionWindow( windowSize: Int = 12 ) -> String { if isAcceptState { return "ACCEPT" } @@ -139,7 +139,7 @@ extension TracedProcessor { return result } - public func formatTrace() -> String { + func formatTrace() -> String { var result = "\n--- cycle \(cycleCount) ---\n" result += formatCallStack() result += formatSavePoints() @@ -150,7 +150,7 @@ extension TracedProcessor { return result } - public func formatInstruction( + func formatInstruction( _ pc: InstructionAddress, depth: Int = 5 ) -> String { @@ -160,7 +160,7 @@ extension TracedProcessor { } extension Collection where Element: InstructionProtocol, Index == InstructionAddress { - public func formatInstruction( + func formatInstruction( _ pc: InstructionAddress, atCurrent: Bool, depth: Int diff --git a/Sources/_StringProcessing/Utility/TypedIndex.swift b/Sources/_StringProcessing/Utility/TypedIndex.swift index 3bddcadfd..adde06a3e 100644 --- a/Sources/_StringProcessing/Utility/TypedIndex.swift +++ b/Sources/_StringProcessing/Utility/TypedIndex.swift @@ -12,55 +12,43 @@ /// Forwarding wrapper around Int-index collections that provide a /// strongly (phantom) typed index. -@frozen -public struct TypedIndex: RawRepresentable where C.Index == Int { - @_alwaysEmitIntoClient - public var rawValue: C +struct TypedIndex: RawRepresentable where C.Index == Int { + var rawValue: C - @_alwaysEmitIntoClient - public init(rawValue: C) { self.rawValue = rawValue } + init(rawValue: C) { self.rawValue = rawValue } - @_alwaysEmitIntoClient - public init(_ rawValue: C) { self.init(rawValue: rawValue) } + init(_ rawValue: C) { self.init(rawValue: rawValue) } } extension TypedIndex: Collection { - public typealias Index = TypedInt<šŸ‘»> - public typealias Element = C.Element + typealias Index = TypedInt<šŸ‘»> + typealias Element = C.Element - @_alwaysEmitIntoClient - public var startIndex: Index { Index(rawValue.startIndex) } + var startIndex: Index { Index(rawValue.startIndex) } - @_alwaysEmitIntoClient - public var endIndex: Index { Index(rawValue.endIndex )} + var endIndex: Index { Index(rawValue.endIndex )} - @_alwaysEmitIntoClient - public var count: Int { rawValue.count } + var count: Int { rawValue.count } - @_alwaysEmitIntoClient - public func index(after: Index) -> Index { + func index(after: Index) -> Index { Index(rawValue.index(after: after.rawValue)) } - @_alwaysEmitIntoClient - public subscript(position: Index) -> Element { + subscript(position: Index) -> Element { rawValue[position.rawValue] } - @_alwaysEmitIntoClient - public func distance( + func distance( from start: Index, to end: Index ) -> Int { rawValue.distance(from: start.rawValue, to: end.rawValue) } - @_alwaysEmitIntoClient - public func index(_ i: Index, offsetBy distance: Int) -> Index { + func index(_ i: Index, offsetBy distance: Int) -> Index { Index(rawValue.index(i.rawValue, offsetBy: distance)) } - @_alwaysEmitIntoClient - public func index(_ i: Index, offsetBy distance: Int, limitedBy limit: Index) -> Index? { + func index(_ i: Index, offsetBy distance: Int, limitedBy limit: Index) -> Index? { guard let idx = rawValue.index(i.rawValue, offsetBy: distance, limitedBy: limit.rawValue) else { return nil } @@ -71,8 +59,7 @@ extension TypedIndex: Collection { extension TypedIndex: RandomAccessCollection where C: RandomAccessCollection { } extension TypedIndex: MutableCollection where C: MutableCollection { - @_alwaysEmitIntoClient - public subscript(position: Index) -> Element { + subscript(position: Index) -> Element { _read { yield rawValue[position.rawValue] } @@ -82,8 +69,7 @@ extension TypedIndex: MutableCollection where C: MutableCollection { } } extension TypedIndex: BidirectionalCollection where C: BidirectionalCollection { - @_alwaysEmitIntoClient - public func index(before: Index) -> Index { + func index(before: Index) -> Index { Index(rawValue.index(before: before.rawValue)) } } @@ -92,11 +78,9 @@ extension TypedIndex: BidirectionalCollection where C: BidirectionalCollection { // failure in the Swift repo. #if false extension TypedIndex: RangeReplaceableCollection where C: RangeReplaceableCollection { - @_alwaysEmitIntoClient - public init() { rawValue = C() } + init() { rawValue = C() } - @_alwaysEmitIntoClient - public mutating func replaceSubrange(_ subrange: Range, with newElements: C) where C : Collection, C.Element == Element { + mutating func replaceSubrange(_ subrange: Range, with newElements: C) where C : Collection, C.Element == Element { let rawRange = subrange.lowerBound.rawValue ..< subrange.upperBound.rawValue rawValue.replaceSubrange(rawRange, with: newElements) } @@ -107,14 +91,13 @@ extension TypedIndex: RangeReplaceableCollection where C: RangeReplaceableCollec // Workaround for #73 extension TypedIndex where C: RangeReplaceableCollection { - public mutating func append(_ newElement: Element) { + mutating func append(_ newElement: Element) { rawValue.append(newElement) } } extension TypedIndex: ExpressibleByArrayLiteral where C: ExpressibleByArrayLiteral & RangeReplaceableCollection { - @_alwaysEmitIntoClient - public init(arrayLiteral elements: Element...) { + init(arrayLiteral elements: Element...) { // TODO: any way around the RRC copying init? self.init(C(elements)) } @@ -122,5 +105,5 @@ extension TypedIndex: ExpressibleByArrayLiteral where C: ExpressibleByArrayLiter // MARK: - Strongly typed wrappers -public typealias InstructionList = TypedIndex<[Instruction], _InstructionAddress> +typealias InstructionList = TypedIndex<[Instruction], _InstructionAddress> diff --git a/Sources/_StringProcessing/Utility/TypedInt.swift b/Sources/_StringProcessing/Utility/TypedInt.swift index caff7f64e..249717b68 100644 --- a/Sources/_StringProcessing/Utility/TypedInt.swift +++ b/Sources/_StringProcessing/Utility/TypedInt.swift @@ -11,86 +11,71 @@ // Just a phantom-typed Int wrapper. -@frozen -public struct TypedInt<šŸ‘»>: RawRepresentable, Hashable { - @_alwaysEmitIntoClient - public var rawValue: Int +struct TypedInt<šŸ‘»>: RawRepresentable, Hashable { + var rawValue: Int - @_alwaysEmitIntoClient - public init(rawValue: Int) { + init(rawValue: Int) { self.rawValue = rawValue } - @_alwaysEmitIntoClient - public init(_ rawValue: Int) { + init(_ rawValue: Int) { self.init(rawValue: rawValue) } - @_alwaysEmitIntoClient - public init(_ uint: UInt64) { + init(_ uint: UInt64) { assert(uint.leadingZeroBitCount > 0) self.init(Int(asserting: uint)) } } extension TypedInt: Comparable { - @_alwaysEmitIntoClient - public static func <(lhs: TypedInt, rhs: TypedInt) -> Bool { + static func <(lhs: TypedInt, rhs: TypedInt) -> Bool { return lhs.rawValue < rhs.rawValue } } extension TypedInt: CustomStringConvertible { - @_alwaysEmitIntoClient - public var description: String { return "#\(rawValue)" } + var description: String { return "#\(rawValue)" } } extension TypedInt: ExpressibleByIntegerLiteral { - @_alwaysEmitIntoClient - public init(integerLiteral value: Int) { + init(integerLiteral value: Int) { self.init(rawValue: value) } } -public protocol TypedIntProtocol { +protocol TypedIntProtocol { associatedtype šŸ‘» } extension TypedInt: TypedIntProtocol { } // A placeholder type for when we must supply a type. // When the phantom type appears, it says boo -public enum _Boo {} +enum _Boo {} // Easier for clients to just have their own typealias -public typealias TypedInt_ = TypedInt +typealias TypedInt_ = TypedInt // TODO: BinaryInteger, etc. extension TypedInt { - @_alwaysEmitIntoClient - public static func +(lhs: TypedInt, rhs: Int) -> TypedInt { + static func +(lhs: TypedInt, rhs: Int) -> TypedInt { return TypedInt(lhs.rawValue + rhs) } - @_alwaysEmitIntoClient - public var bits: UInt64 { + var bits: UInt64 { UInt64(asserting: self.rawValue) } } -@frozen -public struct TypedSetVector { - public typealias Idx = TypedInt<šŸ‘»> +struct TypedSetVector { + typealias Idx = TypedInt<šŸ‘»> // TODO: Replace with real set vector - @_alwaysEmitIntoClient - public var lookup: Dictionary = [:] + var lookup: Dictionary = [:] - @_alwaysEmitIntoClient - public var stored: Array = [] + var stored: Array = [] - @_alwaysEmitIntoClient - public func load(_ idx: Idx) -> Element { stored[idx.rawValue] } + func load(_ idx: Idx) -> Element { stored[idx.rawValue] } - @_alwaysEmitIntoClient @discardableResult - public mutating func store(_ e: Element) -> Idx { + mutating func store(_ e: Element) -> Idx { if let reg = lookup[e] { return reg } let reg = Idx(stored.count) stored.append(e) @@ -98,34 +83,32 @@ public struct TypedSetVector { return reg } - @_alwaysEmitIntoClient - public var count: Int { stored.count } + var count: Int { stored.count } - @_alwaysEmitIntoClient - public init() {} + init() {} } // MARK: - Strongly typed int wrappers /// A distance in the Input, e.g. `n` in consume(n) -public typealias Distance = TypedInt<_Distance> -public enum _Distance {} +typealias Distance = TypedInt<_Distance> +enum _Distance {} /// An instruction address, i.e. the index into our instruction list -public typealias InstructionAddress = TypedInt<_InstructionAddress> -public enum _InstructionAddress {} +typealias InstructionAddress = TypedInt<_InstructionAddress> +enum _InstructionAddress {} /// A position in the call stack, i.e. for save point restores -public typealias CallStackAddress = TypedInt<_CallStackAddress> -public enum _CallStackAddress {} +typealias CallStackAddress = TypedInt<_CallStackAddress> +enum _CallStackAddress {} /// A position in a position stack, i.e. for NFA simulation -public typealias PositionStackAddress = TypedInt<_PositionStackAddress> -public enum _PositionStackAddress {} +typealias PositionStackAddress = TypedInt<_PositionStackAddress> +enum _PositionStackAddress {} /// A position in the save point stack, i.e. for backtracking -public typealias SavePointStackAddress = TypedInt<_SavePointAddress> -public enum _SavePointAddress {} +typealias SavePointStackAddress = TypedInt<_SavePointAddress> +enum _SavePointAddress {} // MARK: - Registers @@ -135,85 +118,85 @@ public enum _SavePointAddress {} /// NOTE: Currently just used for static data, but e.g. could be /// used to save the most recently seen element satisfying some /// property -public typealias ElementRegister = TypedInt<_ElementRegister> -public enum _ElementRegister {} +typealias ElementRegister = TypedInt<_ElementRegister> +enum _ElementRegister {} -public typealias SequenceRegister = TypedInt<_SequenceRegister> -public enum _SequenceRegister {} +typealias SequenceRegister = TypedInt<_SequenceRegister> +enum _SequenceRegister {} /// The register number for a stored boolean value /// /// E.g. used for conditional branches -public typealias BoolRegister = TypedInt<_BoolRegister> -public enum _BoolRegister {} +typealias BoolRegister = TypedInt<_BoolRegister> +enum _BoolRegister {} /// The register number for a string (e.g. comment, failure reason) -public typealias StringRegister = TypedInt<_StringRegister> -public enum _StringRegister {} +typealias StringRegister = TypedInt<_StringRegister> +enum _StringRegister {} /// Used for consume functions, e.g. character classes -public typealias ConsumeFunctionRegister = TypedInt<_ConsumeFunctionRegister> -public enum _ConsumeFunctionRegister {} +typealias ConsumeFunctionRegister = TypedInt<_ConsumeFunctionRegister> +enum _ConsumeFunctionRegister {} /// Used for assertion functions, e.g. anchors etc -public typealias AssertionFunctionRegister = TypedInt<_AssertionFunctionRegister> -public enum _AssertionFunctionRegister {} +typealias AssertionFunctionRegister = TypedInt<_AssertionFunctionRegister> +enum _AssertionFunctionRegister {} /// Used for capture transforms, etc -public typealias TransformRegister = TypedInt<_TransformRegister> -public enum _TransformRegister {} +typealias TransformRegister = TypedInt<_TransformRegister> +enum _TransformRegister {} /// Used for value-producing matchers -public typealias MatcherRegister = TypedInt<_MatcherRegister> -public enum _MatcherRegister {} +typealias MatcherRegister = TypedInt<_MatcherRegister> +enum _MatcherRegister {} /// UNIMPLEMENTED -public typealias IntRegister = TypedInt<_IntRegister> -public enum _IntRegister {} +typealias IntRegister = TypedInt<_IntRegister> +enum _IntRegister {} /// UNIMPLEMENTED -public typealias FloatRegister = TypedInt<_FloatRegister> -public enum _FloatRegister {} +typealias FloatRegister = TypedInt<_FloatRegister> +enum _FloatRegister {} /// UNIMPLEMENTED /// /// NOTE: This, along with a position stack, might /// serve NFA-simulation style execution models -public typealias PositionRegister = TypedInt<_PositionRegister> -public enum _PositionRegister {} +typealias PositionRegister = TypedInt<_PositionRegister> +enum _PositionRegister {} -public typealias ValueRegister = TypedInt<_ValueRegister> -public enum _ValueRegister {} +typealias ValueRegister = TypedInt<_ValueRegister> +enum _ValueRegister {} -public typealias CaptureRegister = TypedInt<_CaptureRegister> -public enum _CaptureRegister {} +typealias CaptureRegister = TypedInt<_CaptureRegister> +enum _CaptureRegister {} /// UNIMPLEMENTED -public typealias InstructionAddressRegister = TypedInt<_InstructionAddressRegister> -public enum _InstructionAddressRegister {} +typealias InstructionAddressRegister = TypedInt<_InstructionAddressRegister> +enum _InstructionAddressRegister {} /// UNIMPLEMENTED -public typealias CallStackAddressRegister = TypedInt<_CallStackAddressRegister> -public enum _CallStackAddressRegister {} +typealias CallStackAddressRegister = TypedInt<_CallStackAddressRegister> +enum _CallStackAddressRegister {} /// UNIMPLEMENTED -public typealias PositionStackAddressRegister = TypedInt<_PositionStackAddressRegister> -public enum _PositionStackAddressRegister {} +typealias PositionStackAddressRegister = TypedInt<_PositionStackAddressRegister> +enum _PositionStackAddressRegister {} /// UNIMPLEMENTED -public typealias SavePointAddressRegister = TypedInt<_SavePointAddressRegister> -public enum _SavePointAddressRegister {} +typealias SavePointAddressRegister = TypedInt<_SavePointAddressRegister> +enum _SavePointAddressRegister {} /// A numbered label -public typealias LabelId = TypedInt<_LabelId> -public enum _LabelId {} +typealias LabelId = TypedInt<_LabelId> +enum _LabelId {} /// A numbered function -public typealias FunctionId = TypedInt<_FunctionId> -public enum _FunctionId {} +typealias FunctionId = TypedInt<_FunctionId> +enum _FunctionId {} /// A numbered capture -public typealias CaptureId = TypedInt<_CaptureId> -public enum _CaptureId {} +typealias CaptureId = TypedInt<_CaptureId> +enum _CaptureId {} From 75d49319abf7120c34eab3d443c102e7d957e91a Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Tue, 22 Feb 2022 07:41:03 -0700 Subject: [PATCH 05/19] Initial custom-component test infrastructure --- Tests/RegexTests/CustomTests.swift | 82 ++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 28 deletions(-) diff --git a/Tests/RegexTests/CustomTests.swift b/Tests/RegexTests/CustomTests.swift index 73a50b108..692ad6c37 100644 --- a/Tests/RegexTests/CustomTests.swift +++ b/Tests/RegexTests/CustomTests.swift @@ -37,39 +37,65 @@ private struct Asciibbler: Nibbler { } } -extension RegexTests { - - // TODO: Refactor below into more exhaustive, declarative - // tests. - func testMatchingConsumers() { - - let regex = Regex { - Numbler() - Asciibbler() - } +enum MatchCall { + case match + case firstMatch +} - guard let result = "4t".match(regex) else { - XCTFail() - return +func customTest( + _ regex: Regex, + _ tests: (input: String, call: MatchCall, match: Match?)... +) { + for (input, call, match) in tests { + let result: Match? + switch call { + case .match: + result = input.match(regex)?.match + case .firstMatch: + result = input.firstMatch(of: regex)?.result } - XCTAssert(result.match == "4t") + XCTAssertEqual(result, match) + } +} - XCTAssertNil("4".match(regex)) - XCTAssertNil("t".match(regex)) - XCTAssertNil("t4".match(regex)) +extension RegexTests { - let regex2 = Regex { - oneOrMore { + // TODO: Refactor below into more exhaustive, declarative + // tests. + func testCustomRegexComponents() { + customTest( + Regex { Numbler() - } - } - - guard let res2 = "ab123c".firstMatch(of: regex2) else { - XCTFail() - return - } - - XCTAssertEqual(res2.match, "123") + Asciibbler() + }, + ("4t", .match, "4t"), + ("4", .match, nil), + ("t", .match, nil), + ("t x1y z", .firstMatch, "1y"), + ("t4", .match, nil)) + + customTest( + Regex { + oneOrMore { Numbler() } + }, + ("ab123c", .firstMatch, "123"), + ("abc", .firstMatch, nil), + ("55z", .match, nil), + ("55z", .firstMatch, "55")) + + // FIXME: Requires we return a value instead of a range +// customTest( +// Regex { +// Numbler() +// }, +// ("ab123c", .firstMatch, 1), +// ("abc", .firstMatch, nil), +// ("55z", .match, nil), +// ("55z", .firstMatch, 5)) + + // TODO: Convert below tests to better infra. Right now + // it's hard because `Match` is constrained to be + // `Equatable` which tuples cannot be. let regex3 = Regex { capture { From 2eb7e4e1563a0dbedc8915dc68e3446dd051d716 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Sat, 26 Feb 2022 14:20:51 -0700 Subject: [PATCH 06/19] Cleanup/simplify Executor codepaths (#190) --- Sources/_StringProcessing/Capture.swift | 10 + .../_StringProcessing/Engine/Consume.swift | 8 +- Sources/_StringProcessing/Executor.swift | 76 ++--- .../_StringProcessing/RegexDSL/Match.swift | 13 +- .../MatchingEngineTests.swift | 288 +----------------- Tests/RegexTests/CaptureTests.swift | 12 +- Tests/RegexTests/MatchTests.swift | 10 +- 7 files changed, 49 insertions(+), 368 deletions(-) diff --git a/Sources/_StringProcessing/Capture.swift b/Sources/_StringProcessing/Capture.swift index 915c4c5d7..ecfc558fe 100644 --- a/Sources/_StringProcessing/Capture.swift +++ b/Sources/_StringProcessing/Capture.swift @@ -71,6 +71,11 @@ extension StructuredCapture { value: storedCapture?.value, optionalCount: optionalCount) } + + func slice(from input: String) -> Substring? { + guard let r = storedCapture?.range else { return nil } + return input[r] + } } extension Sequence where Element == StructuredCapture { @@ -86,5 +91,10 @@ extension Sequence where Element == StructuredCapture { }) return TypeConstruction.tuple(of: caps) } + + func slices(from input: String) -> [Substring?] { + self.map { $0.slice(from: input) } + } } + diff --git a/Sources/_StringProcessing/Engine/Consume.swift b/Sources/_StringProcessing/Engine/Consume.swift index cfb803de8..a4a3bf26c 100644 --- a/Sources/_StringProcessing/Engine/Consume.swift +++ b/Sources/_StringProcessing/Engine/Consume.swift @@ -25,16 +25,10 @@ extension Engine { } extension Engine where Input == String { - func consume( - _ input: Input - ) -> (Input.Index, CaptureList)? { - consume(input, in: input.startIndex ..< input.endIndex) - } - func consume( _ input: Input, in range: Range, - matchMode: MatchMode = .partialFromFront + matchMode: MatchMode ) -> (Input.Index, CaptureList)? { if enableTracing { print("Consume: \(input)") diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 5c098c5c5..9de2b0b3d 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -19,69 +19,33 @@ struct Executor { self.engine = Engine(program, enableTracing: enablesTracing) } - struct Result { - var range: Range - var captures: [StructuredCapture] - var referencedCaptureOffsets: [ReferenceID: Int] - - var destructure: ( - matched: Range, - captures: [StructuredCapture], - referencedCaptureOffsets: [ReferenceID: Int] - ) { - (range, captures, referencedCaptureOffsets) - } - - init( - _ matched: Range, _ captures: [StructuredCapture], - _ referencedCaptureOffsets: [ReferenceID: Int] - ) { - self.range = matched - self.captures = captures - self.referencedCaptureOffsets = referencedCaptureOffsets - } - } - - func execute( - input: String, - in range: Range, - mode: MatchMode = .wholeString - ) -> Result? { + func match( + _ input: String, + in inputRange: Range, + _ mode: MatchMode + ) throws -> RegexMatch? { guard let (endIdx, capList) = engine.consume( - input, in: range, matchMode: mode + input, in: inputRange, matchMode: mode ) else { return nil } let capStruct = engine.program.captureStructure - do { - let range = range.lowerBound.. Result? { - self.execute( - input: input.base, - in: input.startIndex.., - mode: MatchMode = .wholeString - ) -> (Range, CaptureList)? { - engine.consume( - input, in: range, matchMode: mode - ).map { endIndex, capture in - (range.lowerBound.., + _ mode: MatchMode + ) throws -> RegexMatch<(Substring, DynamicCaptures)>? { + try match(input, in: inputRange, mode) } } diff --git a/Sources/_StringProcessing/RegexDSL/Match.swift b/Sources/_StringProcessing/RegexDSL/Match.swift index 2dd31c379..c2593b22a 100644 --- a/Sources/_StringProcessing/RegexDSL/Match.swift +++ b/Sources/_StringProcessing/RegexDSL/Match.swift @@ -69,16 +69,11 @@ extension RegexProtocol { mode: MatchMode = .wholeString ) -> RegexMatch? { let executor = Executor(program: regex.program.loweredProgram) - guard let (range, captures, captureOffsets) = executor.execute( - input: input, in: inputRange, mode: mode - )?.destructure else { - return nil + do { + return try executor.match(input, in: inputRange, mode) + } catch { + fatalError(String(describing: error)) } - return RegexMatch( - input: input, - range: range, - rawCaptures: captures, - referencedCaptureOffsets: captureOffsets) } } diff --git a/Tests/MatchingEngineTests/MatchingEngineTests.swift b/Tests/MatchingEngineTests/MatchingEngineTests.swift index b7c89661d..ccfe85ec7 100644 --- a/Tests/MatchingEngineTests/MatchingEngineTests.swift +++ b/Tests/MatchingEngineTests/MatchingEngineTests.swift @@ -13,289 +13,5 @@ import XCTest @testable import _StringProcessing -/// Hold context and run variety of ad-hoc tests -/// -/// TODO: Use these to demonstrate first-order approximation of what -/// overhead such an engine imposes -fileprivate struct Test: ExpressibleByStringLiteral { - var input: String - var aEater: String - var manyAEater: String - var eatUntilA: String - var eatThroughA: String - - // TODO: Have tests explicitly show each step of type binding, - // input binding, etc. - var enableTracing: Bool? = nil - - /* - - until first A - through first A - until / through last A - etc - - */ - - var file: String - var line: UInt - - init( - _ s: String, - enableTracing: Bool? = nil, - file: String = #file, - line: UInt = #line - ) { - self.input = s - self.aEater = s.first == "A" ? String(s.dropFirst()) : s - self.manyAEater = String(s.drop(while: { $0 == "A" })) - - if let firstIdx = s.firstIndex(of: "A") { - self.eatUntilA = String(s[firstIdx...]) - self.eatThroughA = String(eatUntilA.dropFirst()) - } else { - self.eatUntilA = s - self.eatThroughA = s - } - - self.enableTracing = enableTracing - -// self.untilFirstAEater = String( -// s[(s.firstIndex(where: { $0 == "A" }) ?? s.startIndex)...]) - - - self.file = file - self.line = line - } - init( - stringLiteral: String, - file: String = #file, - line: UInt = #line - ) { - self.init(stringLiteral, file: file, line: line) - } - init(stringLiteral: String) { - // NOTE: Can't get source location of a literal... - self.init(stringLiteral) - } - - var slicedInput: (String, Range) { - let prefix = "aAa prefix āš ļø" - let suffix = "āš ļø aAa suffix" - let outer = prefix + input + suffix - let range = outer.mapOffsets( - (lower: prefix.count, upper: -suffix.count)) - return (outer, range) - } - - func check(_ engine: Engine, expected: String) { - var engine = engine - if let t = enableTracing { - engine.enableTracing = t - } - let output: String - let outputFromSlice: String - - if let (idx, _) = engine.consume(input) { - output = String(input[idx...]) - } else { - output = input - } - - let (outerInput, range) = slicedInput - if let (idx, _) = engine.consume(outerInput, in: range) { - outputFromSlice = String(outerInput[idx..? = nil, - manyAEater: Engine? = nil, - eatUntilA: Engine? = nil, - eatThroughA: Engine? = nil - ) { - if let engine = aEater { - check(engine, expected: self.aEater) - } - if let engine = manyAEater { - check(engine, expected: self.manyAEater) - } - if let engine = eatUntilA { - check(engine, expected: self.eatUntilA) - } - if let engine = eatThroughA { - check(engine, expected: self.eatThroughA) - } - } -} - -var doPrint = false -func show(_ s: CustomStringConvertible) { - if doPrint { print(s) } -} - -func makeEngine( - _ constructor: (inout Program.Builder) -> () -) -> Engine { - var builder = Program.Builder() - constructor(&builder) - let program = try! builder.assemble() - let engine = Engine(program) - show(engine) - return engine -} - -// Eat an A off the front -// -// [0] match "A" -// [1] accept -// -let aEater: Engine = { - makeEngine { builder in - builder.buildMatch("A") - builder.buildAccept() - } -}() - -// Eat many "A"s off the input -// -// [0] saveAddress [3] // .accept -// [1] match "A" -// [2] goto [1] // match "A" -// [3] accept -// -// NOTE: a save would restore input position, which we -// actually don't want to do. -// -// NOTE: We should compare with a more sophisticated match -// instruction that can take at least or at most, etc. -// -let manyAEater: Engine = { - makeEngine { builder in - let accTok = builder.makeAddress() - let matchTok = builder.makeAddress() - - builder.buildSaveAddress(accTok) - builder.buildMatch("A") - builder.resolve(matchTok) - builder.buildBranch(to: matchTok) - builder.buildAccept() - builder.resolve(accTok) - } -}() - -// Eat until you find an A (FAIL if no A) -// -// [0] assert #0 #0 -// [1] condBranch #0 [x] // accept -// [2] advance(1) -// [3] goto 0 -// [4] accept -// -// NOTE: This check-consume-else-branch pattern -// could be pretty common and might be worth a dedicated -// instruction. -let eatUntilA: Engine = { - makeEngine { builder in - let reg = builder.makeBoolRegister() - let accTok = builder.makeAddress() - let assertTok = builder.makeAddress() - builder.buildAssert("A", into: reg) - builder.resolve(assertTok) - builder.buildCondBranch(reg, to: accTok) - builder.buildAdvance(1) - builder.buildBranch(to: assertTok) - builder.buildAccept() - builder.resolve(accTok) - } -}() - -// Eat through the first A (FAIL if no A) -// -// [0] assert #0 #0 -// [1] advance(1) -// [2] condBranch #0 [x] // accept -// [3] goto 0 -// [4] accept -let eatThroughA: Engine = { - makeEngine { builder in - let reg = builder.makeBoolRegister() - let accTok = builder.makeAddress() - let assertTok = builder.makeAddress() - builder.buildAssert("A", into: reg) - builder.resolve(assertTok) - builder.buildAdvance(1) - builder.buildCondBranch(reg, to: accTok) - builder.buildBranch(to: assertTok) - builder.buildAccept() - builder.resolve(accTok) - } -}() - - - -class MatchingEngineTests: XCTestCase { - - func testAEaters() { - let tests: Array = [ - Test("abc"), - Test("Abc"), - Test("AAbc"), - Test(""), - Test("A"), - Test("b"), - Test("bbbA"), - Test("bbAbA"), - ] - - for test in tests { - test.check(aEater: aEater) - test.check(manyAEater: manyAEater) - test.check(eatUntilA: eatUntilA) - test.check(eatThroughA: eatThroughA) - } - } - - func testThreeLetterRepeat() { - // Check for a repeated 3-letter sequence, such as in - // `(...)\1` - // - // [0] movePosition(into: %low) - // [1] advance(3) - // [2] movePosition(into: %high) - // [3] matchSlice(%low, %high) - // [4] accept - let threeLetterRepeat: Engine = { - makeEngine { builder in - let low = builder.makePositionRegister( - initializingWithCurrentPosition: ()) - builder.buildAdvance(3) - let high = builder.makePositionRegister( - initializingWithCurrentPosition: ()) - builder.buildMatchSlice(lower: low, upper: high) - builder.buildAccept() - } - }() - - let tests: Array<(String, Bool)> = [ - ("abcabc", true), - ("abcabc_____", true), - ("dddddd_____", true), - ("šŸ„³šŸ§Ÿā€ā™€ļøcšŸ„³šŸ§Ÿā€ā™€ļøc", true), - ("abccba", false), - ("abcabb", false), - ("abcbac", false), - ("šŸ„³šŸ§Ÿā€ā™€ļøcšŸ„³šŸ§Ÿā€ā™‚ļøc", false), - ] - - for (test, expect) in tests { - let match = threeLetterRepeat.consume(test) != nil - XCTAssertEqual(expect, match) - } - } -} +// TODO: Unit tests for the engine itself. Functional testing +// is handled by regex tests. diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index 9f3cc313b..cc3568c1d 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -142,13 +142,15 @@ func captureTest( for (input, output) in tests { let inputRange = input.startIndex.. searcher algorithm var start = input.startIndex while true { - if let (range, caps) = self.executeFlat( - input: input, + if let result = try! self.dynamicMatch( + input, in: start.. Date: Sun, 27 Feb 2022 07:39:15 -0700 Subject: [PATCH 07/19] Quick bug fix / workaround for whole-match values (#191) --- Sources/_StringProcessing/Capture.swift | 2 - .../_StringProcessing/Engine/Consume.swift | 42 ++++--------------- Sources/_StringProcessing/Executor.swift | 28 +++++++++++-- .../_StringProcessing/RegexDSL/Match.swift | 10 +++++ Tests/RegexTests/CustomTests.swift | 17 ++++---- 5 files changed, 51 insertions(+), 48 deletions(-) diff --git a/Sources/_StringProcessing/Capture.swift b/Sources/_StringProcessing/Capture.swift index ecfc558fe..5b43da870 100644 --- a/Sources/_StringProcessing/Capture.swift +++ b/Sources/_StringProcessing/Capture.swift @@ -96,5 +96,3 @@ extension Sequence where Element == StructuredCapture { self.map { $0.slice(from: input) } } } - - diff --git a/Sources/_StringProcessing/Engine/Consume.swift b/Sources/_StringProcessing/Engine/Consume.swift index a4a3bf26c..4e00a34b4 100644 --- a/Sources/_StringProcessing/Engine/Consume.swift +++ b/Sources/_StringProcessing/Engine/Consume.swift @@ -24,41 +24,17 @@ extension Engine { } } -extension Engine where Input == String { - func consume( - _ input: Input, - in range: Range, - matchMode: MatchMode - ) -> (Input.Index, CaptureList)? { - if enableTracing { - print("Consume: \(input)") - } - - var cpu = makeProcessor(input: input, bounds: range, matchMode: matchMode) - let result: Input.Index? = { - while true { - switch cpu.state { - case .accept: - return cpu.currentPosition - case .fail: - return nil - case .inProgress: cpu.cycle() - } - } - }() - - if enableTracing { - if let idx = result { - print("Result: \(input[.. Input.Index? { + while true { + switch self.state { + case .accept: + return self.currentPosition + case .fail: + return nil + case .inProgress: self.cycle() } } - guard let result = result else { return nil } - - let capList = cpu.storedCaptures - return (result, CaptureList( - values: capList, referencedCaptureOffsets: program.referencedCaptureOffsets)) } } diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 9de2b0b3d..c044cbf24 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -24,21 +24,41 @@ struct Executor { in inputRange: Range, _ mode: MatchMode ) throws -> RegexMatch? { - guard let (endIdx, capList) = engine.consume( - input, in: inputRange, matchMode: mode - ) else { + var cpu = engine.makeProcessor( + input: input, bounds: inputRange, matchMode: mode) + + guard let endIdx = cpu.consume() else { return nil } + + let capList = CaptureList( + values: cpu.storedCaptures, + referencedCaptureOffsets: engine.program.referencedCaptureOffsets) + let capStruct = engine.program.captureStructure let range = inputRange.lowerBound.. { let rawCaptures: [StructuredCapture] let referencedCaptureOffsets: [ReferenceID: Int] + let value: Any? + public var match: Match { if Match.self == (Substring, DynamicCaptures).self { // FIXME(rdar://89449323): Compiler assertion @@ -25,7 +27,15 @@ public struct RegexMatch { } else if Match.self == Substring.self { // FIXME: Plumb whole match (`.0`) through the matching engine. return input[range] as! Match + } else if rawCaptures.isEmpty, value != nil { + // FIXME: This is a workaround for whole-match values not + // being modeled as part of captures. We might want to + // switch to a model where results are alongside captures + return value! as! Match } else { + guard value == nil else { + fatalError("FIXME: what would this mean?") + } let typeErasedMatch = rawCaptures.existentialMatch(from: input[range]) return typeErasedMatch as! Match } diff --git a/Tests/RegexTests/CustomTests.swift b/Tests/RegexTests/CustomTests.swift index 692ad6c37..0ebfe4652 100644 --- a/Tests/RegexTests/CustomTests.swift +++ b/Tests/RegexTests/CustomTests.swift @@ -83,15 +83,14 @@ extension RegexTests { ("55z", .match, nil), ("55z", .firstMatch, "55")) - // FIXME: Requires we return a value instead of a range -// customTest( -// Regex { -// Numbler() -// }, -// ("ab123c", .firstMatch, 1), -// ("abc", .firstMatch, nil), -// ("55z", .match, nil), -// ("55z", .firstMatch, 5)) + customTest( + Regex { + Numbler() + }, + ("ab123c", .firstMatch, 1), + ("abc", .firstMatch, nil), + ("55z", .match, nil), + ("55z", .firstMatch, 5)) // TODO: Convert below tests to better infra. Right now // it's hard because `Match` is constrained to be From 3ab3b179188bc9d59e4c125fd132f2ddc530fc4a Mon Sep 17 00:00:00 2001 From: Richard Wei Date: Thu, 17 Feb 2022 15:20:47 -0800 Subject: [PATCH 08/19] Document specially integrated modules and integration process. --- README.md | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/README.md b/README.md index e6f94377c..941231b24 100644 --- a/README.md +++ b/README.md @@ -9,3 +9,65 @@ See [Declarative String Processing Overview][decl-string] ## Requirements - [Swift Trunk Development Snapshot](https://www.swift.org/download/#snapshots) DEVELOPMENT-SNAPSHOT-2022-02-03 or later. + +## Integration with Swift + +`_MatchingEngine`, `_CUnicode` and `_StringProcessing` are specially integrated modules that are built as part of apple/swift. + +Specifically, `_MatchingEngine` contains the parser for regular expression literals and is built both as part of the compiler and as a core library. `_CUnicode` and `_StringProcessing` are built together as a core library named `_StringProcessing`. + +| Module | Swift toolchain component | +| ------------------- | ------------------------------------------------------------------------------------ | +| `_MatchingEngine` | `SwiftCompilerSources/Sources/ExperimentalRegex` and `stdlib/public/_MatchingEngine` | +| `_CUnicode` | `stdlib/public/_StringProcessing` | +| `_StringProcessing` | `stdlib/public/_StringProcessing` | + +### Branching scheme + +#### Development branch + +The `main` branch is the branch for day-to-day development. Generally, you should create PRs against this branch. + +#### Swift integration branches + +Branches whose name starts with `swift/` are Swift integration branches similar to those in [apple/llvm-project](https://github.com/apple/llvm-project). For each branch, dropping the `swift/` prefix is the corresponding branch in [apple/swift](https://github.com/apple/swift). + +| apple/swift branch | apple/swift-experimental-string-processing branch | +| ------------------- | ----------------------------------------------------- | +| main | swift/main | +| release/5.7 | swift/release/5.7 | +| ... | swift/... | + +A pair of corresponding branches are expected to build successfully together and pass all tests. + +### Integration workflow + +To integrate the latest changes in apple/swift-experimental-string-processing to apple/swift, carefully follow the workflow: + +- Create pull requests. + - Create a pull request in apple/swift-experimental-string-processing from `main` to `swift/main`, e.g. "[Integration] main -> swift/main". + - If apple/swift needs to be modified to work with the latest `main` in apple/swift-experimental-string-processing, create a pull request in apple/swift. +- Trigger CI. + - In the apple/swift-experimental-string-processing pull request, trigger CI using the following command (replacing `` with the apple/swift pull request number, if any): + ``` + apple/swift# # use this line only if there is an corresponding apple/swift PR + @swift-ci please test + ``` + - In the apple/swift pull request (if any), trigger CI using the following command (replacing `` with the apple/swift-experimental-string-processing pull request number): + ``` + apple/swift-experimental-string-processing# + @swift-ci please test + ``` +- Merge when approved. + - Merge the pull request in apple/swift-experimental-string-processing as a **merge commit**. + - Merge the pull request in apple/swift (if any). + +### Development notes + +Compiler integration can be tricky. Use special caution when developing `_MatchingEngine`, `_CUnicode` and `_StringProcessing` modules. + +- Do not change the names of these modules without due approval from compiler and infrastructure teams. +- Do not modify the existing ABI (e.g. C API, serialization format) between the regular expression parser and the Swift compiler unless absolutely necessary. +- Always minimize the number of lockstep integrations, i.e. when apple/swift-experimental-string-processing and apple/swift have to change together. Whenever possible, introduce new API first, migrate Swift compiler onto it, and then deprecate old API. Use versioning if helpful. +- In `_StringProcessing`, do not write fully qualified references to symbols in `_CUnicode`, and always wrap `import _CUnicode` in a `#if canImport(_CUnicode)`. This is because `_CUnicode` is built as part of `_StringProcessing` with CMake. +- In `_MatchingEngine`, do not write fully qualified references to `_MatchingEngine` itself. This is because `_MatchingEngine` is built as `ExperimentalRegex` in `SwiftCompilerSources/` with CMake. From cebf4a6a20fde052287358c6706c27084aa4d27e Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 2 Mar 2022 16:53:55 +0000 Subject: [PATCH 09/19] Fix crash on lone backslash --- Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift | 4 +++- Sources/_MatchingEngine/Regex/Parse/Source.swift | 6 ++++++ Tests/RegexTests/ParseTests.swift | 4 ++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift index dd785f12d..cfab75312 100644 --- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift @@ -1472,7 +1472,9 @@ extension Source { return ref } - let char = src.eat() + guard let char = src.tryEat() else { + throw ParseError.expectedEscape + } // Single-character builtins. if let builtin = AST.Atom.EscapedBuiltin( diff --git a/Sources/_MatchingEngine/Regex/Parse/Source.swift b/Sources/_MatchingEngine/Regex/Parse/Source.swift index 11bd8152f..ddf0475f3 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Source.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Source.swift @@ -86,6 +86,12 @@ extension Source { tryEat(anyOf: set) } + /// Try to eat any character, returning `nil` if the input has been exhausted. + mutating func tryEat() -> Char? { + guard !isEmpty else { return nil } + return eat() + } + mutating func eat(asserting c: Char) { assert(peek() == c) advance() diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index e55abcbb9..23a3b910f 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1753,6 +1753,10 @@ extension RegexTests { diagnosticTest("(?")) diagnosticTest("(?", .expected(")")) + // MARK: Bad escapes + + diagnosticTest("\\", .expectedEscape) + // MARK: Text Segment options diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions) From c48fb1cbdeeb37ba55ff449a1046bbefe48e7d24 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 2 Mar 2022 16:53:55 +0000 Subject: [PATCH 10/19] Separate out DelimiterLexing.swift --- .../Regex/Parse/DelimiterLexing.swift | 153 ++++++++++++++++++ .../_MatchingEngine/Regex/Parse/Mocking.swift | 144 ----------------- 2 files changed, 153 insertions(+), 144 deletions(-) create mode 100644 Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift new file mode 100644 index 000000000..70532f9e7 --- /dev/null +++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift @@ -0,0 +1,153 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +// TODO: mock up multi-line soon + +enum Delimiter: Hashable, CaseIterable { + case traditional + case experimental + case reSingleQuote + + var openingAndClosing: (opening: String, closing: String) { + switch self { + case .traditional: return ("#/", "/#") + case .experimental: return ("#|", "|#") + case .reSingleQuote: return ("re'", "'") + } + } + var opening: String { openingAndClosing.opening } + var closing: String { openingAndClosing.closing } + + /// The default set of syntax options that the delimiter indicates. + var defaultSyntaxOptions: SyntaxOptions { + switch self { + case .traditional, .reSingleQuote: + return .traditional + case .experimental: + return .experimental + } + } +} + +struct LexError: Error, CustomStringConvertible { + enum Kind: Hashable { + case endOfString + case invalidUTF8 // TODO: better range reporting + case unknownDelimiter + } + + var kind: Kind + + /// The pointer at which to resume lexing. + var resumePtr: UnsafeRawPointer + + init(_ kind: Kind, resumeAt resumePtr: UnsafeRawPointer) { + self.kind = kind + self.resumePtr = resumePtr + } + + var description: String { + switch kind { + case .endOfString: return "unterminated regex literal" + case .invalidUTF8: return "invalid UTF-8 found in source file" + case .unknownDelimiter: return "unknown regex literal delimiter" + } + } +} + +/// Attempt to lex a regex literal between `start` and `end`, returning either +/// the contents and pointer from which to resume lexing, or an error. +func lexRegex( + start: UnsafeRawPointer, end: UnsafeRawPointer +) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { + precondition(start <= end) + var current = start + + func ascii(_ s: Unicode.Scalar) -> UInt8 { + assert(s.value <= 0x7F) + return UInt8(asserting: s.value) + } + func load(offset: Int) -> UInt8? { + guard current + offset < end else { return nil } + return current.load(fromByteOffset: offset, as: UInt8.self) + } + func load() -> UInt8? { load(offset: 0) } + func advance(_ n: Int = 1) { + precondition(current + n <= end, "Cannot advance past end") + current = current.advanced(by: n) + } + + func tryEat(_ utf8: String.UTF8View) -> Bool { + for (i, idx) in utf8.indices.enumerated() { + guard load(offset: i) == utf8[idx] else { return false } + } + advance(utf8.count) + return true + } + + // Try to lex the opening delimiter. + guard let delimiter = Delimiter.allCases.first( + where: { tryEat($0.opening.utf8) } + ) else { + throw LexError(.unknownDelimiter, resumeAt: current.successor()) + } + + let contentsStart = current + while true { + switch load() { + case nil, ascii("\n"), ascii("\r"): + throw LexError(.endOfString, resumeAt: current) + + case ascii("\\"): + // Skip next byte. + advance(2) + + default: + // Try to lex the closing delimiter. + let contentsEnd = current + guard tryEat(delimiter.closing.utf8) else { + advance() + continue + } + + // Form a string from the contents and make sure it's valid UTF-8. + let count = contentsEnd - contentsStart + let contents = UnsafeRawBufferPointer( + start: contentsStart, count: count) + let s = String(decoding: contents, as: UTF8.self) + + guard s.utf8.elementsEqual(contents) else { + throw LexError(.invalidUTF8, resumeAt: current) + } + return (contents: s, delimiter, end: current) + } + } +} + +/// Drop a set of regex delimiters from the input string, returning the contents +/// and the delimiter used. The input string must have valid delimiters. +func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) { + let utf8 = str.utf8 + func stripDelimiter(_ delim: Delimiter) -> String? { + let prefix = delim.opening.utf8 + let suffix = delim.closing.utf8 + guard utf8.prefix(prefix.count).elementsEqual(prefix), + utf8.suffix(suffix.count).elementsEqual(suffix) else { return nil } + + return String(utf8.dropFirst(prefix.count).dropLast(suffix.count)) + } + for d in Delimiter.allCases { + if let contents = stripDelimiter(d) { + return (contents, d) + } + } + fatalError("No valid delimiters") +} diff --git a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift index e3a178a15..dfba4757e 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift @@ -9,150 +9,6 @@ // //===----------------------------------------------------------------------===// - -// TODO: mock up multi-line soon - -enum Delimiter: Hashable, CaseIterable { - case traditional - case experimental - case reSingleQuote - - var openingAndClosing: (opening: String, closing: String) { - switch self { - case .traditional: return ("#/", "/#") - case .experimental: return ("#|", "|#") - case .reSingleQuote: return ("re'", "'") - } - } - var opening: String { openingAndClosing.opening } - var closing: String { openingAndClosing.closing } - - /// The default set of syntax options that the delimiter indicates. - var defaultSyntaxOptions: SyntaxOptions { - switch self { - case .traditional, .reSingleQuote: - return .traditional - case .experimental: - return .experimental - } - } -} - -struct LexError: Error, CustomStringConvertible { - enum Kind: Hashable { - case endOfString - case invalidUTF8 // TODO: better range reporting - case unknownDelimiter - } - - var kind: Kind - - /// The pointer at which to resume lexing. - var resumePtr: UnsafeRawPointer - - init(_ kind: Kind, resumeAt resumePtr: UnsafeRawPointer) { - self.kind = kind - self.resumePtr = resumePtr - } - - var description: String { - switch kind { - case .endOfString: return "unterminated regex literal" - case .invalidUTF8: return "invalid UTF-8 found in source file" - case .unknownDelimiter: return "unknown regex literal delimiter" - } - } -} - -/// Drop a set of regex delimiters from the input string, returning the contents -/// and the delimiter used. The input string must have valid delimiters. -func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) { - let utf8 = str.utf8 - func stripDelimiter(_ delim: Delimiter) -> String? { - let prefix = delim.opening.utf8 - let suffix = delim.closing.utf8 - guard utf8.prefix(prefix.count).elementsEqual(prefix), - utf8.suffix(suffix.count).elementsEqual(suffix) else { return nil } - - return String(utf8.dropFirst(prefix.count).dropLast(suffix.count)) - } - for d in Delimiter.allCases { - if let contents = stripDelimiter(d) { - return (contents, d) - } - } - fatalError("No valid delimiters") -} - -/// Attempt to lex a regex literal between `start` and `end`, returning either -/// the contents and pointer from which to resume lexing, or an error. -func lexRegex( - start: UnsafeRawPointer, end: UnsafeRawPointer -) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { - precondition(start <= end) - var current = start - - func ascii(_ s: Unicode.Scalar) -> UInt8 { - assert(s.value <= 0x7F) - return UInt8(asserting: s.value) - } - func load(offset: Int) -> UInt8? { - guard current + offset < end else { return nil } - return current.load(fromByteOffset: offset, as: UInt8.self) - } - func load() -> UInt8? { load(offset: 0) } - func advance(_ n: Int = 1) { - precondition(current + n <= end, "Cannot advance past end") - current = current.advanced(by: n) - } - - func tryEat(_ utf8: String.UTF8View) -> Bool { - for (i, idx) in utf8.indices.enumerated() { - guard load(offset: i) == utf8[idx] else { return false } - } - advance(utf8.count) - return true - } - - // Try to lex the opening delimiter. - guard let delimiter = Delimiter.allCases.first( - where: { tryEat($0.opening.utf8) } - ) else { - throw LexError(.unknownDelimiter, resumeAt: current.successor()) - } - - let contentsStart = current - while true { - switch load() { - case nil, ascii("\n"), ascii("\r"): - throw LexError(.endOfString, resumeAt: current) - - case ascii("\\"): - // Skip next byte. - advance(2) - - default: - // Try to lex the closing delimiter. - let contentsEnd = current - guard tryEat(delimiter.closing.utf8) else { - advance() - continue - } - - // Form a string from the contents and make sure it's valid UTF-8. - let count = contentsEnd - contentsStart - let contents = UnsafeRawBufferPointer( - start: contentsStart, count: count) - let s = String(decoding: contents, as: UTF8.self) - - guard s.utf8.elementsEqual(contents) else { - throw LexError(.invalidUTF8, resumeAt: current) - } - return (contents: s, delimiter, end: current) - } - } -} - private func copyCString(_ str: String) -> UnsafePointer { let count = str.utf8.count + 1 return str.withCString { From 0cbb9af76935b22a32c5ce5a8d02b1e6ad285700 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 2 Mar 2022 16:53:55 +0000 Subject: [PATCH 11/19] Rename LexError -> DelimiterLexError To avoid confusion with more general regex lexical analysis. --- Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift | 8 ++++---- Sources/_MatchingEngine/Regex/Parse/Mocking.swift | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift index 70532f9e7..c023a069c 100644 --- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift @@ -37,7 +37,7 @@ enum Delimiter: Hashable, CaseIterable { } } -struct LexError: Error, CustomStringConvertible { +struct DelimiterLexError: Error, CustomStringConvertible { enum Kind: Hashable { case endOfString case invalidUTF8 // TODO: better range reporting @@ -97,14 +97,14 @@ func lexRegex( guard let delimiter = Delimiter.allCases.first( where: { tryEat($0.opening.utf8) } ) else { - throw LexError(.unknownDelimiter, resumeAt: current.successor()) + throw DelimiterLexError(.unknownDelimiter, resumeAt: current.successor()) } let contentsStart = current while true { switch load() { case nil, ascii("\n"), ascii("\r"): - throw LexError(.endOfString, resumeAt: current) + throw DelimiterLexError(.endOfString, resumeAt: current) case ascii("\\"): // Skip next byte. @@ -125,7 +125,7 @@ func lexRegex( let s = String(decoding: contents, as: UTF8.self) guard s.utf8.elementsEqual(contents) else { - throw LexError(.invalidUTF8, resumeAt: current) + throw DelimiterLexError(.invalidUTF8, resumeAt: current) } return (contents: s, delimiter, end: current) } diff --git a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift index dfba4757e..b535edf1b 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift @@ -52,7 +52,7 @@ func libswiftLexRegexLiteral( let (_, _, endPtr) = try lexRegex(start: inputPtr, end: bufferEndPtr) curPtrPtr.pointee = endPtr.assumingMemoryBound(to: CChar.self) return false - } catch let error as LexError { + } catch let error as DelimiterLexError { if error.kind == .unknownDelimiter { // An unknown delimiter should be recovered from, as we may want to try // lex something else. @@ -66,7 +66,7 @@ func libswiftLexRegexLiteral( // closing delimiters, which would help with code completion. return true } catch { - fatalError("Should be a LexError") + fatalError("Should be a DelimiterLexError") } } From 7e820821f296436b504bb0d9cd782a46eda11f04 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 2 Mar 2022 16:53:56 +0000 Subject: [PATCH 12/19] Refactor delimiter lexing logic Introduce a DelimiterLexer type to perform the lexing. --- .../Regex/Parse/DelimiterLexing.swift | 167 +++++++++++++----- 1 file changed, 121 insertions(+), 46 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift index c023a069c..e49a442e7 100644 --- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift @@ -63,71 +63,137 @@ struct DelimiterLexError: Error, CustomStringConvertible { } } -/// Attempt to lex a regex literal between `start` and `end`, returning either -/// the contents and pointer from which to resume lexing, or an error. -func lexRegex( - start: UnsafeRawPointer, end: UnsafeRawPointer -) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { - precondition(start <= end) - var current = start +fileprivate struct DelimiterLexer { + let start: UnsafeRawPointer + var cursor: UnsafeRawPointer + let end: UnsafeRawPointer + + init(start: UnsafeRawPointer, end: UnsafeRawPointer) { + precondition(start <= end) + self.start = start + self.cursor = start + self.end = end + } func ascii(_ s: Unicode.Scalar) -> UInt8 { assert(s.value <= 0x7F) return UInt8(asserting: s.value) } - func load(offset: Int) -> UInt8? { - guard current + offset < end else { return nil } - return current.load(fromByteOffset: offset, as: UInt8.self) + + /// Return the byte at the current cursor, or `nil` if the end of the buffer + /// has been reached. + func load() -> UInt8? { + guard cursor < end else { return nil } + return cursor.load(as: UInt8.self) } - func load() -> UInt8? { load(offset: 0) } - func advance(_ n: Int = 1) { - precondition(current + n <= end, "Cannot advance past end") - current = current.advanced(by: n) + + /// Return the slice of `count` bytes from a specified cursor position, or + /// `nil` if there are fewer than `count` bytes until the end of the buffer. + func slice( + at cursor: UnsafeRawPointer, _ count: Int + ) -> UnsafeRawBufferPointer? { + guard cursor + count <= end else { return nil } + return UnsafeRawBufferPointer(start: cursor, count: count) } - func tryEat(_ utf8: String.UTF8View) -> Bool { - for (i, idx) in utf8.indices.enumerated() { - guard load(offset: i) == utf8[idx] else { return false } - } - advance(utf8.count) + /// Return the slice of `count` bytes from the current cursor, or `nil` if + /// there are fewer than `count` bytes until the end of the buffer. + func slice(_ count: Int) -> UnsafeRawBufferPointer? { + slice(at: cursor, count) + } + + /// Advance the cursor `n` bytes. + mutating func advanceCursor(_ n: Int = 1) { + cursor += n + precondition(cursor <= end, "Cannot advance past end") + } + + /// Check to see if a UTF-8 sequence can be eaten from the current cursor. + func canEat(_ utf8: String.UTF8View) -> Bool { + guard let slice = slice(utf8.count) else { return false } + return slice.elementsEqual(utf8) + } + + /// Attempt to eat a UTF-8 byte sequence, returning `true` if successful. + mutating func tryEat(_ utf8: String.UTF8View) -> Bool { + guard canEat(utf8) else { return false } + advanceCursor(utf8.count) return true } - // Try to lex the opening delimiter. - guard let delimiter = Delimiter.allCases.first( - where: { tryEat($0.opening.utf8) } - ) else { - throw DelimiterLexError(.unknownDelimiter, resumeAt: current.successor()) + /// Attempt to eat a particular closing delimiter, returning the contents of + /// the literal, and ending pointer, or `nil` if this is not a delimiter + /// ending. + mutating func tryEatEnding( + _ delimiter: Delimiter, contentsStart: UnsafeRawPointer + ) throws -> (contents: String, end: UnsafeRawPointer)? { + let contentsEnd = cursor + guard tryEat(delimiter.closing.utf8) else { return nil } + + // Form a string from the contents and make sure it's valid UTF-8. + let count = contentsEnd - contentsStart + let contents = UnsafeRawBufferPointer( + start: contentsStart, count: count) + let s = String(decoding: contents, as: UTF8.self) + + guard s.utf8.elementsEqual(contents) else { + throw DelimiterLexError(.invalidUTF8, resumeAt: cursor) + } + return (contents: s, end: cursor) } - let contentsStart = current - while true { - switch load() { - case nil, ascii("\n"), ascii("\r"): - throw DelimiterLexError(.endOfString, resumeAt: current) + /// Attempt to advance the lexer, throwing an error if the end of a line or + /// the end of the buffer is reached. + mutating func advance(escaped: Bool = false) throws { + guard let next = load() else { + throw DelimiterLexError(.endOfString, resumeAt: cursor) + } + switch UnicodeScalar(next) { + case let next where !next.isASCII: + // Just advance into a UTF-8 sequence. It shouldn't matter that we'll + // iterate through each byte as we only match against ASCII, and we + // validate it at the end. This case is separated out so we can just deal + // with the ASCII cases below. + advanceCursor() + + case "\n", "\r": + throw DelimiterLexError(.endOfString, resumeAt: cursor) + + case "\0": + // TODO: Warn to match the behavior of String literal lexer? Or should + // we error as unprintable? + advanceCursor() + + case "\\" where !escaped: + // Advance again for an escape sequence. + advanceCursor() + try advance(escaped: true) - case ascii("\\"): - // Skip next byte. - advance(2) default: - // Try to lex the closing delimiter. - let contentsEnd = current - guard tryEat(delimiter.closing.utf8) else { - advance() - continue - } + advanceCursor() + } + } - // Form a string from the contents and make sure it's valid UTF-8. - let count = contentsEnd - contentsStart - let contents = UnsafeRawBufferPointer( - start: contentsStart, count: count) - let s = String(decoding: contents, as: UTF8.self) + /*consuming*/ mutating func lex( + ) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { + + // Try to lex the opening delimiter. + guard let delimiter = Delimiter.allCases.first( + where: { tryEat($0.opening.utf8) } + ) else { + throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor()) + } - guard s.utf8.elementsEqual(contents) else { - throw DelimiterLexError(.invalidUTF8, resumeAt: current) + let contentsStart = cursor + while true { + // Try to lex the closing delimiter. + if let (contents, end) = try tryEatEnding(delimiter, + contentsStart: contentsStart) { + return (contents, delimiter, end) } - return (contents: s, delimiter, end: current) + // Try to advance the lexer. + try advance() } } } @@ -151,3 +217,12 @@ func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) { } fatalError("No valid delimiters") } + +/// Attempt to lex a regex literal between `start` and `end`, returning either +/// the contents and pointer from which to resume lexing, or an error. +func lexRegex( + start: UnsafeRawPointer, end: UnsafeRawPointer +) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { + var lexer = DelimiterLexer(start: start, end: end) + return try lexer.lex() +} From 8b3e2ef4bfd748159171332d3f8dc94d7bbb8ce0 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 2 Mar 2022 16:53:56 +0000 Subject: [PATCH 13/19] Diagnose unprintable ASCII characters This matches the behavior of the C++ lexer for string literals. --- .../Regex/Parse/DelimiterLexing.swift | 7 +++ .../Utility/MissingUnicode.swift | 8 +++ Tests/RegexTests/ParseTests.swift | 63 ++++++++++++++++--- 3 files changed, 70 insertions(+), 8 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift index e49a442e7..4b4618318 100644 --- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift @@ -42,6 +42,7 @@ struct DelimiterLexError: Error, CustomStringConvertible { case endOfString case invalidUTF8 // TODO: better range reporting case unknownDelimiter + case unprintableASCII } var kind: Kind @@ -59,6 +60,7 @@ struct DelimiterLexError: Error, CustomStringConvertible { case .endOfString: return "unterminated regex literal" case .invalidUTF8: return "invalid UTF-8 found in source file" case .unknownDelimiter: return "unknown regex literal delimiter" + case .unprintableASCII: return "unprintable ASCII character found in source file" } } } @@ -169,6 +171,11 @@ fileprivate struct DelimiterLexer { advanceCursor() try advance(escaped: true) + case let next where !next.isPrintableASCII: + // Diagnose unprintable ASCII. + // TODO: Ideally we would recover and continue to lex until the ending + // delimiter. + throw DelimiterLexError(.unprintableASCII, resumeAt: cursor.successor()) default: advanceCursor() diff --git a/Sources/_MatchingEngine/Utility/MissingUnicode.swift b/Sources/_MatchingEngine/Utility/MissingUnicode.swift index a6aae0b82..dccba3286 100644 --- a/Sources/_MatchingEngine/Utility/MissingUnicode.swift +++ b/Sources/_MatchingEngine/Utility/MissingUnicode.swift @@ -661,3 +661,11 @@ extension Character { public var isWordCharacter: Bool { isLetter || isNumber || self == "_" } } + +extension UnicodeScalar { + public var isPrintableASCII: Bool { + // Exclude non-printables before the space character U+20, and anything + // including and above the DEL character U+7F. + value >= 0x20 && value < 0x7F + } +} diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 23a3b910f..b0b2e5309 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -107,20 +107,26 @@ func parseTest( serializedCaptures.deallocate() } -func parseWithDelimitersTest( - _ input: String, _ expecting: AST.Node, - file: StaticString = #file, line: UInt = #line +func delimiterLexingTest( + _ input: String, file: StaticString = #file, line: UInt = #line ) { - // First try lexing. - input.withCString { ptr in - let (contents, delim, end) = try! lexRegex(start: ptr, - end: ptr + input.count) - XCTAssertEqual(end, ptr + input.count, file: file, line: line) + input.withCString(encodedAs: UTF8.self) { ptr in + let endPtr = ptr + input.utf8.count + let (contents, delim, end) = try! lexRegex(start: ptr, end: endPtr) + XCTAssertEqual(end, endPtr, file: file, line: line) let (parseContents, parseDelim) = droppingRegexDelimiters(input) XCTAssertEqual(contents, parseContents, file: file, line: line) XCTAssertEqual(delim, parseDelim, file: file, line: line) } +} + +func parseWithDelimitersTest( + _ input: String, _ expecting: AST.Node, + file: StaticString = #file, line: UInt = #line +) { + // First try lexing. + delimiterLexingTest(input, file: file, line: line) let orig = try! parseWithDelimiters(input) let ast = orig.root @@ -199,6 +205,32 @@ func diagnosticTest( } } +func delimiterLexingDiagnosticTest( + _ input: String, _ expected: DelimiterLexError.Kind, + syntax: SyntaxOptions = .traditional, + file: StaticString = #file, line: UInt = #line +) { + do { + _ = try input.withCString { ptr in + try lexRegex(start: ptr, end: ptr + input.count) + } + XCTFail(""" + Passed, but expected error: \(expected) + """, file: file, line: line) + } catch let e as DelimiterLexError { + guard e.kind == expected else { + XCTFail(""" + + Expected: \(expected) + Actual: \(e.kind) + """, file: file, line: line) + return + } + } catch let e { + XCTFail("Unexpected error type: \(e)", file: file, line: line) + } +} + func libswiftDiagnosticMessageTest( _ input: String, _ expectedErr: String, file: StaticString = #file, line: UInt = #line @@ -1472,6 +1504,11 @@ extension RegexTests { parseWithDelimitersTest("re'x*'", zeroOrMore(of: "x")) + parseWithDelimitersTest(#"re'šŸ”„šŸ‡©šŸ‡°'"#, concat("šŸ”„", "šŸ‡©šŸ‡°")) + parseWithDelimitersTest(#"re'\šŸ”„āœ…'"#, concat("šŸ”„", "āœ…")) + + // Printable ASCII characters. + delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##) // MARK: Parse not-equal // Make sure dumping output correctly reflects differences in AST. @@ -1890,6 +1927,16 @@ extension RegexTests { diagnosticTest("(*LIMIT_DEPTH=-1", .expectedNumber("", kind: .decimal)) } + func testDelimiterLexingErrors() { + delimiterLexingDiagnosticTest(#"re'\\#n'"#, .endOfString) + for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r. + delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII) + } + delimiterLexingDiagnosticTest("re'\n'", .endOfString) + delimiterLexingDiagnosticTest("re'\r'", .endOfString) + delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII) + } + func testlibswiftDiagnostics() { libswiftDiagnosticMessageTest( "#/[x*/#", "cannot parse regular expression: expected ']'") From 56414b8afd3cacaeb31ed3bf7b7a1186b889b624 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 2 Mar 2022 16:53:56 +0000 Subject: [PATCH 14/19] Allow lexer recovery for missing closing delimiter Allow the C++ lexer to form a tok::regex_literal. This avoids generic fallback behavior, and better allows for things like code completion. The test case for this will be in the C++ repo. --- .../Regex/Parse/DelimiterLexing.swift | 17 ++++++++------- .../_MatchingEngine/Regex/Parse/Mocking.swift | 14 +++++++++---- Sources/_MatchingEngine/Utility/Misc.swift | 21 +++++++++++++++++++ 3 files changed, 41 insertions(+), 11 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift index 4b4618318..c4be948ac 100644 --- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift @@ -208,14 +208,17 @@ fileprivate struct DelimiterLexer { /// Drop a set of regex delimiters from the input string, returning the contents /// and the delimiter used. The input string must have valid delimiters. func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) { - let utf8 = str.utf8 func stripDelimiter(_ delim: Delimiter) -> String? { - let prefix = delim.opening.utf8 - let suffix = delim.closing.utf8 - guard utf8.prefix(prefix.count).elementsEqual(prefix), - utf8.suffix(suffix.count).elementsEqual(suffix) else { return nil } - - return String(utf8.dropFirst(prefix.count).dropLast(suffix.count)) + // The opening delimiter must match. + guard var slice = str.utf8.tryDropPrefix(delim.opening.utf8) + else { return nil } + + // The closing delimiter may optionally match, as it may not be present in + // invalid code. + if let newSlice = slice.tryDropSuffix(delim.closing.utf8) { + slice = newSlice + } + return String(slice) } for d in Delimiter.allCases { if let contents = stripDelimiter(d) { diff --git a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift index b535edf1b..5994a4f52 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift @@ -61,10 +61,16 @@ func libswiftLexRegexLiteral( errOut.pointee = copyCString("\(error)") curPtrPtr.pointee = error.resumePtr.assumingMemoryBound(to: CChar.self) - // For now, treat every error as unrecoverable. - // TODO: We should ideally be able to recover from a regex with missing - // closing delimiters, which would help with code completion. - return true + switch error.kind { + case .endOfString: + // Missing closing delimiter can be recovered from. + return false + case .unprintableASCII, .invalidUTF8: + // We don't currently have good recovery behavior for these. + return true + case .unknownDelimiter: + fatalError("Already handled") + } } catch { fatalError("Should be a DelimiterLexError") } diff --git a/Sources/_MatchingEngine/Utility/Misc.swift b/Sources/_MatchingEngine/Utility/Misc.swift index bd1e395b5..55d3d3adc 100644 --- a/Sources/_MatchingEngine/Utility/Misc.swift +++ b/Sources/_MatchingEngine/Utility/Misc.swift @@ -108,7 +108,28 @@ extension Collection { >(_ idx: Index, in c: C) -> C.Index { c.index(atOffset: offset(of: idx)) } +} +extension Collection where Element: Equatable { + /// Attempt to drop a given prefix from the collection, returning the + /// resulting subsequence, or `nil` if the prefix does not match. + public func tryDropPrefix( + _ other: C + ) -> SubSequence? where C.Element == Element { + let prefixCount = other.count + guard prefix(prefixCount).elementsEqual(other) else { return nil } + return dropFirst(prefixCount) + } + + /// Attempt to drop a given suffix from the collection, returning the + /// resulting subsequence, or `nil` if the suffix does not match. + public func tryDropSuffix( + _ other: C + ) -> SubSequence? where C.Element == Element { + let suffixCount = other.count + guard suffix(suffixCount).elementsEqual(other) else { return nil } + return dropLast(suffixCount) + } } extension UnsafeMutableRawPointer { From 61450e875d80c622d5b7adb60cdf6cfc6be56439 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 2 Mar 2022 16:53:56 +0000 Subject: [PATCH 15/19] Add lexing heuristic to handle single quotes in re'...' If a single quote is encountered with a prefix of either `(?`, `(?(`, `\k`, `\g` or `(?C`, continue to scan ahead to a closing `'`. Such prefixes would not be valid endings for a regex literal anyway, and this lets us handle the single quote variant of their syntax. For the group name cases, further refine this skipping behavior by only skipping over characters that could possibly appear in that case. This improves diagnostic behavior by ensuring we don't go wandering off into Swift code. --- .../Regex/Parse/DelimiterLexing.swift | 92 +++++++++++++++ Tests/RegexTests/ParseTests.swift | 106 ++++++++++++++++-- 2 files changed, 191 insertions(+), 7 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift index c4be948ac..f1d3d5607 100644 --- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift @@ -104,6 +104,14 @@ fileprivate struct DelimiterLexer { slice(at: cursor, count) } + /// Return the slice of `count` bytes preceding the current cursor, or `nil` + /// if there are fewer than `count` bytes before the cursor. + func sliceBehind(_ count: Int) -> UnsafeRawBufferPointer? { + let priorCursor = cursor - count + guard priorCursor >= start else { return nil } + return slice(at: priorCursor, count) + } + /// Advance the cursor `n` bytes. mutating func advanceCursor(_ n: Int = 1) { cursor += n @@ -123,6 +131,86 @@ fileprivate struct DelimiterLexer { return true } + /// Attempt to skip over a closing delimiter character that is unlikely to be + /// the actual closing delimiter. + mutating func trySkipDelimiter(_ delimiter: Delimiter) { + // Only the closing `'` for re'...' can potentially be skipped over. + switch delimiter { + case .traditional, .experimental: + return + case .reSingleQuote: + break + } + guard load() == ascii("'") else { return } + + /// Need to look for a prefix of `(?`, `(?(`, `\k`, `\g`, `(?C`, as those + /// are the cases that could use single quotes. Note that none of these + /// would be valid regex endings anyway. + let calloutPrefix = "(?C" + let prefix = ["(?", "(?(", #"\k"#, #"\g"#, calloutPrefix].first { prior in + guard let priorSlice = sliceBehind(prior.utf8.count), + priorSlice.elementsEqual(prior.utf8) + else { return false } + + // Make sure the slice isn't preceded by a '\', as that invalidates this + // analysis. + if let prior = sliceBehind(priorSlice.count + 1) { + return prior[0] != ascii("\\") + } + return true + } + guard let prefix = prefix else { return } + let isCallout = prefix == calloutPrefix + + func isPossiblyGroupReference(_ c: UInt8) -> Bool { + // If this is an ASCII character, make sure it's for a group name. Leave + // other UTF-8 encoded scalars alone, this should at least catch cases + // where we run into a symbol such as `{`, `.`, `;` that would indicate + // we've likely advanced out of the bounds of the regex. + let scalar = UnicodeScalar(c) + guard scalar.isASCII else { return true } + switch scalar { + // Include '-' and '+' which may be used in recursion levels and relative + // references. + case "A"..."Z", "a"..."z", "0"..."9", "_", "-", "+": + return true + default: + return false + } + } + + // Make a note of the current lexing position, as we may need to revert + // back to it. + let originalCursor = cursor + advanceCursor() + + // Try skip over what would be the contents of a group identifier/reference. + while let next = load() { + // Found the ending, we're done. Return so we can continue to lex to the + // real delimiter. + if next == ascii("'") { + advanceCursor() + return + } + + // If this isn't a callout, make sure we have something that could be a + // group reference. We limit the character set here to improve diagnostic + // behavior in the case where the literal is actually unterminated. We + // ideally don't want to go wandering off into Swift source code. We can't + // do the same for callouts, as they take arbitrary strings. + guard isCallout || isPossiblyGroupReference(next) else { break } + do { + try advance() + } catch { + break + } + } + // We bailed out, either because we ran into something that didn't look like + // an identifier, or we reached the end of the line. Revert back to the + // original guess of delimiter. + cursor = originalCursor + } + /// Attempt to eat a particular closing delimiter, returning the contents of /// the literal, and ending pointer, or `nil` if this is not a delimiter /// ending. @@ -194,6 +282,10 @@ fileprivate struct DelimiterLexer { let contentsStart = cursor while true { + // Check to see if we're at a character that looks like a delimiter, but + // likely isn't. In such a case, we can attempt to skip over it. + trySkipDelimiter(delimiter) + // Try to lex the closing delimiter. if let (contents, end) = try tryEatEnding(delimiter, contentsStart: contentsStart) { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index b0b2e5309..b499c0b98 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -107,28 +107,46 @@ func parseTest( serializedCaptures.deallocate() } +/// Test delimiter lexing. Takes an input string that starts with a regex +/// literal. If `ignoreTrailing` is true, there may be additional characters +/// that follow the literal that are not considered part of it. +@discardableResult func delimiterLexingTest( - _ input: String, file: StaticString = #file, line: UInt = #line -) { + _ input: String, ignoreTrailing: Bool = false, + file: StaticString = #file, line: UInt = #line +) -> String { input.withCString(encodedAs: UTF8.self) { ptr in let endPtr = ptr + input.utf8.count let (contents, delim, end) = try! lexRegex(start: ptr, end: endPtr) - XCTAssertEqual(end, endPtr, file: file, line: line) + if ignoreTrailing { + XCTAssertNotEqual(end, endPtr, file: file, line: line) + } else { + XCTAssertEqual(end, endPtr, file: file, line: line) + } - let (parseContents, parseDelim) = droppingRegexDelimiters(input) + let rawPtr = UnsafeRawPointer(ptr) + let buffer = UnsafeRawBufferPointer(start: rawPtr, count: end - rawPtr) + let literal = String(decoding: buffer, as: UTF8.self) + + let (parseContents, parseDelim) = droppingRegexDelimiters(literal) XCTAssertEqual(contents, parseContents, file: file, line: line) XCTAssertEqual(delim, parseDelim, file: file, line: line) + return literal } } +/// Test parsing an input string with regex delimiters. If `ignoreTrailing` is +/// true, there may be additional characters that follow the literal that are +/// not considered part of it. func parseWithDelimitersTest( - _ input: String, _ expecting: AST.Node, + _ input: String, _ expecting: AST.Node, ignoreTrailing: Bool = false, file: StaticString = #file, line: UInt = #line ) { // First try lexing. - delimiterLexingTest(input, file: file, line: line) + let literal = delimiterLexingTest( + input, ignoreTrailing: ignoreTrailing, file: file, line: line) - let orig = try! parseWithDelimiters(input) + let orig = try! parseWithDelimiters(literal) let ast = orig.root guard ast == expecting || ast._dump() == expecting._dump() // EQ workaround @@ -1509,6 +1527,63 @@ extension RegexTests { // Printable ASCII characters. delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##) + + // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter + // if it's clear that it's part of the regex syntax. + + parseWithDelimitersTest( + #"re'(?'a_bcA0'\')'"#, namedCapture("a_bcA0", "'")) + parseWithDelimitersTest( + #"re'(?'a_bcA0-c1A'x*)'"#, + balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x"))) + + parseWithDelimitersTest( + #"re'(?('a_bcA0')x|y)'"#, conditional( + .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y")) + parseWithDelimitersTest( + #"re'(?('+20')\')'"#, conditional( + .groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty())) + + parseWithDelimitersTest( + #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A")))) + parseWithDelimitersTest( + #"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1)) + + parseWithDelimitersTest( + #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A")))) + parseWithDelimitersTest( + #"re'\g'-1'\''"#, concat(subpattern(.relative(-1)), "'")) + + parseWithDelimitersTest( + #"re'(?C'a*b\c šŸ”„_ ;')'"#, pcreCallout(.string(#"a*b\c šŸ”„_ ;"#))) + + // Fine, because we don't end up skipping. + delimiterLexingTest(#"re'(?'"#) + delimiterLexingTest(#"re'(?('"#) + delimiterLexingTest(#"re'\k'"#) + delimiterLexingTest(#"re'\g'"#) + delimiterLexingTest(#"re'(?C'"#) + + // Not a valid group name, but we can still skip over it. + delimiterLexingTest(#"re'(?'šŸ”„')'"#) + + // Escaped, so don't skip. These will ignore the ending `'` as we've already + // closed the literal. + parseWithDelimitersTest( + #"re'\(?''"#, zeroOrOne(of: "("), ignoreTrailing: true + ) + parseWithDelimitersTest( + #"re'\\k''"#, concat("\\", "k"), ignoreTrailing: true + ) + parseWithDelimitersTest( + #"re'\\g''"#, concat("\\", "g"), ignoreTrailing: true + ) + parseWithDelimitersTest( + #"re'\(?C''"#, concat(zeroOrOne(of: "("), "C"), ignoreTrailing: true + ) + delimiterLexingTest(#"re'(\?''"#, ignoreTrailing: true) + delimiterLexingTest(#"re'\(?(''"#, ignoreTrailing: true) + // MARK: Parse not-equal // Make sure dumping output correctly reflects differences in AST. @@ -1815,6 +1890,12 @@ extension RegexTests { diagnosticTest(#"(?<#>)"#, .identifierMustBeAlphaNumeric(.groupName)) diagnosticTest(#"(?'1A')"#, .identifierCannotStartWithNumber(.groupName)) + // TODO: It might be better if tried to consume up to the closing `'` and + // diagnosed an invalid group name based on that. + diagnosticTest(#"(?'abc ')"#, .expected("'")) + + diagnosticTest("(?'šŸ”„')", .identifierMustBeAlphaNumeric(.groupName)) + diagnosticTest(#"(?'-')"#, .expectedIdentifier(.groupName)) diagnosticTest(#"(?'--')"#, .identifierMustBeAlphaNumeric(.groupName)) diagnosticTest(#"(?'a-b-c')"#, .expected("'")) @@ -1928,6 +2009,9 @@ extension RegexTests { } func testDelimiterLexingErrors() { + + // MARK: Printable ASCII + delimiterLexingDiagnosticTest(#"re'\\#n'"#, .endOfString) for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r. delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII) @@ -1935,6 +2019,14 @@ extension RegexTests { delimiterLexingDiagnosticTest("re'\n'", .endOfString) delimiterLexingDiagnosticTest("re'\r'", .endOfString) delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII) + + // MARK: Delimiter skipping + + delimiterLexingDiagnosticTest("re'(?''", .endOfString) + delimiterLexingDiagnosticTest("re'(?'abc'", .endOfString) + delimiterLexingDiagnosticTest("re'(?('abc'", .endOfString) + delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .endOfString) + delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .endOfString) } func testlibswiftDiagnostics() { From 2325cef781477a4b51deeafcdae9c528a486e682 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 2 Mar 2022 16:53:57 +0000 Subject: [PATCH 16/19] Add support for rx'...' for experimental syntax --- Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift | 8 +++++--- Tests/RegexTests/ParseTests.swift | 6 ++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift index f1d3d5607..1227ade1f 100644 --- a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift @@ -15,12 +15,14 @@ enum Delimiter: Hashable, CaseIterable { case traditional case experimental case reSingleQuote + case rxSingleQuote var openingAndClosing: (opening: String, closing: String) { switch self { case .traditional: return ("#/", "/#") case .experimental: return ("#|", "|#") case .reSingleQuote: return ("re'", "'") + case .rxSingleQuote: return ("rx'", "'") } } var opening: String { openingAndClosing.opening } @@ -31,7 +33,7 @@ enum Delimiter: Hashable, CaseIterable { switch self { case .traditional, .reSingleQuote: return .traditional - case .experimental: + case .experimental, .rxSingleQuote: return .experimental } } @@ -134,11 +136,11 @@ fileprivate struct DelimiterLexer { /// Attempt to skip over a closing delimiter character that is unlikely to be /// the actual closing delimiter. mutating func trySkipDelimiter(_ delimiter: Delimiter) { - // Only the closing `'` for re'...' can potentially be skipped over. + // Only the closing `'` for re'...'/rx'...' can potentially be skipped over. switch delimiter { case .traditional, .experimental: return - case .reSingleQuote: + case .reSingleQuote, .rxSingleQuote: break } guard load() == ascii("'") else { return } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index b499c0b98..2ee76b682 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1497,6 +1497,9 @@ extension RegexTests { parseWithDelimitersTest("#/a b/#", concat("a", " ", "b")) parseWithDelimitersTest("#|a b|#", concat("a", "b")) + parseWithDelimitersTest("re'a b'", concat("a", " ", "b")) + parseWithDelimitersTest("rx'a b'", concat("a", "b")) + parseWithDelimitersTest("#|[a b]|#", charClass("a", "b")) parseWithDelimitersTest( "#|(?-x)[a b]|#", changeMatchingOptions( @@ -1537,6 +1540,9 @@ extension RegexTests { #"re'(?'a_bcA0-c1A'x*)'"#, balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x"))) + parseWithDelimitersTest( + #"rx' (?'a_bcA0' a b)'"#, concat(namedCapture("a_bcA0", concat("a", "b")))) + parseWithDelimitersTest( #"re'(?('a_bcA0')x|y)'"#, conditional( .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y")) From 688f1d82a50615a46018279270a9323065ba12b0 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 2 Mar 2022 17:09:47 +0000 Subject: [PATCH 17/19] Change script property default to use Script Extension Change the default script property behavior for an unqualified value e.g `\p{Greek}` from `\p{Script=Greek}` to `\p{Script_Extension=Greek}`. This is arguably the more intuitive behavior, and matches what Perl does. --- .../Regex/Parse/CharacterPropertyClassification.swift | 2 +- Tests/RegexTests/MatchTests.swift | 1 + Tests/RegexTests/ParseTests.swift | 6 +++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift index 6a5740aa1..e5b65a46c 100644 --- a/Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift @@ -381,7 +381,7 @@ extension Source { return .generalCategory(cat) } if let script = classifyScriptProperty(value) { - return .script(script) + return .scriptExtension(script) } if let posix = classifyPOSIX(value) { return .posix(posix) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 3cd2df585..6ee55e414 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -695,6 +695,7 @@ extension RegexTests { firstMatchTest(#"\p{ISBAMUM}"#, input: "123ꚠꚔꚢxyz", match: "ꚠ") firstMatchTest(#"\p{Script=Unknown}"#, input: "\u{10FFFF}", match: "\u{10FFFF}") firstMatchTest(#"\p{scx=Gujr}"#, input: "\u{a839}", match: "\u{a839}") + firstMatchTest(#"\p{Gujr}"#, input: "\u{a839}", match: "\u{a839}") firstMatchTest(#"\p{alpha}"#, input: "123abcXYZ", match: "a") firstMatchTest(#"\P{alpha}"#, input: "123abcXYZ", match: "1") diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index e55abcbb9..e911ee449 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1003,13 +1003,13 @@ extension RegexTests { parseTest(#"\p{sc=grek}"#, prop(.script(.greek))) parseTest(#"\p{sc=isGreek}"#, prop(.script(.greek))) - parseTest(#"\p{Greek}"#, prop(.script(.greek))) - parseTest(#"\p{isGreek}"#, prop(.script(.greek))) + parseTest(#"\p{Greek}"#, prop(.scriptExtension(.greek))) + parseTest(#"\p{isGreek}"#, prop(.scriptExtension(.greek))) parseTest(#"\P{Script=Latn}"#, prop(.script(.latin), inverted: true)) parseTest(#"\p{script=zzzz}"#, prop(.script(.unknown))) parseTest(#"\p{ISscript=iszzzz}"#, prop(.script(.unknown))) parseTest(#"\p{scx=bamum}"#, prop(.scriptExtension(.bamum))) - parseTest(#"\p{ISBAMUM}"#, prop(.script(.bamum))) + parseTest(#"\p{ISBAMUM}"#, prop(.scriptExtension(.bamum))) parseTest(#"\p{alpha}"#, prop(.binary(.alphabetic))) parseTest(#"\p{DEP}"#, prop(.binary(.deprecated))) From e8c84a19714041eb8544c492a2aa148b2868240d Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 2 Mar 2022 13:52:19 -0600 Subject: [PATCH 18/19] Add CustomRegexComponent example (#196) Adds a test with a `SemanticVersion` type that conforms to `CustomRegexComponent`. --- Tests/RegexTests/RegexDSLTests.swift | 53 ++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/Tests/RegexTests/RegexDSLTests.swift b/Tests/RegexTests/RegexDSLTests.swift index 554ef905f..d599400c6 100644 --- a/Tests/RegexTests/RegexDSLTests.swift +++ b/Tests/RegexTests/RegexDSLTests.swift @@ -642,6 +642,59 @@ class RegexDSLTests: XCTestCase { } } } + + func testSemanticVersionExample() { + struct SemanticVersion: Equatable { + var major: Int + var minor: Int + var patch: Int + var dev: String? + } + struct SemanticVersionParser: CustomRegexComponent { + typealias Match = SemanticVersion + func match( + _ input: String, + startingAt index: String.Index, + in bounds: Range + ) -> (upperBound: String.Index, match: SemanticVersion)? { + let regex = Regex { + tryCapture(oneOrMore(.digit)) { Int($0) } + "." + tryCapture(oneOrMore(.digit)) { Int($0) } + optionally { + "." + tryCapture(oneOrMore(.digit)) { Int($0) } + } + optionally { + "-" + capture(oneOrMore(.word)) + } + } + + guard let match = input[index.. Date: Thu, 3 Mar 2022 08:52:06 -0600 Subject: [PATCH 19/19] Support text segment boundary anchors (#178) This enables the `\y` and `\Y` anchors in regex literals and `Anchor.textSegmentBoundary` in the DSL. Note: This also includes `UnicodeScalar` conformance to `RegexProtocol`, which acts like Unicode scalar literals in regex literals. --- Sources/_StringProcessing/ByteCodeGen.swift | 12 ++++++++---- Sources/_StringProcessing/RegexDSL/DSL.swift | 16 +++++++++++----- Tests/RegexTests/MatchTests.swift | 16 ++++++++++++++-- Tests/RegexTests/RegexDSLTests.swift | 10 ++++++++++ 4 files changed, 43 insertions(+), 11 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 93dca17a8..d6389c1f6 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -99,12 +99,16 @@ extension Compiler.ByteCodeGen { } case .textSegment: - // This we should be able to do! - throw Unsupported(#"\y (text segment)"#) + builder.buildAssert { (input, pos, _) in + // FIXME: Grapheme or word based on options + input.isOnGraphemeClusterBoundary(pos) + } case .notTextSegment: - // This we should be able to do! - throw Unsupported(#"\Y (not text segment)"#) + builder.buildAssert { (input, pos, _) in + // FIXME: Grapheme or word based on options + !input.isOnGraphemeClusterBoundary(pos) + } case .startOfLine: builder.buildAssert { (input, pos, bounds) in diff --git a/Sources/_StringProcessing/RegexDSL/DSL.swift b/Sources/_StringProcessing/RegexDSL/DSL.swift index 17f006231..a21dce82d 100644 --- a/Sources/_StringProcessing/RegexDSL/DSL.swift +++ b/Sources/_StringProcessing/RegexDSL/DSL.swift @@ -17,8 +17,7 @@ extension String: RegexProtocol { public typealias Match = Substring public var regex: Regex { - let atoms = self.map { atom(.char($0)) } - return .init(ast: concat(atoms)) + .init(node: .quotedLiteral(self)) } } @@ -26,8 +25,7 @@ extension Substring: RegexProtocol { public typealias Match = Substring public var regex: Regex { - let atoms = self.map { atom(.char($0)) } - return .init(ast: concat(atoms)) + .init(node: .quotedLiteral(String(self))) } } @@ -35,7 +33,15 @@ extension Character: RegexProtocol { public typealias Match = Substring public var regex: Regex { - .init(ast: atom(.char(self))) + .init(node: .atom(.char(self))) + } +} + +extension UnicodeScalar: RegexProtocol { + public typealias Match = Substring + + public var regex: Regex { + .init(node: .atom(.scalar(self))) } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 6ee55e414..dba72820f 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -877,7 +877,8 @@ extension RegexTests { #"\d+\b"#, ("123", "123"), (" 123", "123"), - ("123 456", "123")) + ("123 456", "123"), + ("123A 456", "456")) firstMatchTests( #"\d+\b\s\b\d+"#, ("123", nil), @@ -893,7 +894,18 @@ extension RegexTests { // TODO: \G and \K // TODO: Oniguruma \y and \Y - + firstMatchTests( + #"\u{65}"#, // Scalar 'e' is present in both: + ("Cafe\u{301}", "e"), // composed and + ("Sol Cafe", "e")) // standalone + firstMatchTests( + #"\u{65}\y"#, // Grapheme boundary assertion + ("Cafe\u{301}", nil), + ("Sol Cafe", "e")) + firstMatchTests( + #"\u{65}\Y"#, // Grapheme non-boundary assertion + ("Cafe\u{301}", "e"), + ("Sol Cafe", nil)) } func testMatchGroups() { diff --git a/Tests/RegexTests/RegexDSLTests.swift b/Tests/RegexTests/RegexDSLTests.swift index d599400c6..d78ff04e5 100644 --- a/Tests/RegexTests/RegexDSLTests.swift +++ b/Tests/RegexTests/RegexDSLTests.swift @@ -280,6 +280,16 @@ class RegexDSLTests: XCTestCase { Anchor.endOfLine } + try _testDSLCaptures( + ("Cafe\u{301}", nil), + ("Cafe", "Cafe"), + matchType: Substring.self, ==) + { + oneOrMore(.word) + UnicodeScalar("e") + Anchor.textSegmentBoundary + } + try _testDSLCaptures( ("aaaaa1", "aaaaa1"), ("aaaaa2", nil),