diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..ad640f43a --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,17 @@ + +cmake_minimum_required(VERSION 3.18) +project(SwiftExperimentalStringProcessing + LANGUAGES Swift) + +if(CMAKE_SYSTEM_NAME STREQUAL Windows OR CMAKE_SYSTEM_NAME STREQUAL Darwin) + option(BUILD_SHARED_LIBS "Build shared libraries by default" YES) +endif() + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) +set(CMAKE_Swift_MODULE_DIRECTORY ${CMAKE_BINARY_DIR}/swift) + +find_package(ArgumentParser CONFIG) + +add_subdirectory(Sources) diff --git a/README.md b/README.md index e6f94377c..941231b24 100644 --- a/README.md +++ b/README.md @@ -9,3 +9,65 @@ See [Declarative String Processing Overview][decl-string] ## Requirements - [Swift Trunk Development Snapshot](https://www.swift.org/download/#snapshots) DEVELOPMENT-SNAPSHOT-2022-02-03 or later. + +## Integration with Swift + +`_MatchingEngine`, `_CUnicode` and `_StringProcessing` are specially integrated modules that are built as part of apple/swift. + +Specifically, `_MatchingEngine` contains the parser for regular expression literals and is built both as part of the compiler and as a core library. `_CUnicode` and `_StringProcessing` are built together as a core library named `_StringProcessing`. + +| Module | Swift toolchain component | +| ------------------- | ------------------------------------------------------------------------------------ | +| `_MatchingEngine` | `SwiftCompilerSources/Sources/ExperimentalRegex` and `stdlib/public/_MatchingEngine` | +| `_CUnicode` | `stdlib/public/_StringProcessing` | +| `_StringProcessing` | `stdlib/public/_StringProcessing` | + +### Branching scheme + +#### Development branch + +The `main` branch is the branch for day-to-day development. Generally, you should create PRs against this branch. + +#### Swift integration branches + +Branches whose name starts with `swift/` are Swift integration branches similar to those in [apple/llvm-project](https://github.com/apple/llvm-project). For each branch, dropping the `swift/` prefix is the corresponding branch in [apple/swift](https://github.com/apple/swift). + +| apple/swift branch | apple/swift-experimental-string-processing branch | +| ------------------- | ----------------------------------------------------- | +| main | swift/main | +| release/5.7 | swift/release/5.7 | +| ... | swift/... | + +A pair of corresponding branches are expected to build successfully together and pass all tests. + +### Integration workflow + +To integrate the latest changes in apple/swift-experimental-string-processing to apple/swift, carefully follow the workflow: + +- Create pull requests. + - Create a pull request in apple/swift-experimental-string-processing from `main` to `swift/main`, e.g. "[Integration] main -> swift/main". + - If apple/swift needs to be modified to work with the latest `main` in apple/swift-experimental-string-processing, create a pull request in apple/swift. +- Trigger CI. + - In the apple/swift-experimental-string-processing pull request, trigger CI using the following command (replacing `` with the apple/swift pull request number, if any): + ``` + apple/swift# # use this line only if there is an corresponding apple/swift PR + @swift-ci please test + ``` + - In the apple/swift pull request (if any), trigger CI using the following command (replacing `` with the apple/swift-experimental-string-processing pull request number): + ``` + apple/swift-experimental-string-processing# + @swift-ci please test + ``` +- Merge when approved. + - Merge the pull request in apple/swift-experimental-string-processing as a **merge commit**. + - Merge the pull request in apple/swift (if any). + +### Development notes + +Compiler integration can be tricky. Use special caution when developing `_MatchingEngine`, `_CUnicode` and `_StringProcessing` modules. + +- Do not change the names of these modules without due approval from compiler and infrastructure teams. +- Do not modify the existing ABI (e.g. C API, serialization format) between the regular expression parser and the Swift compiler unless absolutely necessary. +- Always minimize the number of lockstep integrations, i.e. when apple/swift-experimental-string-processing and apple/swift have to change together. Whenever possible, introduce new API first, migrate Swift compiler onto it, and then deprecate old API. Use versioning if helpful. +- In `_StringProcessing`, do not write fully qualified references to symbols in `_CUnicode`, and always wrap `import _CUnicode` in a `#if canImport(_CUnicode)`. This is because `_CUnicode` is built as part of `_StringProcessing` with CMake. +- In `_MatchingEngine`, do not write fully qualified references to `_MatchingEngine` itself. This is because `_MatchingEngine` is built as `ExperimentalRegex` in `SwiftCompilerSources/` with CMake. diff --git a/Sources/CMakeLists.txt b/Sources/CMakeLists.txt new file mode 100644 index 000000000..19feadbd9 --- /dev/null +++ b/Sources/CMakeLists.txt @@ -0,0 +1,6 @@ + +add_subdirectory(_Unicode) +add_subdirectory(_MatchingEngine) +add_subdirectory(_StringProcessing) +add_subdirectory(Prototypes) +add_subdirectory(VariadicsGenerator) diff --git a/Sources/Prototypes/CMakeLists.txt b/Sources/Prototypes/CMakeLists.txt new file mode 100644 index 000000000..60768f5a3 --- /dev/null +++ b/Sources/Prototypes/CMakeLists.txt @@ -0,0 +1,18 @@ + +add_library(Prototypes + Combinators/Combinators.swift + PEG/PEG.swift + PEG/PEGCode.swift + PEG/PEGCompile.swift + PEG/PEGCore.swift + PEG/PEGInterpreter.swift + PEG/PEGTranspile.swift + PEG/PEGVM.swift + PEG/PEGVMExecute.swift + PEG/Printing.swift + PTCaRet/Interpreter.swift + PTCaRet/PTCaRet.swift + TourOfTypes/CharacterClass.swift + TourOfTypes/Literal.swift) +target_link_libraries(Prototypes PUBLIC + _MatchingEngine) diff --git a/Sources/Prototypes/PEG/PEGCode.swift b/Sources/Prototypes/PEG/PEGCode.swift index c33f5759c..b12c5bab6 100644 --- a/Sources/Prototypes/PEG/PEGCode.swift +++ b/Sources/Prototypes/PEG/PEGCode.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -import _StringProcessing +@testable import _StringProcessing extension PEG.VM { struct Code { diff --git a/Sources/Prototypes/PEG/PEGCompile.swift b/Sources/Prototypes/PEG/PEGCompile.swift index 0592cf6a9..0e1b89233 100644 --- a/Sources/Prototypes/PEG/PEGCompile.swift +++ b/Sources/Prototypes/PEG/PEGCompile.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -import _StringProcessing +@testable import _StringProcessing extension PEG.VM { typealias InIndex = Input.Index diff --git a/Sources/Prototypes/PEG/PEGCore.swift b/Sources/Prototypes/PEG/PEGCore.swift index b831cbd0f..5c66dc25a 100644 --- a/Sources/Prototypes/PEG/PEGCore.swift +++ b/Sources/Prototypes/PEG/PEGCore.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -import _StringProcessing +@testable import _StringProcessing let emitComments = true struct PEGCore< diff --git a/Sources/Prototypes/PEG/PEGTranspile.swift b/Sources/Prototypes/PEG/PEGTranspile.swift index df75cea63..84e220d52 100644 --- a/Sources/Prototypes/PEG/PEGTranspile.swift +++ b/Sources/Prototypes/PEG/PEGTranspile.swift @@ -9,8 +9,7 @@ // //===----------------------------------------------------------------------===// -import _MatchingEngine -import _StringProcessing +@testable import _StringProcessing extension PEG.VM where Input == String { typealias MEProg = MEProgram diff --git a/Sources/Prototypes/PEG/PEGVM.swift b/Sources/Prototypes/PEG/PEGVM.swift index a987b581d..4cf91a5c1 100644 --- a/Sources/Prototypes/PEG/PEGVM.swift +++ b/Sources/Prototypes/PEG/PEGVM.swift @@ -9,7 +9,8 @@ // //===----------------------------------------------------------------------===// -import _StringProcessing + +@testable import _StringProcessing extension PEG { diff --git a/Sources/Prototypes/PEG/Printing.swift b/Sources/Prototypes/PEG/Printing.swift index 978250761..be60e72f5 100644 --- a/Sources/Prototypes/PEG/Printing.swift +++ b/Sources/Prototypes/PEG/Printing.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -import _StringProcessing +@testable import _StringProcessing extension PEGCore.Instruction: InstructionProtocol { var operandPC: InstructionAddress? { self.pc } diff --git a/Sources/VariadicsGenerator/CMakeLists.txt b/Sources/VariadicsGenerator/CMakeLists.txt new file mode 100644 index 000000000..8ea543970 --- /dev/null +++ b/Sources/VariadicsGenerator/CMakeLists.txt @@ -0,0 +1,7 @@ + +add_executable(VariadicsGenerator + VariadicsGenerator.swift) +target_compile_options(VariadicsGenerator PRIVATE + -parse-as-library) +target_link_libraries(VariadicsGenerator PUBLIC + ArgumentParser) diff --git a/Sources/_MatchingEngine/CMakeLists.txt b/Sources/_MatchingEngine/CMakeLists.txt new file mode 100644 index 000000000..f7cb97ce3 --- /dev/null +++ b/Sources/_MatchingEngine/CMakeLists.txt @@ -0,0 +1,46 @@ + +add_library(_MatchingEngine + Engine/Backtracking.swift + Engine/Builder.swift + Engine/Capture.swift + Engine/Consume.swift + Engine/Engine.swift + Engine/InstPayload.swift + Engine/Instruction.swift + Engine/Processor.swift + Engine/Program.swift + Engine/Registers.swift + Engine/Tracing.swift + Regex/AST/AST.swift + Regex/AST/ASTAction.swift + Regex/AST/ASTProtocols.swift + Regex/AST/Atom.swift + Regex/AST/Conditional.swift + Regex/AST/CustomCharClass.swift + Regex/AST/Group.swift + Regex/AST/MatchingOptions.swift + Regex/AST/Quantification.swift + Regex/Parse/CaptureStructure.swift + Regex/Parse/CharacterPropertyClassification.swift + Regex/Parse/Diagnostics.swift + Regex/Parse/LexicalAnalysis.swift + Regex/Parse/Mocking.swift + Regex/Parse/Parse.swift + Regex/Parse/Source.swift + Regex/Parse/SourceLocation.swift + Regex/Parse/SyntaxOptions.swift + Regex/Printing/DumpAST.swift + Regex/Printing/PrettyPrinter.swift + Regex/Printing/PrintAsCanonical.swift + Regex/Printing/PrintAsPattern.swift + Regex/Printing/RenderRanges.swift + Utility/AllScalars.swift + Utility/Formatting.swift + Utility/Misc.swift + Utility/MissingUnicode.swift + Utility/Protocols.swift + Utility/TypeConstruction.swift + Utility/TypedIndex.swift + Utility/TypedInt.swift) +target_compile_options(_MatchingEngine PRIVATE + -enable-library-evolution) diff --git a/Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift index 6a5740aa1..e5b65a46c 100644 --- a/Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift @@ -381,7 +381,7 @@ extension Source { return .generalCategory(cat) } if let script = classifyScriptProperty(value) { - return .script(script) + return .scriptExtension(script) } if let posix = classifyPOSIX(value) { return .posix(posix) diff --git a/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift new file mode 100644 index 000000000..1227ade1f --- /dev/null +++ b/Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift @@ -0,0 +1,332 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +// TODO: mock up multi-line soon + +enum Delimiter: Hashable, CaseIterable { + case traditional + case experimental + case reSingleQuote + case rxSingleQuote + + var openingAndClosing: (opening: String, closing: String) { + switch self { + case .traditional: return ("#/", "/#") + case .experimental: return ("#|", "|#") + case .reSingleQuote: return ("re'", "'") + case .rxSingleQuote: return ("rx'", "'") + } + } + var opening: String { openingAndClosing.opening } + var closing: String { openingAndClosing.closing } + + /// The default set of syntax options that the delimiter indicates. + var defaultSyntaxOptions: SyntaxOptions { + switch self { + case .traditional, .reSingleQuote: + return .traditional + case .experimental, .rxSingleQuote: + return .experimental + } + } +} + +struct DelimiterLexError: Error, CustomStringConvertible { + enum Kind: Hashable { + case endOfString + case invalidUTF8 // TODO: better range reporting + case unknownDelimiter + case unprintableASCII + } + + var kind: Kind + + /// The pointer at which to resume lexing. + var resumePtr: UnsafeRawPointer + + init(_ kind: Kind, resumeAt resumePtr: UnsafeRawPointer) { + self.kind = kind + self.resumePtr = resumePtr + } + + var description: String { + switch kind { + case .endOfString: return "unterminated regex literal" + case .invalidUTF8: return "invalid UTF-8 found in source file" + case .unknownDelimiter: return "unknown regex literal delimiter" + case .unprintableASCII: return "unprintable ASCII character found in source file" + } + } +} + +fileprivate struct DelimiterLexer { + let start: UnsafeRawPointer + var cursor: UnsafeRawPointer + let end: UnsafeRawPointer + + init(start: UnsafeRawPointer, end: UnsafeRawPointer) { + precondition(start <= end) + self.start = start + self.cursor = start + self.end = end + } + + func ascii(_ s: Unicode.Scalar) -> UInt8 { + assert(s.value <= 0x7F) + return UInt8(asserting: s.value) + } + + /// Return the byte at the current cursor, or `nil` if the end of the buffer + /// has been reached. + func load() -> UInt8? { + guard cursor < end else { return nil } + return cursor.load(as: UInt8.self) + } + + /// Return the slice of `count` bytes from a specified cursor position, or + /// `nil` if there are fewer than `count` bytes until the end of the buffer. + func slice( + at cursor: UnsafeRawPointer, _ count: Int + ) -> UnsafeRawBufferPointer? { + guard cursor + count <= end else { return nil } + return UnsafeRawBufferPointer(start: cursor, count: count) + } + + /// Return the slice of `count` bytes from the current cursor, or `nil` if + /// there are fewer than `count` bytes until the end of the buffer. + func slice(_ count: Int) -> UnsafeRawBufferPointer? { + slice(at: cursor, count) + } + + /// Return the slice of `count` bytes preceding the current cursor, or `nil` + /// if there are fewer than `count` bytes before the cursor. + func sliceBehind(_ count: Int) -> UnsafeRawBufferPointer? { + let priorCursor = cursor - count + guard priorCursor >= start else { return nil } + return slice(at: priorCursor, count) + } + + /// Advance the cursor `n` bytes. + mutating func advanceCursor(_ n: Int = 1) { + cursor += n + precondition(cursor <= end, "Cannot advance past end") + } + + /// Check to see if a UTF-8 sequence can be eaten from the current cursor. + func canEat(_ utf8: String.UTF8View) -> Bool { + guard let slice = slice(utf8.count) else { return false } + return slice.elementsEqual(utf8) + } + + /// Attempt to eat a UTF-8 byte sequence, returning `true` if successful. + mutating func tryEat(_ utf8: String.UTF8View) -> Bool { + guard canEat(utf8) else { return false } + advanceCursor(utf8.count) + return true + } + + /// Attempt to skip over a closing delimiter character that is unlikely to be + /// the actual closing delimiter. + mutating func trySkipDelimiter(_ delimiter: Delimiter) { + // Only the closing `'` for re'...'/rx'...' can potentially be skipped over. + switch delimiter { + case .traditional, .experimental: + return + case .reSingleQuote, .rxSingleQuote: + break + } + guard load() == ascii("'") else { return } + + /// Need to look for a prefix of `(?`, `(?(`, `\k`, `\g`, `(?C`, as those + /// are the cases that could use single quotes. Note that none of these + /// would be valid regex endings anyway. + let calloutPrefix = "(?C" + let prefix = ["(?", "(?(", #"\k"#, #"\g"#, calloutPrefix].first { prior in + guard let priorSlice = sliceBehind(prior.utf8.count), + priorSlice.elementsEqual(prior.utf8) + else { return false } + + // Make sure the slice isn't preceded by a '\', as that invalidates this + // analysis. + if let prior = sliceBehind(priorSlice.count + 1) { + return prior[0] != ascii("\\") + } + return true + } + guard let prefix = prefix else { return } + let isCallout = prefix == calloutPrefix + + func isPossiblyGroupReference(_ c: UInt8) -> Bool { + // If this is an ASCII character, make sure it's for a group name. Leave + // other UTF-8 encoded scalars alone, this should at least catch cases + // where we run into a symbol such as `{`, `.`, `;` that would indicate + // we've likely advanced out of the bounds of the regex. + let scalar = UnicodeScalar(c) + guard scalar.isASCII else { return true } + switch scalar { + // Include '-' and '+' which may be used in recursion levels and relative + // references. + case "A"..."Z", "a"..."z", "0"..."9", "_", "-", "+": + return true + default: + return false + } + } + + // Make a note of the current lexing position, as we may need to revert + // back to it. + let originalCursor = cursor + advanceCursor() + + // Try skip over what would be the contents of a group identifier/reference. + while let next = load() { + // Found the ending, we're done. Return so we can continue to lex to the + // real delimiter. + if next == ascii("'") { + advanceCursor() + return + } + + // If this isn't a callout, make sure we have something that could be a + // group reference. We limit the character set here to improve diagnostic + // behavior in the case where the literal is actually unterminated. We + // ideally don't want to go wandering off into Swift source code. We can't + // do the same for callouts, as they take arbitrary strings. + guard isCallout || isPossiblyGroupReference(next) else { break } + do { + try advance() + } catch { + break + } + } + // We bailed out, either because we ran into something that didn't look like + // an identifier, or we reached the end of the line. Revert back to the + // original guess of delimiter. + cursor = originalCursor + } + + /// Attempt to eat a particular closing delimiter, returning the contents of + /// the literal, and ending pointer, or `nil` if this is not a delimiter + /// ending. + mutating func tryEatEnding( + _ delimiter: Delimiter, contentsStart: UnsafeRawPointer + ) throws -> (contents: String, end: UnsafeRawPointer)? { + let contentsEnd = cursor + guard tryEat(delimiter.closing.utf8) else { return nil } + + // Form a string from the contents and make sure it's valid UTF-8. + let count = contentsEnd - contentsStart + let contents = UnsafeRawBufferPointer( + start: contentsStart, count: count) + let s = String(decoding: contents, as: UTF8.self) + + guard s.utf8.elementsEqual(contents) else { + throw DelimiterLexError(.invalidUTF8, resumeAt: cursor) + } + return (contents: s, end: cursor) + } + + /// Attempt to advance the lexer, throwing an error if the end of a line or + /// the end of the buffer is reached. + mutating func advance(escaped: Bool = false) throws { + guard let next = load() else { + throw DelimiterLexError(.endOfString, resumeAt: cursor) + } + switch UnicodeScalar(next) { + case let next where !next.isASCII: + // Just advance into a UTF-8 sequence. It shouldn't matter that we'll + // iterate through each byte as we only match against ASCII, and we + // validate it at the end. This case is separated out so we can just deal + // with the ASCII cases below. + advanceCursor() + + case "\n", "\r": + throw DelimiterLexError(.endOfString, resumeAt: cursor) + + case "\0": + // TODO: Warn to match the behavior of String literal lexer? Or should + // we error as unprintable? + advanceCursor() + + case "\\" where !escaped: + // Advance again for an escape sequence. + advanceCursor() + try advance(escaped: true) + + case let next where !next.isPrintableASCII: + // Diagnose unprintable ASCII. + // TODO: Ideally we would recover and continue to lex until the ending + // delimiter. + throw DelimiterLexError(.unprintableASCII, resumeAt: cursor.successor()) + + default: + advanceCursor() + } + } + + /*consuming*/ mutating func lex( + ) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { + + // Try to lex the opening delimiter. + guard let delimiter = Delimiter.allCases.first( + where: { tryEat($0.opening.utf8) } + ) else { + throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor()) + } + + let contentsStart = cursor + while true { + // Check to see if we're at a character that looks like a delimiter, but + // likely isn't. In such a case, we can attempt to skip over it. + trySkipDelimiter(delimiter) + + // Try to lex the closing delimiter. + if let (contents, end) = try tryEatEnding(delimiter, + contentsStart: contentsStart) { + return (contents, delimiter, end) + } + // Try to advance the lexer. + try advance() + } + } +} + +/// Drop a set of regex delimiters from the input string, returning the contents +/// and the delimiter used. The input string must have valid delimiters. +func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) { + func stripDelimiter(_ delim: Delimiter) -> String? { + // The opening delimiter must match. + guard var slice = str.utf8.tryDropPrefix(delim.opening.utf8) + else { return nil } + + // The closing delimiter may optionally match, as it may not be present in + // invalid code. + if let newSlice = slice.tryDropSuffix(delim.closing.utf8) { + slice = newSlice + } + return String(slice) + } + for d in Delimiter.allCases { + if let contents = stripDelimiter(d) { + return (contents, d) + } + } + fatalError("No valid delimiters") +} + +/// Attempt to lex a regex literal between `start` and `end`, returning either +/// the contents and pointer from which to resume lexing, or an error. +func lexRegex( + start: UnsafeRawPointer, end: UnsafeRawPointer +) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { + var lexer = DelimiterLexer(start: start, end: end) + return try lexer.lex() +} diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift index 727727ce1..cfab75312 100644 --- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift @@ -279,7 +279,7 @@ extension Source { /// | 'x' HexDigit{0...2} /// | 'U' HexDigit{8} /// | 'o{' OctalDigit{1...} '}' - /// | OctalDigit{1...3} + /// | '0' OctalDigit{0...3} /// mutating func expectUnicodeScalar( escapedCharacter base: Character @@ -313,13 +313,14 @@ extension Source { let str = try src.lexUntil(eating: "}").value return try Source.validateUnicodeScalar(str, .octal) - case let c where c.isOctalDigit: - // We can read *up to* 2 more octal digits per PCRE. - // FIXME: ICU can read up to 3 octal digits if the leading digit is 0, - // we should have a parser mode to switch. - let nextDigits = src.tryEatPrefix(maxLength: 2, \.isOctalDigit) - let str = String(c) + (nextDigits?.string ?? "") - return try Source.validateUnicodeScalar(str, .octal) + case "0": + // We can read *up to* 3 more octal digits. + // FIXME: PCRE can only read up to 2 octal digits, if we get a strict + // PCRE mode, we should limit it here. + guard let digits = src.tryEatPrefix(maxLength: 3, \.isOctalDigit) else { + return Unicode.Scalar(0) + } + return try Source.validateUnicodeScalar(digits.string, .octal) default: fatalError("Unexpected scalar start") @@ -1341,26 +1342,10 @@ extension Source { return nil } - // Lexing \n is tricky, as it's ambiguous with octal sequences. In PCRE - // it is treated as a backreference if its first digit is not 0 (as that - // is always octal) and one of the following holds: - // - // - It's 0 < n < 10 (as octal would be pointless here) - // - Its first digit is 8 or 9 (as not valid octal) - // - There have been as many prior groups as the reference. - // - // Oniguruma follows the same rules except the second one. e.g \81 and - // \91 are instead treated as literal 81 and 91 respectively. - // TODO: If we want a strict Oniguruma mode, we'll need to add a check - // here. + // Backslash followed by a non-0 digit character is a backreference. if firstChar != "0", let numAndLoc = try src.lexNumber() { - let num = numAndLoc.value - let ref = AST.Reference(.absolute(num), innerLoc: numAndLoc.location) - if num < 10 || firstChar == "8" || firstChar == "9" || - context.isPriorGroupRef(ref.kind) { - return .backreference(ref) - } - return nil + return .backreference(.init( + .absolute(numAndLoc.value), innerLoc: numAndLoc.location)) } return nil } @@ -1487,7 +1472,9 @@ extension Source { return ref } - let char = src.eat() + guard let char = src.tryEat() else { + throw ParseError.expectedEscape + } // Single-character builtins. if let builtin = AST.Atom.EscapedBuiltin( @@ -1497,10 +1484,8 @@ extension Source { } switch char { - // Hexadecimal and octal unicode scalars. This must be done after - // backreference lexing due to the ambiguity with \nnn. - case let c where c.isOctalDigit: fallthrough - case "u", "x", "U", "o": + // Hexadecimal and octal unicode scalars. + case "u", "x", "U", "o", "0": return try .scalar( src.expectUnicodeScalar(escapedCharacter: char).value) default: diff --git a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift index e3a178a15..5994a4f52 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Mocking.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Mocking.swift @@ -9,150 +9,6 @@ // //===----------------------------------------------------------------------===// - -// TODO: mock up multi-line soon - -enum Delimiter: Hashable, CaseIterable { - case traditional - case experimental - case reSingleQuote - - var openingAndClosing: (opening: String, closing: String) { - switch self { - case .traditional: return ("#/", "/#") - case .experimental: return ("#|", "|#") - case .reSingleQuote: return ("re'", "'") - } - } - var opening: String { openingAndClosing.opening } - var closing: String { openingAndClosing.closing } - - /// The default set of syntax options that the delimiter indicates. - var defaultSyntaxOptions: SyntaxOptions { - switch self { - case .traditional, .reSingleQuote: - return .traditional - case .experimental: - return .experimental - } - } -} - -struct LexError: Error, CustomStringConvertible { - enum Kind: Hashable { - case endOfString - case invalidUTF8 // TODO: better range reporting - case unknownDelimiter - } - - var kind: Kind - - /// The pointer at which to resume lexing. - var resumePtr: UnsafeRawPointer - - init(_ kind: Kind, resumeAt resumePtr: UnsafeRawPointer) { - self.kind = kind - self.resumePtr = resumePtr - } - - var description: String { - switch kind { - case .endOfString: return "unterminated regex literal" - case .invalidUTF8: return "invalid UTF-8 found in source file" - case .unknownDelimiter: return "unknown regex literal delimiter" - } - } -} - -/// Drop a set of regex delimiters from the input string, returning the contents -/// and the delimiter used. The input string must have valid delimiters. -func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) { - let utf8 = str.utf8 - func stripDelimiter(_ delim: Delimiter) -> String? { - let prefix = delim.opening.utf8 - let suffix = delim.closing.utf8 - guard utf8.prefix(prefix.count).elementsEqual(prefix), - utf8.suffix(suffix.count).elementsEqual(suffix) else { return nil } - - return String(utf8.dropFirst(prefix.count).dropLast(suffix.count)) - } - for d in Delimiter.allCases { - if let contents = stripDelimiter(d) { - return (contents, d) - } - } - fatalError("No valid delimiters") -} - -/// Attempt to lex a regex literal between `start` and `end`, returning either -/// the contents and pointer from which to resume lexing, or an error. -func lexRegex( - start: UnsafeRawPointer, end: UnsafeRawPointer -) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { - precondition(start <= end) - var current = start - - func ascii(_ s: Unicode.Scalar) -> UInt8 { - assert(s.value <= 0x7F) - return UInt8(asserting: s.value) - } - func load(offset: Int) -> UInt8? { - guard current + offset < end else { return nil } - return current.load(fromByteOffset: offset, as: UInt8.self) - } - func load() -> UInt8? { load(offset: 0) } - func advance(_ n: Int = 1) { - precondition(current + n <= end, "Cannot advance past end") - current = current.advanced(by: n) - } - - func tryEat(_ utf8: String.UTF8View) -> Bool { - for (i, idx) in utf8.indices.enumerated() { - guard load(offset: i) == utf8[idx] else { return false } - } - advance(utf8.count) - return true - } - - // Try to lex the opening delimiter. - guard let delimiter = Delimiter.allCases.first( - where: { tryEat($0.opening.utf8) } - ) else { - throw LexError(.unknownDelimiter, resumeAt: current.successor()) - } - - let contentsStart = current - while true { - switch load() { - case nil, ascii("\n"), ascii("\r"): - throw LexError(.endOfString, resumeAt: current) - - case ascii("\\"): - // Skip next byte. - advance(2) - - default: - // Try to lex the closing delimiter. - let contentsEnd = current - guard tryEat(delimiter.closing.utf8) else { - advance() - continue - } - - // Form a string from the contents and make sure it's valid UTF-8. - let count = contentsEnd - contentsStart - let contents = UnsafeRawBufferPointer( - start: contentsStart, count: count) - let s = String(decoding: contents, as: UTF8.self) - - guard s.utf8.elementsEqual(contents) else { - throw LexError(.invalidUTF8, resumeAt: current) - } - return (contents: s, delimiter, end: current) - } - } -} - private func copyCString(_ str: String) -> UnsafePointer { let count = str.utf8.count + 1 return str.withCString { @@ -196,7 +52,7 @@ func libswiftLexRegexLiteral( let (_, _, endPtr) = try lexRegex(start: inputPtr, end: bufferEndPtr) curPtrPtr.pointee = endPtr.assumingMemoryBound(to: CChar.self) return false - } catch let error as LexError { + } catch let error as DelimiterLexError { if error.kind == .unknownDelimiter { // An unknown delimiter should be recovered from, as we may want to try // lex something else. @@ -205,12 +61,18 @@ func libswiftLexRegexLiteral( errOut.pointee = copyCString("\(error)") curPtrPtr.pointee = error.resumePtr.assumingMemoryBound(to: CChar.self) - // For now, treat every error as unrecoverable. - // TODO: We should ideally be able to recover from a regex with missing - // closing delimiters, which would help with code completion. - return true + switch error.kind { + case .endOfString: + // Missing closing delimiter can be recovered from. + return false + case .unprintableASCII, .invalidUTF8: + // We don't currently have good recovery behavior for these. + return true + case .unknownDelimiter: + fatalError("Already handled") + } } catch { - fatalError("Should be a LexError") + fatalError("Should be a DelimiterLexError") } } diff --git a/Sources/_MatchingEngine/Regex/Parse/Source.swift b/Sources/_MatchingEngine/Regex/Parse/Source.swift index 11bd8152f..ddf0475f3 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Source.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Source.swift @@ -86,6 +86,12 @@ extension Source { tryEat(anyOf: set) } + /// Try to eat any character, returning `nil` if the input has been exhausted. + mutating func tryEat() -> Char? { + guard !isEmpty else { return nil } + return eat() + } + mutating func eat(asserting c: Char) { assert(peek() == c) advance() diff --git a/Sources/_MatchingEngine/Utility/Misc.swift b/Sources/_MatchingEngine/Utility/Misc.swift index bd1e395b5..55d3d3adc 100644 --- a/Sources/_MatchingEngine/Utility/Misc.swift +++ b/Sources/_MatchingEngine/Utility/Misc.swift @@ -108,7 +108,28 @@ extension Collection { >(_ idx: Index, in c: C) -> C.Index { c.index(atOffset: offset(of: idx)) } +} +extension Collection where Element: Equatable { + /// Attempt to drop a given prefix from the collection, returning the + /// resulting subsequence, or `nil` if the prefix does not match. + public func tryDropPrefix( + _ other: C + ) -> SubSequence? where C.Element == Element { + let prefixCount = other.count + guard prefix(prefixCount).elementsEqual(other) else { return nil } + return dropFirst(prefixCount) + } + + /// Attempt to drop a given suffix from the collection, returning the + /// resulting subsequence, or `nil` if the suffix does not match. + public func tryDropSuffix( + _ other: C + ) -> SubSequence? where C.Element == Element { + let suffixCount = other.count + guard suffix(suffixCount).elementsEqual(other) else { return nil } + return dropLast(suffixCount) + } } extension UnsafeMutableRawPointer { diff --git a/Sources/_MatchingEngine/Utility/MissingUnicode.swift b/Sources/_MatchingEngine/Utility/MissingUnicode.swift index a6aae0b82..dccba3286 100644 --- a/Sources/_MatchingEngine/Utility/MissingUnicode.swift +++ b/Sources/_MatchingEngine/Utility/MissingUnicode.swift @@ -661,3 +661,11 @@ extension Character { public var isWordCharacter: Bool { isLetter || isNumber || self == "_" } } + +extension UnicodeScalar { + public var isPrintableASCII: Bool { + // Exclude non-printables before the space character U+20, and anything + // including and above the DEL character U+7F. + value >= 0x20 && value < 0x7F + } +} diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 93dca17a8..d6389c1f6 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -99,12 +99,16 @@ extension Compiler.ByteCodeGen { } case .textSegment: - // This we should be able to do! - throw Unsupported(#"\y (text segment)"#) + builder.buildAssert { (input, pos, _) in + // FIXME: Grapheme or word based on options + input.isOnGraphemeClusterBoundary(pos) + } case .notTextSegment: - // This we should be able to do! - throw Unsupported(#"\Y (not text segment)"#) + builder.buildAssert { (input, pos, _) in + // FIXME: Grapheme or word based on options + !input.isOnGraphemeClusterBoundary(pos) + } case .startOfLine: builder.buildAssert { (input, pos, bounds) in diff --git a/Sources/_StringProcessing/CMakeLists.txt b/Sources/_StringProcessing/CMakeLists.txt new file mode 100644 index 000000000..c20dcc240 --- /dev/null +++ b/Sources/_StringProcessing/CMakeLists.txt @@ -0,0 +1,42 @@ + +add_library(_StringProcessing + Algorithms/Algorithms/Contains.swift + Algorithms/Algorithms/FirstRange.swift + Algorithms/Algorithms/Ranges.swift + Algorithms/Algorithms/Replace.swift + Algorithms/Algorithms/Split.swift + Algorithms/Algorithms/StartsWith.swift + Algorithms/Algorithms/Trim.swift + Algorithms/Consumers/CollectionConsumer.swift + Algorithms/Consumers/FixedPatternConsumer.swift + Algorithms/Consumers/ManyConsumer.swift + Algorithms/Consumers/PredicateConsumer.swift + Algorithms/Consumers/RegexConsumer.swift + Algorithms/Searchers/CollectionSearcher.swift + Algorithms/Searchers/ConsumerSearcher.swift + Algorithms/Searchers/NaivePatternSearcher.swift + Algorithms/Searchers/PatternOrEmpty.swift + Algorithms/Searchers/PredicateSearcher.swift + Algorithms/Searchers/TwoWaySearcher.swift + Algorithms/Searchers/ZSearcher.swift + ASTBuilder.swift + Capture.swift + CharacterClass.swift + Compiler.swift + ConsumerInterface.swift + Executor.swift + Legacy/HareVM.swift + Legacy/LegacyCompile.swift + Legacy/RECode.swift + Legacy/TortoiseVM.swift + Legacy/VirtualMachine.swift + RegexDSL/Builder.swift + RegexDSL/Concatenation.swift + RegexDSL/Core.swift + RegexDSL/DSL.swift + RegexDSL/DSLCapture.swift + RegexDSL/DynamicCaptures.swift) +target_compile_options(_StringProcessing PRIVATE + -enable-library-evolution) +target_link_libraries(_StringProcessing PUBLIC + _MatchingEngine) diff --git a/Sources/_StringProcessing/Capture.swift b/Sources/_StringProcessing/Capture.swift index 915c4c5d7..5b43da870 100644 --- a/Sources/_StringProcessing/Capture.swift +++ b/Sources/_StringProcessing/Capture.swift @@ -71,6 +71,11 @@ extension StructuredCapture { value: storedCapture?.value, optionalCount: optionalCount) } + + func slice(from input: String) -> Substring? { + guard let r = storedCapture?.range else { return nil } + return input[r] + } } extension Sequence where Element == StructuredCapture { @@ -86,5 +91,8 @@ extension Sequence where Element == StructuredCapture { }) return TypeConstruction.tuple(of: caps) } -} + func slices(from input: String) -> [Substring?] { + self.map { $0.slice(from: input) } + } +} diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 5099e187f..1d72a8d27 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -35,7 +35,7 @@ class Compiler { } } -public func _compileRegex( +func _compileRegex( _ regex: String, _ syntax: SyntaxOptions = .traditional ) throws -> Executor { let ast = try parse(regex, syntax) diff --git a/Sources/_StringProcessing/Engine/Consume.swift b/Sources/_StringProcessing/Engine/Consume.swift index 52f752539..4e00a34b4 100644 --- a/Sources/_StringProcessing/Engine/Consume.swift +++ b/Sources/_StringProcessing/Engine/Consume.swift @@ -24,47 +24,17 @@ extension Engine { } } -extension Engine where Input == String { - public func consume( - _ input: Input - ) -> (Input.Index, CaptureList)? { - consume(input, in: input.startIndex ..< input.endIndex) - } - - public func consume( - _ input: Input, - in range: Range, - matchMode: MatchMode = .partialFromFront - ) -> (Input.Index, CaptureList)? { - if enableTracing { - print("Consume: \(input)") - } - - var cpu = makeProcessor(input: input, bounds: range, matchMode: matchMode) - let result: Input.Index? = { - while true { - switch cpu.state { - case .accept: - return cpu.currentPosition - case .fail: - return nil - case .inProgress: cpu.cycle() - } - } - }() - - if enableTracing { - if let idx = result { - print("Result: \(input[.. Input.Index? { + while true { + switch self.state { + case .accept: + return self.currentPosition + case .fail: + return nil + case .inProgress: self.cycle() } } - guard let result = result else { return nil } - - let capList = cpu.storedCaptures - return (result, CaptureList( - values: capList, referencedCaptureOffsets: program.referencedCaptureOffsets)) } } diff --git a/Sources/_StringProcessing/Engine/Engine.swift b/Sources/_StringProcessing/Engine/Engine.swift index 6c9c2efa5..86952c8b7 100644 --- a/Sources/_StringProcessing/Engine/Engine.swift +++ b/Sources/_StringProcessing/Engine/Engine.swift @@ -11,7 +11,7 @@ // Currently, engine binds the type and consume binds an instance. // But, we can play around with this. -public struct Engine where Input.Element: Hashable { +struct Engine where Input.Element: Hashable { var program: MEProgram @@ -24,7 +24,7 @@ public struct Engine where Input.Element: Hashab set { program.enableTracing = newValue } } - public init( + init( _ program: MEProgram, enableTracing: Bool? = nil ) { @@ -36,10 +36,10 @@ public struct Engine where Input.Element: Hashab } } -public struct AsyncEngine { /* ... */ } +struct AsyncEngine { /* ... */ } extension Engine: CustomStringConvertible { - public var description: String { + var description: String { // TODO: better description return program.description } diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index fcc257302..ff28ee9e2 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -299,8 +299,7 @@ extension Instruction { internal var _opcodeMask: UInt64 { 0xFF00_0000_0000_0000 } -// TODO: internal after compiler moves in -public var _payloadMask: UInt64 { ~_opcodeMask } +var _payloadMask: UInt64 { ~_opcodeMask } extension Instruction { var opcodeMask: UInt64 { 0xFF00_0000_0000_0000 } diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index d81c583a8..78171a001 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -12,7 +12,7 @@ import _MatchingEngine // For errors extension MEProgram where Input.Element: Hashable { - public struct Builder { + struct Builder { var instructions: [Instruction] = [] var elements = TypedSetVector() @@ -50,7 +50,7 @@ extension MEProgram where Input.Element: Hashable { nextCaptureRegister.rawValue } - public init() {} + init() {} } } @@ -71,7 +71,7 @@ extension MEProgram.Builder { // TODO: We want a better strategy for fixups, leaving // the operand in a different form isn't great... - public init(staticElements: S) where S.Element == Input.Element { + init(staticElements: S) where S.Element == Input.Element { staticElements.forEach { elements.store($0) } } @@ -79,21 +79,21 @@ extension MEProgram.Builder { .init(instructions.endIndex - 1) } - public mutating func buildNop(_ r: StringRegister? = nil) { + mutating func buildNop(_ r: StringRegister? = nil) { instructions.append(.init(.nop, .init(optionalString: r))) } - public mutating func buildNop(_ s: String) { + mutating func buildNop(_ s: String) { buildNop(strings.store(s)) } - public mutating func buildDecrement( + mutating func buildDecrement( _ i: IntRegister, nowZero: BoolRegister ) { instructions.append(.init( .decrement, .init(bool: nowZero, int: i))) } - public mutating func buildMoveImmediate( + mutating func buildMoveImmediate( _ value: UInt64, into: IntRegister ) { instructions.append(.init( @@ -101,25 +101,25 @@ extension MEProgram.Builder { } // TODO: generic - public mutating func buildMoveImmediate( + mutating func buildMoveImmediate( _ value: Int, into: IntRegister ) { let uint = UInt64(asserting: value) buildMoveImmediate(uint, into: into) } - public mutating func buildMoveCurrentPosition( + mutating func buildMoveCurrentPosition( into: PositionRegister ) { instructions.append(.init( .movePosition, .init(position: into))) } - public mutating func buildBranch(to t: AddressToken) { + mutating func buildBranch(to t: AddressToken) { instructions.append(.init(.branch)) fixup(to: t) } - public mutating func buildCondBranch( + mutating func buildCondBranch( _ condition: BoolRegister, to t: AddressToken ) { instructions.append( @@ -127,7 +127,7 @@ extension MEProgram.Builder { fixup(to: t) } - public mutating func buildCondBranch( + mutating func buildCondBranch( to t: AddressToken, ifZeroElseDecrement i: IntRegister ) { instructions.append( @@ -135,56 +135,56 @@ extension MEProgram.Builder { fixup(to: t) } - public mutating func buildSave(_ t: AddressToken) { + mutating func buildSave(_ t: AddressToken) { instructions.append(.init(.save)) fixup(to: t) } - public mutating func buildSaveAddress(_ t: AddressToken) { + mutating func buildSaveAddress(_ t: AddressToken) { instructions.append(.init(.saveAddress)) fixup(to: t) } - public mutating func buildSplit( + mutating func buildSplit( to: AddressToken, saving: AddressToken ) { instructions.append(.init(.splitSaving)) fixup(to: (to, saving)) } - public mutating func buildClear() { + mutating func buildClear() { instructions.append(.init(.clear)) } - public mutating func buildRestore() { + mutating func buildRestore() { instructions.append(.init(.restore)) } - public mutating func buildFail() { + mutating func buildFail() { instructions.append(.init(.fail)) } - public mutating func buildCall(_ t: AddressToken) { + mutating func buildCall(_ t: AddressToken) { instructions.append(.init(.call)) fixup(to: t) } - public mutating func buildRet() { + mutating func buildRet() { instructions.append(.init(.ret)) } - public mutating func buildAbort(_ s: StringRegister? = nil) { + mutating func buildAbort(_ s: StringRegister? = nil) { instructions.append(.init( .abort, .init(optionalString: s))) } - public mutating func buildAbort(_ s: String) { + mutating func buildAbort(_ s: String) { buildAbort(strings.store(s)) } - public mutating func buildAdvance(_ n: Distance) { + mutating func buildAdvance(_ n: Distance) { instructions.append(.init(.advance, .init(distance: n))) } - public mutating func buildMatch(_ e: Input.Element) { + mutating func buildMatch(_ e: Input.Element) { instructions.append(.init( .match, .init(element: elements.store(e)))) } - public mutating func buildMatchSequence( + mutating func buildMatchSequence( _ s: S ) where S.Element == Input.Element { instructions.append(.init( @@ -192,7 +192,7 @@ extension MEProgram.Builder { .init(sequence: sequences.store(.init(s))))) } - public mutating func buildMatchSlice( + mutating func buildMatchSlice( lower: PositionRegister, upper: PositionRegister ) { instructions.append(.init( @@ -200,50 +200,50 @@ extension MEProgram.Builder { .init(pos: lower, pos2: upper))) } - public mutating func buildConsume( + mutating func buildConsume( by p: @escaping MEProgram.ConsumeFunction ) { instructions.append(.init( .consumeBy, .init(consumer: makeConsumeFunction(p)))) } - public mutating func buildAssert( + mutating func buildAssert( by p: @escaping MEProgram.AssertionFunction ) { instructions.append(.init( .assertBy, .init(assertion: makeAssertionFunction(p)))) } - public mutating func buildAssert( + mutating func buildAssert( _ e: Input.Element, into cond: BoolRegister ) { instructions.append(.init(.assertion, .init( element: elements.store(e), bool: cond))) } - public mutating func buildAccept() { + mutating func buildAccept() { instructions.append(.init(.accept)) } - public mutating func buildPrint(_ s: StringRegister) { + mutating func buildPrint(_ s: StringRegister) { instructions.append(.init(.print, .init(string: s))) } - public mutating func buildBeginCapture( + mutating func buildBeginCapture( _ cap: CaptureRegister ) { instructions.append( .init(.beginCapture, .init(capture: cap))) } - public mutating func buildEndCapture( + mutating func buildEndCapture( _ cap: CaptureRegister ) { instructions.append( .init(.endCapture, .init(capture: cap))) } - public mutating func buildTransformCapture( + mutating func buildTransformCapture( _ cap: CaptureRegister, _ trans: TransformRegister ) { instructions.append(.init( @@ -251,7 +251,7 @@ extension MEProgram.Builder { .init(capture: cap, transform: trans))) } - public mutating func buildMatcher( + mutating func buildMatcher( _ fun: MatcherRegister, into reg: ValueRegister ) { instructions.append(.init( @@ -259,7 +259,7 @@ extension MEProgram.Builder { .init(matcher: fun, value: reg))) } - public mutating func buildMove( + mutating func buildMove( _ value: ValueRegister, into capture: CaptureRegister ) { instructions.append(.init( @@ -267,21 +267,21 @@ extension MEProgram.Builder { .init(value: value, capture: capture))) } - public mutating func buildBackreference( + mutating func buildBackreference( _ cap: CaptureRegister ) { instructions.append( .init(.backreference, .init(capture: cap))) } - public mutating func buildUnresolvedReference(id: ReferenceID) { + mutating func buildUnresolvedReference(id: ReferenceID) { buildBackreference(.init(0)) unresolvedReferences[id, default: []].append(lastInstructionAddress) } // TODO: Mutating because of fail address fixup, drop when // that's removed - public mutating func assemble() throws -> MEProgram { + mutating func assemble() throws -> MEProgram { try resolveReferences() // TODO: This will add a fail instruction at the end every @@ -356,22 +356,22 @@ extension MEProgram.Builder { referencedCaptureOffsets: referencedCaptureOffsets) } - public mutating func reset() { self = Self() } + mutating func reset() { self = Self() } } // Address-agnostic interfaces for label-like support extension MEProgram.Builder { - public enum _AddressToken {} - public typealias AddressToken = TypedInt<_AddressToken> + enum _AddressToken {} + typealias AddressToken = TypedInt<_AddressToken> - public mutating func makeAddress() -> AddressToken { + mutating func makeAddress() -> AddressToken { defer { addressTokens.append(nil) } return AddressToken(addressTokens.count) } // Resolves the address token to the most recently added // instruction, updating prior and future address references - public mutating func resolve(_ t: AddressToken) { + mutating func resolve(_ t: AddressToken) { assert(!instructions.isEmpty) addressTokens[t.rawValue] = @@ -380,7 +380,7 @@ extension MEProgram.Builder { // Resolves the address token to the next instruction (one past the most // recently added one), updating prior and future address references. - public mutating func label(_ t: AddressToken) { + mutating func label(_ t: AddressToken) { addressTokens[t.rawValue] = InstructionAddress(instructions.count) } @@ -388,7 +388,7 @@ extension MEProgram.Builder { // Associate the most recently added instruction with // the provided token, ensuring it is fixed up during // assembly - public mutating func fixup(to t: AddressToken) { + mutating func fixup(to t: AddressToken) { assert(!instructions.isEmpty) addressFixups.append( (InstructionAddress(instructions.endIndex-1), .init(t))) @@ -397,7 +397,7 @@ extension MEProgram.Builder { // Associate the most recently added instruction with // the provided tokens, ensuring it is fixed up during // assembly - public mutating func fixup( + mutating func fixup( to ts: (AddressToken, AddressToken) ) { assert(!instructions.isEmpty) @@ -412,7 +412,7 @@ extension MEProgram.Builder { // // This is useful for possessive quantification that needs some initial save // point to "ratchet" upon a successful match. - public mutating func pushEmptySavePoint() { + mutating func pushEmptySavePoint() { if failAddressToken == nil { failAddressToken = makeAddress() } @@ -438,7 +438,7 @@ fileprivate extension MEProgram.Builder { // Register helpers extension MEProgram.Builder { - public mutating func makeCapture(id: ReferenceID?) -> CaptureRegister { + mutating func makeCapture(id: ReferenceID?) -> CaptureRegister { defer { nextCaptureRegister.rawValue += 1 } // Register the capture for later lookup via symbolic references. if let id = id { @@ -449,25 +449,25 @@ extension MEProgram.Builder { return nextCaptureRegister } - public mutating func makeBoolRegister() -> BoolRegister { + mutating func makeBoolRegister() -> BoolRegister { defer { nextBoolRegister.rawValue += 1 } return nextBoolRegister } - public mutating func makeIntRegister() -> IntRegister { + mutating func makeIntRegister() -> IntRegister { defer { nextIntRegister.rawValue += 1 } return nextIntRegister } - public mutating func makePositionRegister() -> PositionRegister { + mutating func makePositionRegister() -> PositionRegister { defer { nextPositionRegister.rawValue += 1 } return nextPositionRegister } - public mutating func makeValueRegister() -> ValueRegister { + mutating func makeValueRegister() -> ValueRegister { defer { nextValueRegister.rawValue += 1 } return nextValueRegister } // Allocate and initialize a register - public mutating func makeIntRegister( + mutating func makeIntRegister( initialValue: Int ) -> IntRegister { let r = makeIntRegister() @@ -476,7 +476,7 @@ extension MEProgram.Builder { } // Allocate and initialize a register - public mutating func makePositionRegister( + mutating func makePositionRegister( initializingWithCurrentPosition: () ) -> PositionRegister { let r = makePositionRegister() @@ -485,17 +485,17 @@ extension MEProgram.Builder { } // 'kill' or release allocated registers - public mutating func kill(_ r: IntRegister) { + mutating func kill(_ r: IntRegister) { // TODO: Release/reuse registers, for now nop makes // reading the code easier buildNop("kill \(r)") } - public mutating func kill(_ r: BoolRegister) { + mutating func kill(_ r: BoolRegister) { // TODO: Release/reuse registers, for now nop makes // reading the code easier buildNop("kill \(r)") } - public mutating func kill(_ r: PositionRegister) { + mutating func kill(_ r: PositionRegister) { // TODO: Release/reuse registers, for now nop makes // reading the code easier buildNop("kill \(r)") @@ -504,25 +504,25 @@ extension MEProgram.Builder { // TODO: A register-mapping helper struct, which could release // registers without monotonicity required - public mutating func makeConsumeFunction( + mutating func makeConsumeFunction( _ f: @escaping MEProgram.ConsumeFunction ) -> ConsumeFunctionRegister { defer { consumeFunctions.append(f) } return ConsumeFunctionRegister(consumeFunctions.count) } - public mutating func makeAssertionFunction( + mutating func makeAssertionFunction( _ f: @escaping MEProgram.AssertionFunction ) -> AssertionFunctionRegister { defer { assertionFunctions.append(f) } return AssertionFunctionRegister(assertionFunctions.count) } - public mutating func makeTransformFunction( + mutating func makeTransformFunction( _ f: @escaping MEProgram.TransformFunction ) -> TransformRegister { defer { transformFunctions.append(f) } return TransformRegister(transformFunctions.count) } - public mutating func makeMatcherFunction( + mutating func makeMatcherFunction( _ f: @escaping MEProgram.MatcherFunction ) -> MatcherRegister { defer { matcherFunctions.append(f) } diff --git a/Sources/_StringProcessing/Engine/MECapture.swift b/Sources/_StringProcessing/Engine/MECapture.swift index 88f912ecb..bac632e9e 100644 --- a/Sources/_StringProcessing/Engine/MECapture.swift +++ b/Sources/_StringProcessing/Engine/MECapture.swift @@ -142,7 +142,7 @@ extension Processor._StoredCapture: CustomStringConvertible { } } -public struct CaptureList { +struct CaptureList { var values: Array._StoredCapture> var referencedCaptureOffsets: [ReferenceID: Int] diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift index d616657e8..1e58ddf54 100644 --- a/Sources/_StringProcessing/Engine/MEProgram.swift +++ b/Sources/_StringProcessing/Engine/MEProgram.swift @@ -11,13 +11,13 @@ import _MatchingEngine -public struct MEProgram where Input.Element: Equatable { - public typealias ConsumeFunction = (Input, Range) -> Input.Index? - public typealias AssertionFunction = +struct MEProgram where Input.Element: Equatable { + typealias ConsumeFunction = (Input, Range) -> Input.Index? + typealias AssertionFunction = (Input, Input.Index, Range) -> Bool - public typealias TransformFunction = + typealias TransformFunction = (Input, Range) -> Any? - public typealias MatcherFunction = + typealias MatcherFunction = (Input, Input.Index, Range) -> (Input.Index, Any)? var instructions: InstructionList @@ -39,7 +39,7 @@ public struct MEProgram where Input.Element: Equatable { } extension MEProgram: CustomStringConvertible { - public var description: String { + var description: String { var result = """ Elements: \(staticElements) Strings: \(staticStrings) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 10c3eb781..343b02c92 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -public enum MatchMode { +enum MatchMode { case wholeString case partialFromFront } diff --git a/Sources/_StringProcessing/Engine/Tracing.swift b/Sources/_StringProcessing/Engine/Tracing.swift index 7db740f52..24d00d3d7 100644 --- a/Sources/_StringProcessing/Engine/Tracing.swift +++ b/Sources/_StringProcessing/Engine/Tracing.swift @@ -15,7 +15,7 @@ extension Processor: TracedProcessor { var currentPC: InstructionAddress { controller.pc } - public func formatSavePoints() -> String { + func formatSavePoints() -> String { if !savePoints.isEmpty { var result = "save points:\n" for point in savePoints { diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index e066a4369..c044cbf24 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -11,8 +11,7 @@ import _MatchingEngine - // FIXME: Public for prototype -public struct Executor { +struct Executor { // TODO: consider let, for now lets us toggle tracing var engine: Engine @@ -20,70 +19,53 @@ public struct Executor { self.engine = Engine(program, enableTracing: enablesTracing) } - // FIXME: Public for prototype - public struct Result { - public var range: Range - var captures: [StructuredCapture] - var referencedCaptureOffsets: [ReferenceID: Int] + func match( + _ input: String, + in inputRange: Range, + _ mode: MatchMode + ) throws -> RegexMatch? { + var cpu = engine.makeProcessor( + input: input, bounds: inputRange, matchMode: mode) - var destructure: ( - matched: Range, - captures: [StructuredCapture], - referencedCaptureOffsets: [ReferenceID: Int] - ) { - (range, captures, referencedCaptureOffsets) + guard let endIdx = cpu.consume() else { + return nil } - init( - _ matched: Range, _ captures: [StructuredCapture], - _ referencedCaptureOffsets: [ReferenceID: Int] - ) { - self.range = matched - self.captures = captures - self.referencedCaptureOffsets = referencedCaptureOffsets - } - } + let capList = CaptureList( + values: cpu.storedCaptures, + referencedCaptureOffsets: engine.program.referencedCaptureOffsets) - public func execute( - input: String, - in range: Range, - mode: MatchMode = .wholeString - ) -> Result? { - guard let (endIdx, capList) = engine.consume( - input, in: range, matchMode: mode - ) else { - return nil - } let capStruct = engine.program.captureStructure - do { - let range = range.lowerBound.. Result? { - self.execute( - input: input.base, - in: input.startIndex.., - mode: MatchMode = .wholeString - ) -> (Range, CaptureList)? { - engine.consume( - input, in: range, matchMode: mode - ).map { endIndex, capture in - (range.lowerBound.., + _ mode: MatchMode + ) throws -> RegexMatch<(Substring, DynamicCaptures)>? { + try match(input, in: inputRange, mode) } } diff --git a/Sources/_StringProcessing/RegexDSL/DSL.swift b/Sources/_StringProcessing/RegexDSL/DSL.swift index 35a4ccb5e..a21dce82d 100644 --- a/Sources/_StringProcessing/RegexDSL/DSL.swift +++ b/Sources/_StringProcessing/RegexDSL/DSL.swift @@ -17,8 +17,7 @@ extension String: RegexProtocol { public typealias Match = Substring public var regex: Regex { - let atoms = self.map { atom(.char($0)) } - return .init(ast: concat(atoms)) + .init(node: .quotedLiteral(self)) } } @@ -26,8 +25,7 @@ extension Substring: RegexProtocol { public typealias Match = Substring public var regex: Regex { - let atoms = self.map { atom(.char($0)) } - return .init(ast: concat(atoms)) + .init(node: .quotedLiteral(String(self))) } } @@ -35,7 +33,15 @@ extension Character: RegexProtocol { public typealias Match = Substring public var regex: Regex { - .init(ast: atom(.char(self))) + .init(node: .atom(.char(self))) + } +} + +extension UnicodeScalar: RegexProtocol { + public typealias Match = Substring + + public var regex: Regex { + .init(node: .atom(.scalar(self))) } } @@ -187,9 +193,7 @@ public func choiceOf( // MARK: - Backreference - -// FIXME: Public for prototypes. -public struct ReferenceID: Hashable, Equatable { +struct ReferenceID: Hashable, Equatable { private static var counter: Int = 0 var base: Int diff --git a/Sources/_StringProcessing/RegexDSL/DSLTree.swift b/Sources/_StringProcessing/RegexDSL/DSLTree.swift index a44220925..43f8aa62f 100644 --- a/Sources/_StringProcessing/RegexDSL/DSLTree.swift +++ b/Sources/_StringProcessing/RegexDSL/DSLTree.swift @@ -249,7 +249,7 @@ extension DSLTree { } } extension DSLTree.Node { - public func _captureStructure( + func _captureStructure( _ constructor: inout CaptureStructure.Constructor ) -> CaptureStructure { switch self { diff --git a/Sources/_StringProcessing/RegexDSL/Match.swift b/Sources/_StringProcessing/RegexDSL/Match.swift index 2dd31c379..c5ada0c9d 100644 --- a/Sources/_StringProcessing/RegexDSL/Match.swift +++ b/Sources/_StringProcessing/RegexDSL/Match.swift @@ -16,6 +16,8 @@ public struct RegexMatch { let rawCaptures: [StructuredCapture] let referencedCaptureOffsets: [ReferenceID: Int] + let value: Any? + public var match: Match { if Match.self == (Substring, DynamicCaptures).self { // FIXME(rdar://89449323): Compiler assertion @@ -25,7 +27,15 @@ public struct RegexMatch { } else if Match.self == Substring.self { // FIXME: Plumb whole match (`.0`) through the matching engine. return input[range] as! Match + } else if rawCaptures.isEmpty, value != nil { + // FIXME: This is a workaround for whole-match values not + // being modeled as part of captures. We might want to + // switch to a model where results are alongside captures + return value! as! Match } else { + guard value == nil else { + fatalError("FIXME: what would this mean?") + } let typeErasedMatch = rawCaptures.existentialMatch(from: input[range]) return typeErasedMatch as! Match } @@ -69,16 +79,11 @@ extension RegexProtocol { mode: MatchMode = .wholeString ) -> RegexMatch? { let executor = Executor(program: regex.program.loweredProgram) - guard let (range, captures, captureOffsets) = executor.execute( - input: input, in: inputRange, mode: mode - )?.destructure else { - return nil + do { + return try executor.match(input, in: inputRange, mode) + } catch { + fatalError(String(describing: error)) } - return RegexMatch( - input: input, - range: range, - rawCaptures: captures, - referencedCaptureOffsets: captureOffsets) } } diff --git a/Sources/_StringProcessing/Unicode/Decoding.swift b/Sources/_StringProcessing/Unicode/Decoding.swift index 49eb1f794..68c14f6c1 100644 --- a/Sources/_StringProcessing/Unicode/Decoding.swift +++ b/Sources/_StringProcessing/Unicode/Decoding.swift @@ -33,13 +33,13 @@ enum UnsafeAssumingValidUTF8 { @inlinable @inline(__always) - public static func decode(_ x: UInt8) -> Unicode.Scalar { + static func decode(_ x: UInt8) -> Unicode.Scalar { _internalInvariant(UTF8.isASCII(x)) return Unicode.Scalar(_unchecked: UInt32(x)) } @inlinable @inline(__always) - public static func decode( + static func decode( _ x: UInt8, _ y: UInt8 ) -> Unicode.Scalar { _internalInvariant(scalarLength(x) == 2) @@ -50,7 +50,7 @@ enum UnsafeAssumingValidUTF8 { } @inlinable @inline(__always) - public static func decode( + static func decode( _ x: UInt8, _ y: UInt8, _ z: UInt8 ) -> Unicode.Scalar { _internalInvariant(scalarLength(x) == 3) @@ -63,7 +63,7 @@ enum UnsafeAssumingValidUTF8 { } @inlinable @inline(__always) - public static func decode( + static func decode( _ x: UInt8, _ y: UInt8, _ z: UInt8, _ w: UInt8 ) -> Unicode.Scalar { _internalInvariant(scalarLength(x) == 4) @@ -80,7 +80,7 @@ enum UnsafeAssumingValidUTF8 { // Also, assuming we can load from those bounds... @inlinable - public static func decode( + static func decode( _ utf8: UnsafeByteBuffer, startingAt i: Int ) -> (Unicode.Scalar, scalarLength: Int) { let cu0 = utf8[_unchecked: i] @@ -103,7 +103,7 @@ enum UnsafeAssumingValidUTF8 { } @inlinable - public static func decode( + static func decode( _ utf8: UnsafeByteBuffer, endingAt i: Int ) -> (Unicode.Scalar, scalarLength: Int) { let len = scalarLength(utf8, endingAt: i) @@ -113,7 +113,7 @@ enum UnsafeAssumingValidUTF8 { } @inlinable @inline(__always) - public static func scalarLength(_ x: UInt8) -> Int { + static func scalarLength(_ x: UInt8) -> Int { _internalInvariant(!UTF8.isContinuation(x)) if UTF8.isASCII(x) { return 1 } // TODO(String micro-performance): check codegen @@ -121,7 +121,7 @@ enum UnsafeAssumingValidUTF8 { } @inlinable @inline(__always) - public static func scalarLength( + static func scalarLength( _ utf8: UnsafeByteBuffer, endingAt i: Int ) -> Int { var len = 1 @@ -133,12 +133,12 @@ enum UnsafeAssumingValidUTF8 { } @inlinable @inline(__always) - public static func continuationPayload(_ x: UInt8) -> UInt32 { + static func continuationPayload(_ x: UInt8) -> UInt32 { return UInt32(x & 0x3F) } @inlinable - public static func scalarAlign( + static func scalarAlign( _ utf8: UnsafeByteBuffer, _ idx: Int ) -> Int { guard _fastPath(idx != utf8.count) else { return idx } diff --git a/Sources/_StringProcessing/Unicode/NecessaryEvils.swift b/Sources/_StringProcessing/Unicode/NecessaryEvils.swift index ef846c14e..a9ae24429 100644 --- a/Sources/_StringProcessing/Unicode/NecessaryEvils.swift +++ b/Sources/_StringProcessing/Unicode/NecessaryEvils.swift @@ -40,7 +40,7 @@ extension Optional { } // Don't use UnsafeRawBufferPointer for anything important -public struct UnsafeByteBuffer { +struct UnsafeByteBuffer { var pointer: UnsafeRawPointer var count: Int diff --git a/Sources/_StringProcessing/Utility/Protocols.swift b/Sources/_StringProcessing/Utility/Protocols.swift index 9c196c18c..7542a17dd 100644 --- a/Sources/_StringProcessing/Utility/Protocols.swift +++ b/Sources/_StringProcessing/Utility/Protocols.swift @@ -13,11 +13,11 @@ // These currently only drive tracing/formatting, but could drive // more -public protocol InstructionProtocol { +protocol InstructionProtocol { var operandPC: InstructionAddress? { get } } -public protocol ProcessorProtocol { +protocol ProcessorProtocol { associatedtype Input: Collection associatedtype Instruction: InstructionProtocol associatedtype SavePoint = () @@ -45,12 +45,12 @@ public protocol ProcessorProtocol { } extension ProcessorProtocol { - public func fetch() -> Instruction { + func fetch() -> Instruction { instructions[currentPC] } - public var callStack: Array { [] } -// public var savePoints: Array { [] } - public var registers: Array { [] } + var callStack: Array { [] } +// var savePoints: Array { [] } + var registers: Array { [] } } diff --git a/Sources/_StringProcessing/Utility/Traced.swift b/Sources/_StringProcessing/Utility/Traced.swift index c270aba23..5ae7cd245 100644 --- a/Sources/_StringProcessing/Utility/Traced.swift +++ b/Sources/_StringProcessing/Utility/Traced.swift @@ -12,11 +12,11 @@ // TODO: Place shared formatting and trace infrastructure here -public protocol Traced { +protocol Traced { var isTracingEnabled: Bool { get set } } -public protocol TracedProcessor: ProcessorProtocol, Traced { +protocol TracedProcessor: ProcessorProtocol, Traced { // Empty defaulted func formatCallStack() -> String // empty default func formatSavePoints() -> String // empty default @@ -36,7 +36,7 @@ func lineNumber(_ pc: InstructionAddress) -> String { } extension TracedProcessor where Registers: Collection{ - public func formatRegisters() -> String { + func formatRegisters() -> String { typealias E = () if !registers.isEmpty { return "\(registers)\n" @@ -48,19 +48,19 @@ extension TracedProcessor where Registers: Collection{ extension TracedProcessor { func printTrace() { print(formatTrace()) } - public func trace() { + func trace() { if isTracingEnabled { printTrace() } } // Helpers for the conformers - public func formatCallStack() -> String { + func formatCallStack() -> String { if !callStack.isEmpty { return "call stack: \(callStack)\n" } return "" } - public func formatSavePoints() -> String { + func formatSavePoints() -> String { if !savePoints.isEmpty { var result = "save points:\n" for point in savePoints { @@ -71,7 +71,7 @@ extension TracedProcessor { return "" } - public func formatRegisters() -> String { + func formatRegisters() -> String { typealias E = () if Registers.self == E.self { return "" @@ -79,7 +79,7 @@ extension TracedProcessor { return "\(registers)\n" } - public func formatInput() -> String { + func formatInput() -> String { // String override for printing sub-character information. if !input.indices.contains(currentPosition) { // Format unicode scalars as: @@ -115,7 +115,7 @@ extension TracedProcessor { """ } - public func formatInstructionWindow( + func formatInstructionWindow( windowSize: Int = 12 ) -> String { if isAcceptState { return "ACCEPT" } @@ -139,7 +139,7 @@ extension TracedProcessor { return result } - public func formatTrace() -> String { + func formatTrace() -> String { var result = "\n--- cycle \(cycleCount) ---\n" result += formatCallStack() result += formatSavePoints() @@ -150,7 +150,7 @@ extension TracedProcessor { return result } - public func formatInstruction( + func formatInstruction( _ pc: InstructionAddress, depth: Int = 5 ) -> String { @@ -160,7 +160,7 @@ extension TracedProcessor { } extension Collection where Element: InstructionProtocol, Index == InstructionAddress { - public func formatInstruction( + func formatInstruction( _ pc: InstructionAddress, atCurrent: Bool, depth: Int diff --git a/Sources/_StringProcessing/Utility/TypedIndex.swift b/Sources/_StringProcessing/Utility/TypedIndex.swift index 3bddcadfd..adde06a3e 100644 --- a/Sources/_StringProcessing/Utility/TypedIndex.swift +++ b/Sources/_StringProcessing/Utility/TypedIndex.swift @@ -12,55 +12,43 @@ /// Forwarding wrapper around Int-index collections that provide a /// strongly (phantom) typed index. -@frozen -public struct TypedIndex: RawRepresentable where C.Index == Int { - @_alwaysEmitIntoClient - public var rawValue: C +struct TypedIndex: RawRepresentable where C.Index == Int { + var rawValue: C - @_alwaysEmitIntoClient - public init(rawValue: C) { self.rawValue = rawValue } + init(rawValue: C) { self.rawValue = rawValue } - @_alwaysEmitIntoClient - public init(_ rawValue: C) { self.init(rawValue: rawValue) } + init(_ rawValue: C) { self.init(rawValue: rawValue) } } extension TypedIndex: Collection { - public typealias Index = TypedInt<šŸ‘»> - public typealias Element = C.Element + typealias Index = TypedInt<šŸ‘»> + typealias Element = C.Element - @_alwaysEmitIntoClient - public var startIndex: Index { Index(rawValue.startIndex) } + var startIndex: Index { Index(rawValue.startIndex) } - @_alwaysEmitIntoClient - public var endIndex: Index { Index(rawValue.endIndex )} + var endIndex: Index { Index(rawValue.endIndex )} - @_alwaysEmitIntoClient - public var count: Int { rawValue.count } + var count: Int { rawValue.count } - @_alwaysEmitIntoClient - public func index(after: Index) -> Index { + func index(after: Index) -> Index { Index(rawValue.index(after: after.rawValue)) } - @_alwaysEmitIntoClient - public subscript(position: Index) -> Element { + subscript(position: Index) -> Element { rawValue[position.rawValue] } - @_alwaysEmitIntoClient - public func distance( + func distance( from start: Index, to end: Index ) -> Int { rawValue.distance(from: start.rawValue, to: end.rawValue) } - @_alwaysEmitIntoClient - public func index(_ i: Index, offsetBy distance: Int) -> Index { + func index(_ i: Index, offsetBy distance: Int) -> Index { Index(rawValue.index(i.rawValue, offsetBy: distance)) } - @_alwaysEmitIntoClient - public func index(_ i: Index, offsetBy distance: Int, limitedBy limit: Index) -> Index? { + func index(_ i: Index, offsetBy distance: Int, limitedBy limit: Index) -> Index? { guard let idx = rawValue.index(i.rawValue, offsetBy: distance, limitedBy: limit.rawValue) else { return nil } @@ -71,8 +59,7 @@ extension TypedIndex: Collection { extension TypedIndex: RandomAccessCollection where C: RandomAccessCollection { } extension TypedIndex: MutableCollection where C: MutableCollection { - @_alwaysEmitIntoClient - public subscript(position: Index) -> Element { + subscript(position: Index) -> Element { _read { yield rawValue[position.rawValue] } @@ -82,8 +69,7 @@ extension TypedIndex: MutableCollection where C: MutableCollection { } } extension TypedIndex: BidirectionalCollection where C: BidirectionalCollection { - @_alwaysEmitIntoClient - public func index(before: Index) -> Index { + func index(before: Index) -> Index { Index(rawValue.index(before: before.rawValue)) } } @@ -92,11 +78,9 @@ extension TypedIndex: BidirectionalCollection where C: BidirectionalCollection { // failure in the Swift repo. #if false extension TypedIndex: RangeReplaceableCollection where C: RangeReplaceableCollection { - @_alwaysEmitIntoClient - public init() { rawValue = C() } + init() { rawValue = C() } - @_alwaysEmitIntoClient - public mutating func replaceSubrange(_ subrange: Range, with newElements: C) where C : Collection, C.Element == Element { + mutating func replaceSubrange(_ subrange: Range, with newElements: C) where C : Collection, C.Element == Element { let rawRange = subrange.lowerBound.rawValue ..< subrange.upperBound.rawValue rawValue.replaceSubrange(rawRange, with: newElements) } @@ -107,14 +91,13 @@ extension TypedIndex: RangeReplaceableCollection where C: RangeReplaceableCollec // Workaround for #73 extension TypedIndex where C: RangeReplaceableCollection { - public mutating func append(_ newElement: Element) { + mutating func append(_ newElement: Element) { rawValue.append(newElement) } } extension TypedIndex: ExpressibleByArrayLiteral where C: ExpressibleByArrayLiteral & RangeReplaceableCollection { - @_alwaysEmitIntoClient - public init(arrayLiteral elements: Element...) { + init(arrayLiteral elements: Element...) { // TODO: any way around the RRC copying init? self.init(C(elements)) } @@ -122,5 +105,5 @@ extension TypedIndex: ExpressibleByArrayLiteral where C: ExpressibleByArrayLiter // MARK: - Strongly typed wrappers -public typealias InstructionList = TypedIndex<[Instruction], _InstructionAddress> +typealias InstructionList = TypedIndex<[Instruction], _InstructionAddress> diff --git a/Sources/_StringProcessing/Utility/TypedInt.swift b/Sources/_StringProcessing/Utility/TypedInt.swift index caff7f64e..249717b68 100644 --- a/Sources/_StringProcessing/Utility/TypedInt.swift +++ b/Sources/_StringProcessing/Utility/TypedInt.swift @@ -11,86 +11,71 @@ // Just a phantom-typed Int wrapper. -@frozen -public struct TypedInt<šŸ‘»>: RawRepresentable, Hashable { - @_alwaysEmitIntoClient - public var rawValue: Int +struct TypedInt<šŸ‘»>: RawRepresentable, Hashable { + var rawValue: Int - @_alwaysEmitIntoClient - public init(rawValue: Int) { + init(rawValue: Int) { self.rawValue = rawValue } - @_alwaysEmitIntoClient - public init(_ rawValue: Int) { + init(_ rawValue: Int) { self.init(rawValue: rawValue) } - @_alwaysEmitIntoClient - public init(_ uint: UInt64) { + init(_ uint: UInt64) { assert(uint.leadingZeroBitCount > 0) self.init(Int(asserting: uint)) } } extension TypedInt: Comparable { - @_alwaysEmitIntoClient - public static func <(lhs: TypedInt, rhs: TypedInt) -> Bool { + static func <(lhs: TypedInt, rhs: TypedInt) -> Bool { return lhs.rawValue < rhs.rawValue } } extension TypedInt: CustomStringConvertible { - @_alwaysEmitIntoClient - public var description: String { return "#\(rawValue)" } + var description: String { return "#\(rawValue)" } } extension TypedInt: ExpressibleByIntegerLiteral { - @_alwaysEmitIntoClient - public init(integerLiteral value: Int) { + init(integerLiteral value: Int) { self.init(rawValue: value) } } -public protocol TypedIntProtocol { +protocol TypedIntProtocol { associatedtype šŸ‘» } extension TypedInt: TypedIntProtocol { } // A placeholder type for when we must supply a type. // When the phantom type appears, it says boo -public enum _Boo {} +enum _Boo {} // Easier for clients to just have their own typealias -public typealias TypedInt_ = TypedInt +typealias TypedInt_ = TypedInt // TODO: BinaryInteger, etc. extension TypedInt { - @_alwaysEmitIntoClient - public static func +(lhs: TypedInt, rhs: Int) -> TypedInt { + static func +(lhs: TypedInt, rhs: Int) -> TypedInt { return TypedInt(lhs.rawValue + rhs) } - @_alwaysEmitIntoClient - public var bits: UInt64 { + var bits: UInt64 { UInt64(asserting: self.rawValue) } } -@frozen -public struct TypedSetVector { - public typealias Idx = TypedInt<šŸ‘»> +struct TypedSetVector { + typealias Idx = TypedInt<šŸ‘»> // TODO: Replace with real set vector - @_alwaysEmitIntoClient - public var lookup: Dictionary = [:] + var lookup: Dictionary = [:] - @_alwaysEmitIntoClient - public var stored: Array = [] + var stored: Array = [] - @_alwaysEmitIntoClient - public func load(_ idx: Idx) -> Element { stored[idx.rawValue] } + func load(_ idx: Idx) -> Element { stored[idx.rawValue] } - @_alwaysEmitIntoClient @discardableResult - public mutating func store(_ e: Element) -> Idx { + mutating func store(_ e: Element) -> Idx { if let reg = lookup[e] { return reg } let reg = Idx(stored.count) stored.append(e) @@ -98,34 +83,32 @@ public struct TypedSetVector { return reg } - @_alwaysEmitIntoClient - public var count: Int { stored.count } + var count: Int { stored.count } - @_alwaysEmitIntoClient - public init() {} + init() {} } // MARK: - Strongly typed int wrappers /// A distance in the Input, e.g. `n` in consume(n) -public typealias Distance = TypedInt<_Distance> -public enum _Distance {} +typealias Distance = TypedInt<_Distance> +enum _Distance {} /// An instruction address, i.e. the index into our instruction list -public typealias InstructionAddress = TypedInt<_InstructionAddress> -public enum _InstructionAddress {} +typealias InstructionAddress = TypedInt<_InstructionAddress> +enum _InstructionAddress {} /// A position in the call stack, i.e. for save point restores -public typealias CallStackAddress = TypedInt<_CallStackAddress> -public enum _CallStackAddress {} +typealias CallStackAddress = TypedInt<_CallStackAddress> +enum _CallStackAddress {} /// A position in a position stack, i.e. for NFA simulation -public typealias PositionStackAddress = TypedInt<_PositionStackAddress> -public enum _PositionStackAddress {} +typealias PositionStackAddress = TypedInt<_PositionStackAddress> +enum _PositionStackAddress {} /// A position in the save point stack, i.e. for backtracking -public typealias SavePointStackAddress = TypedInt<_SavePointAddress> -public enum _SavePointAddress {} +typealias SavePointStackAddress = TypedInt<_SavePointAddress> +enum _SavePointAddress {} // MARK: - Registers @@ -135,85 +118,85 @@ public enum _SavePointAddress {} /// NOTE: Currently just used for static data, but e.g. could be /// used to save the most recently seen element satisfying some /// property -public typealias ElementRegister = TypedInt<_ElementRegister> -public enum _ElementRegister {} +typealias ElementRegister = TypedInt<_ElementRegister> +enum _ElementRegister {} -public typealias SequenceRegister = TypedInt<_SequenceRegister> -public enum _SequenceRegister {} +typealias SequenceRegister = TypedInt<_SequenceRegister> +enum _SequenceRegister {} /// The register number for a stored boolean value /// /// E.g. used for conditional branches -public typealias BoolRegister = TypedInt<_BoolRegister> -public enum _BoolRegister {} +typealias BoolRegister = TypedInt<_BoolRegister> +enum _BoolRegister {} /// The register number for a string (e.g. comment, failure reason) -public typealias StringRegister = TypedInt<_StringRegister> -public enum _StringRegister {} +typealias StringRegister = TypedInt<_StringRegister> +enum _StringRegister {} /// Used for consume functions, e.g. character classes -public typealias ConsumeFunctionRegister = TypedInt<_ConsumeFunctionRegister> -public enum _ConsumeFunctionRegister {} +typealias ConsumeFunctionRegister = TypedInt<_ConsumeFunctionRegister> +enum _ConsumeFunctionRegister {} /// Used for assertion functions, e.g. anchors etc -public typealias AssertionFunctionRegister = TypedInt<_AssertionFunctionRegister> -public enum _AssertionFunctionRegister {} +typealias AssertionFunctionRegister = TypedInt<_AssertionFunctionRegister> +enum _AssertionFunctionRegister {} /// Used for capture transforms, etc -public typealias TransformRegister = TypedInt<_TransformRegister> -public enum _TransformRegister {} +typealias TransformRegister = TypedInt<_TransformRegister> +enum _TransformRegister {} /// Used for value-producing matchers -public typealias MatcherRegister = TypedInt<_MatcherRegister> -public enum _MatcherRegister {} +typealias MatcherRegister = TypedInt<_MatcherRegister> +enum _MatcherRegister {} /// UNIMPLEMENTED -public typealias IntRegister = TypedInt<_IntRegister> -public enum _IntRegister {} +typealias IntRegister = TypedInt<_IntRegister> +enum _IntRegister {} /// UNIMPLEMENTED -public typealias FloatRegister = TypedInt<_FloatRegister> -public enum _FloatRegister {} +typealias FloatRegister = TypedInt<_FloatRegister> +enum _FloatRegister {} /// UNIMPLEMENTED /// /// NOTE: This, along with a position stack, might /// serve NFA-simulation style execution models -public typealias PositionRegister = TypedInt<_PositionRegister> -public enum _PositionRegister {} +typealias PositionRegister = TypedInt<_PositionRegister> +enum _PositionRegister {} -public typealias ValueRegister = TypedInt<_ValueRegister> -public enum _ValueRegister {} +typealias ValueRegister = TypedInt<_ValueRegister> +enum _ValueRegister {} -public typealias CaptureRegister = TypedInt<_CaptureRegister> -public enum _CaptureRegister {} +typealias CaptureRegister = TypedInt<_CaptureRegister> +enum _CaptureRegister {} /// UNIMPLEMENTED -public typealias InstructionAddressRegister = TypedInt<_InstructionAddressRegister> -public enum _InstructionAddressRegister {} +typealias InstructionAddressRegister = TypedInt<_InstructionAddressRegister> +enum _InstructionAddressRegister {} /// UNIMPLEMENTED -public typealias CallStackAddressRegister = TypedInt<_CallStackAddressRegister> -public enum _CallStackAddressRegister {} +typealias CallStackAddressRegister = TypedInt<_CallStackAddressRegister> +enum _CallStackAddressRegister {} /// UNIMPLEMENTED -public typealias PositionStackAddressRegister = TypedInt<_PositionStackAddressRegister> -public enum _PositionStackAddressRegister {} +typealias PositionStackAddressRegister = TypedInt<_PositionStackAddressRegister> +enum _PositionStackAddressRegister {} /// UNIMPLEMENTED -public typealias SavePointAddressRegister = TypedInt<_SavePointAddressRegister> -public enum _SavePointAddressRegister {} +typealias SavePointAddressRegister = TypedInt<_SavePointAddressRegister> +enum _SavePointAddressRegister {} /// A numbered label -public typealias LabelId = TypedInt<_LabelId> -public enum _LabelId {} +typealias LabelId = TypedInt<_LabelId> +enum _LabelId {} /// A numbered function -public typealias FunctionId = TypedInt<_FunctionId> -public enum _FunctionId {} +typealias FunctionId = TypedInt<_FunctionId> +enum _FunctionId {} /// A numbered capture -public typealias CaptureId = TypedInt<_CaptureId> -public enum _CaptureId {} +typealias CaptureId = TypedInt<_CaptureId> +enum _CaptureId {} diff --git a/Sources/_Unicode/CMakeLists.txt b/Sources/_Unicode/CMakeLists.txt new file mode 100644 index 000000000..7fdb44628 --- /dev/null +++ b/Sources/_Unicode/CMakeLists.txt @@ -0,0 +1,16 @@ + +add_library(_Unicode + CaseConversion.swift + CharacterProps.swift + Comparison.swift + Decoding.swift + Encodings.swift + Formatting.swift + Graphemes.swift + NecessaryEvils.swift + Normaliation.swift + NumberParsing.swift + ScalarProps.swift + Transcoding.swift + UCD.swift + Validation.swift) diff --git a/Tests/MatchingEngineTests/MatchingEngineTests.swift b/Tests/MatchingEngineTests/MatchingEngineTests.swift index b7c89661d..ccfe85ec7 100644 --- a/Tests/MatchingEngineTests/MatchingEngineTests.swift +++ b/Tests/MatchingEngineTests/MatchingEngineTests.swift @@ -13,289 +13,5 @@ import XCTest @testable import _StringProcessing -/// Hold context and run variety of ad-hoc tests -/// -/// TODO: Use these to demonstrate first-order approximation of what -/// overhead such an engine imposes -fileprivate struct Test: ExpressibleByStringLiteral { - var input: String - var aEater: String - var manyAEater: String - var eatUntilA: String - var eatThroughA: String - - // TODO: Have tests explicitly show each step of type binding, - // input binding, etc. - var enableTracing: Bool? = nil - - /* - - until first A - through first A - until / through last A - etc - - */ - - var file: String - var line: UInt - - init( - _ s: String, - enableTracing: Bool? = nil, - file: String = #file, - line: UInt = #line - ) { - self.input = s - self.aEater = s.first == "A" ? String(s.dropFirst()) : s - self.manyAEater = String(s.drop(while: { $0 == "A" })) - - if let firstIdx = s.firstIndex(of: "A") { - self.eatUntilA = String(s[firstIdx...]) - self.eatThroughA = String(eatUntilA.dropFirst()) - } else { - self.eatUntilA = s - self.eatThroughA = s - } - - self.enableTracing = enableTracing - -// self.untilFirstAEater = String( -// s[(s.firstIndex(where: { $0 == "A" }) ?? s.startIndex)...]) - - - self.file = file - self.line = line - } - init( - stringLiteral: String, - file: String = #file, - line: UInt = #line - ) { - self.init(stringLiteral, file: file, line: line) - } - init(stringLiteral: String) { - // NOTE: Can't get source location of a literal... - self.init(stringLiteral) - } - - var slicedInput: (String, Range) { - let prefix = "aAa prefix āš ļø" - let suffix = "āš ļø aAa suffix" - let outer = prefix + input + suffix - let range = outer.mapOffsets( - (lower: prefix.count, upper: -suffix.count)) - return (outer, range) - } - - func check(_ engine: Engine, expected: String) { - var engine = engine - if let t = enableTracing { - engine.enableTracing = t - } - let output: String - let outputFromSlice: String - - if let (idx, _) = engine.consume(input) { - output = String(input[idx...]) - } else { - output = input - } - - let (outerInput, range) = slicedInput - if let (idx, _) = engine.consume(outerInput, in: range) { - outputFromSlice = String(outerInput[idx..? = nil, - manyAEater: Engine? = nil, - eatUntilA: Engine? = nil, - eatThroughA: Engine? = nil - ) { - if let engine = aEater { - check(engine, expected: self.aEater) - } - if let engine = manyAEater { - check(engine, expected: self.manyAEater) - } - if let engine = eatUntilA { - check(engine, expected: self.eatUntilA) - } - if let engine = eatThroughA { - check(engine, expected: self.eatThroughA) - } - } -} - -var doPrint = false -func show(_ s: CustomStringConvertible) { - if doPrint { print(s) } -} - -func makeEngine( - _ constructor: (inout Program.Builder) -> () -) -> Engine { - var builder = Program.Builder() - constructor(&builder) - let program = try! builder.assemble() - let engine = Engine(program) - show(engine) - return engine -} - -// Eat an A off the front -// -// [0] match "A" -// [1] accept -// -let aEater: Engine = { - makeEngine { builder in - builder.buildMatch("A") - builder.buildAccept() - } -}() - -// Eat many "A"s off the input -// -// [0] saveAddress [3] // .accept -// [1] match "A" -// [2] goto [1] // match "A" -// [3] accept -// -// NOTE: a save would restore input position, which we -// actually don't want to do. -// -// NOTE: We should compare with a more sophisticated match -// instruction that can take at least or at most, etc. -// -let manyAEater: Engine = { - makeEngine { builder in - let accTok = builder.makeAddress() - let matchTok = builder.makeAddress() - - builder.buildSaveAddress(accTok) - builder.buildMatch("A") - builder.resolve(matchTok) - builder.buildBranch(to: matchTok) - builder.buildAccept() - builder.resolve(accTok) - } -}() - -// Eat until you find an A (FAIL if no A) -// -// [0] assert #0 #0 -// [1] condBranch #0 [x] // accept -// [2] advance(1) -// [3] goto 0 -// [4] accept -// -// NOTE: This check-consume-else-branch pattern -// could be pretty common and might be worth a dedicated -// instruction. -let eatUntilA: Engine = { - makeEngine { builder in - let reg = builder.makeBoolRegister() - let accTok = builder.makeAddress() - let assertTok = builder.makeAddress() - builder.buildAssert("A", into: reg) - builder.resolve(assertTok) - builder.buildCondBranch(reg, to: accTok) - builder.buildAdvance(1) - builder.buildBranch(to: assertTok) - builder.buildAccept() - builder.resolve(accTok) - } -}() - -// Eat through the first A (FAIL if no A) -// -// [0] assert #0 #0 -// [1] advance(1) -// [2] condBranch #0 [x] // accept -// [3] goto 0 -// [4] accept -let eatThroughA: Engine = { - makeEngine { builder in - let reg = builder.makeBoolRegister() - let accTok = builder.makeAddress() - let assertTok = builder.makeAddress() - builder.buildAssert("A", into: reg) - builder.resolve(assertTok) - builder.buildAdvance(1) - builder.buildCondBranch(reg, to: accTok) - builder.buildBranch(to: assertTok) - builder.buildAccept() - builder.resolve(accTok) - } -}() - - - -class MatchingEngineTests: XCTestCase { - - func testAEaters() { - let tests: Array = [ - Test("abc"), - Test("Abc"), - Test("AAbc"), - Test(""), - Test("A"), - Test("b"), - Test("bbbA"), - Test("bbAbA"), - ] - - for test in tests { - test.check(aEater: aEater) - test.check(manyAEater: manyAEater) - test.check(eatUntilA: eatUntilA) - test.check(eatThroughA: eatThroughA) - } - } - - func testThreeLetterRepeat() { - // Check for a repeated 3-letter sequence, such as in - // `(...)\1` - // - // [0] movePosition(into: %low) - // [1] advance(3) - // [2] movePosition(into: %high) - // [3] matchSlice(%low, %high) - // [4] accept - let threeLetterRepeat: Engine = { - makeEngine { builder in - let low = builder.makePositionRegister( - initializingWithCurrentPosition: ()) - builder.buildAdvance(3) - let high = builder.makePositionRegister( - initializingWithCurrentPosition: ()) - builder.buildMatchSlice(lower: low, upper: high) - builder.buildAccept() - } - }() - - let tests: Array<(String, Bool)> = [ - ("abcabc", true), - ("abcabc_____", true), - ("dddddd_____", true), - ("šŸ„³šŸ§Ÿā€ā™€ļøcšŸ„³šŸ§Ÿā€ā™€ļøc", true), - ("abccba", false), - ("abcabb", false), - ("abcbac", false), - ("šŸ„³šŸ§Ÿā€ā™€ļøcšŸ„³šŸ§Ÿā€ā™‚ļøc", false), - ] - - for (test, expect) in tests { - let match = threeLetterRepeat.consume(test) != nil - XCTAssertEqual(expect, match) - } - } -} +// TODO: Unit tests for the engine itself. Functional testing +// is handled by regex tests. diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index 9f3cc313b..cc3568c1d 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -142,13 +142,15 @@ func captureTest( for (input, output) in tests { let inputRange = input.startIndex..( + _ regex: Regex, + _ tests: (input: String, call: MatchCall, match: Match?)... +) { + for (input, call, match) in tests { + let result: Match? + switch call { + case .match: + result = input.match(regex)?.match + case .firstMatch: + result = input.firstMatch(of: regex)?.result } - XCTAssert(result.match == "4t") + XCTAssertEqual(result, match) + } +} - XCTAssertNil("4".match(regex)) - XCTAssertNil("t".match(regex)) - XCTAssertNil("t4".match(regex)) +extension RegexTests { - let regex2 = Regex { - oneOrMore { + // TODO: Refactor below into more exhaustive, declarative + // tests. + func testCustomRegexComponents() { + customTest( + Regex { Numbler() - } - } - - guard let res2 = "ab123c".firstMatch(of: regex2) else { - XCTFail() - return - } - - XCTAssertEqual(res2.match, "123") + Asciibbler() + }, + ("4t", .match, "4t"), + ("4", .match, nil), + ("t", .match, nil), + ("t x1y z", .firstMatch, "1y"), + ("t4", .match, nil)) + + customTest( + Regex { + oneOrMore { Numbler() } + }, + ("ab123c", .firstMatch, "123"), + ("abc", .firstMatch, nil), + ("55z", .match, nil), + ("55z", .firstMatch, "55")) + + customTest( + Regex { + Numbler() + }, + ("ab123c", .firstMatch, 1), + ("abc", .firstMatch, nil), + ("55z", .match, nil), + ("55z", .firstMatch, 5)) + + // TODO: Convert below tests to better infra. Right now + // it's hard because `Match` is constrained to be + // `Equatable` which tuples cannot be. let regex3 = Regex { capture { diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 4dd6392f7..dba72820f 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -23,13 +23,13 @@ extension Executor { // Consumer -> searcher algorithm var start = input.startIndex while true { - if let (range, caps) = self.executeFlat( - input: input, + if let result = try! self.dynamicMatch( + input, in: start.. String { + input.withCString(encodedAs: UTF8.self) { ptr in + let endPtr = ptr + input.utf8.count + let (contents, delim, end) = try! lexRegex(start: ptr, end: endPtr) + if ignoreTrailing { + XCTAssertNotEqual(end, endPtr, file: file, line: line) + } else { + XCTAssertEqual(end, endPtr, file: file, line: line) + } + + let rawPtr = UnsafeRawPointer(ptr) + let buffer = UnsafeRawBufferPointer(start: rawPtr, count: end - rawPtr) + let literal = String(decoding: buffer, as: UTF8.self) - let (parseContents, parseDelim) = droppingRegexDelimiters(input) + let (parseContents, parseDelim) = droppingRegexDelimiters(literal) XCTAssertEqual(contents, parseContents, file: file, line: line) XCTAssertEqual(delim, parseDelim, file: file, line: line) + return literal } +} - let orig = try! parseWithDelimiters(input) +/// Test parsing an input string with regex delimiters. If `ignoreTrailing` is +/// true, there may be additional characters that follow the literal that are +/// not considered part of it. +func parseWithDelimitersTest( + _ input: String, _ expecting: AST.Node, ignoreTrailing: Bool = false, + file: StaticString = #file, line: UInt = #line +) { + // First try lexing. + let literal = delimiterLexingTest( + input, ignoreTrailing: ignoreTrailing, file: file, line: line) + + let orig = try! parseWithDelimiters(literal) let ast = orig.root guard ast == expecting || ast._dump() == expecting._dump() // EQ workaround @@ -199,6 +223,32 @@ func diagnosticTest( } } +func delimiterLexingDiagnosticTest( + _ input: String, _ expected: DelimiterLexError.Kind, + syntax: SyntaxOptions = .traditional, + file: StaticString = #file, line: UInt = #line +) { + do { + _ = try input.withCString { ptr in + try lexRegex(start: ptr, end: ptr + input.count) + } + XCTFail(""" + Passed, but expected error: \(expected) + """, file: file, line: line) + } catch let e as DelimiterLexError { + guard e.kind == expected else { + XCTFail(""" + + Expected: \(expected) + Actual: \(e.kind) + """, file: file, line: line) + return + } + } catch let e { + XCTFail("Unexpected error type: \(e)", file: file, line: line) + } +} + func libswiftDiagnosticMessageTest( _ input: String, _ expectedErr: String, file: StaticString = #file, line: UInt = #line @@ -329,7 +379,7 @@ extension RegexTests { parseTest(#"\070"#, scalar("\u{38}")) parseTest(#"\07A"#, concat(scalar("\u{7}"), "A")) parseTest(#"\08"#, concat(scalar("\u{0}"), "8")) - parseTest(#"\0707"#, concat(scalar("\u{38}"), "7")) + parseTest(#"\0707"#, scalar("\u{1C7}")) parseTest(#"[\0]"#, charClass(scalar_m("\u{0}"))) parseTest(#"[\01]"#, charClass(scalar_m("\u{1}"))) @@ -337,13 +387,15 @@ extension RegexTests { parseTest(#"[\07A]"#, charClass(scalar_m("\u{7}"), "A")) parseTest(#"[\08]"#, charClass(scalar_m("\u{0}"), "8")) - parseTest(#"[\0707]"#, charClass(scalar_m("\u{38}"), "7")) + parseTest(#"[\0707]"#, charClass(scalar_m("\u{1C7}"))) - parseTest(#"[\1]"#, charClass(scalar_m("\u{1}"))) - parseTest(#"[\123]"#, charClass(scalar_m("\u{53}"))) - parseTest(#"[\101]"#, charClass(scalar_m("\u{41}"))) - parseTest(#"[\7777]"#, charClass(scalar_m("\u{1FF}"), "7")) - parseTest(#"[\181]"#, charClass(scalar_m("\u{1}"), "8", "1")) + // TODO: These are treated as octal sequences by PCRE, we should warn and + // suggest user prefix with 0. + parseTest(#"[\1]"#, charClass("1")) + parseTest(#"[\123]"#, charClass("1", "2", "3")) + parseTest(#"[\101]"#, charClass("1", "0", "1")) + parseTest(#"[\7777]"#, charClass("7", "7", "7", "7")) + parseTest(#"[\181]"#, charClass("1", "8", "1")) // We take *up to* the first two valid digits for \x. No valid digits is 0. parseTest(#"\x"#, scalar("\u{0}")) @@ -492,6 +544,10 @@ extension RegexTests { #"a\Q \Q \\.\Eb"#, concat("a", quote(#" \Q \\."#), "b")) + // These follow the PCRE behavior. + parseTest(#"\Q\\E"#, quote("\\")) + parseTest(#"\E"#, "E") + parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"), syntax: .experimental) parseTest(#"a" .""b""#, concat("a", quote(" ."), quote("b")), @@ -797,11 +853,9 @@ extension RegexTests { ) } - // TODO: Some of these behaviors are unintuitive, we should likely warn on - // some of them. - parseTest(#"\10"#, scalar("\u{8}")) - parseTest(#"\18"#, concat(scalar("\u{1}"), "8")) - parseTest(#"\7777"#, concat(scalar("\u{1FF}"), "7")) + parseTest(#"\10"#, backreference(.absolute(10))) + parseTest(#"\18"#, backreference(.absolute(18))) + parseTest(#"\7777"#, backreference(.absolute(7777))) parseTest(#"\91"#, backreference(.absolute(91))) parseTest( @@ -813,12 +867,13 @@ extension RegexTests { parseTest( #"()()()()()()()()()\10()"#, concat(Array(repeating: capture(empty()), count: 9) - + [scalar("\u{8}"), capture(empty())]), + + [backreference(.absolute(10)), capture(empty())]), captures: .tuple(Array(repeating: .atom(), count: 10)) ) - parseTest(#"()()\10"#, - concat(capture(empty()), capture(empty()), scalar("\u{8}")), - captures: .tuple(.atom(), .atom())) + parseTest(#"()()\10"#, concat( + capture(empty()), capture(empty()), backreference(.absolute(10))), + captures: .tuple(.atom(), .atom()) + ) // A capture of three empty captures. let fourCaptures = capture( @@ -826,8 +881,8 @@ extension RegexTests { ) parseTest( // There are 9 capture groups in total here. - #"((()()())(()()()))\10"#, - concat(capture(concat(fourCaptures, fourCaptures)), scalar("\u{8}")), + #"((()()())(()()()))\10"#, concat(capture(concat( + fourCaptures, fourCaptures)), backreference(.absolute(10))), captures: .tuple(Array(repeating: .atom(), count: 9)) ) parseTest( @@ -852,7 +907,7 @@ extension RegexTests { concat(Array(repeating: capture(empty()), count: 40) + [scalar(" ")]), captures: .tuple(Array(repeating: .atom(), count: 40)) ) - parseTest(#"\40"#, scalar(" ")) + parseTest(#"\40"#, backreference(.absolute(40))) parseTest( String(repeating: "()", count: 40) + #"\40"#, concat(Array(repeating: capture(empty()), count: 40) @@ -862,7 +917,7 @@ extension RegexTests { parseTest(#"\7"#, backreference(.absolute(7))) - parseTest(#"\11"#, scalar("\u{9}")) + parseTest(#"\11"#, backreference(.absolute(11))) parseTest( String(repeating: "()", count: 11) + #"\11"#, concat(Array(repeating: capture(empty()), count: 11) @@ -876,12 +931,11 @@ extension RegexTests { captures: .tuple(Array(repeating: .atom(), count: 11)) ) - parseTest(#"\0113"#, concat(scalar("\u{9}"), "3")) - parseTest(#"\113"#, scalar("\u{4B}")) - parseTest(#"\377"#, scalar("\u{FF}")) + parseTest(#"\0113"#, scalar("\u{4B}")) + parseTest(#"\113"#, backreference(.absolute(113))) + parseTest(#"\377"#, backreference(.absolute(377))) parseTest(#"\81"#, backreference(.absolute(81))) - parseTest(#"\g1"#, backreference(.absolute(1))) parseTest(#"\g001"#, backreference(.absolute(1))) parseTest(#"\g52"#, backreference(.absolute(52))) @@ -999,13 +1053,13 @@ extension RegexTests { parseTest(#"\p{sc=grek}"#, prop(.script(.greek))) parseTest(#"\p{sc=isGreek}"#, prop(.script(.greek))) - parseTest(#"\p{Greek}"#, prop(.script(.greek))) - parseTest(#"\p{isGreek}"#, prop(.script(.greek))) + parseTest(#"\p{Greek}"#, prop(.scriptExtension(.greek))) + parseTest(#"\p{isGreek}"#, prop(.scriptExtension(.greek))) parseTest(#"\P{Script=Latn}"#, prop(.script(.latin), inverted: true)) parseTest(#"\p{script=zzzz}"#, prop(.script(.unknown))) parseTest(#"\p{ISscript=iszzzz}"#, prop(.script(.unknown))) parseTest(#"\p{scx=bamum}"#, prop(.scriptExtension(.bamum))) - parseTest(#"\p{ISBAMUM}"#, prop(.script(.bamum))) + parseTest(#"\p{ISBAMUM}"#, prop(.scriptExtension(.bamum))) parseTest(#"\p{alpha}"#, prop(.binary(.alphabetic))) parseTest(#"\p{DEP}"#, prop(.binary(.deprecated))) @@ -1443,6 +1497,9 @@ extension RegexTests { parseWithDelimitersTest("#/a b/#", concat("a", " ", "b")) parseWithDelimitersTest("#|a b|#", concat("a", "b")) + parseWithDelimitersTest("re'a b'", concat("a", " ", "b")) + parseWithDelimitersTest("rx'a b'", concat("a", "b")) + parseWithDelimitersTest("#|[a b]|#", charClass("a", "b")) parseWithDelimitersTest( "#|(?-x)[a b]|#", changeMatchingOptions( @@ -1468,6 +1525,71 @@ extension RegexTests { parseWithDelimitersTest("re'x*'", zeroOrMore(of: "x")) + parseWithDelimitersTest(#"re'šŸ”„šŸ‡©šŸ‡°'"#, concat("šŸ”„", "šŸ‡©šŸ‡°")) + parseWithDelimitersTest(#"re'\šŸ”„āœ…'"#, concat("šŸ”„", "āœ…")) + + // Printable ASCII characters. + delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##) + + // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter + // if it's clear that it's part of the regex syntax. + + parseWithDelimitersTest( + #"re'(?'a_bcA0'\')'"#, namedCapture("a_bcA0", "'")) + parseWithDelimitersTest( + #"re'(?'a_bcA0-c1A'x*)'"#, + balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x"))) + + parseWithDelimitersTest( + #"rx' (?'a_bcA0' a b)'"#, concat(namedCapture("a_bcA0", concat("a", "b")))) + + parseWithDelimitersTest( + #"re'(?('a_bcA0')x|y)'"#, conditional( + .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y")) + parseWithDelimitersTest( + #"re'(?('+20')\')'"#, conditional( + .groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty())) + + parseWithDelimitersTest( + #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A")))) + parseWithDelimitersTest( + #"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1)) + + parseWithDelimitersTest( + #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A")))) + parseWithDelimitersTest( + #"re'\g'-1'\''"#, concat(subpattern(.relative(-1)), "'")) + + parseWithDelimitersTest( + #"re'(?C'a*b\c šŸ”„_ ;')'"#, pcreCallout(.string(#"a*b\c šŸ”„_ ;"#))) + + // Fine, because we don't end up skipping. + delimiterLexingTest(#"re'(?'"#) + delimiterLexingTest(#"re'(?('"#) + delimiterLexingTest(#"re'\k'"#) + delimiterLexingTest(#"re'\g'"#) + delimiterLexingTest(#"re'(?C'"#) + + // Not a valid group name, but we can still skip over it. + delimiterLexingTest(#"re'(?'šŸ”„')'"#) + + // Escaped, so don't skip. These will ignore the ending `'` as we've already + // closed the literal. + parseWithDelimitersTest( + #"re'\(?''"#, zeroOrOne(of: "("), ignoreTrailing: true + ) + parseWithDelimitersTest( + #"re'\\k''"#, concat("\\", "k"), ignoreTrailing: true + ) + parseWithDelimitersTest( + #"re'\\g''"#, concat("\\", "g"), ignoreTrailing: true + ) + parseWithDelimitersTest( + #"re'\(?C''"#, concat(zeroOrOne(of: "("), "C"), ignoreTrailing: true + ) + delimiterLexingTest(#"re'(\?''"#, ignoreTrailing: true) + delimiterLexingTest(#"re'\(?(''"#, ignoreTrailing: true) + // MARK: Parse not-equal // Make sure dumping output correctly reflects differences in AST. @@ -1749,6 +1871,10 @@ extension RegexTests { diagnosticTest("(?")) diagnosticTest("(?", .expected(")")) + // MARK: Bad escapes + + diagnosticTest("\\", .expectedEscape) + // MARK: Text Segment options diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions) @@ -1770,6 +1896,12 @@ extension RegexTests { diagnosticTest(#"(?<#>)"#, .identifierMustBeAlphaNumeric(.groupName)) diagnosticTest(#"(?'1A')"#, .identifierCannotStartWithNumber(.groupName)) + // TODO: It might be better if tried to consume up to the closing `'` and + // diagnosed an invalid group name based on that. + diagnosticTest(#"(?'abc ')"#, .expected("'")) + + diagnosticTest("(?'šŸ”„')", .identifierMustBeAlphaNumeric(.groupName)) + diagnosticTest(#"(?'-')"#, .expectedIdentifier(.groupName)) diagnosticTest(#"(?'--')"#, .identifierMustBeAlphaNumeric(.groupName)) diagnosticTest(#"(?'a-b-c')"#, .expected("'")) @@ -1882,6 +2014,27 @@ extension RegexTests { diagnosticTest("(*LIMIT_DEPTH=-1", .expectedNumber("", kind: .decimal)) } + func testDelimiterLexingErrors() { + + // MARK: Printable ASCII + + delimiterLexingDiagnosticTest(#"re'\\#n'"#, .endOfString) + for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r. + delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII) + } + delimiterLexingDiagnosticTest("re'\n'", .endOfString) + delimiterLexingDiagnosticTest("re'\r'", .endOfString) + delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII) + + // MARK: Delimiter skipping + + delimiterLexingDiagnosticTest("re'(?''", .endOfString) + delimiterLexingDiagnosticTest("re'(?'abc'", .endOfString) + delimiterLexingDiagnosticTest("re'(?('abc'", .endOfString) + delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .endOfString) + delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .endOfString) + } + func testlibswiftDiagnostics() { libswiftDiagnosticMessageTest( "#/[x*/#", "cannot parse regular expression: expected ']'") diff --git a/Tests/RegexTests/RegexDSLTests.swift b/Tests/RegexTests/RegexDSLTests.swift index 554ef905f..d78ff04e5 100644 --- a/Tests/RegexTests/RegexDSLTests.swift +++ b/Tests/RegexTests/RegexDSLTests.swift @@ -280,6 +280,16 @@ class RegexDSLTests: XCTestCase { Anchor.endOfLine } + try _testDSLCaptures( + ("Cafe\u{301}", nil), + ("Cafe", "Cafe"), + matchType: Substring.self, ==) + { + oneOrMore(.word) + UnicodeScalar("e") + Anchor.textSegmentBoundary + } + try _testDSLCaptures( ("aaaaa1", "aaaaa1"), ("aaaaa2", nil), @@ -642,6 +652,59 @@ class RegexDSLTests: XCTestCase { } } } + + func testSemanticVersionExample() { + struct SemanticVersion: Equatable { + var major: Int + var minor: Int + var patch: Int + var dev: String? + } + struct SemanticVersionParser: CustomRegexComponent { + typealias Match = SemanticVersion + func match( + _ input: String, + startingAt index: String.Index, + in bounds: Range + ) -> (upperBound: String.Index, match: SemanticVersion)? { + let regex = Regex { + tryCapture(oneOrMore(.digit)) { Int($0) } + "." + tryCapture(oneOrMore(.digit)) { Int($0) } + optionally { + "." + tryCapture(oneOrMore(.digit)) { Int($0) } + } + optionally { + "-" + capture(oneOrMore(.word)) + } + } + + guard let match = input[index..