From 7107e29ca55bcebb6833bbd85d2d272feaa96ba6 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 9 Feb 2023 11:49:25 -0800 Subject: [PATCH] Fix output type mismatch with RegexBuilder (#626) Some regex literals (and presumably other `Regex` instances) lose their output type information when used in a RegexBuilder closure due to the way the concatenating builder calls are overloaded. In particular, any output type with labeled tuples or where the sum of tuple components in the accumulated and new output types is greater than 10 will be ignored. Regex internals don't make this distinction, however, so there ends up being a mismatch between what a `Regex.Match` instance tries to produce and the output type of the outermost regex. For example, this code results in a crash, because `regex` is a `Regex` but the match tries to produce a `(Substring, number: Substring)`: let regex = Regex { ZeroOrMore(.whitespace) /:(?\d+):/ ZeroOrMore(.whitespace) } let match = try regex.wholeMatch(in: " :21: ") print(match!.output) To fix this, we add a new `ignoreCapturesInTypedOutput` DSLTree node to mark situations where the output type is discarded. This status is propagated through the capture list into the match's storage, which lets us produce the correct output type. Note that we can't just drop the capture groups when building the compiled program because (1) different parts of the regex might reference the capture group and (2) all capture groups are available if a developer converts the output to `AnyRegexOutput`. let anyOutput = AnyRegexOutput(match) // anyOutput[1] == "21" // anyOutput["number"] == Optional("21") Fixes #625. rdar://104823356 Note: Linux seems to crash on different tests when the two customTest overloads have `internal` visibility or are called. Switching one of the functions to be generic over a RegexComponent works around the issue. --- Package.swift | 6 +- Sources/RegexBuilder/DSL.swift | 62 +++ Sources/RegexBuilder/Variadics.swift | 59 +-- .../VariadicsGenerator.swift | 31 +- .../Regex/Parse/CaptureList.swift | 33 +- Sources/_StringProcessing/ByteCodeGen.swift | 5 +- Sources/_StringProcessing/Capture.swift | 2 +- .../_StringProcessing/ConsumerInterface.swift | 2 +- .../Engine/Structuralize.swift | 3 +- .../_StringProcessing/PrintAsPattern.swift | 3 + .../Regex/AnyRegexOutput.swift | 4 + Sources/_StringProcessing/Regex/DSLTree.swift | 61 ++- .../Utility/RegexFactory.swift | 10 + Tests/RegexBuilderTests/AlgorithmsTests.swift | 2 + .../AnyRegexOutputTests.swift | 1 + Tests/RegexBuilderTests/CustomTests.swift | 161 ++++++-- Tests/RegexBuilderTests/MotivationTests.swift | 7 +- Tests/RegexBuilderTests/RegexDSLTests.swift | 386 ++++++++++++------ Tests/RegexTests/CaptureTests.swift | 6 +- 19 files changed, 605 insertions(+), 239 deletions(-) diff --git a/Package.swift b/Package.swift index b30c402c4..5d45950db 100644 --- a/Package.swift +++ b/Package.swift @@ -8,6 +8,10 @@ let availabilityDefinition = PackageDescription.SwiftSetting.unsafeFlags([ "-define-availability", "-Xfrontend", "SwiftStdlib 5.7:macOS 9999, iOS 9999, watchOS 9999, tvOS 9999", + "-Xfrontend", + "-define-availability", + "-Xfrontend", + "SwiftStdlib 5.8:macOS 9999, iOS 9999, watchOS 9999, tvOS 9999", ]) /// Swift settings for building a private stdlib-like module that is to be used @@ -87,7 +91,7 @@ let package = Package( name: "RegexBuilderTests", dependencies: ["_StringProcessing", "RegexBuilder", "TestSupport"], swiftSettings: [ - .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]) + availabilityDefinition ]), .testTarget( name: "DocumentationTests", diff --git a/Sources/RegexBuilder/DSL.swift b/Sources/RegexBuilder/DSL.swift index 152aadd0c..680f3bd2f 100644 --- a/Sources/RegexBuilder/DSL.swift +++ b/Sources/RegexBuilder/DSL.swift @@ -508,3 +508,65 @@ extension Regex.Match { internal func makeFactory() -> _RegexFactory { _RegexFactory() } + +/// These are special `accumulate` methods that wrap one or both components in +/// a node that indicates that that their output types shouldn't be included in +/// the resulting strongly-typed output type. This is required from a +/// `buildPartialBlock` call where a component's output type is either ignored +/// or not included in the resulting type. For example: +/// +/// static func buildPartialBlock( +/// accumulated: R0, next: R1 +/// ) -> Regex<(Substring, C1)> where R0.RegexOutput == W0, R1.RegexOutput == (W1, C1) +/// +/// In this `buildPartialBlock` overload, `W0` isn't included in the +/// resulting output type, even though it can match any output type, including +/// a tuple. When `W0` matches a tuple type that doesn't match another overload +/// (because of arity or labels) we need this "ignoring" variant so that we +/// don't have a type mismatch when we ultimately cast the type-erased output +/// to the expected type. +@available(SwiftStdlib 5.7, *) +extension _RegexFactory { + /// Concatenates the `left` and `right` component, wrapping `right` to + /// indicate that its output type shouldn't be included in the resulting + /// strongly-typed output type. + @_alwaysEmitIntoClient + internal func accumulate( + _ left: some RegexComponent, + ignoringOutputTypeOf right: some RegexComponent + ) -> Regex { + if #available(macOS 9999, iOS 9999, watchOS 9999, tvOS 9999, *) { + return accumulate(left, ignoreCapturesInTypedOutput(right)) + } + return accumulate(left, right) + } + + /// Concatenates the `left` and `right` component, wrapping `left` to + /// indicate that its output type shouldn't be included in the resulting + /// strongly-typed output type. + @_alwaysEmitIntoClient + internal func accumulate( + ignoringOutputTypeOf left: some RegexComponent, + _ right: some RegexComponent + ) -> Regex { + if #available(macOS 9999, iOS 9999, watchOS 9999, tvOS 9999, *) { + return accumulate(ignoreCapturesInTypedOutput(left), right) + } + return accumulate(left, right) + } + + /// Concatenates the `left` and `right` component, wrapping both sides to + /// indicate that their output types shouldn't be included in the resulting + /// strongly-typed output type. + @_alwaysEmitIntoClient + internal func accumulate( + ignoringOutputTypeOf left: some RegexComponent, + andAlso right: some RegexComponent + ) -> Regex { + if #available(macOS 9999, iOS 9999, watchOS 9999, tvOS 9999, *) { + return accumulate( + ignoreCapturesInTypedOutput(left), ignoreCapturesInTypedOutput(right)) + } + return accumulate(left, right) + } +} diff --git a/Sources/RegexBuilder/Variadics.swift b/Sources/RegexBuilder/Variadics.swift index 0f19cd6b0..f11727521 100644 --- a/Sources/RegexBuilder/Variadics.swift +++ b/Sources/RegexBuilder/Variadics.swift @@ -2,7 +2,7 @@ // // This source file is part of the Swift.org open source project // -// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Copyright (c) 2021-2023 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See https://swift.org/LICENSE.txt for license information @@ -20,7 +20,7 @@ extension RegexComponentBuilder { accumulated: R0, next: R1 ) -> Regex<(Substring, C1)> where R0.RegexOutput == W0, R1.RegexOutput == (W1, C1) { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(ignoringOutputTypeOf: accumulated, next) } } @available(SwiftStdlib 5.7, *) @@ -30,7 +30,7 @@ extension RegexComponentBuilder { accumulated: R0, next: R1 ) -> Regex<(Substring, C1, C2)> where R0.RegexOutput == W0, R1.RegexOutput == (W1, C1, C2) { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(ignoringOutputTypeOf: accumulated, next) } } @available(SwiftStdlib 5.7, *) @@ -40,7 +40,7 @@ extension RegexComponentBuilder { accumulated: R0, next: R1 ) -> Regex<(Substring, C1, C2, C3)> where R0.RegexOutput == W0, R1.RegexOutput == (W1, C1, C2, C3) { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(ignoringOutputTypeOf: accumulated, next) } } @available(SwiftStdlib 5.7, *) @@ -50,7 +50,7 @@ extension RegexComponentBuilder { accumulated: R0, next: R1 ) -> Regex<(Substring, C1, C2, C3, C4)> where R0.RegexOutput == W0, R1.RegexOutput == (W1, C1, C2, C3, C4) { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(ignoringOutputTypeOf: accumulated, next) } } @available(SwiftStdlib 5.7, *) @@ -60,7 +60,7 @@ extension RegexComponentBuilder { accumulated: R0, next: R1 ) -> Regex<(Substring, C1, C2, C3, C4, C5)> where R0.RegexOutput == W0, R1.RegexOutput == (W1, C1, C2, C3, C4, C5) { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(ignoringOutputTypeOf: accumulated, next) } } @available(SwiftStdlib 5.7, *) @@ -70,7 +70,7 @@ extension RegexComponentBuilder { accumulated: R0, next: R1 ) -> Regex<(Substring, C1, C2, C3, C4, C5, C6)> where R0.RegexOutput == W0, R1.RegexOutput == (W1, C1, C2, C3, C4, C5, C6) { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(ignoringOutputTypeOf: accumulated, next) } } @available(SwiftStdlib 5.7, *) @@ -80,7 +80,7 @@ extension RegexComponentBuilder { accumulated: R0, next: R1 ) -> Regex<(Substring, C1, C2, C3, C4, C5, C6, C7)> where R0.RegexOutput == W0, R1.RegexOutput == (W1, C1, C2, C3, C4, C5, C6, C7) { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(ignoringOutputTypeOf: accumulated, next) } } @available(SwiftStdlib 5.7, *) @@ -90,7 +90,7 @@ extension RegexComponentBuilder { accumulated: R0, next: R1 ) -> Regex<(Substring, C1, C2, C3, C4, C5, C6, C7, C8)> where R0.RegexOutput == W0, R1.RegexOutput == (W1, C1, C2, C3, C4, C5, C6, C7, C8) { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(ignoringOutputTypeOf: accumulated, next) } } @available(SwiftStdlib 5.7, *) @@ -100,7 +100,7 @@ extension RegexComponentBuilder { accumulated: R0, next: R1 ) -> Regex<(Substring, C1, C2, C3, C4, C5, C6, C7, C8, C9)> where R0.RegexOutput == W0, R1.RegexOutput == (W1, C1, C2, C3, C4, C5, C6, C7, C8, C9) { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(ignoringOutputTypeOf: accumulated, next) } } @available(SwiftStdlib 5.7, *) @@ -110,7 +110,7 @@ extension RegexComponentBuilder { accumulated: R0, next: R1 ) -> Regex<(Substring, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10)> where R0.RegexOutput == W0, R1.RegexOutput == (W1, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10) { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(ignoringOutputTypeOf: accumulated, next) } } @available(SwiftStdlib 5.7, *) @@ -565,123 +565,112 @@ extension RegexComponentBuilder { } @available(SwiftStdlib 5.7, *) extension RegexComponentBuilder { - @available(SwiftStdlib 5.7, *) @_alwaysEmitIntoClient public static func buildPartialBlock( accumulated: R0, next: R1 ) -> Regex where R0.RegexOutput == W0 { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(ignoringOutputTypeOf: accumulated, andAlso: next) } } @available(SwiftStdlib 5.7, *) extension RegexComponentBuilder { - @available(SwiftStdlib 5.7, *) @_alwaysEmitIntoClient public static func buildPartialBlock( accumulated: R0, next: R1 ) -> Regex<(Substring, C0)> where R0.RegexOutput == (W0, C0) { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(accumulated, ignoringOutputTypeOf: next) } } @available(SwiftStdlib 5.7, *) extension RegexComponentBuilder { - @available(SwiftStdlib 5.7, *) @_alwaysEmitIntoClient public static func buildPartialBlock( accumulated: R0, next: R1 ) -> Regex<(Substring, C0, C1)> where R0.RegexOutput == (W0, C0, C1) { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(accumulated, ignoringOutputTypeOf: next) } } @available(SwiftStdlib 5.7, *) extension RegexComponentBuilder { - @available(SwiftStdlib 5.7, *) @_alwaysEmitIntoClient public static func buildPartialBlock( accumulated: R0, next: R1 ) -> Regex<(Substring, C0, C1, C2)> where R0.RegexOutput == (W0, C0, C1, C2) { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(accumulated, ignoringOutputTypeOf: next) } } @available(SwiftStdlib 5.7, *) extension RegexComponentBuilder { - @available(SwiftStdlib 5.7, *) @_alwaysEmitIntoClient public static func buildPartialBlock( accumulated: R0, next: R1 ) -> Regex<(Substring, C0, C1, C2, C3)> where R0.RegexOutput == (W0, C0, C1, C2, C3) { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(accumulated, ignoringOutputTypeOf: next) } } @available(SwiftStdlib 5.7, *) extension RegexComponentBuilder { - @available(SwiftStdlib 5.7, *) @_alwaysEmitIntoClient public static func buildPartialBlock( accumulated: R0, next: R1 ) -> Regex<(Substring, C0, C1, C2, C3, C4)> where R0.RegexOutput == (W0, C0, C1, C2, C3, C4) { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(accumulated, ignoringOutputTypeOf: next) } } @available(SwiftStdlib 5.7, *) extension RegexComponentBuilder { - @available(SwiftStdlib 5.7, *) @_alwaysEmitIntoClient public static func buildPartialBlock( accumulated: R0, next: R1 ) -> Regex<(Substring, C0, C1, C2, C3, C4, C5)> where R0.RegexOutput == (W0, C0, C1, C2, C3, C4, C5) { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(accumulated, ignoringOutputTypeOf: next) } } @available(SwiftStdlib 5.7, *) extension RegexComponentBuilder { - @available(SwiftStdlib 5.7, *) @_alwaysEmitIntoClient public static func buildPartialBlock( accumulated: R0, next: R1 ) -> Regex<(Substring, C0, C1, C2, C3, C4, C5, C6)> where R0.RegexOutput == (W0, C0, C1, C2, C3, C4, C5, C6) { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(accumulated, ignoringOutputTypeOf: next) } } @available(SwiftStdlib 5.7, *) extension RegexComponentBuilder { - @available(SwiftStdlib 5.7, *) @_alwaysEmitIntoClient public static func buildPartialBlock( accumulated: R0, next: R1 ) -> Regex<(Substring, C0, C1, C2, C3, C4, C5, C6, C7)> where R0.RegexOutput == (W0, C0, C1, C2, C3, C4, C5, C6, C7) { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(accumulated, ignoringOutputTypeOf: next) } } @available(SwiftStdlib 5.7, *) extension RegexComponentBuilder { - @available(SwiftStdlib 5.7, *) @_alwaysEmitIntoClient public static func buildPartialBlock( accumulated: R0, next: R1 ) -> Regex<(Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8)> where R0.RegexOutput == (W0, C0, C1, C2, C3, C4, C5, C6, C7, C8) { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(accumulated, ignoringOutputTypeOf: next) } } @available(SwiftStdlib 5.7, *) extension RegexComponentBuilder { - @available(SwiftStdlib 5.7, *) @_alwaysEmitIntoClient public static func buildPartialBlock( accumulated: R0, next: R1 ) -> Regex<(Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9)> where R0.RegexOutput == (W0, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9) { let factory = makeFactory() - return factory.accumulate(accumulated, next) + return factory.accumulate(accumulated, ignoringOutputTypeOf: next) } } @@ -6884,7 +6873,3 @@ extension TryCapture { self.init(factory.captureOptional(componentBuilder(), reference._raw, transform)) } } - - - -// END AUTO-GENERATED CONTENT diff --git a/Sources/VariadicsGenerator/VariadicsGenerator.swift b/Sources/VariadicsGenerator/VariadicsGenerator.swift index 180bfb168..3853d27b5 100644 --- a/Sources/VariadicsGenerator/VariadicsGenerator.swift +++ b/Sources/VariadicsGenerator/VariadicsGenerator.swift @@ -132,7 +132,7 @@ struct VariadicsGenerator: ParsableCommand { // // This source file is part of the Swift.org open source project // - // Copyright (c) 2021-2022 Apple Inc. and the Swift project authors + // Copyright (c) 2021-2023 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See https://swift.org/LICENSE.txt for license information @@ -262,7 +262,20 @@ struct VariadicsGenerator: ParsableCommand { accumulated: R0, next: R1 ) -> \(regexTypeName)<\(matchType)> \(whereClause) { let factory = makeFactory() + + """) + if leftArity == 0 { + output(""" + return factory.accumulate(ignoringOutputTypeOf: accumulated, next) + + """) + } else { + output(""" return factory.accumulate(accumulated, next) + + """) + } + output(""" } } @@ -274,7 +287,6 @@ struct VariadicsGenerator: ParsableCommand { output(""" \(defaultAvailableAttr) extension \(concatBuilderName) { - \(defaultAvailableAttr) @_alwaysEmitIntoClient public static func buildPartialBlock CaptureList { var builder = Self() - builder.captures.append(.init(optionalDepth: 0, .fake)) - builder.addCaptures(of: ast.root, optionalNesting: .init(canNest: false)) + builder.captures.append(.init(optionalDepth: 0, visibleInTypedOutput: true, .fake)) + builder.addCaptures(of: ast.root, optionalNesting: .init(canNest: false), visibleInTypedOutput: true) return builder.captures } } diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index e0a6c7465..15e052901 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -874,7 +874,7 @@ fileprivate extension Compiler.ByteCodeGen { switch node { case .concatenation(let ch): return ch.flatMap(flatten) - case .convertedRegexLiteral(let n, _): + case .convertedRegexLiteral(let n, _), .ignoreCapturesInTypedOutput(let n): return flatten(n) default: return [node] @@ -951,6 +951,9 @@ fileprivate extension Compiler.ByteCodeGen { case let .nonCapturingGroup(kind, child): try emitNoncapturingGroup(kind.ast, child) + case let .ignoreCapturesInTypedOutput(child): + try emitNode(child) + case .conditional: throw Unsupported("Conditionals") diff --git a/Sources/_StringProcessing/Capture.swift b/Sources/_StringProcessing/Capture.swift index b75d01392..696a85361 100644 --- a/Sources/_StringProcessing/Capture.swift +++ b/Sources/_StringProcessing/Capture.swift @@ -61,7 +61,7 @@ extension Sequence where Element == AnyRegexOutput.Element { // and traffic through existentials @available(SwiftStdlib 5.7, *) func existentialOutput(from input: String) -> Any { - let elements = map { + let elements = filter(\.representation.visibleInTypedOutput).map { $0.existentialOutputComponent(from: input) } return elements.count == 1 diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 3a2731b0a..705b354fb 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -42,7 +42,7 @@ extension DSLTree.Node { case .orderedChoice, .conditional, .concatenation, .capture, .nonCapturingGroup, .quantification, .trivia, .empty, - .absentFunction: return nil + .ignoreCapturesInTypedOutput, .absentFunction: return nil case .consumer: fatalError("FIXME: Is this where we handle them?") diff --git a/Sources/_StringProcessing/Engine/Structuralize.swift b/Sources/_StringProcessing/Engine/Structuralize.swift index bc3adf701..32d7a6204 100644 --- a/Sources/_StringProcessing/Engine/Structuralize.swift +++ b/Sources/_StringProcessing/Engine/Structuralize.swift @@ -14,7 +14,8 @@ extension CaptureList { optionalDepth: cap.optionalDepth, content: meStored.deconstructed, name: cap.name, - referenceID: list.referencedCaptureOffsets.first { $1 == i }?.key + referenceID: list.referencedCaptureOffsets.first { $1 == i }?.key, + visibleInTypedOutput: cap.visibleInTypedOutput ) result.append(element) diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 953df6882..8b456f37d 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -131,6 +131,9 @@ extension PrettyPrinter { printer.printAsPattern(convertedFromAST: child) } + case let .ignoreCapturesInTypedOutput(child): + printAsPattern(convertedFromAST: child, isTopLevel: isTopLevel) + case .conditional: print("/* TODO: conditional */") diff --git a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift index fd292ed1b..243c1ba01 100644 --- a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift +++ b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift @@ -359,6 +359,10 @@ extension AnyRegexOutput { /// The capture reference this element refers to. var referenceID: ReferenceID? = nil + + /// A Boolean value indicating whether this capture should be included in + /// the typed output. + var visibleInTypedOutput: Bool } internal init(input: String, elements: [ElementRepresentation]) { diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 0a0831706..93e86c607 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -42,6 +42,9 @@ extension DSLTree { /// Matches a noncapturing subpattern. case nonCapturingGroup(_AST.GroupKind, Node) + /// Marks all captures in a subpattern as ignored in strongly-typed output. + case ignoreCapturesInTypedOutput(Node) + // TODO: Consider splitting off grouped conditions, or have // our own kind @@ -340,6 +343,27 @@ typealias _CharacterPredicateInterface = ( */ extension DSLTree.Node { + /// Indicates whether this node has at least one child node (among other + /// associated values). + var hasChildNodes: Bool { + switch self { + case .trivia, .empty, .quotedLiteral, + .consumer, .matcher, .characterPredicate, + .customCharacterClass, .atom: + return false + + case .orderedChoice(let c), .concatenation(let c): + return !c.isEmpty + + case .convertedRegexLiteral, .capture, .nonCapturingGroup, + .quantification, .ignoreCapturesInTypedOutput, .conditional: + return true + + case .absentFunction(let abs): + return !abs.ast.children.isEmpty + } + } + @_spi(RegexBuilder) public var children: [DSLTree.Node] { switch self { @@ -354,6 +378,7 @@ extension DSLTree.Node { case let .capture(_, _, n, _): return [n] case let .nonCapturingGroup(_, n): return [n] case let .quantification(_, _, n): return [n] + case let .ignoreCapturesInTypedOutput(n): return [n] case let .conditional(_, t, f): return [t,f] @@ -403,11 +428,13 @@ extension DSLTree { } extension DSLTree { + /// Indicates whether this DSLTree contains any capture groups. var hasCapture: Bool { root.hasCapture } } extension DSLTree.Node { + /// Indicates whether this DSLTree node contains any capture groups. var hasCapture: Bool { switch self { case .capture: @@ -572,52 +599,55 @@ struct CaptureTransform: Hashable, CustomStringConvertible { extension CaptureList.Builder { mutating func addCaptures( - of node: DSLTree.Node, optionalNesting nesting: OptionalNesting + of node: DSLTree.Node, optionalNesting nesting: OptionalNesting, visibleInTypedOutput: Bool ) { switch node { case let .orderedChoice(children): for child in children { - addCaptures(of: child, optionalNesting: nesting.addingOptional) + addCaptures(of: child, optionalNesting: nesting.addingOptional, visibleInTypedOutput: visibleInTypedOutput) } case let .concatenation(children): for child in children { - addCaptures(of: child, optionalNesting: nesting) + addCaptures(of: child, optionalNesting: nesting, visibleInTypedOutput: visibleInTypedOutput) } case let .capture(name, _, child, transform): captures.append(.init( name: name, type: transform?.resultType ?? child.wholeMatchType, - optionalDepth: nesting.depth, .fake)) - addCaptures(of: child, optionalNesting: nesting) + optionalDepth: nesting.depth, visibleInTypedOutput: visibleInTypedOutput, .fake)) + addCaptures(of: child, optionalNesting: nesting, visibleInTypedOutput: visibleInTypedOutput) case let .nonCapturingGroup(kind, child): assert(!kind.ast.isCapturing) - addCaptures(of: child, optionalNesting: nesting) + addCaptures(of: child, optionalNesting: nesting, visibleInTypedOutput: visibleInTypedOutput) + + case let .ignoreCapturesInTypedOutput(child): + addCaptures(of: child, optionalNesting: nesting, visibleInTypedOutput: false) case let .conditional(cond, trueBranch, falseBranch): switch cond.ast { case .group(let g): - addCaptures(of: .group(g), optionalNesting: nesting) + addCaptures(of: .group(g), optionalNesting: nesting, visibleInTypedOutput: visibleInTypedOutput) default: break } - addCaptures(of: trueBranch, optionalNesting: nesting.addingOptional) - addCaptures(of: falseBranch, optionalNesting: nesting.addingOptional) + addCaptures(of: trueBranch, optionalNesting: nesting.addingOptional, visibleInTypedOutput: visibleInTypedOutput) + addCaptures(of: falseBranch, optionalNesting: nesting.addingOptional, visibleInTypedOutput: visibleInTypedOutput) case let .quantification(amount, _, child): var optNesting = nesting if amount.ast.bounds.atLeast == 0 { optNesting = optNesting.addingOptional } - addCaptures(of: child, optionalNesting: optNesting) + addCaptures(of: child, optionalNesting: optNesting, visibleInTypedOutput: visibleInTypedOutput) case let .absentFunction(abs): switch abs.ast.kind { case .expression(_, _, let child): - addCaptures(of: child, optionalNesting: nesting) + addCaptures(of: child, optionalNesting: nesting, visibleInTypedOutput: visibleInTypedOutput) case .clearer, .repeater, .stopper: break } @@ -625,7 +655,7 @@ extension CaptureList.Builder { case let .convertedRegexLiteral(n, _): // We disable nesting for converted AST trees, as literals do not nest // captures. This includes literals nested in a DSL. - return addCaptures(of: n, optionalNesting: nesting.disablingNesting) + return addCaptures(of: n, optionalNesting: nesting.disablingNesting, visibleInTypedOutput: visibleInTypedOutput) case .matcher: break @@ -639,8 +669,8 @@ extension CaptureList.Builder { static func build(_ dsl: DSLTree) -> CaptureList { var builder = Self() builder.captures.append( - .init(type: dsl.root.wholeMatchType, optionalDepth: 0, .fake)) - builder.addCaptures(of: dsl.root, optionalNesting: .init(canNest: true)) + .init(type: dsl.root.wholeMatchType, optionalDepth: 0, visibleInTypedOutput: true, .fake)) + builder.addCaptures(of: dsl.root, optionalNesting: .init(canNest: true), visibleInTypedOutput: true) return builder.captures } } @@ -650,7 +680,7 @@ extension DSLTree.Node { /// output but forwarding its only child's output. var isOutputForwarding: Bool { switch self { - case .nonCapturingGroup: + case .nonCapturingGroup, .ignoreCapturesInTypedOutput: return true case .orderedChoice, .concatenation, .capture, .conditional, .quantification, .customCharacterClass, .atom, @@ -710,6 +740,7 @@ extension DSLTree { case let .capture(_, _, n, _): return [_Tree(n)] case let .nonCapturingGroup(_, n): return [_Tree(n)] case let .quantification(_, _, n): return [_Tree(n)] + case let .ignoreCapturesInTypedOutput(n): return [_Tree(n)] case let .conditional(_, t, f): return [_Tree(t), _Tree(f)] diff --git a/Sources/_StringProcessing/Utility/RegexFactory.swift b/Sources/_StringProcessing/Utility/RegexFactory.swift index e0df906fa..584772921 100644 --- a/Sources/_StringProcessing/Utility/RegexFactory.swift +++ b/Sources/_StringProcessing/Utility/RegexFactory.swift @@ -20,6 +20,16 @@ public struct _RegexFactory { // Hide is behind an SPI that only RegexBuilder can use. @_spi(RegexBuilder) public init() {} + + @available(SwiftStdlib 5.8, *) + public func ignoreCapturesInTypedOutput( + _ child: some RegexComponent + ) -> Regex { + // Don't wrap `child` again if it's a leaf node. + child.regex.root.hasChildNodes + ? .init(node: .ignoreCapturesInTypedOutput(child.regex.root)) + : .init(node: child.regex.root) + } @available(SwiftStdlib 5.7, *) public func accumulate( diff --git a/Tests/RegexBuilderTests/AlgorithmsTests.swift b/Tests/RegexBuilderTests/AlgorithmsTests.swift index dcaddd9d7..7d24e30af 100644 --- a/Tests/RegexBuilderTests/AlgorithmsTests.swift +++ b/Tests/RegexBuilderTests/AlgorithmsTests.swift @@ -13,6 +13,7 @@ import XCTest import _StringProcessing import RegexBuilder +@available(SwiftStdlib 5.7, *) class RegexConsumerTests: XCTestCase { func testMatches() { let regex = Capture(OneOrMore(.digit)) { 2 * Int($0)! } @@ -105,6 +106,7 @@ class RegexConsumerTests: XCTestCase { } } +@available(SwiftStdlib 5.7, *) class AlgorithmsResultBuilderTests: XCTestCase { enum MatchAlgo { case whole diff --git a/Tests/RegexBuilderTests/AnyRegexOutputTests.swift b/Tests/RegexBuilderTests/AnyRegexOutputTests.swift index e6c3214b9..165d1d411 100644 --- a/Tests/RegexBuilderTests/AnyRegexOutputTests.swift +++ b/Tests/RegexBuilderTests/AnyRegexOutputTests.swift @@ -5,6 +5,7 @@ import RegexBuilder private let enablePrinting = false +@available(SwiftStdlib 5.7, *) extension RegexDSLTests { func testContrivedAROExample() { diff --git a/Tests/RegexBuilderTests/CustomTests.swift b/Tests/RegexBuilderTests/CustomTests.swift index d34b5689f..85186b684 100644 --- a/Tests/RegexBuilderTests/CustomTests.swift +++ b/Tests/RegexBuilderTests/CustomTests.swift @@ -14,10 +14,12 @@ import _StringProcessing @testable import RegexBuilder // A nibbler processes a single character from a string +@available(SwiftStdlib 5.7, *) private protocol Nibbler: CustomConsumingRegexComponent { func nibble(_: Character) -> RegexOutput? } +@available(SwiftStdlib 5.7, *) extension Nibbler { // Default implementation, just feed the character in func consuming( @@ -34,6 +36,7 @@ extension Nibbler { // A number nibbler +@available(SwiftStdlib 5.7, *) private struct Numbler: Nibbler { typealias RegexOutput = Int func nibble(_ c: Character) -> Int? { @@ -42,6 +45,7 @@ private struct Numbler: Nibbler { } // An ASCII value nibbler +@available(SwiftStdlib 5.7, *) private struct Asciibbler: Nibbler { typealias RegexOutput = UInt8 func nibble(_ c: Character) -> UInt8? { @@ -49,6 +53,7 @@ private struct Asciibbler: Nibbler { } } +@available(SwiftStdlib 5.7, *) private struct IntParser: CustomConsumingRegexComponent { struct ParseError: Error, Hashable {} typealias RegexOutput = Int @@ -71,6 +76,7 @@ private struct IntParser: CustomConsumingRegexComponent { } } +@available(SwiftStdlib 5.7, *) private struct CurrencyParser: CustomConsumingRegexComponent { enum Currency: String, Hashable { case usd = "USD" @@ -117,9 +123,12 @@ enum MatchCall { case firstMatch } -func customTest( +@available(SwiftStdlib 5.7, *) +fileprivate func customTest( _ regex: Regex, - _ tests: (input: String, call: MatchCall, match: Match?)... + _ tests: (input: String, call: MatchCall, match: Match?)..., + file: StaticString = #file, + line: UInt = #line ) { for (input, call, match) in tests { let result: Match? @@ -129,7 +138,40 @@ func customTest( case .firstMatch: result = input.firstMatch(of: regex)?.output } - XCTAssertEqual(result, match) + XCTAssertEqual(result, match, file: file, line: line) + } +} + +@available(SwiftStdlib 5.7, *) +fileprivate func customTest( + _ regex: some RegexComponent, + _ isEquivalent: (Match, Match) -> Bool, + _ tests: (input: String, call: MatchCall, match: Match?)..., + file: StaticString = #file, + line: UInt = #line +) { + for (input, call, match) in tests { + let result: Match? + switch call { + case .match: + result = input.wholeMatch(of: regex)?.output + case .firstMatch: + result = input.firstMatch(of: regex)?.output + } + switch (result, match) { + case let (result?, match?): + XCTAssert( + isEquivalent(result, match), + "'\(result)' isn't equal to '\(match)'.", + file: file, line: line) + case (nil, nil): + // Success + break + case (nil, _): + XCTFail("No match when expected", file: file, line: line) + case (_, nil): + XCTFail("Unexpected match", file: file, line: line) + } } } @@ -178,6 +220,7 @@ extension Concat: BidirectionalCollection { } } +@available(SwiftStdlib 5.7, *) class CustomRegexComponentTests: XCTestCase { // TODO: Refactor below into more exhaustive, declarative // tests. @@ -211,39 +254,91 @@ class CustomRegexComponentTests: XCTestCase { ("55z", .match, nil), ("55z", .firstMatch, 5)) - // TODO: Convert below tests to better infra. Right now - // it's hard because `Match` is constrained to be - // `Equatable` which tuples cannot be. +// customTest( +// Regex { +// #/(?\D+)/# +// Optionally("~") +// }, +// ("ab123c", .firstMatch, "ab"), +// ("abc", .firstMatch, "abc"), +// ("123", .firstMatch, nil), +// ("a55z", .match, nil), +// ("a55z", .firstMatch, "a")) + + customTest( + Regex<(Substring, Substring, Int)> { + #/(\D+)/# + Capture(Numbler()) + }, + ==, + ("ab123c", .firstMatch, ("ab1", "ab", 1)), + ("abc", .firstMatch, nil), + ("123", .firstMatch, nil), + ("a55z", .match, nil), + ("a55z", .firstMatch, ("a5", "a", 5))) - let regex3 = Regex { - Capture { + customTest( + Regex<(Substring, prefix: Substring)> { + #/(?\D+)/# + }, + ==, + ("ab123c", .firstMatch, ("ab", "ab")), + ("abc", .firstMatch, ("abc", "abc")), + ("123", .firstMatch, nil), + ("a55z", .match, nil), + ("a55z", .firstMatch, ("a", "a"))) + +// customTest( +// Regex<(Substring, Int)> { +// #/(?\D+)/# +// Capture(Numbler()) +// }, +// ==, +// ("ab123c", .firstMatch, ("ab1", 1)), +// ("abc", .firstMatch, nil), +// ("123", .firstMatch, nil), +// ("a55z", .match, nil), +// ("a55z", .firstMatch, ("a5", 5))) + +// customTest( +// Regex<(Substring, Int, Substring)> { +// #/(?\D+)/# +// Regex { +// Capture(Numbler()) +// Capture(OneOrMore(.word)) +// } +// }, +// ==, +// ("ab123c", .firstMatch, ("ab123c", 1, "23c")), +// ("abc", .firstMatch, nil), +// ("123", .firstMatch, nil), +// ("a55z", .match, ("a55z", 5, "5z")), +// ("a55z", .firstMatch, ("a55z", 5, "5z"))) + + customTest( + Regex<(Substring, Substring)> { + Capture { + OneOrMore { + Numbler() + } + } + }, + ==, + ("abc123", .firstMatch, ("123", "123")), + ("abc123", .match, nil), + ("abc", .firstMatch, nil)) + + customTest( + Regex<(Substring, Int)> { OneOrMore { - Numbler() + Capture { Numbler() } } - } - } - - let str = "ab123c" - let res3 = try XCTUnwrap(str.firstMatch(of: regex3)) - - let expectedSubstring = str.dropFirst(2).prefix(3) - XCTAssertEqual(res3.range, expectedSubstring.startIndex.. TransactionKind? in TransactionKind(rawValue: String(s)) } @@ -322,7 +323,7 @@ extension RegexDSLTests { Repeat(.digit, count: 2) Repeat(.digit, count: 2) Repeat(.digit, count: 4) - } transform: { (s: Substring) in + } transform: { (s: Substring) -> Date? in Date(mmddyyyy: String(s)) } @@ -345,7 +346,7 @@ extension RegexDSLTests { OneOrMore(.digit) "." Repeat(.digit, count: 2) - } transform: { (s: Substring) in + } transform: { (s: Substring) -> Double? in Double(s) } } diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 8b7611536..0dd050357 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -14,6 +14,7 @@ import _StringProcessing import RegexBuilder import TestSupport +@available(SwiftStdlib 5.7, *) class RegexDSLTests: XCTestCase { func _testDSLCaptures( _ tests: (input: String, expectedCaptures: MatchType?)..., @@ -52,31 +53,31 @@ class RegexDSLTests: XCTestCase { file: file, line: line) } } - + func testSimpleStrings() throws { let regex = Regex { "a" Capture(Character("b")) // Character - TryCapture("1") { Int($0) } // Int + TryCapture { "1" } transform: { Int($0) } // Int } // Assert the inferred capture type. let _: (Substring, Substring, Int).Type = type(of: regex).RegexOutput.self let maybeMatch = "ab1".wholeMatch(of: regex) let match = try XCTUnwrap(maybeMatch) XCTAssertTrue(match.output == ("ab1", "b", 1)) - + let substring = "ab1"[...] let substringMatch = try XCTUnwrap(substring.wholeMatch(of: regex)) XCTAssertTrue(match.output == substringMatch.output) } - + let allNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n\u{85}\u{2028}\u{2029}" let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" - + func testCharacterClasses() throws { // Must have new stdlib for character class ranges. guard ensureNewStdlib() else { return } - + try _testDSLCaptures( ("a c", ("a c", " ", "c")), matchType: (Substring, Substring, Substring).self, ==) @@ -94,7 +95,7 @@ class RegexDSLTests: XCTestCase { OneOrMore { CharacterClass("a"..."z", .digit) } - + // Second group OneOrMore { ChoiceOf { @@ -103,7 +104,7 @@ class RegexDSLTests: XCTestCase { } } } - + try _testDSLCaptures( ("abc1def2", ("abc1def2", "abc1")), matchType: (Substring, Substring).self, ==) @@ -112,12 +113,12 @@ class RegexDSLTests: XCTestCase { OneOrMore(.digit.inverted) ("a"..."z").inverted } - + OneOrMore { CharacterClass.whitespace.inverted } } - + // `.newlineSequence` and `.verticalWhitespace` match the same set of // newlines in grapheme semantic mode, and scalar mode when applied with // OneOrMore. @@ -146,7 +147,7 @@ class RegexDSLTests: XCTestCase { } }.matchingSemantics(mode) } - + // Try with ASCII-only whitespace. try _testDSLCaptures( ("\n", ("\n", "\n")), @@ -173,7 +174,7 @@ class RegexDSLTests: XCTestCase { } } } - + // `.newlineSequence` in scalar mode may match a single `\r\n`. // `.verticalWhitespace` may not. for asciiOnly in [true, false] { @@ -224,7 +225,7 @@ class RegexDSLTests: XCTestCase { }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) } } - + // Make sure horizontal whitespace does not match newlines or other // vertical whitespace. try _testDSLCaptures( @@ -237,7 +238,7 @@ class RegexDSLTests: XCTestCase { { OneOrMore(.horizontalWhitespace) } - + // Horizontal whitespace in ASCII mode. try _testDSLCaptures( (" \u{9} \t ", " \u{9} \t "), @@ -249,11 +250,11 @@ class RegexDSLTests: XCTestCase { }.asciiOnlyWhitespace() } } - + func testCharacterClassOperations() throws { // Must have new stdlib for character class ranges. guard ensureNewStdlib() else { return } - + try _testDSLCaptures( ("bcdefn1a", "bcdefn1a"), ("nbcdef1a", nil), // fails symmetric difference lookahead @@ -265,15 +266,15 @@ class RegexDSLTests: XCTestCase { let disallowedChars = CharacterClass.hexDigit .symmetricDifference("a"..."z") NegativeLookahead(disallowedChars) // No: 0-9 + g-z - + OneOrMore(("b"..."g").union("d"..."n")) // b-n CharacterClass.digit.subtracting("3"..."9") // 1, 2, non-ascii digits - + CharacterClass.hexDigit.intersection("a"..."z") // a-f } } - + func testAny() throws { // .any matches newlines regardless of matching options. for dotMatchesNewline in [true, false] { @@ -286,7 +287,7 @@ class RegexDSLTests: XCTestCase { }.dotMatchesNewlines(dotMatchesNewline) } } - + // `.anyGraphemeCluster` is the same as `.any` in grapheme mode. for mode in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { try _testDSLCaptures( @@ -301,7 +302,7 @@ class RegexDSLTests: XCTestCase { One(.anyGraphemeCluster) }.matchingSemantics(mode) } - + // Like `.any` it also always matches newlines. for dotMatchesNewline in [true, false] { try _testDSLCaptures( @@ -315,7 +316,7 @@ class RegexDSLTests: XCTestCase { } } } - + func testAnyNonNewline() throws { // `.anyNonNewline` is `.` without single-line mode. for mode in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { @@ -332,7 +333,7 @@ class RegexDSLTests: XCTestCase { OneOrMore(.anyNonNewline) }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) } - + try _testDSLCaptures( ("abcdef", nil), ("abcdef\n", nil), @@ -345,7 +346,7 @@ class RegexDSLTests: XCTestCase { OneOrMore(.anyNonNewline.inverted) }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) } - + try _testDSLCaptures( ("abc", "abc"), ("abcd", nil), @@ -360,7 +361,7 @@ class RegexDSLTests: XCTestCase { } } } - + try _testDSLCaptures( ("\r\n", "\r\n"), matchType: Substring.self, ==) { CharacterClass.anyNonNewline.inverted @@ -372,12 +373,12 @@ class RegexDSLTests: XCTestCase { }.matchingSemantics(.unicodeScalar) } } - + func testMatchResultDotZeroWithoutCapture() throws { let match = try XCTUnwrap("aaa".wholeMatch { OneOrMore { "a" } }) XCTAssertEqual(match.0, "aaa") } - + func testAlternation() throws { do { let regex = ChoiceOf { @@ -446,7 +447,7 @@ class RegexDSLTests: XCTestCase { XCTAssertNil("aab".wholeMatch(of: regex)?.output) } } - + func testCombinators() throws { try _testDSLCaptures( ("aaaabccccdddkj", ("aaaabccccdddkj", "b", "cccc", "d", "k", nil, "j")), @@ -497,7 +498,7 @@ class RegexDSLTests: XCTestCase { .ignoresCase(true) .ignoresCase(false) } - + // An option on an outer component doesn't override an option set on an // inner component. try _testDSLCaptures( @@ -518,7 +519,7 @@ class RegexDSLTests: XCTestCase { } .ignoresCase(false) } - + // FIXME: Re-enable this test try _testDSLCaptures( ("can't stop won't stop", ("can't stop won't stop", "can't", "won't")), @@ -538,7 +539,7 @@ class RegexDSLTests: XCTestCase { OneOrMore(.any, .reluctant) "stop" } - + // FIXME: Re-enable this test try _testDSLCaptures( ("can't stop won't stop", ("can't stop won't stop", "can", "won")), @@ -599,7 +600,7 @@ class RegexDSLTests: XCTestCase { func testQuantificationBehavior() throws { // Must have new stdlib for character class ranges. guard ensureNewStdlib() else { return } - + // Eager by default try _testDSLCaptures( ("abc1def2", ("abc1def2", "2")), @@ -609,7 +610,7 @@ class RegexDSLTests: XCTestCase { Capture(.digit) ZeroOrMore(.any) } - + // Explicitly reluctant try _testDSLCaptures( ("abc1def2", ("abc1def2", "1")), @@ -700,7 +701,7 @@ class RegexDSLTests: XCTestCase { OneOrMore("a") }.repetitionBehavior(.possessive) } - + try _testDSLCaptures( ("abc1def2", "abc1def2"), matchType: Substring.self, ==) @@ -712,7 +713,7 @@ class RegexDSLTests: XCTestCase { CharacterClass.digit } } - + try _testDSLCaptures( ("abcdef2", ("abcdef2", "f")), ("2", ("2", nil)), @@ -726,7 +727,7 @@ class RegexDSLTests: XCTestCase { CharacterClass.digit } } - + try _testDSLCaptures( ("aaabbbcccdddeeefff", "aaabbbcccdddeeefff"), ("aaabbbcccccdddeeefff", "aaabbbcccccdddeeefff"), @@ -748,7 +749,7 @@ class RegexDSLTests: XCTestCase { Repeat(2...) { "e" } Repeat(0...) { "f" } } - + try _testDSLCaptures( ("", nil), ("a", nil), @@ -758,7 +759,7 @@ class RegexDSLTests: XCTestCase { { Repeat(2...) { "a" } } - + try _testDSLCaptures( ("", ""), ("a", "a"), @@ -768,7 +769,7 @@ class RegexDSLTests: XCTestCase { { Repeat(...2) { "a" } } - + try _testDSLCaptures( ("", ""), ("a", "a"), @@ -778,7 +779,7 @@ class RegexDSLTests: XCTestCase { { Repeat(..<2) { "a" } } - + try _testDSLCaptures( ("", ""), ("a", nil), @@ -787,7 +788,7 @@ class RegexDSLTests: XCTestCase { { Repeat(...0) { "a" } } - + try _testDSLCaptures( ("", ""), ("a", nil), @@ -796,7 +797,7 @@ class RegexDSLTests: XCTestCase { { Repeat(0 ... 0) { "a" } } - + try _testDSLCaptures( ("", ""), ("a", nil), @@ -805,7 +806,7 @@ class RegexDSLTests: XCTestCase { { Repeat(count: 0) { "a" } } - + try _testDSLCaptures( ("", ""), ("a", "a"), @@ -814,7 +815,7 @@ class RegexDSLTests: XCTestCase { { Repeat(0 ... 1) { "a" } } - + try _testDSLCaptures( ("", nil), ("a", "a"), @@ -824,7 +825,7 @@ class RegexDSLTests: XCTestCase { { Repeat(1 ... 2) { "a" } } - + try _testDSLCaptures( ("", ""), ("a", nil), @@ -833,7 +834,7 @@ class RegexDSLTests: XCTestCase { { Repeat(0 ..< 1) { "a" } } - + try _testDSLCaptures( ("", ""), ("a", "a"), @@ -842,7 +843,7 @@ class RegexDSLTests: XCTestCase { { Repeat(0 ..< 2) { "a" } } - + try _testDSLCaptures( ("", nil), ("a", "a"), @@ -852,7 +853,7 @@ class RegexDSLTests: XCTestCase { { Repeat(1 ..< 3) { "a" } } - + let octoDecimalRegex: Regex<(Substring, Int?)> = Regex { let charClass = CharacterClass(.digit, "a"..."h")//.ignoringCase() Capture { @@ -907,7 +908,7 @@ class RegexDSLTests: XCTestCase { UnicodeScalar("e") Anchor.textSegmentBoundary } - + try _testDSLCaptures( ("aaaaa1", "aaaaa1"), ("aaaaa2", nil), @@ -934,7 +935,7 @@ class RegexDSLTests: XCTestCase { Anchor.endOfSubject }.anchorsMatchLineEndings() } - + try _testDSLCaptures( ("\naaa", "\naaa"), ("aaa\n", "aaa\n"), @@ -949,7 +950,7 @@ class RegexDSLTests: XCTestCase { Optionally { "\n" } } } - + // startOfLine/endOfLine apply regardless of mode. for matchLineEndings in [true, false] { for mode in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { @@ -958,41 +959,41 @@ class RegexDSLTests: XCTestCase { Repeat("a", count: 3) Anchor.endOfLine }.anchorsMatchLineEndings(matchLineEndings).matchingSemantics(mode) - + XCTAssertNotNil(try r.firstMatch(in: "\naaa")) XCTAssertNotNil(try r.firstMatch(in: "aaa\n")) XCTAssertNotNil(try r.firstMatch(in: "\naaa\n")) XCTAssertNotNil(try r.firstMatch(in: "\naaa\r\n")) XCTAssertNotNil(try r.firstMatch(in: "\r\naaa\n")) XCTAssertNotNil(try r.firstMatch(in: "\r\naaa\r\n")) - + XCTAssertNil(try r.firstMatch(in: "\nbaaa\n")) XCTAssertNil(try r.firstMatch(in: "\naaab\n")) } } } - + func testNestedGroups() throws { return; - + // TODO: clarify what the nesting story is - + /* - try _testDSLCaptures( - ("aaaabccccddd", ("aaaabccccddd", [("b", "cccc", ["d", "d", "d"])])), - matchType: (Substring, [(Substring, Substring, [Substring])]).self, ==) - { - "a".+ - OneOrMore { - Capture(OneOrMore("b")) - Capture(ZeroOrMore("c")) - Capture("d").* - "e".? - } - } + try _testDSLCaptures( + ("aaaabccccddd", ("aaaabccccddd", [("b", "cccc", ["d", "d", "d"])])), + matchType: (Substring, [(Substring, Substring, [Substring])]).self, ==) + { + "a".+ + OneOrMore { + Capture(OneOrMore("b")) + Capture(ZeroOrMore("c")) + Capture("d").* + "e".? + } + } */ } - + func testCaptureTransform() throws { try _testDSLCaptures( ("aaaa1", ("aaaa1", "aaa")), @@ -1015,7 +1016,7 @@ class RegexDSLTests: XCTestCase { One(.digit) } } - + func testCapturelessQuantification() throws { // This test is to make sure that a captureless quantification, when used // straight out of the quantifier (without being wrapped in a builder), is @@ -1028,7 +1029,7 @@ class RegexDSLTests: XCTestCase { let match = try XCTUnwrap(input.wholeMatch(of: regex)?.output) XCTAssertTrue(match == input) } - + func testQuantificationWithTransformedCapture() throws { // This test is to make sure transformed capture type information is // correctly propagated from the DSL into the bytecode and that the engine @@ -1037,7 +1038,7 @@ class RegexDSLTests: XCTestCase { enum Word: Int32 { case apple case orange - + init?(_ string: Substring) { switch string { case "apple": self = .apple @@ -1062,7 +1063,7 @@ class RegexDSLTests: XCTestCase { } } } - + func testNestedCaptureTypes() throws { let regex1 = Regex { OneOrMore("a") @@ -1072,8 +1073,8 @@ class RegexDSLTests: XCTestCase { } } let _: (Substring, Substring, Substring).Type - = type(of: regex1).RegexOutput.self - + = type(of: regex1).RegexOutput.self + let regex2 = Regex { OneOrMore("a") Capture { @@ -1084,8 +1085,8 @@ class RegexDSLTests: XCTestCase { } } let _: (Substring, Substring, Int?).Type - = type(of: regex2).RegexOutput.self - + = type(of: regex2).RegexOutput.self + let regex3 = Regex { OneOrMore("a") Capture { @@ -1097,8 +1098,8 @@ class RegexDSLTests: XCTestCase { } } let _: (Substring, Substring, Int, Double?).Type - = type(of: regex3).RegexOutput.self - + = type(of: regex3).RegexOutput.self + let regex4 = Regex { OneOrMore("a") Capture { @@ -1112,50 +1113,50 @@ class RegexDSLTests: XCTestCase { } let _: ( Substring, Substring, Substring, Substring, Substring?).Type - = type(of: regex4).RegexOutput.self + = type(of: regex4).RegexOutput.self } - + func testUnicodeScalarPostProcessing() throws { let spaces = Regex { ZeroOrMore { One(.whitespace) } } - + let unicodeScalar = Regex { OneOrMore { One(.hexDigit) } spaces } - + let unicodeData = Regex { unicodeScalar Optionally { ".." unicodeScalar } - + ";" spaces - + Capture { OneOrMore(.word) } - + ZeroOrMore(.any) } - + // Assert the inferred capture type. let _: (Substring, Substring).Type = type(of: unicodeData).RegexOutput.self - + let unicodeLine = - "1BCA0..1BCA3 ; Control # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP" + "1BCA0..1BCA3 ; Control # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP" let match = try XCTUnwrap(unicodeLine.wholeMatch(of: unicodeData)) XCTAssertEqual(match.0, Substring(unicodeLine)) XCTAssertEqual(match.1, "Control") } - + func testGraphemeBreakData() throws { let line = """ A6F0..A6F1 ; Extend # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS @@ -1191,7 +1192,7 @@ class RegexDSLTests: XCTestCase { XCTAssertEqual(upper, Unicode.Scalar(0xA6F1)) XCTAssertEqual(propertyString, "Extend") } - + let regexWithTryCapture = Regex { TryCapture { OneOrMore(.hexDigit) @@ -1226,10 +1227,10 @@ class RegexDSLTests: XCTestCase { XCTAssertEqual(upper, Unicode.Scalar(0xA6F1)) XCTAssertEqual(propertyString, "Extend") } - + do { let regexLiteral = try Regex( - #"([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+).*"#, + #"([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+).*"#, as: (Substring, Substring, Substring?, Substring).self) let maybeMatchResult = line.wholeMatch(of: regexLiteral) let matchResult = try XCTUnwrap(maybeMatchResult) @@ -1240,7 +1241,7 @@ class RegexDSLTests: XCTestCase { XCTAssertEqual(propertyString, "Extend") } } - + func testBackreference() throws { try _testDSLCaptures( ("abc#41#42abcabcabc", ("abc#41#42abcabcabc", "abc", 42, "abc", nil)), @@ -1266,7 +1267,7 @@ class RegexDSLTests: XCTestCase { Capture(a) } } - + // Match result referencing a `Reference`. do { let a = Reference(Substring.self) @@ -1294,7 +1295,7 @@ class RegexDSLTests: XCTestCase { XCTAssertEqual(result[a], "abc") XCTAssertEqual(result[b], 42) } - + do { let key = Reference(Substring.self) let value = Reference(Int.self) @@ -1312,15 +1313,15 @@ class RegexDSLTests: XCTestCase { } transform: { Int($0)! } } } - + let result1 = try XCTUnwrap("age:123".wholeMatch(of: regex)) XCTAssertEqual(result1[key], "age") XCTAssertEqual(result1[value], 123) - + let result2 = try XCTUnwrap(":567".wholeMatch(of: regex)) XCTAssertEqual(result2[key], "") XCTAssertEqual(result2[value], 567) - + let result3 = try XCTUnwrap("status:".wholeMatch(of: regex)) XCTAssertEqual(result3[key], "status") // Traps: @@ -1351,7 +1352,7 @@ class RegexDSLTests: XCTestCase { } } } - + // Post-hoc captured reference w/ attempted match before capture // #"(?:\w\1|(\w):)+"# // @@ -1400,7 +1401,7 @@ class RegexDSLTests: XCTestCase { } } } - + func testScalarMatching() throws { // RegexBuilder provides a RegexComponent conformance for UnicodeScalar. In // grapheme cluster mode, it should only match entire graphemes. It may @@ -1409,7 +1410,7 @@ class RegexDSLTests: XCTestCase { XCTAssertNil("a\u{301}".firstMatch(of: "a" as UnicodeScalar)) XCTAssertNotNil("a\u{301}".firstMatch( of: ("a" as UnicodeScalar).regex.matchingSemantics(.unicodeScalar))) - + let r1 = Regex { "a" as UnicodeScalar } @@ -1417,7 +1418,7 @@ class RegexDSLTests: XCTestCase { XCTAssertNotNil( try r1.matchingSemantics(.unicodeScalar).firstMatch(in: "a\u{301}") ) - + let r2 = Regex { CharacterClass.anyOf(["a" as UnicodeScalar, "๐Ÿ‘"]) } @@ -1425,7 +1426,7 @@ class RegexDSLTests: XCTestCase { XCTAssertNotNil( try r2.matchingSemantics(.unicodeScalar).firstMatch(in: "a\u{301}") ) - + let r3 = Regex { "๐Ÿ‘จ" as UnicodeScalar "\u{200D}" as UnicodeScalar @@ -1439,7 +1440,7 @@ class RegexDSLTests: XCTestCase { XCTAssertNotNil(try r3.wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) - + let r4 = Regex { "รฉ" as UnicodeScalar } XCTAssertNotNil( try r4.firstMatch(in: "e\u{301}") @@ -1447,28 +1448,28 @@ class RegexDSLTests: XCTestCase { XCTAssertNotNil( try r4.firstMatch(in: "รฉ") ) - + let r5 = Regex { "e" "\u{301}" as UnicodeScalar } XCTAssertNotNil(try r5.firstMatch(in: "e\u{301}")) XCTAssertNotNil(try r5.firstMatch(in: "รฉ")) - + let r6 = Regex { "abcde" "\u{301}" } XCTAssertNotNil(try r6.firstMatch(in: "abcde\u{301}")) XCTAssertNotNil(try r6.firstMatch(in: "abcdรฉ")) - + let r7 = Regex { "e" as Character "\u{301}" as Character } XCTAssertNotNil(try r7.firstMatch(in: "e\u{301}")) XCTAssertNotNil(try r7.firstMatch(in: "รฉ")) - + // You can't match a partial grapheme in grapheme semantic mode. let r8 = Regex { "๐Ÿ‘จ" as UnicodeScalar @@ -1481,7 +1482,7 @@ class RegexDSLTests: XCTestCase { XCTAssertNil(try r8.wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) XCTAssertNotNil(try r8.matchingSemantics(.unicodeScalar).firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) XCTAssertNil(try r8.matchingSemantics(.unicodeScalar).wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) - + // Scalar coalescing occurs across nested concatenations and literals. let r9 = Regex { Regex { @@ -1503,7 +1504,7 @@ class RegexDSLTests: XCTestCase { XCTAssertNotNil(try r9.wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) XCTAssertNotNil(try r9.matchingSemantics(.unicodeScalar).firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) XCTAssertNotNil(try r9.matchingSemantics(.unicodeScalar).wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) - + let r10 = Regex { "๐Ÿ‘จ" as UnicodeScalar try! Regex(#"\u{200D 1F468 200D 1F467}"#) @@ -1515,7 +1516,7 @@ class RegexDSLTests: XCTestCase { XCTAssertNotNil(try r10.matchingSemantics(.unicodeScalar).firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) XCTAssertNotNil(try r10.matchingSemantics(.unicodeScalar).wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) } - + struct SemanticVersion: Equatable { var major: Int var minor: Int @@ -1542,11 +1543,11 @@ class RegexDSLTests: XCTestCase { Capture(OneOrMore(.word)) } } - + guard let match = input[index..) throws -> (upperBound: String.Index, output: Void)? { print("Matching '\(label)'", to: &Self.traceOutput) print(input, to: &Self.traceOutput) @@ -1611,7 +1612,7 @@ class RegexDSLTests: XCTestCase { """) } - + func testRegexComponentBuilderResultType() { // Test that the user can declare a closure or computed property marked with // `@RegexComponentBuilder` with `Regex` as the result type. @@ -1654,7 +1655,7 @@ class RegexDSLTests: XCTestCase { XCTAssertEqual(try replace("{bar}"), "foo") } - + func testOptionalNesting() throws { try _testDSLCaptures( ("a", ("a", nil)), @@ -1665,7 +1666,7 @@ class RegexDSLTests: XCTestCase { { try! Regex("(?:a|(b)*)?", as: (Substring, Substring?).self) } - + try _testDSLCaptures( ("a", ("a", nil)), ("", ("", nil)), @@ -1677,7 +1678,7 @@ class RegexDSLTests: XCTestCase { try! Regex("a|(b)*", as: (Substring, Substring?).self) } } - + try _testDSLCaptures( ("a", ("a", nil)), ("", ("", nil)), @@ -1692,7 +1693,7 @@ class RegexDSLTests: XCTestCase { } } } - + try _testDSLCaptures( ("a", ("a", nil)), ("", ("", nil)), @@ -1705,7 +1706,7 @@ class RegexDSLTests: XCTestCase { try! Regex("(b)*", as: (Substring, Substring?).self) } } - + try _testDSLCaptures( ("a", ("a", nil)), ("", ("", nil)), @@ -1720,7 +1721,7 @@ class RegexDSLTests: XCTestCase { } } } - + try _testDSLCaptures( ("a", ("a", nil)), ("", ("", nil)), @@ -1737,7 +1738,7 @@ class RegexDSLTests: XCTestCase { } } } - + let r = Regex { Optionally { Optionally { @@ -1757,6 +1758,139 @@ class RegexDSLTests: XCTestCase { } } +fileprivate let oneNumericField = "abc:123:def" +fileprivate let twoNumericFields = "abc:123:def:456:ghi" + +@available(SwiftStdlib 5.7, *) +fileprivate let regexWithCapture = #/:(\d+):/# +@available(SwiftStdlib 5.7, *) +fileprivate let regexWithLabeledCapture = #/:(?\d+):/# +@available(SwiftStdlib 5.7, *) +fileprivate let regexWithNonCapture = #/:(?:\d+):/# + +@available(SwiftStdlib 5.7, *) +extension RegexDSLTests { + func testLabeledCaptures_regularCapture() throws { + // The output type of a regex with unlabeled captures is concatenated. + let dslWithCapture = Regex { + OneOrMore(.word) + regexWithCapture + OneOrMore(.word) + } + XCTAssert(type(of: dslWithCapture).self == Regex<(Substring, Substring)>.self) + + let output = try XCTUnwrap(oneNumericField.wholeMatch(of: dslWithCapture)?.output) + XCTAssertEqual(output.0, oneNumericField[...]) + XCTAssertEqual(output.1, "123") + } + + func testLabeledCaptures_labeledCapture() throws { + guard #available(macOS 13, *) else { + XCTSkip("Fix only exists on macOS 13") + return + } + // The output type of a regex with a labeled capture is dropped. + let dslWithLabeledCapture = Regex { + OneOrMore(.word) + regexWithLabeledCapture + OneOrMore(.word) + } + XCTAssert(type(of: dslWithLabeledCapture).self == Regex.self) + + let match = try XCTUnwrap(oneNumericField.wholeMatch(of: dslWithLabeledCapture)) + XCTAssertEqual(match.output, oneNumericField[...]) + + // We can recover the ignored captures by converting to `AnyRegexOutput`. + let anyOutput = AnyRegexOutput(match) + XCTAssertEqual(anyOutput.count, 2) + XCTAssertEqual(anyOutput[0].substring, oneNumericField[...]) + XCTAssertEqual(anyOutput[1].substring, "123") + XCTAssertEqual(anyOutput["number"]?.substring, "123") + } + + func testLabeledCaptures_coalescingWithCapture() throws { + let coalescingWithCapture = Regex { + "e" as Character + #/\u{301}(\d*)/# + } + XCTAssertNotNil(try coalescingWithCapture.firstMatch(in: "e\u{301}")) + XCTAssertNotNil(try coalescingWithCapture.firstMatch(in: "รฉ")) + + let coalescingWithLabeledCapture = Regex { + "e" as Character + #/\u{301}(?\d*)/# + } + XCTAssertNotNil(try coalescingWithLabeledCapture.firstMatch(in: "e\u{301}")) + XCTAssertNotNil(try coalescingWithLabeledCapture.firstMatch(in: "รฉ")) + } + + func testLabeledCaptures_bothCapture() throws { + guard #available(macOS 13, *) else { + XCTSkip("Fix only exists on macOS 13") + return + } + // Only the output type of a regex with a labeled capture is dropped, + // outputs of other regexes in the same DSL are concatenated. + let dslWithBothCaptures = Regex { + OneOrMore(.word) + regexWithCapture + OneOrMore(.word) + regexWithLabeledCapture + OneOrMore(.word) + } + XCTAssert(type(of: dslWithBothCaptures).self == Regex<(Substring, Substring)>.self) + + let match = try XCTUnwrap(twoNumericFields.wholeMatch(of: dslWithBothCaptures)) + XCTAssertEqual(match.output.0, twoNumericFields[...]) + XCTAssertEqual(match.output.1, "123") + + let anyOutput = AnyRegexOutput(match) + XCTAssertEqual(anyOutput.count, 3) + XCTAssertEqual(anyOutput[0].substring, twoNumericFields[...]) + XCTAssertEqual(anyOutput[1].substring, "123") + XCTAssertEqual(anyOutput[2].substring, "456") + } + + func testLabeledCaptures_tooManyCapture() throws { + guard #available(macOS 13, *) else { + XCTSkip("Fix only exists on macOS 13") + return + } + // The output type of a regex with too many captures is dropped. + // "Too many" means the left and right output types would add up to >= 10. + let alpha = "AAA:abcdefghijklm:123:456:" + let regexWithTooManyCaptures = #/(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)/# + let dslWithTooManyCaptures = Regex { + Capture(OneOrMore(.word)) + ":" + regexWithTooManyCaptures + ":" + TryCapture(OneOrMore(.word)) { Int($0) } + #/:(\d+):/# + } + XCTAssert(type(of: dslWithTooManyCaptures).self + == Regex<(Substring, Substring, Int, Substring)>.self) + + let match = try XCTUnwrap(alpha.wholeMatch(of: dslWithTooManyCaptures)) + XCTAssertEqual(match.output.0, alpha[...]) + XCTAssertEqual(match.output.1, "AAA") + XCTAssertEqual(match.output.2, 123) + XCTAssertEqual(match.output.3, "456") + + // All captures groups are available through `AnyRegexOutput`. + let anyOutput = AnyRegexOutput(match) + XCTAssertEqual(anyOutput.count, 17) + XCTAssertEqual(anyOutput[0].substring, alpha[...]) + XCTAssertEqual(anyOutput[1].substring, "AAA") + for (offset, letter) in "abcdefghijklm".enumerated() { + XCTAssertEqual(anyOutput[offset + 2].substring, String(letter)[...]) + } + XCTAssertEqual(anyOutput[15].substring, "123") + XCTAssertEqual(anyOutput[15].value as? Int, 123) + XCTAssertEqual(anyOutput[16].substring, "456") + } +} + extension Unicode.Scalar { // Convert a hexadecimal string to a scalar init?(hex: S) { diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index 26093bc64..85aecd210 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -16,15 +16,15 @@ import XCTest extension CaptureList.Capture { static var cap: Self { - return Self(optionalDepth: 0, .fake) + return Self(optionalDepth: 0, visibleInTypedOutput: true, .fake) } static var opt: Self { - return Self(optionalDepth: 1, .fake) + return Self(optionalDepth: 1, visibleInTypedOutput: true, .fake) } static func named(_ name: String, opt: Int = 0) -> Self { - return Self(name: name, optionalDepth: opt, .fake) + return Self(name: name, optionalDepth: opt, visibleInTypedOutput: true, .fake) } } extension CaptureList {