diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index d4c91bd63..00ce0d5f6 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -23,7 +23,9 @@ extension Compiler { var hasEmittedFirstMatchableAtom = false private let compileOptions: _CompileOptions - fileprivate var optimizationsEnabled: Bool { !compileOptions.contains(.disableOptimizations) } + fileprivate var optimizationsEnabled: Bool { + !compileOptions.contains(.disableOptimizations) + } init( options: MatchingOptions, @@ -665,10 +667,10 @@ fileprivate extension Compiler.ByteCodeGen { _ minTrips: Int, _ extraTrips: Int? ) -> Bool { + let isScalarSemantics = options.semanticLevel == .unicodeScalar guard optimizationsEnabled && minTrips <= QuantifyPayload.maxStorableTrips && extraTrips ?? 0 <= QuantifyPayload.maxStorableTrips - && options.semanticLevel == .graphemeCluster && kind != .reluctant else { return false } @@ -678,7 +680,7 @@ fileprivate extension Compiler.ByteCodeGen { guard let bitset = ccc.asAsciiBitset(options) else { return false } - builder.buildQuantify(bitset: bitset, kind, minTrips, extraTrips) + builder.buildQuantify(bitset: bitset, kind, minTrips, extraTrips, isScalarSemantics: isScalarSemantics) case .atom(let atom): switch atom { @@ -687,17 +689,17 @@ fileprivate extension Compiler.ByteCodeGen { guard let val = c._singleScalarAsciiValue else { return false } - builder.buildQuantify(asciiChar: val, kind, minTrips, extraTrips) + builder.buildQuantify(asciiChar: val, kind, minTrips, extraTrips, isScalarSemantics: isScalarSemantics) case .any: builder.buildQuantifyAny( - matchesNewlines: true, kind, minTrips, extraTrips) + matchesNewlines: true, kind, minTrips, extraTrips, isScalarSemantics: isScalarSemantics) case .anyNonNewline: builder.buildQuantifyAny( - matchesNewlines: false, kind, minTrips, extraTrips) + matchesNewlines: false, kind, minTrips, extraTrips, isScalarSemantics: isScalarSemantics) case .dot: builder.buildQuantifyAny( - matchesNewlines: options.dotMatchesNewline, kind, minTrips, extraTrips) + matchesNewlines: options.dotMatchesNewline, kind, minTrips, extraTrips, isScalarSemantics: isScalarSemantics) case .characterClass(let cc): // Custom character class that consumes a single grapheme @@ -706,7 +708,8 @@ fileprivate extension Compiler.ByteCodeGen { model: model, kind, minTrips, - extraTrips) + extraTrips, + isScalarSemantics: isScalarSemantics) default: return false } diff --git a/Sources/_StringProcessing/Engine/Backtracking.swift b/Sources/_StringProcessing/Engine/Backtracking.swift index 3ebb060c9..48470ce91 100644 --- a/Sources/_StringProcessing/Engine/Backtracking.swift +++ b/Sources/_StringProcessing/Engine/Backtracking.swift @@ -16,6 +16,11 @@ extension Processor { // Quantifiers may store a range of positions to restore to var rangeStart: Position? var rangeEnd: Position? + + // FIXME: refactor, for now this field is only used for quantifier save + // points. We should try to separate out the concerns better. + var isScalarSemantics: Bool + // The end of the call stack, so we can slice it off // when failing inside a call. // @@ -68,7 +73,11 @@ extension Processor { rangeStart = nil rangeEnd = nil } else { - input.formIndex(before: &rangeEnd!) + if isScalarSemantics { + input.unicodeScalars.formIndex(before: &rangeEnd!) + } else { + input.formIndex(before: &rangeEnd!) + } } } } @@ -82,19 +91,23 @@ extension Processor { pos: addressOnly ? nil : currentPosition, rangeStart: nil, rangeEnd: nil, + isScalarSemantics: false, // FIXME: refactor away stackEnd: .init(callStack.count), captureEnds: storedCaptures, intRegisters: registers.ints, posRegisters: registers.positions) } - func startQuantifierSavePoint() -> SavePoint { + func startQuantifierSavePoint( + isScalarSemantics: Bool + ) -> SavePoint { // Restores to the instruction AFTER the current quantifier instruction SavePoint( pc: controller.pc + 1, pos: nil, rangeStart: nil, rangeEnd: nil, + isScalarSemantics: isScalarSemantics, stackEnd: .init(callStack.count), captureEnds: storedCaptures, intRegisters: registers.ints, diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index f6d5bfcc7..a0e849851 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -370,6 +370,10 @@ extension Instruction.Payload { } } +// TODO: Consider switching all quantification to a quantification +// instruction, where the general path has an instruction list (i.e. a +// slice of a list) + // MARK: Struct definitions struct QuantifyPayload: RawRepresentable { let rawValue: UInt64 @@ -380,9 +384,12 @@ struct QuantifyPayload: RawRepresentable { case builtin = 4 } + // TODO: figure out how to better organize this... + // Future work: optimize this layout -> payload type should be a fast switch // The top 8 bits are reserved for the opcode so we have 56 bits to work with - // b55-b38 - Unused + // b55-b39 - Unused + // b39-b38 - isScalarSemantics // b38-b35 - Payload type (one of 4 types, stored on 3 bits) // b35-b27 - minTrips (8 bit int) // b27-b18 - extraTrips (8 bit value, one bit for nil) @@ -393,6 +400,7 @@ struct QuantifyPayload: RawRepresentable { static var minTripsShift: UInt64 { 27 } static var typeShift: UInt64 { 35 } static var maxStorableTrips: UInt64 { (1 << 8) - 1 } + static var isScalarSemanticsBit: UInt64 { 1 &<< 38 } var quantKindMask: UInt64 { 3 } var extraTripsMask: UInt64 { 0x1FF } @@ -404,7 +412,8 @@ struct QuantifyPayload: RawRepresentable { _ kind: AST.Quantification.Kind, _ minTrips: Int, _ extraTrips: Int?, - _ type: PayloadType + _ type: PayloadType, + isScalarSemantics: Bool ) -> UInt64 { let kindVal: UInt64 switch kind { @@ -415,11 +424,14 @@ struct QuantifyPayload: RawRepresentable { case .possessive: kindVal = 2 } + // TODO: refactor / reimplement let extraTripsVal: UInt64 = extraTrips == nil ? 1 : UInt64(extraTrips!) << 1 - return (kindVal << QuantifyPayload.quantKindShift) + - (extraTripsVal << QuantifyPayload.extraTripsShift) + - (UInt64(minTrips) << QuantifyPayload.minTripsShift) + - (type.rawValue << QuantifyPayload.typeShift) + let scalarSemanticsBit = isScalarSemantics ? Self.isScalarSemanticsBit : 0 + return (kindVal << QuantifyPayload.quantKindShift) | + (extraTripsVal << QuantifyPayload.extraTripsShift) | + (UInt64(minTrips) << QuantifyPayload.minTripsShift) | + (type.rawValue << QuantifyPayload.typeShift) | + scalarSemanticsBit } init(rawValue: UInt64) { @@ -431,46 +443,49 @@ struct QuantifyPayload: RawRepresentable { bitset: AsciiBitsetRegister, _ kind: AST.Quantification.Kind, _ minTrips: Int, - _ extraTrips: Int? + _ extraTrips: Int?, + isScalarSemantics: Bool ) { assert(bitset.bits <= _payloadMask) self.rawValue = bitset.bits - + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .bitset) + + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .bitset, isScalarSemantics: isScalarSemantics) } init( asciiChar: UInt8, _ kind: AST.Quantification.Kind, _ minTrips: Int, - _ extraTrips: Int? + _ extraTrips: Int?, + isScalarSemantics: Bool ) { self.rawValue = UInt64(asciiChar) - + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .asciiChar) + + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .asciiChar, isScalarSemantics: isScalarSemantics) } init( matchesNewlines: Bool, _ kind: AST.Quantification.Kind, _ minTrips: Int, - _ extraTrips: Int? + _ extraTrips: Int?, + isScalarSemantics: Bool ) { self.rawValue = (matchesNewlines ? 1 : 0) - + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .any) + + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .any, isScalarSemantics: isScalarSemantics) } init( model: _CharacterClassModel, _ kind: AST.Quantification.Kind, _ minTrips: Int, - _ extraTrips: Int? + _ extraTrips: Int?, + isScalarSemantics: Bool ) { assert(model.cc.rawValue < 0xFF) - assert(model.matchLevel != .unicodeScalar) let packedModel = model.cc.rawValue + (model.isInverted ? 1 << 9 : 0) + (model.isStrictASCII ? 1 << 10 : 0) self.rawValue = packedModel - + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .builtin) + + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .builtin, isScalarSemantics: isScalarSemantics) } var type: PayloadType { @@ -500,6 +515,10 @@ struct QuantifyPayload: RawRepresentable { } } + var isScalarSemantics: Bool { + rawValue & Self.isScalarSemanticsBit != 0 + } + var bitset: AsciiBitsetRegister { TypedInt(self.rawValue & payloadMask) } diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 4b623fbda..93801aeec 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -222,44 +222,48 @@ extension MEProgram.Builder { bitset: DSLTree.CustomCharacterClass.AsciiBitset, _ kind: AST.Quantification.Kind, _ minTrips: Int, - _ extraTrips: Int? + _ extraTrips: Int?, + isScalarSemantics: Bool ) { instructions.append(.init( .quantify, - .init(quantify: .init(bitset: makeAsciiBitset(bitset), kind, minTrips, extraTrips)))) + .init(quantify: .init(bitset: makeAsciiBitset(bitset), kind, minTrips, extraTrips, isScalarSemantics: isScalarSemantics)))) } mutating func buildQuantify( asciiChar: UInt8, _ kind: AST.Quantification.Kind, _ minTrips: Int, - _ extraTrips: Int? + _ extraTrips: Int?, + isScalarSemantics: Bool ) { instructions.append(.init( .quantify, - .init(quantify: .init(asciiChar: asciiChar, kind, minTrips, extraTrips)))) + .init(quantify: .init(asciiChar: asciiChar, kind, minTrips, extraTrips, isScalarSemantics: isScalarSemantics)))) } mutating func buildQuantifyAny( matchesNewlines: Bool, _ kind: AST.Quantification.Kind, _ minTrips: Int, - _ extraTrips: Int? + _ extraTrips: Int?, + isScalarSemantics: Bool ) { instructions.append(.init( .quantify, - .init(quantify: .init(matchesNewlines: matchesNewlines, kind, minTrips, extraTrips)))) + .init(quantify: .init(matchesNewlines: matchesNewlines, kind, minTrips, extraTrips, isScalarSemantics: isScalarSemantics)))) } mutating func buildQuantify( model: _CharacterClassModel, _ kind: AST.Quantification.Kind, _ minTrips: Int, - _ extraTrips: Int? + _ extraTrips: Int?, + isScalarSemantics: Bool ) { instructions.append(.init( .quantify, - .init(quantify: .init(model: model,kind, minTrips, extraTrips)))) + .init(quantify: .init(model: model,kind, minTrips, extraTrips, isScalarSemantics: isScalarSemantics)))) } mutating func buildAccept() { diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 7ca3ae84a..1ff734ccd 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -1,7 +1,6 @@ extension Processor { func _doQuantifyMatch(_ payload: QuantifyPayload) -> Input.Index? { - // TODO: This optimization is only enabled for grapheme cluster semantics, - // we want these for scalar semantics as well. + let isScalarSemantics = payload.isScalarSemantics switch payload.type { case .bitset: @@ -9,13 +8,13 @@ extension Processor { registers[payload.bitset], at: currentPosition, limitedBy: end, - isScalarSemantics: false) + isScalarSemantics: isScalarSemantics) case .asciiChar: return input.matchScalar( UnicodeScalar.init(_value: UInt32(payload.asciiChar)), at: currentPosition, limitedBy: end, - boundaryCheck: true, + boundaryCheck: !isScalarSemantics, isCaseInsensitive: false) case .builtin: // FIXME: bounds check? endIndex or end? @@ -26,17 +25,20 @@ extension Processor { at: currentPosition, isInverted: payload.builtinIsInverted, isStrictASCII: payload.builtinIsStrict, - isScalarSemantics: false) + isScalarSemantics: isScalarSemantics) case .any: // FIXME: endIndex or end? guard currentPosition < input.endIndex else { return nil } if payload.anyMatchesNewline { + if isScalarSemantics { + return input.unicodeScalars.index(after: currentPosition) + } return input.index(after: currentPosition) } return input.matchAnyNonNewline( - at: currentPosition, isScalarSemantics: false) + at: currentPosition, isScalarSemantics: isScalarSemantics) } } @@ -46,7 +48,9 @@ extension Processor { mutating func runQuantify(_ payload: QuantifyPayload) -> Bool { var trips = 0 var extraTrips = payload.extraTrips - var savePoint = startQuantifierSavePoint() + var savePoint = startQuantifierSavePoint( + isScalarSemantics: payload.isScalarSemantics + ) while true { if trips >= payload.minTrips { @@ -85,7 +89,9 @@ extension Processor { assert(payload.quantKind == .eager && payload.minTrips == 0 && payload.extraTrips == nil) - var savePoint = startQuantifierSavePoint() + var savePoint = startQuantifierSavePoint( + isScalarSemantics: payload.isScalarSemantics + ) while true { savePoint.updateRange(newEnd: currentPosition) @@ -107,7 +113,9 @@ extension Processor { assert(payload.quantKind == .eager && payload.minTrips == 1 && payload.extraTrips == nil) - var savePoint = startQuantifierSavePoint() + var savePoint = startQuantifierSavePoint( + isScalarSemantics: payload.isScalarSemantics + ) while true { let next = _doQuantifyMatch(payload) guard let idx = next else { break } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index a6c9babbe..3fc547e34 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -620,6 +620,50 @@ extension RegexTests { // TODO: After captures, easier to test these } + func testQuantificationScalarSemantics() { + // TODO: We want more thorough testing here, including "a{n,m}", "a?", etc. + + firstMatchTest("a*", input: "aaa\u{301}", match: "aa") + firstMatchTest("a*", input: "aaa\u{301}", match: "aaa", semanticLevel: .unicodeScalar) + firstMatchTest("a+", input: "aaa\u{301}", match: "aa") + firstMatchTest("a+", input: "aaa\u{301}", match: "aaa", semanticLevel: .unicodeScalar) + firstMatchTest("a?", input: "a\u{301}", match: "") + firstMatchTest("a?", input: "a\u{301}", match: "a", semanticLevel: .unicodeScalar) + + firstMatchTest("[ab]*", input: "abab\u{301}", match: "aba") + firstMatchTest("[ab]*", input: "abab\u{301}", match: "abab", semanticLevel: .unicodeScalar) + firstMatchTest("[ab]+", input: "abab\u{301}", match: "aba") + firstMatchTest("[ab]+", input: "abab\u{301}", match: "abab", semanticLevel: .unicodeScalar) + firstMatchTest("[ab]?", input: "b\u{301}", match: "") + firstMatchTest("[ab]?", input: "b\u{301}", match: "b", semanticLevel: .unicodeScalar) + + firstMatchTest(#"\s*"#, input: " \u{301}", match: " \u{301}") + firstMatchTest(#"\s*"#, input: " \u{301}", match: " ", semanticLevel: .unicodeScalar) + firstMatchTest(#"\s+"#, input: " \u{301}", match: " \u{301}") + firstMatchTest(#"\s+"#, input: " \u{301}", match: " ", semanticLevel: .unicodeScalar) + firstMatchTest(#"\s?"#, input: " \u{301}", match: " \u{301}") + firstMatchTest(#"\s?"#, input: " \u{301}", match: " ", semanticLevel: .unicodeScalar) + + firstMatchTest(#".*?a"#, input: "xxa\u{301}xaZ", match: "xxa\u{301}xa") + firstMatchTest(#".*?a"#, input: "xxa\u{301}xaZ", match: "xxa", semanticLevel: .unicodeScalar) + firstMatchTest(#".+?a"#, input: "xxa\u{301}xaZ", match: "xxa\u{301}xa") + firstMatchTest(#".+?a"#, input: "xxa\u{301}xaZ", match: "xxa", semanticLevel: .unicodeScalar) + firstMatchTest(#".?a"#, input: "e\u{301}aZ", match: "e\u{301}a") + firstMatchTest(#".?a"#, input: "e\u{301}aZ", match: "\u{301}a", semanticLevel: .unicodeScalar) + + firstMatchTest(#".+\u{301}"#, input: "aa\u{301}Z", match: nil) + firstMatchTest(#".+\u{301}"#, input: "aa\u{301}Z", match: "aa\u{301}", semanticLevel: .unicodeScalar) + firstMatchTest(#".*\u{301}"#, input: "\u{301}Z", match: "\u{301}") + firstMatchTest(#".*\u{301}"#, input: "\u{301}Z", match: "\u{301}", semanticLevel: .unicodeScalar) + + firstMatchTest(#".?\u{301}"#, input: "aa\u{302}\u{301}Z", match: nil) + firstMatchTest(#".?\u{301}.?Z"#, input: "aa\u{302}\u{301}Z", match: "\u{302}\u{301}Z", semanticLevel: .unicodeScalar) + firstMatchTest(#".?.?\u{301}.?Z"#, input: "aa\u{302}\u{301}Z", match: "a\u{302}\u{301}Z", semanticLevel: .unicodeScalar) + + + // TODO: other test cases? + } + func testMatchCharacterClasses() { // Must have new stdlib for character class ranges and word boundaries. guard ensureNewStdlib() else { return }