Skip to content

Commit 60b4d78

Browse files
committed
Remove most consumer functions
This is based heavily off the work in swiftlang#590, rebased onto main, with some changes to remove even more consumer uses. Consumer functions only have two remaining uses: non-ASCII ranges and Unicode lookups (for things like general category, binary properties, name, etc.). This change primarily treats custom character classes as alternations around their contents, with set operations emitted as instructions instead of implemented via consumer function.
1 parent 9ea9936 commit 60b4d78

File tree

7 files changed

+378
-274
lines changed

7 files changed

+378
-274
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 282 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -243,9 +243,11 @@ fileprivate extension Compiler.ByteCodeGen {
243243
}
244244
}
245245

246-
mutating func emitAlternation(
247-
_ children: [DSLTree.Node]
248-
) throws {
246+
mutating func emitAlternationGen<C: BidirectionalCollection>(
247+
_ elements: C,
248+
withBacktracking: Bool,
249+
_ body: (inout Compiler.ByteCodeGen, C.Element) throws -> Void
250+
) rethrows {
249251
// Alternation: p0 | p1 | ... | pn
250252
// save next_p1
251253
// <code for p0>
@@ -263,16 +265,27 @@ fileprivate extension Compiler.ByteCodeGen {
263265
// <code for pn>
264266
// done:
265267
let done = builder.makeAddress()
266-
for component in children.dropLast() {
268+
for element in elements.dropLast() {
267269
let next = builder.makeAddress()
268270
builder.buildSave(next)
269-
try emitNode(component)
271+
try body(&self, element)
272+
if !withBacktracking {
273+
builder.buildClear()
274+
}
270275
builder.buildBranch(to: done)
271276
builder.label(next)
272277
}
273-
try emitNode(children.last!)
278+
try body(&self, elements.last!)
274279
builder.label(done)
275280
}
281+
282+
mutating func emitAlternation(
283+
_ children: [DSLTree.Node]
284+
) throws {
285+
try emitAlternationGen(children, withBacktracking: true) {
286+
try $0.emitNode($1)
287+
}
288+
}
276289

277290
mutating func emitConcatenationComponent(
278291
_ node: DSLTree.Node
@@ -828,19 +841,187 @@ fileprivate extension Compiler.ByteCodeGen {
828841
}
829842
}
830843

844+
/// Flatten quoted strings into sequences of atoms, so that the standard
845+
/// CCC codegen will handle them.
846+
func flatteningCustomCharacterClassMembers(
847+
_ members: [DSLTree.CustomCharacterClass.Member]
848+
) -> [DSLTree.CustomCharacterClass.Member] {
849+
var characters: Set<Character> = []
850+
var scalars: Set<UnicodeScalar> = []
851+
var result: [DSLTree.CustomCharacterClass.Member] = []
852+
for member in members {
853+
switch member {
854+
case .atom(let atom):
855+
switch atom {
856+
case let .char(char):
857+
characters.insert(char)
858+
case let .scalar(scalar):
859+
scalars.insert(scalar)
860+
default:
861+
result.append(member)
862+
}
863+
case let .quotedLiteral(str):
864+
characters.formUnion(str)
865+
default:
866+
result.append(member)
867+
}
868+
}
869+
result.append(contentsOf: characters.map { .atom(.char($0)) })
870+
result.append(contentsOf: scalars.map { .atom(.scalar($0)) })
871+
return result
872+
}
873+
831874
func coalescingCustomCharacterClass(
832875
_ ccc: DSLTree.CustomCharacterClass
833876
) -> DSLTree.CustomCharacterClass {
834877
// This only needs to be done in grapheme semantic mode. In scalar semantic
835878
// mode, we don't want to coalesce any scalars into a grapheme. This
836879
// means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
837880
// U+302.
838-
guard options.semanticLevel == .graphemeCluster else { return ccc }
839-
840-
let members = coalescingCustomCharacterClassMembers(ccc.members)
841-
return .init(members: members, isInverted: ccc.isInverted)
881+
let members = options.semanticLevel == .graphemeCluster
882+
? coalescingCustomCharacterClassMembers(ccc.members)
883+
: ccc.members
884+
return .init(
885+
members: flatteningCustomCharacterClassMembers(members),
886+
isInverted: ccc.isInverted)
842887
}
843888

889+
mutating func emitCharacterInCCC(_ c: Character) {
890+
switch options.semanticLevel {
891+
case .graphemeCluster:
892+
emitCharacter(c)
893+
case .unicodeScalar:
894+
// When in scalar mode, act like an alternation of the individual scalars
895+
// that comprise a character.
896+
emitAlternationGen(c.unicodeScalars, withBacktracking: false) {
897+
$0.emitMatchScalar($1)
898+
}
899+
}
900+
}
901+
902+
mutating func emitCCCMember(
903+
_ member: DSLTree.CustomCharacterClass.Member
904+
) throws {
905+
switch member {
906+
case .atom(let atom):
907+
switch atom {
908+
case .char(let c):
909+
emitCharacterInCCC(c)
910+
case .scalar(let s):
911+
emitCharacterInCCC(Character(s))
912+
default:
913+
try emitAtom(atom)
914+
}
915+
case .custom(let ccc):
916+
try emitCustomCharacterClass(ccc)
917+
case .quotedLiteral:
918+
fatalError("Removed in 'flatteningCustomCharacterClassMembers'")
919+
case .range:
920+
let consumer = try member.generateConsumer(options)
921+
builder.buildConsume(by: consumer)
922+
case .trivia:
923+
return
924+
925+
// TODO: Can we decide when it's better to try `rhs` first?
926+
// Intersection is trivial, since failure on either side propagates:
927+
// - store current position
928+
// - lhs
929+
// - restore current position
930+
// - rhs
931+
case let .intersection(lhs, rhs):
932+
let r = builder.makePositionRegister()
933+
builder.buildMoveCurrentPosition(into: r)
934+
try emitCustomCharacterClass(lhs)
935+
builder.buildRestorePosition(from: r)
936+
try emitCustomCharacterClass(rhs)
937+
938+
// TODO: Can we decide when it's better to try `rhs` first?
939+
// For subtraction, failure in `lhs` propagates, while failure in `rhs` is
940+
// swallowed/reversed:
941+
// - store current position
942+
// - lhs
943+
// - save to end
944+
// - restore current position
945+
// - rhs
946+
// - clear, fail (since both succeeded)
947+
// - end: ...
948+
case let .subtraction(lhs, rhs):
949+
let r = builder.makePositionRegister()
950+
let end = builder.makeAddress()
951+
builder.buildMoveCurrentPosition(into: r)
952+
try emitCustomCharacterClass(lhs) // no match here = failure, propagates
953+
builder.buildSave(end)
954+
builder.buildRestorePosition(from: r)
955+
try emitCustomCharacterClass(rhs) // no match here = success, resumes at 'end'
956+
builder.buildClear() // clears 'end'
957+
builder.buildFail() // this failure propagates outward
958+
builder.label(end)
959+
960+
// Symmetric difference always requires executing both `rhs` and `lhs`.
961+
// Execute each, ignoring failure and storing the resulting position in a
962+
// register. If those results are equal, fail. If they're different, use
963+
// the position that is different from the starting position:
964+
// - store current position as r0
965+
// - save to lhsFail
966+
// - lhs
967+
// - clear lhsFail (and continue)
968+
// - lhsFail: save position as r1
969+
//
970+
// - restore current position
971+
// - save to rhsFail
972+
// - rhs
973+
// - clear rhsFail (and continue)
974+
// - rhsFail: save position as r2
975+
//
976+
// - restore to resulting position from lhs (r1)
977+
// - if equal to r2, goto fail (both sides had same result)
978+
// - if equal to r0, goto advance (lhs failed)
979+
// - goto end
980+
// - advance: restore to resulting position from rhs (r2)
981+
// - goto end
982+
// - fail: fail
983+
// - end: ...
984+
case let .symmetricDifference(lhs, rhs):
985+
let r0 = builder.makePositionRegister()
986+
let r1 = builder.makePositionRegister()
987+
let r2 = builder.makePositionRegister()
988+
let lhsFail = builder.makeAddress()
989+
let rhsFail = builder.makeAddress()
990+
let advance = builder.makeAddress()
991+
let fail = builder.makeAddress()
992+
let end = builder.makeAddress()
993+
994+
builder.buildMoveCurrentPosition(into: r0)
995+
builder.buildSave(lhsFail)
996+
try emitCustomCharacterClass(lhs)
997+
builder.buildClear()
998+
builder.label(lhsFail)
999+
builder.buildMoveCurrentPosition(into: r1)
1000+
1001+
builder.buildRestorePosition(from: r0)
1002+
builder.buildSave(rhsFail)
1003+
try emitCustomCharacterClass(rhs)
1004+
builder.buildClear()
1005+
builder.label(rhsFail)
1006+
builder.buildMoveCurrentPosition(into: r2)
1007+
1008+
// If r1 == r2, then fail
1009+
builder.buildRestorePosition(from: r1)
1010+
builder.buildCondBranch(to: fail, ifSamePositionAs: r2)
1011+
1012+
// If r1 == r0, then move to r2 before ending
1013+
builder.buildCondBranch(to: advance, ifSamePositionAs: r0)
1014+
builder.buildBranch(to: end)
1015+
builder.label(advance)
1016+
builder.buildRestorePosition(from: r2)
1017+
builder.buildBranch(to: end)
1018+
1019+
builder.label(fail)
1020+
builder.buildFail()
1021+
builder.label(end)
1022+
}
1023+
}
1024+
8441025
mutating func emitCustomCharacterClass(
8451026
_ ccc: DSLTree.CustomCharacterClass
8461027
) throws {
@@ -858,8 +1039,67 @@ fileprivate extension Compiler.ByteCodeGen {
8581039
}
8591040
return
8601041
}
861-
let consumer = try ccc.generateConsumer(options)
862-
builder.buildConsume(by: consumer)
1042+
1043+
let updatedCCC: DSLTree.CustomCharacterClass
1044+
if optimizationsEnabled {
1045+
updatedCCC = ccc.coalescingASCIIMembers(options)
1046+
} else {
1047+
updatedCCC = ccc
1048+
}
1049+
let filteredMembers = updatedCCC.members.filter({!$0.isOnlyTrivia})
1050+
1051+
if updatedCCC.isInverted {
1052+
// inverted
1053+
// custom character class: p0 | p1 | ... | pn
1054+
// Try each member to make sure they all fail
1055+
// save next_p1
1056+
// <code for p0>
1057+
// clear, fail
1058+
// next_p1:
1059+
// save next_p2
1060+
// <code for p1>
1061+
// clear fail
1062+
// next_p2:
1063+
// save next_p...
1064+
// <code for p2>
1065+
// clear fail
1066+
// ...
1067+
// next_pn:
1068+
// save done
1069+
// <code for pn>
1070+
// clear fail
1071+
// done:
1072+
// step forward by 1
1073+
let done = builder.makeAddress()
1074+
for member in filteredMembers.dropLast() {
1075+
let next = builder.makeAddress()
1076+
builder.buildSave(next)
1077+
try emitCCCMember(member)
1078+
builder.buildClear()
1079+
builder.buildFail()
1080+
builder.label(next)
1081+
}
1082+
builder.buildSave(done)
1083+
try emitCCCMember(filteredMembers.last!)
1084+
builder.buildClear()
1085+
builder.buildFail()
1086+
builder.label(done)
1087+
1088+
// Consume a single unit for the inverted ccc
1089+
switch options.semanticLevel {
1090+
case .graphemeCluster:
1091+
builder.buildAdvance(1)
1092+
case .unicodeScalar:
1093+
builder.buildAdvanceUnicodeScalar(1)
1094+
}
1095+
return
1096+
}
1097+
// non inverted CCC
1098+
// Custom character class: p0 | p1 | ... | pn
1099+
// Very similar to alternation, but we don't keep backtracking save points
1100+
try emitAlternationGen(filteredMembers, withBacktracking: false) {
1101+
try $0.emitCCCMember($1)
1102+
}
8631103
}
8641104

8651105
mutating func emitConcatenation(_ children: [DSLTree.Node]) throws {
@@ -996,6 +1236,12 @@ fileprivate extension Compiler.ByteCodeGen {
9961236
}
9971237

9981238
extension DSLTree.Node {
1239+
/// A Boolean value indicating whether this node advances the match position
1240+
/// on a successful match.
1241+
///
1242+
/// For example, an alternation like `(a|b|c)` always advances the position
1243+
/// by a character, but `(a|b|)` has an empty branch, which matches without
1244+
/// advancing.
9991245
var guaranteesForwardProgress: Bool {
10001246
switch self {
10011247
case .orderedChoice(let children):
@@ -1026,12 +1272,34 @@ extension DSLTree.Node {
10261272
case .consumer, .matcher:
10271273
// Allow zero width consumers and matchers
10281274
return false
1029-
case .customCharacterClass:
1030-
return true
1275+
case .customCharacterClass(let ccc):
1276+
return ccc.guaranteesForwardProgress
10311277
case .quantification(let amount, _, let child):
10321278
let (atLeast, _) = amount.ast.bounds
10331279
return atLeast ?? 0 > 0 && child.guaranteesForwardProgress
10341280
default: return false
10351281
}
10361282
}
10371283
}
1284+
1285+
extension DSLTree.CustomCharacterClass {
1286+
/// We allow trivia into CustomCharacterClass, which could result in a CCC
1287+
/// that matches nothing, ie `(?x)[ ]`.
1288+
var guaranteesForwardProgress: Bool {
1289+
for m in members {
1290+
switch m {
1291+
case .trivia:
1292+
continue
1293+
case let .intersection(lhs, rhs):
1294+
return lhs.guaranteesForwardProgress && rhs.guaranteesForwardProgress
1295+
case let .subtraction(lhs, _):
1296+
return lhs.guaranteesForwardProgress
1297+
case let .symmetricDifference(lhs, rhs):
1298+
return lhs.guaranteesForwardProgress && rhs.guaranteesForwardProgress
1299+
default:
1300+
return true
1301+
}
1302+
}
1303+
return false
1304+
}
1305+
}

0 commit comments

Comments
 (0)