Skip to content

Commit c85d1f3

Browse files
committed
Remove most consumer functions
This is based heavily off the work in swiftlang#590, rebased onto main, with some changes to remove even more consumer uses. Consumer functions only have two remaining uses: non-ASCII ranges and Unicode lookups (for things like general category, binary properties, name, etc.). This change primarily treats custom character classes as alternations around their contents, with set operations emitted as instructions instead of implemented via consumer function.
1 parent 9ea9936 commit c85d1f3

File tree

7 files changed

+393
-268
lines changed

7 files changed

+393
-268
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 297 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -828,19 +828,195 @@ fileprivate extension Compiler.ByteCodeGen {
828828
}
829829
}
830830

831+
/// Flatten quoted strings into sequences of atoms, so that the standard
832+
/// CCC codegen will handle them.
833+
func flatteningCustomCharacterClassMembers(
834+
_ members: [DSLTree.CustomCharacterClass.Member]
835+
) -> [DSLTree.CustomCharacterClass.Member] {
836+
var characters: Set<Character> = []
837+
var scalars: Set<UnicodeScalar> = []
838+
var result: [DSLTree.CustomCharacterClass.Member] = []
839+
for member in members {
840+
switch member {
841+
case .atom(let atom):
842+
switch atom {
843+
case let .char(char):
844+
characters.insert(char)
845+
case let .scalar(scalar):
846+
scalars.insert(scalar)
847+
default:
848+
result.append(member)
849+
}
850+
case let .quotedLiteral(str):
851+
characters.formUnion(str)
852+
default:
853+
result.append(member)
854+
}
855+
}
856+
result.append(contentsOf: characters.map { .atom(.char($0)) })
857+
result.append(contentsOf: scalars.map { .atom(.scalar($0)) })
858+
return result
859+
}
860+
831861
func coalescingCustomCharacterClass(
832862
_ ccc: DSLTree.CustomCharacterClass
833863
) -> DSLTree.CustomCharacterClass {
834864
// This only needs to be done in grapheme semantic mode. In scalar semantic
835865
// mode, we don't want to coalesce any scalars into a grapheme. This
836866
// means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
837867
// U+302.
838-
guard options.semanticLevel == .graphemeCluster else { return ccc }
839-
840-
let members = coalescingCustomCharacterClassMembers(ccc.members)
841-
return .init(members: members, isInverted: ccc.isInverted)
868+
let members = options.semanticLevel == .graphemeCluster
869+
? coalescingCustomCharacterClassMembers(ccc.members)
870+
: ccc.members
871+
return .init(
872+
members: flatteningCustomCharacterClassMembers(members),
873+
isInverted: ccc.isInverted)
842874
}
843875

876+
mutating func emitCharacterInCCC(_ c: Character) {
877+
switch options.semanticLevel {
878+
case .graphemeCluster:
879+
emitCharacter(c)
880+
case .unicodeScalar:
881+
// When in scalar mode, act like an alternation of the individual scalars
882+
// that comprise a character.
883+
let done = builder.makeAddress()
884+
for scalar in c.unicodeScalars.dropLast() {
885+
let next = builder.makeAddress()
886+
builder.buildSave(next)
887+
emitMatchScalar(scalar)
888+
builder.buildClear()
889+
builder.buildBranch(to: done)
890+
builder.label(next)
891+
}
892+
emitMatchScalar(c.unicodeScalars.last!)
893+
builder.label(done)
894+
}
895+
}
896+
897+
mutating func emitCCCMember(
898+
_ member: DSLTree.CustomCharacterClass.Member
899+
) throws {
900+
switch member {
901+
case .atom(let atom):
902+
switch atom {
903+
case .char(let c):
904+
emitCharacterInCCC(c)
905+
case .scalar(let s):
906+
emitCharacterInCCC(Character(s))
907+
default:
908+
try emitAtom(atom)
909+
}
910+
case .custom(let ccc):
911+
try emitCustomCharacterClass(ccc)
912+
case .quotedLiteral:
913+
fatalError("Removed in 'flatteningCustomCharacterClassMembers'")
914+
case .range:
915+
let consumer = try member.generateConsumer(options)
916+
builder.buildConsume(by: consumer)
917+
case .trivia:
918+
return
919+
920+
// TODO: Can we decide when it's better to try `rhs` first?
921+
// Intersection is trivial, since failure on either side propagates:
922+
// - store current position
923+
// - lhs
924+
// - restore current position
925+
// - rhs
926+
case let .intersection(lhs, rhs):
927+
let r = builder.makePositionRegister()
928+
builder.buildMoveCurrentPosition(into: r)
929+
try emitCustomCharacterClass(lhs)
930+
builder.buildRestorePosition(from: r)
931+
try emitCustomCharacterClass(rhs)
932+
933+
// TODO: Can we decide when it's better to try `rhs` first?
934+
// For subtraction, failure in `lhs` propagates, while failure in `rhs` is
935+
// swallowed/reversed:
936+
// - store current position
937+
// - lhs
938+
// - save to end
939+
// - restore current position
940+
// - rhs
941+
// - clear, fail (since both succeeded)
942+
// - end: ...
943+
case let .subtraction(lhs, rhs):
944+
let r = builder.makePositionRegister()
945+
let end = builder.makeAddress()
946+
builder.buildMoveCurrentPosition(into: r)
947+
try emitCustomCharacterClass(lhs) // no match here = failure, propagates
948+
builder.buildSave(end)
949+
builder.buildRestorePosition(from: r)
950+
try emitCustomCharacterClass(rhs) // no match here = success, resumes at 'end'
951+
builder.buildClear() // clears 'end'
952+
builder.buildFail() // this failure propagates outward
953+
builder.label(end)
954+
955+
// Symmetric difference always requires executing both `rhs` and `lhs`.
956+
// Execute each, ignoring failure and storing the resulting position in a
957+
// register. If those results are equal, fail. If they're different, use
958+
// the position that is different from the starting position:
959+
// - store current position as r0
960+
// - save to lhsFail
961+
// - lhs
962+
// - clear lhsFail (and continue)
963+
// - lhsFail: save position as r1
964+
//
965+
// - restore current position
966+
// - save to rhsFail
967+
// - rhs
968+
// - clear rhsFail (and continue)
969+
// - rhsFail: save position as r2
970+
//
971+
// - restore to resulting position from lhs (r1)
972+
// - if equal to r2, goto fail (both sides had same result)
973+
// - if equal to r0, goto advance (lhs failed)
974+
// - goto end
975+
// - advance: restore to resulting position from rhs (r2)
976+
// - goto end
977+
// - fail: fail
978+
// - end: ...
979+
case let .symmetricDifference(lhs, rhs):
980+
let r0 = builder.makePositionRegister()
981+
let r1 = builder.makePositionRegister()
982+
let r2 = builder.makePositionRegister()
983+
let lhsFail = builder.makeAddress()
984+
let rhsFail = builder.makeAddress()
985+
let advance = builder.makeAddress()
986+
let fail = builder.makeAddress()
987+
let end = builder.makeAddress()
988+
989+
builder.buildMoveCurrentPosition(into: r0)
990+
builder.buildSave(lhsFail)
991+
try emitCustomCharacterClass(lhs)
992+
builder.buildClear()
993+
builder.label(lhsFail)
994+
builder.buildMoveCurrentPosition(into: r1)
995+
996+
builder.buildRestorePosition(from: r0)
997+
builder.buildSave(rhsFail)
998+
try emitCustomCharacterClass(rhs)
999+
builder.buildClear()
1000+
builder.label(rhsFail)
1001+
builder.buildMoveCurrentPosition(into: r2)
1002+
1003+
// If r1 == r2, then fail
1004+
builder.buildRestorePosition(from: r1)
1005+
builder.buildCondBranch(to: fail, ifSamePositionAs: r2)
1006+
1007+
// If r1 == r0, then move to r2 before ending
1008+
builder.buildCondBranch(to: advance, ifSamePositionAs: r0)
1009+
builder.buildBranch(to: end)
1010+
builder.label(advance)
1011+
builder.buildRestorePosition(from: r2)
1012+
builder.buildBranch(to: end)
1013+
1014+
builder.label(fail)
1015+
builder.buildFail()
1016+
builder.label(end)
1017+
}
1018+
}
1019+
8441020
mutating func emitCustomCharacterClass(
8451021
_ ccc: DSLTree.CustomCharacterClass
8461022
) throws {
@@ -858,8 +1034,93 @@ fileprivate extension Compiler.ByteCodeGen {
8581034
}
8591035
return
8601036
}
861-
let consumer = try ccc.generateConsumer(options)
862-
builder.buildConsume(by: consumer)
1037+
1038+
let updatedCCC: DSLTree.CustomCharacterClass
1039+
if optimizationsEnabled {
1040+
updatedCCC = ccc.coalescingASCIIMembers(options)
1041+
} else {
1042+
updatedCCC = ccc
1043+
}
1044+
let filteredMembers = updatedCCC.members.filter({!$0.isOnlyTrivia})
1045+
1046+
if updatedCCC.isInverted {
1047+
// inverted
1048+
// custom character class: p0 | p1 | ... | pn
1049+
// Try each member to make sure they all fail
1050+
// save next_p1
1051+
// <code for p0>
1052+
// clear, fail
1053+
// next_p1:
1054+
// save next_p2
1055+
// <code for p1>
1056+
// clear fail
1057+
// next_p2:
1058+
// save next_p...
1059+
// <code for p2>
1060+
// clear fail
1061+
// ...
1062+
// next_pn:
1063+
// save done
1064+
// <code for pn>
1065+
// clear fail
1066+
// done:
1067+
// step forward by 1
1068+
let done = builder.makeAddress()
1069+
for member in filteredMembers.dropLast() {
1070+
let next = builder.makeAddress()
1071+
builder.buildSave(next)
1072+
try emitCCCMember(member)
1073+
builder.buildClear()
1074+
builder.buildFail()
1075+
builder.label(next)
1076+
}
1077+
builder.buildSave(done)
1078+
try emitCCCMember(filteredMembers.last!)
1079+
builder.buildClear()
1080+
builder.buildFail()
1081+
builder.label(done)
1082+
1083+
// Consume a single unit for the inverted ccc
1084+
switch options.semanticLevel {
1085+
case .graphemeCluster:
1086+
builder.buildAdvance(1)
1087+
case .unicodeScalar:
1088+
builder.buildAdvanceUnicodeScalar(1)
1089+
}
1090+
return
1091+
}
1092+
// non inverted CCC
1093+
// Custom character class: p0 | p1 | ... | pn
1094+
// Very similar to alternation, but we don't keep backtracking save points
1095+
// save next_p1
1096+
// <code for p0>
1097+
// clear
1098+
// branch done
1099+
// next_p1:
1100+
// save next_p2
1101+
// <code for p1>
1102+
// clear
1103+
// branch done
1104+
// next_p2:
1105+
// save next_p...
1106+
// <code for p2>
1107+
// clear
1108+
// branch done
1109+
// ...
1110+
// next_pn:
1111+
// <code for pn>
1112+
// done:
1113+
let done = builder.makeAddress()
1114+
for member in filteredMembers.dropLast() {
1115+
let next = builder.makeAddress()
1116+
builder.buildSave(next)
1117+
try emitCCCMember(member)
1118+
builder.buildClear()
1119+
builder.buildBranch(to: done)
1120+
builder.label(next)
1121+
}
1122+
try emitCCCMember(filteredMembers.last!)
1123+
builder.label(done)
8631124
}
8641125

8651126
mutating func emitConcatenation(_ children: [DSLTree.Node]) throws {
@@ -996,6 +1257,12 @@ fileprivate extension Compiler.ByteCodeGen {
9961257
}
9971258

9981259
extension DSLTree.Node {
1260+
/// A Boolean value indicating whether this node advances the match position
1261+
/// on a successful match.
1262+
///
1263+
/// For example, an alternation like `(a|b|c)` always advances the position
1264+
/// by a character, but `(a|b|)` has an empty branch, which matches without
1265+
/// advancing.
9991266
var guaranteesForwardProgress: Bool {
10001267
switch self {
10011268
case .orderedChoice(let children):
@@ -1026,12 +1293,34 @@ extension DSLTree.Node {
10261293
case .consumer, .matcher:
10271294
// Allow zero width consumers and matchers
10281295
return false
1029-
case .customCharacterClass:
1030-
return true
1296+
case .customCharacterClass(let ccc):
1297+
return ccc.guaranteesForwardProgress
10311298
case .quantification(let amount, _, let child):
10321299
let (atLeast, _) = amount.ast.bounds
10331300
return atLeast ?? 0 > 0 && child.guaranteesForwardProgress
10341301
default: return false
10351302
}
10361303
}
10371304
}
1305+
1306+
extension DSLTree.CustomCharacterClass {
1307+
/// We allow trivia into CustomCharacterClass, which could result in a CCC
1308+
/// that matches nothing, ie `(?x)[ ]`.
1309+
var guaranteesForwardProgress: Bool {
1310+
for m in members {
1311+
switch m {
1312+
case .trivia:
1313+
continue
1314+
case let .intersection(lhs, rhs):
1315+
return lhs.guaranteesForwardProgress && rhs.guaranteesForwardProgress
1316+
case let .subtraction(lhs, _):
1317+
return lhs.guaranteesForwardProgress
1318+
case let .symmetricDifference(lhs, rhs):
1319+
return lhs.guaranteesForwardProgress && rhs.guaranteesForwardProgress
1320+
default:
1321+
return true
1322+
}
1323+
}
1324+
return false
1325+
}
1326+
}

0 commit comments

Comments
 (0)