Skip to content

Commit df7fc51

Browse files
committed
Remove most consumer functions
This is based heavily off the work in swiftlang#590, rebased onto main, with some changes to remove even more consumer uses. Consumer functions only have two remaining uses: non-ASCII ranges and Unicode lookups (for things like general category, binary properties, name, etc.). This change primarily treats custom character classes as alternations around their contents, with set operations emitted as instructions instead of implemented via consumer function.
1 parent 9ea9936 commit df7fc51

File tree

7 files changed

+382
-268
lines changed

7 files changed

+382
-268
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 296 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -828,19 +828,194 @@ fileprivate extension Compiler.ByteCodeGen {
828828
}
829829
}
830830

831+
/// Flatten quoted strings into groups of
832+
func flatteningCustomCharacterClassMembers(
833+
_ members: [DSLTree.CustomCharacterClass.Member]
834+
) -> [DSLTree.CustomCharacterClass.Member] {
835+
var characters: Set<Character> = []
836+
var scalars: Set<UnicodeScalar> = []
837+
var result: [DSLTree.CustomCharacterClass.Member] = []
838+
for member in members {
839+
switch member {
840+
case .atom(let atom):
841+
switch atom {
842+
case let .char(char):
843+
characters.insert(char)
844+
case let .scalar(scalar):
845+
scalars.insert(scalar)
846+
default:
847+
result.append(member)
848+
}
849+
case let .quotedLiteral(str):
850+
characters.formUnion(str)
851+
default:
852+
result.append(member)
853+
}
854+
}
855+
result.append(contentsOf: characters.map { .atom(.char($0)) })
856+
result.append(contentsOf: scalars.map { .atom(.scalar($0)) })
857+
return result
858+
}
859+
831860
func coalescingCustomCharacterClass(
832861
_ ccc: DSLTree.CustomCharacterClass
833862
) -> DSLTree.CustomCharacterClass {
834863
// This only needs to be done in grapheme semantic mode. In scalar semantic
835864
// mode, we don't want to coalesce any scalars into a grapheme. This
836865
// means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
837866
// U+302.
838-
guard options.semanticLevel == .graphemeCluster else { return ccc }
839-
840-
let members = coalescingCustomCharacterClassMembers(ccc.members)
841-
return .init(members: members, isInverted: ccc.isInverted)
867+
let members = options.semanticLevel == .graphemeCluster
868+
? coalescingCustomCharacterClassMembers(ccc.members)
869+
: ccc.members
870+
return .init(
871+
members: flatteningCustomCharacterClassMembers(members),
872+
isInverted: ccc.isInverted)
842873
}
843874

875+
mutating func emitCharacterInCCC(_ c: Character) {
876+
switch options.semanticLevel {
877+
case .graphemeCluster:
878+
emitCharacter(c)
879+
case .unicodeScalar:
880+
// When in scalar mode, act like an alternation of the individual scalars
881+
// that comprise a character.
882+
let done = builder.makeAddress()
883+
for scalar in c.unicodeScalars.dropLast() {
884+
let next = builder.makeAddress()
885+
builder.buildSave(next)
886+
emitMatchScalar(scalar)
887+
builder.buildClear()
888+
builder.buildBranch(to: done)
889+
builder.label(next)
890+
}
891+
emitMatchScalar(c.unicodeScalars.last!)
892+
builder.label(done)
893+
}
894+
}
895+
896+
mutating func emitCCCMember(
897+
_ member: DSLTree.CustomCharacterClass.Member
898+
) throws {
899+
switch member {
900+
case .atom(let atom):
901+
switch atom {
902+
case .char(let c):
903+
emitCharacterInCCC(c)
904+
case .scalar(let s):
905+
emitCharacterInCCC(Character(s))
906+
default:
907+
try emitAtom(atom)
908+
}
909+
case .custom(let ccc):
910+
try emitCustomCharacterClass(ccc)
911+
case .quotedLiteral:
912+
fatalError("Removed in 'flatteningCustomCharacterClassMembers'")
913+
case .range:
914+
let consumer = try member.generateConsumer(options)
915+
builder.buildConsume(by: consumer)
916+
case .trivia:
917+
return
918+
919+
// TODO: Can we decide when it's better to try `rhs` first?
920+
// Intersection is trivial, since failure on either side propagates:
921+
// - store current position
922+
// - lhs
923+
// - restore current position
924+
// - rhs
925+
case let .intersection(lhs, rhs):
926+
let r = builder.makePositionRegister()
927+
builder.buildMoveCurrentPosition(into: r)
928+
try emitCustomCharacterClass(lhs)
929+
builder.buildRestorePosition(from: r)
930+
try emitCustomCharacterClass(rhs)
931+
932+
// TODO: Can we decide when it's better to try `rhs` first?
933+
// For subtraction, failure in `lhs` propagates, while failure in `rhs` is
934+
// swallowed/reversed:
935+
// - store current position
936+
// - lhs
937+
// - save to end
938+
// - restore current position
939+
// - rhs
940+
// - clear, fail (since both succeeded)
941+
// - end: ...
942+
case let .subtraction(lhs, rhs):
943+
let r = builder.makePositionRegister()
944+
let end = builder.makeAddress()
945+
builder.buildMoveCurrentPosition(into: r)
946+
try emitCustomCharacterClass(lhs) // no match here = failure, propagates
947+
builder.buildSave(end)
948+
builder.buildRestorePosition(from: r)
949+
try emitCustomCharacterClass(rhs) // no match here = success, resumes at 'end'
950+
builder.buildClear() // clears 'end'
951+
builder.buildFail() // this failure propagates outward
952+
builder.label(end)
953+
954+
// Symmetric difference always requires executing both `rhs` and `lhs`.
955+
// Execute each, ignoring failure and storing the resulting position in a
956+
// register. If those results are equal, fail. If they're different, use
957+
// the position that is different from the starting position:
958+
// - store current position as r0
959+
// - save to lhsFail
960+
// - lhs
961+
// - clear lhsFail (and continue)
962+
// - lhsFail: save position as r1
963+
//
964+
// - restore current position
965+
// - save to rhsFail
966+
// - rhs
967+
// - clear rhsFail (and continue)
968+
// - rhsFail: save position as r2
969+
//
970+
// - restore to resulting position from lhs (r1)
971+
// - if equal to r2, goto fail (both sides had same result)
972+
// - if equal to r0, goto advance (lhs failed)
973+
// - goto end
974+
// - advance: restore to resulting position from rhs (r2)
975+
// - goto end
976+
// - fail: fail
977+
// - end: ...
978+
case let .symmetricDifference(lhs, rhs):
979+
let r0 = builder.makePositionRegister()
980+
let r1 = builder.makePositionRegister()
981+
let r2 = builder.makePositionRegister()
982+
let lhsFail = builder.makeAddress()
983+
let rhsFail = builder.makeAddress()
984+
let advance = builder.makeAddress()
985+
let fail = builder.makeAddress()
986+
let end = builder.makeAddress()
987+
988+
builder.buildMoveCurrentPosition(into: r0)
989+
builder.buildSave(lhsFail)
990+
try emitCustomCharacterClass(lhs)
991+
builder.buildClear()
992+
builder.label(lhsFail)
993+
builder.buildMoveCurrentPosition(into: r1)
994+
995+
builder.buildRestorePosition(from: r0)
996+
builder.buildSave(rhsFail)
997+
try emitCustomCharacterClass(rhs)
998+
builder.buildClear()
999+
builder.label(rhsFail)
1000+
builder.buildMoveCurrentPosition(into: r2)
1001+
1002+
// If r1 == r2, then fail
1003+
builder.buildRestorePosition(from: r1)
1004+
builder.buildCondBranch(to: fail, ifSamePositionAs: r2)
1005+
1006+
// If r1 == r0, then move to r2 before ending
1007+
builder.buildCondBranch(to: advance, ifSamePositionAs: r0)
1008+
builder.buildBranch(to: end)
1009+
builder.label(advance)
1010+
builder.buildRestorePosition(from: r2)
1011+
builder.buildBranch(to: end)
1012+
1013+
builder.label(fail)
1014+
builder.buildFail()
1015+
builder.label(end)
1016+
}
1017+
}
1018+
8441019
mutating func emitCustomCharacterClass(
8451020
_ ccc: DSLTree.CustomCharacterClass
8461021
) throws {
@@ -858,8 +1033,93 @@ fileprivate extension Compiler.ByteCodeGen {
8581033
}
8591034
return
8601035
}
861-
let consumer = try ccc.generateConsumer(options)
862-
builder.buildConsume(by: consumer)
1036+
1037+
let updatedCCC: DSLTree.CustomCharacterClass
1038+
if optimizationsEnabled {
1039+
updatedCCC = ccc.coalescingASCIIMembers(options)
1040+
} else {
1041+
updatedCCC = ccc
1042+
}
1043+
let filteredMembers = updatedCCC.members.filter({!$0.isOnlyTrivia})
1044+
1045+
if updatedCCC.isInverted {
1046+
// inverted
1047+
// custom character class: p0 | p1 | ... | pn
1048+
// Try each member to make sure they all fail
1049+
// save next_p1
1050+
// <code for p0>
1051+
// clear, fail
1052+
// next_p1:
1053+
// save next_p2
1054+
// <code for p1>
1055+
// clear fail
1056+
// next_p2:
1057+
// save next_p...
1058+
// <code for p2>
1059+
// clear fail
1060+
// ...
1061+
// next_pn:
1062+
// save done
1063+
// <code for pn>
1064+
// clear fail
1065+
// done:
1066+
// step forward by 1
1067+
let done = builder.makeAddress()
1068+
for member in filteredMembers.dropLast() {
1069+
let next = builder.makeAddress()
1070+
builder.buildSave(next)
1071+
try emitCCCMember(member)
1072+
builder.buildClear()
1073+
builder.buildFail()
1074+
builder.label(next)
1075+
}
1076+
builder.buildSave(done)
1077+
try emitCCCMember(filteredMembers.last!)
1078+
builder.buildClear()
1079+
builder.buildFail()
1080+
builder.label(done)
1081+
1082+
// Consume a single unit for the inverted ccc
1083+
switch options.semanticLevel {
1084+
case .graphemeCluster:
1085+
builder.buildAdvance(1)
1086+
case .unicodeScalar:
1087+
builder.buildAdvanceUnicodeScalar(1)
1088+
}
1089+
return
1090+
}
1091+
// non inverted CCC
1092+
// Custom character class: p0 | p1 | ... | pn
1093+
// Very similar to alternation, but we don't keep backtracking save points
1094+
// save next_p1
1095+
// <code for p0>
1096+
// clear
1097+
// branch done
1098+
// next_p1:
1099+
// save next_p2
1100+
// <code for p1>
1101+
// clear
1102+
// branch done
1103+
// next_p2:
1104+
// save next_p...
1105+
// <code for p2>
1106+
// clear
1107+
// branch done
1108+
// ...
1109+
// next_pn:
1110+
// <code for pn>
1111+
// done:
1112+
let done = builder.makeAddress()
1113+
for member in filteredMembers.dropLast() {
1114+
let next = builder.makeAddress()
1115+
builder.buildSave(next)
1116+
try emitCCCMember(member)
1117+
builder.buildClear()
1118+
builder.buildBranch(to: done)
1119+
builder.label(next)
1120+
}
1121+
try emitCCCMember(filteredMembers.last!)
1122+
builder.label(done)
8631123
}
8641124

8651125
mutating func emitConcatenation(_ children: [DSLTree.Node]) throws {
@@ -996,6 +1256,12 @@ fileprivate extension Compiler.ByteCodeGen {
9961256
}
9971257

9981258
extension DSLTree.Node {
1259+
/// A Boolean value indicating whether this node advances the match position
1260+
/// on a successful match.
1261+
///
1262+
/// For example, an alternation like `(a|b|c)` always advances the position
1263+
/// by a character, but `(a|b|)` has an empty branch, which matches without
1264+
/// advancing.
9991265
var guaranteesForwardProgress: Bool {
10001266
switch self {
10011267
case .orderedChoice(let children):
@@ -1026,12 +1292,34 @@ extension DSLTree.Node {
10261292
case .consumer, .matcher:
10271293
// Allow zero width consumers and matchers
10281294
return false
1029-
case .customCharacterClass:
1030-
return true
1295+
case .customCharacterClass(let ccc):
1296+
return ccc.guaranteesForwardProgress
10311297
case .quantification(let amount, _, let child):
10321298
let (atLeast, _) = amount.ast.bounds
10331299
return atLeast ?? 0 > 0 && child.guaranteesForwardProgress
10341300
default: return false
10351301
}
10361302
}
10371303
}
1304+
1305+
extension DSLTree.CustomCharacterClass {
1306+
/// We allow trivia into CustomCharacterClass, which could result in a CCC
1307+
/// that matches nothing, ie `(?x)[ ]`.
1308+
var guaranteesForwardProgress: Bool {
1309+
for m in members {
1310+
switch m {
1311+
case .trivia:
1312+
continue
1313+
case let .intersection(lhs, rhs):
1314+
return lhs.guaranteesForwardProgress && rhs.guaranteesForwardProgress
1315+
case let .subtraction(lhs, _):
1316+
return lhs.guaranteesForwardProgress
1317+
case let .symmetricDifference(lhs, rhs):
1318+
return lhs.guaranteesForwardProgress && rhs.guaranteesForwardProgress
1319+
default:
1320+
return true
1321+
}
1322+
}
1323+
return false
1324+
}
1325+
}

0 commit comments

Comments
 (0)