@@ -256,9 +256,11 @@ fileprivate extension Compiler.ByteCodeGen {
256
256
}
257
257
}
258
258
259
- mutating func emitAlternation(
260
- _ children: [ DSLTree . Node ]
261
- ) throws {
259
+ mutating func emitAlternationGen< C: BidirectionalCollection > (
260
+ _ elements: C ,
261
+ withBacktracking: Bool ,
262
+ _ body: ( inout Compiler . ByteCodeGen , C . Element ) throws -> Void
263
+ ) rethrows {
262
264
// Alternation: p0 | p1 | ... | pn
263
265
// save next_p1
264
266
// <code for p0>
@@ -276,16 +278,27 @@ fileprivate extension Compiler.ByteCodeGen {
276
278
// <code for pn>
277
279
// done:
278
280
let done = builder. makeAddress ( )
279
- for component in children . dropLast ( ) {
281
+ for element in elements . dropLast ( ) {
280
282
let next = builder. makeAddress ( )
281
283
builder. buildSave ( next)
282
- try emitNode ( component)
284
+ try body ( & self , element)
285
+ if !withBacktracking {
286
+ builder. buildClear ( )
287
+ }
283
288
builder. buildBranch ( to: done)
284
289
builder. label ( next)
285
290
}
286
- try emitNode ( children . last!)
291
+ try body ( & self , elements . last!)
287
292
builder. label ( done)
288
293
}
294
+
295
+ mutating func emitAlternation(
296
+ _ children: [ DSLTree . Node ]
297
+ ) throws {
298
+ try emitAlternationGen ( children, withBacktracking: true ) {
299
+ try $0. emitNode ( $1)
300
+ }
301
+ }
289
302
290
303
mutating func emitConcatenationComponent(
291
304
_ node: DSLTree . Node
@@ -872,19 +885,187 @@ fileprivate extension Compiler.ByteCodeGen {
872
885
}
873
886
}
874
887
888
+ /// Flatten quoted strings into sequences of atoms, so that the standard
889
+ /// CCC codegen will handle them.
890
+ func flatteningCustomCharacterClassMembers(
891
+ _ members: [ DSLTree . CustomCharacterClass . Member ]
892
+ ) -> [ DSLTree . CustomCharacterClass . Member ] {
893
+ var characters : Set < Character > = [ ]
894
+ var scalars : Set < UnicodeScalar > = [ ]
895
+ var result : [ DSLTree . CustomCharacterClass . Member ] = [ ]
896
+ for member in members {
897
+ switch member {
898
+ case . atom( let atom) :
899
+ switch atom {
900
+ case let . char( char) :
901
+ characters. insert ( char)
902
+ case let . scalar( scalar) :
903
+ scalars. insert ( scalar)
904
+ default :
905
+ result. append ( member)
906
+ }
907
+ case let . quotedLiteral( str) :
908
+ characters. formUnion ( str)
909
+ default :
910
+ result. append ( member)
911
+ }
912
+ }
913
+ result. append ( contentsOf: characters. map { . atom( . char( $0) ) } )
914
+ result. append ( contentsOf: scalars. map { . atom( . scalar( $0) ) } )
915
+ return result
916
+ }
917
+
875
918
func coalescingCustomCharacterClass(
876
919
_ ccc: DSLTree . CustomCharacterClass
877
920
) -> DSLTree . CustomCharacterClass {
878
921
// This only needs to be done in grapheme semantic mode. In scalar semantic
879
922
// mode, we don't want to coalesce any scalars into a grapheme. This
880
923
// means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
881
924
// U+302.
882
- guard options. semanticLevel == . graphemeCluster else { return ccc }
883
-
884
- let members = coalescingCustomCharacterClassMembers ( ccc. members)
885
- return . init( members: members, isInverted: ccc. isInverted)
925
+ let members = options. semanticLevel == . graphemeCluster
926
+ ? coalescingCustomCharacterClassMembers ( ccc. members)
927
+ : ccc. members
928
+ return . init(
929
+ members: flatteningCustomCharacterClassMembers ( members) ,
930
+ isInverted: ccc. isInverted)
886
931
}
887
932
933
+ mutating func emitCharacterInCCC( _ c: Character ) {
934
+ switch options. semanticLevel {
935
+ case . graphemeCluster:
936
+ emitCharacter ( c)
937
+ case . unicodeScalar:
938
+ // When in scalar mode, act like an alternation of the individual scalars
939
+ // that comprise a character.
940
+ emitAlternationGen ( c. unicodeScalars, withBacktracking: false ) {
941
+ $0. emitMatchScalar ( $1)
942
+ }
943
+ }
944
+ }
945
+
946
+ mutating func emitCCCMember(
947
+ _ member: DSLTree . CustomCharacterClass . Member
948
+ ) throws {
949
+ switch member {
950
+ case . atom( let atom) :
951
+ switch atom {
952
+ case . char( let c) :
953
+ emitCharacterInCCC ( c)
954
+ case . scalar( let s) :
955
+ emitCharacterInCCC ( Character ( s) )
956
+ default :
957
+ try emitAtom ( atom)
958
+ }
959
+ case . custom( let ccc) :
960
+ try emitCustomCharacterClass ( ccc)
961
+ case . quotedLiteral:
962
+ fatalError ( " Removed in 'flatteningCustomCharacterClassMembers' " )
963
+ case . range:
964
+ let consumer = try member. generateConsumer ( options)
965
+ builder. buildConsume ( by: consumer)
966
+ case . trivia:
967
+ return
968
+
969
+ // TODO: Can we decide when it's better to try `rhs` first?
970
+ // Intersection is trivial, since failure on either side propagates:
971
+ // - store current position
972
+ // - lhs
973
+ // - restore current position
974
+ // - rhs
975
+ case let . intersection( lhs, rhs) :
976
+ let r = builder. makePositionRegister ( )
977
+ builder. buildMoveCurrentPosition ( into: r)
978
+ try emitCustomCharacterClass ( lhs)
979
+ builder. buildRestorePosition ( from: r)
980
+ try emitCustomCharacterClass ( rhs)
981
+
982
+ // TODO: Can we decide when it's better to try `rhs` first?
983
+ // For subtraction, failure in `lhs` propagates, while failure in `rhs` is
984
+ // swallowed/reversed:
985
+ // - store current position
986
+ // - lhs
987
+ // - save to end
988
+ // - restore current position
989
+ // - rhs
990
+ // - clear, fail (since both succeeded)
991
+ // - end: ...
992
+ case let . subtraction( lhs, rhs) :
993
+ let r = builder. makePositionRegister ( )
994
+ let end = builder. makeAddress ( )
995
+ builder. buildMoveCurrentPosition ( into: r)
996
+ try emitCustomCharacterClass ( lhs) // no match here = failure, propagates
997
+ builder. buildSave ( end)
998
+ builder. buildRestorePosition ( from: r)
999
+ try emitCustomCharacterClass ( rhs) // no match here = success, resumes at 'end'
1000
+ builder. buildClear ( ) // clears 'end'
1001
+ builder. buildFail ( ) // this failure propagates outward
1002
+ builder. label ( end)
1003
+
1004
+ // Symmetric difference always requires executing both `rhs` and `lhs`.
1005
+ // Execute each, ignoring failure and storing the resulting position in a
1006
+ // register. If those results are equal, fail. If they're different, use
1007
+ // the position that is different from the starting position:
1008
+ // - store current position as r0
1009
+ // - save to lhsFail
1010
+ // - lhs
1011
+ // - clear lhsFail (and continue)
1012
+ // - lhsFail: save position as r1
1013
+ //
1014
+ // - restore current position
1015
+ // - save to rhsFail
1016
+ // - rhs
1017
+ // - clear rhsFail (and continue)
1018
+ // - rhsFail: save position as r2
1019
+ //
1020
+ // - restore to resulting position from lhs (r1)
1021
+ // - if equal to r2, goto fail (both sides had same result)
1022
+ // - if equal to r0, goto advance (lhs failed)
1023
+ // - goto end
1024
+ // - advance: restore to resulting position from rhs (r2)
1025
+ // - goto end
1026
+ // - fail: fail
1027
+ // - end: ...
1028
+ case let . symmetricDifference( lhs, rhs) :
1029
+ let r0 = builder. makePositionRegister ( )
1030
+ let r1 = builder. makePositionRegister ( )
1031
+ let r2 = builder. makePositionRegister ( )
1032
+ let lhsFail = builder. makeAddress ( )
1033
+ let rhsFail = builder. makeAddress ( )
1034
+ let advance = builder. makeAddress ( )
1035
+ let fail = builder. makeAddress ( )
1036
+ let end = builder. makeAddress ( )
1037
+
1038
+ builder. buildMoveCurrentPosition ( into: r0)
1039
+ builder. buildSave ( lhsFail)
1040
+ try emitCustomCharacterClass ( lhs)
1041
+ builder. buildClear ( )
1042
+ builder. label ( lhsFail)
1043
+ builder. buildMoveCurrentPosition ( into: r1)
1044
+
1045
+ builder. buildRestorePosition ( from: r0)
1046
+ builder. buildSave ( rhsFail)
1047
+ try emitCustomCharacterClass ( rhs)
1048
+ builder. buildClear ( )
1049
+ builder. label ( rhsFail)
1050
+ builder. buildMoveCurrentPosition ( into: r2)
1051
+
1052
+ // If r1 == r2, then fail
1053
+ builder. buildRestorePosition ( from: r1)
1054
+ builder. buildCondBranch ( to: fail, ifSamePositionAs: r2)
1055
+
1056
+ // If r1 == r0, then move to r2 before ending
1057
+ builder. buildCondBranch ( to: advance, ifSamePositionAs: r0)
1058
+ builder. buildBranch ( to: end)
1059
+ builder. label ( advance)
1060
+ builder. buildRestorePosition ( from: r2)
1061
+ builder. buildBranch ( to: end)
1062
+
1063
+ builder. label ( fail)
1064
+ builder. buildFail ( )
1065
+ builder. label ( end)
1066
+ }
1067
+ }
1068
+
888
1069
mutating func emitCustomCharacterClass(
889
1070
_ ccc: DSLTree . CustomCharacterClass
890
1071
) throws {
@@ -902,8 +1083,67 @@ fileprivate extension Compiler.ByteCodeGen {
902
1083
}
903
1084
return
904
1085
}
905
- let consumer = try ccc. generateConsumer ( options)
906
- builder. buildConsume ( by: consumer)
1086
+
1087
+ let updatedCCC : DSLTree . CustomCharacterClass
1088
+ if optimizationsEnabled {
1089
+ updatedCCC = ccc. coalescingASCIIMembers ( options)
1090
+ } else {
1091
+ updatedCCC = ccc
1092
+ }
1093
+ let filteredMembers = updatedCCC. members. filter ( { !$0. isOnlyTrivia} )
1094
+
1095
+ if updatedCCC. isInverted {
1096
+ // inverted
1097
+ // custom character class: p0 | p1 | ... | pn
1098
+ // Try each member to make sure they all fail
1099
+ // save next_p1
1100
+ // <code for p0>
1101
+ // clear, fail
1102
+ // next_p1:
1103
+ // save next_p2
1104
+ // <code for p1>
1105
+ // clear fail
1106
+ // next_p2:
1107
+ // save next_p...
1108
+ // <code for p2>
1109
+ // clear fail
1110
+ // ...
1111
+ // next_pn:
1112
+ // save done
1113
+ // <code for pn>
1114
+ // clear fail
1115
+ // done:
1116
+ // step forward by 1
1117
+ let done = builder. makeAddress ( )
1118
+ for member in filteredMembers. dropLast ( ) {
1119
+ let next = builder. makeAddress ( )
1120
+ builder. buildSave ( next)
1121
+ try emitCCCMember ( member)
1122
+ builder. buildClear ( )
1123
+ builder. buildFail ( )
1124
+ builder. label ( next)
1125
+ }
1126
+ builder. buildSave ( done)
1127
+ try emitCCCMember ( filteredMembers. last!)
1128
+ builder. buildClear ( )
1129
+ builder. buildFail ( )
1130
+ builder. label ( done)
1131
+
1132
+ // Consume a single unit for the inverted ccc
1133
+ switch options. semanticLevel {
1134
+ case . graphemeCluster:
1135
+ builder. buildAdvance ( 1 )
1136
+ case . unicodeScalar:
1137
+ builder. buildAdvanceUnicodeScalar ( 1 )
1138
+ }
1139
+ return
1140
+ }
1141
+ // non inverted CCC
1142
+ // Custom character class: p0 | p1 | ... | pn
1143
+ // Very similar to alternation, but we don't keep backtracking save points
1144
+ try emitAlternationGen ( filteredMembers, withBacktracking: false ) {
1145
+ try $0. emitCCCMember ( $1)
1146
+ }
907
1147
}
908
1148
909
1149
mutating func emitConcatenation( _ children: [ DSLTree . Node ] ) throws {
@@ -1040,6 +1280,12 @@ fileprivate extension Compiler.ByteCodeGen {
1040
1280
}
1041
1281
1042
1282
extension DSLTree . Node {
1283
+ /// A Boolean value indicating whether this node advances the match position
1284
+ /// on a successful match.
1285
+ ///
1286
+ /// For example, an alternation like `(a|b|c)` always advances the position
1287
+ /// by a character, but `(a|b|)` has an empty branch, which matches without
1288
+ /// advancing.
1043
1289
var guaranteesForwardProgress : Bool {
1044
1290
switch self {
1045
1291
case . orderedChoice( let children) :
@@ -1070,12 +1316,34 @@ extension DSLTree.Node {
1070
1316
case . consumer, . matcher:
1071
1317
// Allow zero width consumers and matchers
1072
1318
return false
1073
- case . customCharacterClass:
1074
- return true
1319
+ case . customCharacterClass( let ccc ) :
1320
+ return ccc . guaranteesForwardProgress
1075
1321
case . quantification( let amount, _, let child) :
1076
1322
let ( atLeast, _) = amount. ast. bounds
1077
1323
return atLeast ?? 0 > 0 && child. guaranteesForwardProgress
1078
1324
default : return false
1079
1325
}
1080
1326
}
1081
1327
}
1328
+
1329
+ extension DSLTree . CustomCharacterClass {
1330
+ /// We allow trivia into CustomCharacterClass, which could result in a CCC
1331
+ /// that matches nothing, ie `(?x)[ ]`.
1332
+ var guaranteesForwardProgress : Bool {
1333
+ for m in members {
1334
+ switch m {
1335
+ case . trivia:
1336
+ continue
1337
+ case let . intersection( lhs, rhs) :
1338
+ return lhs. guaranteesForwardProgress && rhs. guaranteesForwardProgress
1339
+ case let . subtraction( lhs, _) :
1340
+ return lhs. guaranteesForwardProgress
1341
+ case let . symmetricDifference( lhs, rhs) :
1342
+ return lhs. guaranteesForwardProgress && rhs. guaranteesForwardProgress
1343
+ default :
1344
+ return true
1345
+ }
1346
+ }
1347
+ return false
1348
+ }
1349
+ }
0 commit comments