@@ -243,9 +243,11 @@ fileprivate extension Compiler.ByteCodeGen {
243
243
}
244
244
}
245
245
246
- mutating func emitAlternation(
247
- _ children: [ DSLTree . Node ]
248
- ) throws {
246
+ mutating func emitAlternationGen< C: BidirectionalCollection > (
247
+ _ elements: C ,
248
+ withBacktracking: Bool ,
249
+ _ body: ( inout Compiler . ByteCodeGen , C . Element ) throws -> Void
250
+ ) rethrows {
249
251
// Alternation: p0 | p1 | ... | pn
250
252
// save next_p1
251
253
// <code for p0>
@@ -263,16 +265,27 @@ fileprivate extension Compiler.ByteCodeGen {
263
265
// <code for pn>
264
266
// done:
265
267
let done = builder. makeAddress ( )
266
- for component in children . dropLast ( ) {
268
+ for element in elements . dropLast ( ) {
267
269
let next = builder. makeAddress ( )
268
270
builder. buildSave ( next)
269
- try emitNode ( component)
271
+ try body ( & self , element)
272
+ if !withBacktracking {
273
+ builder. buildClear ( )
274
+ }
270
275
builder. buildBranch ( to: done)
271
276
builder. label ( next)
272
277
}
273
- try emitNode ( children . last!)
278
+ try body ( & self , elements . last!)
274
279
builder. label ( done)
275
280
}
281
+
282
+ mutating func emitAlternation(
283
+ _ children: [ DSLTree . Node ]
284
+ ) throws {
285
+ try emitAlternationGen ( children, withBacktracking: true ) {
286
+ try $0. emitNode ( $1)
287
+ }
288
+ }
276
289
277
290
mutating func emitConcatenationComponent(
278
291
_ node: DSLTree . Node
@@ -828,19 +841,187 @@ fileprivate extension Compiler.ByteCodeGen {
828
841
}
829
842
}
830
843
844
+ /// Flatten quoted strings into sequences of atoms, so that the standard
845
+ /// CCC codegen will handle them.
846
+ func flatteningCustomCharacterClassMembers(
847
+ _ members: [ DSLTree . CustomCharacterClass . Member ]
848
+ ) -> [ DSLTree . CustomCharacterClass . Member ] {
849
+ var characters : Set < Character > = [ ]
850
+ var scalars : Set < UnicodeScalar > = [ ]
851
+ var result : [ DSLTree . CustomCharacterClass . Member ] = [ ]
852
+ for member in members {
853
+ switch member {
854
+ case . atom( let atom) :
855
+ switch atom {
856
+ case let . char( char) :
857
+ characters. insert ( char)
858
+ case let . scalar( scalar) :
859
+ scalars. insert ( scalar)
860
+ default :
861
+ result. append ( member)
862
+ }
863
+ case let . quotedLiteral( str) :
864
+ characters. formUnion ( str)
865
+ default :
866
+ result. append ( member)
867
+ }
868
+ }
869
+ result. append ( contentsOf: characters. map { . atom( . char( $0) ) } )
870
+ result. append ( contentsOf: scalars. map { . atom( . scalar( $0) ) } )
871
+ return result
872
+ }
873
+
831
874
func coalescingCustomCharacterClass(
832
875
_ ccc: DSLTree . CustomCharacterClass
833
876
) -> DSLTree . CustomCharacterClass {
834
877
// This only needs to be done in grapheme semantic mode. In scalar semantic
835
878
// mode, we don't want to coalesce any scalars into a grapheme. This
836
879
// means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
837
880
// U+302.
838
- guard options. semanticLevel == . graphemeCluster else { return ccc }
839
-
840
- let members = coalescingCustomCharacterClassMembers ( ccc. members)
841
- return . init( members: members, isInverted: ccc. isInverted)
881
+ let members = options. semanticLevel == . graphemeCluster
882
+ ? coalescingCustomCharacterClassMembers ( ccc. members)
883
+ : ccc. members
884
+ return . init(
885
+ members: flatteningCustomCharacterClassMembers ( members) ,
886
+ isInverted: ccc. isInverted)
842
887
}
843
888
889
+ mutating func emitCharacterInCCC( _ c: Character ) {
890
+ switch options. semanticLevel {
891
+ case . graphemeCluster:
892
+ emitCharacter ( c)
893
+ case . unicodeScalar:
894
+ // When in scalar mode, act like an alternation of the individual scalars
895
+ // that comprise a character.
896
+ emitAlternationGen ( c. unicodeScalars, withBacktracking: false ) {
897
+ $0. emitMatchScalar ( $1)
898
+ }
899
+ }
900
+ }
901
+
902
+ mutating func emitCCCMember(
903
+ _ member: DSLTree . CustomCharacterClass . Member
904
+ ) throws {
905
+ switch member {
906
+ case . atom( let atom) :
907
+ switch atom {
908
+ case . char( let c) :
909
+ emitCharacterInCCC ( c)
910
+ case . scalar( let s) :
911
+ emitCharacterInCCC ( Character ( s) )
912
+ default :
913
+ try emitAtom ( atom)
914
+ }
915
+ case . custom( let ccc) :
916
+ try emitCustomCharacterClass ( ccc)
917
+ case . quotedLiteral:
918
+ fatalError ( " Removed in 'flatteningCustomCharacterClassMembers' " )
919
+ case . range:
920
+ let consumer = try member. generateConsumer ( options)
921
+ builder. buildConsume ( by: consumer)
922
+ case . trivia:
923
+ return
924
+
925
+ // TODO: Can we decide when it's better to try `rhs` first?
926
+ // Intersection is trivial, since failure on either side propagates:
927
+ // - store current position
928
+ // - lhs
929
+ // - restore current position
930
+ // - rhs
931
+ case let . intersection( lhs, rhs) :
932
+ let r = builder. makePositionRegister ( )
933
+ builder. buildMoveCurrentPosition ( into: r)
934
+ try emitCustomCharacterClass ( lhs)
935
+ builder. buildRestorePosition ( from: r)
936
+ try emitCustomCharacterClass ( rhs)
937
+
938
+ // TODO: Can we decide when it's better to try `rhs` first?
939
+ // For subtraction, failure in `lhs` propagates, while failure in `rhs` is
940
+ // swallowed/reversed:
941
+ // - store current position
942
+ // - lhs
943
+ // - save to end
944
+ // - restore current position
945
+ // - rhs
946
+ // - clear, fail (since both succeeded)
947
+ // - end: ...
948
+ case let . subtraction( lhs, rhs) :
949
+ let r = builder. makePositionRegister ( )
950
+ let end = builder. makeAddress ( )
951
+ builder. buildMoveCurrentPosition ( into: r)
952
+ try emitCustomCharacterClass ( lhs) // no match here = failure, propagates
953
+ builder. buildSave ( end)
954
+ builder. buildRestorePosition ( from: r)
955
+ try emitCustomCharacterClass ( rhs) // no match here = success, resumes at 'end'
956
+ builder. buildClear ( ) // clears 'end'
957
+ builder. buildFail ( ) // this failure propagates outward
958
+ builder. label ( end)
959
+
960
+ // Symmetric difference always requires executing both `rhs` and `lhs`.
961
+ // Execute each, ignoring failure and storing the resulting position in a
962
+ // register. If those results are equal, fail. If they're different, use
963
+ // the position that is different from the starting position:
964
+ // - store current position as r0
965
+ // - save to lhsFail
966
+ // - lhs
967
+ // - clear lhsFail (and continue)
968
+ // - lhsFail: save position as r1
969
+ //
970
+ // - restore current position
971
+ // - save to rhsFail
972
+ // - rhs
973
+ // - clear rhsFail (and continue)
974
+ // - rhsFail: save position as r2
975
+ //
976
+ // - restore to resulting position from lhs (r1)
977
+ // - if equal to r2, goto fail (both sides had same result)
978
+ // - if equal to r0, goto advance (lhs failed)
979
+ // - goto end
980
+ // - advance: restore to resulting position from rhs (r2)
981
+ // - goto end
982
+ // - fail: fail
983
+ // - end: ...
984
+ case let . symmetricDifference( lhs, rhs) :
985
+ let r0 = builder. makePositionRegister ( )
986
+ let r1 = builder. makePositionRegister ( )
987
+ let r2 = builder. makePositionRegister ( )
988
+ let lhsFail = builder. makeAddress ( )
989
+ let rhsFail = builder. makeAddress ( )
990
+ let advance = builder. makeAddress ( )
991
+ let fail = builder. makeAddress ( )
992
+ let end = builder. makeAddress ( )
993
+
994
+ builder. buildMoveCurrentPosition ( into: r0)
995
+ builder. buildSave ( lhsFail)
996
+ try emitCustomCharacterClass ( lhs)
997
+ builder. buildClear ( )
998
+ builder. label ( lhsFail)
999
+ builder. buildMoveCurrentPosition ( into: r1)
1000
+
1001
+ builder. buildRestorePosition ( from: r0)
1002
+ builder. buildSave ( rhsFail)
1003
+ try emitCustomCharacterClass ( rhs)
1004
+ builder. buildClear ( )
1005
+ builder. label ( rhsFail)
1006
+ builder. buildMoveCurrentPosition ( into: r2)
1007
+
1008
+ // If r1 == r2, then fail
1009
+ builder. buildRestorePosition ( from: r1)
1010
+ builder. buildCondBranch ( to: fail, ifSamePositionAs: r2)
1011
+
1012
+ // If r1 == r0, then move to r2 before ending
1013
+ builder. buildCondBranch ( to: advance, ifSamePositionAs: r0)
1014
+ builder. buildBranch ( to: end)
1015
+ builder. label ( advance)
1016
+ builder. buildRestorePosition ( from: r2)
1017
+ builder. buildBranch ( to: end)
1018
+
1019
+ builder. label ( fail)
1020
+ builder. buildFail ( )
1021
+ builder. label ( end)
1022
+ }
1023
+ }
1024
+
844
1025
mutating func emitCustomCharacterClass(
845
1026
_ ccc: DSLTree . CustomCharacterClass
846
1027
) throws {
@@ -858,8 +1039,67 @@ fileprivate extension Compiler.ByteCodeGen {
858
1039
}
859
1040
return
860
1041
}
861
- let consumer = try ccc. generateConsumer ( options)
862
- builder. buildConsume ( by: consumer)
1042
+
1043
+ let updatedCCC : DSLTree . CustomCharacterClass
1044
+ if optimizationsEnabled {
1045
+ updatedCCC = ccc. coalescingASCIIMembers ( options)
1046
+ } else {
1047
+ updatedCCC = ccc
1048
+ }
1049
+ let filteredMembers = updatedCCC. members. filter ( { !$0. isOnlyTrivia} )
1050
+
1051
+ if updatedCCC. isInverted {
1052
+ // inverted
1053
+ // custom character class: p0 | p1 | ... | pn
1054
+ // Try each member to make sure they all fail
1055
+ // save next_p1
1056
+ // <code for p0>
1057
+ // clear, fail
1058
+ // next_p1:
1059
+ // save next_p2
1060
+ // <code for p1>
1061
+ // clear fail
1062
+ // next_p2:
1063
+ // save next_p...
1064
+ // <code for p2>
1065
+ // clear fail
1066
+ // ...
1067
+ // next_pn:
1068
+ // save done
1069
+ // <code for pn>
1070
+ // clear fail
1071
+ // done:
1072
+ // step forward by 1
1073
+ let done = builder. makeAddress ( )
1074
+ for member in filteredMembers. dropLast ( ) {
1075
+ let next = builder. makeAddress ( )
1076
+ builder. buildSave ( next)
1077
+ try emitCCCMember ( member)
1078
+ builder. buildClear ( )
1079
+ builder. buildFail ( )
1080
+ builder. label ( next)
1081
+ }
1082
+ builder. buildSave ( done)
1083
+ try emitCCCMember ( filteredMembers. last!)
1084
+ builder. buildClear ( )
1085
+ builder. buildFail ( )
1086
+ builder. label ( done)
1087
+
1088
+ // Consume a single unit for the inverted ccc
1089
+ switch options. semanticLevel {
1090
+ case . graphemeCluster:
1091
+ builder. buildAdvance ( 1 )
1092
+ case . unicodeScalar:
1093
+ builder. buildAdvanceUnicodeScalar ( 1 )
1094
+ }
1095
+ return
1096
+ }
1097
+ // non inverted CCC
1098
+ // Custom character class: p0 | p1 | ... | pn
1099
+ // Very similar to alternation, but we don't keep backtracking save points
1100
+ try emitAlternationGen ( filteredMembers, withBacktracking: false ) {
1101
+ try $0. emitCCCMember ( $1)
1102
+ }
863
1103
}
864
1104
865
1105
mutating func emitConcatenation( _ children: [ DSLTree . Node ] ) throws {
@@ -996,6 +1236,12 @@ fileprivate extension Compiler.ByteCodeGen {
996
1236
}
997
1237
998
1238
extension DSLTree . Node {
1239
+ /// A Boolean value indicating whether this node advances the match position
1240
+ /// on a successful match.
1241
+ ///
1242
+ /// For example, an alternation like `(a|b|c)` always advances the position
1243
+ /// by a character, but `(a|b|)` has an empty branch, which matches without
1244
+ /// advancing.
999
1245
var guaranteesForwardProgress : Bool {
1000
1246
switch self {
1001
1247
case . orderedChoice( let children) :
@@ -1026,12 +1272,34 @@ extension DSLTree.Node {
1026
1272
case . consumer, . matcher:
1027
1273
// Allow zero width consumers and matchers
1028
1274
return false
1029
- case . customCharacterClass:
1030
- return true
1275
+ case . customCharacterClass( let ccc ) :
1276
+ return ccc . guaranteesForwardProgress
1031
1277
case . quantification( let amount, _, let child) :
1032
1278
let ( atLeast, _) = amount. ast. bounds
1033
1279
return atLeast ?? 0 > 0 && child. guaranteesForwardProgress
1034
1280
default : return false
1035
1281
}
1036
1282
}
1037
1283
}
1284
+
1285
+ extension DSLTree . CustomCharacterClass {
1286
+ /// We allow trivia into CustomCharacterClass, which could result in a CCC
1287
+ /// that matches nothing, ie `(?x)[ ]`.
1288
+ var guaranteesForwardProgress : Bool {
1289
+ for m in members {
1290
+ switch m {
1291
+ case . trivia:
1292
+ continue
1293
+ case let . intersection( lhs, rhs) :
1294
+ return lhs. guaranteesForwardProgress && rhs. guaranteesForwardProgress
1295
+ case let . subtraction( lhs, _) :
1296
+ return lhs. guaranteesForwardProgress
1297
+ case let . symmetricDifference( lhs, rhs) :
1298
+ return lhs. guaranteesForwardProgress && rhs. guaranteesForwardProgress
1299
+ default :
1300
+ return true
1301
+ }
1302
+ }
1303
+ return false
1304
+ }
1305
+ }
0 commit comments