@@ -828,19 +828,195 @@ fileprivate extension Compiler.ByteCodeGen {
828
828
}
829
829
}
830
830
831
+ /// Flatten quoted strings into sequences of atoms, so that the standard
832
+ /// CCC codegen will handle them.
833
+ func flatteningCustomCharacterClassMembers(
834
+ _ members: [ DSLTree . CustomCharacterClass . Member ]
835
+ ) -> [ DSLTree . CustomCharacterClass . Member ] {
836
+ var characters : Set < Character > = [ ]
837
+ var scalars : Set < UnicodeScalar > = [ ]
838
+ var result : [ DSLTree . CustomCharacterClass . Member ] = [ ]
839
+ for member in members {
840
+ switch member {
841
+ case . atom( let atom) :
842
+ switch atom {
843
+ case let . char( char) :
844
+ characters. insert ( char)
845
+ case let . scalar( scalar) :
846
+ scalars. insert ( scalar)
847
+ default :
848
+ result. append ( member)
849
+ }
850
+ case let . quotedLiteral( str) :
851
+ characters. formUnion ( str)
852
+ default :
853
+ result. append ( member)
854
+ }
855
+ }
856
+ result. append ( contentsOf: characters. map { . atom( . char( $0) ) } )
857
+ result. append ( contentsOf: scalars. map { . atom( . scalar( $0) ) } )
858
+ return result
859
+ }
860
+
831
861
func coalescingCustomCharacterClass(
832
862
_ ccc: DSLTree . CustomCharacterClass
833
863
) -> DSLTree . CustomCharacterClass {
834
864
// This only needs to be done in grapheme semantic mode. In scalar semantic
835
865
// mode, we don't want to coalesce any scalars into a grapheme. This
836
866
// means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
837
867
// U+302.
838
- guard options. semanticLevel == . graphemeCluster else { return ccc }
839
-
840
- let members = coalescingCustomCharacterClassMembers ( ccc. members)
841
- return . init( members: members, isInverted: ccc. isInverted)
868
+ let members = options. semanticLevel == . graphemeCluster
869
+ ? coalescingCustomCharacterClassMembers ( ccc. members)
870
+ : ccc. members
871
+ return . init(
872
+ members: flatteningCustomCharacterClassMembers ( members) ,
873
+ isInverted: ccc. isInverted)
842
874
}
843
875
876
+ mutating func emitCharacterInCCC( _ c: Character ) {
877
+ switch options. semanticLevel {
878
+ case . graphemeCluster:
879
+ emitCharacter ( c)
880
+ case . unicodeScalar:
881
+ // When in scalar mode, act like an alternation of the individual scalars
882
+ // that comprise a character.
883
+ let done = builder. makeAddress ( )
884
+ for scalar in c. unicodeScalars. dropLast ( ) {
885
+ let next = builder. makeAddress ( )
886
+ builder. buildSave ( next)
887
+ emitMatchScalar ( scalar)
888
+ builder. buildClear ( )
889
+ builder. buildBranch ( to: done)
890
+ builder. label ( next)
891
+ }
892
+ emitMatchScalar ( c. unicodeScalars. last!)
893
+ builder. label ( done)
894
+ }
895
+ }
896
+
897
+ mutating func emitCCCMember(
898
+ _ member: DSLTree . CustomCharacterClass . Member
899
+ ) throws {
900
+ switch member {
901
+ case . atom( let atom) :
902
+ switch atom {
903
+ case . char( let c) :
904
+ emitCharacterInCCC ( c)
905
+ case . scalar( let s) :
906
+ emitCharacterInCCC ( Character ( s) )
907
+ default :
908
+ try emitAtom ( atom)
909
+ }
910
+ case . custom( let ccc) :
911
+ try emitCustomCharacterClass ( ccc)
912
+ case . quotedLiteral:
913
+ fatalError ( " Removed in 'flatteningCustomCharacterClassMembers' " )
914
+ case . range:
915
+ let consumer = try member. generateConsumer ( options)
916
+ builder. buildConsume ( by: consumer)
917
+ case . trivia:
918
+ return
919
+
920
+ // TODO: Can we decide when it's better to try `rhs` first?
921
+ // Intersection is trivial, since failure on either side propagates:
922
+ // - store current position
923
+ // - lhs
924
+ // - restore current position
925
+ // - rhs
926
+ case let . intersection( lhs, rhs) :
927
+ let r = builder. makePositionRegister ( )
928
+ builder. buildMoveCurrentPosition ( into: r)
929
+ try emitCustomCharacterClass ( lhs)
930
+ builder. buildRestorePosition ( from: r)
931
+ try emitCustomCharacterClass ( rhs)
932
+
933
+ // TODO: Can we decide when it's better to try `rhs` first?
934
+ // For subtraction, failure in `lhs` propagates, while failure in `rhs` is
935
+ // swallowed/reversed:
936
+ // - store current position
937
+ // - lhs
938
+ // - save to end
939
+ // - restore current position
940
+ // - rhs
941
+ // - clear, fail (since both succeeded)
942
+ // - end: ...
943
+ case let . subtraction( lhs, rhs) :
944
+ let r = builder. makePositionRegister ( )
945
+ let end = builder. makeAddress ( )
946
+ builder. buildMoveCurrentPosition ( into: r)
947
+ try emitCustomCharacterClass ( lhs) // no match here = failure, propagates
948
+ builder. buildSave ( end)
949
+ builder. buildRestorePosition ( from: r)
950
+ try emitCustomCharacterClass ( rhs) // no match here = success, resumes at 'end'
951
+ builder. buildClear ( ) // clears 'end'
952
+ builder. buildFail ( ) // this failure propagates outward
953
+ builder. label ( end)
954
+
955
+ // Symmetric difference always requires executing both `rhs` and `lhs`.
956
+ // Execute each, ignoring failure and storing the resulting position in a
957
+ // register. If those results are equal, fail. If they're different, use
958
+ // the position that is different from the starting position:
959
+ // - store current position as r0
960
+ // - save to lhsFail
961
+ // - lhs
962
+ // - clear lhsFail (and continue)
963
+ // - lhsFail: save position as r1
964
+ //
965
+ // - restore current position
966
+ // - save to rhsFail
967
+ // - rhs
968
+ // - clear rhsFail (and continue)
969
+ // - rhsFail: save position as r2
970
+ //
971
+ // - restore to resulting position from lhs (r1)
972
+ // - if equal to r2, goto fail (both sides had same result)
973
+ // - if equal to r0, goto advance (lhs failed)
974
+ // - goto end
975
+ // - advance: restore to resulting position from rhs (r2)
976
+ // - goto end
977
+ // - fail: fail
978
+ // - end: ...
979
+ case let . symmetricDifference( lhs, rhs) :
980
+ let r0 = builder. makePositionRegister ( )
981
+ let r1 = builder. makePositionRegister ( )
982
+ let r2 = builder. makePositionRegister ( )
983
+ let lhsFail = builder. makeAddress ( )
984
+ let rhsFail = builder. makeAddress ( )
985
+ let advance = builder. makeAddress ( )
986
+ let fail = builder. makeAddress ( )
987
+ let end = builder. makeAddress ( )
988
+
989
+ builder. buildMoveCurrentPosition ( into: r0)
990
+ builder. buildSave ( lhsFail)
991
+ try emitCustomCharacterClass ( lhs)
992
+ builder. buildClear ( )
993
+ builder. label ( lhsFail)
994
+ builder. buildMoveCurrentPosition ( into: r1)
995
+
996
+ builder. buildRestorePosition ( from: r0)
997
+ builder. buildSave ( rhsFail)
998
+ try emitCustomCharacterClass ( rhs)
999
+ builder. buildClear ( )
1000
+ builder. label ( rhsFail)
1001
+ builder. buildMoveCurrentPosition ( into: r2)
1002
+
1003
+ // If r1 == r2, then fail
1004
+ builder. buildRestorePosition ( from: r1)
1005
+ builder. buildCondBranch ( to: fail, ifSamePositionAs: r2)
1006
+
1007
+ // If r1 == r0, then move to r2 before ending
1008
+ builder. buildCondBranch ( to: advance, ifSamePositionAs: r0)
1009
+ builder. buildBranch ( to: end)
1010
+ builder. label ( advance)
1011
+ builder. buildRestorePosition ( from: r2)
1012
+ builder. buildBranch ( to: end)
1013
+
1014
+ builder. label ( fail)
1015
+ builder. buildFail ( )
1016
+ builder. label ( end)
1017
+ }
1018
+ }
1019
+
844
1020
mutating func emitCustomCharacterClass(
845
1021
_ ccc: DSLTree . CustomCharacterClass
846
1022
) throws {
@@ -858,8 +1034,93 @@ fileprivate extension Compiler.ByteCodeGen {
858
1034
}
859
1035
return
860
1036
}
861
- let consumer = try ccc. generateConsumer ( options)
862
- builder. buildConsume ( by: consumer)
1037
+
1038
+ let updatedCCC : DSLTree . CustomCharacterClass
1039
+ if optimizationsEnabled {
1040
+ updatedCCC = ccc. coalescingASCIIMembers ( options)
1041
+ } else {
1042
+ updatedCCC = ccc
1043
+ }
1044
+ let filteredMembers = updatedCCC. members. filter ( { !$0. isOnlyTrivia} )
1045
+
1046
+ if updatedCCC. isInverted {
1047
+ // inverted
1048
+ // custom character class: p0 | p1 | ... | pn
1049
+ // Try each member to make sure they all fail
1050
+ // save next_p1
1051
+ // <code for p0>
1052
+ // clear, fail
1053
+ // next_p1:
1054
+ // save next_p2
1055
+ // <code for p1>
1056
+ // clear fail
1057
+ // next_p2:
1058
+ // save next_p...
1059
+ // <code for p2>
1060
+ // clear fail
1061
+ // ...
1062
+ // next_pn:
1063
+ // save done
1064
+ // <code for pn>
1065
+ // clear fail
1066
+ // done:
1067
+ // step forward by 1
1068
+ let done = builder. makeAddress ( )
1069
+ for member in filteredMembers. dropLast ( ) {
1070
+ let next = builder. makeAddress ( )
1071
+ builder. buildSave ( next)
1072
+ try emitCCCMember ( member)
1073
+ builder. buildClear ( )
1074
+ builder. buildFail ( )
1075
+ builder. label ( next)
1076
+ }
1077
+ builder. buildSave ( done)
1078
+ try emitCCCMember ( filteredMembers. last!)
1079
+ builder. buildClear ( )
1080
+ builder. buildFail ( )
1081
+ builder. label ( done)
1082
+
1083
+ // Consume a single unit for the inverted ccc
1084
+ switch options. semanticLevel {
1085
+ case . graphemeCluster:
1086
+ builder. buildAdvance ( 1 )
1087
+ case . unicodeScalar:
1088
+ builder. buildAdvanceUnicodeScalar ( 1 )
1089
+ }
1090
+ return
1091
+ }
1092
+ // non inverted CCC
1093
+ // Custom character class: p0 | p1 | ... | pn
1094
+ // Very similar to alternation, but we don't keep backtracking save points
1095
+ // save next_p1
1096
+ // <code for p0>
1097
+ // clear
1098
+ // branch done
1099
+ // next_p1:
1100
+ // save next_p2
1101
+ // <code for p1>
1102
+ // clear
1103
+ // branch done
1104
+ // next_p2:
1105
+ // save next_p...
1106
+ // <code for p2>
1107
+ // clear
1108
+ // branch done
1109
+ // ...
1110
+ // next_pn:
1111
+ // <code for pn>
1112
+ // done:
1113
+ let done = builder. makeAddress ( )
1114
+ for member in filteredMembers. dropLast ( ) {
1115
+ let next = builder. makeAddress ( )
1116
+ builder. buildSave ( next)
1117
+ try emitCCCMember ( member)
1118
+ builder. buildClear ( )
1119
+ builder. buildBranch ( to: done)
1120
+ builder. label ( next)
1121
+ }
1122
+ try emitCCCMember ( filteredMembers. last!)
1123
+ builder. label ( done)
863
1124
}
864
1125
865
1126
mutating func emitConcatenation( _ children: [ DSLTree . Node ] ) throws {
@@ -996,6 +1257,12 @@ fileprivate extension Compiler.ByteCodeGen {
996
1257
}
997
1258
998
1259
extension DSLTree . Node {
1260
+ /// A Boolean value indicating whether this node advances the match position
1261
+ /// on a successful match.
1262
+ ///
1263
+ /// For example, an alternation like `(a|b|c)` always advances the position
1264
+ /// by a character, but `(a|b|)` has an empty branch, which matches without
1265
+ /// advancing.
999
1266
var guaranteesForwardProgress : Bool {
1000
1267
switch self {
1001
1268
case . orderedChoice( let children) :
@@ -1026,12 +1293,34 @@ extension DSLTree.Node {
1026
1293
case . consumer, . matcher:
1027
1294
// Allow zero width consumers and matchers
1028
1295
return false
1029
- case . customCharacterClass:
1030
- return true
1296
+ case . customCharacterClass( let ccc ) :
1297
+ return ccc . guaranteesForwardProgress
1031
1298
case . quantification( let amount, _, let child) :
1032
1299
let ( atLeast, _) = amount. ast. bounds
1033
1300
return atLeast ?? 0 > 0 && child. guaranteesForwardProgress
1034
1301
default : return false
1035
1302
}
1036
1303
}
1037
1304
}
1305
+
1306
+ extension DSLTree . CustomCharacterClass {
1307
+ /// We allow trivia into CustomCharacterClass, which could result in a CCC
1308
+ /// that matches nothing, ie `(?x)[ ]`.
1309
+ var guaranteesForwardProgress : Bool {
1310
+ for m in members {
1311
+ switch m {
1312
+ case . trivia:
1313
+ continue
1314
+ case let . intersection( lhs, rhs) :
1315
+ return lhs. guaranteesForwardProgress && rhs. guaranteesForwardProgress
1316
+ case let . subtraction( lhs, _) :
1317
+ return lhs. guaranteesForwardProgress
1318
+ case let . symmetricDifference( lhs, rhs) :
1319
+ return lhs. guaranteesForwardProgress && rhs. guaranteesForwardProgress
1320
+ default :
1321
+ return true
1322
+ }
1323
+ }
1324
+ return false
1325
+ }
1326
+ }
0 commit comments