@@ -828,19 +828,194 @@ fileprivate extension Compiler.ByteCodeGen {
828
828
}
829
829
}
830
830
831
+ /// Flatten quoted strings into groups of
832
+ func flatteningCustomCharacterClassMembers(
833
+ _ members: [ DSLTree . CustomCharacterClass . Member ]
834
+ ) -> [ DSLTree . CustomCharacterClass . Member ] {
835
+ var characters : Set < Character > = [ ]
836
+ var scalars : Set < UnicodeScalar > = [ ]
837
+ var result : [ DSLTree . CustomCharacterClass . Member ] = [ ]
838
+ for member in members {
839
+ switch member {
840
+ case . atom( let atom) :
841
+ switch atom {
842
+ case let . char( char) :
843
+ characters. insert ( char)
844
+ case let . scalar( scalar) :
845
+ scalars. insert ( scalar)
846
+ default :
847
+ result. append ( member)
848
+ }
849
+ case let . quotedLiteral( str) :
850
+ characters. formUnion ( str)
851
+ default :
852
+ result. append ( member)
853
+ }
854
+ }
855
+ result. append ( contentsOf: characters. map { . atom( . char( $0) ) } )
856
+ result. append ( contentsOf: scalars. map { . atom( . scalar( $0) ) } )
857
+ return result
858
+ }
859
+
831
860
func coalescingCustomCharacterClass(
832
861
_ ccc: DSLTree . CustomCharacterClass
833
862
) -> DSLTree . CustomCharacterClass {
834
863
// This only needs to be done in grapheme semantic mode. In scalar semantic
835
864
// mode, we don't want to coalesce any scalars into a grapheme. This
836
865
// means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
837
866
// U+302.
838
- guard options. semanticLevel == . graphemeCluster else { return ccc }
839
-
840
- let members = coalescingCustomCharacterClassMembers ( ccc. members)
841
- return . init( members: members, isInverted: ccc. isInverted)
867
+ let members = options. semanticLevel == . graphemeCluster
868
+ ? coalescingCustomCharacterClassMembers ( ccc. members)
869
+ : ccc. members
870
+ return . init(
871
+ members: flatteningCustomCharacterClassMembers ( members) ,
872
+ isInverted: ccc. isInverted)
842
873
}
843
874
875
+ mutating func emitCharacterInCCC( _ c: Character ) {
876
+ switch options. semanticLevel {
877
+ case . graphemeCluster:
878
+ emitCharacter ( c)
879
+ case . unicodeScalar:
880
+ // When in scalar mode, act like an alternation of the individual scalars
881
+ // that comprise a character.
882
+ let done = builder. makeAddress ( )
883
+ for scalar in c. unicodeScalars. dropLast ( ) {
884
+ let next = builder. makeAddress ( )
885
+ builder. buildSave ( next)
886
+ emitMatchScalar ( scalar)
887
+ builder. buildClear ( )
888
+ builder. buildBranch ( to: done)
889
+ builder. label ( next)
890
+ }
891
+ emitMatchScalar ( c. unicodeScalars. last!)
892
+ builder. label ( done)
893
+ }
894
+ }
895
+
896
+ mutating func emitCCCMember(
897
+ _ member: DSLTree . CustomCharacterClass . Member
898
+ ) throws {
899
+ switch member {
900
+ case . atom( let atom) :
901
+ switch atom {
902
+ case . char( let c) :
903
+ emitCharacterInCCC ( c)
904
+ case . scalar( let s) :
905
+ emitCharacterInCCC ( Character ( s) )
906
+ default :
907
+ try emitAtom ( atom)
908
+ }
909
+ case . custom( let ccc) :
910
+ try emitCustomCharacterClass ( ccc)
911
+ case . quotedLiteral:
912
+ fatalError ( " Removed in 'flatteningCustomCharacterClassMembers' " )
913
+ case . range:
914
+ let consumer = try member. generateConsumer ( options)
915
+ builder. buildConsume ( by: consumer)
916
+ case . trivia:
917
+ return
918
+
919
+ // TODO: Can we decide when it's better to try `rhs` first?
920
+ // Intersection is trivial, since failure on either side propagates:
921
+ // - store current position
922
+ // - lhs
923
+ // - restore current position
924
+ // - rhs
925
+ case let . intersection( lhs, rhs) :
926
+ let r = builder. makePositionRegister ( )
927
+ builder. buildMoveCurrentPosition ( into: r)
928
+ try emitCustomCharacterClass ( lhs)
929
+ builder. buildRestorePosition ( from: r)
930
+ try emitCustomCharacterClass ( rhs)
931
+
932
+ // TODO: Can we decide when it's better to try `rhs` first?
933
+ // For subtraction, failure in `lhs` propagates, while failure in `rhs` is
934
+ // swallowed/reversed:
935
+ // - store current position
936
+ // - lhs
937
+ // - save to end
938
+ // - restore current position
939
+ // - rhs
940
+ // - clear, fail (since both succeeded)
941
+ // - end: ...
942
+ case let . subtraction( lhs, rhs) :
943
+ let r = builder. makePositionRegister ( )
944
+ let end = builder. makeAddress ( )
945
+ builder. buildMoveCurrentPosition ( into: r)
946
+ try emitCustomCharacterClass ( lhs) // no match here = failure, propagates
947
+ builder. buildSave ( end)
948
+ builder. buildRestorePosition ( from: r)
949
+ try emitCustomCharacterClass ( rhs) // no match here = success, resumes at 'end'
950
+ builder. buildClear ( ) // clears 'end'
951
+ builder. buildFail ( ) // this failure propagates outward
952
+ builder. label ( end)
953
+
954
+ // Symmetric difference always requires executing both `rhs` and `lhs`.
955
+ // Execute each, ignoring failure and storing the resulting position in a
956
+ // register. If those results are equal, fail. If they're different, use
957
+ // the position that is different from the starting position:
958
+ // - store current position as r0
959
+ // - save to lhsFail
960
+ // - lhs
961
+ // - clear lhsFail (and continue)
962
+ // - lhsFail: save position as r1
963
+ //
964
+ // - restore current position
965
+ // - save to rhsFail
966
+ // - rhs
967
+ // - clear rhsFail (and continue)
968
+ // - rhsFail: save position as r2
969
+ //
970
+ // - restore to resulting position from lhs (r1)
971
+ // - if equal to r2, goto fail (both sides had same result)
972
+ // - if equal to r0, goto advance (lhs failed)
973
+ // - goto end
974
+ // - advance: restore to resulting position from rhs (r2)
975
+ // - goto end
976
+ // - fail: fail
977
+ // - end: ...
978
+ case let . symmetricDifference( lhs, rhs) :
979
+ let r0 = builder. makePositionRegister ( )
980
+ let r1 = builder. makePositionRegister ( )
981
+ let r2 = builder. makePositionRegister ( )
982
+ let lhsFail = builder. makeAddress ( )
983
+ let rhsFail = builder. makeAddress ( )
984
+ let advance = builder. makeAddress ( )
985
+ let fail = builder. makeAddress ( )
986
+ let end = builder. makeAddress ( )
987
+
988
+ builder. buildMoveCurrentPosition ( into: r0)
989
+ builder. buildSave ( lhsFail)
990
+ try emitCustomCharacterClass ( lhs)
991
+ builder. buildClear ( )
992
+ builder. label ( lhsFail)
993
+ builder. buildMoveCurrentPosition ( into: r1)
994
+
995
+ builder. buildRestorePosition ( from: r0)
996
+ builder. buildSave ( rhsFail)
997
+ try emitCustomCharacterClass ( rhs)
998
+ builder. buildClear ( )
999
+ builder. label ( rhsFail)
1000
+ builder. buildMoveCurrentPosition ( into: r2)
1001
+
1002
+ // If r1 == r2, then fail
1003
+ builder. buildRestorePosition ( from: r1)
1004
+ builder. buildCondBranch ( to: fail, ifSamePositionAs: r2)
1005
+
1006
+ // If r1 == r0, then move to r2 before ending
1007
+ builder. buildCondBranch ( to: advance, ifSamePositionAs: r0)
1008
+ builder. buildBranch ( to: end)
1009
+ builder. label ( advance)
1010
+ builder. buildRestorePosition ( from: r2)
1011
+ builder. buildBranch ( to: end)
1012
+
1013
+ builder. label ( fail)
1014
+ builder. buildFail ( )
1015
+ builder. label ( end)
1016
+ }
1017
+ }
1018
+
844
1019
mutating func emitCustomCharacterClass(
845
1020
_ ccc: DSLTree . CustomCharacterClass
846
1021
) throws {
@@ -858,8 +1033,93 @@ fileprivate extension Compiler.ByteCodeGen {
858
1033
}
859
1034
return
860
1035
}
861
- let consumer = try ccc. generateConsumer ( options)
862
- builder. buildConsume ( by: consumer)
1036
+
1037
+ let updatedCCC : DSLTree . CustomCharacterClass
1038
+ if optimizationsEnabled {
1039
+ updatedCCC = ccc. coalescingASCIIMembers ( options)
1040
+ } else {
1041
+ updatedCCC = ccc
1042
+ }
1043
+ let filteredMembers = updatedCCC. members. filter ( { !$0. isOnlyTrivia} )
1044
+
1045
+ if updatedCCC. isInverted {
1046
+ // inverted
1047
+ // custom character class: p0 | p1 | ... | pn
1048
+ // Try each member to make sure they all fail
1049
+ // save next_p1
1050
+ // <code for p0>
1051
+ // clear, fail
1052
+ // next_p1:
1053
+ // save next_p2
1054
+ // <code for p1>
1055
+ // clear fail
1056
+ // next_p2:
1057
+ // save next_p...
1058
+ // <code for p2>
1059
+ // clear fail
1060
+ // ...
1061
+ // next_pn:
1062
+ // save done
1063
+ // <code for pn>
1064
+ // clear fail
1065
+ // done:
1066
+ // step forward by 1
1067
+ let done = builder. makeAddress ( )
1068
+ for member in filteredMembers. dropLast ( ) {
1069
+ let next = builder. makeAddress ( )
1070
+ builder. buildSave ( next)
1071
+ try emitCCCMember ( member)
1072
+ builder. buildClear ( )
1073
+ builder. buildFail ( )
1074
+ builder. label ( next)
1075
+ }
1076
+ builder. buildSave ( done)
1077
+ try emitCCCMember ( filteredMembers. last!)
1078
+ builder. buildClear ( )
1079
+ builder. buildFail ( )
1080
+ builder. label ( done)
1081
+
1082
+ // Consume a single unit for the inverted ccc
1083
+ switch options. semanticLevel {
1084
+ case . graphemeCluster:
1085
+ builder. buildAdvance ( 1 )
1086
+ case . unicodeScalar:
1087
+ builder. buildAdvanceUnicodeScalar ( 1 )
1088
+ }
1089
+ return
1090
+ }
1091
+ // non inverted CCC
1092
+ // Custom character class: p0 | p1 | ... | pn
1093
+ // Very similar to alternation, but we don't keep backtracking save points
1094
+ // save next_p1
1095
+ // <code for p0>
1096
+ // clear
1097
+ // branch done
1098
+ // next_p1:
1099
+ // save next_p2
1100
+ // <code for p1>
1101
+ // clear
1102
+ // branch done
1103
+ // next_p2:
1104
+ // save next_p...
1105
+ // <code for p2>
1106
+ // clear
1107
+ // branch done
1108
+ // ...
1109
+ // next_pn:
1110
+ // <code for pn>
1111
+ // done:
1112
+ let done = builder. makeAddress ( )
1113
+ for member in filteredMembers. dropLast ( ) {
1114
+ let next = builder. makeAddress ( )
1115
+ builder. buildSave ( next)
1116
+ try emitCCCMember ( member)
1117
+ builder. buildClear ( )
1118
+ builder. buildBranch ( to: done)
1119
+ builder. label ( next)
1120
+ }
1121
+ try emitCCCMember ( filteredMembers. last!)
1122
+ builder. label ( done)
863
1123
}
864
1124
865
1125
mutating func emitConcatenation( _ children: [ DSLTree . Node ] ) throws {
@@ -996,6 +1256,12 @@ fileprivate extension Compiler.ByteCodeGen {
996
1256
}
997
1257
998
1258
extension DSLTree . Node {
1259
+ /// A Boolean value indicating whether this node advances the match position
1260
+ /// on a successful match.
1261
+ ///
1262
+ /// For example, an alternation like `(a|b|c)` always advances the position
1263
+ /// by a character, but `(a|b|)` has an empty branch, which matches without
1264
+ /// advancing.
999
1265
var guaranteesForwardProgress : Bool {
1000
1266
switch self {
1001
1267
case . orderedChoice( let children) :
@@ -1026,12 +1292,34 @@ extension DSLTree.Node {
1026
1292
case . consumer, . matcher:
1027
1293
// Allow zero width consumers and matchers
1028
1294
return false
1029
- case . customCharacterClass:
1030
- return true
1295
+ case . customCharacterClass( let ccc ) :
1296
+ return ccc . guaranteesForwardProgress
1031
1297
case . quantification( let amount, _, let child) :
1032
1298
let ( atLeast, _) = amount. ast. bounds
1033
1299
return atLeast ?? 0 > 0 && child. guaranteesForwardProgress
1034
1300
default : return false
1035
1301
}
1036
1302
}
1037
1303
}
1304
+
1305
+ extension DSLTree . CustomCharacterClass {
1306
+ /// We allow trivia into CustomCharacterClass, which could result in a CCC
1307
+ /// that matches nothing, ie `(?x)[ ]`.
1308
+ var guaranteesForwardProgress : Bool {
1309
+ for m in members {
1310
+ switch m {
1311
+ case . trivia:
1312
+ continue
1313
+ case let . intersection( lhs, rhs) :
1314
+ return lhs. guaranteesForwardProgress && rhs. guaranteesForwardProgress
1315
+ case let . subtraction( lhs, _) :
1316
+ return lhs. guaranteesForwardProgress
1317
+ case let . symmetricDifference( lhs, rhs) :
1318
+ return lhs. guaranteesForwardProgress && rhs. guaranteesForwardProgress
1319
+ default :
1320
+ return true
1321
+ }
1322
+ }
1323
+ return false
1324
+ }
1325
+ }
0 commit comments