@@ -970,122 +970,138 @@ public static StringBuilder decodeEscapes(ErrorCallback errors, String string, S
970
970
}
971
971
972
972
private static <T > T unescapeString (SourceRange sourceRange , ErrorCallback errorCallback , String st , PythonStringFactory <T > stringFactory ) {
973
- if (!st .contains ("\\ " )) {
973
+ int backslashIndex = st .indexOf ('\\' );
974
+ if (backslashIndex < 0 ) {
974
975
return stringFactory .fromJavaString (st );
975
976
}
976
977
PythonStringFactory .PythonStringBuilder <T > sb = stringFactory .createBuilder (st .length ());
977
- boolean wasDeprecationWarning = false ;
978
- for (int i = 0 ; i < st .length (); i ++) {
979
- char ch = st .charAt (i );
980
- if (ch == '\\' ) {
981
- char nextChar = (i == st .length () - 1 ) ? '\\' : st .charAt (i + 1 );
982
- // Octal escape?
983
- if (nextChar >= '0' && nextChar <= '7' ) {
984
- String code = "" + nextChar ;
978
+ boolean emittedDeprecationWarning = false ;
979
+ int substringStart = 0 ;
980
+ do {
981
+ if (backslashIndex != 0 ) {
982
+ sb .appendString (st .substring (substringStart , backslashIndex ));
983
+ }
984
+ if (backslashIndex + 1 < st .length ()) {
985
+ substringStart = processEscapeSequence (sourceRange , errorCallback , st , sb , backslashIndex + 1 );
986
+ if (substringStart == backslashIndex + 1 ) {
987
+ sb .appendCodePoint ('\\' );
988
+ if (!emittedDeprecationWarning ) {
989
+ emittedDeprecationWarning = true ;
990
+ warnInvalidEscapeSequence (errorCallback , sourceRange , st .codePointAt (substringStart ));
991
+ }
992
+ }
993
+ } else {
994
+ // Lone backslash at the end, can occur in f-strings
995
+ substringStart = backslashIndex ;
996
+ break ;
997
+ }
998
+ } while ((backslashIndex = st .indexOf ('\\' , substringStart )) >= 0 );
999
+ if (substringStart < st .length ()) {
1000
+ sb .appendString (st .substring (substringStart ));
1001
+ }
1002
+ return sb .build ();
1003
+ }
1004
+
1005
+ private static <T > int processEscapeSequence (SourceRange sourceRange , ErrorCallback errorCallback , String st , PythonStringFactory .PythonStringBuilder <T > sb , int startIndex ) {
1006
+ int cp = st .codePointAt (startIndex );
1007
+ int i = startIndex + Character .charCount (cp );
1008
+ return switch (cp ) {
1009
+ case '\\' -> {
1010
+ sb .appendCodePoint ('\\' );
1011
+ yield i ;
1012
+ }
1013
+ case 'a' -> {
1014
+ sb .appendCodePoint ('\u0007' );
1015
+ yield i ;
1016
+ }
1017
+ case 'b' -> {
1018
+ sb .appendCodePoint ('\b' );
1019
+ yield i ;
1020
+ }
1021
+ case 'f' -> {
1022
+ sb .appendCodePoint ('\f' );
1023
+ yield i ;
1024
+ }
1025
+ case 'n' -> {
1026
+ sb .appendCodePoint ('\n' );
1027
+ yield i ;
1028
+ }
1029
+ case 'r' -> {
1030
+ sb .appendCodePoint ('\r' );
1031
+ yield i ;
1032
+ }
1033
+ case 't' -> {
1034
+ sb .appendCodePoint ('\t' );
1035
+ yield i ;
1036
+ }
1037
+ case 'v' -> {
1038
+ sb .appendCodePoint ('\u000b' );
1039
+ yield i ;
1040
+ }
1041
+ case '\"' -> {
1042
+ sb .appendCodePoint ('\"' );
1043
+ yield i ;
1044
+ }
1045
+ case '\'' -> {
1046
+ sb .appendCodePoint ('\'' );
1047
+ yield i ;
1048
+ }
1049
+ case '\r' , '\n' -> i ;
1050
+ // Octal code point
1051
+ case '0' , '1' , '2' , '3' , '4' , '5' , '6' , '7' -> {
1052
+ int octalValue = cp - '0' ;
1053
+ cp = i < st .length () ? st .codePointAt (i ) : 0 ;
1054
+ if (cp >= '0' && cp <= '7' ) {
985
1055
i ++;
986
- if ((i < st .length () - 1 ) && st .charAt (i + 1 ) >= '0' && st .charAt (i + 1 ) <= '7' ) {
987
- code += st .charAt (i + 1 );
1056
+ octalValue = octalValue * 8 + cp - '0' ;
1057
+ cp = i < st .length () ? st .codePointAt (i ) : 0 ;
1058
+ if (cp >= '0' && cp <= '7' ) {
988
1059
i ++;
989
- if ((i < st .length () - 1 ) && st .charAt (i + 1 ) >= '0' && st .charAt (i + 1 ) <= '7' ) {
990
- code += st .charAt (i + 1 );
991
- i ++;
992
- }
1060
+ octalValue = octalValue * 8 + cp - '0' ;
993
1061
}
994
- sb .appendCodePoint (Integer .parseInt (code , 8 ));
995
- continue ;
996
1062
}
997
- switch (nextChar ) {
998
- case '\\' :
999
- ch = '\\' ;
1000
- break ;
1001
- case 'a' :
1002
- ch = '\u0007' ;
1003
- break ;
1004
- case 'b' :
1005
- ch = '\b' ;
1006
- break ;
1007
- case 'f' :
1008
- ch = '\f' ;
1009
- break ;
1010
- case 'n' :
1011
- ch = '\n' ;
1012
- break ;
1013
- case 'r' :
1014
- ch = '\r' ;
1015
- break ;
1016
- case 't' :
1017
- ch = '\t' ;
1018
- break ;
1019
- case 'v' :
1020
- ch = '\u000b' ;
1021
- break ;
1022
- case '\"' :
1023
- ch = '\"' ;
1024
- break ;
1025
- case '\'' :
1026
- ch = '\'' ;
1027
- break ;
1028
- case '\r' :
1029
- nextChar = (i == st .length () - 2 ) ? '\\' : st .charAt (i + 2 );
1030
- if (nextChar == '\n' ) {
1031
- i ++;
1032
- }
1033
- i ++;
1034
- continue ;
1035
- case '\n' :
1036
- i ++;
1037
- continue ;
1038
- // Hex Unicode: u????
1039
- case 'u' :
1040
- int code = getHexValue (st , sourceRange , i + 2 , 4 , errorCallback );
1041
- if (code < 0 ) {
1042
- return stringFactory .fromJavaString (st );
1043
- }
1044
- sb .appendCodePoint (code );
1045
- i += 5 ;
1046
- continue ;
1047
- // Hex Unicode: U????????
1048
- case 'U' :
1049
- code = getHexValue (st , sourceRange , i + 2 , 8 , errorCallback );
1050
- if (Character .isValidCodePoint (code )) {
1051
- sb .appendCodePoint (code );
1052
- } else {
1053
- errorCallback .onError (ErrorCallback .ErrorType .Encoding , sourceRange , String .format (UNICODE_ERROR + ILLEGAL_CHARACTER , i , i + 9 ));
1054
- return stringFactory .fromJavaString (st );
1055
- }
1056
- i += 9 ;
1057
- continue ;
1058
- // Hex Unicode: x??
1059
- case 'x' :
1060
- code = getHexValue (st , sourceRange , i + 2 , 2 , errorCallback );
1061
- if (code < 0 ) {
1062
- return stringFactory .fromJavaString (st );
1063
- }
1064
- sb .appendCodePoint (code );
1065
- i += 3 ;
1066
- continue ;
1067
- case 'N' :
1068
- // a character from Unicode Data Database
1069
- i = doCharacterName (st , sourceRange , sb , i + 2 , errorCallback );
1070
- if (i < 0 ) {
1071
- return stringFactory .fromJavaString (st );
1072
- }
1073
- continue ;
1074
- default :
1075
- if (!wasDeprecationWarning ) {
1076
- wasDeprecationWarning = true ;
1077
- warnInvalidEscapeSequence (errorCallback , sourceRange , nextChar );
1078
- }
1079
- sb .appendCodePoint (ch );
1080
- sb .appendCodePoint (nextChar );
1081
- i ++;
1082
- continue ;
1063
+ sb .appendCodePoint (octalValue );
1064
+ yield i ;
1065
+ }
1066
+ // Hex Unicode: u????
1067
+ case 'u' -> {
1068
+ int code = getHexValue (st , sourceRange , i , 4 , errorCallback );
1069
+ if (code < 0 ) {
1070
+ yield startIndex ;
1083
1071
}
1084
- i ++;
1072
+ sb .appendCodePoint (code );
1073
+ yield i + 4 ;
1085
1074
}
1086
- sb .appendCodePoint (ch );
1087
- }
1088
- return sb .build ();
1075
+ // Hex Unicode: U????????
1076
+ case 'U' -> {
1077
+ int code = getHexValue (st , sourceRange , i , 8 , errorCallback );
1078
+ if (Character .isValidCodePoint (code )) {
1079
+ sb .appendCodePoint (code );
1080
+ } else {
1081
+ errorCallback .onError (ErrorCallback .ErrorType .Encoding , sourceRange , String .format (UNICODE_ERROR + ILLEGAL_CHARACTER , i , i + 9 ));
1082
+ yield startIndex ;
1083
+ }
1084
+ yield i + 8 ;
1085
+ }
1086
+ // Hex Unicode: x??
1087
+ case 'x' -> {
1088
+ int code = getHexValue (st , sourceRange , i , 2 , errorCallback );
1089
+ if (code < 0 ) {
1090
+ yield startIndex ;
1091
+ }
1092
+ sb .appendCodePoint (code );
1093
+ yield i + 2 ;
1094
+ }
1095
+ case 'N' -> {
1096
+ i = doCharacterName (st , sourceRange , sb , i , errorCallback );
1097
+ if (i < 0 ) {
1098
+ yield startIndex ;
1099
+ }
1100
+ yield i ;
1101
+ // a character from Unicode Data Database
1102
+ }
1103
+ default -> startIndex ;
1104
+ };
1089
1105
}
1090
1106
1091
1107
private static int getHexValue (String text , SourceRange sourceRange , int start , int len , ErrorCallback errorCb ) {
@@ -1130,7 +1146,7 @@ private static int createTruncatedError(SourceRange sourceRange, int startIndex,
1130
1146
* @param text a text that contains /N{...} escape sequence
1131
1147
* @param sb string builder where the result code point will be written
1132
1148
* @param offset this is offset of the open brace
1133
- * @return offset of the close brace or {@code -1} if an error was signaled
1149
+ * @return offset after the close brace or {@code -1} if an error was signaled
1134
1150
*/
1135
1151
private static int doCharacterName (String text , SourceRange sourceRange , PythonStringFactory .PythonStringBuilder <?> sb , int offset , ErrorCallback errorCallback ) {
1136
1152
if (offset >= text .length ()) {
@@ -1155,7 +1171,7 @@ private static int doCharacterName(String text, SourceRange sourceRange, PythonS
1155
1171
errorCallback .onError (ErrorCallback .ErrorType .Encoding , sourceRange , UNICODE_ERROR + UNKNOWN_UNICODE_ERROR , offset - 2 , closeIndex );
1156
1172
return -1 ;
1157
1173
}
1158
- return closeIndex ;
1174
+ return closeIndex + 1 ;
1159
1175
}
1160
1176
1161
1177
// Names for most control characters that mean 0-31, not some symbol
@@ -1216,8 +1232,8 @@ public static int getCodePoint(String charName) {
1216
1232
return -1 ;
1217
1233
}
1218
1234
1219
- public static void warnInvalidEscapeSequence (ErrorCallback errorCallback , SourceRange sourceRange , char nextChar ) {
1220
- errorCallback .onWarning (WarningType .Deprecation , sourceRange , "invalid escape sequence '\\ %c'" , nextChar );
1235
+ public static void warnInvalidEscapeSequence (ErrorCallback errorCallback , SourceRange sourceRange , int nextCodePoint ) {
1236
+ errorCallback .onWarning (WarningType .Deprecation , sourceRange , "invalid escape sequence '\\ %c'" , nextCodePoint );
1221
1237
}
1222
1238
1223
1239
private static final String UNICODE_ERROR = "(unicode error) 'unicodeescape' codec can't decode bytes in position %d-%d:" ;
0 commit comments