Skip to content

Commit ac15d87

Browse files
committed
[GR-57506] Fix handling non-BMP characters in literals with escapes
PullRequest: graalpython/3487
2 parents 6f9a721 + 0b26cd7 commit ac15d87

File tree

2 files changed

+132
-112
lines changed

2 files changed

+132
-112
lines changed

graalpython/com.oracle.graal.python.pegparser/src/com/oracle/graal/python/pegparser/sst/StringLiteralUtils.java

Lines changed: 127 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -970,122 +970,138 @@ public static StringBuilder decodeEscapes(ErrorCallback errors, String string, S
970970
}
971971

972972
private static <T> T unescapeString(SourceRange sourceRange, ErrorCallback errorCallback, String st, PythonStringFactory<T> stringFactory) {
973-
if (!st.contains("\\")) {
973+
int backslashIndex = st.indexOf('\\');
974+
if (backslashIndex < 0) {
974975
return stringFactory.fromJavaString(st);
975976
}
976977
PythonStringFactory.PythonStringBuilder<T> sb = stringFactory.createBuilder(st.length());
977-
boolean wasDeprecationWarning = false;
978-
for (int i = 0; i < st.length(); i++) {
979-
char ch = st.charAt(i);
980-
if (ch == '\\') {
981-
char nextChar = (i == st.length() - 1) ? '\\' : st.charAt(i + 1);
982-
// Octal escape?
983-
if (nextChar >= '0' && nextChar <= '7') {
984-
String code = "" + nextChar;
978+
boolean emittedDeprecationWarning = false;
979+
int substringStart = 0;
980+
do {
981+
if (backslashIndex != 0) {
982+
sb.appendString(st.substring(substringStart, backslashIndex));
983+
}
984+
if (backslashIndex + 1 < st.length()) {
985+
substringStart = processEscapeSequence(sourceRange, errorCallback, st, sb, backslashIndex + 1);
986+
if (substringStart == backslashIndex + 1) {
987+
sb.appendCodePoint('\\');
988+
if (!emittedDeprecationWarning) {
989+
emittedDeprecationWarning = true;
990+
warnInvalidEscapeSequence(errorCallback, sourceRange, st.codePointAt(substringStart));
991+
}
992+
}
993+
} else {
994+
// Lone backslash at the end, can occur in f-strings
995+
substringStart = backslashIndex;
996+
break;
997+
}
998+
} while ((backslashIndex = st.indexOf('\\', substringStart)) >= 0);
999+
if (substringStart < st.length()) {
1000+
sb.appendString(st.substring(substringStart));
1001+
}
1002+
return sb.build();
1003+
}
1004+
1005+
private static <T> int processEscapeSequence(SourceRange sourceRange, ErrorCallback errorCallback, String st, PythonStringFactory.PythonStringBuilder<T> sb, int startIndex) {
1006+
int cp = st.codePointAt(startIndex);
1007+
int i = startIndex + Character.charCount(cp);
1008+
return switch (cp) {
1009+
case '\\' -> {
1010+
sb.appendCodePoint('\\');
1011+
yield i;
1012+
}
1013+
case 'a' -> {
1014+
sb.appendCodePoint('\u0007');
1015+
yield i;
1016+
}
1017+
case 'b' -> {
1018+
sb.appendCodePoint('\b');
1019+
yield i;
1020+
}
1021+
case 'f' -> {
1022+
sb.appendCodePoint('\f');
1023+
yield i;
1024+
}
1025+
case 'n' -> {
1026+
sb.appendCodePoint('\n');
1027+
yield i;
1028+
}
1029+
case 'r' -> {
1030+
sb.appendCodePoint('\r');
1031+
yield i;
1032+
}
1033+
case 't' -> {
1034+
sb.appendCodePoint('\t');
1035+
yield i;
1036+
}
1037+
case 'v' -> {
1038+
sb.appendCodePoint('\u000b');
1039+
yield i;
1040+
}
1041+
case '\"' -> {
1042+
sb.appendCodePoint('\"');
1043+
yield i;
1044+
}
1045+
case '\'' -> {
1046+
sb.appendCodePoint('\'');
1047+
yield i;
1048+
}
1049+
case '\r', '\n' -> i;
1050+
// Octal code point
1051+
case '0', '1', '2', '3', '4', '5', '6', '7' -> {
1052+
int octalValue = cp - '0';
1053+
cp = i < st.length() ? st.codePointAt(i) : 0;
1054+
if (cp >= '0' && cp <= '7') {
9851055
i++;
986-
if ((i < st.length() - 1) && st.charAt(i + 1) >= '0' && st.charAt(i + 1) <= '7') {
987-
code += st.charAt(i + 1);
1056+
octalValue = octalValue * 8 + cp - '0';
1057+
cp = i < st.length() ? st.codePointAt(i) : 0;
1058+
if (cp >= '0' && cp <= '7') {
9881059
i++;
989-
if ((i < st.length() - 1) && st.charAt(i + 1) >= '0' && st.charAt(i + 1) <= '7') {
990-
code += st.charAt(i + 1);
991-
i++;
992-
}
1060+
octalValue = octalValue * 8 + cp - '0';
9931061
}
994-
sb.appendCodePoint(Integer.parseInt(code, 8));
995-
continue;
9961062
}
997-
switch (nextChar) {
998-
case '\\':
999-
ch = '\\';
1000-
break;
1001-
case 'a':
1002-
ch = '\u0007';
1003-
break;
1004-
case 'b':
1005-
ch = '\b';
1006-
break;
1007-
case 'f':
1008-
ch = '\f';
1009-
break;
1010-
case 'n':
1011-
ch = '\n';
1012-
break;
1013-
case 'r':
1014-
ch = '\r';
1015-
break;
1016-
case 't':
1017-
ch = '\t';
1018-
break;
1019-
case 'v':
1020-
ch = '\u000b';
1021-
break;
1022-
case '\"':
1023-
ch = '\"';
1024-
break;
1025-
case '\'':
1026-
ch = '\'';
1027-
break;
1028-
case '\r':
1029-
nextChar = (i == st.length() - 2) ? '\\' : st.charAt(i + 2);
1030-
if (nextChar == '\n') {
1031-
i++;
1032-
}
1033-
i++;
1034-
continue;
1035-
case '\n':
1036-
i++;
1037-
continue;
1038-
// Hex Unicode: u????
1039-
case 'u':
1040-
int code = getHexValue(st, sourceRange, i + 2, 4, errorCallback);
1041-
if (code < 0) {
1042-
return stringFactory.fromJavaString(st);
1043-
}
1044-
sb.appendCodePoint(code);
1045-
i += 5;
1046-
continue;
1047-
// Hex Unicode: U????????
1048-
case 'U':
1049-
code = getHexValue(st, sourceRange, i + 2, 8, errorCallback);
1050-
if (Character.isValidCodePoint(code)) {
1051-
sb.appendCodePoint(code);
1052-
} else {
1053-
errorCallback.onError(ErrorCallback.ErrorType.Encoding, sourceRange, String.format(UNICODE_ERROR + ILLEGAL_CHARACTER, i, i + 9));
1054-
return stringFactory.fromJavaString(st);
1055-
}
1056-
i += 9;
1057-
continue;
1058-
// Hex Unicode: x??
1059-
case 'x':
1060-
code = getHexValue(st, sourceRange, i + 2, 2, errorCallback);
1061-
if (code < 0) {
1062-
return stringFactory.fromJavaString(st);
1063-
}
1064-
sb.appendCodePoint(code);
1065-
i += 3;
1066-
continue;
1067-
case 'N':
1068-
// a character from Unicode Data Database
1069-
i = doCharacterName(st, sourceRange, sb, i + 2, errorCallback);
1070-
if (i < 0) {
1071-
return stringFactory.fromJavaString(st);
1072-
}
1073-
continue;
1074-
default:
1075-
if (!wasDeprecationWarning) {
1076-
wasDeprecationWarning = true;
1077-
warnInvalidEscapeSequence(errorCallback, sourceRange, nextChar);
1078-
}
1079-
sb.appendCodePoint(ch);
1080-
sb.appendCodePoint(nextChar);
1081-
i++;
1082-
continue;
1063+
sb.appendCodePoint(octalValue);
1064+
yield i;
1065+
}
1066+
// Hex Unicode: u????
1067+
case 'u' -> {
1068+
int code = getHexValue(st, sourceRange, i, 4, errorCallback);
1069+
if (code < 0) {
1070+
yield startIndex;
10831071
}
1084-
i++;
1072+
sb.appendCodePoint(code);
1073+
yield i + 4;
10851074
}
1086-
sb.appendCodePoint(ch);
1087-
}
1088-
return sb.build();
1075+
// Hex Unicode: U????????
1076+
case 'U' -> {
1077+
int code = getHexValue(st, sourceRange, i, 8, errorCallback);
1078+
if (Character.isValidCodePoint(code)) {
1079+
sb.appendCodePoint(code);
1080+
} else {
1081+
errorCallback.onError(ErrorCallback.ErrorType.Encoding, sourceRange, String.format(UNICODE_ERROR + ILLEGAL_CHARACTER, i, i + 9));
1082+
yield startIndex;
1083+
}
1084+
yield i + 8;
1085+
}
1086+
// Hex Unicode: x??
1087+
case 'x' -> {
1088+
int code = getHexValue(st, sourceRange, i, 2, errorCallback);
1089+
if (code < 0) {
1090+
yield startIndex;
1091+
}
1092+
sb.appendCodePoint(code);
1093+
yield i + 2;
1094+
}
1095+
case 'N' -> {
1096+
i = doCharacterName(st, sourceRange, sb, i, errorCallback);
1097+
if (i < 0) {
1098+
yield startIndex;
1099+
}
1100+
yield i;
1101+
// a character from Unicode Data Database
1102+
}
1103+
default -> startIndex;
1104+
};
10891105
}
10901106

10911107
private static int getHexValue(String text, SourceRange sourceRange, int start, int len, ErrorCallback errorCb) {
@@ -1130,7 +1146,7 @@ private static int createTruncatedError(SourceRange sourceRange, int startIndex,
11301146
* @param text a text that contains /N{...} escape sequence
11311147
* @param sb string builder where the result code point will be written
11321148
* @param offset this is offset of the open brace
1133-
* @return offset of the close brace or {@code -1} if an error was signaled
1149+
* @return offset after the close brace or {@code -1} if an error was signaled
11341150
*/
11351151
private static int doCharacterName(String text, SourceRange sourceRange, PythonStringFactory.PythonStringBuilder<?> sb, int offset, ErrorCallback errorCallback) {
11361152
if (offset >= text.length()) {
@@ -1155,7 +1171,7 @@ private static int doCharacterName(String text, SourceRange sourceRange, PythonS
11551171
errorCallback.onError(ErrorCallback.ErrorType.Encoding, sourceRange, UNICODE_ERROR + UNKNOWN_UNICODE_ERROR, offset - 2, closeIndex);
11561172
return -1;
11571173
}
1158-
return closeIndex;
1174+
return closeIndex + 1;
11591175
}
11601176

11611177
// Names for most control characters that mean 0-31, not some symbol
@@ -1216,8 +1232,8 @@ public static int getCodePoint(String charName) {
12161232
return -1;
12171233
}
12181234

1219-
public static void warnInvalidEscapeSequence(ErrorCallback errorCallback, SourceRange sourceRange, char nextChar) {
1220-
errorCallback.onWarning(WarningType.Deprecation, sourceRange, "invalid escape sequence '\\%c'", nextChar);
1235+
public static void warnInvalidEscapeSequence(ErrorCallback errorCallback, SourceRange sourceRange, int nextCodePoint) {
1236+
errorCallback.onWarning(WarningType.Deprecation, sourceRange, "invalid escape sequence '\\%c'", nextCodePoint);
12211237
}
12221238

12231239
private static final String UNICODE_ERROR = "(unicode error) 'unicodeescape' codec can't decode bytes in position %d-%d:";

graalpython/com.oracle.graal.python.test/src/tests/test_string.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2018, 2023, Oracle and/or its affiliates.
1+
# Copyright (c) 2018, 2024, Oracle and/or its affiliates.
22
# Copyright (C) 1996-2017 Python Software Foundation
33
#
44
# Licensed under the PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
@@ -1191,3 +1191,7 @@ class S(str): pass
11911191
assert ('' + 'a').__class__ == str
11921192
assert ('a' + '').__class__ == str
11931193
assert ('a' + 'a').__class__ == str
1194+
1195+
def test_literal_with_nonbmp_and_escapes():
1196+
# Check that escape processing didn't accidentally break the emoji into surrogates
1197+
assert len("\\🤗\\") == 3

0 commit comments

Comments
 (0)