Skip to content

Commit cf5baa4

Browse files
committed
Share unicode name alias table with TRegex
1 parent 4a3f01c commit cf5baa4

File tree

3 files changed

+27
-59
lines changed

3 files changed

+27
-59
lines changed

graalpython/com.oracle.graal.python.test/src/com/oracle/graal/python/test/parser/StringUtilsTests.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,14 @@ public void blockCjkUnifiedIdeographUnknownCharacters() throws Exception {
113113
checkUnknownChar("CJK Unified Ideograph-2A6E0");
114114
}
115115

116+
@Test
117+
public void controlCharacters() {
118+
Assert.assertEquals("\f", StringUtils.unescapeJavaString(errorCallback, "\\N{FORM FEED}"));
119+
Assert.assertEquals("\f", StringUtils.unescapeJavaString(errorCallback, "\\N{FF}"));
120+
Assert.assertEquals("\u0096", StringUtils.unescapeJavaString(errorCallback, "\\N{START OF GUARDED AREA}"));
121+
Assert.assertEquals("\udb40\udd57", StringUtils.unescapeJavaString(errorCallback, "\\N{VS104}"));
122+
}
123+
116124
@Test
117125
public void malformedError() throws Exception {
118126
checkSyntaxErrorMessage("'\\N'", "SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 0-1: malformed \\N character escape");

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/parser/sst/StringUtils.java

Lines changed: 18 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,7 @@
4141

4242
package com.oracle.graal.python.parser.sst;
4343

44-
import java.util.HashMap;
4544
import java.util.Locale;
46-
import java.util.Map;
4745

4846
import com.ibm.icu.lang.UCharacter;
4947
import com.oracle.graal.python.builtins.PythonBuiltinClassType;
@@ -57,8 +55,9 @@
5755
import com.oracle.graal.python.runtime.PythonParser.ParserErrorCallback;
5856
import com.oracle.graal.python.runtime.exception.PException;
5957
import com.oracle.graal.python.util.PythonUtils;
60-
import com.oracle.truffle.api.CompilerDirectives;
58+
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
6159
import com.oracle.truffle.api.source.Source;
60+
import com.oracle.truffle.regex.chardata.UnicodeCharacterAliases;
6261

6362
public class StringUtils {
6463

@@ -263,7 +262,7 @@ private static PException createTruncatedError(String text, int startIndex, int
263262
* @param offset this is offset of the open brace
264263
* @return offset of the close brace
265264
*/
266-
@CompilerDirectives.TruffleBoundary
265+
@TruffleBoundary
267266
private static int doCharacterName(String text, StringBuilder sb, int offset) {
268267
if (offset >= text.length()) {
269268
throw PConstructAndRaiseNode.raiseUncachedUnicodeDecodeError("unicodeescape", text, offset - 2, offset, MALFORMED_ERROR);
@@ -288,61 +287,21 @@ private static int doCharacterName(String text, StringBuilder sb, int offset) {
288287
return closeIndex;
289288
}
290289

291-
// ICU4J doesn't have names for most control characters
292-
private static final Map<String, Integer> CONTROL_CHAR_NAMES = new HashMap<>(32);
293-
static {
294-
CONTROL_CHAR_NAMES.put("NULL", 0x0000);
295-
CONTROL_CHAR_NAMES.put("START OF HEADING", 0x0001);
296-
CONTROL_CHAR_NAMES.put("START OF TEXT", 0x0002);
297-
CONTROL_CHAR_NAMES.put("END OF TEXT", 0x0003);
298-
CONTROL_CHAR_NAMES.put("END OF TRANSMISSION", 0x0004);
299-
CONTROL_CHAR_NAMES.put("ENQUIRY", 0x0005);
300-
CONTROL_CHAR_NAMES.put("ACKNOWLEDGE", 0x0006);
301-
CONTROL_CHAR_NAMES.put("BELL", 0x0007);
302-
CONTROL_CHAR_NAMES.put("BACKSPACE", 0x0008);
303-
CONTROL_CHAR_NAMES.put("CHARACTER TABULATION", 0x0009);
304-
CONTROL_CHAR_NAMES.put("LINE FEED", 0x000A);
305-
CONTROL_CHAR_NAMES.put("LINE TABULATION", 0x000B);
306-
CONTROL_CHAR_NAMES.put("FORM FEED", 0x000C);
307-
CONTROL_CHAR_NAMES.put("CARRIAGE RETURN", 0x000D);
308-
CONTROL_CHAR_NAMES.put("SHIFT OUT", 0x000E);
309-
CONTROL_CHAR_NAMES.put("SHIFT IN", 0x000F);
310-
CONTROL_CHAR_NAMES.put("DATA LINK ESCAPE", 0x0010);
311-
CONTROL_CHAR_NAMES.put("DEVICE CONTROL ONE", 0x0011);
312-
CONTROL_CHAR_NAMES.put("DEVICE CONTROL TWO", 0x0012);
313-
CONTROL_CHAR_NAMES.put("DEVICE CONTROL THREE", 0x0013);
314-
CONTROL_CHAR_NAMES.put("DEVICE CONTROL FOUR", 0x0014);
315-
CONTROL_CHAR_NAMES.put("NEGATIVE ACKNOWLEDGE", 0x0015);
316-
CONTROL_CHAR_NAMES.put("SYNCHRONOUS IDLE", 0x0016);
317-
CONTROL_CHAR_NAMES.put("END OF TRANSMISSION BLOCK", 0x0017);
318-
CONTROL_CHAR_NAMES.put("CANCEL", 0x0018);
319-
CONTROL_CHAR_NAMES.put("END OF MEDIUM", 0x0019);
320-
CONTROL_CHAR_NAMES.put("SUBSTITUTE", 0x001A);
321-
CONTROL_CHAR_NAMES.put("ESCAPE", 0x001B);
322-
CONTROL_CHAR_NAMES.put("INFORMATION SEPARATOR FOUR", 0x001C);
323-
CONTROL_CHAR_NAMES.put("INFORMATION SEPARATOR THREE", 0x001D);
324-
CONTROL_CHAR_NAMES.put("INFORMATION SEPARATOR TWO", 0x001E);
325-
CONTROL_CHAR_NAMES.put("INFORMATION SEPARATOR ONE", 0x001F);
326-
}
327-
328-
@CompilerDirectives.TruffleBoundary
329-
public static int getCodePoint(String charName) {
330-
int possibleChar = UCharacter.getCharFromName(charName);
331-
if (possibleChar > -1) {
332-
return possibleChar;
333-
}
334-
possibleChar = UCharacter.getCharFromExtendedName(charName);
335-
if (possibleChar > -1) {
336-
return possibleChar;
337-
}
338-
possibleChar = UCharacter.getCharFromNameAlias(charName);
339-
if (possibleChar > -1) {
340-
return possibleChar;
341-
}
342-
possibleChar = CONTROL_CHAR_NAMES.getOrDefault(charName.toUpperCase(Locale.ROOT), -1);
343-
if (possibleChar > -1) {
344-
return possibleChar;
290+
@TruffleBoundary
291+
public static int getCodePoint(String characterName) {
292+
// CPython's logic for resolving these character names goes like this:
293+
// 1) handle Hangul Syllables in region AC00-D7A3
294+
// 2) handle CJK Ideographs
295+
// 3) handle character names as given in UnicodeData.txt
296+
// 4) handle all aliases as given in NameAliases.txt
297+
// With ICU's UCharacter, we get cases 1), 2) and 3). As for 4), the aliases, ICU only
298+
// handles aliases of type 'correction'. Therefore, we extract the contents of
299+
// NameAliases.txt and handle aliases by ourselves.
300+
String normalizedName = characterName.trim().toUpperCase(Locale.ROOT);
301+
if (UnicodeCharacterAliases.CHARACTER_ALIASES.containsKey(normalizedName)) {
302+
return UnicodeCharacterAliases.CHARACTER_ALIASES.get(normalizedName);
303+
} else {
304+
return UCharacter.getCharFromName(characterName);
345305
}
346-
return -1;
347306
}
348307
}

mx.graalpython/suite.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@
256256
"XZ-1.8",
257257
"truffle:ICU4J",
258258
"truffle:ICU4J-CHARSET",
259+
"regex:TREGEX",
259260
"sdk:JLINE3",
260261
],
261262
"requires": [

0 commit comments

Comments
 (0)