1
1
/*
2
- * Copyright (c) 2019, 2021 , Oracle and/or its affiliates. All rights reserved.
2
+ * Copyright (c) 2019, 2022 , Oracle and/or its affiliates. All rights reserved.
3
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
4
*
5
5
* The Universal Permissive License (UPL), Version 1.0
41
41
42
42
package com .oracle .graal .python .parser .sst ;
43
43
44
- import java .util .HashMap ;
45
44
import java .util .Locale ;
46
- import java .util .Map ;
47
45
48
46
import com .ibm .icu .lang .UCharacter ;
49
47
import com .oracle .graal .python .builtins .PythonBuiltinClassType ;
57
55
import com .oracle .graal .python .runtime .PythonParser .ParserErrorCallback ;
58
56
import com .oracle .graal .python .runtime .exception .PException ;
59
57
import com .oracle .graal .python .util .PythonUtils ;
60
- import com .oracle .truffle .api .CompilerDirectives ;
58
+ import com .oracle .truffle .api .CompilerDirectives . TruffleBoundary ;
61
59
import com .oracle .truffle .api .source .Source ;
60
+ import com .oracle .truffle .regex .chardata .UnicodeCharacterAliases ;
62
61
63
62
public class StringUtils {
64
63
@@ -263,7 +262,7 @@ private static PException createTruncatedError(String text, int startIndex, int
263
262
* @param offset this is offset of the open brace
264
263
* @return offset of the close brace
265
264
*/
266
- @ CompilerDirectives . TruffleBoundary
265
+ @ TruffleBoundary
267
266
private static int doCharacterName (String text , StringBuilder sb , int offset ) {
268
267
if (offset >= text .length ()) {
269
268
throw PConstructAndRaiseNode .raiseUncachedUnicodeDecodeError ("unicodeescape" , text , offset - 2 , offset , MALFORMED_ERROR );
@@ -288,61 +287,21 @@ private static int doCharacterName(String text, StringBuilder sb, int offset) {
288
287
return closeIndex ;
289
288
}
290
289
291
- // ICU4J doesn't have names for most control characters
292
- private static final Map <String , Integer > CONTROL_CHAR_NAMES = new HashMap <>(32 );
293
- static {
294
- CONTROL_CHAR_NAMES .put ("NULL" , 0x0000 );
295
- CONTROL_CHAR_NAMES .put ("START OF HEADING" , 0x0001 );
296
- CONTROL_CHAR_NAMES .put ("START OF TEXT" , 0x0002 );
297
- CONTROL_CHAR_NAMES .put ("END OF TEXT" , 0x0003 );
298
- CONTROL_CHAR_NAMES .put ("END OF TRANSMISSION" , 0x0004 );
299
- CONTROL_CHAR_NAMES .put ("ENQUIRY" , 0x0005 );
300
- CONTROL_CHAR_NAMES .put ("ACKNOWLEDGE" , 0x0006 );
301
- CONTROL_CHAR_NAMES .put ("BELL" , 0x0007 );
302
- CONTROL_CHAR_NAMES .put ("BACKSPACE" , 0x0008 );
303
- CONTROL_CHAR_NAMES .put ("CHARACTER TABULATION" , 0x0009 );
304
- CONTROL_CHAR_NAMES .put ("LINE FEED" , 0x000A );
305
- CONTROL_CHAR_NAMES .put ("LINE TABULATION" , 0x000B );
306
- CONTROL_CHAR_NAMES .put ("FORM FEED" , 0x000C );
307
- CONTROL_CHAR_NAMES .put ("CARRIAGE RETURN" , 0x000D );
308
- CONTROL_CHAR_NAMES .put ("SHIFT OUT" , 0x000E );
309
- CONTROL_CHAR_NAMES .put ("SHIFT IN" , 0x000F );
310
- CONTROL_CHAR_NAMES .put ("DATA LINK ESCAPE" , 0x0010 );
311
- CONTROL_CHAR_NAMES .put ("DEVICE CONTROL ONE" , 0x0011 );
312
- CONTROL_CHAR_NAMES .put ("DEVICE CONTROL TWO" , 0x0012 );
313
- CONTROL_CHAR_NAMES .put ("DEVICE CONTROL THREE" , 0x0013 );
314
- CONTROL_CHAR_NAMES .put ("DEVICE CONTROL FOUR" , 0x0014 );
315
- CONTROL_CHAR_NAMES .put ("NEGATIVE ACKNOWLEDGE" , 0x0015 );
316
- CONTROL_CHAR_NAMES .put ("SYNCHRONOUS IDLE" , 0x0016 );
317
- CONTROL_CHAR_NAMES .put ("END OF TRANSMISSION BLOCK" , 0x0017 );
318
- CONTROL_CHAR_NAMES .put ("CANCEL" , 0x0018 );
319
- CONTROL_CHAR_NAMES .put ("END OF MEDIUM" , 0x0019 );
320
- CONTROL_CHAR_NAMES .put ("SUBSTITUTE" , 0x001A );
321
- CONTROL_CHAR_NAMES .put ("ESCAPE" , 0x001B );
322
- CONTROL_CHAR_NAMES .put ("INFORMATION SEPARATOR FOUR" , 0x001C );
323
- CONTROL_CHAR_NAMES .put ("INFORMATION SEPARATOR THREE" , 0x001D );
324
- CONTROL_CHAR_NAMES .put ("INFORMATION SEPARATOR TWO" , 0x001E );
325
- CONTROL_CHAR_NAMES .put ("INFORMATION SEPARATOR ONE" , 0x001F );
326
- }
327
-
328
- @ CompilerDirectives .TruffleBoundary
329
- public static int getCodePoint (String charName ) {
330
- int possibleChar = UCharacter .getCharFromName (charName );
331
- if (possibleChar > -1 ) {
332
- return possibleChar ;
333
- }
334
- possibleChar = UCharacter .getCharFromExtendedName (charName );
335
- if (possibleChar > -1 ) {
336
- return possibleChar ;
337
- }
338
- possibleChar = UCharacter .getCharFromNameAlias (charName );
339
- if (possibleChar > -1 ) {
340
- return possibleChar ;
341
- }
342
- possibleChar = CONTROL_CHAR_NAMES .getOrDefault (charName .toUpperCase (Locale .ROOT ), -1 );
343
- if (possibleChar > -1 ) {
344
- return possibleChar ;
290
+ @ TruffleBoundary
291
+ public static int getCodePoint (String characterName ) {
292
+ // CPython's logic for resolving these character names goes like this:
293
+ // 1) handle Hangul Syllables in region AC00-D7A3
294
+ // 2) handle CJK Ideographs
295
+ // 3) handle character names as given in UnicodeData.txt
296
+ // 4) handle all aliases as given in NameAliases.txt
297
+ // With ICU's UCharacter, we get cases 1), 2) and 3). As for 4), the aliases, ICU only
298
+ // handles aliases of type 'correction'. Therefore, we extract the contents of
299
+ // NameAliases.txt and handle aliases by ourselves.
300
+ String normalizedName = characterName .trim ().toUpperCase (Locale .ROOT );
301
+ if (UnicodeCharacterAliases .CHARACTER_ALIASES .containsKey (normalizedName )) {
302
+ return UnicodeCharacterAliases .CHARACTER_ALIASES .get (normalizedName );
303
+ } else {
304
+ return UCharacter .getCharFromName (characterName );
345
305
}
346
- return -1 ;
347
306
}
348
307
}
0 commit comments