74
74
#define sjis_lead (c ) ((c) != 0x80 && (c) != 0xA0 && (c) < 0xFD)
75
75
#define sjis_trail (c ) ((c) >= 0x40 && (c) != 0x7F && (c) < 0xFD)
76
76
77
+ /* Lookup table for php_htmlspecialchars */
78
+ typedef struct {
79
+ char * entity [256 ];
80
+ ushort entity_len [256 ];
81
+ } htmlspecialchars_lut ;
82
+
77
83
/* {{{ get_default_charset */
78
84
static char * get_default_charset (void ) {
79
85
if (PG (internal_encoding ) && PG (internal_encoding )[0 ]) {
@@ -752,6 +758,60 @@ static zend_result resolve_named_entity_html(const char *start, size_t length, c
752
758
}
753
759
/* }}} */
754
760
761
+ /* {{{ is_codepoint_allowed */
762
+ static inline zend_bool is_codepoint_allowed (
763
+ unsigned int cp , /* The codepoint to check */
764
+ enum entity_charset charset , /* Current charset */
765
+ int doctype , /* The doctype flags (ENT_HTML401, ENT_HTML5, etc.) */
766
+ const enc_to_uni * to_uni_table /* Mapping table if needed */
767
+ ) {
768
+ // If charset is Unicode-compatible, the code point is used as-is
769
+ if (CHARSET_UNICODE_COMPAT (charset )) {
770
+ return unicode_cp_is_allowed (cp , doctype );
771
+ }
772
+ // If we have a mapping table (i.e., a non-UTF charset)
773
+ if (to_uni_table ) {
774
+ map_to_unicode (cp , to_uni_table , & cp );
775
+ return unicode_cp_is_allowed (cp , doctype );
776
+ }
777
+
778
+ if (cp <= 0x7D ) {
779
+ return unicode_cp_is_allowed (cp , doctype );
780
+ }
781
+
782
+ return 1 ;
783
+ }
784
+ /* }}} */
785
+
786
+ /* {{{ init_htmlspecialchars_lut */
787
+ static void init_htmlspecialchars_lut (htmlspecialchars_lut * lut , const int flags , const int doctype ) {
788
+ memset (lut , 0 , sizeof (* lut ));
789
+
790
+ lut -> entity ['&' ] = "&" ;
791
+ lut -> entity ['>' ] = ">" ;
792
+ lut -> entity ['<' ] = "<" ;
793
+ lut -> entity_len ['&' ] = 5 ;
794
+ lut -> entity_len ['>' ] = 4 ;
795
+ lut -> entity_len ['<' ] = 4 ;
796
+
797
+ if (flags & ENT_QUOTES & ENT_HTML_QUOTE_DOUBLE ) {
798
+ lut -> entity ['"' ] = """ ;
799
+ lut -> entity_len ['"' ] = 6 ;
800
+ }
801
+
802
+ if (flags & ENT_QUOTES & ENT_HTML_QUOTE_SINGLE ) {
803
+ char * apos = "'" ;
804
+ if (doctype != ENT_HTML401 ) {
805
+ if (doctype & (ENT_XML1 | ENT_XHTML | ENT_HTML5 )) {
806
+ apos = "'" ;
807
+ }
808
+ }
809
+ lut -> entity ['\'' ] = apos ;
810
+ lut -> entity_len ['\'' ] = 6 ;
811
+ }
812
+ }
813
+ /* }}} */
814
+
755
815
static inline size_t write_octet_sequence (unsigned char * buf , enum entity_charset charset , unsigned code ) {
756
816
/* code is not necessarily a unicode code point */
757
817
switch (charset ) {
@@ -1304,6 +1364,179 @@ PHPAPI zend_string *php_escape_html_entities_ex(const unsigned char *old, size_t
1304
1364
}
1305
1365
/* }}} */
1306
1366
1367
+ /* {{{ php_htmlspecialchars */
1368
+ PHPAPI zend_string * php_htmlspecialchars_ex (
1369
+ const zend_string * input , const int flags ,
1370
+ const char * hint_charset , const bool double_encode ,
1371
+ const bool quiet
1372
+ ) {
1373
+ const entity_ht * inv_map = NULL ;
1374
+ htmlspecialchars_lut lut ;
1375
+ const int doctype = flags & ENT_HTML_DOC_TYPE_MASK ;
1376
+
1377
+ const size_t initial_size = (ZSTR_LEN (input ) < 64 )
1378
+ ? 256
1379
+ : zend_safe_addmult (ZSTR_LEN (input ), 2 , 0 , "htmlspecialchars" );
1380
+ zend_string * output = zend_string_alloc (initial_size , 0 );
1381
+
1382
+ size_t free_space = initial_size ;
1383
+ char * output_ptr = ZSTR_VAL (output );
1384
+ const char * input_ptr = ZSTR_VAL (input );
1385
+ const char * input_end = input_ptr + input -> len ;
1386
+
1387
+ const enum entity_charset charset = determine_charset (hint_charset , quiet );
1388
+ const enc_to_uni * to_uni_table = NULL ;
1389
+ if (!CHARSET_UNICODE_COMPAT (charset )) {
1390
+ to_uni_table = enc_to_uni_index [charset ];
1391
+ }
1392
+
1393
+ /* Replacement for invalid characters and byte sequences */
1394
+ const unsigned char * replacement = NULL ;
1395
+ size_t replacement_len = 0 ;
1396
+ if (flags & (ENT_HTML_SUBSTITUTE_ERRORS | ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS )) {
1397
+ if (charset == cs_utf_8 ) {
1398
+ replacement = (const unsigned char * )"\xEF\xBF\xBD" ;
1399
+ replacement_len = sizeof ("\xEF\xBF\xBD" ) - 1 ;
1400
+ } else {
1401
+ replacement = (const unsigned char * )"�" ;
1402
+ replacement_len = sizeof ("�" ) - 1 ;
1403
+ }
1404
+ }
1405
+
1406
+ init_htmlspecialchars_lut (& lut , flags , doctype );
1407
+
1408
+ if (!double_encode ) {
1409
+ inv_map = unescape_inverse_map (1 , flags );
1410
+ }
1411
+
1412
+ while (input_ptr < input_end ) {
1413
+ const unsigned char c = * input_ptr ;
1414
+ /* ASCII chars */
1415
+ if (c < 0x80 ) {
1416
+ /* Handle HTML entities */
1417
+ if (c == '&' && !double_encode ) {
1418
+ const char * semicolon = memchr (input_ptr , ';' , MIN (LONGEST_ENTITY_LENGTH + 1 , input_end - input_ptr ));
1419
+ if (semicolon ) {
1420
+ const size_t candidate_len = semicolon - (const char * )input_ptr + 1 ;
1421
+ unsigned dummy1 , dummy2 ;
1422
+
1423
+ /* Named entity */
1424
+ if (resolve_named_entity_html ((const char * )input_ptr + 1 , candidate_len - 2 , inv_map , & dummy1 ,
1425
+ & dummy2 ) == SUCCESS ) {
1426
+ memcpy (output_ptr , input_ptr , candidate_len );
1427
+ output_ptr += candidate_len ;
1428
+ input_ptr += candidate_len ;
1429
+ free_space -= candidate_len ;
1430
+ goto ensure_memory ;
1431
+ }
1432
+
1433
+ /* Numeric entity */
1434
+ if (input_ptr [1 ] == '#' ) {
1435
+ unsigned code_point ;
1436
+ char * start = (char * )input_ptr + 2 ;
1437
+ const int valid = process_numeric_entity ((const char * * )& start , & code_point );
1438
+ if (valid == SUCCESS && start == semicolon ) {
1439
+ if (!(flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS ) ||
1440
+ numeric_entity_is_allowed (code_point , doctype )) {
1441
+ memcpy (output_ptr , input_ptr , candidate_len );
1442
+ output_ptr += candidate_len ;
1443
+ input_ptr += candidate_len ;
1444
+ free_space -= candidate_len ;
1445
+ goto ensure_memory ;
1446
+ }
1447
+ }
1448
+ }
1449
+ }
1450
+
1451
+ /* Invalid entity */
1452
+ memcpy (output_ptr , "&" , 5 );
1453
+ output_ptr += 5 ;
1454
+ free_space -= 5 ;
1455
+ input_ptr ++ ;
1456
+ goto ensure_memory ;
1457
+ }
1458
+
1459
+ /* Check disallowed chars */
1460
+ if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS ) {
1461
+ if (!is_codepoint_allowed (c , charset , doctype , NULL )) {
1462
+ memcpy (output_ptr , replacement , replacement_len );
1463
+ output_ptr += replacement_len ;
1464
+ free_space -= replacement_len ;
1465
+ input_ptr ++ ;
1466
+ goto ensure_memory ;
1467
+ }
1468
+ }
1469
+
1470
+ /* Use lookup table for fast replace */
1471
+ if (lut .entity [c ]) {
1472
+ const size_t entity_len = lut .entity_len [c ];
1473
+ memcpy (output_ptr , lut .entity [c ], entity_len );
1474
+ output_ptr += entity_len ;
1475
+ free_space -= entity_len ;
1476
+ } else {
1477
+ * output_ptr ++ = c ;
1478
+ free_space -- ;
1479
+ }
1480
+
1481
+ input_ptr ++ ;
1482
+ } else {
1483
+ /* Multibyte chars */
1484
+ zend_result status ;
1485
+ const size_t original_pos = (const char * )input_ptr - ZSTR_VAL (input );
1486
+ size_t cursor = original_pos ;
1487
+ const unsigned int this_char = get_next_char (charset , (unsigned char * )ZSTR_VAL (input ), ZSTR_LEN (input ),
1488
+ & cursor , & status );
1489
+ const size_t processed_len = cursor - original_pos ;
1490
+
1491
+ if (status == FAILURE ) {
1492
+ if (flags & ENT_HTML_IGNORE_ERRORS ) {
1493
+ input_ptr += processed_len ;
1494
+ continue ;
1495
+ }
1496
+ if (flags & ENT_HTML_SUBSTITUTE_ERRORS ) {
1497
+ memcpy (output_ptr , replacement , replacement_len );
1498
+ output_ptr += replacement_len ;
1499
+ free_space -= replacement_len ;
1500
+ input_ptr += processed_len ;
1501
+ } else {
1502
+ zend_string_release (output );
1503
+ return ZSTR_EMPTY_ALLOC ();
1504
+ }
1505
+ } else {
1506
+ /* Check disallowed chars */
1507
+ const unsigned char * sequence = (unsigned char * )input_ptr ;
1508
+ size_t sequence_len = processed_len ;
1509
+
1510
+ if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS ) {
1511
+ if (!is_codepoint_allowed (this_char , charset , doctype , to_uni_table )) {
1512
+ sequence = replacement ;
1513
+ sequence_len = replacement_len ;
1514
+ }
1515
+ }
1516
+
1517
+ memcpy (output_ptr , sequence , sequence_len );
1518
+ output_ptr += sequence_len ;
1519
+ free_space -= sequence_len ;
1520
+ input_ptr += processed_len ;
1521
+ }
1522
+ }
1523
+
1524
+ ensure_memory :
1525
+ if (free_space < 128 ) {
1526
+ const size_t used = ZSTR_LEN (output ) - free_space ;
1527
+ const size_t new_size = used + 1024 ;
1528
+ output = zend_string_realloc (output , new_size , 0 );
1529
+ output_ptr = ZSTR_VAL (output ) + used ;
1530
+ free_space = new_size - used ;
1531
+ }
1532
+ }
1533
+
1534
+ * output_ptr = '\0' ;
1535
+ ZSTR_LEN (output ) = (output_ptr - ZSTR_VAL (output ));
1536
+ return output ;
1537
+ }
1538
+ /* }}} */
1539
+
1307
1540
/* {{{ php_html_entities */
1308
1541
static void php_html_entities (INTERNAL_FUNCTION_PARAMETERS , int all )
1309
1542
{
@@ -1327,10 +1560,33 @@ static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
1327
1560
}
1328
1561
/* }}} */
1329
1562
1563
+ /* {{{ php_html_entities */
1564
+ static void php_htmlspecialchars (INTERNAL_FUNCTION_PARAMETERS )
1565
+ {
1566
+ zend_string * str , * hint_charset = NULL ;
1567
+ zend_long flags = ENT_QUOTES |ENT_SUBSTITUTE ;
1568
+ zend_string * replaced ;
1569
+ bool double_encode = 1 ;
1570
+
1571
+ ZEND_PARSE_PARAMETERS_START (1 , 4 )
1572
+ Z_PARAM_STR (str )
1573
+ Z_PARAM_OPTIONAL
1574
+ Z_PARAM_LONG (flags )
1575
+ Z_PARAM_STR_OR_NULL (hint_charset )
1576
+ Z_PARAM_BOOL (double_encode );
1577
+ ZEND_PARSE_PARAMETERS_END ();
1578
+
1579
+ replaced = php_htmlspecialchars_ex (
1580
+ str , (int ) flags ,
1581
+ hint_charset ? ZSTR_VAL (hint_charset ) : NULL, double_encode , /* quiet */ 0 );
1582
+ RETVAL_STR (replaced );
1583
+ }
1584
+ /* }}} */
1585
+
1330
1586
/* {{{ Convert special characters to HTML entities */
1331
1587
PHP_FUNCTION (htmlspecialchars )
1332
1588
{
1333
- php_html_entities (INTERNAL_FUNCTION_PARAM_PASSTHRU , 0 );
1589
+ php_htmlspecialchars (INTERNAL_FUNCTION_PARAM_PASSTHRU );
1334
1590
}
1335
1591
/* }}} */
1336
1592
0 commit comments