Skip to content

Commit 75bc970

Browse files
committed
Optimization for htmlspecialchars function.
A dedicated php_htmlspecialchars function instead of the “universal” php_escape_html_entities_ex. We work with ASCII-compatible encodings, we can employ byte-by-byte scanning and a lookup table to identify special characters. For c < 0x80, the lookup table is used; for potentially multi-byte characters, we continue to rely on get_next_char. This approach provides a noticeable performance improvement for ASCII strings and some improvement for multi-byte strings due to more optimized logic.
1 parent e954bf6 commit 75bc970

File tree

1 file changed

+257
-1
lines changed

1 file changed

+257
-1
lines changed

ext/standard/html.c

Lines changed: 257 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,12 @@
7474
#define sjis_lead(c) ((c) != 0x80 && (c) != 0xA0 && (c) < 0xFD)
7575
#define sjis_trail(c) ((c) >= 0x40 && (c) != 0x7F && (c) < 0xFD)
7676

77+
/* Lookup table for php_htmlspecialchars */
78+
typedef struct {
79+
char* entity[256];
80+
ushort entity_len[256];
81+
} htmlspecialchars_lut;
82+
7783
/* {{{ get_default_charset */
7884
static char *get_default_charset(void) {
7985
if (PG(internal_encoding) && PG(internal_encoding)[0]) {
@@ -752,6 +758,60 @@ static zend_result resolve_named_entity_html(const char *start, size_t length, c
752758
}
753759
/* }}} */
754760

761+
/* {{{ is_codepoint_allowed */
762+
static inline zend_bool is_codepoint_allowed(
763+
unsigned int cp, /* The codepoint to check */
764+
enum entity_charset charset, /* Current charset */
765+
int doctype, /* The doctype flags (ENT_HTML401, ENT_HTML5, etc.) */
766+
const enc_to_uni* to_uni_table /* Mapping table if needed */
767+
) {
768+
// If charset is Unicode-compatible, the code point is used as-is
769+
if (CHARSET_UNICODE_COMPAT(charset)) {
770+
return unicode_cp_is_allowed(cp, doctype);
771+
}
772+
// If we have a mapping table (i.e., a non-UTF charset)
773+
if (to_uni_table) {
774+
map_to_unicode(cp, to_uni_table, &cp);
775+
return unicode_cp_is_allowed(cp, doctype);
776+
}
777+
778+
if (cp <= 0x7D) {
779+
return unicode_cp_is_allowed(cp, doctype);
780+
}
781+
782+
return 1;
783+
}
784+
/* }}} */
785+
786+
/* {{{ init_htmlspecialchars_lut */
787+
static void init_htmlspecialchars_lut(htmlspecialchars_lut* lut, const int flags, const int doctype) {
788+
memset(lut, 0, sizeof(*lut));
789+
790+
lut->entity['&'] = "&amp;";
791+
lut->entity['>'] = "&gt;";
792+
lut->entity['<'] = "&lt;";
793+
lut->entity_len['&'] = 5;
794+
lut->entity_len['>'] = 4;
795+
lut->entity_len['<'] = 4;
796+
797+
if (flags & ENT_QUOTES & ENT_HTML_QUOTE_DOUBLE) {
798+
lut->entity['"'] = "&quot;";
799+
lut->entity_len['"'] = 6;
800+
}
801+
802+
if (flags & ENT_QUOTES & ENT_HTML_QUOTE_SINGLE) {
803+
char* apos = "&#039;";
804+
if (doctype != ENT_HTML401) {
805+
if (doctype & (ENT_XML1 | ENT_XHTML | ENT_HTML5)) {
806+
apos = "&apos;";
807+
}
808+
}
809+
lut->entity['\''] = apos;
810+
lut->entity_len['\''] = 6;
811+
}
812+
}
813+
/* }}} */
814+
755815
static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charset charset, unsigned code) {
756816
/* code is not necessarily a unicode code point */
757817
switch (charset) {
@@ -1304,6 +1364,179 @@ PHPAPI zend_string *php_escape_html_entities_ex(const unsigned char *old, size_t
13041364
}
13051365
/* }}} */
13061366

1367+
/* {{{ php_htmlspecialchars */
1368+
PHPAPI zend_string* php_htmlspecialchars_ex(
1369+
const zend_string* input, const int flags,
1370+
const char* hint_charset, const bool double_encode,
1371+
const bool quiet
1372+
) {
1373+
const entity_ht* inv_map = NULL;
1374+
htmlspecialchars_lut lut;
1375+
const int doctype = flags & ENT_HTML_DOC_TYPE_MASK;
1376+
1377+
const size_t initial_size = (ZSTR_LEN(input) < 64)
1378+
? 256
1379+
: zend_safe_addmult(ZSTR_LEN(input), 2, 0, "htmlspecialchars");
1380+
zend_string* output = zend_string_alloc(initial_size, 0);
1381+
1382+
size_t free_space = initial_size;
1383+
char* output_ptr = ZSTR_VAL(output);
1384+
const char* input_ptr = ZSTR_VAL(input);
1385+
const char* input_end = input_ptr + input->len;
1386+
1387+
const enum entity_charset charset = determine_charset(hint_charset, quiet);
1388+
const enc_to_uni* to_uni_table = NULL;
1389+
if (!CHARSET_UNICODE_COMPAT(charset)) {
1390+
to_uni_table = enc_to_uni_index[charset];
1391+
}
1392+
1393+
/* Replacement for invalid characters and byte sequences */
1394+
const unsigned char* replacement = NULL;
1395+
size_t replacement_len = 0;
1396+
if (flags & (ENT_HTML_SUBSTITUTE_ERRORS | ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS)) {
1397+
if (charset == cs_utf_8) {
1398+
replacement = (const unsigned char*)"\xEF\xBF\xBD";
1399+
replacement_len = sizeof("\xEF\xBF\xBD") - 1;
1400+
} else {
1401+
replacement = (const unsigned char*)"&#xFFFD;";
1402+
replacement_len = sizeof("&#xFFFD;") - 1;
1403+
}
1404+
}
1405+
1406+
init_htmlspecialchars_lut(&lut, flags, doctype);
1407+
1408+
if (!double_encode) {
1409+
inv_map = unescape_inverse_map(1, flags);
1410+
}
1411+
1412+
while (input_ptr < input_end) {
1413+
const unsigned char c = *input_ptr;
1414+
/* ASCII chars */
1415+
if (c < 0x80) {
1416+
/* Handle HTML entities */
1417+
if (c == '&' && !double_encode) {
1418+
const char* semicolon = memchr(input_ptr, ';', MIN(LONGEST_ENTITY_LENGTH + 1, input_end - input_ptr));
1419+
if (semicolon) {
1420+
const size_t candidate_len = semicolon - (const char*)input_ptr + 1;
1421+
unsigned dummy1, dummy2;
1422+
1423+
/* Named entity */
1424+
if (resolve_named_entity_html((const char*)input_ptr + 1, candidate_len - 2, inv_map, &dummy1,
1425+
&dummy2) == SUCCESS) {
1426+
memcpy(output_ptr, input_ptr, candidate_len);
1427+
output_ptr += candidate_len;
1428+
input_ptr += candidate_len;
1429+
free_space -= candidate_len;
1430+
goto ensure_memory;
1431+
}
1432+
1433+
/* Numeric entity */
1434+
if (input_ptr[1] == '#') {
1435+
unsigned code_point;
1436+
char* start = (char*)input_ptr + 2;
1437+
const int valid = process_numeric_entity((const char**)&start, &code_point);
1438+
if (valid == SUCCESS && start == semicolon) {
1439+
if (!(flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) ||
1440+
numeric_entity_is_allowed(code_point, doctype)) {
1441+
memcpy(output_ptr, input_ptr, candidate_len);
1442+
output_ptr += candidate_len;
1443+
input_ptr += candidate_len;
1444+
free_space -= candidate_len;
1445+
goto ensure_memory;
1446+
}
1447+
}
1448+
}
1449+
}
1450+
1451+
/* Invalid entity */
1452+
memcpy(output_ptr, "&amp;", 5);
1453+
output_ptr += 5;
1454+
free_space -= 5;
1455+
input_ptr++;
1456+
goto ensure_memory;
1457+
}
1458+
1459+
/* Check disallowed chars */
1460+
if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) {
1461+
if (!is_codepoint_allowed(c, charset, doctype, NULL)) {
1462+
memcpy(output_ptr, replacement, replacement_len);
1463+
output_ptr += replacement_len;
1464+
free_space -= replacement_len;
1465+
input_ptr++;
1466+
goto ensure_memory;
1467+
}
1468+
}
1469+
1470+
/* Use lookup table for fast replace */
1471+
if (lut.entity[c]) {
1472+
const size_t entity_len = lut.entity_len[c];
1473+
memcpy(output_ptr, lut.entity[c], entity_len);
1474+
output_ptr += entity_len;
1475+
free_space -= entity_len;
1476+
} else {
1477+
*output_ptr++ = c;
1478+
free_space--;
1479+
}
1480+
1481+
input_ptr++;
1482+
} else {
1483+
/* Multibyte chars */
1484+
zend_result status;
1485+
const size_t original_pos = (const char*)input_ptr - ZSTR_VAL(input);
1486+
size_t cursor = original_pos;
1487+
const unsigned int this_char = get_next_char(charset, (unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input),
1488+
&cursor, &status);
1489+
const size_t processed_len = cursor - original_pos;
1490+
1491+
if (status == FAILURE) {
1492+
if (flags & ENT_HTML_IGNORE_ERRORS) {
1493+
input_ptr += processed_len;
1494+
continue;
1495+
}
1496+
if (flags & ENT_HTML_SUBSTITUTE_ERRORS) {
1497+
memcpy(output_ptr, replacement, replacement_len);
1498+
output_ptr += replacement_len;
1499+
free_space -= replacement_len;
1500+
input_ptr += processed_len;
1501+
} else {
1502+
zend_string_release(output);
1503+
return ZSTR_EMPTY_ALLOC();
1504+
}
1505+
} else {
1506+
/* Check disallowed chars */
1507+
const unsigned char* sequence = (unsigned char*)input_ptr;
1508+
size_t sequence_len = processed_len;
1509+
1510+
if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) {
1511+
if (!is_codepoint_allowed(this_char, charset, doctype, to_uni_table)) {
1512+
sequence = replacement;
1513+
sequence_len = replacement_len;
1514+
}
1515+
}
1516+
1517+
memcpy(output_ptr, sequence, sequence_len);
1518+
output_ptr += sequence_len;
1519+
free_space -= sequence_len;
1520+
input_ptr += processed_len;
1521+
}
1522+
}
1523+
1524+
ensure_memory:
1525+
if (free_space < 128) {
1526+
const size_t used = ZSTR_LEN(output) - free_space;
1527+
const size_t new_size = used + 1024;
1528+
output = zend_string_realloc(output, new_size, 0);
1529+
output_ptr = ZSTR_VAL(output) + used;
1530+
free_space = new_size - used;
1531+
}
1532+
}
1533+
1534+
*output_ptr = '\0';
1535+
ZSTR_LEN(output) = (output_ptr - ZSTR_VAL(output));
1536+
return output;
1537+
}
1538+
/* }}} */
1539+
13071540
/* {{{ php_html_entities */
13081541
static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
13091542
{
@@ -1327,10 +1560,33 @@ static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
13271560
}
13281561
/* }}} */
13291562

1563+
/* {{{ php_html_entities */
1564+
static void php_htmlspecialchars(INTERNAL_FUNCTION_PARAMETERS)
1565+
{
1566+
zend_string *str, *hint_charset = NULL;
1567+
zend_long flags = ENT_QUOTES|ENT_SUBSTITUTE;
1568+
zend_string *replaced;
1569+
bool double_encode = 1;
1570+
1571+
ZEND_PARSE_PARAMETERS_START(1, 4)
1572+
Z_PARAM_STR(str)
1573+
Z_PARAM_OPTIONAL
1574+
Z_PARAM_LONG(flags)
1575+
Z_PARAM_STR_OR_NULL(hint_charset)
1576+
Z_PARAM_BOOL(double_encode);
1577+
ZEND_PARSE_PARAMETERS_END();
1578+
1579+
replaced = php_htmlspecialchars_ex(
1580+
str, (int) flags,
1581+
hint_charset ? ZSTR_VAL(hint_charset) : NULL, double_encode, /* quiet */ 0);
1582+
RETVAL_STR(replaced);
1583+
}
1584+
/* }}} */
1585+
13301586
/* {{{ Convert special characters to HTML entities */
13311587
PHP_FUNCTION(htmlspecialchars)
13321588
{
1333-
php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
1589+
php_htmlspecialchars(INTERNAL_FUNCTION_PARAM_PASSTHRU);
13341590
}
13351591
/* }}} */
13361592

0 commit comments

Comments
 (0)