Skip to content

Optimize PHP html_entity_decode function #18092

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
229 changes: 133 additions & 96 deletions ext/standard/html.c
Original file line number Diff line number Diff line change
Expand Up @@ -809,112 +809,149 @@ static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charse
/* +2 is 1 because of rest (probably unnecessary), 1 because of terminating 0 */
#define TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(oldlen) ((oldlen) + (oldlen) / 5 + 2)
static void traverse_for_entities(
const char *old,
size_t oldlen,
zend_string *ret, /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */
const char *input,
size_t input_len,
zend_string *output, /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */
int all,
int flags,
const entity_ht *inv_map,
enum entity_charset charset)
{
const char *p,
*lim;
char *q;
int doctype = flags & ENT_HTML_DOC_TYPE_MASK;

lim = old + oldlen; /* terminator address */
assert(*lim == '\0');

for (p = old, q = ZSTR_VAL(ret); p < lim;) {
unsigned code, code2 = 0;
const char *next = NULL; /* when set, next > p, otherwise possible inf loop */

/* Shift JIS, Big5 and HKSCS use multi-byte encodings where an
* ASCII range byte can be part of a multi-byte sequence.
* However, they start at 0x40, therefore if we find a 0x26 byte,
* we're sure it represents the '&' character. */

/* assumes there are no single-char entities */
if (p[0] != '&' || (p + 3 >= lim)) {
*(q++) = *(p++);
continue;
}

/* now p[3] is surely valid and is no terminator */

/* numerical entity */
if (p[1] == '#') {
next = &p[2];
if (process_numeric_entity(&next, &code) == FAILURE)
goto invalid_code;

/* If we're in htmlspecialchars_decode, we're only decoding entities
const char *current_ptr = input;
const char *input_end = input + input_len; /* terminator address */
char *output_ptr = ZSTR_VAL(output);
int doctype = flags & ENT_HTML_DOC_TYPE_MASK;

assert(*input_end == '\0');

while (current_ptr < input_end) {
const char *ampersand_ptr = memchr(current_ptr, '&', input_end - current_ptr);
if (!ampersand_ptr) {
size_t tail_len = input_end - current_ptr;
if (tail_len > 0) {
memcpy(output_ptr, current_ptr, tail_len);
output_ptr += tail_len;
}
break;
}

/* Copy everything up to the found '&' */
size_t chunk_len = ampersand_ptr - current_ptr;
if (chunk_len > 0) {
memcpy(output_ptr, current_ptr, chunk_len);
output_ptr += chunk_len;
}

/* Now current_ptr points to the '&' character. */
current_ptr = ampersand_ptr;

/* If there are less than 4 bytes remaining, there isn't enough for an entity – copy '&' as a normal character */
if (current_ptr + 3 >= input_end) {
*output_ptr++ = *current_ptr++;
continue;
}

unsigned code = 0, code2 = 0;
const char *entity_end_ptr = NULL;
int valid_entity = 1;

if (current_ptr[1] == '#') {
/* Processing numeric entity */
const char *num_start = current_ptr + 2;
entity_end_ptr = num_start;
if (process_numeric_entity(&entity_end_ptr, &code) == FAILURE) {
valid_entity = 0;
}
/* If we're in htmlspecialchars_decode, we're only decoding entities
* that represent &, <, >, " and '. Is this one of them? */
if (!all && (code > 63U ||
stage3_table_be_apos_00000[code].data.ent.entity == NULL))
goto invalid_code;

/* are we allowed to decode this entity in this document type?
if (valid_entity && !all &&
(code > 63U ||
stage3_table_be_apos_00000[code].data.ent.entity == NULL))
{
valid_entity = 0;
}
/* are we allowed to decode this entity in this document type?
* HTML 5 is the only that has a character that cannot be used in
* a numeric entity but is allowed literally (U+000D). The
* unoptimized version would be ... || !numeric_entity_is_allowed(code) */
if (!unicode_cp_is_allowed(code, doctype) ||
(doctype == ENT_HTML_DOC_HTML5 && code == 0x0D))
goto invalid_code;
} else {
const char *start;
size_t ent_len;

next = &p[1];
start = next;

if (process_named_entity_html(&next, &start, &ent_len) == FAILURE)
goto invalid_code;

if (resolve_named_entity_html(start, ent_len, inv_map, &code, &code2) == FAILURE) {
if (doctype == ENT_HTML_DOC_XHTML && ent_len == 4 && start[0] == 'a'
&& start[1] == 'p' && start[2] == 'o' && start[3] == 's') {
/* uses html4 inv_map, which doesn't include apos;. This is a
* hack to support it */
code = (unsigned) '\'';
} else {
goto invalid_code;
}
}
}

assert(*next == ';');

if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
(code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE)))
/* && code2 == '\0' always true for current maps */)
goto invalid_code;

/* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
if (valid_entity && (!unicode_cp_is_allowed(code, doctype) ||
(doctype == ENT_HTML_DOC_HTML5 && code == 0x0D)))
{
valid_entity = 0;
}
} else {
/* Processing named entity */
const char *name_start = current_ptr + 1;
/* Search for ';' */
const char *semi_colon_ptr = memchr(name_start, ';', LONGEST_ENTITY_LENGTH + 1);
if (!semi_colon_ptr) {
valid_entity = 0;
} else {
size_t name_len = semi_colon_ptr - name_start;
if (name_len == 0) {
valid_entity = 0;
} else {
if (resolve_named_entity_html(name_start, name_len, inv_map, &code, &code2) == FAILURE) {
if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 &&
name_start[0] == 'a' && name_start[1] == 'p' &&
name_start[2] == 'o' && name_start[3] == 's')
{
/* uses html4 inv_map, which doesn't include apos;. This is a
* hack to support it */
code = (unsigned)'\'';
} else {
valid_entity = 0;
}
}
entity_end_ptr = semi_colon_ptr;
}
}
}

/* If entity_end_ptr is not found or does not point to ';', consider the entity invalid */
if (!valid_entity || entity_end_ptr == NULL || *entity_end_ptr != ';') {
*output_ptr++ = *current_ptr++;
continue;
}

/* Check if quotes are allowed for entities representing ' or " */
if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
(code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE))))
{
valid_entity = 0;
}

/* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
* the call is needed to ensure the codepoint <= U+00FF) */
if (charset != cs_utf_8) {
/* replace unicode code point */
if (map_from_unicode(code, charset, &code) == FAILURE || code2 != 0)
goto invalid_code; /* not representable in target charset */
}

q += write_octet_sequence((unsigned char*)q, charset, code);
if (code2) {
q += write_octet_sequence((unsigned char*)q, charset, code2);
}

/* jump over the valid entity; may go beyond size of buffer; np */
p = next + 1;
continue;

invalid_code:
for (; p < next; p++) {
*(q++) = *p;
}
}

*q = '\0';
ZSTR_LEN(ret) = (size_t)(q - ZSTR_VAL(ret));
if (valid_entity && charset != cs_utf_8) {
/* replace unicode code point */
if (map_from_unicode(code, charset, &code) == FAILURE || code2 != 0)
valid_entity = 0;
}

if (valid_entity) {
/* Write the parsed entity into the output buffer */
output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code);
if (code2) {
output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code2);
}
/* Move current_ptr past the semicolon */
current_ptr = entity_end_ptr + 1;
} else {
/* If the entity is invalid, copy characters from current_ptr up to entity_end_ptr */
if (entity_end_ptr) {
size_t len = entity_end_ptr - current_ptr;
memcpy(output_ptr, current_ptr, len);
output_ptr += len;
current_ptr = entity_end_ptr;
} else {
*output_ptr++ = *current_ptr++;
}
}
}

*output_ptr = '\0';
ZSTR_LEN(output) = (size_t)(output_ptr - ZSTR_VAL(output));
}
/* }}} */

Expand Down
Loading