Skip to content

Commit 55fbf83

Browse files
committed
CR, refactoring, codestyle
1 parent 66f5709 commit 55fbf83

File tree

1 file changed

+133
-139
lines changed

1 file changed

+133
-139
lines changed

ext/standard/html.c

Lines changed: 133 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -809,149 +809,143 @@ static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charse
809809
/* +2 is 1 because of rest (probably unnecessary), 1 because of terminating 0 */
810810
#define TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(oldlen) ((oldlen) + (oldlen) / 5 + 2)
811811
static void traverse_for_entities(
812-
const char *input,
813-
size_t input_len,
812+
const zend_string *input,
814813
zend_string *output, /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */
815-
int all,
816-
int flags,
814+
const int all,
815+
const int flags,
817816
const entity_ht *inv_map,
818-
enum entity_charset charset)
817+
const enum entity_charset charset)
819818
{
820-
const char *current_ptr = input;
821-
const char *input_end = input + input_len; /* terminator address */
822-
char *output_ptr = ZSTR_VAL(output);
823-
int doctype = flags & ENT_HTML_DOC_TYPE_MASK;
824-
825-
assert(*input_end == '\0');
826-
827-
while (current_ptr < input_end) {
828-
const char *ampersand_ptr = memchr(current_ptr, '&', input_end - current_ptr);
829-
if (!ampersand_ptr) {
830-
size_t tail_len = input_end - current_ptr;
831-
if (tail_len > 0) {
832-
memcpy(output_ptr, current_ptr, tail_len);
833-
output_ptr += tail_len;
834-
}
835-
break;
836-
}
837-
838-
/* Copy everything up to the found '&' */
839-
size_t chunk_len = ampersand_ptr - current_ptr;
840-
if (chunk_len > 0) {
841-
memcpy(output_ptr, current_ptr, chunk_len);
842-
output_ptr += chunk_len;
843-
}
844-
845-
/* Now current_ptr points to the '&' character. */
846-
current_ptr = ampersand_ptr;
847-
848-
/* If there are less than 4 bytes remaining, there isn't enough for an entity – copy '&' as a normal character */
849-
if (current_ptr + 3 >= input_end) {
850-
*output_ptr++ = *current_ptr++;
851-
continue;
852-
}
853-
854-
unsigned code = 0, code2 = 0;
855-
const char *entity_end_ptr = NULL;
856-
int valid_entity = 1;
857-
858-
if (current_ptr[1] == '#') {
859-
/* Processing numeric entity */
860-
const char *num_start = current_ptr + 2;
861-
entity_end_ptr = num_start;
862-
if (process_numeric_entity(&entity_end_ptr, &code) == FAILURE) {
863-
valid_entity = 0;
864-
}
865-
/* If we're in htmlspecialchars_decode, we're only decoding entities
866-
* that represent &, <, >, " and '. Is this one of them? */
867-
if (valid_entity && !all &&
868-
(code > 63U ||
869-
stage3_table_be_apos_00000[code].data.ent.entity == NULL))
870-
{
871-
valid_entity = 0;
872-
}
873-
/* are we allowed to decode this entity in this document type?
874-
* HTML 5 is the only that has a character that cannot be used in
875-
* a numeric entity but is allowed literally (U+000D). The
876-
* unoptimized version would be ... || !numeric_entity_is_allowed(code) */
877-
if (valid_entity && (!unicode_cp_is_allowed(code, doctype) ||
878-
(doctype == ENT_HTML_DOC_HTML5 && code == 0x0D)))
879-
{
880-
valid_entity = 0;
881-
}
882-
} else {
883-
/* Processing named entity */
884-
const char *name_start = current_ptr + 1;
885-
/* Search for ';' */
886-
const char *semi_colon_ptr = memchr(name_start, ';', LONGEST_ENTITY_LENGTH + 1);
887-
if (!semi_colon_ptr) {
888-
valid_entity = 0;
889-
} else {
890-
size_t name_len = semi_colon_ptr - name_start;
891-
if (name_len == 0) {
892-
valid_entity = 0;
893-
} else {
894-
if (resolve_named_entity_html(name_start, name_len, inv_map, &code, &code2) == FAILURE) {
895-
if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 &&
896-
name_start[0] == 'a' && name_start[1] == 'p' &&
897-
name_start[2] == 'o' && name_start[3] == 's')
898-
{
899-
/* uses html4 inv_map, which doesn't include apos;. This is a
819+
const char *current_ptr = ZSTR_VAL(input);
820+
const char *input_end = current_ptr + input->len; /* terminator address */
821+
char *output_ptr = ZSTR_VAL(output);
822+
const int doctype = flags & ENT_HTML_DOC_TYPE_MASK;
823+
824+
while (current_ptr < input_end) {
825+
const char *ampersand_ptr = memchr(current_ptr, '&', input_end - current_ptr);
826+
if (!ampersand_ptr) {
827+
const size_t tail_len = input_end - current_ptr;
828+
if (tail_len > 0) {
829+
memcpy(output_ptr, current_ptr, tail_len);
830+
output_ptr += tail_len;
831+
}
832+
break;
833+
}
834+
835+
/* Copy everything up to the found '&' */
836+
const size_t chunk_len = ampersand_ptr - current_ptr;
837+
if (chunk_len > 0) {
838+
memcpy(output_ptr, current_ptr, chunk_len);
839+
output_ptr += chunk_len;
840+
}
841+
842+
/* Now current_ptr points to the '&' character. */
843+
current_ptr = ampersand_ptr;
844+
845+
/* If there are less than 4 bytes remaining, there isn't enough for an entity – copy '&' as a normal character */
846+
if (input_end - current_ptr < 4){
847+
const size_t remaining = input_end - current_ptr;
848+
memcpy(output_ptr, current_ptr, remaining);
849+
output_ptr += remaining;
850+
break;
851+
}
852+
853+
unsigned code = 0, code2 = 0;
854+
const char *entity_end_ptr = NULL;
855+
bool valid_entity = true;
856+
857+
if (current_ptr[1] == '#') {
858+
/* Processing numeric entity */
859+
const char *num_start = current_ptr + 2;
860+
entity_end_ptr = num_start;
861+
if (process_numeric_entity(&entity_end_ptr, &code) == FAILURE) {
862+
valid_entity = false;
863+
} else if (!all && (code > 63U || stage3_table_be_apos_00000[code].data.ent.entity == NULL)) {
864+
/* If we're in htmlspecialchars_decode, we're only decoding entities
865+
* that represent &, <, >, " and '. Is this one of them? */
866+
valid_entity = false;
867+
} else if (!unicode_cp_is_allowed(code, doctype) ||
868+
(doctype == ENT_HTML_DOC_HTML5 && code == 0x0D)) {
869+
/* are we allowed to decode this entity in this document type?
870+
* HTML 5 is the only that has a character that cannot be used in
871+
* a numeric entity but is allowed literally (U+000D). The
872+
* unoptimized version would be ... || !numeric_entity_is_allowed(code) */
873+
valid_entity = false;
874+
}
875+
} else {
876+
/* Processing named entity */
877+
const char *name_start = current_ptr + 1;
878+
/* Search for ';' */
879+
const size_t max_search_len = MIN(LONGEST_ENTITY_LENGTH + 1, input_end - name_start);
880+
const char *semi_colon_ptr = memchr(name_start, ';', max_search_len);
881+
if (!semi_colon_ptr) {
882+
valid_entity = false;
883+
} else {
884+
const size_t name_len = semi_colon_ptr - name_start;
885+
if (name_len == 0) {
886+
valid_entity = false;
887+
} else {
888+
if (resolve_named_entity_html(name_start, name_len, inv_map, &code, &code2) == FAILURE) {
889+
if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 &&
890+
name_start[0] == 'a' && name_start[1] == 'p' &&
891+
name_start[2] == 'o' && name_start[3] == 's')
892+
{
893+
/* uses html4 inv_map, which doesn't include apos;. This is a
900894
* hack to support it */
901-
code = (unsigned)'\'';
902-
} else {
903-
valid_entity = 0;
904-
}
905-
}
906-
entity_end_ptr = semi_colon_ptr;
907-
}
908-
}
909-
}
910-
911-
/* If entity_end_ptr is not found or does not point to ';', consider the entity invalid */
912-
if (!valid_entity || entity_end_ptr == NULL || *entity_end_ptr != ';') {
913-
*output_ptr++ = *current_ptr++;
914-
continue;
915-
}
916-
917-
/* Check if quotes are allowed for entities representing ' or " */
918-
if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
919-
(code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE))))
920-
{
921-
valid_entity = 0;
922-
}
923-
924-
/* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
895+
code = (unsigned)'\'';
896+
} else {
897+
valid_entity = false;
898+
}
899+
}
900+
entity_end_ptr = semi_colon_ptr;
901+
}
902+
}
903+
}
904+
905+
/* If entity_end_ptr is not found or does not point to ';', consider the entity invalid */
906+
if (!valid_entity || entity_end_ptr == NULL || *entity_end_ptr != ';') {
907+
*output_ptr++ = *current_ptr++;
908+
continue;
909+
}
910+
911+
/* Check if quotes are allowed for entities representing ' or " */
912+
if ((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
913+
(code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE)))
914+
{
915+
valid_entity = false;
916+
}
917+
918+
/* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
925919
* the call is needed to ensure the codepoint <= U+00FF) */
926-
if (valid_entity && charset != cs_utf_8) {
927-
/* replace unicode code point */
928-
if (map_from_unicode(code, charset, &code) == FAILURE || code2 != 0)
929-
valid_entity = 0;
930-
}
931-
932-
if (valid_entity) {
933-
/* Write the parsed entity into the output buffer */
934-
output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code);
935-
if (code2) {
936-
output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code2);
937-
}
938-
/* Move current_ptr past the semicolon */
939-
current_ptr = entity_end_ptr + 1;
940-
} else {
941-
/* If the entity is invalid, copy characters from current_ptr up to entity_end_ptr */
942-
if (entity_end_ptr) {
943-
size_t len = entity_end_ptr - current_ptr;
944-
memcpy(output_ptr, current_ptr, len);
945-
output_ptr += len;
946-
current_ptr = entity_end_ptr;
947-
} else {
948-
*output_ptr++ = *current_ptr++;
949-
}
950-
}
951-
}
952-
953-
*output_ptr = '\0';
954-
ZSTR_LEN(output) = (size_t)(output_ptr - ZSTR_VAL(output));
920+
if (valid_entity && charset != cs_utf_8) {
921+
/* replace unicode code point */
922+
if (map_from_unicode(code, charset, &code) == FAILURE || code2 != 0)
923+
valid_entity = false;
924+
}
925+
926+
if (valid_entity) {
927+
/* Write the parsed entity into the output buffer */
928+
output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code);
929+
if (code2) {
930+
output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code2);
931+
}
932+
/* Move current_ptr past the semicolon */
933+
current_ptr = entity_end_ptr + 1;
934+
} else {
935+
/* If the entity is invalid, copy characters from current_ptr up to entity_end_ptr */
936+
if (entity_end_ptr) {
937+
const size_t len = entity_end_ptr - current_ptr;
938+
memcpy(output_ptr, current_ptr, len);
939+
output_ptr += len;
940+
current_ptr = entity_end_ptr;
941+
} else {
942+
*output_ptr++ = *current_ptr++;
943+
}
944+
}
945+
}
946+
947+
*output_ptr = '\0';
948+
ZSTR_LEN(output) = (size_t)(output_ptr - ZSTR_VAL(output));
955949
}
956950
/* }}} */
957951

@@ -1036,7 +1030,7 @@ PHPAPI zend_string *php_unescape_html_entities(zend_string *str, int all, int fl
10361030
inverse_map = unescape_inverse_map(all, flags);
10371031

10381032
/* replace numeric entities */
1039-
traverse_for_entities(ZSTR_VAL(str), ZSTR_LEN(str), ret, all, flags, inverse_map, charset);
1033+
traverse_for_entities(str, ret, all, flags, inverse_map, charset);
10401034

10411035
return ret;
10421036
}

0 commit comments

Comments
 (0)