Skip to content

Commit 63a9484

Browse files
committed
validate_utf8_char
A separate optimized validate_utf8_char function is used for validating multi-byte UTF-8 characters. The optimization comes from using more straightforward conditional logic and bitwise operations for faster execution.
1 parent 914fa23 commit 63a9484

File tree

1 file changed

+137
-9
lines changed

1 file changed

+137
-9
lines changed

ext/standard/html.c

Lines changed: 137 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -812,6 +812,126 @@ static void init_htmlspecialchars_lut(htmlspecialchars_lut* lut, const int flags
812812
}
813813
/* }}} */
814814

815+
static unsigned int validate_utf8_char(
816+
const unsigned char *str,
817+
const size_t str_len,
818+
size_t* cursor,
819+
zend_result* status
820+
) {
821+
const size_t pos = *cursor;
822+
*status = SUCCESS;
823+
const size_t tail_len = str_len - pos;
824+
825+
/* Check if at least 1 byte is available */
826+
if (tail_len < 1) {
827+
MB_FAILURE(pos, 1);
828+
}
829+
830+
const unsigned char c = str[pos];
831+
832+
/* ASCII (single byte) */
833+
if (c < 0x80) {
834+
*cursor = pos + 1;
835+
return c;
836+
}
837+
838+
/* Leading byte < 0xC2 => invalid multibyte start */
839+
if (c < 0xC2) {
840+
MB_FAILURE(pos, 1);
841+
}
842+
843+
/* 2-byte sequence (0xC2..0xDF) */
844+
if (c < 0xE0) {
845+
/* Need 2 bytes total */
846+
if (tail_len < 2) {
847+
MB_FAILURE(pos, 1);
848+
}
849+
const unsigned char b2 = str[pos + 1];
850+
851+
/* Check continuation byte 10xxxxxx */
852+
if ((b2 & 0xC0) != 0x80) {
853+
MB_FAILURE(pos, ((b2 < 0x80) || (b2 >= 0xC2 && b2 <= 0xF4)) ? 1 : 2);
854+
}
855+
856+
/* Combine bits into code point and check range >= 0x80 */
857+
const unsigned int cp = ((c & 0x1F) << 6) | (b2 & 0x3F);
858+
if (cp < 0x80) {
859+
MB_FAILURE(pos, 2);
860+
}
861+
862+
*cursor = pos + 2;
863+
return cp;
864+
}
865+
866+
/* 3-byte sequence (0xE0..0xEF) */
867+
if (c < 0xF0) {
868+
/* Need 3 bytes total and valid continuation bytes */
869+
if (tail_len < 3 ||
870+
((str[pos + 1] & 0xC0) != 0x80) ||
871+
((str[pos + 2] & 0xC0) != 0x80)) {
872+
if (tail_len < 2 ||
873+
((str[pos + 1] < 0x80) || (str[pos + 1] >= 0xC2 && str[pos + 1] <= 0xF4))) {
874+
MB_FAILURE(pos, 1);
875+
} else if (tail_len < 3 ||
876+
((str[pos + 2] < 0x80) || (str[pos + 2] >= 0xC2 && str[pos + 2] <= 0xF4))) {
877+
MB_FAILURE(pos, 2);
878+
} else {
879+
MB_FAILURE(pos, 3);
880+
}
881+
}
882+
883+
/* Combine bits and check for >= 0x800 and not in surrogate area */
884+
const unsigned int cp = ((c & 0x0F) << 12)
885+
| ((str[pos + 1] & 0x3F) << 6)
886+
| (str[pos + 2] & 0x3F);
887+
888+
if (cp < 0x800 || (cp >= 0xD800 && cp <= 0xDFFF)) {
889+
MB_FAILURE(pos, 3);
890+
}
891+
892+
*cursor = pos + 3;
893+
return cp;
894+
}
895+
896+
/* 4-byte sequence (0xF0..0xF4) */
897+
if (c < 0xF5) {
898+
/* Need 4 bytes total and valid continuation bytes */
899+
if (tail_len < 4 ||
900+
((str[pos + 1] & 0xC0) != 0x80) ||
901+
((str[pos + 2] & 0xC0) != 0x80) ||
902+
((str[pos + 3] & 0xC0) != 0x80)) {
903+
if (tail_len < 2 ||
904+
((str[pos + 1] < 0x80) || (str[pos + 1] >= 0xC2 && str[pos + 1] <= 0xF4))) {
905+
MB_FAILURE(pos, 1);
906+
} else if (tail_len < 3 ||
907+
((str[pos + 2] < 0x80) || (str[pos + 2] >= 0xC2 && str[pos + 2] <= 0xF4))) {
908+
MB_FAILURE(pos, 2);
909+
} else if (tail_len < 4 ||
910+
((str[pos + 3] < 0x80) || (str[pos + 3] >= 0xC2 && str[pos + 3] <= 0xF4))) {
911+
MB_FAILURE(pos, 3);
912+
} else {
913+
MB_FAILURE(pos, 4);
914+
}
915+
}
916+
917+
/* Combine bits and check range 0x10000..0x10FFFF */
918+
const unsigned int cp = ((c & 0x07) << 18)
919+
| ((str[pos + 1] & 0x3F) << 12)
920+
| ((str[pos + 2] & 0x3F) << 6)
921+
| (str[pos + 3] & 0x3F);
922+
923+
if (cp < 0x10000 || cp > 0x10FFFF) {
924+
MB_FAILURE(pos, 4);
925+
}
926+
927+
*cursor = pos + 4;
928+
return cp;
929+
}
930+
931+
/* Leading byte >= 0xF5 is invalid */
932+
MB_FAILURE(pos, 1);
933+
}
934+
815935
static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charset charset, unsigned code) {
816936
/* code is not necessarily a unicode code point */
817937
switch (charset) {
@@ -1478,15 +1598,23 @@ PHPAPI zend_string* php_htmlspecialchars_ex(
14781598
free_space--;
14791599
}
14801600

1481-
input_ptr++;
1482-
} else {
1483-
/* Multibyte chars */
1484-
zend_result status;
1485-
const size_t original_pos = (const char*)input_ptr - ZSTR_VAL(input);
1486-
size_t cursor = original_pos;
1487-
const unsigned int this_char = get_next_char(charset, (unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input),
1488-
&cursor, &status);
1489-
const size_t processed_len = cursor - original_pos;
1601+
input_ptr++;
1602+
} else {
1603+
/* Multibyte chars */
1604+
zend_result status;
1605+
const size_t original_pos = (const char*)input_ptr - ZSTR_VAL(input);
1606+
size_t cursor = original_pos;
1607+
1608+
unsigned int this_char = 0;
1609+
if (charset == cs_utf_8) {
1610+
this_char = validate_utf8_char((unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input),
1611+
&cursor, &status);
1612+
} else {
1613+
this_char = get_next_char(charset, (unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input),
1614+
&cursor, &status);
1615+
}
1616+
1617+
const size_t processed_len = cursor - original_pos;
14901618

14911619
if (status == FAILURE) {
14921620
if (flags & ENT_HTML_IGNORE_ERRORS) {

0 commit comments

Comments
 (0)