Skip to content

Commit 27a1bec

Browse files
committed
Split out a specialized function to decode multibyte UTF-8 sequences
Decoding purely multibyte UTF-8 is common for example in the case of JSON. Furthermore, we want to avoid the switch on the character set in such hot code. Finally, we also add UNEXPECTED markers to move code to the cold section which reduces pressure on the µop and instruction caches.
1 parent 7478517 commit 27a1bec

File tree

3 files changed

+97
-62
lines changed

3 files changed

+97
-62
lines changed

UPGRADING.INTERNALS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ PHP 8.5 INTERNALS UPGRADE NOTES
5555
is still valid. This is useful when a GC cycle is collected and the
5656
database object can be destroyed prior to destroying the statement.
5757

58+
- ext/standard
59+
. Added `php_next_utf8_char_mb()` to decode the next UTF-8 multibyte
60+
codepoint (i.e. >= 2 bytes).
61+
5862
========================
5963
4. OpCode changes
6064
========================

ext/standard/html.c

Lines changed: 92 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,16 @@
5353
(all) = (all) && !CHARSET_PARTIAL_SUPPORT((charset)) && ((doctype) != ENT_HTML_DOC_XML1); \
5454
} while (0)
5555

56-
#define MB_FAILURE(pos, advance) do { \
56+
#define MB_FAILURE_NO_STATUS(pos, advance) do { \
5757
*cursor = pos + (advance); \
58-
*status = FAILURE; \
5958
return 0; \
6059
} while (0)
6160

61+
#define MB_FAILURE(pos, advance) do { \
62+
*status = FAILURE; \
63+
MB_FAILURE_NO_STATUS(pos, advance); \
64+
} while (0)
65+
6266
#define CHECK_LEN(pos, chars_need) ((str_len - (pos)) >= (chars_need))
6367

6468
/* valid as single byte character or leading byte */
@@ -85,6 +89,87 @@ static char *get_default_charset(void) {
8589
}
8690
/* }}} */
8791

92+
/* Decodes the next UTF-8 multibyte codepoint (i.e. >= 2 bytes).
93+
* Uses `c` as the leading byte. */
94+
PHPAPI unsigned int php_next_utf8_char_mb(
95+
const unsigned char *str,
96+
unsigned char c,
97+
size_t str_len,
98+
size_t *cursor)
99+
{
100+
size_t pos = *cursor;
101+
unsigned int this_char = 0;
102+
103+
/* We'll follow strategy 2. from section 3.6.1 of UTR #36:
104+
* "In a reported illegal byte sequence, do not include any
105+
* non-initial byte that encodes a valid character or is a leading
106+
* byte for a valid sequence." */
107+
108+
ZEND_ASSERT(c >= 0x80);
109+
110+
if (UNEXPECTED(c < 0xc2)) {
111+
MB_FAILURE_NO_STATUS(pos, 1);
112+
} else if (c < 0xe0) {
113+
if (UNEXPECTED(!CHECK_LEN(pos, 2)))
114+
MB_FAILURE_NO_STATUS(pos, 1);
115+
116+
if (UNEXPECTED(!utf8_trail(str[pos + 1]))) {
117+
MB_FAILURE_NO_STATUS(pos, utf8_lead(str[pos + 1]) ? 1 : 2);
118+
}
119+
this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
120+
if (UNEXPECTED(this_char < 0x80)) { /* non-shortest form */
121+
MB_FAILURE_NO_STATUS(pos, 2);
122+
}
123+
pos += 2;
124+
} else if (c < 0xf0) {
125+
size_t avail = str_len - pos;
126+
127+
if (UNEXPECTED(avail < 3 ||
128+
!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2]))) {
129+
if (avail < 2 || utf8_lead(str[pos + 1]))
130+
MB_FAILURE_NO_STATUS(pos, 1);
131+
else if (avail < 3 || utf8_lead(str[pos + 2]))
132+
MB_FAILURE_NO_STATUS(pos, 2);
133+
else
134+
MB_FAILURE_NO_STATUS(pos, 3);
135+
}
136+
137+
this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
138+
if (UNEXPECTED(this_char < 0x800)) { /* non-shortest form */
139+
MB_FAILURE_NO_STATUS(pos, 3);
140+
} else if (UNEXPECTED(this_char >= 0xd800 && this_char <= 0xdfff)) { /* surrogate */
141+
MB_FAILURE_NO_STATUS(pos, 3);
142+
}
143+
pos += 3;
144+
} else if (c < 0xf5) {
145+
size_t avail = str_len - pos;
146+
147+
if (UNEXPECTED(avail < 4 ||
148+
!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2]) ||
149+
!utf8_trail(str[pos + 3]))) {
150+
if (avail < 2 || utf8_lead(str[pos + 1]))
151+
MB_FAILURE_NO_STATUS(pos, 1);
152+
else if (avail < 3 || utf8_lead(str[pos + 2]))
153+
MB_FAILURE_NO_STATUS(pos, 2);
154+
else if (avail < 4 || utf8_lead(str[pos + 3]))
155+
MB_FAILURE_NO_STATUS(pos, 3);
156+
else
157+
MB_FAILURE_NO_STATUS(pos, 4);
158+
}
159+
160+
this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
161+
if (UNEXPECTED(this_char < 0x10000 || this_char > 0x10FFFF)) { /* non-shortest form or outside range */
162+
MB_FAILURE_NO_STATUS(pos, 4);
163+
}
164+
pos += 4;
165+
} else {
166+
MB_FAILURE_NO_STATUS(pos, 1);
167+
}
168+
169+
*cursor = pos;
170+
return this_char;
171+
}
172+
88173
/* {{{ get_next_char */
89174
static inline unsigned int get_next_char(
90175
enum entity_charset charset,
@@ -105,72 +190,17 @@ static inline unsigned int get_next_char(
105190
switch (charset) {
106191
case cs_utf_8:
107192
{
108-
/* We'll follow strategy 2. from section 3.6.1 of UTR #36:
109-
* "In a reported illegal byte sequence, do not include any
110-
* non-initial byte that encodes a valid character or is a leading
111-
* byte for a valid sequence." */
112193
unsigned char c;
113194
c = str[pos];
114195
if (c < 0x80) {
115196
this_char = c;
116197
pos++;
117-
} else if (c < 0xc2) {
118-
MB_FAILURE(pos, 1);
119-
} else if (c < 0xe0) {
120-
if (!CHECK_LEN(pos, 2))
121-
MB_FAILURE(pos, 1);
122-
123-
if (!utf8_trail(str[pos + 1])) {
124-
MB_FAILURE(pos, utf8_lead(str[pos + 1]) ? 1 : 2);
125-
}
126-
this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
127-
if (this_char < 0x80) { /* non-shortest form */
128-
MB_FAILURE(pos, 2);
129-
}
130-
pos += 2;
131-
} else if (c < 0xf0) {
132-
size_t avail = str_len - pos;
133-
134-
if (avail < 3 ||
135-
!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2])) {
136-
if (avail < 2 || utf8_lead(str[pos + 1]))
137-
MB_FAILURE(pos, 1);
138-
else if (avail < 3 || utf8_lead(str[pos + 2]))
139-
MB_FAILURE(pos, 2);
140-
else
141-
MB_FAILURE(pos, 3);
142-
}
143-
144-
this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
145-
if (this_char < 0x800) { /* non-shortest form */
146-
MB_FAILURE(pos, 3);
147-
} else if (this_char >= 0xd800 && this_char <= 0xdfff) { /* surrogate */
148-
MB_FAILURE(pos, 3);
149-
}
150-
pos += 3;
151-
} else if (c < 0xf5) {
152-
size_t avail = str_len - pos;
153-
154-
if (avail < 4 ||
155-
!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2]) ||
156-
!utf8_trail(str[pos + 3])) {
157-
if (avail < 2 || utf8_lead(str[pos + 1]))
158-
MB_FAILURE(pos, 1);
159-
else if (avail < 3 || utf8_lead(str[pos + 2]))
160-
MB_FAILURE(pos, 2);
161-
else if (avail < 4 || utf8_lead(str[pos + 3]))
162-
MB_FAILURE(pos, 3);
163-
else
164-
MB_FAILURE(pos, 4);
165-
}
166-
167-
this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
168-
if (this_char < 0x10000 || this_char > 0x10FFFF) { /* non-shortest form or outside range */
169-
MB_FAILURE(pos, 4);
170-
}
171-
pos += 4;
172198
} else {
173-
MB_FAILURE(pos, 1);
199+
this_char = php_next_utf8_char_mb(str, c, str_len, cursor);
200+
if (UNEXPECTED(this_char == 0)) {
201+
*status = FAILURE;
202+
}
203+
return this_char;
174204
}
175205
}
176206
break;

ext/standard/html.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,5 +48,6 @@ PHPAPI zend_string *php_escape_html_entities(const unsigned char *old, size_t ol
4848
PHPAPI zend_string *php_escape_html_entities_ex(const unsigned char *old, size_t oldlen, int all, int flags, const char *hint_charset, bool double_encode, bool quiet);
4949
PHPAPI zend_string *php_unescape_html_entities(zend_string *str, int all, int flags, const char *hint_charset);
5050
PHPAPI unsigned int php_next_utf8_char(const unsigned char *str, size_t str_len, size_t *cursor, zend_result *status);
51+
PHPAPI unsigned int php_next_utf8_char_mb(const unsigned char *str, unsigned char c, size_t str_len, size_t *cursor);
5152

5253
#endif /* HTML_H */

0 commit comments

Comments
 (0)