Skip to content

Commit 3f12d26

Browse files
committed
Merge branch 'PHP-8.1'
* PHP-8.1: Error handling for UTF-8 complies with WHATWG specification
2 parents 4bd7f4e + 04e59c9 commit 3f12d26

File tree

4 files changed

+93
-83
lines changed

4 files changed

+93
-83
lines changed

ext/mbstring/libmbfl/filters/mbfilter_utf8.c

Lines changed: 20 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -127,9 +127,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
127127
CK((*filter->output_function)(s, filter->data));
128128
} else {
129129
CK(mbfl_filt_put_invalid_char(filter));
130-
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
131-
goto retry;
132-
}
130+
goto retry;
133131
}
134132
break;
135133
case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
@@ -144,9 +142,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
144142
filter->status++;
145143
} else {
146144
CK(mbfl_filt_put_invalid_char(filter));
147-
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
148-
goto retry;
149-
}
145+
goto retry;
150146
}
151147
break;
152148
case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
@@ -161,9 +157,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
161157
filter->status++;
162158
} else {
163159
CK(mbfl_filt_put_invalid_char(filter));
164-
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
165-
goto retry;
166-
}
160+
goto retry;
167161
}
168162
break;
169163
case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
@@ -172,9 +166,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
172166
filter->status++;
173167
} else {
174168
CK(mbfl_filt_put_invalid_char(filter));
175-
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
176-
goto retry;
177-
}
169+
goto retry;
178170
}
179171
break;
180172

@@ -237,9 +229,7 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
237229
unsigned char c2 = *p++;
238230
if ((c2 & 0xC0) != 0x80) {
239231
*out++ = MBFL_BAD_INPUT;
240-
if (c2 < 0x80 || (c2 >= 0xC2 && c2 <= 0xF4)) {
241-
p--;
242-
}
232+
p--;
243233
} else {
244234
*out++ = ((c & 0x1F) << 6) | (c2 & 0x3F);
245235
}
@@ -252,34 +242,21 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
252242
unsigned char c3 = *p++;
253243
if ((c2 & 0xC0) != 0x80 || !((c2 >= 0x80 && c2 <= 0xBF) && ((c == 0xE0 && c2 >= 0xA0) || (c == 0xED && c2 < 0xA0) || (c > 0xE0 && c != 0xED)))) {
254244
*out++ = MBFL_BAD_INPUT;
255-
if (c2 < 0x80 || (c2 >= 0xC2 && c2 <= 0xF4)) {
256-
p -= 2;
257-
} else {
258-
p--;
259-
}
245+
p -= 2;
260246
} else if ((c3 & 0xC0) != 0x80) {
261247
*out++ = MBFL_BAD_INPUT;
262-
if (c3 < 0x80 || (c3 >= 0xC2 && c3 <= 0xF4)) {
263-
p--;
264-
}
248+
p--;
265249
} else {
266250
uint32_t decoded = ((c & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
267-
if (decoded >= 0xD800 && decoded <= 0xDFFF) {
251+
if (decoded < 0x800 || (decoded >= 0xD800 && decoded <= 0xDFFF)) {
268252
*out++ = MBFL_BAD_INPUT;
269253
} else {
270-
*out++ = (decoded < 0x800) ? MBFL_BAD_INPUT : decoded;
254+
*out++ = decoded;
271255
}
272256
}
273257
} else {
274258
*out++ = MBFL_BAD_INPUT;
275-
/* Skip over some number of bytes to duplicate error-handling behavior of old implementation */
276-
while (p < e) {
277-
c = *p;
278-
if ((c & 0xC0) != 0x80) {
279-
if (c >= 0x80 && (c < 0xC2 || c > 0xF4))
280-
p++;
281-
break;
282-
}
259+
while (p < e && (*p & 0xC0) == 0x80) {
283260
p++;
284261
}
285262
}
@@ -288,51 +265,28 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
288265
unsigned char c2 = *p++;
289266
unsigned char c3 = *p++;
290267
unsigned char c4 = *p++;
291-
if ((c2 & 0xC0) != 0x80) {
268+
/* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have
269+
* fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is
270+
* greater than U+10FFFF, which is the highest legal codepoint */
271+
if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) {
292272
*out++ = MBFL_BAD_INPUT;
293-
if (c2 < 0x80 || (c2 >= 0xC2 && c2 <= 0xF4)) {
294-
p -= 3;
295-
} else {
296-
p -= 2;
297-
}
298-
} else if ((c3 & 0xC0) != 0x80 || !((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || (c > 0xF0 && c < 0xF4))) {
273+
p -= 3;
274+
} else if ((c3 & 0xC0) != 0x80) {
299275
*out++ = MBFL_BAD_INPUT;
300-
if (!((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || (c > 0xF0 && c < 0xF4))) {
301-
if (c2 >= 0x80 && (c2 < 0xC2 || c2 > 0xF4)) {
302-
p -= 2;
303-
} else {
304-
p -= 3;
305-
}
306-
} else if (c3 < 0x80 || (c3 >= 0xC2 && c3 <= 0xF4)) {
307-
p -= 2;
308-
} else {
309-
p--;
310-
}
276+
p -= 2;
311277
} else if ((c4 & 0xC0) != 0x80) {
312278
*out++ = MBFL_BAD_INPUT;
313-
if (c4 < 0x80 || (c4 >= 0xC2 && c4 <= 0xF4)) {
314-
p--;
315-
}
279+
p--;
316280
} else {
317281
uint32_t decoded = ((c & 0x7) << 18) | ((c2 & 0x3F) << 12) | ((c3 & 0x3F) << 6) | (c4 & 0x3F);
318282
*out++ = (decoded < 0x10000) ? MBFL_BAD_INPUT : decoded;
319283
}
320284
} else {
321285
*out++ = MBFL_BAD_INPUT;
322-
/* Skip over some number of bytes to duplicate error-handling behavior of old implementation */
323286
if (p < e) {
324287
unsigned char c2 = *p;
325-
if (!((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || (c > 0xF0 && c < 0xF4))) {
326-
if (c2 >= 0x80 && (c2 < 0xC2 || c2 > 0xF4))
327-
p++;
328-
} else {
329-
while (p < e) {
330-
c = *p;
331-
if ((c & 0xC0) != 0x80) {
332-
if (c >= 0x80 && (c < 0xC2 || c > 0xF4))
333-
p++;
334-
break;
335-
}
288+
if ((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || c == 0xF2 || c == 0xF3) {
289+
while (p < e && (*p & 0xC0) == 0x80) {
336290
p++;
337291
}
338292
}

ext/mbstring/tests/illformed_utf_sequences.phpt

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,28 +21,28 @@ var_dump(chk_enc("\x31\x32\x33", 0));
2121
var_dump(chk_enc("\x41\x42\x43", 0));
2222
var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6));
2323
var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6));
24-
var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 6));
25-
var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 6));
26-
var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 9));
27-
var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 8));
24+
var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 9));
25+
var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 9));
26+
var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 12));
27+
var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 11));
2828
var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15));
2929
var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15));
3030
var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18));
3131
var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18));
3232

3333
var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0));
34-
var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 6));
35-
var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 9));
34+
var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 9));
35+
var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 12));
3636
var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15));
3737
var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18));
3838

3939
var_dump(chk_enc("\xc1\xbf", 2));
4040
var_dump(chk_enc("\xc2\x80", 0));
4141
var_dump(chk_enc("\xdf\xbf", 0));
42-
var_dump(chk_enc("\xe0\x9f\xff", 2));
42+
var_dump(chk_enc("\xe0\x9f\xff", 3));
4343
var_dump(chk_enc("\xe0\xa0\x80", 2));
4444
var_dump(chk_enc("\xef\xbf\xbf", 0));
45-
var_dump(chk_enc("\xf0\x8f\xbf\xbf", 3));
45+
var_dump(chk_enc("\xf0\x8f\xbf\xbf", 4));
4646
var_dump(chk_enc("\xf0\x90\x80\x80", 0));
4747
var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4));
4848
var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5));
@@ -57,7 +57,7 @@ echo "UTF-8 and surrogates area\n";
5757
$out = '';
5858
$cnt = 0;
5959
for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
60-
$s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 2);
60+
$s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 3);
6161
if ($s === false) {
6262
$cnt++;
6363
} else {
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
--TEST--
2+
Confirm error handling for UTF-8 complies with WHATWG spec
3+
--EXTENSIONS--
4+
mbstring
5+
--FILE--
6+
<?php
7+
/* The WHATWG specifies not just how web browsers should handle _valid_
8+
* UTF-8 text, but how they should handle _invalid_ UTF-8 text (such
9+
* as how many error markers each invalid byte sequence should decode
10+
* to).
11+
* That specification is followed by the JavaScript Encoding API.
12+
*
13+
* The API documentation for mb_convert_encoding does not specify how
14+
* many error markers we will emit for each possible invalid byte
15+
* sequence, so we might as well comply with the WHATWG specification.
16+
*
17+
* Thanks to Martin Auswöger for pointing this out... and another big
18+
* thanks for providing test cases!
19+
*
20+
* Ref: https://encoding.spec.whatwg.org/#utf-8-decoder
21+
*/
22+
mb_substitute_character(0x25);
23+
24+
$testCases = [
25+
["\x80", "%"],
26+
["\xFF", "%"],
27+
["\xC2\x7F", "%\x7F"],
28+
["\xC2\x80", "\xC2\x80"],
29+
["\xDF\xBF", "\xDF\xBF"],
30+
["\xDF\xC0", "%%"],
31+
["\xE0\xA0\x7F", "%\x7F"],
32+
["\xE0\xA0\x80", "\xE0\xA0\x80"],
33+
["\xEF\xBF\xBF", "\xEF\xBF\xBF"],
34+
["\xEF\xBF\xC0", "%%"],
35+
["\xF0\x90\x80\x7F", "%\x7F"],
36+
["\xF0\x90\x80\x80", "\xF0\x90\x80\x80"],
37+
["\xF4\x8F\xBF\xBF", "\xF4\x8F\xBF\xBF"],
38+
["\xF4\x8F\xBF\xC0", "%%"],
39+
["\xFA\x80\x80\x80\x80", "%%%%%"],
40+
["\xFB\xBF\xBF\xBF\xBF", "%%%%%"],
41+
["\xFD\x80\x80\x80\x80\x80", "%%%%%%"],
42+
["\xFD\xBF\xBF\xBF\xBF\xBF", "%%%%%%"]
43+
];
44+
45+
foreach ($testCases as $testCase) {
46+
$result = mb_convert_encoding($testCase[0], 'UTF-8', 'UTF-8');
47+
if ($result !== $testCase[1]) {
48+
die("Expected UTF-8 string " . bin2hex($testCase[0]) . " to convert to UTF-8 string " . bin2hex($testCase[1]) . "; got " . bin2hex($result));
49+
}
50+
}
51+
52+
echo "All done!\n";
53+
54+
?>
55+
--EXPECT--
56+
All done!

ext/mbstring/tests/utf_encodings.phpt

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -761,14 +761,14 @@ testValidString('', '', 'UTF-8', 'UTF-32BE');
761761

762762
$invalid = array(
763763
// Codepoints outside of valid 0-0x10FFFF range for Unicode
764-
"\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0x110000
764+
"\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x110000
765765
"\xF7\x80\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x1C0000
766766
"\xF7\xBF\xBF\xBF" => str_repeat("\x00\x00\x00%", 4), // CP 0x1FFFFF
767767

768768
// Reserved range for UTF-16 surrogate pairs
769-
"\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 2), // CP 0xD800
770-
"\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 2), // CP 0xDBFF
771-
"\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 2), // CP 0xDFFF
769+
"\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0xD800
770+
"\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDBFF
771+
"\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDFFF
772772

773773
// Truncated characters
774774
"\xDF" => "\x00\x00\x00%", // should have been 2-byte
@@ -788,8 +788,8 @@ $invalid = array(
788788

789789
// Multi-byte characters which end too soon and go to a junk byte
790790
// (Which isn't even valid to start a new character)
791-
"\xF0\xBF\xBF\xFF" => "\x00\x00\x00%",
792-
"\xF0\xBF\xFF" => "\x00\x00\x00%",
791+
"\xF0\xBF\xBF\xFF" => str_repeat("\x00\x00\x00%", 2),
792+
"\xF0\xBF\xFF" => str_repeat("\x00\x00\x00%", 2),
793793

794794
// Continuation bytes which appear outside of a MB char
795795
"\x80" => "\x00\x00\x00%",
@@ -799,8 +799,8 @@ $invalid = array(
799799
// Overlong code units
800800
// (Using more bytes than needed to encode a character)
801801
"\xC1\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 2 bytes
802-
"\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 3 bytes
803-
"\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 3) // didn't need 4 bytes
802+
"\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 3), // didn't need 3 bytes
803+
"\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 4) // didn't need 4 bytes
804804
);
805805

806806
testInvalidCodepoints($invalid, 'UTF-8');

0 commit comments

Comments
 (0)