Skip to content

Commit bfcb754

Browse files
committed
- Fixed get_next_char(), used by htmlentities/htmlspecialchars, accepting
certain ill-formed UTF-8 sequences.
1 parent 3943351 commit bfcb754

File tree

4 files changed

+90
-7
lines changed

4 files changed

+90
-7
lines changed

ext/standard/html.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ inline static unsigned int get_next_char(enum entity_charset charset,
129129
MB_WRITE(c);
130130
this_char = c;
131131
pos++;
132-
} else if (c < 0xc0) {
132+
} else if (c < 0xc2) {
133133
MB_FAILURE(pos);
134134
} else if (c < 0xe0) {
135135
CHECK_LEN(pos, 2);
@@ -161,7 +161,7 @@ inline static unsigned int get_next_char(enum entity_charset charset,
161161
MB_WRITE((unsigned char)str[pos + 1]);
162162
MB_WRITE((unsigned char)str[pos + 2]);
163163
pos += 3;
164-
} else if (c < 0xf8) {
164+
} else if (c < 0xf5) {
165165
CHECK_LEN(pos, 4);
166166
if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
167167
MB_FAILURE(pos);
@@ -173,7 +173,7 @@ inline static unsigned int get_next_char(enum entity_charset charset,
173173
MB_FAILURE(pos);
174174
}
175175
this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
176-
if (this_char < 0x10000) {
176+
if (this_char < 0x10000 || this_char > 0x10FFFF) {
177177
MB_FAILURE(pos);
178178
}
179179
MB_WRITE((unsigned char)c);

ext/standard/tests/strings/htmlentities-utf-2.phpt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ foreach($strings as $string) {
5050
%unicode|string%(16) "266561637574653b"
5151
%unicode|string%(2) "79"
5252
%unicode|string%(2) "79"
53-
%unicode|string%(8) "f7bfbfbf"
54-
%unicode|string%(8) "f7bfbfbf"
53+
%unicode|string%(0) ""
54+
%unicode|string%(0) ""
5555
%unicode|string%(0) ""
5656
%unicode|string%(0) ""
5757
%unicode|string%(0) ""
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
--TEST--
2+
Test get_next_char(), used by htmlentities()/htmlspecialchars(): validity of UTF-8 sequences
3+
--FILE--
4+
<?php
5+
6+
/* conformance to Unicode 5.2, section 3.9, D92 */
7+
8+
$val_ranges = array(
9+
array(array(0x00, 0x7F)),
10+
array(array(0xC2, 0xDF), array(0x80, 0xBF)),
11+
array(array(0xE0, 0xE0), array(0xA0, 0xBF), array(0x80, 0xBF)),
12+
array(array(0xE1, 0xEC), array(0x80, 0xBF), array(0x80, 0xBF)),
13+
array(array(0xED, 0xED), array(0x80, 0x9F), array(0x80, 0xBF)),
14+
array(array(0xEE, 0xEF), array(0x80, 0xBF), array(0x80, 0xBF)),
15+
array(array(0xF0, 0xF0), array(0x90, 0xBF), array(0x80, 0xBF), array(0x80, 0xBF)),
16+
array(array(0xF1, 0xF3), array(0x80, 0xBF), array(0x80, 0xBF), array(0x80, 0xBF)),
17+
array(array(0xF4, 0xF4), array(0x80, 0x8F), array(0x80, 0xBF), array(0x80, 0xBF)),
18+
);
19+
20+
function is_valid($seq) {
21+
global $val_ranges;
22+
$b = ord($seq[0]);
23+
foreach ($val_ranges as $l) {
24+
if ($b >= $l[0][0] && $b <= $l[0][1]) {
25+
if (count($l) != strlen($seq)) {
26+
return false;
27+
}
28+
for ($n = 1; $n < strlen($seq); $n++) {
29+
if (ord($seq[$n]) < $l[$n][0] || ord($seq[$n]) > $l[$n][1]) {
30+
return false;
31+
}
32+
}
33+
return true;
34+
}
35+
}
36+
return false;
37+
}
38+
39+
function concordance($s) {
40+
$vhe = strlen(htmlspecialchars($s, ENT_QUOTES, "UTF-8")) > 0;
41+
$v = is_valid($s);
42+
return ($vhe === $v);
43+
}
44+
45+
for ($b1 = 0xC0; $b1 < 0xE0; $b1++) {
46+
for ($b2 = 0x80; $b2 < 0xBF; $b2++) {
47+
$s = chr($b1).chr($b2);
48+
if (!concordance($s))
49+
echo "Discordance for ".bin2hex($s),"\n";
50+
}
51+
}
52+
53+
54+
for ($b1 = 0xE0; $b1 < 0xEF; $b1++) {
55+
for ($b2 = 0x80; $b2 < 0xBF; $b2++) {
56+
$s = chr($b1).chr($b2)."\x80";
57+
if (!concordance($s))
58+
echo "Discordance for ".bin2hex($s),"\n";
59+
$s = chr($b1).chr($b2)."\xBF";
60+
if (!concordance($s))
61+
echo "Discordance for ".bin2hex($s),"\n";
62+
}
63+
}
64+
65+
for ($b1 = 0xF0; $b1 < 0xFF; $b1++) {
66+
for ($b2 = 0x80; $b2 < 0xBF; $b2++) {
67+
$s = chr($b1).chr($b2)."\x80\x80";
68+
if (!concordance($s))
69+
echo "Discordance for ".bin2hex($s),"\n";
70+
$s = chr($b1).chr($b2)."\xBF\x80";
71+
if (!concordance($s))
72+
echo "Discordance for ".bin2hex($s),"\n";
73+
$s = chr($b1).chr($b2)."\x80\xBF";
74+
if (!concordance($s))
75+
echo "Discordance for ".bin2hex($s),"\n";
76+
$s = chr($b1).chr($b2)."\xBF\xBF";
77+
if (!concordance($s))
78+
echo "Discordance for ".bin2hex($s),"\n";
79+
}
80+
}
81+
echo "Done.\n";
82+
--EXPECT--
83+
Done.

ext/standard/tests/strings/htmlentities-utf.phpt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ foreach($strings as $string) {
5050
%unicode|string%(16) "266561637574653b"
5151
%unicode|string%(0) ""
5252
%unicode|string%(0) ""
53-
%unicode|string%(8) "f7bfbfbf"
54-
%unicode|string%(8) "f7bfbfbf"
53+
%unicode|string%(0) ""
54+
%unicode|string%(0) ""
5555
%unicode|string%(0) ""
5656
%unicode|string%(0) ""
5757
%unicode|string%(0) ""

0 commit comments

Comments
 (0)