Skip to content

Commit 6dd7547

Browse files
committed
Leading BOM is stripped for UTF-32
For consistency with UTF-16 and UCS-4. Also, do some code cleanup.
1 parent 1cf12c0 commit 6dd7547

File tree

2 files changed

+45
-124
lines changed

2 files changed

+45
-124
lines changed

ext/mbstring/libmbfl/filters/mbfilter_utf32.c

Lines changed: 36 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -129,106 +129,53 @@ const struct mbfl_convert_vtbl vtbl_wchar_utf32le = {
129129

130130
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
131131

132-
/*
133-
* UTF-32 => wchar
134-
*/
135-
int mbfl_filt_conv_utf32_wchar(int c, mbfl_convert_filter *filter)
132+
static int emit_char_if_valid(int n, mbfl_convert_filter *filter)
136133
{
137-
int n, endian;
134+
if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xD800 || n > 0xDFFF)) {
135+
CK((*filter->output_function)(n, filter->data));
136+
} else {
137+
n = (n & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH;
138+
CK((*filter->output_function)(n, filter->data));
139+
}
140+
return 0;
141+
}
138142

139-
endian = filter->status & 0xff00;
140-
switch (filter->status & 0xff) {
141-
case 0:
142-
if (endian) {
143-
n = c & 0xff;
144-
} else {
145-
n = (c & 0xffu) << 24;
146-
}
147-
filter->cache = n;
148-
filter->status++;
149-
break;
150-
case 1:
151-
if (endian) {
152-
n = (c & 0xff) << 8;
153-
} else {
154-
n = (c & 0xff) << 16;
155-
}
156-
filter->cache |= n;
157-
filter->status++;
158-
break;
159-
case 2:
160-
if (endian) {
161-
n = (c & 0xff) << 16;
162-
} else {
163-
n = (c & 0xff) << 8;
164-
}
165-
filter->cache |= n;
143+
int mbfl_filt_conv_utf32_wchar(int c, mbfl_convert_filter *filter)
144+
{
145+
if (filter->status < 3) {
146+
filter->cache = (filter->cache << 8) | (c & 0xFF);
166147
filter->status++;
167-
break;
168-
default:
169-
if (endian) {
170-
n = (c & 0xffu) << 24;
171-
} else {
172-
n = c & 0xff;
173-
}
174-
n |= filter->cache;
175-
if ((n & 0xffff) == 0 && ((n >> 16) & 0xffff) == 0xfffe) {
176-
if (endian) {
177-
filter->status = 0; /* big-endian */
178-
} else {
179-
filter->status = 0x100; /* little-endian */
180-
}
181-
CK((*filter->output_function)(0xfeff, filter->data));
148+
} else {
149+
int n = ((unsigned int)filter->cache << 8) | (c & 0xFF);
150+
filter->cache = filter->status = 0;
151+
152+
if (n == 0xFFFE0000) {
153+
/* Found a little-endian byte order mark */
154+
filter->filter_function = mbfl_filt_conv_utf32le_wchar;
182155
} else {
183-
filter->status &= ~0xff;
184-
if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 0xdfff)) {
185-
CK((*filter->output_function)(n, filter->data));
186-
} else {
187-
n = (n & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH;
188-
CK((*filter->output_function)(n, filter->data));
156+
filter->filter_function = mbfl_filt_conv_utf32be_wchar;
157+
if (n != 0xFEFF) {
158+
CK(emit_char_if_valid(n, filter));
189159
}
190160
}
191-
break;
192161
}
193162

194163
return c;
195164
}
196165

197-
/*
198-
* UTF-32BE => wchar
199-
*/
200166
int mbfl_filt_conv_utf32be_wchar(int c, mbfl_convert_filter *filter)
201167
{
202-
int n;
203-
204-
if (filter->status == 0) {
205-
filter->status = 1;
206-
n = (c & 0xffu) << 24;
207-
filter->cache = n;
208-
} else if (filter->status == 1) {
209-
filter->status = 2;
210-
n = (c & 0xff) << 16;
211-
filter->cache |= n;
212-
} else if (filter->status == 2) {
213-
filter->status = 3;
214-
n = (c & 0xff) << 8;
215-
filter->cache |= n;
168+
if (filter->status < 3) {
169+
filter->cache = (filter->cache << 8) | (c & 0xFF);
170+
filter->status++;
216171
} else {
217-
filter->status = 0;
218-
n = (c & 0xff) | filter->cache;
219-
if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 0xdfff)) {
220-
CK((*filter->output_function)(n, filter->data));
221-
} else {
222-
n = (n & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH;
223-
CK((*filter->output_function)(n, filter->data));
224-
}
172+
int n = ((unsigned int)filter->cache << 8) | (c & 0xFF);
173+
filter->cache = filter->status = 0;
174+
CK(emit_char_if_valid(n, filter));
225175
}
226176
return c;
227177
}
228178

229-
/*
230-
* wchar => UTF-32BE
231-
*/
232179
int mbfl_filt_conv_wchar_utf32be(int c, mbfl_convert_filter *filter)
233180
{
234181
if (c >= 0 && c < MBFL_WCSPLANE_UTF32MAX) {
@@ -243,41 +190,19 @@ int mbfl_filt_conv_wchar_utf32be(int c, mbfl_convert_filter *filter)
243190
return c;
244191
}
245192

246-
/*
247-
* UTF-32LE => wchar
248-
*/
249193
int mbfl_filt_conv_utf32le_wchar(int c, mbfl_convert_filter *filter)
250194
{
251-
int n;
252-
253-
if (filter->status == 0) {
254-
filter->status = 1;
255-
n = (c & 0xff);
256-
filter->cache = n;
257-
} else if (filter->status == 1) {
258-
filter->status = 2;
259-
n = (c & 0xff) << 8;
260-
filter->cache |= n;
261-
} else if (filter->status == 2) {
262-
filter->status = 3;
263-
n = (c & 0xff) << 16;
264-
filter->cache |= n;
195+
if (filter->status < 3) {
196+
filter->cache |= ((c & 0xFFU) << (8 * filter->status));
197+
filter->status++;
265198
} else {
266-
filter->status = 0;
267-
n = ((c & 0xffu) << 24) | filter->cache;
268-
if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 0xdfff)) {
269-
CK((*filter->output_function)(n, filter->data));
270-
} else {
271-
n = (n & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH;
272-
CK((*filter->output_function)(n, filter->data));
273-
}
199+
int n = ((c & 0xFFU) << 24) | filter->cache;
200+
filter->cache = filter->status = 0;
201+
CK(emit_char_if_valid(n, filter));
274202
}
275203
return c;
276204
}
277205

278-
/*
279-
* wchar => UTF-32LE
280-
*/
281206
int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter)
282207
{
283208
if (c >= 0 && c < MBFL_WCSPLANE_UTF32MAX) {
@@ -294,7 +219,7 @@ int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter)
294219

295220
static int mbfl_filt_conv_utf32_wchar_flush(mbfl_convert_filter *filter)
296221
{
297-
if (filter->status & 0xF) {
222+
if (filter->status) {
298223
/* Input string was truncated */
299224
CK((*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data));
300225
}
@@ -303,6 +228,5 @@ static int mbfl_filt_conv_utf32_wchar_flush(mbfl_convert_filter *filter)
303228
(*filter->flush_function)(filter->data);
304229
}
305230

306-
filter->status = filter->cache = 0;
307231
return 0;
308232
}

ext/mbstring/tests/illformed_utf_sequences.phpt

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,9 @@ Unicode standard conformance test (ill-formed UTF sequences.)
44
<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
55
--FILE--
66
<?php
7-
function chk_enc($str, $n, $enc = "UTF-8", $with_bom = false) {
7+
function chk_enc($str, $n, $enc = "UTF-8") {
88
$src = bin2hex(mb_convert_encoding($str, "UCS-4BE", $enc));
99
$dst = str_repeat("0000fffd", $n);
10-
if ($with_bom) {
11-
$dst = "0000feff" . $dst;
12-
}
1310
if ($dst == $src) {
1411
return false;
1512
} else {
@@ -129,29 +126,29 @@ $out = '';
129126
$cnt = 0;
130127
for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
131128
$s = chk_enc("\x00\x00\xfe\xff". pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff),
132-
1, "UTF-32", true);
129+
1, "UTF-32");
133130
if ($s === false) {
134131
$cnt++;
135132
} else {
136133
$out .= $s;
137134
}
138135
}
139136
var_dump($cnt);
140-
var_dump(str_replace("0000feff","",$out));
137+
var_dump($out);
141138

142139
$out = '';
143140
$cnt = 0;
144141
for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
145142
$s = chk_enc("\xff\xfe\x00\x00". pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff),
146-
1, "UTF-32", true);
143+
1, "UTF-32");
147144
if ($s === false) {
148145
$cnt++;
149146
} else {
150147
$out .= $s;
151148
}
152149
}
153150
var_dump($cnt);
154-
var_dump(str_replace("0000feff","",$out));
151+
var_dump($out);
155152

156153
?>
157154
--EXPECT--
@@ -199,10 +196,10 @@ bool(false)
199196
string(8) "0010ffff"
200197
bool(false)
201198
string(8) "0010ffff"
202-
string(16) "0000feff0000fffd"
203-
string(16) "0000feff0010ffff"
204-
string(16) "0000feff0000fffd"
205-
string(16) "0000feff0010ffff"
199+
string(8) "0000fffd"
200+
string(8) "0010ffff"
201+
string(8) "0000fffd"
202+
string(8) "0010ffff"
206203
UTF-32 and surrogates area
207204
int(2048)
208205
string(16) "0000d7ff0000e000"

0 commit comments

Comments
 (0)