Skip to content

Commit d8895cd

Browse files
committed
Improve error handling for UTF-16{,BE,LE}
Catch various errors such as the first part of a surrogate pair not being followed by a proper second part, the first part of a surrogate pair appearing at the end of a string, the second part of a surrogate pair appearing out of place, and so on.
1 parent d9ddeb6 commit d8895cd

File tree

2 files changed

+77
-94
lines changed

2 files changed

+77
-94
lines changed

ext/mbstring/libmbfl/filters/mbfilter_utf16.c

Lines changed: 76 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -150,111 +150,89 @@ const struct mbfl_convert_vtbl vtbl_wchar_utf16le = {
150150

151151
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
152152

153-
/*
154-
* UTF-16 => wchar
155-
*/
156153
int mbfl_filt_conv_utf16_wchar(int c, mbfl_convert_filter *filter)
157154
{
158-
int n, endian;
159-
160-
endian = filter->status & 0xff00;
161-
switch (filter->status & 0x0f) {
162-
case 0:
163-
if (endian) {
164-
n = c & 0xff;
165-
} else {
166-
n = (c & 0xff) << 8;
167-
}
168-
filter->cache |= n;
169-
filter->status++;
170-
break;
171-
default:
172-
if (endian) {
173-
n = (c & 0xff) << 8;
155+
/* Start with the assumption that the string is big-endian;
156+
* If we find a little-endian BOM, then we will change that assumption */
157+
if (filter->status == 0) {
158+
filter->cache = c & 0xFF;
159+
filter->status = 1;
160+
} else {
161+
int n = (filter->cache << 8) | (c & 0xFF);
162+
if (n == 0xFFFE) {
163+
/* Switch to little-endian mode */
164+
filter->filter_function = mbfl_filt_conv_utf16le_wchar;
165+
filter->cache = filter->status = 0;
174166
} else {
175-
n = c & 0xff;
176-
}
177-
n |= filter->cache & 0xffff;
178-
filter->status &= ~0x0f;
179-
if (n >= 0xd800 && n < 0xdc00) {
180-
filter->cache = ((n & 0x3ff) << 16) + 0x400000;
181-
} else if (n >= 0xdc00 && n < 0xe000) {
182-
n &= 0x3ff;
183-
n |= (filter->cache & 0xfff0000) >> 6;
184-
filter->cache = 0;
185-
if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
186-
CK((*filter->output_function)(n, filter->data));
187-
} else { /* illegal character */
188-
n &= MBFL_WCSGROUP_MASK;
189-
n |= MBFL_WCSGROUP_THROUGH;
167+
filter->filter_function = mbfl_filt_conv_utf16be_wchar;
168+
if (n >= 0xD800 && n <= 0xDBFF) {
169+
filter->cache = n & 0x3FF; /* Pick out 10 data bits */
170+
filter->status = 2;
171+
return c;
172+
} else if (n >= 0xDC00 && n <= 0xDFFF) {
173+
/* This is wrong; second part of surrogate pair has come first */
174+
CK((*filter->output_function)(n | MBFL_WCSGROUP_THROUGH, filter->data));
175+
} else if (n != 0xFEFF) {
190176
CK((*filter->output_function)(n, filter->data));
191177
}
192-
} else {
193-
int is_first = filter->status & 0x10;
194-
filter->cache = 0;
195-
filter->status |= 0x10;
196-
if (!is_first) {
197-
if (n == 0xfffe) {
198-
if (endian) {
199-
filter->status &= ~0x100; /* big-endian */
200-
} else {
201-
filter->status |= 0x100; /* little-endian */
202-
}
203-
break;
204-
} else if (n == 0xfeff) {
205-
break;
206-
}
207-
}
208-
CK((*filter->output_function)(n, filter->data));
178+
filter->cache = filter->status = 0;
209179
}
210-
break;
211180
}
212181

213182
return c;
214183
}
215184

216-
/*
217-
* UTF-16BE => wchar
218-
*/
219185
int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
220186
{
221187
int n;
222188

223189
switch (filter->status) {
224-
case 0:
190+
case 0: /* First byte */
191+
filter->cache = c & 0xFF;
225192
filter->status = 1;
226-
n = (c & 0xff) << 8;
227-
filter->cache |= n;
228193
break;
229-
default:
230-
filter->status = 0;
231-
n = (filter->cache & 0xff00) | (c & 0xff);
232-
if (n >= 0xd800 && n < 0xdc00) {
233-
filter->cache = ((n & 0x3ff) << 16) + 0x400000;
234-
} else if (n >= 0xdc00 && n < 0xe000) {
235-
n &= 0x3ff;
236-
n |= (filter->cache & 0xfff0000) >> 6;
237-
filter->cache = 0;
238-
if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
239-
CK((*filter->output_function)(n, filter->data));
240-
} else { /* illegal character */
241-
n &= MBFL_WCSGROUP_MASK;
242-
n |= MBFL_WCSGROUP_THROUGH;
243-
CK((*filter->output_function)(n, filter->data));
244-
}
194+
195+
case 1: /* Second byte */
196+
n = (filter->cache << 8) | (c & 0xFF);
197+
if (n >= 0xD800 && n <= 0xDBFF) {
198+
filter->cache = n & 0x3FF; /* Pick out 10 data bits */
199+
filter->status = 2;
200+
} else if (n >= 0xDC00 && n <= 0xDFFF) {
201+
/* This is wrong; second part of surrogate pair has come first */
202+
CK((*filter->output_function)(n | MBFL_WCSGROUP_THROUGH, filter->data));
203+
filter->status = 0;
245204
} else {
246-
filter->cache = 0;
247205
CK((*filter->output_function)(n, filter->data));
206+
filter->status = 0;
248207
}
249208
break;
209+
210+
case 2: /* Second part of surrogate, first byte */
211+
filter->cache = (filter->cache << 8) | (c & 0xFF);
212+
filter->status = 3;
213+
break;
214+
215+
case 3: /* Second part of surrogate, second byte */
216+
n = ((filter->cache & 0xFF) << 8) | (c & 0xFF);
217+
if (n >= 0xD800 && n <= 0xDBFF) {
218+
/* Wrong; that's the first half of a surrogate pair, not the second */
219+
CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
220+
filter->cache = n & 0x3FF;
221+
filter->status = 2;
222+
} else if (n >= 0xDC00 && n <= 0xDFFF) {
223+
n = ((filter->cache & 0x3FF00) << 2) + (n & 0x3FF) + 0x10000;
224+
CK((*filter->output_function)(n, filter->data));
225+
filter->status = 0;
226+
} else {
227+
CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
228+
CK((*filter->output_function)(n, filter->data));
229+
filter->status = 0;
230+
}
250231
}
251232

252233
return c;
253234
}
254235

255-
/*
256-
* wchar => UTF-16BE
257-
*/
258236
int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
259237
{
260238
int n;
@@ -276,11 +254,10 @@ int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
276254
return c;
277255
}
278256

279-
/*
280-
* UTF-16LE => wchar
281-
*/
282257
int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
283258
{
259+
int n;
260+
284261
switch (filter->status) {
285262
case 0:
286263
filter->cache = c & 0xff;
@@ -296,12 +273,12 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
296273
/* This is wrong; the second part of the surrogate pair has come first
297274
* Flag it with `MBFL_WCSGROUP_THROUGH`; the following filter will handle
298275
* the error */
299-
int n = (filter->cache + ((c & 0xff) << 8)) | MBFL_WCSGROUP_THROUGH;
300-
filter->status = 0;
276+
n = (filter->cache + ((c & 0xff) << 8)) | MBFL_WCSGROUP_THROUGH;
301277
CK((*filter->output_function)(n, filter->data));
302-
} else {
303278
filter->status = 0;
279+
} else {
304280
CK((*filter->output_function)(filter->cache + ((c & 0xff) << 8), filter->data));
281+
filter->status = 0;
305282
}
306283
break;
307284

@@ -311,18 +288,26 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
311288
break;
312289

313290
case 3:
314-
filter->status = 0;
315-
int n = filter->cache + ((c & 0x3) << 8) + 0x10000;
316-
CK((*filter->output_function)(n, filter->data));
291+
n = (filter->cache & 0xFF) | ((c & 0xFF) << 8);
292+
if (n >= 0xD800 && n <= 0xDBFF) {
293+
CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
294+
filter->cache = n & 0x3FF;
295+
filter->status = 2;
296+
} else if (n >= 0xDC00 && n <= 0xDFFF) {
297+
n = filter->cache + ((c & 0x3) << 8) + 0x10000;
298+
CK((*filter->output_function)(n, filter->data));
299+
filter->status = 0;
300+
} else {
301+
CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
302+
CK((*filter->output_function)(n, filter->data));
303+
filter->status = 0;
304+
}
317305
break;
318306
}
319307

320308
return c;
321309
}
322310

323-
/*
324-
* wchar => UTF-16LE
325-
*/
326311
int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
327312
{
328313
int n;
@@ -350,7 +335,7 @@ static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter)
350335
int cache = filter->cache;
351336
filter->status = filter->cache = 0;
352337

353-
if (status & 0xF) {
338+
if (status) {
354339
/* Input string was truncated */
355340
CK((*filter->output_function)(cache | MBFL_WCSGROUP_THROUGH, filter->data));
356341
}

ext/mbstring/libmbfl/mbfl/mbfilter.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,6 @@ size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *str
200200
size_t n;
201201
unsigned char *p;
202202
mbfl_convert_filter *filter;
203-
int (*filter_function)(int c, mbfl_convert_filter *filter);
204203

205204
ZEND_ASSERT(convd);
206205
ZEND_ASSERT(string);
@@ -212,9 +211,8 @@ size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *str
212211

213212
filter = convd->filter1;
214213
if (filter != NULL) {
215-
filter_function = filter->filter_function;
216214
while (n > 0) {
217-
if ((*filter_function)(*p++, filter) < 0) {
215+
if ((*filter->filter_function)(*p++, filter) < 0) {
218216
return p - string->val;
219217
}
220218
n--;

0 commit comments

Comments
 (0)