Skip to content

Commit d9ddeb6

Browse files
committed
UTF-16 text conversion handles truncated characters as illegal
This broke one old test (Zend/tests/multibyte_encoding_003.phpt), which used a PHP script encoded as UTF-16. The problem was that to terminate the test script, we need the text: "\n--EXPECT--". Out of that text, the terminating newline (0x0A byte) becomes part of the resulting test script; but a bare 0x0A byte with no 0x00 is not valid UTF-16. Since we now treat truncated UTF-16 characters as erroneous, an extra '?' is appended to the output as an 'illegal character' marker. Really, if we are running PHP scripts which are treated as encoded in UTF-16 or some other arbitrary text encoding (not ASCII), and the script is not actually a valid string in that encoding, inserting '?' characters into the code which the PHP interpreter runs is a bad thing to do. In such cases, the script shouldn't be treated as UTF-16 (or whatever) at all. I wonder if mbstring's encoding detection is being used in 'non-strict' mode?
1 parent 9bfb158 commit d9ddeb6

File tree

2 files changed

+22
-3
lines changed

2 files changed

+22
-3
lines changed
Binary file not shown.

ext/mbstring/libmbfl/filters/mbfilter_utf16.c

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
static int mbfl_filt_ident_utf16(int c, mbfl_identify_filter *filter);
3434
static int mbfl_filt_ident_utf16le(int c, mbfl_identify_filter *filter);
3535
static int mbfl_filt_ident_utf16be(int c, mbfl_identify_filter *filter);
36+
static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter);
3637

3738
static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};
3839

@@ -93,7 +94,7 @@ const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
9394
mbfl_filt_conv_common_ctor,
9495
NULL,
9596
mbfl_filt_conv_utf16_wchar,
96-
mbfl_filt_conv_common_flush,
97+
mbfl_filt_conv_utf16_wchar_flush,
9798
NULL,
9899
};
99100

@@ -113,7 +114,7 @@ const struct mbfl_convert_vtbl vtbl_utf16be_wchar = {
113114
mbfl_filt_conv_common_ctor,
114115
NULL,
115116
mbfl_filt_conv_utf16be_wchar,
116-
mbfl_filt_conv_common_flush,
117+
mbfl_filt_conv_utf16_wchar_flush,
117118
NULL,
118119
};
119120

@@ -133,7 +134,7 @@ const struct mbfl_convert_vtbl vtbl_utf16le_wchar = {
133134
mbfl_filt_conv_common_ctor,
134135
NULL,
135136
mbfl_filt_conv_utf16le_wchar,
136-
mbfl_filt_conv_common_flush,
137+
mbfl_filt_conv_utf16_wchar_flush,
137138
NULL,
138139
};
139140

@@ -343,6 +344,24 @@ int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
343344
return c;
344345
}
345346

347+
static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter)
348+
{
349+
int status = filter->status;
350+
int cache = filter->cache;
351+
filter->status = filter->cache = 0;
352+
353+
if (status & 0xF) {
354+
/* Input string was truncated */
355+
CK((*filter->output_function)(cache | MBFL_WCSGROUP_THROUGH, filter->data));
356+
}
357+
358+
if (filter->flush_function) {
359+
(*filter->flush_function)(filter->data);
360+
}
361+
362+
return 0;
363+
}
364+
346365
static int mbfl_filt_ident_utf16(int c, mbfl_identify_filter *filter)
347366
{
348367
if (filter->status == 0) {

0 commit comments

Comments
 (0)