Skip to content

Commit ebe6500

Browse files
committed
Fix error reporting bug for Unicode -> CP50220 conversion
To detect errors in conversion from Unicode to another text encoding, each mbstring conversion filter object maintains a count of 'bad' characters. After a conversion operation finishes, this count is checked to see if there was any error. The problem with CP50220 was that mbstring used a chain of two conversion filter objects. The 'bad character count' would be incremented on the second object in the chain, but this didn't do anything, as only the count on the first such object is ever checked. Fix this by implementing the conversion using a single conversion filter object, rather than a chain of two. This is possible because of the recent refactoring, which pulled out the needed logic for CP50220 conversion into a helper function.
1 parent 1f130d4 commit ebe6500

File tree

3 files changed

+46
-30
lines changed

3 files changed

+46
-30
lines changed

ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c

Lines changed: 39 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@
3131
#include "unicode_table_jis.h"
3232
#include "cp932_table.h"
3333

34-
static void mbfl_filt_conv_wchar_cp50220_ctor(mbfl_convert_filter *filt);
35-
static void mbfl_filt_conv_wchar_cp50220_dtor(mbfl_convert_filter *filt);
3634
static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter);
35+
static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter);
36+
static int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter);
3737

3838
/* Previously, a dubious 'encoding' called 'cp50220raw' was supported
3939
* This was just CP50220, but the implementation was less strict regarding
@@ -92,10 +92,10 @@ const struct mbfl_convert_vtbl vtbl_cp50220_wchar = {
9292
const struct mbfl_convert_vtbl vtbl_wchar_cp50220 = {
9393
mbfl_no_encoding_wchar,
9494
mbfl_no_encoding_cp50220,
95-
mbfl_filt_conv_wchar_cp50220_ctor,
96-
mbfl_filt_conv_wchar_cp50220_dtor,
97-
mbfl_filt_conv_wchar_cp50221,
98-
mbfl_filt_conv_any_jis_flush,
95+
mbfl_filt_conv_common_ctor,
96+
NULL,
97+
mbfl_filt_conv_wchar_cp50220,
98+
mbfl_filt_conv_wchar_cp50220_flush,
9999
NULL,
100100
};
101101

@@ -318,35 +318,45 @@ static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter)
318318
return 0;
319319
}
320320

321-
/*
322-
* wchar => CP50220
323-
*/
324-
static void mbfl_filt_conv_wchar_cp50220_ctor(mbfl_convert_filter *filt)
321+
static int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter)
325322
{
326-
/* Insert a new convert filter into the chain, after this one, which will
327-
* actually perform the CP50220 conversion. Alter this filter so that it
328-
* converts halfwidth katakana instead */
329-
mbfl_convert_filter *cp50220_filt = emalloc(sizeof(mbfl_convert_filter));
330-
*cp50220_filt = *filt;
331-
332-
/* Reinitialize */
333-
mbfl_filt_conv_common_ctor(filt);
334-
filt->filter_function = vtbl_tl_jisx0201_jisx0208.filter_function;
335-
filt->filter_flush = (filter_flush_t)vtbl_tl_jisx0201_jisx0208.filter_flush;
336-
filt->output_function = (output_function_t)cp50220_filt->filter_function;
337-
filt->flush_function = (flush_function_t)cp50220_filt->filter_flush;
338-
filt->data = cp50220_filt;
339-
filt->opaque = (void*)(MBFL_FILT_TL_HAN2ZEN_KATAKANA | MBFL_FILT_TL_HAN2ZEN_GLUE);
323+
int mode = MBFL_FILT_TL_HAN2ZEN_KATAKANA | MBFL_FILT_TL_HAN2ZEN_GLUE, second = 0;
324+
bool consumed = false;
325+
326+
if (filter->cache) {
327+
int s = mbfl_convert_kana(filter->cache, c, &consumed, &second, mode);
328+
filter->cache = consumed ? 0 : c;
329+
mbfl_filt_conv_wchar_cp50221(s, filter);
330+
if (second) {
331+
mbfl_filt_conv_wchar_cp50221(second, filter);
332+
}
333+
} else if (c == 0) {
334+
/* This case has to be handled separately, since `filter->cache == 0` means
335+
* no codepoint is cached */
336+
(*filter->output_function)(0, filter->data);
337+
} else {
338+
filter->cache = c;
339+
}
340+
341+
return 0;
340342
}
341343

342-
static void mbfl_filt_conv_wchar_cp50220_dtor(mbfl_convert_filter *filt)
344+
static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter)
343345
{
344-
efree(filt->data);
346+
int mode = MBFL_FILT_TL_HAN2ZEN_KATAKANA | MBFL_FILT_TL_HAN2ZEN_GLUE, second = 0;
347+
348+
if (filter->cache) {
349+
int s = mbfl_convert_kana(filter->cache, 0, NULL, &second, mode);
350+
mbfl_filt_conv_wchar_cp50221(s, filter);
351+
if (second) {
352+
mbfl_filt_conv_wchar_cp50221(s, filter);
353+
}
354+
filter->cache = 0;
355+
}
356+
357+
return mbfl_filt_conv_any_jis_flush(filter);
345358
}
346359

347-
/*
348-
* wchar => CP50221
349-
*/
350360
int mbfl_filt_conv_wchar_cp50221(int c, mbfl_convert_filter *filter)
351361
{
352362
int s = 0;

ext/mbstring/libmbfl/filters/mbfilter_cp5022x.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ extern const struct mbfl_convert_vtbl vtbl_cp50222_wchar;
4444
extern const struct mbfl_convert_vtbl vtbl_wchar_cp50222;
4545

4646
int mbfl_filt_conv_cp5022x_wchar(int c, mbfl_convert_filter *filter);
47-
int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter);
4847
int mbfl_filt_conv_wchar_cp50221(int c, mbfl_convert_filter *filter);
4948
int mbfl_filt_conv_wchar_cp50222(int c, mbfl_convert_filter *filter);
5049
int mbfl_filt_conv_wchar_cp50222_flush(mbfl_convert_filter *filter);

ext/mbstring/tests/cp5022x_encoding.phpt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,9 +285,16 @@ foreach ($fullwidthKatakana as $cp => $kuten) {
285285

286286
echo "Folding of fullwidth katakana for CP50220 OK\n";
287287

288+
testInvalidString("\xD8\x00", '%', 'UTF-16BE', 'CP50220');
289+
testInvalidString("\xD8\x00", '%', 'UTF-16BE', 'CP50221');
290+
testInvalidString("\xD8\x00", '%', 'UTF-16BE', 'CP50222');
291+
292+
echo "Invalid Unicode is flagged when converting to CP5022x\n";
293+
288294
?>
289295
--EXPECT--
290296
ASCII support OK
291297
JIS X 0201 support OK
292298
CP932 support OK
293299
Folding of fullwidth katakana for CP50220 OK
300+
Invalid Unicode is flagged when converting to CP5022x

0 commit comments

Comments
 (0)