Skip to content

Commit 5c80565

Browse files
committed
Enhance handling of CP51932 encoding
- Don't pass 'control' characters through in the middle of a multi-byte char - Treat truncated multi-byte characters as an error
1 parent beef597 commit 5c80565

File tree

3 files changed

+7741
-20
lines changed

3 files changed

+7741
-20
lines changed

ext/mbstring/libmbfl/filters/mbfilter_cp51932.c

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434
#include "unicode_table_jis.h"
3535
#include "cp932_table.h"
3636

37+
static int mbfl_filt_conv_cp51932_wchar_flush(mbfl_convert_filter *filter);
38+
3739
static const unsigned char mblen_table_eucjp[] = { /* 0xA1-0xFE */
3840
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3941
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -72,7 +74,7 @@ const struct mbfl_convert_vtbl vtbl_cp51932_wchar = {
7274
mbfl_filt_conv_common_ctor,
7375
NULL,
7476
mbfl_filt_conv_cp51932_wchar,
75-
mbfl_filt_conv_common_flush,
77+
mbfl_filt_conv_cp51932_wchar_flush,
7678
NULL,
7779
};
7880

@@ -105,17 +107,15 @@ mbfl_filt_conv_cp51932_wchar(int c, mbfl_convert_filter *filter)
105107

106108
switch (filter->status) {
107109
case 0:
108-
if (c >= 0 && c < 0x80) { /* latin */
110+
if (c >= 0 && c < 0x80) { /* latin */
109111
CK((*filter->output_function)(c, filter->data));
110-
} else if (c > 0xa0 && c < 0xff) { /* CP932 first char */
112+
} else if (c >= 0xA1 && c <= 0xFE) { /* CP932, first byte */
111113
filter->status = 1;
112114
filter->cache = c;
113-
} else if (c == 0x8e) { /* kana first char */
115+
} else if (c == 0x8e) { /* kana first char */
114116
filter->status = 2;
115117
} else {
116-
w = c & MBFL_WCSGROUP_MASK;
117-
w |= MBFL_WCSGROUP_THROUGH;
118-
CK((*filter->output_function)(w, filter->data));
118+
CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
119119
}
120120
break;
121121

@@ -152,17 +152,11 @@ mbfl_filt_conv_cp51932_wchar(int c, mbfl_convert_filter *filter)
152152
}
153153
}
154154
if (w <= 0) {
155-
w = ((c1 & 0x7f) << 8) | (c & 0x7f);
156-
w &= MBFL_WCSPLANE_MASK;
157-
w |= MBFL_WCSPLANE_WINCP932;
155+
w = ((c1 & 0x7f) << 8) | (c & 0x7f) | MBFL_WCSPLANE_WINCP932;
158156
}
159157
CK((*filter->output_function)(w, filter->data));
160-
} else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
161-
CK((*filter->output_function)(c, filter->data));
162158
} else {
163-
w = (c1 << 8) | c;
164-
w &= MBFL_WCSGROUP_MASK;
165-
w |= MBFL_WCSGROUP_THROUGH;
159+
w = (c1 << 8) | c | MBFL_WCSGROUP_THROUGH;
166160
CK((*filter->output_function)(w, filter->data));
167161
}
168162
break;
@@ -172,12 +166,8 @@ mbfl_filt_conv_cp51932_wchar(int c, mbfl_convert_filter *filter)
172166
if (c > 0xa0 && c < 0xe0) {
173167
w = 0xfec0 + c;
174168
CK((*filter->output_function)(w, filter->data));
175-
} else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
176-
CK((*filter->output_function)(c, filter->data));
177169
} else {
178-
w = 0x8e00 | c;
179-
w &= MBFL_WCSGROUP_MASK;
180-
w |= MBFL_WCSGROUP_THROUGH;
170+
w = 0x8e00 | c | MBFL_WCSGROUP_THROUGH;
181171
CK((*filter->output_function)(w, filter->data));
182172
}
183173
break;
@@ -190,6 +180,20 @@ mbfl_filt_conv_cp51932_wchar(int c, mbfl_convert_filter *filter)
190180
return c;
191181
}
192182

183+
static int mbfl_filt_conv_cp51932_wchar_flush(mbfl_convert_filter *filter)
184+
{
185+
if (filter->status) {
186+
/* Input string was truncated */
187+
(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
188+
}
189+
190+
if (filter->flush_function) {
191+
(*filter->flush_function)(filter->data);
192+
}
193+
194+
return 0;
195+
}
196+
193197
/*
194198
* wchar => cp51932
195199
*/
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
--TEST--
2+
Exhaustive test of CP51932 encoding verification and conversion
3+
--SKIPIF--
4+
<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
5+
--FILE--
6+
<?php
7+
srand(2020); /* Make results consistent */
8+
include('encoding_tests.inc');
9+
mb_substitute_character(0x25); // '%'
10+
11+
/* Read in the table of all characters in CP51932 */
12+
$validChars = array(); /* CP51932 string -> UTF-16BE string */
13+
$fromUnicode = array();
14+
15+
$fp = fopen(realpath(__DIR__ . '/data/CP51932.txt'), 'r+');
16+
while ($line = fgets($fp, 256)) {
17+
if ($line[0] == '#')
18+
continue;
19+
20+
$byte2 = null;
21+
if (sscanf($line, '<U%x> \x%x\x%x', $codepoint, $byte1, $byte2) >= 2) {
22+
/* The table we are using tries to map as many Unicode codepoints into
23+
* CP51932 as possible, including by mapping latin characters with accents
24+
* to the equivalent without accents; but since CP51932 is based on the
25+
* CP932 character set, we don't need to handle codepoints which are not
26+
* mapped from any character in CP932 */
27+
if (($codepoint >= 0xC0 && $codepoint <= 0xD6) ||
28+
($codepoint >= 0xD8 && $codepoint <= 0xF6) ||
29+
($codepoint >= 0xF8 && $codepoint <= 0xFF))
30+
continue;
31+
$cp51932 = ($byte2 ? (chr($byte1) . chr($byte2)) : chr($byte1));
32+
$utf16 = pack('n', $codepoint);
33+
$validChars[$cp51932] = $utf16;
34+
$fromUnicode[$utf16] = $cp51932;
35+
}
36+
}
37+
38+
/* We map the JIS X 0208 FULLWIDTH TILDE to U+FF5E (FULLWIDTH TILDE)
39+
* But when converting Unicode to CP51932, we also accept U+301C (WAVE DASH) */
40+
$fromUnicode["\x30\x1C"] = "\xA1\xC1";
41+
/* We map the JIS X 0208 MINUS SIGN to U+FF0D (FULLWIDTH HYPHEN-MINUS SIGN),
42+
* but when converting Unicode to CP51932, we also accept U+2212 (MINUS SIGN) */
43+
$fromUnicode["\x22\x12"] = "\xA1\xDD";
44+
/* We map the JIS X 0208 PARALLEL TO symbol to U+2225 (PARALLEL TO),
45+
* but when converting Unicode to CP51932, we also accept U+2016
46+
* (DOUBLE VERTICAL LINE) */
47+
$fromUnicode["\x20\x16"] = "\xA1\xC2";
48+
49+
/* There are a number of duplicate, irreversible mappings in the CP51932 table
50+
* In most cases, the one which we primarily use appears last in the table,
51+
* but in some cases, it is first and will be overwritten in the above loop
52+
*
53+
* Interestingly, the "collisions" happen in both directions! Part of this is
54+
* because the table we are using attempts to map as many Unicode codepoints
55+
* as possible to CP932 characters */
56+
$fromUnicode["\x22\x20"] = "\xA2\xDC";
57+
$fromUnicode["\x22\x29"] = "\xA2\xC1";
58+
$fromUnicode["\x22\x2B"] = "\xA2\xE9";
59+
$fromUnicode["\x22\x35"] = "\xA2\xE8";
60+
$fromUnicode["\x22\x1A"] = "\xA2\xE5";
61+
$fromUnicode["\x22\x2A"] = "\xA2\xC0";
62+
$fromUnicode["\x22\x61"] = "\xA2\xE1";
63+
$fromUnicode["\x22\xA5"] = "\xA2\xDD";
64+
$fromUnicode["\x22\x52"] = "\xA2\xE2";
65+
$fromUnicode["\xFF\xE2"] = "\xA2\xCC";
66+
unset($fromUnicode["\x00\xA1"]); // Don't map upside-down ! to ordinary !
67+
unset($fromUnicode["\x00\xA6"]); // Don't map broken bar to ordinary pipe character
68+
unset($fromUnicode["\x00\xA9"]); // Don't map © to c
69+
unset($fromUnicode["\x00\xAA"]); // Don't map feminine ordinal indicator
70+
unset($fromUnicode["\x00\xAB"]); // Don't map left double angled quote mark to "much less than"
71+
unset($fromUnicode["\x00\xAD"]); // Don't map soft hyphen to ordinary hyphen
72+
unset($fromUnicode["\x00\xAE"]); // Don't map ® to R
73+
unset($fromUnicode["\x00\xAF"]); // Don't map Unicode halfwidth macron to CP932 fullwidth macron
74+
unset($fromUnicode["\x00\xB2"]); // Don't map ² to ordinary 2
75+
unset($fromUnicode["\x00\xB3"]); // Don't map ³ to ordinary 3
76+
unset($fromUnicode["\x00\xB5"]); // Don't map micro sign to Greek mu
77+
unset($fromUnicode["\x00\xB7"]); // Don't map middle dot to katakana middle dot
78+
unset($fromUnicode["\x00\xB8"]); // Don't map cedilla to fullwidth comma
79+
unset($fromUnicode["\x00\xB9"]); // Don't map ¹ to ordinary 1
80+
unset($fromUnicode["\x00\xBA"]); // Don't map "masculine ordinal indicator"
81+
unset($fromUnicode["\x00\xBB"]); // Don't map right double angled quote mark to "much greater than"
82+
unset($fromUnicode["\x30\x94"]); // Don't map hiragana vu to katakana vu
83+
84+
for ($i = 0; $i <= 0x7F; $i++)
85+
$validChars[chr($i)] = "\x00" . chr($i);
86+
87+
testAllValidChars($validChars, 'CP51932', 'UTF-16BE', false);
88+
testAllValidChars($fromUnicode, 'UTF-16BE', 'CP51932', false);
89+
echo "CP51932 verification and conversion works on all valid characters\n";
90+
91+
findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0xA9, 0xAF), 2) + array_fill_keys(range(0xF5, 0xF8), 2) + array(0xFD => 2, 0xFE => 2));
92+
93+
testAllInvalidChars($invalidChars, $validChars, 'CP51932', 'UTF-16BE', "\x00%");
94+
testTruncatedChars($truncated, 'CP51932', 'UTF-16BE', "\x00%");
95+
echo "CP51932 verification and conversion works on all invalid characters\n";
96+
97+
findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2));
98+
convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP51932', '%');
99+
echo "Unicode -> CP51932 conversion works on all invalid codepoints\n";
100+
?>
101+
--EXPECT--
102+
CP51932 verification and conversion works on all valid characters
103+
CP51932 verification and conversion works on all invalid characters
104+
Unicode -> CP51932 conversion works on all invalid codepoints

0 commit comments

Comments
 (0)