Skip to content

Commit 2a93a8b

Browse files
committed
Add test suite for CP5022{0,1,2}
1 parent cebdad4 commit 2a93a8b

File tree

1 file changed

+286
-0
lines changed

1 file changed

+286
-0
lines changed
Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
--TEST--
2+
Exhaustive test of CP50220, CP50221, and CP50222 encodings
3+
--SKIPIF--
4+
<?php
5+
extension_loaded('mbstring') or die('skip mbstring not available');
6+
if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
7+
?>
8+
--FILE--
9+
<?php
10+
include('encoding_tests.inc');
11+
mb_substitute_character(0x25); // '%'
12+
13+
function shiftJISDecode($bytes) {
14+
/* Convert CP932's default Shift-JIS representation to kuten code
15+
*
16+
* Shift-JIS is fun! The first byte only represents the top 7 bits of
17+
* the ku number, because 94 first bytes were not available. There are
18+
* two different ranges of 94 which the second byte can fall in, and
19+
* we get the low bit of the ku number by seeing which one it is. */
20+
$first = ($bytes >> 8) & 0xFF;
21+
$second = $bytes & 0xFF;
22+
$hi_bits = $first - (($first > 0x9F) ? 0xE0 - 31 : 0x81);
23+
if ($second > 0x9E) {
24+
$kuten = ((($hi_bits << 1) + 0x22) << 8) + ($second - 0x9F + 0x21);
25+
} else if ($second > 0x7F) {
26+
$kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x80 + 63 + 0x21);
27+
} else {
28+
$kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x40 + 0x21);
29+
}
30+
return $kuten;
31+
}
32+
33+
/* Read in table of all characters in CP932 charset */
34+
$cp932Chars = array(); /* CP932 -> UTF-16BE */
35+
$fp = fopen(__DIR__ . '/data/CP932.txt', 'r+');
36+
while ($line = fgets($fp, 256)) {
37+
if ($line[0] == '#')
38+
continue;
39+
40+
if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) {
41+
if ($bytes < 256)
42+
continue;
43+
if ($bytes > 0xFA00) // We don't handle these extra characters from ku 114 and above
44+
continue;
45+
$cp932Chars[pack('n', shiftJISDecode($bytes))] = pack('n', $codepoint);
46+
}
47+
}
48+
49+
/* Aside from the characters in that table, we also support a 'user' area,
50+
* which maps to Unicode 'private' codepoints 0xE000-E757 */
51+
$codepoint = 0xE000;
52+
for ($i = 0xF0; $i <= 0xF9; $i++) {
53+
for ($j = 0x40; $j <= 0xFC; $j++) {
54+
if ($j == 0x7F)
55+
continue;
56+
$cp932Chars[pack('n', shiftJISDecode(($i << 8) + $j))] = pack('n', $codepoint);
57+
$codepoint++;
58+
}
59+
}
60+
61+
/* There are 396 Unicode codepoints which are non-invertible in CP932
62+
* (multiple CP932 byte sequences map to the same codepoint) */
63+
$nonInvertible = array();
64+
for ($i = 0xED00; $i <= 0xEEFF; $i++) {
65+
$bytes = pack('n', shiftJISDecode($i));
66+
if (isset($cp932Chars[$bytes])) {
67+
$nonInvertible[$bytes] = $cp932Chars[$bytes];
68+
unset($cp932Chars[$bytes]); // will test these separately
69+
}
70+
}
71+
foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C] as $i) {
72+
$bytes = pack('n', shiftJISDecode($i));
73+
$nonInvertible[$bytes] = $cp932Chars[$bytes];
74+
unset($cp932Chars[$bytes]); // will test these separately
75+
}
76+
77+
/* Read in table of all characters in JISX-0201 charset */
78+
$jisx0201Chars = array(); /* JISX0201 -> UTF-16BE */
79+
$fp = fopen(__DIR__ . '/data/JISX0201.txt', 'r+');
80+
while ($line = fgets($fp, 256)) {
81+
if ($line[0] == '#')
82+
continue;
83+
84+
if (sscanf($line, "0x%x\t0x%x", $byte, $codepoint) == 2)
85+
$jisx0201Chars[chr($byte)] = pack('n', $codepoint);
86+
}
87+
88+
/* Our conversions between CP5022x (when CP932 charset is selected) and Unicode
89+
* differ in a number of places from the table provided by the Unicode Consortium */
90+
$cp932Chars["\x21\x41"] = "\x30\x1C"; /* WAVE DASH instead of FULLWIDTH TILDE */
91+
$cp932Chars["\x21\x42"] = "\x20\x16"; /* DOUBLE VERTICAL LINE instead of PARALLEL TO */
92+
$cp932Chars["\x21\x5D"] = "\x22\x12"; /* MINUS SIGN instead of FULLWIDTH HYPHEN-MINUS */
93+
$cp932Chars["\x21\x71"] = "\x00\xA2"; /* CENT SIGN instead of FULLWIDTH CENT SIGN */
94+
$cp932Chars["\x21\x72"] = "\x00\xA3"; /* POUND SIGN instead of FULLWIDTH POUND SIGN */
95+
$cp932Chars["\x22\x4C"] = "\x00\xAC"; /* NOT SIGN instead of FULLWIDTH NOT SIGN */
96+
97+
function testValid($from, $to, $encoding, $bothWays = true) {
98+
identifyValidString($from, $encoding);
99+
convertValidString($from, $to, $encoding, 'UTF-16BE', false);
100+
101+
if ($bothWays) {
102+
/* An 0xF at the beginning is redundant; it switches to ASCII mode, but
103+
* ASCII mode is default */
104+
if ($from[0] == "\x0F")
105+
$from = substr($from, 1, strlen($from) - 1);
106+
/* ESC ( B at the beginning is redundant, since ASCII mode is the default */
107+
if (substr($from, 0, 3) == "\x1B(B")
108+
$from = substr($from, 3, strlen($from) - 3);
109+
/* If the string switches to a different charset, it should switch back to
110+
* ASCII at the end */
111+
if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(J") !== false || strpos($from, "\x1B(I") !== false)
112+
$from .= "\x1B(B";
113+
if ($encoding == 'CP50222' && $from[0] == "\x0E")
114+
$from .= "\x0F";
115+
116+
convertValidString($to, $from, 'UTF-16BE', $encoding, false);
117+
}
118+
}
119+
120+
function testInvalid($from, $to, $encoding) {
121+
testInvalidString($from, $to, $encoding, 'UTF-16BE');
122+
}
123+
124+
for ($i = 0; $i < 0x80; $i++) {
125+
if ($i == 0xE || $i == 0xF || $i == 0x1B)
126+
continue;
127+
testValid(chr($i), "\x00" . chr($i), 'CP50220');
128+
testValid(chr($i), "\x00" . chr($i), 'CP50221');
129+
testValid(chr($i), "\x00" . chr($i), 'CP50222');
130+
testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'CP50220');
131+
testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'CP50221');
132+
testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'CP50222');
133+
testValid("\x0F" . chr($i), "\x00" . chr($i), 'CP50222', false); /* 0xF is 'Shift Out' code */
134+
}
135+
136+
for ($i = 0x80; $i < 256; $i++) {
137+
if ($i >= 0xA1 && $i <= 0xDF) // We convert single bytes from 0xA1-0xDF as JIS X 0201 kana
138+
continue;
139+
testInvalid(chr($i), "\x00%", 'CP50220');
140+
testInvalid(chr($i), "\x00%", 'CP50221');
141+
testInvalid(chr($i), "\x00%", 'CP50222');
142+
testInvalid("\x1B(B" . chr($i), "\x00%", 'CP50220');
143+
testInvalid("\x1B(B" . chr($i), "\x00%", 'CP50221');
144+
testInvalid("\x1B(B" . chr($i), "\x00%", 'CP50222');
145+
testInvalid("\x0F" . chr($i), "\x00%", 'CP50220');
146+
testInvalid("\x0F" . chr($i), "\x00%", 'CP50221');
147+
testInvalid("\x0F" . chr($i), "\x00%", 'CP50222');
148+
}
149+
150+
echo "ASCII support OK\n";
151+
152+
/* All valid JIS X 0201 characters
153+
* Those with a 1 in the high bit are JIS X 0201 kana */
154+
foreach ($jisx0201Chars as $jisx0201 => $utf16BE) {
155+
if (ord($jisx0201) >= 128) { /* Kana */
156+
$kana = chr(ord($jisx0201) - 128);
157+
testValid("\x1B(I" . $kana, $utf16BE, 'CP50221');
158+
testValid("\x1B(J\x0E" . $kana, $utf16BE, 'CP50222', false); /* 0xE is 'Shift In' code */
159+
testValid("\x0E" . $kana, $utf16BE, 'CP50222', false);
160+
testValid($jisx0201, $utf16BE, 'CP50220', false);
161+
testValid($jisx0201, $utf16BE, 'CP50221', false);
162+
testValid($jisx0201, $utf16BE, 'CP50222', false);
163+
} else { /* Latin */
164+
testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50220', $utf16BE > "\x00\x80");
165+
testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50221', $utf16BE > "\x00\x80");
166+
testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50222', $utf16BE > "\x00\x80");
167+
}
168+
}
169+
170+
for ($i = 0x80; $i < 256; $i++) {
171+
if ($i >= 0xA1 && $i <= 0xDF)
172+
continue;
173+
testInvalid("\x1B(I" . chr($i), "\x00%", 'CP50220');
174+
testInvalid("\x1B(I" . chr($i), "\x00%", 'CP50221');
175+
testInvalid("\x1B(I" . chr($i), "\x00%", 'CP50222');
176+
testInvalid("\x1B(J" . chr($i), "\x00%", 'CP50220');
177+
testInvalid("\x1B(J" . chr($i), "\x00%", 'CP50221');
178+
testInvalid("\x1B(J" . chr($i), "\x00%", 'CP50222');
179+
}
180+
181+
echo "JIS X 0201 support OK\n";
182+
183+
/* All valid CP932 characters */
184+
foreach ($cp932Chars as $cp932 => $utf16BE) {
185+
testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50220');
186+
testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50221');
187+
testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50222');
188+
}
189+
foreach ($nonInvertible as $cp932 => $utf16BE) {
190+
testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50220', false);
191+
testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50221', false);
192+
testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50222', false);
193+
}
194+
195+
/* All invalid 2-byte CP932 characters */
196+
for ($i = 0x21; $i <= 0x7E; $i++) {
197+
for ($j = 0; $j < 256; $j++) {
198+
$testString = chr($i) . chr($j);
199+
if (!isset($cp932Chars[$testString]) && !isset($nonInvertible[$testString])) {
200+
testInvalid("\x1B\$B" . $testString, "\x00%", 'CP50220');
201+
testInvalid("\x1B\$B" . $testString, "\x00%", 'CP50221');
202+
testInvalid("\x1B\$B" . $testString, "\x00%", 'CP50222');
203+
}
204+
}
205+
}
206+
207+
echo "CP932 support OK\n";
208+
209+
/* Unicode codepoint for halfwidth katakana -> kuten code for ordinary katakana */
210+
$fullwidthKatakana = array(
211+
0xFF61 => 0x2123, /* Ideographic full stop */
212+
0xFF62 => 0x2156, /* Left corner bracket */
213+
0xFF63 => 0x2157, /* Right corner bracket */
214+
0xFF64 => 0x2122, /* Ideographic comma */
215+
0xFF65 => 0x2126, /* Katakana middle dot */
216+
0xFF66 => 0x2572, /* Wo */
217+
0xFF67 => 0x2521, /* Small A */
218+
0xFF68 => 0x2523, /* Small I */
219+
0xFF69 => 0x2525, /* Small U */
220+
0xFF6A => 0x2527, /* Small E */
221+
0xFF6B => 0x2529, /* Small O */
222+
0xFF6C => 0x2563, /* Small Ya */
223+
0xFF6D => 0x2565, /* Small Yu */
224+
0xFF6E => 0x2567, /* Small Yo */
225+
0xFF6F => 0x2543, /* Small Tsu */
226+
0xFF70 => 0x213C, /* Prolonged Sound Marker */
227+
0xFF71 => 0x2522, /* A */
228+
0xFF72 => 0x2524, /* I */
229+
0xFF73 => 0x2526, /* U */
230+
0xFF74 => 0x2528, /* E */
231+
0xFF75 => 0x252A, /* O */
232+
0xFF76 => 0x252B, /* Ka */
233+
0xFF77 => 0x252D, /* Ki */
234+
0xFF78 => 0x252F, /* Ku */
235+
0xFF79 => 0x2531, /* Ke */
236+
0xFF7A => 0x2533, /* Ko */
237+
0xFF7B => 0x2535, /* Sa */
238+
0xFF7C => 0x2537, /* Shi */
239+
0xFF7D => 0x2539, /* Su */
240+
0xFF7E => 0x253B, /* Se */
241+
0xFF7F => 0x253D, /* So */
242+
0xFF80 => 0x253F, /* Ta */
243+
0xFF81 => 0x2541, /* Chi */
244+
0xFF82 => 0x2544, /* Tsu */
245+
0xFF83 => 0x2546, /* Te */
246+
0xFF84 => 0x2548, /* To */
247+
0xFF85 => 0x254A, /* Na */
248+
0xFF86 => 0x254B, /* Ni */
249+
0xFF87 => 0x254C, /* Nu */
250+
0xFF88 => 0x254D, /* Ne */
251+
0xFF89 => 0x254E, /* No */
252+
0xFF8A => 0x254F, /* Ha */
253+
0xFF8B => 0x2552, /* Hi */
254+
0xFF8C => 0x2555, /* Fu */
255+
0xFF8D => 0x2558, /* He */
256+
0xFF8E => 0x255B, /* Ho */
257+
0xFF8F => 0x255E, /* Ma */
258+
0xFF90 => 0x255F, /* Mi */
259+
0xFF91 => 0x2560, /* Mu */
260+
0xFF92 => 0x2561, /* Me */
261+
0xFF93 => 0x2562, /* Mo */
262+
0xFF94 => 0x2564, /* Ya */
263+
0xFF95 => 0x2566, /* Yu */
264+
0xFF96 => 0x2568, /* Yo */
265+
0xFF97 => 0x2569, /* Ra */
266+
0xFF98 => 0x256A, /* Ri */
267+
0xFF99 => 0x256B, /* Ru */
268+
0xFF9A => 0x256C, /* Re */
269+
0xFF9B => 0x256D, /* Ro */
270+
0xFF9C => 0x256F, /* Wa */
271+
0xFF9D => 0x2573, /* N */
272+
0xFF9E => 0x212B, /* Voice Mark */
273+
0xFF9F => 0x212C /* Semi-voice Mark */
274+
);
275+
foreach ($fullwidthKatakana as $cp => $kuten) {
276+
convertValidString(pack('n', $cp), "\x1B\$B" . pack('n', $kuten) . "\x1B(B", 'UTF-16BE', 'CP50220', false);
277+
}
278+
279+
echo "Folding of fullwidth katakana for CP50220 OK\n";
280+
281+
?>
282+
--EXPECT--
283+
ASCII support OK
284+
JIS X 0201 support OK
285+
CP932 support OK
286+
Folding of fullwidth katakana for CP50220 OK

0 commit comments

Comments
 (0)