|
| 1 | +--TEST-- |
| 2 | +Exhaustive test of CP50220, CP50221, and CP50222 encodings |
| 3 | +--SKIPIF-- |
| 4 | +<?php |
| 5 | +extension_loaded('mbstring') or die('skip mbstring not available'); |
| 6 | +if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); |
| 7 | +?> |
| 8 | +--FILE-- |
| 9 | +<?php |
| 10 | +include('encoding_tests.inc'); |
| 11 | +mb_substitute_character(0x25); // '%' |
| 12 | + |
| 13 | +function shiftJISDecode($bytes) { |
| 14 | + /* Convert CP932's default Shift-JIS representation to kuten code |
| 15 | + * |
| 16 | + * Shift-JIS is fun! The first byte only represents the top 7 bits of |
| 17 | + * the ku number, because 94 first bytes were not available. There are |
| 18 | + * two different ranges of 94 which the second byte can fall in, and |
| 19 | + * we get the low bit of the ku number by seeing which one it is. */ |
| 20 | + $first = ($bytes >> 8) & 0xFF; |
| 21 | + $second = $bytes & 0xFF; |
| 22 | + $hi_bits = $first - (($first > 0x9F) ? 0xE0 - 31 : 0x81); |
| 23 | + if ($second > 0x9E) { |
| 24 | + $kuten = ((($hi_bits << 1) + 0x22) << 8) + ($second - 0x9F + 0x21); |
| 25 | + } else if ($second > 0x7F) { |
| 26 | + $kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x80 + 63 + 0x21); |
| 27 | + } else { |
| 28 | + $kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x40 + 0x21); |
| 29 | + } |
| 30 | + return $kuten; |
| 31 | +} |
| 32 | + |
| 33 | +/* Read in table of all characters in CP932 charset */ |
| 34 | +$cp932Chars = array(); /* CP932 -> UTF-16BE */ |
| 35 | +$fp = fopen(__DIR__ . '/data/CP932.txt', 'r+'); |
| 36 | +while ($line = fgets($fp, 256)) { |
| 37 | + if ($line[0] == '#') |
| 38 | + continue; |
| 39 | + |
| 40 | + if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) { |
| 41 | + if ($bytes < 256) |
| 42 | + continue; |
| 43 | + if ($bytes > 0xFA00) // We don't handle these extra characters from ku 114 and above |
| 44 | + continue; |
| 45 | + $cp932Chars[pack('n', shiftJISDecode($bytes))] = pack('n', $codepoint); |
| 46 | + } |
| 47 | +} |
| 48 | + |
| 49 | +/* Aside from the characters in that table, we also support a 'user' area, |
| 50 | + * which maps to Unicode 'private' codepoints 0xE000-E757 */ |
| 51 | +$codepoint = 0xE000; |
| 52 | +for ($i = 0xF0; $i <= 0xF9; $i++) { |
| 53 | + for ($j = 0x40; $j <= 0xFC; $j++) { |
| 54 | + if ($j == 0x7F) |
| 55 | + continue; |
| 56 | + $cp932Chars[pack('n', shiftJISDecode(($i << 8) + $j))] = pack('n', $codepoint); |
| 57 | + $codepoint++; |
| 58 | + } |
| 59 | +} |
| 60 | + |
| 61 | +/* There are 396 Unicode codepoints which are non-invertible in CP932 |
| 62 | + * (multiple CP932 byte sequences map to the same codepoint) */ |
| 63 | +$nonInvertible = array(); |
| 64 | +for ($i = 0xED00; $i <= 0xEEFF; $i++) { |
| 65 | + $bytes = pack('n', shiftJISDecode($i)); |
| 66 | + if (isset($cp932Chars[$bytes])) { |
| 67 | + $nonInvertible[$bytes] = $cp932Chars[$bytes]; |
| 68 | + unset($cp932Chars[$bytes]); // will test these separately |
| 69 | + } |
| 70 | +} |
| 71 | +foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C] as $i) { |
| 72 | + $bytes = pack('n', shiftJISDecode($i)); |
| 73 | + $nonInvertible[$bytes] = $cp932Chars[$bytes]; |
| 74 | + unset($cp932Chars[$bytes]); // will test these separately |
| 75 | +} |
| 76 | + |
| 77 | +/* Read in table of all characters in JISX-0201 charset */ |
| 78 | +$jisx0201Chars = array(); /* JISX0201 -> UTF-16BE */ |
| 79 | +$fp = fopen(__DIR__ . '/data/JISX0201.txt', 'r+'); |
| 80 | +while ($line = fgets($fp, 256)) { |
| 81 | + if ($line[0] == '#') |
| 82 | + continue; |
| 83 | + |
| 84 | + if (sscanf($line, "0x%x\t0x%x", $byte, $codepoint) == 2) |
| 85 | + $jisx0201Chars[chr($byte)] = pack('n', $codepoint); |
| 86 | +} |
| 87 | + |
| 88 | +/* Our conversions between CP5022x (when CP932 charset is selected) and Unicode |
| 89 | + * differ in a number of places from the table provided by the Unicode Consortium */ |
| 90 | +$cp932Chars["\x21\x41"] = "\x30\x1C"; /* WAVE DASH instead of FULLWIDTH TILDE */ |
| 91 | +$cp932Chars["\x21\x42"] = "\x20\x16"; /* DOUBLE VERTICAL LINE instead of PARALLEL TO */ |
| 92 | +$cp932Chars["\x21\x5D"] = "\x22\x12"; /* MINUS SIGN instead of FULLWIDTH HYPHEN-MINUS */ |
| 93 | +$cp932Chars["\x21\x71"] = "\x00\xA2"; /* CENT SIGN instead of FULLWIDTH CENT SIGN */ |
| 94 | +$cp932Chars["\x21\x72"] = "\x00\xA3"; /* POUND SIGN instead of FULLWIDTH POUND SIGN */ |
| 95 | +$cp932Chars["\x22\x4C"] = "\x00\xAC"; /* NOT SIGN instead of FULLWIDTH NOT SIGN */ |
| 96 | + |
| 97 | +function testValid($from, $to, $encoding, $bothWays = true) { |
| 98 | + identifyValidString($from, $encoding); |
| 99 | + convertValidString($from, $to, $encoding, 'UTF-16BE', false); |
| 100 | + |
| 101 | + if ($bothWays) { |
| 102 | + /* An 0xF at the beginning is redundant; it switches to ASCII mode, but |
| 103 | + * ASCII mode is default */ |
| 104 | + if ($from[0] == "\x0F") |
| 105 | + $from = substr($from, 1, strlen($from) - 1); |
| 106 | + /* ESC ( B at the beginning is redundant, since ASCII mode is the default */ |
| 107 | + if (substr($from, 0, 3) == "\x1B(B") |
| 108 | + $from = substr($from, 3, strlen($from) - 3); |
| 109 | + /* If the string switches to a different charset, it should switch back to |
| 110 | + * ASCII at the end */ |
| 111 | + if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(J") !== false || strpos($from, "\x1B(I") !== false) |
| 112 | + $from .= "\x1B(B"; |
| 113 | + if ($encoding == 'CP50222' && $from[0] == "\x0E") |
| 114 | + $from .= "\x0F"; |
| 115 | + |
| 116 | + convertValidString($to, $from, 'UTF-16BE', $encoding, false); |
| 117 | + } |
| 118 | +} |
| 119 | + |
| 120 | +function testInvalid($from, $to, $encoding) { |
| 121 | + testInvalidString($from, $to, $encoding, 'UTF-16BE'); |
| 122 | +} |
| 123 | + |
| 124 | +for ($i = 0; $i < 0x80; $i++) { |
| 125 | + if ($i == 0xE || $i == 0xF || $i == 0x1B) |
| 126 | + continue; |
| 127 | + testValid(chr($i), "\x00" . chr($i), 'CP50220'); |
| 128 | + testValid(chr($i), "\x00" . chr($i), 'CP50221'); |
| 129 | + testValid(chr($i), "\x00" . chr($i), 'CP50222'); |
| 130 | + testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'CP50220'); |
| 131 | + testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'CP50221'); |
| 132 | + testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'CP50222'); |
| 133 | + testValid("\x0F" . chr($i), "\x00" . chr($i), 'CP50222', false); /* 0xF is 'Shift Out' code */ |
| 134 | +} |
| 135 | + |
| 136 | +for ($i = 0x80; $i < 256; $i++) { |
| 137 | + if ($i >= 0xA1 && $i <= 0xDF) // We convert single bytes from 0xA1-0xDF as JIS X 0201 kana |
| 138 | + continue; |
| 139 | + testInvalid(chr($i), "\x00%", 'CP50220'); |
| 140 | + testInvalid(chr($i), "\x00%", 'CP50221'); |
| 141 | + testInvalid(chr($i), "\x00%", 'CP50222'); |
| 142 | + testInvalid("\x1B(B" . chr($i), "\x00%", 'CP50220'); |
| 143 | + testInvalid("\x1B(B" . chr($i), "\x00%", 'CP50221'); |
| 144 | + testInvalid("\x1B(B" . chr($i), "\x00%", 'CP50222'); |
| 145 | + testInvalid("\x0F" . chr($i), "\x00%", 'CP50220'); |
| 146 | + testInvalid("\x0F" . chr($i), "\x00%", 'CP50221'); |
| 147 | + testInvalid("\x0F" . chr($i), "\x00%", 'CP50222'); |
| 148 | +} |
| 149 | + |
| 150 | +echo "ASCII support OK\n"; |
| 151 | + |
| 152 | +/* All valid JIS X 0201 characters |
| 153 | + * Those with a 1 in the high bit are JIS X 0201 kana */ |
| 154 | +foreach ($jisx0201Chars as $jisx0201 => $utf16BE) { |
| 155 | + if (ord($jisx0201) >= 128) { /* Kana */ |
| 156 | + $kana = chr(ord($jisx0201) - 128); |
| 157 | + testValid("\x1B(I" . $kana, $utf16BE, 'CP50221'); |
| 158 | + testValid("\x1B(J\x0E" . $kana, $utf16BE, 'CP50222', false); /* 0xE is 'Shift In' code */ |
| 159 | + testValid("\x0E" . $kana, $utf16BE, 'CP50222', false); |
| 160 | + testValid($jisx0201, $utf16BE, 'CP50220', false); |
| 161 | + testValid($jisx0201, $utf16BE, 'CP50221', false); |
| 162 | + testValid($jisx0201, $utf16BE, 'CP50222', false); |
| 163 | + } else { /* Latin */ |
| 164 | + testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50220', $utf16BE > "\x00\x80"); |
| 165 | + testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50221', $utf16BE > "\x00\x80"); |
| 166 | + testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50222', $utf16BE > "\x00\x80"); |
| 167 | + } |
| 168 | +} |
| 169 | + |
| 170 | +for ($i = 0x80; $i < 256; $i++) { |
| 171 | + if ($i >= 0xA1 && $i <= 0xDF) |
| 172 | + continue; |
| 173 | + testInvalid("\x1B(I" . chr($i), "\x00%", 'CP50220'); |
| 174 | + testInvalid("\x1B(I" . chr($i), "\x00%", 'CP50221'); |
| 175 | + testInvalid("\x1B(I" . chr($i), "\x00%", 'CP50222'); |
| 176 | + testInvalid("\x1B(J" . chr($i), "\x00%", 'CP50220'); |
| 177 | + testInvalid("\x1B(J" . chr($i), "\x00%", 'CP50221'); |
| 178 | + testInvalid("\x1B(J" . chr($i), "\x00%", 'CP50222'); |
| 179 | +} |
| 180 | + |
| 181 | +echo "JIS X 0201 support OK\n"; |
| 182 | + |
| 183 | +/* All valid CP932 characters */ |
| 184 | +foreach ($cp932Chars as $cp932 => $utf16BE) { |
| 185 | + testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50220'); |
| 186 | + testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50221'); |
| 187 | + testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50222'); |
| 188 | +} |
| 189 | +foreach ($nonInvertible as $cp932 => $utf16BE) { |
| 190 | + testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50220', false); |
| 191 | + testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50221', false); |
| 192 | + testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50222', false); |
| 193 | +} |
| 194 | + |
| 195 | +/* All invalid 2-byte CP932 characters */ |
| 196 | +for ($i = 0x21; $i <= 0x7E; $i++) { |
| 197 | + for ($j = 0; $j < 256; $j++) { |
| 198 | + $testString = chr($i) . chr($j); |
| 199 | + if (!isset($cp932Chars[$testString]) && !isset($nonInvertible[$testString])) { |
| 200 | + testInvalid("\x1B\$B" . $testString, "\x00%", 'CP50220'); |
| 201 | + testInvalid("\x1B\$B" . $testString, "\x00%", 'CP50221'); |
| 202 | + testInvalid("\x1B\$B" . $testString, "\x00%", 'CP50222'); |
| 203 | + } |
| 204 | + } |
| 205 | +} |
| 206 | + |
| 207 | +echo "CP932 support OK\n"; |
| 208 | + |
| 209 | +/* Unicode codepoint for halfwidth katakana -> kuten code for ordinary katakana */ |
| 210 | +$fullwidthKatakana = array( |
| 211 | + 0xFF61 => 0x2123, /* Ideographic full stop */ |
| 212 | + 0xFF62 => 0x2156, /* Left corner bracket */ |
| 213 | + 0xFF63 => 0x2157, /* Right corner bracket */ |
| 214 | + 0xFF64 => 0x2122, /* Ideographic comma */ |
| 215 | + 0xFF65 => 0x2126, /* Katakana middle dot */ |
| 216 | + 0xFF66 => 0x2572, /* Wo */ |
| 217 | + 0xFF67 => 0x2521, /* Small A */ |
| 218 | + 0xFF68 => 0x2523, /* Small I */ |
| 219 | + 0xFF69 => 0x2525, /* Small U */ |
| 220 | + 0xFF6A => 0x2527, /* Small E */ |
| 221 | + 0xFF6B => 0x2529, /* Small O */ |
| 222 | + 0xFF6C => 0x2563, /* Small Ya */ |
| 223 | + 0xFF6D => 0x2565, /* Small Yu */ |
| 224 | + 0xFF6E => 0x2567, /* Small Yo */ |
| 225 | + 0xFF6F => 0x2543, /* Small Tsu */ |
| 226 | + 0xFF70 => 0x213C, /* Prolonged Sound Marker */ |
| 227 | + 0xFF71 => 0x2522, /* A */ |
| 228 | + 0xFF72 => 0x2524, /* I */ |
| 229 | + 0xFF73 => 0x2526, /* U */ |
| 230 | + 0xFF74 => 0x2528, /* E */ |
| 231 | + 0xFF75 => 0x252A, /* O */ |
| 232 | + 0xFF76 => 0x252B, /* Ka */ |
| 233 | + 0xFF77 => 0x252D, /* Ki */ |
| 234 | + 0xFF78 => 0x252F, /* Ku */ |
| 235 | + 0xFF79 => 0x2531, /* Ke */ |
| 236 | + 0xFF7A => 0x2533, /* Ko */ |
| 237 | + 0xFF7B => 0x2535, /* Sa */ |
| 238 | + 0xFF7C => 0x2537, /* Shi */ |
| 239 | + 0xFF7D => 0x2539, /* Su */ |
| 240 | + 0xFF7E => 0x253B, /* Se */ |
| 241 | + 0xFF7F => 0x253D, /* So */ |
| 242 | + 0xFF80 => 0x253F, /* Ta */ |
| 243 | + 0xFF81 => 0x2541, /* Chi */ |
| 244 | + 0xFF82 => 0x2544, /* Tsu */ |
| 245 | + 0xFF83 => 0x2546, /* Te */ |
| 246 | + 0xFF84 => 0x2548, /* To */ |
| 247 | + 0xFF85 => 0x254A, /* Na */ |
| 248 | + 0xFF86 => 0x254B, /* Ni */ |
| 249 | + 0xFF87 => 0x254C, /* Nu */ |
| 250 | + 0xFF88 => 0x254D, /* Ne */ |
| 251 | + 0xFF89 => 0x254E, /* No */ |
| 252 | + 0xFF8A => 0x254F, /* Ha */ |
| 253 | + 0xFF8B => 0x2552, /* Hi */ |
| 254 | + 0xFF8C => 0x2555, /* Fu */ |
| 255 | + 0xFF8D => 0x2558, /* He */ |
| 256 | + 0xFF8E => 0x255B, /* Ho */ |
| 257 | + 0xFF8F => 0x255E, /* Ma */ |
| 258 | + 0xFF90 => 0x255F, /* Mi */ |
| 259 | + 0xFF91 => 0x2560, /* Mu */ |
| 260 | + 0xFF92 => 0x2561, /* Me */ |
| 261 | + 0xFF93 => 0x2562, /* Mo */ |
| 262 | + 0xFF94 => 0x2564, /* Ya */ |
| 263 | + 0xFF95 => 0x2566, /* Yu */ |
| 264 | + 0xFF96 => 0x2568, /* Yo */ |
| 265 | + 0xFF97 => 0x2569, /* Ra */ |
| 266 | + 0xFF98 => 0x256A, /* Ri */ |
| 267 | + 0xFF99 => 0x256B, /* Ru */ |
| 268 | + 0xFF9A => 0x256C, /* Re */ |
| 269 | + 0xFF9B => 0x256D, /* Ro */ |
| 270 | + 0xFF9C => 0x256F, /* Wa */ |
| 271 | + 0xFF9D => 0x2573, /* N */ |
| 272 | + 0xFF9E => 0x212B, /* Voice Mark */ |
| 273 | + 0xFF9F => 0x212C /* Semi-voice Mark */ |
| 274 | +); |
| 275 | +foreach ($fullwidthKatakana as $cp => $kuten) { |
| 276 | + convertValidString(pack('n', $cp), "\x1B\$B" . pack('n', $kuten) . "\x1B(B", 'UTF-16BE', 'CP50220', false); |
| 277 | +} |
| 278 | + |
| 279 | +echo "Folding of fullwidth katakana for CP50220 OK\n"; |
| 280 | + |
| 281 | +?> |
| 282 | +--EXPECT-- |
| 283 | +ASCII support OK |
| 284 | +JIS X 0201 support OK |
| 285 | +CP932 support OK |
| 286 | +Folding of fullwidth katakana for CP50220 OK |
0 commit comments