|
| 1 | +--TEST-- |
| 2 | +Exhaustive test of CP51932 encoding verification and conversion |
| 3 | +--SKIPIF-- |
| 4 | +<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?> |
| 5 | +--FILE-- |
| 6 | +<?php |
| 7 | +srand(2020); /* Make results consistent */ |
| 8 | +include('encoding_tests.inc'); |
| 9 | +mb_substitute_character(0x25); // '%' |
| 10 | + |
| 11 | +/* Read in the table of all characters in CP51932 */ |
| 12 | +$validChars = array(); /* CP51932 string -> UTF-16BE string */ |
| 13 | +$fromUnicode = array(); |
| 14 | + |
| 15 | +$fp = fopen(realpath(__DIR__ . '/data/CP51932.txt'), 'r+'); |
| 16 | +while ($line = fgets($fp, 256)) { |
| 17 | + if ($line[0] == '#') |
| 18 | + continue; |
| 19 | + |
| 20 | + $byte2 = null; |
| 21 | + if (sscanf($line, '<U%x> \x%x\x%x', $codepoint, $byte1, $byte2) >= 2) { |
| 22 | + /* The table we are using tries to map as many Unicode codepoints into |
| 23 | + * CP51932 as possible, including by mapping latin characters with accents |
| 24 | + * to the equivalent without accents; but since CP51932 is based on the |
| 25 | + * CP932 character set, we don't need to handle codepoints which are not |
| 26 | + * mapped from any character in CP932 */ |
| 27 | + if (($codepoint >= 0xC0 && $codepoint <= 0xD6) || |
| 28 | + ($codepoint >= 0xD8 && $codepoint <= 0xF6) || |
| 29 | + ($codepoint >= 0xF8 && $codepoint <= 0xFF)) |
| 30 | + continue; |
| 31 | + $cp51932 = ($byte2 ? (chr($byte1) . chr($byte2)) : chr($byte1)); |
| 32 | + $utf16 = pack('n', $codepoint); |
| 33 | + $validChars[$cp51932] = $utf16; |
| 34 | + $fromUnicode[$utf16] = $cp51932; |
| 35 | + } |
| 36 | +} |
| 37 | + |
| 38 | +/* We map the JIS X 0208 FULLWIDTH TILDE to U+FF5E (FULLWIDTH TILDE) |
| 39 | + * But when converting Unicode to CP51932, we also accept U+301C (WAVE DASH) */ |
| 40 | +$fromUnicode["\x30\x1C"] = "\xA1\xC1"; |
| 41 | +/* We map the JIS X 0208 MINUS SIGN to U+FF0D (FULLWIDTH HYPHEN-MINUS SIGN), |
| 42 | + * but when converting Unicode to CP51932, we also accept U+2212 (MINUS SIGN) */ |
| 43 | +$fromUnicode["\x22\x12"] = "\xA1\xDD"; |
| 44 | +/* We map the JIS X 0208 PARALLEL TO symbol to U+2225 (PARALLEL TO), |
| 45 | + * but when converting Unicode to CP51932, we also accept U+2016 |
| 46 | + * (DOUBLE VERTICAL LINE) */ |
| 47 | +$fromUnicode["\x20\x16"] = "\xA1\xC2"; |
| 48 | + |
| 49 | +/* There are a number of duplicate, irreversible mappings in the CP51932 table |
| 50 | + * In most cases, the one which we primarily use appears last in the table, |
| 51 | + * but in some cases, it is first and will be overwritten in the above loop |
| 52 | + * |
| 53 | + * Interestingly, the "collisions" happen in both directions! Part of this is |
| 54 | + * because the table we are using attempts to map as many Unicode codepoints |
| 55 | + * as possible to CP932 characters */ |
| 56 | +$fromUnicode["\x22\x20"] = "\xA2\xDC"; |
| 57 | +$fromUnicode["\x22\x29"] = "\xA2\xC1"; |
| 58 | +$fromUnicode["\x22\x2B"] = "\xA2\xE9"; |
| 59 | +$fromUnicode["\x22\x35"] = "\xA2\xE8"; |
| 60 | +$fromUnicode["\x22\x1A"] = "\xA2\xE5"; |
| 61 | +$fromUnicode["\x22\x2A"] = "\xA2\xC0"; |
| 62 | +$fromUnicode["\x22\x61"] = "\xA2\xE1"; |
| 63 | +$fromUnicode["\x22\xA5"] = "\xA2\xDD"; |
| 64 | +$fromUnicode["\x22\x52"] = "\xA2\xE2"; |
| 65 | +$fromUnicode["\xFF\xE2"] = "\xA2\xCC"; |
| 66 | +unset($fromUnicode["\x00\xA1"]); // Don't map upside-down ! to ordinary ! |
| 67 | +unset($fromUnicode["\x00\xA6"]); // Don't map broken bar to ordinary pipe character |
| 68 | +unset($fromUnicode["\x00\xA9"]); // Don't map © to c |
| 69 | +unset($fromUnicode["\x00\xAA"]); // Don't map feminine ordinal indicator |
| 70 | +unset($fromUnicode["\x00\xAB"]); // Don't map left double angled quote mark to "much less than" |
| 71 | +unset($fromUnicode["\x00\xAD"]); // Don't map soft hyphen to ordinary hyphen |
| 72 | +unset($fromUnicode["\x00\xAE"]); // Don't map ® to R |
| 73 | +unset($fromUnicode["\x00\xAF"]); // Don't map Unicode halfwidth macron to CP932 fullwidth macron |
| 74 | +unset($fromUnicode["\x00\xB2"]); // Don't map ² to ordinary 2 |
| 75 | +unset($fromUnicode["\x00\xB3"]); // Don't map ³ to ordinary 3 |
| 76 | +unset($fromUnicode["\x00\xB5"]); // Don't map micro sign to Greek mu |
| 77 | +unset($fromUnicode["\x00\xB7"]); // Don't map middle dot to katakana middle dot |
| 78 | +unset($fromUnicode["\x00\xB8"]); // Don't map cedilla to fullwidth comma |
| 79 | +unset($fromUnicode["\x00\xB9"]); // Don't map ¹ to ordinary 1 |
| 80 | +unset($fromUnicode["\x00\xBA"]); // Don't map "masculine ordinal indicator" |
| 81 | +unset($fromUnicode["\x00\xBB"]); // Don't map right double angled quote mark to "much greater than" |
| 82 | +unset($fromUnicode["\x30\x94"]); // Don't map hiragana vu to katakana vu |
| 83 | + |
| 84 | +for ($i = 0; $i <= 0x7F; $i++) |
| 85 | + $validChars[chr($i)] = "\x00" . chr($i); |
| 86 | + |
| 87 | +testAllValidChars($validChars, 'CP51932', 'UTF-16BE', false); |
| 88 | +testAllValidChars($fromUnicode, 'UTF-16BE', 'CP51932', false); |
| 89 | +echo "CP51932 verification and conversion works on all valid characters\n"; |
| 90 | + |
| 91 | +findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0xA9, 0xAF), 2) + array_fill_keys(range(0xF5, 0xF8), 2) + array(0xFD => 2, 0xFE => 2)); |
| 92 | + |
| 93 | +testAllInvalidChars($invalidChars, $validChars, 'CP51932', 'UTF-16BE', "\x00%"); |
| 94 | +testTruncatedChars($truncated, 'CP51932', 'UTF-16BE', "\x00%"); |
| 95 | +echo "CP51932 verification and conversion works on all invalid characters\n"; |
| 96 | + |
| 97 | +findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2)); |
| 98 | +convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP51932', '%'); |
| 99 | +echo "Unicode -> CP51932 conversion works on all invalid codepoints\n"; |
| 100 | +?> |
| 101 | +--EXPECT-- |
| 102 | +CP51932 verification and conversion works on all valid characters |
| 103 | +CP51932 verification and conversion works on all invalid characters |
| 104 | +Unicode -> CP51932 conversion works on all invalid codepoints |
0 commit comments