Enhance handling of CP51932 encoding

alexdowad · alexdowad · commit 5c805655db11 · 2020-11-25T20:51:44.000+02:00
- Don't pass 'control' characters through in the middle of a multi-byte char
- Treat truncated multi-byte characters as an error
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c b/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c
@@ -34,6 +34,8 @@
 #include "unicode_table_jis.h"
 #include "cp932_table.h"
 
+static int mbfl_filt_conv_cp51932_wchar_flush(mbfl_convert_filter *filter);
+
 static const unsigned char mblen_table_eucjp[] = { /* 0xA1-0xFE */
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -72,7 +74,7 @@ const struct mbfl_convert_vtbl vtbl_cp51932_wchar = {
 	mbfl_filt_conv_common_ctor,
 	NULL,
 	mbfl_filt_conv_cp51932_wchar,
-	mbfl_filt_conv_common_flush,
+	mbfl_filt_conv_cp51932_wchar_flush,
 	NULL,
 };
 
@@ -105,17 +107,15 @@ mbfl_filt_conv_cp51932_wchar(int c, mbfl_convert_filter *filter)
 
 	switch (filter->status) {
 	case 0:
-		if (c >= 0 && c < 0x80) {	/* latin */
+		if (c >= 0 && c < 0x80) { /* latin */
 			CK((*filter->output_function)(c, filter->data));
-		} else if (c > 0xa0 && c < 0xff) {	/* CP932 first char */
+		} else if (c >= 0xA1 && c <= 0xFE) { /* CP932, first byte */
 			filter->status = 1;
 			filter->cache = c;
-		} else if (c == 0x8e) {	/* kana first char */
+		} else if (c == 0x8e) { /* kana first char */
 			filter->status = 2;
 		} else {
-			w = c & MBFL_WCSGROUP_MASK;
-			w |= MBFL_WCSGROUP_THROUGH;
-			CK((*filter->output_function)(w, filter->data));
+			CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
 		}
 		break;
 
@@ -152,17 +152,11 @@ mbfl_filt_conv_cp51932_wchar(int c, mbfl_convert_filter *filter)
 				}
 			}
 			if (w <= 0) {
-				w = ((c1 & 0x7f) << 8) | (c & 0x7f);
-				w &= MBFL_WCSPLANE_MASK;
-				w |= MBFL_WCSPLANE_WINCP932;
+				w = ((c1 & 0x7f) << 8) | (c & 0x7f) | MBFL_WCSPLANE_WINCP932;
 			}
 			CK((*filter->output_function)(w, filter->data));
-		} else if ((c >= 0 && c < 0x21) || c == 0x7f) {		/* CTLs */
-			CK((*filter->output_function)(c, filter->data));
 		} else {
-			w = (c1 << 8) | c;
-			w &= MBFL_WCSGROUP_MASK;
-			w |= MBFL_WCSGROUP_THROUGH;
+			w = (c1 << 8) | c | MBFL_WCSGROUP_THROUGH;
 			CK((*filter->output_function)(w, filter->data));
 		}
 		break;
@@ -172,12 +166,8 @@ mbfl_filt_conv_cp51932_wchar(int c, mbfl_convert_filter *filter)
 		if (c > 0xa0 && c < 0xe0) {
 			w = 0xfec0 + c;
 			CK((*filter->output_function)(w, filter->data));
-		} else if ((c >= 0 && c < 0x21) || c == 0x7f) {		/* CTLs */
-			CK((*filter->output_function)(c, filter->data));
 		} else {
-			w = 0x8e00 | c;
-			w &= MBFL_WCSGROUP_MASK;
-			w |= MBFL_WCSGROUP_THROUGH;
+			w = 0x8e00 | c | MBFL_WCSGROUP_THROUGH;
 			CK((*filter->output_function)(w, filter->data));
 		}
 		break;
@@ -190,6 +180,20 @@ mbfl_filt_conv_cp51932_wchar(int c, mbfl_convert_filter *filter)
 	return c;
 }
 
+static int mbfl_filt_conv_cp51932_wchar_flush(mbfl_convert_filter *filter)
+{
+	if (filter->status) {
+		/* Input string was truncated */
+		(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
+	}
+
+	if (filter->flush_function) {
+		(*filter->flush_function)(filter->data);
+	}
+
+	return 0;
+}
+
 /*
  * wchar => cp51932
  */
diff --git a/ext/mbstring/tests/cp51932_encoding.phpt b/ext/mbstring/tests/cp51932_encoding.phpt
@@ -0,0 +1,104 @@
+--TEST--
+Exhaustive test of CP51932 encoding verification and conversion
+--SKIPIF--
+<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
+--FILE--
+<?php
+srand(2020); /* Make results consistent */
+include('encoding_tests.inc');
+mb_substitute_character(0x25); // '%'
+
+/* Read in the table of all characters in CP51932 */
+$validChars = array(); /* CP51932 string -> UTF-16BE string */
+$fromUnicode = array();
+
+$fp = fopen(realpath(__DIR__ . '/data/CP51932.txt'), 'r+');
+while ($line = fgets($fp, 256)) {
+  if ($line[0] == '#')
+    continue;
+
+  $byte2 = null;
+  if (sscanf($line, '<U%x> \x%x\x%x', $codepoint, $byte1, $byte2) >= 2) {
+    /* The table we are using tries to map as many Unicode codepoints into
+     * CP51932 as possible, including by mapping latin characters with accents
+     * to the equivalent without accents; but since CP51932 is based on the
+     * CP932 character set, we don't need to handle codepoints which are not
+     * mapped from any character in CP932 */
+    if (($codepoint >= 0xC0 && $codepoint <= 0xD6) ||
+        ($codepoint >= 0xD8 && $codepoint <= 0xF6) ||
+        ($codepoint >= 0xF8 && $codepoint <= 0xFF))
+      continue;
+    $cp51932 = ($byte2 ? (chr($byte1) . chr($byte2)) : chr($byte1));
+    $utf16 = pack('n', $codepoint);
+    $validChars[$cp51932] = $utf16;
+    $fromUnicode[$utf16] = $cp51932;
+  }
+}
+
+/* We map the JIS X 0208 FULLWIDTH TILDE to U+FF5E (FULLWIDTH TILDE)
+ * But when converting Unicode to CP51932, we also accept U+301C (WAVE DASH) */
+$fromUnicode["\x30\x1C"] = "\xA1\xC1";
+/* We map the JIS X 0208 MINUS SIGN to U+FF0D (FULLWIDTH HYPHEN-MINUS SIGN),
+ * but when converting Unicode to CP51932, we also accept U+2212 (MINUS SIGN) */
+$fromUnicode["\x22\x12"] = "\xA1\xDD";
+/* We map the JIS X 0208 PARALLEL TO symbol to U+2225 (PARALLEL TO),
+ * but when converting Unicode to CP51932, we also accept U+2016
+ * (DOUBLE VERTICAL LINE) */
+$fromUnicode["\x20\x16"] = "\xA1\xC2";
+
+/* There are a number of duplicate, irreversible mappings in the CP51932 table
+ * In most cases, the one which we primarily use appears last in the table,
+ * but in some cases, it is first and will be overwritten in the above loop
+ *
+ * Interestingly, the "collisions" happen in both directions! Part of this is
+ * because the table we are using attempts to map as many Unicode codepoints
+ * as possible to CP932 characters */
+$fromUnicode["\x22\x20"] = "\xA2\xDC";
+$fromUnicode["\x22\x29"] = "\xA2\xC1";
+$fromUnicode["\x22\x2B"] = "\xA2\xE9";
+$fromUnicode["\x22\x35"] = "\xA2\xE8";
+$fromUnicode["\x22\x1A"] = "\xA2\xE5";
+$fromUnicode["\x22\x2A"] = "\xA2\xC0";
+$fromUnicode["\x22\x61"] = "\xA2\xE1";
+$fromUnicode["\x22\xA5"] = "\xA2\xDD";
+$fromUnicode["\x22\x52"] = "\xA2\xE2";
+$fromUnicode["\xFF\xE2"] = "\xA2\xCC";
+unset($fromUnicode["\x00\xA1"]); // Don't map upside-down ! to ordinary !
+unset($fromUnicode["\x00\xA6"]); // Don't map broken bar to ordinary pipe character
+unset($fromUnicode["\x00\xA9"]); // Don't map © to c
+unset($fromUnicode["\x00\xAA"]); // Don't map feminine ordinal indicator
+unset($fromUnicode["\x00\xAB"]); // Don't map left double angled quote mark to "much less than"
+unset($fromUnicode["\x00\xAD"]); // Don't map soft hyphen to ordinary hyphen
+unset($fromUnicode["\x00\xAE"]); // Don't map ® to R
+unset($fromUnicode["\x00\xAF"]); // Don't map Unicode halfwidth macron to CP932 fullwidth macron
+unset($fromUnicode["\x00\xB2"]); // Don't map ² to ordinary 2
+unset($fromUnicode["\x00\xB3"]); // Don't map ³ to ordinary 3
+unset($fromUnicode["\x00\xB5"]); // Don't map micro sign to Greek mu
+unset($fromUnicode["\x00\xB7"]); // Don't map middle dot to katakana middle dot
+unset($fromUnicode["\x00\xB8"]); // Don't map cedilla to fullwidth comma
+unset($fromUnicode["\x00\xB9"]); // Don't map ¹ to ordinary 1
+unset($fromUnicode["\x00\xBA"]); // Don't map "masculine ordinal indicator"
+unset($fromUnicode["\x00\xBB"]); // Don't map right double angled quote mark to "much greater than"
+unset($fromUnicode["\x30\x94"]); // Don't map hiragana vu to katakana vu
+
+for ($i = 0; $i <= 0x7F; $i++)
+  $validChars[chr($i)] = "\x00" . chr($i);
+
+testAllValidChars($validChars, 'CP51932', 'UTF-16BE', false);
+testAllValidChars($fromUnicode, 'UTF-16BE', 'CP51932', false);
+echo "CP51932 verification and conversion works on all valid characters\n";
+
+findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0xA9, 0xAF), 2) + array_fill_keys(range(0xF5, 0xF8), 2) + array(0xFD => 2, 0xFE => 2));
+
+testAllInvalidChars($invalidChars, $validChars, 'CP51932', 'UTF-16BE', "\x00%");
+testTruncatedChars($truncated, 'CP51932', 'UTF-16BE', "\x00%");
+echo "CP51932 verification and conversion works on all invalid characters\n";
+
+findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2));
+convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP51932', '%');
+echo "Unicode -> CP51932 conversion works on all invalid codepoints\n";
+?>
+--EXPECT--
+CP51932 verification and conversion works on all valid characters
+CP51932 verification and conversion works on all invalid characters
+Unicode -> CP51932 conversion works on all invalid codepoints
diff --git a/ext/mbstring/tests/data/CP51932.txt b/ext/mbstring/tests/data/CP51932.txt