Add specialized implementation of mb_strcut for GB18030

alexdowad · alexdowad · commit cffdeb81d5b8 · 2023-12-18T17:01:20.000+02:00
For GB18030, it is not generally possible to identify character
boundaries without scanning through the entire string. Therefore,
implement mb_strcut using a similar strategy as the mblen_table based
implementation in mbstring.c. The difference is that for GB18030, we
need to look at two leading bytes to determine the byte length of a
multi-byte character.

The new implementation is 4-5x faster for short strings, and more than
10x faster for long strings. (Part of the reason why this new code has
such a great performance advantage is because it is replacing code
based on the older text conversion filters provided by libmbfl, which
were quite slow.)

The behavior is the same as before for valid GB18030 strings; for
some invalid strings, mb_strcut will choose different 'cut' points
as compared to before. (Clang's libFuzzer was used to compare the
old and new implementations, searching for test cases where they had
different behavior; no such cases were found.)
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cjk.c b/ext/mbstring/libmbfl/filters/mbfilter_cjk.c
@@ -11560,6 +11560,45 @@ static void mb_wchar_to_cp936(uint32_t *in, size_t len, mb_convert_buf *buf, boo
 	MB_CONVERT_BUF_STORE(buf, out, limit);
 }
 
+/* Step through a GB18030 string one character at a time. Find the last position at or
+ * before `limit` which falls directly after the end of a (single or multi-byte) character */
+static zend_always_inline unsigned char* step_through_gb18030_str(unsigned char *p, unsigned char *limit)
+{
+	while (p < limit) {
+		unsigned char c = *p;
+		if (c < 0x81 || c == 0xFF) {
+			p++;
+		} else {
+			if (limit - p == 1) {
+				break;
+			}
+			unsigned char c2 = p[1];
+			/* For a 4-byte char, the 2nd byte will be 0x30-0x39 */
+			unsigned int w = (c2 >= 0x30 && c2 <= 0x39) ? 4 : 2;
+			if (limit - p < w) {
+				break;
+			}
+			p += w;
+		}
+	}
+	return p;
+}
+
+static zend_string* mb_cut_gb18030(unsigned char *str, size_t from, size_t len, unsigned char *end)
+{
+	ZEND_ASSERT(str + from <= end);
+	unsigned char *start = step_through_gb18030_str(str, str + from);
+	if (str + from + len > end) {
+		len = (end - str) - from;
+	}
+	if (start + len >= end) {
+		return zend_string_init_fast((const char*)start, end - start);
+	} else {
+		unsigned char *_end = step_through_gb18030_str(start, start + len);
+		return zend_string_init_fast((const char*)start, _end - start);
+	}
+}
+
 static const char *mbfl_encoding_gb18030_aliases[] = {"gb-18030", "gb-18030-2000", NULL};
 
 static const struct mbfl_convert_vtbl vtbl_gb18030_wchar = {
@@ -11594,7 +11633,7 @@ const mbfl_encoding mbfl_encoding_gb18030 = {
 	mb_gb18030_to_wchar,
 	mb_wchar_to_gb18030,
 	NULL,
-	NULL,
+	mb_cut_gb18030,
 };
 
 static const char *mbfl_encoding_cp936_aliases[] = {"CP-936", "GBK", NULL};
diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c
@@ -2363,7 +2363,7 @@ PHP_FUNCTION(mb_strcut)
 	zend_string *encoding = NULL;
 	char *string_val;
 	zend_long from, len;
-	bool len_is_null = 1;
+	bool len_is_null = true;
 	mbfl_string string, result, *ret;
 
 	ZEND_PARSE_PARAMETERS_START(2, 4)
diff --git a/ext/mbstring/tests/mb_strcut.phpt b/ext/mbstring/tests/mb_strcut.phpt
@@ -26,6 +26,7 @@ $jis = mb_convert_encoding("漢字 abc カナ", 'JIS', 'UTF-8');
 $iso2022jp2004 = mb_convert_encoding("漢字 abc カナ凜", 'ISO-2022-JP-2004', 'UTF-8'); // [1b242851 3441 3b7a 1b2842 20 61 62 63 20 1b242851 252b 254a 7425 1b2842]
 $iso2022jpms = mb_convert_encoding("漢字 abc カナ", 'ISO-2022-JP-MS', 'UTF-8'); // [1b2442 3441 3b7a 1b2842 20 61 62 63 20 1b2442 252b 254a 1b2842]
 $iso2022jp_kddi = mb_convert_encoding("漢字 abc カナ", 'ISO-2022-JP-KDDI', 'UTF-8');
+$gb18030 = mb_convert_encoding("漢字 abc カナ", 'GB18030', 'UTF-8');
 
 print "== EUC-JP ==\n";
 print MBStringChars(mb_strcut($euc_jp,  6,   5, 'EUC-JP'), 'EUC-JP') . "\n";
@@ -218,9 +219,37 @@ print "UTF-16 section is terminated improperly: [" . mb_strcut("&i6o\x83", 0, 10
 
 print "== GB18030 ==\n";
 
+print "Empty string: [" . bin2hex(mb_strcut("", 0, 5, 'GB18030')) . "]\n";
+print "Empty string 2: [" . bin2hex(mb_strcut("", -2, 1, 'GB18030')) . "]\n";
+print "Empty string 3: [" . bin2hex(mb_strcut("", 0, -1, 'GB18030')) . "]\n";
 print "Invalid byte 0xF5: [" . bin2hex(mb_strcut("\xF5a", 1, 100, 'GB18030')) . "]\n";
 print "Double-byte char: [" . bin2hex(mb_strcut("\xAFw", -1, 100, "GB18030")) . "]\n";
 
+print MBStringChars(mb_strcut($gb18030, 0, 0, 'GB18030'), 'GB18030') . "\n";
+print MBStringChars(mb_strcut($gb18030, 0, 1, 'GB18030'), 'GB18030') . "\n";
+print MBStringChars(mb_strcut($gb18030, 0, 2, 'GB18030'), 'GB18030') . "\n";
+print MBStringChars(mb_strcut($gb18030, 0, 3, 'GB18030'), 'GB18030') . "\n";
+print MBStringChars(mb_strcut($gb18030, 0, 4, 'GB18030'), 'GB18030') . "\n";
+print MBStringChars(mb_strcut($gb18030, 0, 5, 'GB18030'), 'GB18030') . "\n";
+print MBStringChars(mb_strcut($gb18030, 1, 2, 'GB18030'), 'GB18030') . "\n";
+print MBStringChars(mb_strcut($gb18030, 1, 3, 'GB18030'), 'GB18030') . "\n";
+print MBStringChars(mb_strcut($gb18030, 1, 4, 'GB18030'), 'GB18030') . "\n";
+
+// U+210A is encoded using 4 bytes in GB18030
+print "Operating on 4-byte GB18030 character:\n";
+$fourbyte = mb_convert_encoding("\x21\x0A", 'GB18030', 'UTF-16BE');
+print MBStringChars(mb_strcut($fourbyte, 0, 4, 'GB18030'), 'GB18030') . "\n";
+print MBStringChars(mb_strcut($fourbyte, 1, 4, 'GB18030'), 'GB18030') . "\n";
+print MBStringChars(mb_strcut($fourbyte, 2, 4, 'GB18030'), 'GB18030') . "\n";
+print MBStringChars(mb_strcut($fourbyte, 3, 4, 'GB18030'), 'GB18030') . "\n";
+print MBStringChars(mb_strcut($fourbyte, 4, 4, 'GB18030'), 'GB18030') . "\n";
+print MBStringChars(mb_strcut($fourbyte, 1, 3, 'GB18030'), 'GB18030') . "\n";
+print MBStringChars(mb_strcut($fourbyte, 2, 3, 'GB18030'), 'GB18030') . "\n";
+print MBStringChars(mb_strcut($fourbyte, 2, 4, 'GB18030'), 'GB18030') . "\n";
+print MBStringChars(mb_strcut($fourbyte, 0, -1, 'GB18030'), 'GB18030') . "\n";
+
+print "[" . bin2hex(mb_strcut(hex2bin("84308130"), 2, null, "GB18030")) . "]\n";
+
 print "== UHC ==\n";
 
 print "Single byte 0x96: [" . bin2hex(mb_strcut("\x96", 1, 1280, "UHC")) . "]\n";
@@ -405,8 +434,31 @@ UTF-16 section ends abruptly: []
 UTF-16 section ends abruptly in middle of 2nd codepoint: []
 UTF-16 section is terminated improperly: []
 == GB18030 ==
+Empty string: []
+Empty string 2: []
+Empty string 3: []
 Invalid byte 0xF5: []
 Double-byte char: []
+[]
+[]
+[9d68]
+[9d68]
+[9d68 d7d6]
+[9d68 d7d6 20]
+[9d68]
+[9d68]
+[9d68 d7d6]
+Operating on 4-byte GB18030 character:
+[8136bc32]
+[]
+[]
+[]
+[]
+[]
+[]
+[]
+[]
+[]
 == UHC ==
 Single byte 0x96: [96]
 == ASCII ==