Add fast mb_strcut implementation for UTF-16

alexdowad · alexdowad · commit d04854b38c40 · 2023-10-28T19:09:08.000+02:00
Similar to the fast, specialized mb_strcut implementation for UTF-8 in 1f0cf13, this new implementation of mb_strcut for UTF-16 strings just examines a few bytes before each cut point. Even for short strings, the new implementation is around 2x faster. For strings around 10,000 bytes in length, it comes out about 100-500x faster in my microbenchmarks. The new implementation behaves identically to the old one on valid UTF-16 strings; a fuzzer was used to help verify this.
diff --git a/UPGRADING b/UPGRADING
@@ -101,6 +101,10 @@ PHP 8.4 UPGRADE NOTES
 5. Changed Functions
 ========================================
 
+- MBString:
+  . The behavior of mb_strcut is more consistent now on invalid UTF-8 and UTF-16
+    strings. (For valid UTF-8 and UTF-16 strings, there is no change.)
+
 - PGSQL:
   . pg_select, the conditions arguments accepts an empty array and is optional.
 
@@ -177,3 +181,5 @@ PHP 8.4 UPGRADE NOTES
 
 * The performance of strspn() is greatly improved. It now runs in linear time
   instead of being bounded by quadratic time.
+
+* mb_strcut() is much faster now for UTF-8 and UTF-16 strings.
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c
@@ -175,6 +175,9 @@ static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf
 
 static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter);
 static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
+static zend_string* mb_cut_utf16(unsigned char *str, size_t from, size_t len, unsigned char *end);
+static zend_string* mb_cut_utf16be(unsigned char *str, size_t from, size_t len, unsigned char *end);
+static zend_string* mb_cut_utf16le(unsigned char *str, size_t from, size_t len, unsigned char *end);
 
 static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};
 
@@ -190,7 +193,7 @@ const mbfl_encoding mbfl_encoding_utf16 = {
 	mb_utf16_to_wchar,
 	mb_wchar_to_utf16be,
 	NULL,
-	NULL,
+	mb_cut_utf16
 };
 
 const mbfl_encoding mbfl_encoding_utf16be = {
@@ -205,7 +208,7 @@ const mbfl_encoding mbfl_encoding_utf16be = {
 	mb_utf16be_to_wchar,
 	mb_wchar_to_utf16be,
 	NULL,
-	NULL,
+	mb_cut_utf16be
 };
 
 const mbfl_encoding mbfl_encoding_utf16le = {
@@ -220,7 +223,7 @@ const mbfl_encoding mbfl_encoding_utf16le = {
 	mb_utf16le_to_wchar,
 	mb_wchar_to_utf16le,
 	NULL,
-	NULL,
+	mb_cut_utf16le
 };
 
 const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
@@ -1043,3 +1046,89 @@ static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *b
 }
 
 #endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */
+
+static zend_string* mb_cut_utf16be(unsigned char *str, size_t from, size_t len, unsigned char *end)
+{
+	if (len > end - (str + from)) {
+		len = end - (str + from);
+	}
+	from &= ~1;
+	len &= ~1;
+	unsigned char *start = str + from;
+	if (len < 2 || (end - start) < 2) {
+		return zend_empty_string;
+	}
+	/* Check if 1st codepoint is 2nd part of surrogate pair */
+	if (from > 0) {
+		uint32_t start_cp = (*start << 8) + *(start + 1);
+		if (start_cp >= 0xDC00 && start_cp <= 0xDFFF) {
+			uint32_t preceding_cp = (*(start - 2) << 8) + *(start - 1);
+			if (preceding_cp >= 0xD800 && preceding_cp <= 0xDBFF) {
+				from -= 2;
+			}
+		}
+	}
+	/* Same for ending cut point */
+	unsigned char *_end = start + len;
+	if (_end > end) {
+		_end = end;
+	}
+	uint32_t ending_cp = (*(_end - 2) << 8) + *(_end - 1);
+	if (ending_cp >= 0xD800 && ending_cp <= 0xDBFF) {
+		_end -= 2;
+	}
+	return zend_string_init_fast((char*)start, _end - start);
+}
+
+static zend_string* mb_cut_utf16le(unsigned char *str, size_t from, size_t len, unsigned char *end)
+{
+	if (len > end - (str + from)) {
+		len = end - (str + from);
+	}
+	from &= ~1;
+	len &= ~1;
+	unsigned char *start = str + from;
+	if (len < 2 || (end - start) < 2) {
+		return zend_empty_string;
+	}
+	/* Check if 1st codepoint is 2nd part of surrogate pair */
+	if (from > 0) {
+		uint32_t start_cp = (*(start + 1) << 8) + *start;
+		if (start_cp >= 0xDC00 && start_cp <= 0xDFFF) {
+			uint32_t preceding_cp = (*(start - 1) << 8) + *(start - 2);
+			if (preceding_cp >= 0xD800 && preceding_cp <= 0xDBFF) {
+				from -= 2;
+			}
+		}
+	}
+	/* Same for ending cut point */
+	unsigned char *_end = start + len;
+	if (_end > end) {
+		_end = end;
+	}
+	uint32_t ending_cp = (*(_end - 1) << 8) + *(_end - 2);
+	if (ending_cp >= 0xD800 && ending_cp <= 0xDBFF) {
+		_end -= 2;
+	}
+	return zend_string_init_fast((char*)start, _end - start);
+}
+
+static zend_string* mb_cut_utf16(unsigned char *str, size_t from, size_t len, unsigned char *end)
+{
+	if (len < 2 || (end - str) < 2) {
+		return zend_empty_string;
+	}
+	uint32_t cp = (*str << 8) + *(str + 1);
+	if (cp == 0xFFFE) {
+		/* Little-endian BOM */
+		if (from < 2) {
+			from = 2;
+		}
+		return mb_cut_utf16le(str, from, len, end);
+	} else {
+		if (cp == 0xFEFF && from < 2) {
+			from = 2;
+		}
+		return mb_cut_utf16be(str, from, len, end);
+	}
+}
diff --git a/ext/mbstring/tests/mb_strcut.phpt b/ext/mbstring/tests/mb_strcut.phpt
@@ -248,7 +248,7 @@ OK
 Single byte: []
 With from=1: []
 Bad surrogate: []
-Bad surrogate followed by other bytes: [003f1243]
+Bad surrogate followed by other bytes: [d9001243]
 BE byte order mark: []
 LE byte order mark: []
 Length=0: []