Skip to content

Commit d04854b

Browse files
committed
Add fast mb_strcut implementation for UTF-16
Similar to the fast, specialized mb_strcut implementation for UTF-8 in 1f0cf13, this new implementation of mb_strcut for UTF-16 strings just examines a few bytes before each cut point. Even for short strings, the new implementation is around 2x faster. For strings around 10,000 bytes in length, it comes out about 100-500x faster in my microbenchmarks. The new implementation behaves identically to the old one on valid UTF-16 strings; a fuzzer was used to help verify this.
1 parent 00c567a commit d04854b

File tree

3 files changed

+99
-4
lines changed

3 files changed

+99
-4
lines changed

UPGRADING

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,10 @@ PHP 8.4 UPGRADE NOTES
101101
5. Changed Functions
102102
========================================
103103

104+
- MBString:
105+
. The behavior of mb_strcut is more consistent now on invalid UTF-8 and UTF-16
106+
strings. (For valid UTF-8 and UTF-16 strings, there is no change.)
107+
104108
- PGSQL:
105109
. pg_select, the conditions arguments accepts an empty array and is optional.
106110

@@ -177,3 +181,5 @@ PHP 8.4 UPGRADE NOTES
177181

178182
* The performance of strspn() is greatly improved. It now runs in linear time
179183
instead of being bounded by quadratic time.
184+
185+
* mb_strcut() is much faster now for UTF-8 and UTF-16 strings.

ext/mbstring/libmbfl/filters/mbfilter_utf16.c

Lines changed: 92 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,9 @@ static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf
175175

176176
static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter);
177177
static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
178+
static zend_string* mb_cut_utf16(unsigned char *str, size_t from, size_t len, unsigned char *end);
179+
static zend_string* mb_cut_utf16be(unsigned char *str, size_t from, size_t len, unsigned char *end);
180+
static zend_string* mb_cut_utf16le(unsigned char *str, size_t from, size_t len, unsigned char *end);
178181

179182
static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};
180183

@@ -190,7 +193,7 @@ const mbfl_encoding mbfl_encoding_utf16 = {
190193
mb_utf16_to_wchar,
191194
mb_wchar_to_utf16be,
192195
NULL,
193-
NULL,
196+
mb_cut_utf16
194197
};
195198

196199
const mbfl_encoding mbfl_encoding_utf16be = {
@@ -205,7 +208,7 @@ const mbfl_encoding mbfl_encoding_utf16be = {
205208
mb_utf16be_to_wchar,
206209
mb_wchar_to_utf16be,
207210
NULL,
208-
NULL,
211+
mb_cut_utf16be
209212
};
210213

211214
const mbfl_encoding mbfl_encoding_utf16le = {
@@ -220,7 +223,7 @@ const mbfl_encoding mbfl_encoding_utf16le = {
220223
mb_utf16le_to_wchar,
221224
mb_wchar_to_utf16le,
222225
NULL,
223-
NULL,
226+
mb_cut_utf16le
224227
};
225228

226229
const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
@@ -1043,3 +1046,89 @@ static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *b
10431046
}
10441047

10451048
#endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */
1049+
1050+
static zend_string* mb_cut_utf16be(unsigned char *str, size_t from, size_t len, unsigned char *end)
1051+
{
1052+
if (len > end - (str + from)) {
1053+
len = end - (str + from);
1054+
}
1055+
from &= ~1;
1056+
len &= ~1;
1057+
unsigned char *start = str + from;
1058+
if (len < 2 || (end - start) < 2) {
1059+
return zend_empty_string;
1060+
}
1061+
/* Check if 1st codepoint is 2nd part of surrogate pair */
1062+
if (from > 0) {
1063+
uint32_t start_cp = (*start << 8) + *(start + 1);
1064+
if (start_cp >= 0xDC00 && start_cp <= 0xDFFF) {
1065+
uint32_t preceding_cp = (*(start - 2) << 8) + *(start - 1);
1066+
if (preceding_cp >= 0xD800 && preceding_cp <= 0xDBFF) {
1067+
from -= 2;
1068+
}
1069+
}
1070+
}
1071+
/* Same for ending cut point */
1072+
unsigned char *_end = start + len;
1073+
if (_end > end) {
1074+
_end = end;
1075+
}
1076+
uint32_t ending_cp = (*(_end - 2) << 8) + *(_end - 1);
1077+
if (ending_cp >= 0xD800 && ending_cp <= 0xDBFF) {
1078+
_end -= 2;
1079+
}
1080+
return zend_string_init_fast((char*)start, _end - start);
1081+
}
1082+
1083+
static zend_string* mb_cut_utf16le(unsigned char *str, size_t from, size_t len, unsigned char *end)
1084+
{
1085+
if (len > end - (str + from)) {
1086+
len = end - (str + from);
1087+
}
1088+
from &= ~1;
1089+
len &= ~1;
1090+
unsigned char *start = str + from;
1091+
if (len < 2 || (end - start) < 2) {
1092+
return zend_empty_string;
1093+
}
1094+
/* Check if 1st codepoint is 2nd part of surrogate pair */
1095+
if (from > 0) {
1096+
uint32_t start_cp = (*(start + 1) << 8) + *start;
1097+
if (start_cp >= 0xDC00 && start_cp <= 0xDFFF) {
1098+
uint32_t preceding_cp = (*(start - 1) << 8) + *(start - 2);
1099+
if (preceding_cp >= 0xD800 && preceding_cp <= 0xDBFF) {
1100+
from -= 2;
1101+
}
1102+
}
1103+
}
1104+
/* Same for ending cut point */
1105+
unsigned char *_end = start + len;
1106+
if (_end > end) {
1107+
_end = end;
1108+
}
1109+
uint32_t ending_cp = (*(_end - 1) << 8) + *(_end - 2);
1110+
if (ending_cp >= 0xD800 && ending_cp <= 0xDBFF) {
1111+
_end -= 2;
1112+
}
1113+
return zend_string_init_fast((char*)start, _end - start);
1114+
}
1115+
1116+
static zend_string* mb_cut_utf16(unsigned char *str, size_t from, size_t len, unsigned char *end)
1117+
{
1118+
if (len < 2 || (end - str) < 2) {
1119+
return zend_empty_string;
1120+
}
1121+
uint32_t cp = (*str << 8) + *(str + 1);
1122+
if (cp == 0xFFFE) {
1123+
/* Little-endian BOM */
1124+
if (from < 2) {
1125+
from = 2;
1126+
}
1127+
return mb_cut_utf16le(str, from, len, end);
1128+
} else {
1129+
if (cp == 0xFEFF && from < 2) {
1130+
from = 2;
1131+
}
1132+
return mb_cut_utf16be(str, from, len, end);
1133+
}
1134+
}

ext/mbstring/tests/mb_strcut.phpt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ OK
248248
Single byte: []
249249
With from=1: []
250250
Bad surrogate: []
251-
Bad surrogate followed by other bytes: [003f1243]
251+
Bad surrogate followed by other bytes: [d9001243]
252252
BE byte order mark: []
253253
LE byte order mark: []
254254
Length=0: []

0 commit comments

Comments
 (0)