Skip to content

Commit cffdeb8

Browse files
committed
Add specialized implementation of mb_strcut for GB18030
For GB18030, it is not generally possible to identify character boundaries without scanning through the entire string. Therefore, implement mb_strcut using a similar strategy as the mblen_table based implementation in mbstring.c. The difference is that for GB18030, we need to look at two leading bytes to determine the byte length of a multi-byte character. The new implementation is 4-5x faster for short strings, and more than 10x faster for long strings. (Part of the reason why this new code has such a great performance advantage is because it is replacing code based on the older text conversion filters provided by libmbfl, which were quite slow.) The behavior is the same as before for valid GB18030 strings; for some invalid strings, mb_strcut will choose different 'cut' points as compared to before. (Clang's libFuzzer was used to compare the old and new implementations, searching for test cases where they had different behavior; no such cases were found.)
1 parent dd0f2ab commit cffdeb8

File tree

3 files changed

+93
-2
lines changed

3 files changed

+93
-2
lines changed

ext/mbstring/libmbfl/filters/mbfilter_cjk.c

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11560,6 +11560,45 @@ static void mb_wchar_to_cp936(uint32_t *in, size_t len, mb_convert_buf *buf, boo
1156011560
MB_CONVERT_BUF_STORE(buf, out, limit);
1156111561
}
1156211562

11563+
/* Step through a GB18030 string one character at a time. Find the last position at or
11564+
* before `limit` which falls directly after the end of a (single or multi-byte) character */
11565+
static zend_always_inline unsigned char* step_through_gb18030_str(unsigned char *p, unsigned char *limit)
11566+
{
11567+
while (p < limit) {
11568+
unsigned char c = *p;
11569+
if (c < 0x81 || c == 0xFF) {
11570+
p++;
11571+
} else {
11572+
if (limit - p == 1) {
11573+
break;
11574+
}
11575+
unsigned char c2 = p[1];
11576+
/* For a 4-byte char, the 2nd byte will be 0x30-0x39 */
11577+
unsigned int w = (c2 >= 0x30 && c2 <= 0x39) ? 4 : 2;
11578+
if (limit - p < w) {
11579+
break;
11580+
}
11581+
p += w;
11582+
}
11583+
}
11584+
return p;
11585+
}
11586+
11587+
static zend_string* mb_cut_gb18030(unsigned char *str, size_t from, size_t len, unsigned char *end)
11588+
{
11589+
ZEND_ASSERT(str + from <= end);
11590+
unsigned char *start = step_through_gb18030_str(str, str + from);
11591+
if (str + from + len > end) {
11592+
len = (end - str) - from;
11593+
}
11594+
if (start + len >= end) {
11595+
return zend_string_init_fast((const char*)start, end - start);
11596+
} else {
11597+
unsigned char *_end = step_through_gb18030_str(start, start + len);
11598+
return zend_string_init_fast((const char*)start, _end - start);
11599+
}
11600+
}
11601+
1156311602
static const char *mbfl_encoding_gb18030_aliases[] = {"gb-18030", "gb-18030-2000", NULL};
1156411603

1156511604
static const struct mbfl_convert_vtbl vtbl_gb18030_wchar = {
@@ -11594,7 +11633,7 @@ const mbfl_encoding mbfl_encoding_gb18030 = {
1159411633
mb_gb18030_to_wchar,
1159511634
mb_wchar_to_gb18030,
1159611635
NULL,
11597-
NULL,
11636+
mb_cut_gb18030,
1159811637
};
1159911638

1160011639
static const char *mbfl_encoding_cp936_aliases[] = {"CP-936", "GBK", NULL};

ext/mbstring/mbstring.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2363,7 +2363,7 @@ PHP_FUNCTION(mb_strcut)
23632363
zend_string *encoding = NULL;
23642364
char *string_val;
23652365
zend_long from, len;
2366-
bool len_is_null = 1;
2366+
bool len_is_null = true;
23672367
mbfl_string string, result, *ret;
23682368

23692369
ZEND_PARSE_PARAMETERS_START(2, 4)

ext/mbstring/tests/mb_strcut.phpt

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ $jis = mb_convert_encoding("漢字 abc カナ", 'JIS', 'UTF-8');
2626
$iso2022jp2004 = mb_convert_encoding("漢字 abc カナ凜", 'ISO-2022-JP-2004', 'UTF-8'); // [1b242851 3441 3b7a 1b2842 20 61 62 63 20 1b242851 252b 254a 7425 1b2842]
2727
$iso2022jpms = mb_convert_encoding("漢字 abc カナ", 'ISO-2022-JP-MS', 'UTF-8'); // [1b2442 3441 3b7a 1b2842 20 61 62 63 20 1b2442 252b 254a 1b2842]
2828
$iso2022jp_kddi = mb_convert_encoding("漢字 abc カナ", 'ISO-2022-JP-KDDI', 'UTF-8');
29+
$gb18030 = mb_convert_encoding("漢字 abc カナ", 'GB18030', 'UTF-8');
2930

3031
print "== EUC-JP ==\n";
3132
print MBStringChars(mb_strcut($euc_jp, 6, 5, 'EUC-JP'), 'EUC-JP') . "\n";
@@ -218,9 +219,37 @@ print "UTF-16 section is terminated improperly: [" . mb_strcut("&i6o\x83", 0, 10
218219

219220
print "== GB18030 ==\n";
220221

222+
print "Empty string: [" . bin2hex(mb_strcut("", 0, 5, 'GB18030')) . "]\n";
223+
print "Empty string 2: [" . bin2hex(mb_strcut("", -2, 1, 'GB18030')) . "]\n";
224+
print "Empty string 3: [" . bin2hex(mb_strcut("", 0, -1, 'GB18030')) . "]\n";
221225
print "Invalid byte 0xF5: [" . bin2hex(mb_strcut("\xF5a", 1, 100, 'GB18030')) . "]\n";
222226
print "Double-byte char: [" . bin2hex(mb_strcut("\xAFw", -1, 100, "GB18030")) . "]\n";
223227

228+
print MBStringChars(mb_strcut($gb18030, 0, 0, 'GB18030'), 'GB18030') . "\n";
229+
print MBStringChars(mb_strcut($gb18030, 0, 1, 'GB18030'), 'GB18030') . "\n";
230+
print MBStringChars(mb_strcut($gb18030, 0, 2, 'GB18030'), 'GB18030') . "\n";
231+
print MBStringChars(mb_strcut($gb18030, 0, 3, 'GB18030'), 'GB18030') . "\n";
232+
print MBStringChars(mb_strcut($gb18030, 0, 4, 'GB18030'), 'GB18030') . "\n";
233+
print MBStringChars(mb_strcut($gb18030, 0, 5, 'GB18030'), 'GB18030') . "\n";
234+
print MBStringChars(mb_strcut($gb18030, 1, 2, 'GB18030'), 'GB18030') . "\n";
235+
print MBStringChars(mb_strcut($gb18030, 1, 3, 'GB18030'), 'GB18030') . "\n";
236+
print MBStringChars(mb_strcut($gb18030, 1, 4, 'GB18030'), 'GB18030') . "\n";
237+
238+
// U+210A is encoded using 4 bytes in GB18030
239+
print "Operating on 4-byte GB18030 character:\n";
240+
$fourbyte = mb_convert_encoding("\x21\x0A", 'GB18030', 'UTF-16BE');
241+
print MBStringChars(mb_strcut($fourbyte, 0, 4, 'GB18030'), 'GB18030') . "\n";
242+
print MBStringChars(mb_strcut($fourbyte, 1, 4, 'GB18030'), 'GB18030') . "\n";
243+
print MBStringChars(mb_strcut($fourbyte, 2, 4, 'GB18030'), 'GB18030') . "\n";
244+
print MBStringChars(mb_strcut($fourbyte, 3, 4, 'GB18030'), 'GB18030') . "\n";
245+
print MBStringChars(mb_strcut($fourbyte, 4, 4, 'GB18030'), 'GB18030') . "\n";
246+
print MBStringChars(mb_strcut($fourbyte, 1, 3, 'GB18030'), 'GB18030') . "\n";
247+
print MBStringChars(mb_strcut($fourbyte, 2, 3, 'GB18030'), 'GB18030') . "\n";
248+
print MBStringChars(mb_strcut($fourbyte, 2, 4, 'GB18030'), 'GB18030') . "\n";
249+
print MBStringChars(mb_strcut($fourbyte, 0, -1, 'GB18030'), 'GB18030') . "\n";
250+
251+
print "[" . bin2hex(mb_strcut(hex2bin("84308130"), 2, null, "GB18030")) . "]\n";
252+
224253
print "== UHC ==\n";
225254

226255
print "Single byte 0x96: [" . bin2hex(mb_strcut("\x96", 1, 1280, "UHC")) . "]\n";
@@ -405,8 +434,31 @@ UTF-16 section ends abruptly: []
405434
UTF-16 section ends abruptly in middle of 2nd codepoint: []
406435
UTF-16 section is terminated improperly: []
407436
== GB18030 ==
437+
Empty string: []
438+
Empty string 2: []
439+
Empty string 3: []
408440
Invalid byte 0xF5: []
409441
Double-byte char: []
442+
[]
443+
[]
444+
[9d68]
445+
[9d68]
446+
[9d68 d7d6]
447+
[9d68 d7d6 20]
448+
[9d68]
449+
[9d68]
450+
[9d68 d7d6]
451+
Operating on 4-byte GB18030 character:
452+
[8136bc32]
453+
[]
454+
[]
455+
[]
456+
[]
457+
[]
458+
[]
459+
[]
460+
[]
461+
[]
410462
== UHC ==
411463
Single byte 0x96: [96]
412464
== ASCII ==

0 commit comments

Comments
 (0)