Skip to content

Commit 1f0cf13

Browse files
committed
Add fast mb_strcut implementation for UTF-8
The old implementation runs through the entire string to pick out the part which should be returned by mb_strcut. This creates significant performance overhead. The new specialized implementation of mb_strcut for UTF-8 usually only examines a few bytes around the starting and ending cut points, meaning it generally runs in constant time. For UTF-8 strings just a few bytes long, the new implementation is around 10% faster (according to microbenchmarks which I ran locally). For strings around 10,000 bytes in length, it is 50-300x faster. (Yes, that is 300x and not 300%.) The new implementation behaves identically to the old one on VALID UTF-8 strings; a fuzzer was used to help ensure this is the case. On invalid UTF-8 strings, there is a difference: in some cases, the old implementation will pass invalid byte sequences through unchanged, while in others it will remove them. The new implementation has behavior which is perhaps slightly more predictable: it simply backs up the starting and ending cut points to the preceding "starter byte" (one which is not a UTF-8 continuation byte).
1 parent 3fa836f commit 1f0cf13

20 files changed

+151
-70
lines changed

ext/mbstring/libmbfl/filters/mbfilter_7bit.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,8 @@ const mbfl_encoding mbfl_encoding_7bit = {
6565
&vtbl_wchar_7bit,
6666
mb_7bit_to_wchar,
6767
mb_wchar_to_7bit,
68-
NULL
68+
NULL,
69+
NULL,
6970
};
7071

7172
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)

ext/mbstring/libmbfl/filters/mbfilter_base64.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ const mbfl_encoding mbfl_encoding_base64 = {
4545
NULL,
4646
mb_base64_to_wchar,
4747
mb_wchar_to_base64,
48-
NULL
48+
NULL,
49+
NULL,
4950
};
5051

5152
const struct mbfl_convert_vtbl vtbl_8bit_b64 = {

ext/mbstring/libmbfl/filters/mbfilter_cjk.c

Lines changed: 60 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -4392,7 +4392,8 @@ const mbfl_encoding mbfl_encoding_jis = {
43924392
&vtbl_wchar_jis,
43934393
mb_iso2022jp_to_wchar,
43944394
mb_wchar_to_jis,
4395-
mb_check_jis
4395+
mb_check_jis,
4396+
NULL,
43964397
};
43974398

43984399
static const struct mbfl_convert_vtbl vtbl_2022jp_wchar = {
@@ -4426,7 +4427,8 @@ const mbfl_encoding mbfl_encoding_2022jp = {
44264427
&vtbl_wchar_2022jp,
44274428
mb_iso2022jp_to_wchar,
44284429
mb_wchar_to_iso2022jp,
4429-
mb_check_iso2022jp
4430+
mb_check_iso2022jp,
4431+
NULL,
44304432
};
44314433

44324434
static const char *mbfl_encoding_2022jp_kddi_aliases[] = {"ISO-2022-JP-KDDI", NULL};
@@ -4462,7 +4464,8 @@ const mbfl_encoding mbfl_encoding_2022jp_kddi = {
44624464
&vtbl_wchar_2022jp_kddi,
44634465
mb_iso2022jp_kddi_to_wchar,
44644466
mb_wchar_to_iso2022jp_kddi,
4465-
NULL
4467+
NULL,
4468+
NULL,
44664469
};
44674470

44684471
static const struct mbfl_convert_vtbl vtbl_2022jp_2004_wchar = {
@@ -4496,7 +4499,8 @@ const mbfl_encoding mbfl_encoding_2022jp_2004 = {
44964499
&vtbl_wchar_2022jp_2004,
44974500
mb_iso2022jp2004_to_wchar,
44984501
mb_wchar_to_iso2022jp2004,
4499-
NULL
4502+
NULL,
4503+
NULL,
45004504
};
45014505

45024506
/* Previously, a dubious 'encoding' called 'cp50220raw' was supported
@@ -4581,7 +4585,8 @@ const mbfl_encoding mbfl_encoding_cp50220 = {
45814585
&vtbl_wchar_cp50220,
45824586
mb_cp5022x_to_wchar,
45834587
mb_wchar_to_cp50220,
4584-
NULL
4588+
NULL,
4589+
NULL,
45854590
};
45864591

45874592
const mbfl_encoding mbfl_encoding_cp50221 = {
@@ -4595,7 +4600,8 @@ const mbfl_encoding mbfl_encoding_cp50221 = {
45954600
&vtbl_wchar_cp50221,
45964601
mb_cp5022x_to_wchar,
45974602
mb_wchar_to_cp50221,
4598-
NULL
4603+
NULL,
4604+
NULL,
45994605
};
46004606

46014607
const mbfl_encoding mbfl_encoding_cp50222 = {
@@ -4609,7 +4615,8 @@ const mbfl_encoding mbfl_encoding_cp50222 = {
46094615
&vtbl_wchar_cp50222,
46104616
mb_cp5022x_to_wchar,
46114617
mb_wchar_to_cp50222,
4612-
NULL
4618+
NULL,
4619+
NULL,
46134620
};
46144621

46154622
static const char *mbfl_encoding_2022jpms_aliases[] = {"ISO2022JPMS", NULL};
@@ -4645,7 +4652,8 @@ const mbfl_encoding mbfl_encoding_2022jpms = {
46454652
&vtbl_wchar_2022jpms,
46464653
mb_iso2022jpms_to_wchar,
46474654
mb_wchar_to_iso2022jpms,
4648-
NULL
4655+
NULL,
4656+
NULL,
46494657
};
46504658

46514659
/* ISO-2022-KR is defined in RFC 1557
@@ -4687,7 +4695,8 @@ const mbfl_encoding mbfl_encoding_2022kr = {
46874695
&vtbl_wchar_2022kr,
46884696
mb_iso2022kr_to_wchar,
46894697
mb_wchar_to_iso2022kr,
4690-
NULL
4698+
NULL,
4699+
NULL,
46914700
};
46924701

46934702
/*
@@ -7832,7 +7841,8 @@ const mbfl_encoding mbfl_encoding_sjis = {
78327841
&vtbl_wchar_sjis,
78337842
mb_sjis_to_wchar,
78347843
mb_wchar_to_sjis,
7835-
NULL
7844+
NULL,
7845+
NULL,
78367846
};
78377847

78387848
static const char *mbfl_encoding_sjis_mac_aliases[] = {"MacJapanese", "x-Mac-Japanese", NULL};
@@ -7868,7 +7878,8 @@ const mbfl_encoding mbfl_encoding_sjis_mac = {
78687878
&vtbl_wchar_sjis_mac,
78697879
mb_sjismac_to_wchar,
78707880
mb_wchar_to_sjismac,
7871-
NULL
7881+
NULL,
7882+
NULL,
78727883
};
78737884

78747885
static const char *mbfl_encoding_sjis_docomo_aliases[] = {"SJIS-DOCOMO", "shift_jis-imode", "x-sjis-emoji-docomo", NULL};
@@ -7906,7 +7917,8 @@ const mbfl_encoding mbfl_encoding_sjis_docomo = {
79067917
&vtbl_wchar_sjis_docomo,
79077918
mb_sjis_docomo_to_wchar,
79087919
mb_wchar_to_sjis_docomo,
7909-
NULL
7920+
NULL,
7921+
NULL,
79107922
};
79117923

79127924
static const struct mbfl_convert_vtbl vtbl_sjis_kddi_wchar = {
@@ -7940,7 +7952,8 @@ const mbfl_encoding mbfl_encoding_sjis_kddi = {
79407952
&vtbl_wchar_sjis_kddi,
79417953
mb_sjis_kddi_to_wchar,
79427954
mb_wchar_to_sjis_kddi,
7943-
NULL
7955+
NULL,
7956+
NULL,
79447957
};
79457958

79467959
static const struct mbfl_convert_vtbl vtbl_sjis_sb_wchar = {
@@ -7974,7 +7987,8 @@ const mbfl_encoding mbfl_encoding_sjis_sb = {
79747987
&vtbl_wchar_sjis_sb,
79757988
mb_sjis_sb_to_wchar,
79767989
mb_wchar_to_sjis_sb,
7977-
NULL
7990+
NULL,
7991+
NULL,
79787992
};
79797993

79807994
/* Although the specification for Shift-JIS-2004 indicates that 0x5C and
@@ -8017,7 +8031,8 @@ const mbfl_encoding mbfl_encoding_sjis2004 = {
80178031
&vtbl_wchar_sjis2004,
80188032
mb_sjis2004_to_wchar,
80198033
mb_wchar_to_sjis2004,
8020-
NULL
8034+
NULL,
8035+
NULL,
80218036
};
80228037

80238038
/* CP932 is Microsoft's version of Shift-JIS.
@@ -8103,7 +8118,8 @@ const mbfl_encoding mbfl_encoding_cp932 = {
81038118
&vtbl_wchar_cp932,
81048119
mb_cp932_to_wchar,
81058120
mb_wchar_to_cp932,
8106-
NULL
8121+
NULL,
8122+
NULL,
81078123
};
81088124

81098125
static const struct mbfl_convert_vtbl vtbl_sjiswin_wchar = {
@@ -8137,7 +8153,8 @@ const mbfl_encoding mbfl_encoding_sjiswin = {
81378153
&vtbl_wchar_sjiswin,
81388154
mb_cp932_to_wchar,
81398155
mb_wchar_to_sjiswin,
8140-
NULL
8156+
NULL,
8157+
NULL,
81418158
};
81428159

81438160
/*
@@ -10346,7 +10363,8 @@ const mbfl_encoding mbfl_encoding_euc_jp = {
1034610363
&vtbl_wchar_eucjp,
1034710364
mb_eucjp_to_wchar,
1034810365
mb_wchar_to_eucjp,
10349-
NULL
10366+
NULL,
10367+
NULL,
1035010368
};
1035110369

1035210370
static const char *mbfl_encoding_eucjp2004_aliases[] = {"EUC_JP-2004", NULL};
@@ -10382,7 +10400,8 @@ const mbfl_encoding mbfl_encoding_eucjp2004 = {
1038210400
&vtbl_wchar_eucjp2004,
1038310401
mb_eucjp2004_to_wchar,
1038410402
mb_wchar_to_eucjp2004,
10385-
NULL
10403+
NULL,
10404+
NULL,
1038610405
};
1038710406

1038810407
static const char *mbfl_encoding_eucjp_win_aliases[] = {"eucJP-open", "eucJP-ms", NULL};
@@ -10418,7 +10437,8 @@ const mbfl_encoding mbfl_encoding_eucjp_win = {
1041810437
&vtbl_wchar_eucjpwin,
1041910438
mb_eucjpwin_to_wchar,
1042010439
mb_wchar_to_eucjpwin,
10421-
NULL
10440+
NULL,
10441+
NULL,
1042210442
};
1042310443

1042410444
static const char *mbfl_encoding_cp51932_aliases[] = {"cp51932", NULL};
@@ -10454,7 +10474,8 @@ const mbfl_encoding mbfl_encoding_cp51932 = {
1045410474
&vtbl_wchar_cp51932,
1045510475
mb_cp51932_to_wchar,
1045610476
mb_wchar_to_cp51932,
10457-
NULL
10477+
NULL,
10478+
NULL,
1045810479
};
1045910480

1046010481
static const unsigned char mblen_table_euccn[] = { /* 0xA1-0xFE */
@@ -10509,7 +10530,8 @@ const mbfl_encoding mbfl_encoding_euc_cn = {
1050910530
&vtbl_wchar_euccn,
1051010531
mb_euccn_to_wchar,
1051110532
mb_wchar_to_euccn,
10512-
NULL
10533+
NULL,
10534+
NULL,
1051310535
};
1051410536

1051510537
static const char *mbfl_encoding_euc_tw_aliases[] = {"EUC_TW", "eucTW", "x-euc-tw", NULL};
@@ -10545,7 +10567,8 @@ const mbfl_encoding mbfl_encoding_euc_tw = {
1054510567
&vtbl_wchar_euctw,
1054610568
mb_euctw_to_wchar,
1054710569
mb_wchar_to_euctw,
10548-
NULL
10570+
NULL,
10571+
NULL,
1054910572
};
1055010573

1055110574
static const char *mbfl_encoding_euc_kr_aliases[] = {"EUC_KR", "eucKR", "x-euc-kr", NULL};
@@ -10581,7 +10604,8 @@ const mbfl_encoding mbfl_encoding_euc_kr = {
1058110604
&vtbl_wchar_euckr,
1058210605
mb_euckr_to_wchar,
1058310606
mb_wchar_to_euckr,
10584-
NULL
10607+
NULL,
10608+
NULL,
1058510609
};
1058610610

1058710611
/* UHC was introduced by MicroSoft in Windows 95, and is also known as CP949.
@@ -10640,7 +10664,8 @@ const mbfl_encoding mbfl_encoding_uhc = {
1064010664
&vtbl_wchar_uhc,
1064110665
mb_uhc_to_wchar,
1064210666
mb_wchar_to_uhc,
10643-
NULL
10667+
NULL,
10668+
NULL,
1064410669
};
1064510670

1064610671
/*
@@ -11555,7 +11580,8 @@ const mbfl_encoding mbfl_encoding_gb18030 = {
1155511580
&vtbl_wchar_gb18030,
1155611581
mb_gb18030_to_wchar,
1155711582
mb_wchar_to_gb18030,
11558-
NULL
11583+
NULL,
11584+
NULL,
1155911585
};
1156011586

1156111587
static const char *mbfl_encoding_cp936_aliases[] = {"CP-936", "GBK", NULL};
@@ -11591,7 +11617,8 @@ const mbfl_encoding mbfl_encoding_cp936 = {
1159111617
&vtbl_wchar_cp936,
1159211618
mb_cp936_to_wchar,
1159311619
mb_wchar_to_cp936,
11594-
NULL
11620+
NULL,
11621+
NULL,
1159511622
};
1159611623

1159711624
/*
@@ -12160,7 +12187,8 @@ const mbfl_encoding mbfl_encoding_big5 = {
1216012187
&vtbl_wchar_big5,
1216112188
mb_big5_to_wchar,
1216212189
mb_wchar_to_big5,
12163-
NULL
12190+
NULL,
12191+
NULL,
1216412192
};
1216512193

1216612194
static const struct mbfl_convert_vtbl vtbl_cp950_wchar = {
@@ -12194,7 +12222,8 @@ const mbfl_encoding mbfl_encoding_cp950 = {
1219412222
&vtbl_wchar_cp950,
1219512223
mb_cp950_to_wchar,
1219612224
mb_wchar_to_cp950,
12197-
NULL
12225+
NULL,
12226+
NULL,
1219812227
};
1219912228

1220012229
/*
@@ -12567,5 +12596,6 @@ const mbfl_encoding mbfl_encoding_hz = {
1256712596
&vtbl_wchar_hz,
1256812597
mb_hz_to_wchar,
1256912598
mb_wchar_to_hz,
12570-
NULL
12599+
NULL,
12600+
NULL,
1257112601
};

ext/mbstring/libmbfl/filters/mbfilter_htmlent.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,8 @@ const mbfl_encoding mbfl_encoding_html_ent = {
6767
&vtbl_wchar_html,
6868
mb_htmlent_to_wchar,
6969
mb_wchar_to_htmlent,
70-
NULL
70+
NULL,
71+
NULL,
7172
};
7273

7374
const struct mbfl_convert_vtbl vtbl_wchar_html = {

ext/mbstring/libmbfl/filters/mbfilter_qprint.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ const mbfl_encoding mbfl_encoding_qprint = {
4646
NULL,
4747
mb_qprint_to_wchar,
4848
mb_wchar_to_qprint,
49-
NULL
49+
NULL,
50+
NULL,
5051
};
5152

5253
const struct mbfl_convert_vtbl vtbl_8bit_qprint = {

ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int
8787
&vtbl_wchar_##id, \
8888
mb_##id##_to_wchar, \
8989
mb_wchar_to_##id, \
90+
NULL, \
9091
NULL \
9192
}
9293

ext/mbstring/libmbfl/filters/mbfilter_ucs2.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@ const mbfl_encoding mbfl_encoding_ucs2 = {
5757
&vtbl_wchar_ucs2,
5858
mb_ucs2_to_wchar,
5959
mb_wchar_to_ucs2be,
60-
NULL
60+
NULL,
61+
NULL,
6162
};
6263

6364
const mbfl_encoding mbfl_encoding_ucs2be = {
@@ -71,7 +72,8 @@ const mbfl_encoding mbfl_encoding_ucs2be = {
7172
&vtbl_wchar_ucs2be,
7273
mb_ucs2be_to_wchar,
7374
mb_wchar_to_ucs2be,
74-
NULL
75+
NULL,
76+
NULL,
7577
};
7678

7779
const mbfl_encoding mbfl_encoding_ucs2le = {
@@ -85,7 +87,8 @@ const mbfl_encoding mbfl_encoding_ucs2le = {
8587
&vtbl_wchar_ucs2le,
8688
mb_ucs2le_to_wchar,
8789
mb_wchar_to_ucs2le,
88-
NULL
90+
NULL,
91+
NULL,
8992
};
9093

9194
const struct mbfl_convert_vtbl vtbl_ucs2_wchar = {

0 commit comments

Comments
 (0)