Skip to content

Commit 7658220

Browse files
authored
Improve performance of mbfl_name2encoding() by using perfect hashing (#12707)
mbfl_name2encoding() uses a linear loop through the encodings, comparing the name one by one, which is very slow. For the benchmark [1] just looking up the name takes about 50% of run-time. By using perfect hashing instead, we no longer have to loop over the list, and the number of string comparisons is reduced to just a single one. The perfect hashing table is generated using GNU gperf and amended manually to fit in with mbstring and manually changed to reduce the cache size. [1] #12684 (comment)
1 parent 931a8b0 commit 7658220

File tree

2 files changed

+204
-3
lines changed

2 files changed

+204
-3
lines changed

UPGRADING

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,3 +227,5 @@ PHP 8.4 UPGRADE NOTES
227227
* mb_strcut() is much faster now for UTF-8 and UTF-16 strings.
228228

229229
* get_browser() is much faster now, up to 1.5x - 2.5x for some test cases.
230+
231+
* Looking up mbstring encoding names is much faster now.

ext/mbstring/libmbfl/mbfl/mbfl_encoding.c

Lines changed: 202 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -144,13 +144,212 @@ static const mbfl_encoding *mbfl_encoding_ptr_list[] = {
144144
NULL
145145
};
146146

147+
/* The following perfect hashing table was amended from gperf, and hashing code was generated using gperf.
148+
* The table was amended to refer to the table above such that it is lighter for the data cache.
149+
* Command used: gperf encodings.txt --readonly-tables --null-strings --ignore-case
150+
* The encodings.txt contains all the contents of the name fields of the mbfl_encoding_ptr_list table. */
151+
152+
static const int8_t mbfl_encoding_ptr_list_after_hashing[187] = {
153+
-1, -1, -1,
154+
65,
155+
23,
156+
9,
157+
-1,
158+
60,
159+
36,
160+
-1, -1,
161+
58,
162+
42,
163+
-1, -1,
164+
18,
165+
27,
166+
77,
167+
26,
168+
40,
169+
72,
170+
12,
171+
10,
172+
2,
173+
31,
174+
-1, -1,
175+
75,
176+
74,
177+
33,
178+
45,
179+
-1,
180+
67,
181+
13,
182+
-1,
183+
51,
184+
53,
185+
11,
186+
1,
187+
-1,
188+
48,
189+
56,
190+
-1,
191+
38,
192+
20,
193+
46,
194+
54,
195+
-1,
196+
14,
197+
24,
198+
44,
199+
39,
200+
43,
201+
-1,
202+
30,
203+
49,
204+
57,
205+
76,
206+
-1, -1,
207+
68,
208+
73,
209+
7,
210+
16,
211+
-1,
212+
35,
213+
66,
214+
-1, -1, -1,
215+
47,
216+
55,
217+
-1, -1, -1,
218+
63,
219+
15,
220+
8,
221+
17,
222+
-1,
223+
21,
224+
70,
225+
-1,
226+
29,
227+
5,
228+
6,
229+
61,
230+
-1, -1,
231+
71,
232+
52,
233+
3,
234+
37,
235+
-1, -1,
236+
28,
237+
-1, -1, -1,
238+
32,
239+
50,
240+
34,
241+
-1, -1, -1,
242+
62,
243+
-1, -1, -1, -1, -1, -1, -1, -1, -1,
244+
59,
245+
0,
246+
-1, -1, -1, -1,
247+
22,
248+
-1, -1, -1, -1, -1, -1, -1, -1, -1,
249+
25,
250+
41,
251+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
252+
19,
253+
-1, -1, -1,
254+
4,
255+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
256+
69,
257+
-1, -1, -1, -1,
258+
64,
259+
};
260+
261+
static unsigned int mbfl_name2encoding_perfect_hash(const char *str, size_t len)
262+
{
263+
static const unsigned char asso_values[] =
264+
{
265+
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
266+
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
267+
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
268+
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
269+
187, 187, 187, 187, 187, 0, 187, 187, 5, 20,
270+
0, 15, 40, 10, 25, 70, 5, 60, 187, 187,
271+
187, 187, 187, 187, 187, 75, 5, 0, 20, 5,
272+
0, 75, 5, 0, 40, 75, 20, 0, 0, 0,
273+
35, 45, 50, 0, 75, 0, 187, 0, 187, 187,
274+
0, 187, 187, 187, 187, 187, 187, 75, 5, 0,
275+
20, 5, 0, 75, 5, 0, 40, 75, 20, 0,
276+
0, 0, 35, 45, 50, 0, 75, 0, 187, 0,
277+
187, 187, 0, 187, 187, 187, 187, 187, 187, 187,
278+
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
279+
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
280+
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
281+
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
282+
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
283+
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
284+
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
285+
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
286+
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
287+
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
288+
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
289+
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
290+
187, 187, 187, 187, 187, 187
291+
};
292+
unsigned int hval = len;
293+
294+
switch (hval)
295+
{
296+
default:
297+
hval += asso_values[(unsigned char)str[6]];
298+
ZEND_FALLTHROUGH;
299+
case 6:
300+
hval += asso_values[(unsigned char)str[5]];
301+
ZEND_FALLTHROUGH;
302+
case 5:
303+
hval += asso_values[(unsigned char)str[4]];
304+
ZEND_FALLTHROUGH;
305+
case 4:
306+
case 3:
307+
hval += asso_values[(unsigned char)str[2]];
308+
ZEND_FALLTHROUGH;
309+
case 2:
310+
case 1:
311+
hval += asso_values[(unsigned char)str[0]];
312+
break;
313+
}
314+
return hval + asso_values[(unsigned char)str[len - 1]];
315+
}
316+
317+
#define NAME_HASH_MIN_NAME_LENGTH 2
318+
#define NAME_HASH_MAX_NAME_LENGTH 23
319+
147320
const mbfl_encoding *mbfl_name2encoding(const char *name)
148321
{
149-
const mbfl_encoding **encoding;
322+
const mbfl_encoding *const *encoding;
150323

324+
/* Sanity check perfect hash for name.
325+
* Never enable this in production, this is only a development-time sanity check! */
326+
#if ZEND_DEBUG && 0
151327
for (encoding = mbfl_encoding_ptr_list; *encoding; encoding++) {
152-
if (strcasecmp((*encoding)->name, name) == 0) {
153-
return *encoding;
328+
size_t name_length = strlen((*encoding)->name);
329+
if (!(name_length <= NAME_HASH_MAX_NAME_LENGTH && name_length >= NAME_HASH_MIN_NAME_LENGTH)) {
330+
fprintf(stderr, "name length is not satisfying bound check: %zu %s\n", name_length, (*encoding)->name);
331+
abort();
332+
}
333+
unsigned int key = mbfl_name2encoding_perfect_hash((*encoding)->name, name_length);
334+
if (mbfl_encoding_ptr_list[mbfl_encoding_ptr_list_after_hashing[key]] != *encoding) {
335+
fprintf(stderr, "mbfl_name2encoding_perfect_hash: key %u %s mismatch\n", key, (*encoding)->name);
336+
abort();
337+
}
338+
}
339+
#endif
340+
341+
/* Use perfect hash lookup for name */
342+
size_t name_len = strlen(name);
343+
if (name_len <= NAME_HASH_MAX_NAME_LENGTH && name_len >= NAME_HASH_MIN_NAME_LENGTH) {
344+
unsigned int key = mbfl_name2encoding_perfect_hash(name, name_len);
345+
if (key <= 186) {
346+
int8_t offset = mbfl_encoding_ptr_list_after_hashing[key];
347+
if (offset >= 0) {
348+
encoding = mbfl_encoding_ptr_list + offset;
349+
if (strcasecmp((*encoding)->name, name) == 0) {
350+
return *encoding;
351+
}
352+
}
154353
}
155354
}
156355

0 commit comments

Comments
 (0)