Skip to content

Commit ea3f0ee

Browse files
committed
Optimize php_unicode_convert_case (cuts mbstring case conversion time ~15%)
This function uses various subfunctions to convert case of Unicode wchars. Previously, these subfunctions would store the case-converted characters in a buffer, and the parent function would then pass them (byte by byte) to the next filter in the filter chain. Rather than passing around that buffer, it's better for the subfunctions to directly pass the case-converted bytes to the next filter in the filter chain. This speeds things up nicely.
1 parent ddc76e5 commit ea3f0ee

File tree

1 file changed

+47
-44
lines changed

1 file changed

+47
-44
lines changed

ext/mbstring/php_unicode.c

Lines changed: 47 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -249,53 +249,60 @@ static inline unsigned php_unicode_tofold_simple(unsigned code, enum mbfl_no_enc
249249
return code;
250250
}
251251

252-
static inline unsigned php_unicode_tolower_full(
253-
unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
252+
static inline void php_unicode_tolower_full(unsigned code, enum mbfl_no_encoding enc,
253+
mbfl_convert_filter* next_filter) {
254254
code = php_unicode_tolower_raw(code, enc);
255255
if (UNEXPECTED(code > 0xffffff)) {
256256
unsigned len = code >> 24;
257257
const unsigned *p = &_uccase_extra_table[code & 0xffffff];
258-
memcpy(out, p + 1, len * sizeof(unsigned));
259-
return len;
258+
while (len--) {
259+
(next_filter->filter_function)(*++p, next_filter);
260+
}
261+
} else {
262+
(next_filter->filter_function)(code, next_filter);
260263
}
261-
*out = code;
262-
return 1;
263264
}
264-
static inline unsigned php_unicode_toupper_full(
265-
unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
265+
266+
static inline void php_unicode_toupper_full(unsigned code, enum mbfl_no_encoding enc,
267+
mbfl_convert_filter* next_filter) {
266268
code = php_unicode_toupper_raw(code, enc);
267269
if (UNEXPECTED(code > 0xffffff)) {
268270
unsigned len = code >> 24;
269271
const unsigned *p = &_uccase_extra_table[code & 0xffffff];
270-
memcpy(out, p + 1, len * sizeof(unsigned));
271-
return len;
272+
while (len--) {
273+
(next_filter->filter_function)(*++p, next_filter);
274+
}
275+
} else {
276+
(next_filter->filter_function)(code, next_filter);
272277
}
273-
*out = code;
274-
return 1;
275278
}
276-
static inline unsigned php_unicode_totitle_full(
277-
unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
279+
280+
static inline void php_unicode_totitle_full(unsigned code, enum mbfl_no_encoding enc,
281+
mbfl_convert_filter* next_filter) {
278282
code = php_unicode_totitle_raw(code, enc);
279283
if (UNEXPECTED(code > 0xffffff)) {
280284
unsigned len = code >> 24;
281285
const unsigned *p = &_uccase_extra_table[code & 0xffffff];
282-
memcpy(out, p + 1, len * sizeof(unsigned));
283-
return len;
286+
while (len--) {
287+
(next_filter->filter_function)(*++p, next_filter);
288+
}
289+
} else {
290+
(next_filter->filter_function)(code, next_filter);
284291
}
285-
*out = code;
286-
return 1;
287292
}
288-
static inline unsigned php_unicode_tofold_full(
289-
unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
293+
294+
static inline void php_unicode_tofold_full(unsigned code, enum mbfl_no_encoding enc,
295+
mbfl_convert_filter* next_filter) {
290296
code = php_unicode_tofold_raw(code, enc);
291297
if (UNEXPECTED(code > 0xffffff)) {
292298
unsigned len = code >> 24;
293299
const unsigned *p = &_uccase_extra_table[code & 0xffffff];
294-
memcpy(out, p + 1, len * sizeof(unsigned));
295-
return len;
300+
while (len--) {
301+
(next_filter->filter_function)(*++p, next_filter);
302+
}
303+
} else {
304+
(next_filter->filter_function)(code, next_filter);
296305
}
297-
*out = code;
298-
return 1;
299306
}
300307

301308
struct convert_case_data {
@@ -308,8 +315,7 @@ struct convert_case_data {
308315
static int convert_case_filter(int c, void *void_data)
309316
{
310317
struct convert_case_data *data = (struct convert_case_data *) void_data;
311-
unsigned out[3];
312-
unsigned len, i;
318+
unsigned code;
313319

314320
/* Handle invalid characters early, as we assign special meaning to
315321
* codepoints above 0xffffff. */
@@ -320,48 +326,48 @@ static int convert_case_filter(int c, void *void_data)
320326

321327
switch (data->case_mode) {
322328
case PHP_UNICODE_CASE_UPPER_SIMPLE:
323-
out[0] = php_unicode_toupper_simple(c, data->no_encoding);
324-
len = 1;
329+
code = php_unicode_toupper_simple(c, data->no_encoding);
330+
(data->next_filter->filter_function)(code, data->next_filter);
325331
break;
326332

327333
case PHP_UNICODE_CASE_UPPER:
328-
len = php_unicode_toupper_full(c, data->no_encoding, out);
334+
php_unicode_toupper_full(c, data->no_encoding, data->next_filter);
329335
break;
330336

331337
case PHP_UNICODE_CASE_LOWER_SIMPLE:
332-
out[0] = php_unicode_tolower_simple(c, data->no_encoding);
333-
len = 1;
338+
code = php_unicode_tolower_simple(c, data->no_encoding);
339+
(data->next_filter->filter_function)(code, data->next_filter);
334340
break;
335341

336342
case PHP_UNICODE_CASE_LOWER:
337-
len = php_unicode_tolower_full(c, data->no_encoding, out);
343+
php_unicode_tolower_full(c, data->no_encoding, data->next_filter);
338344
break;
339345

340346
case PHP_UNICODE_CASE_FOLD:
341-
len = php_unicode_tofold_full(c, data->no_encoding, out);
347+
php_unicode_tofold_full(c, data->no_encoding, data->next_filter);
342348
break;
343349

344350
case PHP_UNICODE_CASE_FOLD_SIMPLE:
345-
out[0] = php_unicode_tofold_simple(c, data->no_encoding);
346-
len = 1;
351+
code = php_unicode_tofold_simple(c, data->no_encoding);
352+
(data->next_filter->filter_function)(code, data->next_filter);
347353
break;
348354

349355
case PHP_UNICODE_CASE_TITLE_SIMPLE:
350356
case PHP_UNICODE_CASE_TITLE:
351357
{
352358
if (data->title_mode) {
353359
if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
354-
out[0] = php_unicode_tolower_simple(c, data->no_encoding);
355-
len = 1;
360+
code = php_unicode_tolower_simple(c, data->no_encoding);
361+
(data->next_filter->filter_function)(code, data->next_filter);
356362
} else {
357-
len = php_unicode_tolower_full(c, data->no_encoding, out);
363+
php_unicode_tolower_full(c, data->no_encoding, data->next_filter);
358364
}
359365
} else {
360366
if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
361-
out[0] = php_unicode_totitle_simple(c, data->no_encoding);
362-
len = 1;
367+
code = php_unicode_totitle_simple(c, data->no_encoding);
368+
(data->next_filter->filter_function)(code, data->next_filter);
363369
} else {
364-
len = php_unicode_totitle_full(c, data->no_encoding, out);
370+
php_unicode_totitle_full(c, data->no_encoding, data->next_filter);
365371
}
366372
}
367373
if (!php_unicode_is_case_ignorable(c)) {
@@ -372,9 +378,6 @@ static int convert_case_filter(int c, void *void_data)
372378
EMPTY_SWITCH_DEFAULT_CASE()
373379
}
374380

375-
for (i = 0; i < len; i++) {
376-
(*data->next_filter->filter_function)(out[i], data->next_filter);
377-
}
378381
return 0;
379382
}
380383

0 commit comments

Comments
 (0)