Skip to content

Commit d0d8344

Browse files
committed
Cache UTF-8-validity status of strings in GC flags
The PCRE extension is already doing this. The flag is set when a string is determined to be valid UTF-8, and cleared in zend_string_forget_hash_val. We might as well make good use of it in mbstring as well. This should result in a negligible slowdown for non-UTF-8 strings, bad UTF-8 strings, and good UTF-8 strings which are checked only once. However, when microbenchmarking this change using a variety of text encodings and string lengths, I found that in most of these cases, the 'new' code was a few percent faster. In a couple of cases, the 'old' code was a few percent faster. This was not a result of sampling error, since I could reproduce these test results repeatedly, and even when running a large number of iterations. Both the new and old code were compiled with -O3 -march=native. My (unproved) hypothesis is that although the new code appears to only add one more conditional branch, the compiler may emit slightly different code from before (perhaps with different register allocation and so on), and this may cause some cases to run slightly faster and others to run slightly slower. I have not disassembled the old and new binaries to see if an examination of the emitted assembly code would support this hypothesis. For good UTF-8 strings which are checked repeatedly, the speedup is about 40% even for strings 1-5 bytes in length. For ~100 byte strings, it is ~700%, and for ~10000 byte strings, it is ~80000%. I tried fuzzing MBString's php_mb_check_encoding function and pcre2lib's valid_utf function to see if I could find any cases where their output would be different. After running the fuzzer for a couple of minutes, it had tried more than 1 million test cases without finding any where the output was different. Therefore, it appears that MBString's UTF-8 validation is compatible with PCRE's.
1 parent d665142 commit d0d8344

File tree

2 files changed

+32
-16
lines changed

2 files changed

+32
-16
lines changed

ext/mbstring/mbstring.c

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_en
8080

8181
static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
8282

83+
static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding);
84+
8385
/* See mbfilter_cp5022x.c */
8486
uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
8587
/* }}} */
@@ -2829,19 +2831,17 @@ static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t
28292831
/* {{{ Encodings of the given string is returned (as a string) */
28302832
PHP_FUNCTION(mb_detect_encoding)
28312833
{
2832-
char *str;
2833-
size_t str_len;
2834-
zend_string *encoding_str = NULL;
2834+
zend_string *str, *encoding_str = NULL;
28352835
HashTable *encoding_ht = NULL;
2836-
bool strict = 0;
2836+
bool strict = false;
28372837

28382838
mbfl_string string;
28392839
const mbfl_encoding *ret;
28402840
const mbfl_encoding **elist;
28412841
size_t size;
28422842

28432843
ZEND_PARSE_PARAMETERS_START(1, 3)
2844-
Z_PARAM_STRING(str, str_len)
2844+
Z_PARAM_STR(str)
28452845
Z_PARAM_OPTIONAL
28462846
Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(encoding_ht, encoding_str)
28472847
Z_PARAM_BOOL(strict)
@@ -2879,11 +2879,11 @@ PHP_FUNCTION(mb_detect_encoding)
28792879

28802880
if (strict && size == 1) {
28812881
/* If there is only a single candidate encoding, mb_check_encoding is faster */
2882-
ret = (php_mb_check_encoding(str, str_len, *elist)) ? *elist : NULL;
2882+
ret = (mb_check_str_encoding(str, *elist)) ? *elist : NULL;
28832883
} else {
28842884
mbfl_string_init(&string);
2885-
string.val = (unsigned char *)str;
2886-
string.len = str_len;
2885+
string.val = (unsigned char*)ZSTR_VAL(str);
2886+
string.len = ZSTR_LEN(str);
28872887
ret = mbfl_identify_encoding(&string, elist, size, strict);
28882888
}
28892889

@@ -4357,7 +4357,7 @@ PHP_FUNCTION(mb_get_info)
43574357
}
43584358
/* }}} */
43594359

4360-
MBSTRING_API int php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
4360+
MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
43614361
{
43624362
uint32_t wchar_buf[128];
43634363
unsigned char *in = (unsigned char*)input;
@@ -4370,7 +4370,7 @@ MBSTRING_API int php_mb_check_encoding(const char *input, size_t length, const m
43704370
ZEND_ASSERT(out_len <= 8);
43714371
for (int i = 0; i < out_len; i++) {
43724372
if (wchar_buf[i] == MBFL_BAD_INPUT) {
4373-
return 0;
4373+
return false;
43744374
}
43754375
}
43764376

@@ -4379,12 +4379,28 @@ MBSTRING_API int php_mb_check_encoding(const char *input, size_t length, const m
43794379
ZEND_ASSERT(out_len <= 128);
43804380
for (int i = 0; i < out_len; i++) {
43814381
if (wchar_buf[i] == MBFL_BAD_INPUT) {
4382-
return 0;
4382+
return false;
43834383
}
43844384
}
43854385
}
43864386

4387-
return 1;
4387+
return true;
4388+
}
4389+
4390+
static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding)
4391+
{
4392+
if (encoding == &mbfl_encoding_utf8) {
4393+
if (GC_FLAGS(str) & IS_STR_VALID_UTF8) {
4394+
return true;
4395+
}
4396+
bool result = php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), encoding);
4397+
if (result && !ZSTR_IS_INTERNED(str)) {
4398+
GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
4399+
}
4400+
return result;
4401+
} else {
4402+
return php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), encoding);
4403+
}
43884404
}
43894405

43904406
static int php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)
@@ -4404,14 +4420,14 @@ static int php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding
44044420
ZEND_HASH_FOREACH_KEY_VAL(vars, idx, key, entry) {
44054421
ZVAL_DEREF(entry);
44064422
if (key) {
4407-
if (!php_mb_check_encoding(ZSTR_VAL(key), ZSTR_LEN(key), encoding)) {
4423+
if (!mb_check_str_encoding(key, encoding)) {
44084424
valid = 0;
44094425
break;
44104426
}
44114427
}
44124428
switch (Z_TYPE_P(entry)) {
44134429
case IS_STRING:
4414-
if (!php_mb_check_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), encoding)) {
4430+
if (!mb_check_str_encoding(Z_STR_P(entry), encoding)) {
44154431
valid = 0;
44164432
break;
44174433
}
@@ -4459,7 +4475,7 @@ PHP_FUNCTION(mb_check_encoding)
44594475
if (input_ht) {
44604476
RETURN_BOOL(php_mb_check_encoding_recursive(input_ht, encoding));
44614477
} else if (input_str) {
4462-
RETURN_BOOL(php_mb_check_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), encoding));
4478+
RETURN_BOOL(mb_check_str_encoding(input_str, encoding));
44634479
} else {
44644480
php_error_docref(NULL, E_DEPRECATED,
44654481
"Calling mb_check_encoding() without argument is deprecated");

ext/mbstring/mbstring.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ MBSTRING_API zend_string* php_mb_convert_encoding(
6565
MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc);
6666

6767
MBSTRING_API size_t php_mb_stripos(int mode, const char *old_haystack, size_t old_haystack_len, const char *old_needle, size_t old_needle_len, zend_long offset, const mbfl_encoding *encoding);
68-
MBSTRING_API int php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding);
68+
MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding);
6969

7070
ZEND_BEGIN_MODULE_GLOBALS(mbstring)
7171
char *internal_encoding_name;

0 commit comments

Comments
 (0)