diff --git a/Zend/zend_string.c b/Zend/zend_string.c index a0f379556d05e..4715d4ec0af0e 100644 --- a/Zend/zend_string.c +++ b/Zend/zend_string.c @@ -180,6 +180,10 @@ static zend_always_inline zend_string *zend_add_interned_string(zend_string *str GC_SET_REFCOUNT(str, 1); GC_ADD_FLAGS(str, IS_STR_INTERNED | flags); + if (!ZSTR_IS_VALID_UTF8(str) && zend_string_validate_utf8(str)) { + GC_ADD_FLAGS(str, IS_STR_VALID_UTF8); + } + ZVAL_INTERNED_STR(&val, str); zend_hash_add_new(interned_strings, str, &val); @@ -493,3 +497,45 @@ ZEND_API zend_string *zend_string_concat3( return res; } + +// Copyright (c) 2008-2009 Bjoern Hoehrmann +// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. +// https://stackoverflow.com/a/22135005/1320374 + +enum { + UTF8_ACCEPT = 0, + UTF8_REJECT = 1, +}; + +static const uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +ZEND_API bool zend_string_validate_utf8(zend_string *string) { + char *str = ZSTR_VAL(string); + size_t len = ZSTR_LEN(string); + uint32_t state = UTF8_ACCEPT; + + for (size_t i = 0; i < len; i++) { + uint32_t type = utf8d[(uint8_t)str[i]]; + state = utf8d[256 + state * 16 + type]; + + if (state == UTF8_REJECT) + break; + } + + return state == UTF8_ACCEPT; +} diff --git a/Zend/zend_string.h b/Zend/zend_string.h index cc4f0751bec63..70af76a1f391a 100644 --- a/Zend/zend_string.h +++ b/Zend/zend_string.h @@ -78,6 +78,8 @@ ZEND_API extern zend_string *zend_empty_string; ZEND_API extern zend_string *zend_one_char_string[256]; ZEND_API extern zend_string **zend_known_strings; +ZEND_API bool zend_string_validate_utf8(zend_string *string); + END_EXTERN_C() /* Shortcuts */ diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 3277284be51cb..9d0c2b9b8f803 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -1804,7 +1804,7 @@ static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding) unsigned int char_len = encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4); if (char_len) { return ZSTR_LEN(string) / char_len; - } else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && GC_FLAGS(string) & IS_STR_VALID_UTF8) { + } else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && ZSTR_IS_VALID_UTF8(string)) { return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string)); } @@ -2254,7 +2254,7 @@ PHP_FUNCTION(mb_substr_count) if (php_mb_is_no_encoding_utf8(enc->no_encoding)) { /* No need to do any conversion if haystack/needle are already known-valid UTF-8 * (If they are not valid, then not passing them through conversion filters could affect output) */ - if (GC_FLAGS(haystack) & IS_STR_VALID_UTF8) { + if (ZSTR_IS_VALID_UTF8(haystack)) { haystack_u8 = haystack; } else { unsigned int num_errors = 0; @@ -2264,7 +2264,7 @@ PHP_FUNCTION(mb_substr_count) } } - if (GC_FLAGS(needle) & IS_STR_VALID_UTF8) { + if (ZSTR_IS_VALID_UTF8(needle)) { needle_u8 = needle; } else { unsigned int num_errors = 0; @@ -3152,7 +3152,7 @@ PHP_FUNCTION(mb_detect_encoding) strict = MBSTRG(strict_detection); } - if (size == 1 && *elist == &mbfl_encoding_utf8 && (GC_FLAGS(str) & IS_STR_VALID_UTF8)) { + if (size == 1 && *elist == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) { ret = &mbfl_encoding_utf8; } else { ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict); @@ -5172,11 +5172,13 @@ static bool mb_fast_check_utf8_avx2(zend_string *str) static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding) { if (encoding == &mbfl_encoding_utf8) { - if (GC_FLAGS(str) & IS_STR_VALID_UTF8) { + if (ZSTR_IS_VALID_UTF8(str)) { return true; + } else if (ZSTR_IS_INTERNED(str)) { + return false; } bool result = mb_fast_check_utf8(str); - if (result && !ZSTR_IS_INTERNED(str)) { + if (result) { GC_ADD_FLAGS(str, IS_STR_VALID_UTF8); } return result; @@ -5439,7 +5441,7 @@ PHP_FUNCTION(mb_scrub) RETURN_THROWS(); } - if (enc == &mbfl_encoding_utf8 && (GC_FLAGS(str) & IS_STR_VALID_UTF8)) { + if (enc == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) { /* A valid UTF-8 string will not be changed by mb_scrub; so just increment the refcount and return it */ RETURN_STR_COPY(str); } diff --git a/ext/opcache/ZendAccelerator.c b/ext/opcache/ZendAccelerator.c index 3d0254e8307ee..348bffd58cf2c 100644 --- a/ext/opcache/ZendAccelerator.c +++ b/ext/opcache/ZendAccelerator.c @@ -550,6 +550,9 @@ zend_string* ZEND_FASTCALL accel_new_interned_string(zend_string *str) *hash_slot = STRTAB_STR_TO_POS(&ZCSG(interned_strings), s); GC_SET_REFCOUNT(s, 2); GC_TYPE_INFO(s) = GC_STRING | ((IS_STR_INTERNED | IS_STR_PERMANENT) << GC_FLAGS_SHIFT)| (ZSTR_IS_VALID_UTF8(str) ? IS_STR_VALID_UTF8 : 0); + if (!ZSTR_IS_VALID_UTF8(s) && zend_string_validate_utf8(str)) { + GC_ADD_FLAGS(s, IS_STR_VALID_UTF8); + } ZSTR_H(s) = h; ZSTR_LEN(s) = ZSTR_LEN(str); memcpy(ZSTR_VAL(s), ZSTR_VAL(str), ZSTR_LEN(s) + 1); diff --git a/ext/pcre/php_pcre.c b/ext/pcre/php_pcre.c index db5c3d9158055..a0a346e8ae667 100644 --- a/ext/pcre/php_pcre.c +++ b/ext/pcre/php_pcre.c @@ -1125,7 +1125,7 @@ static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ * static zend_always_inline bool is_known_valid_utf8( zend_string *subject_str, PCRE2_SIZE start_offset) { - if (!(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8)) { + if (!ZSTR_IS_VALID_UTF8(subject_str)) { /* We don't know whether the string is valid UTF-8 or not. */ return 0; } diff --git a/ext/zend_test/tests/strings_marked_as_utf8.phpt b/ext/zend_test/tests/strings_marked_as_utf8.phpt index 9eaa43e63902c..d827e92dade33 100644 --- a/ext/zend_test/tests/strings_marked_as_utf8.phpt +++ b/ext/zend_test/tests/strings_marked_as_utf8.phpt @@ -47,12 +47,6 @@ $s = "f" . "o"; var_dump($s); var_dump(zend_test_is_string_marked_as_valid_utf8($s)); -// The "foo" string matches with a "Foo" class which is registered by the zend_test extension. -// That class name does not have the "valid UTF-8" flag because class names in general -// don't have to be UTF-8. As the "foo" string here goes through the interning logic, -// the string gets replaced by the "foo" string from the class, which does -// not have the "valid UTF-8" flag. We therefore choose a different test case: "fxo". -// The previous "foo" test case works because it is not interned. echo "Multiple concatenation known valid UTF-8 in assignment:\n"; $s = "f" . "o" . "o"; var_dump($s); @@ -167,7 +161,7 @@ string(2) "fo" bool(true) Multiple concatenation known valid UTF-8 in assignment: string(3) "foo" -bool(false) +bool(true) string(3) "fxo" bool(true) Concatenation known valid UTF-8 string with empty string in variables: