diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index d2ff3079f0d73..c3e9ed5ae1fd6 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -1748,9 +1748,11 @@ static size_t mb_fast_strlen_utf8(unsigned char *p, size_t len) #ifdef __SSE2__ if (len >= sizeof(__m128i)) { + e -= sizeof(__m128i); + const __m128i threshold = _mm_set1_epi8(-64); const __m128i delta = _mm_set1_epi8(1); - __m128i counter = _mm_set1_epi8(0); /* Vector of 16 continuation-byte counters */ + __m128i counter = _mm_setzero_si128(); /* Vector of 16 continuation-byte counters */ int reset_counter = 255; do { @@ -1762,13 +1764,14 @@ static size_t mb_fast_strlen_utf8(unsigned char *p, size_t len) * and reset them to zero */ if (--reset_counter == 0) { len -= _mm_sum_epu8(counter); - counter = _mm_set1_epi8(0); + counter = _mm_setzero_si128(); reset_counter = 255; } p += sizeof(__m128i); - } while (p + sizeof(__m128i) <= e); + } while (p <= e); + e += sizeof(__m128i); len -= _mm_sum_epu8(counter); /* Fold in any remaining non-zero values in the 16 counters */ } #endif @@ -4587,13 +4590,212 @@ MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const return true; } +static bool mb_fast_check_utf8(zend_string *str) +{ +#ifdef __SSE2__ + unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str); + /* To make the condition for the main `while` loop below tigher, `e` will point not 1 byte + * past the end of the string, but 16 bytes before that... BUT, we treat the terminating + * null byte which is included in each zend_string as part of the content to check + * This ensures that multi-byte characters which are truncated abruptly at the end of + * the string will be detected as invalid */ + e = e - sizeof(__m128i) + 1; + + /* For checking for illegal bytes 0xF5-FF */ + const __m128i over_f5 = _mm_set1_epi8(-117); + /* For checking for overlong 3-byte code units and reserved codepoints U+D800-DFFF */ + const __m128i over_9f = _mm_set1_epi8(-97); + /* For checking for overlong 4-byte code units and invalid codepoints > U+10FFFF */ + const __m128i over_8f = _mm_set1_epi8(-113); + /* For checking for illegal bytes 0xC0-C1 */ + const __m128i find_c0 = _mm_set1_epi8(-64); + const __m128i c0_to_c1 = _mm_set1_epi8(-126); + /* For checking structure of continuation bytes */ + const __m128i find_e0 = _mm_set1_epi8(-32); + const __m128i find_f0 = _mm_set1_epi8(-16); + + __m128i last_block = _mm_setzero_si128(); + __m128i operand; + + while (p <= e) { + operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */ + +check_operand: + /* If all 16 bytes are single-byte characters, then a number of checks can be skipped */ + if (!_mm_movemask_epi8(_mm_cmplt_epi8(operand, _mm_setzero_si128()))) { + /* Even if this block only contains single-byte characters, there may have been a + * multi-byte character at the end of the previous block, which was supposed to + * have continuation bytes in this block + * This bitmask will pick out a 2/3/4-byte character starting from the last byte of + * the previous block, a 3/4-byte starting from the 2nd last, or a 4-byte starting + * from the 3rd last */ + __m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + __m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask); + if (_mm_movemask_epi8(bad)) { + return false; + } + + /* Consume as many full blocks of single-byte characters as we can */ + while (true) { + p += sizeof(__m128i); + if (p > e) { + goto finish_up_remaining_bytes; + } + operand = _mm_loadu_si128((__m128i*)p); + if (_mm_movemask_epi8(_mm_cmplt_epi8(operand, _mm_setzero_si128()))) { + break; + } + } + } + + /* Check for >= 0xF5, which are illegal byte values in UTF-8 + * AVX512 has instructions for vectorized unsigned compare, but SSE2 only has signed compare + * So we add an offset to shift 0xF5-FF to the far low end of the signed byte range + * Then a single signed compare will pick out any bad bytes + * `bad` is a vector of 16 good/bad values, where 0x00 means good and 0xFF means bad */ + __m128i bad = _mm_cmplt_epi8(_mm_add_epi8(operand, over_f5), over_f5); + + /* Check for overlong 3-byte code units AND reserved codepoints U+D800-DFFF + * 0xE0 followed by a byte < 0xA0 indicates an overlong 3-byte code unit, and + * 0xED followed by a byte >= 0xA0 indicates a reserved codepoint + * We can check for both problems at once by generating a vector where each byte < 0xA0 + * is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED + * Shift the original block right by one byte, and XOR the shifted block with the bitmask + * Any matches will give a 0x00 byte; do a compare with a zero vector to pick out the + * bad positions, and OR them into `bad` */ + __m128i operand2 = _mm_or_si128(_mm_slli_si128(operand, 1), _mm_srli_si128(last_block, 15)); + __m128i mask1 = _mm_or_si128(find_e0, _mm_and_si128(_mm_set1_epi8(0xD), _mm_cmpgt_epi8(operand, over_9f))); + bad = _mm_or_si128(bad, _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_xor_si128(operand2, mask1))); + + /* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF + * Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte + * code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF + * Build the bitmask, XOR it with the shifted block, check for 0x00 bytes in the result */ + __m128i mask2 = _mm_or_si128(find_f0, _mm_and_si128(_mm_set1_epi8(0x4), _mm_cmpgt_epi8(operand, over_8f))); + bad = _mm_or_si128(bad, _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_xor_si128(operand2, mask2))); + + /* Check for overlong 2-byte code units + * Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit + * Same deal as before; add an offset to shift 0xC0-C1 to the far low end of the signed + * byte range, do a signed compare to pick out any bad bytes */ + bad = _mm_or_si128(bad, _mm_cmplt_epi8(_mm_add_epi8(operand, find_c0), c0_to_c1)); + + /* Check structure of continuation bytes + * A UTF-8 byte should be a continuation byte if, and only if, it is: + * 1) 1 byte after the start of a 2-byte, 3-byte, or 4-byte character + * 2) 2 bytes after the start of a 3-byte or 4-byte character + * 3) 3 bytes after the start of a 4-byte character + * We build 3 bitmasks with 0xFF in each such position, and OR them together to + * get a single bitmask with 0xFF in each position where a continuation byte should be */ + __m128i cont_mask = _mm_cmpeq_epi8(_mm_and_si128(operand2, find_c0), find_c0); + __m128i operand3 = _mm_or_si128(_mm_slli_si128(operand, 2), _mm_srli_si128(last_block, 14)); + cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand3, find_e0), find_e0)); + __m128i operand4 = _mm_or_si128(_mm_slli_si128(operand, 3), _mm_srli_si128(last_block, 13)); + cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand4, find_f0), find_f0)); + + /* Now, use a signed comparison to get another bitmask with 0xFF in each position where + * a continuation byte actually is + * XOR those two bitmasks together; if everything is good, the result should be zero + * However, if a byte which should have been a continuation wasn't, or if a byte which + * shouldn't have been a continuation was, we will get 0xFF in that position */ + __m128i continuation = _mm_cmplt_epi8(operand, find_c0); + bad = _mm_or_si128(bad, _mm_xor_si128(continuation, cont_mask)); + + /* Pick out the high bit of each byte in `bad` as a 16-bit value (into a scalar register) + * If that value is non-zero, then we found a bad byte somewhere! */ + if (_mm_movemask_epi8(bad)) { + return false; + } + + last_block = operand; + p += sizeof(__m128i); + } + +finish_up_remaining_bytes: ; + /* Finish up 1-15 remaining bytes */ + uint8_t shift_dist = p - e; + if (shift_dist < 16) { + /* Crazy hack here... we want to use the above vectorized code to check a block of less than 16 + * bytes, but there is no good way to read a variable number of bytes into an XMM register + * However, we know that these bytes are part of a zend_string, and a zend_string has some + * 'header' fields which occupy the memory just before its content + * And, those header fields occupy more than 16 bytes... + * So if we go back 16 bytes from the end of the zend_string content, and load 16 bytes from there, + * we may pick up some 'junk' bytes from the zend_string header fields, but we will get the 1-15 + * bytes we wanted in the tail end of our XMM register, and this will never cause a segfault. + * Then, we do a left shift to get rid of the unwanted bytes + * Conveniently, the same left shift also zero-fills the tail end of the XMM register */ + operand = _mm_loadu_si128((__m128i*)e); + + /* This looks useless, but it's not + * The PSRLDQ instruction used for this 128-bit left shift requires an immediate (literal) + * shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist) */ + switch (shift_dist) { + case 0: + goto check_operand; + case 1: + operand = _mm_srli_si128(operand, 1); + goto check_operand; + case 2: + operand = _mm_srli_si128(operand, 2); + goto check_operand; + case 3: + operand = _mm_srli_si128(operand, 3); + goto check_operand; + case 4: + operand = _mm_srli_si128(operand, 4); + goto check_operand; + case 5: + operand = _mm_srli_si128(operand, 5); + goto check_operand; + case 6: + operand = _mm_srli_si128(operand, 6); + goto check_operand; + case 7: + operand = _mm_srli_si128(operand, 7); + goto check_operand; + case 8: + operand = _mm_srli_si128(operand, 8); + goto check_operand; + case 9: + operand = _mm_srli_si128(operand, 9); + goto check_operand; + case 10: + operand = _mm_srli_si128(operand, 10); + goto check_operand; + case 11: + operand = _mm_srli_si128(operand, 11); + goto check_operand; + case 12: + operand = _mm_srli_si128(operand, 12); + goto check_operand; + case 13: + operand = _mm_srli_si128(operand, 13); + goto check_operand; + case 14: + operand = _mm_srli_si128(operand, 14); + goto check_operand; + case 15: + operand = _mm_srli_si128(operand, 15); + goto check_operand; + } + + ZEND_UNREACHABLE(); + } + + return true; +#else + return php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), &mbfl_encoding_utf8); +#endif +} + static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding) { if (encoding == &mbfl_encoding_utf8) { if (GC_FLAGS(str) & IS_STR_VALID_UTF8) { return true; } - bool result = php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), encoding); + bool result = mb_fast_check_utf8(str); if (result && !ZSTR_IS_INTERNED(str)) { GC_ADD_FLAGS(str, IS_STR_VALID_UTF8); } diff --git a/ext/mbstring/tests/utf_encodings.phpt b/ext/mbstring/tests/utf_encodings.phpt index 2f050c657c2f3..d253d009571ac 100644 --- a/ext/mbstring/tests/utf_encodings.phpt +++ b/ext/mbstring/tests/utf_encodings.phpt @@ -812,6 +812,10 @@ $invalid = array( ); testInvalidCodepoints($invalid, 'UTF-8'); +// Regression test for bug in SSE2-based accelerated UTF-8 validation function + +if (mb_check_encoding("k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xc6", 'UTF-8')) + die("UTF-8 validation was incorrect on 16-byte string with truncated multi-byte char at end"); echo "== UTF-16 ==\n";