Skip to content

More performance increases for mbstring functions on UTF-8 text #10313

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
210 changes: 206 additions & 4 deletions ext/mbstring/mbstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -1748,9 +1748,11 @@ static size_t mb_fast_strlen_utf8(unsigned char *p, size_t len)

#ifdef __SSE2__
if (len >= sizeof(__m128i)) {
e -= sizeof(__m128i);

const __m128i threshold = _mm_set1_epi8(-64);
const __m128i delta = _mm_set1_epi8(1);
__m128i counter = _mm_set1_epi8(0); /* Vector of 16 continuation-byte counters */
__m128i counter = _mm_setzero_si128(); /* Vector of 16 continuation-byte counters */

int reset_counter = 255;
do {
Expand All @@ -1762,13 +1764,14 @@ static size_t mb_fast_strlen_utf8(unsigned char *p, size_t len)
* and reset them to zero */
if (--reset_counter == 0) {
len -= _mm_sum_epu8(counter);
counter = _mm_set1_epi8(0);
counter = _mm_setzero_si128();
reset_counter = 255;
}

p += sizeof(__m128i);
} while (p + sizeof(__m128i) <= e);
} while (p <= e);

e += sizeof(__m128i);
len -= _mm_sum_epu8(counter); /* Fold in any remaining non-zero values in the 16 counters */
}
#endif
Expand Down Expand Up @@ -4587,13 +4590,212 @@ MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const
return true;
}

static bool mb_fast_check_utf8(zend_string *str)
{
#ifdef __SSE2__
unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
/* To make the condition for the main `while` loop below tigher, `e` will point not 1 byte
* past the end of the string, but 16 bytes before that... BUT, we treat the terminating
* null byte which is included in each zend_string as part of the content to check
* This ensures that multi-byte characters which are truncated abruptly at the end of
* the string will be detected as invalid */
e = e - sizeof(__m128i) + 1;

/* For checking for illegal bytes 0xF5-FF */
const __m128i over_f5 = _mm_set1_epi8(-117);
/* For checking for overlong 3-byte code units and reserved codepoints U+D800-DFFF */
const __m128i over_9f = _mm_set1_epi8(-97);
/* For checking for overlong 4-byte code units and invalid codepoints > U+10FFFF */
const __m128i over_8f = _mm_set1_epi8(-113);
/* For checking for illegal bytes 0xC0-C1 */
const __m128i find_c0 = _mm_set1_epi8(-64);
const __m128i c0_to_c1 = _mm_set1_epi8(-126);
/* For checking structure of continuation bytes */
const __m128i find_e0 = _mm_set1_epi8(-32);
const __m128i find_f0 = _mm_set1_epi8(-16);

__m128i last_block = _mm_setzero_si128();
__m128i operand;

while (p <= e) {
operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */

check_operand:
/* If all 16 bytes are single-byte characters, then a number of checks can be skipped */
if (!_mm_movemask_epi8(_mm_cmplt_epi8(operand, _mm_setzero_si128()))) {
/* Even if this block only contains single-byte characters, there may have been a
* multi-byte character at the end of the previous block, which was supposed to
* have continuation bytes in this block
* This bitmask will pick out a 2/3/4-byte character starting from the last byte of
* the previous block, a 3/4-byte starting from the 2nd last, or a 4-byte starting
* from the 3rd last */
__m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
if (_mm_movemask_epi8(bad)) {
return false;
}

/* Consume as many full blocks of single-byte characters as we can */
while (true) {
p += sizeof(__m128i);
if (p > e) {
goto finish_up_remaining_bytes;
}
operand = _mm_loadu_si128((__m128i*)p);
if (_mm_movemask_epi8(_mm_cmplt_epi8(operand, _mm_setzero_si128()))) {
break;
}
}
}

/* Check for >= 0xF5, which are illegal byte values in UTF-8
* AVX512 has instructions for vectorized unsigned compare, but SSE2 only has signed compare
* So we add an offset to shift 0xF5-FF to the far low end of the signed byte range
* Then a single signed compare will pick out any bad bytes
* `bad` is a vector of 16 good/bad values, where 0x00 means good and 0xFF means bad */
__m128i bad = _mm_cmplt_epi8(_mm_add_epi8(operand, over_f5), over_f5);

/* Check for overlong 3-byte code units AND reserved codepoints U+D800-DFFF
* 0xE0 followed by a byte < 0xA0 indicates an overlong 3-byte code unit, and
* 0xED followed by a byte >= 0xA0 indicates a reserved codepoint
* We can check for both problems at once by generating a vector where each byte < 0xA0
* is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED
* Shift the original block right by one byte, and XOR the shifted block with the bitmask
* Any matches will give a 0x00 byte; do a compare with a zero vector to pick out the
* bad positions, and OR them into `bad` */
__m128i operand2 = _mm_or_si128(_mm_slli_si128(operand, 1), _mm_srli_si128(last_block, 15));
__m128i mask1 = _mm_or_si128(find_e0, _mm_and_si128(_mm_set1_epi8(0xD), _mm_cmpgt_epi8(operand, over_9f)));
bad = _mm_or_si128(bad, _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_xor_si128(operand2, mask1)));

/* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF
* Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte
* code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF
* Build the bitmask, XOR it with the shifted block, check for 0x00 bytes in the result */
__m128i mask2 = _mm_or_si128(find_f0, _mm_and_si128(_mm_set1_epi8(0x4), _mm_cmpgt_epi8(operand, over_8f)));
bad = _mm_or_si128(bad, _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_xor_si128(operand2, mask2)));

/* Check for overlong 2-byte code units
* Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit
* Same deal as before; add an offset to shift 0xC0-C1 to the far low end of the signed
* byte range, do a signed compare to pick out any bad bytes */
bad = _mm_or_si128(bad, _mm_cmplt_epi8(_mm_add_epi8(operand, find_c0), c0_to_c1));

/* Check structure of continuation bytes
* A UTF-8 byte should be a continuation byte if, and only if, it is:
* 1) 1 byte after the start of a 2-byte, 3-byte, or 4-byte character
* 2) 2 bytes after the start of a 3-byte or 4-byte character
* 3) 3 bytes after the start of a 4-byte character
* We build 3 bitmasks with 0xFF in each such position, and OR them together to
* get a single bitmask with 0xFF in each position where a continuation byte should be */
__m128i cont_mask = _mm_cmpeq_epi8(_mm_and_si128(operand2, find_c0), find_c0);
__m128i operand3 = _mm_or_si128(_mm_slli_si128(operand, 2), _mm_srli_si128(last_block, 14));
cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand3, find_e0), find_e0));
__m128i operand4 = _mm_or_si128(_mm_slli_si128(operand, 3), _mm_srli_si128(last_block, 13));
cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand4, find_f0), find_f0));

/* Now, use a signed comparison to get another bitmask with 0xFF in each position where
* a continuation byte actually is
* XOR those two bitmasks together; if everything is good, the result should be zero
* However, if a byte which should have been a continuation wasn't, or if a byte which
* shouldn't have been a continuation was, we will get 0xFF in that position */
__m128i continuation = _mm_cmplt_epi8(operand, find_c0);
bad = _mm_or_si128(bad, _mm_xor_si128(continuation, cont_mask));

/* Pick out the high bit of each byte in `bad` as a 16-bit value (into a scalar register)
* If that value is non-zero, then we found a bad byte somewhere! */
if (_mm_movemask_epi8(bad)) {
return false;
}

last_block = operand;
p += sizeof(__m128i);
}

finish_up_remaining_bytes: ;
/* Finish up 1-15 remaining bytes */
uint8_t shift_dist = p - e;
if (shift_dist < 16) {
/* Crazy hack here... we want to use the above vectorized code to check a block of less than 16
* bytes, but there is no good way to read a variable number of bytes into an XMM register
* However, we know that these bytes are part of a zend_string, and a zend_string has some
* 'header' fields which occupy the memory just before its content
* And, those header fields occupy more than 16 bytes...
* So if we go back 16 bytes from the end of the zend_string content, and load 16 bytes from there,
* we may pick up some 'junk' bytes from the zend_string header fields, but we will get the 1-15
* bytes we wanted in the tail end of our XMM register, and this will never cause a segfault.
* Then, we do a left shift to get rid of the unwanted bytes
* Conveniently, the same left shift also zero-fills the tail end of the XMM register */
operand = _mm_loadu_si128((__m128i*)e);

/* This looks useless, but it's not
* The PSRLDQ instruction used for this 128-bit left shift requires an immediate (literal)
* shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist) */
switch (shift_dist) {
case 0:
goto check_operand;
case 1:
operand = _mm_srli_si128(operand, 1);
goto check_operand;
case 2:
operand = _mm_srli_si128(operand, 2);
goto check_operand;
case 3:
operand = _mm_srli_si128(operand, 3);
goto check_operand;
case 4:
operand = _mm_srli_si128(operand, 4);
goto check_operand;
case 5:
operand = _mm_srli_si128(operand, 5);
goto check_operand;
case 6:
operand = _mm_srli_si128(operand, 6);
goto check_operand;
case 7:
operand = _mm_srli_si128(operand, 7);
goto check_operand;
case 8:
operand = _mm_srli_si128(operand, 8);
goto check_operand;
case 9:
operand = _mm_srli_si128(operand, 9);
goto check_operand;
case 10:
operand = _mm_srli_si128(operand, 10);
goto check_operand;
case 11:
operand = _mm_srli_si128(operand, 11);
goto check_operand;
case 12:
operand = _mm_srli_si128(operand, 12);
goto check_operand;
case 13:
operand = _mm_srli_si128(operand, 13);
goto check_operand;
case 14:
operand = _mm_srli_si128(operand, 14);
goto check_operand;
case 15:
operand = _mm_srli_si128(operand, 15);
goto check_operand;
}

ZEND_UNREACHABLE();
}

return true;
#else
return php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), &mbfl_encoding_utf8);
#endif
}

static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding)
{
if (encoding == &mbfl_encoding_utf8) {
if (GC_FLAGS(str) & IS_STR_VALID_UTF8) {
return true;
}
bool result = php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), encoding);
bool result = mb_fast_check_utf8(str);
if (result && !ZSTR_IS_INTERNED(str)) {
GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
}
Expand Down
4 changes: 4 additions & 0 deletions ext/mbstring/tests/utf_encodings.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -812,6 +812,10 @@ $invalid = array(
);

testInvalidCodepoints($invalid, 'UTF-8');
// Regression test for bug in SSE2-based accelerated UTF-8 validation function

if (mb_check_encoding("k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xc6", 'UTF-8'))
die("UTF-8 validation was incorrect on 16-byte string with truncated multi-byte char at end");

echo "== UTF-16 ==\n";

Expand Down