Skip to content

Commit 3ae4779

Browse files
committed
Add accelerated (SIMD-based) implementation of mb_check_encoding for UTF-8
The new SSE2-based implementation of mb_check_encoding for UTF-8 is about 10% faster for 0-5 byte strings, more than 3 times faster for ~100-byte strings, and just under 4 times faster for ~10,000-byte strings. I believe it may be possible to make this function much faster again. Some possible directions for further performance optimization include: • If other ISA extensions like AVX or AVX-512 are available, use a similar algorithm, but process text in blocks of 32 or 64 bytes (instead of 16 bytes). • If other SIMD ISA extensions are available, use the greater variety of available instructions to make some of the checks tighter. • Even if only SSE/SSE2 are available, find clever ways to squeeze instructions out of the hot path. This would probably require a lot of perusing instruction mauals and thinking hard about which SIMD instructions could be used to perform the same checks with fewer instructions. • Find a better algorithm, possibly one where more checks could be combined (just as the current algorithm combines the checks for certain overlong code units and reserved codepoints).
1 parent 585ac79 commit 3ae4779

File tree

2 files changed

+217
-1
lines changed

2 files changed

+217
-1
lines changed

ext/mbstring/mbstring.c

Lines changed: 203 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4587,13 +4587,215 @@ MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const
45874587
return true;
45884588
}
45894589

4590+
static bool mb_fast_check_utf8(zend_string *str)
4591+
{
4592+
#ifdef __SSE2__
4593+
unsigned char *p = (unsigned char*)ZSTR_VAL(str);
4594+
/* `e` points 1 byte past the last full 16-byte block of string content
4595+
* Note that we include the terminating null byte which is included in each zend_string
4596+
* as part of the content to check; this ensures that multi-byte characters which are
4597+
* truncated abruptly at the end of the string will be detected as invalid */
4598+
unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m128i) - 1));
4599+
4600+
/* For checking for illegal bytes 0xF5-FF */
4601+
const __m128i over_f5 = _mm_set1_epi8(-117);
4602+
/* For checking for overlong 3-byte code units and reserved codepoints U+D800-DFFF */
4603+
const __m128i over_9f = _mm_set1_epi8(-97);
4604+
/* For checking for overlong 4-byte code units and invalid codepoints > U+10FFFF */
4605+
const __m128i over_8f = _mm_set1_epi8(-113);
4606+
/* For checking for illegal bytes 0xC0-C1 */
4607+
const __m128i find_c0 = _mm_set1_epi8(-64);
4608+
const __m128i c0_to_c1 = _mm_set1_epi8(-126);
4609+
/* For checking structure of continuation bytes */
4610+
const __m128i find_e0 = _mm_set1_epi8(-32);
4611+
const __m128i find_f0 = _mm_set1_epi8(-16);
4612+
4613+
__m128i last_block = _mm_setzero_si128();
4614+
__m128i operand;
4615+
4616+
while (p < e) {
4617+
operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
4618+
4619+
check_operand:
4620+
/* If all 16 bytes are single-byte characters, then a number of checks can be skipped */
4621+
if (!_mm_movemask_epi8(_mm_cmplt_epi8(operand, _mm_setzero_si128()))) {
4622+
/* Even if this block only contains single-byte characters, there may have been a
4623+
* multi-byte character at the end of the previous block, which was supposed to
4624+
* have continuation bytes in this block
4625+
* This bitmask will pick out a 2/3/4-byte character starting from the last byte of
4626+
* the previous block, a 3/4-byte starting from the 2nd last, or a 4-byte starting
4627+
* from the 3rd last */
4628+
__m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
4629+
__m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
4630+
if (_mm_movemask_epi8(bad)) {
4631+
return false;
4632+
}
4633+
4634+
/* Consume as many full blocks of single-byte characters as we can */
4635+
while (true) {
4636+
p += sizeof(__m128i);
4637+
if (p >= e) {
4638+
goto finish_up_remaining_bytes;
4639+
}
4640+
operand = _mm_loadu_si128((__m128i*)p);
4641+
if (_mm_movemask_epi8(_mm_cmplt_epi8(operand, _mm_setzero_si128()))) {
4642+
break;
4643+
}
4644+
}
4645+
}
4646+
4647+
/* Check for >= 0xF5, which are illegal byte values in UTF-8
4648+
* AVX512 has instructions for vectorized unsigned compare, but SSE2 only has signed compare
4649+
* So we add an offset to shift 0xF5-FF to the far low end of the signed byte range
4650+
* Then a single signed compare will pick out any bad bytes
4651+
* `bad` is a vector of 16 good/bad values, where 0x00 means good and 0xFF means bad */
4652+
__m128i bad = _mm_cmplt_epi8(_mm_add_epi8(operand, over_f5), over_f5);
4653+
4654+
/* Check for overlong 3-byte code units AND reserved codepoints U+D800-DFFF
4655+
* 0xE0 followed by a byte < 0xA0 indicates an overlong 3-byte code unit, and
4656+
* 0xED followed by a byte >= 0xA0 indicates a reserved codepoint
4657+
* We can check for both problems at once by generating a vector where each byte < 0xA0
4658+
* is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED
4659+
* Shift the original block right by one byte, and XOR the shifted block with the bitmask
4660+
* Any matches will give a 0x00 byte; do a compare with a zero vector to pick out the
4661+
* bad positions, and OR them into `bad` */
4662+
__m128i operand2 = _mm_or_si128(_mm_slli_si128(operand, 1), _mm_srli_si128(last_block, 15));
4663+
__m128i mask1 = _mm_or_si128(find_e0, _mm_and_si128(_mm_set1_epi8(0xD), _mm_cmpgt_epi8(operand, over_9f)));
4664+
bad = _mm_or_si128(bad, _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_xor_si128(operand2, mask1)));
4665+
4666+
/* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF
4667+
* Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte
4668+
* code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF
4669+
* Build the bitmask, XOR it with the shifted block, check for 0x00 bytes in the result */
4670+
__m128i mask2 = _mm_or_si128(find_f0, _mm_and_si128(_mm_set1_epi8(0x4), _mm_cmpgt_epi8(operand, over_8f)));
4671+
bad = _mm_or_si128(bad, _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_xor_si128(operand2, mask2)));
4672+
4673+
/* Check for overlong 2-byte code units
4674+
* Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit
4675+
* Same deal as before; add an offset to shift 0xC0-C1 to the far low end of the signed
4676+
* byte range, do a signed compare to pick out any bad bytes */
4677+
bad = _mm_or_si128(bad, _mm_cmplt_epi8(_mm_add_epi8(operand, find_c0), c0_to_c1));
4678+
4679+
/* Check structure of continuation bytes
4680+
* A UTF-8 byte should be a continuation byte if, and only if, it is:
4681+
* 1) 1 byte after the start of a 2-byte, 3-byte, or 4-byte character
4682+
* 2) 2 bytes after the start of a 3-byte or 4-byte character
4683+
* 3) 3 bytes after the start of a 4-byte character
4684+
* We build 3 bitmasks with 0xFF in each such position, and OR them together to
4685+
* get a single bitmask with 0xFF in each position where a continuation byte should be */
4686+
__m128i cont_mask = _mm_cmpeq_epi8(_mm_and_si128(operand2, find_c0), find_c0);
4687+
__m128i operand3 = _mm_or_si128(_mm_slli_si128(operand, 2), _mm_srli_si128(last_block, 14));
4688+
cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand3, find_e0), find_e0));
4689+
__m128i operand4 = _mm_or_si128(_mm_slli_si128(operand, 3), _mm_srli_si128(last_block, 13));
4690+
cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand4, find_f0), find_f0));
4691+
4692+
/* Now, use a signed comparison to get another bitmask with 0xFF in each position where
4693+
* a continuation byte actually is
4694+
* XOR those two bitmasks together; if everything is good, the result should be zero
4695+
* However, if a byte which should have been a continuation wasn't, or if a byte which
4696+
* shouldn't have been a continuation was, we will get 0xFF in that position */
4697+
__m128i continuation = _mm_cmplt_epi8(operand, find_c0);
4698+
bad = _mm_or_si128(bad, _mm_xor_si128(continuation, cont_mask));
4699+
4700+
/* Pick out the high bit of each byte in `bad` as a 16-bit value (into a scalar register)
4701+
* If that value is non-zero, then we found a bad byte somewhere! */
4702+
if (_mm_movemask_epi8(bad)) {
4703+
return false;
4704+
}
4705+
4706+
last_block = operand;
4707+
p += sizeof(__m128i);
4708+
}
4709+
4710+
finish_up_remaining_bytes: ;
4711+
/* Finish up 1-15 remaining bytes */
4712+
if (p == e) {
4713+
uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m128i) - 1); /* Not including terminating null */
4714+
4715+
/* Crazy hack here... we want to use the above vectorized code to check a block of less than 16
4716+
* bytes, but there is no good way to read a variable number of bytes into an XMM register
4717+
* However, we know that these bytes are part of a zend_string, and a zend_string has some
4718+
* 'header' fields which occupy the memory just before its content
4719+
* And, those header fields occupy more than 16 bytes...
4720+
* So if we go back 16 bytes from the end of the zend_string content, and load 16 bytes from there,
4721+
* we may pick up some 'junk' bytes from the zend_string header fields, but we will get the 1-15
4722+
* bytes we wanted in the tail end of our XMM register, and this will never cause a segfault.
4723+
* Then, we do a left shift to get rid of the unwanted bytes
4724+
* Conveniently, the same left shift also zero-fills the tail end of the XMM register
4725+
*
4726+
* The following `switch` looks useless, but it's not
4727+
* The PSRLDQ instruction used for the 128-bit left shift requires an immediate (literal)
4728+
* shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist)
4729+
*/
4730+
switch (remaining_bytes) {
4731+
case 0:
4732+
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 15)), 15);
4733+
goto check_operand;
4734+
case 1:
4735+
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 14)), 14);
4736+
goto check_operand;
4737+
case 2:
4738+
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 13)), 13);
4739+
goto check_operand;
4740+
case 3:
4741+
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 12)), 12);
4742+
goto check_operand;
4743+
case 4:
4744+
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 11)), 11);
4745+
goto check_operand;
4746+
case 5:
4747+
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10);
4748+
goto check_operand;
4749+
case 6:
4750+
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9);
4751+
goto check_operand;
4752+
case 7:
4753+
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 8)), 8);
4754+
goto check_operand;
4755+
case 8:
4756+
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 7)), 7);
4757+
goto check_operand;
4758+
case 9:
4759+
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6);
4760+
goto check_operand;
4761+
case 10:
4762+
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5);
4763+
goto check_operand;
4764+
case 11:
4765+
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4);
4766+
goto check_operand;
4767+
case 12:
4768+
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3);
4769+
goto check_operand;
4770+
case 13:
4771+
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2);
4772+
goto check_operand;
4773+
case 14:
4774+
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1);
4775+
goto check_operand;
4776+
case 15:
4777+
/* No trailing bytes are left which need to be checked
4778+
* We get 15 because we did not include the terminating null when
4779+
* calculating `remaining_bytes`, so the value wraps around */
4780+
return true;
4781+
}
4782+
4783+
ZEND_UNREACHABLE();
4784+
}
4785+
4786+
return true;
4787+
#else
4788+
return php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), &mbfl_encoding_utf8);
4789+
#endif
4790+
}
4791+
45904792
static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding)
45914793
{
45924794
if (encoding == &mbfl_encoding_utf8) {
45934795
if (GC_FLAGS(str) & IS_STR_VALID_UTF8) {
45944796
return true;
45954797
}
4596-
bool result = php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), encoding);
4798+
bool result = mb_fast_check_utf8(str);
45974799
if (result && !ZSTR_IS_INTERNED(str)) {
45984800
GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
45994801
}

ext/mbstring/tests/utf_encodings.phpt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -813,6 +813,20 @@ $invalid = array(
813813

814814
testInvalidCodepoints($invalid, 'UTF-8');
815815

816+
// Regression test for bug in SSE2-based accelerated UTF-8 validation function
817+
$truncated16byte = [
818+
"k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xc6",
819+
"k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xef",
820+
"k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xef\xbf",
821+
"k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0",
822+
"k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0\xbf",
823+
"k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0\xbf\xbf"
824+
];
825+
foreach ($truncated16byte as $trunc) {
826+
if (mb_check_encoding($trunc, 'UTF-8'))
827+
die("UTF-8 validation was incorrect on 16-byte string with truncated multi-byte char at end");
828+
}
829+
816830
echo "== UTF-16 ==\n";
817831

818832
testValidCodepoints("UTF-16");

0 commit comments

Comments
 (0)