Skip to content

Commit fa06647

Browse files
committed
Add accelerated (SIMD-based) implementation of mb_check_encoding for UTF-8
The new SSE2-based implementation of mb_check_encoding for UTF-8 is about 10% faster for 0-5 byte strings, more than 3 times faster for ~100-byte strings, and just under 4 times faster for ~10,000-byte strings. I believe it may be possible to make this function much faster again. Some possible directions for further performance optimization include: • If other ISA extensions like AVX or AVX-512 are available, use a similar algorithm, but process text in blocks of 32 or 64 bytes (instead of 16 bytes). • If other SIMD ISA extensions are available, use the greater variety of available instructions to make some of the checks tighter. • Even if only SSE/SSE2 are available, find clever ways to squeeze instructions out of the hot path. This would probably require a lot of perusing instruction mauals and thinking hard about which SIMD instructions could be used to perform the same checks with fewer instructions. • Find a better algorithm, possibly one where more checks could be combined (just as the current algorithm combines the checks for certain overlong code units and reserved codepoints).
1 parent a903586 commit fa06647

File tree

1 file changed

+172
-1
lines changed

1 file changed

+172
-1
lines changed

ext/mbstring/mbstring.c

Lines changed: 172 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4587,13 +4587,184 @@ MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const
45874587
return true;
45884588
}
45894589

4590+
static bool mb_fast_check_utf8(zend_string *str)
4591+
{
4592+
unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
4593+
4594+
#ifdef __SSE2__
4595+
e -= sizeof(__m128i);
4596+
4597+
/* For checking for illegal bytes 0xF5-FF */
4598+
const __m128i over_f5 = _mm_set1_epi8(-117);
4599+
/* For checking for overlong 3-byte code units and reserved codepoints U+D800-DFFF */
4600+
const __m128i over_9f = _mm_set1_epi8(-97);
4601+
/* For checking for overlong 4-byte code units and invalid codepoints > U+10FFFF */
4602+
const __m128i over_8f = _mm_set1_epi8(-113);
4603+
/* For checking for illegal bytes 0xC0-C1 */
4604+
const __m128i find_c0 = _mm_set1_epi8(-64);
4605+
const __m128i c0_to_c1 = _mm_set1_epi8(-126);
4606+
/* For checking structure of continuation bytes */
4607+
const __m128i find_e0 = _mm_set1_epi8(-32);
4608+
const __m128i find_f0 = _mm_set1_epi8(-16);
4609+
4610+
__m128i last_block = _mm_setzero_si128();
4611+
__m128i operand;
4612+
4613+
while (p <= e) {
4614+
operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
4615+
4616+
check_operand: ;
4617+
/* If all 16 bytes are single-byte characters, there is no need to check anything */
4618+
__m128i multibyte = _mm_cmplt_epi8(operand, _mm_setzero_si128());
4619+
if (_mm_movemask_epi8(multibyte)) {
4620+
/* Check for >= 0xF5, which are illegal byte values in UTF-8
4621+
* AVX512 has instructions for vectorized unsigned compare, but SSE2 only has signed compare
4622+
* So we add an offset to shift 0xF5-FF to the far low end of the signed byte range
4623+
* Then a single signed compare will pick out any bad bytes
4624+
* `bad` is a vector of 16 good/bad values, where 0x00 means good and 0xFF means bad */
4625+
__m128i bad = _mm_cmplt_epi8(_mm_add_epi8(operand, over_f5), over_f5);
4626+
4627+
/* Check for overlong 3-byte code units AND reserved codepoints U+D800-DFFF
4628+
* 0xE0 followed by a byte < 0xA0 indicates an overlong 3-byte code unit, and
4629+
* 0xED followed by a byte >= 0xA0 indicates a reserved codepoint
4630+
* We can check for both problems at once by generating a vector where each byte < 0xA0
4631+
* is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED
4632+
* Shift the original block right by one byte, and XOR the shifted block with the bitmask
4633+
* Any matches will give a 0x00 byte; do a compare with a zero vector to pick out the
4634+
* bad positions, and OR them into `bad` */
4635+
__m128i operand2 = _mm_or_si128(_mm_slli_si128(operand, 1), _mm_srli_si128(last_block, 15));
4636+
__m128i mask1 = _mm_or_si128(find_e0, _mm_and_si128(_mm_set1_epi8(0xD), _mm_cmpgt_epi8(operand, over_9f)));
4637+
bad = _mm_or_si128(bad, _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_xor_si128(operand2, mask1)));
4638+
4639+
/* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF
4640+
* Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte
4641+
* code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF
4642+
* Build the bitmask, XOR it with the shifted block, check for 0x00 bytes in the result */
4643+
__m128i mask2 = _mm_or_si128(find_f0, _mm_and_si128(_mm_set1_epi8(0x4), _mm_cmpgt_epi8(operand, over_8f)));
4644+
bad = _mm_or_si128(bad, _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_xor_si128(operand2, mask2)));
4645+
4646+
/* Check for overlong 2-byte code units
4647+
* Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit
4648+
* Same deal as before; add an offset to shift 0xC0-C1 to the far low end of the signed
4649+
* byte range, do a signed compare to pick out any bad bytes */
4650+
bad = _mm_or_si128(bad, _mm_cmplt_epi8(_mm_add_epi8(operand, find_c0), c0_to_c1));
4651+
4652+
/* Check structure of continuation bytes (the grand finale!)
4653+
* A UTF-8 byte should be a continuation byte if, and only if, it is:
4654+
* 1) 1 byte after the start of a 2-byte, 3-byte, or 4-byte character
4655+
* 2) 2 bytes after the start of a 3-byte or 4-byte character
4656+
* 3) 3 bytes after the start of a 4-byte character
4657+
* We build 3 bitmasks with 0xFF in each such position, and OR them together to
4658+
* get a single bitmask with 0xFF in each position where a continuation byte should be */
4659+
__m128i cont_mask = _mm_cmpeq_epi8(_mm_and_si128(operand2, find_c0), find_c0);
4660+
__m128i operand3 = _mm_or_si128(_mm_slli_si128(operand, 2), _mm_srli_si128(last_block, 14));
4661+
cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand3, find_e0), find_e0));
4662+
__m128i operand4 = _mm_or_si128(_mm_slli_si128(operand, 3), _mm_srli_si128(last_block, 13));
4663+
cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand4, find_f0), find_f0));
4664+
4665+
/* Now, use a signed comparison to get another bitmask with 0xFF in each position where
4666+
* a continuation byte actually is
4667+
* XOR those two bitmasks together; if everything is good, the result should be zero
4668+
* However, if a byte which should have been a continuation wasn't, or if a byte which
4669+
* shouldn't have been a continuation was, we will get 0xFF in that position */
4670+
__m128i continuation = _mm_cmplt_epi8(operand, find_c0);
4671+
bad = _mm_or_si128(bad, _mm_xor_si128(continuation, cont_mask));
4672+
4673+
/* Pick out the high bit of each byte in `bad` as a 16-bit value (into a scalar register)
4674+
* If that value is non-zero, then we found a bad byte somewhere! */
4675+
if (_mm_movemask_epi8(bad)) {
4676+
return false;
4677+
}
4678+
}
4679+
4680+
last_block = operand;
4681+
p += sizeof(__m128i);
4682+
}
4683+
4684+
/* Finish up 1-15 remaining bytes */
4685+
uint8_t shift_dist = p - e;
4686+
if (shift_dist < 16) {
4687+
/* Crazy hack here... we want to use the above vectorized code to check a block of less than 16
4688+
* bytes, but there is no good way to read a variable number of bytes into an XMM register
4689+
* However, we know that these bytes are part of a zend_string, and a zend_string has some
4690+
* 'header' fields which occupy the memory just before its content
4691+
* And, those header fields occupy more than 16 bytes...
4692+
* So if we go back 16 bytes from the end of the zend_string content, and load 16 bytes from there,
4693+
* we may pick up some 'junk' bytes from the zend_string header fields, but we will get the 1-15
4694+
* bytes we wanted in the tail end of our XMM register, and this will never cause a segfault.
4695+
* Then, we do a left shift to get rid of the unwanted bytes
4696+
* Conveniently, the same left shift also zero-fills the tail end of the XMM register */
4697+
operand = _mm_loadu_si128((__m128i*)e);
4698+
4699+
/* This looks useless, but it's not
4700+
* The PSRLDQ instruction used for this 128-bit left shift requires an immediate (literal)
4701+
* shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist) */
4702+
switch (shift_dist) {
4703+
case 0:
4704+
goto check_operand;
4705+
case 1:
4706+
operand = _mm_srli_si128(operand, 1);
4707+
goto check_operand;
4708+
case 2:
4709+
operand = _mm_srli_si128(operand, 2);
4710+
goto check_operand;
4711+
case 3:
4712+
operand = _mm_srli_si128(operand, 3);
4713+
goto check_operand;
4714+
case 4:
4715+
operand = _mm_srli_si128(operand, 4);
4716+
goto check_operand;
4717+
case 5:
4718+
operand = _mm_srli_si128(operand, 5);
4719+
goto check_operand;
4720+
case 6:
4721+
operand = _mm_srli_si128(operand, 6);
4722+
goto check_operand;
4723+
case 7:
4724+
operand = _mm_srli_si128(operand, 7);
4725+
goto check_operand;
4726+
case 8:
4727+
operand = _mm_srli_si128(operand, 8);
4728+
goto check_operand;
4729+
case 9:
4730+
operand = _mm_srli_si128(operand, 9);
4731+
goto check_operand;
4732+
case 10:
4733+
operand = _mm_srli_si128(operand, 10);
4734+
goto check_operand;
4735+
case 11:
4736+
operand = _mm_srli_si128(operand, 11);
4737+
goto check_operand;
4738+
case 12:
4739+
operand = _mm_srli_si128(operand, 12);
4740+
goto check_operand;
4741+
case 13:
4742+
operand = _mm_srli_si128(operand, 13);
4743+
goto check_operand;
4744+
case 14:
4745+
operand = _mm_srli_si128(operand, 14);
4746+
goto check_operand;
4747+
case 15:
4748+
operand = _mm_srli_si128(operand, 15);
4749+
goto check_operand;
4750+
}
4751+
4752+
ZEND_UNREACHABLE();
4753+
}
4754+
4755+
return true;
4756+
#else
4757+
return php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), &mbfl_encoding_utf8);
4758+
#endif
4759+
}
4760+
45904761
static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding)
45914762
{
45924763
if (encoding == &mbfl_encoding_utf8) {
45934764
if (GC_FLAGS(str) & IS_STR_VALID_UTF8) {
45944765
return true;
45954766
}
4596-
bool result = php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), encoding);
4767+
bool result = mb_fast_check_utf8(str);
45974768
if (result && !ZSTR_IS_INTERNED(str)) {
45984769
GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
45994770
}

0 commit comments

Comments
 (0)