@@ -1748,9 +1748,11 @@ static size_t mb_fast_strlen_utf8(unsigned char *p, size_t len)
1748
1748
1749
1749
#ifdef __SSE2__
1750
1750
if (len >= sizeof (__m128i )) {
1751
+ e -= sizeof (__m128i );
1752
+
1751
1753
const __m128i threshold = _mm_set1_epi8 (-64 );
1752
1754
const __m128i delta = _mm_set1_epi8 (1 );
1753
- __m128i counter = _mm_set1_epi8 ( 0 ); /* Vector of 16 continuation-byte counters */
1755
+ __m128i counter = _mm_setzero_si128 ( ); /* Vector of 16 continuation-byte counters */
1754
1756
1755
1757
int reset_counter = 255 ;
1756
1758
do {
@@ -1762,13 +1764,14 @@ static size_t mb_fast_strlen_utf8(unsigned char *p, size_t len)
1762
1764
* and reset them to zero */
1763
1765
if (-- reset_counter == 0 ) {
1764
1766
len -= _mm_sum_epu8 (counter );
1765
- counter = _mm_set1_epi8 ( 0 );
1767
+ counter = _mm_setzero_si128 ( );
1766
1768
reset_counter = 255 ;
1767
1769
}
1768
1770
1769
1771
p += sizeof (__m128i );
1770
- } while (p + sizeof ( __m128i ) <= e );
1772
+ } while (p <= e );
1771
1773
1774
+ e += sizeof (__m128i );
1772
1775
len -= _mm_sum_epu8 (counter ); /* Fold in any remaining non-zero values in the 16 counters */
1773
1776
}
1774
1777
#endif
@@ -4587,13 +4590,215 @@ MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const
4587
4590
return true;
4588
4591
}
4589
4592
4593
+ static bool mb_fast_check_utf8 (zend_string * str )
4594
+ {
4595
+ #ifdef __SSE2__
4596
+ unsigned char * p = (unsigned char * )ZSTR_VAL (str );
4597
+ /* `e` points 1 byte past the last full 16-byte block of string content
4598
+ * Note that we include the terminating null byte which is included in each zend_string
4599
+ * as part of the content to check; this ensures that multi-byte characters which are
4600
+ * truncated abruptly at the end of the string will be detected as invalid */
4601
+ unsigned char * e = p + ((ZSTR_LEN (str ) + 1 ) & ~(sizeof (__m128i ) - 1 ));
4602
+
4603
+ /* For checking for illegal bytes 0xF5-FF */
4604
+ const __m128i over_f5 = _mm_set1_epi8 (-117 );
4605
+ /* For checking for overlong 3-byte code units and reserved codepoints U+D800-DFFF */
4606
+ const __m128i over_9f = _mm_set1_epi8 (-97 );
4607
+ /* For checking for overlong 4-byte code units and invalid codepoints > U+10FFFF */
4608
+ const __m128i over_8f = _mm_set1_epi8 (-113 );
4609
+ /* For checking for illegal bytes 0xC0-C1 */
4610
+ const __m128i find_c0 = _mm_set1_epi8 (-64 );
4611
+ const __m128i c0_to_c1 = _mm_set1_epi8 (-126 );
4612
+ /* For checking structure of continuation bytes */
4613
+ const __m128i find_e0 = _mm_set1_epi8 (-32 );
4614
+ const __m128i find_f0 = _mm_set1_epi8 (-16 );
4615
+
4616
+ __m128i last_block = _mm_setzero_si128 ();
4617
+ __m128i operand ;
4618
+
4619
+ while (p < e ) {
4620
+ operand = _mm_loadu_si128 ((__m128i * )p ); /* Load 16 bytes */
4621
+
4622
+ check_operand :
4623
+ /* If all 16 bytes are single-byte characters, then a number of checks can be skipped */
4624
+ if (!_mm_movemask_epi8 (_mm_cmplt_epi8 (operand , _mm_setzero_si128 ()))) {
4625
+ /* Even if this block only contains single-byte characters, there may have been a
4626
+ * multi-byte character at the end of the previous block, which was supposed to
4627
+ * have continuation bytes in this block
4628
+ * This bitmask will pick out a 2/3/4-byte character starting from the last byte of
4629
+ * the previous block, a 3/4-byte starting from the 2nd last, or a 4-byte starting
4630
+ * from the 3rd last */
4631
+ __m128i bad_mask = _mm_set_epi8 (-64 , -32 , -16 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
4632
+ __m128i bad = _mm_cmpeq_epi8 (_mm_and_si128 (last_block , bad_mask ), bad_mask );
4633
+ if (_mm_movemask_epi8 (bad )) {
4634
+ return false;
4635
+ }
4636
+
4637
+ /* Consume as many full blocks of single-byte characters as we can */
4638
+ while (true) {
4639
+ p += sizeof (__m128i );
4640
+ if (p >= e ) {
4641
+ goto finish_up_remaining_bytes ;
4642
+ }
4643
+ operand = _mm_loadu_si128 ((__m128i * )p );
4644
+ if (_mm_movemask_epi8 (_mm_cmplt_epi8 (operand , _mm_setzero_si128 ()))) {
4645
+ break ;
4646
+ }
4647
+ }
4648
+ }
4649
+
4650
+ /* Check for >= 0xF5, which are illegal byte values in UTF-8
4651
+ * AVX512 has instructions for vectorized unsigned compare, but SSE2 only has signed compare
4652
+ * So we add an offset to shift 0xF5-FF to the far low end of the signed byte range
4653
+ * Then a single signed compare will pick out any bad bytes
4654
+ * `bad` is a vector of 16 good/bad values, where 0x00 means good and 0xFF means bad */
4655
+ __m128i bad = _mm_cmplt_epi8 (_mm_add_epi8 (operand , over_f5 ), over_f5 );
4656
+
4657
+ /* Check for overlong 3-byte code units AND reserved codepoints U+D800-DFFF
4658
+ * 0xE0 followed by a byte < 0xA0 indicates an overlong 3-byte code unit, and
4659
+ * 0xED followed by a byte >= 0xA0 indicates a reserved codepoint
4660
+ * We can check for both problems at once by generating a vector where each byte < 0xA0
4661
+ * is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED
4662
+ * Shift the original block right by one byte, and XOR the shifted block with the bitmask
4663
+ * Any matches will give a 0x00 byte; do a compare with a zero vector to pick out the
4664
+ * bad positions, and OR them into `bad` */
4665
+ __m128i operand2 = _mm_or_si128 (_mm_slli_si128 (operand , 1 ), _mm_srli_si128 (last_block , 15 ));
4666
+ __m128i mask1 = _mm_or_si128 (find_e0 , _mm_and_si128 (_mm_set1_epi8 (0xD ), _mm_cmpgt_epi8 (operand , over_9f )));
4667
+ bad = _mm_or_si128 (bad , _mm_cmpeq_epi8 (_mm_setzero_si128 (), _mm_xor_si128 (operand2 , mask1 )));
4668
+
4669
+ /* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF
4670
+ * Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte
4671
+ * code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF
4672
+ * Build the bitmask, XOR it with the shifted block, check for 0x00 bytes in the result */
4673
+ __m128i mask2 = _mm_or_si128 (find_f0 , _mm_and_si128 (_mm_set1_epi8 (0x4 ), _mm_cmpgt_epi8 (operand , over_8f )));
4674
+ bad = _mm_or_si128 (bad , _mm_cmpeq_epi8 (_mm_setzero_si128 (), _mm_xor_si128 (operand2 , mask2 )));
4675
+
4676
+ /* Check for overlong 2-byte code units
4677
+ * Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit
4678
+ * Same deal as before; add an offset to shift 0xC0-C1 to the far low end of the signed
4679
+ * byte range, do a signed compare to pick out any bad bytes */
4680
+ bad = _mm_or_si128 (bad , _mm_cmplt_epi8 (_mm_add_epi8 (operand , find_c0 ), c0_to_c1 ));
4681
+
4682
+ /* Check structure of continuation bytes
4683
+ * A UTF-8 byte should be a continuation byte if, and only if, it is:
4684
+ * 1) 1 byte after the start of a 2-byte, 3-byte, or 4-byte character
4685
+ * 2) 2 bytes after the start of a 3-byte or 4-byte character
4686
+ * 3) 3 bytes after the start of a 4-byte character
4687
+ * We build 3 bitmasks with 0xFF in each such position, and OR them together to
4688
+ * get a single bitmask with 0xFF in each position where a continuation byte should be */
4689
+ __m128i cont_mask = _mm_cmpeq_epi8 (_mm_and_si128 (operand2 , find_c0 ), find_c0 );
4690
+ __m128i operand3 = _mm_or_si128 (_mm_slli_si128 (operand , 2 ), _mm_srli_si128 (last_block , 14 ));
4691
+ cont_mask = _mm_or_si128 (cont_mask , _mm_cmpeq_epi8 (_mm_and_si128 (operand3 , find_e0 ), find_e0 ));
4692
+ __m128i operand4 = _mm_or_si128 (_mm_slli_si128 (operand , 3 ), _mm_srli_si128 (last_block , 13 ));
4693
+ cont_mask = _mm_or_si128 (cont_mask , _mm_cmpeq_epi8 (_mm_and_si128 (operand4 , find_f0 ), find_f0 ));
4694
+
4695
+ /* Now, use a signed comparison to get another bitmask with 0xFF in each position where
4696
+ * a continuation byte actually is
4697
+ * XOR those two bitmasks together; if everything is good, the result should be zero
4698
+ * However, if a byte which should have been a continuation wasn't, or if a byte which
4699
+ * shouldn't have been a continuation was, we will get 0xFF in that position */
4700
+ __m128i continuation = _mm_cmplt_epi8 (operand , find_c0 );
4701
+ bad = _mm_or_si128 (bad , _mm_xor_si128 (continuation , cont_mask ));
4702
+
4703
+ /* Pick out the high bit of each byte in `bad` as a 16-bit value (into a scalar register)
4704
+ * If that value is non-zero, then we found a bad byte somewhere! */
4705
+ if (_mm_movemask_epi8 (bad )) {
4706
+ return false;
4707
+ }
4708
+
4709
+ last_block = operand ;
4710
+ p += sizeof (__m128i );
4711
+ }
4712
+
4713
+ finish_up_remaining_bytes : ;
4714
+ /* Finish up 1-15 remaining bytes */
4715
+ if (p == e ) {
4716
+ uint8_t remaining_bytes = ZSTR_LEN (str ) & (sizeof (__m128i ) - 1 ); /* Not including terminating null */
4717
+
4718
+ /* Crazy hack here... we want to use the above vectorized code to check a block of less than 16
4719
+ * bytes, but there is no good way to read a variable number of bytes into an XMM register
4720
+ * However, we know that these bytes are part of a zend_string, and a zend_string has some
4721
+ * 'header' fields which occupy the memory just before its content
4722
+ * And, those header fields occupy more than 16 bytes...
4723
+ * So if we go back 16 bytes from the end of the zend_string content, and load 16 bytes from there,
4724
+ * we may pick up some 'junk' bytes from the zend_string header fields, but we will get the 1-15
4725
+ * bytes we wanted in the tail end of our XMM register, and this will never cause a segfault.
4726
+ * Then, we do a left shift to get rid of the unwanted bytes
4727
+ * Conveniently, the same left shift also zero-fills the tail end of the XMM register
4728
+ *
4729
+ * The following `switch` looks useless, but it's not
4730
+ * The PSRLDQ instruction used for the 128-bit left shift requires an immediate (literal)
4731
+ * shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist)
4732
+ */
4733
+ switch (remaining_bytes ) {
4734
+ case 0 :
4735
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 15 )), 15 );
4736
+ goto check_operand ;
4737
+ case 1 :
4738
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 14 )), 14 );
4739
+ goto check_operand ;
4740
+ case 2 :
4741
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 13 )), 13 );
4742
+ goto check_operand ;
4743
+ case 3 :
4744
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 12 )), 12 );
4745
+ goto check_operand ;
4746
+ case 4 :
4747
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 11 )), 11 );
4748
+ goto check_operand ;
4749
+ case 5 :
4750
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 10 )), 10 );
4751
+ goto check_operand ;
4752
+ case 6 :
4753
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 9 )), 9 );
4754
+ goto check_operand ;
4755
+ case 7 :
4756
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 8 )), 8 );
4757
+ goto check_operand ;
4758
+ case 8 :
4759
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 7 )), 7 );
4760
+ goto check_operand ;
4761
+ case 9 :
4762
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 6 )), 6 );
4763
+ goto check_operand ;
4764
+ case 10 :
4765
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 5 )), 5 );
4766
+ goto check_operand ;
4767
+ case 11 :
4768
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 4 )), 4 );
4769
+ goto check_operand ;
4770
+ case 12 :
4771
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 3 )), 3 );
4772
+ goto check_operand ;
4773
+ case 13 :
4774
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 2 )), 2 );
4775
+ goto check_operand ;
4776
+ case 14 :
4777
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 1 )), 1 );
4778
+ goto check_operand ;
4779
+ case 15 :
4780
+ /* No trailing bytes are left which need to be checked
4781
+ * We get 15 because we did not include the terminating null when
4782
+ * calculating `remaining_bytes`, so the value wraps around */
4783
+ return true;
4784
+ }
4785
+
4786
+ ZEND_UNREACHABLE ();
4787
+ }
4788
+
4789
+ return true;
4790
+ #else
4791
+ return php_mb_check_encoding (ZSTR_VAL (str ), ZSTR_LEN (str ), & mbfl_encoding_utf8 );
4792
+ #endif
4793
+ }
4794
+
4590
4795
static bool mb_check_str_encoding (zend_string * str , const mbfl_encoding * encoding )
4591
4796
{
4592
4797
if (encoding == & mbfl_encoding_utf8 ) {
4593
4798
if (GC_FLAGS (str ) & IS_STR_VALID_UTF8 ) {
4594
4799
return true;
4595
4800
}
4596
- bool result = php_mb_check_encoding ( ZSTR_VAL ( str ), ZSTR_LEN ( str ), encoding );
4801
+ bool result = mb_fast_check_utf8 ( str );
4597
4802
if (result && !ZSTR_IS_INTERNED (str )) {
4598
4803
GC_ADD_FLAGS (str , IS_STR_VALID_UTF8 );
4599
4804
}
0 commit comments