@@ -4587,13 +4587,212 @@ MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const
4587
4587
return true;
4588
4588
}
4589
4589
4590
+ static bool mb_fast_check_utf8 (zend_string * str )
4591
+ {
4592
+ #ifdef __SSE2__
4593
+ unsigned char * p = (unsigned char * )ZSTR_VAL (str ), * e = p + ZSTR_LEN (str );
4594
+ /* To make the condition for the main `while` loop below tigher, `e` will point not 1 byte
4595
+ * past the end of the string, but 16 bytes before that... BUT, we treat the terminating
4596
+ * null byte which is included in each zend_string as part of the content to check
4597
+ * This ensures that multi-byte characters which are truncated abruptly at the end of
4598
+ * the string will be detected as invalid */
4599
+ e = e - sizeof (__m128i ) + 1 ;
4600
+
4601
+ /* For checking for illegal bytes 0xF5-FF */
4602
+ const __m128i over_f5 = _mm_set1_epi8 (-117 );
4603
+ /* For checking for overlong 3-byte code units and reserved codepoints U+D800-DFFF */
4604
+ const __m128i over_9f = _mm_set1_epi8 (-97 );
4605
+ /* For checking for overlong 4-byte code units and invalid codepoints > U+10FFFF */
4606
+ const __m128i over_8f = _mm_set1_epi8 (-113 );
4607
+ /* For checking for illegal bytes 0xC0-C1 */
4608
+ const __m128i find_c0 = _mm_set1_epi8 (-64 );
4609
+ const __m128i c0_to_c1 = _mm_set1_epi8 (-126 );
4610
+ /* For checking structure of continuation bytes */
4611
+ const __m128i find_e0 = _mm_set1_epi8 (-32 );
4612
+ const __m128i find_f0 = _mm_set1_epi8 (-16 );
4613
+
4614
+ __m128i last_block = _mm_setzero_si128 ();
4615
+ __m128i operand ;
4616
+
4617
+ while (p <= e ) {
4618
+ operand = _mm_loadu_si128 ((__m128i * )p ); /* Load 16 bytes */
4619
+
4620
+ check_operand :
4621
+ /* If all 16 bytes are single-byte characters, then a number of checks can be skipped */
4622
+ if (!_mm_movemask_epi8 (_mm_cmplt_epi8 (operand , _mm_setzero_si128 ()))) {
4623
+ /* Even if this block only contains single-byte characters, there may have been a
4624
+ * multi-byte character at the end of the previous block, which was supposed to
4625
+ * have continuation bytes in this block
4626
+ * This bitmask will pick out a 2/3/4-byte character starting from the last byte of
4627
+ * the previous block, a 3/4-byte starting from the 2nd last, or a 4-byte starting
4628
+ * from the 3rd last */
4629
+ __m128i bad_mask = _mm_set_epi8 (-64 , -32 , -16 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
4630
+ __m128i bad = _mm_cmpeq_epi8 (_mm_and_si128 (last_block , bad_mask ), bad_mask );
4631
+ if (_mm_movemask_epi8 (bad )) {
4632
+ return false;
4633
+ }
4634
+
4635
+ /* Consume as many full blocks of single-byte characters as we can */
4636
+ while (true) {
4637
+ p += sizeof (__m128i );
4638
+ if (p > e ) {
4639
+ goto finish_up_remaining_bytes ;
4640
+ }
4641
+ operand = _mm_loadu_si128 ((__m128i * )p );
4642
+ if (_mm_movemask_epi8 (_mm_cmplt_epi8 (operand , _mm_setzero_si128 ()))) {
4643
+ break ;
4644
+ }
4645
+ }
4646
+ }
4647
+
4648
+ /* Check for >= 0xF5, which are illegal byte values in UTF-8
4649
+ * AVX512 has instructions for vectorized unsigned compare, but SSE2 only has signed compare
4650
+ * So we add an offset to shift 0xF5-FF to the far low end of the signed byte range
4651
+ * Then a single signed compare will pick out any bad bytes
4652
+ * `bad` is a vector of 16 good/bad values, where 0x00 means good and 0xFF means bad */
4653
+ __m128i bad = _mm_cmplt_epi8 (_mm_add_epi8 (operand , over_f5 ), over_f5 );
4654
+
4655
+ /* Check for overlong 3-byte code units AND reserved codepoints U+D800-DFFF
4656
+ * 0xE0 followed by a byte < 0xA0 indicates an overlong 3-byte code unit, and
4657
+ * 0xED followed by a byte >= 0xA0 indicates a reserved codepoint
4658
+ * We can check for both problems at once by generating a vector where each byte < 0xA0
4659
+ * is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED
4660
+ * Shift the original block right by one byte, and XOR the shifted block with the bitmask
4661
+ * Any matches will give a 0x00 byte; do a compare with a zero vector to pick out the
4662
+ * bad positions, and OR them into `bad` */
4663
+ __m128i operand2 = _mm_or_si128 (_mm_slli_si128 (operand , 1 ), _mm_srli_si128 (last_block , 15 ));
4664
+ __m128i mask1 = _mm_or_si128 (find_e0 , _mm_and_si128 (_mm_set1_epi8 (0xD ), _mm_cmpgt_epi8 (operand , over_9f )));
4665
+ bad = _mm_or_si128 (bad , _mm_cmpeq_epi8 (_mm_setzero_si128 (), _mm_xor_si128 (operand2 , mask1 )));
4666
+
4667
+ /* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF
4668
+ * Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte
4669
+ * code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF
4670
+ * Build the bitmask, XOR it with the shifted block, check for 0x00 bytes in the result */
4671
+ __m128i mask2 = _mm_or_si128 (find_f0 , _mm_and_si128 (_mm_set1_epi8 (0x4 ), _mm_cmpgt_epi8 (operand , over_8f )));
4672
+ bad = _mm_or_si128 (bad , _mm_cmpeq_epi8 (_mm_setzero_si128 (), _mm_xor_si128 (operand2 , mask2 )));
4673
+
4674
+ /* Check for overlong 2-byte code units
4675
+ * Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit
4676
+ * Same deal as before; add an offset to shift 0xC0-C1 to the far low end of the signed
4677
+ * byte range, do a signed compare to pick out any bad bytes */
4678
+ bad = _mm_or_si128 (bad , _mm_cmplt_epi8 (_mm_add_epi8 (operand , find_c0 ), c0_to_c1 ));
4679
+
4680
+ /* Check structure of continuation bytes
4681
+ * A UTF-8 byte should be a continuation byte if, and only if, it is:
4682
+ * 1) 1 byte after the start of a 2-byte, 3-byte, or 4-byte character
4683
+ * 2) 2 bytes after the start of a 3-byte or 4-byte character
4684
+ * 3) 3 bytes after the start of a 4-byte character
4685
+ * We build 3 bitmasks with 0xFF in each such position, and OR them together to
4686
+ * get a single bitmask with 0xFF in each position where a continuation byte should be */
4687
+ __m128i cont_mask = _mm_cmpeq_epi8 (_mm_and_si128 (operand2 , find_c0 ), find_c0 );
4688
+ __m128i operand3 = _mm_or_si128 (_mm_slli_si128 (operand , 2 ), _mm_srli_si128 (last_block , 14 ));
4689
+ cont_mask = _mm_or_si128 (cont_mask , _mm_cmpeq_epi8 (_mm_and_si128 (operand3 , find_e0 ), find_e0 ));
4690
+ __m128i operand4 = _mm_or_si128 (_mm_slli_si128 (operand , 3 ), _mm_srli_si128 (last_block , 13 ));
4691
+ cont_mask = _mm_or_si128 (cont_mask , _mm_cmpeq_epi8 (_mm_and_si128 (operand4 , find_f0 ), find_f0 ));
4692
+
4693
+ /* Now, use a signed comparison to get another bitmask with 0xFF in each position where
4694
+ * a continuation byte actually is
4695
+ * XOR those two bitmasks together; if everything is good, the result should be zero
4696
+ * However, if a byte which should have been a continuation wasn't, or if a byte which
4697
+ * shouldn't have been a continuation was, we will get 0xFF in that position */
4698
+ __m128i continuation = _mm_cmplt_epi8 (operand , find_c0 );
4699
+ bad = _mm_or_si128 (bad , _mm_xor_si128 (continuation , cont_mask ));
4700
+
4701
+ /* Pick out the high bit of each byte in `bad` as a 16-bit value (into a scalar register)
4702
+ * If that value is non-zero, then we found a bad byte somewhere! */
4703
+ if (_mm_movemask_epi8 (bad )) {
4704
+ return false;
4705
+ }
4706
+
4707
+ last_block = operand ;
4708
+ p += sizeof (__m128i );
4709
+ }
4710
+
4711
+ finish_up_remaining_bytes : ;
4712
+ /* Finish up 1-15 remaining bytes */
4713
+ uint8_t shift_dist = p - e ;
4714
+ if (shift_dist < 16 ) {
4715
+ /* Crazy hack here... we want to use the above vectorized code to check a block of less than 16
4716
+ * bytes, but there is no good way to read a variable number of bytes into an XMM register
4717
+ * However, we know that these bytes are part of a zend_string, and a zend_string has some
4718
+ * 'header' fields which occupy the memory just before its content
4719
+ * And, those header fields occupy more than 16 bytes...
4720
+ * So if we go back 16 bytes from the end of the zend_string content, and load 16 bytes from there,
4721
+ * we may pick up some 'junk' bytes from the zend_string header fields, but we will get the 1-15
4722
+ * bytes we wanted in the tail end of our XMM register, and this will never cause a segfault.
4723
+ * Then, we do a left shift to get rid of the unwanted bytes
4724
+ * Conveniently, the same left shift also zero-fills the tail end of the XMM register */
4725
+ operand = _mm_loadu_si128 ((__m128i * )e );
4726
+
4727
+ /* This looks useless, but it's not
4728
+ * The PSRLDQ instruction used for this 128-bit left shift requires an immediate (literal)
4729
+ * shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist) */
4730
+ switch (shift_dist ) {
4731
+ case 0 :
4732
+ goto check_operand ;
4733
+ case 1 :
4734
+ operand = _mm_srli_si128 (operand , 1 );
4735
+ goto check_operand ;
4736
+ case 2 :
4737
+ operand = _mm_srli_si128 (operand , 2 );
4738
+ goto check_operand ;
4739
+ case 3 :
4740
+ operand = _mm_srli_si128 (operand , 3 );
4741
+ goto check_operand ;
4742
+ case 4 :
4743
+ operand = _mm_srli_si128 (operand , 4 );
4744
+ goto check_operand ;
4745
+ case 5 :
4746
+ operand = _mm_srli_si128 (operand , 5 );
4747
+ goto check_operand ;
4748
+ case 6 :
4749
+ operand = _mm_srli_si128 (operand , 6 );
4750
+ goto check_operand ;
4751
+ case 7 :
4752
+ operand = _mm_srli_si128 (operand , 7 );
4753
+ goto check_operand ;
4754
+ case 8 :
4755
+ operand = _mm_srli_si128 (operand , 8 );
4756
+ goto check_operand ;
4757
+ case 9 :
4758
+ operand = _mm_srli_si128 (operand , 9 );
4759
+ goto check_operand ;
4760
+ case 10 :
4761
+ operand = _mm_srli_si128 (operand , 10 );
4762
+ goto check_operand ;
4763
+ case 11 :
4764
+ operand = _mm_srli_si128 (operand , 11 );
4765
+ goto check_operand ;
4766
+ case 12 :
4767
+ operand = _mm_srli_si128 (operand , 12 );
4768
+ goto check_operand ;
4769
+ case 13 :
4770
+ operand = _mm_srli_si128 (operand , 13 );
4771
+ goto check_operand ;
4772
+ case 14 :
4773
+ operand = _mm_srli_si128 (operand , 14 );
4774
+ goto check_operand ;
4775
+ case 15 :
4776
+ operand = _mm_srli_si128 (operand , 15 );
4777
+ goto check_operand ;
4778
+ }
4779
+
4780
+ ZEND_UNREACHABLE ();
4781
+ }
4782
+
4783
+ return true;
4784
+ #else
4785
+ return php_mb_check_encoding (ZSTR_VAL (str ), ZSTR_LEN (str ), & mbfl_encoding_utf8 );
4786
+ #endif
4787
+ }
4788
+
4590
4789
static bool mb_check_str_encoding (zend_string * str , const mbfl_encoding * encoding )
4591
4790
{
4592
4791
if (encoding == & mbfl_encoding_utf8 ) {
4593
4792
if (GC_FLAGS (str ) & IS_STR_VALID_UTF8 ) {
4594
4793
return true;
4595
4794
}
4596
- bool result = php_mb_check_encoding ( ZSTR_VAL ( str ), ZSTR_LEN ( str ), encoding );
4795
+ bool result = mb_fast_check_utf8 ( str );
4597
4796
if (result && !ZSTR_IS_INTERNED (str )) {
4598
4797
GC_ADD_FLAGS (str , IS_STR_VALID_UTF8 );
4599
4798
}
0 commit comments