@@ -4587,13 +4587,215 @@ MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const
4587
4587
return true;
4588
4588
}
4589
4589
4590
+ static bool mb_fast_check_utf8 (zend_string * str )
4591
+ {
4592
+ #ifdef __SSE2__
4593
+ unsigned char * p = (unsigned char * )ZSTR_VAL (str );
4594
+ /* `e` points 1 byte past the last full 16-byte block of string content
4595
+ * Note that we include the terminating null byte which is included in each zend_string
4596
+ * as part of the content to check; this ensures that multi-byte characters which are
4597
+ * truncated abruptly at the end of the string will be detected as invalid */
4598
+ unsigned char * e = p + ((ZSTR_LEN (str ) + 1 ) & ~(sizeof (__m128i ) - 1 ));
4599
+
4600
+ /* For checking for illegal bytes 0xF5-FF */
4601
+ const __m128i over_f5 = _mm_set1_epi8 (-117 );
4602
+ /* For checking for overlong 3-byte code units and reserved codepoints U+D800-DFFF */
4603
+ const __m128i over_9f = _mm_set1_epi8 (-97 );
4604
+ /* For checking for overlong 4-byte code units and invalid codepoints > U+10FFFF */
4605
+ const __m128i over_8f = _mm_set1_epi8 (-113 );
4606
+ /* For checking for illegal bytes 0xC0-C1 */
4607
+ const __m128i find_c0 = _mm_set1_epi8 (-64 );
4608
+ const __m128i c0_to_c1 = _mm_set1_epi8 (-126 );
4609
+ /* For checking structure of continuation bytes */
4610
+ const __m128i find_e0 = _mm_set1_epi8 (-32 );
4611
+ const __m128i find_f0 = _mm_set1_epi8 (-16 );
4612
+
4613
+ __m128i last_block = _mm_setzero_si128 ();
4614
+ __m128i operand ;
4615
+
4616
+ while (p < e ) {
4617
+ operand = _mm_loadu_si128 ((__m128i * )p ); /* Load 16 bytes */
4618
+
4619
+ check_operand :
4620
+ /* If all 16 bytes are single-byte characters, then a number of checks can be skipped */
4621
+ if (!_mm_movemask_epi8 (_mm_cmplt_epi8 (operand , _mm_setzero_si128 ()))) {
4622
+ /* Even if this block only contains single-byte characters, there may have been a
4623
+ * multi-byte character at the end of the previous block, which was supposed to
4624
+ * have continuation bytes in this block
4625
+ * This bitmask will pick out a 2/3/4-byte character starting from the last byte of
4626
+ * the previous block, a 3/4-byte starting from the 2nd last, or a 4-byte starting
4627
+ * from the 3rd last */
4628
+ __m128i bad_mask = _mm_set_epi8 (-64 , -32 , -16 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
4629
+ __m128i bad = _mm_cmpeq_epi8 (_mm_and_si128 (last_block , bad_mask ), bad_mask );
4630
+ if (_mm_movemask_epi8 (bad )) {
4631
+ return false;
4632
+ }
4633
+
4634
+ /* Consume as many full blocks of single-byte characters as we can */
4635
+ while (true) {
4636
+ p += sizeof (__m128i );
4637
+ if (p >= e ) {
4638
+ goto finish_up_remaining_bytes ;
4639
+ }
4640
+ operand = _mm_loadu_si128 ((__m128i * )p );
4641
+ if (_mm_movemask_epi8 (_mm_cmplt_epi8 (operand , _mm_setzero_si128 ()))) {
4642
+ break ;
4643
+ }
4644
+ }
4645
+ }
4646
+
4647
+ /* Check for >= 0xF5, which are illegal byte values in UTF-8
4648
+ * AVX512 has instructions for vectorized unsigned compare, but SSE2 only has signed compare
4649
+ * So we add an offset to shift 0xF5-FF to the far low end of the signed byte range
4650
+ * Then a single signed compare will pick out any bad bytes
4651
+ * `bad` is a vector of 16 good/bad values, where 0x00 means good and 0xFF means bad */
4652
+ __m128i bad = _mm_cmplt_epi8 (_mm_add_epi8 (operand , over_f5 ), over_f5 );
4653
+
4654
+ /* Check for overlong 3-byte code units AND reserved codepoints U+D800-DFFF
4655
+ * 0xE0 followed by a byte < 0xA0 indicates an overlong 3-byte code unit, and
4656
+ * 0xED followed by a byte >= 0xA0 indicates a reserved codepoint
4657
+ * We can check for both problems at once by generating a vector where each byte < 0xA0
4658
+ * is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED
4659
+ * Shift the original block right by one byte, and XOR the shifted block with the bitmask
4660
+ * Any matches will give a 0x00 byte; do a compare with a zero vector to pick out the
4661
+ * bad positions, and OR them into `bad` */
4662
+ __m128i operand2 = _mm_or_si128 (_mm_slli_si128 (operand , 1 ), _mm_srli_si128 (last_block , 15 ));
4663
+ __m128i mask1 = _mm_or_si128 (find_e0 , _mm_and_si128 (_mm_set1_epi8 (0xD ), _mm_cmpgt_epi8 (operand , over_9f )));
4664
+ bad = _mm_or_si128 (bad , _mm_cmpeq_epi8 (_mm_setzero_si128 (), _mm_xor_si128 (operand2 , mask1 )));
4665
+
4666
+ /* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF
4667
+ * Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte
4668
+ * code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF
4669
+ * Build the bitmask, XOR it with the shifted block, check for 0x00 bytes in the result */
4670
+ __m128i mask2 = _mm_or_si128 (find_f0 , _mm_and_si128 (_mm_set1_epi8 (0x4 ), _mm_cmpgt_epi8 (operand , over_8f )));
4671
+ bad = _mm_or_si128 (bad , _mm_cmpeq_epi8 (_mm_setzero_si128 (), _mm_xor_si128 (operand2 , mask2 )));
4672
+
4673
+ /* Check for overlong 2-byte code units
4674
+ * Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit
4675
+ * Same deal as before; add an offset to shift 0xC0-C1 to the far low end of the signed
4676
+ * byte range, do a signed compare to pick out any bad bytes */
4677
+ bad = _mm_or_si128 (bad , _mm_cmplt_epi8 (_mm_add_epi8 (operand , find_c0 ), c0_to_c1 ));
4678
+
4679
+ /* Check structure of continuation bytes
4680
+ * A UTF-8 byte should be a continuation byte if, and only if, it is:
4681
+ * 1) 1 byte after the start of a 2-byte, 3-byte, or 4-byte character
4682
+ * 2) 2 bytes after the start of a 3-byte or 4-byte character
4683
+ * 3) 3 bytes after the start of a 4-byte character
4684
+ * We build 3 bitmasks with 0xFF in each such position, and OR them together to
4685
+ * get a single bitmask with 0xFF in each position where a continuation byte should be */
4686
+ __m128i cont_mask = _mm_cmpeq_epi8 (_mm_and_si128 (operand2 , find_c0 ), find_c0 );
4687
+ __m128i operand3 = _mm_or_si128 (_mm_slli_si128 (operand , 2 ), _mm_srli_si128 (last_block , 14 ));
4688
+ cont_mask = _mm_or_si128 (cont_mask , _mm_cmpeq_epi8 (_mm_and_si128 (operand3 , find_e0 ), find_e0 ));
4689
+ __m128i operand4 = _mm_or_si128 (_mm_slli_si128 (operand , 3 ), _mm_srli_si128 (last_block , 13 ));
4690
+ cont_mask = _mm_or_si128 (cont_mask , _mm_cmpeq_epi8 (_mm_and_si128 (operand4 , find_f0 ), find_f0 ));
4691
+
4692
+ /* Now, use a signed comparison to get another bitmask with 0xFF in each position where
4693
+ * a continuation byte actually is
4694
+ * XOR those two bitmasks together; if everything is good, the result should be zero
4695
+ * However, if a byte which should have been a continuation wasn't, or if a byte which
4696
+ * shouldn't have been a continuation was, we will get 0xFF in that position */
4697
+ __m128i continuation = _mm_cmplt_epi8 (operand , find_c0 );
4698
+ bad = _mm_or_si128 (bad , _mm_xor_si128 (continuation , cont_mask ));
4699
+
4700
+ /* Pick out the high bit of each byte in `bad` as a 16-bit value (into a scalar register)
4701
+ * If that value is non-zero, then we found a bad byte somewhere! */
4702
+ if (_mm_movemask_epi8 (bad )) {
4703
+ return false;
4704
+ }
4705
+
4706
+ last_block = operand ;
4707
+ p += sizeof (__m128i );
4708
+ }
4709
+
4710
+ finish_up_remaining_bytes : ;
4711
+ /* Finish up 1-15 remaining bytes */
4712
+ if (p == e ) {
4713
+ uint8_t remaining_bytes = ZSTR_LEN (str ) & (sizeof (__m128i ) - 1 ); /* Not including terminating null */
4714
+
4715
+ /* Crazy hack here... we want to use the above vectorized code to check a block of less than 16
4716
+ * bytes, but there is no good way to read a variable number of bytes into an XMM register
4717
+ * However, we know that these bytes are part of a zend_string, and a zend_string has some
4718
+ * 'header' fields which occupy the memory just before its content
4719
+ * And, those header fields occupy more than 16 bytes...
4720
+ * So if we go back 16 bytes from the end of the zend_string content, and load 16 bytes from there,
4721
+ * we may pick up some 'junk' bytes from the zend_string header fields, but we will get the 1-15
4722
+ * bytes we wanted in the tail end of our XMM register, and this will never cause a segfault.
4723
+ * Then, we do a left shift to get rid of the unwanted bytes
4724
+ * Conveniently, the same left shift also zero-fills the tail end of the XMM register
4725
+ *
4726
+ * The following `switch` looks useless, but it's not
4727
+ * The PSRLDQ instruction used for the 128-bit left shift requires an immediate (literal)
4728
+ * shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist)
4729
+ */
4730
+ switch (remaining_bytes ) {
4731
+ case 0 :
4732
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 15 )), 15 );
4733
+ goto check_operand ;
4734
+ case 1 :
4735
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 14 )), 14 );
4736
+ goto check_operand ;
4737
+ case 2 :
4738
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 13 )), 13 );
4739
+ goto check_operand ;
4740
+ case 3 :
4741
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 12 )), 12 );
4742
+ goto check_operand ;
4743
+ case 4 :
4744
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 11 )), 11 );
4745
+ goto check_operand ;
4746
+ case 5 :
4747
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 10 )), 10 );
4748
+ goto check_operand ;
4749
+ case 6 :
4750
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 9 )), 9 );
4751
+ goto check_operand ;
4752
+ case 7 :
4753
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 8 )), 8 );
4754
+ goto check_operand ;
4755
+ case 8 :
4756
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 7 )), 7 );
4757
+ goto check_operand ;
4758
+ case 9 :
4759
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 6 )), 6 );
4760
+ goto check_operand ;
4761
+ case 10 :
4762
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 5 )), 5 );
4763
+ goto check_operand ;
4764
+ case 11 :
4765
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 4 )), 4 );
4766
+ goto check_operand ;
4767
+ case 12 :
4768
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 3 )), 3 );
4769
+ goto check_operand ;
4770
+ case 13 :
4771
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 2 )), 2 );
4772
+ goto check_operand ;
4773
+ case 14 :
4774
+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 1 )), 1 );
4775
+ goto check_operand ;
4776
+ case 15 :
4777
+ /* No trailing bytes are left which need to be checked
4778
+ * We get 15 because we did not include the terminating null when
4779
+ * calculating `remaining_bytes`, so the value wraps around */
4780
+ return true;
4781
+ }
4782
+
4783
+ ZEND_UNREACHABLE ();
4784
+ }
4785
+
4786
+ return true;
4787
+ #else
4788
+ return php_mb_check_encoding (ZSTR_VAL (str ), ZSTR_LEN (str ), & mbfl_encoding_utf8 );
4789
+ #endif
4790
+ }
4791
+
4590
4792
static bool mb_check_str_encoding (zend_string * str , const mbfl_encoding * encoding )
4591
4793
{
4592
4794
if (encoding == & mbfl_encoding_utf8 ) {
4593
4795
if (GC_FLAGS (str ) & IS_STR_VALID_UTF8 ) {
4594
4796
return true;
4595
4797
}
4596
- bool result = php_mb_check_encoding ( ZSTR_VAL ( str ), ZSTR_LEN ( str ), encoding );
4798
+ bool result = mb_fast_check_utf8 ( str );
4597
4799
if (result && !ZSTR_IS_INTERNED (str )) {
4598
4800
GC_ADD_FLAGS (str , IS_STR_VALID_UTF8 );
4599
4801
}
0 commit comments