@@ -4587,13 +4587,184 @@ MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const
4587
4587
return true;
4588
4588
}
4589
4589
4590
+ static bool mb_fast_check_utf8 (zend_string * str )
4591
+ {
4592
+ unsigned char * p = (unsigned char * )ZSTR_VAL (str ), * e = p + ZSTR_LEN (str );
4593
+
4594
+ #ifdef __SSE2__
4595
+ e -= sizeof (__m128i );
4596
+
4597
+ /* For checking for illegal bytes 0xF5-FF */
4598
+ const __m128i over_f5 = _mm_set1_epi8 (-117 );
4599
+ /* For checking for overlong 3-byte code units and reserved codepoints U+D800-DFFF */
4600
+ const __m128i over_9f = _mm_set1_epi8 (-97 );
4601
+ /* For checking for overlong 4-byte code units and invalid codepoints > U+10FFFF */
4602
+ const __m128i over_8f = _mm_set1_epi8 (-113 );
4603
+ /* For checking for illegal bytes 0xC0-C1 */
4604
+ const __m128i find_c0 = _mm_set1_epi8 (-64 );
4605
+ const __m128i c0_to_c1 = _mm_set1_epi8 (-126 );
4606
+ /* For checking structure of continuation bytes */
4607
+ const __m128i find_e0 = _mm_set1_epi8 (-32 );
4608
+ const __m128i find_f0 = _mm_set1_epi8 (-16 );
4609
+
4610
+ __m128i last_block = _mm_setzero_si128 ();
4611
+ __m128i operand ;
4612
+
4613
+ while (p <= e ) {
4614
+ operand = _mm_loadu_si128 ((__m128i * )p ); /* Load 16 bytes */
4615
+
4616
+ check_operand :
4617
+ /* If all 16 bytes are single-byte characters, there is no need to check anything */
4618
+ __m128i multibyte = _mm_cmplt_epi8 (operand , _mm_setzero_si128 ());
4619
+ if (_mm_movemask_epi8 (multibyte )) {
4620
+ /* Check for >= 0xF5, which are illegal byte values in UTF-8
4621
+ * AVX512 has instructions for vectorized unsigned compare, but SSE2 only has signed compare
4622
+ * So we add an offset to shift 0xF5-FF to the far low end of the signed byte range
4623
+ * Then a single signed compare will pick out any bad bytes
4624
+ * `bad` is a vector of 16 good/bad values, where 0x00 means good and 0xFF means bad */
4625
+ __m128i bad = _mm_cmplt_epi8 (_mm_add_epi8 (operand , over_f5 ), over_f5 );
4626
+
4627
+ /* Check for overlong 3-byte code units AND reserved codepoints U+D800-DFFF
4628
+ * 0xE0 followed by a byte < 0xA0 indicates an overlong 3-byte code unit, and
4629
+ * 0xED followed by a byte >= 0xA0 indicates a reserved codepoint
4630
+ * We can check for both problems at once by generating a vector where each byte < 0xA0
4631
+ * is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED
4632
+ * Shift the original block right by one byte, and XOR the shifted block with the bitmask
4633
+ * Any matches will give a 0x00 byte; do a compare with a zero vector to pick out the
4634
+ * bad positions, and OR them into `bad` */
4635
+ __m128i operand2 = _mm_or_si128 (_mm_slli_si128 (operand , 1 ), _mm_srli_si128 (last_block , 15 ));
4636
+ __m128i mask1 = _mm_or_si128 (find_e0 , _mm_and_si128 (_mm_set1_epi8 (0xD ), _mm_cmpgt_epi8 (operand , over_9f )));
4637
+ bad = _mm_or_si128 (bad , _mm_cmpeq_epi8 (_mm_setzero_si128 (), _mm_xor_si128 (operand2 , mask1 )));
4638
+
4639
+ /* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF
4640
+ * Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte
4641
+ * code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF
4642
+ * Build the bitmask, XOR it with the shifted block, check for 0x00 bytes in the result */
4643
+ __m128i mask2 = _mm_or_si128 (find_f0 , _mm_and_si128 (_mm_set1_epi8 (0x4 ), _mm_cmpgt_epi8 (operand , over_8f )));
4644
+ bad = _mm_or_si128 (bad , _mm_cmpeq_epi8 (_mm_setzero_si128 (), _mm_xor_si128 (operand2 , mask2 )));
4645
+
4646
+ /* Check for overlong 2-byte code units
4647
+ * Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit
4648
+ * Same deal as before; add an offset to shift 0xC0-C1 to the far low end of the signed
4649
+ * byte range, do a signed compare to pick out any bad bytes */
4650
+ bad = _mm_or_si128 (bad , _mm_cmplt_epi8 (_mm_add_epi8 (operand , find_c0 ), c0_to_c1 ));
4651
+
4652
+ /* Check structure of continuation bytes (the grand finale!)
4653
+ * A UTF-8 byte should be a continuation byte if, and only if, it is:
4654
+ * 1) 1 byte after the start of a 2-byte, 3-byte, or 4-byte character
4655
+ * 2) 2 bytes after the start of a 3-byte or 4-byte character
4656
+ * 3) 3 bytes after the start of a 4-byte character
4657
+ * We build 3 bitmasks with 0xFF in each such position, and OR them together to
4658
+ * get a single bitmask with 0xFF in each position where a continuation byte should be */
4659
+ __m128i cont_mask = _mm_cmpeq_epi8 (_mm_and_si128 (operand2 , find_c0 ), find_c0 );
4660
+ __m128i operand3 = _mm_or_si128 (_mm_slli_si128 (operand , 2 ), _mm_srli_si128 (last_block , 14 ));
4661
+ cont_mask = _mm_or_si128 (cont_mask , _mm_cmpeq_epi8 (_mm_and_si128 (operand3 , find_e0 ), find_e0 ));
4662
+ __m128i operand4 = _mm_or_si128 (_mm_slli_si128 (operand , 3 ), _mm_srli_si128 (last_block , 13 ));
4663
+ cont_mask = _mm_or_si128 (cont_mask , _mm_cmpeq_epi8 (_mm_and_si128 (operand4 , find_f0 ), find_f0 ));
4664
+
4665
+ /* Now, use a signed comparison to get another bitmask with 0xFF in each position where
4666
+ * a continuation byte actually is
4667
+ * XOR those two bitmasks together; if everything is good, the result should be zero
4668
+ * However, if a byte which should have been a continuation wasn't, or if a byte which
4669
+ * shouldn't have been a continuation was, we will get 0xFF in that position */
4670
+ __m128i continuation = _mm_cmplt_epi8 (operand , find_c0 );
4671
+ bad = _mm_or_si128 (bad , _mm_xor_si128 (continuation , cont_mask ));
4672
+
4673
+ /* Pick out the high bit of each byte in `bad` as a 16-bit value (into a scalar register)
4674
+ * If that value is non-zero, then we found a bad byte somewhere! */
4675
+ if (_mm_movemask_epi8 (bad )) {
4676
+ return false;
4677
+ }
4678
+ }
4679
+
4680
+ last_block = operand ;
4681
+ p += sizeof (__m128i );
4682
+ }
4683
+
4684
+ /* Finish up 1-15 remaining bytes */
4685
+ uint8_t shift_dist = p - e ;
4686
+ if (shift_dist < 16 ) {
4687
+ /* Crazy hack here... we want to use the above vectorized code to check a block of less than 16
4688
+ * bytes, but there is no good way to read a variable number of bytes into an XMM register
4689
+ * However, we know that these bytes are part of a zend_string, and a zend_string has some
4690
+ * 'header' fields which occupy the memory just before its content
4691
+ * And, those header fields occupy more than 16 bytes...
4692
+ * So if we go back 16 bytes from the end of the zend_string content, and load 16 bytes from there,
4693
+ * we may pick up some 'junk' bytes from the zend_string header fields, but we will get the 1-15
4694
+ * bytes we wanted in the tail end of our XMM register, and this will never cause a segfault.
4695
+ * Then, we do a left shift to get rid of the unwanted bytes
4696
+ * Conveniently, the same left shift also zero-fills the tail end of the XMM register */
4697
+ operand = _mm_loadu_si128 ((__m128i * )e );
4698
+
4699
+ /* This looks useless, but it's not
4700
+ * The PSRLDQ instruction used for this 128-bit left shift requires an immediate (literal)
4701
+ * shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist) */
4702
+ switch (shift_dist ) {
4703
+ case 0 :
4704
+ goto check_operand ;
4705
+ case 1 :
4706
+ operand = _mm_srli_si128 (operand , 1 );
4707
+ goto check_operand ;
4708
+ case 2 :
4709
+ operand = _mm_srli_si128 (operand , 2 );
4710
+ goto check_operand ;
4711
+ case 3 :
4712
+ operand = _mm_srli_si128 (operand , 3 );
4713
+ goto check_operand ;
4714
+ case 4 :
4715
+ operand = _mm_srli_si128 (operand , 4 );
4716
+ goto check_operand ;
4717
+ case 5 :
4718
+ operand = _mm_srli_si128 (operand , 5 );
4719
+ goto check_operand ;
4720
+ case 6 :
4721
+ operand = _mm_srli_si128 (operand , 6 );
4722
+ goto check_operand ;
4723
+ case 7 :
4724
+ operand = _mm_srli_si128 (operand , 7 );
4725
+ goto check_operand ;
4726
+ case 8 :
4727
+ operand = _mm_srli_si128 (operand , 8 );
4728
+ goto check_operand ;
4729
+ case 9 :
4730
+ operand = _mm_srli_si128 (operand , 9 );
4731
+ goto check_operand ;
4732
+ case 10 :
4733
+ operand = _mm_srli_si128 (operand , 10 );
4734
+ goto check_operand ;
4735
+ case 11 :
4736
+ operand = _mm_srli_si128 (operand , 11 );
4737
+ goto check_operand ;
4738
+ case 12 :
4739
+ operand = _mm_srli_si128 (operand , 12 );
4740
+ goto check_operand ;
4741
+ case 13 :
4742
+ operand = _mm_srli_si128 (operand , 13 );
4743
+ goto check_operand ;
4744
+ case 14 :
4745
+ operand = _mm_srli_si128 (operand , 14 );
4746
+ goto check_operand ;
4747
+ case 15 :
4748
+ operand = _mm_srli_si128 (operand , 15 );
4749
+ goto check_operand ;
4750
+ }
4751
+
4752
+ ZEND_UNREACHABLE ();
4753
+ }
4754
+
4755
+ return true;
4756
+ #else
4757
+ return php_mb_check_encoding (ZSTR_VAL (str ), ZSTR_LEN (str ), & mbfl_encoding_utf8 );
4758
+ #endif
4759
+ }
4760
+
4590
4761
static bool mb_check_str_encoding (zend_string * str , const mbfl_encoding * encoding )
4591
4762
{
4592
4763
if (encoding == & mbfl_encoding_utf8 ) {
4593
4764
if (GC_FLAGS (str ) & IS_STR_VALID_UTF8 ) {
4594
4765
return true;
4595
4766
}
4596
- bool result = php_mb_check_encoding ( ZSTR_VAL ( str ), ZSTR_LEN ( str ), encoding );
4767
+ bool result = mb_fast_check_utf8 ( str );
4597
4768
if (result && !ZSTR_IS_INTERNED (str )) {
4598
4769
GC_ADD_FLAGS (str , IS_STR_VALID_UTF8 );
4599
4770
}
0 commit comments