@@ -812,6 +812,126 @@ static void init_htmlspecialchars_lut(htmlspecialchars_lut* lut, const int flags
812
812
}
813
813
/* }}} */
814
814
815
+ static unsigned int validate_utf8_char (
816
+ const unsigned char * str ,
817
+ const size_t str_len ,
818
+ size_t * cursor ,
819
+ zend_result * status
820
+ ) {
821
+ const size_t pos = * cursor ;
822
+ * status = SUCCESS ;
823
+ const size_t tail_len = str_len - pos ;
824
+
825
+ /* Check if at least 1 byte is available */
826
+ if (tail_len < 1 ) {
827
+ MB_FAILURE (pos , 1 );
828
+ }
829
+
830
+ const unsigned char c = str [pos ];
831
+
832
+ /* ASCII (single byte) */
833
+ if (c < 0x80 ) {
834
+ * cursor = pos + 1 ;
835
+ return c ;
836
+ }
837
+
838
+ /* Leading byte < 0xC2 => invalid multibyte start */
839
+ if (c < 0xC2 ) {
840
+ MB_FAILURE (pos , 1 );
841
+ }
842
+
843
+ /* 2-byte sequence (0xC2..0xDF) */
844
+ if (c < 0xE0 ) {
845
+ /* Need 2 bytes total */
846
+ if (tail_len < 2 ) {
847
+ MB_FAILURE (pos , 1 );
848
+ }
849
+ const unsigned char b2 = str [pos + 1 ];
850
+
851
+ /* Check continuation byte 10xxxxxx */
852
+ if ((b2 & 0xC0 ) != 0x80 ) {
853
+ MB_FAILURE (pos , ((b2 < 0x80 ) || (b2 >= 0xC2 && b2 <= 0xF4 )) ? 1 : 2 );
854
+ }
855
+
856
+ /* Combine bits into code point and check range >= 0x80 */
857
+ const unsigned int cp = ((c & 0x1F ) << 6 ) | (b2 & 0x3F );
858
+ if (cp < 0x80 ) {
859
+ MB_FAILURE (pos , 2 );
860
+ }
861
+
862
+ * cursor = pos + 2 ;
863
+ return cp ;
864
+ }
865
+
866
+ /* 3-byte sequence (0xE0..0xEF) */
867
+ if (c < 0xF0 ) {
868
+ /* Need 3 bytes total and valid continuation bytes */
869
+ if (tail_len < 3 ||
870
+ ((str [pos + 1 ] & 0xC0 ) != 0x80 ) ||
871
+ ((str [pos + 2 ] & 0xC0 ) != 0x80 )) {
872
+ if (tail_len < 2 ||
873
+ ((str [pos + 1 ] < 0x80 ) || (str [pos + 1 ] >= 0xC2 && str [pos + 1 ] <= 0xF4 ))) {
874
+ MB_FAILURE (pos , 1 );
875
+ } else if (tail_len < 3 ||
876
+ ((str [pos + 2 ] < 0x80 ) || (str [pos + 2 ] >= 0xC2 && str [pos + 2 ] <= 0xF4 ))) {
877
+ MB_FAILURE (pos , 2 );
878
+ } else {
879
+ MB_FAILURE (pos , 3 );
880
+ }
881
+ }
882
+
883
+ /* Combine bits and check for >= 0x800 and not in surrogate area */
884
+ const unsigned int cp = ((c & 0x0F ) << 12 )
885
+ | ((str [pos + 1 ] & 0x3F ) << 6 )
886
+ | (str [pos + 2 ] & 0x3F );
887
+
888
+ if (cp < 0x800 || (cp >= 0xD800 && cp <= 0xDFFF )) {
889
+ MB_FAILURE (pos , 3 );
890
+ }
891
+
892
+ * cursor = pos + 3 ;
893
+ return cp ;
894
+ }
895
+
896
+ /* 4-byte sequence (0xF0..0xF4) */
897
+ if (c < 0xF5 ) {
898
+ /* Need 4 bytes total and valid continuation bytes */
899
+ if (tail_len < 4 ||
900
+ ((str [pos + 1 ] & 0xC0 ) != 0x80 ) ||
901
+ ((str [pos + 2 ] & 0xC0 ) != 0x80 ) ||
902
+ ((str [pos + 3 ] & 0xC0 ) != 0x80 )) {
903
+ if (tail_len < 2 ||
904
+ ((str [pos + 1 ] < 0x80 ) || (str [pos + 1 ] >= 0xC2 && str [pos + 1 ] <= 0xF4 ))) {
905
+ MB_FAILURE (pos , 1 );
906
+ } else if (tail_len < 3 ||
907
+ ((str [pos + 2 ] < 0x80 ) || (str [pos + 2 ] >= 0xC2 && str [pos + 2 ] <= 0xF4 ))) {
908
+ MB_FAILURE (pos , 2 );
909
+ } else if (tail_len < 4 ||
910
+ ((str [pos + 3 ] < 0x80 ) || (str [pos + 3 ] >= 0xC2 && str [pos + 3 ] <= 0xF4 ))) {
911
+ MB_FAILURE (pos , 3 );
912
+ } else {
913
+ MB_FAILURE (pos , 4 );
914
+ }
915
+ }
916
+
917
+ /* Combine bits and check range 0x10000..0x10FFFF */
918
+ const unsigned int cp = ((c & 0x07 ) << 18 )
919
+ | ((str [pos + 1 ] & 0x3F ) << 12 )
920
+ | ((str [pos + 2 ] & 0x3F ) << 6 )
921
+ | (str [pos + 3 ] & 0x3F );
922
+
923
+ if (cp < 0x10000 || cp > 0x10FFFF ) {
924
+ MB_FAILURE (pos , 4 );
925
+ }
926
+
927
+ * cursor = pos + 4 ;
928
+ return cp ;
929
+ }
930
+
931
+ /* Leading byte >= 0xF5 is invalid */
932
+ MB_FAILURE (pos , 1 );
933
+ }
934
+
815
935
static inline size_t write_octet_sequence (unsigned char * buf , enum entity_charset charset , unsigned code ) {
816
936
/* code is not necessarily a unicode code point */
817
937
switch (charset ) {
@@ -1478,15 +1598,23 @@ PHPAPI zend_string* php_htmlspecialchars_ex(
1478
1598
free_space -- ;
1479
1599
}
1480
1600
1481
- input_ptr ++ ;
1482
- } else {
1483
- /* Multibyte chars */
1484
- zend_result status ;
1485
- const size_t original_pos = (const char * )input_ptr - ZSTR_VAL (input );
1486
- size_t cursor = original_pos ;
1487
- const unsigned int this_char = get_next_char (charset , (unsigned char * )ZSTR_VAL (input ), ZSTR_LEN (input ),
1488
- & cursor , & status );
1489
- const size_t processed_len = cursor - original_pos ;
1601
+ input_ptr ++ ;
1602
+ } else {
1603
+ /* Multibyte chars */
1604
+ zend_result status ;
1605
+ const size_t original_pos = (const char * )input_ptr - ZSTR_VAL (input );
1606
+ size_t cursor = original_pos ;
1607
+
1608
+ unsigned int this_char = 0 ;
1609
+ if (charset == cs_utf_8 ) {
1610
+ this_char = validate_utf8_char ((unsigned char * )ZSTR_VAL (input ), ZSTR_LEN (input ),
1611
+ & cursor , & status );
1612
+ } else {
1613
+ this_char = get_next_char (charset , (unsigned char * )ZSTR_VAL (input ), ZSTR_LEN (input ),
1614
+ & cursor , & status );
1615
+ }
1616
+
1617
+ const size_t processed_len = cursor - original_pos ;
1490
1618
1491
1619
if (status == FAILURE ) {
1492
1620
if (flags & ENT_HTML_IGNORE_ERRORS ) {
0 commit comments