@@ -1109,6 +1109,22 @@ static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ *
1109
1109
}
1110
1110
/* }}} */
1111
1111
1112
+ static zend_always_inline zend_bool is_known_valid_utf8 (
1113
+ zend_string * subject_str , PCRE2_SIZE start_offset ) {
1114
+ if (!(GC_FLAGS (subject_str ) & IS_STR_VALID_UTF8 )) {
1115
+ /* We don't know whether the string is valid UTF-8 or not. */
1116
+ return 0 ;
1117
+ }
1118
+
1119
+ if (start_offset == ZSTR_LEN (subject_str )) {
1120
+ /* Degenerate case: Offset points to end of string. */
1121
+ return 1 ;
1122
+ }
1123
+
1124
+ /* Check that the offset does not point to an UTF-8 continuation byte. */
1125
+ return (ZSTR_VAL (subject_str )[start_offset ] & 0xc0 ) != 0x80 ;
1126
+ }
1127
+
1112
1128
/* {{{ php_pcre_match_impl() */
1113
1129
PHPAPI void php_pcre_match_impl (pcre_cache_entry * pce , zend_string * subject_str , zval * return_value ,
1114
1130
zval * subpats , int global , int use_flags , zend_long flags , zend_off_t start_offset )
@@ -1130,7 +1146,7 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
1130
1146
PCRE2_SPTR mark = NULL ; /* Target for MARK name */
1131
1147
zval marks ; /* Array of marks for PREG_PATTERN_ORDER */
1132
1148
pcre2_match_data * match_data ;
1133
- PCRE2_SIZE start_offset2 ;
1149
+ PCRE2_SIZE start_offset2 , orig_start_offset ;
1134
1150
1135
1151
char * subject = ZSTR_VAL (subject_str );
1136
1152
size_t subject_len = ZSTR_LEN (subject_str );
@@ -1226,8 +1242,10 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
1226
1242
}
1227
1243
}
1228
1244
1229
- options = (pce -> compile_options & PCRE2_UTF ) && !(GC_FLAGS (subject_str ) & IS_STR_VALID_UTF8 )
1230
- ? 0 : PCRE2_NO_UTF_CHECK ;
1245
+ orig_start_offset = start_offset2 ;
1246
+ options =
1247
+ (pce -> compile_options & PCRE2_UTF ) && !is_known_valid_utf8 (subject_str , orig_start_offset )
1248
+ ? 0 : PCRE2_NO_UTF_CHECK ;
1231
1249
1232
1250
/* Execute the regular expression. */
1233
1251
#ifdef HAVE_PCRE_JIT_SUPPORT
@@ -1417,7 +1435,8 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
1417
1435
1418
1436
if (PCRE_G (error_code ) == PHP_PCRE_NO_ERROR ) {
1419
1437
/* If there was no error and we're in /u mode, remember that the string is valid UTF-8. */
1420
- if ((pce -> compile_options & PCRE2_UTF ) && !ZSTR_IS_INTERNED (subject_str )) {
1438
+ if ((pce -> compile_options & PCRE2_UTF )
1439
+ && !ZSTR_IS_INTERNED (subject_str ) && orig_start_offset == 0 ) {
1421
1440
GC_ADD_FLAGS (subject_str , IS_STR_VALID_UTF8 );
1422
1441
}
1423
1442
0 commit comments