Skip to content

Commit c9e78e6

Browse files
committed
PCRE: Check whether start offset is on char boundary
We need not just the whole string to be UTF-8, but the start position to be on a character boundary as well. Check this by looking for a continuation byte.
1 parent 0d49cf4 commit c9e78e6

File tree

2 files changed

+39
-1
lines changed

2 files changed

+39
-1
lines changed

ext/pcre/php_pcre.c

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1130,6 +1130,22 @@ static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ *
11301130
}
11311131
/* }}} */
11321132

1133+
static zend_always_inline zend_bool is_known_valid_utf8(
1134+
zend_string *subject_str, PCRE2_SIZE start_offset) {
1135+
if (!(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8)) {
1136+
/* We don't know whether the string is valid UTF-8 or not. */
1137+
return 0;
1138+
}
1139+
1140+
if (start_offset == ZSTR_LEN(subject_str)) {
1141+
/* Degenerate case: Offset points to end of string. */
1142+
return 1;
1143+
}
1144+
1145+
/* Check that the offset does not point to an UTF-8 continuation byte. */
1146+
return (ZSTR_VAL(subject_str)[start_offset] & 0xc0) != 0x80;
1147+
}
1148+
11331149
/* {{{ php_pcre_match_impl() */
11341150
PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value,
11351151
zval *subpats, int global, int use_flags, zend_long flags, zend_off_t start_offset)
@@ -1247,7 +1263,7 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
12471263
}
12481264
}
12491265

1250-
options = (pce->compile_options & PCRE2_UTF) && !(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8)
1266+
options = (pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, start_offset2)
12511267
? 0 : PCRE2_NO_UTF_CHECK;
12521268

12531269
/* Execute the regular expression. */

ext/pcre/tests/bug79241.phpt

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
--TEST--
2+
Bug #79241: Segmentation fault on preg_match()
3+
--FILE--
4+
<?php
5+
6+
// if "’" string is used directly without json_decode,
7+
// the issue does not reproduce
8+
$text = json_decode('"’"');
9+
10+
$pattern = '/\b/u';
11+
12+
// it has to be exact two calls to preg_match(),
13+
// with the second call offsetting after the tick symbol
14+
var_dump(preg_match($pattern, $text, $matches, 0, 0));
15+
var_dump(preg_match($pattern, $text, $matches, 0, 1));
16+
var_dump(preg_last_error() == PREG_BAD_UTF8_OFFSET_ERROR);
17+
18+
?>
19+
--EXPECT--
20+
int(0)
21+
bool(false)
22+
bool(true)

0 commit comments

Comments
 (0)