Skip to content

Commit d2befbc

Browse files
committed
Merge branch 'PHP-7.4'
* PHP-7.4: PCRE: Only remember valid UTF-8 if start offset zero PCRE: Check whether start offset is on char boundary
2 parents 24127d2 + cd5591a commit d2befbc

File tree

2 files changed

+56
-4
lines changed

2 files changed

+56
-4
lines changed

ext/pcre/php_pcre.c

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1109,6 +1109,22 @@ static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ *
11091109
}
11101110
/* }}} */
11111111

1112+
static zend_always_inline zend_bool is_known_valid_utf8(
1113+
zend_string *subject_str, PCRE2_SIZE start_offset) {
1114+
if (!(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8)) {
1115+
/* We don't know whether the string is valid UTF-8 or not. */
1116+
return 0;
1117+
}
1118+
1119+
if (start_offset == ZSTR_LEN(subject_str)) {
1120+
/* Degenerate case: Offset points to end of string. */
1121+
return 1;
1122+
}
1123+
1124+
/* Check that the offset does not point to an UTF-8 continuation byte. */
1125+
return (ZSTR_VAL(subject_str)[start_offset] & 0xc0) != 0x80;
1126+
}
1127+
11121128
/* {{{ php_pcre_match_impl() */
11131129
PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value,
11141130
zval *subpats, int global, int use_flags, zend_long flags, zend_off_t start_offset)
@@ -1130,7 +1146,7 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
11301146
PCRE2_SPTR mark = NULL; /* Target for MARK name */
11311147
zval marks; /* Array of marks for PREG_PATTERN_ORDER */
11321148
pcre2_match_data *match_data;
1133-
PCRE2_SIZE start_offset2;
1149+
PCRE2_SIZE start_offset2, orig_start_offset;
11341150

11351151
char *subject = ZSTR_VAL(subject_str);
11361152
size_t subject_len = ZSTR_LEN(subject_str);
@@ -1226,8 +1242,10 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
12261242
}
12271243
}
12281244

1229-
options = (pce->compile_options & PCRE2_UTF) && !(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8)
1230-
? 0 : PCRE2_NO_UTF_CHECK;
1245+
orig_start_offset = start_offset2;
1246+
options =
1247+
(pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, orig_start_offset)
1248+
? 0 : PCRE2_NO_UTF_CHECK;
12311249

12321250
/* Execute the regular expression. */
12331251
#ifdef HAVE_PCRE_JIT_SUPPORT
@@ -1417,7 +1435,8 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
14171435

14181436
if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
14191437
/* If there was no error and we're in /u mode, remember that the string is valid UTF-8. */
1420-
if ((pce->compile_options & PCRE2_UTF) && !ZSTR_IS_INTERNED(subject_str)) {
1438+
if ((pce->compile_options & PCRE2_UTF)
1439+
&& !ZSTR_IS_INTERNED(subject_str) && orig_start_offset == 0) {
14211440
GC_ADD_FLAGS(subject_str, IS_STR_VALID_UTF8);
14221441
}
14231442

ext/pcre/tests/bug79241.phpt

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
--TEST--
2+
Bug #79241: Segmentation fault on preg_match()
3+
--FILE--
4+
<?php
5+
6+
// if "’" string is used directly without json_decode,
7+
// the issue does not reproduce
8+
$text = json_decode('"’"');
9+
10+
$pattern = '/\b/u';
11+
12+
// it has to be exact two calls to preg_match(),
13+
// with the second call offsetting after the tick symbol
14+
var_dump(preg_match($pattern, $text, $matches, 0, 0));
15+
var_dump(preg_match($pattern, $text, $matches, 0, 1));
16+
var_dump(preg_last_error() == PREG_BAD_UTF8_OFFSET_ERROR);
17+
18+
echo "\n";
19+
20+
$text = "VA\xff"; $text .= "LID";
21+
var_dump(preg_match($pattern, $text, $matches, 0, 4));
22+
var_dump(preg_match($pattern, $text, $matches, 0, 0));
23+
var_dump(preg_last_error() == PREG_BAD_UTF8_ERROR);
24+
25+
?>
26+
--EXPECT--
27+
int(0)
28+
bool(false)
29+
bool(true)
30+
31+
int(1)
32+
bool(false)
33+
bool(true)

0 commit comments

Comments
 (0)