Skip to content

Commit b9f23c2

Browse files
committed
Fix #70232: Incorrect bump-along behavior with \K and empty string match
To do global matching (/g), for every empty match we have to do a second match with PCRE_NOTEMPTY turned on. That may fail, however, when the \K escape sequence is involved. For this purpose libpcre 8.0 introduced the PCRE_NOTEMPTY_ATSTART flag, which we will use if available, and otherwise fall back to the old (possibly buggy) behavior.
1 parent f94bcb1 commit b9f23c2

File tree

2 files changed

+82
-9
lines changed

2 files changed

+82
-9
lines changed

ext/pcre/php_pcre.c

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,11 @@
4343

4444
#define PCRE_CACHE_SIZE 4096
4545

46+
/* not fully functional workaround for libpcre < 8.0, see bug #70232 */
47+
#ifndef PCRE_NOTEMPTY_ATSTART
48+
# define PCRE_NOTEMPTY_ATSTART PCRE_NOTEMPTY
49+
#endif
50+
4651
enum {
4752
PHP_PCRE_NO_ERROR = 0,
4853
PHP_PCRE_INTERNAL_ERROR,
@@ -794,7 +799,7 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subjec
794799
pcre_free((void *) stringlist);
795800
}
796801
} else if (count == PCRE_ERROR_NOMATCH) {
797-
/* If we previously set PCRE_NOTEMPTY after a null match,
802+
/* If we previously set PCRE_NOTEMPTY_ATSTART after a null match,
798803
this is not necessarily the end. We need to advance
799804
the start offset, and continue. Fudge the offset values
800805
to achieve this, unless we're already at the end of the string. */
@@ -811,10 +816,10 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subjec
811816
}
812817

813818
/* If we have matched an empty string, mimic what Perl's /g options does.
814-
This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
819+
This turns out to be rather cunning. First we set PCRE_NOTEMPTY_ATSTART and try
815820
the match again at the same point. If this fails (picked up above) we
816821
advance to the next character. */
817-
g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
822+
g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED : 0;
818823

819824
/* Advance to the position right after the last full match */
820825
start_offset = offsets[1];
@@ -1256,7 +1261,7 @@ PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int sub
12561261
limit--;
12571262

12581263
} else if (count == PCRE_ERROR_NOMATCH || limit == 0) {
1259-
/* If we previously set PCRE_NOTEMPTY after a null match,
1264+
/* If we previously set PCRE_NOTEMPTY_ATSTART after a null match,
12601265
this is not necessarily the end. We need to advance
12611266
the start offset, and continue. Fudge the offset values
12621267
to achieve this, unless we're already at the end of the string. */
@@ -1290,10 +1295,10 @@ PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int sub
12901295
}
12911296

12921297
/* If we have matched an empty string, mimic what Perl's /g options does.
1293-
This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1298+
This turns out to be rather cunning. First we set PCRE_NOTEMPTY_ATSTART and try
12941299
the match again at the same point. If this fails (picked up above) we
12951300
advance to the next character. */
1296-
g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1301+
g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED : 0;
12971302

12981303
/* Advance to the next piece. */
12991304
start_offset = offsets[1];
@@ -1659,7 +1664,7 @@ PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subjec
16591664
}
16601665
}
16611666
} else if (count == PCRE_ERROR_NOMATCH) {
1662-
/* If we previously set PCRE_NOTEMPTY after a null match,
1667+
/* If we previously set PCRE_NOTEMPTY_ATSTART after a null match,
16631668
this is not necessarily the end. We need to advance
16641669
the start offset, and continue. Fudge the offset values
16651670
to achieve this, unless we're already at the end of the string. */
@@ -1691,10 +1696,10 @@ PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subjec
16911696
}
16921697

16931698
/* If we have matched an empty string, mimic what Perl's /g options does.
1694-
This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1699+
This turns out to be rather cunning. First we set PCRE_NOTEMPTY_ATSTART and try
16951700
the match again at the same point. If this fails (picked up above) we
16961701
advance to the next character. */
1697-
g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1702+
g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED : 0;
16981703

16991704
/* Advance to the position right after the last full match */
17001705
start_offset = offsets[1];

ext/pcre/tests/bug70232.phpt

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
--TEST--
2+
Bug #70232 (Incorrect bump-along behavior with \K and empty string match)
3+
--SKIPIF--
4+
<?php
5+
if (version_compare(explode(' ', PCRE_VERSION)[0], '8.0', 'lt')) {
6+
die("skip this test requires libpcre >= 8.0");
7+
}
8+
?>
9+
--FILE--
10+
<?php
11+
$pattern = '~(?: |\G)\d\B\K~';
12+
$subject = "123 a123 1234567 b123 123";
13+
preg_match_all($pattern, $subject, $matches);
14+
var_dump($matches);
15+
var_dump(preg_replace($pattern, "*", $subject));
16+
var_dump(preg_split($pattern, $subject));
17+
?>
18+
--EXPECT--
19+
array(1) {
20+
[0]=>
21+
array(10) {
22+
[0]=>
23+
string(0) ""
24+
[1]=>
25+
string(0) ""
26+
[2]=>
27+
string(0) ""
28+
[3]=>
29+
string(0) ""
30+
[4]=>
31+
string(0) ""
32+
[5]=>
33+
string(0) ""
34+
[6]=>
35+
string(0) ""
36+
[7]=>
37+
string(0) ""
38+
[8]=>
39+
string(0) ""
40+
[9]=>
41+
string(0) ""
42+
}
43+
}
44+
string(35) "1*2*3 a123 1*2*3*4*5*6*7 b123 1*2*3"
45+
array(11) {
46+
[0]=>
47+
string(1) "1"
48+
[1]=>
49+
string(1) "2"
50+
[2]=>
51+
string(8) "3 a123 1"
52+
[3]=>
53+
string(1) "2"
54+
[4]=>
55+
string(1) "3"
56+
[5]=>
57+
string(1) "4"
58+
[6]=>
59+
string(1) "5"
60+
[7]=>
61+
string(1) "6"
62+
[8]=>
63+
string(8) "7 b123 1"
64+
[9]=>
65+
string(1) "2"
66+
[10]=>
67+
string(1) "3"
68+
}

0 commit comments

Comments
 (0)