Skip to content

Commit 8f17826

Browse files
ju1iusnikic
ju1ius
authored andcommitted
adds support for named subpatterns to mb_ereg_replace
Named subpatterns are now passed to `mb_ereg_replace_callback`. This commit also adds a subset of the oniguruma back-reference syntax for replacements: * `\k<name>` and `\k'name'` for named subpatterns. * `\k<n>` and `\k'n'` for numbered subpatterns These last two notations allow referencing numbered groups where n > 9.
1 parent 212f56b commit 8f17826

File tree

3 files changed

+181
-31
lines changed

3 files changed

+181
-31
lines changed

ext/mbstring/php_mbregex.c

Lines changed: 135 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -690,6 +690,136 @@ mb_regex_groups_iter(const OnigUChar* name, const OnigUChar* name_end, int ngrou
690690
}
691691
/* }}} */
692692

693+
/*
694+
* Helper for _php_mb_regex_ereg_replace_exec
695+
*/
696+
/* {{{ mb_regex_substitute */
697+
static inline void mb_regex_substitute(
698+
smart_str *pbuf,
699+
const char *subject,
700+
size_t subject_len,
701+
char *replace,
702+
size_t replace_len,
703+
php_mb_regex_t *regexp,
704+
OnigRegion *regs,
705+
const mbfl_encoding *enc
706+
) {
707+
char *p, *sp, *eos;
708+
int no; /* bakreference group number */
709+
int clen; /* byte-length of the current character */
710+
711+
p = replace;
712+
eos = replace + replace_len;
713+
714+
while (p < eos) {
715+
clen = (int) php_mb_mbchar_bytes_ex(p, enc);
716+
if (clen != 1 || p == eos || p[0] != '\\') {
717+
/* skip anything that's not an ascii backslash */
718+
smart_str_appendl(pbuf, p, clen);
719+
p += clen;
720+
continue;
721+
}
722+
sp = p; /* save position */
723+
clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
724+
if (clen != 1 || p == eos) {
725+
/* skip escaped multibyte char */
726+
p += clen;
727+
smart_str_appendl(pbuf, sp, p - sp);
728+
continue;
729+
}
730+
no = -1;
731+
switch (p[0]) {
732+
case '0':
733+
no = 0;
734+
p++;
735+
break;
736+
case '1': case '2': case '3': case '4':
737+
case '5': case '6': case '7': case '8': case '9':
738+
if (!onig_noname_group_capture_is_active(regexp)) {
739+
/*
740+
* FIXME:
741+
* Oniguruma throws a compile error if numbered backrefs are used with named groups in the pattern.
742+
* For now we just ignore them, but in the future we might want to raise a warning
743+
* and abort the whole replace operation.
744+
*/
745+
p++;
746+
smart_str_appendl(pbuf, sp, p - sp);
747+
continue;
748+
}
749+
no = p[0] - '0';
750+
p++;
751+
break;
752+
case 'k':
753+
clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
754+
if (clen != 1 || p == eos || (p[0] != '<' && p[0] != '\'')) {
755+
/* not a backref delimiter */
756+
p += clen;
757+
smart_str_appendl(pbuf, sp, p - sp);
758+
continue;
759+
}
760+
/* try to consume everything until next delimiter */
761+
char delim = p[0] == '<' ? '>' : '\'';
762+
char *name, *name_end;
763+
char maybe_num = 1;
764+
name_end = name = p + 1;
765+
while (name_end < eos) {
766+
clen = (int) php_mb_mbchar_bytes_ex(name_end, enc);
767+
if (clen != 1) {
768+
name_end += clen;
769+
maybe_num = 0;
770+
continue;
771+
}
772+
if (name_end[0] == delim) break;
773+
if (maybe_num && !isdigit(name_end[0])) maybe_num = 0;
774+
name_end++;
775+
}
776+
p = name_end + 1;
777+
if (name_end - name < 1 || name_end >= eos) {
778+
/* the backref was empty or we failed to find the end delimiter */
779+
smart_str_appendl(pbuf, sp, p - sp);
780+
continue;
781+
}
782+
/* we have either a name or a number */
783+
if (maybe_num) {
784+
if (!onig_noname_group_capture_is_active(regexp)) {
785+
/* see above note on mixing numbered & named backrefs */
786+
smart_str_appendl(pbuf, sp, p - sp);
787+
continue;
788+
}
789+
if (name_end - name == 1) {
790+
no = name[0] - '0';
791+
break;
792+
}
793+
if (name[0] == '0') {
794+
/* 01 is not a valid number */
795+
break;
796+
}
797+
no = (int) strtoul(name, NULL, 10);
798+
break;
799+
}
800+
no = onig_name_to_backref_number(regexp, (OnigUChar *)name, (OnigUChar *)name_end, regs);
801+
break;
802+
default:
803+
p += clen;
804+
smart_str_appendl(pbuf, sp, p - sp);
805+
continue;
806+
}
807+
if (no < 0 || no >= regs->num_regs) {
808+
/* invalid group number reference, keep the escape sequence in the output */
809+
smart_str_appendl(pbuf, sp, p - sp);
810+
continue;
811+
}
812+
if (regs->beg[no] >= 0 && regs->beg[no] < regs->end[no] && (size_t)regs->end[no] <= subject_len) {
813+
smart_str_appendl(pbuf, subject + regs->beg[no], regs->end[no] - regs->beg[no]);
814+
}
815+
}
816+
817+
if (p < eos) {
818+
smart_str_appendl(pbuf, p, eos - p);
819+
}
820+
}
821+
/* }}} */
822+
693823
/*
694824
* php functions
695825
*/
@@ -857,14 +987,12 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp
857987
char *string;
858988
size_t string_len;
859989

860-
char *p;
861990
php_mb_regex_t *re;
862991
OnigSyntaxType *syntax;
863992
OnigRegion *regs = NULL;
864993
smart_str out_buf = {0};
865994
smart_str eval_buf = {0};
866995
smart_str *pbuf;
867-
size_t i;
868996
int err, eval, n;
869997
OnigUChar *pos;
870998
OnigUChar *string_lim;
@@ -974,38 +1102,11 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp
9741102
break;
9751103
}
9761104
if (err >= 0) {
977-
#if moriyoshi_0
978-
if (regs->beg[0] == regs->end[0]) {
979-
php_error_docref(NULL, E_WARNING, "Empty regular expression");
980-
break;
981-
}
982-
#endif
9831105
/* copy the part of the string before the match */
9841106
smart_str_appendl(&out_buf, (char *)pos, (size_t)((OnigUChar *)(string + regs->beg[0]) - pos));
9851107

9861108
if (!is_callable) {
987-
/* copy replacement and backrefs */
988-
i = 0;
989-
p = replace;
990-
while (i < replace_len) {
991-
int fwd = (int) php_mb_mbchar_bytes_ex(p, enc);
992-
n = -1;
993-
if ((replace_len - i) >= 2 && fwd == 1 &&
994-
p[0] == '\\' && p[1] >= '0' && p[1] <= '9') {
995-
n = p[1] - '0';
996-
}
997-
if (n >= 0 && n < regs->num_regs) {
998-
if (regs->beg[n] >= 0 && regs->beg[n] < regs->end[n] && (size_t)regs->end[n] <= string_len) {
999-
smart_str_appendl(pbuf, string + regs->beg[n], regs->end[n] - regs->beg[n]);
1000-
}
1001-
p += 2;
1002-
i += 2;
1003-
} else {
1004-
smart_str_appendl(pbuf, p, fwd);
1005-
p += fwd;
1006-
i += fwd;
1007-
}
1008-
}
1109+
mb_regex_substitute(pbuf, string, string_len, replace, replace_len, re, regs, enc);
10091110
}
10101111

10111112
if (eval) {
@@ -1045,6 +1146,10 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp
10451146
for (i = 0; i < regs->num_regs; i++) {
10461147
add_next_index_stringl(&subpats, string + regs->beg[i], regs->end[i] - regs->beg[i]);
10471148
}
1149+
if (onig_number_of_names(re) > 0) {
1150+
mb_regex_groups_iter_args args = {&subpats, string, string_len, regs};
1151+
onig_foreach_name(re, mb_regex_groups_iter, &args);
1152+
}
10481153

10491154
ZVAL_COPY_VALUE(&args[0], &subpats);
10501155
/* null terminate buffer */

ext/mbstring/tests/mb_ereg_replace_callback.phpt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,16 @@ function_exists('mb_ereg_replace_callback') or die("skip mb_ereg_replace_callbac
88
--FILE--
99
<?php
1010
$str = 'abc 123 #",; $foo';
11-
echo mb_ereg_replace_callback('(\S+)', function($m){return $m[1].'('.strlen($m[1]).')';}, $str);
11+
12+
echo mb_ereg_replace_callback('(\S+)', function ($m) {
13+
return $m[1].'('.strlen($m[1]).')';
14+
}, $str), "\n";
15+
16+
echo mb_ereg_replace_callback('(?<word>\w+) (?<digit>\d+).*', function ($m) {
17+
return sprintf("%s-%s", $m['digit'], $m['word']);
18+
}, $str), "\n";
1219
?>
1320
--EXPECT--
1421
abc(3) 123(3) #",;(4) $foo(4)
22+
123-abc
1523

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
--TEST--
2+
mb_ereg_replace() with named subpatterns
3+
--SKIPIF--
4+
<?php
5+
extension_loaded('mbstring') or die('skip mbstring not available');
6+
function_exists('mb_ereg_replace') or die("skip mb_ereg_replace() is not available in this build");
7+
?>
8+
--FILE--
9+
<?php
10+
mb_regex_set_options('');
11+
// \k<word> syntax
12+
echo mb_ereg_replace('(?<a>\s*)(?<b>\w+)(?<c>\s*)', '\k<a>_\k<b>_\k<c>', 'a b c d e' ), "\n";
13+
// \k'word' syntax
14+
echo mb_ereg_replace('(?<word>[a-z]+)',"<\k'word'>", 'abc def ghi'), PHP_EOL;
15+
// numbered captures with \k<n> syntax
16+
echo mb_ereg_replace('(1)(2)(3)(4)(5)(6)(7)(8)(9)(a)(\10)', '\k<0>-\k<10>-', '123456789aa'), PHP_EOL;
17+
// numbered captures with \k'n' syntax
18+
echo mb_ereg_replace('(1)(2)(3)(4)(5)(6)(7)(8)(9)(a)(\10)', "\k'0'-\k'10'-", '123456789aa'), PHP_EOL;
19+
// backref 0 works, but 01 is ignored
20+
echo mb_ereg_replace('a', "\k'0'_\k<01>", 'a'), PHP_EOL;
21+
// Numbered backref is ignored if named backrefs are present
22+
echo mb_ereg_replace('(?<a>A)\k<a>', '-\1-', 'AA'), PHP_EOL;
23+
// An empty backref is ignored
24+
echo mb_ereg_replace('(\w)\1', '-\k<>-', 'AA'), PHP_EOL;
25+
// An unclosed backref is ignored
26+
echo mb_ereg_replace('(?<a>\w+)', '-\k<a', 'AA'), PHP_EOL;
27+
?>
28+
29+
--EXPECT--
30+
_a_ _b_ _c_ _d_ _e_
31+
<abc> <def> <ghi>
32+
123456789aa-a-
33+
123456789aa-a-
34+
a_\k<01>
35+
-\1-
36+
-\k<>-
37+
-\k<a

0 commit comments

Comments
 (0)