From 34e5bf02dff59baf1df130e48b741667a5e791d8 Mon Sep 17 00:00:00 2001 From: Ayesh Karunaratne Date: Sat, 2 Mar 2024 19:22:01 +0700 Subject: [PATCH] ext/pcre: Add "/r" modifier Adds support for "Caseless restricted" matching added in PCRE2lib 10.43 with the "r" modifier. This is `PCRE2_EXTRA_CASELESS_RESTRICT` in PCRE2. This is an "extra" option, which means it is not possible to pass this option as pcre2_compile() function parameter. This option is passed in a pcre2_set_compile_extra_options() call. Previously, these extra options are set at php_pcre_init_pcre2(), but after this change, it is possible to customize the options by adding bits to `eoptions` in pcre_get_compiled_regex_cache_ex(). The tests for this change are ported from upstream test suite[^1]. [^1]: https://github.com/PCRE2Project/pcre2/commit/c13d54f6581#diff-8c8312e4eb2d35bb16485404b7b5cc0eaef0bca1aa95ff5febf6a1890048305c --- UPGRADING | 4 + UPGRADING.INTERNALS | 3 + ext/pcre/php_pcre.c | 4 + .../tests/preg_match_caseless_restrict.phpt | 101 ++++++++++++++++++ 4 files changed, 112 insertions(+) create mode 100644 ext/pcre/tests/preg_match_caseless_restrict.phpt diff --git a/UPGRADING b/UPGRADING index 78956ae29a6fb..0ce30cf66fd6b 100644 --- a/UPGRADING +++ b/UPGRADING @@ -210,6 +210,10 @@ PHP 8.4 UPGRADE NOTES As a consequence, LoongArch JIT support has been added, spaces are now allowed between braces in Perl-compatible items, and variable-length lookbehind assertions are now supported. + . Added support for the "r" (PCRE2_EXTRA_CASELESS_RESTRICT) modifier, as well + as the (?r) mode modifier. When enabled along with the case-insensitive + modifier ("i"), the expression locks out mixing of ASCII and non-ASCII + characters. - PDO: . Added support for driver-specific subclasses. diff --git a/UPGRADING.INTERNALS b/UPGRADING.INTERNALS index 217d86809ad46..0489224bc5a90 100644 --- a/UPGRADING.INTERNALS +++ b/UPGRADING.INTERNALS @@ -185,6 +185,9 @@ PHP 8.4 INTERNALS UPGRADE NOTES When flags should be ignored, pass 0 to the flags argument. - php_pcre_match_impl() and pcre_get_compiled_regex_cache_ex() now use proper boolean argument types instead of integer types. + - pcre_get_compiled_regex_cache_ex() now provides an option to collect extra + options (from modifiers used in the expression, for example), and calls + pcre2_set_compile_extra_options() with those options. ======================== 4. OpCode changes diff --git a/ext/pcre/php_pcre.c b/ext/pcre/php_pcre.c index c14cc524c5b3a..49526330cff1a 100644 --- a/ext/pcre/php_pcre.c +++ b/ext/pcre/php_pcre.c @@ -592,6 +592,7 @@ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bo #else uint32_t coptions = 0; #endif + uint32_t eoptions = PHP_PCRE_DEFAULT_EXTRA_COPTIONS; PCRE2_UCHAR error[128]; PCRE2_SIZE erroffset; int errnumber; @@ -722,6 +723,7 @@ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bo /* PCRE specific options */ case 'A': coptions |= PCRE2_ANCHORED; break; case 'D': coptions |= PCRE2_DOLLAR_ENDONLY;break; + case 'r': eoptions |= PCRE2_EXTRA_CASELESS_RESTRICT; break; case 'S': /* Pass. */ break; case 'X': /* Pass. */ break; case 'U': coptions |= PCRE2_UNGREEDY; break; @@ -776,6 +778,8 @@ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bo } pcre2_set_character_tables(cctx, tables); + pcre2_set_compile_extra_options(cctx, eoptions); + /* Compile pattern and display a warning if compilation failed. */ re = pcre2_compile((PCRE2_SPTR)pattern, pattern_len, coptions, &errnumber, &erroffset, cctx); diff --git a/ext/pcre/tests/preg_match_caseless_restrict.phpt b/ext/pcre/tests/preg_match_caseless_restrict.phpt new file mode 100644 index 0000000000000..8238e286b1cf1 --- /dev/null +++ b/ext/pcre/tests/preg_match_caseless_restrict.phpt @@ -0,0 +1,101 @@ +--TEST-- +testing /r modifier in preg_* functions +--FILE-- + +--EXPECT-- +SK substitute matching +int(1) +int(1) +int(0) +int(0) +int(1) +int(1) +int(1) +int(1) +K substitute matching +int(1) +int(0) +non-ASCII in expressions +int(1) +int(0) +Character sets +int(1) +int(1) +int(1) +int(1) +int(1) +int(1) +int(1) +int(1) +non-ASCII in character sets +int(0) +int(1) +Meta characters and negate character sets +int(1) +int(1) +int(1) +int(1) +int(1) +int(1) +int(1) +int(1) +int(1) +Modifier used within the expression +int(1) +int(0) +int(0) +int(1) +int(0) +Done