From 197f810f5e0b8e214178e55f6b1340876097297f Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Fri, 20 Jan 2023 10:28:26 +0200 Subject: [PATCH 1/2] mb_scrub does not attempt to scrub known-valid UTF-8 strings --- ext/mbstring/mbstring.c | 13 ++++++++----- ext/mbstring/tests/mb_scrub.phpt | 8 ++++++++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 630b926af4684..1ef21530b047e 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -5066,12 +5066,10 @@ PHP_FUNCTION(mb_chr) /* {{{ */ PHP_FUNCTION(mb_scrub) { - char* str; - size_t str_len; - zend_string *enc_name = NULL; + zend_string *str, *enc_name = NULL; ZEND_PARSE_PARAMETERS_START(1, 2) - Z_PARAM_STRING(str, str_len) + Z_PARAM_STR(str) Z_PARAM_OPTIONAL Z_PARAM_STR_OR_NULL(enc_name) ZEND_PARSE_PARAMETERS_END(); @@ -5081,7 +5079,12 @@ PHP_FUNCTION(mb_scrub) RETURN_THROWS(); } - RETURN_STR(php_mb_convert_encoding_ex(str, str_len, enc, enc)); + if (enc == &mbfl_encoding_utf8 && (GC_FLAGS(str) & IS_STR_VALID_UTF8)) { + /* A valid UTF-8 string will not be changed by mb_scrub; so just increment the refcount and return it */ + RETURN_STR_COPY(str); + } + + RETURN_STR(php_mb_convert_encoding_ex(ZSTR_VAL(str), ZSTR_LEN(str), enc, enc)); } /* }}} */ diff --git a/ext/mbstring/tests/mb_scrub.phpt b/ext/mbstring/tests/mb_scrub.phpt index 1b2d8ab4e34e2..6eb580bf31cc9 100644 --- a/ext/mbstring/tests/mb_scrub.phpt +++ b/ext/mbstring/tests/mb_scrub.phpt @@ -8,7 +8,15 @@ var_dump( "?" === mb_scrub("\x80"), "?" === mb_scrub("\x80", 'UTF-8') ); + +$utf8str = "abc 日本語 Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞"; +// Check $utf8str so it is marked as 'valid UTF-8' +// This will enable optimized implementation of mb_scrub +if (!mb_check_encoding($utf8str, 'UTF-8')) + die("Test string should be valid UTF-8"); +var_dump(mb_scrub($utf8str)); ?> --EXPECT-- bool(true) bool(true) +string(122) "abc 日本語 Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞" From 69c5af8f744c4b8a1b00826e191736a69495c9c0 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sun, 22 Jan 2023 07:39:50 +0200 Subject: [PATCH 2/2] Use RETURN_STR_COPY in mb_output_handler This means the same thing and makes the code read a tiny bit better. Thanks to Nikita Popov for the tip. --- ext/mbstring/mbstring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 1ef21530b047e..964d459543efd 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -1534,7 +1534,7 @@ PHP_FUNCTION(mb_output_handler) const mbfl_encoding *encoding = MBSTRG(current_http_output_encoding); if (encoding == &mbfl_encoding_pass) { - RETURN_STR(zend_string_copy(str)); + RETURN_STR_COPY(str); } if (arg_status & PHP_OUTPUT_HANDLER_START) { @@ -1574,7 +1574,7 @@ PHP_FUNCTION(mb_output_handler) } if (!MBSTRG(outconv_enabled)) { - RETURN_STR(zend_string_copy(str)); + RETURN_STR_COPY(str); } mb_convert_buf buf;