diff --git a/NEWS b/NEWS index 53d6a71945a4..736220ecafb8 100644 --- a/NEWS +++ b/NEWS @@ -19,4 +19,7 @@ Standard: (timwolla) . Fix GH-12252 (round(): Validate the rounding mode). (timwolla) +MBString: + . Added mb_trim, mb_ltrim and mb_rtrim. (Yuya Hamada) + <<< NOTE: Insert NEWS from last stable release here prior to actual release! >>> diff --git a/UPGRADING b/UPGRADING index 54a2f0e8c33e..1a941ab4f082 100644 --- a/UPGRADING +++ b/UPGRADING @@ -64,6 +64,9 @@ PHP 8.4 UPGRADE NOTES inputs might also be affected and result in different outputs compared to earlier PHP versions. +- MBString: + . Added mb_trim, mb_ltrim and mb_rtrim functions. + ======================================== 6. New Functions ======================================== diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 3e59806b8675..89605b4aa968 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -2939,6 +2939,145 @@ PHP_FUNCTION(mb_strtolower) RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), enc)); } +typedef enum { + MB_LTRIM = 1, + MB_RTRIM = 2, + MB_BOTH_TRIM = 3 +} mb_trim_mode; + +static zend_always_inline bool is_trim_wchar(uint32_t w, const HashTable *ht) +{ + return zend_hash_index_exists(ht, w); +} + +static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, mb_trim_mode mode, const mbfl_encoding *enc) +{ + unsigned char *in = (unsigned char*)ZSTR_VAL(str); + uint32_t wchar_buf[128]; + size_t in_len = ZSTR_LEN(str); + size_t out_len = 0; + unsigned int state = 0; + size_t left = 0; + size_t right = 0; + size_t total_len = 0; + + while (in_len) { + out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state); + ZEND_ASSERT(out_len <= 128); + total_len += out_len; + + for (size_t i = 0; i < out_len; i++) { + uint32_t w = wchar_buf[i]; + if (is_trim_wchar(w, what_ht)) { + if (mode & MB_LTRIM) { + left += 1; + } + if (mode & MB_RTRIM) { + right += 1; + } + } else { + mode &= ~MB_LTRIM; + if (mode & MB_RTRIM) { + right = 0; + } + } + } + } + + return mb_get_substr(str, left, total_len - (right + left), enc); +} + +static zend_string* mb_trim_default_chars(zend_string *str, mb_trim_mode mode, const mbfl_encoding *enc) +{ + const uint32_t trim_default_chars[] = { + 0x20, 0x0C, 0x0A, 0x0D, 0x09, 0x0B, 0x00, 0xA0, 0x1680, + 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, + 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000, + 0x85, 0x180E + }; + size_t trim_default_chars_length = sizeof(trim_default_chars) / sizeof(uint32_t); + + HashTable what_ht; + zval val; + ZVAL_TRUE(&val); + + zend_hash_init(&what_ht, trim_default_chars_length, NULL, NULL, false); + + for (size_t i = 0; i < trim_default_chars_length; i++) { + zend_hash_index_add_new(&what_ht, trim_default_chars[i], &val); + } + zend_string* retval = trim_each_wchar(str, &what_ht, mode, enc); + zend_hash_destroy(&what_ht); + + return retval; +} + +static zend_string* mb_trim_what_chars(zend_string *str, zend_string *what, mb_trim_mode mode, const mbfl_encoding *enc) +{ + unsigned char *what_in = (unsigned char*)ZSTR_VAL(what); + uint32_t what_wchar_buf[128]; + size_t what_out_len = 0; + unsigned int state = 0; + size_t what_len = ZSTR_LEN(what); + HashTable what_ht; + zval val; + ZVAL_TRUE(&val); + zend_hash_init(&what_ht, what_len, NULL, NULL, false); + + while (what_len) { + what_out_len = enc->to_wchar(&what_in, &what_len, what_wchar_buf, 128, &state); + ZEND_ASSERT(what_out_len <= 128); + for (size_t i = 0; i < what_out_len; i++) { + zend_hash_index_add(&what_ht, what_wchar_buf[i], &val); + } + } + + zend_string *retval = trim_each_wchar(str, &what_ht, mode, enc); + zend_hash_destroy(&what_ht); + + return retval; +} + +static void php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS, mb_trim_mode mode) +{ + zend_string *str; + zend_string *what = NULL; + zend_string *encoding = NULL; + + ZEND_PARSE_PARAMETERS_START(1, 3) + Z_PARAM_STR(str) + Z_PARAM_OPTIONAL + Z_PARAM_STR(what) + Z_PARAM_STR_OR_NULL(encoding) + ZEND_PARSE_PARAMETERS_END(); + + const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3); + if (!enc) { + RETURN_THROWS(); + } + + if (what) { + RETURN_STR(mb_trim_what_chars(str, what, mode, enc)); + } else { + RETURN_STR(mb_trim_default_chars(str, mode, enc)); + } +} + +PHP_FUNCTION(mb_trim) +{ + php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_BOTH_TRIM); +} + +PHP_FUNCTION(mb_ltrim) +{ + php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_LTRIM); +} + +PHP_FUNCTION(mb_rtrim) +{ + php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_RTRIM); +} + static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size) { const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0); diff --git a/ext/mbstring/mbstring.stub.php b/ext/mbstring/mbstring.stub.php index 6c31ff492105..501128b74784 100644 --- a/ext/mbstring/mbstring.stub.php +++ b/ext/mbstring/mbstring.stub.php @@ -135,6 +135,12 @@ function mb_strtoupper(string $string, ?string $encoding = null): string {} /** @refcount 1 */ function mb_strtolower(string $string, ?string $encoding = null): string {} +function mb_trim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string {} + +function mb_ltrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string {} + +function mb_rtrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string {} + /** @refcount 1 */ function mb_detect_encoding(string $string, array|string|null $encodings = null, bool $strict = false): string|false {} diff --git a/ext/mbstring/mbstring_arginfo.h b/ext/mbstring/mbstring_arginfo.h index e964a83c118b..c24571a0b3c0 100644 --- a/ext/mbstring/mbstring_arginfo.h +++ b/ext/mbstring/mbstring_arginfo.h @@ -1,5 +1,5 @@ /* This is a generated file, edit the .stub.php file instead. - * Stub hash: 141073d610f862b525406fb7f48ac58b6691080e */ + * Stub hash: 4071d9df39c4ec0d544edd9ff74e5d85f8863b0d */ ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_language, 0, 0, MAY_BE_STRING|MAY_BE_BOOL) ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, language, IS_STRING, 1, "null") @@ -118,6 +118,16 @@ ZEND_END_ARG_INFO() #define arginfo_mb_strtolower arginfo_mb_strtoupper +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_mb_trim, 0, 1, IS_STRING, 0) + ZEND_ARG_TYPE_INFO(0, string, IS_STRING, 0) + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, characters, IS_STRING, 0, "\" \\f\\n\\r\\t\\v\\x00             

   …᠎\"") + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, encoding, IS_STRING, 1, "null") +ZEND_END_ARG_INFO() + +#define arginfo_mb_ltrim arginfo_mb_trim + +#define arginfo_mb_rtrim arginfo_mb_trim + ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_detect_encoding, 0, 1, MAY_BE_STRING|MAY_BE_FALSE) ZEND_ARG_TYPE_INFO(0, string, IS_STRING, 0) ZEND_ARG_TYPE_MASK(0, encodings, MAY_BE_ARRAY|MAY_BE_STRING|MAY_BE_NULL, "null") @@ -339,6 +349,9 @@ ZEND_FUNCTION(mb_convert_encoding); ZEND_FUNCTION(mb_convert_case); ZEND_FUNCTION(mb_strtoupper); ZEND_FUNCTION(mb_strtolower); +ZEND_FUNCTION(mb_trim); +ZEND_FUNCTION(mb_ltrim); +ZEND_FUNCTION(mb_rtrim); ZEND_FUNCTION(mb_detect_encoding); ZEND_FUNCTION(mb_list_encodings); ZEND_FUNCTION(mb_encoding_aliases); @@ -434,6 +447,9 @@ static const zend_function_entry ext_functions[] = { ZEND_FE(mb_convert_case, arginfo_mb_convert_case) ZEND_FE(mb_strtoupper, arginfo_mb_strtoupper) ZEND_FE(mb_strtolower, arginfo_mb_strtolower) + ZEND_FE(mb_trim, arginfo_mb_trim) + ZEND_FE(mb_ltrim, arginfo_mb_ltrim) + ZEND_FE(mb_rtrim, arginfo_mb_rtrim) ZEND_FE(mb_detect_encoding, arginfo_mb_detect_encoding) ZEND_FE(mb_list_encodings, arginfo_mb_list_encodings) ZEND_FE(mb_encoding_aliases, arginfo_mb_encoding_aliases) diff --git a/ext/mbstring/tests/mb_trim.phpt b/ext/mbstring/tests/mb_trim.phpt new file mode 100644 index 000000000000..872915b210a2 --- /dev/null +++ b/ext/mbstring/tests/mb_trim.phpt @@ -0,0 +1,125 @@ +--TEST-- +mb_trim() function tests +--EXTENSIONS-- +mbstring +--FILE-- +getMessage()); +} + +?> +--EXPECT-- +== Copy from trim == +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +== Empty string == +string(0) "" +string(0) "" +string(0) "" +== Single string == +string(6) " test " +string(21) "あいうえおあお" +string(11) "foo BAR Spa" +string(12) "oo BAR Spaß" +== Multi strings == +string(10) "oo BAR Spa" +string(10) "oo BAR Spa" +string(16) "いうおえお " +string(16) "いうおえお " +string(25) " あいうおえお  " +string(26) " あいうおえお  a" +== Many strings == +string(0) "" +string(1) "a" +string(388) "                                                                                                                                 a" +== mb_ltrim == +string(15) "いああああ" +== mb_rtrim == +string(102) "あああああああああああああああああああああああああああああああああい" +== default params == +string(0) "" +== Byte Order Mark == +string(6) "漢字" +string(8) "226f575b" +string(8) "6f225b57" +== Empty string == +string(6) " abcd " +string(6) " abcd " +string(6) " abcd " +== SJIS == +string(3) "あ" +== Same strings == +string(1) "f" +== $encoding throws ValueError == +string(73) "mb_trim(): Argument #3 ($encoding) must be a valid encoding, "NULL" given" diff --git a/ext/mbstring/tests/mbregex_stack_limit2.phpt b/ext/mbstring/tests/mbregex_stack_limit2.phpt index c0ced11f200b..9c2efcc39ec2 100644 --- a/ext/mbstring/tests/mbregex_stack_limit2.phpt +++ b/ext/mbstring/tests/mbregex_stack_limit2.phpt @@ -12,7 +12,7 @@ if (version_compare(MB_ONIGURUMA_VERSION, '6.9.3') < 0) { ?> --FILE--