Skip to content

[RFC] Add trim for multibyte function #12459

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,7 @@ Standard:
(timwolla)
. Fix GH-12252 (round(): Validate the rounding mode). (timwolla)

MBString:
. Added mb_trim, mb_ltrim and mb_rtrim. (Yuya Hamada)

<<< NOTE: Insert NEWS from last stable release here prior to actual release! >>>
3 changes: 3 additions & 0 deletions UPGRADING
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ PHP 8.4 UPGRADE NOTES
inputs might also be affected and result in different outputs compared to
earlier PHP versions.

- MBString:
. Added mb_trim, mb_ltrim and mb_rtrim functions.

========================================
6. New Functions
========================================
Expand Down
139 changes: 139 additions & 0 deletions ext/mbstring/mbstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -2939,6 +2939,145 @@ PHP_FUNCTION(mb_strtolower)
RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
}

typedef enum {
MB_LTRIM = 1,
MB_RTRIM = 2,
MB_BOTH_TRIM = 3
} mb_trim_mode;

static zend_always_inline bool is_trim_wchar(uint32_t w, const HashTable *ht)
{
return zend_hash_index_exists(ht, w);
}

static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, mb_trim_mode mode, const mbfl_encoding *enc)
{
unsigned char *in = (unsigned char*)ZSTR_VAL(str);
uint32_t wchar_buf[128];
size_t in_len = ZSTR_LEN(str);
size_t out_len = 0;
unsigned int state = 0;
size_t left = 0;
size_t right = 0;
size_t total_len = 0;

while (in_len) {
out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
ZEND_ASSERT(out_len <= 128);
total_len += out_len;

for (size_t i = 0; i < out_len; i++) {
uint32_t w = wchar_buf[i];
if (is_trim_wchar(w, what_ht)) {
if (mode & MB_LTRIM) {
left += 1;
}
if (mode & MB_RTRIM) {
right += 1;
}
} else {
mode &= ~MB_LTRIM;
if (mode & MB_RTRIM) {
right = 0;
}
}
}
}

return mb_get_substr(str, left, total_len - (right + left), enc);
}

static zend_string* mb_trim_default_chars(zend_string *str, mb_trim_mode mode, const mbfl_encoding *enc)
{
const uint32_t trim_default_chars[] = {
0x20, 0x0C, 0x0A, 0x0D, 0x09, 0x0B, 0x00, 0xA0, 0x1680,
0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007,
0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000,
0x85, 0x180E
};
size_t trim_default_chars_length = sizeof(trim_default_chars) / sizeof(uint32_t);

HashTable what_ht;
zval val;
ZVAL_TRUE(&val);

zend_hash_init(&what_ht, trim_default_chars_length, NULL, NULL, false);

for (size_t i = 0; i < trim_default_chars_length; i++) {
zend_hash_index_add_new(&what_ht, trim_default_chars[i], &val);
}
zend_string* retval = trim_each_wchar(str, &what_ht, mode, enc);
zend_hash_destroy(&what_ht);

return retval;
}

static zend_string* mb_trim_what_chars(zend_string *str, zend_string *what, mb_trim_mode mode, const mbfl_encoding *enc)
{
unsigned char *what_in = (unsigned char*)ZSTR_VAL(what);
uint32_t what_wchar_buf[128];
size_t what_out_len = 0;
unsigned int state = 0;
size_t what_len = ZSTR_LEN(what);
HashTable what_ht;
zval val;
ZVAL_TRUE(&val);
zend_hash_init(&what_ht, what_len, NULL, NULL, false);

while (what_len) {
what_out_len = enc->to_wchar(&what_in, &what_len, what_wchar_buf, 128, &state);
ZEND_ASSERT(what_out_len <= 128);
for (size_t i = 0; i < what_out_len; i++) {
zend_hash_index_add(&what_ht, what_wchar_buf[i], &val);
}
}

zend_string *retval = trim_each_wchar(str, &what_ht, mode, enc);
zend_hash_destroy(&what_ht);

return retval;
}

static void php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS, mb_trim_mode mode)
{
zend_string *str;
zend_string *what = NULL;
zend_string *encoding = NULL;

ZEND_PARSE_PARAMETERS_START(1, 3)
Z_PARAM_STR(str)
Z_PARAM_OPTIONAL
Z_PARAM_STR(what)
Z_PARAM_STR_OR_NULL(encoding)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, just thinking about this. Is it true that most mbstring functions take the encoding last as an optional parameter? If so, then this is good. 👍🏻

It just means that if a user wants to pass an explicit encoding, they have to pass the list of whitespace chars as well. If the order was (str, encoding, what), then the user could pass an explicit encoding, but use the default value of what.

What do you think is better?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(If this was already discussed at the RFC stage, then please disregard my comment.)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just my 2c: The mbstring functions take encoding as last. If you don't want to pass what then you can use named arguments:
mb_trim("my string", encoding: "GB18030") for example.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As @nielsdos said, I am assuming to use mb_trim("my string", encoding: "GB18030");

ZEND_PARSE_PARAMETERS_END();

const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
if (!enc) {
RETURN_THROWS();
}

if (what) {
RETURN_STR(mb_trim_what_chars(str, what, mode, enc));
} else {
RETURN_STR(mb_trim_default_chars(str, mode, enc));
}
}

PHP_FUNCTION(mb_trim)
{
php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_BOTH_TRIM);
}

PHP_FUNCTION(mb_ltrim)
{
php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_LTRIM);
}

PHP_FUNCTION(mb_rtrim)
{
php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_RTRIM);
}

static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
{
const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
Expand Down
6 changes: 6 additions & 0 deletions ext/mbstring/mbstring.stub.php
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,12 @@ function mb_strtoupper(string $string, ?string $encoding = null): string {}
/** @refcount 1 */
function mb_strtolower(string $string, ?string $encoding = null): string {}

function mb_trim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string {}

function mb_ltrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string {}

function mb_rtrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string {}

/** @refcount 1 */
function mb_detect_encoding(string $string, array|string|null $encodings = null, bool $strict = false): string|false {}

Expand Down
18 changes: 17 additions & 1 deletion ext/mbstring/mbstring_arginfo.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

125 changes: 125 additions & 0 deletions ext/mbstring/tests/mb_trim.phpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
--TEST--
mb_trim() function tests
--EXTENSIONS--
mbstring
--FILE--
<?php
mb_internal_encoding("UTF-8");

echo "== Copy from trim ==\n";
var_dump('ABC' === mb_trim('ABC'));
var_dump('ABC' === mb_ltrim('ABC'));
var_dump('ABC' === mb_rtrim('ABC'));
var_dump('ABC' === mb_trim(" \0\t\nABC \0\t\n"));
var_dump("ABC \0\t\n" === mb_ltrim(" \0\t\nABC \0\t\n"));
var_dump(" \0\t\nABC" === mb_rtrim(" \0\t\nABC \0\t\n"));
var_dump(" \0\t\nABC \0\t\n" === mb_trim(" \0\t\nABC \0\t\n",''));
var_dump(" \0\t\nABC \0\t\n" === mb_ltrim(" \0\t\nABC \0\t\n",''));
var_dump(" \0\t\nABC \0\t\n" === mb_rtrim(" \0\t\nABC \0\t\n",''));
echo "== Empty string ==\n";
var_dump(mb_trim(""));
var_dump(mb_ltrim(""));
var_dump(mb_rtrim(""));

echo "== Single string ==\n";
var_dump(mb_ltrim(' test ', ''));
var_dump(mb_trim(" あいうえおあお ", " ", "UTF-8"));
var_dump(mb_trim('foo BAR Spaß', 'ß', "UTF-8"));
var_dump(mb_trim('foo BAR Spaß', 'f', "UTF-8"));

echo "== Multi strings ==\n";
var_dump(mb_trim('foo BAR Spaß', 'ßf', "UTF-8"));
var_dump(mb_trim('foo BAR Spaß', 'fß', "UTF-8"));
var_dump(mb_trim(" あいうおえお  あ", " あ", "UTF-8"));
var_dump(mb_trim(" あいうおえお  あ", "あ ", "UTF-8"));
var_dump(mb_trim(" あいうおえお  a", "あa", "UTF-8"));
var_dump(mb_trim(" あいうおえお  a", "\xe3", "UTF-8"));

echo "== Many strings ==\n";
var_dump(mb_trim(str_repeat(" ", 129)));
var_dump(mb_trim(str_repeat(" ", 129) . "a"));
var_dump(mb_rtrim(str_repeat(" ", 129) . "a"));

echo "== mb_ltrim ==\n";
var_dump(mb_ltrim("あああああああああああああああああああああああああああああああああいああああ", "あ"));
echo "== mb_rtrim ==\n";
var_dump(mb_rtrim("あああああああああああああああああああああああああああああああああいああああ", "あ"));

echo "== default params ==\n";
var_dump(mb_trim(" \f\n\r\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}"));

echo "== Byte Order Mark ==\n";
var_dump(mb_ltrim("\u{FFFE}漢字", "\u{FFFE}\u{FEFF}"));
var_dump(bin2hex(mb_ltrim(mb_convert_encoding("\u{FFFE}漢字", "UTF-16LE", "UTF-8"), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16LE", "UTF-8"), "UTF-16LE")));
var_dump(bin2hex(mb_ltrim(mb_convert_encoding("\u{FEFF}漢字", "UTF-16BE", "UTF-8"), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16BE", "UTF-8"), "UTF-16BE")));

echo "== Empty string ==\n";
var_dump(mb_trim(" abcd ", ""));
var_dump(mb_ltrim(" abcd ", ""));
var_dump(mb_rtrim(" abcd ", ""));

echo "== SJIS ==\n";
var_dump(mb_convert_encoding(mb_trim("\x81\x40\x82\xa0\x81\x40", "\x81\x40", "SJIS"), "UTF-8", "SJIS"));

echo "== Same strings ==\n";
var_dump(mb_trim("foo", "oo"));

echo "== \$encoding throws ValueError ==\n";
try {
var_dump(mb_trim( "\u{180F}", "", "NULL"));
} catch (ValueError $e) {
var_dump($e->getMessage());
}

?>
--EXPECT--
== Copy from trim ==
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
== Empty string ==
string(0) ""
string(0) ""
string(0) ""
== Single string ==
string(6) " test "
string(21) "あいうえおあお"
string(11) "foo BAR Spa"
string(12) "oo BAR Spaß"
== Multi strings ==
string(10) "oo BAR Spa"
string(10) "oo BAR Spa"
string(16) "いうおえお "
string(16) "いうおえお "
string(25) " あいうおえお  "
string(26) " あいうおえお  a"
== Many strings ==
string(0) ""
string(1) "a"
string(388) "                                                                                                                                 a"
== mb_ltrim ==
string(15) "いああああ"
== mb_rtrim ==
string(102) "あああああああああああああああああああああああああああああああああい"
== default params ==
string(0) ""
== Byte Order Mark ==
string(6) "漢字"
string(8) "226f575b"
string(8) "6f225b57"
== Empty string ==
string(6) " abcd "
string(6) " abcd "
string(6) " abcd "
== SJIS ==
string(3) "あ"
== Same strings ==
string(1) "f"
== $encoding throws ValueError ==
string(73) "mb_trim(): Argument #3 ($encoding) must be a valid encoding, "NULL" given"
4 changes: 2 additions & 2 deletions ext/mbstring/tests/mbregex_stack_limit2.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ if (version_compare(MB_ONIGURUMA_VERSION, '6.9.3') < 0) {
?>
--FILE--
<?php
function mb_trim( $string, $chars = "", $chars_array = array() )
function mb_trim_regex( $string, $chars = "", $chars_array = array() )
{
for( $x=0; $x<iconv_strlen( $chars ); $x++ ) $chars_array[] = preg_quote( iconv_substr( $chars, $x, 1 ) );
$encoded_char_list = implode( "|", array_merge( array( "\s","\t","\n","\r", "\0", "\x0B" ), $chars_array ) );
Expand All @@ -23,7 +23,7 @@ function mb_trim( $string, $chars = "", $chars_array = array() )
}

ini_set('mbstring.regex_stack_limit', 10000);
var_dump(mb_trim(str_repeat(' ', 10000)));
var_dump(mb_trim_regex(str_repeat(' ', 10000)));

echo 'OK';
?>
Expand Down