From 3a9f5c6c55dfbbcabe9a42d4f5031a50310269b5 Mon Sep 17 00:00:00 2001 From: nielsdos <7771979+nielsdos@users.noreply.github.com> Date: Sun, 14 May 2023 16:29:13 +0200 Subject: [PATCH] [RFC] Implement mb_str_pad() Closes GH-10203. --- NEWS | 2 + UPGRADING | 4 + ext/mbstring/mbstring.c | 126 ++++++++++++++++++++++++++ ext/mbstring/mbstring.stub.php | 2 + ext/mbstring/mbstring_arginfo.h | 12 ++- ext/mbstring/tests/mb_str_pad.phpt | 138 +++++++++++++++++++++++++++++ 6 files changed, 283 insertions(+), 1 deletion(-) create mode 100644 ext/mbstring/tests/mb_str_pad.phpt diff --git a/NEWS b/NEWS index a147e14718d00..deb75af657ca0 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,8 @@ PHP NEWS ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ?? ??? ????, PHP 8.3.0alpha3 +- MBString: + . Implement mb_str_pad() RFC. (nielsdos) 22 Jun 2023, PHP 8.3.0alpha2 diff --git a/UPGRADING b/UPGRADING index 5d06a74ce4b3f..df3d0396620d4 100644 --- a/UPGRADING +++ b/UPGRADING @@ -208,6 +208,10 @@ PHP 8.3 UPGRADE NOTES the given $depth and $options. RFC: https://wiki.php.net/rfc/json_validate +- MBString: + . Added mb_str_pad(), which is the mbstring equivalent of str_pad(). + RFC: https://wiki.php.net/rfc/mb_str_pad + - Posix: . Added posix_sysconf call to get runtime informations. . Added posix_pathconf call to get configuration value from a directory/file. diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index e052d8e207815..d2fe1a7845a62 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -5522,6 +5522,132 @@ PHP_FUNCTION(mb_chr) } /* }}} */ +PHP_FUNCTION(mb_str_pad) +{ + zend_string *input, *encoding_str = NULL, *pad = NULL; + zend_long pad_to_length; + zend_long pad_type_val = PHP_STR_PAD_RIGHT; + + ZEND_PARSE_PARAMETERS_START(2, 5) + Z_PARAM_STR(input) + Z_PARAM_LONG(pad_to_length) + Z_PARAM_OPTIONAL + Z_PARAM_STR(pad) + Z_PARAM_LONG(pad_type_val) + Z_PARAM_STR_OR_NULL(encoding_str) + ZEND_PARSE_PARAMETERS_END(); + + const mbfl_encoding *encoding = php_mb_get_encoding(encoding_str, 5); + if (!encoding) { + RETURN_THROWS(); + } + + size_t input_length = mb_get_strlen(input, encoding); + + /* If resulting string turns out to be shorter than input string, + we simply copy the input and return. */ + if (pad_to_length < 0 || (size_t)pad_to_length <= input_length) { + RETURN_STR_COPY(input); + } + + if (ZSTR_LEN(pad) == 0) { + zend_argument_value_error(3, "must be a non-empty string"); + RETURN_THROWS(); + } + + if (pad_type_val < PHP_STR_PAD_LEFT || pad_type_val > PHP_STR_PAD_BOTH) { + zend_argument_value_error(4, "must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH"); + RETURN_THROWS(); + } + + size_t pad_length = mb_get_strlen(pad, encoding); + + size_t num_mb_pad_chars = pad_to_length - input_length; + + /* We need to figure out the left/right padding lengths. */ + size_t left_pad = 0, right_pad = 0; /* Initialize here to silence compiler warnings. */ + switch (pad_type_val) { + case PHP_STR_PAD_RIGHT: + right_pad = num_mb_pad_chars; + break; + + case PHP_STR_PAD_LEFT: + left_pad = num_mb_pad_chars; + break; + + case PHP_STR_PAD_BOTH: + left_pad = num_mb_pad_chars / 2; + right_pad = num_mb_pad_chars - left_pad; + break; + } + + /* How many full block copies need to happen, and how many characters are then left over? */ + size_t full_left_pad_copies = left_pad / pad_length; + size_t full_right_pad_copies = right_pad / pad_length; + size_t remaining_left_pad_chars = left_pad % pad_length; + size_t remaining_right_pad_chars = right_pad % pad_length; + + if (UNEXPECTED(full_left_pad_copies > SIZE_MAX / ZSTR_LEN(pad) || full_right_pad_copies > SIZE_MAX / ZSTR_LEN(pad))) { + goto overflow_no_release; + } + + /* Compute the number of bytes required for the padding */ + size_t full_left_pad_bytes = full_left_pad_copies * ZSTR_LEN(pad); + size_t full_right_pad_bytes = full_right_pad_copies * ZSTR_LEN(pad); + + /* No special fast-path handling necessary for zero-length pads because these functions will not + * allocate memory in case a zero-length pad is required. */ + zend_string *remaining_left_pad_str = mb_get_substr(pad, 0, remaining_left_pad_chars, encoding); + zend_string *remaining_right_pad_str = mb_get_substr(pad, 0, remaining_right_pad_chars, encoding); + + if (UNEXPECTED(full_left_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_left_pad_str) + || full_right_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_right_pad_str))) { + goto overflow; + } + + size_t left_pad_bytes = full_left_pad_bytes + ZSTR_LEN(remaining_left_pad_str); + size_t right_pad_bytes = full_right_pad_bytes + ZSTR_LEN(remaining_right_pad_str); + + if (UNEXPECTED(left_pad_bytes > ZSTR_MAX_LEN - right_pad_bytes + || ZSTR_LEN(input) > ZSTR_MAX_LEN - left_pad_bytes - right_pad_bytes)) { + goto overflow; + } + + zend_string *result = zend_string_alloc(ZSTR_LEN(input) + left_pad_bytes + right_pad_bytes, false); + char *buffer = ZSTR_VAL(result); + + /* First we pad the left. */ + for (size_t i = 0; i < full_left_pad_copies; i++, buffer += ZSTR_LEN(pad)) { + memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad)); + } + memcpy(buffer, ZSTR_VAL(remaining_left_pad_str), ZSTR_LEN(remaining_left_pad_str)); + buffer += ZSTR_LEN(remaining_left_pad_str); + + /* Then we copy the input string. */ + memcpy(buffer, ZSTR_VAL(input), ZSTR_LEN(input)); + buffer += ZSTR_LEN(input); + + /* Finally, we pad on the right. */ + for (size_t i = 0; i < full_right_pad_copies; i++, buffer += ZSTR_LEN(pad)) { + memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad)); + } + memcpy(buffer, ZSTR_VAL(remaining_right_pad_str), ZSTR_LEN(remaining_right_pad_str)); + + ZSTR_VAL(result)[ZSTR_LEN(result)] = '\0'; + + zend_string_release_ex(remaining_left_pad_str, false); + zend_string_release_ex(remaining_right_pad_str, false); + + RETURN_NEW_STR(result); + +overflow: + zend_string_release_ex(remaining_left_pad_str, false); + zend_string_release_ex(remaining_right_pad_str, false); +overflow_no_release: + zend_throw_error(NULL, "String size overflow"); + RETURN_THROWS(); +} + /* {{{ */ PHP_FUNCTION(mb_scrub) { diff --git a/ext/mbstring/mbstring.stub.php b/ext/mbstring/mbstring.stub.php index add0a35e05b80..6c31ff492105f 100644 --- a/ext/mbstring/mbstring.stub.php +++ b/ext/mbstring/mbstring.stub.php @@ -183,6 +183,8 @@ function mb_ord(string $string, ?string $encoding = null): int|false {} function mb_chr(int $codepoint, ?string $encoding = null): string|false {} +function mb_str_pad(string $string, int $length, string $pad_string = " ", int $pad_type = STR_PAD_RIGHT, ?string $encoding = null): string {} + #ifdef HAVE_MBREGEX /** @refcount 1 */ function mb_regex_encoding(?string $encoding = null): string|bool {} diff --git a/ext/mbstring/mbstring_arginfo.h b/ext/mbstring/mbstring_arginfo.h index e8985793e380f..e964a83c118b9 100644 --- a/ext/mbstring/mbstring_arginfo.h +++ b/ext/mbstring/mbstring_arginfo.h @@ -1,5 +1,5 @@ /* This is a generated file, edit the .stub.php file instead. - * Stub hash: 26a027093075613056921c4d1a7eee65d52ec5eb */ + * Stub hash: 141073d610f862b525406fb7f48ac58b6691080e */ ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_language, 0, 0, MAY_BE_STRING|MAY_BE_BOOL) ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, language, IS_STRING, 1, "null") @@ -198,6 +198,14 @@ ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_chr, 0, 1, MAY_BE_STRING|MAY_ ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, encoding, IS_STRING, 1, "null") ZEND_END_ARG_INFO() +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_mb_str_pad, 0, 2, IS_STRING, 0) + ZEND_ARG_TYPE_INFO(0, string, IS_STRING, 0) + ZEND_ARG_TYPE_INFO(0, length, IS_LONG, 0) + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, pad_string, IS_STRING, 0, "\" \"") + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, pad_type, IS_LONG, 0, "STR_PAD_RIGHT") + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, encoding, IS_STRING, 1, "null") +ZEND_END_ARG_INFO() + #if defined(HAVE_MBREGEX) ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_regex_encoding, 0, 0, MAY_BE_STRING|MAY_BE_BOOL) ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, encoding, IS_STRING, 1, "null") @@ -346,6 +354,7 @@ ZEND_FUNCTION(mb_check_encoding); ZEND_FUNCTION(mb_scrub); ZEND_FUNCTION(mb_ord); ZEND_FUNCTION(mb_chr); +ZEND_FUNCTION(mb_str_pad); #if defined(HAVE_MBREGEX) ZEND_FUNCTION(mb_regex_encoding); #endif @@ -440,6 +449,7 @@ static const zend_function_entry ext_functions[] = { ZEND_FE(mb_scrub, arginfo_mb_scrub) ZEND_FE(mb_ord, arginfo_mb_ord) ZEND_FE(mb_chr, arginfo_mb_chr) + ZEND_FE(mb_str_pad, arginfo_mb_str_pad) #if defined(HAVE_MBREGEX) ZEND_FE(mb_regex_encoding, arginfo_mb_regex_encoding) #endif diff --git a/ext/mbstring/tests/mb_str_pad.phpt b/ext/mbstring/tests/mb_str_pad.phpt new file mode 100644 index 0000000000000..136938eb2604d --- /dev/null +++ b/ext/mbstring/tests/mb_str_pad.phpt @@ -0,0 +1,138 @@ +--TEST-- +mb_str_pad() +--EXTENSIONS-- +mbstring +--FILE-- +getMessage()); +} +try { + var_dump(mb_str_pad('▶▶', 6, '', STR_PAD_LEFT)); +} catch (ValueError $e) { + var_dump($e->getMessage()); +} +try { + var_dump(mb_str_pad('▶▶', 6, '', STR_PAD_BOTH)); +} catch (ValueError $e) { + var_dump($e->getMessage()); +} +try { + var_dump(mb_str_pad('▶▶', 6, ' ', 123456)); +} catch (ValueError $e) { + var_dump($e->getMessage()); +} +try { + var_dump(mb_str_pad('▶▶', 6, ' ', STR_PAD_BOTH, 'unexisting')); +} catch (ValueError $e) { + var_dump($e->getMessage()); +} + +echo "--- Simple ASCII strings ---\n"; +var_dump(mb_str_pad('Hello', 7, '+-', STR_PAD_BOTH)); +var_dump(mb_str_pad('World', 10, '+-', STR_PAD_BOTH)); +var_dump(mb_str_pad('Hello', 7, '+-', STR_PAD_LEFT)); +var_dump(mb_str_pad('World', 10, '+-', STR_PAD_LEFT)); +var_dump(mb_str_pad('Hello', 7, '+-', STR_PAD_RIGHT)); +var_dump(mb_str_pad('World', 10, '+-', STR_PAD_RIGHT)); + +echo "--- Edge cases pad length ---\n"; +var_dump(mb_str_pad('▶▶', 2, ' ', STR_PAD_BOTH)); +var_dump(mb_str_pad('▶▶', 1, ' ', STR_PAD_BOTH)); +var_dump(mb_str_pad('▶▶', 0, ' ', STR_PAD_BOTH)); +var_dump(mb_str_pad('▶▶', -1, ' ', STR_PAD_BOTH)); + +echo "--- Empty input string ---\n"; +var_dump(mb_str_pad('', 2, ' ', STR_PAD_BOTH)); +var_dump(mb_str_pad('', 1, ' ', STR_PAD_BOTH)); +var_dump(mb_str_pad('', 0, ' ', STR_PAD_BOTH)); +var_dump(mb_str_pad('', -1, ' ', STR_PAD_BOTH)); + +echo "--- No default argument ---\n"; +var_dump(mb_str_pad('▶▶', 6, pad_type: STR_PAD_RIGHT)); +var_dump(mb_str_pad('▶▶', 6, pad_type: STR_PAD_LEFT)); +var_dump(mb_str_pad('▶▶', 6, pad_type: STR_PAD_BOTH)); + +echo "--- UTF-8 emojis ---\n"; +for ($i = 6; $i > 0; $i--) { + var_dump(mb_str_pad('▶▶', $i, '❤❓❇', STR_PAD_RIGHT)); + var_dump(mb_str_pad('▶▶', $i, '❤❓❇', STR_PAD_LEFT)); + var_dump(mb_str_pad('▶▶', $i, '❤❓❇', STR_PAD_BOTH)); +} + +echo "--- UTF-8, 32, 7 test ---\n"; + +// Taken from mb_substr.phpt +$utf8 = "Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь"; +$utf32 = mb_convert_encoding($utf8, 'UTF-32', 'UTF-8'); +$utf7 = mb_convert_encoding($utf8, 'UTF-7', 'UTF-8'); +$tests = ["UTF-8" => $utf8, "UTF-32" => $utf32, "UTF-7" => $utf7]; + +foreach ($tests as $encoding => $test) { + $pad_str = mb_convert_encoding('▶▶', $encoding, 'UTF-8'); + var_dump(mb_convert_encoding(mb_str_pad($test, 44, $pad_str, STR_PAD_RIGHT, $encoding), 'UTF-8', $encoding)); + var_dump(mb_convert_encoding(mb_str_pad($test, 44, $pad_str, STR_PAD_LEFT, $encoding), 'UTF-8', $encoding)); + var_dump(mb_convert_encoding(mb_str_pad($test, 44, $pad_str, STR_PAD_BOTH, $encoding), 'UTF-8', $encoding)); +} +?> +--EXPECT-- +--- Error conditions --- +string(66) "mb_str_pad(): Argument #3 ($pad_string) must be a non-empty string" +string(66) "mb_str_pad(): Argument #3 ($pad_string) must be a non-empty string" +string(66) "mb_str_pad(): Argument #3 ($pad_string) must be a non-empty string" +string(90) "mb_str_pad(): Argument #4 ($pad_type) must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH" +string(82) "mb_str_pad(): Argument #5 ($encoding) must be a valid encoding, "unexisting" given" +--- Simple ASCII strings --- +string(7) "+Hello+" +string(10) "+-World+-+" +string(7) "+-Hello" +string(10) "+-+-+World" +string(7) "Hello+-" +string(10) "World+-+-+" +--- Edge cases pad length --- +string(6) "▶▶" +string(6) "▶▶" +string(6) "▶▶" +string(6) "▶▶" +--- Empty input string --- +string(2) " " +string(1) " " +string(0) "" +string(0) "" +--- No default argument --- +string(10) "▶▶ " +string(10) " ▶▶" +string(10) " ▶▶ " +--- UTF-8 emojis --- +string(18) "▶▶❤❓❇❤" +string(18) "❤❓❇❤▶▶" +string(18) "❤❓▶▶❤❓" +string(15) "▶▶❤❓❇" +string(15) "❤❓❇▶▶" +string(15) "❤▶▶❤❓" +string(12) "▶▶❤❓" +string(12) "❤❓▶▶" +string(12) "❤▶▶❤" +string(9) "▶▶❤" +string(9) "❤▶▶" +string(9) "▶▶❤" +string(6) "▶▶" +string(6) "▶▶" +string(6) "▶▶" +string(6) "▶▶" +string(6) "▶▶" +string(6) "▶▶" +--- UTF-8, 32, 7 test --- +string(92) "Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶▶" +string(92) "▶▶▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь" +string(92) "▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶" +string(92) "Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶▶" +string(92) "▶▶▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь" +string(92) "▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶" +string(92) "Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶▶" +string(92) "▶▶▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь" +string(92) "▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶"