Skip to content

[RFC] Implement mb_str_pad() #11284

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ PHP NEWS
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
?? ??? ????, PHP 8.3.0alpha3

- MBString:
. Implement mb_str_pad() RFC. (nielsdos)

22 Jun 2023, PHP 8.3.0alpha2

Expand Down
4 changes: 4 additions & 0 deletions UPGRADING
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,10 @@ PHP 8.3 UPGRADE NOTES
the given $depth and $options.
RFC: https://wiki.php.net/rfc/json_validate

- MBString:
. Added mb_str_pad(), which is the mbstring equivalent of str_pad().
RFC: https://wiki.php.net/rfc/mb_str_pad

- Posix:
. Added posix_sysconf call to get runtime informations.
. Added posix_pathconf call to get configuration value from a directory/file.
Expand Down
126 changes: 126 additions & 0 deletions ext/mbstring/mbstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -5522,6 +5522,132 @@ PHP_FUNCTION(mb_chr)
}
/* }}} */

PHP_FUNCTION(mb_str_pad)
{
zend_string *input, *encoding_str = NULL, *pad = NULL;
zend_long pad_to_length;
zend_long pad_type_val = PHP_STR_PAD_RIGHT;

ZEND_PARSE_PARAMETERS_START(2, 5)
Z_PARAM_STR(input)
Z_PARAM_LONG(pad_to_length)
Z_PARAM_OPTIONAL
Z_PARAM_STR(pad)
Z_PARAM_LONG(pad_type_val)
Z_PARAM_STR_OR_NULL(encoding_str)
ZEND_PARSE_PARAMETERS_END();

const mbfl_encoding *encoding = php_mb_get_encoding(encoding_str, 5);
if (!encoding) {
RETURN_THROWS();
}

size_t input_length = mb_get_strlen(input, encoding);

/* If resulting string turns out to be shorter than input string,
we simply copy the input and return. */
if (pad_to_length < 0 || (size_t)pad_to_length <= input_length) {
RETURN_STR_COPY(input);
}

if (ZSTR_LEN(pad) == 0) {
zend_argument_value_error(3, "must be a non-empty string");
RETURN_THROWS();
}

if (pad_type_val < PHP_STR_PAD_LEFT || pad_type_val > PHP_STR_PAD_BOTH) {
zend_argument_value_error(4, "must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH");
RETURN_THROWS();
}

size_t pad_length = mb_get_strlen(pad, encoding);

size_t num_mb_pad_chars = pad_to_length - input_length;

/* We need to figure out the left/right padding lengths. */
size_t left_pad = 0, right_pad = 0; /* Initialize here to silence compiler warnings. */
switch (pad_type_val) {
case PHP_STR_PAD_RIGHT:
right_pad = num_mb_pad_chars;
break;

case PHP_STR_PAD_LEFT:
left_pad = num_mb_pad_chars;
break;

case PHP_STR_PAD_BOTH:
left_pad = num_mb_pad_chars / 2;
right_pad = num_mb_pad_chars - left_pad;
break;
}

/* How many full block copies need to happen, and how many characters are then left over? */
size_t full_left_pad_copies = left_pad / pad_length;
size_t full_right_pad_copies = right_pad / pad_length;
size_t remaining_left_pad_chars = left_pad % pad_length;
size_t remaining_right_pad_chars = right_pad % pad_length;

if (UNEXPECTED(full_left_pad_copies > SIZE_MAX / ZSTR_LEN(pad) || full_right_pad_copies > SIZE_MAX / ZSTR_LEN(pad))) {
goto overflow_no_release;
}

/* Compute the number of bytes required for the padding */
size_t full_left_pad_bytes = full_left_pad_copies * ZSTR_LEN(pad);
size_t full_right_pad_bytes = full_right_pad_copies * ZSTR_LEN(pad);

/* No special fast-path handling necessary for zero-length pads because these functions will not
* allocate memory in case a zero-length pad is required. */
zend_string *remaining_left_pad_str = mb_get_substr(pad, 0, remaining_left_pad_chars, encoding);
zend_string *remaining_right_pad_str = mb_get_substr(pad, 0, remaining_right_pad_chars, encoding);

if (UNEXPECTED(full_left_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_left_pad_str)
|| full_right_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_right_pad_str))) {
goto overflow;
}

size_t left_pad_bytes = full_left_pad_bytes + ZSTR_LEN(remaining_left_pad_str);
size_t right_pad_bytes = full_right_pad_bytes + ZSTR_LEN(remaining_right_pad_str);

if (UNEXPECTED(left_pad_bytes > ZSTR_MAX_LEN - right_pad_bytes
|| ZSTR_LEN(input) > ZSTR_MAX_LEN - left_pad_bytes - right_pad_bytes)) {
goto overflow;
}

zend_string *result = zend_string_alloc(ZSTR_LEN(input) + left_pad_bytes + right_pad_bytes, false);
char *buffer = ZSTR_VAL(result);

/* First we pad the left. */
for (size_t i = 0; i < full_left_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
}
memcpy(buffer, ZSTR_VAL(remaining_left_pad_str), ZSTR_LEN(remaining_left_pad_str));
buffer += ZSTR_LEN(remaining_left_pad_str);

/* Then we copy the input string. */
memcpy(buffer, ZSTR_VAL(input), ZSTR_LEN(input));
buffer += ZSTR_LEN(input);

/* Finally, we pad on the right. */
for (size_t i = 0; i < full_right_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
}
memcpy(buffer, ZSTR_VAL(remaining_right_pad_str), ZSTR_LEN(remaining_right_pad_str));

ZSTR_VAL(result)[ZSTR_LEN(result)] = '\0';

zend_string_release_ex(remaining_left_pad_str, false);
zend_string_release_ex(remaining_right_pad_str, false);

RETURN_NEW_STR(result);

overflow:
zend_string_release_ex(remaining_left_pad_str, false);
zend_string_release_ex(remaining_right_pad_str, false);
overflow_no_release:
zend_throw_error(NULL, "String size overflow");
RETURN_THROWS();
}

/* {{{ */
PHP_FUNCTION(mb_scrub)
{
Expand Down
2 changes: 2 additions & 0 deletions ext/mbstring/mbstring.stub.php
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,8 @@ function mb_ord(string $string, ?string $encoding = null): int|false {}

function mb_chr(int $codepoint, ?string $encoding = null): string|false {}

function mb_str_pad(string $string, int $length, string $pad_string = " ", int $pad_type = STR_PAD_RIGHT, ?string $encoding = null): string {}

#ifdef HAVE_MBREGEX
/** @refcount 1 */
function mb_regex_encoding(?string $encoding = null): string|bool {}
Expand Down
12 changes: 11 additions & 1 deletion ext/mbstring/mbstring_arginfo.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

138 changes: 138 additions & 0 deletions ext/mbstring/tests/mb_str_pad.phpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
--TEST--
mb_str_pad()
--EXTENSIONS--
mbstring
--FILE--
<?php

echo "--- Error conditions ---\n";
try {
var_dump(mb_str_pad('▶▶', 6, '', STR_PAD_RIGHT));
} catch (ValueError $e) {
var_dump($e->getMessage());
}
try {
var_dump(mb_str_pad('▶▶', 6, '', STR_PAD_LEFT));
} catch (ValueError $e) {
var_dump($e->getMessage());
}
try {
var_dump(mb_str_pad('▶▶', 6, '', STR_PAD_BOTH));
} catch (ValueError $e) {
var_dump($e->getMessage());
}
try {
var_dump(mb_str_pad('▶▶', 6, ' ', 123456));
} catch (ValueError $e) {
var_dump($e->getMessage());
}
try {
var_dump(mb_str_pad('▶▶', 6, ' ', STR_PAD_BOTH, 'unexisting'));
} catch (ValueError $e) {
var_dump($e->getMessage());
}

echo "--- Simple ASCII strings ---\n";
var_dump(mb_str_pad('Hello', 7, '+-', STR_PAD_BOTH));
var_dump(mb_str_pad('World', 10, '+-', STR_PAD_BOTH));
var_dump(mb_str_pad('Hello', 7, '+-', STR_PAD_LEFT));
var_dump(mb_str_pad('World', 10, '+-', STR_PAD_LEFT));
var_dump(mb_str_pad('Hello', 7, '+-', STR_PAD_RIGHT));
var_dump(mb_str_pad('World', 10, '+-', STR_PAD_RIGHT));

echo "--- Edge cases pad length ---\n";
var_dump(mb_str_pad('▶▶', 2, ' ', STR_PAD_BOTH));
var_dump(mb_str_pad('▶▶', 1, ' ', STR_PAD_BOTH));
var_dump(mb_str_pad('▶▶', 0, ' ', STR_PAD_BOTH));
var_dump(mb_str_pad('▶▶', -1, ' ', STR_PAD_BOTH));

echo "--- Empty input string ---\n";
var_dump(mb_str_pad('', 2, ' ', STR_PAD_BOTH));
var_dump(mb_str_pad('', 1, ' ', STR_PAD_BOTH));
var_dump(mb_str_pad('', 0, ' ', STR_PAD_BOTH));
var_dump(mb_str_pad('', -1, ' ', STR_PAD_BOTH));

echo "--- No default argument ---\n";
var_dump(mb_str_pad('▶▶', 6, pad_type: STR_PAD_RIGHT));
var_dump(mb_str_pad('▶▶', 6, pad_type: STR_PAD_LEFT));
var_dump(mb_str_pad('▶▶', 6, pad_type: STR_PAD_BOTH));

echo "--- UTF-8 emojis ---\n";
for ($i = 6; $i > 0; $i--) {
var_dump(mb_str_pad('▶▶', $i, '❤❓❇', STR_PAD_RIGHT));
var_dump(mb_str_pad('▶▶', $i, '❤❓❇', STR_PAD_LEFT));
var_dump(mb_str_pad('▶▶', $i, '❤❓❇', STR_PAD_BOTH));
}

echo "--- UTF-8, 32, 7 test ---\n";

// Taken from mb_substr.phpt
$utf8 = "Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь";
$utf32 = mb_convert_encoding($utf8, 'UTF-32', 'UTF-8');
$utf7 = mb_convert_encoding($utf8, 'UTF-7', 'UTF-8');
$tests = ["UTF-8" => $utf8, "UTF-32" => $utf32, "UTF-7" => $utf7];

foreach ($tests as $encoding => $test) {
$pad_str = mb_convert_encoding('▶▶', $encoding, 'UTF-8');
var_dump(mb_convert_encoding(mb_str_pad($test, 44, $pad_str, STR_PAD_RIGHT, $encoding), 'UTF-8', $encoding));
var_dump(mb_convert_encoding(mb_str_pad($test, 44, $pad_str, STR_PAD_LEFT, $encoding), 'UTF-8', $encoding));
var_dump(mb_convert_encoding(mb_str_pad($test, 44, $pad_str, STR_PAD_BOTH, $encoding), 'UTF-8', $encoding));
}
?>
--EXPECT--
--- Error conditions ---
string(66) "mb_str_pad(): Argument #3 ($pad_string) must be a non-empty string"
string(66) "mb_str_pad(): Argument #3 ($pad_string) must be a non-empty string"
string(66) "mb_str_pad(): Argument #3 ($pad_string) must be a non-empty string"
string(90) "mb_str_pad(): Argument #4 ($pad_type) must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH"
string(82) "mb_str_pad(): Argument #5 ($encoding) must be a valid encoding, "unexisting" given"
--- Simple ASCII strings ---
string(7) "+Hello+"
string(10) "+-World+-+"
string(7) "+-Hello"
string(10) "+-+-+World"
string(7) "Hello+-"
string(10) "World+-+-+"
--- Edge cases pad length ---
string(6) "▶▶"
string(6) "▶▶"
string(6) "▶▶"
string(6) "▶▶"
--- Empty input string ---
string(2) " "
string(1) " "
string(0) ""
string(0) ""
--- No default argument ---
string(10) "▶▶ "
string(10) " ▶▶"
string(10) " ▶▶ "
--- UTF-8 emojis ---
string(18) "▶▶❤❓❇❤"
string(18) "❤❓❇❤▶▶"
string(18) "❤❓▶▶❤❓"
string(15) "▶▶❤❓❇"
string(15) "❤❓❇▶▶"
string(15) "❤▶▶❤❓"
string(12) "▶▶❤❓"
string(12) "❤❓▶▶"
string(12) "❤▶▶❤"
string(9) "▶▶❤"
string(9) "❤▶▶"
string(9) "▶▶❤"
string(6) "▶▶"
string(6) "▶▶"
string(6) "▶▶"
string(6) "▶▶"
string(6) "▶▶"
string(6) "▶▶"
--- UTF-8, 32, 7 test ---
string(92) "Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶▶"
string(92) "▶▶▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь"
string(92) "▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶"
string(92) "Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶▶"
string(92) "▶▶▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь"
string(92) "▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶"
string(92) "Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶▶"
string(92) "▶▶▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь"
string(92) "▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶"