Skip to content

Commit d77ad27

Browse files
legalenikic
authored andcommitted
Implement mb_str_split()
RFC: https://wiki.php.net/rfc/mb_str_split
1 parent 083cfc0 commit d77ad27

File tree

6 files changed

+408
-0
lines changed

6 files changed

+408
-0
lines changed

UPGRADING

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,11 @@ PHP 7.4 UPGRADE NOTES
114114
native variables and create/access data structures defined in C libraries.
115115
RFC: https://wiki.php.net/rfc/ffi
116116

117+
- Mbstring:
118+
. Added mb_str_split() function, which provide the same functionality as
119+
str_split(), but operating on code points rather than bytes.
120+
RFC: https://wiki.php.net/rfc/mb_str_split
121+
117122
- OPcache:
118123
. Support for preloading code has been added.
119124
RFC: https://wiki.php.net/rfc/preload

ext/mbstring/mbstring.c

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,12 @@ ZEND_BEGIN_ARG_INFO_EX(arginfo_mb_output_handler, 0, 0, 2)
229229
ZEND_ARG_INFO(0, status)
230230
ZEND_END_ARG_INFO()
231231

232+
ZEND_BEGIN_ARG_INFO_EX(arginfo_mb_str_split, 0, 0, 1)
233+
ZEND_ARG_INFO(0, str)
234+
ZEND_ARG_INFO(0, split_length)
235+
ZEND_ARG_INFO(0, encoding)
236+
ZEND_END_ARG_INFO()
237+
232238
ZEND_BEGIN_ARG_INFO_EX(arginfo_mb_strlen, 0, 0, 1)
233239
ZEND_ARG_INFO(0, str)
234240
ZEND_ARG_INFO(0, encoding)
@@ -526,6 +532,7 @@ static const zend_function_entry mbstring_functions[] = {
526532
PHP_FE(mb_parse_str, arginfo_mb_parse_str)
527533
PHP_FE(mb_output_handler, arginfo_mb_output_handler)
528534
PHP_FE(mb_preferred_mime_name, arginfo_mb_preferred_mime_name)
535+
PHP_FE(mb_str_split, arginfo_mb_str_split)
529536
PHP_FE(mb_strlen, arginfo_mb_strlen)
530537
PHP_FE(mb_strpos, arginfo_mb_strpos)
531538
PHP_FE(mb_strrpos, arginfo_mb_strrpos)
@@ -2273,6 +2280,169 @@ PHP_FUNCTION(mb_output_handler)
22732280
}
22742281
/* }}} */
22752282

2283+
/* {{{ proto array mb_str_split(string str [, int split_length] [, string encoding])
2284+
Convert a multibyte string to an array. If split_length is specified,
2285+
break the string down into chunks each split_length characters long. */
2286+
2287+
/* structure to pass split params to the callback */
2288+
struct mbfl_split_params {
2289+
zval *return_value; /* php function return value structure pointer */
2290+
mbfl_string *result_string; /* string to store result chunk */
2291+
size_t mb_chunk_length; /* actual chunk length in chars */
2292+
size_t split_length; /* split length in chars */
2293+
mbfl_convert_filter *next_filter; /* widechar to encoding converter */
2294+
};
2295+
2296+
/* callback function to fill split array */
2297+
static int mbfl_split_output(int c, void *data)
2298+
{
2299+
struct mbfl_split_params *params = (struct mbfl_split_params *)data; /* cast passed data */
2300+
2301+
(*params->next_filter->filter_function)(c, params->next_filter); /* decoder filter */
2302+
2303+
if(params->split_length == ++params->mb_chunk_length) { /* if current chunk size reached defined chunk size or last char reached */
2304+
mbfl_convert_filter_flush(params->next_filter);/* concatenate separate decoded chars to the solid string */
2305+
mbfl_memory_device *device = (mbfl_memory_device *)params->next_filter->data; /* chars container */
2306+
mbfl_string *chunk = params->result_string;
2307+
mbfl_memory_device_result(device, chunk); /* make chunk */
2308+
add_next_index_stringl(params->return_value, (const char *)chunk->val, chunk->len); /* add chunk to the array */
2309+
efree(chunk->val);
2310+
params->mb_chunk_length = 0; /* reset mb_chunk size */
2311+
}
2312+
return 0;
2313+
}
2314+
2315+
PHP_FUNCTION(mb_str_split)
2316+
{
2317+
zend_string *str, *encoding = NULL;
2318+
size_t mb_len, chunks, chunk_len;
2319+
const char *p, *last; /* pointer for the string cursor and last string char */
2320+
mbfl_string string, result_string;
2321+
const mbfl_encoding *mbfl_encoding;
2322+
zend_long split_length = 1;
2323+
2324+
ZEND_PARSE_PARAMETERS_START(1, 3)
2325+
Z_PARAM_STR(str)
2326+
Z_PARAM_OPTIONAL
2327+
Z_PARAM_LONG(split_length)
2328+
Z_PARAM_STR(encoding)
2329+
ZEND_PARSE_PARAMETERS_END();
2330+
2331+
if (split_length <= 0) {
2332+
php_error_docref(NULL, E_WARNING, "The length of each segment must be greater than zero");
2333+
RETURN_FALSE;
2334+
}
2335+
2336+
/* fill mbfl_string structure */
2337+
string.val = (unsigned char *) ZSTR_VAL(str);
2338+
string.len = ZSTR_LEN(str);
2339+
string.no_language = MBSTRG(language);
2340+
string.encoding = php_mb_get_encoding(encoding);
2341+
if (!string.encoding) {
2342+
RETURN_FALSE;
2343+
}
2344+
2345+
p = ZSTR_VAL(str); /* string cursor pointer */
2346+
last = ZSTR_VAL(str) + ZSTR_LEN(str); /* last string char pointer */
2347+
2348+
mbfl_encoding = string.encoding;
2349+
2350+
/* first scenario: 1,2,4-bytes fixed width encodings (head part) */
2351+
if (mbfl_encoding->flag & MBFL_ENCTYPE_SBCS) { /* 1 byte */
2352+
mb_len = string.len;
2353+
chunk_len = (size_t)split_length; /* chunk length in bytes */
2354+
} else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { /* 2 bytes */
2355+
mb_len = string.len / 2;
2356+
chunk_len = split_length * 2;
2357+
} else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { /* 4 bytes */
2358+
mb_len = string.len / 4;
2359+
chunk_len = split_length * 4;
2360+
} else if (mbfl_encoding->mblen_table != NULL) {
2361+
/* second scenario: variable width encodings with length table */
2362+
char unsigned const *mbtab = mbfl_encoding->mblen_table;
2363+
2364+
/* assume that we have 1-bytes characters */
2365+
array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
2366+
2367+
while (p < last) { /* split cycle work until the cursor has reached the last byte */
2368+
char const *chunk_p = p; /* chunk first byte pointer */
2369+
chunk_len = 0; /* chunk length in bytes */
2370+
for (zend_long char_count = 0; char_count < split_length && p < last; ++char_count) {
2371+
char unsigned const m = mbtab[*(const unsigned char *)p]; /* single character length table */
2372+
chunk_len += m;
2373+
p += m;
2374+
}
2375+
if (p >= last) chunk_len -= p - last; /* check if chunk is in bounds */
2376+
add_next_index_stringl(return_value, chunk_p, chunk_len);
2377+
}
2378+
return;
2379+
} else {
2380+
/* third scenario: other multibyte encodings */
2381+
mbfl_convert_filter *filter, *decoder;
2382+
2383+
/* assume that we have 1-bytes characters */
2384+
array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
2385+
2386+
/* decoder filter to decode wchar to encoding */
2387+
mbfl_memory_device device;
2388+
mbfl_memory_device_init(&device, split_length + 1, 0);
2389+
2390+
decoder = mbfl_convert_filter_new(
2391+
&mbfl_encoding_wchar,
2392+
string.encoding,
2393+
mbfl_memory_device_output,
2394+
NULL,
2395+
&device);
2396+
/* if something wrong with the decoded */
2397+
if (decoder == NULL) {
2398+
RETURN_FALSE;
2399+
}
2400+
2401+
/* wchar filter */
2402+
mbfl_string_init(&result_string); /* mbfl_string to store chunk in the callback */
2403+
struct mbfl_split_params params = { /* init callback function params structure */
2404+
.return_value = return_value,
2405+
.result_string = &result_string,
2406+
.mb_chunk_length = 0,
2407+
.split_length = (size_t)split_length,
2408+
.next_filter = decoder,
2409+
};
2410+
2411+
filter = mbfl_convert_filter_new(
2412+
string.encoding,
2413+
&mbfl_encoding_wchar,
2414+
mbfl_split_output,
2415+
NULL,
2416+
&params);
2417+
/* if something wrong with the filter */
2418+
if (filter == NULL){
2419+
mbfl_convert_filter_delete(decoder); /* this will free allocated memory for the decoded */
2420+
RETURN_FALSE;
2421+
}
2422+
2423+
while (p < last - 1) { /* cycle each byte except last with callback function */
2424+
(*filter->filter_function)(*p++, filter);
2425+
}
2426+
params.mb_chunk_length = split_length - 1; /* force to finish current chunk */
2427+
(*filter->filter_function)(*p++, filter); /*process last char */
2428+
2429+
mbfl_convert_filter_delete(decoder);
2430+
mbfl_convert_filter_delete(filter);
2431+
return;
2432+
}
2433+
2434+
/* first scenario: 1,2,4-bytes fixed width encodings (tail part) */
2435+
chunks = (mb_len + split_length - 1) / split_length; /* (round up idiom) */
2436+
array_init_size(return_value, chunks);
2437+
if (chunks != 0) {
2438+
for (zend_long i = 0; i < chunks - 1; p += chunk_len, ++i) {
2439+
add_next_index_stringl(return_value, p, chunk_len);
2440+
}
2441+
add_next_index_stringl(return_value, p, last - p);
2442+
}
2443+
}
2444+
/* }}} */
2445+
22762446
/* {{{ proto int mb_strlen(string str [, string encoding])
22772447
Get character numbers of a string */
22782448
PHP_FUNCTION(mb_strlen)

ext/mbstring/mbstring.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ PHP_FUNCTION(mb_substitute_character);
7878
PHP_FUNCTION(mb_preferred_mime_name);
7979
PHP_FUNCTION(mb_parse_str);
8080
PHP_FUNCTION(mb_output_handler);
81+
PHP_FUNCTION(mb_str_split);
8182
PHP_FUNCTION(mb_strlen);
8283
PHP_FUNCTION(mb_strpos);
8384
PHP_FUNCTION(mb_strrpos);
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
--TEST--
2+
mb_str_split() tests for the japanese language
3+
--SKIPIF--
4+
<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
5+
--INI--
6+
output_handler=
7+
mbstring.func_overload=0
8+
--FILE--
9+
<?php
10+
ini_set('include_path','.');
11+
include_once('common.inc');
12+
13+
$string = "日本"; /* 2 chars */
14+
$len = 2;
15+
$charset = [
16+
"BIG-5",
17+
"EUC-JP",
18+
"ISO-2022-JP",
19+
"SJIS",
20+
"UTF-16BE",
21+
"UTF-16LE",
22+
"UTF-32BE",
23+
"UTF-32LE",
24+
"UTF-8"
25+
];
26+
27+
28+
foreach($charset as $cs){
29+
$enc = mb_convert_encoding($string, $cs, "UTF-8");
30+
$split = mb_str_split($enc, 1, $cs);
31+
32+
/* check chunks number */
33+
for($i = 1; $i <= $len; ++$i){
34+
$ceil = ceil($len / $i);
35+
$cnt = count(mb_str_split($enc,$i,$cs));
36+
if($ceil != $cnt){
37+
echo "$cs WRONG CHUNKS NUMBER: expected/actual: $ceil/$cnt\n";
38+
}
39+
}
40+
41+
/* check content */
42+
echo "$cs:";
43+
for($i = 0; $i < $len; ++$i){
44+
echo " " . unpack("H*", $split[$i])[1];
45+
}
46+
echo "\n";
47+
}
48+
49+
/* long string test */
50+
$size = 50000;
51+
$long = str_repeat($string, $size); /* 50k x 2 chars = 1e5 chars */
52+
$enc = mb_convert_encoding($long, "ISO-2022-JP", "UTF-8");
53+
$array = mb_str_split($enc, $len, "ISO-2022-JP");
54+
$count = count($array);
55+
56+
/* check array size */
57+
if($size !== $count) printf("Long string splitting error: actual array size: %d expected: %d\n", $count, $size);
58+
59+
/* compare initial string and last array element after splitting */
60+
$enc = mb_convert_encoding($string, "ISO-2022-JP", "UTF-8");
61+
if(end($array) !== $enc){
62+
printf("Long string splitting error:
63+
last array element: %s expected: %s\n", unpack("H*", end($array))[1],unpack("H*", $enc)[1]);
64+
}
65+
66+
?>
67+
--EXPECT--
68+
BIG-5: a4e9 a5bb
69+
EUC-JP: c6fc cbdc
70+
ISO-2022-JP: 1b2442467c1b2842 1b24424b5c1b2842
71+
SJIS: 93fa 967b
72+
UTF-16BE: 65e5 672c
73+
UTF-16LE: e565 2c67
74+
UTF-32BE: 000065e5 0000672c
75+
UTF-32LE: e5650000 2c670000
76+
UTF-8: e697a5 e69cac
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
--TEST--
2+
mb_str_split() tests for the russian language
3+
--SKIPIF--
4+
<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
5+
--INI--
6+
output_handler=
7+
mbstring.func_overload=0
8+
--FILE--
9+
<?php
10+
ini_set('include_path','.');
11+
include_once('common.inc');
12+
13+
$string = "рай рай рай "; /* 12 chars */
14+
$len = 12;
15+
$charset = [
16+
"EUC-JP",
17+
"CP866",
18+
"KOI8-R",
19+
"UTF-16BE",
20+
"UTF-16LE",
21+
"UTF-32BE",
22+
"UTF-32LE",
23+
"UTF-8"
24+
];
25+
26+
27+
foreach($charset as $cs){
28+
$enc = mb_convert_encoding($string, $cs, "UTF-8");
29+
$split = mb_str_split($enc, 1, $cs);
30+
31+
32+
/* check chunks number */
33+
for($i = 1; $i <= $len; ++$i){
34+
$ceil = ceil($len / $i);
35+
$cnt = count(mb_str_split($enc,$i,$cs));
36+
if($ceil != $cnt){
37+
echo "$cs WRONG CHUNKS NUMBER: expected/actual: $ceil/$cnt\n";
38+
}
39+
}
40+
41+
/* check content */
42+
echo "$cs:";
43+
for($i = 0; $i < $len; ++$i){
44+
echo " " . unpack("H*", $split[$i])[1];
45+
}
46+
echo "\n";
47+
}
48+
49+
/* long string test */
50+
$size = 25000;
51+
$long = str_repeat($string, $size); /* 25k x 12 chars = 3e5 chars */
52+
$enc = mb_convert_encoding($long, "EUC-JP", "UTF-8");
53+
$array = mb_str_split($enc, $len, "EUC-JP");
54+
$count = count($array);
55+
56+
/* check array size */
57+
if($size !== $count) printf("Long string splitting error: actual array size: %d expected: %d\n", $count, $size);
58+
59+
/* compare initial string and last array element after splitting */
60+
$enc = mb_convert_encoding($string, "EUC-JP", "UTF-8");
61+
if(end($array) !== $enc){
62+
printf("Long string splitting error:
63+
last array element: %s expected: %s\n", unpack("H*", end($array))[1],unpack("H*", $enc)[1]);
64+
}
65+
66+
?>
67+
--EXPECT--
68+
EUC-JP: a7e2 a7d1 a7db 20 a7e2 a7d1 a7db 20 a7e2 a7d1 a7db 20
69+
CP866: e0 a0 a9 20 e0 a0 a9 20 e0 a0 a9 20
70+
KOI8-R: d2 c1 ca 20 d2 c1 ca 20 d2 c1 ca 20
71+
UTF-16BE: 0440 0430 0439 0020 0440 0430 0439 0020 0440 0430 0439 0020
72+
UTF-16LE: 4004 3004 3904 2000 4004 3004 3904 2000 4004 3004 3904 2000
73+
UTF-32BE: 00000440 00000430 00000439 00000020 00000440 00000430 00000439 00000020 00000440 00000430 00000439 00000020
74+
UTF-32LE: 40040000 30040000 39040000 20000000 40040000 30040000 39040000 20000000 40040000 30040000 39040000 20000000
75+
UTF-8: d180 d0b0 d0b9 20 d180 d0b0 d0b9 20 d180 d0b0 d0b9 20

0 commit comments

Comments
 (0)