Skip to content

Commit 5f1477d

Browse files
committed
Optimize mb_strcut for fixed-byte-length text encodings
On microbenchmarks run on my dev machine, mb_strcut is now ~50% faster for fixed-byte-length text encodings like ASCII. (This is because the previous code did an extra, unnecessary copy operation on the resulting output string.)
1 parent 58fc521 commit 5f1477d

File tree

2 files changed

+61
-5
lines changed

2 files changed

+61
-5
lines changed

ext/mbstring/mbstring.c

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2443,12 +2443,22 @@ PHP_FUNCTION(mb_strcut)
24432443

24442444
if (enc->cut) {
24452445
RETURN_STR(enc->cut(string.val, from, len, string.val + string.len));
2446-
} else {
2447-
ret = mbfl_strcut(&string, &result, from, len);
2448-
ZEND_ASSERT(ret != NULL);
2449-
RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2450-
efree(ret->val);
24512446
}
2447+
2448+
unsigned int char_len = string.encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2449+
if (char_len) {
2450+
/* Round `from` down to a multiple of `char_len`; works because `char_len` is a power of 2 */
2451+
from &= -char_len;
2452+
if (len > string.len - from) {
2453+
len = string.len - from;
2454+
}
2455+
RETURN_STR(zend_string_init_fast((const char*)(string.val + from), len & -char_len));
2456+
}
2457+
2458+
ret = mbfl_strcut(&string, &result, from, len);
2459+
ZEND_ASSERT(ret != NULL);
2460+
RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2461+
efree(ret->val);
24522462
}
24532463
/* }}} */
24542464

ext/mbstring/tests/mb_strcut.phpt

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,33 @@ print "== UHC ==\n";
225225

226226
print "Single byte 0x96: [" . bin2hex(mb_strcut("\x96", 1, 1280, "UHC")) . "]\n";
227227

228+
print "== ASCII ==\n";
229+
230+
print "Empty: [" . bin2hex(mb_strcut("ABC", 0, 0, "ASCII")) . "]\n";
231+
print "Empty: [" . bin2hex(mb_strcut("ABC", 1, 0, "ASCII")) . "]\n";
232+
print "Empty: [" . bin2hex(mb_strcut("ABC", 2, 0, "ASCII")) . "]\n";
233+
234+
print "One char: [" . bin2hex(mb_strcut("ABC", 2, 1, "ASCII")) . "]\n";
235+
print "Two chars: [" . bin2hex(mb_strcut("ABC", 1, 2, "ASCII")) . "]\n";
236+
print "Two chars: [" . bin2hex(mb_strcut("ABC", 1, 3, "ASCII")) . "]\n";
237+
238+
print "== UCS-2BE ==\n";
239+
240+
print "Empty: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 0, 0, "UCS-2BE")) . "]\n";
241+
print "Empty: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 1, 0, "UCS-2BE")) . "]\n";
242+
print "Empty: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 2, 0, "UCS-2BE")) . "]\n";
243+
244+
print "Empty: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 2, 1, "UCS-2BE")) . "]\n";
245+
print "One char: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 1, 2, "UCS-2BE")) . "]\n";
246+
print "Cut in middle of following char: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 1, 3, "UCS-2BE")) . "]\n";
247+
print "Two chars: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 1, 4, "UCS-2BE")) . "]\n";
248+
249+
print "== UCS-4BE ==\n";
250+
251+
print "From 1, Length 5: [" . bin2hex(mb_strcut("\x00\x00\x00\x41\x00\x00\x00\x42", 1, 5, "UCS-4BE")) . "]\n";
252+
print "From 1, Length 6: [" . bin2hex(mb_strcut("\x00\x00\x00\x41\x00\x00\x00\x42", 1, 6, "UCS-4BE")) . "]\n";
253+
print "From 1, Length 8: [" . bin2hex(mb_strcut("\x00\x00\x00\x41\x00\x00\x00\x42", 1, 8, "UCS-4BE")) . "]\n";
254+
228255
?>
229256
--EXPECT--
230257
== EUC-JP ==
@@ -382,3 +409,22 @@ Invalid byte 0xF5: []
382409
Double-byte char: []
383410
== UHC ==
384411
Single byte 0x96: [96]
412+
== ASCII ==
413+
Empty: []
414+
Empty: []
415+
Empty: []
416+
One char: [43]
417+
Two chars: [4243]
418+
Two chars: [4243]
419+
== UCS-2BE ==
420+
Empty: []
421+
Empty: []
422+
Empty: []
423+
Empty: []
424+
One char: [0041]
425+
Cut in middle of following char: [0041]
426+
Two chars: [00410042]
427+
== UCS-4BE ==
428+
From 1, Length 5: [00000041]
429+
From 1, Length 6: [00000041]
430+
From 1, Length 8: [0000004100000042]

0 commit comments

Comments
 (0)