Skip to content

Commit b9cd1cd

Browse files
committed
Implement mb_substr_count using fast text conversion filters
The performance gain from this change depends on the text encoding and input string size. For very small strings, other overheads tend to swamp the performance gains to some extent, such that the speedup is less than 2x. For medium-length strings (~100 bytes or so), the speedup is typically around 2.5x. The greatest performance gains are for UTF-8 strings which have already been marked as valid (using the GC flags on the zend_string object); for those, the speedup is more than 10x in many cases. The previous implementation first converted the haystack and needle to wchars, then searched for matches between the two sequences of wchars. Because we use -1 as an error marker when converting to wchars, error markers from invalid byte sequences in the haystack would match error markers from invalid byte sequences in the needle, even if the specific invalid byte sequence was different. I am not sure whether this behavior is really desirable or not, but anyways, this new implementation follows the same behavior so as not to cause BC breaks.
1 parent f9a1a90 commit b9cd1cd

File tree

5 files changed

+132
-175
lines changed

5 files changed

+132
-175
lines changed

ext/mbstring/libmbfl/mbfl/mbfilter.c

Lines changed: 0 additions & 132 deletions
Original file line numberDiff line numberDiff line change
@@ -429,138 +429,6 @@ const mbfl_encoding *mbfl_identify_encoding(mbfl_string *string, const mbfl_enco
429429
return enc;
430430
}
431431

432-
/*
433-
* strpos
434-
*/
435-
struct collector_strpos_data {
436-
mbfl_convert_filter *next_filter;
437-
mbfl_wchar_device needle;
438-
size_t needle_len;
439-
size_t start;
440-
size_t output;
441-
size_t found_pos;
442-
size_t needle_pos;
443-
size_t matched_pos;
444-
};
445-
446-
static int
447-
collector_strpos(int c, void* data)
448-
{
449-
int *p, *h, *m;
450-
ssize_t n;
451-
struct collector_strpos_data *pc = (struct collector_strpos_data*)data;
452-
453-
if (pc->output >= pc->start) {
454-
if (c == (int)pc->needle.buffer[pc->needle_pos]) {
455-
if (pc->needle_pos == 0) {
456-
pc->found_pos = pc->output; /* found position */
457-
}
458-
pc->needle_pos++; /* needle pointer */
459-
if (pc->needle_pos >= pc->needle_len) {
460-
pc->matched_pos = pc->found_pos; /* matched position */
461-
pc->needle_pos--;
462-
goto retry;
463-
}
464-
} else if (pc->needle_pos != 0) {
465-
retry:
466-
h = (int *)pc->needle.buffer;
467-
h++;
468-
for (;;) {
469-
pc->found_pos++;
470-
p = h;
471-
m = (int *)pc->needle.buffer;
472-
n = pc->needle_pos - 1;
473-
while (n > 0 && *p == *m) {
474-
n--;
475-
p++;
476-
m++;
477-
}
478-
if (n <= 0) {
479-
if (*m != c) {
480-
pc->needle_pos = 0;
481-
}
482-
break;
483-
} else {
484-
h++;
485-
pc->needle_pos--;
486-
}
487-
}
488-
}
489-
}
490-
491-
pc->output++;
492-
return 0;
493-
}
494-
495-
/*
496-
* substr_count
497-
*/
498-
499-
size_t
500-
mbfl_substr_count(
501-
mbfl_string *haystack,
502-
mbfl_string *needle
503-
)
504-
{
505-
size_t n, result = 0;
506-
unsigned char *p;
507-
mbfl_convert_filter *filter;
508-
struct collector_strpos_data pc;
509-
510-
/* needle is converted into wchar */
511-
mbfl_wchar_device_init(&pc.needle);
512-
filter = mbfl_convert_filter_new(
513-
needle->encoding,
514-
&mbfl_encoding_wchar,
515-
mbfl_wchar_device_output, 0, &pc.needle);
516-
ZEND_ASSERT(filter);
517-
mbfl_convert_filter_feed_string(filter, needle->val, needle->len);
518-
mbfl_convert_filter_flush(filter);
519-
mbfl_convert_filter_delete(filter);
520-
pc.needle_len = pc.needle.pos;
521-
if (pc.needle.buffer == NULL) {
522-
return MBFL_ERROR_ENCODING;
523-
}
524-
if (pc.needle_len == 0) {
525-
mbfl_wchar_device_clear(&pc.needle);
526-
return MBFL_ERROR_EMPTY;
527-
}
528-
/* initialize filter and collector data */
529-
filter = mbfl_convert_filter_new(
530-
haystack->encoding,
531-
&mbfl_encoding_wchar,
532-
collector_strpos, 0, &pc);
533-
ZEND_ASSERT(filter);
534-
pc.start = 0;
535-
pc.output = 0;
536-
pc.needle_pos = 0;
537-
pc.found_pos = 0;
538-
pc.matched_pos = MBFL_ERROR_NOT_FOUND;
539-
540-
/* feed data */
541-
p = haystack->val;
542-
n = haystack->len;
543-
if (p != NULL) {
544-
while (n > 0) {
545-
if ((*filter->filter_function)(*p++, filter) < 0) {
546-
pc.matched_pos = MBFL_ERROR_ENCODING;
547-
break;
548-
}
549-
if (pc.matched_pos != MBFL_ERROR_NOT_FOUND) {
550-
++result;
551-
pc.matched_pos = MBFL_ERROR_NOT_FOUND;
552-
pc.needle_pos = 0;
553-
}
554-
n--;
555-
}
556-
}
557-
mbfl_convert_filter_flush(filter);
558-
mbfl_convert_filter_delete(filter);
559-
mbfl_wchar_device_clear(&pc.needle);
560-
561-
return result;
562-
}
563-
564432
/*
565433
* strcut
566434
*/

ext/mbstring/libmbfl/mbfl/mbfilter.h

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -112,13 +112,11 @@
112112
#define MBFL_VERSION_MINOR 3
113113
#define MBFL_VERSION_TEENY 2
114114

115-
/*
116-
* convert filter
117-
*/
118115
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE 0
119116
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR 1
120117
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG 2
121118
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY 3
119+
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8 4 /* For internal use only; deliberately uses invalid UTF-8 byte sequence as error marker */
122120

123121
/*
124122
* convenience macros
@@ -195,12 +193,6 @@ static inline int mbfl_is_error(size_t len) {
195193
#define MBFL_ERROR_EMPTY ((size_t) -8)
196194
#define MBFL_ERROR_OFFSET ((size_t) -16)
197195

198-
/*
199-
* substr_count
200-
*/
201-
MBFLAPI extern size_t
202-
mbfl_substr_count(mbfl_string *haystack, mbfl_string *needle);
203-
204196
/*
205197
* If specified as length, the substr until the end of the string is taken.
206198
*/

ext/mbstring/libmbfl/mbfl/mbfl_convert.c

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -394,9 +394,15 @@ static size_t mb_illegal_marker(uint32_t bad_cp, uint32_t *out, unsigned int err
394394
{
395395
uint32_t *start = out;
396396

397-
if (bad_cp == MBFL_BAD_INPUT && err_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
398-
*out++ = replacement_char;
397+
if (bad_cp == MBFL_BAD_INPUT) {
398+
/* Input string contained a byte sequence which was invalid in the 'from' encoding
399+
* Unless the error handling mode is set to NONE, insert the replacement character */
400+
if (err_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
401+
*out++ = replacement_char;
402+
}
399403
} else {
404+
/* Input string contained a byte sequence which was valid in the 'from' encoding,
405+
* but decoded to a Unicode codepoint which cannot be represented in the 'to' encoding */
400406
switch (err_mode) {
401407
case MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR:
402408
*out++ = replacement_char;
@@ -427,6 +433,17 @@ void mb_illegal_output(uint32_t bad_cp, mb_from_wchar_fn fn, mb_convert_buf* buf
427433
uint32_t repl_char = buf->replacement_char;
428434
unsigned int err_mode = buf->error_mode;
429435

436+
if (err_mode == MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8) {
437+
/* This mode is for internal use only, when converting a string to
438+
* UTF-8 before searching it; it uses a byte which is illegal in
439+
* UTF-8 as an error marker. This ensures that error markers will
440+
* never 'accidentally' match valid text, as could happen when a
441+
* character like '?' is used as an error marker. */
442+
MB_CONVERT_BUF_ENSURE(buf, buf->out, buf->limit, 1);
443+
buf->out = mb_convert_buf_add(buf->out, 0xFF);
444+
return;
445+
}
446+
430447
size_t len = mb_illegal_marker(bad_cp, temp, err_mode, repl_char);
431448

432449
/* Avoid infinite loop if `fn` is not able to handle `repl_char` */

ext/mbstring/mbstring.c

Lines changed: 69 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1961,10 +1961,10 @@ static size_t mb_find_strpos(zend_string *haystack, zend_string *needle, const m
19611961
}
19621962

19631963
out:
1964-
if (haystack_u8 != NULL && haystack_u8 != haystack) {
1964+
if (haystack_u8 != haystack) {
19651965
zend_string_free(haystack_u8);
19661966
}
1967-
if (needle_u8 != NULL && needle_u8 != needle) {
1967+
if (needle_u8 != needle) {
19681968
zend_string_free(needle_u8);
19691969
}
19701970
return result;
@@ -2263,42 +2263,89 @@ PHP_FUNCTION(mb_strrichr)
22632263
#undef MB_STRISTR
22642264
#undef MB_STRRICHR
22652265

2266-
/* {{{ Count the number of substring occurrences */
22672266
PHP_FUNCTION(mb_substr_count)
22682267
{
2269-
mbfl_string haystack, needle;
2270-
char *haystack_val, *needle_val;
2271-
zend_string *enc_name = NULL;
2268+
zend_string *haystack, *needle, *enc_name = NULL, *haystack_u8 = NULL, *needle_u8 = NULL;
22722269

22732270
ZEND_PARSE_PARAMETERS_START(2, 3)
2274-
Z_PARAM_STRING(haystack_val, haystack.len)
2275-
Z_PARAM_STRING(needle_val, needle.len)
2271+
Z_PARAM_STR(haystack)
2272+
Z_PARAM_STR(needle)
22762273
Z_PARAM_OPTIONAL
22772274
Z_PARAM_STR_OR_NULL(enc_name)
22782275
ZEND_PARSE_PARAMETERS_END();
22792276

2280-
haystack.val = (unsigned char*)haystack_val;
2281-
needle.val = (unsigned char*)needle_val;
2282-
2283-
if (needle.len == 0) {
2277+
if (ZSTR_LEN(needle) == 0) {
22842278
zend_argument_value_error(2, "must not be empty");
22852279
RETURN_THROWS();
22862280
}
22872281

2288-
haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 3);
2289-
if (!haystack.encoding) {
2282+
const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 3);
2283+
if (!enc) {
22902284
RETURN_THROWS();
22912285
}
22922286

2293-
size_t n = mbfl_substr_count(&haystack, &needle);
2294-
/* An error can only occur if needle is empty,
2295-
* an encoding error happens (which should not happen at this stage and is a bug)
2296-
* or the haystack is more than sizeof(size_t) bytes
2297-
* If one of these things occur this is a bug and should be flagged as such */
2298-
ZEND_ASSERT(!mbfl_is_error(n));
2299-
RETVAL_LONG(n);
2287+
if (php_mb_is_no_encoding_utf8(enc->no_encoding)) {
2288+
/* No need to do any conversion if haystack/needle are already known-valid UTF-8
2289+
* (If they are not valid, then not passing them through conversion filters could affect output) */
2290+
if (GC_FLAGS(haystack) & IS_STR_VALID_UTF8) {
2291+
haystack_u8 = haystack;
2292+
} else {
2293+
unsigned int num_errors = 0;
2294+
haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2295+
if (!num_errors && !ZSTR_IS_INTERNED(haystack)) {
2296+
GC_ADD_FLAGS(haystack, IS_STR_VALID_UTF8);
2297+
}
2298+
}
2299+
2300+
if (GC_FLAGS(needle) & IS_STR_VALID_UTF8) {
2301+
needle_u8 = needle;
2302+
} else {
2303+
unsigned int num_errors = 0;
2304+
needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2305+
if (!num_errors && !ZSTR_IS_INTERNED(needle)) {
2306+
GC_ADD_FLAGS(needle, IS_STR_VALID_UTF8);
2307+
}
2308+
}
2309+
} else {
2310+
unsigned int num_errors = 0;
2311+
haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2312+
needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2313+
/* A string with >0 bytes may convert to 0 codepoints; for example, the contents
2314+
* may be only escape sequences */
2315+
if (ZSTR_LEN(needle_u8) == 0) {
2316+
zend_string_free(haystack_u8);
2317+
zend_string_free(needle_u8);
2318+
zend_argument_value_error(2, "must not be empty");
2319+
RETURN_THROWS();
2320+
}
2321+
}
2322+
2323+
size_t result = 0;
2324+
2325+
if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
2326+
goto out;
2327+
}
2328+
2329+
const char *p = ZSTR_VAL(haystack_u8), *e = p + ZSTR_LEN(haystack_u8);
2330+
while (true) {
2331+
p = zend_memnstr(p, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), e);
2332+
if (!p) {
2333+
break;
2334+
}
2335+
p += ZSTR_LEN(needle_u8);
2336+
result++;
2337+
}
2338+
2339+
out:
2340+
if (haystack_u8 != haystack) {
2341+
zend_string_free(haystack_u8);
2342+
}
2343+
if (needle_u8 != needle) {
2344+
zend_string_free(needle_u8);
2345+
}
2346+
2347+
RETVAL_LONG(result);
23002348
}
2301-
/* }}} */
23022349

23032350
/* {{{ Returns part of a string */
23042351
PHP_FUNCTION(mb_substr)

0 commit comments

Comments
 (0)