Skip to content

Commit 4427b2e

Browse files
committed
Mark UTF-8 strings emitted by mbstring functions as valid UTF-8
We now have a couple of mbstring functions which have fast paths for strings marked as 'valid UTF-8'. Later, we may likely have more. So that these fast paths can be used more frequently, mark UTF-8 strings emitted by mbstring as 'valid UTF-8'. This is always a correct thing to do, because mbstring never returns invalid UTF-8 as the result of a conversion (or similar) operation. Internally, we do have a conversion mode which deliberately emits invalid UTF-8 in some cases. (This is done to prevent unwanted matches when we are converting strings to UTF-8 before performing matching operations on them.) For such strings, don't set the 'valid UTF-8' flag. It probably wouldn't hurt anything to set it, because strings generated using that special conversion mode should *never* be returned to userland, and I don't think we do anything with them which cares about the IS_STR_VALID_UTF8 flag... but still, it would likely cause confusion for developers.
1 parent e7c0f4e commit 4427b2e

File tree

5 files changed

+32
-12
lines changed

5 files changed

+32
-12
lines changed

ext/mbstring/libmbfl/mbfl/mbfl_consts.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,4 +47,10 @@
4747
#define MBFL_QPRINT_STS_MIME_HEADER 0x1000000
4848
#define MBFL_BASE64_STS_MIME_HEADER 0x1000000
4949

50+
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE 0
51+
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR 1
52+
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG 2
53+
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY 3
54+
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8 4 /* For internal use only; deliberately uses invalid UTF-8 byte sequence as error marker */
55+
5056
#endif /* MBFL_CONSTS_H */

ext/mbstring/libmbfl/mbfl/mbfl_convert.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ zend_string* mb_fast_convert(unsigned char *in, size_t in_len, const mbfl_encodi
365365
}
366366

367367
*num_errors = buf.errors;
368-
return mb_convert_buf_result(&buf);
368+
return mb_convert_buf_result(&buf, to);
369369
}
370370

371371
static uint32_t* convert_cp_to_hex(uint32_t cp, uint32_t *out)

ext/mbstring/libmbfl/mbfl/mbfl_encoding.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#define MBFL_ENCODING_H
3333

3434
#include "mbfl_defs.h"
35+
#include "mbfl_consts.h"
3536
#include "zend.h"
3637

3738
enum mbfl_no_encoding {
@@ -208,7 +209,7 @@ static inline unsigned char* mb_convert_buf_add4(unsigned char *out, char c1, ch
208209
return out;
209210
}
210211

211-
static inline zend_string* mb_convert_buf_result(mb_convert_buf *buf)
212+
static inline zend_string* mb_convert_buf_result_raw(mb_convert_buf *buf)
212213
{
213214
ZEND_ASSERT(buf->out <= buf->limit);
214215
zend_string *ret = buf->str;
@@ -234,6 +235,17 @@ typedef struct {
234235
mb_from_wchar_fn from_wchar;
235236
} mbfl_encoding;
236237

238+
extern const mbfl_encoding mbfl_encoding_utf8;
239+
240+
static inline zend_string* mb_convert_buf_result(mb_convert_buf *buf, const mbfl_encoding *enc)
241+
{
242+
zend_string *ret = mb_convert_buf_result_raw(buf);
243+
if (enc == &mbfl_encoding_utf8 && buf->error_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8) {
244+
GC_ADD_FLAGS(ret, IS_STR_VALID_UTF8);
245+
}
246+
return ret;
247+
}
248+
237249
MBFLAPI extern const mbfl_encoding *mbfl_name2encoding(const char *name);
238250
MBFLAPI extern const mbfl_encoding *mbfl_no2encoding(enum mbfl_no_encoding no_encoding);
239251
MBFLAPI extern enum mbfl_no_encoding mbfl_name2no_encoding(const char *name);

ext/mbstring/mbstring.c

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1591,7 +1591,7 @@ PHP_FUNCTION(mb_output_handler)
15911591
}
15921592

15931593
MBSTRG(illegalchars) += buf.errors;
1594-
RETVAL_STR(mb_convert_buf_result(&buf));
1594+
RETVAL_STR(mb_convert_buf_result_raw(&buf));
15951595

15961596
if (last_feed) {
15971597
MBSTRG(outconv_enabled) = false;
@@ -1679,7 +1679,7 @@ PHP_FUNCTION(mb_str_split)
16791679
enc->from_wchar(wchar_buf, split_len - char_count, &buf, true);
16801680
i += split_len - char_count;
16811681
char_count = 0;
1682-
add_next_index_str(return_value, mb_convert_buf_result(&buf));
1682+
add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
16831683
} else {
16841684
/* Output from this iteration is not enough to finish the next chunk;
16851685
* output what we can, and leave 'buf' to be used again on next iteration */
@@ -1696,7 +1696,7 @@ PHP_FUNCTION(mb_str_split)
16961696
if (out_len - i >= split_len) {
16971697
enc->from_wchar(wchar_buf + i, split_len, &buf, true);
16981698
i += split_len;
1699-
add_next_index_str(return_value, mb_convert_buf_result(&buf));
1699+
add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
17001700
} else {
17011701
/* The remaining codepoints in wchar_buf aren't enough to finish a chunk;
17021702
* leave them for the next iteration */
@@ -1710,7 +1710,7 @@ PHP_FUNCTION(mb_str_split)
17101710
if (char_count) {
17111711
/* The main loop above has finished processing the input string, but
17121712
* has left a partial chunk in 'buf' */
1713-
add_next_index_str(return_value, mb_convert_buf_result(&buf));
1713+
add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
17141714
}
17151715
}
17161716
}
@@ -2076,7 +2076,7 @@ static zend_string* mb_get_substr_slow(unsigned char *in, size_t in_len, size_t
20762076
}
20772077
}
20782078

2079-
return mb_convert_buf_result(&buf);
2079+
return mb_convert_buf_result(&buf, enc);
20802080
}
20812081

20822082
static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, const mbfl_encoding *enc)
@@ -2590,7 +2590,9 @@ static zend_string* mb_trim_string(zend_string *input, zend_string *marker, cons
25902590
buf.out += ZSTR_LEN(marker);
25912591
}
25922592

2593-
return mb_convert_buf_result(&buf);
2593+
/* Even if `enc` is UTF-8, don't mark the output string as valid UTF-8, because
2594+
* we have no guarantee that the trim marker string is valid UTF-8 */
2595+
return mb_convert_buf_result_raw(&buf);
25942596
}
25952597

25962598
/* Trim the string to terminal width; optional, add a 'trim marker' if it was truncated */
@@ -3298,7 +3300,7 @@ static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *enc
32983300
encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
32993301
}
33003302

3301-
return mb_convert_buf_result(&buf);
3303+
return mb_convert_buf_result(&buf, encoding);
33023304
}
33033305

33043306
char mb_convert_kana_flags[17] = {
@@ -3697,7 +3699,7 @@ static zend_string* html_numeric_entity_encode(zend_string *input, const mbfl_en
36973699
encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
36983700
}
36993701

3700-
return mb_convert_buf_result(&buf);
3702+
return mb_convert_buf_result(&buf, encoding);
37013703
}
37023704

37033705
/* {{{ Converts specified characters to HTML numeric entities */
@@ -3929,7 +3931,7 @@ static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_en
39293931
encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
39303932
}
39313933

3932-
return mb_convert_buf_result(&buf);
3934+
return mb_convert_buf_result(&buf, encoding);
39333935
}
39343936

39353937
/* {{{ Converts HTML numeric entities to character code */

ext/mbstring/php_unicode.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -366,5 +366,5 @@ MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, cons
366366
dst_encoding->from_wchar(converted_buf, p - converted_buf, &buf, !in_len);
367367
}
368368

369-
return mb_convert_buf_result(&buf);
369+
return mb_convert_buf_result(&buf, dst_encoding);
370370
}

0 commit comments

Comments
 (0)