diff --git a/NEWS b/NEWS index ad42da6774b2a..30153beae1ba1 100644 --- a/NEWS +++ b/NEWS @@ -59,6 +59,8 @@ PHP NEWS MB_CASE_LOWER_SIMPLE and MB_CASE_TITLE_SIMPLE. (Alex Dowad) . mb_detect_encoding is better able to identify UTF-8 and UTF-16 strings with a byte-order mark. (Alex Dowad) + . mb_decode_mimeheader handles underscores in QPrint-encoded MIME encoded + words properly according to the standard (RFC 2047). (Alex Dowad) - Opcache: . Added start, restart and force restart time to opcache's diff --git a/UPGRADING b/UPGRADING index 9b5febd600377..21c4bd94a39b5 100644 --- a/UPGRADING +++ b/UPGRADING @@ -63,6 +63,10 @@ PHP 8.3 UPGRADE NOTES casing rules for the Greek letter sigma. For mb_convert_case, conditional casing only applies to MB_CASE_LOWER and MB_CASE_TITLE modes, not to MB_CASE_LOWER_SIMPLE and MB_CASE_TITLE_SIMPLE. (Alex Dowad) + . mb_decode_mimeheader handles underscores in QPrint-encoded MIME encoded + words as dictated by RFC 2047; they are converted to spaces (byte 0x20). + To include a underscore in a QPrint-encoded MIME encoded word, it must + be encoded as "=5F". (Alex Dowad) - Standard: . E_NOTICEs emitted by unserialized() have been promoted to E_WARNING. diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c index c278f24f40674..99ca334d50bf4 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c @@ -2250,11 +2250,7 @@ static void mb_wchar_to_sjis_docomo(uint32_t *in, size_t len, mb_convert_buf *bu /* Continue what we were doing on the previous call */ w = buf->state; buf->state = 0; - if (len) { - goto reprocess_wchar; - } else { - goto emit_output; - } + goto reprocess_wchar; } while (len--) { @@ -2482,11 +2478,7 @@ static void mb_wchar_to_sjis_kddi(uint32_t *in, size_t len, mb_convert_buf *buf, if (buf->state) { w = buf->state; buf->state = 0; - if (len) { - goto reprocess_wchar; - } else { - goto emit_output; - } + goto reprocess_wchar; } while (len--) { @@ -2793,11 +2785,7 @@ static void mb_wchar_to_sjis_sb(uint32_t *in, size_t len, mb_convert_buf *buf, b if (buf->state) { w = buf->state; buf->state = 0; - if (len) { - goto reprocess_wchar; - } else { - goto emit_output; - } + goto reprocess_wchar; } while (len--) { diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter.c b/ext/mbstring/libmbfl/mbfl/mbfilter.c index 3c1d9071d3a98..cbf487b1a5b7d 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter.c @@ -832,276 +832,3 @@ mbfl_mime_header_encode( return result; } - - -/* - * MIME header decode - */ -struct mime_header_decoder_data { - mbfl_convert_filter *deco_filter; - mbfl_convert_filter *conv1_filter; - mbfl_convert_filter *conv2_filter; - mbfl_memory_device outdev; - mbfl_memory_device tmpdev; - size_t cspos; - int status; - const mbfl_encoding *encoding; - const mbfl_encoding *incode; - const mbfl_encoding *outcode; -}; - -static int -mime_header_decoder_collector(int c, void* data) -{ - const mbfl_encoding *encoding; - struct mime_header_decoder_data *pd = (struct mime_header_decoder_data*)data; - - switch (pd->status) { - case 1: - if (c == 0x3f) { /* ? */ - mbfl_memory_device_output(c, &pd->tmpdev); - pd->cspos = pd->tmpdev.pos; - pd->status = 2; - } else { - mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev); - mbfl_memory_device_reset(&pd->tmpdev); - if (c == 0x3d) { /* = */ - mbfl_memory_device_output(c, &pd->tmpdev); - } else if (c == 0x0d || c == 0x0a) { /* CR or LF */ - pd->status = 9; - } else { - (*pd->conv1_filter->filter_function)(c, pd->conv1_filter); - pd->status = 0; - } - } - break; - case 2: /* store charset string */ - if (c == 0x3f) { /* ? */ - /* identify charset */ - mbfl_memory_device_output('\0', &pd->tmpdev); - encoding = mbfl_name2encoding((const char *)&pd->tmpdev.buffer[pd->cspos]); - if (encoding != NULL) { - pd->incode = encoding; - pd->status = 3; - } - mbfl_memory_device_unput(&pd->tmpdev); - mbfl_memory_device_output(c, &pd->tmpdev); - } else { - mbfl_memory_device_output(c, &pd->tmpdev); - if (pd->tmpdev.pos > 100) { /* too long charset string */ - pd->status = 0; - } else if (c == 0x0d || c == 0x0a) { /* CR or LF */ - mbfl_memory_device_unput(&pd->tmpdev); - pd->status = 9; - } - if (pd->status != 2) { - mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev); - mbfl_memory_device_reset(&pd->tmpdev); - } - } - break; - case 3: /* identify encoding */ - mbfl_memory_device_output(c, &pd->tmpdev); - if (c == 0x42 || c == 0x62) { /* 'B' or 'b' */ - pd->encoding = &mbfl_encoding_base64; - pd->status = 4; - } else if (c == 0x51 || c == 0x71) { /* 'Q' or 'q' */ - pd->encoding = &mbfl_encoding_qprint; - pd->status = 4; - } else { - if (c == 0x0d || c == 0x0a) { /* CR or LF */ - mbfl_memory_device_unput(&pd->tmpdev); - pd->status = 9; - } else { - pd->status = 0; - } - mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev); - mbfl_memory_device_reset(&pd->tmpdev); - } - break; - case 4: /* reset filter */ - mbfl_memory_device_output(c, &pd->tmpdev); - if (c == 0x3f) { /* ? */ - /* charset convert filter */ - mbfl_convert_filter_reset(pd->conv1_filter, pd->incode, &mbfl_encoding_wchar); - /* decode filter */ - mbfl_convert_filter_reset(pd->deco_filter, pd->encoding, &mbfl_encoding_8bit); - pd->status = 5; - } else { - if (c == 0x0d || c == 0x0a) { /* CR or LF */ - mbfl_memory_device_unput(&pd->tmpdev); - pd->status = 9; - } else { - pd->status = 0; - } - mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev); - } - mbfl_memory_device_reset(&pd->tmpdev); - break; - case 5: /* encoded block */ - if (c == 0x3f) { /* ? */ - pd->status = 6; - } else { - (*pd->deco_filter->filter_function)(c, pd->deco_filter); - } - break; - case 6: /* check end position */ - if (c == 0x3d) { /* = */ - /* flush and reset filter */ - (*pd->deco_filter->filter_flush)(pd->deco_filter); - (*pd->conv1_filter->filter_flush)(pd->conv1_filter); - mbfl_convert_filter_reset(pd->conv1_filter, &mbfl_encoding_ascii, &mbfl_encoding_wchar); - pd->status = 7; - } else { - (*pd->deco_filter->filter_function)(0x3f, pd->deco_filter); - if (c != 0x3f) { /* ? */ - (*pd->deco_filter->filter_function)(c, pd->deco_filter); - pd->status = 5; - } - } - break; - case 7: /* after encoded block */ - if (c == 0x0d || c == 0x0a) { /* CR LF */ - pd->status = 8; - } else { - mbfl_memory_device_output(c, &pd->tmpdev); - if (c == 0x3d) { /* = */ - pd->status = 1; - } else if (c != 0x20 && c != 0x09) { /* not space */ - mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev); - mbfl_memory_device_reset(&pd->tmpdev); - pd->status = 0; - } - } - break; - case 8: /* folding */ - case 9: /* folding */ - if (c != 0x0d && c != 0x0a && c != 0x20 && c != 0x09) { - if (c == 0x3d) { /* = */ - if (pd->status == 8) { - mbfl_memory_device_output(0x20, &pd->tmpdev); /* SPACE */ - } else { - (*pd->conv1_filter->filter_function)(0x20, pd->conv1_filter); - } - mbfl_memory_device_output(c, &pd->tmpdev); - pd->status = 1; - } else { - mbfl_memory_device_output(0x20, &pd->tmpdev); - mbfl_memory_device_output(c, &pd->tmpdev); - mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev); - mbfl_memory_device_reset(&pd->tmpdev); - pd->status = 0; - } - } - break; - default: /* non encoded block */ - if (c == 0x0d || c == 0x0a) { /* CR LF */ - pd->status = 9; - } else if (c == 0x3d) { /* = */ - mbfl_memory_device_output(c, &pd->tmpdev); - pd->status = 1; - } else { - (*pd->conv1_filter->filter_function)(c, pd->conv1_filter); - } - break; - } - - return 0; -} - -mbfl_string * -mime_header_decoder_result(struct mime_header_decoder_data *pd, mbfl_string *result) -{ - switch (pd->status) { - case 1: - case 2: - case 3: - case 4: - case 7: - case 8: - case 9: - mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev); - break; - case 5: - case 6: - (*pd->deco_filter->filter_flush)(pd->deco_filter); - (*pd->conv1_filter->filter_flush)(pd->conv1_filter); - break; - } - (*pd->conv2_filter->filter_flush)(pd->conv2_filter); - mbfl_memory_device_reset(&pd->tmpdev); - pd->status = 0; - - return mbfl_memory_device_result(&pd->outdev, result); -} - -struct mime_header_decoder_data* -mime_header_decoder_new(const mbfl_encoding *outcode) -{ - struct mime_header_decoder_data *pd = emalloc(sizeof(struct mime_header_decoder_data)); - - mbfl_memory_device_init(&pd->outdev, 0, 0); - mbfl_memory_device_init(&pd->tmpdev, 0, 0); - pd->cspos = 0; - pd->status = 0; - pd->encoding = &mbfl_encoding_8bit; - pd->incode = &mbfl_encoding_ascii; - pd->outcode = outcode; - /* charset convert filter */ - pd->conv2_filter = mbfl_convert_filter_new(&mbfl_encoding_wchar, pd->outcode, mbfl_memory_device_output, 0, &pd->outdev); - pd->conv1_filter = mbfl_convert_filter_new(pd->incode, &mbfl_encoding_wchar, mbfl_filter_output_pipe, 0, pd->conv2_filter); - /* decode filter */ - pd->deco_filter = mbfl_convert_filter_new(pd->encoding, &mbfl_encoding_8bit, mbfl_filter_output_pipe, 0, pd->conv1_filter); - - if (pd->conv1_filter == NULL || pd->conv2_filter == NULL || pd->deco_filter == NULL) { - mime_header_decoder_delete(pd); - return NULL; - } - - return pd; -} - -void -mime_header_decoder_delete(struct mime_header_decoder_data *pd) -{ - if (pd) { - mbfl_convert_filter_delete(pd->conv2_filter); - mbfl_convert_filter_delete(pd->conv1_filter); - mbfl_convert_filter_delete(pd->deco_filter); - mbfl_memory_device_clear(&pd->outdev); - mbfl_memory_device_clear(&pd->tmpdev); - efree((void*)pd); - } -} - -mbfl_string * -mbfl_mime_header_decode( - mbfl_string *string, - mbfl_string *result, - const mbfl_encoding *outcode) -{ - size_t n; - unsigned char *p; - struct mime_header_decoder_data *pd; - - mbfl_string_init(result); - result->encoding = outcode; - - pd = mime_header_decoder_new(outcode); - if (pd == NULL) { - return NULL; - } - - /* feed data */ - n = string->len; - p = string->val; - while (n > 0) { - mime_header_decoder_collector(*p++, pd); - n--; - } - - result = mime_header_decoder_result(pd, result); - mime_header_decoder_delete(pd); - - return result; -} diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter.h b/ext/mbstring/libmbfl/mbfl/mbfilter.h index 86720330018f3..e3678584fa340 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter.h +++ b/ext/mbstring/libmbfl/mbfl/mbfilter.h @@ -193,24 +193,4 @@ mbfl_mime_header_encode( const char *linefeed, int indent); -/* - * MIME header decode - */ -struct mime_header_decoder_data; /* forward declaration */ - -MBFLAPI extern struct mime_header_decoder_data * -mime_header_decoder_new(const mbfl_encoding *outcode); - -MBFLAPI extern void -mime_header_decoder_delete(struct mime_header_decoder_data *pd); - -MBFLAPI extern mbfl_string * -mime_header_decoder_result(struct mime_header_decoder_data *pd, mbfl_string *result); - -MBFLAPI extern mbfl_string * -mbfl_mime_header_decode( - mbfl_string *string, - mbfl_string *result, - const mbfl_encoding *outcode); - #endif /* MBFL_MBFILTER_H */ diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c index 2495f7447aa3a..1d44756ee051a 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c +++ b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c @@ -212,12 +212,6 @@ const mbfl_encoding *mbfl_no2encoding(enum mbfl_no_encoding no_encoding) return NULL; } -enum mbfl_no_encoding mbfl_name2no_encoding(const char *name) -{ - const mbfl_encoding *encoding = mbfl_name2encoding(name); - return encoding ? encoding->no_encoding : mbfl_no_encoding_invalid; -} - const char *mbfl_no_encoding2name(enum mbfl_no_encoding no_encoding) { const mbfl_encoding *encoding = mbfl_no2encoding(no_encoding); @@ -229,11 +223,6 @@ const mbfl_encoding **mbfl_get_supported_encodings(void) return mbfl_encoding_ptr_list; } -const char *mbfl_no2preferred_mime_name(enum mbfl_no_encoding no_encoding) -{ - return mbfl_encoding_preferred_mime_name(mbfl_no2encoding(no_encoding)); -} - const char *mbfl_encoding_preferred_mime_name(const mbfl_encoding *encoding) { if (encoding->mime_name && encoding->mime_name[0] != '\0') { diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h index 93ea632a83503..c20cb7bded40b 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h +++ b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h @@ -248,10 +248,8 @@ static inline zend_string* mb_convert_buf_result(mb_convert_buf *buf, const mbfl MBFLAPI extern const mbfl_encoding *mbfl_name2encoding(const char *name); MBFLAPI extern const mbfl_encoding *mbfl_no2encoding(enum mbfl_no_encoding no_encoding); -MBFLAPI extern enum mbfl_no_encoding mbfl_name2no_encoding(const char *name); MBFLAPI extern const mbfl_encoding **mbfl_get_supported_encodings(void); MBFLAPI extern const char *mbfl_no_encoding2name(enum mbfl_no_encoding no_encoding); -MBFLAPI extern const char *mbfl_no2preferred_mime_name(enum mbfl_no_encoding no_encoding); MBFLAPI extern const char *mbfl_encoding_preferred_mime_name(const mbfl_encoding *encoding); #endif /* MBFL_ENCODING_H */ diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 270dc2d36d7d2..a31201682ac57 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -1469,7 +1469,6 @@ PHP_FUNCTION(mb_substitute_character) /* {{{ Return the preferred MIME name (charset) as a string */ PHP_FUNCTION(mb_preferred_mime_name) { - enum mbfl_no_encoding no_encoding; char *name = NULL; size_t name_len; @@ -1477,13 +1476,13 @@ PHP_FUNCTION(mb_preferred_mime_name) Z_PARAM_STRING(name, name_len) ZEND_PARSE_PARAMETERS_END(); - no_encoding = mbfl_name2no_encoding(name); - if (no_encoding == mbfl_no_encoding_invalid) { + const mbfl_encoding *enc = mbfl_name2encoding(name); + if (enc == NULL) { zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name); RETURN_THROWS(); } - const char *preferred_name = mbfl_no2preferred_mime_name(no_encoding); + const char *preferred_name = mbfl_encoding_preferred_mime_name(enc); if (preferred_name == NULL || *preferred_name == '\0') { php_error_docref(NULL, E_WARNING, "No MIME preferred name corresponding to \"%s\"", name); RETVAL_FALSE; @@ -3247,28 +3246,6 @@ PHP_FUNCTION(mb_encode_mimeheader) } /* }}} */ -/* {{{ Decodes the MIME "encoded-word" in the string */ -PHP_FUNCTION(mb_decode_mimeheader) -{ - char *string_val; - mbfl_string string, result, *ret; - - string.encoding = MBSTRG(current_internal_encoding); - - ZEND_PARSE_PARAMETERS_START(1, 1) - Z_PARAM_STRING(string_val, string.len) - ZEND_PARSE_PARAMETERS_END(); - - string.val = (unsigned char*)string_val; - mbfl_string_init(&result); - ret = mbfl_mime_header_decode(&string, &result, MBSTRG(current_internal_encoding)); - ZEND_ASSERT(ret != NULL); - // TODO: avoid reallocation ??? - RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */ - efree(ret->val); -} -/* }}} */ - static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode) { /* Each wchar may potentially expand to 2 when we perform kana conversion... @@ -5343,7 +5320,6 @@ PHP_FUNCTION(mb_check_encoding) } /* }}} */ - static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string *enc_name, const uint32_t enc_name_arg_num) { @@ -5376,7 +5352,6 @@ static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string return wchar_buf[0]; } - /* {{{ */ PHP_FUNCTION(mb_ord) { @@ -5409,7 +5384,6 @@ PHP_FUNCTION(mb_ord) } /* }}} */ - static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint32_t enc_name_arg_num) { const mbfl_encoding *enc; @@ -5480,7 +5454,6 @@ static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint3 return ret; } - /* {{{ */ PHP_FUNCTION(mb_chr) { @@ -5527,7 +5500,6 @@ PHP_FUNCTION(mb_scrub) } /* }}} */ - /* {{{ php_mb_populate_current_detect_order_list */ static void php_mb_populate_current_detect_order_list(void) { @@ -5641,3 +5613,235 @@ static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding) /* {{{ MBSTRG(http_input_identify) = (const mbfl_encoding*)encoding; } /* }}} */ + +static int8_t decode_base64(unsigned char c) +{ + if (c >= 'A' && c <= 'Z') { + return c - 'A'; + } else if (c >= 'a' && c <= 'z') { + return c - 'a' + 26; + } else if (c >= '0' && c <= '9') { + return c - '0' + 52; + } else if (c == '+') { + return 62; + } else if (c == '/') { + return 63; + } + return -1; +} + +static int8_t qprint_map[] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 +}; + +/* Decode MIME encoded word as defined in RFC 2047 */ +static unsigned char* mime_header_decode_encoded_word(unsigned char *p, unsigned char *e, const mbfl_encoding *outcode, mb_convert_buf *outbuf, unsigned int *state) +{ + if ((e - p) < 6) { + return NULL; + } + + ZEND_ASSERT(p[0] == '='); + ZEND_ASSERT(p[1] == '?'); + p += 2; + + unsigned char *charset = p; + unsigned char *charset_end = memchr(charset, '?', e - charset); + if (charset_end == NULL) { + return NULL; + } + + unsigned char *encoding = charset_end + 1; + p = encoding + 1; + if (p >= e || *p++ != '?') { + return NULL; + } + + char *charset_name = estrndup((const char*)charset, charset_end - charset); + const mbfl_encoding *incode = mbfl_name2encoding(charset_name); + efree(charset_name); + if (incode == NULL) { + return NULL; + } + + unsigned char *end_marker = (unsigned char*)zend_memnstr((const char*)p, "?=", 2, (const char*)e); + if (end_marker) { + e = end_marker; + } else if (p < e && *(e-1) == '?') { + /* If encoded word is not properly terminated, but last byte is '?', + * take that as a terminator (legacy behavior) */ + e--; + } + + unsigned char *buf = emalloc(e - p), *bufp = buf; + if (*encoding == 'Q' || *encoding == 'q') { + /* Fill `buf` with bytes from decoding QPrint */ + while (p < e) { + unsigned char c = *p++; + if (c == '_') { + *bufp++ = ' '; + continue; + } else if (c == '=' && (e - p) >= 2) { + unsigned char c2 = *p++; + unsigned char c3 = *p++; + if (qprint_map[c2] >= 0 && qprint_map[c3] >= 0) { + *bufp++ = (qprint_map[c2] << 4) | (qprint_map[c3] & 0xF); + continue; + } else if (c2 == '\r') { + if (c3 != '\n') { + p--; + } + continue; + } else if (c2 == '\n') { + p--; + continue; + } + } + *bufp++ = c; + } + } else if (*encoding == 'B' || *encoding == 'b') { + /* Fill `buf` with bytes from decoding Base64 */ + unsigned int bits = 0, cache = 0; + while (p < e) { + unsigned char c = *p++; + if (c == '\r' || c == '\n' || c == ' ' || c == '\t' || c == '=') { + continue; + } + int8_t decoded = decode_base64(c); + if (decoded == -1) { + *bufp++ = '?'; + continue; + } + bits += 6; + cache = (cache << 6) | (decoded & 0x3F); + if (bits == 24) { + *bufp++ = (cache >> 16) & 0xFF; + *bufp++ = (cache >> 8) & 0xFF; + *bufp++ = cache & 0xFF; + bits = cache = 0; + } + } + if (bits == 18) { + *bufp++ = (cache >> 10) & 0xFF; + *bufp++ = (cache >> 2) & 0xFF; + } else if (bits == 12) { + *bufp++ = (cache >> 4) & 0xFF; + } + } else { + efree(buf); + return NULL; + } + + size_t in_len = bufp - buf; + uint32_t wchar_buf[128]; + + bufp = buf; + while (in_len) { + size_t out_len = incode->to_wchar(&bufp, &in_len, wchar_buf, 128, state); + ZEND_ASSERT(out_len <= 128); + outcode->from_wchar(wchar_buf, out_len, outbuf, false); + } + + efree(buf); + return e + 2; +} + +static zend_string* mb_mime_header_decode(zend_string *input, const mbfl_encoding *outcode) +{ + unsigned char *p = (unsigned char*)ZSTR_VAL(input), *e = p + ZSTR_LEN(input); + unsigned int state = 0; + bool space_pending = false; + + mb_convert_buf buf; + mb_convert_buf_init(&buf, ZSTR_LEN(input), '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR); + + while (p < e) { + unsigned char c = *p; + + if (c == '=' && *(p + 1) == '?' && (e - p) >= 6) { + /* Does this look like a MIME encoded word? If so, try to decode it as one */ + unsigned char *incode_end = memchr(p + 2, '?', e - p - 2); + if (incode_end && (e - incode_end) >= 3) { + unsigned char *temp = mime_header_decode_encoded_word(p, e, outcode, &buf, &state); + if (temp) { + p = temp; + /* Decoding of MIME encoded word was successful; + * Try to collapse a run of whitespace */ + if (p < e && (*p == '\n' || *p == '\r')) { + do { + p++; + } while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' ')); + /* We will only actually output a space if this is not immediately followed + * by another valid encoded word */ + space_pending = true; + } + continue; + } + } + } + + if (space_pending) { + uint32_t space = ' '; + outcode->from_wchar(&space, 1, &buf, false); + space_pending = false; + } + + /* Consume a run of plain ASCII characters */ + if (c != '\n' && c != '\r') { + unsigned char *end = p + 1; + while (end < e && (*end != '=' && *end != '\n' && *end != '\r')) { + end++; + } + uint32_t wchar_buf[128]; + size_t in_len = end - p; + while (in_len) { + size_t out_len = mbfl_encoding_ascii.to_wchar(&p, &in_len, wchar_buf, 128, &state); + ZEND_ASSERT(out_len <= 128); + outcode->from_wchar(wchar_buf, out_len, &buf, false); + } + } + /* Collapse a run of whitespace into a single space */ + if (p < e && (*p == '\n' || *p == '\r')) { + do { + p++; + } while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' ')); + if (p < e) { + /* Emulating legacy behavior of mb_decode_mimeheader here; + * a run of whitespace is not converted to a space at the very + * end of the input string */ + uint32_t space = ' '; + outcode->from_wchar(&space, 1, &buf, false); + } + } + } + + outcode->from_wchar(NULL, 0, &buf, true); + + return mb_convert_buf_result(&buf, outcode); +} + +PHP_FUNCTION(mb_decode_mimeheader) +{ + zend_string *str; + + ZEND_PARSE_PARAMETERS_START(1, 1) + Z_PARAM_STR(str) + ZEND_PARSE_PARAMETERS_END(); + + RETURN_STR(mb_mime_header_decode(str, MBSTRG(current_internal_encoding))); +} diff --git a/ext/mbstring/tests/mb_decode_mimeheader_variation4.phpt b/ext/mbstring/tests/mb_decode_mimeheader_variation4.phpt new file mode 100644 index 0000000000000..4579e6e834c83 --- /dev/null +++ b/ext/mbstring/tests/mb_decode_mimeheader_variation4.phpt @@ -0,0 +1,117 @@ +--TEST-- +Test mb_decode_mimeheader() function: weird variations found by fuzzer +--EXTENSIONS-- +mbstring +--FILE-- + +--EXPECT-- +string(36) "0032002c0020004700430047003f00470053" +string(6) "203869" +string(0) "" +string(0) "" +string(10) "2c13403d2c" +string(16) "3d3f493f423f3f3d" +string(200) "3d3f3d203f3d3f523f3d3f3d203f3f003d3d3d3d3f3d3d3d3f3f3d3f55432d523f3d3f3d203f3d3f3d3d3d3d3d3f3d203f3d3d3d3d3d3d3f3d3d3d3d3d3d3f3d203f3d3d3d3d3d3d3f3d3d3d3f3f3d3f55432d4b523f3d3f3d203f3d3f3d3d3d3f3d3d3f" +string(400) "003d003f003f003f007400660037002c0055000100000060004000000004007c003f004400180000000000000076003f003f003f003f003f003f003f003f003f003f003f003f001300660037002c0055002600000053000100000017002c0044003f003f003f003f003f003f003f0001000000000014003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f00000011000000000000000000000000" +string(0) "" +string(2) "3f" +string(2) "3f" +string(0) "" +string(2) "3f" +string(4) "3d3f" +string(6) "3d3f3d" +string(6) "3d3f2c" +string(42) "626567696e20303634342066696c656e616d650a20" +string(2) "36" +string(2) "36" +string(2) "36" diff --git a/ext/mbstring/tests/mb_decode_mimeheader_variation5.phpt b/ext/mbstring/tests/mb_decode_mimeheader_variation5.phpt new file mode 100644 index 0000000000000..a313ff14e0f15 --- /dev/null +++ b/ext/mbstring/tests/mb_decode_mimeheader_variation5.phpt @@ -0,0 +1,23 @@ +--TEST-- +Test mb_decode_mimeheader() function: use of underscores in QPrint-encoded data +--EXTENSIONS-- +mbstring +--FILE-- + +--EXPECT-- +string(3) "abc" +string(7) "abc def" +string(9) "_abc def_" +string(10) " 汉字 " +string(1) "_"