Skip to content

Commit 157ca65

Browse files
committed
Implement mb_decode_mimeheader using fast text conversion filters
The new implementation is 2.5x-3x faster. If an invalid charset name was used, the old implementation would get 'stuck' trying to parse the charset name and would not interpret any other MIME encoded words up to the end of the input string. The new implementation fixes this bug. If an (invalid) encoded word ends abruptly and a new (valid) encoded word starts, the old implementation would not decode the valid encoded word. The new implementation also fixes this. Otherwise, the behavior of the new implementation has been designed to closely match that of the old implementation.
1 parent 117f226 commit 157ca65

File tree

5 files changed

+349
-335
lines changed

5 files changed

+349
-335
lines changed

ext/mbstring/libmbfl/filters/mbfilter_sjis.c

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2250,11 +2250,7 @@ static void mb_wchar_to_sjis_docomo(uint32_t *in, size_t len, mb_convert_buf *bu
22502250
/* Continue what we were doing on the previous call */
22512251
w = buf->state;
22522252
buf->state = 0;
2253-
if (len) {
2254-
goto reprocess_wchar;
2255-
} else {
2256-
goto emit_output;
2257-
}
2253+
goto reprocess_wchar;
22582254
}
22592255

22602256
while (len--) {
@@ -2482,11 +2478,7 @@ static void mb_wchar_to_sjis_kddi(uint32_t *in, size_t len, mb_convert_buf *buf,
24822478
if (buf->state) {
24832479
w = buf->state;
24842480
buf->state = 0;
2485-
if (len) {
2486-
goto reprocess_wchar;
2487-
} else {
2488-
goto emit_output;
2489-
}
2481+
goto reprocess_wchar;
24902482
}
24912483

24922484
while (len--) {
@@ -2793,11 +2785,7 @@ static void mb_wchar_to_sjis_sb(uint32_t *in, size_t len, mb_convert_buf *buf, b
27932785
if (buf->state) {
27942786
w = buf->state;
27952787
buf->state = 0;
2796-
if (len) {
2797-
goto reprocess_wchar;
2798-
} else {
2799-
goto emit_output;
2800-
}
2788+
goto reprocess_wchar;
28012789
}
28022790

28032791
while (len--) {

ext/mbstring/libmbfl/mbfl/mbfilter.c

Lines changed: 0 additions & 273 deletions
Original file line numberDiff line numberDiff line change
@@ -832,276 +832,3 @@ mbfl_mime_header_encode(
832832

833833
return result;
834834
}
835-
836-
837-
/*
838-
* MIME header decode
839-
*/
840-
struct mime_header_decoder_data {
841-
mbfl_convert_filter *deco_filter;
842-
mbfl_convert_filter *conv1_filter;
843-
mbfl_convert_filter *conv2_filter;
844-
mbfl_memory_device outdev;
845-
mbfl_memory_device tmpdev;
846-
size_t cspos;
847-
int status;
848-
const mbfl_encoding *encoding;
849-
const mbfl_encoding *incode;
850-
const mbfl_encoding *outcode;
851-
};
852-
853-
static int
854-
mime_header_decoder_collector(int c, void* data)
855-
{
856-
const mbfl_encoding *encoding;
857-
struct mime_header_decoder_data *pd = (struct mime_header_decoder_data*)data;
858-
859-
switch (pd->status) {
860-
case 1:
861-
if (c == 0x3f) { /* ? */
862-
mbfl_memory_device_output(c, &pd->tmpdev);
863-
pd->cspos = pd->tmpdev.pos;
864-
pd->status = 2;
865-
} else {
866-
mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
867-
mbfl_memory_device_reset(&pd->tmpdev);
868-
if (c == 0x3d) { /* = */
869-
mbfl_memory_device_output(c, &pd->tmpdev);
870-
} else if (c == 0x0d || c == 0x0a) { /* CR or LF */
871-
pd->status = 9;
872-
} else {
873-
(*pd->conv1_filter->filter_function)(c, pd->conv1_filter);
874-
pd->status = 0;
875-
}
876-
}
877-
break;
878-
case 2: /* store charset string */
879-
if (c == 0x3f) { /* ? */
880-
/* identify charset */
881-
mbfl_memory_device_output('\0', &pd->tmpdev);
882-
encoding = mbfl_name2encoding((const char *)&pd->tmpdev.buffer[pd->cspos]);
883-
if (encoding != NULL) {
884-
pd->incode = encoding;
885-
pd->status = 3;
886-
}
887-
mbfl_memory_device_unput(&pd->tmpdev);
888-
mbfl_memory_device_output(c, &pd->tmpdev);
889-
} else {
890-
mbfl_memory_device_output(c, &pd->tmpdev);
891-
if (pd->tmpdev.pos > 100) { /* too long charset string */
892-
pd->status = 0;
893-
} else if (c == 0x0d || c == 0x0a) { /* CR or LF */
894-
mbfl_memory_device_unput(&pd->tmpdev);
895-
pd->status = 9;
896-
}
897-
if (pd->status != 2) {
898-
mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
899-
mbfl_memory_device_reset(&pd->tmpdev);
900-
}
901-
}
902-
break;
903-
case 3: /* identify encoding */
904-
mbfl_memory_device_output(c, &pd->tmpdev);
905-
if (c == 0x42 || c == 0x62) { /* 'B' or 'b' */
906-
pd->encoding = &mbfl_encoding_base64;
907-
pd->status = 4;
908-
} else if (c == 0x51 || c == 0x71) { /* 'Q' or 'q' */
909-
pd->encoding = &mbfl_encoding_qprint;
910-
pd->status = 4;
911-
} else {
912-
if (c == 0x0d || c == 0x0a) { /* CR or LF */
913-
mbfl_memory_device_unput(&pd->tmpdev);
914-
pd->status = 9;
915-
} else {
916-
pd->status = 0;
917-
}
918-
mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
919-
mbfl_memory_device_reset(&pd->tmpdev);
920-
}
921-
break;
922-
case 4: /* reset filter */
923-
mbfl_memory_device_output(c, &pd->tmpdev);
924-
if (c == 0x3f) { /* ? */
925-
/* charset convert filter */
926-
mbfl_convert_filter_reset(pd->conv1_filter, pd->incode, &mbfl_encoding_wchar);
927-
/* decode filter */
928-
mbfl_convert_filter_reset(pd->deco_filter, pd->encoding, &mbfl_encoding_8bit);
929-
pd->status = 5;
930-
} else {
931-
if (c == 0x0d || c == 0x0a) { /* CR or LF */
932-
mbfl_memory_device_unput(&pd->tmpdev);
933-
pd->status = 9;
934-
} else {
935-
pd->status = 0;
936-
}
937-
mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
938-
}
939-
mbfl_memory_device_reset(&pd->tmpdev);
940-
break;
941-
case 5: /* encoded block */
942-
if (c == 0x3f) { /* ? */
943-
pd->status = 6;
944-
} else {
945-
(*pd->deco_filter->filter_function)(c, pd->deco_filter);
946-
}
947-
break;
948-
case 6: /* check end position */
949-
if (c == 0x3d) { /* = */
950-
/* flush and reset filter */
951-
(*pd->deco_filter->filter_flush)(pd->deco_filter);
952-
(*pd->conv1_filter->filter_flush)(pd->conv1_filter);
953-
mbfl_convert_filter_reset(pd->conv1_filter, &mbfl_encoding_ascii, &mbfl_encoding_wchar);
954-
pd->status = 7;
955-
} else {
956-
(*pd->deco_filter->filter_function)(0x3f, pd->deco_filter);
957-
if (c != 0x3f) { /* ? */
958-
(*pd->deco_filter->filter_function)(c, pd->deco_filter);
959-
pd->status = 5;
960-
}
961-
}
962-
break;
963-
case 7: /* after encoded block */
964-
if (c == 0x0d || c == 0x0a) { /* CR LF */
965-
pd->status = 8;
966-
} else {
967-
mbfl_memory_device_output(c, &pd->tmpdev);
968-
if (c == 0x3d) { /* = */
969-
pd->status = 1;
970-
} else if (c != 0x20 && c != 0x09) { /* not space */
971-
mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
972-
mbfl_memory_device_reset(&pd->tmpdev);
973-
pd->status = 0;
974-
}
975-
}
976-
break;
977-
case 8: /* folding */
978-
case 9: /* folding */
979-
if (c != 0x0d && c != 0x0a && c != 0x20 && c != 0x09) {
980-
if (c == 0x3d) { /* = */
981-
if (pd->status == 8) {
982-
mbfl_memory_device_output(0x20, &pd->tmpdev); /* SPACE */
983-
} else {
984-
(*pd->conv1_filter->filter_function)(0x20, pd->conv1_filter);
985-
}
986-
mbfl_memory_device_output(c, &pd->tmpdev);
987-
pd->status = 1;
988-
} else {
989-
mbfl_memory_device_output(0x20, &pd->tmpdev);
990-
mbfl_memory_device_output(c, &pd->tmpdev);
991-
mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
992-
mbfl_memory_device_reset(&pd->tmpdev);
993-
pd->status = 0;
994-
}
995-
}
996-
break;
997-
default: /* non encoded block */
998-
if (c == 0x0d || c == 0x0a) { /* CR LF */
999-
pd->status = 9;
1000-
} else if (c == 0x3d) { /* = */
1001-
mbfl_memory_device_output(c, &pd->tmpdev);
1002-
pd->status = 1;
1003-
} else {
1004-
(*pd->conv1_filter->filter_function)(c, pd->conv1_filter);
1005-
}
1006-
break;
1007-
}
1008-
1009-
return 0;
1010-
}
1011-
1012-
mbfl_string *
1013-
mime_header_decoder_result(struct mime_header_decoder_data *pd, mbfl_string *result)
1014-
{
1015-
switch (pd->status) {
1016-
case 1:
1017-
case 2:
1018-
case 3:
1019-
case 4:
1020-
case 7:
1021-
case 8:
1022-
case 9:
1023-
mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1024-
break;
1025-
case 5:
1026-
case 6:
1027-
(*pd->deco_filter->filter_flush)(pd->deco_filter);
1028-
(*pd->conv1_filter->filter_flush)(pd->conv1_filter);
1029-
break;
1030-
}
1031-
(*pd->conv2_filter->filter_flush)(pd->conv2_filter);
1032-
mbfl_memory_device_reset(&pd->tmpdev);
1033-
pd->status = 0;
1034-
1035-
return mbfl_memory_device_result(&pd->outdev, result);
1036-
}
1037-
1038-
struct mime_header_decoder_data*
1039-
mime_header_decoder_new(const mbfl_encoding *outcode)
1040-
{
1041-
struct mime_header_decoder_data *pd = emalloc(sizeof(struct mime_header_decoder_data));
1042-
1043-
mbfl_memory_device_init(&pd->outdev, 0, 0);
1044-
mbfl_memory_device_init(&pd->tmpdev, 0, 0);
1045-
pd->cspos = 0;
1046-
pd->status = 0;
1047-
pd->encoding = &mbfl_encoding_8bit;
1048-
pd->incode = &mbfl_encoding_ascii;
1049-
pd->outcode = outcode;
1050-
/* charset convert filter */
1051-
pd->conv2_filter = mbfl_convert_filter_new(&mbfl_encoding_wchar, pd->outcode, mbfl_memory_device_output, 0, &pd->outdev);
1052-
pd->conv1_filter = mbfl_convert_filter_new(pd->incode, &mbfl_encoding_wchar, mbfl_filter_output_pipe, 0, pd->conv2_filter);
1053-
/* decode filter */
1054-
pd->deco_filter = mbfl_convert_filter_new(pd->encoding, &mbfl_encoding_8bit, mbfl_filter_output_pipe, 0, pd->conv1_filter);
1055-
1056-
if (pd->conv1_filter == NULL || pd->conv2_filter == NULL || pd->deco_filter == NULL) {
1057-
mime_header_decoder_delete(pd);
1058-
return NULL;
1059-
}
1060-
1061-
return pd;
1062-
}
1063-
1064-
void
1065-
mime_header_decoder_delete(struct mime_header_decoder_data *pd)
1066-
{
1067-
if (pd) {
1068-
mbfl_convert_filter_delete(pd->conv2_filter);
1069-
mbfl_convert_filter_delete(pd->conv1_filter);
1070-
mbfl_convert_filter_delete(pd->deco_filter);
1071-
mbfl_memory_device_clear(&pd->outdev);
1072-
mbfl_memory_device_clear(&pd->tmpdev);
1073-
efree((void*)pd);
1074-
}
1075-
}
1076-
1077-
mbfl_string *
1078-
mbfl_mime_header_decode(
1079-
mbfl_string *string,
1080-
mbfl_string *result,
1081-
const mbfl_encoding *outcode)
1082-
{
1083-
size_t n;
1084-
unsigned char *p;
1085-
struct mime_header_decoder_data *pd;
1086-
1087-
mbfl_string_init(result);
1088-
result->encoding = outcode;
1089-
1090-
pd = mime_header_decoder_new(outcode);
1091-
if (pd == NULL) {
1092-
return NULL;
1093-
}
1094-
1095-
/* feed data */
1096-
n = string->len;
1097-
p = string->val;
1098-
while (n > 0) {
1099-
mime_header_decoder_collector(*p++, pd);
1100-
n--;
1101-
}
1102-
1103-
result = mime_header_decoder_result(pd, result);
1104-
mime_header_decoder_delete(pd);
1105-
1106-
return result;
1107-
}

ext/mbstring/libmbfl/mbfl/mbfilter.h

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -193,24 +193,4 @@ mbfl_mime_header_encode(
193193
const char *linefeed,
194194
int indent);
195195

196-
/*
197-
* MIME header decode
198-
*/
199-
struct mime_header_decoder_data; /* forward declaration */
200-
201-
MBFLAPI extern struct mime_header_decoder_data *
202-
mime_header_decoder_new(const mbfl_encoding *outcode);
203-
204-
MBFLAPI extern void
205-
mime_header_decoder_delete(struct mime_header_decoder_data *pd);
206-
207-
MBFLAPI extern mbfl_string *
208-
mime_header_decoder_result(struct mime_header_decoder_data *pd, mbfl_string *result);
209-
210-
MBFLAPI extern mbfl_string *
211-
mbfl_mime_header_decode(
212-
mbfl_string *string,
213-
mbfl_string *result,
214-
const mbfl_encoding *outcode);
215-
216196
#endif /* MBFL_MBFILTER_H */

0 commit comments

Comments
 (0)