Skip to content

Commit caeaa66

Browse files
committed
Strict conversion of UHC text to Unicode
Previously, mbstring would accept a lot of things which were not valid UHC text. No more. - Don't allow single-byte control characters to appear where the 2nd byte of a multi-byte character should be. - Validate that the 2nd byte of a multi-byte character is in the expected range. - Treat it as an error if a multi-byte character is truncated. Also add a test suite to confirm that UHC conversion (both to and from Unicode) works according to spec.
1 parent 4550036 commit caeaa66

File tree

3 files changed

+17365
-13
lines changed

3 files changed

+17365
-13
lines changed

ext/mbstring/libmbfl/filters/mbfilter_uhc.c

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
#define UNICODE_TABLE_UHC_DEF
3737
#include "unicode_table_uhc.h"
3838

39+
static int mbfl_filt_conv_uhc_wchar_flush(mbfl_convert_filter *filter);
40+
3941
static const unsigned char mblen_table_uhc[] = { /* 0x81-0xFE */
4042
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4143
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -74,7 +76,7 @@ const struct mbfl_convert_vtbl vtbl_uhc_wchar = {
7476
mbfl_filt_conv_common_ctor,
7577
NULL,
7678
mbfl_filt_conv_uhc_wchar,
77-
mbfl_filt_conv_common_flush,
79+
mbfl_filt_conv_uhc_wchar_flush,
7880
NULL,
7981
};
8082

@@ -99,7 +101,7 @@ mbfl_filt_conv_uhc_wchar(int c, mbfl_convert_filter *filter)
99101
case 0:
100102
if (c >= 0 && c < 0x80) { /* latin */
101103
CK((*filter->output_function)(c, filter->data));
102-
} else if (c > 0x80 && c < 0xff && c != 0xc9) { /* dbcs lead byte */
104+
} else if (c > 0x80 && c < 0xfe && c != 0xc9) { /* dbcs lead byte */
103105
filter->status = 1;
104106
filter->cache = c;
105107
} else {
@@ -113,23 +115,23 @@ mbfl_filt_conv_uhc_wchar(int c, mbfl_convert_filter *filter)
113115
filter->status = 0;
114116
c1 = filter->cache;
115117

116-
if (c1 >= 0x81 && c1 <= 0xa0) {
118+
if (c1 >= 0x81 && c1 <= 0xa0 && c >= 0x41 && c <= 0xfe) {
117119
w = (c1 - 0x81)*190 + (c - 0x41);
118120
if (w >= 0 && w < uhc1_ucs_table_size) {
119121
flag = 1;
120122
w = uhc1_ucs_table[w];
121123
} else {
122124
w = 0;
123125
}
124-
} else if (c1 >= 0xa1 && c1 <= 0xc6) {
126+
} else if (c1 >= 0xa1 && c1 <= 0xc6 && c >= 0x41 && c <= 0xfe) {
125127
w = (c1 - 0xa1)*190 + (c - 0x41);
126128
if (w >= 0 && w < uhc2_ucs_table_size) {
127129
flag = 2;
128130
w = uhc2_ucs_table[w];
129131
} else {
130132
w = 0;
131133
}
132-
} else if (c1 >= 0xc7 && c1 <= 0xfe) {
134+
} else if (c1 >= 0xc7 && c1 < 0xfe && c >= 0xa1 && c <= 0xfe) {
133135
w = (c1 - 0xc7)*94 + (c - 0xa1);
134136
if (w >= 0 && w < uhc3_ucs_table_size) {
135137
flag = 3;
@@ -147,14 +149,10 @@ mbfl_filt_conv_uhc_wchar(int c, mbfl_convert_filter *filter)
147149
}
148150
CK((*filter->output_function)(w, filter->data));
149151
} else {
150-
if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
151-
CK((*filter->output_function)(c, filter->data));
152-
} else {
153-
w = (c1 << 8) | c;
154-
w &= MBFL_WCSGROUP_MASK;
155-
w |= MBFL_WCSGROUP_THROUGH;
156-
CK((*filter->output_function)(w, filter->data));
157-
}
152+
w = (c1 << 8) | c;
153+
w &= MBFL_WCSGROUP_MASK;
154+
w |= MBFL_WCSGROUP_THROUGH;
155+
CK((*filter->output_function)(w, filter->data));
158156
}
159157
break;
160158

@@ -186,6 +184,7 @@ mbfl_filt_conv_wchar_uhc(int c, mbfl_convert_filter *filter)
186184
} else if (c >= ucs_r2_uhc_table_min && c < ucs_r2_uhc_table_max) {
187185
s = ucs_r2_uhc_table[c - ucs_r2_uhc_table_min];
188186
}
187+
189188
if (s <= 0) {
190189
c1 = c & ~MBFL_WCSPLANE_MASK;
191190
if (c1 == MBFL_WCSPLANE_UHC) {
@@ -197,6 +196,7 @@ mbfl_filt_conv_wchar_uhc(int c, mbfl_convert_filter *filter)
197196
s = -1;
198197
}
199198
}
199+
200200
if (s >= 0) {
201201
if (s < 0x80) { /* latin */
202202
CK((*filter->output_function)(s, filter->data));
@@ -210,3 +210,17 @@ mbfl_filt_conv_wchar_uhc(int c, mbfl_convert_filter *filter)
210210

211211
return c;
212212
}
213+
214+
static int mbfl_filt_conv_uhc_wchar_flush(mbfl_convert_filter *filter)
215+
{
216+
if (filter->status == 1) {
217+
/* 2-byte character was truncated */
218+
CK((*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data));
219+
}
220+
221+
if (filter->flush_function) {
222+
(*filter->flush_function)(filter->data);
223+
}
224+
225+
return 0;
226+
}

0 commit comments

Comments
 (0)