Skip to content

Commit 73c6a5b

Browse files
committed
Fix conversion of Big5 and CP950 text (and add test suite)
- Truncated multi-byte characters are treated as an error - Follow recommended mappings from Unicode consortium
1 parent b6dcab2 commit 73c6a5b

File tree

6 files changed

+28040
-274
lines changed

6 files changed

+28040
-274
lines changed

ext/mbstring/libmbfl/filters/mbfilter_big5.c

Lines changed: 152 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232

3333
#include "unicode_table_big5.h"
3434

35+
static int mbfl_filt_conv_big5_wchar_flush(mbfl_convert_filter *filter);
36+
3537
static const unsigned char mblen_table_big5[] = { /* 0x81-0xFE */
3638
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3739
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -81,7 +83,7 @@ const struct mbfl_convert_vtbl vtbl_big5_wchar = {
8183
mbfl_filt_conv_common_ctor,
8284
NULL,
8385
mbfl_filt_conv_big5_wchar,
84-
mbfl_filt_conv_common_flush,
86+
mbfl_filt_conv_big5_wchar_flush,
8587
NULL,
8688
};
8789

@@ -101,7 +103,7 @@ const struct mbfl_convert_vtbl vtbl_cp950_wchar = {
101103
mbfl_filt_conv_common_ctor,
102104
NULL,
103105
mbfl_filt_conv_big5_wchar,
104-
mbfl_filt_conv_common_flush,
106+
mbfl_filt_conv_big5_wchar_flush,
105107
NULL,
106108
};
107109

@@ -119,60 +121,48 @@ const struct mbfl_convert_vtbl vtbl_wchar_cp950 = {
119121

120122
/* 63 + 94 = 157 or 94 */
121123
static unsigned short cp950_pua_tbl[][4] = {
122-
{0xe000,0xe310,0xfa40,0xfefe},
123-
{0xe311,0xeeb7,0x8e40,0xa0fe},
124-
{0xeeb8,0xf6b0,0x8140,0x8dfe},
125-
{0xf6b1,0xf70e,0xc6a1,0xc6fe},
126-
{0xf70f,0xf848,0xc740,0xc8fe},
124+
{0xe000, 0xe310, 0xfa40, 0xfefe},
125+
{0xe311, 0xeeb7, 0x8e40, 0xa0fe},
126+
{0xeeb8, 0xf6b0, 0x8140, 0x8dfe},
127+
{0xf6b1, 0xf70e, 0xc6a1, 0xc6fe},
128+
{0xf70f, 0xf848, 0xc740, 0xc8fe},
127129
};
128130

129-
static inline int is_in_cp950_pua(int c1, int c) {
131+
static inline int is_in_cp950_pua(int c1, int c)
132+
{
130133
if ((c1 >= 0xfa && c1 <= 0xfe) || (c1 >= 0x8e && c1 <= 0xa0) ||
131134
(c1 >= 0x81 && c1 <= 0x8d) || (c1 >= 0xc7 && c1 <= 0xc8)) {
132-
return (c >=0x40 && c <= 0x7e) || (c >= 0xa1 && c <= 0xfe);
133-
}
134-
if (c1 == 0xc6) {
135+
return (c >= 0x40 && c <= 0x7e) || (c >= 0xa1 && c <= 0xfe);
136+
} else if (c1 == 0xc6) {
135137
return c >= 0xa1 && c <= 0xfe;
136138
}
137139
return 0;
138140
}
139141

140-
/*
141-
* Big5 => wchar
142-
*/
143-
int
144-
mbfl_filt_conv_big5_wchar(int c, mbfl_convert_filter *filter)
142+
int mbfl_filt_conv_big5_wchar(int c, mbfl_convert_filter *filter)
145143
{
146-
int k;
147-
int c1, w, c2;
144+
int k, c1, w;
148145

149146
switch (filter->status) {
150147
case 0:
151-
if (filter->from->no_encoding == mbfl_no_encoding_cp950) {
152-
c1 = 0x80;
153-
} else {
154-
c1 = 0xa0;
155-
}
156-
157-
if (c >= 0 && c <= 0x80) { /* latin */
148+
if (c >= 0 && c < 0x80) { /* latin */
158149
CK((*filter->output_function)(c, filter->data));
159-
} else if (c == 0xff) {
160-
CK((*filter->output_function)(0xf8f8, filter->data));
161-
} else if (c > c1 && c < 0xff) { /* dbcs lead byte */
150+
} else if (filter->from->no_encoding != mbfl_no_encoding_cp950 && c > 0xA0 && c <= 0xF9 && c != 0xC8) {
151+
filter->status = 1;
152+
filter->cache = c;
153+
} else if (filter->from->no_encoding == mbfl_no_encoding_cp950 && c > 0x80 && c <= 0xFE) {
162154
filter->status = 1;
163155
filter->cache = c;
164156
} else {
165-
w = c & MBFL_WCSGROUP_MASK;
166-
w |= MBFL_WCSGROUP_THROUGH;
167-
CK((*filter->output_function)(w, filter->data));
157+
CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
168158
}
169159
break;
170160

171-
case 1: /* dbcs second byte */
161+
case 1: /* dbcs second byte */
172162
filter->status = 0;
173163
c1 = filter->cache;
174-
if ((c > 0x39 && c < 0x7f) | (c > 0xa0 && c < 0xff)) {
175-
if (c < 0x7f){
164+
if ((c > 0x3f && c < 0x7f) || (c > 0xa0 && c < 0xff)) {
165+
if (c < 0x7f) {
176166
w = (c1 - 0xa1)*157 + (c - 0x40);
177167
} else {
178168
w = (c1 - 0xa1)*157 + (c - 0xa1) + 0x3f;
@@ -185,35 +175,67 @@ mbfl_filt_conv_big5_wchar(int c, mbfl_convert_filter *filter)
185175

186176
if (filter->from->no_encoding == mbfl_no_encoding_cp950) {
187177
/* PUA for CP950 */
188-
if (w <= 0 && is_in_cp950_pua(c1, c)) {
189-
c2 = c1 << 8 | c;
190-
for (k = 0; k < sizeof(cp950_pua_tbl)/(sizeof(unsigned short)*4); k++) {
178+
if (is_in_cp950_pua(c1, c)) {
179+
int c2 = (c1 << 8) | c;
180+
181+
for (k = 0; k < sizeof(cp950_pua_tbl) / (sizeof(unsigned short)*4); k++) {
191182
if (c2 >= cp950_pua_tbl[k][2] && c2 <= cp950_pua_tbl[k][3]) {
192183
break;
193184
}
194185
}
195186

196187
if ((cp950_pua_tbl[k][2] & 0xff) == 0x40) {
197-
w = 157*(c1 - (cp950_pua_tbl[k][2]>>8)) + c - (c >= 0xa1 ? 0x62 : 0x40)
198-
+ cp950_pua_tbl[k][0];
188+
w = 157*(c1 - (cp950_pua_tbl[k][2]>>8)) + c - (c >= 0xa1 ? 0x62 : 0x40) + cp950_pua_tbl[k][0];
199189
} else {
200190
w = c2 - cp950_pua_tbl[k][2] + cp950_pua_tbl[k][0];
201191
}
192+
} else if (c1 == 0xA1) {
193+
if (c == 0x45) {
194+
w = 0x2027;
195+
} else if (c == 0x4E) {
196+
w = 0xFE51;
197+
} else if (c == 0x5A) {
198+
w = 0x2574;
199+
} else if (c == 0xC2) {
200+
w = 0x00AF;
201+
} else if (c == 0xC3) {
202+
w = 0xFFE3;
203+
} else if (c == 0xC5) {
204+
w = 0x02CD;
205+
} else if (c == 0xE3) {
206+
w = 0xFF5E;
207+
} else if (c == 0xF2) {
208+
w = 0x2295;
209+
} else if (c == 0xF3) {
210+
w = 0x2299;
211+
} else if (c == 0xFE) {
212+
w = 0xFF0F;
213+
}
214+
} else if (c1 == 0xA2) {
215+
if (c == 0x40) {
216+
w = 0xFF3C;
217+
} else if (c == 0x41) {
218+
w = 0x2215;
219+
} else if (c == 0x42) {
220+
w = 0xFE68;
221+
} else if (c == 0x46) {
222+
w = 0xFFE0;
223+
} else if (c == 0x47) {
224+
w = 0xFFE1;
225+
} else if (c == 0xCC) {
226+
w = 0x5341;
227+
} else if (c == 0xCE) {
228+
w = 0x5345;
229+
}
202230
}
203231
}
204232

205233
if (w <= 0) {
206-
w = (c1 << 8) | c;
207-
w &= MBFL_WCSPLANE_MASK;
208-
w |= MBFL_WCSPLANE_BIG5;
234+
w = (c1 << 8) | c | MBFL_WCSPLANE_BIG5;
209235
}
210236
CK((*filter->output_function)(w, filter->data));
211-
} else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
212-
CK((*filter->output_function)(c, filter->data));
213237
} else {
214-
w = (c1 << 8) | c;
215-
w &= MBFL_WCSGROUP_MASK;
216-
w |= MBFL_WCSGROUP_THROUGH;
238+
w = (c1 << 8) | c | MBFL_WCSGROUP_THROUGH;
217239
CK((*filter->output_function)(w, filter->data));
218240
}
219241
break;
@@ -226,16 +248,24 @@ mbfl_filt_conv_big5_wchar(int c, mbfl_convert_filter *filter)
226248
return c;
227249
}
228250

229-
/*
230-
* wchar => Big5
231-
*/
232-
int
233-
mbfl_filt_conv_wchar_big5(int c, mbfl_convert_filter *filter)
251+
static int mbfl_filt_conv_big5_wchar_flush(mbfl_convert_filter *filter)
234252
{
235-
int k;
236-
int c1, s, c2;
253+
if (filter->status == 1) {
254+
/* 2-byte character was truncated */
255+
CK((*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data));
256+
}
257+
258+
if (filter->flush_function) {
259+
(*filter->flush_function)(filter->data);
260+
}
261+
262+
return 0;
263+
}
264+
265+
int mbfl_filt_conv_wchar_big5(int c, mbfl_convert_filter *filter)
266+
{
267+
int k, s = 0;
237268

238-
s = 0;
239269
if (c >= ucs_a1_big5_table_min && c < ucs_a1_big5_table_max) {
240270
s = ucs_a1_big5_table[c - ucs_a1_big5_table_min];
241271
} else if (c >= ucs_a2_big5_table_min && c < ucs_a2_big5_table_max) {
@@ -244,8 +274,6 @@ mbfl_filt_conv_wchar_big5(int c, mbfl_convert_filter *filter)
244274
s = ucs_a3_big5_table[c - ucs_a3_big5_table_min];
245275
} else if (c >= ucs_i_big5_table_min && c < ucs_i_big5_table_max) {
246276
s = ucs_i_big5_table[c - ucs_i_big5_table_min];
247-
} else if (c >= ucs_pua_big5_table_min && c < ucs_pua_big5_table_max) {
248-
s = ucs_pua_big5_table[c - ucs_pua_big5_table_min];
249277
} else if (c >= ucs_r1_big5_table_min && c < ucs_r1_big5_table_max) {
250278
s = ucs_r1_big5_table[c - ucs_r1_big5_table_min];
251279
} else if (c >= ucs_r2_big5_table_min && c < ucs_r2_big5_table_max) {
@@ -254,49 +282,92 @@ mbfl_filt_conv_wchar_big5(int c, mbfl_convert_filter *filter)
254282

255283
if (filter->to->no_encoding == mbfl_no_encoding_cp950) {
256284
if (c >= 0xe000 && c <= 0xf848) { /* PUA for CP950 */
257-
for (k = 0; k < sizeof(cp950_pua_tbl)/(sizeof(unsigned short)*4); k++) {
285+
for (k = 0; k < sizeof(cp950_pua_tbl) / (sizeof(unsigned short)*4); k++) {
258286
if (c <= cp950_pua_tbl[k][1]) {
259287
break;
260288
}
261289
}
262-
c1 = c - cp950_pua_tbl[k][0];
290+
291+
int c1 = c - cp950_pua_tbl[k][0];
263292
if ((cp950_pua_tbl[k][2] & 0xff) == 0x40) {
264-
c2 = cp950_pua_tbl[k][2] >> 8;
265-
s = ((c1 / 157) + c2) << 8; c1 %= 157;
293+
int c2 = cp950_pua_tbl[k][2] >> 8;
294+
s = ((c1 / 157) + c2) << 8;
295+
c1 %= 157;
266296
s |= c1 + (c1 >= 0x3f ? 0x62 : 0x40);
267297
} else {
268298
s = c1 + cp950_pua_tbl[k][2];
269299
}
270-
}
271-
272-
if (c == 0x80) {
273-
s = 0x80;
274-
} else if (c == 0xf8f8) {
275-
s = 0xff;
276-
} else if (c == 0x256d) {
277-
s = 0xa27e;
278-
} else if (c == 0x256e) {
279-
s = 0xa2a1;
280-
} else if (c == 0x256f) {
281-
s = 0xa2a3;
282-
} else if (c == 0x2570) {
283-
s = 0xa2a2;
300+
} else if (c == 0x00A2) {
301+
s = 0;
302+
} else if (c == 0x00A3) {
303+
s = 0;
304+
} else if (c == 0x00AF) {
305+
s = 0xA1C2;
306+
} else if (c == 0x02CD) {
307+
s = 0xA1C5;
308+
} else if (c == 0x0401) {
309+
s = 0;
310+
} else if (c >= 0x0414 && c <= 0x041C) {
311+
s = 0;
312+
} else if (c >= 0x0423 && c <= 0x044F) {
313+
s = 0;
314+
} else if (c == 0x0451) {
315+
s = 0;
316+
} else if (c == 0x2022) {
317+
s = 0;
318+
} else if (c == 0x2027) {
319+
s = 0xA145;
320+
} else if (c == 0x203E) {
321+
s = 0;
322+
} else if (c == 0x2215) {
323+
s = 0xA241;
324+
} else if (c == 0x223C) {
325+
s = 0;
326+
} else if (c == 0x2295) {
327+
s = 0xA1F2;
328+
} else if (c == 0x2299) {
329+
s = 0xA1F3;
330+
} else if (c >= 0x2460 && c <= 0x247D) {
331+
s = 0;
332+
} else if (c == 0x2574) {
333+
s = 0xA15A;
334+
} else if (c == 0x2609) {
335+
s = 0;
336+
} else if (c == 0x2641) {
337+
s = 0;
338+
} else if (c == 0x3005 || (c >= 0x302A && c <= 0x30FF)) {
339+
s = 0;
340+
} else if (c == 0xFE51) {
341+
s = 0xA14E;
342+
} else if (c == 0xFE68) {
343+
s = 0xA242;
344+
} else if (c == 0xFF3C) {
345+
s = 0xA240;
346+
} else if (c == 0xFF5E) {
347+
s = 0xA1E3;
348+
} else if (c == 0xFF64) {
349+
s = 0;
350+
} else if (c == 0xFFE0) {
351+
s = 0xA246;
352+
} else if (c == 0xFFE1) {
353+
s = 0xA247;
354+
} else if (c == 0xFFE3) {
355+
s = 0xA1C3;
356+
} else if (c == 0xFF0F) {
357+
s = 0xA1FE;
284358
}
285359
}
286360

287361
if (s <= 0) {
288-
c1 = c & ~MBFL_WCSPLANE_MASK;
289-
if (c1 == MBFL_WCSPLANE_BIG5) {
290-
s = c & MBFL_WCSPLANE_MASK;
291-
}
292362
if (c == 0) {
293363
s = 0;
294-
} else if (s <= 0) {
364+
} else {
295365
s = -1;
296366
}
297367
}
368+
298369
if (s >= 0) {
299-
if (s <= 0x80 || s == 0xff) { /* latin */
370+
if (s <= 0x80) { /* latin */
300371
CK((*filter->output_function)(s, filter->data));
301372
} else {
302373
CK((*filter->output_function)((s >> 8) & 0xff, filter->data));

0 commit comments

Comments
 (0)