@@ -127,9 +127,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
127
127
CK ((* filter -> output_function )(s , filter -> data ));
128
128
} else {
129
129
CK (mbfl_filt_put_invalid_char (filter ));
130
- if (c < 0x80 || (c >= 0xc2 && c <= 0xf4 )) {
131
- goto retry ;
132
- }
130
+ goto retry ;
133
131
}
134
132
break ;
135
133
case 0x20 : /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
@@ -144,9 +142,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
144
142
filter -> status ++ ;
145
143
} else {
146
144
CK (mbfl_filt_put_invalid_char (filter ));
147
- if (c < 0x80 || (c >= 0xc2 && c <= 0xf4 )) {
148
- goto retry ;
149
- }
145
+ goto retry ;
150
146
}
151
147
break ;
152
148
case 0x30 : /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
@@ -161,9 +157,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
161
157
filter -> status ++ ;
162
158
} else {
163
159
CK (mbfl_filt_put_invalid_char (filter ));
164
- if (c < 0x80 || (c >= 0xc2 && c <= 0xf4 )) {
165
- goto retry ;
166
- }
160
+ goto retry ;
167
161
}
168
162
break ;
169
163
case 0x31 : /* 4byte code 3rd char: 0x80-0xbf */
@@ -172,9 +166,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
172
166
filter -> status ++ ;
173
167
} else {
174
168
CK (mbfl_filt_put_invalid_char (filter ));
175
- if (c < 0x80 || (c >= 0xc2 && c <= 0xf4 )) {
176
- goto retry ;
177
- }
169
+ goto retry ;
178
170
}
179
171
break ;
180
172
@@ -237,9 +229,7 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
237
229
unsigned char c2 = * p ++ ;
238
230
if ((c2 & 0xC0 ) != 0x80 ) {
239
231
* out ++ = MBFL_BAD_INPUT ;
240
- if (c2 < 0x80 || (c2 >= 0xC2 && c2 <= 0xF4 )) {
241
- p -- ;
242
- }
232
+ p -- ;
243
233
} else {
244
234
* out ++ = ((c & 0x1F ) << 6 ) | (c2 & 0x3F );
245
235
}
@@ -252,34 +242,21 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
252
242
unsigned char c3 = * p ++ ;
253
243
if ((c2 & 0xC0 ) != 0x80 || !((c2 >= 0x80 && c2 <= 0xBF ) && ((c == 0xE0 && c2 >= 0xA0 ) || (c == 0xED && c2 < 0xA0 ) || (c > 0xE0 && c != 0xED )))) {
254
244
* out ++ = MBFL_BAD_INPUT ;
255
- if (c2 < 0x80 || (c2 >= 0xC2 && c2 <= 0xF4 )) {
256
- p -= 2 ;
257
- } else {
258
- p -- ;
259
- }
245
+ p -= 2 ;
260
246
} else if ((c3 & 0xC0 ) != 0x80 ) {
261
247
* out ++ = MBFL_BAD_INPUT ;
262
- if (c3 < 0x80 || (c3 >= 0xC2 && c3 <= 0xF4 )) {
263
- p -- ;
264
- }
248
+ p -- ;
265
249
} else {
266
250
uint32_t decoded = ((c & 0xF ) << 12 ) | ((c2 & 0x3F ) << 6 ) | (c3 & 0x3F );
267
- if (decoded >= 0xD800 && decoded <= 0xDFFF ) {
251
+ if (decoded < 0x800 || ( decoded >= 0xD800 && decoded <= 0xDFFF ) ) {
268
252
* out ++ = MBFL_BAD_INPUT ;
269
253
} else {
270
- * out ++ = ( decoded < 0x800 ) ? MBFL_BAD_INPUT : decoded ;
254
+ * out ++ = decoded ;
271
255
}
272
256
}
273
257
} else {
274
258
* out ++ = MBFL_BAD_INPUT ;
275
- /* Skip over some number of bytes to duplicate error-handling behavior of old implementation */
276
- while (p < e ) {
277
- c = * p ;
278
- if ((c & 0xC0 ) != 0x80 ) {
279
- if (c >= 0x80 && (c < 0xC2 || c > 0xF4 ))
280
- p ++ ;
281
- break ;
282
- }
259
+ while (p < e && (* p & 0xC0 ) == 0x80 ) {
283
260
p ++ ;
284
261
}
285
262
}
@@ -288,51 +265,28 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
288
265
unsigned char c2 = * p ++ ;
289
266
unsigned char c3 = * p ++ ;
290
267
unsigned char c4 = * p ++ ;
291
- if ((c2 & 0xC0 ) != 0x80 ) {
268
+ /* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have
269
+ * fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is
270
+ * greater than U+10FFFF, which is the highest legal codepoint */
271
+ if ((c2 & 0xC0 ) != 0x80 || (c == 0xF0 && c2 < 0x90 ) || (c == 0xF4 && c2 >= 0x90 )) {
292
272
* out ++ = MBFL_BAD_INPUT ;
293
- if (c2 < 0x80 || (c2 >= 0xC2 && c2 <= 0xF4 )) {
294
- p -= 3 ;
295
- } else {
296
- p -= 2 ;
297
- }
298
- } else if ((c3 & 0xC0 ) != 0x80 || !((c == 0xF0 && c2 >= 0x90 ) || (c == 0xF4 && c2 < 0x90 ) || (c > 0xF0 && c < 0xF4 ))) {
273
+ p -= 3 ;
274
+ } else if ((c3 & 0xC0 ) != 0x80 ) {
299
275
* out ++ = MBFL_BAD_INPUT ;
300
- if (!((c == 0xF0 && c2 >= 0x90 ) || (c == 0xF4 && c2 < 0x90 ) || (c > 0xF0 && c < 0xF4 ))) {
301
- if (c2 >= 0x80 && (c2 < 0xC2 || c2 > 0xF4 )) {
302
- p -= 2 ;
303
- } else {
304
- p -= 3 ;
305
- }
306
- } else if (c3 < 0x80 || (c3 >= 0xC2 && c3 <= 0xF4 )) {
307
- p -= 2 ;
308
- } else {
309
- p -- ;
310
- }
276
+ p -= 2 ;
311
277
} else if ((c4 & 0xC0 ) != 0x80 ) {
312
278
* out ++ = MBFL_BAD_INPUT ;
313
- if (c4 < 0x80 || (c4 >= 0xC2 && c4 <= 0xF4 )) {
314
- p -- ;
315
- }
279
+ p -- ;
316
280
} else {
317
281
uint32_t decoded = ((c & 0x7 ) << 18 ) | ((c2 & 0x3F ) << 12 ) | ((c3 & 0x3F ) << 6 ) | (c4 & 0x3F );
318
282
* out ++ = (decoded < 0x10000 ) ? MBFL_BAD_INPUT : decoded ;
319
283
}
320
284
} else {
321
285
* out ++ = MBFL_BAD_INPUT ;
322
- /* Skip over some number of bytes to duplicate error-handling behavior of old implementation */
323
286
if (p < e ) {
324
287
unsigned char c2 = * p ;
325
- if (!((c == 0xF0 && c2 >= 0x90 ) || (c == 0xF4 && c2 < 0x90 ) || (c > 0xF0 && c < 0xF4 ))) {
326
- if (c2 >= 0x80 && (c2 < 0xC2 || c2 > 0xF4 ))
327
- p ++ ;
328
- } else {
329
- while (p < e ) {
330
- c = * p ;
331
- if ((c & 0xC0 ) != 0x80 ) {
332
- if (c >= 0x80 && (c < 0xC2 || c > 0xF4 ))
333
- p ++ ;
334
- break ;
335
- }
288
+ if ((c == 0xF0 && c2 >= 0x90 ) || (c == 0xF4 && c2 < 0x90 ) || c == 0xF2 || c == 0xF3 ) {
289
+ while (p < e && (* p & 0xC0 ) == 0x80 ) {
336
290
p ++ ;
337
291
}
338
292
}
0 commit comments