|
30 | 30 | #include "mbfilter.h"
|
31 | 31 | #include "mbfilter_utf32.h"
|
32 | 32 |
|
| 33 | +static int mbfl_filt_ident_utf32(int c, mbfl_identify_filter *filter); |
| 34 | +static int mbfl_filt_ident_utf32le(int c, mbfl_identify_filter *filter); |
| 35 | +static int mbfl_filt_ident_utf32be(int c, mbfl_identify_filter *filter); |
| 36 | + |
33 | 37 | static const char *mbfl_encoding_utf32_aliases[] = {"utf32", NULL};
|
34 | 38 |
|
35 | 39 | const mbfl_encoding mbfl_encoding_utf32 = {
|
@@ -65,6 +69,24 @@ const mbfl_encoding mbfl_encoding_utf32le = {
|
65 | 69 | &vtbl_wchar_utf32le
|
66 | 70 | };
|
67 | 71 |
|
| 72 | +const struct mbfl_identify_vtbl vtbl_identify_utf32 = { |
| 73 | + mbfl_no_encoding_utf32, |
| 74 | + mbfl_filt_ident_common_ctor, |
| 75 | + mbfl_filt_ident_utf32 |
| 76 | +}; |
| 77 | + |
| 78 | +const struct mbfl_identify_vtbl vtbl_identify_utf32be = { |
| 79 | + mbfl_no_encoding_utf32be, |
| 80 | + mbfl_filt_ident_common_ctor, |
| 81 | + mbfl_filt_ident_utf32be |
| 82 | +}; |
| 83 | + |
| 84 | +const struct mbfl_identify_vtbl vtbl_identify_utf32le = { |
| 85 | + mbfl_no_encoding_utf32le, |
| 86 | + mbfl_filt_ident_common_ctor, |
| 87 | + mbfl_filt_ident_utf32le |
| 88 | +}; |
| 89 | + |
68 | 90 | const struct mbfl_convert_vtbl vtbl_utf32_wchar = {
|
69 | 91 | mbfl_no_encoding_utf32,
|
70 | 92 | mbfl_no_encoding_wchar,
|
@@ -289,3 +311,128 @@ int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter)
|
289 | 311 |
|
290 | 312 | return c;
|
291 | 313 | }
|
| 314 | + |
| 315 | +static int mbfl_filt_ident_utf32(int c, mbfl_identify_filter *filter) |
| 316 | +{ |
| 317 | + /* The largest valid codepoint is 0x10FFFF; we don't want values above that |
| 318 | + * Neither do we want to see surrogates |
| 319 | + * For UTF-32 (not LE or BE), we do also need to look for a byte-order mark */ |
| 320 | + switch (filter->status) { |
| 321 | + case 0: /* 1st byte */ |
| 322 | + if (c == 0xff) { |
| 323 | + filter->status = 1; |
| 324 | + return c; |
| 325 | + } |
| 326 | + filter->filter_function = mbfl_filt_ident_utf32be; |
| 327 | + break; |
| 328 | + |
| 329 | + case 1: /* 2nd byte */ |
| 330 | + if (c == 0xfe) { |
| 331 | + filter->status = 2; |
| 332 | + return c; |
| 333 | + } |
| 334 | + filter->filter_function = mbfl_filt_ident_utf32be; |
| 335 | + (filter->filter_function)(0xff, filter); |
| 336 | + break; |
| 337 | + |
| 338 | + case 2: /* 3rd byte */ |
| 339 | + if (c == 0) { |
| 340 | + filter->status = 3; |
| 341 | + return c; |
| 342 | + } |
| 343 | + filter->filter_function = mbfl_filt_ident_utf32be; |
| 344 | + (filter->filter_function)(0xff, filter); |
| 345 | + (filter->filter_function)(0xfe, filter); |
| 346 | + break; |
| 347 | + |
| 348 | + case 3: /* 4th byte */ |
| 349 | + if (c == 0) { |
| 350 | + /* We found a little-endian byte-order mark! */ |
| 351 | + filter->status = 0; |
| 352 | + filter->filter_function = mbfl_filt_ident_utf32le; |
| 353 | + return c; |
| 354 | + } |
| 355 | + filter->filter_function = mbfl_filt_ident_utf32be; |
| 356 | + (filter->filter_function)(0xff, filter); |
| 357 | + (filter->filter_function)(0xfe, filter); |
| 358 | + (filter->filter_function)(0, filter); |
| 359 | + break; |
| 360 | + } |
| 361 | + |
| 362 | + return (filter->filter_function)(c, filter); |
| 363 | +} |
| 364 | + |
| 365 | +static int mbfl_filt_ident_utf32le(int c, mbfl_identify_filter *filter) |
| 366 | +{ |
| 367 | + switch (filter->status) { |
| 368 | + case 0: /* 1st byte */ |
| 369 | + filter->status = 1; |
| 370 | + break; |
| 371 | + |
| 372 | + case 1: /* 2nd byte */ |
| 373 | + if (c >= 0xD8 && c <= 0xDF) { |
| 374 | + filter->status = 4; /* might be surrogate if we are in BMP */ |
| 375 | + } else { |
| 376 | + filter->status = 2; |
| 377 | + } |
| 378 | + break; |
| 379 | + |
| 380 | + case 2: /* 3rd byte */ |
| 381 | + if (c > 0x10) { |
| 382 | + filter->flag = 1; /* too big */ |
| 383 | + } |
| 384 | + filter->status = 3; |
| 385 | + break; |
| 386 | + |
| 387 | + case 3: /* 4th byte */ |
| 388 | + if (c) { |
| 389 | + filter->flag = 1; /* too big */ |
| 390 | + } |
| 391 | + filter->status = 0; |
| 392 | + break; |
| 393 | + |
| 394 | + case 4: /* 3rd byte, previous byte looked like surrogate */ |
| 395 | + if (!c) { |
| 396 | + filter->flag = 1; /* yep, it's a surrogate */ |
| 397 | + } |
| 398 | + filter->status = 3; |
| 399 | + } |
| 400 | + return c; |
| 401 | +} |
| 402 | + |
| 403 | +static int mbfl_filt_ident_utf32be(int c, mbfl_identify_filter *filter) |
| 404 | +{ |
| 405 | + switch (filter->status) { |
| 406 | + case 0: /* 1st byte */ |
| 407 | + if (c) { |
| 408 | + filter->flag = 1; /* too big */ |
| 409 | + } |
| 410 | + filter->status = 1; |
| 411 | + break; |
| 412 | + |
| 413 | + case 1: /* 2nd byte */ |
| 414 | + if (c > 0x10) { |
| 415 | + filter->flag = 1; /* too big */ |
| 416 | + } if (c) { |
| 417 | + filter->status = 4; /* not in the BMP */ |
| 418 | + } else { |
| 419 | + filter->status = 2; |
| 420 | + } |
| 421 | + break; |
| 422 | + |
| 423 | + case 2: /* 3rd byte */ |
| 424 | + if (c >= 0xD8 && c <= 0xDF) { |
| 425 | + filter->flag = 1; /* reserved range for surrogates */ |
| 426 | + } |
| 427 | + filter->status = 3; |
| 428 | + break; |
| 429 | + |
| 430 | + case 3: /* 4th byte */ |
| 431 | + filter->status = 0; |
| 432 | + break; |
| 433 | + |
| 434 | + case 4: /* 3rd byte, not in BMP */ |
| 435 | + filter->status = 3; |
| 436 | + } |
| 437 | + return c; |
| 438 | +} |
0 commit comments