Skip to content

Commit 7047e5d

Browse files
committed
Add identify filter for UTF-32{,BE,LE}
1 parent d8895cd commit 7047e5d

File tree

3 files changed

+153
-0
lines changed

3 files changed

+153
-0
lines changed

ext/mbstring/libmbfl/filters/mbfilter_utf32.c

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@
3030
#include "mbfilter.h"
3131
#include "mbfilter_utf32.h"
3232

33+
static int mbfl_filt_ident_utf32(int c, mbfl_identify_filter *filter);
34+
static int mbfl_filt_ident_utf32le(int c, mbfl_identify_filter *filter);
35+
static int mbfl_filt_ident_utf32be(int c, mbfl_identify_filter *filter);
36+
3337
static const char *mbfl_encoding_utf32_aliases[] = {"utf32", NULL};
3438

3539
const mbfl_encoding mbfl_encoding_utf32 = {
@@ -65,6 +69,24 @@ const mbfl_encoding mbfl_encoding_utf32le = {
6569
&vtbl_wchar_utf32le
6670
};
6771

72+
const struct mbfl_identify_vtbl vtbl_identify_utf32 = {
73+
mbfl_no_encoding_utf32,
74+
mbfl_filt_ident_common_ctor,
75+
mbfl_filt_ident_utf32
76+
};
77+
78+
const struct mbfl_identify_vtbl vtbl_identify_utf32be = {
79+
mbfl_no_encoding_utf32be,
80+
mbfl_filt_ident_common_ctor,
81+
mbfl_filt_ident_utf32be
82+
};
83+
84+
const struct mbfl_identify_vtbl vtbl_identify_utf32le = {
85+
mbfl_no_encoding_utf32le,
86+
mbfl_filt_ident_common_ctor,
87+
mbfl_filt_ident_utf32le
88+
};
89+
6890
const struct mbfl_convert_vtbl vtbl_utf32_wchar = {
6991
mbfl_no_encoding_utf32,
7092
mbfl_no_encoding_wchar,
@@ -289,3 +311,128 @@ int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter)
289311

290312
return c;
291313
}
314+
315+
static int mbfl_filt_ident_utf32(int c, mbfl_identify_filter *filter)
316+
{
317+
/* The largest valid codepoint is 0x10FFFF; we don't want values above that
318+
* Neither do we want to see surrogates
319+
* For UTF-32 (not LE or BE), we do also need to look for a byte-order mark */
320+
switch (filter->status) {
321+
case 0: /* 1st byte */
322+
if (c == 0xff) {
323+
filter->status = 1;
324+
return c;
325+
}
326+
filter->filter_function = mbfl_filt_ident_utf32be;
327+
break;
328+
329+
case 1: /* 2nd byte */
330+
if (c == 0xfe) {
331+
filter->status = 2;
332+
return c;
333+
}
334+
filter->filter_function = mbfl_filt_ident_utf32be;
335+
(filter->filter_function)(0xff, filter);
336+
break;
337+
338+
case 2: /* 3rd byte */
339+
if (c == 0) {
340+
filter->status = 3;
341+
return c;
342+
}
343+
filter->filter_function = mbfl_filt_ident_utf32be;
344+
(filter->filter_function)(0xff, filter);
345+
(filter->filter_function)(0xfe, filter);
346+
break;
347+
348+
case 3: /* 4th byte */
349+
if (c == 0) {
350+
/* We found a little-endian byte-order mark! */
351+
filter->status = 0;
352+
filter->filter_function = mbfl_filt_ident_utf32le;
353+
return c;
354+
}
355+
filter->filter_function = mbfl_filt_ident_utf32be;
356+
(filter->filter_function)(0xff, filter);
357+
(filter->filter_function)(0xfe, filter);
358+
(filter->filter_function)(0, filter);
359+
break;
360+
}
361+
362+
return (filter->filter_function)(c, filter);
363+
}
364+
365+
static int mbfl_filt_ident_utf32le(int c, mbfl_identify_filter *filter)
366+
{
367+
switch (filter->status) {
368+
case 0: /* 1st byte */
369+
filter->status = 1;
370+
break;
371+
372+
case 1: /* 2nd byte */
373+
if (c >= 0xD8 && c <= 0xDF) {
374+
filter->status = 4; /* might be surrogate if we are in BMP */
375+
} else {
376+
filter->status = 2;
377+
}
378+
break;
379+
380+
case 2: /* 3rd byte */
381+
if (c > 0x10) {
382+
filter->flag = 1; /* too big */
383+
}
384+
filter->status = 3;
385+
break;
386+
387+
case 3: /* 4th byte */
388+
if (c) {
389+
filter->flag = 1; /* too big */
390+
}
391+
filter->status = 0;
392+
break;
393+
394+
case 4: /* 3rd byte, previous byte looked like surrogate */
395+
if (!c) {
396+
filter->flag = 1; /* yep, it's a surrogate */
397+
}
398+
filter->status = 3;
399+
}
400+
return c;
401+
}
402+
403+
static int mbfl_filt_ident_utf32be(int c, mbfl_identify_filter *filter)
404+
{
405+
switch (filter->status) {
406+
case 0: /* 1st byte */
407+
if (c) {
408+
filter->flag = 1; /* too big */
409+
}
410+
filter->status = 1;
411+
break;
412+
413+
case 1: /* 2nd byte */
414+
if (c > 0x10) {
415+
filter->flag = 1; /* too big */
416+
} if (c) {
417+
filter->status = 4; /* not in the BMP */
418+
} else {
419+
filter->status = 2;
420+
}
421+
break;
422+
423+
case 2: /* 3rd byte */
424+
if (c >= 0xD8 && c <= 0xDF) {
425+
filter->flag = 1; /* reserved range for surrogates */
426+
}
427+
filter->status = 3;
428+
break;
429+
430+
case 3: /* 4th byte */
431+
filter->status = 0;
432+
break;
433+
434+
case 4: /* 3rd byte, not in BMP */
435+
filter->status = 3;
436+
}
437+
return c;
438+
}

ext/mbstring/libmbfl/filters/mbfilter_utf32.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@
3333
extern const mbfl_encoding mbfl_encoding_utf32;
3434
extern const mbfl_encoding mbfl_encoding_utf32be;
3535
extern const mbfl_encoding mbfl_encoding_utf32le;
36+
extern const struct mbfl_identify_vtbl vtbl_identify_utf32;
37+
extern const struct mbfl_identify_vtbl vtbl_identify_utf32be;
38+
extern const struct mbfl_identify_vtbl vtbl_identify_utf32le;
3639
extern const struct mbfl_convert_vtbl vtbl_utf32_wchar;
3740
extern const struct mbfl_convert_vtbl vtbl_wchar_utf32;
3841
extern const struct mbfl_convert_vtbl vtbl_utf32be_wchar;

ext/mbstring/libmbfl/mbfl/mbfl_ident.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,9 @@ static const struct mbfl_identify_vtbl *mbfl_identify_filter_list[] = {
169169
&vtbl_identify_ucs2,
170170
&vtbl_identify_ucs2be,
171171
&vtbl_identify_ucs2le,
172+
&vtbl_identify_utf32,
173+
&vtbl_identify_utf32be,
174+
&vtbl_identify_utf32le,
172175
&vtbl_identify_false,
173176
NULL
174177
};

0 commit comments

Comments
 (0)