Skip to content

Commit dcaa010

Browse files
committed
Strict validation of conversion flags to mb_convert_kana
mb_convert_kana is controlled by user-provided flags, which specify what it should convert and to what. These flags come in inverse pairs, for example "fullwidth numerals to halfwidth numerals" and "halfwidth numerals to fullwidth numerals". It does not make sense to combine inverse flags. But, clever reader of commit logs, you will surely say: What if I want all my halfwidth numerals to become fullwidth, and all my fullwidth numerals to become halfwidth? Much too clever, you are! Let's put aside the fact that this bizarre switch-up is ridiculous and will never be used, and face up to another stark reality: mb_convert_kana does not work for that case, and never has. This was probably never noticed because nobody ever tried. Disallowing useless combinations of flags gives freedom to rearrange the kana conversion code without changing behavior. We can also reject unrecognized flags. This may help users to catch bugs. Interestingly, the existing tests used a 'Z' flag, which is useless (it's not recognized at all).
1 parent f6ac832 commit dcaa010

File tree

5 files changed

+214
-121
lines changed

5 files changed

+214
-121
lines changed

ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@ static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter)
322322

323323
static int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter)
324324
{
325-
int mode = MBFL_FILT_TL_HAN2ZEN_KATAKANA | MBFL_FILT_TL_HAN2ZEN_GLUE;
325+
int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
326326
bool consumed = false;
327327

328328
if (filter->cache) {
@@ -342,7 +342,7 @@ static int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter)
342342

343343
static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter)
344344
{
345-
int mode = MBFL_FILT_TL_HAN2ZEN_KATAKANA | MBFL_FILT_TL_HAN2ZEN_GLUE;
345+
int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
346346

347347
if (filter->cache) {
348348
int s = mbfl_convert_kana(filter->cache, 0, NULL, NULL, mode);

ext/mbstring/libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.c

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -46,21 +46,21 @@
4646
*/
4747
int mbfl_convert_kana(int c, int next, bool *consumed, int *second, int mode)
4848
{
49-
if ((mode & MBFL_FILT_TL_HAN2ZEN_ALL) && c >= 0x21 && c <= 0x7d && c != '"' && c != '\'' && c != '\\') {
49+
if ((mode & MBFL_HAN2ZEN_ALL) && c >= 0x21 && c <= 0x7d && c != '"' && c != '\'' && c != '\\') {
5050
return c + 0xfee0;
51-
} else if ((mode & MBFL_FILT_TL_HAN2ZEN_ALPHA) && ((c >= 0x41 && c <= 0x5a) || (c >= 0x61 && c <= 0x7a))) { /* alphabetic */
51+
} else if ((mode & MBFL_HAN2ZEN_ALPHA) && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
5252
return c + 0xfee0;
53-
} else if ((mode & MBFL_FILT_TL_HAN2ZEN_NUMERIC) && c >= 0x30 && c <= 0x39) { /* num */
53+
} else if ((mode & MBFL_HAN2ZEN_NUMERIC) && c >= '0' && c <= '9') {
5454
return c + 0xfee0;
55-
} else if ((mode & MBFL_FILT_TL_HAN2ZEN_SPACE) && c == 0x20) { /* space */
55+
} else if ((mode & MBFL_HAN2ZEN_SPACE) && c == ' ') {
5656
return 0x3000;
5757
}
5858

59-
if (mode & (MBFL_FILT_TL_HAN2ZEN_KATAKANA | MBFL_FILT_TL_HAN2ZEN_HIRAGANA)) {
59+
if (mode & (MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_HIRAGANA)) {
6060
/* Convert Hankaku kana to Zenkaku kana
6161
* Either all Hankaku kana (including katakana and hiragana) will be converted
6262
* to Zenkaku katakana, or to Zenkaku hiragana */
63-
if ((mode & MBFL_FILT_TL_HAN2ZEN_KATAKANA) && (mode & MBFL_FILT_TL_HAN2ZEN_GLUE)) {
63+
if ((mode & MBFL_HAN2ZEN_KATAKANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
6464
if (c >= 0xff61 && c <= 0xff9f) {
6565
int n = c - 0xff60;
6666
if (next >= 0xff61 && next <= 0xff9f) {
@@ -78,7 +78,7 @@ int mbfl_convert_kana(int c, int next, bool *consumed, int *second, int mode)
7878

7979
return 0x3000 + hankana2zenkana_table[n];
8080
}
81-
} else if ((mode & MBFL_FILT_TL_HAN2ZEN_HIRAGANA) && (mode & MBFL_FILT_TL_HAN2ZEN_GLUE)) {
81+
} else if ((mode & MBFL_HAN2ZEN_HIRAGANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
8282
if (c >= 0xff61 && c <= 0xff9f) {
8383
int n = c - 0xff60;
8484
if (next >= 0xff61 && next <= 0xff9f) {
@@ -93,14 +93,14 @@ int mbfl_convert_kana(int c, int next, bool *consumed, int *second, int mode)
9393

9494
return 0x3000 + hankana2zenhira_table[n];
9595
}
96-
} else if ((mode & MBFL_FILT_TL_HAN2ZEN_KATAKANA) && c >= 0xff61 && c <= 0xff9f) {
96+
} else if ((mode & MBFL_HAN2ZEN_KATAKANA) && c >= 0xff61 && c <= 0xff9f) {
9797
return 0x3000 + hankana2zenkana_table[c - 0xff60];
98-
} else if ((mode & MBFL_FILT_TL_HAN2ZEN_HIRAGANA) && c >= 0xff61 && c <= 0xff9f) {
98+
} else if ((mode & MBFL_HAN2ZEN_HIRAGANA) && c >= 0xff61 && c <= 0xff9f) {
9999
return 0x3000 + hankana2zenhira_table[c - 0xff60];
100100
}
101101
}
102102

103-
if (mode & MBFL_FILT_TL_HAN2ZEN_COMPAT1) { /* special ascii to symbol */
103+
if (mode & MBFL_HAN2ZEN_SPECIAL) { /* special ascii to symbol */
104104
if (c == 0x5c) {
105105
return 0xffe5; /* FULLWIDTH YEN SIGN */
106106
} else if (c == 0xa5) { /* YEN SIGN */
@@ -116,32 +116,32 @@ int mbfl_convert_kana(int c, int next, bool *consumed, int *second, int mode)
116116
}
117117
}
118118

119-
if (mode & (MBFL_FILT_TL_ZEN2HAN_ALL | MBFL_FILT_TL_ZEN2HAN_ALPHA | MBFL_FILT_TL_ZEN2HAN_NUMERIC | MBFL_FILT_TL_ZEN2HAN_SPACE)) {
119+
if (mode & (MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC | MBFL_ZEN2HAN_SPACE)) {
120120
/* Zenkaku to Hankaku */
121-
if ((mode & MBFL_FILT_TL_ZEN2HAN_ALL) && c >= 0xff01 && c <= 0xff5d && c != 0xff02 && c != 0xff07 && c!= 0xff3c) {
121+
if ((mode & MBFL_ZEN2HAN_ALL) && c >= 0xff01 && c <= 0xff5d && c != 0xff02 && c != 0xff07 && c != 0xff3c) {
122122
/* all except " ' \ ~ */
123123
return c - 0xfee0;
124-
} else if ((mode & MBFL_FILT_TL_ZEN2HAN_ALPHA) && ((c >= 0xff21 && c <= 0xff3a) || (c >= 0xff41 && c <= 0xff5a))) {
124+
} else if ((mode & MBFL_ZEN2HAN_ALPHA) && ((c >= 0xff21 && c <= 0xff3a) || (c >= 0xff41 && c <= 0xff5a))) {
125125
return c - 0xfee0;
126-
} else if ((mode & MBFL_FILT_TL_ZEN2HAN_NUMERIC) && (c >= 0xff10 && c <= 0xff19)) {
126+
} else if ((mode & MBFL_ZEN2HAN_NUMERIC) && (c >= 0xff10 && c <= 0xff19)) {
127127
return c - 0xfee0;
128-
} else if ((mode & MBFL_FILT_TL_ZEN2HAN_SPACE) && (c == 0x3000)) {
128+
} else if ((mode & MBFL_ZEN2HAN_SPACE) && (c == 0x3000)) {
129129
return 0x20;
130-
} else if ((mode & MBFL_FILT_TL_ZEN2HAN_ALL) && (c == 0x2212)) { /* MINUS SIGN */
130+
} else if ((mode & MBFL_ZEN2HAN_ALL) && (c == 0x2212)) { /* MINUS SIGN */
131131
return 0x2d;
132132
}
133133
}
134134

135-
if (mode & (MBFL_FILT_TL_ZEN2HAN_KATAKANA | MBFL_FILT_TL_ZEN2HAN_HIRAGANA)) {
135+
if (mode & (MBFL_ZEN2HAN_KATAKANA | MBFL_ZEN2HAN_HIRAGANA)) {
136136
/* Zenkaku kana to hankaku kana */
137-
if ((mode & MBFL_FILT_TL_ZEN2HAN_KATAKANA) && c >= 0x30a1 && c <= 0x30f4) {
137+
if ((mode & MBFL_ZEN2HAN_KATAKANA) && c >= 0x30a1 && c <= 0x30f4) {
138138
/* Zenkaku katakana to hankaku kana */
139139
int n = c - 0x30a1;
140140
if (zenkana2hankana_table[n][1]) {
141141
*second = 0xff00 + zenkana2hankana_table[n][1];
142142
}
143143
return 0xff00 + zenkana2hankana_table[n][0];
144-
} else if ((mode & MBFL_FILT_TL_ZEN2HAN_HIRAGANA) && c >= 0x3041 && c <= 0x3093) {
144+
} else if ((mode & MBFL_ZEN2HAN_HIRAGANA) && c >= 0x3041 && c <= 0x3093) {
145145
/* Zenkaku hiragana to hankaku kana */
146146
int n = c - 0x3041;
147147
if (zenkana2hankana_table[n][1]) {
@@ -165,17 +165,17 @@ int mbfl_convert_kana(int c, int next, bool *consumed, int *second, int mode)
165165
} else if (c == 0x30fb) {
166166
return 0xff65; /* HALFWIDTH KATAKANA MIDDLE DOT */
167167
}
168-
} else if (mode & (MBFL_FILT_TL_ZENKAKU_HIRA2KANA | MBFL_FILT_TL_ZENKAKU_KANA2HIRA)) {
169-
if ((mode & MBFL_FILT_TL_ZENKAKU_HIRA2KANA) && ((c >= 0x3041 && c <= 0x3093) || c == 0x309d || c == 0x309e)) {
168+
} else if (mode & (MBFL_ZENKAKU_HIRA2KATA | MBFL_ZENKAKU_KATA2HIRA)) {
169+
if ((mode & MBFL_ZENKAKU_HIRA2KATA) && ((c >= 0x3041 && c <= 0x3093) || c == 0x309d || c == 0x309e)) {
170170
/* Zenkaku hiragana to Zenkaku katakana */
171171
return c + 0x60;
172-
} else if ((mode & MBFL_FILT_TL_ZENKAKU_KANA2HIRA) && ((c >= 0x30a1 && c <= 0x30f3) || c == 0x30fd || c == 0x30fe)) {
172+
} else if ((mode & MBFL_ZENKAKU_KATA2HIRA) && ((c >= 0x30a1 && c <= 0x30f3) || c == 0x30fd || c == 0x30fe)) {
173173
/* Zenkaku katakana to Zenkaku hiragana */
174174
return c - 0x60;
175175
}
176176
}
177177

178-
if (mode & MBFL_FILT_TL_ZEN2HAN_COMPAT1) { /* special symbol to ascii */
178+
if (mode & MBFL_ZEN2HAN_SPECIAL) { /* special symbol to ascii */
179179
if (c == 0xffe5) { /* FULLWIDTH YEN SIGN */
180180
return 0x5c;
181181
} else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */

ext/mbstring/libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.h

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -29,23 +29,25 @@
2929

3030
/* "Zen" is 全, or "full"; "Han" is 半, or "half"
3131
* This refers to "fullwidth" or "halfwidth" variants of characters used for writing Japanese */
32-
#define MBFL_FILT_TL_HAN2ZEN_ALL 0x00000001
33-
#define MBFL_FILT_TL_HAN2ZEN_ALPHA 0x00000002
34-
#define MBFL_FILT_TL_HAN2ZEN_NUMERIC 0x00000004
35-
#define MBFL_FILT_TL_HAN2ZEN_SPACE 0x00000008
36-
#define MBFL_FILT_TL_ZEN2HAN_ALL 0x00000010
37-
#define MBFL_FILT_TL_ZEN2HAN_ALPHA 0x00000020
38-
#define MBFL_FILT_TL_ZEN2HAN_NUMERIC 0x00000040
39-
#define MBFL_FILT_TL_ZEN2HAN_SPACE 0x00000080
40-
#define MBFL_FILT_TL_HAN2ZEN_KATAKANA 0x00000100
41-
#define MBFL_FILT_TL_HAN2ZEN_HIRAGANA 0x00000200
42-
#define MBFL_FILT_TL_HAN2ZEN_GLUE 0x00000800
43-
#define MBFL_FILT_TL_ZEN2HAN_KATAKANA 0x00001000
44-
#define MBFL_FILT_TL_ZEN2HAN_HIRAGANA 0x00002000
45-
#define MBFL_FILT_TL_ZENKAKU_HIRA2KANA 0x00010000
46-
#define MBFL_FILT_TL_ZENKAKU_KANA2HIRA 0x00020000
47-
#define MBFL_FILT_TL_HAN2ZEN_COMPAT1 0x00100000
48-
#define MBFL_FILT_TL_ZEN2HAN_COMPAT1 0x00200000
32+
#define MBFL_HAN2ZEN_ALL 0x00001
33+
#define MBFL_HAN2ZEN_ALPHA 0x00002
34+
#define MBFL_HAN2ZEN_NUMERIC 0x00004
35+
#define MBFL_HAN2ZEN_SPACE 0x00008
36+
#define MBFL_HAN2ZEN_KATAKANA 0x00010
37+
#define MBFL_HAN2ZEN_HIRAGANA 0x00020
38+
#define MBFL_HAN2ZEN_SPECIAL 0x00040
39+
#define MBFL_ZENKAKU_HIRA2KATA 0x00080
40+
41+
#define MBFL_ZEN2HAN_ALL 0x00100
42+
#define MBFL_ZEN2HAN_ALPHA 0x00200
43+
#define MBFL_ZEN2HAN_NUMERIC 0x00400
44+
#define MBFL_ZEN2HAN_SPACE 0x00800
45+
#define MBFL_ZEN2HAN_KATAKANA 0x01000
46+
#define MBFL_ZEN2HAN_HIRAGANA 0x02000
47+
#define MBFL_ZEN2HAN_SPECIAL 0x04000
48+
#define MBFL_ZENKAKU_KATA2HIRA 0x08000
49+
50+
#define MBFL_HAN2ZEN_GLUE 0x10000
4951

5052
extern const struct mbfl_convert_vtbl vtbl_tl_jisx0201_jisx0208;
5153

ext/mbstring/mbstring.c

Lines changed: 71 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -2848,6 +2848,12 @@ PHP_FUNCTION(mb_decode_mimeheader)
28482848
}
28492849
/* }}} */
28502850

2851+
char mb_convert_kana_flags[17] = {
2852+
'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C',
2853+
'a', 'r', 'n', 's', 'k', 'h', 'm', 'c',
2854+
'V'
2855+
};
2856+
28512857
/* {{{ Conversion between full-width character and half-width character (Japanese) */
28522858
PHP_FUNCTION(mb_convert_kana)
28532859
{
@@ -2866,70 +2872,78 @@ PHP_FUNCTION(mb_convert_kana)
28662872

28672873
string.val = (unsigned char*)string_val;
28682874

2869-
/* "Zen" is 全, or "full"; "Han" is 半, or "half"
2870-
* This refers to "fullwidth" or "halfwidth" variants of characters used for writing Japanese */
28712875
if (optstr != NULL) {
28722876
char *p = optstr, *e = p + optstr_len;
28732877
opt = 0;
2878+
next_option:
28742879
while (p < e) {
2875-
switch (*p++) {
2876-
case 'A':
2877-
opt |= MBFL_FILT_TL_HAN2ZEN_ALL;
2878-
break;
2879-
case 'a':
2880-
opt |= MBFL_FILT_TL_ZEN2HAN_ALL;
2881-
break;
2882-
case 'R':
2883-
opt |= MBFL_FILT_TL_HAN2ZEN_ALPHA;
2884-
break;
2885-
case 'r':
2886-
opt |= MBFL_FILT_TL_ZEN2HAN_ALPHA;
2887-
break;
2888-
case 'N':
2889-
opt |= MBFL_FILT_TL_HAN2ZEN_NUMERIC;
2890-
break;
2891-
case 'n':
2892-
opt |= MBFL_FILT_TL_ZEN2HAN_NUMERIC;
2893-
break;
2894-
case 'S':
2895-
opt |= MBFL_FILT_TL_HAN2ZEN_SPACE;
2896-
break;
2897-
case 's':
2898-
opt |= MBFL_FILT_TL_ZEN2HAN_SPACE;
2899-
break;
2900-
case 'K':
2901-
opt |= MBFL_FILT_TL_HAN2ZEN_KATAKANA;
2902-
break;
2903-
case 'k':
2904-
opt |= MBFL_FILT_TL_ZEN2HAN_KATAKANA;
2905-
break;
2906-
case 'H':
2907-
opt |= MBFL_FILT_TL_HAN2ZEN_HIRAGANA;
2908-
break;
2909-
case 'h':
2910-
opt |= MBFL_FILT_TL_ZEN2HAN_HIRAGANA;
2911-
break;
2912-
case 'V':
2913-
opt |= MBFL_FILT_TL_HAN2ZEN_GLUE;
2914-
break;
2915-
case 'C':
2916-
opt |= MBFL_FILT_TL_ZENKAKU_HIRA2KANA;
2917-
break;
2918-
case 'c':
2919-
opt |= MBFL_FILT_TL_ZENKAKU_KANA2HIRA;
2920-
break;
2921-
case 'M':
2922-
/* TODO: figure out what 'M' and 'm' are for, and rename the constant
2923-
* to something meaningful */
2924-
opt |= MBFL_FILT_TL_HAN2ZEN_COMPAT1;
2925-
break;
2926-
case 'm':
2927-
opt |= MBFL_FILT_TL_ZEN2HAN_COMPAT1;
2928-
break;
2880+
/* Walk through option string and convert to bit vector
2881+
* See mbfilter_tl_jisx0201_jisx0208.h for the values used */
2882+
char c = *p++;
2883+
if (c == 'A') {
2884+
opt |= MBFL_HAN2ZEN_ALL | MBFL_HAN2ZEN_ALPHA | MBFL_HAN2ZEN_NUMERIC;
2885+
} else if (c == 'a') {
2886+
opt |= MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC;
2887+
} else {
2888+
for (int i = 0; i < sizeof(mb_convert_kana_flags) / sizeof(char); i++) {
2889+
if (c == mb_convert_kana_flags[i]) {
2890+
opt |= (1 << i);
2891+
goto next_option;
2892+
}
2893+
}
2894+
2895+
zend_argument_value_error(2, "contains invalid flag: '%c'", c);
2896+
RETURN_THROWS();
2897+
}
2898+
}
2899+
2900+
/* Check for illegal combinations of options */
2901+
if (((opt & 0xFF00) >> 8) & opt) {
2902+
/* It doesn't make sense to convert the same type of characters from halfwidth to
2903+
* fullwidth and then back to halfwidth again. Neither does it make sense to convert
2904+
* FW hiragana to FW katakana and then back again. */
2905+
int badflag = ((opt & 0xFF00) >> 8) & opt, i;
2906+
for (i = 0; (badflag & 1) == 0; badflag >>= 1, i++);
2907+
char flag1 = mb_convert_kana_flags[i], flag2 = mb_convert_kana_flags[i+8];
2908+
if ((flag1 == 'R' || flag1 == 'N') && (opt & MBFL_HAN2ZEN_ALL))
2909+
flag1 = 'A';
2910+
if ((flag2 == 'r' || flag2 == 'n') && (opt & MBFL_ZEN2HAN_ALL))
2911+
flag2 = 'a';
2912+
zend_argument_value_error(2, "must not combine '%c' and '%c' flags", flag1, flag2);
2913+
RETURN_THROWS();
2914+
}
2915+
2916+
if ((opt & MBFL_HAN2ZEN_HIRAGANA) && (opt & MBFL_HAN2ZEN_KATAKANA)) {
2917+
/* We can either convert all HW kana to FW hiragana, or to FW katakana, but not both */
2918+
zend_argument_value_error(2, "must not combine 'H' and 'K' flags");
2919+
RETURN_THROWS();
2920+
}
2921+
2922+
/* We can either convert all FW kana to HW hiragana, or all FW kana to HW katakana,
2923+
* or all FW hiragana to FW katakana, or all FW katakana to FW hiragana, but not
2924+
* more than one of these */
2925+
if (opt & MBFL_ZEN2HAN_HIRAGANA) {
2926+
if (opt & MBFL_ZEN2HAN_KATAKANA) {
2927+
zend_argument_value_error(2, "must not combine 'h' and 'k' flags");
2928+
RETURN_THROWS();
2929+
} else if (opt & MBFL_ZENKAKU_HIRA2KATA) {
2930+
zend_argument_value_error(2, "must not combine 'h' and 'C' flags");
2931+
RETURN_THROWS();
2932+
} else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
2933+
zend_argument_value_error(2, "must not combine 'h' and 'c' flags");
2934+
RETURN_THROWS();
2935+
}
2936+
} else if (opt & MBFL_ZEN2HAN_KATAKANA) {
2937+
if (opt & MBFL_ZENKAKU_HIRA2KATA) {
2938+
zend_argument_value_error(2, "must not combine 'k' and 'C' flags");
2939+
RETURN_THROWS();
2940+
} else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
2941+
zend_argument_value_error(2, "must not combine 'k' and 'c' flags");
2942+
RETURN_THROWS();
29292943
}
29302944
}
29312945
} else {
2932-
opt = MBFL_FILT_TL_HAN2ZEN_KATAKANA | MBFL_FILT_TL_HAN2ZEN_GLUE;
2946+
opt = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
29332947
}
29342948

29352949
/* encoding */

0 commit comments

Comments
 (0)