Skip to content

Commit 34ece40

Browse files
committed
Remove useless mbstring encoding 'JIS-ms'
MicroSoft invented three encodings very similar to ISO-2022-JP/JIS7/JIS8, called CP50220, CP50221, and CP50222. All three are supported by mbstring. Since these encodings are very similar, some code can be shared. Actually, conversion of CP50220/1/2 to Unicode is exactly the same operation; it's when converting from Unicode to CP50220/1/2 that some small differences arise in how certain katakana are handled. The most important common code was a function called `mbfl_filt_wchar_jis_ms`. The `jis_ms` part doubtless refers to the fact that these encodings are modified versions of 'JIS' invented by 'MS'. mbstring also went a step further and exported 'JIS-ms' to userland as a separate encoding from CP50220/1/2. If users requested 'JIS-ms' conversion, they got something like CP50220/1/2, minus their special ways of handling half-width katakana when converting from Unicode. But... that 'encoding' is not something which actually exists in the world outside of mbstring. CP50220/1/2 do exist in MicroSoft software, but not 'JIS-ms'. For a text encoding conversion library, inventing new variant encodings and implementing them is not very productive. Our interest is in handling text encodings which real people actually use for... you know, storing actual text and things like that.
1 parent fcbe45d commit 34ece40

File tree

4 files changed

+13
-198
lines changed

4 files changed

+13
-198
lines changed

ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c

Lines changed: 12 additions & 191 deletions
Original file line numberDiff line numberDiff line change
@@ -45,19 +45,12 @@ static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter);
4545
* This was just CP50220, but the implementation was less strict regarding
4646
* invalid characters; it would silently pass some through
4747
* This 'encoding' only existed in mbstring. In case some poor, lost soul is
48-
* still using it, retain minimal support by aliasing it to CP50220 */
49-
static const char *cp50220_aliases[] = {"cp50220raw", "cp50220-raw", NULL};
50-
51-
const mbfl_encoding mbfl_encoding_jis_ms = {
52-
mbfl_no_encoding_jis_ms,
53-
"JIS-ms",
54-
"ISO-2022-JP",
55-
NULL,
56-
NULL,
57-
MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_GL_UNSAFE,
58-
&vtbl_jis_ms_wchar,
59-
&vtbl_wchar_jis_ms
60-
};
48+
* still using it, retain minimal support by aliasing it to CP50220
49+
*
50+
* Further, mbstring also had a made-up encoding called "JIS-ms"
51+
* This was the same as CP5022{0,1,2}, but without their special ways of
52+
* handling conversion of Unicode half-width katakana */
53+
static const char *cp50220_aliases[] = {"cp50220raw", "cp50220-raw", "JIS-ms", NULL};
6154

6255
const mbfl_encoding mbfl_encoding_cp50220 = {
6356
mbfl_no_encoding_cp50220,
@@ -92,32 +85,12 @@ const mbfl_encoding mbfl_encoding_cp50222 = {
9285
&vtbl_wchar_cp50222
9386
};
9487

95-
const struct mbfl_convert_vtbl vtbl_jis_ms_wchar = {
96-
mbfl_no_encoding_jis_ms,
97-
mbfl_no_encoding_wchar,
98-
mbfl_filt_conv_common_ctor,
99-
NULL,
100-
mbfl_filt_conv_jis_ms_wchar,
101-
mbfl_filt_conv_common_flush,
102-
NULL,
103-
};
104-
105-
const struct mbfl_convert_vtbl vtbl_wchar_jis_ms = {
106-
mbfl_no_encoding_wchar,
107-
mbfl_no_encoding_jis_ms,
108-
mbfl_filt_conv_common_ctor,
109-
NULL,
110-
mbfl_filt_conv_wchar_jis_ms,
111-
mbfl_filt_conv_any_jis_flush,
112-
NULL,
113-
};
114-
11588
const struct mbfl_convert_vtbl vtbl_cp50220_wchar = {
11689
mbfl_no_encoding_cp50220,
11790
mbfl_no_encoding_wchar,
11891
mbfl_filt_conv_common_ctor,
11992
NULL,
120-
mbfl_filt_conv_jis_ms_wchar,
93+
mbfl_filt_conv_cp5022x_wchar,
12194
mbfl_filt_conv_cp5022x_wchar_flush,
12295
NULL,
12396
};
@@ -137,7 +110,7 @@ const struct mbfl_convert_vtbl vtbl_cp50221_wchar = {
137110
mbfl_no_encoding_wchar,
138111
mbfl_filt_conv_common_ctor,
139112
NULL,
140-
mbfl_filt_conv_jis_ms_wchar,
113+
mbfl_filt_conv_cp5022x_wchar,
141114
mbfl_filt_conv_cp5022x_wchar_flush,
142115
NULL,
143116
};
@@ -157,7 +130,7 @@ const struct mbfl_convert_vtbl vtbl_cp50222_wchar = {
157130
mbfl_no_encoding_wchar,
158131
mbfl_filt_conv_common_ctor,
159132
NULL,
160-
mbfl_filt_conv_jis_ms_wchar,
133+
mbfl_filt_conv_cp5022x_wchar,
161134
mbfl_filt_conv_cp5022x_wchar_flush,
162135
NULL,
163136
};
@@ -174,11 +147,7 @@ const struct mbfl_convert_vtbl vtbl_wchar_cp50222 = {
174147

175148
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
176149

177-
/*
178-
* JIS-ms => wchar
179-
*/
180-
int
181-
mbfl_filt_conv_jis_ms_wchar(int c, mbfl_convert_filter *filter)
150+
int mbfl_filt_conv_cp5022x_wchar(int c, mbfl_convert_filter *filter)
182151
{
183152
int c1, s, w;
184153

@@ -355,154 +324,6 @@ static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter)
355324
return 0;
356325
}
357326

358-
/*
359-
* wchar => JIS
360-
*/
361-
int
362-
mbfl_filt_conv_wchar_jis_ms(int c, mbfl_convert_filter *filter)
363-
{
364-
int s = 0;
365-
366-
if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
367-
s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
368-
} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
369-
s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
370-
} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
371-
s = ucs_i_jis_table[c - ucs_i_jis_table_min];
372-
} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
373-
s = ucs_r_jis_table[c - ucs_r_jis_table_min];
374-
} else if (c >= 0xe000 && c < (0xe000 + 10 * 94)) {
375-
/* PUE => Microsoft extended (pseudo 95ku - 114ku) */
376-
/* See http://www.opengroup.or.jp/jvc/cde/ucs-conv.html#ch4_2 */
377-
s = c - 0xe000;
378-
s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
379-
} else if (c >= (0xe000 + 10 * 94) && c <= (0xe000 + 20 * 94)) {
380-
/* PUE => JISX0212 user-defined (G3 85ku - 94ku) */
381-
/* See http://www.opengroup.or.jp/jvc/cde/ucs-conv.html#ch4_2 */
382-
s = c - (0xe000 + 10 * 94);
383-
s = (s / 94 + 0xf5) << 8 | (s % 94 + 0xa1);
384-
}
385-
386-
/* do some transliteration */
387-
if (s <= 0) {
388-
if (c == 0xa5) { /* YEN SIGN */
389-
s = 0x1005c;
390-
} else if (c == 0x203e) { /* OVER LINE */
391-
s = 0x1007e;
392-
} else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
393-
s = 0x2140;
394-
} else if (c == 0x2225) { /* PARALLEL TO */
395-
s = 0x2142;
396-
} else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
397-
s = 0x215d;
398-
} else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
399-
s = 0x2171;
400-
} else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
401-
s = 0x2172;
402-
} else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
403-
s = 0x224c;
404-
}
405-
}
406-
if (s <= 0 || (s >= 0x8080 && s < 0x10000)) {
407-
int i;
408-
s = -1;
409-
410-
for (i = 0;
411-
i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
412-
const int oh = cp932ext1_ucs_table_min / 94;
413-
414-
if (c == cp932ext1_ucs_table[i]) {
415-
s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
416-
break;
417-
}
418-
}
419-
420-
if (s < 0) {
421-
const int oh = cp932ext2_ucs_table_min / 94;
422-
const int cp932ext2_ucs_table_size =
423-
cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
424-
for (i = 0; i < cp932ext2_ucs_table_size; i++) {
425-
if (c == cp932ext2_ucs_table[i]) {
426-
s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
427-
break;
428-
}
429-
}
430-
}
431-
432-
if (s < 0) {
433-
const int cp932ext3_ucs_table_size =
434-
cp932ext3_ucs_table_max - cp932ext3_ucs_table_min;
435-
const int limit = cp932ext3_ucs_table_size >
436-
cp932ext3_eucjp_table_size ?
437-
cp932ext3_eucjp_table_size:
438-
cp932ext3_ucs_table_size;
439-
for (i = 0; i < limit; i++) {
440-
if (c == cp932ext3_ucs_table[i]) {
441-
s = cp932ext3_eucjp_table[i];
442-
break;
443-
}
444-
}
445-
}
446-
447-
if (c == 0) {
448-
s = 0;
449-
} else if (s <= 0) {
450-
s = -1;
451-
}
452-
}
453-
454-
if (s >= 0) {
455-
if (s < 0x80) { /* ASCII */
456-
if ((filter->status & 0xff00) != 0) {
457-
CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
458-
CK((*filter->output_function)(0x28, filter->data)); /* '(' */
459-
CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
460-
}
461-
filter->status = 0;
462-
CK((*filter->output_function)(s, filter->data));
463-
} else if (s < 0x100) { /* kana */
464-
if ((filter->status & 0xff00) != 0x100) {
465-
CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
466-
CK((*filter->output_function)(0x28, filter->data)); /* '(' */
467-
CK((*filter->output_function)(0x49, filter->data)); /* 'I' */
468-
}
469-
filter->status = 0x100;
470-
CK((*filter->output_function)(s & 0x7f, filter->data));
471-
} else if (s < 0x8080) { /* X 0208 */
472-
if ((filter->status & 0xff00) != 0x200) {
473-
CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
474-
CK((*filter->output_function)(0x24, filter->data)); /* '$' */
475-
CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
476-
}
477-
filter->status = 0x200;
478-
CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
479-
CK((*filter->output_function)(s & 0xff, filter->data));
480-
} else if (s < 0x10000) { /* X 0212 */
481-
if ((filter->status & 0xff00) != 0x300) {
482-
CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
483-
CK((*filter->output_function)(0x24, filter->data)); /* '$' */
484-
CK((*filter->output_function)(0x28, filter->data)); /* '(' */
485-
CK((*filter->output_function)(0x44, filter->data)); /* 'D' */
486-
}
487-
filter->status = 0x300;
488-
CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
489-
CK((*filter->output_function)(s & 0x7f, filter->data));
490-
} else { /* X 0201 latin */
491-
if ((filter->status & 0xff00) != 0x400) {
492-
CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
493-
CK((*filter->output_function)(0x28, filter->data)); /* '(' */
494-
CK((*filter->output_function)(0x4a, filter->data)); /* 'J' */
495-
}
496-
filter->status = 0x400;
497-
CK((*filter->output_function)(s & 0x7f, filter->data));
498-
}
499-
} else {
500-
CK(mbfl_filt_conv_illegal_output(c, filter));
501-
}
502-
503-
return c;
504-
}
505-
506327
/*
507328
* wchar => CP50220
508329
*/
@@ -843,8 +664,8 @@ mbfl_filt_conv_wchar_cp50222_flush(mbfl_convert_filter *filter)
843664
}
844665
filter->status &= 0xff;
845666

846-
if (filter->flush_function != NULL) {
847-
return (*filter->flush_function)(filter->data);
667+
if (filter->flush_function) {
668+
(*filter->flush_function)(filter->data);
848669
}
849670

850671
return 0;

ext/mbstring/libmbfl/filters/mbfilter_cp5022x.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,22 +32,18 @@
3232

3333
#include "mbfilter.h"
3434

35-
extern const mbfl_encoding mbfl_encoding_jis_ms;
3635
extern const mbfl_encoding mbfl_encoding_cp50220;
3736
extern const mbfl_encoding mbfl_encoding_cp50221;
3837
extern const mbfl_encoding mbfl_encoding_cp50222;
3938

40-
extern const struct mbfl_convert_vtbl vtbl_jis_ms_wchar;
41-
extern const struct mbfl_convert_vtbl vtbl_wchar_jis_ms;
4239
extern const struct mbfl_convert_vtbl vtbl_cp50220_wchar;
4340
extern const struct mbfl_convert_vtbl vtbl_wchar_cp50220;
4441
extern const struct mbfl_convert_vtbl vtbl_cp50221_wchar;
4542
extern const struct mbfl_convert_vtbl vtbl_wchar_cp50221;
4643
extern const struct mbfl_convert_vtbl vtbl_cp50222_wchar;
4744
extern const struct mbfl_convert_vtbl vtbl_wchar_cp50222;
4845

49-
int mbfl_filt_conv_jis_ms_wchar(int c, mbfl_convert_filter *filter);
50-
int mbfl_filt_conv_wchar_jis_ms(int c, mbfl_convert_filter *filter);
46+
int mbfl_filt_conv_cp5022x_wchar(int c, mbfl_convert_filter *filter);
5147
int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter);
5248
int mbfl_filt_conv_wchar_cp50221(int c, mbfl_convert_filter *filter);
5349
int mbfl_filt_conv_wchar_cp50222(int c, mbfl_convert_filter *filter);

ext/mbstring/libmbfl/mbfl/mbfl_encoding.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,6 @@ static const mbfl_encoding *mbfl_encoding_ptr_list[] = {
161161
&mbfl_encoding_koi8u,
162162
&mbfl_encoding_armscii8,
163163
&mbfl_encoding_cp850,
164-
&mbfl_encoding_jis_ms,
165164
&mbfl_encoding_2022jp_2004,
166165
&mbfl_encoding_2022jp_kddi,
167166
&mbfl_encoding_cp50220,

ext/mbstring/libmbfl/mbfl/mbfl_encoding.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,6 @@ enum mbfl_no_encoding {
113113
mbfl_no_encoding_8859_16,
114114
mbfl_no_encoding_armscii8,
115115
mbfl_no_encoding_cp850,
116-
mbfl_no_encoding_jis_ms,
117116
mbfl_no_encoding_cp50220,
118117
mbfl_no_encoding_cp50221,
119118
mbfl_no_encoding_cp50222,

0 commit comments

Comments
 (0)