27
27
*
28
28
*/
29
29
30
+ /* CP932 is Microsoft's version of Shift-JIS.
31
+ *
32
+ * What we call "SJIS-win" is a variant of CP932 which maps U+00A5
33
+ * and U+203E the same way as eucJP-win; namely, instead of mapping
34
+ * U+00A5 (YEN SIGN) to 0x5C and U+203E (OVERLINE) to 0x7E,
35
+ * these codepoints are mapped to appropriate JIS X 0208 characters.
36
+ *
37
+ * When converting from Shift-JIS to Unicode, there is no difference
38
+ * between CP932 and "SJIS-win".
39
+ *
40
+ * Additional facts:
41
+ *
42
+ * • In the libmbfl library which formed the base for mbstring, "CP932" and
43
+ * "SJIS-win" were originally aliases. The differing mappings were added in
44
+ * December 2002. The libmbfl author later stated that this was done so that
45
+ * "CP932" would comply with a certain specification, while "SJIS-win" would
46
+ * maintain the existing mappings. He does not remember which specification
47
+ * it was.
48
+ * • The WHATWG specification for "Shift_JIS" (followed by web browsers)
49
+ * agrees with our mappings for "CP932".
50
+ * • Microsoft Windows' "best-fit" mappings for CP932 (via the
51
+ * WideCharToMultiByte API) convert U+00A5 to 0x5C, which also agrees with
52
+ * our mappings for "CP932".
53
+ * • glibc's iconv converts U+203E to CP932 0x7E, which again agrees with
54
+ * our mappings for "CP932".
55
+ * • When converting Shift-JIS to CP932, the conversion goes through Unicode.
56
+ * Shift-JIS 0x7E converts to U+203E, so mapping U+203E to 0x7E means that
57
+ * 0x7E will go to 0x7E when converting Shift-JIS to CP932.
58
+ */
59
+
30
60
#include "mbfilter.h"
31
61
#include "mbfilter_cp932.h"
32
62
36
66
static int mbfl_filt_conv_cp932_wchar_flush (mbfl_convert_filter * filter );
37
67
static size_t mb_cp932_to_wchar (unsigned char * * in , size_t * in_len , uint32_t * buf , size_t bufsize , unsigned int * state );
38
68
static void mb_wchar_to_cp932 (uint32_t * in , size_t len , mb_convert_buf * buf , bool end );
69
+ static void mb_wchar_to_sjiswin (uint32_t * in , size_t len , mb_convert_buf * buf , bool end );
39
70
40
71
static const unsigned char mblen_table_sjis [] = { /* 0x80-0x9f,0xE0-0xFF */
41
72
1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
@@ -56,7 +87,8 @@ static const unsigned char mblen_table_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */
56
87
2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2
57
88
};
58
89
59
- static const char * mbfl_encoding_cp932_aliases [] = {"MS932" , "Windows-31J" , "MS_Kanji" , "SJIS-win" , "SJIS-ms" , "SJIS-open" , NULL };
90
+ static const char * mbfl_encoding_cp932_aliases [] = {"MS932" , "Windows-31J" , "MS_Kanji" , NULL };
91
+ static const char * mbfl_encoding_sjiswin_aliases [] = {"SJIS-ms" , "SJIS-open" , NULL };
60
92
61
93
const mbfl_encoding mbfl_encoding_cp932 = {
62
94
mbfl_no_encoding_cp932 ,
@@ -91,6 +123,39 @@ const struct mbfl_convert_vtbl vtbl_wchar_cp932 = {
91
123
NULL ,
92
124
};
93
125
126
+ const mbfl_encoding mbfl_encoding_sjiswin = {
127
+ mbfl_no_encoding_sjiswin ,
128
+ "SJIS-win" ,
129
+ "Shift_JIS" ,
130
+ mbfl_encoding_sjiswin_aliases ,
131
+ mblen_table_sjis ,
132
+ MBFL_ENCTYPE_GL_UNSAFE ,
133
+ & vtbl_sjiswin_wchar ,
134
+ & vtbl_wchar_sjiswin ,
135
+ mb_cp932_to_wchar ,
136
+ mb_wchar_to_sjiswin
137
+ };
138
+
139
+ const struct mbfl_convert_vtbl vtbl_sjiswin_wchar = {
140
+ mbfl_no_encoding_sjiswin ,
141
+ mbfl_no_encoding_wchar ,
142
+ mbfl_filt_conv_common_ctor ,
143
+ NULL ,
144
+ mbfl_filt_conv_cp932_wchar ,
145
+ mbfl_filt_conv_cp932_wchar_flush ,
146
+ NULL ,
147
+ };
148
+
149
+ const struct mbfl_convert_vtbl vtbl_wchar_sjiswin = {
150
+ mbfl_no_encoding_wchar ,
151
+ mbfl_no_encoding_sjiswin ,
152
+ mbfl_filt_conv_common_ctor ,
153
+ NULL ,
154
+ mbfl_filt_conv_wchar_sjiswin ,
155
+ mbfl_filt_conv_common_flush ,
156
+ NULL ,
157
+ };
158
+
94
159
#define CK (statement ) do { if ((statement) < 0) return (-1); } while (0)
95
160
96
161
#define SJIS_ENCODE (c1 ,c2 ,s1 ,s2 ) \
@@ -136,12 +201,7 @@ const struct mbfl_convert_vtbl vtbl_wchar_cp932 = {
136
201
} \
137
202
} while (0)
138
203
139
-
140
- /*
141
- * SJIS-win => wchar
142
- */
143
- int
144
- mbfl_filt_conv_cp932_wchar (int c , mbfl_convert_filter * filter )
204
+ int mbfl_filt_conv_cp932_wchar (int c , mbfl_convert_filter * filter )
145
205
{
146
206
int c1 , s , s1 , s2 , w ;
147
207
@@ -227,18 +287,16 @@ static int mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter *filter)
227
287
return 0 ;
228
288
}
229
289
230
- /*
231
- * wchar => SJIS-win
232
- */
233
- int
234
- mbfl_filt_conv_wchar_cp932 (int c , mbfl_convert_filter * filter )
290
+ int mbfl_filt_conv_wchar_cp932 (int c , mbfl_convert_filter * filter )
235
291
{
236
292
int c1 , c2 , s1 , s2 ;
237
293
238
294
s1 = 0 ;
239
295
s2 = 0 ;
240
296
if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max ) {
241
297
s1 = ucs_a1_jis_table [c - ucs_a1_jis_table_min ];
298
+ } else if (c == 0x203E ) {
299
+ s1 = 0x7E ;
242
300
} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max ) {
243
301
s1 = ucs_a2_jis_table [c - ucs_a2_jis_table_min ];
244
302
} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max ) {
@@ -254,7 +312,7 @@ mbfl_filt_conv_wchar_cp932(int c, mbfl_convert_filter *filter)
254
312
}
255
313
if (s1 <= 0 ) {
256
314
if (c == 0xa5 ) { /* YEN SIGN */
257
- s1 = 0x216F ; /* FULLWIDTH YEN SIGN */
315
+ s1 = 0x5C ;
258
316
} else if (c == 0xff3c ) { /* FULLWIDTH REVERSE SOLIDUS */
259
317
s1 = 0x2140 ;
260
318
} else if (c == 0x2225 ) { /* PARALLEL TO */
@@ -314,6 +372,20 @@ mbfl_filt_conv_wchar_cp932(int c, mbfl_convert_filter *filter)
314
372
return 0 ;
315
373
}
316
374
375
+ int mbfl_filt_conv_wchar_sjiswin (int c , mbfl_convert_filter * filter )
376
+ {
377
+ if (c == 0xA5 ) {
378
+ CK ((* filter -> output_function )(0x81 , filter -> data ));
379
+ CK ((* filter -> output_function )(0x8F , filter -> data ));
380
+ } else if (c == 0x203E ) {
381
+ CK ((* filter -> output_function )(0x81 , filter -> data ));
382
+ CK ((* filter -> output_function )(0x50 , filter -> data ));
383
+ } else {
384
+ return mbfl_filt_conv_wchar_cp932 (c , filter );
385
+ }
386
+ return 0 ;
387
+ }
388
+
317
389
static size_t mb_cp932_to_wchar (unsigned char * * in , size_t * in_len , uint32_t * buf , size_t bufsize , unsigned int * state )
318
390
{
319
391
unsigned char * p = * in , * e = p + * in_len ;
@@ -389,6 +461,87 @@ static void mb_wchar_to_cp932(uint32_t *in, size_t len, mb_convert_buf *buf, boo
389
461
MB_CONVERT_BUF_LOAD (buf , out , limit );
390
462
MB_CONVERT_BUF_ENSURE (buf , out , limit , len * 2 );
391
463
464
+ while (len -- ) {
465
+ uint32_t w = * in ++ ;
466
+ unsigned int s1 = 0 , s2 = 0 , c1 , c2 ;
467
+
468
+ if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max ) {
469
+ s1 = ucs_a1_jis_table [w - ucs_a1_jis_table_min ];
470
+ } else if (w == 0x203E ) {
471
+ s1 = 0x7E ;
472
+ } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max ) {
473
+ s1 = ucs_a2_jis_table [w - ucs_a2_jis_table_min ];
474
+ } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max ) {
475
+ s1 = ucs_i_jis_table [w - ucs_i_jis_table_min ];
476
+ } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max ) {
477
+ s1 = ucs_r_jis_table [w - ucs_r_jis_table_min ];
478
+ } else if (w >= 0xE000 && w < (0xE000 + 20 * 94 )) {
479
+ s1 = w - 0xE000 ;
480
+ c1 = s1 /94 + 0x7F ;
481
+ c2 = s1 %94 + 0x21 ;
482
+ s1 = (c1 << 8 ) | c2 ;
483
+ s2 = 1 ;
484
+ }
485
+
486
+ if (w == 0xA5 ) { /* YEN SIGN */
487
+ s1 = 0x5C ;
488
+ } else if (w == 0xFF3C ) { /* FULLWIDTH REVERSE SOLIDUS */
489
+ s1 = 0x2140 ;
490
+ } else if (w == 0x2225 ) { /* PARALLEL TO */
491
+ s1 = 0x2142 ;
492
+ } else if (w == 0xFF0D ) { /* FULLWIDTH HYPHEN-MINUS */
493
+ s1 = 0x215D ;
494
+ } else if (w == 0xFFE0 ) { /* FULLWIDTH CENT SIGN */
495
+ s1 = 0x2171 ;
496
+ } else if (w == 0xFFE1 ) { /* FULLWIDTH POUND SIGN */
497
+ s1 = 0x2172 ;
498
+ } else if (w == 0xFFE2 ) { /* FULLWIDTH NOT SIGN */
499
+ s1 = 0x224C ;
500
+ } else if (w == 0 ) {
501
+ out = mb_convert_buf_add (out , 0 );
502
+ continue ;
503
+ }
504
+
505
+ if (!s1 || (s1 >= 0x8080 && !s2 )) { /* not found or X 0212 */
506
+ for (unsigned int i = 0 ; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min ; i ++ ) {
507
+ if (cp932ext1_ucs_table [i ] == w ) {
508
+ s1 = ((i /94 + 0x2D ) << 8 ) + (i %94 + 0x21 );
509
+ goto emit_output ;
510
+ }
511
+ }
512
+
513
+ for (unsigned int i = 0 ; i < cp932ext3_ucs_table_max - cp932ext3_ucs_table_min ; i ++ ) {
514
+ if (cp932ext3_ucs_table [i ] == w ) {
515
+ s1 = ((i /94 + 0x93 ) << 8 ) + (i %94 + 0x21 );
516
+ goto emit_output ;
517
+ }
518
+ }
519
+
520
+ MB_CONVERT_ERROR (buf , out , limit , w , mb_wchar_to_cp932 );
521
+ MB_CONVERT_BUF_ENSURE (buf , out , limit , len * 2 );
522
+ continue ;
523
+ }
524
+
525
+ emit_output :
526
+ if (s1 < 0x100 ) {
527
+ out = mb_convert_buf_add (out , s1 );
528
+ } else {
529
+ c1 = (s1 >> 8 ) & 0xFF ;
530
+ c2 = s1 & 0xFF ;
531
+ SJIS_ENCODE (c1 , c2 , s1 , s2 );
532
+ out = mb_convert_buf_add2 (out , s1 , s2 );
533
+ }
534
+ }
535
+
536
+ MB_CONVERT_BUF_STORE (buf , out , limit );
537
+ }
538
+
539
+ static void mb_wchar_to_sjiswin (uint32_t * in , size_t len , mb_convert_buf * buf , bool end )
540
+ {
541
+ unsigned char * out , * limit ;
542
+ MB_CONVERT_BUF_LOAD (buf , out , limit );
543
+ MB_CONVERT_BUF_ENSURE (buf , out , limit , len * 2 );
544
+
392
545
while (len -- ) {
393
546
uint32_t w = * in ++ ;
394
547
unsigned int s1 = 0 , s2 = 0 , c1 , c2 ;
@@ -403,7 +556,7 @@ static void mb_wchar_to_cp932(uint32_t *in, size_t len, mb_convert_buf *buf, boo
403
556
s1 = ucs_r_jis_table [w - ucs_r_jis_table_min ];
404
557
} else if (w >= 0xE000 && w < (0xE000 + 20 * 94 )) {
405
558
s1 = w - 0xE000 ;
406
- c1 = s1 /94 + 0x7f ;
559
+ c1 = s1 /94 + 0x7F ;
407
560
c2 = s1 %94 + 0x21 ;
408
561
s1 = (c1 << 8 ) | c2 ;
409
562
s2 = 1 ;
0 commit comments