@@ -150,111 +150,89 @@ const struct mbfl_convert_vtbl vtbl_wchar_utf16le = {
150
150
151
151
#define CK (statement ) do { if ((statement) < 0) return (-1); } while (0)
152
152
153
- /*
154
- * UTF-16 => wchar
155
- */
156
153
int mbfl_filt_conv_utf16_wchar (int c , mbfl_convert_filter * filter )
157
154
{
158
- int n , endian ;
159
-
160
- endian = filter -> status & 0xff00 ;
161
- switch (filter -> status & 0x0f ) {
162
- case 0 :
163
- if (endian ) {
164
- n = c & 0xff ;
165
- } else {
166
- n = (c & 0xff ) << 8 ;
167
- }
168
- filter -> cache |= n ;
169
- filter -> status ++ ;
170
- break ;
171
- default :
172
- if (endian ) {
173
- n = (c & 0xff ) << 8 ;
155
+ /* Start with the assumption that the string is big-endian;
156
+ * If we find a little-endian BOM, then we will change that assumption */
157
+ if (filter -> status == 0 ) {
158
+ filter -> cache = c & 0xFF ;
159
+ filter -> status = 1 ;
160
+ } else {
161
+ int n = (filter -> cache << 8 ) | (c & 0xFF );
162
+ if (n == 0xFFFE ) {
163
+ /* Switch to little-endian mode */
164
+ filter -> filter_function = mbfl_filt_conv_utf16le_wchar ;
165
+ filter -> cache = filter -> status = 0 ;
174
166
} else {
175
- n = c & 0xff ;
176
- }
177
- n |= filter -> cache & 0xffff ;
178
- filter -> status &= ~0x0f ;
179
- if (n >= 0xd800 && n < 0xdc00 ) {
180
- filter -> cache = ((n & 0x3ff ) << 16 ) + 0x400000 ;
181
- } else if (n >= 0xdc00 && n < 0xe000 ) {
182
- n &= 0x3ff ;
183
- n |= (filter -> cache & 0xfff0000 ) >> 6 ;
184
- filter -> cache = 0 ;
185
- if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX ) {
186
- CK ((* filter -> output_function )(n , filter -> data ));
187
- } else { /* illegal character */
188
- n &= MBFL_WCSGROUP_MASK ;
189
- n |= MBFL_WCSGROUP_THROUGH ;
167
+ filter -> filter_function = mbfl_filt_conv_utf16be_wchar ;
168
+ if (n >= 0xD800 && n <= 0xDBFF ) {
169
+ filter -> cache = n & 0x3FF ; /* Pick out 10 data bits */
170
+ filter -> status = 2 ;
171
+ return c ;
172
+ } else if (n >= 0xDC00 && n <= 0xDFFF ) {
173
+ /* This is wrong; second part of surrogate pair has come first */
174
+ CK ((* filter -> output_function )(n | MBFL_WCSGROUP_THROUGH , filter -> data ));
175
+ } else if (n != 0xFEFF ) {
190
176
CK ((* filter -> output_function )(n , filter -> data ));
191
177
}
192
- } else {
193
- int is_first = filter -> status & 0x10 ;
194
- filter -> cache = 0 ;
195
- filter -> status |= 0x10 ;
196
- if (!is_first ) {
197
- if (n == 0xfffe ) {
198
- if (endian ) {
199
- filter -> status &= ~0x100 ; /* big-endian */
200
- } else {
201
- filter -> status |= 0x100 ; /* little-endian */
202
- }
203
- break ;
204
- } else if (n == 0xfeff ) {
205
- break ;
206
- }
207
- }
208
- CK ((* filter -> output_function )(n , filter -> data ));
178
+ filter -> cache = filter -> status = 0 ;
209
179
}
210
- break ;
211
180
}
212
181
213
182
return c ;
214
183
}
215
184
216
- /*
217
- * UTF-16BE => wchar
218
- */
219
185
int mbfl_filt_conv_utf16be_wchar (int c , mbfl_convert_filter * filter )
220
186
{
221
187
int n ;
222
188
223
189
switch (filter -> status ) {
224
- case 0 :
190
+ case 0 : /* First byte */
191
+ filter -> cache = c & 0xFF ;
225
192
filter -> status = 1 ;
226
- n = (c & 0xff ) << 8 ;
227
- filter -> cache |= n ;
228
193
break ;
229
- default :
230
- filter -> status = 0 ;
231
- n = (filter -> cache & 0xff00 ) | (c & 0xff );
232
- if (n >= 0xd800 && n < 0xdc00 ) {
233
- filter -> cache = ((n & 0x3ff ) << 16 ) + 0x400000 ;
234
- } else if (n >= 0xdc00 && n < 0xe000 ) {
235
- n &= 0x3ff ;
236
- n |= (filter -> cache & 0xfff0000 ) >> 6 ;
237
- filter -> cache = 0 ;
238
- if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX ) {
239
- CK ((* filter -> output_function )(n , filter -> data ));
240
- } else { /* illegal character */
241
- n &= MBFL_WCSGROUP_MASK ;
242
- n |= MBFL_WCSGROUP_THROUGH ;
243
- CK ((* filter -> output_function )(n , filter -> data ));
244
- }
194
+
195
+ case 1 : /* Second byte */
196
+ n = (filter -> cache << 8 ) | (c & 0xFF );
197
+ if (n >= 0xD800 && n <= 0xDBFF ) {
198
+ filter -> cache = n & 0x3FF ; /* Pick out 10 data bits */
199
+ filter -> status = 2 ;
200
+ } else if (n >= 0xDC00 && n <= 0xDFFF ) {
201
+ /* This is wrong; second part of surrogate pair has come first */
202
+ CK ((* filter -> output_function )(n | MBFL_WCSGROUP_THROUGH , filter -> data ));
203
+ filter -> status = 0 ;
245
204
} else {
246
- filter -> cache = 0 ;
247
205
CK ((* filter -> output_function )(n , filter -> data ));
206
+ filter -> status = 0 ;
248
207
}
249
208
break ;
209
+
210
+ case 2 : /* Second part of surrogate, first byte */
211
+ filter -> cache = (filter -> cache << 8 ) | (c & 0xFF );
212
+ filter -> status = 3 ;
213
+ break ;
214
+
215
+ case 3 : /* Second part of surrogate, second byte */
216
+ n = ((filter -> cache & 0xFF ) << 8 ) | (c & 0xFF );
217
+ if (n >= 0xD800 && n <= 0xDBFF ) {
218
+ /* Wrong; that's the first half of a surrogate pair, not the second */
219
+ CK ((* filter -> output_function )((0xD8 << 10 ) | (filter -> cache >> 8 ) | MBFL_WCSGROUP_THROUGH , filter -> data ));
220
+ filter -> cache = n & 0x3FF ;
221
+ filter -> status = 2 ;
222
+ } else if (n >= 0xDC00 && n <= 0xDFFF ) {
223
+ n = ((filter -> cache & 0x3FF00 ) << 2 ) + (n & 0x3FF ) + 0x10000 ;
224
+ CK ((* filter -> output_function )(n , filter -> data ));
225
+ filter -> status = 0 ;
226
+ } else {
227
+ CK ((* filter -> output_function )((0xD8 << 10 ) | (filter -> cache >> 8 ) | MBFL_WCSGROUP_THROUGH , filter -> data ));
228
+ CK ((* filter -> output_function )(n , filter -> data ));
229
+ filter -> status = 0 ;
230
+ }
250
231
}
251
232
252
233
return c ;
253
234
}
254
235
255
- /*
256
- * wchar => UTF-16BE
257
- */
258
236
int mbfl_filt_conv_wchar_utf16be (int c , mbfl_convert_filter * filter )
259
237
{
260
238
int n ;
@@ -276,11 +254,10 @@ int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
276
254
return c ;
277
255
}
278
256
279
- /*
280
- * UTF-16LE => wchar
281
- */
282
257
int mbfl_filt_conv_utf16le_wchar (int c , mbfl_convert_filter * filter )
283
258
{
259
+ int n ;
260
+
284
261
switch (filter -> status ) {
285
262
case 0 :
286
263
filter -> cache = c & 0xff ;
@@ -296,12 +273,12 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
296
273
/* This is wrong; the second part of the surrogate pair has come first
297
274
* Flag it with `MBFL_WCSGROUP_THROUGH`; the following filter will handle
298
275
* the error */
299
- int n = (filter -> cache + ((c & 0xff ) << 8 )) | MBFL_WCSGROUP_THROUGH ;
300
- filter -> status = 0 ;
276
+ n = (filter -> cache + ((c & 0xff ) << 8 )) | MBFL_WCSGROUP_THROUGH ;
301
277
CK ((* filter -> output_function )(n , filter -> data ));
302
- } else {
303
278
filter -> status = 0 ;
279
+ } else {
304
280
CK ((* filter -> output_function )(filter -> cache + ((c & 0xff ) << 8 ), filter -> data ));
281
+ filter -> status = 0 ;
305
282
}
306
283
break ;
307
284
@@ -311,18 +288,26 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
311
288
break ;
312
289
313
290
case 3 :
314
- filter -> status = 0 ;
315
- int n = filter -> cache + ((c & 0x3 ) << 8 ) + 0x10000 ;
316
- CK ((* filter -> output_function )(n , filter -> data ));
291
+ n = (filter -> cache & 0xFF ) | ((c & 0xFF ) << 8 );
292
+ if (n >= 0xD800 && n <= 0xDBFF ) {
293
+ CK ((* filter -> output_function )((0xD8 << 10 ) | (filter -> cache >> 10 ) | MBFL_WCSGROUP_THROUGH , filter -> data ));
294
+ filter -> cache = n & 0x3FF ;
295
+ filter -> status = 2 ;
296
+ } else if (n >= 0xDC00 && n <= 0xDFFF ) {
297
+ n = filter -> cache + ((c & 0x3 ) << 8 ) + 0x10000 ;
298
+ CK ((* filter -> output_function )(n , filter -> data ));
299
+ filter -> status = 0 ;
300
+ } else {
301
+ CK ((* filter -> output_function )((0xD8 << 10 ) | (filter -> cache >> 10 ) | MBFL_WCSGROUP_THROUGH , filter -> data ));
302
+ CK ((* filter -> output_function )(n , filter -> data ));
303
+ filter -> status = 0 ;
304
+ }
317
305
break ;
318
306
}
319
307
320
308
return c ;
321
309
}
322
310
323
- /*
324
- * wchar => UTF-16LE
325
- */
326
311
int mbfl_filt_conv_wchar_utf16le (int c , mbfl_convert_filter * filter )
327
312
{
328
313
int n ;
@@ -350,7 +335,7 @@ static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter)
350
335
int cache = filter -> cache ;
351
336
filter -> status = filter -> cache = 0 ;
352
337
353
- if (status & 0xF ) {
338
+ if (status ) {
354
339
/* Input string was truncated */
355
340
CK ((* filter -> output_function )(cache | MBFL_WCSGROUP_THROUGH , filter -> data ));
356
341
}
0 commit comments