|
32 | 32 | #include "mbfilter_htmlent.h"
|
33 | 33 | #include "html_entities.h"
|
34 | 34 |
|
| 35 | +static size_t mb_htmlent_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); |
| 36 | +static void mb_wchar_to_htmlent(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); |
| 37 | + |
35 | 38 | static const int htmlentitifieds[256] = {
|
36 | 39 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
37 | 40 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
@@ -62,8 +65,8 @@ const mbfl_encoding mbfl_encoding_html_ent = {
|
62 | 65 | MBFL_ENCTYPE_GL_UNSAFE,
|
63 | 66 | &vtbl_html_wchar,
|
64 | 67 | &vtbl_wchar_html,
|
65 |
| - NULL, |
66 |
| - NULL |
| 68 | + mb_htmlent_to_wchar, |
| 69 | + mb_wchar_to_htmlent |
67 | 70 | };
|
68 | 71 |
|
69 | 72 | const struct mbfl_convert_vtbl vtbl_wchar_html = {
|
@@ -311,3 +314,149 @@ void mbfl_filt_conv_html_dec_copy(mbfl_convert_filter *src, mbfl_convert_filter
|
311 | 314 | dest->opaque = emalloc(html_enc_buffer_size+1);
|
312 | 315 | memcpy(dest->opaque, src->opaque, html_enc_buffer_size+1);
|
313 | 316 | }
|
| 317 | + |
| 318 | +static size_t mb_htmlent_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) |
| 319 | +{ |
| 320 | + unsigned char *p = *in, *e = p + *in_len; |
| 321 | + uint32_t *out = buf, *limit = buf + bufsize; |
| 322 | + |
| 323 | + while (p < e && out < limit) { |
| 324 | + unsigned char c = *p++; |
| 325 | + |
| 326 | + if (c == '&') { |
| 327 | + /* Find terminating ; for HTML entity */ |
| 328 | + unsigned char *terminator = p; |
| 329 | + while (terminator < e && *terminator != ';') |
| 330 | + terminator++; |
| 331 | + if (terminator < e) { |
| 332 | + if (*p == '#' && (e - p) >= 2) { |
| 333 | + /* Numeric entity */ |
| 334 | + unsigned int value = 0; |
| 335 | + unsigned char *digits = p + 1; |
| 336 | + if (*digits == 'x' || *digits == 'X') { |
| 337 | + /* Hexadecimal */ |
| 338 | + digits++; |
| 339 | + if (digits == terminator) { |
| 340 | + goto bad_entity; |
| 341 | + } |
| 342 | + while (digits < terminator) { |
| 343 | + unsigned char digit = *digits++; |
| 344 | + if (digit >= '0' && digit <= '9') { |
| 345 | + value = (value * 16) + (digit - '0'); |
| 346 | + } else if (digit >= 'A' && digit <= 'F') { |
| 347 | + value = (value * 16) + (digit - 'A' + 10); |
| 348 | + } else if (digit >= 'a' && digit <= 'f') { |
| 349 | + value = (value * 16) + (digit - 'a' + 10); |
| 350 | + } else { |
| 351 | + goto bad_entity; |
| 352 | + } |
| 353 | + } |
| 354 | + } else { |
| 355 | + /* Decimal */ |
| 356 | + if (digits == terminator) { |
| 357 | + goto bad_entity; |
| 358 | + } |
| 359 | + while (digits < terminator) { |
| 360 | + unsigned char digit = *digits++; |
| 361 | + if (digit >= '0' && digit <= '9') { |
| 362 | + value = (value * 10) + (digit - '0'); |
| 363 | + } else { |
| 364 | + goto bad_entity; |
| 365 | + } |
| 366 | + } |
| 367 | + } |
| 368 | + if (value > 0x10FFFF) { |
| 369 | + goto bad_entity; |
| 370 | + } |
| 371 | + *out++ = value; |
| 372 | + p = terminator + 1; |
| 373 | + goto next_iteration; |
| 374 | + } else { |
| 375 | + /* Named entity */ |
| 376 | + mbfl_html_entity_entry *entity = (mbfl_html_entity_entry*)mbfl_html_entity_list; |
| 377 | + while (entity->name) { |
| 378 | + if (!strncmp((char*)p, entity->name, terminator - p)) { |
| 379 | + *out++ = entity->code; |
| 380 | + p = terminator + 1; |
| 381 | + goto next_iteration; |
| 382 | + } |
| 383 | + entity++; |
| 384 | + } |
| 385 | + } |
| 386 | + } |
| 387 | + /* Either we didn't find ;, or the name of the entity was not recognized */ |
| 388 | +bad_entity: |
| 389 | + *out++ = '&'; |
| 390 | + while (p < terminator && out < limit) { |
| 391 | + *out++ = *p++; |
| 392 | + } |
| 393 | + if (terminator < e && out < limit) { |
| 394 | + *out++ = *p++; |
| 395 | + } |
| 396 | + } else { |
| 397 | + *out++ = c; |
| 398 | + } |
| 399 | + |
| 400 | +next_iteration: ; |
| 401 | + } |
| 402 | + |
| 403 | + *in_len = e - p; |
| 404 | + *in = p; |
| 405 | + return out - buf; |
| 406 | +} |
| 407 | + |
| 408 | +static void mb_wchar_to_htmlent(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) |
| 409 | +{ |
| 410 | + unsigned char *out, *limit; |
| 411 | + MB_CONVERT_BUF_LOAD(buf, out, limit); |
| 412 | + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); |
| 413 | + |
| 414 | + while (len--) { |
| 415 | + uint32_t w = *in++; |
| 416 | + |
| 417 | + if (w < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) && htmlentitifieds[w] != 1) { |
| 418 | + /* Fast path for most ASCII characters */ |
| 419 | + out = mb_convert_buf_add(out, w); |
| 420 | + } else { |
| 421 | + out = mb_convert_buf_add(out, '&'); |
| 422 | + |
| 423 | + /* See if there is a matching named entity */ |
| 424 | + mbfl_html_entity_entry *entity = (mbfl_html_entity_entry*)mbfl_html_entity_list; |
| 425 | + while (entity->name) { |
| 426 | + if (w == entity->code) { |
| 427 | + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 1 + strlen(entity->name)); |
| 428 | + for (char *str = entity->name; *str; str++) { |
| 429 | + out = mb_convert_buf_add(out, *str); |
| 430 | + } |
| 431 | + out = mb_convert_buf_add(out, ';'); |
| 432 | + goto next_iteration; |
| 433 | + } |
| 434 | + entity++; |
| 435 | + } |
| 436 | + |
| 437 | + /* There is no matching named entity; emit a numeric entity instead */ |
| 438 | + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 12); |
| 439 | + out = mb_convert_buf_add(out, '#'); |
| 440 | + |
| 441 | + if (!w) { |
| 442 | + out = mb_convert_buf_add(out, '0'); |
| 443 | + } else { |
| 444 | + unsigned char buf[12]; |
| 445 | + unsigned char *converted = buf + sizeof(buf); |
| 446 | + while (w) { |
| 447 | + *(--converted) = "0123456789"[w % 10]; |
| 448 | + w /= 10; |
| 449 | + } |
| 450 | + while (converted < buf + sizeof(buf)) { |
| 451 | + out = mb_convert_buf_add(out, *converted++); |
| 452 | + } |
| 453 | + } |
| 454 | + |
| 455 | + out = mb_convert_buf_add(out, ';'); |
| 456 | + } |
| 457 | + |
| 458 | +next_iteration: ; |
| 459 | + } |
| 460 | + |
| 461 | + MB_CONVERT_BUF_STORE(buf, out, limit); |
| 462 | +} |
0 commit comments