Skip to content

Commit f7f3519

Browse files
committed
Implement fast text conversion interface for HTML-ENTITIES
1 parent da176a8 commit f7f3519

File tree

2 files changed

+213
-2
lines changed

2 files changed

+213
-2
lines changed

ext/mbstring/libmbfl/filters/mbfilter_htmlent.c

Lines changed: 151 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@
3232
#include "mbfilter_htmlent.h"
3333
#include "html_entities.h"
3434

35+
static size_t mb_htmlent_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
36+
static void mb_wchar_to_htmlent(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
37+
3538
static const int htmlentitifieds[256] = {
3639
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3740
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -62,8 +65,8 @@ const mbfl_encoding mbfl_encoding_html_ent = {
6265
MBFL_ENCTYPE_GL_UNSAFE,
6366
&vtbl_html_wchar,
6467
&vtbl_wchar_html,
65-
NULL,
66-
NULL
68+
mb_htmlent_to_wchar,
69+
mb_wchar_to_htmlent
6770
};
6871

6972
const struct mbfl_convert_vtbl vtbl_wchar_html = {
@@ -311,3 +314,149 @@ void mbfl_filt_conv_html_dec_copy(mbfl_convert_filter *src, mbfl_convert_filter
311314
dest->opaque = emalloc(html_enc_buffer_size+1);
312315
memcpy(dest->opaque, src->opaque, html_enc_buffer_size+1);
313316
}
317+
318+
static size_t mb_htmlent_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
319+
{
320+
unsigned char *p = *in, *e = p + *in_len;
321+
uint32_t *out = buf, *limit = buf + bufsize;
322+
323+
while (p < e && out < limit) {
324+
unsigned char c = *p++;
325+
326+
if (c == '&') {
327+
/* Find terminating ; for HTML entity */
328+
unsigned char *terminator = p;
329+
while (terminator < e && *terminator != ';')
330+
terminator++;
331+
if (terminator < e) {
332+
if (*p == '#' && (e - p) >= 2) {
333+
/* Numeric entity */
334+
unsigned int value = 0;
335+
unsigned char *digits = p + 1;
336+
if (*digits == 'x' || *digits == 'X') {
337+
/* Hexadecimal */
338+
digits++;
339+
if (digits == terminator) {
340+
goto bad_entity;
341+
}
342+
while (digits < terminator) {
343+
unsigned char digit = *digits++;
344+
if (digit >= '0' && digit <= '9') {
345+
value = (value * 16) + (digit - '0');
346+
} else if (digit >= 'A' && digit <= 'F') {
347+
value = (value * 16) + (digit - 'A' + 10);
348+
} else if (digit >= 'a' && digit <= 'f') {
349+
value = (value * 16) + (digit - 'a' + 10);
350+
} else {
351+
goto bad_entity;
352+
}
353+
}
354+
} else {
355+
/* Decimal */
356+
if (digits == terminator) {
357+
goto bad_entity;
358+
}
359+
while (digits < terminator) {
360+
unsigned char digit = *digits++;
361+
if (digit >= '0' && digit <= '9') {
362+
value = (value * 10) + (digit - '0');
363+
} else {
364+
goto bad_entity;
365+
}
366+
}
367+
}
368+
if (value > 0x10FFFF) {
369+
goto bad_entity;
370+
}
371+
*out++ = value;
372+
p = terminator + 1;
373+
goto next_iteration;
374+
} else {
375+
/* Named entity */
376+
mbfl_html_entity_entry *entity = (mbfl_html_entity_entry*)mbfl_html_entity_list;
377+
while (entity->name) {
378+
if (!strncmp((char*)p, entity->name, terminator - p)) {
379+
*out++ = entity->code;
380+
p = terminator + 1;
381+
goto next_iteration;
382+
}
383+
entity++;
384+
}
385+
}
386+
}
387+
/* Either we didn't find ;, or the name of the entity was not recognized */
388+
bad_entity:
389+
*out++ = '&';
390+
while (p < terminator && out < limit) {
391+
*out++ = *p++;
392+
}
393+
if (terminator < e && out < limit) {
394+
*out++ = *p++;
395+
}
396+
} else {
397+
*out++ = c;
398+
}
399+
400+
next_iteration: ;
401+
}
402+
403+
*in_len = e - p;
404+
*in = p;
405+
return out - buf;
406+
}
407+
408+
static void mb_wchar_to_htmlent(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
409+
{
410+
unsigned char *out, *limit;
411+
MB_CONVERT_BUF_LOAD(buf, out, limit);
412+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
413+
414+
while (len--) {
415+
uint32_t w = *in++;
416+
417+
if (w < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) && htmlentitifieds[w] != 1) {
418+
/* Fast path for most ASCII characters */
419+
out = mb_convert_buf_add(out, w);
420+
} else {
421+
out = mb_convert_buf_add(out, '&');
422+
423+
/* See if there is a matching named entity */
424+
mbfl_html_entity_entry *entity = (mbfl_html_entity_entry*)mbfl_html_entity_list;
425+
while (entity->name) {
426+
if (w == entity->code) {
427+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 1 + strlen(entity->name));
428+
for (char *str = entity->name; *str; str++) {
429+
out = mb_convert_buf_add(out, *str);
430+
}
431+
out = mb_convert_buf_add(out, ';');
432+
goto next_iteration;
433+
}
434+
entity++;
435+
}
436+
437+
/* There is no matching named entity; emit a numeric entity instead */
438+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 12);
439+
out = mb_convert_buf_add(out, '#');
440+
441+
if (!w) {
442+
out = mb_convert_buf_add(out, '0');
443+
} else {
444+
unsigned char buf[12];
445+
unsigned char *converted = buf + sizeof(buf);
446+
while (w) {
447+
*(--converted) = "0123456789"[w % 10];
448+
w /= 10;
449+
}
450+
while (converted < buf + sizeof(buf)) {
451+
out = mb_convert_buf_add(out, *converted++);
452+
}
453+
}
454+
455+
out = mb_convert_buf_add(out, ';');
456+
}
457+
458+
next_iteration: ;
459+
}
460+
461+
MB_CONVERT_BUF_STORE(buf, out, limit);
462+
}
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
--TEST--
2+
Temporary test of mbstring's HTML-ENTITIES 'encoding'
3+
--EXTENSIONS--
4+
mbstring
5+
--FILE--
6+
<?php
7+
8+
/* Using mbstring to convert strings to and from HTML entities has already been deprecated
9+
* So this test should be removed when the HTML-ENTITIES 'encoding' is */
10+
11+
function convertToEntities($raw, $htmlent) {
12+
$converted = mb_convert_encoding($raw, 'HTML-ENTITIES', 'UTF-8');
13+
if ($converted !== $htmlent)
14+
die('Expected ' . bin2hex($raw) . ' to convert to "' . $htmlent . '"; actually got "' . $converted . '"');
15+
}
16+
17+
function convertFromEntities($raw, $htmlent) {
18+
$converted = mb_convert_encoding($htmlent, 'UTF-8', 'HTML-ENTITIES');
19+
if ($converted !== $raw)
20+
die('Expected "' . $htmlent . '" to convert to ' . bin2hex($raw) . '; actually got ' . bin2hex($converted));
21+
}
22+
23+
function testConversion($raw, $htmlent) {
24+
convertToEntities($raw, $htmlent);
25+
convertFromEntities($raw, $htmlent);
26+
}
27+
28+
testConversion('', '');
29+
testConversion('abc', 'abc');
30+
testConversion('&<>', '&<>');
31+
32+
convertFromEntities('"', '&quot;');
33+
convertFromEntities("\xC3\xAC", '&igrave;');
34+
35+
convertFromEntities('', '&#x3042;');
36+
testConversion('', '&#12354;');
37+
testConversion('abcあxyz', 'abc&#12354;xyz');
38+
39+
convertFromEntities('&#x;', '&#x;');
40+
convertFromEntities('&#;', '&#;');
41+
convertFromEntities('&#', '&#');
42+
convertFromEntities('&', '&');
43+
44+
convertFromEntities("\x00", '&#00000;');
45+
46+
testConversion(str_repeat('', 100), str_repeat('&#12354;', 100));
47+
48+
echo "Done!\n";
49+
?>
50+
--EXPECTF--
51+
Deprecated: mb_convert_encoding(): Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead in %s
52+
53+
Deprecated: mb_convert_encoding(): Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead in %s
54+
55+
Deprecated: mb_convert_encoding(): Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead in %s
56+
57+
Deprecated: mb_convert_encoding(): Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead in %s
58+
59+
Deprecated: mb_convert_encoding(): Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead in %s
60+
61+
Deprecated: mb_convert_encoding(): Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead in %s
62+
Done!

0 commit comments

Comments
 (0)