Skip to content

Commit 4de6c3a

Browse files
committed
- Added a 3rd parameter to get_html_translation_table. It now takes a charset
hint, like htmlentities et al. - Fixed bug #49407 (get_html_translation_table doesn't handle UTF-8). - Fixed bug #25927 (get_html_translation_table calls the ' ' instead of '). - Fixed tests for get_html_translation_table and unified the Windows and non-Windows versions of the tests.
1 parent f4a896c commit 4de6c3a

12 files changed

+1828
-4678
lines changed

ext/standard/html.c

Lines changed: 63 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1263,60 +1263,91 @@ PHP_FUNCTION(htmlentities)
12631263
}
12641264
/* }}} */
12651265

1266-
/* {{{ proto array get_html_translation_table([int table [, int quote_style]])
1266+
/* {{{ proto array get_html_translation_table([int table [, int quote_style [, string charset_hint]]])
12671267
Returns the internal translation table used by htmlspecialchars and htmlentities */
12681268
PHP_FUNCTION(get_html_translation_table)
12691269
{
12701270
long which = HTML_SPECIALCHARS, quote_style = ENT_COMPAT;
12711271
unsigned int i;
12721272
int j;
1273-
char ind[2];
1274-
enum entity_charset charset = determine_charset(NULL TSRMLS_CC);
1273+
unsigned char ind[5]; /* max # of 8-bit code units (4; for UTF-8) + 1 for \0 */
1274+
void *dummy;
1275+
char *charset_hint = NULL;
1276+
int charset_hint_len;
1277+
enum entity_charset charset;
12751278

1276-
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|ll", &which, &quote_style) == FAILURE) {
1279+
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|lls",
1280+
&which, &quote_style, &charset_hint, &charset_hint_len) == FAILURE) {
12771281
return;
12781282
}
12791283

1280-
array_init(return_value);
1284+
charset = determine_charset(charset_hint TSRMLS_CC);
12811285

1282-
ind[1] = 0;
1286+
array_init(return_value);
12831287

12841288
switch (which) {
1285-
case HTML_ENTITIES:
1286-
for (j=0; entity_map[j].charset != cs_terminator; j++) {
1287-
if (entity_map[j].charset != charset)
1289+
case HTML_ENTITIES:
1290+
for (j = 0; entity_map[j].charset != cs_terminator; j++) {
1291+
if (entity_map[j].charset != charset)
1292+
continue;
1293+
for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) {
1294+
char buffer[16];
1295+
unsigned k;
1296+
size_t written;
1297+
1298+
if (entity_map[j].table[i] == NULL)
12881299
continue;
1289-
for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) {
1290-
char buffer[16];
1300+
1301+
k = i + entity_map[j].basechar;
12911302

1292-
if (entity_map[j].table[i] == NULL)
1293-
continue;
1294-
/* what about wide chars here ?? */
1295-
ind[0] = i + entity_map[j].basechar;
1296-
snprintf(buffer, sizeof(buffer), "&%s;", entity_map[j].table[i]);
1297-
add_assoc_string(return_value, ind, buffer, 1);
1303+
switch (charset) {
1304+
case cs_utf_8:
1305+
written = php_utf32_utf8(ind, k);
1306+
ind[written] = '\0';
1307+
break;
1308+
/* we have no mappings for these, but if we had... */
1309+
case cs_big5:
1310+
case cs_gb2312:
1311+
case cs_big5hkscs:
1312+
case cs_sjis:
1313+
written = php_mb2_int_to_char(ind, k);
1314+
ind[written] = '\0';
1315+
break;
1316+
case cs_eucjp:
1317+
written = php_mb3_int_to_char(ind, k);
1318+
ind[written] = '\0';
1319+
break;
1320+
default: /* one byte */
1321+
written = 1;
1322+
ind[0] = (unsigned char)k;
1323+
ind[1] = '\0';
1324+
break;
1325+
}
12981326

1327+
snprintf(buffer, sizeof(buffer), "&%s;", entity_map[j].table[i]);
1328+
if (zend_hash_find(Z_ARRVAL_P(return_value), (const char*)ind, written+1, &dummy) == FAILURE) {
1329+
/* in case of the single quote, which is repeated, the first one wins,
1330+
* so don't replace the existint mapping */
1331+
add_assoc_string(return_value, (const char*)ind, buffer, 1);
12991332
}
13001333
}
1301-
/* break thru */
1302-
1303-
case HTML_SPECIALCHARS:
1304-
for (j = 0; basic_entities_ex[j].charcode != 0; j++) {
1305-
void *dummy;
1334+
}
1335+
/* break thru */
13061336

1307-
if (basic_entities_ex[j].flags && (quote_style & basic_entities_ex[j].flags) == 0)
1308-
continue;
1337+
case HTML_SPECIALCHARS:
1338+
for (j = 0; basic_entities_ex[j].charcode != 0; j++) {
1339+
if (basic_entities_ex[j].flags && (quote_style & basic_entities_ex[j].flags) == 0)
1340+
continue;
13091341

1310-
ind[0] = (unsigned char)basic_entities_ex[j].charcode;
1311-
if (zend_hash_find(Z_ARRVAL_P(return_value), ind, sizeof(ind), &dummy) == FAILURE) {
1312-
/* in case of the single quote, which is repeated, the first one wins,
1313-
* so don't replace the existint mapping */
1314-
add_assoc_stringl(return_value, ind, basic_entities_ex[j].entity,
1315-
basic_entities_ex[j].entitylen, 1);
1316-
}
1342+
ind[0] = (unsigned char)basic_entities_ex[j].charcode;
1343+
ind[1] = '\0';
1344+
if (zend_hash_find(Z_ARRVAL_P(return_value), (const char*)ind, 2, &dummy) == FAILURE) {
1345+
add_assoc_stringl(return_value, ind, basic_entities_ex[j].entity,
1346+
basic_entities_ex[j].entitylen, 1);
13171347
}
1348+
}
13181349

1319-
break;
1350+
break;
13201351
}
13211352
}
13221353
/* }}} */

0 commit comments

Comments
 (0)