Skip to content

Commit d6ac8b2

Browse files
committed
Clean up determine_charset() implementation
And drop code related to locale-based charset guessing, which is no longer in use.
1 parent 481b742 commit d6ac8b2

File tree

1 file changed

+20
-74
lines changed

1 file changed

+20
-74
lines changed

ext/standard/html.c

Lines changed: 20 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -370,90 +370,41 @@ static inline unsigned int get_next_char(
370370
static enum entity_charset determine_charset(char *charset_hint)
371371
{
372372
size_t i;
373-
enum entity_charset charset = cs_utf_8;
374-
size_t len = 0;
375373
const zend_encoding *zenc;
376374

377-
/* Default is now UTF-8 */
378-
if (charset_hint == NULL)
379-
return cs_utf_8;
375+
if (charset_hint && *charset_hint) {
376+
/* Explicitly passed charset */
377+
goto det_charset;
378+
}
380379

381-
if ((len = strlen(charset_hint)) != 0) {
380+
charset_hint = get_default_charset();
381+
if (charset_hint && *charset_hint) {
382+
/* default_charset or internal_encoding */
382383
goto det_charset;
383384
}
384385

385386
zenc = zend_multibyte_get_internal_encoding();
386387
if (zenc != NULL) {
388+
/* mbstring.internal_encoding or mb_internal_encoding() */
389+
// TODO: We *shouldn't* be taking this into account anymore.
387390
charset_hint = (char *)zend_multibyte_get_encoding_name(zenc);
388-
if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
389-
if (len == sizeof("auto")-1 && !memcmp("auto", charset_hint, sizeof("auto")-1)) {
390-
charset_hint = NULL;
391-
len = 0;
392-
} else {
393-
goto det_charset;
394-
}
395-
}
396-
}
397-
398-
charset_hint = SG(default_charset);
399-
if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
400-
goto det_charset;
401-
}
402-
403-
/* try to detect the charset for the locale */
404-
#if HAVE_NL_LANGINFO && defined(CODESET)
405-
charset_hint = nl_langinfo(CODESET);
406-
if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
407-
goto det_charset;
408-
}
409-
#endif
410-
411-
/* try to figure out the charset from the locale */
412-
{
413-
char *localename;
414-
char *dot, *at;
415-
416-
/* lang[_territory][.codeset][@modifier] */
417-
localename = setlocale(LC_CTYPE, NULL);
418-
419-
dot = strchr(localename, '.');
420-
if (dot) {
421-
dot++;
422-
/* locale specifies a codeset */
423-
at = strchr(dot, '@');
424-
if (at)
425-
len = at - dot;
426-
else
427-
len = strlen(dot);
428-
charset_hint = dot;
429-
} else {
430-
/* no explicit name; see if the name itself
431-
* is the charset */
432-
charset_hint = localename;
433-
len = strlen(charset_hint);
434-
}
435391
}
436392

437393
det_charset:
438-
439394
if (charset_hint) {
440-
int found = 0;
441-
395+
size_t len = strlen(charset_hint);
442396
/* now walk the charset map and look for the codeset */
443397
for (i = 0; i < sizeof(charset_map)/sizeof(charset_map[0]); i++) {
444398
if (len == charset_map[i].codeset_len &&
445399
zend_binary_strcasecmp(charset_hint, len, charset_map[i].codeset, len) == 0) {
446-
charset = charset_map[i].charset;
447-
found = 1;
448-
break;
400+
return charset_map[i].charset;
449401
}
450402
}
451-
if (!found) {
452-
php_error_docref(NULL, E_WARNING, "Charset `%s' not supported, assuming utf-8",
453-
charset_hint);
454-
}
403+
404+
php_error_docref(NULL, E_WARNING, "Charset `%s' not supported, assuming utf-8",
405+
charset_hint);
455406
}
456-
return charset;
407+
return cs_utf_8;
457408
}
458409
/* }}} */
459410

@@ -1384,7 +1335,6 @@ PHPAPI zend_string *php_escape_html_entities_ex(unsigned char *old, size_t oldle
13841335
static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
13851336
{
13861337
zend_string *str, *hint_charset = NULL;
1387-
char *default_charset;
13881338
zend_long flags = ENT_COMPAT;
13891339
zend_string *replaced;
13901340
zend_bool double_encode = 1;
@@ -1397,10 +1347,9 @@ static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
13971347
Z_PARAM_BOOL(double_encode);
13981348
ZEND_PARSE_PARAMETERS_END();
13991349

1400-
if (!hint_charset) {
1401-
default_charset = get_default_charset();
1402-
}
1403-
replaced = php_escape_html_entities_ex((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), all, (int) flags, (hint_charset ? ZSTR_VAL(hint_charset) : default_charset), double_encode);
1350+
replaced = php_escape_html_entities_ex(
1351+
(unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), all, (int) flags,
1352+
hint_charset ? ZSTR_VAL(hint_charset) : NULL, double_encode);
14041353
RETVAL_STR(replaced);
14051354
}
14061355
/* }}} */
@@ -1462,7 +1411,6 @@ PHP_FUNCTION(htmlspecialchars_decode)
14621411
PHP_FUNCTION(html_entity_decode)
14631412
{
14641413
zend_string *str, *hint_charset = NULL;
1465-
char *default_charset;
14661414
zend_long quote_style = ENT_COMPAT;
14671415
zend_string *replaced;
14681416

@@ -1473,10 +1421,8 @@ PHP_FUNCTION(html_entity_decode)
14731421
Z_PARAM_STR(hint_charset)
14741422
ZEND_PARSE_PARAMETERS_END();
14751423

1476-
if (!hint_charset) {
1477-
default_charset = get_default_charset();
1478-
}
1479-
replaced = php_unescape_html_entities(str, 1 /*all*/, (int)quote_style, (hint_charset ? ZSTR_VAL(hint_charset) : default_charset));
1424+
replaced = php_unescape_html_entities(
1425+
str, 1 /*all*/, (int)quote_style, hint_charset ? ZSTR_VAL(hint_charset) : NULL);
14801426

14811427
if (replaced) {
14821428
RETURN_STR(replaced);

0 commit comments

Comments
 (0)