Skip to content

Commit a2bc57e

Browse files
committed
mb_detect_encoding will not return non-encodings
Among the text encodings supported by mbstring are several which are not really 'text encodings'. These include Base64, QPrint, UUencode, HTML entities, '7 bit', and '8 bit'. Rather than providing an explicit list of text encodings which they are interested in, users may pass the output of mb_list_encodings to mb_detect_encoding. Since Base64, QPrint, and so on are included in the output of mb_list_encodings, mb_detect_encoding can return one of these as its 'detected encoding' (and in fact, this often happens). Before mb_detect_encoding was enhanced so it could detect any of the supported text encodings, this did not happen, and it is never desired.
1 parent 28b346b commit a2bc57e

File tree

4 files changed

+29
-2
lines changed

4 files changed

+29
-2
lines changed

ext/mbstring/mbstring.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2664,6 +2664,23 @@ PHP_FUNCTION(mb_strtolower)
26642664
}
26652665
/* }}} */
26662666

2667+
static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2668+
{
2669+
/* mbstring supports some 'text encodings' which aren't really text encodings
2670+
* at all, but really 'byte encodings', like Base64, QPrint, and so on.
2671+
* These should never be returned by `mb_detect_encoding`. */
2672+
int shift = 0;
2673+
for (int i = 0; i < *size; i++) {
2674+
const mbfl_encoding *encoding = elist[i];
2675+
if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2676+
shift++; /* Remove this encoding from the list */
2677+
} else if (shift) {
2678+
elist[i - shift] = encoding;
2679+
}
2680+
}
2681+
*size -= shift;
2682+
}
2683+
26672684
/* {{{ Encodings of the given string is returned (as a string) */
26682685
PHP_FUNCTION(mb_detect_encoding)
26692686
{
@@ -2709,6 +2726,14 @@ PHP_FUNCTION(mb_detect_encoding)
27092726
RETURN_THROWS();
27102727
}
27112728

2729+
if (free_elist) {
2730+
remove_non_encodings_from_elist(elist, &size);
2731+
if (size == 0) {
2732+
efree(ZEND_VOIDP(elist));
2733+
RETURN_FALSE;
2734+
}
2735+
}
2736+
27122737
if (ZEND_NUM_ARGS() < 3) {
27132738
strict = MBSTRG(strict_detection);
27142739
}

ext/mbstring/tests/bug81298.phpt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,5 @@ var_dump(mb_detect_encoding("foobar.", "ascii,html"));
1616
bool(false)
1717
string(5) "ASCII"
1818
string(5) "ASCII"
19-
string(13) "HTML-ENTITIES"
19+
bool(false)
2020
string(5) "ASCII"

ext/mbstring/tests/mb_detect_encoding.phpt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ echo mb_detect_encoding($test, ['UTF-8', 'UTF-16']), "\n";
6161

6262
// We once had a problem where all kind of strings would be detected as 'UUENCODE'
6363
echo mb_detect_encoding('abc', ['UUENCODE', 'UTF-8']), "\n";
64+
echo mb_detect_encoding('abc', ['UUENCODE', 'QPrint', 'HTML-ENTITIES', 'Base64', '7bit', '8bit', 'SJIS']), "\n";
6465

6566
echo "== DETECT ORDER ==\n";
6667

@@ -242,6 +243,7 @@ ISO-8859-1
242243
UTF-8
243244
UTF-8
244245
UTF-8
246+
SJIS
245247
== DETECT ORDER ==
246248
JIS: JIS
247249
EUC-JP: EUC-JP

ext/mbstring/tests/other_encodings.phpt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ var_dump(mb_convert_encoding("ABC", "8bit", "7bit"));
1717
echo "7bit done\n";
1818

1919
// "8bit"
20-
var_dump(mb_convert_encoding("\x01\x00", "8bit", "UTF-16BE")); // codepoints over 0xFF are illegal for '8-bit'
20+
var_dump(mb_convert_encoding("\x01\x00", "8bit", "UTF-16BE")); // codepoints over 0xFF are illegal or '8-bit'
2121
echo "8bit done\n";
2222

2323
// UCS-2

0 commit comments

Comments
 (0)