From 3a955323cd933e1970fcf4143a41eee383cb5467 Mon Sep 17 00:00:00 2001 From: Michael Orlitzky Date: Sat, 16 Nov 2024 21:49:06 -0500 Subject: [PATCH 1/3] ext/iconv/iconv.c: make iconv with //IGNORE conform to the docs Some iconv implementations allow you to append "//IGNORE" to the target encoding as an indication that iconv() should skip input sequences that cannot be represented in that encoding. Both GNU iconv implementations support this, and apparently so does Solaris. The behavior has even been codified in the recent POSIX 2024 standard, but not all implementations support it yet; musl, for example, does not. There are actually two types of "bad sequences" that iconv can encounter. The iconv() function translates sequences from an input encoding to an output encoding. All versions of POSIX, as well as the documentation for the various implementations, are clear that //IGNORE should only ignore valid input sequences that cannot be represented in the target encoding. If the input sequences themselves are invalid, that is an error (EILSEQ), even when //IGNORE is used. The PHP documentation for iconv is aligned with this. A priori, iconv "returns the converted string, or false on failure." And, If the string //IGNORE is appended, characters that cannot be represented in the target charset are silently discarded. Otherwise, E_NOTICE is generated and the function will return false. But again, this should only have an effect on valid input sequences. Which brings us to the problem. The current behavior of, $text = "aa\xC3\xC3\xC3\xB8aa"; var_dump(urlencode(iconv("UTF-8", "UTF-8//IGNORE", $text))); with glibc iconv is to print, string(10) "aa%C3%B8aa" even though the input $text is invalid UTF-8. The reason for this goes back to PHP bug 48147 which was intended to address a bug in glibc's iconv implementation with //IGNORE. Prior to PHP bug 52211, PHP's iconv would return part of the string if iconv failed. And the glibc bug was making it return the wrong part. So, as part of bug 48147, PHP added an ICONV_BROKEN_IGNORE test to config.m4, and added an internal workaround for ignoring EILSEQ errors mid-translation. Unfortunately there are some problems with this test and the workaround: * The test supplies an invalid input sequence to iconv() with //IGNORE, and looks for an error. As discussed above, this should always be an error. Any implementation conforming to any version of POSIX should trigger ICONV_BROKEN_IGNORE. (Tested: glibc and musl.) * The internal workaround for ICONV_BROKEN_IGNORE ignores EILSEQ errors. Again, this is not right, because you will only get EILSEQ from invalid input sequences (in POSIX conforming implementations) or when //IGNORE is NOT used (GNU implementations). * The workaround leaves "//IGNORE" in the target encoding and so does nothing to aid implementations like musl where //IGNORE is "broken" because it never worked to begin with. In short, we're always getting the workaround behavior, and the workaround behavior runs contrary to both POSIX and the PHP documentation. Invalid input sequences should never be ignored. This commit removes the workaround, but will be a breaking change for anyone using //IGNORE on invalid inputs. --- ext/iconv/iconv.c | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/ext/iconv/iconv.c b/ext/iconv/iconv.c index 4241b7c2887fb..0ad2ee3520e59 100644 --- a/ext/iconv/iconv.c +++ b/ext/iconv/iconv.c @@ -415,24 +415,6 @@ static php_iconv_err_t _php_iconv_appendc(smart_str *d, const char c, iconv_t cd } /* }}} */ -/* {{{ */ -#ifdef ICONV_BROKEN_IGNORE -static int _php_check_ignore(const char *charset) -{ - size_t clen = strlen(charset); - if (clen >= 9 && strcmp("//IGNORE", charset+clen-8) == 0) { - return 1; - } - if (clen >= 19 && strcmp("//IGNORE//TRANSLIT", charset+clen-18) == 0) { - return 1; - } - return 0; -} -#else -#define _php_check_ignore(x) (0) -#endif -/* }}} */ - /* {{{ php_iconv_string() */ PHP_ICONV_API php_iconv_err_t php_iconv_string(const char *in_p, size_t in_len, zend_string **out, const char *out_charset, const char *in_charset) { @@ -442,7 +424,6 @@ PHP_ICONV_API php_iconv_err_t php_iconv_string(const char *in_p, size_t in_len, size_t bsz, result = 0; php_iconv_err_t retval = PHP_ICONV_ERR_SUCCESS; zend_string *out_buf; - int ignore_ilseq = _php_check_ignore(out_charset); *out = NULL; @@ -466,16 +447,6 @@ PHP_ICONV_API php_iconv_err_t php_iconv_string(const char *in_p, size_t in_len, result = iconv(cd, (ICONV_CONST char **) &in_p, &in_left, (char **) &out_p, &out_left); out_size = bsz - out_left; if (result == (size_t)(-1)) { - if (ignore_ilseq && errno == EILSEQ) { - if (in_left <= 1) { - result = 0; - } else { - errno = 0; - in_p++; - in_left--; - continue; - } - } if (errno == E2BIG && in_left > 0) { /* converted string is longer than out buffer */ From 0535da4fa3fb520a645fffe0e68ba9c7d86d2f87 Mon Sep 17 00:00:00 2001 From: Michael Orlitzky Date: Sat, 16 Nov 2024 22:15:57 -0500 Subject: [PATCH 2/3] ext/iconv/config.m4: drop ICONV_BROKEN_IGNORE test The result of this test was used in PHP's iconv() function to implement a workaround, but the workaround has been removed and there are no remaining consumers of ICONV_BROKEN_IGNORE. --- ext/iconv/config.m4 | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/ext/iconv/config.m4 b/ext/iconv/config.m4 index 4d0dc36489833..85c8d3d2c949b 100644 --- a/ext/iconv/config.m4 +++ b/ext/iconv/config.m4 @@ -83,34 +83,6 @@ int main(void) { AS_VAR_IF([php_cv_iconv_errno], [yes],, [AC_MSG_FAILURE([The iconv check failed, 'errno' is missing.])]) - AC_CACHE_CHECK([if iconv supports //IGNORE], [php_cv_iconv_ignore], - [AC_RUN_IFELSE([AC_LANG_SOURCE([ -#include -#include - -int main(void) { - iconv_t cd = iconv_open( "UTF-8//IGNORE", "UTF-8" ); - if(cd == (iconv_t)-1) { - return 1; - } - char *in_p = "\xC3\xC3\xC3\xB8"; - size_t in_left = 4, out_left = 4096; - char *out = malloc(out_left); - char *out_p = out; - size_t result = iconv(cd, (char **) &in_p, &in_left, (char **) &out_p, &out_left); - if(result == (size_t)-1) { - return 1; - } - return 0; -} - ])], - [php_cv_iconv_ignore=yes], - [php_cv_iconv_ignore=no], - [php_cv_iconv_ignore=no])]) - AS_VAR_IF([php_cv_iconv_ignore], [no], - [AC_DEFINE([ICONV_BROKEN_IGNORE], [1], - [Define to 1 if iconv has broken IGNORE.])]) - LIBS=$save_LIBS CFLAGS=$save_CFLAGS From 9079d613e9688b9127b7989478744130b2b689bf Mon Sep 17 00:00:00 2001 From: Michael Orlitzky Date: Mon, 18 Nov 2024 09:59:02 -0500 Subject: [PATCH 3/3] ext/iconv/tests/bug48147.phpt: rename and repurpose This test ensures that iconv() with //IGNORE will return part of the translated string when it is given invalid inputs. POSIX and the PHP documentation however agree that an error should be returned in that case: the test is making sure that we get the wrong answer. PHP's iconv has recently been updated to agree with the standard and docs. As a result, this test fails, but that's OK -- it should. The history of iconv in PHP is convoluted, so rather than just update the expected output here, I think it's best to rename the test and sever its ties to bug 48147. Referencing that bug at this point can only cause confusion. --- ext/iconv/tests/bug48147.phpt | 27 ------------ .../tests/dont-ignore-invalid-inputs.phpt | 42 +++++++++++++++++++ 2 files changed, 42 insertions(+), 27 deletions(-) delete mode 100644 ext/iconv/tests/bug48147.phpt create mode 100644 ext/iconv/tests/dont-ignore-invalid-inputs.phpt diff --git a/ext/iconv/tests/bug48147.phpt b/ext/iconv/tests/bug48147.phpt deleted file mode 100644 index ce304eecfb3c7..0000000000000 --- a/ext/iconv/tests/bug48147.phpt +++ /dev/null @@ -1,27 +0,0 @@ ---TEST-- -Bug #48147 (iconv with //IGNORE cuts the string) ---EXTENSIONS-- -iconv ---FILE-- - ---EXPECTF-- -Notice: iconv(): Detected an illegal character in input string in %s on line %d -bool(false) -string(10) "aa%C3%B8aa" - -Notice: iconv(): Detected an incomplete multibyte character in input string in %s on line %d -string(0) "" -string(8) "%C3%B8aa" - -Notice: iconv(): Detected an incomplete multibyte character in input string in %s on line %d -string(0) "" diff --git a/ext/iconv/tests/dont-ignore-invalid-inputs.phpt b/ext/iconv/tests/dont-ignore-invalid-inputs.phpt new file mode 100644 index 0000000000000..0cf5d10330156 --- /dev/null +++ b/ext/iconv/tests/dont-ignore-invalid-inputs.phpt @@ -0,0 +1,42 @@ +--TEST-- +iconv with //IGNORE should not ignore invalid input sequences +--EXTENSIONS-- +iconv +--SKIPIF-- + +--FILE-- + +--EXPECTF-- +Notice: iconv(): Detected an illegal character in input string in %s on line %d +bool(false) + +Notice: iconv(): Detected an illegal character in input string in %s on line %d +bool(false) + +Notice: iconv(): Detected an incomplete multibyte character in input string in %s on line %d +bool(false) + +Notice: iconv(): Detected an illegal character in input string in %s on line %d +bool(false) + +Notice: iconv(): Detected an incomplete multibyte character in input string in %s on line %d +bool(false)