From 65d6a55e493845efb2f40c6715f6e44a1521b0c6 Mon Sep 17 00:00:00 2001 From: "Christoph M. Becker" Date: Tue, 2 Mar 2021 19:39:08 +0100 Subject: [PATCH 1/3] Fix #51903: simplexml_load_file() doesn't use HTTP headers The `encoding` attribute of the XML declaration is optional; it is good practice to use external encoding information where available if it is missing. Thus, we check for `charset` info of `Content-Type` headers, and see whether the encoding is supported. This is just a PHP 7.4+ compatible update of Mike's patch. Co-authored-by: Michael Wallner --- ext/libxml/libxml.c | 29 +++++++++++++++++++++++++++++ ext/libxml/tests/bug51903.phpt | 26 ++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 ext/libxml/tests/bug51903.phpt diff --git a/ext/libxml/libxml.c b/ext/libxml/libxml.c index c024e1667025c..1f6beb4d2a8b5 100644 --- a/ext/libxml/libxml.c +++ b/ext/libxml/libxml.c @@ -409,6 +409,35 @@ php_libxml_input_buffer_create_filename(const char *URI, xmlCharEncoding enc) return(NULL); } + /* Check if there's been an external transport protocol with an encoding information */ + if (enc == XML_CHAR_ENCODING_NONE) { + php_stream *s = (php_stream *) context; + + if (Z_TYPE(s->wrapperdata) == IS_ARRAY) { + zval *header; + + ZEND_HASH_FOREACH_VAL_IND(Z_ARRVAL(s->wrapperdata), header) { + char *buf = "Content-Type:"; + if (Z_TYPE_P(header) == IS_STRING && + !zend_binary_strncasecmp(Z_STRVAL_P(header), Z_STRLEN_P(header), buf, sizeof(buf)-1, sizeof(buf)-1)) { + char *needle = estrdup("charset="); + char *haystack = estrndup(Z_STRVAL_P(header), Z_STRLEN_P(header)); + char *encoding = php_stristr(haystack, needle, Z_STRLEN_P(header), sizeof("charset=")-1); + + if (encoding) { + enc = xmlParseCharEncoding(encoding + sizeof("charset=")-1); + if (enc <= XML_CHAR_ENCODING_NONE) { + enc = XML_CHAR_ENCODING_NONE; + } + } + efree(haystack); + efree(needle); + break; /* found content-type */ + } + } ZEND_HASH_FOREACH_END(); + } + } + /* Allocate the Input buffer front-end. */ ret = xmlAllocParserInputBuffer(enc); if (ret != NULL) { diff --git a/ext/libxml/tests/bug51903.phpt b/ext/libxml/tests/bug51903.phpt new file mode 100644 index 0000000000000..e450b3f74810d --- /dev/null +++ b/ext/libxml/tests/bug51903.phpt @@ -0,0 +1,26 @@ +--TEST-- +Bug #51903 (simplexml_load_file() doesn't use HTTP headers) +--SKIPIF-- + +--FILE-- +\n" + . "\xE4\xF6\xFC\n", +]; +$pid = http_server('tcp://127.0.0.1:12342', $responses); + +$sxe = simplexml_load_file('http://127.0.0.1:12342/'); +echo "$sxe\n"; + +http_server_kill($pid); +?> +--EXPECT-- +äöü From aeca52b9a4dafa06a46e9f8c796b6ca6d882897b Mon Sep 17 00:00:00 2001 From: "Christoph M. Becker" Date: Wed, 3 Mar 2021 17:32:26 +0100 Subject: [PATCH 2/3] Fix erroneous type declaration Of course, a buffer should be an array. --- ext/libxml/libxml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/libxml/libxml.c b/ext/libxml/libxml.c index 1f6beb4d2a8b5..f4f53a3cdc6e6 100644 --- a/ext/libxml/libxml.c +++ b/ext/libxml/libxml.c @@ -417,7 +417,7 @@ php_libxml_input_buffer_create_filename(const char *URI, xmlCharEncoding enc) zval *header; ZEND_HASH_FOREACH_VAL_IND(Z_ARRVAL(s->wrapperdata), header) { - char *buf = "Content-Type:"; + char buf[] = "Content-Type:"; if (Z_TYPE_P(header) == IS_STRING && !zend_binary_strncasecmp(Z_STRVAL_P(header), Z_STRLEN_P(header), buf, sizeof(buf)-1, sizeof(buf)-1)) { char *needle = estrdup("charset="); From 03cb853e0139d01592195395142928584076f20c Mon Sep 17 00:00:00 2001 From: "Christoph M. Becker" Date: Wed, 3 Mar 2021 19:23:39 +0100 Subject: [PATCH 3/3] Cater to trailing parameters and quoted-strings We do not need to cater to escaped backslashes and quotes in quoted-strings, since no known character encoding contains these. --- ext/libxml/libxml.c | 21 ++++++++++++++++++++- ext/libxml/tests/bug51903.phpt | 22 +++++++++++++++++----- 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/ext/libxml/libxml.c b/ext/libxml/libxml.c index f4f53a3cdc6e6..2f6e9adc886a9 100644 --- a/ext/libxml/libxml.c +++ b/ext/libxml/libxml.c @@ -425,7 +425,26 @@ php_libxml_input_buffer_create_filename(const char *URI, xmlCharEncoding enc) char *encoding = php_stristr(haystack, needle, Z_STRLEN_P(header), sizeof("charset=")-1); if (encoding) { - enc = xmlParseCharEncoding(encoding + sizeof("charset=")-1); + char *end; + + encoding += sizeof("charset=")-1; + if (*encoding == '"') { + encoding++; + } + end = strchr(encoding, ';'); + if (end == NULL) { + end = encoding + strlen(encoding); + } + end--; /* end == encoding-1 isn't a buffer underrun */ + while (*end == ' ' || *end == '\t') { + end--; + } + if (*end == '"') { + end--; + } + if (encoding >= end) continue; + *(end+1) = '\0'; + enc = xmlParseCharEncoding(encoding); if (enc <= XML_CHAR_ENCODING_NONE) { enc = XML_CHAR_ENCODING_NONE; } diff --git a/ext/libxml/tests/bug51903.phpt b/ext/libxml/tests/bug51903.phpt index e450b3f74810d..36a4b5570448a 100644 --- a/ext/libxml/tests/bug51903.phpt +++ b/ext/libxml/tests/bug51903.phpt @@ -11,16 +11,28 @@ http_server_skipif('tcp://127.0.0.1:12342'); require "./ext/standard/tests/http/server.inc"; $responses = [ "data://text/plain,HTTP/1.1 200 OK\r\n" - . "Content-Type: text/xml; charset=ISO-8859-1\r\n\r\n" - . "\n" - . "\xE4\xF6\xFC\n", + . "Content-Type: text/xml; charset=ISO-8859-1\r\n\r\n" + . "\n" + . "\xE4\xF6\xFC\n", + "data://text/plain,HTTP/1.1 200 OK\r\n" + . "Content-Type: text/xml; charset=ISO-8859-1; foo=bar\r\n\r\n" + . "\n" + . "\xE4\xF6\xFC\n", + "data://text/plain,HTTP/1.1 200 OK\r\n" + . "Content-Type: text/xml; charset=\"ISO-8859-1\" ; foo=bar\r\n\r\n" + . "\n" + . "\xE4\xF6\xFC\n", ]; $pid = http_server('tcp://127.0.0.1:12342', $responses); -$sxe = simplexml_load_file('http://127.0.0.1:12342/'); -echo "$sxe\n"; +for ($i = 0; $i < count($responses); $i++) { + $sxe = simplexml_load_file('http://127.0.0.1:12342/'); + echo "$sxe\n"; +} http_server_kill($pid); ?> --EXPECT-- äöü +äöü +äöü