Skip to content

Commit 219f8bf

Browse files
committed
Implement MIME sniff
1 parent 1499389 commit 219f8bf

File tree

7 files changed

+425
-44
lines changed

7 files changed

+425
-44
lines changed

ext/dom/html_document.c

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -715,7 +715,6 @@ PHP_METHOD(DOM_HTMLDocument, createFromFile)
715715
dom_setup_parser_encoding_manually((const lxb_char_t *) buf, encoding_data, &decoding_encoding_ctx, &application_data);
716716
}
717717

718-
// TODO: https://mimesniff.spec.whatwg.org/#parsing-a-mime-type
719718
stream = php_stream_open_wrapper_ex(filename, "rb", REPORT_ERRORS, /* opened_path */ NULL, /* context */ php_libxml_get_stream_context());
720719
if (!stream) {
721720
if (!EG(exception)) {
@@ -724,6 +723,19 @@ PHP_METHOD(DOM_HTMLDocument, createFromFile)
724723
RETURN_THROWS();
725724
}
726725

726+
/* MIME sniff */
727+
if (should_determine_encoding_implicitly) {
728+
zend_string *charset = php_libxml_sniff_charset_from_stream(stream);
729+
if (charset != NULL) {
730+
const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name((const lxb_char_t *) ZSTR_VAL(charset), ZSTR_LEN(charset));
731+
if (encoding_data != NULL) {
732+
should_determine_encoding_implicitly = false;
733+
dom_setup_parser_encoding_manually((const lxb_char_t *) buf, encoding_data, &decoding_encoding_ctx, &application_data);
734+
}
735+
zend_string_release_ex(charset, false);
736+
}
737+
}
738+
727739
lxb_html_document_t *document = lxb_html_document_create();
728740
if (UNEXPECTED(document == NULL)) {
729741
goto fail_oom;
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
--TEST--
2+
DOM\HTMLDocument::createFromFile() HTTP header Content-Type
3+
--EXTENSIONS--
4+
dom
5+
--SKIPIF--
6+
<?php
7+
if (@!include "./ext/standard/tests/http/server.inc") die('skip server.inc not available');
8+
http_server_skipif();
9+
?>
10+
--FILE--
11+
<?php
12+
require "./ext/standard/tests/http/server.inc";
13+
14+
$tests = [
15+
"Invalid type/subtype" => [
16+
"/html; Charset=\"ISO-8859-1\"",
17+
"text/; Charset=\"ISO-8859-1\"",
18+
"/; Charset=\"ISO-8859-1\"",
19+
"$/€; Charset=\"ISO-8859-1\"",
20+
"; Charset=\"ISO-8859-1\"",
21+
],
22+
"Valid type/subtype without charset" => [
23+
"text/html; x=ISO-8859-1",
24+
"text/html; x=\"ISO-8859-1\"",
25+
"text/html; charet=\"ISO-8859-1\"",
26+
"text/html; chars et=\"ISO-8859-1\"",
27+
],
28+
"All valid" => [
29+
"text/html; charset=ISO-8859-1",
30+
"\t\r text/html; charset=ISO-8859-1 \t",
31+
"text/html; foo=bar;charset=ISO-8859-1",
32+
"text/html; foo=bar;charset=ISO-8859-1;bar=\"foooooo\"",
33+
"text/html;;;; charset=ISO-8859-1",
34+
"text/html; Charset=\"ISO-8859-1\"",
35+
"text/html; Charset=\"ISO\\-8859-1\"",
36+
"text/html; ;; ; ;; Charset=\"ISO-8859-1\"",
37+
"text/html;Charset=\"ISO-8859-1",
38+
"text/html;Charset=\"ISO-8859-1\\",
39+
"text/html;Charset=\"ISO-8859-1\\\"",
40+
],
41+
];
42+
43+
foreach ($tests as $name => $headers) {
44+
echo "--- $name ---\n";
45+
$responses = array_map(fn ($header) => "data://text/plain,HTTP/1.1 200 OK\r\nContent-Type: " . $header . "\r\n\r\n" . "<p>\xE4\xF6\xFC</p>\n", $headers);
46+
['pid' => $pid, 'uri' => $uri] = http_server($responses);
47+
for ($i = 0; $i < count($responses); $i++) {
48+
$result = DOM\HTMLDocument::createFromFile($uri, LIBXML_NOERROR);
49+
echo $result->textContent;
50+
}
51+
http_server_kill($pid);
52+
}
53+
?>
54+
--EXPECT--
55+
--- Invalid type/subtype ---
56+
���
57+
���
58+
���
59+
���
60+
���
61+
--- Valid type/subtype without charset ---
62+
���
63+
���
64+
���
65+
���
66+
--- All valid ---
67+
äöü
68+
äöü
69+
äöü
70+
äöü
71+
äöü
72+
äöü
73+
äöü
74+
äöü
75+
äöü
76+
���
77+
���

ext/libxml/config.w32

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ if (PHP_LIBXML == "yes") {
99
CHECK_HEADER_ADD_INCLUDE("libxml/tree.h", "CFLAGS_LIBXML", PHP_PHP_BUILD + "\\include\\libxml2") &&
1010
ADD_EXTENSION_DEP('libxml', 'iconv')) {
1111

12-
EXTENSION("libxml", "libxml.c", false /* never shared */, "/DZEND_ENABLE_STATIC_TSRMLS_CACHE=1");
12+
EXTENSION("libxml", "libxml.c mime_sniff.c", false /* never shared */, "/DZEND_ENABLE_STATIC_TSRMLS_CACHE=1");
1313
AC_DEFINE("HAVE_LIBXML", 1, "LibXML support");
1414
ADD_FLAG("CFLAGS_LIBXML", "/D LIBXML_STATIC /D LIBXML_STATIC_FOR_DLL /D HAVE_WIN32_THREADS ");
1515
if (!PHP_LIBXML_SHARED) {

ext/libxml/config0.m4

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ if test "$PHP_LIBXML" != "no"; then
1111

1212
PHP_SETUP_LIBXML(LIBXML_SHARED_LIBADD, [
1313
AC_DEFINE(HAVE_LIBXML,1,[ ])
14-
PHP_NEW_EXTENSION(libxml, [libxml.c], $ext_shared,, -DZEND_ENABLE_STATIC_TSRMLS_CACHE=1)
14+
PHP_NEW_EXTENSION(libxml, [libxml.c mime_sniff.c], $ext_shared,, -DZEND_ENABLE_STATIC_TSRMLS_CACHE=1)
1515
PHP_INSTALL_HEADERS([ext/libxml/php_libxml.h])
1616
])
1717
fi

ext/libxml/libxml.c

Lines changed: 7 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -501,47 +501,13 @@ php_libxml_input_buffer_create_filename(const char *URI, xmlCharEncoding enc)
501501
/* Check if there's been an external transport protocol with an encoding information */
502502
if (enc == XML_CHAR_ENCODING_NONE) {
503503
php_stream *s = (php_stream *) context;
504-
505-
if (Z_TYPE(s->wrapperdata) == IS_ARRAY) {
506-
zval *header;
507-
508-
ZEND_HASH_FOREACH_VAL_IND(Z_ARRVAL(s->wrapperdata), header) {
509-
const char buf[] = "Content-Type:";
510-
if (Z_TYPE_P(header) == IS_STRING &&
511-
!zend_binary_strncasecmp(Z_STRVAL_P(header), Z_STRLEN_P(header), buf, sizeof(buf)-1, sizeof(buf)-1)) {
512-
char needle[] = "charset=";
513-
char *haystack = estrndup(Z_STRVAL_P(header), Z_STRLEN_P(header));
514-
char *encoding = php_stristr(haystack, needle, Z_STRLEN_P(header), strlen(needle));
515-
516-
if (encoding) {
517-
char *end;
518-
519-
encoding += sizeof("charset=")-1;
520-
if (*encoding == '"') {
521-
encoding++;
522-
}
523-
end = strchr(encoding, ';');
524-
if (end == NULL) {
525-
end = encoding + strlen(encoding);
526-
}
527-
end--; /* end == encoding-1 isn't a buffer underrun */
528-
while (*end == ' ' || *end == '\t') {
529-
end--;
530-
}
531-
if (*end == '"') {
532-
end--;
533-
}
534-
if (encoding >= end) continue;
535-
*(end+1) = '\0';
536-
enc = xmlParseCharEncoding(encoding);
537-
if (enc <= XML_CHAR_ENCODING_NONE) {
538-
enc = XML_CHAR_ENCODING_NONE;
539-
}
540-
}
541-
efree(haystack);
542-
break; /* found content-type */
543-
}
544-
} ZEND_HASH_FOREACH_END();
504+
zend_string *charset = php_libxml_sniff_charset_from_stream(s);
505+
if (charset != NULL) {
506+
enc = xmlParseCharEncoding(ZSTR_VAL(charset));
507+
if (enc <= XML_CHAR_ENCODING_NONE) {
508+
enc = XML_CHAR_ENCODING_NONE;
509+
}
510+
zend_string_release_ex(charset, false);
545511
}
546512
}
547513

0 commit comments

Comments
 (0)