Skip to content

Commit 8d9fb64

Browse files
committed
Implement MIME sniff
1 parent 5cd92b1 commit 8d9fb64

File tree

7 files changed

+444
-44
lines changed

7 files changed

+444
-44
lines changed

ext/dom/html_document.c

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -715,7 +715,6 @@ PHP_METHOD(DOM_HTMLDocument, createFromFile)
715715
dom_setup_parser_encoding_manually((const lxb_char_t *) buf, encoding_data, &decoding_encoding_ctx, &application_data);
716716
}
717717

718-
// TODO: https://mimesniff.spec.whatwg.org/#parsing-a-mime-type
719718
stream = php_stream_open_wrapper_ex(filename, "rb", REPORT_ERRORS, /* opened_path */ NULL, /* context */ php_libxml_get_stream_context());
720719
if (!stream) {
721720
if (!EG(exception)) {
@@ -724,6 +723,19 @@ PHP_METHOD(DOM_HTMLDocument, createFromFile)
724723
RETURN_THROWS();
725724
}
726725

726+
/* MIME sniff */
727+
if (should_determine_encoding_implicitly) {
728+
zend_string *charset = php_libxml_sniff_charset_from_stream(stream);
729+
if (charset != NULL) {
730+
const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name((const lxb_char_t *) ZSTR_VAL(charset), ZSTR_LEN(charset));
731+
if (encoding_data != NULL) {
732+
should_determine_encoding_implicitly = false;
733+
dom_setup_parser_encoding_manually((const lxb_char_t *) buf, encoding_data, &decoding_encoding_ctx, &application_data);
734+
}
735+
zend_string_release_ex(charset, false);
736+
}
737+
}
738+
727739
lxb_html_document_t *document = lxb_html_document_create();
728740
if (UNEXPECTED(document == NULL)) {
729741
goto fail_oom;
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
--TEST--
2+
DOM\HTMLDocument::createFromFile() HTTP header Content-Type
3+
--EXTENSIONS--
4+
dom
5+
--SKIPIF--
6+
<?php
7+
if (@!include "./ext/standard/tests/http/server.inc") die('skip server.inc not available');
8+
http_server_skipif();
9+
?>
10+
--FILE--
11+
<?php
12+
require "./ext/standard/tests/http/server.inc";
13+
14+
$tests = [
15+
"Invalid type/subtype" => [
16+
"/html; Charset=\"ISO-8859-1\"",
17+
"text/; Charset=\"ISO-8859-1\"",
18+
"tex°t/html; Charset=\"ISO-8859-1\"",
19+
"/; Charset=\"ISO-8859-1\"",
20+
"$/€; Charset=\"ISO-8859-1\"",
21+
"; Charset=\"ISO-8859-1\"",
22+
";",
23+
"",
24+
" \t",
25+
],
26+
"Valid type/subtype without charset" => [
27+
"text/html; x=ISO-8859-1",
28+
"text/html; x=\"ISO-8859-1\"",
29+
"text/html; charet=\"ISO-8859-1\"",
30+
"text/html; chars et=\"ISO-8859-1\"",
31+
],
32+
"All valid inputs" => [
33+
"text/html; charset=ISO-8859-1",
34+
"\t\r text/html; charset=ISO-8859-1 \t",
35+
"text/html; foo=bar;charset=ISO-8859-1",
36+
"text/html; foo=bar;charset=ISO-8859-1;bar=\"foooooo\"",
37+
"text/html;;;; charset=ISO-8859-1",
38+
"text/html; Charset=\"ISO-8859-1\"",
39+
"text/html; Charset=\"ISO\\-8859-1\"",
40+
"text/html; ;; ; ;; Charset=\"ISO-8859-1\"",
41+
"text/html;Charset=\"ISO-8859-1",
42+
"tex.t/h#\$%!&'*%2B-.^_`|~tml;Charset=\"ISO-8859-1\"", // Note: have to encode + as 2B because of implementation details of http_server()
43+
],
44+
"Valid input, but invalid encoding name" => [
45+
"text/html;Charset=\"ISO-8859-1\\",
46+
"text/html;Charset=\"ISO-8859-1\\\"",
47+
"text/html;Charset=\"foobar\\\"",
48+
"text/html;Charset=\"\\\"",
49+
"text/html;Charset=",
50+
],
51+
];
52+
53+
foreach ($tests as $name => $headers) {
54+
echo "--- $name ---\n";
55+
$responses = array_map(fn ($header) => "data://text/plain,HTTP/1.1 200 OK\r\nContent-Type: " . $header . "\r\n\r\n" . "<p>\xE4\xF6\xFC</p>\n", $headers);
56+
['pid' => $pid, 'uri' => $uri] = http_server($responses);
57+
for ($i = 0; $i < count($responses); $i++) {
58+
$result = DOM\HTMLDocument::createFromFile($uri, LIBXML_NOERROR);
59+
echo $result->textContent;
60+
}
61+
http_server_kill($pid);
62+
}
63+
?>
64+
--EXPECT--
65+
--- Invalid type/subtype ---
66+
���
67+
���
68+
���
69+
���
70+
���
71+
���
72+
���
73+
���
74+
���
75+
--- Valid type/subtype without charset ---
76+
���
77+
���
78+
���
79+
���
80+
--- All valid inputs ---
81+
äöü
82+
äöü
83+
äöü
84+
äöü
85+
äöü
86+
äöü
87+
äöü
88+
äöü
89+
äöü
90+
äöü
91+
--- Valid input, but invalid encoding name ---
92+
���
93+
���
94+
���
95+
���
96+
���

ext/libxml/config.w32

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ if (PHP_LIBXML == "yes") {
99
CHECK_HEADER_ADD_INCLUDE("libxml/tree.h", "CFLAGS_LIBXML", PHP_PHP_BUILD + "\\include\\libxml2") &&
1010
ADD_EXTENSION_DEP('libxml', 'iconv')) {
1111

12-
EXTENSION("libxml", "libxml.c", false /* never shared */, "/DZEND_ENABLE_STATIC_TSRMLS_CACHE=1");
12+
EXTENSION("libxml", "libxml.c mime_sniff.c", false /* never shared */, "/DZEND_ENABLE_STATIC_TSRMLS_CACHE=1");
1313
AC_DEFINE("HAVE_LIBXML", 1, "LibXML support");
1414
ADD_FLAG("CFLAGS_LIBXML", "/D LIBXML_STATIC /D LIBXML_STATIC_FOR_DLL /D HAVE_WIN32_THREADS ");
1515
if (!PHP_LIBXML_SHARED) {

ext/libxml/config0.m4

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ if test "$PHP_LIBXML" != "no"; then
1111

1212
PHP_SETUP_LIBXML(LIBXML_SHARED_LIBADD, [
1313
AC_DEFINE(HAVE_LIBXML,1,[ ])
14-
PHP_NEW_EXTENSION(libxml, [libxml.c], $ext_shared,, -DZEND_ENABLE_STATIC_TSRMLS_CACHE=1)
14+
PHP_NEW_EXTENSION(libxml, [libxml.c mime_sniff.c], $ext_shared,, -DZEND_ENABLE_STATIC_TSRMLS_CACHE=1)
1515
PHP_INSTALL_HEADERS([ext/libxml/php_libxml.h])
1616
])
1717
fi

ext/libxml/libxml.c

Lines changed: 7 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -501,47 +501,13 @@ php_libxml_input_buffer_create_filename(const char *URI, xmlCharEncoding enc)
501501
/* Check if there's been an external transport protocol with an encoding information */
502502
if (enc == XML_CHAR_ENCODING_NONE) {
503503
php_stream *s = (php_stream *) context;
504-
505-
if (Z_TYPE(s->wrapperdata) == IS_ARRAY) {
506-
zval *header;
507-
508-
ZEND_HASH_FOREACH_VAL_IND(Z_ARRVAL(s->wrapperdata), header) {
509-
const char buf[] = "Content-Type:";
510-
if (Z_TYPE_P(header) == IS_STRING &&
511-
!zend_binary_strncasecmp(Z_STRVAL_P(header), Z_STRLEN_P(header), buf, sizeof(buf)-1, sizeof(buf)-1)) {
512-
char needle[] = "charset=";
513-
char *haystack = estrndup(Z_STRVAL_P(header), Z_STRLEN_P(header));
514-
char *encoding = php_stristr(haystack, needle, Z_STRLEN_P(header), strlen(needle));
515-
516-
if (encoding) {
517-
char *end;
518-
519-
encoding += sizeof("charset=")-1;
520-
if (*encoding == '"') {
521-
encoding++;
522-
}
523-
end = strchr(encoding, ';');
524-
if (end == NULL) {
525-
end = encoding + strlen(encoding);
526-
}
527-
end--; /* end == encoding-1 isn't a buffer underrun */
528-
while (*end == ' ' || *end == '\t') {
529-
end--;
530-
}
531-
if (*end == '"') {
532-
end--;
533-
}
534-
if (encoding >= end) continue;
535-
*(end+1) = '\0';
536-
enc = xmlParseCharEncoding(encoding);
537-
if (enc <= XML_CHAR_ENCODING_NONE) {
538-
enc = XML_CHAR_ENCODING_NONE;
539-
}
540-
}
541-
efree(haystack);
542-
break; /* found content-type */
543-
}
544-
} ZEND_HASH_FOREACH_END();
504+
zend_string *charset = php_libxml_sniff_charset_from_stream(s);
505+
if (charset != NULL) {
506+
enc = xmlParseCharEncoding(ZSTR_VAL(charset));
507+
if (enc <= XML_CHAR_ENCODING_NONE) {
508+
enc = XML_CHAR_ENCODING_NONE;
509+
}
510+
zend_string_release_ex(charset, false);
545511
}
546512
}
547513

0 commit comments

Comments
 (0)