From 006e0f105455c039d85c8a75bc0faa9570ee6a26 Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Fri, 25 Aug 2023 23:36:30 +0200 Subject: [PATCH 01/53] Split off and wrap cloning API --- ext/dom/document.c | 8 +------- ext/dom/node.c | 30 +----------------------------- ext/dom/php_dom.c | 10 ++++++++++ ext/dom/php_dom.h | 2 ++ 4 files changed, 14 insertions(+), 36 deletions(-) diff --git a/ext/dom/document.c b/ext/dom/document.c index 31b889125269b..2aebbccca719d 100644 --- a/ext/dom/document.c +++ b/ext/dom/document.c @@ -776,8 +776,6 @@ PHP_METHOD(DOMDocument, importNode) dom_object *intern, *nodeobj; int ret; bool recursive = 0; - /* See http://www.xmlsoft.org/html/libxml-tree.html#xmlDocCopyNode for meaning of values */ - int extended_recursive; if (zend_parse_parameters(ZEND_NUM_ARGS(), "O|b", &node, dom_node_class_entry, &recursive) == FAILURE) { RETURN_THROWS(); @@ -796,11 +794,7 @@ PHP_METHOD(DOMDocument, importNode) if (nodep->doc == docp) { retnodep = nodep; } else { - extended_recursive = recursive; - if ((recursive == 0) && (nodep->type == XML_ELEMENT_NODE)) { - extended_recursive = 2; - } - retnodep = xmlDocCopyNode(nodep, docp, extended_recursive); + retnodep = dom_clone_node(nodep, docp, recursive); if (!retnodep) { RETURN_FALSE; } diff --git a/ext/dom/node.c b/ext/dom/node.c index 5719bdde9999d..cd62565df884d 100644 --- a/ext/dom/node.c +++ b/ext/dom/node.c @@ -1329,40 +1329,12 @@ PHP_METHOD(DOMNode, cloneNode) DOM_GET_OBJ(n, id, xmlNodePtr, intern); - node = xmlDocCopyNode(n, n->doc, recursive); + node = dom_clone_node(n, n->doc, recursive); if (!node) { RETURN_FALSE; } - /* When deep is false Element nodes still require the attributes - Following taken from libxml as xmlDocCopyNode doesn't do this */ - if (n->type == XML_ELEMENT_NODE && recursive == 0) { - if (n->nsDef != NULL) { - node->nsDef = xmlCopyNamespaceList(n->nsDef); - } - if (n->ns != NULL) { - xmlNsPtr ns; - ns = xmlSearchNs(n->doc, node, n->ns->prefix); - if (ns == NULL) { - ns = xmlSearchNs(n->doc, n, n->ns->prefix); - if (ns != NULL) { - xmlNodePtr root = node; - - while (root->parent != NULL) { - root = root->parent; - } - node->ns = xmlNewNs(root, ns->href, ns->prefix); - } - } else { - node->ns = ns; - } - } - if (n->properties != NULL) { - node->properties = xmlCopyPropList(node, n->properties); - } - } - if (node->type == XML_ATTRIBUTE_NODE && n->ns != NULL && node->ns == NULL) { /* Let reconciliation deal with this. The lifetime of the namespace poses no problem * because we're increasing the refcount of the document proxy at the return. diff --git a/ext/dom/php_dom.c b/ext/dom/php_dom.c index ce540ad4a3b0a..99d2f7fdd1d2a 100644 --- a/ext/dom/php_dom.c +++ b/ext/dom/php_dom.c @@ -1819,4 +1819,14 @@ static int dom_nodemap_has_dimension(zend_object *object, zval *member, int chec return offset >= 0 && offset < php_dom_get_namednodemap_length(php_dom_obj_from_obj(object)); } /* }}} end dom_nodemap_has_dimension */ +xmlNodePtr dom_clone_node(xmlNodePtr node, xmlDocPtr doc, bool recursive) +{ + /* See http://www.xmlsoft.org/html/libxml-tree.html#xmlDocCopyNode for meaning of values */ + int extended_recursive = recursive; + if (!recursive && node->type == XML_ELEMENT_NODE) { + extended_recursive = 2; + } + return xmlDocCopyNode(node, doc, extended_recursive); +} + #endif /* HAVE_DOM */ diff --git a/ext/dom/php_dom.h b/ext/dom/php_dom.h index b77036e83c294..df13dcef0d6cc 100644 --- a/ext/dom/php_dom.h +++ b/ext/dom/php_dom.h @@ -174,6 +174,8 @@ void php_dom_nodelist_get_item_into_zval(dom_nnodemap_object *objmap, zend_long int php_dom_get_namednodemap_length(dom_object *obj); int php_dom_get_nodelist_length(dom_object *obj); +xmlNodePtr dom_clone_node(xmlNodePtr node, xmlDocPtr doc, bool recursive); + #define DOM_GET_INTERN(__id, __intern) { \ __intern = Z_DOMOBJ_P(__id); \ if (UNEXPECTED(__intern->ptr == NULL)) { \ From d0814070f74904b4a348e659c0b7338ec0d391ce Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Sat, 26 Aug 2023 01:11:26 +0200 Subject: [PATCH 02/53] Update ext/libxml APIs so that non-libxml users can hook into the error mechanism --- ext/libxml/libxml.c | 43 +++++++++++++++++++++++++++++++---------- ext/libxml/php_libxml.h | 1 + 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/ext/libxml/libxml.c b/ext/libxml/libxml.c index 6ddbdff5fb800..baf5bfa5028e5 100644 --- a/ext/libxml/libxml.c +++ b/ext/libxml/libxml.c @@ -608,7 +608,7 @@ static void _php_libxml_free_error(void *ptr) xmlResetError((xmlErrorPtr) ptr); } -static void _php_list_set_error_structure(xmlErrorPtr error, const char *msg) +static void _php_list_set_error_structure(xmlErrorPtr error, const char *msg, int line, int column) { xmlError error_copy; int ret; @@ -621,6 +621,8 @@ static void _php_list_set_error_structure(xmlErrorPtr error, const char *msg) } else { error_copy.code = XML_ERR_INTERNAL_ERROR; error_copy.level = XML_ERR_ERROR; + error_copy.line = line; + error_copy.int2 = column; error_copy.message = (char*)xmlStrdup((const xmlChar*)msg); ret = 0; } @@ -630,7 +632,7 @@ static void _php_list_set_error_structure(xmlErrorPtr error, const char *msg) } } -static void php_libxml_ctx_error_level(int level, void *ctx, const char *msg) +static void php_libxml_ctx_error_level(int level, void *ctx, const char *msg, int line) { xmlParserCtxtPtr parser; @@ -638,9 +640,9 @@ static void php_libxml_ctx_error_level(int level, void *ctx, const char *msg) if (parser != NULL && parser->input != NULL) { if (parser->input->filename) { - php_error_docref(NULL, level, "%s in %s, line: %d", msg, parser->input->filename, parser->input->line); + php_error_docref(NULL, level, "%s in %s, line: %d", msg, parser->input->filename, line); } else { - php_error_docref(NULL, level, "%s in Entity, line: %d", msg, parser->input->line); + php_error_docref(NULL, level, "%s in Entity, line: %d", msg, line); } } else { php_error_docref(NULL, E_WARNING, "%s", msg); @@ -650,13 +652,13 @@ static void php_libxml_ctx_error_level(int level, void *ctx, const char *msg) void php_libxml_issue_error(int level, const char *msg) { if (LIBXML(error_list)) { - _php_list_set_error_structure(NULL, msg); + _php_list_set_error_structure(NULL, msg, 0, 0); } else { php_error_docref(NULL, level, "%s", msg); } } -static void php_libxml_internal_error_handler(int error_type, void *ctx, const char **msg, va_list ap) +static void php_libxml_internal_error_handler_ex(int error_type, void *ctx, const char **msg, va_list ap, int line, int column) { char *buf; int len, len_iter, output = 0; @@ -676,15 +678,15 @@ static void php_libxml_internal_error_handler(int error_type, void *ctx, const c if (output == 1) { if (LIBXML(error_list)) { - _php_list_set_error_structure(NULL, ZSTR_VAL(LIBXML(error_buffer).s)); + _php_list_set_error_structure(NULL, ZSTR_VAL(LIBXML(error_buffer).s), line, column); } else if (!EG(exception)) { /* Don't throw additional notices/warnings if an exception has already been thrown. */ switch (error_type) { case PHP_LIBXML_CTX_ERROR: - php_libxml_ctx_error_level(E_WARNING, ctx, ZSTR_VAL(LIBXML(error_buffer).s)); + php_libxml_ctx_error_level(E_WARNING, ctx, ZSTR_VAL(LIBXML(error_buffer).s), line); break; case PHP_LIBXML_CTX_WARNING: - php_libxml_ctx_error_level(E_NOTICE, ctx, ZSTR_VAL(LIBXML(error_buffer).s)); + php_libxml_ctx_error_level(E_NOTICE, ctx, ZSTR_VAL(LIBXML(error_buffer).s), line); break; default: php_error_docref(NULL, E_WARNING, "%s", ZSTR_VAL(LIBXML(error_buffer).s)); @@ -694,6 +696,19 @@ static void php_libxml_internal_error_handler(int error_type, void *ctx, const c } } +static void php_libxml_internal_error_handler(int error_type, void *ctx, const char **msg, va_list ap) +{ + int line = 0; + int column = 0; + xmlParserCtxtPtr parser = (xmlParserCtxtPtr) ctx; + /* Context is not valid for PHP_LIBXML_ERROR, don't dereference it in that case */ + if (error_type != PHP_LIBXML_ERROR && parser != NULL && parser->input != NULL) { + line = parser->input->line; + column = parser->input->col; + } + php_libxml_internal_error_handler_ex(error_type, ctx, msg, ap, line, column); +} + static xmlParserInputPtr _php_libxml_external_entity_loader(const char *URL, const char *ID, xmlParserCtxtPtr context) { @@ -823,6 +838,14 @@ static xmlParserInputPtr _php_libxml_pre_ext_ent_loader(const char *URL, } } +PHP_LIBXML_API void php_libxml_pretend_ctx_error_ex(int line, int column, const char *msg,...) +{ + va_list args; + va_start(args, msg); + php_libxml_internal_error_handler_ex(PHP_LIBXML_CTX_ERROR, NULL, &msg, args, line, column); + va_end(args); +} + PHP_LIBXML_API void php_libxml_ctx_error(void *ctx, const char *msg, ...) { va_list args; @@ -841,7 +864,7 @@ PHP_LIBXML_API void php_libxml_ctx_warning(void *ctx, const char *msg, ...) static void php_libxml_structured_error_handler(void *userData, xmlErrorPtr error) { - _php_list_set_error_structure(error, NULL); + _php_list_set_error_structure(error, NULL, 0, 0); return; } diff --git a/ext/libxml/php_libxml.h b/ext/libxml/php_libxml.h index 7ce7def92ae5f..7050dd7ab842e 100644 --- a/ext/libxml/php_libxml.h +++ b/ext/libxml/php_libxml.h @@ -131,6 +131,7 @@ PHP_LIBXML_API void php_libxml_node_free_resource(xmlNodePtr node); PHP_LIBXML_API void php_libxml_node_decrement_resource(php_libxml_node_object *object); PHP_LIBXML_API void php_libxml_error_handler(void *ctx, const char *msg, ...); PHP_LIBXML_API void php_libxml_ctx_warning(void *ctx, const char *msg, ...); +PHP_LIBXML_API void php_libxml_pretend_ctx_error_ex(int line, int column, const char *msg,...); PHP_LIBXML_API void php_libxml_ctx_error(void *ctx, const char *msg, ...); PHP_LIBXML_API int php_libxml_xmlCheckUTF8(const unsigned char *s); PHP_LIBXML_API void php_libxml_switch_context(zval *context, zval *oldcontext); From c031fff4aad9a8f089097fa3fe7b8fd7abb4eba8 Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Sat, 26 Aug 2023 01:11:52 +0200 Subject: [PATCH 03/53] Add is_html5_class field to document data in libxml --- ext/libxml/libxml.c | 1 + ext/libxml/php_libxml.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ext/libxml/libxml.c b/ext/libxml/libxml.c index baf5bfa5028e5..b55f57e2752d9 100644 --- a/ext/libxml/libxml.c +++ b/ext/libxml/libxml.c @@ -1356,6 +1356,7 @@ PHP_LIBXML_API int php_libxml_increment_doc_ref(php_libxml_node_object *object, object->document->refcount = ret_refcount; object->document->doc_props = NULL; object->document->cache_tag.modification_nr = 1; /* iterators start at 0, such that they will start in an uninitialised state */ + object->document->is_html5_class = false; } return ret_refcount; diff --git a/ext/libxml/php_libxml.h b/ext/libxml/php_libxml.h index 7050dd7ab842e..7ffb93274ba7d 100644 --- a/ext/libxml/php_libxml.h +++ b/ext/libxml/php_libxml.h @@ -63,9 +63,10 @@ typedef struct { typedef struct _php_libxml_ref_obj { void *ptr; - int refcount; libxml_doc_props *doc_props; php_libxml_cache_tag cache_tag; + int refcount; + bool is_html5_class; } php_libxml_ref_obj; typedef struct _php_libxml_node_ptr { From 32747ec49d0e301d88be89dd006600daded23f3b Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Sat, 26 Aug 2023 01:12:23 +0200 Subject: [PATCH 04/53] Implement HTML5Document --- ext/dom/config.m4 | 19 +- ext/dom/config.w32 | 18 +- ext/dom/document.c | 33 +- ext/dom/dom_ce.h | 1 + ext/dom/dom_properties.h | 3 + ext/dom/html5_document.c | 1024 +++++++++++ ext/dom/html5_parser.c | 262 +++ ext/dom/html5_parser.h | 57 + ext/dom/html5_serializer.c | 351 ++++ ext/dom/html5_serializer.h | 31 + ext/dom/namespace_compat.c | 54 + ext/dom/namespace_compat.h | 39 + ext/dom/node.c | 2 +- ext/dom/php_dom.c | 50 +- ext/dom/php_dom.h | 12 +- ext/dom/php_dom.stub.php | 1575 +++++++++-------- ext/dom/php_dom_arginfo.h | 77 +- .../HTML5/encoding/Document_GB18030.phpt | 37 + .../HTML5/encoding/Document_Shift_JIS.phpt | 41 + .../HTML5/encoding/Document_UTF16BE_BOM.phpt | 39 + .../HTML5/encoding/Document_UTF16LE_BOM.phpt | 39 + .../HTML5/encoding/Document_UTF8_BOM.phpt | 39 + .../HTML5/encoding/Document_Windows1251.phpt | 41 + .../Document_encoding_edge_case_01.phpt | 16 + .../Document_encoding_edge_case_02.phpt | 28 + .../Document_encoding_edge_case_03.phpt | 18 + .../Document_encoding_edge_case_04.phpt | 16 + .../Document_encoding_edge_case_05.phpt | 22 + .../Document_encoding_edge_case_06.phpt | 16 + .../Document_encoding_edge_case_07.phpt | 22 + .../Document_encoding_field_test.phpt | 36 + .../Document_encoding_unicode_error.phpt | 27 + .../encoding/Document_fallback_encoding.phpt | 24 + .../Document_load_different_encoding.phpt | 19 + .../HTML5/encoding/fallback_encoding.html | 6 + ext/dom/tests/HTML5/encoding/gb18030.html | 7 + ext/dom/tests/HTML5/encoding/shift_jis.html | 7 + ext/dom/tests/HTML5/encoding/utf16be_bom.html | Bin 0 -> 212 bytes ext/dom/tests/HTML5/encoding/utf16le_bom.html | Bin 0 -> 212 bytes .../tests/HTML5/encoding/utf16le_error.html | Bin 0 -> 268 bytes ext/dom/tests/HTML5/encoding/utf8_bom.html | 7 + ext/dom/tests/HTML5/encoding/windows1251.html | 7 + .../Document_adopt_DOMDocument.phpt | 30 + .../HTML5/interactions/Document_clone.phpt | 31 + .../Document_node_ownerDocument_for_XML.phpt | 98 + .../Document_registerNodeClass_01.phpt | 28 + .../Document_registerNodeClass_02.phpt | 17 + .../Document_registerNodeClass_03.phpt | 26 + ...should_retain_properties_and_owner_01.phpt | 107 ++ ...should_retain_properties_and_owner_02.phpt | 106 ++ ...dHTMLFile_DOM_HTML_NO_DEFAULT_NS copy.phpt | 39 + .../Document_loadHTMLFile_empty_path.phpt | 19 + ...ment_loadHTMLFile_local_existing_file.phpt | 24 + ...oadHTMLFile_local_file_does_not_exist.phpt | 15 + ...oadHTMLFile_nul_terminator_cases_path.phpt | 21 + ...cument_loadHTMLFile_parser_warning_01.phpt | 20 + ...cument_loadHTMLFile_parser_warning_02.phpt | 21 + ...cument_loadHTMLFile_parser_warning_03.phpt | 17 + ...dHTMLFile_with_failing_stream_wrapper.phpt | 51 + ...dHTMLFile_with_working_stream_wrapper.phpt | 62 + ...ument_loadHTML_DOM_HTML_NO_DEFAULT_NS.phpt | 39 + .../Document_loadHTML_LIBXML_COMPACT.phpt | 41 + ...cument_loadHTML_LIBXML_HTML_NOIMPLIED.phpt | 93 + ...dHTML_LIBXML_HTML_NOIMPLIED_namespace.phpt | 17 + .../HTML5/parser/Document_loadHTML_empty.phpt | 15 + .../parser/Document_loadHTML_line_column.phpt | 59 + .../Document_loadHTML_normal_no_error.phpt | 41 + .../parser/Document_loadHTML_old_dtd.phpt | 40 + .../Document_loadHTML_parser_warning_01.phpt | 24 + .../Document_loadHTML_parser_warning_02.phpt | 33 + .../Document_loadHTML_parser_warning_03.phpt | 18 + ...oadHTML_parser_warning_internal_error.phpt | 31 + .../Document_loadHTML_without_body.phpt | 16 + .../HTML5/parser/Document_load_options.phpt | 109 ++ ext/dom/tests/HTML5/parser/paragraph.html | 1 + .../tests/HTML5/parser/parser_warning_01.html | 7 + .../tests/HTML5/parser/parser_warning_02.html | Bin 0 -> 191 bytes .../tests/HTML5/parser/parser_warning_03.html | 6 + .../HTML5/parser/predefined_namespaces.phpt | 101 ++ .../serializer/Document_escape_attribute.phpt | 16 + .../serializer/Document_escape_nbsp.phpt | 14 + .../Document_serialize_attribute_ns.phpt | 22 + .../serializer/Document_serialize_cdata.phpt | 14 + .../Document_serialize_comment.phpt | 14 + .../Document_serialize_doctype.phpt | 40 + .../Document_serialize_element_ns.phpt | 29 + .../Document_serialize_failing_stream.phpt | 49 + .../Document_serialize_fragment.phpt | 18 + .../Document_serialize_full_document.phpt | 52 + .../Document_serialize_ns_imported_01.phpt | 34 + .../Document_serialize_ns_imported_02.phpt | 33 + .../Document_serialize_ns_imported_03.phpt | 33 + .../Document_serialize_ns_imported_04.phpt | 33 + .../Document_serialize_ns_imported_05.phpt | 33 + .../Document_serialize_ns_imported_06.phpt | 33 + ...ment_serialize_processing_instruction.phpt | 15 + .../Document_serialize_roots_test_empty.phpt | 30 + .../Document_serialize_text_01.phpt | 16 + .../Document_serialize_text_02.phpt | 27 + .../Document_serialize_text_03.phpt | 27 + .../Document_serialize_void_elements.phpt | 100 ++ ext/dom/xpath.c | 1 - 102 files changed, 5422 insertions(+), 806 deletions(-) create mode 100644 ext/dom/html5_document.c create mode 100644 ext/dom/html5_parser.c create mode 100644 ext/dom/html5_parser.h create mode 100644 ext/dom/html5_serializer.c create mode 100644 ext/dom/html5_serializer.h create mode 100644 ext/dom/namespace_compat.c create mode 100644 ext/dom/namespace_compat.h create mode 100644 ext/dom/tests/HTML5/encoding/Document_GB18030.phpt create mode 100644 ext/dom/tests/HTML5/encoding/Document_Shift_JIS.phpt create mode 100644 ext/dom/tests/HTML5/encoding/Document_UTF16BE_BOM.phpt create mode 100644 ext/dom/tests/HTML5/encoding/Document_UTF16LE_BOM.phpt create mode 100644 ext/dom/tests/HTML5/encoding/Document_UTF8_BOM.phpt create mode 100644 ext/dom/tests/HTML5/encoding/Document_Windows1251.phpt create mode 100644 ext/dom/tests/HTML5/encoding/Document_encoding_edge_case_01.phpt create mode 100644 ext/dom/tests/HTML5/encoding/Document_encoding_edge_case_02.phpt create mode 100644 ext/dom/tests/HTML5/encoding/Document_encoding_edge_case_03.phpt create mode 100644 ext/dom/tests/HTML5/encoding/Document_encoding_edge_case_04.phpt create mode 100644 ext/dom/tests/HTML5/encoding/Document_encoding_edge_case_05.phpt create mode 100644 ext/dom/tests/HTML5/encoding/Document_encoding_edge_case_06.phpt create mode 100644 ext/dom/tests/HTML5/encoding/Document_encoding_edge_case_07.phpt create mode 100644 ext/dom/tests/HTML5/encoding/Document_encoding_field_test.phpt create mode 100644 ext/dom/tests/HTML5/encoding/Document_encoding_unicode_error.phpt create mode 100644 ext/dom/tests/HTML5/encoding/Document_fallback_encoding.phpt create mode 100644 ext/dom/tests/HTML5/encoding/Document_load_different_encoding.phpt create mode 100644 ext/dom/tests/HTML5/encoding/fallback_encoding.html create mode 100644 ext/dom/tests/HTML5/encoding/gb18030.html create mode 100644 ext/dom/tests/HTML5/encoding/shift_jis.html create mode 100644 ext/dom/tests/HTML5/encoding/utf16be_bom.html create mode 100644 ext/dom/tests/HTML5/encoding/utf16le_bom.html create mode 100644 ext/dom/tests/HTML5/encoding/utf16le_error.html create mode 100644 ext/dom/tests/HTML5/encoding/utf8_bom.html create mode 100644 ext/dom/tests/HTML5/encoding/windows1251.html create mode 100644 ext/dom/tests/HTML5/interactions/Document_adopt_DOMDocument.phpt create mode 100644 ext/dom/tests/HTML5/interactions/Document_clone.phpt create mode 100644 ext/dom/tests/HTML5/interactions/Document_node_ownerDocument_for_XML.phpt create mode 100644 ext/dom/tests/HTML5/interactions/Document_registerNodeClass_01.phpt create mode 100644 ext/dom/tests/HTML5/interactions/Document_registerNodeClass_02.phpt create mode 100644 ext/dom/tests/HTML5/interactions/Document_registerNodeClass_03.phpt create mode 100644 ext/dom/tests/HTML5/interactions/Document_should_retain_properties_and_owner_01.phpt create mode 100644 ext/dom/tests/HTML5/interactions/Document_should_retain_properties_and_owner_02.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTMLFile_DOM_HTML_NO_DEFAULT_NS copy.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTMLFile_empty_path.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTMLFile_local_existing_file.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTMLFile_local_file_does_not_exist.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTMLFile_nul_terminator_cases_path.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTMLFile_parser_warning_01.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTMLFile_parser_warning_02.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTMLFile_parser_warning_03.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTMLFile_with_failing_stream_wrapper.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTMLFile_with_working_stream_wrapper.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTML_DOM_HTML_NO_DEFAULT_NS.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTML_LIBXML_COMPACT.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTML_LIBXML_HTML_NOIMPLIED.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTML_LIBXML_HTML_NOIMPLIED_namespace.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTML_empty.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTML_line_column.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTML_normal_no_error.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTML_old_dtd.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTML_parser_warning_01.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTML_parser_warning_02.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTML_parser_warning_03.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTML_parser_warning_internal_error.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_loadHTML_without_body.phpt create mode 100644 ext/dom/tests/HTML5/parser/Document_load_options.phpt create mode 100644 ext/dom/tests/HTML5/parser/paragraph.html create mode 100644 ext/dom/tests/HTML5/parser/parser_warning_01.html create mode 100644 ext/dom/tests/HTML5/parser/parser_warning_02.html create mode 100644 ext/dom/tests/HTML5/parser/parser_warning_03.html create mode 100644 ext/dom/tests/HTML5/parser/predefined_namespaces.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_escape_attribute.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_escape_nbsp.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_serialize_attribute_ns.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_serialize_cdata.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_serialize_comment.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_serialize_doctype.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_serialize_element_ns.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_serialize_failing_stream.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_serialize_fragment.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_serialize_full_document.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_serialize_ns_imported_01.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_serialize_ns_imported_02.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_serialize_ns_imported_03.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_serialize_ns_imported_04.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_serialize_ns_imported_05.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_serialize_ns_imported_06.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_serialize_processing_instruction.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_serialize_roots_test_empty.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_serialize_text_01.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_serialize_text_02.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_serialize_text_03.phpt create mode 100644 ext/dom/tests/HTML5/serializer/Document_serialize_void_elements.phpt diff --git a/ext/dom/config.m4 b/ext/dom/config.m4 index 6a83d10c8e245..c43bb35f100b5 100644 --- a/ext/dom/config.m4 +++ b/ext/dom/config.m4 @@ -12,7 +12,21 @@ if test "$PHP_DOM" != "no"; then PHP_SETUP_LIBXML(DOM_SHARED_LIBADD, [ AC_DEFINE(HAVE_DOM,1,[ ]) + PHP_LEXBOR_CFLAGS="-I@ext_srcdir@/lexbor -DLEXBOR_STATIC" + LEXBOR_DIR="lexbor/lexbor" + LEXBOR_SOURCES="$LEXBOR_DIR/ports/posix/lexbor/core/memory.c \ + $LEXBOR_DIR/core/array_obj.c $LEXBOR_DIR/core/array.c $LEXBOR_DIR/core/avl.c $LEXBOR_DIR/core/bst.c $LEXBOR_DIR/core/diyfp.c $LEXBOR_DIR/core/conv.c $LEXBOR_DIR/core/dobject.c $LEXBOR_DIR/core/dtoa.c $LEXBOR_DIR/core/hash.c $LEXBOR_DIR/core/mem.c $LEXBOR_DIR/core/mraw.c $LEXBOR_DIR/core/print.c $LEXBOR_DIR/core/serialize.c $LEXBOR_DIR/core/shs.c $LEXBOR_DIR/core/str.c $LEXBOR_DIR/core/strtod.c \ + $LEXBOR_DIR/dom/interface.c $LEXBOR_DIR/dom/interfaces/attr.c $LEXBOR_DIR/dom/interfaces/cdata_section.c $LEXBOR_DIR/dom/interfaces/character_data.c $LEXBOR_DIR/dom/interfaces/comment.c $LEXBOR_DIR/dom/interfaces/document.c $LEXBOR_DIR/dom/interfaces/document_fragment.c $LEXBOR_DIR/dom/interfaces/document_type.c $LEXBOR_DIR/dom/interfaces/element.c $LEXBOR_DIR/dom/interfaces/node.c $LEXBOR_DIR/dom/interfaces/processing_instruction.c $LEXBOR_DIR/dom/interfaces/shadow_root.c $LEXBOR_DIR/dom/interfaces/text.c \ + $LEXBOR_DIR/html/tokenizer/error.c $LEXBOR_DIR/html/tokenizer/state_comment.c $LEXBOR_DIR/html/tokenizer/state_doctype.c $LEXBOR_DIR/html/tokenizer/state_rawtext.c $LEXBOR_DIR/html/tokenizer/state_rcdata.c $LEXBOR_DIR/html/tokenizer/state_script.c $LEXBOR_DIR/html/tokenizer/state.c \ + $LEXBOR_DIR/html/tree/active_formatting.c $LEXBOR_DIR/html/tree/error.c $LEXBOR_DIR/html/tree/insertion_mode/after_after_body.c $LEXBOR_DIR/html/tree/insertion_mode/after_after_frameset.c $LEXBOR_DIR/html/tree/insertion_mode/after_body.c $LEXBOR_DIR/html/tree/insertion_mode/after_frameset.c $LEXBOR_DIR/html/tree/insertion_mode/after_head.c $LEXBOR_DIR/html/tree/insertion_mode/before_head.c $LEXBOR_DIR/html/tree/insertion_mode/before_html.c $LEXBOR_DIR/html/tree/insertion_mode/foreign_content.c $LEXBOR_DIR/html/tree/insertion_mode/in_body.c $LEXBOR_DIR/html/tree/insertion_mode/in_caption.c $LEXBOR_DIR/html/tree/insertion_mode/in_cell.c $LEXBOR_DIR/html/tree/insertion_mode/in_column_group.c $LEXBOR_DIR/html/tree/insertion_mode/in_frameset.c $LEXBOR_DIR/html/tree/insertion_mode/in_head.c $LEXBOR_DIR/html/tree/insertion_mode/in_head_noscript.c $LEXBOR_DIR/html/tree/insertion_mode/initial.c $LEXBOR_DIR/html/tree/insertion_mode/in_row.c $LEXBOR_DIR/html/tree/insertion_mode/in_select.c $LEXBOR_DIR/html/tree/insertion_mode/in_select_in_table.c $LEXBOR_DIR/html/tree/insertion_mode/in_table_body.c $LEXBOR_DIR/html/tree/insertion_mode/in_table.c $LEXBOR_DIR/html/tree/insertion_mode/in_table_text.c $LEXBOR_DIR/html/tree/insertion_mode/in_template.c $LEXBOR_DIR/html/tree/insertion_mode/text.c $LEXBOR_DIR/html/tree/open_elements.c \ + $LEXBOR_DIR/encoding/big5.c $LEXBOR_DIR/encoding/decode.c $LEXBOR_DIR/encoding/encode.c $LEXBOR_DIR/encoding/encoding.c $LEXBOR_DIR/encoding/euc_kr.c $LEXBOR_DIR/encoding/gb18030.c $LEXBOR_DIR/encoding/iso_2022_jp_katakana.c $LEXBOR_DIR/encoding/jis0208.c $LEXBOR_DIR/encoding/jis0212.c $LEXBOR_DIR/encoding/range.c $LEXBOR_DIR/encoding/res.c $LEXBOR_DIR/encoding/single.c \ + $LEXBOR_DIR/html/encoding.c $LEXBOR_DIR/html/interface.c $LEXBOR_DIR/html/parser.c $LEXBOR_DIR/html/token.c $LEXBOR_DIR/html/token_attr.c $LEXBOR_DIR/html/tokenizer.c $LEXBOR_DIR/html/tree.c \ + $LEXBOR_DIR/html/interfaces/anchor_element.c $LEXBOR_DIR/html/interfaces/area_element.c $LEXBOR_DIR/html/interfaces/audio_element.c $LEXBOR_DIR/html/interfaces/base_element.c $LEXBOR_DIR/html/interfaces/body_element.c $LEXBOR_DIR/html/interfaces/br_element.c $LEXBOR_DIR/html/interfaces/button_element.c $LEXBOR_DIR/html/interfaces/canvas_element.c $LEXBOR_DIR/html/interfaces/data_element.c $LEXBOR_DIR/html/interfaces/data_list_element.c $LEXBOR_DIR/html/interfaces/details_element.c $LEXBOR_DIR/html/interfaces/dialog_element.c $LEXBOR_DIR/html/interfaces/directory_element.c $LEXBOR_DIR/html/interfaces/div_element.c $LEXBOR_DIR/html/interfaces/d_list_element.c $LEXBOR_DIR/html/interfaces/document.c $LEXBOR_DIR/html/interfaces/element.c $LEXBOR_DIR/html/interfaces/embed_element.c $LEXBOR_DIR/html/interfaces/field_set_element.c $LEXBOR_DIR/html/interfaces/font_element.c $LEXBOR_DIR/html/interfaces/form_element.c $LEXBOR_DIR/html/interfaces/frame_element.c $LEXBOR_DIR/html/interfaces/frame_set_element.c $LEXBOR_DIR/html/interfaces/head_element.c $LEXBOR_DIR/html/interfaces/heading_element.c $LEXBOR_DIR/html/interfaces/hr_element.c $LEXBOR_DIR/html/interfaces/html_element.c $LEXBOR_DIR/html/interfaces/iframe_element.c $LEXBOR_DIR/html/interfaces/image_element.c $LEXBOR_DIR/html/interfaces/input_element.c $LEXBOR_DIR/html/interfaces/label_element.c $LEXBOR_DIR/html/interfaces/legend_element.c $LEXBOR_DIR/html/interfaces/li_element.c $LEXBOR_DIR/html/interfaces/link_element.c $LEXBOR_DIR/html/interfaces/map_element.c $LEXBOR_DIR/html/interfaces/marquee_element.c $LEXBOR_DIR/html/interfaces/media_element.c $LEXBOR_DIR/html/interfaces/menu_element.c $LEXBOR_DIR/html/interfaces/meta_element.c $LEXBOR_DIR/html/interfaces/meter_element.c $LEXBOR_DIR/html/interfaces/mod_element.c $LEXBOR_DIR/html/interfaces/object_element.c $LEXBOR_DIR/html/interfaces/o_list_element.c $LEXBOR_DIR/html/interfaces/opt_group_element.c $LEXBOR_DIR/html/interfaces/option_element.c $LEXBOR_DIR/html/interfaces/output_element.c $LEXBOR_DIR/html/interfaces/paragraph_element.c $LEXBOR_DIR/html/interfaces/param_element.c $LEXBOR_DIR/html/interfaces/picture_element.c $LEXBOR_DIR/html/interfaces/pre_element.c $LEXBOR_DIR/html/interfaces/progress_element.c $LEXBOR_DIR/html/interfaces/quote_element.c $LEXBOR_DIR/html/interfaces/script_element.c $LEXBOR_DIR/html/interfaces/select_element.c $LEXBOR_DIR/html/interfaces/slot_element.c $LEXBOR_DIR/html/interfaces/source_element.c $LEXBOR_DIR/html/interfaces/span_element.c $LEXBOR_DIR/html/interfaces/style_element.c $LEXBOR_DIR/html/interfaces/table_caption_element.c $LEXBOR_DIR/html/interfaces/table_cell_element.c $LEXBOR_DIR/html/interfaces/table_col_element.c $LEXBOR_DIR/html/interfaces/table_element.c $LEXBOR_DIR/html/interfaces/table_row_element.c $LEXBOR_DIR/html/interfaces/table_section_element.c $LEXBOR_DIR/html/interfaces/template_element.c $LEXBOR_DIR/html/interfaces/text_area_element.c $LEXBOR_DIR/html/interfaces/time_element.c $LEXBOR_DIR/html/interfaces/title_element.c $LEXBOR_DIR/html/interfaces/track_element.c $LEXBOR_DIR/html/interfaces/u_list_element.c $LEXBOR_DIR/html/interfaces/unknown_element.c $LEXBOR_DIR/html/interfaces/video_element.c $LEXBOR_DIR/html/interfaces/window.c \ + $LEXBOR_DIR/selectors/selectors.c \ + $LEXBOR_DIR/ns/ns.c \ + $LEXBOR_DIR/tag/tag.c" PHP_NEW_EXTENSION(dom, [php_dom.c attr.c document.c \ + html5_document.c html5_serializer.c html5_parser.c namespace_compat.c \ domexception.c parentnode.c \ processinginstruction.c cdatasection.c \ documentfragment.c domimplementation.c \ @@ -21,8 +35,9 @@ if test "$PHP_DOM" != "no"; then nodelist.c text.c comment.c \ entityreference.c \ notation.c xpath.c dom_iterators.c \ - namednodemap.c], - $ext_shared) + namednodemap.c \ + $LEXBOR_SOURCES], + $ext_shared,,$PHP_LEXBOR_CFLAGS) PHP_SUBST(DOM_SHARED_LIBADD) PHP_INSTALL_HEADERS([ext/dom/xml_common.h]) PHP_ADD_EXTENSION_DEP(dom, libxml) diff --git a/ext/dom/config.w32 b/ext/dom/config.w32 index 7795445019e1a..b663b64c69a5f 100644 --- a/ext/dom/config.w32 +++ b/ext/dom/config.w32 @@ -8,13 +8,29 @@ if (PHP_DOM == "yes") { CHECK_HEADER_ADD_INCLUDE("libxml/parser.h", "CFLAGS_DOM", PHP_PHP_BUILD + "\\include\\libxml2") ) { EXTENSION("dom", "php_dom.c attr.c document.c \ + html5_document.c html5_serializer.c html5_parser.c namespace_compat.c \ domexception.c parentnode.c processinginstruction.c \ cdatasection.c documentfragment.c domimplementation.c element.c \ node.c characterdata.c documenttype.c \ entity.c nodelist.c text.c comment.c \ entityreference.c \ notation.c xpath.c dom_iterators.c \ - namednodemap.c"); + namednodemap.c", null, "-Iext/dom/lexbor"); + + ADD_SOURCES("ext/dom/lexbor/lexbor/ports/windows_nt/lexbor/core", "memory.c", "dom"); + ADD_SOURCES("ext/dom/lexbor/lexbor/core", "array_obj.c array.c avl.c bst.c diyfp.c conv.c dobject.c dtoa.c hash.c mem.c mraw.c print.c serialize.c shs.c str.c strtod.c", "dom"); + ADD_SOURCES("ext/dom/lexbor/lexbor/dom", "interface.c", "dom"); + ADD_SOURCES("ext/dom/lexbor/lexbor/dom/interfaces", "attr.c cdata_section.c character_data.c comment.c document.c document_fragment.c document_type.c element.c node.c processing_instruction.c shadow_root.c text.c", "dom"); + ADD_SOURCES("ext/dom/lexbor/lexbor/html/tokenizer", "error.c state_comment.c state_doctype.c state_rawtext.c state_rcdata.c state_script.c state.c", "dom"); + ADD_SOURCES("ext/dom/lexbor/lexbor/html/tree", "active_formatting.c open_elements.c error.c", "dom"); + ADD_SOURCES("ext/dom/lexbor/lexbor/html/tree/insertion_mode", "after_after_body.c after_after_frameset.c after_body.c after_frameset.c after_head.c before_head.c before_html.c foreign_content.c in_body.c in_caption.c in_cell.c in_column_group.c in_frameset.c in_head.c in_head_noscript.c initial.c in_row.c in_select.c in_select_in_table.c in_table_body.c in_table.c in_table_text.c in_template.c text.c", "dom"); + ADD_SOURCES("ext/dom/lexbor/lexbor/html", "encoding.c interface.c parser.c token.c token_attr.c tokenizer.c tree.c", "dom"); + ADD_SOURCES("ext/dom/lexbor/lexbor/encoding", "big5.c decode.c encode.c encoding.c euc_kr.c gb18030.c iso_2022_jp_katakana.c jis0208.c jis0212.c range.c res.c single.c", "dom"); + ADD_SOURCES("ext/dom/lexbor/lexbor/html/interfaces", "anchor_element.c area_element.c audio_element.c base_element.c body_element.c br_element.c button_element.c canvas_element.c data_element.c data_list_element.c details_element.c dialog_element.c directory_element.c div_element.c d_list_element.c document.c element.c embed_element.c field_set_element.c font_element.c form_element.c frame_element.c frame_set_element.c head_element.c heading_element.c hr_element.c html_element.c iframe_element.c image_element.c input_element.c label_element.c legend_element.c li_element.c link_element.c map_element.c marquee_element.c media_element.c menu_element.c meta_element.c meter_element.c mod_element.c object_element.c o_list_element.c opt_group_element.c option_element.c output_element.c paragraph_element.c param_element.c picture_element.c pre_element.c progress_element.c quote_element.c script_element.c select_element.c slot_element.c source_element.c span_element.c style_element.c table_caption_element.c table_cell_element.c table_col_element.c table_element.c table_row_element.c table_section_element.c template_element.c text_area_element.c time_element.c title_element.c track_element.c u_list_element.c unknown_element.c video_element.c window.c", "dom"); + ADD_SOURCES("ext/dom/lexbor/lexbor/selectors", "selectors.c", "dom"); + ADD_SOURCES("ext/dom/lexbor/lexbor/ns", "ns.c", "dom"); + ADD_SOURCES("ext/dom/lexbor/lexbor/tag", "tag.c", "dom"); + ADD_FLAG("CFLAGS_DOM", "/D LEXBOR_STATIC "); AC_DEFINE("HAVE_DOM", 1, "DOM support"); diff --git a/ext/dom/document.c b/ext/dom/document.c index 2aebbccca719d..cbaa29fb775c4 100644 --- a/ext/dom/document.c +++ b/ext/dom/document.c @@ -35,9 +35,6 @@ struct _idsIterator { xmlNode *element; }; -#define DOM_LOAD_STRING 0 -#define DOM_LOAD_FILE 1 - /* * class DOMDocument extends DOMNode * @@ -794,7 +791,7 @@ PHP_METHOD(DOMDocument, importNode) if (nodep->doc == docp) { retnodep = nodep; } else { - retnodep = dom_clone_node(nodep, docp, recursive); + retnodep = dom_clone_node(nodep, docp, intern, recursive); if (!retnodep) { RETURN_FALSE; } @@ -1101,8 +1098,7 @@ PHP_METHOD(DOMDocument, normalizeDocument) } /* }}} end dom_document_normalize_document */ -/* {{{ */ -PHP_METHOD(DOMDocument, __construct) +void php_dom_document_constructor(INTERNAL_FUNCTION_PARAMETERS) { xmlDoc *docp = NULL, *olddoc; dom_object *intern; @@ -1141,6 +1137,12 @@ PHP_METHOD(DOMDocument, __construct) } php_libxml_increment_node_ptr((php_libxml_node_object *)intern, (xmlNodePtr)docp, (void *)intern); } + +/* {{{ */ +PHP_METHOD(DOMDocument, __construct) +{ + php_dom_document_constructor(INTERNAL_FUNCTION_PARAM_PASSTHRU); +} /* }}} end DOMDocument::__construct */ char *_dom_get_valid_file_path(char *source, char *resolved_path, int resolved_path_len ) /* {{{ */ @@ -1313,7 +1315,7 @@ static xmlDocPtr dom_document_parser(zval *id, int mode, char *source, size_t so } /* }}} */ -static void dom_finish_loading_document(zval *this, zval *return_value, xmlDocPtr newdoc) +void php_dom_finish_loading_document(zval *this, zval *return_value, xmlDocPtr newdoc) { if (!newdoc) RETURN_FALSE; @@ -1321,6 +1323,7 @@ static void dom_finish_loading_document(zval *this, zval *return_value, xmlDocPt dom_object *intern = Z_DOMOBJ_P(this); size_t old_modification_nr = 0; if (intern != NULL) { + bool is_html5_class = intern->document->is_html5_class; xmlDocPtr docp = (xmlDocPtr) dom_object_get_node(intern); dom_doc_propsptr doc_prop = NULL; if (docp != NULL) { @@ -1340,6 +1343,7 @@ static void dom_finish_loading_document(zval *this, zval *return_value, xmlDocPt RETURN_FALSE; } intern->document->doc_props = doc_prop; + intern->document->is_html5_class = is_html5_class; } php_libxml_increment_node_ptr((php_libxml_node_object *)intern, (xmlNodePtr)newdoc, (void *)intern); @@ -1352,8 +1356,7 @@ static void dom_finish_loading_document(zval *this, zval *return_value, xmlDocPt RETURN_TRUE; } -/* {{{ static void dom_parse_document(INTERNAL_FUNCTION_PARAMETERS, int mode) */ -static void dom_parse_document(INTERNAL_FUNCTION_PARAMETERS, int mode) { +void dom_parse_document(INTERNAL_FUNCTION_PARAMETERS, int mode, xmlDocPtr *doc_out) { char *source; size_t source_len; zend_long options = 0; @@ -1376,17 +1379,18 @@ static void dom_parse_document(INTERNAL_FUNCTION_PARAMETERS, int mode) { } xmlDocPtr newdoc = dom_document_parser(ZEND_THIS, mode, source, source_len, options); + *doc_out = newdoc; - dom_finish_loading_document(ZEND_THIS, return_value, newdoc); + php_dom_finish_loading_document(ZEND_THIS, return_value, newdoc); } -/* }}} end dom_parser_document */ /* {{{ URL: http://www.w3.org/TR/DOM-Level-3-LS/load-save.html#LS-DocumentLS-load Since: DOM Level 3 */ PHP_METHOD(DOMDocument, load) { - dom_parse_document(INTERNAL_FUNCTION_PARAM_PASSTHRU, DOM_LOAD_FILE); + xmlDocPtr unused; + dom_parse_document(INTERNAL_FUNCTION_PARAM_PASSTHRU, DOM_LOAD_FILE, &unused); } /* }}} end dom_document_load */ @@ -1395,7 +1399,8 @@ Since: DOM Level 3 */ PHP_METHOD(DOMDocument, loadXML) { - dom_parse_document(INTERNAL_FUNCTION_PARAM_PASSTHRU, DOM_LOAD_STRING); + xmlDocPtr unused; + dom_parse_document(INTERNAL_FUNCTION_PARAM_PASSTHRU, DOM_LOAD_STRING, &unused); } /* }}} end dom_document_loadxml */ @@ -1917,7 +1922,7 @@ static void dom_load_html(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{ */ xmlDocPtr newdoc = ctxt->myDoc; htmlFreeParserCtxt(ctxt); - dom_finish_loading_document(ZEND_THIS, return_value, newdoc); + php_dom_finish_loading_document(ZEND_THIS, return_value, newdoc); } /* }}} */ diff --git a/ext/dom/dom_ce.h b/ext/dom/dom_ce.h index b0faf3934df52..399e21d2900ce 100644 --- a/ext/dom/dom_ce.h +++ b/ext/dom/dom_ce.h @@ -23,6 +23,7 @@ extern PHP_DOM_EXPORT zend_class_entry *dom_domexception_class_entry; extern PHP_DOM_EXPORT zend_class_entry *dom_domimplementation_class_entry; extern PHP_DOM_EXPORT zend_class_entry *dom_documentfragment_class_entry; extern PHP_DOM_EXPORT zend_class_entry *dom_document_class_entry; +extern PHP_DOM_EXPORT zend_class_entry *dom_html5_document_class_entry; extern PHP_DOM_EXPORT zend_class_entry *dom_nodelist_class_entry; extern PHP_DOM_EXPORT zend_class_entry *dom_namednodemap_class_entry; extern PHP_DOM_EXPORT zend_class_entry *dom_characterdata_class_entry; diff --git a/ext/dom/dom_properties.h b/ext/dom/dom_properties.h index 5116c310570e2..a5a144734e45a 100644 --- a/ext/dom/dom_properties.h +++ b/ext/dom/dom_properties.h @@ -61,6 +61,9 @@ zend_result dom_document_recover_write(dom_object *obj, zval *newval); zend_result dom_document_substitue_entities_read(dom_object *obj, zval *retval); zend_result dom_document_substitue_entities_write(dom_object *obj, zval *newval); +/* html5 document properties */ +zend_result dom_html5_document_encoding_write(dom_object *obj, zval *retval); + /* documenttype properties */ zend_result dom_documenttype_name_read(dom_object *obj, zval *retval); zend_result dom_documenttype_entities_read(dom_object *obj, zval *retval); diff --git a/ext/dom/html5_document.c b/ext/dom/html5_document.c new file mode 100644 index 0000000000000..9fb4d4411d39f --- /dev/null +++ b/ext/dom/html5_document.c @@ -0,0 +1,1024 @@ +/* + +----------------------------------------------------------------------+ + | Copyright (c) The PHP Group | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | https://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Authors: Niels Dossche | + +----------------------------------------------------------------------+ +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "php.h" +#if defined(HAVE_LIBXML) && defined(HAVE_DOM) +#include "php_dom.h" +#include "html5_parser.h" +#include "html5_serializer.h" +#include "namespace_compat.h" +#include +#include +#include + +/* Implementation defined, but as HTML5 defaults in all other cases to UTF-8, we'll do the same. */ +#define DOM_FALLBACK_ENCODING_NAME "UTF-8" +#define DOM_FALLBACK_ENCODING_ID LXB_ENCODING_UTF_8 + +typedef struct { + size_t last_line; + size_t last_column; + size_t last_offset; +} dom_line_column_cache; + +typedef struct { + const char *input_name; + const lxb_codepoint_t *current_input_codepoints; + const char *current_input_characters; + size_t current_input_length; + size_t current_total_offset; + dom_line_column_cache cache_tokenizer; +} dom_lexbor_libxml2_bridge_application_data; + +typedef struct { + const lxb_encoding_data_t *encoding_data; + size_t bom_shift; +} dom_character_encoding_data; + +typedef zend_result (*dom_write_output)(void*, const char *, size_t); + +typedef struct { + const lxb_encoding_data_t *encoding_data; + const lxb_encoding_data_t *decoding_data; + lxb_encoding_encode_t *encode; + lxb_encoding_decode_t *decode; + lxb_codepoint_t *codepoints; + lxb_char_t *encoding_output; + void *output_data; + dom_write_output write_output; +} dom_output_ctx; + +typedef struct { + /* We can skip some conversion if the input and output encoding are both UTF-8, we only have to validate and substitute replacement characters */ + bool fast_path; /* Put first, close to the encode & decode structures, for cache locality */ + lxb_encoding_encode_t encode; + lxb_encoding_decode_t decode; + const lxb_encoding_data_t *encode_data; + const lxb_encoding_data_t *decode_data; + lxb_char_t encoding_output[4096]; + lxb_codepoint_t codepoints[4096]; +} dom_decoding_encoding_ctx; + +static void dom_decoding_encoding_ctx_init(dom_decoding_encoding_ctx *ctx) +{ + ctx->encode_data = lxb_encoding_data(LXB_ENCODING_UTF_8); + ctx->decode_data = NULL; + /* Set fast path on by default so that the decoder finishing is skipped if this was never initialised properly. */ + ctx->fast_path = true; + (void) lxb_encoding_encode_init(&ctx->encode, ctx->encode_data, ctx->encoding_output, sizeof(ctx->encoding_output) / sizeof(lxb_char_t)); + (void) lxb_encoding_encode_replace_set(&ctx->encode, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE); +} + +static const char *dom_lexbor_tokenizer_error_code_to_string(lxb_html_tokenizer_error_id_t id) +{ + switch (id) { + case LXB_HTML_TOKENIZER_ERROR_ABCLOFEMCO: return "abrupt-closing-of-empty-comment"; + case LXB_HTML_TOKENIZER_ERROR_ABDOPUID: return "abrupt-doctype-public-identifier"; + case LXB_HTML_TOKENIZER_ERROR_ABDOSYID: return "abrupt-doctype-system-identifier"; + case LXB_HTML_TOKENIZER_ERROR_ABOFDIINNUCHRE: return "absence-of-digits-in-numeric-character-reference"; + case LXB_HTML_TOKENIZER_ERROR_CDINHTCO: return "cdata-in-html-content"; + case LXB_HTML_TOKENIZER_ERROR_CHREOUUNRA: return "character-reference-outside-unicode-range"; + case LXB_HTML_TOKENIZER_ERROR_COCHININST: return "control-character-in-input-stream"; + case LXB_HTML_TOKENIZER_ERROR_COCHRE: return "control-character-reference"; + case LXB_HTML_TOKENIZER_ERROR_ENTAWIAT: return "end-tag-with-attributes"; + case LXB_HTML_TOKENIZER_ERROR_DUAT: return "duplicate-attribute"; + case LXB_HTML_TOKENIZER_ERROR_ENTAWITRSO: return "end-tag-with-trailing-solidus"; + case LXB_HTML_TOKENIZER_ERROR_EOBETANA: return "eof-before-tag-name"; + case LXB_HTML_TOKENIZER_ERROR_EOINCD: return "eof-in-cdata"; + case LXB_HTML_TOKENIZER_ERROR_EOINCO: return "eof-in-comment"; + case LXB_HTML_TOKENIZER_ERROR_EOINDO: return "eof-in-doctype"; + case LXB_HTML_TOKENIZER_ERROR_EOINSCHTCOLITE: return "eof-in-script-html-comment-like-text"; + case LXB_HTML_TOKENIZER_ERROR_EOINTA: return "eof-in-tag"; + case LXB_HTML_TOKENIZER_ERROR_INCLCO: return "incorrectly-closed-comment"; + case LXB_HTML_TOKENIZER_ERROR_INOPCO: return "incorrectly-opened-comment"; + case LXB_HTML_TOKENIZER_ERROR_INCHSEAFDONA: return "invalid-character-sequence-after-doctype-name"; + case LXB_HTML_TOKENIZER_ERROR_INFICHOFTANA: return "invalid-first-character-of-tag-name"; + case LXB_HTML_TOKENIZER_ERROR_MIATVA: return "missing-attribute-value"; + case LXB_HTML_TOKENIZER_ERROR_MIDONA: return "missing-doctype-name"; + case LXB_HTML_TOKENIZER_ERROR_MIDOPUID: return "missing-doctype-public-identifier"; + case LXB_HTML_TOKENIZER_ERROR_MIDOSYID: return "missing-doctype-system-identifier"; + case LXB_HTML_TOKENIZER_ERROR_MIENTANA: return "missing-end-tag-name"; + case LXB_HTML_TOKENIZER_ERROR_MIQUBEDOPUID: return "missing-quote-before-doctype-public-identifier"; + case LXB_HTML_TOKENIZER_ERROR_MIQUBEDOSYID: return "missing-quote-before-doctype-system-identifier"; + case LXB_HTML_TOKENIZER_ERROR_MISEAFCHRE: return "missing-semicolon-after-character-reference"; + case LXB_HTML_TOKENIZER_ERROR_MIWHAFDOPUKE: return "missing-whitespace-after-doctype-public-keyword"; + case LXB_HTML_TOKENIZER_ERROR_MIWHAFDOSYKE: return "missing-whitespace-after-doctype-system-keyword"; + case LXB_HTML_TOKENIZER_ERROR_MIWHBEDONA: return "missing-whitespace-before-doctype-name"; + case LXB_HTML_TOKENIZER_ERROR_MIWHBEAT: return "missing-whitespace-between-attributes"; + case LXB_HTML_TOKENIZER_ERROR_MIWHBEDOPUANSYID: return "missing-whitespace-between-doctype-public-and-system-identifiers"; + case LXB_HTML_TOKENIZER_ERROR_NECO: return "nested-comment"; + case LXB_HTML_TOKENIZER_ERROR_NOCHRE: return "noncharacter-character-reference"; + case LXB_HTML_TOKENIZER_ERROR_NOININST: return "noncharacter-in-input-stream"; + case LXB_HTML_TOKENIZER_ERROR_NOVOHTELSTTAWITRSO: return "non-void-html-element-start-tag-with-trailing-solidus"; + case LXB_HTML_TOKENIZER_ERROR_NUCHRE: return "null-character-reference"; + case LXB_HTML_TOKENIZER_ERROR_SUCHRE: return "surrogate-character-reference"; + case LXB_HTML_TOKENIZER_ERROR_SUININST: return "surrogate-in-input-stream"; + case LXB_HTML_TOKENIZER_ERROR_UNCHAFDOSYID: return "unexpected-character-after-doctype-system-identifier"; + case LXB_HTML_TOKENIZER_ERROR_UNCHINATNA: return "unexpected-character-in-attribute-name"; + case LXB_HTML_TOKENIZER_ERROR_UNCHINUNATVA: return "unexpected-character-in-unquoted-attribute-value"; + case LXB_HTML_TOKENIZER_ERROR_UNEQSIBEATNA: return "unexpected-equals-sign-before-attribute-name"; + case LXB_HTML_TOKENIZER_ERROR_UNNUCH: return "unexpected-null-character"; + case LXB_HTML_TOKENIZER_ERROR_UNQUMAINOFTANA: return "unexpected-question-mark-instead-of-tag-name"; + case LXB_HTML_TOKENIZER_ERROR_UNSOINTA: return "unexpected-solidus-in-tag"; + case LXB_HTML_TOKENIZER_ERROR_UNNACHRE: return "unknown-named-character-reference"; + default: return "unknown error"; + } +} + +static const char *dom_lexbor_tree_error_code_to_string(lxb_html_tree_error_id_t id) +{ + switch (id) { + case LXB_HTML_RULES_ERROR_UNTO: return "unexpected-token"; + case LXB_HTML_RULES_ERROR_UNCLTO: return "unexpected-closed-token"; + case LXB_HTML_RULES_ERROR_NUCH: return "null-character"; + case LXB_HTML_RULES_ERROR_UNCHTO: return "unexpected-character-token"; + case LXB_HTML_RULES_ERROR_UNTOININMO: return "unexpected-token-in-initial-mode"; + case LXB_HTML_RULES_ERROR_BADOTOININMO: return "bad-doctype-token-in-initial-mode"; + case LXB_HTML_RULES_ERROR_DOTOINBEHTMO: return "doctype-token-in-before-html-mode"; + case LXB_HTML_RULES_ERROR_UNCLTOINBEHTMO: return "unexpected-closed-token-in-before-html-mode"; + case LXB_HTML_RULES_ERROR_DOTOINBEHEMO: return "doctype-token-in-before-head-mode"; + case LXB_HTML_RULES_ERROR_UNCLTOINBEHEMO: return "unexpected-closed_token-in-before-head-mode"; + case LXB_HTML_RULES_ERROR_DOTOINHEMO: return "doctype-token-in-head-mode"; + case LXB_HTML_RULES_ERROR_NOVOHTELSTTAWITRSO: return "non-void-html-element-start-tag-with-trailing-solidus"; + case LXB_HTML_RULES_ERROR_HETOINHEMO: return "head-token-in-head-mode"; + case LXB_HTML_RULES_ERROR_UNCLTOINHEMO: return "unexpected-closed-token-in-head-mode"; + case LXB_HTML_RULES_ERROR_TECLTOWIOPINHEMO: return "template-closed-token-without-opening-in-head-mode"; + case LXB_HTML_RULES_ERROR_TEELISNOCUINHEMO: return "template-element-is-not-current-in-head-mode"; + case LXB_HTML_RULES_ERROR_DOTOINHENOMO: return "doctype-token-in-head-noscript-mode"; + case LXB_HTML_RULES_ERROR_DOTOAFHEMO: return "doctype-token-after-head-mode"; + case LXB_HTML_RULES_ERROR_HETOAFHEMO: return "head-token-after-head-mode"; + case LXB_HTML_RULES_ERROR_DOTOINBOMO: return "doctype-token-in-body-mode"; + case LXB_HTML_RULES_ERROR_BAENOPELISWR: return "bad-ending-open-elements-is-wrong"; + case LXB_HTML_RULES_ERROR_OPELISWR: return "open-elements-is-wrong"; + case LXB_HTML_RULES_ERROR_UNELINOPELST: return "unexpected-element-in-open-elements-stack"; + case LXB_HTML_RULES_ERROR_MIELINOPELST: return "missing-element-in-open-elements-stack"; + case LXB_HTML_RULES_ERROR_NOBOELINSC: return "no-body-element-in-scope"; + case LXB_HTML_RULES_ERROR_MIELINSC: return "missing-element-in-scope"; + case LXB_HTML_RULES_ERROR_UNELINSC: return "unexpected-element-in-scope"; + case LXB_HTML_RULES_ERROR_UNELINACFOST: return "unexpected-element-in-active-formatting-stack"; + case LXB_HTML_RULES_ERROR_UNENOFFI: return "unexpected-end-of-file"; + case LXB_HTML_RULES_ERROR_CHINTATE: return "characters-in-table-text"; + case LXB_HTML_RULES_ERROR_DOTOINTAMO: return "doctype-token-in-table-mode"; + case LXB_HTML_RULES_ERROR_DOTOINSEMO: return "doctype-token-in-select-mode"; + case LXB_HTML_RULES_ERROR_DOTOAFBOMO: return "doctype-token-after-body-mode"; + case LXB_HTML_RULES_ERROR_DOTOINFRMO: return "doctype-token-in-frameset-mode"; + case LXB_HTML_RULES_ERROR_DOTOAFFRMO: return "doctype-token-after-frameset-mode"; + case LXB_HTML_RULES_ERROR_DOTOFOCOMO: return "doctype-token-foreign-content-mode"; + default: return "unknown error"; + } +} + +static const char *dom_lexbor_libxml2_bridge_status_code_to_string(lexbor_libxml2_bridge_status status) +{ + switch (status) { + case LEXBOR_LIBXML2_BRIDGE_STATUS_CANNOT_INIT: return "cannot initialize data structures"; + case LEXBOR_LIBXML2_BRIDGE_STATUS_FATAL_PARSE: return "fatal error in parsing"; + case LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW: return "string length overflow"; + case LEXBOR_LIBXML2_BRIDGE_STATUS_OOM: return "out of memory"; + default: return "unknown error"; + } +} + +static void dom_reset_line_column_cache(dom_line_column_cache *cache) +{ + cache->last_line = 1; + cache->last_column = 1; + cache->last_offset = 0; +} + +static void dom_find_line_and_column_using_cache(const dom_lexbor_libxml2_bridge_application_data *application_data, dom_line_column_cache *cache, size_t offset) +{ + offset -= application_data->current_total_offset; + if (offset > application_data->current_input_length) { + /* Possible with empty input, also just good for general safety */ + offset = application_data->current_input_length; + } + + /* Either unicode or UTF-8 data */ + if (application_data->current_input_codepoints != NULL) { + while (cache->last_offset < offset) { + if (application_data->current_input_codepoints[cache->last_offset] == 0x000A) { + cache->last_line++; + cache->last_column = 1; + } else { + cache->last_column++; + } + cache->last_offset++; + } + } else { + while (cache->last_offset < offset) { + const lxb_char_t current = application_data->current_input_characters[cache->last_offset]; + if (current == '\n') { + cache->last_line++; + cache->last_column = 1; + cache->last_offset++; + } else { + /* See Lexbor tokenizer patch + * Note for future self: branchlessly computing the length and jumping by the length would be nice, + * however it takes so many instructions to do so that it is slower than this naive method. */ + if ((current & 0b11000000) != 0b10000000) { + cache->last_column++; + } + cache->last_offset++; + } + } + } +} + +static void dom_lexbor_libxml2_bridge_tokenizer_error_reporter(void *application_data_voidptr, lxb_html_tokenizer_error_t *error, size_t offset) +{ + dom_lexbor_libxml2_bridge_application_data *application_data = application_data_voidptr; + dom_find_line_and_column_using_cache(application_data, &application_data->cache_tokenizer, offset); + php_libxml_pretend_ctx_error_ex(application_data->cache_tokenizer.last_line, application_data->cache_tokenizer.last_column, "tokenizer error %s in %s, line: %zu, column: %zu\n", dom_lexbor_tokenizer_error_code_to_string(error->id), application_data->input_name, application_data->cache_tokenizer.last_line, application_data->cache_tokenizer.last_column); +} + +static void dom_lexbor_libxml2_bridge_tree_error_reporter(void *application_data_voidptr, lxb_html_tree_error_t *error, size_t line, size_t column, size_t len) +{ + dom_lexbor_libxml2_bridge_application_data *application_data = application_data_voidptr; + if (UNEXPECTED(len <= 1)) { + /* Possible with EOF, or single-character tokens, don't use a range in the error display in this case */ + php_libxml_pretend_ctx_error_ex(line, column, "tree error %s in %s, line: %zu, column: %zu\n", dom_lexbor_tree_error_code_to_string(error->id), application_data->input_name, line, column); + } else { + php_libxml_pretend_ctx_error_ex(line, column, "tree error %s in %s, line: %zu, column: %zu-%zu\n", dom_lexbor_tree_error_code_to_string(error->id), application_data->input_name, line, column, column + len - 1); + } +} + +static xmlNodePtr dom_search_child(xmlNodePtr parent, const char *searching_for) +{ + xmlNodePtr node = parent->children; + while (node != NULL) { + if (node->type == XML_ELEMENT_NODE && strcmp((const char *) node->name, searching_for) == 0) { + return node; + } + node = node->next; + } + return NULL; +} + +static void dom_place_remove_element_and_hoist_children(xmlNodePtr parent, const char *searching_for) +{ + xmlNodePtr node = dom_search_child(parent, searching_for); + if (node != NULL) { + xmlUnlinkNode(node); + + xmlNodePtr child = node->children; + while (child != NULL) { + xmlUnlinkNode(child); + xmlAddChild(parent, child); + child = node->children; + } + + xmlFreeNode(node); + } +} + +static void dom_post_process_html5_loading(xmlDocPtr lxml_doc, zend_long options, const lexbor_libxml2_bridge_extracted_observations *observations) +{ + if (options & HTML_PARSE_NOIMPLIED) { + xmlNodePtr html_node = dom_search_child((xmlNodePtr) lxml_doc, "html"); + if (!observations->has_explicit_head_tag) { + dom_place_remove_element_and_hoist_children(html_node, "head"); + } + if (!observations->has_explicit_body_tag) { + dom_place_remove_element_and_hoist_children(html_node, "body"); + } + if (!observations->has_explicit_html_tag) { + /* The HTML node has a single namespace declaration, that we must preserve after removing the node. + * However, it's possible the namespace is NULL if DOM\HTML_NO_DEFAULT_NS was set. */ + if (!(options & DOM_HTML_NO_DEFAULT_NS)) { + php_libxml_set_old_ns(lxml_doc, html_node->nsDef); + html_node->nsDef = NULL; + } + dom_place_remove_element_and_hoist_children((xmlNodePtr) lxml_doc, "html"); + if (!(options & DOM_HTML_NO_DEFAULT_NS) && EXPECTED(lxml_doc->children != NULL)) { + dom_reconcile_ns_list(lxml_doc, lxml_doc->children, lxml_doc->last); + } + } + } +} + +/* https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding */ +static dom_character_encoding_data dom_determine_encoding(const char *source, size_t source_len) +{ + dom_character_encoding_data result; + + /* BOM sniffing */ + if (source_len >= 3 && source[0] == '\xEF' && source[1] == '\xBB' && source[2] == '\xBF') { + result.encoding_data = lxb_encoding_data(LXB_ENCODING_UTF_8); + result.bom_shift = 3; + return result; + } else if (source_len >= 2) { + if (source[0] == '\xFE' && source[1] == '\xFF') { + result.encoding_data = lxb_encoding_data(LXB_ENCODING_UTF_16BE); + result.bom_shift = 2; + return result; + } else if (source[0] == '\xFF' && source[1] == '\xFE') { + result.encoding_data = lxb_encoding_data(LXB_ENCODING_UTF_16LE); + result.bom_shift = 2; + return result; + } + } + + /* Perform prescan */ + lxb_html_encoding_t encoding; + lxb_status_t status = lxb_html_encoding_init(&encoding); + if (status != LXB_STATUS_OK) { + goto fallback; + } + /* This is the "wait either for 1024 bytes or 500ms" part */ + if (source_len > 1024) { + source_len = 1024; + } + status = lxb_html_encoding_determine(&encoding, (const lxb_char_t *) source, (const lxb_char_t *) source + source_len); + if (status != LXB_STATUS_OK) { + goto fallback; + } + lxb_html_encoding_entry_t *entry = lxb_html_encoding_meta_entry(&encoding, 0); + if (entry == NULL) { + goto fallback; + } + result.encoding_data = lxb_encoding_data_by_pre_name(entry->name, entry->end - entry->name); + result.bom_shift = 0; + lxb_html_encoding_destroy(&encoding, false); + return result; + +fallback: + result.encoding_data = lxb_encoding_data(DOM_FALLBACK_ENCODING_ID); + result.bom_shift = 0; + lxb_html_encoding_destroy(&encoding, false); + return result; +} + +static void dom_setup_parser_encoding(const lxb_char_t **buf_ref, size_t *read, dom_decoding_encoding_ctx *decoding_encoding_ctx) +{ + static const lxb_codepoint_t replacement_codepoint = LXB_ENCODING_REPLACEMENT_CODEPOINT; + dom_character_encoding_data dom_encoding_data = dom_determine_encoding((const char *) *buf_ref, *read); + *buf_ref += dom_encoding_data.bom_shift; + *read -= dom_encoding_data.bom_shift; + + decoding_encoding_ctx->decode_data = dom_encoding_data.encoding_data; + if (decoding_encoding_ctx->decode_data == NULL) { + decoding_encoding_ctx->decode_data = lxb_encoding_data(DOM_FALLBACK_ENCODING_ID); + ZEND_ASSERT(decoding_encoding_ctx->decode_data != NULL); + } + (void) lxb_encoding_decode_init(&decoding_encoding_ctx->decode, decoding_encoding_ctx->decode_data, decoding_encoding_ctx->codepoints, sizeof(decoding_encoding_ctx->codepoints) / sizeof(lxb_codepoint_t)); + (void) lxb_encoding_decode_replace_set(&decoding_encoding_ctx->decode, &replacement_codepoint, LXB_ENCODING_REPLACEMENT_BUFFER_LEN); + decoding_encoding_ctx->fast_path = decoding_encoding_ctx->decode_data == decoding_encoding_ctx->encode_data; /* Note: encode_data is for UTF-8 */ +} + +static bool dom_process_parse_chunk(lexbor_libxml2_bridge_parse_context *ctx, lxb_html_document_t *document, lxb_html_parser_t *parser, size_t encoded_length, const lxb_char_t *encoding_output, size_t input_buffer_length, size_t *tokenizer_error_offset, size_t *tree_error_offset) +{ + dom_lexbor_libxml2_bridge_application_data *application_data = ctx->application_data; + application_data->current_input_length = input_buffer_length; + lexbor_status_t lexbor_status = lxb_html_document_parse_chunk(document, encoding_output, encoded_length); + if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) { + return false; + } + lexbor_libxml2_bridge_report_errors(ctx, parser, encoding_output, application_data->current_total_offset, tokenizer_error_offset, tree_error_offset); + dom_find_line_and_column_using_cache(application_data, &application_data->cache_tokenizer, application_data->current_total_offset + input_buffer_length); + application_data->current_total_offset += input_buffer_length; + application_data->cache_tokenizer.last_offset = 0; + return true; +} + +static bool dom_decode_encode_fast_path(lexbor_libxml2_bridge_parse_context *ctx, lxb_html_document_t *document, lxb_html_parser_t *parser, const lxb_char_t **buf_ref_ref, const lxb_char_t *buf_end, dom_decoding_encoding_ctx *decoding_encoding_ctx, size_t *tokenizer_error_offset, size_t *tree_error_offset) +{ + const lxb_char_t *buf_ref = *buf_ref_ref; + const lxb_char_t *last_output = buf_ref; + while (buf_ref != buf_end) { + const lxb_char_t *buf_ref_backup = buf_ref; + lxb_codepoint_t codepoint = decoding_encoding_ctx->decode_data->decode_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end); + if (UNEXPECTED(codepoint > LXB_ENCODING_MAX_CODEPOINT)) { + size_t skip = buf_ref - buf_ref_backup; /* Skip invalid data, it's replaced by the UTF-8 replacement bytes */ + if (!dom_process_parse_chunk(ctx, document, parser, buf_ref - last_output - skip, last_output, buf_ref - last_output, tokenizer_error_offset, tree_error_offset)) { + goto fail_oom; + } + if (!dom_process_parse_chunk(ctx, document, parser, LXB_ENCODING_REPLACEMENT_SIZE, LXB_ENCODING_REPLACEMENT_BYTES, 0, tokenizer_error_offset, tree_error_offset)) { + goto fail_oom; + } + last_output = buf_ref; + } + } + if (buf_ref != last_output && !dom_process_parse_chunk(ctx, document, parser, buf_ref - last_output, last_output, buf_ref - last_output, tokenizer_error_offset, tree_error_offset)) { + goto fail_oom; + } + *buf_ref_ref = buf_ref; + return true; +fail_oom: + *buf_ref_ref = buf_ref; + return false; +} + +static bool dom_decode_encode_slow_path(lexbor_libxml2_bridge_parse_context *ctx, lxb_html_document_t *document, lxb_html_parser_t *parser, const lxb_char_t **buf_ref_ref, const lxb_char_t *buf_end, dom_decoding_encoding_ctx *decoding_encoding_ctx, size_t *tokenizer_error_offset, size_t *tree_error_offset) +{ + const lxb_char_t *buf_ref = *buf_ref_ref; + lexbor_status_t decode_status, encode_status; + do { + decode_status = decoding_encoding_ctx->decode_data->decode(&decoding_encoding_ctx->decode, &buf_ref, buf_end); + + const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) decoding_encoding_ctx->codepoints; + size_t decoding_buffer_used = lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode); + const lxb_codepoint_t *codepoints_end = decoding_encoding_ctx->codepoints + decoding_buffer_used; + do { + encode_status = decoding_encoding_ctx->encode_data->encode(&decoding_encoding_ctx->encode, &codepoints_ref, codepoints_end); + ZEND_ASSERT(encode_status != LXB_STATUS_ERROR && "parameters and replacements should be valid"); + if (!dom_process_parse_chunk(ctx, document, parser, lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode), decoding_encoding_ctx->encoding_output, decoding_buffer_used, tokenizer_error_offset, tree_error_offset)) { + goto fail_oom; + } + lxb_encoding_encode_buf_used_set(&decoding_encoding_ctx->encode, 0); + } while (encode_status == LXB_STATUS_SMALL_BUFFER); + lxb_encoding_decode_buf_used_set(&decoding_encoding_ctx->decode, 0); + } while (decode_status == LXB_STATUS_SMALL_BUFFER); + *buf_ref_ref = buf_ref; + return true; +fail_oom: + *buf_ref_ref = buf_ref; + return false; +} + +static bool dom_parse_decode_encode_step(lexbor_libxml2_bridge_parse_context *ctx, lxb_html_document_t *document, lxb_html_parser_t *parser, const lxb_char_t **buf_ref_ref, const lxb_char_t *buf_end, dom_decoding_encoding_ctx *decoding_encoding_ctx, size_t *tokenizer_error_offset, size_t *tree_error_offset) +{ + if (decoding_encoding_ctx->fast_path) { + return dom_decode_encode_fast_path(ctx, document, parser, buf_ref_ref, buf_end, decoding_encoding_ctx, tokenizer_error_offset, tree_error_offset); + } else { + return dom_decode_encode_slow_path(ctx, document, parser, buf_ref_ref, buf_end, decoding_encoding_ctx, tokenizer_error_offset, tree_error_offset); + } +} + +static bool dom_parse_decode_encode_finish(lexbor_libxml2_bridge_parse_context *ctx, lxb_html_document_t *document, lxb_html_parser_t *parser, dom_decoding_encoding_ctx *decoding_encoding_ctx, size_t *tokenizer_error_offset, size_t *tree_error_offset) +{ + if (!decoding_encoding_ctx->fast_path) { + /* Fast path handles codepoints one by one, so this part is not applicable in that case */ + (void) lxb_encoding_decode_finish(&decoding_encoding_ctx->decode); + size_t decoding_buffer_size = lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode); + if (decoding_buffer_size > 0) { + const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) decoding_encoding_ctx->codepoints; + const lxb_codepoint_t *codepoints_end = codepoints_ref + decoding_buffer_size; + (void) decoding_encoding_ctx->encode_data->encode(&decoding_encoding_ctx->encode, &codepoints_ref, codepoints_end); + if (!dom_process_parse_chunk(ctx, document, parser, lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode), decoding_encoding_ctx->encoding_output, decoding_buffer_size, tokenizer_error_offset, tree_error_offset)) { + return false; + } + } + } + (void) lxb_encoding_encode_finish(&decoding_encoding_ctx->encode); + if (lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode) && !dom_process_parse_chunk(ctx, document, parser, lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode), decoding_encoding_ctx->encoding_output, lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode), tokenizer_error_offset, tree_error_offset)) { + return false; + } + return true; +} + +static bool check_options_validity(zend_long options) +{ + const zend_long VALID_OPTIONS = XML_PARSE_NOERROR | XML_PARSE_COMPACT | HTML_PARSE_NOIMPLIED | DOM_HTML_NO_DEFAULT_NS; + if ((options & ~VALID_OPTIONS) != 0) { + zend_argument_value_error(2, "contains invalid flags (allowed flags: LIBXML_NOERROR, LIBXML_COMPACT, LIBXML_HTML_NOIMPLIED, DOM\\NO_DEFAULT_NS)"); + return false; + } + return true; +} + +PHP_METHOD(DOM_HTML5Document, loadHTML) +{ + const char *source; + size_t source_len; + zend_long options = 0; + if (zend_parse_parameters(ZEND_NUM_ARGS(), "s|l", &source, &source_len, &options) == FAILURE) { + RETURN_THROWS(); + } + + if (!check_options_validity(options)) { + RETURN_THROWS(); + } + + dom_lexbor_libxml2_bridge_application_data application_data; + application_data.input_name = "Entity"; + application_data.current_total_offset = 0; + dom_reset_line_column_cache(&application_data.cache_tokenizer); + lexbor_libxml2_bridge_parse_context ctx; + lexbor_libxml2_bridge_parse_context_init(&ctx); + if (!(options & XML_PARSE_NOERROR)) { + lexbor_libxml2_bridge_parse_set_error_callbacks(&ctx, dom_lexbor_libxml2_bridge_tokenizer_error_reporter, dom_lexbor_libxml2_bridge_tree_error_reporter); + } + ctx.application_data = &application_data; + + lxb_html_document_t *document = lxb_html_document_create(); + if (UNEXPECTED(document == NULL)) { + goto fail_oom; + } + + lxb_status_t lexbor_status = lxb_html_document_parse_chunk_begin(document); + if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) { + goto fail_oom; + } + + /* Setup everything encoding & decoding related */ + dom_decoding_encoding_ctx decoding_encoding_ctx; + dom_decoding_encoding_ctx_init(&decoding_encoding_ctx); + + lxb_html_parser_t *parser = document->dom_document.parser; + size_t tokenizer_error_offset = 0; + size_t tree_error_offset = 0; + + const lxb_char_t *buf_ref = (const lxb_char_t *) source; + dom_setup_parser_encoding(&buf_ref, &source_len, &decoding_encoding_ctx); + + if (decoding_encoding_ctx.fast_path) { + application_data.current_input_codepoints = NULL; + application_data.current_input_characters = source; + } else { + application_data.current_input_codepoints = decoding_encoding_ctx.codepoints; + application_data.current_input_characters = NULL; + } + + while (source_len > 0) { + size_t chunk_size = source_len; + if (chunk_size > sizeof(decoding_encoding_ctx.encoding_output) / sizeof(lxb_char_t)) { + chunk_size = sizeof(decoding_encoding_ctx.encoding_output) / sizeof(lxb_char_t); + } + source_len -= chunk_size; + + const lxb_char_t *buf_end = buf_ref + chunk_size; + bool result = dom_parse_decode_encode_step(&ctx, document, parser, &buf_ref, buf_end, &decoding_encoding_ctx, &tokenizer_error_offset, &tree_error_offset); + if (!result) { + goto fail_oom; + } + } + + if (!dom_parse_decode_encode_finish(&ctx, document, parser, &decoding_encoding_ctx, &tokenizer_error_offset, &tree_error_offset)) { + goto fail_oom; + } + + lexbor_status = lxb_html_document_parse_chunk_end(document); + if (lexbor_status != LXB_STATUS_OK) { + goto fail_oom; + } + + xmlDocPtr lxml_doc; + lexbor_libxml2_bridge_status bridge_status = lexbor_libxml2_bridge_convert_document(document, &lxml_doc, options & XML_PARSE_COMPACT, !(options & DOM_HTML_NO_DEFAULT_NS)); + lexbor_libxml2_bridge_copy_observations(parser->tree, &ctx.observations); + if (UNEXPECTED(bridge_status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK)) { + php_libxml_ctx_error(NULL, "%s in %s", dom_lexbor_libxml2_bridge_status_code_to_string(bridge_status), application_data.input_name); + lxb_html_document_destroy(document); + RETURN_FALSE; + } + lxb_html_document_destroy(document); + + dom_post_process_html5_loading(lxml_doc, options, &ctx.observations); + + if (decoding_encoding_ctx.decode_data) { + lxml_doc->encoding = xmlStrdup((const xmlChar *) decoding_encoding_ctx.decode_data->name); + } else { + lxml_doc->encoding = xmlStrdup((const xmlChar *) "UTF-8"); + } + + php_dom_finish_loading_document(ZEND_THIS, return_value, lxml_doc); + return; + +fail_oom: + lxb_html_document_destroy(document); + php_dom_throw_error(INVALID_STATE_ERR, 1); + RETURN_THROWS(); +} + +PHP_METHOD(DOM_HTML5Document, loadHTMLFile) +{ + const char *filename; + size_t filename_len; + zend_long options = 0; + php_stream *stream = NULL; + if (zend_parse_parameters(ZEND_NUM_ARGS(), "p|l", &filename, &filename_len, &options) == FAILURE) { + RETURN_THROWS(); + } + + if (!check_options_validity(options)) { + RETURN_THROWS(); + } + + /* See php_libxml_streams_IO_open_wrapper(), apparently this caused issues in the past. */ + if (strstr(filename, "%00")) { + php_error_docref(NULL, E_WARNING, "URI must not contain percent-encoded NUL bytes"); + RETURN_FALSE; + } + + dom_lexbor_libxml2_bridge_application_data application_data; + application_data.input_name = filename; + application_data.current_total_offset = 0; + dom_reset_line_column_cache(&application_data.cache_tokenizer); + lexbor_libxml2_bridge_parse_context ctx; + lexbor_libxml2_bridge_parse_context_init(&ctx); + if (!(options & XML_PARSE_NOERROR)) { + lexbor_libxml2_bridge_parse_set_error_callbacks(&ctx, dom_lexbor_libxml2_bridge_tokenizer_error_reporter, dom_lexbor_libxml2_bridge_tree_error_reporter); + } + ctx.application_data = &application_data; + + lxb_html_document_t *document = lxb_html_document_create(); + if (UNEXPECTED(document == NULL)) { + goto fail_oom; + } + + lxb_status_t lexbor_status = lxb_html_document_parse_chunk_begin(document); + if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) { + goto fail_oom; + } + + stream = php_stream_open_wrapper_ex(filename, "rb", REPORT_ERRORS, /* opened_path */ NULL, /* context */ NULL); + if (!stream) { + lxb_html_document_destroy(document); + RETURN_FALSE; + } + + /* Setup everything encoding & decoding related */ + bool first_read = true; + dom_decoding_encoding_ctx decoding_encoding_ctx; + dom_decoding_encoding_ctx_init(&decoding_encoding_ctx); + + size_t tokenizer_error_offset = 0; + size_t tree_error_offset = 0; + ssize_t read; + char buf[4096]; + lxb_html_parser_t *parser = document->dom_document.parser; + + while ((read = php_stream_read(stream, buf, sizeof(buf))) > 0) { + const lxb_char_t *buf_ref = (const lxb_char_t *) buf; + + /* First read => determine encoding */ + if (first_read) { + first_read = false; + dom_setup_parser_encoding(&buf_ref, (size_t *) &read, &decoding_encoding_ctx); + if (decoding_encoding_ctx.fast_path) { + application_data.current_input_codepoints = NULL; + application_data.current_input_characters = buf; + } else { + application_data.current_input_codepoints = decoding_encoding_ctx.codepoints; + application_data.current_input_characters = NULL; + } + } + + const lxb_char_t *buf_end = buf_ref + read; + bool result = dom_parse_decode_encode_step(&ctx, document, parser, &buf_ref, buf_end, &decoding_encoding_ctx, &tokenizer_error_offset, &tree_error_offset); + if (!result) { + goto fail_oom; + } + } + + php_stream_close(stream); + stream = NULL; + + if (!dom_parse_decode_encode_finish(&ctx, document, parser, &decoding_encoding_ctx, &tokenizer_error_offset, &tree_error_offset)) { + goto fail_oom; + } + + lexbor_status = lxb_html_document_parse_chunk_end(document); + if (lexbor_status != LXB_STATUS_OK) { + goto fail_oom; + } + + xmlDocPtr lxml_doc; + lexbor_libxml2_bridge_status bridge_status = lexbor_libxml2_bridge_convert_document(document, &lxml_doc, options & XML_PARSE_COMPACT, !(options & DOM_HTML_NO_DEFAULT_NS)); + lexbor_libxml2_bridge_copy_observations(parser->tree, &ctx.observations); + if (UNEXPECTED(bridge_status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK)) { + php_libxml_ctx_error(NULL, "%s in %s", dom_lexbor_libxml2_bridge_status_code_to_string(bridge_status), filename); + lxb_html_document_destroy(document); + RETURN_FALSE; + } + lxb_html_document_destroy(document); + + dom_post_process_html5_loading(lxml_doc, options, &ctx.observations); + + if (decoding_encoding_ctx.decode_data) { + lxml_doc->encoding = xmlStrdup((const xmlChar *) decoding_encoding_ctx.decode_data->name); + } else { + lxml_doc->encoding = xmlStrdup((const xmlChar *) "UTF-8"); + } + + php_dom_finish_loading_document(ZEND_THIS, return_value, lxml_doc); + return; + +fail_oom: + php_dom_throw_error(INVALID_STATE_ERR, 1); + lxb_html_document_destroy(document); + if (stream) { + php_stream_close(stream); + } + RETURN_THROWS(); +} + +/* Living spec never creates explicit namespace declaration nodes. + * They are only written upon serialization but never appear in the tree. + * So in principle we could just ignore them outright. + * However, step 10 in https://html.spec.whatwg.org/multipage/parsing.html#create-an-element-for-the-token + * requires us to have the declaration as an attribute available */ +static void dom_mark_namespaces_as_attributes_too(xmlDocPtr doc) +{ + if (!doc) { + return; + } + + xmlNodePtr node = doc->children; + while (node != NULL) { + if (node->type == XML_ELEMENT_NODE) { + dom_ns_compat_mark_attribute_list(node->nsDef); + + if (node->children) { + node = node->children; + continue; + } + } + + if (node->next) { + node = node->next; + } else { + /* Go upwards, until we find a parent node with a next sibling, or until we hit the base. */ + do { + node = node->parent; + if (node == NULL) { + return; + } + } while (node->next == NULL); + node = node->next; + } + } +} + +void dom_mark_namespaces_for_copy_based_on_copy(xmlNodePtr copy, const xmlNode *original) +{ + xmlNodePtr copy_current = copy; + const xmlNode *original_current = original; + while (copy_current != NULL) { + ZEND_ASSERT(original_current != NULL); + + if (copy_current->type == XML_ELEMENT_NODE) { + dom_ns_compat_copy_attribute_list_mark(copy_current->nsDef, original_current->nsDef); + + if (copy_current->children) { + copy_current = copy_current->children; + original_current = original_current->children; + continue; + } + } + + if (copy_current->next) { + copy_current = copy_current->next; + original_current = original_current->next; + } else { + /* Go upwards, until we find a parent node with a next sibling, or until we hit the base. */ + do { + copy_current = copy_current->parent; + if (copy_current == NULL) { + return; + } + original_current = original_current->parent; + } while (copy_current->next == NULL); + copy_current = copy_current->next; + original_current = original_current->next; + } + } +} + +static zend_result dom_write_output_smart_str(void *ctx, const char *buf, size_t size) +{ + smart_str_appendl((smart_str *) ctx, buf, size); + return SUCCESS; +} + +static zend_result dom_write_output_stream(void *application_data, const char *buf, size_t len) +{ + php_stream *stream = (php_stream *) application_data; + if (UNEXPECTED(php_stream_write(stream, buf, len) < 0)) { + return FAILURE; + } + return SUCCESS; +} + +static zend_result dom_saveHTML_write_string_len(void *application_data, const char *buf, size_t len) +{ + dom_output_ctx *output = (dom_output_ctx *) application_data; + lxb_status_t decode_status, encode_status; + const lxb_char_t *buf_ref = (const lxb_char_t *) buf; + const lxb_char_t *buf_end = buf_ref + len; + + do { + decode_status = output->decoding_data->decode(output->decode, &buf_ref, buf_end); + + const lxb_codepoint_t *codepoints_ref = output->codepoints; + const lxb_codepoint_t *codepoints_end = codepoints_ref + lxb_encoding_decode_buf_used(output->decode); + do { + encode_status = output->encoding_data->encode(output->encode, &codepoints_ref, codepoints_end); + if (UNEXPECTED(output->write_output(output->output_data, (const char *) output->encoding_output, lxb_encoding_encode_buf_used(output->encode)) != SUCCESS)) { + return FAILURE; + } + lxb_encoding_encode_buf_used_set(output->encode, 0); + } while (encode_status == LXB_STATUS_SMALL_BUFFER); + lxb_encoding_decode_buf_used_set(output->decode, 0); + } while (decode_status == LXB_STATUS_SMALL_BUFFER); + + return SUCCESS; +} + +static zend_result dom_saveHTML_write_string(void *application_data, const char *buf) +{ + return dom_saveHTML_write_string_len(application_data, buf, strlen(buf)); +} + +static zend_result dom_common_save(dom_output_ctx *output_ctx, const xmlDoc *docp, const xmlNode *node) +{ + /* Initialize everything related to encoding & decoding */ + const lxb_encoding_data_t *decoding_data = lxb_encoding_data(LXB_ENCODING_UTF_8); + const lxb_encoding_data_t *encoding_data = NULL; + if (docp->encoding != NULL) { + encoding_data = lxb_encoding_data_by_name((const lxb_char_t *) docp->encoding, strlen((const char *) docp->encoding)); + } + if (encoding_data == NULL) { + encoding_data = lxb_encoding_data(DOM_FALLBACK_ENCODING_ID); + ZEND_ASSERT(encoding_data != NULL); + } + lxb_encoding_encode_t encode; + lxb_encoding_decode_t decode; + lxb_char_t encoding_output[4096]; + lxb_codepoint_t codepoints[4096]; + (void) lxb_encoding_encode_init(&encode, encoding_data, encoding_output, sizeof(encoding_output) / sizeof(lxb_char_t)); + (void) lxb_encoding_decode_init(&decode, decoding_data, codepoints, sizeof(codepoints) / sizeof(lxb_codepoint_t)); + if (encoding_data->encoding == LXB_ENCODING_UTF_8) { + lxb_encoding_encode_replace_set(&encode, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE); + } else { + /* Fallback if there is no replacement by default */ + lxb_encoding_encode_replace_set(&encode, (const lxb_char_t *) "?", 1); + } + lxb_encoding_decode_replace_set(&decode, LXB_ENCODING_REPLACEMENT_BUFFER, LXB_ENCODING_REPLACEMENT_BUFFER_LEN); + + output_ctx->encoding_data = encoding_data; + output_ctx->decoding_data = decoding_data; + output_ctx->encode = &encode; + output_ctx->decode = &decode; + output_ctx->codepoints = codepoints; + output_ctx->encoding_output = encoding_output; + + dom_html5_serialize_context ctx; + ctx.write_string_len = dom_saveHTML_write_string_len; + ctx.write_string = dom_saveHTML_write_string; + ctx.application_data = output_ctx; + if (UNEXPECTED(dom_html5_serialize(&ctx, node) != SUCCESS)) { + return FAILURE; + } + + (void) lxb_encoding_decode_finish(&decode); + if (lxb_encoding_decode_buf_used(&decode)) { + const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) codepoints; + (void) encoding_data->encode(&encode, &codepoints_ref, codepoints_ref + lxb_encoding_decode_buf_used(&decode)); + if (UNEXPECTED(output_ctx->write_output(output_ctx->output_data, (const char *) encoding_output, lxb_encoding_encode_buf_used(&encode)) != SUCCESS)) { + return FAILURE; + } + } + (void) lxb_encoding_encode_finish(&encode); + if (lxb_encoding_encode_buf_used(&encode)) { + if (UNEXPECTED(output_ctx->write_output(output_ctx->output_data, (const char *) encoding_output, lxb_encoding_encode_buf_used(&encode)) != SUCCESS)) { + return FAILURE; + } + } + + return SUCCESS; +} + +PHP_METHOD(DOM_HTML5Document, saveHTMLFile) +{ + zval *id; + xmlDoc *docp; + size_t file_len; + dom_object *intern; + char *file; + + id = ZEND_THIS; + if (zend_parse_parameters(ZEND_NUM_ARGS(), "p", &file, &file_len) == FAILURE) { + RETURN_THROWS(); + } + + if (file_len == 0) { + zend_argument_value_error(1, "must not be empty"); + RETURN_THROWS(); + } + + php_stream *stream = php_stream_open_wrapper_ex(file, "wb", REPORT_ERRORS, /* opened_path */ NULL, /* context */ NULL); + if (!stream) { + RETURN_FALSE; + } + + DOM_GET_OBJ(docp, id, xmlDocPtr, intern); + + dom_output_ctx output_ctx; + output_ctx.output_data = stream; + output_ctx.write_output = dom_write_output_stream; + if (UNEXPECTED(dom_common_save(&output_ctx, docp, (const xmlNode *) docp) != SUCCESS)) { + php_stream_close(stream); + RETURN_FALSE; + } + + zend_long bytes = php_stream_tell(stream); + php_stream_close(stream); + + RETURN_LONG(bytes); +} + +PHP_METHOD(DOM_HTML5Document, saveHTML) +{ + zval *nodep = NULL; + const xmlDoc *docp; + const xmlNode *node; + dom_object *intern, *nodeobj; + + if (zend_parse_parameters(ZEND_NUM_ARGS(), "|O!", &nodep, dom_node_class_entry) == FAILURE) { + RETURN_THROWS(); + } + + DOM_GET_OBJ(docp, ZEND_THIS, xmlDocPtr, intern); + + if (nodep != NULL) { + DOM_GET_OBJ(node, nodep, xmlNodePtr, nodeobj); + if (node->doc != docp) { + php_dom_throw_error(WRONG_DOCUMENT_ERR, dom_get_strict_error(intern->document)); + RETURN_FALSE; + } + } else { + node = (const xmlNode *) docp; + } + + smart_str buf = {0}; + dom_output_ctx output_ctx; + output_ctx.output_data = &buf; + output_ctx.write_output = dom_write_output_smart_str; + /* Can't fail because dom_write_output_smart_str() can't fail. */ + zend_result result = dom_common_save(&output_ctx, docp, node); + ZEND_ASSERT(result == SUCCESS); + + RETURN_STR(smart_str_extract(&buf)); +} + +PHP_METHOD(DOM_HTML5Document, __construct) +{ + php_dom_document_constructor(INTERNAL_FUNCTION_PARAM_PASSTHRU); + Z_DOMOBJ_P(ZEND_THIS)->document->is_html5_class = true; +} + +PHP_METHOD(DOM_HTML5Document, load) +{ + xmlDocPtr doc = NULL; + dom_parse_document(INTERNAL_FUNCTION_PARAM_PASSTHRU, DOM_LOAD_FILE, &doc); + dom_mark_namespaces_as_attributes_too(doc); +} + +PHP_METHOD(DOM_HTML5Document, loadXML) +{ + xmlDocPtr doc = NULL; + dom_parse_document(INTERNAL_FUNCTION_PARAM_PASSTHRU, DOM_LOAD_STRING, &doc); + dom_mark_namespaces_as_attributes_too(doc); +} + +zend_result dom_html5_document_encoding_write(dom_object *obj, zval *newval) +{ + xmlDoc *docp = (xmlDocPtr) dom_object_get_node(obj); + if (docp == NULL) { + php_dom_throw_error(INVALID_STATE_ERR, 1); + return FAILURE; + } + + /* Typed property, can only be IS_STRING or IS_NULL. */ + ZEND_ASSERT(Z_TYPE_P(newval) == IS_STRING || Z_TYPE_P(newval) == IS_NULL); + + if (Z_TYPE_P(newval) == IS_NULL) { + goto invalid_encoding; + } + + zend_string *str = Z_STR_P(newval); + const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name((const lxb_char_t *) ZSTR_VAL(str), ZSTR_LEN(str)); + + if (encoding_data != NULL) { + xmlFree((xmlChar *) docp->encoding); + docp->encoding = xmlStrdup((const xmlChar *) encoding_data->name); + } else { + goto invalid_encoding; + } + + return SUCCESS; + +invalid_encoding: + zend_value_error("Invalid document encoding"); + return FAILURE; +} + +#endif /* HAVE_LIBXML && HAVE_DOM */ diff --git a/ext/dom/html5_parser.c b/ext/dom/html5_parser.c new file mode 100644 index 0000000000000..bddccd17b153b --- /dev/null +++ b/ext/dom/html5_parser.c @@ -0,0 +1,262 @@ +/* + +----------------------------------------------------------------------+ + | Copyright (c) The PHP Group | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | https://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Authors: Niels Dossche | + +----------------------------------------------------------------------+ +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "php.h" +#if defined(HAVE_LIBXML) && defined(HAVE_DOM) +#include "html5_parser.h" +#include "namespace_compat.h" +#include +#include +#include +#include +#include +#include + +typedef struct { + lxb_dom_node_t *node; + uintptr_t current_active_namespace; + xmlNodePtr lxml_parent; + xmlNsPtr lxml_ns; +} work_list_item; + +static void lexbor_libxml2_bridge_work_list_item_push(lexbor_array_obj_t *array, lxb_dom_node_t *node, uintptr_t current_active_namespace, xmlNodePtr lxml_parent, xmlNsPtr lxml_ns) +{ + work_list_item *item = (work_list_item *) lexbor_array_obj_push_wo_cls(array); + item->node = node; + item->current_active_namespace = current_active_namespace; + item->lxml_parent = lxml_parent; + item->lxml_ns = lxml_ns; +} + +static unsigned short sanitize_line_nr(size_t line) +{ + if (line > USHRT_MAX) { + return USHRT_MAX; + } + return (unsigned short) line; +} + +static const xmlChar *get_libxml_namespace_href(uintptr_t lexbor_namespace) +{ + if (lexbor_namespace == LXB_NS_SVG) { + return (const xmlChar *) DOM_SVG_NS_URI; + } else if (lexbor_namespace == LXB_NS_MATH) { + return (const xmlChar *) DOM_MATHML_NS_URI; + } else { + return (const xmlChar *) DOM_XHTML_NS_URI; + } +} + +static lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert(lxb_dom_node_t *start_node, xmlDocPtr lxml_doc, bool compact_text_nodes, bool create_default_ns) +{ + lexbor_libxml2_bridge_status retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OK; + + lexbor_array_obj_t work_list; + lexbor_array_obj_init(&work_list, 128, sizeof(work_list_item)); + + for (lxb_dom_node_t *node = start_node; node != NULL; node = node->prev) { + lexbor_libxml2_bridge_work_list_item_push(&work_list, node, LXB_NS__UNDEF, (xmlNodePtr) lxml_doc, NULL); + } + + work_list_item *current_stack_item; + while ((current_stack_item = lexbor_array_obj_pop(&work_list)) != NULL) { + lxb_dom_node_t *node = current_stack_item->node; + xmlNodePtr lxml_parent = current_stack_item->lxml_parent; + + /* CDATA section and processing instructions don't occur in parsed HTML documents. + * The historical types are not emitted by the parser either. */ + if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) { + /* Note: HTML isn't exactly XML-namespace-aware; as this is an HTML parser we only care about the local name. + * If a prefix:name format is used, then the local name will be "prefix:name" and the prefix will be empty. + * There is however still somewhat of a concept of namespaces. There are three: HTML (the default), SVG, and MATHML. */ + lxb_dom_element_t *element = lxb_dom_interface_element(node); + const lxb_char_t *name = lxb_dom_element_local_name(element, NULL); + xmlNodePtr lxml_element = xmlNewDocNode(lxml_doc, NULL, name, NULL); + if (UNEXPECTED(lxml_element == NULL)) { + retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM; + goto out; + } + xmlAddChild(lxml_parent, lxml_element); + lxml_element->line = sanitize_line_nr(node->line); + + /* Namespaces, note: namespace switches are uncommon */ + uintptr_t entering_namespace = element->node.ns; + xmlNsPtr current_lxml_ns = current_stack_item->lxml_ns; + if (create_default_ns && UNEXPECTED(entering_namespace != current_stack_item->current_active_namespace)) { + current_lxml_ns = xmlNewNs(lxml_element, get_libxml_namespace_href(entering_namespace), NULL); + } + lxml_element->ns = current_lxml_ns; /* Instead of xmlSetNs() because we know the arguments are valid. Prevents overhead. */ + + for (lxb_dom_node_t *child_node = element->node.last_child; child_node != NULL; child_node = child_node->prev) { + lexbor_libxml2_bridge_work_list_item_push(&work_list, child_node, entering_namespace, lxml_element, current_lxml_ns); + } + + for (lxb_dom_attr_t *attr = element->last_attr; attr != NULL; attr = attr->prev) { + lexbor_libxml2_bridge_work_list_item_push(&work_list, (lxb_dom_node_t *) attr, entering_namespace, lxml_element, current_lxml_ns); + } + } else if (node->type == LXB_DOM_NODE_TYPE_TEXT) { + lxb_dom_text_t *text = lxb_dom_interface_text(node); + const lxb_char_t *data = text->char_data.data.data; + size_t data_length = text->char_data.data.length; + if (UNEXPECTED(data_length >= INT_MAX)) { + retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW; + goto out; + } + xmlNodePtr lxml_text; + if (compact_text_nodes && data_length < sizeof(void *) * 2) { + /* See xmlSAX2TextNode() in libxml2 */ + lxml_text = xmlMalloc(sizeof(xmlNode)); + if (UNEXPECTED(lxml_text == NULL)) { + retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM; + goto out; + } + memset(lxml_text, 0, sizeof(xmlNode)); + lxml_text->name = xmlStringText; + lxml_text->type = XML_TEXT_NODE; + lxml_text->doc = lxml_doc; + lxml_text->content = (xmlChar *) &lxml_text->properties; + memcpy(lxml_text->content, data, data_length + 1 /* include '\0' */); + } else { + lxml_text = xmlNewDocTextLen(lxml_doc, (const xmlChar *) data, data_length); + if (UNEXPECTED(lxml_text == NULL)) { + retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM; + goto out; + } + } + xmlAddChild(lxml_parent, lxml_text); + if (node->line >= USHRT_MAX) { + lxml_text->line = USHRT_MAX; + lxml_text->psvi = (void *) (ptrdiff_t) node->line; + } else { + lxml_text->line = (unsigned short) node->line; + } + } else if (node->type == LXB_DOM_NODE_TYPE_DOCUMENT_TYPE) { + lxb_dom_document_type_t *doctype = lxb_dom_interface_document_type(node); + const lxb_char_t *name = lxb_dom_document_type_name(doctype, NULL); + size_t public_id_len, system_id_len; + const lxb_char_t *public_id = lxb_dom_document_type_public_id(doctype, &public_id_len); + const lxb_char_t *system_id = lxb_dom_document_type_system_id(doctype, &system_id_len); + xmlDtdPtr lxml_dtd = xmlCreateIntSubset(lxml_doc, name, public_id_len ? public_id : NULL, system_id_len ? system_id : NULL); + if (UNEXPECTED(lxml_dtd == NULL)) { + retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM; + goto out; + } + /* libxml2 doesn't support line numbers on this anyway, it returns -1 instead, so don't bother */ + } else if (node->type == LXB_DOM_NODE_TYPE_ATTRIBUTE) { + lxb_dom_attr_t *attr = lxb_dom_interface_attr(node); + do { + /* Same namespace remark as for elements */ + const lxb_char_t *local_name = lxb_dom_attr_local_name(attr, NULL); + const lxb_char_t *value = lxb_dom_attr_value(attr, NULL); + xmlAttrPtr lxml_attr = xmlSetNsProp(lxml_parent, NULL, local_name, value); + if (UNEXPECTED(lxml_attr == NULL)) { + retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM; + goto out; + } + attr = attr->next; + /* libxml2 doesn't support line numbers on this anyway, it derives them instead, so don't bother */ + } while (attr); + } else if (node->type == LXB_DOM_NODE_TYPE_COMMENT) { + lxb_dom_comment_t *comment = lxb_dom_interface_comment(node); + xmlNodePtr lxml_comment = xmlNewDocComment(lxml_doc, comment->char_data.data.data); + if (UNEXPECTED(lxml_comment == NULL)) { + retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM; + goto out; + } + xmlAddChild(lxml_parent, lxml_comment); + lxml_comment->line = sanitize_line_nr(node->line); + } + } + +out: + lexbor_array_obj_destroy(&work_list, false); + return retval; +} + +void lexbor_libxml2_bridge_parse_context_init(lexbor_libxml2_bridge_parse_context *ctx) +{ + memset(ctx, 0, sizeof(*ctx)); +} + +void lexbor_libxml2_bridge_parse_set_error_callbacks(lexbor_libxml2_bridge_parse_context *ctx, lexbor_libxml2_bridge_tokenizer_error_reporter tokenizer_error_reporter, lexbor_libxml2_bridge_tree_error_reporter tree_error_reporter) +{ + ctx->tokenizer_error_reporter = tokenizer_error_reporter; + ctx->tree_error_reporter = tree_error_reporter; +} + +lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert_document(lxb_html_document_t *document, xmlDocPtr *doc_out, bool compact_text_nodes, bool create_default_ns) +{ +#ifdef LIBXML_HTML_ENABLED + xmlDocPtr lxml_doc = htmlNewDocNoDtD(NULL, NULL); +#else + xmlDocPtr lxml_doc = xmlNewDoc((const xmlChar *) "1.0"); + lxml_doc->type = XML_HTML_DOCUMENT_NODE; +#endif + if (!lxml_doc) { + return LEXBOR_LIBXML2_BRIDGE_STATUS_OOM; + } + lexbor_libxml2_bridge_status status = lexbor_libxml2_bridge_convert(lxb_dom_interface_node(document)->last_child, lxml_doc, compact_text_nodes, create_default_ns); + if (status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK) { + xmlFreeDoc(lxml_doc); + return status; + } + *doc_out = lxml_doc; + return LEXBOR_LIBXML2_BRIDGE_STATUS_OK; +} + +void lexbor_libxml2_bridge_report_errors(const lexbor_libxml2_bridge_parse_context *ctx, lxb_html_parser_t *parser, const lxb_char_t *input_html, size_t chunk_offset, size_t *error_index_offset_tokenizer, size_t *error_index_offset_tree) +{ + void *error; + + /* Tokenizer errors */ + lexbor_array_obj_t *parse_errors = lxb_html_parser_tokenizer(parser)->parse_errors; + size_t index = *error_index_offset_tokenizer; + while ((error = lexbor_array_obj_get(parse_errors, index)) != NULL) { + /* See https://github.com/lexbor/lexbor/blob/master/source/lexbor/html/tokenizer/error.h */ + lxb_html_tokenizer_error_t *token_error = error; + if (ctx->tokenizer_error_reporter) { + ctx->tokenizer_error_reporter(ctx->application_data, token_error, token_error->pos - input_html + chunk_offset); + } + index++; + } + *error_index_offset_tokenizer = index; + + /* Tree parser errors */ + parse_errors = lxb_html_parser_tree(parser)->parse_errors; + index = *error_index_offset_tree; + while ((error = lexbor_array_obj_get(parse_errors, index)) != NULL) { + /* See https://github.com/lexbor/lexbor/blob/master/source/lexbor/html/tree/error.h */ + lxb_html_tree_error_t *tree_error = error; + if (ctx->tree_error_reporter) { + ctx->tree_error_reporter(ctx->application_data, tree_error, tree_error->line + 1, tree_error->column + 1, tree_error->length); + } + index++; + } + *error_index_offset_tree = index; +} + +void lexbor_libxml2_bridge_copy_observations(lxb_html_tree_t *tree, lexbor_libxml2_bridge_extracted_observations *observations) +{ + observations->has_explicit_html_tag = tree->has_explicit_html_tag; + observations->has_explicit_head_tag = tree->has_explicit_head_tag; + observations->has_explicit_body_tag = tree->has_explicit_body_tag; +} + +#endif /* HAVE_LIBXML && HAVE_DOM */ diff --git a/ext/dom/html5_parser.h b/ext/dom/html5_parser.h new file mode 100644 index 0000000000000..e0e5b7b55cbf6 --- /dev/null +++ b/ext/dom/html5_parser.h @@ -0,0 +1,57 @@ +/* + +----------------------------------------------------------------------+ + | Copyright (c) The PHP Group | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | https://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Authors: Niels Dossche | + +----------------------------------------------------------------------+ +*/ + +#ifndef CONVERT_H +#define CONVERT_H + +#include +#include +#include + +typedef enum { + LEXBOR_LIBXML2_BRIDGE_STATUS_OK = 0, + LEXBOR_LIBXML2_BRIDGE_STATUS_CANNOT_INIT, + LEXBOR_LIBXML2_BRIDGE_STATUS_FATAL_PARSE, + LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW, + LEXBOR_LIBXML2_BRIDGE_STATUS_OOM, +} lexbor_libxml2_bridge_status; + +typedef void (*lexbor_libxml2_bridge_tokenizer_error_reporter)(void *application_data, lxb_html_tokenizer_error_t *error, size_t offset); +typedef void (*lexbor_libxml2_bridge_tree_error_reporter)(void *application_data, lxb_html_tree_error_t *error, size_t line, size_t column, size_t len); + +typedef struct { + bool has_explicit_html_tag; + bool has_explicit_head_tag; + bool has_explicit_body_tag; +} lexbor_libxml2_bridge_extracted_observations; + +typedef struct { + /* Private fields */ + lexbor_libxml2_bridge_tokenizer_error_reporter tokenizer_error_reporter; + lexbor_libxml2_bridge_tree_error_reporter tree_error_reporter; + /* Public fields */ + lexbor_libxml2_bridge_extracted_observations observations; + /* Application data, do what you want with this */ + void *application_data; +} lexbor_libxml2_bridge_parse_context; + +void lexbor_libxml2_bridge_parse_context_init(lexbor_libxml2_bridge_parse_context *ctx); +void lexbor_libxml2_bridge_parse_set_error_callbacks(lexbor_libxml2_bridge_parse_context *ctx, lexbor_libxml2_bridge_tokenizer_error_reporter tokenizer_error_reporter, lexbor_libxml2_bridge_tree_error_reporter tree_error_reporter); +lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert_document(lxb_html_document_t *document, xmlDocPtr *doc_out, bool compact_text_nodes, bool create_default_ns); +void lexbor_libxml2_bridge_report_errors(const lexbor_libxml2_bridge_parse_context *ctx, lxb_html_parser_t *parser, const lxb_char_t *input_html, size_t chunk_offset, size_t *error_index_offset_tokenizer, size_t *error_index_offset_tree); +void lexbor_libxml2_bridge_copy_observations(lxb_html_tree_t *tree, lexbor_libxml2_bridge_extracted_observations *observations); + +#endif diff --git a/ext/dom/html5_serializer.c b/ext/dom/html5_serializer.c new file mode 100644 index 0000000000000..daa2e0ce2ec0a --- /dev/null +++ b/ext/dom/html5_serializer.c @@ -0,0 +1,351 @@ +/* + +----------------------------------------------------------------------+ + | Copyright (c) The PHP Group | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | https://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Authors: Niels Dossche | + +----------------------------------------------------------------------+ +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "php.h" +#if defined(HAVE_LIBXML) && defined(HAVE_DOM) +#include "php_dom.h" +#include "html5_serializer.h" +#include "namespace_compat.h" +#include + +#define TRY(x) do { if (UNEXPECTED((x) != SUCCESS)) { return FAILURE; } } while (0) + +static bool dom_is_ns(const xmlNode *node, const char *uri) +{ + return node->ns != NULL && strcmp((const char *) node->ns->href, uri) == 0; +} + +static bool dom_is_html_ns(const xmlNode *node) +{ + return node->ns == NULL || dom_is_ns(node, DOM_XHTML_NS_URI); +} + +static bool dom_local_name_compare_ex(const xmlNode *node, const char *tag, size_t tag_length, size_t name_length) +{ + return name_length == tag_length && zend_binary_strcmp((const char *) node->name, name_length, tag, tag_length) == 0; +} + +static zend_result dom_html5_serialize_doctype(dom_html5_serialize_context *ctx, const xmlDtd *dtd) +{ + TRY(ctx->write_string_len(ctx->application_data, "write_string(ctx->application_data, (const char *) dtd->name)); + return ctx->write_string_len(ctx->application_data, ">", strlen(">")); +} + +static zend_result dom_html5_serialize_comment(dom_html5_serialize_context *ctx, const xmlNode *node) +{ + TRY(ctx->write_string_len(ctx->application_data, "", strlen("-->")); +} + +static zend_result dom_html5_serialize_processing_instruction(dom_html5_serialize_context *ctx, const xmlNode *node) +{ + TRY(ctx->write_string_len(ctx->application_data, "write_string(ctx->application_data, (const char *) node->name)); + TRY(ctx->write_string_len(ctx->application_data, " ", strlen(" "))); + TRY(ctx->write_string(ctx->application_data, (const char *) node->content)); + return ctx->write_string_len(ctx->application_data, ">", strlen(">")); +} + +/* https://html.spec.whatwg.org/multipage/parsing.html#escapingString */ +static zend_result dom_html5_escape_string(dom_html5_serialize_context *ctx, const char *content, bool attribute_mode) +{ + const char *last_output = content; + + while (*content != '\0') { + switch (*content) { + /* Step 1 */ + case '&': { + TRY(ctx->write_string_len(ctx->application_data, last_output, content - last_output)); + TRY(ctx->write_string_len(ctx->application_data, "&", strlen("&"))); + last_output = content + 1; + break; + } + + /* Step 2 (non-breaking space) (note: uses UTF-8 internally) */ + case '\xC2': { + if (content[1] == '\xA0') { + TRY(ctx->write_string_len(ctx->application_data, last_output, content - last_output)); + TRY(ctx->write_string_len(ctx->application_data, " ", strlen(" "))); + content++; /* Consume A0 too */ + last_output = content + 1; + } + break; + } + + /* Step 3 */ + case '"': { + if (attribute_mode) { + TRY(ctx->write_string_len(ctx->application_data, last_output, content - last_output)); + TRY(ctx->write_string_len(ctx->application_data, """, strlen("""))); + last_output = content + 1; + } + break; + } + + /* Step 4 */ + case '<': { + if (!attribute_mode) { + TRY(ctx->write_string_len(ctx->application_data, last_output, content - last_output)); + TRY(ctx->write_string_len(ctx->application_data, "<", strlen("<"))); + last_output = content + 1; + } + break; + } + case '>': { + if (!attribute_mode) { + TRY(ctx->write_string_len(ctx->application_data, last_output, content - last_output)); + TRY(ctx->write_string_len(ctx->application_data, ">", strlen(">"))); + last_output = content + 1; + } + break; + } + } + + content++; + } + + return ctx->write_string_len(ctx->application_data, last_output, content - last_output); +} + +static zend_result dom_html5_serialize_text_node(dom_html5_serialize_context *ctx, const xmlNode *node) +{ + if (node->parent->type == XML_ELEMENT_NODE && dom_is_html_ns(node->parent)) { + const xmlNode *parent = node->parent; + size_t name_length = strlen((const char *) parent->name); + /* Note: