From 6bf665c366adc350cfaccb50c9610082e27a3de4 Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Wed, 13 Mar 2024 21:22:12 +0100 Subject: [PATCH 1/4] Add fast path for ASCII bytes in UTF-8 validation --- ext/dom/html_document.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ext/dom/html_document.c b/ext/dom/html_document.c index 1a05280f2c78..e2de5a48de44 100644 --- a/ext/dom/html_document.c +++ b/ext/dom/html_document.c @@ -517,8 +517,16 @@ static bool dom_decode_encode_fast_path( const lxb_char_t *buf_ref = *buf_ref_ref; const lxb_char_t *last_output = buf_ref; while (buf_ref != buf_end) { - const lxb_char_t *buf_ref_backup = buf_ref; /* Fast path converts non-validated UTF-8 -> validated UTF-8 */ + if (decoding_encoding_ctx->decode.u.utf_8.need == 0 && *buf_ref < 0x80) { + /* Fast path within the fast path: try to skip non-mb bytes in bulk if we are not in a state where we + * need more UTF-8 bytes to complete a sequence. + * It might be tempting to use SIMD here, but it turns out that this is less efficient because + * we need to process the same byte multiple times sometimes when mixing ASCII with multibyte. */ + buf_ref++; + continue; + } + const lxb_char_t *buf_ref_backup = buf_ref; lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end); if (UNEXPECTED(codepoint > LXB_ENCODING_MAX_CODEPOINT)) { size_t skip = buf_ref - buf_ref_backup; /* Skip invalid data, it's replaced by the UTF-8 replacement bytes */ From c16893845939da3d2655c93d03fb8b2b81ecbdc2 Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Wed, 13 Mar 2024 21:22:31 +0100 Subject: [PATCH 2/4] Use temporary variables to reduce memory stores --- ext/dom/html_document.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/ext/dom/html_document.c b/ext/dom/html_document.c index e2de5a48de44..c97fbbab39f7 100644 --- a/ext/dom/html_document.c +++ b/ext/dom/html_document.c @@ -245,35 +245,43 @@ static void dom_find_line_and_column_using_cache( offset = application_data->current_input_length; } + size_t last_column = cache->last_column; + size_t last_line = cache->last_line; + size_t last_offset = cache->last_offset; + /* Either unicode or UTF-8 data */ if (application_data->current_input_codepoints != NULL) { - while (cache->last_offset < offset) { - if (application_data->current_input_codepoints[cache->last_offset] == 0x000A /* Unicode codepoint for line feed */) { - cache->last_line++; - cache->last_column = 1; + while (last_offset < offset) { + if (application_data->current_input_codepoints[last_offset] == 0x000A /* Unicode codepoint for line feed */) { + last_line++; + last_column = 1; } else { - cache->last_column++; + last_column++; } - cache->last_offset++; + last_offset++; } } else { - while (cache->last_offset < offset) { - const lxb_char_t current = application_data->current_input_characters[cache->last_offset]; + while (last_offset < offset) { + const lxb_char_t current = application_data->current_input_characters[last_offset]; if (current == '\n') { - cache->last_line++; - cache->last_column = 1; - cache->last_offset++; + last_line++; + last_column = 1; + last_offset++; } else { /* See Lexbor tokenizer patch * Note for future self: branchlessly computing the length and jumping by the length would be nice, * however it takes so many instructions to do so that it is slower than this naive method. */ if ((current & 0b11000000) != 0b10000000) { - cache->last_column++; + last_column++; } - cache->last_offset++; + last_offset++; } } } + + cache->last_column = last_column; + cache->last_line = last_line; + cache->last_offset = last_offset; } static void dom_lexbor_libxml2_bridge_tokenizer_error_reporter( From b1bea499397085f19b0c63f02a8608789864a5ab Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Wed, 13 Mar 2024 21:22:44 +0100 Subject: [PATCH 3/4] Only register error handling when observable --- UPGRADING.INTERNALS | 1 + ext/dom/html_document.c | 14 ++++++++++++-- ext/libxml/libxml.c | 15 +++++++-------- ext/libxml/php_libxml.h | 1 + 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/UPGRADING.INTERNALS b/UPGRADING.INTERNALS index e435e35add74..cdd3c59fb6bb 100644 --- a/UPGRADING.INTERNALS +++ b/UPGRADING.INTERNALS @@ -184,6 +184,7 @@ PHP 8.4 INTERNALS UPGRADE NOTES - Removed the "properties" HashTable field from php_libxml_node_object. - Added a way to attached private data to a php_libxml_ref_obj. - Added a way to fix a class type onto php_libxml_ref_obj. + - Added php_libxml_uses_internal_errors(). e. ext/date - Added the php_format_date_ex() API to format instances of php_date_obj. diff --git a/ext/dom/html_document.c b/ext/dom/html_document.c index c97fbbab39f7..7968a4143703 100644 --- a/ext/dom/html_document.c +++ b/ext/dom/html_document.c @@ -765,6 +765,16 @@ PHP_METHOD(DOM_HTMLDocument, createEmpty) RETURN_THROWS(); } +/* Only bother to register error handling when the error reports can become observable. */ +static bool dom_should_register_error_handlers(zend_long options) +{ + if (options & XML_PARSE_NOERROR) { + return false; + } + + return php_libxml_uses_internal_errors() || (EG(error_reporting) & E_WARNING); +} + PHP_METHOD(DOM_HTMLDocument, createFromString) { const char *source, *override_encoding = NULL; @@ -793,7 +803,7 @@ PHP_METHOD(DOM_HTMLDocument, createFromString) dom_reset_line_column_cache(&application_data.cache_tokenizer); lexbor_libxml2_bridge_parse_context ctx; lexbor_libxml2_bridge_parse_context_init(&ctx); - if (!(options & XML_PARSE_NOERROR)) { + if (dom_should_register_error_handlers(options)) { lexbor_libxml2_bridge_parse_set_error_callbacks( &ctx, dom_lexbor_libxml2_bridge_tokenizer_error_reporter, @@ -952,7 +962,7 @@ PHP_METHOD(DOM_HTMLDocument, createFromFile) dom_reset_line_column_cache(&application_data.cache_tokenizer); lexbor_libxml2_bridge_parse_context ctx; lexbor_libxml2_bridge_parse_context_init(&ctx); - if (!(options & XML_PARSE_NOERROR)) { + if (dom_should_register_error_handlers(options)) { lexbor_libxml2_bridge_parse_set_error_callbacks( &ctx, dom_lexbor_libxml2_bridge_tokenizer_error_reporter, diff --git a/ext/libxml/libxml.c b/ext/libxml/libxml.c index 15f6e0cb830a..9c94cb9da59f 100644 --- a/ext/libxml/libxml.c +++ b/ext/libxml/libxml.c @@ -1055,23 +1055,22 @@ PHP_FUNCTION(libxml_set_streams_context) } /* }}} */ +PHP_LIBXML_API bool php_libxml_uses_internal_errors(void) +{ + return xmlStructuredError == php_libxml_structured_error_handler; +} + /* {{{ Disable libxml errors and allow user to fetch error information as needed */ PHP_FUNCTION(libxml_use_internal_errors) { - xmlStructuredErrorFunc current_handler; - bool use_errors, use_errors_is_null = 1, retval; + bool use_errors, use_errors_is_null = true; ZEND_PARSE_PARAMETERS_START(0, 1) Z_PARAM_OPTIONAL Z_PARAM_BOOL_OR_NULL(use_errors, use_errors_is_null) ZEND_PARSE_PARAMETERS_END(); - current_handler = xmlStructuredError; - if (current_handler && current_handler == php_libxml_structured_error_handler) { - retval = 1; - } else { - retval = 0; - } + bool retval = php_libxml_uses_internal_errors(); if (use_errors_is_null) { RETURN_BOOL(retval); diff --git a/ext/libxml/php_libxml.h b/ext/libxml/php_libxml.h index 9385c1848a92..42bfb2e0fe2b 100644 --- a/ext/libxml/php_libxml.h +++ b/ext/libxml/php_libxml.h @@ -163,6 +163,7 @@ PHP_LIBXML_API void php_libxml_issue_error(int level, const char *msg); PHP_LIBXML_API bool php_libxml_disable_entity_loader(bool disable); PHP_LIBXML_API void php_libxml_set_old_ns(xmlDocPtr doc, xmlNsPtr ns); PHP_LIBXML_API php_stream_context *php_libxml_get_stream_context(void); +PHP_LIBXML_API bool php_libxml_uses_internal_errors(void); PHP_LIBXML_API zend_string *php_libxml_sniff_charset_from_string(const char *start, const char *end); PHP_LIBXML_API zend_string *php_libxml_sniff_charset_from_stream(const php_stream *s); From ca25140db95fc1468036767971924c264afb136e Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Thu, 14 Mar 2024 19:39:23 +0100 Subject: [PATCH 4/4] Handle user error handlers --- ext/dom/html_document.c | 2 +- .../html/parser/user_error_handler.phpt | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 ext/dom/tests/modern/html/parser/user_error_handler.phpt diff --git a/ext/dom/html_document.c b/ext/dom/html_document.c index 7968a4143703..bc171ffab63c 100644 --- a/ext/dom/html_document.c +++ b/ext/dom/html_document.c @@ -772,7 +772,7 @@ static bool dom_should_register_error_handlers(zend_long options) return false; } - return php_libxml_uses_internal_errors() || (EG(error_reporting) & E_WARNING); + return php_libxml_uses_internal_errors() || ((EG(error_reporting) | EG(user_error_handler_error_reporting)) & E_WARNING); } PHP_METHOD(DOM_HTMLDocument, createFromString) diff --git a/ext/dom/tests/modern/html/parser/user_error_handler.phpt b/ext/dom/tests/modern/html/parser/user_error_handler.phpt new file mode 100644 index 000000000000..86b5a6d28f54 --- /dev/null +++ b/ext/dom/tests/modern/html/parser/user_error_handler.phpt @@ -0,0 +1,19 @@ +--TEST-- +Parse HTML document with user error handler and error_reporting(0) +--EXTENSIONS-- +dom +--INI-- +error_reporting=0 +--FILE-- +'); + +?> +--EXPECT-- +int(2) +string(113) "DOM\HTMLDocument::createFromString(): tree error unexpected-token-in-initial-mode in Entity, line: 1, column: 2-5"