Skip to content

HTML5 parser optimizations #13702

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions UPGRADING.INTERNALS
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ PHP 8.4 INTERNALS UPGRADE NOTES
- Removed the "properties" HashTable field from php_libxml_node_object.
- Added a way to attached private data to a php_libxml_ref_obj.
- Added a way to fix a class type onto php_libxml_ref_obj.
- Added php_libxml_uses_internal_errors().

e. ext/date
- Added the php_format_date_ex() API to format instances of php_date_obj.
Expand Down
58 changes: 42 additions & 16 deletions ext/dom/html_document.c
Original file line number Diff line number Diff line change
Expand Up @@ -245,35 +245,43 @@ static void dom_find_line_and_column_using_cache(
offset = application_data->current_input_length;
}

size_t last_column = cache->last_column;
size_t last_line = cache->last_line;
size_t last_offset = cache->last_offset;

/* Either unicode or UTF-8 data */
if (application_data->current_input_codepoints != NULL) {
while (cache->last_offset < offset) {
if (application_data->current_input_codepoints[cache->last_offset] == 0x000A /* Unicode codepoint for line feed */) {
cache->last_line++;
cache->last_column = 1;
while (last_offset < offset) {
if (application_data->current_input_codepoints[last_offset] == 0x000A /* Unicode codepoint for line feed */) {
last_line++;
last_column = 1;
} else {
cache->last_column++;
last_column++;
}
cache->last_offset++;
last_offset++;
}
} else {
while (cache->last_offset < offset) {
const lxb_char_t current = application_data->current_input_characters[cache->last_offset];
while (last_offset < offset) {
const lxb_char_t current = application_data->current_input_characters[last_offset];
if (current == '\n') {
cache->last_line++;
cache->last_column = 1;
cache->last_offset++;
last_line++;
last_column = 1;
last_offset++;
} else {
/* See Lexbor tokenizer patch
* Note for future self: branchlessly computing the length and jumping by the length would be nice,
* however it takes so many instructions to do so that it is slower than this naive method. */
if ((current & 0b11000000) != 0b10000000) {
cache->last_column++;
last_column++;
}
cache->last_offset++;
last_offset++;
}
}
}

cache->last_column = last_column;
cache->last_line = last_line;
cache->last_offset = last_offset;
}

static void dom_lexbor_libxml2_bridge_tokenizer_error_reporter(
Expand Down Expand Up @@ -517,8 +525,16 @@ static bool dom_decode_encode_fast_path(
const lxb_char_t *buf_ref = *buf_ref_ref;
const lxb_char_t *last_output = buf_ref;
while (buf_ref != buf_end) {
const lxb_char_t *buf_ref_backup = buf_ref;
/* Fast path converts non-validated UTF-8 -> validated UTF-8 */
if (decoding_encoding_ctx->decode.u.utf_8.need == 0 && *buf_ref < 0x80) {
/* Fast path within the fast path: try to skip non-mb bytes in bulk if we are not in a state where we
* need more UTF-8 bytes to complete a sequence.
* It might be tempting to use SIMD here, but it turns out that this is less efficient because
* we need to process the same byte multiple times sometimes when mixing ASCII with multibyte. */
buf_ref++;
continue;
}
const lxb_char_t *buf_ref_backup = buf_ref;
lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
if (UNEXPECTED(codepoint > LXB_ENCODING_MAX_CODEPOINT)) {
size_t skip = buf_ref - buf_ref_backup; /* Skip invalid data, it's replaced by the UTF-8 replacement bytes */
Expand Down Expand Up @@ -749,6 +765,16 @@ PHP_METHOD(DOM_HTMLDocument, createEmpty)
RETURN_THROWS();
}

/* Only bother to register error handling when the error reports can become observable. */
static bool dom_should_register_error_handlers(zend_long options)
{
if (options & XML_PARSE_NOERROR) {
return false;
}

return php_libxml_uses_internal_errors() || ((EG(error_reporting) | EG(user_error_handler_error_reporting)) & E_WARNING);
}

PHP_METHOD(DOM_HTMLDocument, createFromString)
{
const char *source, *override_encoding = NULL;
Expand Down Expand Up @@ -777,7 +803,7 @@ PHP_METHOD(DOM_HTMLDocument, createFromString)
dom_reset_line_column_cache(&application_data.cache_tokenizer);
lexbor_libxml2_bridge_parse_context ctx;
lexbor_libxml2_bridge_parse_context_init(&ctx);
if (!(options & XML_PARSE_NOERROR)) {
if (dom_should_register_error_handlers(options)) {
lexbor_libxml2_bridge_parse_set_error_callbacks(
&ctx,
dom_lexbor_libxml2_bridge_tokenizer_error_reporter,
Expand Down Expand Up @@ -936,7 +962,7 @@ PHP_METHOD(DOM_HTMLDocument, createFromFile)
dom_reset_line_column_cache(&application_data.cache_tokenizer);
lexbor_libxml2_bridge_parse_context ctx;
lexbor_libxml2_bridge_parse_context_init(&ctx);
if (!(options & XML_PARSE_NOERROR)) {
if (dom_should_register_error_handlers(options)) {
lexbor_libxml2_bridge_parse_set_error_callbacks(
&ctx,
dom_lexbor_libxml2_bridge_tokenizer_error_reporter,
Expand Down
19 changes: 19 additions & 0 deletions ext/dom/tests/modern/html/parser/user_error_handler.phpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
--TEST--
Parse HTML document with user error handler and error_reporting(0)
--EXTENSIONS--
dom
--INI--
error_reporting=0
--FILE--
<?php

set_error_handler(function ($errno, $errstr, $errfile, $errline) {
var_dump($errno, $errstr);
}, E_WARNING);

DOM\HTMLDocument::createFromString('<html></html>');

?>
--EXPECT--
int(2)
string(113) "DOM\HTMLDocument::createFromString(): tree error unexpected-token-in-initial-mode in Entity, line: 1, column: 2-5"
15 changes: 7 additions & 8 deletions ext/libxml/libxml.c
Original file line number Diff line number Diff line change
Expand Up @@ -1055,23 +1055,22 @@ PHP_FUNCTION(libxml_set_streams_context)
}
/* }}} */

PHP_LIBXML_API bool php_libxml_uses_internal_errors(void)
{
return xmlStructuredError == php_libxml_structured_error_handler;
}

/* {{{ Disable libxml errors and allow user to fetch error information as needed */
PHP_FUNCTION(libxml_use_internal_errors)
{
xmlStructuredErrorFunc current_handler;
bool use_errors, use_errors_is_null = 1, retval;
bool use_errors, use_errors_is_null = true;

ZEND_PARSE_PARAMETERS_START(0, 1)
Z_PARAM_OPTIONAL
Z_PARAM_BOOL_OR_NULL(use_errors, use_errors_is_null)
ZEND_PARSE_PARAMETERS_END();

current_handler = xmlStructuredError;
if (current_handler && current_handler == php_libxml_structured_error_handler) {
retval = 1;
} else {
retval = 0;
}
bool retval = php_libxml_uses_internal_errors();

if (use_errors_is_null) {
RETURN_BOOL(retval);
Expand Down
1 change: 1 addition & 0 deletions ext/libxml/php_libxml.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ PHP_LIBXML_API void php_libxml_issue_error(int level, const char *msg);
PHP_LIBXML_API bool php_libxml_disable_entity_loader(bool disable);
PHP_LIBXML_API void php_libxml_set_old_ns(xmlDocPtr doc, xmlNsPtr ns);
PHP_LIBXML_API php_stream_context *php_libxml_get_stream_context(void);
PHP_LIBXML_API bool php_libxml_uses_internal_errors(void);

PHP_LIBXML_API zend_string *php_libxml_sniff_charset_from_string(const char *start, const char *end);
PHP_LIBXML_API zend_string *php_libxml_sniff_charset_from_stream(const php_stream *s);
Expand Down