From 623cdb911a4905221fc72e1a74c79765491dfe0e Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Thu, 16 Jan 2025 20:11:35 +0100 Subject: [PATCH] Fix GH-17481: UTF-8 corruption in \Dom\HTMLDocument We need to properly handle the case when we return from having too few bytes, this needs to be handled separately because the while loop otherwise just performs a partial byte copy. --- ext/dom/html_document.c | 27 +++++++++++++-- .../tests/modern/html/encoding/gh17481.phpt | 33 +++++++++++++++++++ 2 files changed, 58 insertions(+), 2 deletions(-) create mode 100644 ext/dom/tests/modern/html/encoding/gh17481.phpt diff --git a/ext/dom/html_document.c b/ext/dom/html_document.c index ed7454dd89d4..ca6d215154e2 100644 --- a/ext/dom/html_document.c +++ b/ext/dom/html_document.c @@ -528,9 +528,32 @@ static bool dom_decode_encode_fast_path( size_t *tree_error_offset ) { - decoding_encoding_ctx->decode.status = LXB_STATUS_OK; - const lxb_char_t *buf_ref = *buf_ref_ref; + + /* If we returned for needing more bytes, we need to finish up the buffer for the old codepoint. */ + if (decoding_encoding_ctx->decode.status == LXB_STATUS_CONTINUE) { + lxb_char_t buf[4]; + lxb_char_t *buf_ptr = buf; + lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end); + if (lxb_encoding_encode_utf_8_single(&decoding_encoding_ctx->encode, &buf_ptr, buf + sizeof(buf), codepoint) > sizeof(buf)) { + buf_ptr = zend_mempcpy(buf, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE); + } + decoding_encoding_ctx->decode.status = LXB_STATUS_OK; + + if (!dom_process_parse_chunk( + ctx, + document, + parser, + buf_ptr - buf, + buf, + buf_ptr - buf, + tokenizer_error_offset, + tree_error_offset + )) { + goto fail_oom; + } + } + const lxb_char_t *last_output = buf_ref; while (buf_ref != buf_end) { /* Fast path converts non-validated UTF-8 -> validated UTF-8 */ diff --git a/ext/dom/tests/modern/html/encoding/gh17481.phpt b/ext/dom/tests/modern/html/encoding/gh17481.phpt new file mode 100644 index 000000000000..74e13e130094 --- /dev/null +++ b/ext/dom/tests/modern/html/encoding/gh17481.phpt @@ -0,0 +1,33 @@ +--TEST-- +GH-17481 (UTF-8 corruption in \Dom\HTMLDocument) +--EXTENSIONS-- +dom +--FILE-- +$input"; + if ($endTag) { + $Data .= ''; + } + $Document = \Dom\HTMLDocument::createFromString($Data, 0, 'UTF-8'); + var_dump($Document->body->textContent === $input); +} + +?> +--EXPECT-- +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true)