From cd6debc2a008970e5c700c15358d1b6128a37a9a Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Thu, 10 Oct 2024 23:24:03 +0200 Subject: [PATCH] Use SWAR to seek for non-ASCII UTF-8 in DOM parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GitHub FYP test case: ``` Benchmark 1: ./sapi/cli/php test.php Time (mean ± σ): 502.8 ms ± 6.2 ms [User: 498.3 ms, System: 3.2 ms] Range (min … max): 495.2 ms … 509.8 ms 10 runs Benchmark 2: ./sapi/cli/php_old test.php Time (mean ± σ): 518.4 ms ± 4.3 ms [User: 513.9 ms, System: 3.2 ms] Range (min … max): 511.5 ms … 525.5 ms 10 runs Summary ./sapi/cli/php test.php ran 1.03 ± 0.02 times faster than ./sapi/cli/php_old test.php ``` Wikipedia English homepage test case: ``` Benchmark 1: ./sapi/cli/php test.php Time (mean ± σ): 301.1 ms ± 4.2 ms [User: 295.5 ms, System: 4.8 ms] Range (min … max): 296.3 ms … 308.8 ms 10 runs Benchmark 2: ./sapi/cli/php_old test.php Time (mean ± σ): 308.2 ms ± 1.7 ms [User: 304.6 ms, System: 2.9 ms] Range (min … max): 306.9 ms … 312.8 ms 10 runs Summary ./sapi/cli/php test.php ran 1.02 ± 0.02 times faster than ./sapi/cli/php_old test.php ``` --- ext/dom/html_document.c | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/ext/dom/html_document.c b/ext/dom/html_document.c index ed7454dd89d43..3ca812bde50b6 100644 --- a/ext/dom/html_document.c +++ b/ext/dom/html_document.c @@ -30,6 +30,7 @@ #include #include #include +#include /* Implementation defined, but as HTML5 defaults in all other cases to UTF-8, we'll do the same. */ #define DOM_FALLBACK_ENCODING_ID LXB_ENCODING_UTF_8 @@ -517,6 +518,30 @@ static bool dom_process_parse_chunk( return true; } +/* This seeks, using SWAR techniques, to the first non-ASCII byte in a UTF-8 input. + * Returns true if the entire input was consumed without encountering non-ASCII, false otherwise. */ +static zend_always_inline bool dom_seek_utf8_non_ascii(const lxb_char_t **data, const lxb_char_t *end) +{ + while (*data + sizeof(size_t) <= end) { + size_t bytes; + memcpy(&bytes, *data, sizeof(bytes)); + /* If the top bit is set, it's not ASCII. */ + if ((bytes & LEXBOR_SWAR_REPEAT(0x80)) != 0) { + return false; + } + *data += sizeof(size_t); + } + + while (*data < end) { + if (**data & 0x80) { + return false; + } + (*data)++; + } + + return true; +} + static bool dom_decode_encode_fast_path( lexbor_libxml2_bridge_parse_context *ctx, lxb_html_document_t *document, @@ -534,13 +559,13 @@ static bool dom_decode_encode_fast_path( const lxb_char_t *last_output = buf_ref; while (buf_ref != buf_end) { /* Fast path converts non-validated UTF-8 -> validated UTF-8 */ - if (decoding_encoding_ctx->decode.u.utf_8.need == 0 && *buf_ref < 0x80) { + if (decoding_encoding_ctx->decode.u.utf_8.need == 0) { /* Fast path within the fast path: try to skip non-mb bytes in bulk if we are not in a state where we - * need more UTF-8 bytes to complete a sequence. - * It might be tempting to use SIMD here, but it turns out that this is less efficient because - * we need to process the same byte multiple times sometimes when mixing ASCII with multibyte. */ - buf_ref++; - continue; + * need more UTF-8 bytes to complete a sequence. */ + if (dom_seek_utf8_non_ascii(&buf_ref, buf_end)) { + ZEND_ASSERT(buf_ref == buf_end); + break; + } } const lxb_char_t *buf_ref_backup = buf_ref; lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);