Skip to content

Commit baa76be

Browse files
authored
Use SWAR to seek for non-ASCII UTF-8 in DOM parsing (#16350)
GitHub FYP test case: ``` Benchmark 1: ./sapi/cli/php test.php Time (mean ± σ): 502.8 ms ± 6.2 ms [User: 498.3 ms, System: 3.2 ms] Range (min … max): 495.2 ms … 509.8 ms 10 runs Benchmark 2: ./sapi/cli/php_old test.php Time (mean ± σ): 518.4 ms ± 4.3 ms [User: 513.9 ms, System: 3.2 ms] Range (min … max): 511.5 ms … 525.5 ms 10 runs Summary ./sapi/cli/php test.php ran 1.03 ± 0.02 times faster than ./sapi/cli/php_old test.php ``` Wikipedia English homepage test case: ``` Benchmark 1: ./sapi/cli/php test.php Time (mean ± σ): 301.1 ms ± 4.2 ms [User: 295.5 ms, System: 4.8 ms] Range (min … max): 296.3 ms … 308.8 ms 10 runs Benchmark 2: ./sapi/cli/php_old test.php Time (mean ± σ): 308.2 ms ± 1.7 ms [User: 304.6 ms, System: 2.9 ms] Range (min … max): 306.9 ms … 312.8 ms 10 runs Summary ./sapi/cli/php test.php ran 1.02 ± 0.02 times faster than ./sapi/cli/php_old test.php ```
1 parent 497dbaa commit baa76be

File tree

1 file changed

+31
-6
lines changed

1 file changed

+31
-6
lines changed

ext/dom/html_document.c

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include <Zend/zend_smart_string.h>
3131
#include <lexbor/html/encoding.h>
3232
#include <lexbor/encoding/encoding.h>
33+
#include <lexbor/core/swar.h>
3334

3435
/* Implementation defined, but as HTML5 defaults in all other cases to UTF-8, we'll do the same. */
3536
#define DOM_FALLBACK_ENCODING_ID LXB_ENCODING_UTF_8
@@ -517,6 +518,30 @@ static bool dom_process_parse_chunk(
517518
return true;
518519
}
519520

521+
/* This seeks, using SWAR techniques, to the first non-ASCII byte in a UTF-8 input.
522+
* Returns true if the entire input was consumed without encountering non-ASCII, false otherwise. */
523+
static zend_always_inline bool dom_seek_utf8_non_ascii(const lxb_char_t **data, const lxb_char_t *end)
524+
{
525+
while (*data + sizeof(size_t) <= end) {
526+
size_t bytes;
527+
memcpy(&bytes, *data, sizeof(bytes));
528+
/* If the top bit is set, it's not ASCII. */
529+
if ((bytes & LEXBOR_SWAR_REPEAT(0x80)) != 0) {
530+
return false;
531+
}
532+
*data += sizeof(size_t);
533+
}
534+
535+
while (*data < end) {
536+
if (**data & 0x80) {
537+
return false;
538+
}
539+
(*data)++;
540+
}
541+
542+
return true;
543+
}
544+
520545
static bool dom_decode_encode_fast_path(
521546
lexbor_libxml2_bridge_parse_context *ctx,
522547
lxb_html_document_t *document,
@@ -534,13 +559,13 @@ static bool dom_decode_encode_fast_path(
534559
const lxb_char_t *last_output = buf_ref;
535560
while (buf_ref != buf_end) {
536561
/* Fast path converts non-validated UTF-8 -> validated UTF-8 */
537-
if (decoding_encoding_ctx->decode.u.utf_8.need == 0 && *buf_ref < 0x80) {
562+
if (decoding_encoding_ctx->decode.u.utf_8.need == 0) {
538563
/* Fast path within the fast path: try to skip non-mb bytes in bulk if we are not in a state where we
539-
* need more UTF-8 bytes to complete a sequence.
540-
* It might be tempting to use SIMD here, but it turns out that this is less efficient because
541-
* we need to process the same byte multiple times sometimes when mixing ASCII with multibyte. */
542-
buf_ref++;
543-
continue;
564+
* need more UTF-8 bytes to complete a sequence. */
565+
if (dom_seek_utf8_non_ascii(&buf_ref, buf_end)) {
566+
ZEND_ASSERT(buf_ref == buf_end);
567+
break;
568+
}
544569
}
545570
const lxb_char_t *buf_ref_backup = buf_ref;
546571
lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);

0 commit comments

Comments
 (0)