From 291f1fbf2041d9add6e13394cae74fffca6e3aa0 Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Sat, 29 Jun 2024 22:09:36 +0200 Subject: [PATCH] Update Lexbor --- ext/dom/lexbor/lexbor/core/diyfp.h | 4 +- ext/dom/lexbor/lexbor/core/perf.h | 7 +- ext/dom/lexbor/lexbor/core/swar.h | 68 +++++++++++++++++++ ext/dom/lexbor/lexbor/css/parser.c | 10 +++ ext/dom/lexbor/lexbor/html/encoding.c | 2 + ext/dom/lexbor/lexbor/html/tokenizer/state.c | 5 ++ .../lexbor/html/tree/insertion_mode/in_head.c | 2 +- 7 files changed, 89 insertions(+), 9 deletions(-) create mode 100644 ext/dom/lexbor/lexbor/core/swar.h diff --git a/ext/dom/lexbor/lexbor/core/diyfp.h b/ext/dom/lexbor/lexbor/core/diyfp.h index ca3d8e46a5bd..47fedb9da997 100644 --- a/ext/dom/lexbor/lexbor/core/diyfp.h +++ b/ext/dom/lexbor/lexbor/core/diyfp.h @@ -71,7 +71,7 @@ lexbor_cached_power_bin(int exp, int *dec_exp); /* * Inline functions */ -#if (LEXBOR_HAVE_BUILTIN_CLZLL) +#ifdef LEXBOR_HAVE_BUILTIN_CLZLL #define nxt_leading_zeros64(x) (((x) == 0) ? 64 : __builtin_clzll(x)) #else @@ -199,7 +199,7 @@ lexbor_diyfp_sub(lexbor_diyfp_t lhs, lexbor_diyfp_t rhs) lxb_inline lexbor_diyfp_t lexbor_diyfp_mul(lexbor_diyfp_t lhs, lexbor_diyfp_t rhs) { -#if (LEXBOR_HAVE_UNSIGNED_INT128) +#ifdef LEXBOR_HAVE_UNSIGNED_INT128 uint64_t l, h; lxb_uint128_t u128; diff --git a/ext/dom/lexbor/lexbor/core/perf.h b/ext/dom/lexbor/lexbor/core/perf.h index 44041cd3740f..49bf0bb6460a 100644 --- a/ext/dom/lexbor/lexbor/core/perf.h +++ b/ext/dom/lexbor/lexbor/core/perf.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018 Alexander Borisov + * Copyright (C) 2018-2024 Alexander Borisov * * Author: Alexander Borisov */ @@ -14,9 +14,6 @@ extern "C" { #include "lexbor/core/base.h" -#ifdef LEXBOR_WITH_PERF - - LXB_API void * lexbor_perf_create(void); @@ -36,8 +33,6 @@ LXB_API double lexbor_perf_in_sec(void *perf); -#endif /* LEXBOR_WITH_PERF */ - #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/ext/dom/lexbor/lexbor/core/swar.h b/ext/dom/lexbor/lexbor/core/swar.h new file mode 100644 index 000000000000..a93d30ad5dff --- /dev/null +++ b/ext/dom/lexbor/lexbor/core/swar.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2024 Alexander Borisov + * + * Author: Niels Dossche + */ + +#ifndef LEXBOR_SWAR_H +#define LEXBOR_SWAR_H + +#ifdef __cplusplus +extern "C" { +#endif + + +#include "lexbor/core/base.h" + + +/* + * Based on techniques from https://graphics.stanford.edu/~seander/bithacks.html + */ +#define LEXBOR_SWAR_ONES (~((size_t) 0) / 0xFF) +#define LEXBOR_SWAR_REPEAT(x) (LEXBOR_SWAR_ONES * (x)) +#define LEXBOR_SWAR_HAS_ZERO(v) (((v) - LEXBOR_SWAR_ONES) & ~(v) & LEXBOR_SWAR_REPEAT(0x80)) +#define LEXBOR_SWAR_IS_LITTLE_ENDIAN (*(unsigned char *) &(uint16_t){1}) + + +/* + * When handling hot loops that search for a set of characters, + * this function can be used to quickly move the data pointer much + * closer to the first occurrence of such a character. + */ +lxb_inline const lxb_char_t * +lexbor_swar_seek4(const lxb_char_t *data, const lxb_char_t *end, + lxb_char_t c1, lxb_char_t c2, lxb_char_t c3, lxb_char_t c4) +{ + size_t bytes, matches, t1, t2, t3, t4; + + if (LEXBOR_SWAR_IS_LITTLE_ENDIAN) { + while (data + sizeof(size_t) <= end) { + memcpy(&bytes, data, sizeof(size_t)); + + t1 = bytes ^ LEXBOR_SWAR_REPEAT(c1); + t2 = bytes ^ LEXBOR_SWAR_REPEAT(c2); + t3 = bytes ^ LEXBOR_SWAR_REPEAT(c3); + t4 = bytes ^ LEXBOR_SWAR_REPEAT(c4); + matches = LEXBOR_SWAR_HAS_ZERO(t1) | LEXBOR_SWAR_HAS_ZERO(t2) + | LEXBOR_SWAR_HAS_ZERO(t3) | LEXBOR_SWAR_HAS_ZERO(t4); + + if (matches) { + data += ((((matches - 1) & LEXBOR_SWAR_ONES) * LEXBOR_SWAR_ONES) + >> (sizeof(size_t) * 8 - 8)) - 1; + break; + } else { + data += sizeof(size_t); + } + } + } + + return data; +} + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_SWAR_H */ + diff --git a/ext/dom/lexbor/lexbor/css/parser.c b/ext/dom/lexbor/lexbor/css/parser.c index 44a6a64e0cb1..5ceffed0f970 100644 --- a/ext/dom/lexbor/lexbor/css/parser.c +++ b/ext/dom/lexbor/lexbor/css/parser.c @@ -62,6 +62,16 @@ lxb_css_parser_init(lxb_css_parser_t *parser, lxb_css_syntax_tokenizer_t *tkz) parser->rules_end = parser->rules_begin + lxb_rules_length; parser->rules = parser->rules_begin; + /* + * Zero those parameters that can be used (passed to the function). + * The parser->rules->phase parameter will be assigned at the end of the + * parsing. + * + * The point is that parser->rules[0] is used as a stub before exiting + * parsing. + */ + parser->rules->context = NULL; + /* Temp */ parser->pos = NULL; parser->str.length = 0; diff --git a/ext/dom/lexbor/lexbor/html/encoding.c b/ext/dom/lexbor/lexbor/html/encoding.c index 1b36e0dcfa70..5368400dab84 100644 --- a/ext/dom/lexbor/lexbor/html/encoding.c +++ b/ext/dom/lexbor/lexbor/html/encoding.c @@ -477,6 +477,8 @@ lxb_html_get_attribute(const lxb_char_t *data, const lxb_char_t *end, data++; } + *name_end = data; + spaces_state: data = lxb_html_encoding_skip_spaces(data, end); diff --git a/ext/dom/lexbor/lexbor/html/tokenizer/state.c b/ext/dom/lexbor/lexbor/html/tokenizer/state.c index 2f3414fe79f4..69b78f288d14 100644 --- a/ext/dom/lexbor/lexbor/html/tokenizer/state.c +++ b/ext/dom/lexbor/lexbor/html/tokenizer/state.c @@ -15,6 +15,7 @@ #define LEXBOR_STR_RES_MAP_HEX #define LEXBOR_STR_RES_MAP_NUM #include "lexbor/core/str_res.h" +#include "lexbor/core/swar.h" #define LXB_HTML_TOKENIZER_RES_ENTITIES_SBST #include "lexbor/html/tokenizer/res.h" @@ -226,6 +227,8 @@ lxb_html_tokenizer_state_data(lxb_html_tokenizer_t *tkz, { lxb_html_tokenizer_state_begin_set(tkz, data); + data = lexbor_swar_seek4(data, end, 0x3C, 0x26, 0x0D, 0x00); + while (data != end) { switch (*data) { /* U+003C LESS-THAN SIGN (<) */ @@ -906,6 +909,8 @@ lxb_html_tokenizer_state_attribute_value_double_quoted(lxb_html_tokenizer_t *tkz lxb_html_tokenizer_state_begin_set(tkz, data); + data = lexbor_swar_seek4(data, end, 0x22, 0x26, 0x0D, 0x00); + while (data != end) { switch (*data) { /* U+0022 QUOTATION MARK (") */ diff --git a/ext/dom/lexbor/lexbor/html/tree/insertion_mode/in_head.c b/ext/dom/lexbor/lexbor/html/tree/insertion_mode/in_head.c index 5d91e607b9b0..4d19708544c2 100644 --- a/ext/dom/lexbor/lexbor/html/tree/insertion_mode/in_head.c +++ b/ext/dom/lexbor/lexbor/html/tree/insertion_mode/in_head.c @@ -175,7 +175,7 @@ lxb_html_tree_insertion_mode_in_head_open(lxb_html_tree_t *tree, * We can create function for this, but... * * The "in head noscript" insertion mode use this - * is you change this code, please, change it in in head noscript" mode + * is you change this code, please, change it in head noscript" mode */ case LXB_TAG__TEXT: { lxb_html_token_t ws_token = {0};